2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
62 #include <asm/uaccess.h>
65 #include <linux/sysctl.h>
68 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
69 const struct in6_addr *dest);
70 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
71 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
72 static unsigned int ip6_mtu(const struct dst_entry *dst);
73 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
74 static void ip6_dst_destroy(struct dst_entry *);
75 static void ip6_dst_ifdown(struct dst_entry *,
76 struct net_device *dev, int how);
77 static int ip6_dst_gc(struct dst_ops *ops);
79 static int ip6_pkt_discard(struct sk_buff *skb);
80 static int ip6_pkt_discard_out(struct sk_buff *skb);
81 static void ip6_link_failure(struct sk_buff *skb);
82 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
83 struct sk_buff *skb, u32 mtu);
84 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
87 #ifdef CONFIG_IPV6_ROUTE_INFO
88 static struct rt6_info *rt6_add_route_info(struct net_device *dev,
89 const struct in6_addr *prefix, int prefixlen,
90 const struct in6_addr *gwaddr, unsigned int pref);
91 static struct rt6_info *rt6_get_route_info(struct net_device *dev,
92 const struct in6_addr *prefix, int prefixlen,
93 const struct in6_addr *gwaddr);
96 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
98 struct rt6_info *rt = (struct rt6_info *) dst;
99 struct inet_peer *peer;
102 if (!(rt->dst.flags & DST_HOST))
105 peer = rt6_get_peer_create(rt);
107 u32 *old_p = __DST_METRICS_PTR(old);
108 unsigned long prev, new;
111 if (inet_metrics_new(peer))
112 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
114 new = (unsigned long) p;
115 prev = cmpxchg(&dst->_metrics, old, new);
118 p = __DST_METRICS_PTR(prev);
119 if (prev & DST_METRICS_READ_ONLY)
126 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
130 struct in6_addr *p = &rt->rt6i_gateway;
132 if (!ipv6_addr_any(p))
133 return (const void *) p;
135 return &ipv6_hdr(skb)->daddr;
139 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
143 struct rt6_info *rt = (struct rt6_info *) dst;
146 daddr = choose_neigh_daddr(rt, skb, daddr);
147 n = __ipv6_neigh_lookup(dst->dev, daddr);
150 return neigh_create(&nd_tbl, daddr, dst->dev);
153 static struct dst_ops ip6_dst_ops_template = {
155 .protocol = cpu_to_be16(ETH_P_IPV6),
158 .check = ip6_dst_check,
159 .default_advmss = ip6_default_advmss,
161 .cow_metrics = ipv6_cow_metrics,
162 .destroy = ip6_dst_destroy,
163 .ifdown = ip6_dst_ifdown,
164 .negative_advice = ip6_negative_advice,
165 .link_failure = ip6_link_failure,
166 .update_pmtu = ip6_rt_update_pmtu,
167 .redirect = rt6_do_redirect,
168 .local_out = __ip6_local_out,
169 .neigh_lookup = ip6_neigh_lookup,
172 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
174 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
176 return mtu ? : dst->dev->mtu;
179 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
180 struct sk_buff *skb, u32 mtu)
184 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
189 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
195 static struct dst_ops ip6_dst_blackhole_ops = {
197 .protocol = cpu_to_be16(ETH_P_IPV6),
198 .destroy = ip6_dst_destroy,
199 .check = ip6_dst_check,
200 .mtu = ip6_blackhole_mtu,
201 .default_advmss = ip6_default_advmss,
202 .update_pmtu = ip6_rt_blackhole_update_pmtu,
203 .redirect = ip6_rt_blackhole_redirect,
204 .cow_metrics = ip6_rt_blackhole_cow_metrics,
205 .neigh_lookup = ip6_neigh_lookup,
208 static const u32 ip6_template_metrics[RTAX_MAX] = {
209 [RTAX_HOPLIMIT - 1] = 0,
212 static const struct rt6_info ip6_null_entry_template = {
214 .__refcnt = ATOMIC_INIT(1),
216 .obsolete = DST_OBSOLETE_FORCE_CHK,
217 .error = -ENETUNREACH,
218 .input = ip6_pkt_discard,
219 .output = ip6_pkt_discard_out,
221 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
222 .rt6i_protocol = RTPROT_KERNEL,
223 .rt6i_metric = ~(u32) 0,
224 .rt6i_ref = ATOMIC_INIT(1),
227 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
229 static int ip6_pkt_prohibit(struct sk_buff *skb);
230 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
232 static const struct rt6_info ip6_prohibit_entry_template = {
234 .__refcnt = ATOMIC_INIT(1),
236 .obsolete = DST_OBSOLETE_FORCE_CHK,
238 .input = ip6_pkt_prohibit,
239 .output = ip6_pkt_prohibit_out,
241 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
242 .rt6i_protocol = RTPROT_KERNEL,
243 .rt6i_metric = ~(u32) 0,
244 .rt6i_ref = ATOMIC_INIT(1),
247 static const struct rt6_info ip6_blk_hole_entry_template = {
249 .__refcnt = ATOMIC_INIT(1),
251 .obsolete = DST_OBSOLETE_FORCE_CHK,
253 .input = dst_discard,
254 .output = dst_discard,
256 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
257 .rt6i_protocol = RTPROT_KERNEL,
258 .rt6i_metric = ~(u32) 0,
259 .rt6i_ref = ATOMIC_INIT(1),
264 /* allocate dst with ip6_dst_ops */
265 static inline struct rt6_info *ip6_dst_alloc(struct net *net,
266 struct net_device *dev,
268 struct fib6_table *table)
270 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
271 0, DST_OBSOLETE_FORCE_CHK, flags);
274 struct dst_entry *dst = &rt->dst;
276 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
277 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
278 rt->rt6i_genid = rt_genid(net);
279 INIT_LIST_HEAD(&rt->rt6i_siblings);
280 rt->rt6i_nsiblings = 0;
285 static void ip6_dst_destroy(struct dst_entry *dst)
287 struct rt6_info *rt = (struct rt6_info *)dst;
288 struct inet6_dev *idev = rt->rt6i_idev;
289 struct dst_entry *from = dst->from;
291 if (!(rt->dst.flags & DST_HOST))
292 dst_destroy_metrics_generic(dst);
295 rt->rt6i_idev = NULL;
302 if (rt6_has_peer(rt)) {
303 struct inet_peer *peer = rt6_peer_ptr(rt);
308 void rt6_bind_peer(struct rt6_info *rt, int create)
310 struct inet_peer_base *base;
311 struct inet_peer *peer;
313 base = inetpeer_base_ptr(rt->_rt6i_peer);
317 peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
319 if (!rt6_set_peer(rt, peer))
324 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
327 struct rt6_info *rt = (struct rt6_info *)dst;
328 struct inet6_dev *idev = rt->rt6i_idev;
329 struct net_device *loopback_dev =
330 dev_net(dev)->loopback_dev;
332 if (dev != loopback_dev) {
333 if (idev && idev->dev == dev) {
334 struct inet6_dev *loopback_idev =
335 in6_dev_get(loopback_dev);
337 rt->rt6i_idev = loopback_idev;
344 static bool rt6_check_expired(const struct rt6_info *rt)
346 if (rt->rt6i_flags & RTF_EXPIRES) {
347 if (time_after(jiffies, rt->dst.expires))
349 } else if (rt->dst.from) {
350 return rt6_check_expired((struct rt6_info *) rt->dst.from);
355 static bool rt6_need_strict(const struct in6_addr *daddr)
357 return ipv6_addr_type(daddr) &
358 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
361 /* Multipath route selection:
362 * Hash based function using packet header and flowlabel.
363 * Adapted from fib_info_hashfn()
365 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
366 const struct flowi6 *fl6)
368 unsigned int val = fl6->flowi6_proto;
370 val ^= ipv6_addr_hash(&fl6->daddr);
371 val ^= ipv6_addr_hash(&fl6->saddr);
373 /* Work only if this not encapsulated */
374 switch (fl6->flowi6_proto) {
378 val ^= (__force u16)fl6->fl6_sport;
379 val ^= (__force u16)fl6->fl6_dport;
383 val ^= (__force u16)fl6->fl6_icmp_type;
384 val ^= (__force u16)fl6->fl6_icmp_code;
387 /* RFC6438 recommands to use flowlabel */
388 val ^= (__force u32)fl6->flowlabel;
390 /* Perhaps, we need to tune, this function? */
391 val = val ^ (val >> 7) ^ (val >> 12);
392 return val % candidate_count;
395 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
398 struct rt6_info *sibling, *next_sibling;
401 route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
402 /* Don't change the route, if route_choosen == 0
403 * (siblings does not include ourself)
406 list_for_each_entry_safe(sibling, next_sibling,
407 &match->rt6i_siblings, rt6i_siblings) {
409 if (route_choosen == 0) {
418 * Route lookup. Any table->tb6_lock is implied.
421 static inline struct rt6_info *rt6_device_match(struct net *net,
423 const struct in6_addr *saddr,
427 struct rt6_info *local = NULL;
428 struct rt6_info *sprt;
430 if (!oif && ipv6_addr_any(saddr))
433 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
434 struct net_device *dev = sprt->dst.dev;
437 if (dev->ifindex == oif)
439 if (dev->flags & IFF_LOOPBACK) {
440 if (!sprt->rt6i_idev ||
441 sprt->rt6i_idev->dev->ifindex != oif) {
442 if (flags & RT6_LOOKUP_F_IFACE && oif)
444 if (local && (!oif ||
445 local->rt6i_idev->dev->ifindex == oif))
451 if (ipv6_chk_addr(net, saddr, dev,
452 flags & RT6_LOOKUP_F_IFACE))
461 if (flags & RT6_LOOKUP_F_IFACE)
462 return net->ipv6.ip6_null_entry;
468 #ifdef CONFIG_IPV6_ROUTER_PREF
469 static void rt6_probe(struct rt6_info *rt)
471 struct neighbour *neigh;
473 * Okay, this does not seem to be appropriate
474 * for now, however, we need to check if it
475 * is really so; aka Router Reachability Probing.
477 * Router Reachability Probe MUST be rate-limited
478 * to no more than one per minute.
480 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
483 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
485 write_lock(&neigh->lock);
486 if (neigh->nud_state & NUD_VALID)
491 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
492 struct in6_addr mcaddr;
493 struct in6_addr *target;
496 neigh->updated = jiffies;
497 write_unlock(&neigh->lock);
500 target = (struct in6_addr *)&rt->rt6i_gateway;
501 addrconf_addr_solict_mult(target, &mcaddr);
502 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
505 write_unlock(&neigh->lock);
507 rcu_read_unlock_bh();
510 static inline void rt6_probe(struct rt6_info *rt)
516 * Default Router Selection (RFC 2461 6.3.6)
518 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
520 struct net_device *dev = rt->dst.dev;
521 if (!oif || dev->ifindex == oif)
523 if ((dev->flags & IFF_LOOPBACK) &&
524 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
529 static inline bool rt6_check_neigh(struct rt6_info *rt)
531 struct neighbour *neigh;
534 if (rt->rt6i_flags & RTF_NONEXTHOP ||
535 !(rt->rt6i_flags & RTF_GATEWAY))
539 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
541 read_lock(&neigh->lock);
542 if (neigh->nud_state & NUD_VALID)
544 #ifdef CONFIG_IPV6_ROUTER_PREF
545 else if (!(neigh->nud_state & NUD_FAILED))
548 read_unlock(&neigh->lock);
550 rcu_read_unlock_bh();
555 static int rt6_score_route(struct rt6_info *rt, int oif,
560 m = rt6_check_dev(rt, oif);
561 if (!m && (strict & RT6_LOOKUP_F_IFACE))
563 #ifdef CONFIG_IPV6_ROUTER_PREF
564 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
566 if (!rt6_check_neigh(rt) && (strict & RT6_LOOKUP_F_REACHABLE))
571 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
572 int *mpri, struct rt6_info *match)
576 if (rt6_check_expired(rt))
579 m = rt6_score_route(rt, oif, strict);
584 if (strict & RT6_LOOKUP_F_REACHABLE)
588 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
596 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
597 struct rt6_info *rr_head,
598 u32 metric, int oif, int strict)
600 struct rt6_info *rt, *match;
604 for (rt = rr_head; rt && rt->rt6i_metric == metric;
605 rt = rt->dst.rt6_next)
606 match = find_match(rt, oif, strict, &mpri, match);
607 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
608 rt = rt->dst.rt6_next)
609 match = find_match(rt, oif, strict, &mpri, match);
614 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
616 struct rt6_info *match, *rt0;
621 fn->rr_ptr = rt0 = fn->leaf;
623 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
626 (strict & RT6_LOOKUP_F_REACHABLE)) {
627 struct rt6_info *next = rt0->dst.rt6_next;
629 /* no entries matched; do round-robin */
630 if (!next || next->rt6i_metric != rt0->rt6i_metric)
637 net = dev_net(rt0->dst.dev);
638 return match ? match : net->ipv6.ip6_null_entry;
641 #ifdef CONFIG_IPV6_ROUTE_INFO
642 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
643 const struct in6_addr *gwaddr)
645 struct route_info *rinfo = (struct route_info *) opt;
646 struct in6_addr prefix_buf, *prefix;
648 unsigned long lifetime;
651 if (len < sizeof(struct route_info)) {
655 /* Sanity check for prefix_len and length */
656 if (rinfo->length > 3) {
658 } else if (rinfo->prefix_len > 128) {
660 } else if (rinfo->prefix_len > 64) {
661 if (rinfo->length < 2) {
664 } else if (rinfo->prefix_len > 0) {
665 if (rinfo->length < 1) {
670 pref = rinfo->route_pref;
671 if (pref == ICMPV6_ROUTER_PREF_INVALID)
674 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
676 if (rinfo->length == 3)
677 prefix = (struct in6_addr *)rinfo->prefix;
679 /* this function is safe */
680 ipv6_addr_prefix(&prefix_buf,
681 (struct in6_addr *)rinfo->prefix,
683 prefix = &prefix_buf;
686 rt = rt6_get_route_info(dev, prefix, rinfo->prefix_len, gwaddr);
688 if (rt && !lifetime) {
694 rt = rt6_add_route_info(dev, prefix, rinfo->prefix_len, gwaddr, pref);
696 rt->rt6i_flags = RTF_ROUTEINFO |
697 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
700 if (!addrconf_finite_timeout(lifetime))
701 rt6_clean_expires(rt);
703 rt6_set_expires(rt, jiffies + HZ * lifetime);
711 #define BACKTRACK(__net, saddr) \
713 if (rt == __net->ipv6.ip6_null_entry) { \
714 struct fib6_node *pn; \
716 if (fn->fn_flags & RTN_TL_ROOT) \
719 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
720 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
723 if (fn->fn_flags & RTN_RTINFO) \
729 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
730 struct fib6_table *table,
731 struct flowi6 *fl6, int flags)
733 struct fib6_node *fn;
736 read_lock_bh(&table->tb6_lock);
737 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
740 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
741 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
742 rt = rt6_multipath_select(rt, fl6);
743 BACKTRACK(net, &fl6->saddr);
745 dst_use(&rt->dst, jiffies);
746 read_unlock_bh(&table->tb6_lock);
751 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
754 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
756 EXPORT_SYMBOL_GPL(ip6_route_lookup);
758 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
759 const struct in6_addr *saddr, int oif, int strict)
761 struct flowi6 fl6 = {
765 struct dst_entry *dst;
766 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
769 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
770 flags |= RT6_LOOKUP_F_HAS_SADDR;
773 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
775 return (struct rt6_info *) dst;
782 EXPORT_SYMBOL(rt6_lookup);
784 /* ip6_ins_rt is called with FREE table->tb6_lock.
785 It takes new route entry, the addition fails by any reason the
786 route is freed. In any case, if caller does not hold it, it may
790 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
793 struct fib6_table *table;
795 table = rt->rt6i_table;
796 write_lock_bh(&table->tb6_lock);
797 err = fib6_add(&table->tb6_root, rt, info);
798 write_unlock_bh(&table->tb6_lock);
803 int ip6_ins_rt(struct rt6_info *rt)
805 struct nl_info info = {
806 .nl_net = dev_net(rt->dst.dev),
808 return __ip6_ins_rt(rt, &info);
811 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
812 const struct in6_addr *daddr,
813 const struct in6_addr *saddr)
821 rt = ip6_rt_copy(ort, daddr);
824 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
825 if (ort->rt6i_dst.plen != 128 &&
826 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
827 rt->rt6i_flags |= RTF_ANYCAST;
828 rt->rt6i_gateway = *daddr;
831 rt->rt6i_flags |= RTF_CACHE;
833 #ifdef CONFIG_IPV6_SUBTREES
834 if (rt->rt6i_src.plen && saddr) {
835 rt->rt6i_src.addr = *saddr;
836 rt->rt6i_src.plen = 128;
844 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
845 const struct in6_addr *daddr)
847 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
850 rt->rt6i_flags |= RTF_CACHE;
854 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
855 struct flowi6 *fl6, int flags)
857 struct fib6_node *fn;
858 struct rt6_info *rt, *nrt;
862 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
864 strict |= flags & RT6_LOOKUP_F_IFACE;
867 read_lock_bh(&table->tb6_lock);
870 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
873 rt = rt6_select(fn, oif, strict | reachable);
874 if (rt->rt6i_nsiblings && oif == 0)
875 rt = rt6_multipath_select(rt, fl6);
876 BACKTRACK(net, &fl6->saddr);
877 if (rt == net->ipv6.ip6_null_entry ||
878 rt->rt6i_flags & RTF_CACHE)
882 read_unlock_bh(&table->tb6_lock);
884 if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)))
885 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
886 else if (!(rt->dst.flags & DST_HOST))
887 nrt = rt6_alloc_clone(rt, &fl6->daddr);
892 rt = nrt ? : net->ipv6.ip6_null_entry;
896 err = ip6_ins_rt(nrt);
905 * Race condition! In the gap, when table->tb6_lock was
906 * released someone could insert this route. Relookup.
917 read_unlock_bh(&table->tb6_lock);
919 rt->dst.lastuse = jiffies;
925 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
926 struct flowi6 *fl6, int flags)
928 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
931 static struct dst_entry *ip6_route_input_lookup(struct net *net,
932 struct net_device *dev,
933 struct flowi6 *fl6, int flags)
935 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
936 flags |= RT6_LOOKUP_F_IFACE;
938 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
941 void ip6_route_input(struct sk_buff *skb)
943 const struct ipv6hdr *iph = ipv6_hdr(skb);
944 struct net *net = dev_net(skb->dev);
945 int flags = RT6_LOOKUP_F_HAS_SADDR;
946 struct flowi6 fl6 = {
947 .flowi6_iif = skb->dev->ifindex,
950 .flowlabel = ip6_flowinfo(iph),
951 .flowi6_mark = skb->mark,
952 .flowi6_proto = iph->nexthdr,
955 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
958 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
959 struct flowi6 *fl6, int flags)
961 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
964 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
969 fl6->flowi6_iif = LOOPBACK_IFINDEX;
971 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
972 flags |= RT6_LOOKUP_F_IFACE;
974 if (!ipv6_addr_any(&fl6->saddr))
975 flags |= RT6_LOOKUP_F_HAS_SADDR;
977 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
979 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
982 EXPORT_SYMBOL(ip6_route_output);
984 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
986 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
987 struct dst_entry *new = NULL;
989 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
993 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
994 rt6_init_peer(rt, net->ipv6.peers);
997 new->input = dst_discard;
998 new->output = dst_discard;
1000 if (dst_metrics_read_only(&ort->dst))
1001 new->_metrics = ort->dst._metrics;
1003 dst_copy_metrics(new, &ort->dst);
1004 rt->rt6i_idev = ort->rt6i_idev;
1006 in6_dev_hold(rt->rt6i_idev);
1008 rt->rt6i_gateway = ort->rt6i_gateway;
1009 rt->rt6i_flags = ort->rt6i_flags;
1010 rt->rt6i_metric = 0;
1012 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1013 #ifdef CONFIG_IPV6_SUBTREES
1014 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1020 dst_release(dst_orig);
1021 return new ? new : ERR_PTR(-ENOMEM);
1025 * Destination cache support functions
1028 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1030 struct rt6_info *rt;
1032 rt = (struct rt6_info *) dst;
1034 /* All IPV6 dsts are created with ->obsolete set to the value
1035 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1036 * into this function always.
1038 if (rt->rt6i_genid != rt_genid(dev_net(rt->dst.dev)))
1041 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
1047 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1049 struct rt6_info *rt = (struct rt6_info *) dst;
1052 if (rt->rt6i_flags & RTF_CACHE) {
1053 if (rt6_check_expired(rt)) {
1065 static void ip6_link_failure(struct sk_buff *skb)
1067 struct rt6_info *rt;
1069 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1071 rt = (struct rt6_info *) skb_dst(skb);
1073 if (rt->rt6i_flags & RTF_CACHE)
1074 rt6_update_expires(rt, 0);
1075 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1076 rt->rt6i_node->fn_sernum = -1;
1080 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1081 struct sk_buff *skb, u32 mtu)
1083 struct rt6_info *rt6 = (struct rt6_info*)dst;
1086 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1087 struct net *net = dev_net(dst->dev);
1089 rt6->rt6i_flags |= RTF_MODIFIED;
1090 if (mtu < IPV6_MIN_MTU) {
1091 u32 features = dst_metric(dst, RTAX_FEATURES);
1093 features |= RTAX_FEATURE_ALLFRAG;
1094 dst_metric_set(dst, RTAX_FEATURES, features);
1096 dst_metric_set(dst, RTAX_MTU, mtu);
1097 rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
1101 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1102 int oif, u32 mark, kuid_t uid)
1104 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1105 struct dst_entry *dst;
1108 memset(&fl6, 0, sizeof(fl6));
1109 fl6.flowi6_oif = oif;
1110 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1111 fl6.flowi6_flags = 0;
1112 fl6.daddr = iph->daddr;
1113 fl6.saddr = iph->saddr;
1114 fl6.flowlabel = ip6_flowinfo(iph);
1115 fl6.flowi6_uid = uid;
1117 dst = ip6_route_output(net, NULL, &fl6);
1119 ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu));
1122 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1124 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1126 ip6_update_pmtu(skb, sock_net(sk), mtu,
1127 sk->sk_bound_dev_if, sk->sk_mark, sock_i_uid(sk));
1129 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1131 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1133 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1134 struct dst_entry *dst;
1137 memset(&fl6, 0, sizeof(fl6));
1138 fl6.flowi6_oif = oif;
1139 fl6.flowi6_mark = mark;
1140 fl6.flowi6_flags = 0;
1141 fl6.daddr = iph->daddr;
1142 fl6.saddr = iph->saddr;
1143 fl6.flowlabel = ip6_flowinfo(iph);
1145 dst = ip6_route_output(net, NULL, &fl6);
1147 rt6_do_redirect(dst, NULL, skb);
1150 EXPORT_SYMBOL_GPL(ip6_redirect);
1152 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1154 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1156 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1158 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1160 struct net_device *dev = dst->dev;
1161 unsigned int mtu = dst_mtu(dst);
1162 struct net *net = dev_net(dev);
1164 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1166 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1167 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1170 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1171 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1172 * IPV6_MAXPLEN is also valid and means: "any MSS,
1173 * rely only on pmtu discovery"
1175 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1180 static unsigned int ip6_mtu(const struct dst_entry *dst)
1182 struct inet6_dev *idev;
1183 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1191 idev = __in6_dev_get(dst->dev);
1193 mtu = idev->cnf.mtu6;
1199 static struct dst_entry *icmp6_dst_gc_list;
1200 static DEFINE_SPINLOCK(icmp6_dst_lock);
1202 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1205 struct dst_entry *dst;
1206 struct rt6_info *rt;
1207 struct inet6_dev *idev = in6_dev_get(dev);
1208 struct net *net = dev_net(dev);
1210 if (unlikely(!idev))
1211 return ERR_PTR(-ENODEV);
1213 rt = ip6_dst_alloc(net, dev, 0, NULL);
1214 if (unlikely(!rt)) {
1216 dst = ERR_PTR(-ENOMEM);
1220 rt->dst.flags |= DST_HOST;
1221 rt->dst.output = ip6_output;
1222 atomic_set(&rt->dst.__refcnt, 1);
1223 rt->rt6i_dst.addr = fl6->daddr;
1224 rt->rt6i_dst.plen = 128;
1225 rt->rt6i_idev = idev;
1226 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1228 spin_lock_bh(&icmp6_dst_lock);
1229 rt->dst.next = icmp6_dst_gc_list;
1230 icmp6_dst_gc_list = &rt->dst;
1231 spin_unlock_bh(&icmp6_dst_lock);
1233 fib6_force_start_gc(net);
1235 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1241 int icmp6_dst_gc(void)
1243 struct dst_entry *dst, **pprev;
1246 spin_lock_bh(&icmp6_dst_lock);
1247 pprev = &icmp6_dst_gc_list;
1249 while ((dst = *pprev) != NULL) {
1250 if (!atomic_read(&dst->__refcnt)) {
1259 spin_unlock_bh(&icmp6_dst_lock);
1264 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1267 struct dst_entry *dst, **pprev;
1269 spin_lock_bh(&icmp6_dst_lock);
1270 pprev = &icmp6_dst_gc_list;
1271 while ((dst = *pprev) != NULL) {
1272 struct rt6_info *rt = (struct rt6_info *) dst;
1273 if (func(rt, arg)) {
1280 spin_unlock_bh(&icmp6_dst_lock);
1283 static int ip6_dst_gc(struct dst_ops *ops)
1285 unsigned long now = jiffies;
1286 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1287 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1288 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1289 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1290 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1291 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1294 entries = dst_entries_get_fast(ops);
1295 if (time_after(rt_last_gc + rt_min_interval, now) &&
1296 entries <= rt_max_size)
1299 net->ipv6.ip6_rt_gc_expire++;
1300 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1301 net->ipv6.ip6_rt_last_gc = now;
1302 entries = dst_entries_get_slow(ops);
1303 if (entries < ops->gc_thresh)
1304 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1306 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1307 return entries > rt_max_size;
1310 int ip6_dst_hoplimit(struct dst_entry *dst)
1312 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1313 if (hoplimit == 0) {
1314 struct net_device *dev = dst->dev;
1315 struct inet6_dev *idev;
1318 idev = __in6_dev_get(dev);
1320 hoplimit = idev->cnf.hop_limit;
1322 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1327 EXPORT_SYMBOL(ip6_dst_hoplimit);
1333 int ip6_route_add(struct fib6_config *cfg)
1336 struct net *net = cfg->fc_nlinfo.nl_net;
1337 struct rt6_info *rt = NULL;
1338 struct net_device *dev = NULL;
1339 struct inet6_dev *idev = NULL;
1340 struct fib6_table *table;
1343 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1345 #ifndef CONFIG_IPV6_SUBTREES
1346 if (cfg->fc_src_len)
1349 if (cfg->fc_ifindex) {
1351 dev = dev_get_by_index(net, cfg->fc_ifindex);
1354 idev = in6_dev_get(dev);
1359 if (cfg->fc_metric == 0)
1360 cfg->fc_metric = IP6_RT_PRIO_USER;
1363 if (cfg->fc_nlinfo.nlh &&
1364 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1365 table = fib6_get_table(net, cfg->fc_table);
1367 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1368 table = fib6_new_table(net, cfg->fc_table);
1371 table = fib6_new_table(net, cfg->fc_table);
1377 rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table);
1384 if (cfg->fc_flags & RTF_EXPIRES)
1385 rt6_set_expires(rt, jiffies +
1386 clock_t_to_jiffies(cfg->fc_expires));
1388 rt6_clean_expires(rt);
1390 if (cfg->fc_protocol == RTPROT_UNSPEC)
1391 cfg->fc_protocol = RTPROT_BOOT;
1392 rt->rt6i_protocol = cfg->fc_protocol;
1394 addr_type = ipv6_addr_type(&cfg->fc_dst);
1396 if (addr_type & IPV6_ADDR_MULTICAST)
1397 rt->dst.input = ip6_mc_input;
1398 else if (cfg->fc_flags & RTF_LOCAL)
1399 rt->dst.input = ip6_input;
1401 rt->dst.input = ip6_forward;
1403 rt->dst.output = ip6_output;
1405 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1406 rt->rt6i_dst.plen = cfg->fc_dst_len;
1407 if (rt->rt6i_dst.plen == 128)
1408 rt->dst.flags |= DST_HOST;
1410 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1411 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1416 dst_init_metrics(&rt->dst, metrics, 0);
1418 #ifdef CONFIG_IPV6_SUBTREES
1419 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1420 rt->rt6i_src.plen = cfg->fc_src_len;
1423 rt->rt6i_metric = cfg->fc_metric;
1425 /* We cannot add true routes via loopback here,
1426 they would result in kernel looping; promote them to reject routes
1428 if ((cfg->fc_flags & RTF_REJECT) ||
1429 (dev && (dev->flags & IFF_LOOPBACK) &&
1430 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1431 !(cfg->fc_flags & RTF_LOCAL))) {
1432 /* hold loopback dev/idev if we haven't done so. */
1433 if (dev != net->loopback_dev) {
1438 dev = net->loopback_dev;
1440 idev = in6_dev_get(dev);
1446 rt->dst.output = ip6_pkt_discard_out;
1447 rt->dst.input = ip6_pkt_discard;
1448 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1449 switch (cfg->fc_type) {
1451 rt->dst.error = -EINVAL;
1454 rt->dst.error = -EACCES;
1457 rt->dst.error = -EAGAIN;
1460 rt->dst.error = -ENETUNREACH;
1466 if (cfg->fc_flags & RTF_GATEWAY) {
1467 const struct in6_addr *gw_addr;
1470 gw_addr = &cfg->fc_gateway;
1471 rt->rt6i_gateway = *gw_addr;
1472 gwa_type = ipv6_addr_type(gw_addr);
1474 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1475 struct rt6_info *grt;
1477 /* IPv6 strictly inhibits using not link-local
1478 addresses as nexthop address.
1479 Otherwise, router will not able to send redirects.
1480 It is very good, but in some (rare!) circumstances
1481 (SIT, PtP, NBMA NOARP links) it is handy to allow
1482 some exceptions. --ANK
1485 if (!(gwa_type & IPV6_ADDR_UNICAST))
1488 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1490 err = -EHOSTUNREACH;
1494 if (dev != grt->dst.dev) {
1500 idev = grt->rt6i_idev;
1502 in6_dev_hold(grt->rt6i_idev);
1504 if (!(grt->rt6i_flags & RTF_GATEWAY))
1512 if (!dev || (dev->flags & IFF_LOOPBACK))
1520 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1521 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1525 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1526 rt->rt6i_prefsrc.plen = 128;
1528 rt->rt6i_prefsrc.plen = 0;
1530 rt->rt6i_flags = cfg->fc_flags;
1537 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1538 int type = nla_type(nla);
1541 if (type > RTAX_MAX) {
1546 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1552 rt->rt6i_idev = idev;
1553 rt->rt6i_table = table;
1555 cfg->fc_nlinfo.nl_net = dev_net(dev);
1557 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1569 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1572 struct fib6_table *table;
1573 struct net *net = dev_net(rt->dst.dev);
1575 if (rt == net->ipv6.ip6_null_entry) {
1580 table = rt->rt6i_table;
1581 write_lock_bh(&table->tb6_lock);
1582 err = fib6_del(rt, info);
1583 write_unlock_bh(&table->tb6_lock);
1590 int ip6_del_rt(struct rt6_info *rt)
1592 struct nl_info info = {
1593 .nl_net = dev_net(rt->dst.dev),
1595 return __ip6_del_rt(rt, &info);
1598 static int ip6_route_del(struct fib6_config *cfg)
1600 struct fib6_table *table;
1601 struct fib6_node *fn;
1602 struct rt6_info *rt;
1605 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1609 read_lock_bh(&table->tb6_lock);
1611 fn = fib6_locate(&table->tb6_root,
1612 &cfg->fc_dst, cfg->fc_dst_len,
1613 &cfg->fc_src, cfg->fc_src_len);
1616 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1617 if (cfg->fc_ifindex &&
1619 rt->dst.dev->ifindex != cfg->fc_ifindex))
1621 if (cfg->fc_flags & RTF_GATEWAY &&
1622 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1624 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1627 read_unlock_bh(&table->tb6_lock);
1629 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1632 read_unlock_bh(&table->tb6_lock);
1637 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
1639 struct net *net = dev_net(skb->dev);
1640 struct netevent_redirect netevent;
1641 struct rt6_info *rt, *nrt = NULL;
1642 struct ndisc_options ndopts;
1643 struct inet6_dev *in6_dev;
1644 struct neighbour *neigh;
1646 int optlen, on_link;
1649 optlen = skb->tail - skb->transport_header;
1650 optlen -= sizeof(*msg);
1653 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
1657 msg = (struct rd_msg *)icmp6_hdr(skb);
1659 if (ipv6_addr_is_multicast(&msg->dest)) {
1660 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
1665 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
1667 } else if (ipv6_addr_type(&msg->target) !=
1668 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
1669 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
1673 in6_dev = __in6_dev_get(skb->dev);
1676 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
1680 * The IP source address of the Redirect MUST be the same as the current
1681 * first-hop router for the specified ICMP Destination Address.
1684 if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
1685 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
1690 if (ndopts.nd_opts_tgt_lladdr) {
1691 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
1694 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
1699 rt = (struct rt6_info *) dst;
1700 if (rt == net->ipv6.ip6_null_entry) {
1701 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1705 /* Redirect received -> path was valid.
1706 * Look, redirects are sent only in response to data packets,
1707 * so that this nexthop apparently is reachable. --ANK
1709 dst_confirm(&rt->dst);
1711 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
1716 * We have finally decided to accept it.
1719 neigh_update(neigh, lladdr, NUD_STALE,
1720 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1721 NEIGH_UPDATE_F_OVERRIDE|
1722 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1723 NEIGH_UPDATE_F_ISROUTER))
1726 nrt = ip6_rt_copy(rt, &msg->dest);
1730 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1732 nrt->rt6i_flags &= ~RTF_GATEWAY;
1734 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1736 if (ip6_ins_rt(nrt))
1739 netevent.old = &rt->dst;
1740 netevent.new = &nrt->dst;
1741 netevent.daddr = &msg->dest;
1742 netevent.neigh = neigh;
1743 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1745 if (rt->rt6i_flags & RTF_CACHE) {
1746 rt = (struct rt6_info *) dst_clone(&rt->dst);
1751 neigh_release(neigh);
1755 * Misc support functions
1758 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1759 const struct in6_addr *dest)
1761 struct net *net = dev_net(ort->dst.dev);
1762 struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
1766 rt->dst.input = ort->dst.input;
1767 rt->dst.output = ort->dst.output;
1768 rt->dst.flags |= DST_HOST;
1770 rt->rt6i_dst.addr = *dest;
1771 rt->rt6i_dst.plen = 128;
1772 dst_copy_metrics(&rt->dst, &ort->dst);
1773 rt->dst.error = ort->dst.error;
1774 rt->rt6i_idev = ort->rt6i_idev;
1776 in6_dev_hold(rt->rt6i_idev);
1777 rt->dst.lastuse = jiffies;
1779 rt->rt6i_gateway = ort->rt6i_gateway;
1780 rt->rt6i_flags = ort->rt6i_flags;
1781 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1782 (RTF_DEFAULT | RTF_ADDRCONF))
1783 rt6_set_from(rt, ort);
1784 rt->rt6i_metric = 0;
1786 #ifdef CONFIG_IPV6_SUBTREES
1787 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1789 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1790 rt->rt6i_table = ort->rt6i_table;
1795 #ifdef CONFIG_IPV6_ROUTE_INFO
1796 static struct rt6_info *rt6_get_route_info(struct net_device *dev,
1797 const struct in6_addr *prefix, int prefixlen,
1798 const struct in6_addr *gwaddr)
1800 struct fib6_node *fn;
1801 struct rt6_info *rt = NULL;
1802 struct fib6_table *table;
1804 table = fib6_get_table(dev_net(dev),
1805 addrconf_rt_table(dev, RT6_TABLE_INFO));
1809 read_lock_bh(&table->tb6_lock);
1810 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1814 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1815 if (rt->dst.dev->ifindex != dev->ifindex)
1817 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1819 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1825 read_unlock_bh(&table->tb6_lock);
1829 static struct rt6_info *rt6_add_route_info(struct net_device *dev,
1830 const struct in6_addr *prefix, int prefixlen,
1831 const struct in6_addr *gwaddr, unsigned int pref)
1833 struct fib6_config cfg = {
1834 .fc_table = addrconf_rt_table(dev, RT6_TABLE_INFO),
1835 .fc_metric = IP6_RT_PRIO_USER,
1836 .fc_ifindex = dev->ifindex,
1837 .fc_dst_len = prefixlen,
1838 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1839 RTF_UP | RTF_PREF(pref),
1840 .fc_nlinfo.portid = 0,
1841 .fc_nlinfo.nlh = NULL,
1842 .fc_nlinfo.nl_net = dev_net(dev),
1845 cfg.fc_dst = *prefix;
1846 cfg.fc_gateway = *gwaddr;
1848 /* We should treat it as a default route if prefix length is 0. */
1850 cfg.fc_flags |= RTF_DEFAULT;
1852 ip6_route_add(&cfg);
1854 return rt6_get_route_info(dev, prefix, prefixlen, gwaddr);
1858 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1860 struct rt6_info *rt;
1861 struct fib6_table *table;
1863 table = fib6_get_table(dev_net(dev),
1864 addrconf_rt_table(dev, RT6_TABLE_MAIN));
1868 read_lock_bh(&table->tb6_lock);
1869 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1870 if (dev == rt->dst.dev &&
1871 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1872 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1877 read_unlock_bh(&table->tb6_lock);
1881 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1882 struct net_device *dev,
1885 struct fib6_config cfg = {
1886 .fc_table = addrconf_rt_table(dev, RT6_TABLE_DFLT),
1887 .fc_metric = IP6_RT_PRIO_USER,
1888 .fc_ifindex = dev->ifindex,
1889 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1890 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1891 .fc_nlinfo.portid = 0,
1892 .fc_nlinfo.nlh = NULL,
1893 .fc_nlinfo.nl_net = dev_net(dev),
1896 cfg.fc_gateway = *gwaddr;
1898 ip6_route_add(&cfg);
1900 return rt6_get_dflt_router(gwaddr, dev);
1904 int rt6_addrconf_purge(struct rt6_info *rt, void *arg) {
1905 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1906 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2))
1911 void rt6_purge_dflt_routers(struct net *net)
1913 fib6_clean_all(net, rt6_addrconf_purge, 0, NULL);
1916 static void rtmsg_to_fib6_config(struct net *net,
1917 struct in6_rtmsg *rtmsg,
1918 struct fib6_config *cfg)
1920 memset(cfg, 0, sizeof(*cfg));
1922 cfg->fc_table = RT6_TABLE_MAIN;
1923 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1924 cfg->fc_metric = rtmsg->rtmsg_metric;
1925 cfg->fc_expires = rtmsg->rtmsg_info;
1926 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1927 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1928 cfg->fc_flags = rtmsg->rtmsg_flags;
1930 cfg->fc_nlinfo.nl_net = net;
1932 cfg->fc_dst = rtmsg->rtmsg_dst;
1933 cfg->fc_src = rtmsg->rtmsg_src;
1934 cfg->fc_gateway = rtmsg->rtmsg_gateway;
1937 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1939 struct fib6_config cfg;
1940 struct in6_rtmsg rtmsg;
1944 case SIOCADDRT: /* Add a route */
1945 case SIOCDELRT: /* Delete a route */
1946 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1948 err = copy_from_user(&rtmsg, arg,
1949 sizeof(struct in6_rtmsg));
1953 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1958 err = ip6_route_add(&cfg);
1961 err = ip6_route_del(&cfg);
1975 * Drop the packet on the floor
1978 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1981 struct dst_entry *dst = skb_dst(skb);
1982 switch (ipstats_mib_noroutes) {
1983 case IPSTATS_MIB_INNOROUTES:
1984 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1985 if (type == IPV6_ADDR_ANY) {
1986 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1987 IPSTATS_MIB_INADDRERRORS);
1991 case IPSTATS_MIB_OUTNOROUTES:
1992 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1993 ipstats_mib_noroutes);
1996 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2001 static int ip6_pkt_discard(struct sk_buff *skb)
2003 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2006 static int ip6_pkt_discard_out(struct sk_buff *skb)
2008 skb->dev = skb_dst(skb)->dev;
2009 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2012 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2014 static int ip6_pkt_prohibit(struct sk_buff *skb)
2016 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2019 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2021 skb->dev = skb_dst(skb)->dev;
2022 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2028 * Allocate a dst for local (unicast / anycast) address.
2031 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2032 const struct in6_addr *addr,
2035 struct net *net = dev_net(idev->dev);
2036 struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
2039 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2040 return ERR_PTR(-ENOMEM);
2045 rt->dst.flags |= DST_HOST;
2046 rt->dst.input = ip6_input;
2047 rt->dst.output = ip6_output;
2048 rt->rt6i_idev = idev;
2050 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2052 rt->rt6i_flags |= RTF_ANYCAST;
2054 rt->rt6i_flags |= RTF_LOCAL;
2056 rt->rt6i_dst.addr = *addr;
2057 rt->rt6i_dst.plen = 128;
2058 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2060 atomic_set(&rt->dst.__refcnt, 1);
2065 int ip6_route_get_saddr(struct net *net,
2066 struct rt6_info *rt,
2067 const struct in6_addr *daddr,
2069 struct in6_addr *saddr)
2071 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2073 if (rt->rt6i_prefsrc.plen)
2074 *saddr = rt->rt6i_prefsrc.addr;
2076 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2077 daddr, prefs, saddr);
2081 /* remove deleted ip from prefsrc entries */
2082 struct arg_dev_net_ip {
2083 struct net_device *dev;
2085 struct in6_addr *addr;
2088 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2090 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2091 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2092 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2094 if (((void *)rt->dst.dev == dev || !dev) &&
2095 rt != net->ipv6.ip6_null_entry &&
2096 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2097 /* remove prefsrc entry */
2098 rt->rt6i_prefsrc.plen = 0;
2103 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2105 struct net *net = dev_net(ifp->idev->dev);
2106 struct arg_dev_net_ip adni = {
2107 .dev = ifp->idev->dev,
2111 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2114 struct arg_dev_net {
2115 struct net_device *dev;
2119 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2121 const struct arg_dev_net *adn = arg;
2122 const struct net_device *dev = adn->dev;
2124 if ((rt->dst.dev == dev || !dev) &&
2125 rt != adn->net->ipv6.ip6_null_entry)
2131 void rt6_ifdown(struct net *net, struct net_device *dev)
2133 struct arg_dev_net adn = {
2138 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2139 icmp6_clean_all(fib6_ifdown, &adn);
2142 struct rt6_mtu_change_arg {
2143 struct net_device *dev;
2147 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2149 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2150 struct inet6_dev *idev;
2152 /* In IPv6 pmtu discovery is not optional,
2153 so that RTAX_MTU lock cannot disable it.
2154 We still use this lock to block changes
2155 caused by addrconf/ndisc.
2158 idev = __in6_dev_get(arg->dev);
2162 /* For administrative MTU increase, there is no way to discover
2163 IPv6 PMTU increase, so PMTU increase should be updated here.
2164 Since RFC 1981 doesn't include administrative MTU increase
2165 update PMTU increase is a MUST. (i.e. jumbo frame)
2168 If new MTU is less than route PMTU, this new MTU will be the
2169 lowest MTU in the path, update the route PMTU to reflect PMTU
2170 decreases; if new MTU is greater than route PMTU, and the
2171 old MTU is the lowest MTU in the path, update the route PMTU
2172 to reflect the increase. In this case if the other nodes' MTU
2173 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2176 if (rt->dst.dev == arg->dev &&
2177 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2178 (dst_mtu(&rt->dst) >= arg->mtu ||
2179 (dst_mtu(&rt->dst) < arg->mtu &&
2180 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2181 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2186 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2188 struct rt6_mtu_change_arg arg = {
2193 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2196 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2197 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2198 [RTA_OIF] = { .type = NLA_U32 },
2199 [RTA_IIF] = { .type = NLA_U32 },
2200 [RTA_PRIORITY] = { .type = NLA_U32 },
2201 [RTA_METRICS] = { .type = NLA_NESTED },
2202 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2203 [RTA_UID] = { .type = NLA_U32 },
2206 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2207 struct fib6_config *cfg)
2210 struct nlattr *tb[RTA_MAX+1];
2213 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2218 rtm = nlmsg_data(nlh);
2219 memset(cfg, 0, sizeof(*cfg));
2221 cfg->fc_table = rtm->rtm_table;
2222 cfg->fc_dst_len = rtm->rtm_dst_len;
2223 cfg->fc_src_len = rtm->rtm_src_len;
2224 cfg->fc_flags = RTF_UP;
2225 cfg->fc_protocol = rtm->rtm_protocol;
2226 cfg->fc_type = rtm->rtm_type;
2228 if (rtm->rtm_type == RTN_UNREACHABLE ||
2229 rtm->rtm_type == RTN_BLACKHOLE ||
2230 rtm->rtm_type == RTN_PROHIBIT ||
2231 rtm->rtm_type == RTN_THROW)
2232 cfg->fc_flags |= RTF_REJECT;
2234 if (rtm->rtm_type == RTN_LOCAL)
2235 cfg->fc_flags |= RTF_LOCAL;
2237 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2238 cfg->fc_nlinfo.nlh = nlh;
2239 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2241 if (tb[RTA_GATEWAY]) {
2242 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2243 cfg->fc_flags |= RTF_GATEWAY;
2247 int plen = (rtm->rtm_dst_len + 7) >> 3;
2249 if (nla_len(tb[RTA_DST]) < plen)
2252 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2256 int plen = (rtm->rtm_src_len + 7) >> 3;
2258 if (nla_len(tb[RTA_SRC]) < plen)
2261 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2264 if (tb[RTA_PREFSRC])
2265 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2268 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2270 if (tb[RTA_PRIORITY])
2271 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2273 if (tb[RTA_METRICS]) {
2274 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2275 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2279 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2281 if (tb[RTA_MULTIPATH]) {
2282 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2283 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2291 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2293 struct fib6_config r_cfg;
2294 struct rtnexthop *rtnh;
2297 int err = 0, last_err = 0;
2300 rtnh = (struct rtnexthop *)cfg->fc_mp;
2301 remaining = cfg->fc_mp_len;
2303 /* Parse a Multipath Entry */
2304 while (rtnh_ok(rtnh, remaining)) {
2305 memcpy(&r_cfg, cfg, sizeof(*cfg));
2306 if (rtnh->rtnh_ifindex)
2307 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2309 attrlen = rtnh_attrlen(rtnh);
2311 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2313 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2315 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
2316 r_cfg.fc_flags |= RTF_GATEWAY;
2319 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2322 /* If we are trying to remove a route, do not stop the
2323 * loop when ip6_route_del() fails (because next hop is
2324 * already gone), we should try to remove all next hops.
2327 /* If add fails, we should try to delete all
2328 * next hops that have been already added.
2334 /* Because each route is added like a single route we remove
2335 * this flag after the first nexthop (if there is a collision,
2336 * we have already fail to add the first nexthop:
2337 * fib6_add_rt2node() has reject it).
2339 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~NLM_F_EXCL;
2340 rtnh = rtnh_next(rtnh, &remaining);
2346 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh)
2348 struct fib6_config cfg;
2351 err = rtm_to_fib6_config(skb, nlh, &cfg);
2356 return ip6_route_multipath(&cfg, 0);
2358 return ip6_route_del(&cfg);
2361 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh)
2363 struct fib6_config cfg;
2366 err = rtm_to_fib6_config(skb, nlh, &cfg);
2371 return ip6_route_multipath(&cfg, 1);
2373 return ip6_route_add(&cfg);
2376 static inline size_t rt6_nlmsg_size(void)
2378 return NLMSG_ALIGN(sizeof(struct rtmsg))
2379 + nla_total_size(16) /* RTA_SRC */
2380 + nla_total_size(16) /* RTA_DST */
2381 + nla_total_size(16) /* RTA_GATEWAY */
2382 + nla_total_size(16) /* RTA_PREFSRC */
2383 + nla_total_size(4) /* RTA_TABLE */
2384 + nla_total_size(4) /* RTA_IIF */
2385 + nla_total_size(4) /* RTA_OIF */
2386 + nla_total_size(4) /* RTA_PRIORITY */
2387 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2388 + nla_total_size(sizeof(struct rta_cacheinfo));
2391 static int rt6_fill_node(struct net *net,
2392 struct sk_buff *skb, struct rt6_info *rt,
2393 struct in6_addr *dst, struct in6_addr *src,
2394 int iif, int type, u32 portid, u32 seq,
2395 int prefix, int nowait, unsigned int flags)
2398 struct nlmsghdr *nlh;
2402 if (prefix) { /* user wants prefix routes only */
2403 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2404 /* success since this is not a prefix route */
2409 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2413 rtm = nlmsg_data(nlh);
2414 rtm->rtm_family = AF_INET6;
2415 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2416 rtm->rtm_src_len = rt->rt6i_src.plen;
2419 table = rt->rt6i_table->tb6_id;
2421 table = RT6_TABLE_UNSPEC;
2422 rtm->rtm_table = table;
2423 if (nla_put_u32(skb, RTA_TABLE, table))
2424 goto nla_put_failure;
2425 if (rt->rt6i_flags & RTF_REJECT) {
2426 switch (rt->dst.error) {
2428 rtm->rtm_type = RTN_BLACKHOLE;
2431 rtm->rtm_type = RTN_PROHIBIT;
2434 rtm->rtm_type = RTN_THROW;
2437 rtm->rtm_type = RTN_UNREACHABLE;
2441 else if (rt->rt6i_flags & RTF_LOCAL)
2442 rtm->rtm_type = RTN_LOCAL;
2443 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2444 rtm->rtm_type = RTN_LOCAL;
2446 rtm->rtm_type = RTN_UNICAST;
2448 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2449 rtm->rtm_protocol = rt->rt6i_protocol;
2450 if (rt->rt6i_flags & RTF_DYNAMIC)
2451 rtm->rtm_protocol = RTPROT_REDIRECT;
2452 else if (rt->rt6i_flags & RTF_ADDRCONF) {
2453 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2454 rtm->rtm_protocol = RTPROT_RA;
2456 rtm->rtm_protocol = RTPROT_KERNEL;
2459 if (rt->rt6i_flags & RTF_CACHE)
2460 rtm->rtm_flags |= RTM_F_CLONED;
2463 if (nla_put(skb, RTA_DST, 16, dst))
2464 goto nla_put_failure;
2465 rtm->rtm_dst_len = 128;
2466 } else if (rtm->rtm_dst_len)
2467 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2468 goto nla_put_failure;
2469 #ifdef CONFIG_IPV6_SUBTREES
2471 if (nla_put(skb, RTA_SRC, 16, src))
2472 goto nla_put_failure;
2473 rtm->rtm_src_len = 128;
2474 } else if (rtm->rtm_src_len &&
2475 nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2476 goto nla_put_failure;
2479 #ifdef CONFIG_IPV6_MROUTE
2480 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2481 int err = ip6mr_get_route(net, skb, rtm, nowait);
2486 goto nla_put_failure;
2488 if (err == -EMSGSIZE)
2489 goto nla_put_failure;
2494 if (nla_put_u32(skb, RTA_IIF, iif))
2495 goto nla_put_failure;
2497 struct in6_addr saddr_buf;
2498 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2499 nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2500 goto nla_put_failure;
2503 if (rt->rt6i_prefsrc.plen) {
2504 struct in6_addr saddr_buf;
2505 saddr_buf = rt->rt6i_prefsrc.addr;
2506 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2507 goto nla_put_failure;
2510 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2511 goto nla_put_failure;
2513 if (rt->rt6i_flags & RTF_GATEWAY) {
2514 if (nla_put(skb, RTA_GATEWAY, 16, &rt->rt6i_gateway) < 0)
2515 goto nla_put_failure;
2519 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2520 goto nla_put_failure;
2521 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2522 goto nla_put_failure;
2524 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2526 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2527 goto nla_put_failure;
2529 return nlmsg_end(skb, nlh);
2532 nlmsg_cancel(skb, nlh);
2536 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2538 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2541 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2542 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2543 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2547 return rt6_fill_node(arg->net,
2548 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2549 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
2550 prefix, 0, NLM_F_MULTI);
2553 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh)
2555 struct net *net = sock_net(in_skb->sk);
2556 struct nlattr *tb[RTA_MAX+1];
2557 struct rt6_info *rt;
2558 struct sk_buff *skb;
2561 int err, iif = 0, oif = 0;
2563 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2568 memset(&fl6, 0, sizeof(fl6));
2571 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2574 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2578 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2581 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2585 iif = nla_get_u32(tb[RTA_IIF]);
2588 oif = nla_get_u32(tb[RTA_OIF]);
2591 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
2594 fl6.flowi6_uid = make_kuid(current_user_ns(),
2595 nla_get_u32(tb[RTA_UID]));
2597 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
2600 struct net_device *dev;
2603 dev = __dev_get_by_index(net, iif);
2609 fl6.flowi6_iif = iif;
2611 if (!ipv6_addr_any(&fl6.saddr))
2612 flags |= RT6_LOOKUP_F_HAS_SADDR;
2614 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2617 fl6.flowi6_oif = oif;
2619 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2622 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2629 /* Reserve room for dummy headers, this skb can pass
2630 through good chunk of routing engine.
2632 skb_reset_mac_header(skb);
2633 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2635 skb_dst_set(skb, &rt->dst);
2637 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2638 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
2639 nlh->nlmsg_seq, 0, 0, 0);
2645 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2650 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2652 struct sk_buff *skb;
2653 struct net *net = info->nl_net;
2658 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2660 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2664 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2665 event, info->portid, seq, 0, 0, 0);
2667 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2668 WARN_ON(err == -EMSGSIZE);
2672 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2673 info->nlh, gfp_any());
2677 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2680 static int ip6_route_dev_notify(struct notifier_block *this,
2681 unsigned long event, void *data)
2683 struct net_device *dev = (struct net_device *)data;
2684 struct net *net = dev_net(dev);
2686 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2687 net->ipv6.ip6_null_entry->dst.dev = dev;
2688 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2689 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2690 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2691 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2692 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2693 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2704 #ifdef CONFIG_PROC_FS
2715 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2717 struct seq_file *m = p_arg;
2719 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2721 #ifdef CONFIG_IPV6_SUBTREES
2722 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2724 seq_puts(m, "00000000000000000000000000000000 00 ");
2726 if (rt->rt6i_flags & RTF_GATEWAY) {
2727 seq_printf(m, "%pi6", &rt->rt6i_gateway);
2729 seq_puts(m, "00000000000000000000000000000000");
2731 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2732 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2733 rt->dst.__use, rt->rt6i_flags,
2734 rt->dst.dev ? rt->dst.dev->name : "");
2738 static int ipv6_route_show(struct seq_file *m, void *v)
2740 struct net *net = (struct net *)m->private;
2741 fib6_clean_all_ro(net, rt6_info_route, 0, m);
2745 static int ipv6_route_open(struct inode *inode, struct file *file)
2747 return single_open_net(inode, file, ipv6_route_show);
2750 static const struct file_operations ipv6_route_proc_fops = {
2751 .owner = THIS_MODULE,
2752 .open = ipv6_route_open,
2754 .llseek = seq_lseek,
2755 .release = single_release_net,
2758 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2760 struct net *net = (struct net *)seq->private;
2761 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2762 net->ipv6.rt6_stats->fib_nodes,
2763 net->ipv6.rt6_stats->fib_route_nodes,
2764 net->ipv6.rt6_stats->fib_rt_alloc,
2765 net->ipv6.rt6_stats->fib_rt_entries,
2766 net->ipv6.rt6_stats->fib_rt_cache,
2767 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2768 net->ipv6.rt6_stats->fib_discarded_routes);
2773 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2775 return single_open_net(inode, file, rt6_stats_seq_show);
2778 static const struct file_operations rt6_stats_seq_fops = {
2779 .owner = THIS_MODULE,
2780 .open = rt6_stats_seq_open,
2782 .llseek = seq_lseek,
2783 .release = single_release_net,
2785 #endif /* CONFIG_PROC_FS */
2787 #ifdef CONFIG_SYSCTL
2790 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2791 void __user *buffer, size_t *lenp, loff_t *ppos)
2798 net = (struct net *)ctl->extra1;
2799 delay = net->ipv6.sysctl.flush_delay;
2800 proc_dointvec(ctl, write, buffer, lenp, ppos);
2801 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2805 ctl_table ipv6_route_table_template[] = {
2807 .procname = "flush",
2808 .data = &init_net.ipv6.sysctl.flush_delay,
2809 .maxlen = sizeof(int),
2811 .proc_handler = ipv6_sysctl_rtcache_flush
2814 .procname = "gc_thresh",
2815 .data = &ip6_dst_ops_template.gc_thresh,
2816 .maxlen = sizeof(int),
2818 .proc_handler = proc_dointvec,
2821 .procname = "max_size",
2822 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2823 .maxlen = sizeof(int),
2825 .proc_handler = proc_dointvec,
2828 .procname = "gc_min_interval",
2829 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2830 .maxlen = sizeof(int),
2832 .proc_handler = proc_dointvec_jiffies,
2835 .procname = "gc_timeout",
2836 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2837 .maxlen = sizeof(int),
2839 .proc_handler = proc_dointvec_jiffies,
2842 .procname = "gc_interval",
2843 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2844 .maxlen = sizeof(int),
2846 .proc_handler = proc_dointvec_jiffies,
2849 .procname = "gc_elasticity",
2850 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2851 .maxlen = sizeof(int),
2853 .proc_handler = proc_dointvec,
2856 .procname = "mtu_expires",
2857 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2858 .maxlen = sizeof(int),
2860 .proc_handler = proc_dointvec_jiffies,
2863 .procname = "min_adv_mss",
2864 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2865 .maxlen = sizeof(int),
2867 .proc_handler = proc_dointvec,
2870 .procname = "gc_min_interval_ms",
2871 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2872 .maxlen = sizeof(int),
2874 .proc_handler = proc_dointvec_ms_jiffies,
2879 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2881 struct ctl_table *table;
2883 table = kmemdup(ipv6_route_table_template,
2884 sizeof(ipv6_route_table_template),
2888 table[0].data = &net->ipv6.sysctl.flush_delay;
2889 table[0].extra1 = net;
2890 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2891 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2892 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2893 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2894 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2895 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2896 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2897 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2898 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2900 /* Don't export sysctls to unprivileged users */
2901 if (net->user_ns != &init_user_ns)
2902 table[0].procname = NULL;
2909 static int __net_init ip6_route_net_init(struct net *net)
2913 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2914 sizeof(net->ipv6.ip6_dst_ops));
2916 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2917 goto out_ip6_dst_ops;
2919 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2920 sizeof(*net->ipv6.ip6_null_entry),
2922 if (!net->ipv6.ip6_null_entry)
2923 goto out_ip6_dst_entries;
2924 net->ipv6.ip6_null_entry->dst.path =
2925 (struct dst_entry *)net->ipv6.ip6_null_entry;
2926 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2927 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2928 ip6_template_metrics, true);
2930 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2931 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2932 sizeof(*net->ipv6.ip6_prohibit_entry),
2934 if (!net->ipv6.ip6_prohibit_entry)
2935 goto out_ip6_null_entry;
2936 net->ipv6.ip6_prohibit_entry->dst.path =
2937 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2938 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2939 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2940 ip6_template_metrics, true);
2942 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2943 sizeof(*net->ipv6.ip6_blk_hole_entry),
2945 if (!net->ipv6.ip6_blk_hole_entry)
2946 goto out_ip6_prohibit_entry;
2947 net->ipv6.ip6_blk_hole_entry->dst.path =
2948 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2949 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2950 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2951 ip6_template_metrics, true);
2954 net->ipv6.sysctl.flush_delay = 0;
2955 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2956 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2957 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2958 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2959 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2960 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2961 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2963 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2969 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2970 out_ip6_prohibit_entry:
2971 kfree(net->ipv6.ip6_prohibit_entry);
2973 kfree(net->ipv6.ip6_null_entry);
2975 out_ip6_dst_entries:
2976 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2981 static void __net_exit ip6_route_net_exit(struct net *net)
2983 kfree(net->ipv6.ip6_null_entry);
2984 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2985 kfree(net->ipv6.ip6_prohibit_entry);
2986 kfree(net->ipv6.ip6_blk_hole_entry);
2988 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2991 static int __net_init ip6_route_net_init_late(struct net *net)
2993 #ifdef CONFIG_PROC_FS
2994 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
2995 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3000 static void __net_exit ip6_route_net_exit_late(struct net *net)
3002 #ifdef CONFIG_PROC_FS
3003 remove_proc_entry("ipv6_route", net->proc_net);
3004 remove_proc_entry("rt6_stats", net->proc_net);
3008 static struct pernet_operations ip6_route_net_ops = {
3009 .init = ip6_route_net_init,
3010 .exit = ip6_route_net_exit,
3013 static int __net_init ipv6_inetpeer_init(struct net *net)
3015 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3019 inet_peer_base_init(bp);
3020 net->ipv6.peers = bp;
3024 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3026 struct inet_peer_base *bp = net->ipv6.peers;
3028 net->ipv6.peers = NULL;
3029 inetpeer_invalidate_tree(bp);
3033 static struct pernet_operations ipv6_inetpeer_ops = {
3034 .init = ipv6_inetpeer_init,
3035 .exit = ipv6_inetpeer_exit,
3038 static struct pernet_operations ip6_route_net_late_ops = {
3039 .init = ip6_route_net_init_late,
3040 .exit = ip6_route_net_exit_late,
3043 static struct notifier_block ip6_route_dev_notifier = {
3044 .notifier_call = ip6_route_dev_notify,
3048 int __init ip6_route_init(void)
3053 ip6_dst_ops_template.kmem_cachep =
3054 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3055 SLAB_HWCACHE_ALIGN, NULL);
3056 if (!ip6_dst_ops_template.kmem_cachep)
3059 ret = dst_entries_init(&ip6_dst_blackhole_ops);
3061 goto out_kmem_cache;
3063 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3065 goto out_dst_entries;
3067 ret = register_pernet_subsys(&ip6_route_net_ops);
3069 goto out_register_inetpeer;
3071 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3073 /* Registering of the loopback is done before this portion of code,
3074 * the loopback reference in rt6_info will not be taken, do it
3075 * manually for init_net */
3076 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3077 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3078 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3079 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3080 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3081 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3082 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3086 goto out_register_subsys;
3092 ret = fib6_rules_init();
3096 ret = register_pernet_subsys(&ip6_route_net_late_ops);
3098 goto fib6_rules_init;
3101 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3102 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3103 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3104 goto out_register_late_subsys;
3106 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3108 goto out_register_late_subsys;
3113 out_register_late_subsys:
3114 unregister_pernet_subsys(&ip6_route_net_late_ops);
3116 fib6_rules_cleanup();
3121 out_register_subsys:
3122 unregister_pernet_subsys(&ip6_route_net_ops);
3123 out_register_inetpeer:
3124 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3126 dst_entries_destroy(&ip6_dst_blackhole_ops);
3128 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3132 void ip6_route_cleanup(void)
3134 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3135 unregister_pernet_subsys(&ip6_route_net_late_ops);
3136 fib6_rules_cleanup();
3139 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3140 unregister_pernet_subsys(&ip6_route_net_ops);
3141 dst_entries_destroy(&ip6_dst_blackhole_ops);
3142 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);