2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
52 #include <linux/rtnetlink.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
58 #include <asm/uaccess.h>
61 #include <linux/sysctl.h>
64 /* Set to 3 to get tracing. */
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #define RT6_TRACE(x...) do { ; } while (0)
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void ip6_dst_destroy(struct dst_entry *);
81 static void ip6_dst_ifdown(struct dst_entry *,
82 struct net_device *dev, int how);
83 static int ip6_dst_gc(struct dst_ops *ops);
85 static int ip6_pkt_discard(struct sk_buff *skb);
86 static int ip6_pkt_discard_out(struct sk_buff *skb);
87 static void ip6_link_failure(struct sk_buff *skb);
88 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92 const struct in6_addr *prefix, int prefixlen,
93 const struct in6_addr *gwaddr, int ifindex,
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96 const struct in6_addr *prefix, int prefixlen,
97 const struct in6_addr *gwaddr, int ifindex);
100 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
102 struct rt6_info *rt = (struct rt6_info *) dst;
103 struct inet_peer *peer;
107 rt6_bind_peer(rt, 1);
109 peer = rt->rt6i_peer;
111 u32 *old_p = __DST_METRICS_PTR(old);
112 unsigned long prev, new;
115 if (inet_metrics_new(peer))
116 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
118 new = (unsigned long) p;
119 prev = cmpxchg(&dst->_metrics, old, new);
122 p = __DST_METRICS_PTR(prev);
123 if (prev & DST_METRICS_READ_ONLY)
130 static struct dst_ops ip6_dst_ops_template = {
132 .protocol = cpu_to_be16(ETH_P_IPV6),
135 .check = ip6_dst_check,
136 .default_advmss = ip6_default_advmss,
137 .default_mtu = ip6_default_mtu,
138 .cow_metrics = ipv6_cow_metrics,
139 .destroy = ip6_dst_destroy,
140 .ifdown = ip6_dst_ifdown,
141 .negative_advice = ip6_negative_advice,
142 .link_failure = ip6_link_failure,
143 .update_pmtu = ip6_rt_update_pmtu,
144 .local_out = __ip6_local_out,
147 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
152 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
156 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
162 static struct dst_ops ip6_dst_blackhole_ops = {
164 .protocol = cpu_to_be16(ETH_P_IPV6),
165 .destroy = ip6_dst_destroy,
166 .check = ip6_dst_check,
167 .default_mtu = ip6_blackhole_default_mtu,
168 .default_advmss = ip6_default_advmss,
169 .update_pmtu = ip6_rt_blackhole_update_pmtu,
170 .cow_metrics = ip6_rt_blackhole_cow_metrics,
173 static const u32 ip6_template_metrics[RTAX_MAX] = {
174 [RTAX_HOPLIMIT - 1] = 0,
177 static struct rt6_info ip6_null_entry_template = {
179 .__refcnt = ATOMIC_INIT(1),
182 .error = -ENETUNREACH,
183 .input = ip6_pkt_discard,
184 .output = ip6_pkt_discard_out,
186 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
187 .rt6i_protocol = RTPROT_KERNEL,
188 .rt6i_metric = ~(u32) 0,
189 .rt6i_ref = ATOMIC_INIT(1),
192 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
194 static int ip6_pkt_prohibit(struct sk_buff *skb);
195 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
197 static struct rt6_info ip6_prohibit_entry_template = {
199 .__refcnt = ATOMIC_INIT(1),
203 .input = ip6_pkt_prohibit,
204 .output = ip6_pkt_prohibit_out,
206 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
207 .rt6i_protocol = RTPROT_KERNEL,
208 .rt6i_metric = ~(u32) 0,
209 .rt6i_ref = ATOMIC_INIT(1),
212 static struct rt6_info ip6_blk_hole_entry_template = {
214 .__refcnt = ATOMIC_INIT(1),
218 .input = dst_discard,
219 .output = dst_discard,
221 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
222 .rt6i_protocol = RTPROT_KERNEL,
223 .rt6i_metric = ~(u32) 0,
224 .rt6i_ref = ATOMIC_INIT(1),
229 /* allocate dst with ip6_dst_ops */
230 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
231 struct net_device *dev,
234 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
237 memset(&rt->rt6i_table, 0,
238 sizeof(*rt) - sizeof(struct dst_entry));
243 static void ip6_dst_destroy(struct dst_entry *dst)
245 struct rt6_info *rt = (struct rt6_info *)dst;
246 struct inet6_dev *idev = rt->rt6i_idev;
247 struct inet_peer *peer = rt->rt6i_peer;
250 rt->rt6i_idev = NULL;
254 rt->rt6i_peer = NULL;
259 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
261 static u32 rt6_peer_genid(void)
263 return atomic_read(&__rt6_peer_genid);
266 void rt6_bind_peer(struct rt6_info *rt, int create)
268 struct inet_peer *peer;
270 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
271 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
274 rt->rt6i_peer_genid = rt6_peer_genid();
277 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
280 struct rt6_info *rt = (struct rt6_info *)dst;
281 struct inet6_dev *idev = rt->rt6i_idev;
282 struct net_device *loopback_dev =
283 dev_net(dev)->loopback_dev;
285 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
286 struct inet6_dev *loopback_idev =
287 in6_dev_get(loopback_dev);
288 if (loopback_idev != NULL) {
289 rt->rt6i_idev = loopback_idev;
295 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
297 return (rt->rt6i_flags & RTF_EXPIRES) &&
298 time_after(jiffies, rt->rt6i_expires);
301 static inline int rt6_need_strict(const struct in6_addr *daddr)
303 return ipv6_addr_type(daddr) &
304 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
308 * Route lookup. Any table->tb6_lock is implied.
311 static inline struct rt6_info *rt6_device_match(struct net *net,
313 const struct in6_addr *saddr,
317 struct rt6_info *local = NULL;
318 struct rt6_info *sprt;
320 if (!oif && ipv6_addr_any(saddr))
323 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
324 struct net_device *dev = sprt->rt6i_dev;
327 if (dev->ifindex == oif)
329 if (dev->flags & IFF_LOOPBACK) {
330 if (sprt->rt6i_idev == NULL ||
331 sprt->rt6i_idev->dev->ifindex != oif) {
332 if (flags & RT6_LOOKUP_F_IFACE && oif)
334 if (local && (!oif ||
335 local->rt6i_idev->dev->ifindex == oif))
341 if (ipv6_chk_addr(net, saddr, dev,
342 flags & RT6_LOOKUP_F_IFACE))
351 if (flags & RT6_LOOKUP_F_IFACE)
352 return net->ipv6.ip6_null_entry;
358 #ifdef CONFIG_IPV6_ROUTER_PREF
359 static void rt6_probe(struct rt6_info *rt)
361 struct neighbour *neigh;
363 * Okay, this does not seem to be appropriate
364 * for now, however, we need to check if it
365 * is really so; aka Router Reachability Probing.
367 * Router Reachability Probe MUST be rate-limited
368 * to no more than one per minute.
371 neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
372 if (!neigh || (neigh->nud_state & NUD_VALID))
374 read_lock_bh(&neigh->lock);
375 if (!(neigh->nud_state & NUD_VALID) &&
376 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
377 struct in6_addr mcaddr;
378 struct in6_addr *target;
380 neigh->updated = jiffies;
381 read_unlock_bh(&neigh->lock);
383 target = (struct in6_addr *)&neigh->primary_key;
384 addrconf_addr_solict_mult(target, &mcaddr);
385 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
387 read_unlock_bh(&neigh->lock);
393 static inline void rt6_probe(struct rt6_info *rt)
399 * Default Router Selection (RFC 2461 6.3.6)
401 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
403 struct net_device *dev = rt->rt6i_dev;
404 if (!oif || dev->ifindex == oif)
406 if ((dev->flags & IFF_LOOPBACK) &&
407 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
412 static inline int rt6_check_neigh(struct rt6_info *rt)
414 struct neighbour *neigh;
418 neigh = dst_get_neighbour(&rt->dst);
419 if (rt->rt6i_flags & RTF_NONEXTHOP ||
420 !(rt->rt6i_flags & RTF_GATEWAY))
423 read_lock_bh(&neigh->lock);
424 if (neigh->nud_state & NUD_VALID)
426 #ifdef CONFIG_IPV6_ROUTER_PREF
427 else if (neigh->nud_state & NUD_FAILED)
432 read_unlock_bh(&neigh->lock);
439 static int rt6_score_route(struct rt6_info *rt, int oif,
444 m = rt6_check_dev(rt, oif);
445 if (!m && (strict & RT6_LOOKUP_F_IFACE))
447 #ifdef CONFIG_IPV6_ROUTER_PREF
448 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
450 n = rt6_check_neigh(rt);
451 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
456 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
457 int *mpri, struct rt6_info *match)
461 if (rt6_check_expired(rt))
464 m = rt6_score_route(rt, oif, strict);
469 if (strict & RT6_LOOKUP_F_REACHABLE)
473 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
481 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
482 struct rt6_info *rr_head,
483 u32 metric, int oif, int strict)
485 struct rt6_info *rt, *match;
489 for (rt = rr_head; rt && rt->rt6i_metric == metric;
490 rt = rt->dst.rt6_next)
491 match = find_match(rt, oif, strict, &mpri, match);
492 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
493 rt = rt->dst.rt6_next)
494 match = find_match(rt, oif, strict, &mpri, match);
499 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
501 struct rt6_info *match, *rt0;
504 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
505 __func__, fn->leaf, oif);
509 fn->rr_ptr = rt0 = fn->leaf;
511 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
514 (strict & RT6_LOOKUP_F_REACHABLE)) {
515 struct rt6_info *next = rt0->dst.rt6_next;
517 /* no entries matched; do round-robin */
518 if (!next || next->rt6i_metric != rt0->rt6i_metric)
525 RT6_TRACE("%s() => %p\n",
528 net = dev_net(rt0->rt6i_dev);
529 return match ? match : net->ipv6.ip6_null_entry;
532 #ifdef CONFIG_IPV6_ROUTE_INFO
533 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
534 const struct in6_addr *gwaddr)
536 struct net *net = dev_net(dev);
537 struct route_info *rinfo = (struct route_info *) opt;
538 struct in6_addr prefix_buf, *prefix;
540 unsigned long lifetime;
543 if (len < sizeof(struct route_info)) {
547 /* Sanity check for prefix_len and length */
548 if (rinfo->length > 3) {
550 } else if (rinfo->prefix_len > 128) {
552 } else if (rinfo->prefix_len > 64) {
553 if (rinfo->length < 2) {
556 } else if (rinfo->prefix_len > 0) {
557 if (rinfo->length < 1) {
562 pref = rinfo->route_pref;
563 if (pref == ICMPV6_ROUTER_PREF_INVALID)
566 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
568 if (rinfo->length == 3)
569 prefix = (struct in6_addr *)rinfo->prefix;
571 /* this function is safe */
572 ipv6_addr_prefix(&prefix_buf,
573 (struct in6_addr *)rinfo->prefix,
575 prefix = &prefix_buf;
578 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
581 if (rt && !lifetime) {
587 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
590 rt->rt6i_flags = RTF_ROUTEINFO |
591 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
594 if (!addrconf_finite_timeout(lifetime)) {
595 rt->rt6i_flags &= ~RTF_EXPIRES;
597 rt->rt6i_expires = jiffies + HZ * lifetime;
598 rt->rt6i_flags |= RTF_EXPIRES;
600 dst_release(&rt->dst);
606 #define BACKTRACK(__net, saddr) \
608 if (rt == __net->ipv6.ip6_null_entry) { \
609 struct fib6_node *pn; \
611 if (fn->fn_flags & RTN_TL_ROOT) \
614 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
615 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
618 if (fn->fn_flags & RTN_RTINFO) \
624 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
625 struct fib6_table *table,
626 struct flowi6 *fl6, int flags)
628 struct fib6_node *fn;
631 read_lock_bh(&table->tb6_lock);
632 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
635 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
636 BACKTRACK(net, &fl6->saddr);
638 dst_use(&rt->dst, jiffies);
639 read_unlock_bh(&table->tb6_lock);
644 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
645 const struct in6_addr *saddr, int oif, int strict)
647 struct flowi6 fl6 = {
651 struct dst_entry *dst;
652 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
655 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
656 flags |= RT6_LOOKUP_F_HAS_SADDR;
659 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
661 return (struct rt6_info *) dst;
668 EXPORT_SYMBOL(rt6_lookup);
670 /* ip6_ins_rt is called with FREE table->tb6_lock.
671 It takes new route entry, the addition fails by any reason the
672 route is freed. In any case, if caller does not hold it, it may
676 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
679 struct fib6_table *table;
681 table = rt->rt6i_table;
682 write_lock_bh(&table->tb6_lock);
683 err = fib6_add(&table->tb6_root, rt, info);
684 write_unlock_bh(&table->tb6_lock);
689 int ip6_ins_rt(struct rt6_info *rt)
691 struct nl_info info = {
692 .nl_net = dev_net(rt->rt6i_dev),
694 return __ip6_ins_rt(rt, &info);
697 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, const struct in6_addr *daddr,
698 const struct in6_addr *saddr)
706 rt = ip6_rt_copy(ort);
709 struct neighbour *neigh;
710 int attempts = !in_softirq();
712 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
713 if (rt->rt6i_dst.plen != 128 &&
714 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
715 rt->rt6i_flags |= RTF_ANYCAST;
716 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
719 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
720 rt->rt6i_dst.plen = 128;
721 rt->rt6i_flags |= RTF_CACHE;
722 rt->dst.flags |= DST_HOST;
724 #ifdef CONFIG_IPV6_SUBTREES
725 if (rt->rt6i_src.plen && saddr) {
726 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
727 rt->rt6i_src.plen = 128;
732 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
734 struct net *net = dev_net(rt->rt6i_dev);
735 int saved_rt_min_interval =
736 net->ipv6.sysctl.ip6_rt_gc_min_interval;
737 int saved_rt_elasticity =
738 net->ipv6.sysctl.ip6_rt_gc_elasticity;
740 if (attempts-- > 0) {
741 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
742 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
744 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
746 net->ipv6.sysctl.ip6_rt_gc_elasticity =
748 net->ipv6.sysctl.ip6_rt_gc_min_interval =
749 saved_rt_min_interval;
755 "ipv6: Neighbour table overflow.\n");
759 dst_set_neighbour(&rt->dst, neigh);
765 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, const struct in6_addr *daddr)
767 struct rt6_info *rt = ip6_rt_copy(ort);
769 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
770 rt->rt6i_dst.plen = 128;
771 rt->rt6i_flags |= RTF_CACHE;
772 rt->dst.flags |= DST_HOST;
773 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
778 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
779 struct flowi6 *fl6, int flags)
781 struct fib6_node *fn;
782 struct rt6_info *rt, *nrt;
786 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
788 strict |= flags & RT6_LOOKUP_F_IFACE;
791 read_lock_bh(&table->tb6_lock);
794 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
797 rt = rt6_select(fn, oif, strict | reachable);
799 BACKTRACK(net, &fl6->saddr);
800 if (rt == net->ipv6.ip6_null_entry ||
801 rt->rt6i_flags & RTF_CACHE)
805 read_unlock_bh(&table->tb6_lock);
807 if (!dst_get_neighbour_raw(&rt->dst) &&
808 !(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_LOCAL)))
809 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
810 else if (!(rt->dst.flags & DST_HOST))
811 nrt = rt6_alloc_clone(rt, &fl6->daddr);
815 dst_release(&rt->dst);
816 rt = nrt ? : net->ipv6.ip6_null_entry;
820 err = ip6_ins_rt(nrt);
829 * Race condition! In the gap, when table->tb6_lock was
830 * released someone could insert this route. Relookup.
832 dst_release(&rt->dst);
841 read_unlock_bh(&table->tb6_lock);
843 rt->dst.lastuse = jiffies;
849 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
850 struct flowi6 *fl6, int flags)
852 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
855 void ip6_route_input(struct sk_buff *skb)
857 const struct ipv6hdr *iph = ipv6_hdr(skb);
858 struct net *net = dev_net(skb->dev);
859 int flags = RT6_LOOKUP_F_HAS_SADDR;
860 struct flowi6 fl6 = {
861 .flowi6_iif = skb->dev->ifindex,
864 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
865 .flowi6_mark = skb->mark,
866 .flowi6_proto = iph->nexthdr,
869 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
870 flags |= RT6_LOOKUP_F_IFACE;
872 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
875 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
876 struct flowi6 *fl6, int flags)
878 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
881 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
886 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
887 flags |= RT6_LOOKUP_F_IFACE;
889 if (!ipv6_addr_any(&fl6->saddr))
890 flags |= RT6_LOOKUP_F_HAS_SADDR;
892 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
894 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
897 EXPORT_SYMBOL(ip6_route_output);
899 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
901 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
902 struct dst_entry *new = NULL;
904 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
906 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
911 new->input = dst_discard;
912 new->output = dst_discard;
914 dst_copy_metrics(new, &ort->dst);
915 rt->rt6i_idev = ort->rt6i_idev;
917 in6_dev_hold(rt->rt6i_idev);
918 rt->rt6i_expires = 0;
920 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
921 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
924 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
925 #ifdef CONFIG_IPV6_SUBTREES
926 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
932 dst_release(dst_orig);
933 return new ? new : ERR_PTR(-ENOMEM);
937 * Destination cache support functions
940 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
944 rt = (struct rt6_info *) dst;
946 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
947 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
949 rt6_bind_peer(rt, 0);
950 rt->rt6i_peer_genid = rt6_peer_genid();
957 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
959 struct rt6_info *rt = (struct rt6_info *) dst;
962 if (rt->rt6i_flags & RTF_CACHE) {
963 if (rt6_check_expired(rt)) {
975 static void ip6_link_failure(struct sk_buff *skb)
979 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
981 rt = (struct rt6_info *) skb_dst(skb);
983 if (rt->rt6i_flags&RTF_CACHE) {
984 dst_set_expires(&rt->dst, 0);
985 rt->rt6i_flags |= RTF_EXPIRES;
986 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
987 rt->rt6i_node->fn_sernum = -1;
991 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
993 struct rt6_info *rt6 = (struct rt6_info*)dst;
995 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
996 rt6->rt6i_flags |= RTF_MODIFIED;
997 if (mtu < IPV6_MIN_MTU) {
998 u32 features = dst_metric(dst, RTAX_FEATURES);
1000 features |= RTAX_FEATURE_ALLFRAG;
1001 dst_metric_set(dst, RTAX_FEATURES, features);
1003 dst_metric_set(dst, RTAX_MTU, mtu);
1007 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1009 struct net_device *dev = dst->dev;
1010 unsigned int mtu = dst_mtu(dst);
1011 struct net *net = dev_net(dev);
1013 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1015 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1016 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1019 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1020 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1021 * IPV6_MAXPLEN is also valid and means: "any MSS,
1022 * rely only on pmtu discovery"
1024 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1029 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1031 unsigned int mtu = IPV6_MIN_MTU;
1032 struct inet6_dev *idev;
1035 idev = __in6_dev_get(dst->dev);
1037 mtu = idev->cnf.mtu6;
1043 static struct dst_entry *icmp6_dst_gc_list;
1044 static DEFINE_SPINLOCK(icmp6_dst_lock);
1046 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1047 struct neighbour *neigh,
1048 const struct in6_addr *addr)
1050 struct rt6_info *rt;
1051 struct inet6_dev *idev = in6_dev_get(dev);
1052 struct net *net = dev_net(dev);
1054 if (unlikely(idev == NULL))
1057 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1058 if (unlikely(rt == NULL)) {
1066 neigh = ndisc_get_neigh(dev, addr);
1071 rt->rt6i_idev = idev;
1072 dst_set_neighbour(&rt->dst, neigh);
1073 atomic_set(&rt->dst.__refcnt, 1);
1074 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1075 rt->dst.output = ip6_output;
1077 spin_lock_bh(&icmp6_dst_lock);
1078 rt->dst.next = icmp6_dst_gc_list;
1079 icmp6_dst_gc_list = &rt->dst;
1080 spin_unlock_bh(&icmp6_dst_lock);
1082 fib6_force_start_gc(net);
1088 int icmp6_dst_gc(void)
1090 struct dst_entry *dst, **pprev;
1093 spin_lock_bh(&icmp6_dst_lock);
1094 pprev = &icmp6_dst_gc_list;
1096 while ((dst = *pprev) != NULL) {
1097 if (!atomic_read(&dst->__refcnt)) {
1106 spin_unlock_bh(&icmp6_dst_lock);
1111 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1114 struct dst_entry *dst, **pprev;
1116 spin_lock_bh(&icmp6_dst_lock);
1117 pprev = &icmp6_dst_gc_list;
1118 while ((dst = *pprev) != NULL) {
1119 struct rt6_info *rt = (struct rt6_info *) dst;
1120 if (func(rt, arg)) {
1127 spin_unlock_bh(&icmp6_dst_lock);
1130 static int ip6_dst_gc(struct dst_ops *ops)
1132 unsigned long now = jiffies;
1133 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1134 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1135 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1136 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1137 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1138 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1141 entries = dst_entries_get_fast(ops);
1142 if (time_after(rt_last_gc + rt_min_interval, now) &&
1143 entries <= rt_max_size)
1146 net->ipv6.ip6_rt_gc_expire++;
1147 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1148 net->ipv6.ip6_rt_last_gc = now;
1149 entries = dst_entries_get_slow(ops);
1150 if (entries < ops->gc_thresh)
1151 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1153 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1154 return entries > rt_max_size;
1157 /* Clean host part of a prefix. Not necessary in radix tree,
1158 but results in cleaner routing tables.
1160 Remove it only when all the things will work!
1163 int ip6_dst_hoplimit(struct dst_entry *dst)
1165 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1166 if (hoplimit == 0) {
1167 struct net_device *dev = dst->dev;
1168 struct inet6_dev *idev;
1171 idev = __in6_dev_get(dev);
1173 hoplimit = idev->cnf.hop_limit;
1175 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1180 EXPORT_SYMBOL(ip6_dst_hoplimit);
1186 int ip6_route_add(struct fib6_config *cfg)
1189 struct net *net = cfg->fc_nlinfo.nl_net;
1190 struct rt6_info *rt = NULL;
1191 struct net_device *dev = NULL;
1192 struct inet6_dev *idev = NULL;
1193 struct fib6_table *table;
1196 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1198 #ifndef CONFIG_IPV6_SUBTREES
1199 if (cfg->fc_src_len)
1202 if (cfg->fc_ifindex) {
1204 dev = dev_get_by_index(net, cfg->fc_ifindex);
1207 idev = in6_dev_get(dev);
1212 if (cfg->fc_metric == 0)
1213 cfg->fc_metric = IP6_RT_PRIO_USER;
1215 table = fib6_new_table(net, cfg->fc_table);
1216 if (table == NULL) {
1221 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1228 rt->dst.obsolete = -1;
1229 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1230 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1233 if (cfg->fc_protocol == RTPROT_UNSPEC)
1234 cfg->fc_protocol = RTPROT_BOOT;
1235 rt->rt6i_protocol = cfg->fc_protocol;
1237 addr_type = ipv6_addr_type(&cfg->fc_dst);
1239 if (addr_type & IPV6_ADDR_MULTICAST)
1240 rt->dst.input = ip6_mc_input;
1241 else if (cfg->fc_flags & RTF_LOCAL)
1242 rt->dst.input = ip6_input;
1244 rt->dst.input = ip6_forward;
1246 rt->dst.output = ip6_output;
1248 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1249 rt->rt6i_dst.plen = cfg->fc_dst_len;
1250 if (rt->rt6i_dst.plen == 128)
1251 rt->dst.flags |= DST_HOST;
1253 #ifdef CONFIG_IPV6_SUBTREES
1254 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1255 rt->rt6i_src.plen = cfg->fc_src_len;
1258 rt->rt6i_metric = cfg->fc_metric;
1260 /* We cannot add true routes via loopback here,
1261 they would result in kernel looping; promote them to reject routes
1263 if ((cfg->fc_flags & RTF_REJECT) ||
1264 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1265 && !(cfg->fc_flags&RTF_LOCAL))) {
1266 /* hold loopback dev/idev if we haven't done so. */
1267 if (dev != net->loopback_dev) {
1272 dev = net->loopback_dev;
1274 idev = in6_dev_get(dev);
1280 rt->dst.output = ip6_pkt_discard_out;
1281 rt->dst.input = ip6_pkt_discard;
1282 rt->dst.error = -ENETUNREACH;
1283 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1287 if (cfg->fc_flags & RTF_GATEWAY) {
1288 const struct in6_addr *gw_addr;
1291 gw_addr = &cfg->fc_gateway;
1292 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1293 gwa_type = ipv6_addr_type(gw_addr);
1295 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1296 struct rt6_info *grt;
1298 /* IPv6 strictly inhibits using not link-local
1299 addresses as nexthop address.
1300 Otherwise, router will not able to send redirects.
1301 It is very good, but in some (rare!) circumstances
1302 (SIT, PtP, NBMA NOARP links) it is handy to allow
1303 some exceptions. --ANK
1306 if (!(gwa_type&IPV6_ADDR_UNICAST))
1309 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1311 err = -EHOSTUNREACH;
1315 if (dev != grt->rt6i_dev) {
1316 dst_release(&grt->dst);
1320 dev = grt->rt6i_dev;
1321 idev = grt->rt6i_idev;
1323 in6_dev_hold(grt->rt6i_idev);
1325 if (!(grt->rt6i_flags&RTF_GATEWAY))
1327 dst_release(&grt->dst);
1333 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1341 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1342 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1346 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1347 rt->rt6i_prefsrc.plen = 128;
1349 rt->rt6i_prefsrc.plen = 0;
1351 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1352 struct neighbour *neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1353 if (IS_ERR(neigh)) {
1354 err = PTR_ERR(neigh);
1357 dst_set_neighbour(&rt->dst, neigh);
1360 rt->rt6i_flags = cfg->fc_flags;
1367 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1368 int type = nla_type(nla);
1371 if (type > RTAX_MAX) {
1376 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1382 rt->rt6i_idev = idev;
1383 rt->rt6i_table = table;
1385 cfg->fc_nlinfo.nl_net = dev_net(dev);
1387 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1399 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1402 struct fib6_table *table;
1403 struct net *net = dev_net(rt->rt6i_dev);
1405 if (rt == net->ipv6.ip6_null_entry) {
1410 table = rt->rt6i_table;
1411 write_lock_bh(&table->tb6_lock);
1412 err = fib6_del(rt, info);
1413 write_unlock_bh(&table->tb6_lock);
1416 dst_release(&rt->dst);
1420 int ip6_del_rt(struct rt6_info *rt)
1422 struct nl_info info = {
1423 .nl_net = dev_net(rt->rt6i_dev),
1425 return __ip6_del_rt(rt, &info);
1428 static int ip6_route_del(struct fib6_config *cfg)
1430 struct fib6_table *table;
1431 struct fib6_node *fn;
1432 struct rt6_info *rt;
1435 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1439 read_lock_bh(&table->tb6_lock);
1441 fn = fib6_locate(&table->tb6_root,
1442 &cfg->fc_dst, cfg->fc_dst_len,
1443 &cfg->fc_src, cfg->fc_src_len);
1446 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1447 if (cfg->fc_ifindex &&
1448 (rt->rt6i_dev == NULL ||
1449 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1451 if (cfg->fc_flags & RTF_GATEWAY &&
1452 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1454 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1457 read_unlock_bh(&table->tb6_lock);
1459 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1462 read_unlock_bh(&table->tb6_lock);
1470 struct ip6rd_flowi {
1472 struct in6_addr gateway;
1475 static struct rt6_info *__ip6_route_redirect(struct net *net,
1476 struct fib6_table *table,
1480 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1481 struct rt6_info *rt;
1482 struct fib6_node *fn;
1485 * Get the "current" route for this destination and
1486 * check if the redirect has come from approriate router.
1488 * RFC 2461 specifies that redirects should only be
1489 * accepted if they come from the nexthop to the target.
1490 * Due to the way the routes are chosen, this notion
1491 * is a bit fuzzy and one might need to check all possible
1495 read_lock_bh(&table->tb6_lock);
1496 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1498 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1500 * Current route is on-link; redirect is always invalid.
1502 * Seems, previous statement is not true. It could
1503 * be node, which looks for us as on-link (f.e. proxy ndisc)
1504 * But then router serving it might decide, that we should
1505 * know truth 8)8) --ANK (980726).
1507 if (rt6_check_expired(rt))
1509 if (!(rt->rt6i_flags & RTF_GATEWAY))
1511 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1513 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1519 rt = net->ipv6.ip6_null_entry;
1520 BACKTRACK(net, &fl6->saddr);
1524 read_unlock_bh(&table->tb6_lock);
1529 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1530 const struct in6_addr *src,
1531 const struct in6_addr *gateway,
1532 struct net_device *dev)
1534 int flags = RT6_LOOKUP_F_HAS_SADDR;
1535 struct net *net = dev_net(dev);
1536 struct ip6rd_flowi rdfl = {
1538 .flowi6_oif = dev->ifindex,
1544 ipv6_addr_copy(&rdfl.gateway, gateway);
1546 if (rt6_need_strict(dest))
1547 flags |= RT6_LOOKUP_F_IFACE;
1549 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1550 flags, __ip6_route_redirect);
1553 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1554 const struct in6_addr *saddr,
1555 struct neighbour *neigh, u8 *lladdr, int on_link)
1557 struct rt6_info *rt, *nrt = NULL;
1558 struct netevent_redirect netevent;
1559 struct net *net = dev_net(neigh->dev);
1561 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1563 if (rt == net->ipv6.ip6_null_entry) {
1564 if (net_ratelimit())
1565 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1566 "for redirect target\n");
1571 * We have finally decided to accept it.
1574 neigh_update(neigh, lladdr, NUD_STALE,
1575 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1576 NEIGH_UPDATE_F_OVERRIDE|
1577 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1578 NEIGH_UPDATE_F_ISROUTER))
1582 * Redirect received -> path was valid.
1583 * Look, redirects are sent only in response to data packets,
1584 * so that this nexthop apparently is reachable. --ANK
1586 dst_confirm(&rt->dst);
1588 /* Duplicate redirect: silently ignore. */
1589 if (neigh == dst_get_neighbour_raw(&rt->dst))
1592 nrt = ip6_rt_copy(rt);
1596 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1598 nrt->rt6i_flags &= ~RTF_GATEWAY;
1600 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1601 nrt->rt6i_dst.plen = 128;
1602 nrt->dst.flags |= DST_HOST;
1604 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1605 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1607 if (ip6_ins_rt(nrt))
1610 netevent.old = &rt->dst;
1611 netevent.new = &nrt->dst;
1612 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1614 if (rt->rt6i_flags&RTF_CACHE) {
1620 dst_release(&rt->dst);
1624 * Handle ICMP "packet too big" messages
1625 * i.e. Path MTU discovery
1628 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1629 struct net *net, u32 pmtu, int ifindex)
1631 struct rt6_info *rt, *nrt;
1634 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1638 if (rt6_check_expired(rt)) {
1643 if (pmtu >= dst_mtu(&rt->dst))
1646 if (pmtu < IPV6_MIN_MTU) {
1648 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1649 * MTU (1280) and a fragment header should always be included
1650 * after a node receiving Too Big message reporting PMTU is
1651 * less than the IPv6 Minimum Link MTU.
1653 pmtu = IPV6_MIN_MTU;
1657 /* New mtu received -> path was valid.
1658 They are sent only in response to data packets,
1659 so that this nexthop apparently is reachable. --ANK
1661 dst_confirm(&rt->dst);
1663 /* Host route. If it is static, it would be better
1664 not to override it, but add new one, so that
1665 when cache entry will expire old pmtu
1666 would return automatically.
1668 if (rt->rt6i_flags & RTF_CACHE) {
1669 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1671 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1672 features |= RTAX_FEATURE_ALLFRAG;
1673 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1675 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1676 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1681 Two cases are possible:
1682 1. It is connected route. Action: COW
1683 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1685 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1686 nrt = rt6_alloc_cow(rt, daddr, saddr);
1688 nrt = rt6_alloc_clone(rt, daddr);
1691 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1693 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1694 features |= RTAX_FEATURE_ALLFRAG;
1695 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1698 /* According to RFC 1981, detecting PMTU increase shouldn't be
1699 * happened within 5 mins, the recommended timer is 10 mins.
1700 * Here this route expiration time is set to ip6_rt_mtu_expires
1701 * which is 10 mins. After 10 mins the decreased pmtu is expired
1702 * and detecting PMTU increase will be automatically happened.
1704 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1705 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1710 dst_release(&rt->dst);
1713 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1714 struct net_device *dev, u32 pmtu)
1716 struct net *net = dev_net(dev);
1719 * RFC 1981 states that a node "MUST reduce the size of the packets it
1720 * is sending along the path" that caused the Packet Too Big message.
1721 * Since it's not possible in the general case to determine which
1722 * interface was used to send the original packet, we update the MTU
1723 * on the interface that will be used to send future packets. We also
1724 * update the MTU on the interface that received the Packet Too Big in
1725 * case the original packet was forced out that interface with
1726 * SO_BINDTODEVICE or similar. This is the next best thing to the
1727 * correct behaviour, which would be to update the MTU on all
1730 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1731 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1735 * Misc support functions
1738 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1740 struct net *net = dev_net(ort->rt6i_dev);
1741 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1745 rt->dst.input = ort->dst.input;
1746 rt->dst.output = ort->dst.output;
1748 dst_copy_metrics(&rt->dst, &ort->dst);
1749 rt->dst.error = ort->dst.error;
1750 rt->rt6i_idev = ort->rt6i_idev;
1752 in6_dev_hold(rt->rt6i_idev);
1753 rt->dst.lastuse = jiffies;
1754 rt->rt6i_expires = 0;
1756 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1757 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1758 rt->rt6i_metric = 0;
1760 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1761 #ifdef CONFIG_IPV6_SUBTREES
1762 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1764 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1765 rt->rt6i_table = ort->rt6i_table;
1770 #ifdef CONFIG_IPV6_ROUTE_INFO
1771 static struct rt6_info *rt6_get_route_info(struct net *net,
1772 const struct in6_addr *prefix, int prefixlen,
1773 const struct in6_addr *gwaddr, int ifindex)
1775 struct fib6_node *fn;
1776 struct rt6_info *rt = NULL;
1777 struct fib6_table *table;
1779 table = fib6_get_table(net, RT6_TABLE_INFO);
1783 write_lock_bh(&table->tb6_lock);
1784 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1788 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1789 if (rt->rt6i_dev->ifindex != ifindex)
1791 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1793 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1799 write_unlock_bh(&table->tb6_lock);
1803 static struct rt6_info *rt6_add_route_info(struct net *net,
1804 const struct in6_addr *prefix, int prefixlen,
1805 const struct in6_addr *gwaddr, int ifindex,
1808 struct fib6_config cfg = {
1809 .fc_table = RT6_TABLE_INFO,
1810 .fc_metric = IP6_RT_PRIO_USER,
1811 .fc_ifindex = ifindex,
1812 .fc_dst_len = prefixlen,
1813 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1814 RTF_UP | RTF_PREF(pref),
1816 .fc_nlinfo.nlh = NULL,
1817 .fc_nlinfo.nl_net = net,
1820 ipv6_addr_copy(&cfg.fc_dst, prefix);
1821 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1823 /* We should treat it as a default route if prefix length is 0. */
1825 cfg.fc_flags |= RTF_DEFAULT;
1827 ip6_route_add(&cfg);
1829 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1833 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1835 struct rt6_info *rt;
1836 struct fib6_table *table;
1838 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1842 write_lock_bh(&table->tb6_lock);
1843 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1844 if (dev == rt->rt6i_dev &&
1845 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1846 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1851 write_unlock_bh(&table->tb6_lock);
1855 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1856 struct net_device *dev,
1859 struct fib6_config cfg = {
1860 .fc_table = RT6_TABLE_DFLT,
1861 .fc_metric = IP6_RT_PRIO_USER,
1862 .fc_ifindex = dev->ifindex,
1863 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1864 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1866 .fc_nlinfo.nlh = NULL,
1867 .fc_nlinfo.nl_net = dev_net(dev),
1870 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1872 ip6_route_add(&cfg);
1874 return rt6_get_dflt_router(gwaddr, dev);
1877 void rt6_purge_dflt_routers(struct net *net)
1879 struct rt6_info *rt;
1880 struct fib6_table *table;
1882 /* NOTE: Keep consistent with rt6_get_dflt_router */
1883 table = fib6_get_table(net, RT6_TABLE_DFLT);
1888 read_lock_bh(&table->tb6_lock);
1889 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1890 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1892 read_unlock_bh(&table->tb6_lock);
1897 read_unlock_bh(&table->tb6_lock);
1900 static void rtmsg_to_fib6_config(struct net *net,
1901 struct in6_rtmsg *rtmsg,
1902 struct fib6_config *cfg)
1904 memset(cfg, 0, sizeof(*cfg));
1906 cfg->fc_table = RT6_TABLE_MAIN;
1907 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1908 cfg->fc_metric = rtmsg->rtmsg_metric;
1909 cfg->fc_expires = rtmsg->rtmsg_info;
1910 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1911 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1912 cfg->fc_flags = rtmsg->rtmsg_flags;
1914 cfg->fc_nlinfo.nl_net = net;
1916 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1917 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1918 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1921 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1923 struct fib6_config cfg;
1924 struct in6_rtmsg rtmsg;
1928 case SIOCADDRT: /* Add a route */
1929 case SIOCDELRT: /* Delete a route */
1930 if (!capable(CAP_NET_ADMIN))
1932 err = copy_from_user(&rtmsg, arg,
1933 sizeof(struct in6_rtmsg));
1937 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1942 err = ip6_route_add(&cfg);
1945 err = ip6_route_del(&cfg);
1959 * Drop the packet on the floor
1962 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1965 struct dst_entry *dst = skb_dst(skb);
1966 switch (ipstats_mib_noroutes) {
1967 case IPSTATS_MIB_INNOROUTES:
1968 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1969 if (type == IPV6_ADDR_ANY) {
1970 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1971 IPSTATS_MIB_INADDRERRORS);
1975 case IPSTATS_MIB_OUTNOROUTES:
1976 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1977 ipstats_mib_noroutes);
1980 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1985 static int ip6_pkt_discard(struct sk_buff *skb)
1987 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1990 static int ip6_pkt_discard_out(struct sk_buff *skb)
1992 skb->dev = skb_dst(skb)->dev;
1993 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1996 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1998 static int ip6_pkt_prohibit(struct sk_buff *skb)
2000 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2003 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2005 skb->dev = skb_dst(skb)->dev;
2006 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2012 * Allocate a dst for local (unicast / anycast) address.
2015 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2016 const struct in6_addr *addr,
2019 struct net *net = dev_net(idev->dev);
2020 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2021 net->loopback_dev, 0);
2022 struct neighbour *neigh;
2025 if (net_ratelimit())
2026 pr_warning("IPv6: Maximum number of routes reached,"
2027 " consider increasing route/max_size.\n");
2028 return ERR_PTR(-ENOMEM);
2033 rt->dst.flags |= DST_HOST;
2034 rt->dst.input = ip6_input;
2035 rt->dst.output = ip6_output;
2036 rt->rt6i_idev = idev;
2037 rt->dst.obsolete = -1;
2039 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2041 rt->rt6i_flags |= RTF_ANYCAST;
2043 rt->rt6i_flags |= RTF_LOCAL;
2044 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2045 if (IS_ERR(neigh)) {
2048 return ERR_CAST(neigh);
2050 dst_set_neighbour(&rt->dst, neigh);
2052 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2053 rt->rt6i_dst.plen = 128;
2054 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2056 atomic_set(&rt->dst.__refcnt, 1);
2061 int ip6_route_get_saddr(struct net *net,
2062 struct rt6_info *rt,
2063 const struct in6_addr *daddr,
2065 struct in6_addr *saddr)
2067 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2069 if (rt->rt6i_prefsrc.plen)
2070 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2072 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2073 daddr, prefs, saddr);
2077 /* remove deleted ip from prefsrc entries */
2078 struct arg_dev_net_ip {
2079 struct net_device *dev;
2081 struct in6_addr *addr;
2084 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2086 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2087 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2088 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2090 if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2091 rt != net->ipv6.ip6_null_entry &&
2092 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2093 /* remove prefsrc entry */
2094 rt->rt6i_prefsrc.plen = 0;
2099 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2101 struct net *net = dev_net(ifp->idev->dev);
2102 struct arg_dev_net_ip adni = {
2103 .dev = ifp->idev->dev,
2107 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2110 struct arg_dev_net {
2111 struct net_device *dev;
2115 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2117 const struct arg_dev_net *adn = arg;
2118 const struct net_device *dev = adn->dev;
2120 if ((rt->rt6i_dev == dev || dev == NULL) &&
2121 rt != adn->net->ipv6.ip6_null_entry) {
2122 RT6_TRACE("deleted by ifdown %p\n", rt);
2128 void rt6_ifdown(struct net *net, struct net_device *dev)
2130 struct arg_dev_net adn = {
2135 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2136 icmp6_clean_all(fib6_ifdown, &adn);
2139 struct rt6_mtu_change_arg
2141 struct net_device *dev;
2145 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2147 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2148 struct inet6_dev *idev;
2150 /* In IPv6 pmtu discovery is not optional,
2151 so that RTAX_MTU lock cannot disable it.
2152 We still use this lock to block changes
2153 caused by addrconf/ndisc.
2156 idev = __in6_dev_get(arg->dev);
2160 /* For administrative MTU increase, there is no way to discover
2161 IPv6 PMTU increase, so PMTU increase should be updated here.
2162 Since RFC 1981 doesn't include administrative MTU increase
2163 update PMTU increase is a MUST. (i.e. jumbo frame)
2166 If new MTU is less than route PMTU, this new MTU will be the
2167 lowest MTU in the path, update the route PMTU to reflect PMTU
2168 decreases; if new MTU is greater than route PMTU, and the
2169 old MTU is the lowest MTU in the path, update the route PMTU
2170 to reflect the increase. In this case if the other nodes' MTU
2171 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2174 if (rt->rt6i_dev == arg->dev &&
2175 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2176 (dst_mtu(&rt->dst) >= arg->mtu ||
2177 (dst_mtu(&rt->dst) < arg->mtu &&
2178 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2179 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2184 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2186 struct rt6_mtu_change_arg arg = {
2191 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2194 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2195 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2196 [RTA_OIF] = { .type = NLA_U32 },
2197 [RTA_IIF] = { .type = NLA_U32 },
2198 [RTA_PRIORITY] = { .type = NLA_U32 },
2199 [RTA_METRICS] = { .type = NLA_NESTED },
2202 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2203 struct fib6_config *cfg)
2206 struct nlattr *tb[RTA_MAX+1];
2209 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2214 rtm = nlmsg_data(nlh);
2215 memset(cfg, 0, sizeof(*cfg));
2217 cfg->fc_table = rtm->rtm_table;
2218 cfg->fc_dst_len = rtm->rtm_dst_len;
2219 cfg->fc_src_len = rtm->rtm_src_len;
2220 cfg->fc_flags = RTF_UP;
2221 cfg->fc_protocol = rtm->rtm_protocol;
2223 if (rtm->rtm_type == RTN_UNREACHABLE)
2224 cfg->fc_flags |= RTF_REJECT;
2226 if (rtm->rtm_type == RTN_LOCAL)
2227 cfg->fc_flags |= RTF_LOCAL;
2229 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2230 cfg->fc_nlinfo.nlh = nlh;
2231 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2233 if (tb[RTA_GATEWAY]) {
2234 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2235 cfg->fc_flags |= RTF_GATEWAY;
2239 int plen = (rtm->rtm_dst_len + 7) >> 3;
2241 if (nla_len(tb[RTA_DST]) < plen)
2244 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2248 int plen = (rtm->rtm_src_len + 7) >> 3;
2250 if (nla_len(tb[RTA_SRC]) < plen)
2253 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2256 if (tb[RTA_PREFSRC])
2257 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2260 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2262 if (tb[RTA_PRIORITY])
2263 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2265 if (tb[RTA_METRICS]) {
2266 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2267 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2271 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2278 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2280 struct fib6_config cfg;
2283 err = rtm_to_fib6_config(skb, nlh, &cfg);
2287 return ip6_route_del(&cfg);
2290 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2292 struct fib6_config cfg;
2295 err = rtm_to_fib6_config(skb, nlh, &cfg);
2299 return ip6_route_add(&cfg);
2302 static inline size_t rt6_nlmsg_size(void)
2304 return NLMSG_ALIGN(sizeof(struct rtmsg))
2305 + nla_total_size(16) /* RTA_SRC */
2306 + nla_total_size(16) /* RTA_DST */
2307 + nla_total_size(16) /* RTA_GATEWAY */
2308 + nla_total_size(16) /* RTA_PREFSRC */
2309 + nla_total_size(4) /* RTA_TABLE */
2310 + nla_total_size(4) /* RTA_IIF */
2311 + nla_total_size(4) /* RTA_OIF */
2312 + nla_total_size(4) /* RTA_PRIORITY */
2313 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2314 + nla_total_size(sizeof(struct rta_cacheinfo));
2317 static int rt6_fill_node(struct net *net,
2318 struct sk_buff *skb, struct rt6_info *rt,
2319 struct in6_addr *dst, struct in6_addr *src,
2320 int iif, int type, u32 pid, u32 seq,
2321 int prefix, int nowait, unsigned int flags)
2324 struct nlmsghdr *nlh;
2327 struct neighbour *n;
2329 if (prefix) { /* user wants prefix routes only */
2330 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2331 /* success since this is not a prefix route */
2336 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2340 rtm = nlmsg_data(nlh);
2341 rtm->rtm_family = AF_INET6;
2342 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2343 rtm->rtm_src_len = rt->rt6i_src.plen;
2346 table = rt->rt6i_table->tb6_id;
2348 table = RT6_TABLE_UNSPEC;
2349 rtm->rtm_table = table;
2350 NLA_PUT_U32(skb, RTA_TABLE, table);
2351 if (rt->rt6i_flags&RTF_REJECT)
2352 rtm->rtm_type = RTN_UNREACHABLE;
2353 else if (rt->rt6i_flags&RTF_LOCAL)
2354 rtm->rtm_type = RTN_LOCAL;
2355 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2356 rtm->rtm_type = RTN_LOCAL;
2358 rtm->rtm_type = RTN_UNICAST;
2360 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2361 rtm->rtm_protocol = rt->rt6i_protocol;
2362 if (rt->rt6i_flags&RTF_DYNAMIC)
2363 rtm->rtm_protocol = RTPROT_REDIRECT;
2364 else if (rt->rt6i_flags & RTF_ADDRCONF)
2365 rtm->rtm_protocol = RTPROT_KERNEL;
2366 else if (rt->rt6i_flags&RTF_DEFAULT)
2367 rtm->rtm_protocol = RTPROT_RA;
2369 if (rt->rt6i_flags&RTF_CACHE)
2370 rtm->rtm_flags |= RTM_F_CLONED;
2373 NLA_PUT(skb, RTA_DST, 16, dst);
2374 rtm->rtm_dst_len = 128;
2375 } else if (rtm->rtm_dst_len)
2376 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2377 #ifdef CONFIG_IPV6_SUBTREES
2379 NLA_PUT(skb, RTA_SRC, 16, src);
2380 rtm->rtm_src_len = 128;
2381 } else if (rtm->rtm_src_len)
2382 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2385 #ifdef CONFIG_IPV6_MROUTE
2386 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2387 int err = ip6mr_get_route(net, skb, rtm, nowait);
2392 goto nla_put_failure;
2394 if (err == -EMSGSIZE)
2395 goto nla_put_failure;
2400 NLA_PUT_U32(skb, RTA_IIF, iif);
2402 struct in6_addr saddr_buf;
2403 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2404 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2407 if (rt->rt6i_prefsrc.plen) {
2408 struct in6_addr saddr_buf;
2409 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2410 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2413 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2414 goto nla_put_failure;
2417 n = dst_get_neighbour(&rt->dst);
2419 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2421 goto nla_put_failure;
2427 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2429 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2431 if (!(rt->rt6i_flags & RTF_EXPIRES))
2433 else if (rt->rt6i_expires - jiffies < INT_MAX)
2434 expires = rt->rt6i_expires - jiffies;
2438 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2439 expires, rt->dst.error) < 0)
2440 goto nla_put_failure;
2442 return nlmsg_end(skb, nlh);
2445 nlmsg_cancel(skb, nlh);
2449 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2451 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2454 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2455 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2456 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2460 return rt6_fill_node(arg->net,
2461 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2462 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2463 prefix, 0, NLM_F_MULTI);
2466 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2468 struct net *net = sock_net(in_skb->sk);
2469 struct nlattr *tb[RTA_MAX+1];
2470 struct rt6_info *rt;
2471 struct sk_buff *skb;
2476 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2481 memset(&fl6, 0, sizeof(fl6));
2484 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2487 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2491 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2494 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2498 iif = nla_get_u32(tb[RTA_IIF]);
2501 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2504 struct net_device *dev;
2505 dev = __dev_get_by_index(net, iif);
2512 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2518 /* Reserve room for dummy headers, this skb can pass
2519 through good chunk of routing engine.
2521 skb_reset_mac_header(skb);
2522 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2524 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2525 skb_dst_set(skb, &rt->dst);
2527 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2528 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2529 nlh->nlmsg_seq, 0, 0, 0);
2535 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2540 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2542 struct sk_buff *skb;
2543 struct net *net = info->nl_net;
2548 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2550 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2554 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2555 event, info->pid, seq, 0, 0, 0);
2557 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2558 WARN_ON(err == -EMSGSIZE);
2562 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2563 info->nlh, gfp_any());
2567 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2570 static int ip6_route_dev_notify(struct notifier_block *this,
2571 unsigned long event, void *data)
2573 struct net_device *dev = (struct net_device *)data;
2574 struct net *net = dev_net(dev);
2576 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2577 net->ipv6.ip6_null_entry->dst.dev = dev;
2578 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2579 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2580 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2581 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2582 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2583 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2594 #ifdef CONFIG_PROC_FS
2605 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2607 struct seq_file *m = p_arg;
2608 struct neighbour *n;
2610 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2612 #ifdef CONFIG_IPV6_SUBTREES
2613 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2615 seq_puts(m, "00000000000000000000000000000000 00 ");
2618 n = dst_get_neighbour(&rt->dst);
2620 seq_printf(m, "%pi6", n->primary_key);
2622 seq_puts(m, "00000000000000000000000000000000");
2625 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2626 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2627 rt->dst.__use, rt->rt6i_flags,
2628 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2632 static int ipv6_route_show(struct seq_file *m, void *v)
2634 struct net *net = (struct net *)m->private;
2635 fib6_clean_all(net, rt6_info_route, 0, m);
2639 static int ipv6_route_open(struct inode *inode, struct file *file)
2641 return single_open_net(inode, file, ipv6_route_show);
2644 static const struct file_operations ipv6_route_proc_fops = {
2645 .owner = THIS_MODULE,
2646 .open = ipv6_route_open,
2648 .llseek = seq_lseek,
2649 .release = single_release_net,
2652 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2654 struct net *net = (struct net *)seq->private;
2655 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2656 net->ipv6.rt6_stats->fib_nodes,
2657 net->ipv6.rt6_stats->fib_route_nodes,
2658 net->ipv6.rt6_stats->fib_rt_alloc,
2659 net->ipv6.rt6_stats->fib_rt_entries,
2660 net->ipv6.rt6_stats->fib_rt_cache,
2661 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2662 net->ipv6.rt6_stats->fib_discarded_routes);
2667 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2669 return single_open_net(inode, file, rt6_stats_seq_show);
2672 static const struct file_operations rt6_stats_seq_fops = {
2673 .owner = THIS_MODULE,
2674 .open = rt6_stats_seq_open,
2676 .llseek = seq_lseek,
2677 .release = single_release_net,
2679 #endif /* CONFIG_PROC_FS */
2681 #ifdef CONFIG_SYSCTL
2684 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2685 void __user *buffer, size_t *lenp, loff_t *ppos)
2692 net = (struct net *)ctl->extra1;
2693 delay = net->ipv6.sysctl.flush_delay;
2694 proc_dointvec(ctl, write, buffer, lenp, ppos);
2695 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2699 ctl_table ipv6_route_table_template[] = {
2701 .procname = "flush",
2702 .data = &init_net.ipv6.sysctl.flush_delay,
2703 .maxlen = sizeof(int),
2705 .proc_handler = ipv6_sysctl_rtcache_flush
2708 .procname = "gc_thresh",
2709 .data = &ip6_dst_ops_template.gc_thresh,
2710 .maxlen = sizeof(int),
2712 .proc_handler = proc_dointvec,
2715 .procname = "max_size",
2716 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2717 .maxlen = sizeof(int),
2719 .proc_handler = proc_dointvec,
2722 .procname = "gc_min_interval",
2723 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2724 .maxlen = sizeof(int),
2726 .proc_handler = proc_dointvec_jiffies,
2729 .procname = "gc_timeout",
2730 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2731 .maxlen = sizeof(int),
2733 .proc_handler = proc_dointvec_jiffies,
2736 .procname = "gc_interval",
2737 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2738 .maxlen = sizeof(int),
2740 .proc_handler = proc_dointvec_jiffies,
2743 .procname = "gc_elasticity",
2744 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2745 .maxlen = sizeof(int),
2747 .proc_handler = proc_dointvec,
2750 .procname = "mtu_expires",
2751 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2752 .maxlen = sizeof(int),
2754 .proc_handler = proc_dointvec_jiffies,
2757 .procname = "min_adv_mss",
2758 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2759 .maxlen = sizeof(int),
2761 .proc_handler = proc_dointvec,
2764 .procname = "gc_min_interval_ms",
2765 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2766 .maxlen = sizeof(int),
2768 .proc_handler = proc_dointvec_ms_jiffies,
2773 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2775 struct ctl_table *table;
2777 table = kmemdup(ipv6_route_table_template,
2778 sizeof(ipv6_route_table_template),
2782 table[0].data = &net->ipv6.sysctl.flush_delay;
2783 table[0].extra1 = net;
2784 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2785 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2786 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2787 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2788 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2789 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2790 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2791 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2792 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2799 static int __net_init ip6_route_net_init(struct net *net)
2803 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2804 sizeof(net->ipv6.ip6_dst_ops));
2806 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2807 goto out_ip6_dst_ops;
2809 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2810 sizeof(*net->ipv6.ip6_null_entry),
2812 if (!net->ipv6.ip6_null_entry)
2813 goto out_ip6_dst_entries;
2814 net->ipv6.ip6_null_entry->dst.path =
2815 (struct dst_entry *)net->ipv6.ip6_null_entry;
2816 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2817 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2818 ip6_template_metrics, true);
2820 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2821 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2822 sizeof(*net->ipv6.ip6_prohibit_entry),
2824 if (!net->ipv6.ip6_prohibit_entry)
2825 goto out_ip6_null_entry;
2826 net->ipv6.ip6_prohibit_entry->dst.path =
2827 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2828 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2829 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2830 ip6_template_metrics, true);
2832 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2833 sizeof(*net->ipv6.ip6_blk_hole_entry),
2835 if (!net->ipv6.ip6_blk_hole_entry)
2836 goto out_ip6_prohibit_entry;
2837 net->ipv6.ip6_blk_hole_entry->dst.path =
2838 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2839 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2840 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2841 ip6_template_metrics, true);
2844 net->ipv6.sysctl.flush_delay = 0;
2845 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2846 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2847 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2848 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2849 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2850 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2851 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2853 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2859 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2860 out_ip6_prohibit_entry:
2861 kfree(net->ipv6.ip6_prohibit_entry);
2863 kfree(net->ipv6.ip6_null_entry);
2865 out_ip6_dst_entries:
2866 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2871 static void __net_exit ip6_route_net_exit(struct net *net)
2873 kfree(net->ipv6.ip6_null_entry);
2874 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2875 kfree(net->ipv6.ip6_prohibit_entry);
2876 kfree(net->ipv6.ip6_blk_hole_entry);
2878 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2881 static int __net_init ip6_route_net_init_late(struct net *net)
2883 #ifdef CONFIG_PROC_FS
2884 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2885 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2890 static void __net_exit ip6_route_net_exit_late(struct net *net)
2892 #ifdef CONFIG_PROC_FS
2893 proc_net_remove(net, "ipv6_route");
2894 proc_net_remove(net, "rt6_stats");
2898 static struct pernet_operations ip6_route_net_ops = {
2899 .init = ip6_route_net_init,
2900 .exit = ip6_route_net_exit,
2903 static struct pernet_operations ip6_route_net_late_ops = {
2904 .init = ip6_route_net_init_late,
2905 .exit = ip6_route_net_exit_late,
2908 static struct notifier_block ip6_route_dev_notifier = {
2909 .notifier_call = ip6_route_dev_notify,
2913 int __init ip6_route_init(void)
2918 ip6_dst_ops_template.kmem_cachep =
2919 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2920 SLAB_HWCACHE_ALIGN, NULL);
2921 if (!ip6_dst_ops_template.kmem_cachep)
2924 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2926 goto out_kmem_cache;
2928 ret = register_pernet_subsys(&ip6_route_net_ops);
2930 goto out_dst_entries;
2932 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2934 /* Registering of the loopback is done before this portion of code,
2935 * the loopback reference in rt6_info will not be taken, do it
2936 * manually for init_net */
2937 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2938 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2939 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2940 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2941 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2942 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2943 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2947 goto out_register_subsys;
2953 ret = fib6_rules_init();
2957 ret = register_pernet_subsys(&ip6_route_net_late_ops);
2959 goto fib6_rules_init;
2962 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2963 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2964 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2965 goto out_register_late_subsys;
2967 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2969 goto out_register_late_subsys;
2974 out_register_late_subsys:
2975 unregister_pernet_subsys(&ip6_route_net_late_ops);
2977 fib6_rules_cleanup();
2982 out_register_subsys:
2983 unregister_pernet_subsys(&ip6_route_net_ops);
2985 dst_entries_destroy(&ip6_dst_blackhole_ops);
2987 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2991 void ip6_route_cleanup(void)
2993 unregister_netdevice_notifier(&ip6_route_dev_notifier);
2994 unregister_pernet_subsys(&ip6_route_net_late_ops);
2995 fib6_rules_cleanup();
2998 unregister_pernet_subsys(&ip6_route_net_ops);
2999 dst_entries_destroy(&ip6_dst_blackhole_ops);
3000 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);