2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
52 #include <linux/rtnetlink.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
58 #include <asm/uaccess.h>
61 #include <linux/sysctl.h>
64 /* Set to 3 to get tracing. */
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #define RT6_TRACE(x...) do { ; } while (0)
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void ip6_dst_destroy(struct dst_entry *);
81 static void ip6_dst_ifdown(struct dst_entry *,
82 struct net_device *dev, int how);
83 static int ip6_dst_gc(struct dst_ops *ops);
85 static int ip6_pkt_discard(struct sk_buff *skb);
86 static int ip6_pkt_discard_out(struct sk_buff *skb);
87 static void ip6_link_failure(struct sk_buff *skb);
88 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92 const struct in6_addr *prefix, int prefixlen,
93 const struct in6_addr *gwaddr, int ifindex,
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96 const struct in6_addr *prefix, int prefixlen,
97 const struct in6_addr *gwaddr, int ifindex);
100 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
102 struct rt6_info *rt = (struct rt6_info *) dst;
103 struct inet_peer *peer;
107 rt6_bind_peer(rt, 1);
109 peer = rt->rt6i_peer;
111 u32 *old_p = __DST_METRICS_PTR(old);
112 unsigned long prev, new;
115 if (inet_metrics_new(peer))
116 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
118 new = (unsigned long) p;
119 prev = cmpxchg(&dst->_metrics, old, new);
122 p = __DST_METRICS_PTR(prev);
123 if (prev & DST_METRICS_READ_ONLY)
130 static struct dst_ops ip6_dst_ops_template = {
132 .protocol = cpu_to_be16(ETH_P_IPV6),
135 .check = ip6_dst_check,
136 .default_advmss = ip6_default_advmss,
137 .default_mtu = ip6_default_mtu,
138 .cow_metrics = ipv6_cow_metrics,
139 .destroy = ip6_dst_destroy,
140 .ifdown = ip6_dst_ifdown,
141 .negative_advice = ip6_negative_advice,
142 .link_failure = ip6_link_failure,
143 .update_pmtu = ip6_rt_update_pmtu,
144 .local_out = __ip6_local_out,
147 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
152 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
156 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
162 static struct dst_ops ip6_dst_blackhole_ops = {
164 .protocol = cpu_to_be16(ETH_P_IPV6),
165 .destroy = ip6_dst_destroy,
166 .check = ip6_dst_check,
167 .default_mtu = ip6_blackhole_default_mtu,
168 .default_advmss = ip6_default_advmss,
169 .update_pmtu = ip6_rt_blackhole_update_pmtu,
170 .cow_metrics = ip6_rt_blackhole_cow_metrics,
173 static const u32 ip6_template_metrics[RTAX_MAX] = {
174 [RTAX_HOPLIMIT - 1] = 0,
177 static struct rt6_info ip6_null_entry_template = {
179 .__refcnt = ATOMIC_INIT(1),
182 .error = -ENETUNREACH,
183 .input = ip6_pkt_discard,
184 .output = ip6_pkt_discard_out,
186 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
187 .rt6i_protocol = RTPROT_KERNEL,
188 .rt6i_metric = ~(u32) 0,
189 .rt6i_ref = ATOMIC_INIT(1),
192 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
194 static int ip6_pkt_prohibit(struct sk_buff *skb);
195 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
197 static struct rt6_info ip6_prohibit_entry_template = {
199 .__refcnt = ATOMIC_INIT(1),
203 .input = ip6_pkt_prohibit,
204 .output = ip6_pkt_prohibit_out,
206 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
207 .rt6i_protocol = RTPROT_KERNEL,
208 .rt6i_metric = ~(u32) 0,
209 .rt6i_ref = ATOMIC_INIT(1),
212 static struct rt6_info ip6_blk_hole_entry_template = {
214 .__refcnt = ATOMIC_INIT(1),
218 .input = dst_discard,
219 .output = dst_discard,
221 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
222 .rt6i_protocol = RTPROT_KERNEL,
223 .rt6i_metric = ~(u32) 0,
224 .rt6i_ref = ATOMIC_INIT(1),
229 /* allocate dst with ip6_dst_ops */
230 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
231 struct net_device *dev,
234 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
237 memset(&rt->rt6i_table, 0,
238 sizeof(*rt) - sizeof(struct dst_entry));
243 static void ip6_dst_destroy(struct dst_entry *dst)
245 struct rt6_info *rt = (struct rt6_info *)dst;
246 struct inet6_dev *idev = rt->rt6i_idev;
247 struct inet_peer *peer = rt->rt6i_peer;
250 rt->rt6i_idev = NULL;
254 rt->rt6i_peer = NULL;
259 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
261 static u32 rt6_peer_genid(void)
263 return atomic_read(&__rt6_peer_genid);
266 void rt6_bind_peer(struct rt6_info *rt, int create)
268 struct inet_peer *peer;
270 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
271 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
274 rt->rt6i_peer_genid = rt6_peer_genid();
277 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
280 struct rt6_info *rt = (struct rt6_info *)dst;
281 struct inet6_dev *idev = rt->rt6i_idev;
282 struct net_device *loopback_dev =
283 dev_net(dev)->loopback_dev;
285 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
286 struct inet6_dev *loopback_idev =
287 in6_dev_get(loopback_dev);
288 if (loopback_idev != NULL) {
289 rt->rt6i_idev = loopback_idev;
295 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
297 return (rt->rt6i_flags & RTF_EXPIRES) &&
298 time_after(jiffies, rt->rt6i_expires);
301 static inline int rt6_need_strict(const struct in6_addr *daddr)
303 return ipv6_addr_type(daddr) &
304 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
308 * Route lookup. Any table->tb6_lock is implied.
311 static inline struct rt6_info *rt6_device_match(struct net *net,
313 const struct in6_addr *saddr,
317 struct rt6_info *local = NULL;
318 struct rt6_info *sprt;
320 if (!oif && ipv6_addr_any(saddr))
323 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
324 struct net_device *dev = sprt->rt6i_dev;
327 if (dev->ifindex == oif)
329 if (dev->flags & IFF_LOOPBACK) {
330 if (sprt->rt6i_idev == NULL ||
331 sprt->rt6i_idev->dev->ifindex != oif) {
332 if (flags & RT6_LOOKUP_F_IFACE && oif)
334 if (local && (!oif ||
335 local->rt6i_idev->dev->ifindex == oif))
341 if (ipv6_chk_addr(net, saddr, dev,
342 flags & RT6_LOOKUP_F_IFACE))
351 if (flags & RT6_LOOKUP_F_IFACE)
352 return net->ipv6.ip6_null_entry;
358 #ifdef CONFIG_IPV6_ROUTER_PREF
359 static void rt6_probe(struct rt6_info *rt)
361 struct neighbour *neigh;
363 * Okay, this does not seem to be appropriate
364 * for now, however, we need to check if it
365 * is really so; aka Router Reachability Probing.
367 * Router Reachability Probe MUST be rate-limited
368 * to no more than one per minute.
371 neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
372 if (!neigh || (neigh->nud_state & NUD_VALID))
374 read_lock_bh(&neigh->lock);
375 if (!(neigh->nud_state & NUD_VALID) &&
376 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
377 struct in6_addr mcaddr;
378 struct in6_addr *target;
380 neigh->updated = jiffies;
381 read_unlock_bh(&neigh->lock);
383 target = (struct in6_addr *)&neigh->primary_key;
384 addrconf_addr_solict_mult(target, &mcaddr);
385 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
387 read_unlock_bh(&neigh->lock);
393 static inline void rt6_probe(struct rt6_info *rt)
399 * Default Router Selection (RFC 2461 6.3.6)
401 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
403 struct net_device *dev = rt->rt6i_dev;
404 if (!oif || dev->ifindex == oif)
406 if ((dev->flags & IFF_LOOPBACK) &&
407 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
412 static inline int rt6_check_neigh(struct rt6_info *rt)
414 struct neighbour *neigh;
418 neigh = dst_get_neighbour(&rt->dst);
419 if (rt->rt6i_flags & RTF_NONEXTHOP ||
420 !(rt->rt6i_flags & RTF_GATEWAY))
423 read_lock_bh(&neigh->lock);
424 if (neigh->nud_state & NUD_VALID)
426 #ifdef CONFIG_IPV6_ROUTER_PREF
427 else if (neigh->nud_state & NUD_FAILED)
432 read_unlock_bh(&neigh->lock);
439 static int rt6_score_route(struct rt6_info *rt, int oif,
444 m = rt6_check_dev(rt, oif);
445 if (!m && (strict & RT6_LOOKUP_F_IFACE))
447 #ifdef CONFIG_IPV6_ROUTER_PREF
448 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
450 n = rt6_check_neigh(rt);
451 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
456 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
457 int *mpri, struct rt6_info *match)
461 if (rt6_check_expired(rt))
464 m = rt6_score_route(rt, oif, strict);
469 if (strict & RT6_LOOKUP_F_REACHABLE)
473 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
481 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
482 struct rt6_info *rr_head,
483 u32 metric, int oif, int strict)
485 struct rt6_info *rt, *match;
489 for (rt = rr_head; rt && rt->rt6i_metric == metric;
490 rt = rt->dst.rt6_next)
491 match = find_match(rt, oif, strict, &mpri, match);
492 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
493 rt = rt->dst.rt6_next)
494 match = find_match(rt, oif, strict, &mpri, match);
499 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
501 struct rt6_info *match, *rt0;
504 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
505 __func__, fn->leaf, oif);
509 fn->rr_ptr = rt0 = fn->leaf;
511 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
514 (strict & RT6_LOOKUP_F_REACHABLE)) {
515 struct rt6_info *next = rt0->dst.rt6_next;
517 /* no entries matched; do round-robin */
518 if (!next || next->rt6i_metric != rt0->rt6i_metric)
525 RT6_TRACE("%s() => %p\n",
528 net = dev_net(rt0->rt6i_dev);
529 return match ? match : net->ipv6.ip6_null_entry;
532 #ifdef CONFIG_IPV6_ROUTE_INFO
533 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
534 const struct in6_addr *gwaddr)
536 struct net *net = dev_net(dev);
537 struct route_info *rinfo = (struct route_info *) opt;
538 struct in6_addr prefix_buf, *prefix;
540 unsigned long lifetime;
543 if (len < sizeof(struct route_info)) {
547 /* Sanity check for prefix_len and length */
548 if (rinfo->length > 3) {
550 } else if (rinfo->prefix_len > 128) {
552 } else if (rinfo->prefix_len > 64) {
553 if (rinfo->length < 2) {
556 } else if (rinfo->prefix_len > 0) {
557 if (rinfo->length < 1) {
562 pref = rinfo->route_pref;
563 if (pref == ICMPV6_ROUTER_PREF_INVALID)
566 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
568 if (rinfo->length == 3)
569 prefix = (struct in6_addr *)rinfo->prefix;
571 /* this function is safe */
572 ipv6_addr_prefix(&prefix_buf,
573 (struct in6_addr *)rinfo->prefix,
575 prefix = &prefix_buf;
578 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
581 if (rt && !lifetime) {
587 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
590 rt->rt6i_flags = RTF_ROUTEINFO |
591 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
594 if (!addrconf_finite_timeout(lifetime)) {
595 rt->rt6i_flags &= ~RTF_EXPIRES;
597 rt->rt6i_expires = jiffies + HZ * lifetime;
598 rt->rt6i_flags |= RTF_EXPIRES;
600 dst_release(&rt->dst);
606 #define BACKTRACK(__net, saddr) \
608 if (rt == __net->ipv6.ip6_null_entry) { \
609 struct fib6_node *pn; \
611 if (fn->fn_flags & RTN_TL_ROOT) \
614 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
615 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
618 if (fn->fn_flags & RTN_RTINFO) \
624 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
625 struct fib6_table *table,
626 struct flowi6 *fl6, int flags)
628 struct fib6_node *fn;
631 read_lock_bh(&table->tb6_lock);
632 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
635 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
636 BACKTRACK(net, &fl6->saddr);
638 dst_use(&rt->dst, jiffies);
639 read_unlock_bh(&table->tb6_lock);
644 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
645 const struct in6_addr *saddr, int oif, int strict)
647 struct flowi6 fl6 = {
651 struct dst_entry *dst;
652 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
655 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
656 flags |= RT6_LOOKUP_F_HAS_SADDR;
659 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
661 return (struct rt6_info *) dst;
668 EXPORT_SYMBOL(rt6_lookup);
670 /* ip6_ins_rt is called with FREE table->tb6_lock.
671 It takes new route entry, the addition fails by any reason the
672 route is freed. In any case, if caller does not hold it, it may
676 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
679 struct fib6_table *table;
681 table = rt->rt6i_table;
682 write_lock_bh(&table->tb6_lock);
683 err = fib6_add(&table->tb6_root, rt, info);
684 write_unlock_bh(&table->tb6_lock);
689 int ip6_ins_rt(struct rt6_info *rt)
691 struct nl_info info = {
692 .nl_net = dev_net(rt->rt6i_dev),
694 return __ip6_ins_rt(rt, &info);
697 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, const struct in6_addr *daddr,
698 const struct in6_addr *saddr)
706 rt = ip6_rt_copy(ort);
709 struct neighbour *neigh;
710 int attempts = !in_softirq();
712 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
713 if (rt->rt6i_dst.plen != 128 &&
714 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
715 rt->rt6i_flags |= RTF_ANYCAST;
716 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
719 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
720 rt->rt6i_dst.plen = 128;
721 rt->rt6i_flags |= RTF_CACHE;
722 rt->dst.flags |= DST_HOST;
724 #ifdef CONFIG_IPV6_SUBTREES
725 if (rt->rt6i_src.plen && saddr) {
726 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
727 rt->rt6i_src.plen = 128;
732 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
734 struct net *net = dev_net(rt->rt6i_dev);
735 int saved_rt_min_interval =
736 net->ipv6.sysctl.ip6_rt_gc_min_interval;
737 int saved_rt_elasticity =
738 net->ipv6.sysctl.ip6_rt_gc_elasticity;
740 if (attempts-- > 0) {
741 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
742 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
744 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
746 net->ipv6.sysctl.ip6_rt_gc_elasticity =
748 net->ipv6.sysctl.ip6_rt_gc_min_interval =
749 saved_rt_min_interval;
755 "ipv6: Neighbour table overflow.\n");
759 dst_set_neighbour(&rt->dst, neigh);
765 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, const struct in6_addr *daddr)
767 struct rt6_info *rt = ip6_rt_copy(ort);
769 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
770 rt->rt6i_dst.plen = 128;
771 rt->rt6i_flags |= RTF_CACHE;
772 rt->dst.flags |= DST_HOST;
773 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
778 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
779 struct flowi6 *fl6, int flags)
781 struct fib6_node *fn;
782 struct rt6_info *rt, *nrt;
786 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
788 strict |= flags & RT6_LOOKUP_F_IFACE;
791 read_lock_bh(&table->tb6_lock);
794 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
797 rt = rt6_select(fn, oif, strict | reachable);
799 BACKTRACK(net, &fl6->saddr);
800 if (rt == net->ipv6.ip6_null_entry ||
801 rt->rt6i_flags & RTF_CACHE)
805 read_unlock_bh(&table->tb6_lock);
807 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
808 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
809 else if (!(rt->dst.flags & DST_HOST))
810 nrt = rt6_alloc_clone(rt, &fl6->daddr);
814 dst_release(&rt->dst);
815 rt = nrt ? : net->ipv6.ip6_null_entry;
819 err = ip6_ins_rt(nrt);
828 * Race condition! In the gap, when table->tb6_lock was
829 * released someone could insert this route. Relookup.
831 dst_release(&rt->dst);
840 read_unlock_bh(&table->tb6_lock);
842 rt->dst.lastuse = jiffies;
848 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
849 struct flowi6 *fl6, int flags)
851 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
854 void ip6_route_input(struct sk_buff *skb)
856 const struct ipv6hdr *iph = ipv6_hdr(skb);
857 struct net *net = dev_net(skb->dev);
858 int flags = RT6_LOOKUP_F_HAS_SADDR;
859 struct flowi6 fl6 = {
860 .flowi6_iif = skb->dev->ifindex,
863 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
864 .flowi6_mark = skb->mark,
865 .flowi6_proto = iph->nexthdr,
868 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
869 flags |= RT6_LOOKUP_F_IFACE;
871 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
874 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
875 struct flowi6 *fl6, int flags)
877 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
880 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
885 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
886 flags |= RT6_LOOKUP_F_IFACE;
888 if (!ipv6_addr_any(&fl6->saddr))
889 flags |= RT6_LOOKUP_F_HAS_SADDR;
891 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
893 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
896 EXPORT_SYMBOL(ip6_route_output);
898 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
900 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
901 struct dst_entry *new = NULL;
903 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
905 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
910 new->input = dst_discard;
911 new->output = dst_discard;
913 dst_copy_metrics(new, &ort->dst);
914 rt->rt6i_idev = ort->rt6i_idev;
916 in6_dev_hold(rt->rt6i_idev);
917 rt->rt6i_expires = 0;
919 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
920 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
923 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
924 #ifdef CONFIG_IPV6_SUBTREES
925 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
931 dst_release(dst_orig);
932 return new ? new : ERR_PTR(-ENOMEM);
936 * Destination cache support functions
939 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
943 rt = (struct rt6_info *) dst;
945 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
946 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
948 rt6_bind_peer(rt, 0);
949 rt->rt6i_peer_genid = rt6_peer_genid();
956 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
958 struct rt6_info *rt = (struct rt6_info *) dst;
961 if (rt->rt6i_flags & RTF_CACHE) {
962 if (rt6_check_expired(rt)) {
974 static void ip6_link_failure(struct sk_buff *skb)
978 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
980 rt = (struct rt6_info *) skb_dst(skb);
982 if (rt->rt6i_flags&RTF_CACHE) {
983 dst_set_expires(&rt->dst, 0);
984 rt->rt6i_flags |= RTF_EXPIRES;
985 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
986 rt->rt6i_node->fn_sernum = -1;
990 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
992 struct rt6_info *rt6 = (struct rt6_info*)dst;
994 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
995 rt6->rt6i_flags |= RTF_MODIFIED;
996 if (mtu < IPV6_MIN_MTU) {
997 u32 features = dst_metric(dst, RTAX_FEATURES);
999 features |= RTAX_FEATURE_ALLFRAG;
1000 dst_metric_set(dst, RTAX_FEATURES, features);
1002 dst_metric_set(dst, RTAX_MTU, mtu);
1006 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1008 struct net_device *dev = dst->dev;
1009 unsigned int mtu = dst_mtu(dst);
1010 struct net *net = dev_net(dev);
1012 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1014 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1015 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1018 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1019 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1020 * IPV6_MAXPLEN is also valid and means: "any MSS,
1021 * rely only on pmtu discovery"
1023 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1028 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1030 unsigned int mtu = IPV6_MIN_MTU;
1031 struct inet6_dev *idev;
1034 idev = __in6_dev_get(dst->dev);
1036 mtu = idev->cnf.mtu6;
1042 static struct dst_entry *icmp6_dst_gc_list;
1043 static DEFINE_SPINLOCK(icmp6_dst_lock);
1045 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1046 struct neighbour *neigh,
1047 const struct in6_addr *addr)
1049 struct rt6_info *rt;
1050 struct inet6_dev *idev = in6_dev_get(dev);
1051 struct net *net = dev_net(dev);
1053 if (unlikely(idev == NULL))
1056 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1057 if (unlikely(rt == NULL)) {
1065 neigh = ndisc_get_neigh(dev, addr);
1070 rt->rt6i_idev = idev;
1071 dst_set_neighbour(&rt->dst, neigh);
1072 atomic_set(&rt->dst.__refcnt, 1);
1073 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1074 rt->dst.output = ip6_output;
1076 spin_lock_bh(&icmp6_dst_lock);
1077 rt->dst.next = icmp6_dst_gc_list;
1078 icmp6_dst_gc_list = &rt->dst;
1079 spin_unlock_bh(&icmp6_dst_lock);
1081 fib6_force_start_gc(net);
1087 int icmp6_dst_gc(void)
1089 struct dst_entry *dst, **pprev;
1092 spin_lock_bh(&icmp6_dst_lock);
1093 pprev = &icmp6_dst_gc_list;
1095 while ((dst = *pprev) != NULL) {
1096 if (!atomic_read(&dst->__refcnt)) {
1105 spin_unlock_bh(&icmp6_dst_lock);
1110 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1113 struct dst_entry *dst, **pprev;
1115 spin_lock_bh(&icmp6_dst_lock);
1116 pprev = &icmp6_dst_gc_list;
1117 while ((dst = *pprev) != NULL) {
1118 struct rt6_info *rt = (struct rt6_info *) dst;
1119 if (func(rt, arg)) {
1126 spin_unlock_bh(&icmp6_dst_lock);
1129 static int ip6_dst_gc(struct dst_ops *ops)
1131 unsigned long now = jiffies;
1132 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1133 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1134 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1135 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1136 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1137 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1140 entries = dst_entries_get_fast(ops);
1141 if (time_after(rt_last_gc + rt_min_interval, now) &&
1142 entries <= rt_max_size)
1145 net->ipv6.ip6_rt_gc_expire++;
1146 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1147 net->ipv6.ip6_rt_last_gc = now;
1148 entries = dst_entries_get_slow(ops);
1149 if (entries < ops->gc_thresh)
1150 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1152 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1153 return entries > rt_max_size;
1156 /* Clean host part of a prefix. Not necessary in radix tree,
1157 but results in cleaner routing tables.
1159 Remove it only when all the things will work!
1162 int ip6_dst_hoplimit(struct dst_entry *dst)
1164 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1165 if (hoplimit == 0) {
1166 struct net_device *dev = dst->dev;
1167 struct inet6_dev *idev;
1170 idev = __in6_dev_get(dev);
1172 hoplimit = idev->cnf.hop_limit;
1174 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1179 EXPORT_SYMBOL(ip6_dst_hoplimit);
1185 int ip6_route_add(struct fib6_config *cfg)
1188 struct net *net = cfg->fc_nlinfo.nl_net;
1189 struct rt6_info *rt = NULL;
1190 struct net_device *dev = NULL;
1191 struct inet6_dev *idev = NULL;
1192 struct fib6_table *table;
1195 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1197 #ifndef CONFIG_IPV6_SUBTREES
1198 if (cfg->fc_src_len)
1201 if (cfg->fc_ifindex) {
1203 dev = dev_get_by_index(net, cfg->fc_ifindex);
1206 idev = in6_dev_get(dev);
1211 if (cfg->fc_metric == 0)
1212 cfg->fc_metric = IP6_RT_PRIO_USER;
1214 table = fib6_new_table(net, cfg->fc_table);
1215 if (table == NULL) {
1220 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1227 rt->dst.obsolete = -1;
1228 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1229 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1232 if (cfg->fc_protocol == RTPROT_UNSPEC)
1233 cfg->fc_protocol = RTPROT_BOOT;
1234 rt->rt6i_protocol = cfg->fc_protocol;
1236 addr_type = ipv6_addr_type(&cfg->fc_dst);
1238 if (addr_type & IPV6_ADDR_MULTICAST)
1239 rt->dst.input = ip6_mc_input;
1240 else if (cfg->fc_flags & RTF_LOCAL)
1241 rt->dst.input = ip6_input;
1243 rt->dst.input = ip6_forward;
1245 rt->dst.output = ip6_output;
1247 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1248 rt->rt6i_dst.plen = cfg->fc_dst_len;
1249 if (rt->rt6i_dst.plen == 128)
1250 rt->dst.flags |= DST_HOST;
1252 #ifdef CONFIG_IPV6_SUBTREES
1253 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1254 rt->rt6i_src.plen = cfg->fc_src_len;
1257 rt->rt6i_metric = cfg->fc_metric;
1259 /* We cannot add true routes via loopback here,
1260 they would result in kernel looping; promote them to reject routes
1262 if ((cfg->fc_flags & RTF_REJECT) ||
1263 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1264 && !(cfg->fc_flags&RTF_LOCAL))) {
1265 /* hold loopback dev/idev if we haven't done so. */
1266 if (dev != net->loopback_dev) {
1271 dev = net->loopback_dev;
1273 idev = in6_dev_get(dev);
1279 rt->dst.output = ip6_pkt_discard_out;
1280 rt->dst.input = ip6_pkt_discard;
1281 rt->dst.error = -ENETUNREACH;
1282 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1286 if (cfg->fc_flags & RTF_GATEWAY) {
1287 const struct in6_addr *gw_addr;
1290 gw_addr = &cfg->fc_gateway;
1291 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1292 gwa_type = ipv6_addr_type(gw_addr);
1294 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1295 struct rt6_info *grt;
1297 /* IPv6 strictly inhibits using not link-local
1298 addresses as nexthop address.
1299 Otherwise, router will not able to send redirects.
1300 It is very good, but in some (rare!) circumstances
1301 (SIT, PtP, NBMA NOARP links) it is handy to allow
1302 some exceptions. --ANK
1305 if (!(gwa_type&IPV6_ADDR_UNICAST))
1308 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1310 err = -EHOSTUNREACH;
1314 if (dev != grt->rt6i_dev) {
1315 dst_release(&grt->dst);
1319 dev = grt->rt6i_dev;
1320 idev = grt->rt6i_idev;
1322 in6_dev_hold(grt->rt6i_idev);
1324 if (!(grt->rt6i_flags&RTF_GATEWAY))
1326 dst_release(&grt->dst);
1332 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1340 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1341 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1345 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1346 rt->rt6i_prefsrc.plen = 128;
1348 rt->rt6i_prefsrc.plen = 0;
1350 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1351 struct neighbour *neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1352 if (IS_ERR(neigh)) {
1353 err = PTR_ERR(neigh);
1356 dst_set_neighbour(&rt->dst, neigh);
1359 rt->rt6i_flags = cfg->fc_flags;
1366 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1367 int type = nla_type(nla);
1370 if (type > RTAX_MAX) {
1375 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1381 rt->rt6i_idev = idev;
1382 rt->rt6i_table = table;
1384 cfg->fc_nlinfo.nl_net = dev_net(dev);
1386 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1398 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1401 struct fib6_table *table;
1402 struct net *net = dev_net(rt->rt6i_dev);
1404 if (rt == net->ipv6.ip6_null_entry) {
1409 table = rt->rt6i_table;
1410 write_lock_bh(&table->tb6_lock);
1411 err = fib6_del(rt, info);
1412 write_unlock_bh(&table->tb6_lock);
1415 dst_release(&rt->dst);
1419 int ip6_del_rt(struct rt6_info *rt)
1421 struct nl_info info = {
1422 .nl_net = dev_net(rt->rt6i_dev),
1424 return __ip6_del_rt(rt, &info);
1427 static int ip6_route_del(struct fib6_config *cfg)
1429 struct fib6_table *table;
1430 struct fib6_node *fn;
1431 struct rt6_info *rt;
1434 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1438 read_lock_bh(&table->tb6_lock);
1440 fn = fib6_locate(&table->tb6_root,
1441 &cfg->fc_dst, cfg->fc_dst_len,
1442 &cfg->fc_src, cfg->fc_src_len);
1445 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1446 if (cfg->fc_ifindex &&
1447 (rt->rt6i_dev == NULL ||
1448 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1450 if (cfg->fc_flags & RTF_GATEWAY &&
1451 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1453 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1456 read_unlock_bh(&table->tb6_lock);
1458 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1461 read_unlock_bh(&table->tb6_lock);
1469 struct ip6rd_flowi {
1471 struct in6_addr gateway;
1474 static struct rt6_info *__ip6_route_redirect(struct net *net,
1475 struct fib6_table *table,
1479 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1480 struct rt6_info *rt;
1481 struct fib6_node *fn;
1484 * Get the "current" route for this destination and
1485 * check if the redirect has come from approriate router.
1487 * RFC 2461 specifies that redirects should only be
1488 * accepted if they come from the nexthop to the target.
1489 * Due to the way the routes are chosen, this notion
1490 * is a bit fuzzy and one might need to check all possible
1494 read_lock_bh(&table->tb6_lock);
1495 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1497 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1499 * Current route is on-link; redirect is always invalid.
1501 * Seems, previous statement is not true. It could
1502 * be node, which looks for us as on-link (f.e. proxy ndisc)
1503 * But then router serving it might decide, that we should
1504 * know truth 8)8) --ANK (980726).
1506 if (rt6_check_expired(rt))
1508 if (!(rt->rt6i_flags & RTF_GATEWAY))
1510 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1512 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1518 rt = net->ipv6.ip6_null_entry;
1519 BACKTRACK(net, &fl6->saddr);
1523 read_unlock_bh(&table->tb6_lock);
1528 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1529 const struct in6_addr *src,
1530 const struct in6_addr *gateway,
1531 struct net_device *dev)
1533 int flags = RT6_LOOKUP_F_HAS_SADDR;
1534 struct net *net = dev_net(dev);
1535 struct ip6rd_flowi rdfl = {
1537 .flowi6_oif = dev->ifindex,
1543 ipv6_addr_copy(&rdfl.gateway, gateway);
1545 if (rt6_need_strict(dest))
1546 flags |= RT6_LOOKUP_F_IFACE;
1548 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1549 flags, __ip6_route_redirect);
1552 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1553 const struct in6_addr *saddr,
1554 struct neighbour *neigh, u8 *lladdr, int on_link)
1556 struct rt6_info *rt, *nrt = NULL;
1557 struct netevent_redirect netevent;
1558 struct net *net = dev_net(neigh->dev);
1560 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1562 if (rt == net->ipv6.ip6_null_entry) {
1563 if (net_ratelimit())
1564 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1565 "for redirect target\n");
1570 * We have finally decided to accept it.
1573 neigh_update(neigh, lladdr, NUD_STALE,
1574 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1575 NEIGH_UPDATE_F_OVERRIDE|
1576 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1577 NEIGH_UPDATE_F_ISROUTER))
1581 * Redirect received -> path was valid.
1582 * Look, redirects are sent only in response to data packets,
1583 * so that this nexthop apparently is reachable. --ANK
1585 dst_confirm(&rt->dst);
1587 /* Duplicate redirect: silently ignore. */
1588 if (neigh == dst_get_neighbour_raw(&rt->dst))
1591 nrt = ip6_rt_copy(rt);
1595 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1597 nrt->rt6i_flags &= ~RTF_GATEWAY;
1599 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1600 nrt->rt6i_dst.plen = 128;
1601 nrt->dst.flags |= DST_HOST;
1603 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1604 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1606 if (ip6_ins_rt(nrt))
1609 netevent.old = &rt->dst;
1610 netevent.new = &nrt->dst;
1611 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1613 if (rt->rt6i_flags&RTF_CACHE) {
1619 dst_release(&rt->dst);
1623 * Handle ICMP "packet too big" messages
1624 * i.e. Path MTU discovery
1627 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1628 struct net *net, u32 pmtu, int ifindex)
1630 struct rt6_info *rt, *nrt;
1633 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1637 if (rt6_check_expired(rt)) {
1642 if (pmtu >= dst_mtu(&rt->dst))
1645 if (pmtu < IPV6_MIN_MTU) {
1647 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1648 * MTU (1280) and a fragment header should always be included
1649 * after a node receiving Too Big message reporting PMTU is
1650 * less than the IPv6 Minimum Link MTU.
1652 pmtu = IPV6_MIN_MTU;
1656 /* New mtu received -> path was valid.
1657 They are sent only in response to data packets,
1658 so that this nexthop apparently is reachable. --ANK
1660 dst_confirm(&rt->dst);
1662 /* Host route. If it is static, it would be better
1663 not to override it, but add new one, so that
1664 when cache entry will expire old pmtu
1665 would return automatically.
1667 if (rt->rt6i_flags & RTF_CACHE) {
1668 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1670 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1671 features |= RTAX_FEATURE_ALLFRAG;
1672 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1674 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1675 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1680 Two cases are possible:
1681 1. It is connected route. Action: COW
1682 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1684 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1685 nrt = rt6_alloc_cow(rt, daddr, saddr);
1687 nrt = rt6_alloc_clone(rt, daddr);
1690 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1692 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1693 features |= RTAX_FEATURE_ALLFRAG;
1694 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1697 /* According to RFC 1981, detecting PMTU increase shouldn't be
1698 * happened within 5 mins, the recommended timer is 10 mins.
1699 * Here this route expiration time is set to ip6_rt_mtu_expires
1700 * which is 10 mins. After 10 mins the decreased pmtu is expired
1701 * and detecting PMTU increase will be automatically happened.
1703 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1704 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1709 dst_release(&rt->dst);
1712 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1713 struct net_device *dev, u32 pmtu)
1715 struct net *net = dev_net(dev);
1718 * RFC 1981 states that a node "MUST reduce the size of the packets it
1719 * is sending along the path" that caused the Packet Too Big message.
1720 * Since it's not possible in the general case to determine which
1721 * interface was used to send the original packet, we update the MTU
1722 * on the interface that will be used to send future packets. We also
1723 * update the MTU on the interface that received the Packet Too Big in
1724 * case the original packet was forced out that interface with
1725 * SO_BINDTODEVICE or similar. This is the next best thing to the
1726 * correct behaviour, which would be to update the MTU on all
1729 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1730 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1734 * Misc support functions
1737 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1739 struct net *net = dev_net(ort->rt6i_dev);
1740 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1744 rt->dst.input = ort->dst.input;
1745 rt->dst.output = ort->dst.output;
1747 dst_copy_metrics(&rt->dst, &ort->dst);
1748 rt->dst.error = ort->dst.error;
1749 rt->rt6i_idev = ort->rt6i_idev;
1751 in6_dev_hold(rt->rt6i_idev);
1752 rt->dst.lastuse = jiffies;
1753 rt->rt6i_expires = 0;
1755 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1756 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1757 rt->rt6i_metric = 0;
1759 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1760 #ifdef CONFIG_IPV6_SUBTREES
1761 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1763 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1764 rt->rt6i_table = ort->rt6i_table;
1769 #ifdef CONFIG_IPV6_ROUTE_INFO
1770 static struct rt6_info *rt6_get_route_info(struct net *net,
1771 const struct in6_addr *prefix, int prefixlen,
1772 const struct in6_addr *gwaddr, int ifindex)
1774 struct fib6_node *fn;
1775 struct rt6_info *rt = NULL;
1776 struct fib6_table *table;
1778 table = fib6_get_table(net, RT6_TABLE_INFO);
1782 write_lock_bh(&table->tb6_lock);
1783 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1787 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1788 if (rt->rt6i_dev->ifindex != ifindex)
1790 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1792 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1798 write_unlock_bh(&table->tb6_lock);
1802 static struct rt6_info *rt6_add_route_info(struct net *net,
1803 const struct in6_addr *prefix, int prefixlen,
1804 const struct in6_addr *gwaddr, int ifindex,
1807 struct fib6_config cfg = {
1808 .fc_table = RT6_TABLE_INFO,
1809 .fc_metric = IP6_RT_PRIO_USER,
1810 .fc_ifindex = ifindex,
1811 .fc_dst_len = prefixlen,
1812 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1813 RTF_UP | RTF_PREF(pref),
1815 .fc_nlinfo.nlh = NULL,
1816 .fc_nlinfo.nl_net = net,
1819 ipv6_addr_copy(&cfg.fc_dst, prefix);
1820 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1822 /* We should treat it as a default route if prefix length is 0. */
1824 cfg.fc_flags |= RTF_DEFAULT;
1826 ip6_route_add(&cfg);
1828 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1832 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1834 struct rt6_info *rt;
1835 struct fib6_table *table;
1837 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1841 write_lock_bh(&table->tb6_lock);
1842 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1843 if (dev == rt->rt6i_dev &&
1844 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1845 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1850 write_unlock_bh(&table->tb6_lock);
1854 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1855 struct net_device *dev,
1858 struct fib6_config cfg = {
1859 .fc_table = RT6_TABLE_DFLT,
1860 .fc_metric = IP6_RT_PRIO_USER,
1861 .fc_ifindex = dev->ifindex,
1862 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1863 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1865 .fc_nlinfo.nlh = NULL,
1866 .fc_nlinfo.nl_net = dev_net(dev),
1869 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1871 ip6_route_add(&cfg);
1873 return rt6_get_dflt_router(gwaddr, dev);
1876 void rt6_purge_dflt_routers(struct net *net)
1878 struct rt6_info *rt;
1879 struct fib6_table *table;
1881 /* NOTE: Keep consistent with rt6_get_dflt_router */
1882 table = fib6_get_table(net, RT6_TABLE_DFLT);
1887 read_lock_bh(&table->tb6_lock);
1888 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1889 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1891 read_unlock_bh(&table->tb6_lock);
1896 read_unlock_bh(&table->tb6_lock);
1899 static void rtmsg_to_fib6_config(struct net *net,
1900 struct in6_rtmsg *rtmsg,
1901 struct fib6_config *cfg)
1903 memset(cfg, 0, sizeof(*cfg));
1905 cfg->fc_table = RT6_TABLE_MAIN;
1906 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1907 cfg->fc_metric = rtmsg->rtmsg_metric;
1908 cfg->fc_expires = rtmsg->rtmsg_info;
1909 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1910 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1911 cfg->fc_flags = rtmsg->rtmsg_flags;
1913 cfg->fc_nlinfo.nl_net = net;
1915 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1916 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1917 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1920 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1922 struct fib6_config cfg;
1923 struct in6_rtmsg rtmsg;
1927 case SIOCADDRT: /* Add a route */
1928 case SIOCDELRT: /* Delete a route */
1929 if (!capable(CAP_NET_ADMIN))
1931 err = copy_from_user(&rtmsg, arg,
1932 sizeof(struct in6_rtmsg));
1936 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1941 err = ip6_route_add(&cfg);
1944 err = ip6_route_del(&cfg);
1958 * Drop the packet on the floor
1961 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1964 struct dst_entry *dst = skb_dst(skb);
1965 switch (ipstats_mib_noroutes) {
1966 case IPSTATS_MIB_INNOROUTES:
1967 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1968 if (type == IPV6_ADDR_ANY) {
1969 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1970 IPSTATS_MIB_INADDRERRORS);
1974 case IPSTATS_MIB_OUTNOROUTES:
1975 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1976 ipstats_mib_noroutes);
1979 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1984 static int ip6_pkt_discard(struct sk_buff *skb)
1986 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1989 static int ip6_pkt_discard_out(struct sk_buff *skb)
1991 skb->dev = skb_dst(skb)->dev;
1992 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1995 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1997 static int ip6_pkt_prohibit(struct sk_buff *skb)
1999 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2002 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2004 skb->dev = skb_dst(skb)->dev;
2005 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2011 * Allocate a dst for local (unicast / anycast) address.
2014 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2015 const struct in6_addr *addr,
2018 struct net *net = dev_net(idev->dev);
2019 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2020 net->loopback_dev, 0);
2021 struct neighbour *neigh;
2024 if (net_ratelimit())
2025 pr_warning("IPv6: Maximum number of routes reached,"
2026 " consider increasing route/max_size.\n");
2027 return ERR_PTR(-ENOMEM);
2032 rt->dst.flags |= DST_HOST;
2033 rt->dst.input = ip6_input;
2034 rt->dst.output = ip6_output;
2035 rt->rt6i_idev = idev;
2036 rt->dst.obsolete = -1;
2038 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2040 rt->rt6i_flags |= RTF_ANYCAST;
2042 rt->rt6i_flags |= RTF_LOCAL;
2043 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2044 if (IS_ERR(neigh)) {
2047 return ERR_CAST(neigh);
2049 dst_set_neighbour(&rt->dst, neigh);
2051 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2052 rt->rt6i_dst.plen = 128;
2053 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2055 atomic_set(&rt->dst.__refcnt, 1);
2060 int ip6_route_get_saddr(struct net *net,
2061 struct rt6_info *rt,
2062 const struct in6_addr *daddr,
2064 struct in6_addr *saddr)
2066 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2068 if (rt->rt6i_prefsrc.plen)
2069 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2071 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2072 daddr, prefs, saddr);
2076 /* remove deleted ip from prefsrc entries */
2077 struct arg_dev_net_ip {
2078 struct net_device *dev;
2080 struct in6_addr *addr;
2083 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2085 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2086 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2087 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2089 if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2090 rt != net->ipv6.ip6_null_entry &&
2091 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2092 /* remove prefsrc entry */
2093 rt->rt6i_prefsrc.plen = 0;
2098 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2100 struct net *net = dev_net(ifp->idev->dev);
2101 struct arg_dev_net_ip adni = {
2102 .dev = ifp->idev->dev,
2106 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2109 struct arg_dev_net {
2110 struct net_device *dev;
2114 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2116 const struct arg_dev_net *adn = arg;
2117 const struct net_device *dev = adn->dev;
2119 if ((rt->rt6i_dev == dev || dev == NULL) &&
2120 rt != adn->net->ipv6.ip6_null_entry) {
2121 RT6_TRACE("deleted by ifdown %p\n", rt);
2127 void rt6_ifdown(struct net *net, struct net_device *dev)
2129 struct arg_dev_net adn = {
2134 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2135 icmp6_clean_all(fib6_ifdown, &adn);
2138 struct rt6_mtu_change_arg
2140 struct net_device *dev;
2144 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2146 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2147 struct inet6_dev *idev;
2149 /* In IPv6 pmtu discovery is not optional,
2150 so that RTAX_MTU lock cannot disable it.
2151 We still use this lock to block changes
2152 caused by addrconf/ndisc.
2155 idev = __in6_dev_get(arg->dev);
2159 /* For administrative MTU increase, there is no way to discover
2160 IPv6 PMTU increase, so PMTU increase should be updated here.
2161 Since RFC 1981 doesn't include administrative MTU increase
2162 update PMTU increase is a MUST. (i.e. jumbo frame)
2165 If new MTU is less than route PMTU, this new MTU will be the
2166 lowest MTU in the path, update the route PMTU to reflect PMTU
2167 decreases; if new MTU is greater than route PMTU, and the
2168 old MTU is the lowest MTU in the path, update the route PMTU
2169 to reflect the increase. In this case if the other nodes' MTU
2170 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2173 if (rt->rt6i_dev == arg->dev &&
2174 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2175 (dst_mtu(&rt->dst) >= arg->mtu ||
2176 (dst_mtu(&rt->dst) < arg->mtu &&
2177 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2178 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2183 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2185 struct rt6_mtu_change_arg arg = {
2190 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2193 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2194 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2195 [RTA_OIF] = { .type = NLA_U32 },
2196 [RTA_IIF] = { .type = NLA_U32 },
2197 [RTA_PRIORITY] = { .type = NLA_U32 },
2198 [RTA_METRICS] = { .type = NLA_NESTED },
2201 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2202 struct fib6_config *cfg)
2205 struct nlattr *tb[RTA_MAX+1];
2208 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2213 rtm = nlmsg_data(nlh);
2214 memset(cfg, 0, sizeof(*cfg));
2216 cfg->fc_table = rtm->rtm_table;
2217 cfg->fc_dst_len = rtm->rtm_dst_len;
2218 cfg->fc_src_len = rtm->rtm_src_len;
2219 cfg->fc_flags = RTF_UP;
2220 cfg->fc_protocol = rtm->rtm_protocol;
2222 if (rtm->rtm_type == RTN_UNREACHABLE)
2223 cfg->fc_flags |= RTF_REJECT;
2225 if (rtm->rtm_type == RTN_LOCAL)
2226 cfg->fc_flags |= RTF_LOCAL;
2228 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2229 cfg->fc_nlinfo.nlh = nlh;
2230 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2232 if (tb[RTA_GATEWAY]) {
2233 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2234 cfg->fc_flags |= RTF_GATEWAY;
2238 int plen = (rtm->rtm_dst_len + 7) >> 3;
2240 if (nla_len(tb[RTA_DST]) < plen)
2243 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2247 int plen = (rtm->rtm_src_len + 7) >> 3;
2249 if (nla_len(tb[RTA_SRC]) < plen)
2252 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2255 if (tb[RTA_PREFSRC])
2256 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2259 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2261 if (tb[RTA_PRIORITY])
2262 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2264 if (tb[RTA_METRICS]) {
2265 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2266 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2270 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2277 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2279 struct fib6_config cfg;
2282 err = rtm_to_fib6_config(skb, nlh, &cfg);
2286 return ip6_route_del(&cfg);
2289 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2291 struct fib6_config cfg;
2294 err = rtm_to_fib6_config(skb, nlh, &cfg);
2298 return ip6_route_add(&cfg);
2301 static inline size_t rt6_nlmsg_size(void)
2303 return NLMSG_ALIGN(sizeof(struct rtmsg))
2304 + nla_total_size(16) /* RTA_SRC */
2305 + nla_total_size(16) /* RTA_DST */
2306 + nla_total_size(16) /* RTA_GATEWAY */
2307 + nla_total_size(16) /* RTA_PREFSRC */
2308 + nla_total_size(4) /* RTA_TABLE */
2309 + nla_total_size(4) /* RTA_IIF */
2310 + nla_total_size(4) /* RTA_OIF */
2311 + nla_total_size(4) /* RTA_PRIORITY */
2312 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2313 + nla_total_size(sizeof(struct rta_cacheinfo));
2316 static int rt6_fill_node(struct net *net,
2317 struct sk_buff *skb, struct rt6_info *rt,
2318 struct in6_addr *dst, struct in6_addr *src,
2319 int iif, int type, u32 pid, u32 seq,
2320 int prefix, int nowait, unsigned int flags)
2323 struct nlmsghdr *nlh;
2326 struct neighbour *n;
2328 if (prefix) { /* user wants prefix routes only */
2329 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2330 /* success since this is not a prefix route */
2335 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2339 rtm = nlmsg_data(nlh);
2340 rtm->rtm_family = AF_INET6;
2341 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2342 rtm->rtm_src_len = rt->rt6i_src.plen;
2345 table = rt->rt6i_table->tb6_id;
2347 table = RT6_TABLE_UNSPEC;
2348 rtm->rtm_table = table;
2349 NLA_PUT_U32(skb, RTA_TABLE, table);
2350 if (rt->rt6i_flags&RTF_REJECT)
2351 rtm->rtm_type = RTN_UNREACHABLE;
2352 else if (rt->rt6i_flags&RTF_LOCAL)
2353 rtm->rtm_type = RTN_LOCAL;
2354 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2355 rtm->rtm_type = RTN_LOCAL;
2357 rtm->rtm_type = RTN_UNICAST;
2359 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2360 rtm->rtm_protocol = rt->rt6i_protocol;
2361 if (rt->rt6i_flags&RTF_DYNAMIC)
2362 rtm->rtm_protocol = RTPROT_REDIRECT;
2363 else if (rt->rt6i_flags & RTF_ADDRCONF)
2364 rtm->rtm_protocol = RTPROT_KERNEL;
2365 else if (rt->rt6i_flags&RTF_DEFAULT)
2366 rtm->rtm_protocol = RTPROT_RA;
2368 if (rt->rt6i_flags&RTF_CACHE)
2369 rtm->rtm_flags |= RTM_F_CLONED;
2372 NLA_PUT(skb, RTA_DST, 16, dst);
2373 rtm->rtm_dst_len = 128;
2374 } else if (rtm->rtm_dst_len)
2375 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2376 #ifdef CONFIG_IPV6_SUBTREES
2378 NLA_PUT(skb, RTA_SRC, 16, src);
2379 rtm->rtm_src_len = 128;
2380 } else if (rtm->rtm_src_len)
2381 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2384 #ifdef CONFIG_IPV6_MROUTE
2385 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2386 int err = ip6mr_get_route(net, skb, rtm, nowait);
2391 goto nla_put_failure;
2393 if (err == -EMSGSIZE)
2394 goto nla_put_failure;
2399 NLA_PUT_U32(skb, RTA_IIF, iif);
2401 struct in6_addr saddr_buf;
2402 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2403 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2406 if (rt->rt6i_prefsrc.plen) {
2407 struct in6_addr saddr_buf;
2408 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2409 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2412 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2413 goto nla_put_failure;
2416 n = dst_get_neighbour(&rt->dst);
2418 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2420 goto nla_put_failure;
2426 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2428 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2430 if (!(rt->rt6i_flags & RTF_EXPIRES))
2432 else if (rt->rt6i_expires - jiffies < INT_MAX)
2433 expires = rt->rt6i_expires - jiffies;
2437 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2438 expires, rt->dst.error) < 0)
2439 goto nla_put_failure;
2441 return nlmsg_end(skb, nlh);
2444 nlmsg_cancel(skb, nlh);
2448 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2450 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2453 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2454 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2455 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2459 return rt6_fill_node(arg->net,
2460 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2461 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2462 prefix, 0, NLM_F_MULTI);
2465 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2467 struct net *net = sock_net(in_skb->sk);
2468 struct nlattr *tb[RTA_MAX+1];
2469 struct rt6_info *rt;
2470 struct sk_buff *skb;
2475 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2480 memset(&fl6, 0, sizeof(fl6));
2483 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2486 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2490 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2493 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2497 iif = nla_get_u32(tb[RTA_IIF]);
2500 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2503 struct net_device *dev;
2504 dev = __dev_get_by_index(net, iif);
2511 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2517 /* Reserve room for dummy headers, this skb can pass
2518 through good chunk of routing engine.
2520 skb_reset_mac_header(skb);
2521 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2523 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2524 skb_dst_set(skb, &rt->dst);
2526 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2527 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2528 nlh->nlmsg_seq, 0, 0, 0);
2534 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2539 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2541 struct sk_buff *skb;
2542 struct net *net = info->nl_net;
2547 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2549 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2553 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2554 event, info->pid, seq, 0, 0, 0);
2556 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2557 WARN_ON(err == -EMSGSIZE);
2561 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2562 info->nlh, gfp_any());
2566 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2569 static int ip6_route_dev_notify(struct notifier_block *this,
2570 unsigned long event, void *data)
2572 struct net_device *dev = (struct net_device *)data;
2573 struct net *net = dev_net(dev);
2575 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2576 net->ipv6.ip6_null_entry->dst.dev = dev;
2577 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2578 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2579 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2580 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2581 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2582 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2593 #ifdef CONFIG_PROC_FS
2604 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2606 struct seq_file *m = p_arg;
2607 struct neighbour *n;
2609 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2611 #ifdef CONFIG_IPV6_SUBTREES
2612 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2614 seq_puts(m, "00000000000000000000000000000000 00 ");
2617 n = dst_get_neighbour(&rt->dst);
2619 seq_printf(m, "%pi6", n->primary_key);
2621 seq_puts(m, "00000000000000000000000000000000");
2624 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2625 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2626 rt->dst.__use, rt->rt6i_flags,
2627 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2631 static int ipv6_route_show(struct seq_file *m, void *v)
2633 struct net *net = (struct net *)m->private;
2634 fib6_clean_all(net, rt6_info_route, 0, m);
2638 static int ipv6_route_open(struct inode *inode, struct file *file)
2640 return single_open_net(inode, file, ipv6_route_show);
2643 static const struct file_operations ipv6_route_proc_fops = {
2644 .owner = THIS_MODULE,
2645 .open = ipv6_route_open,
2647 .llseek = seq_lseek,
2648 .release = single_release_net,
2651 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2653 struct net *net = (struct net *)seq->private;
2654 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2655 net->ipv6.rt6_stats->fib_nodes,
2656 net->ipv6.rt6_stats->fib_route_nodes,
2657 net->ipv6.rt6_stats->fib_rt_alloc,
2658 net->ipv6.rt6_stats->fib_rt_entries,
2659 net->ipv6.rt6_stats->fib_rt_cache,
2660 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2661 net->ipv6.rt6_stats->fib_discarded_routes);
2666 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2668 return single_open_net(inode, file, rt6_stats_seq_show);
2671 static const struct file_operations rt6_stats_seq_fops = {
2672 .owner = THIS_MODULE,
2673 .open = rt6_stats_seq_open,
2675 .llseek = seq_lseek,
2676 .release = single_release_net,
2678 #endif /* CONFIG_PROC_FS */
2680 #ifdef CONFIG_SYSCTL
2683 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2684 void __user *buffer, size_t *lenp, loff_t *ppos)
2691 net = (struct net *)ctl->extra1;
2692 delay = net->ipv6.sysctl.flush_delay;
2693 proc_dointvec(ctl, write, buffer, lenp, ppos);
2694 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2698 ctl_table ipv6_route_table_template[] = {
2700 .procname = "flush",
2701 .data = &init_net.ipv6.sysctl.flush_delay,
2702 .maxlen = sizeof(int),
2704 .proc_handler = ipv6_sysctl_rtcache_flush
2707 .procname = "gc_thresh",
2708 .data = &ip6_dst_ops_template.gc_thresh,
2709 .maxlen = sizeof(int),
2711 .proc_handler = proc_dointvec,
2714 .procname = "max_size",
2715 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2716 .maxlen = sizeof(int),
2718 .proc_handler = proc_dointvec,
2721 .procname = "gc_min_interval",
2722 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2723 .maxlen = sizeof(int),
2725 .proc_handler = proc_dointvec_jiffies,
2728 .procname = "gc_timeout",
2729 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2730 .maxlen = sizeof(int),
2732 .proc_handler = proc_dointvec_jiffies,
2735 .procname = "gc_interval",
2736 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2737 .maxlen = sizeof(int),
2739 .proc_handler = proc_dointvec_jiffies,
2742 .procname = "gc_elasticity",
2743 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2744 .maxlen = sizeof(int),
2746 .proc_handler = proc_dointvec,
2749 .procname = "mtu_expires",
2750 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2751 .maxlen = sizeof(int),
2753 .proc_handler = proc_dointvec_jiffies,
2756 .procname = "min_adv_mss",
2757 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2758 .maxlen = sizeof(int),
2760 .proc_handler = proc_dointvec,
2763 .procname = "gc_min_interval_ms",
2764 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2765 .maxlen = sizeof(int),
2767 .proc_handler = proc_dointvec_ms_jiffies,
2772 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2774 struct ctl_table *table;
2776 table = kmemdup(ipv6_route_table_template,
2777 sizeof(ipv6_route_table_template),
2781 table[0].data = &net->ipv6.sysctl.flush_delay;
2782 table[0].extra1 = net;
2783 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2784 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2785 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2786 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2787 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2788 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2789 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2790 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2791 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2798 static int __net_init ip6_route_net_init(struct net *net)
2802 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2803 sizeof(net->ipv6.ip6_dst_ops));
2805 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2806 goto out_ip6_dst_ops;
2808 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2809 sizeof(*net->ipv6.ip6_null_entry),
2811 if (!net->ipv6.ip6_null_entry)
2812 goto out_ip6_dst_entries;
2813 net->ipv6.ip6_null_entry->dst.path =
2814 (struct dst_entry *)net->ipv6.ip6_null_entry;
2815 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2816 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2817 ip6_template_metrics, true);
2819 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2820 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2821 sizeof(*net->ipv6.ip6_prohibit_entry),
2823 if (!net->ipv6.ip6_prohibit_entry)
2824 goto out_ip6_null_entry;
2825 net->ipv6.ip6_prohibit_entry->dst.path =
2826 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2827 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2828 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2829 ip6_template_metrics, true);
2831 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2832 sizeof(*net->ipv6.ip6_blk_hole_entry),
2834 if (!net->ipv6.ip6_blk_hole_entry)
2835 goto out_ip6_prohibit_entry;
2836 net->ipv6.ip6_blk_hole_entry->dst.path =
2837 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2838 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2839 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2840 ip6_template_metrics, true);
2843 net->ipv6.sysctl.flush_delay = 0;
2844 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2845 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2846 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2847 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2848 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2849 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2850 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2852 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2858 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2859 out_ip6_prohibit_entry:
2860 kfree(net->ipv6.ip6_prohibit_entry);
2862 kfree(net->ipv6.ip6_null_entry);
2864 out_ip6_dst_entries:
2865 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2870 static void __net_exit ip6_route_net_exit(struct net *net)
2872 kfree(net->ipv6.ip6_null_entry);
2873 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2874 kfree(net->ipv6.ip6_prohibit_entry);
2875 kfree(net->ipv6.ip6_blk_hole_entry);
2877 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2880 static int __net_init ip6_route_net_init_late(struct net *net)
2882 #ifdef CONFIG_PROC_FS
2883 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2884 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2889 static void __net_exit ip6_route_net_exit_late(struct net *net)
2891 #ifdef CONFIG_PROC_FS
2892 proc_net_remove(net, "ipv6_route");
2893 proc_net_remove(net, "rt6_stats");
2897 static struct pernet_operations ip6_route_net_ops = {
2898 .init = ip6_route_net_init,
2899 .exit = ip6_route_net_exit,
2902 static struct pernet_operations ip6_route_net_late_ops = {
2903 .init = ip6_route_net_init_late,
2904 .exit = ip6_route_net_exit_late,
2907 static struct notifier_block ip6_route_dev_notifier = {
2908 .notifier_call = ip6_route_dev_notify,
2912 int __init ip6_route_init(void)
2917 ip6_dst_ops_template.kmem_cachep =
2918 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2919 SLAB_HWCACHE_ALIGN, NULL);
2920 if (!ip6_dst_ops_template.kmem_cachep)
2923 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2925 goto out_kmem_cache;
2927 ret = register_pernet_subsys(&ip6_route_net_ops);
2929 goto out_dst_entries;
2931 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2933 /* Registering of the loopback is done before this portion of code,
2934 * the loopback reference in rt6_info will not be taken, do it
2935 * manually for init_net */
2936 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2937 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2938 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2939 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2940 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2941 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2942 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2946 goto out_register_subsys;
2952 ret = fib6_rules_init();
2956 ret = register_pernet_subsys(&ip6_route_net_late_ops);
2958 goto fib6_rules_init;
2961 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2962 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2963 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2964 goto out_register_late_subsys;
2966 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2968 goto out_register_late_subsys;
2973 out_register_late_subsys:
2974 unregister_pernet_subsys(&ip6_route_net_late_ops);
2976 fib6_rules_cleanup();
2981 out_register_subsys:
2982 unregister_pernet_subsys(&ip6_route_net_ops);
2984 dst_entries_destroy(&ip6_dst_blackhole_ops);
2986 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2990 void ip6_route_cleanup(void)
2992 unregister_netdevice_notifier(&ip6_route_dev_notifier);
2993 unregister_pernet_subsys(&ip6_route_net_late_ops);
2994 fib6_rules_cleanup();
2997 unregister_pernet_subsys(&ip6_route_net_ops);
2998 dst_entries_destroy(&ip6_dst_blackhole_ops);
2999 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);