2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
27 #include <linux/capability.h>
28 #include <linux/config.h>
29 #include <linux/errno.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/init.h>
39 #include <linux/netlink.h>
40 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
54 #include <linux/rtnetlink.h>
58 #include <asm/uaccess.h>
61 #include <linux/sysctl.h>
64 /* Set to 3 to get tracing. */
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #define RT6_TRACE(x...) do { ; } while (0)
75 #define CLONE_OFFLINK_ROUTE 0
77 #define RT6_SELECT_F_IFACE 0x1
78 #define RT6_SELECT_F_REACHABLE 0x2
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(void);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct sk_buff *skb);
98 static void ip6_link_failure(struct sk_buff *skb);
99 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103 struct in6_addr *gwaddr, int ifindex,
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106 struct in6_addr *gwaddr, int ifindex);
109 static struct dst_ops ip6_dst_ops = {
111 .protocol = __constant_htons(ETH_P_IPV6),
114 .check = ip6_dst_check,
115 .destroy = ip6_dst_destroy,
116 .ifdown = ip6_dst_ifdown,
117 .negative_advice = ip6_negative_advice,
118 .link_failure = ip6_link_failure,
119 .update_pmtu = ip6_rt_update_pmtu,
120 .entry_size = sizeof(struct rt6_info),
123 struct rt6_info ip6_null_entry = {
126 .__refcnt = ATOMIC_INIT(1),
128 .dev = &loopback_dev,
130 .error = -ENETUNREACH,
131 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
132 .input = ip6_pkt_discard,
133 .output = ip6_pkt_discard_out,
135 .path = (struct dst_entry*)&ip6_null_entry,
138 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
139 .rt6i_metric = ~(u32) 0,
140 .rt6i_ref = ATOMIC_INIT(1),
143 struct fib6_node ip6_routing_table = {
144 .leaf = &ip6_null_entry,
145 .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
148 /* Protects all the ip6 fib */
150 DEFINE_RWLOCK(rt6_lock);
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
156 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
159 static void ip6_dst_destroy(struct dst_entry *dst)
161 struct rt6_info *rt = (struct rt6_info *)dst;
162 struct inet6_dev *idev = rt->rt6i_idev;
165 rt->rt6i_idev = NULL;
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
173 struct rt6_info *rt = (struct rt6_info *)dst;
174 struct inet6_dev *idev = rt->rt6i_idev;
176 if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178 if (loopback_idev != NULL) {
179 rt->rt6i_idev = loopback_idev;
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
187 return (rt->rt6i_flags & RTF_EXPIRES &&
188 time_after(jiffies, rt->rt6i_expires));
192 * Route lookup. Any rt6_lock is implied.
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
199 struct rt6_info *local = NULL;
200 struct rt6_info *sprt;
203 for (sprt = rt; sprt; sprt = sprt->u.next) {
204 struct net_device *dev = sprt->rt6i_dev;
205 if (dev->ifindex == oif)
207 if (dev->flags & IFF_LOOPBACK) {
208 if (sprt->rt6i_idev == NULL ||
209 sprt->rt6i_idev->dev->ifindex != oif) {
212 if (local && (!oif ||
213 local->rt6i_idev->dev->ifindex == oif))
224 return &ip6_null_entry;
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
232 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
234 * Okay, this does not seem to be appropriate
235 * for now, however, we need to check if it
236 * is really so; aka Router Reachability Probing.
238 * Router Reachability Probe MUST be rate-limited
239 * to no more than one per minute.
241 if (!neigh || (neigh->nud_state & NUD_VALID))
243 read_lock_bh(&neigh->lock);
244 if (!(neigh->nud_state & NUD_VALID) &&
245 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246 struct in6_addr mcaddr;
247 struct in6_addr *target;
249 neigh->updated = jiffies;
250 read_unlock_bh(&neigh->lock);
252 target = (struct in6_addr *)&neigh->primary_key;
253 addrconf_addr_solict_mult(target, &mcaddr);
254 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
256 read_unlock_bh(&neigh->lock);
259 static inline void rt6_probe(struct rt6_info *rt)
266 * Default Router Selection (RFC 2461 6.3.6)
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
270 struct net_device *dev = rt->rt6i_dev;
271 if (!oif || dev->ifindex == oif)
273 if ((dev->flags & IFF_LOOPBACK) &&
274 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
279 static int inline rt6_check_neigh(struct rt6_info *rt)
281 struct neighbour *neigh = rt->rt6i_nexthop;
284 read_lock_bh(&neigh->lock);
285 if (neigh->nud_state & NUD_VALID)
287 read_unlock_bh(&neigh->lock);
292 static int rt6_score_route(struct rt6_info *rt, int oif,
295 int m = rt6_check_dev(rt, oif);
296 if (!m && (strict & RT6_SELECT_F_IFACE))
298 #ifdef CONFIG_IPV6_ROUTER_PREF
299 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
301 if (rt6_check_neigh(rt))
303 else if (strict & RT6_SELECT_F_REACHABLE)
308 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
311 struct rt6_info *match = NULL, *last = NULL;
312 struct rt6_info *rt, *rt0 = *head;
316 RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
317 __FUNCTION__, head, head ? *head : NULL, oif);
319 for (rt = rt0, metric = rt0->rt6i_metric;
320 rt && rt->rt6i_metric == metric;
324 if (rt6_check_expired(rt))
329 m = rt6_score_route(rt, oif, strict);
343 (strict & RT6_SELECT_F_REACHABLE) &&
344 last && last != rt0) {
345 /* no entries matched; do round-robin */
347 rt0->u.next = last->u.next;
351 RT6_TRACE("%s() => %p, score=%d\n",
352 __FUNCTION__, match, mpri);
354 return (match ? match : &ip6_null_entry);
357 #ifdef CONFIG_IPV6_ROUTE_INFO
358 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
359 struct in6_addr *gwaddr)
361 struct route_info *rinfo = (struct route_info *) opt;
362 struct in6_addr prefix_buf, *prefix;
367 if (len < sizeof(struct route_info)) {
371 /* Sanity check for prefix_len and length */
372 if (rinfo->length > 3) {
374 } else if (rinfo->prefix_len > 128) {
376 } else if (rinfo->prefix_len > 64) {
377 if (rinfo->length < 2) {
380 } else if (rinfo->prefix_len > 0) {
381 if (rinfo->length < 1) {
386 pref = rinfo->route_pref;
387 if (pref == ICMPV6_ROUTER_PREF_INVALID)
388 pref = ICMPV6_ROUTER_PREF_MEDIUM;
390 lifetime = htonl(rinfo->lifetime);
391 if (lifetime == 0xffffffff) {
393 } else if (lifetime > 0x7fffffff/HZ) {
394 /* Avoid arithmetic overflow */
395 lifetime = 0x7fffffff/HZ - 1;
398 if (rinfo->length == 3)
399 prefix = (struct in6_addr *)rinfo->prefix;
401 /* this function is safe */
402 ipv6_addr_prefix(&prefix_buf,
403 (struct in6_addr *)rinfo->prefix,
405 prefix = &prefix_buf;
408 rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
410 if (rt && !lifetime) {
411 ip6_del_rt(rt, NULL, NULL, NULL);
416 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
419 rt->rt6i_flags = RTF_ROUTEINFO |
420 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
423 if (lifetime == 0xffffffff) {
424 rt->rt6i_flags &= ~RTF_EXPIRES;
426 rt->rt6i_expires = jiffies + HZ * lifetime;
427 rt->rt6i_flags |= RTF_EXPIRES;
429 dst_release(&rt->u.dst);
435 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
438 struct fib6_node *fn;
441 read_lock_bh(&rt6_lock);
442 fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
443 rt = rt6_device_match(fn->leaf, oif, strict);
444 dst_hold(&rt->u.dst);
446 read_unlock_bh(&rt6_lock);
448 rt->u.dst.lastuse = jiffies;
449 if (rt->u.dst.error == 0)
451 dst_release(&rt->u.dst);
455 /* ip6_ins_rt is called with FREE rt6_lock.
456 It takes new route entry, the addition fails by any reason the
457 route is freed. In any case, if caller does not hold it, it may
461 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
462 void *_rtattr, struct netlink_skb_parms *req)
466 write_lock_bh(&rt6_lock);
467 err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
468 write_unlock_bh(&rt6_lock);
473 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
474 struct in6_addr *saddr)
482 rt = ip6_rt_copy(ort);
485 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
486 if (rt->rt6i_dst.plen != 128 &&
487 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
488 rt->rt6i_flags |= RTF_ANYCAST;
489 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
492 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
493 rt->rt6i_dst.plen = 128;
494 rt->rt6i_flags |= RTF_CACHE;
495 rt->u.dst.flags |= DST_HOST;
497 #ifdef CONFIG_IPV6_SUBTREES
498 if (rt->rt6i_src.plen && saddr) {
499 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
500 rt->rt6i_src.plen = 128;
504 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
511 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
513 struct rt6_info *rt = ip6_rt_copy(ort);
515 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
516 rt->rt6i_dst.plen = 128;
517 rt->rt6i_flags |= RTF_CACHE;
518 if (rt->rt6i_flags & RTF_REJECT)
519 rt->u.dst.error = ort->u.dst.error;
520 rt->u.dst.flags |= DST_HOST;
521 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
526 #define BACKTRACK() \
527 if (rt == &ip6_null_entry) { \
528 while ((fn = fn->parent) != NULL) { \
529 if (fn->fn_flags & RTN_ROOT) { \
532 if (fn->fn_flags & RTN_RTINFO) \
538 void ip6_route_input(struct sk_buff *skb)
540 struct fib6_node *fn;
541 struct rt6_info *rt, *nrt;
545 int reachable = RT6_SELECT_F_REACHABLE;
547 strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
550 read_lock_bh(&rt6_lock);
553 fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
554 &skb->nh.ipv6h->saddr);
557 rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
559 if (rt == &ip6_null_entry ||
560 rt->rt6i_flags & RTF_CACHE)
563 dst_hold(&rt->u.dst);
564 read_unlock_bh(&rt6_lock);
566 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
567 nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
569 #if CLONE_OFFLINK_ROUTE
570 nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
576 dst_release(&rt->u.dst);
577 rt = nrt ? : &ip6_null_entry;
579 dst_hold(&rt->u.dst);
581 err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
590 * Race condition! In the gap, when rt6_lock was
591 * released someone could insert this route. Relookup.
593 dst_release(&rt->u.dst);
601 dst_hold(&rt->u.dst);
602 read_unlock_bh(&rt6_lock);
604 rt->u.dst.lastuse = jiffies;
606 skb->dst = (struct dst_entry *) rt;
610 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
612 struct fib6_node *fn;
613 struct rt6_info *rt, *nrt;
617 int reachable = RT6_SELECT_F_REACHABLE;
619 strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
622 read_lock_bh(&rt6_lock);
625 fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
628 rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
630 if (rt == &ip6_null_entry ||
631 rt->rt6i_flags & RTF_CACHE)
634 dst_hold(&rt->u.dst);
635 read_unlock_bh(&rt6_lock);
637 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
638 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
640 #if CLONE_OFFLINK_ROUTE
641 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
647 dst_release(&rt->u.dst);
648 rt = nrt ? : &ip6_null_entry;
650 dst_hold(&rt->u.dst);
652 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
661 * Race condition! In the gap, when rt6_lock was
662 * released someone could insert this route. Relookup.
664 dst_release(&rt->u.dst);
672 dst_hold(&rt->u.dst);
673 read_unlock_bh(&rt6_lock);
675 rt->u.dst.lastuse = jiffies;
682 * Destination cache support functions
685 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
689 rt = (struct rt6_info *) dst;
691 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
697 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
699 struct rt6_info *rt = (struct rt6_info *) dst;
702 if (rt->rt6i_flags & RTF_CACHE)
703 ip6_del_rt(rt, NULL, NULL, NULL);
710 static void ip6_link_failure(struct sk_buff *skb)
714 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
716 rt = (struct rt6_info *) skb->dst;
718 if (rt->rt6i_flags&RTF_CACHE) {
719 dst_set_expires(&rt->u.dst, 0);
720 rt->rt6i_flags |= RTF_EXPIRES;
721 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
722 rt->rt6i_node->fn_sernum = -1;
726 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
728 struct rt6_info *rt6 = (struct rt6_info*)dst;
730 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
731 rt6->rt6i_flags |= RTF_MODIFIED;
732 if (mtu < IPV6_MIN_MTU) {
734 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
736 dst->metrics[RTAX_MTU-1] = mtu;
740 /* Protected by rt6_lock. */
741 static struct dst_entry *ndisc_dst_gc_list;
742 static int ipv6_get_mtu(struct net_device *dev);
744 static inline unsigned int ipv6_advmss(unsigned int mtu)
746 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
748 if (mtu < ip6_rt_min_advmss)
749 mtu = ip6_rt_min_advmss;
752 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
753 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
754 * IPV6_MAXPLEN is also valid and means: "any MSS,
755 * rely only on pmtu discovery"
757 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
762 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
763 struct neighbour *neigh,
764 struct in6_addr *addr,
765 int (*output)(struct sk_buff *))
768 struct inet6_dev *idev = in6_dev_get(dev);
770 if (unlikely(idev == NULL))
773 rt = ip6_dst_alloc();
774 if (unlikely(rt == NULL)) {
783 neigh = ndisc_get_neigh(dev, addr);
786 rt->rt6i_idev = idev;
787 rt->rt6i_nexthop = neigh;
788 atomic_set(&rt->u.dst.__refcnt, 1);
789 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
790 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
791 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
792 rt->u.dst.output = output;
794 #if 0 /* there's no chance to use these for ndisc */
795 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
798 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
799 rt->rt6i_dst.plen = 128;
802 write_lock_bh(&rt6_lock);
803 rt->u.dst.next = ndisc_dst_gc_list;
804 ndisc_dst_gc_list = &rt->u.dst;
805 write_unlock_bh(&rt6_lock);
807 fib6_force_start_gc();
810 return (struct dst_entry *)rt;
813 int ndisc_dst_gc(int *more)
815 struct dst_entry *dst, *next, **pprev;
819 pprev = &ndisc_dst_gc_list;
821 while ((dst = *pprev) != NULL) {
822 if (!atomic_read(&dst->__refcnt)) {
835 static int ip6_dst_gc(void)
837 static unsigned expire = 30*HZ;
838 static unsigned long last_gc;
839 unsigned long now = jiffies;
841 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
842 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
848 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
849 expire = ip6_rt_gc_timeout>>1;
852 expire -= expire>>ip6_rt_gc_elasticity;
853 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
856 /* Clean host part of a prefix. Not necessary in radix tree,
857 but results in cleaner routing tables.
859 Remove it only when all the things will work!
862 static int ipv6_get_mtu(struct net_device *dev)
864 int mtu = IPV6_MIN_MTU;
865 struct inet6_dev *idev;
867 idev = in6_dev_get(dev);
869 mtu = idev->cnf.mtu6;
875 int ipv6_get_hoplimit(struct net_device *dev)
877 int hoplimit = ipv6_devconf.hop_limit;
878 struct inet6_dev *idev;
880 idev = in6_dev_get(dev);
882 hoplimit = idev->cnf.hop_limit;
892 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
893 void *_rtattr, struct netlink_skb_parms *req)
898 struct rt6_info *rt = NULL;
899 struct net_device *dev = NULL;
900 struct inet6_dev *idev = NULL;
903 rta = (struct rtattr **) _rtattr;
905 if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
907 #ifndef CONFIG_IPV6_SUBTREES
908 if (rtmsg->rtmsg_src_len)
911 if (rtmsg->rtmsg_ifindex) {
913 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
916 idev = in6_dev_get(dev);
921 if (rtmsg->rtmsg_metric == 0)
922 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
924 rt = ip6_dst_alloc();
931 rt->u.dst.obsolete = -1;
932 rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
933 if (nlh && (r = NLMSG_DATA(nlh))) {
934 rt->rt6i_protocol = r->rtm_protocol;
936 rt->rt6i_protocol = RTPROT_BOOT;
939 addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
941 if (addr_type & IPV6_ADDR_MULTICAST)
942 rt->u.dst.input = ip6_mc_input;
944 rt->u.dst.input = ip6_forward;
946 rt->u.dst.output = ip6_output;
948 ipv6_addr_prefix(&rt->rt6i_dst.addr,
949 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
950 rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
951 if (rt->rt6i_dst.plen == 128)
952 rt->u.dst.flags = DST_HOST;
954 #ifdef CONFIG_IPV6_SUBTREES
955 ipv6_addr_prefix(&rt->rt6i_src.addr,
956 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
957 rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
960 rt->rt6i_metric = rtmsg->rtmsg_metric;
962 /* We cannot add true routes via loopback here,
963 they would result in kernel looping; promote them to reject routes
965 if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
966 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
967 /* hold loopback dev/idev if we haven't done so. */
968 if (dev != &loopback_dev) {
975 idev = in6_dev_get(dev);
981 rt->u.dst.output = ip6_pkt_discard_out;
982 rt->u.dst.input = ip6_pkt_discard;
983 rt->u.dst.error = -ENETUNREACH;
984 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
988 if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
989 struct in6_addr *gw_addr;
992 gw_addr = &rtmsg->rtmsg_gateway;
993 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
994 gwa_type = ipv6_addr_type(gw_addr);
996 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
997 struct rt6_info *grt;
999 /* IPv6 strictly inhibits using not link-local
1000 addresses as nexthop address.
1001 Otherwise, router will not able to send redirects.
1002 It is very good, but in some (rare!) circumstances
1003 (SIT, PtP, NBMA NOARP links) it is handy to allow
1004 some exceptions. --ANK
1007 if (!(gwa_type&IPV6_ADDR_UNICAST))
1010 grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1012 err = -EHOSTUNREACH;
1016 if (dev != grt->rt6i_dev) {
1017 dst_release(&grt->u.dst);
1021 dev = grt->rt6i_dev;
1022 idev = grt->rt6i_idev;
1024 in6_dev_hold(grt->rt6i_idev);
1026 if (!(grt->rt6i_flags&RTF_GATEWAY))
1028 dst_release(&grt->u.dst);
1034 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1042 if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1043 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1044 if (IS_ERR(rt->rt6i_nexthop)) {
1045 err = PTR_ERR(rt->rt6i_nexthop);
1046 rt->rt6i_nexthop = NULL;
1051 rt->rt6i_flags = rtmsg->rtmsg_flags;
1054 if (rta && rta[RTA_METRICS-1]) {
1055 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1056 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1058 while (RTA_OK(attr, attrlen)) {
1059 unsigned flavor = attr->rta_type;
1061 if (flavor > RTAX_MAX) {
1065 rt->u.dst.metrics[flavor-1] =
1066 *(u32 *)RTA_DATA(attr);
1068 attr = RTA_NEXT(attr, attrlen);
1072 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1073 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1074 if (!rt->u.dst.metrics[RTAX_MTU-1])
1075 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1076 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1077 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1078 rt->u.dst.dev = dev;
1079 rt->rt6i_idev = idev;
1080 return ip6_ins_rt(rt, nlh, _rtattr, req);
1088 dst_free((struct dst_entry *) rt);
1092 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1096 write_lock_bh(&rt6_lock);
1098 err = fib6_del(rt, nlh, _rtattr, req);
1099 dst_release(&rt->u.dst);
1101 write_unlock_bh(&rt6_lock);
1106 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1108 struct fib6_node *fn;
1109 struct rt6_info *rt;
1112 read_lock_bh(&rt6_lock);
1114 fn = fib6_locate(&ip6_routing_table,
1115 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1116 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1119 for (rt = fn->leaf; rt; rt = rt->u.next) {
1120 if (rtmsg->rtmsg_ifindex &&
1121 (rt->rt6i_dev == NULL ||
1122 rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1124 if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1125 !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1127 if (rtmsg->rtmsg_metric &&
1128 rtmsg->rtmsg_metric != rt->rt6i_metric)
1130 dst_hold(&rt->u.dst);
1131 read_unlock_bh(&rt6_lock);
1133 return ip6_del_rt(rt, nlh, _rtattr, req);
1136 read_unlock_bh(&rt6_lock);
1144 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1145 struct neighbour *neigh, u8 *lladdr, int on_link)
1147 struct rt6_info *rt, *nrt;
1149 /* Locate old route to this destination. */
1150 rt = rt6_lookup(dest, NULL, neigh->dev->ifindex, 1);
1155 if (neigh->dev != rt->rt6i_dev)
1159 * Current route is on-link; redirect is always invalid.
1161 * Seems, previous statement is not true. It could
1162 * be node, which looks for us as on-link (f.e. proxy ndisc)
1163 * But then router serving it might decide, that we should
1164 * know truth 8)8) --ANK (980726).
1166 if (!(rt->rt6i_flags&RTF_GATEWAY))
1170 * RFC 2461 specifies that redirects should only be
1171 * accepted if they come from the nexthop to the target.
1172 * Due to the way default routers are chosen, this notion
1173 * is a bit fuzzy and one might need to check all default
1176 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway)) {
1177 if (rt->rt6i_flags & RTF_DEFAULT) {
1178 struct rt6_info *rt1;
1180 read_lock(&rt6_lock);
1181 for (rt1 = ip6_routing_table.leaf; rt1; rt1 = rt1->u.next) {
1182 if (ipv6_addr_equal(saddr, &rt1->rt6i_gateway)) {
1183 dst_hold(&rt1->u.dst);
1184 dst_release(&rt->u.dst);
1185 read_unlock(&rt6_lock);
1190 read_unlock(&rt6_lock);
1192 if (net_ratelimit())
1193 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1194 "for redirect target\n");
1201 * We have finally decided to accept it.
1204 neigh_update(neigh, lladdr, NUD_STALE,
1205 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1206 NEIGH_UPDATE_F_OVERRIDE|
1207 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1208 NEIGH_UPDATE_F_ISROUTER))
1212 * Redirect received -> path was valid.
1213 * Look, redirects are sent only in response to data packets,
1214 * so that this nexthop apparently is reachable. --ANK
1216 dst_confirm(&rt->u.dst);
1218 /* Duplicate redirect: silently ignore. */
1219 if (neigh == rt->u.dst.neighbour)
1222 nrt = ip6_rt_copy(rt);
1226 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1228 nrt->rt6i_flags &= ~RTF_GATEWAY;
1230 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1231 nrt->rt6i_dst.plen = 128;
1232 nrt->u.dst.flags |= DST_HOST;
1234 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1235 nrt->rt6i_nexthop = neigh_clone(neigh);
1236 /* Reset pmtu, it may be better */
1237 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1238 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1240 if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1243 if (rt->rt6i_flags&RTF_CACHE) {
1244 ip6_del_rt(rt, NULL, NULL, NULL);
1249 dst_release(&rt->u.dst);
1254 * Handle ICMP "packet too big" messages
1255 * i.e. Path MTU discovery
1258 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1259 struct net_device *dev, u32 pmtu)
1261 struct rt6_info *rt, *nrt;
1264 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1268 if (pmtu >= dst_mtu(&rt->u.dst))
1271 if (pmtu < IPV6_MIN_MTU) {
1273 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1274 * MTU (1280) and a fragment header should always be included
1275 * after a node receiving Too Big message reporting PMTU is
1276 * less than the IPv6 Minimum Link MTU.
1278 pmtu = IPV6_MIN_MTU;
1282 /* New mtu received -> path was valid.
1283 They are sent only in response to data packets,
1284 so that this nexthop apparently is reachable. --ANK
1286 dst_confirm(&rt->u.dst);
1288 /* Host route. If it is static, it would be better
1289 not to override it, but add new one, so that
1290 when cache entry will expire old pmtu
1291 would return automatically.
1293 if (rt->rt6i_flags & RTF_CACHE) {
1294 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1296 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1297 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1298 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1303 Two cases are possible:
1304 1. It is connected route. Action: COW
1305 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1307 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1308 nrt = rt6_alloc_cow(rt, daddr, saddr);
1310 nrt = rt6_alloc_clone(rt, daddr);
1313 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1315 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1317 /* According to RFC 1981, detecting PMTU increase shouldn't be
1318 * happened within 5 mins, the recommended timer is 10 mins.
1319 * Here this route expiration time is set to ip6_rt_mtu_expires
1320 * which is 10 mins. After 10 mins the decreased pmtu is expired
1321 * and detecting PMTU increase will be automatically happened.
1323 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1324 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1326 ip6_ins_rt(nrt, NULL, NULL, NULL);
1329 dst_release(&rt->u.dst);
1333 * Misc support functions
1336 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1338 struct rt6_info *rt = ip6_dst_alloc();
1341 rt->u.dst.input = ort->u.dst.input;
1342 rt->u.dst.output = ort->u.dst.output;
1344 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1345 rt->u.dst.dev = ort->u.dst.dev;
1347 dev_hold(rt->u.dst.dev);
1348 rt->rt6i_idev = ort->rt6i_idev;
1350 in6_dev_hold(rt->rt6i_idev);
1351 rt->u.dst.lastuse = jiffies;
1352 rt->rt6i_expires = 0;
1354 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1355 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1356 rt->rt6i_metric = 0;
1358 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1359 #ifdef CONFIG_IPV6_SUBTREES
1360 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1366 #ifdef CONFIG_IPV6_ROUTE_INFO
1367 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1368 struct in6_addr *gwaddr, int ifindex)
1370 struct fib6_node *fn;
1371 struct rt6_info *rt = NULL;
1373 write_lock_bh(&rt6_lock);
1374 fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1378 for (rt = fn->leaf; rt; rt = rt->u.next) {
1379 if (rt->rt6i_dev->ifindex != ifindex)
1381 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1383 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1385 dst_hold(&rt->u.dst);
1389 write_unlock_bh(&rt6_lock);
1393 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1394 struct in6_addr *gwaddr, int ifindex,
1397 struct in6_rtmsg rtmsg;
1399 memset(&rtmsg, 0, sizeof(rtmsg));
1400 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1401 ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1402 rtmsg.rtmsg_dst_len = prefixlen;
1403 ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1404 rtmsg.rtmsg_metric = 1024;
1405 rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1406 rtmsg.rtmsg_ifindex = ifindex;
1408 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1410 return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1414 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1416 struct rt6_info *rt;
1417 struct fib6_node *fn;
1419 fn = &ip6_routing_table;
1421 write_lock_bh(&rt6_lock);
1422 for (rt = fn->leaf; rt; rt=rt->u.next) {
1423 if (dev == rt->rt6i_dev &&
1424 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1425 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1429 dst_hold(&rt->u.dst);
1430 write_unlock_bh(&rt6_lock);
1434 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1435 struct net_device *dev,
1438 struct in6_rtmsg rtmsg;
1440 memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1441 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1442 ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1443 rtmsg.rtmsg_metric = 1024;
1444 rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1447 rtmsg.rtmsg_ifindex = dev->ifindex;
1449 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1450 return rt6_get_dflt_router(gwaddr, dev);
1453 void rt6_purge_dflt_routers(void)
1455 struct rt6_info *rt;
1458 read_lock_bh(&rt6_lock);
1459 for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1460 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1461 dst_hold(&rt->u.dst);
1463 read_unlock_bh(&rt6_lock);
1465 ip6_del_rt(rt, NULL, NULL, NULL);
1470 read_unlock_bh(&rt6_lock);
1473 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1475 struct in6_rtmsg rtmsg;
1479 case SIOCADDRT: /* Add a route */
1480 case SIOCDELRT: /* Delete a route */
1481 if (!capable(CAP_NET_ADMIN))
1483 err = copy_from_user(&rtmsg, arg,
1484 sizeof(struct in6_rtmsg));
1491 err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1494 err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1508 * Drop the packet on the floor
1511 static int ip6_pkt_discard(struct sk_buff *skb)
1513 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1514 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1519 static int ip6_pkt_discard_out(struct sk_buff *skb)
1521 skb->dev = skb->dst->dev;
1522 return ip6_pkt_discard(skb);
1526 * Allocate a dst for local (unicast / anycast) address.
1529 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1530 const struct in6_addr *addr,
1533 struct rt6_info *rt = ip6_dst_alloc();
1536 return ERR_PTR(-ENOMEM);
1538 dev_hold(&loopback_dev);
1541 rt->u.dst.flags = DST_HOST;
1542 rt->u.dst.input = ip6_input;
1543 rt->u.dst.output = ip6_output;
1544 rt->rt6i_dev = &loopback_dev;
1545 rt->rt6i_idev = idev;
1546 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1547 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1548 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1549 rt->u.dst.obsolete = -1;
1551 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1553 rt->rt6i_flags |= RTF_ANYCAST;
1555 rt->rt6i_flags |= RTF_LOCAL;
1556 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1557 if (rt->rt6i_nexthop == NULL) {
1558 dst_free((struct dst_entry *) rt);
1559 return ERR_PTR(-ENOMEM);
1562 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1563 rt->rt6i_dst.plen = 128;
1565 atomic_set(&rt->u.dst.__refcnt, 1);
1570 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1572 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1573 rt != &ip6_null_entry) {
1574 RT6_TRACE("deleted by ifdown %p\n", rt);
1580 void rt6_ifdown(struct net_device *dev)
1582 write_lock_bh(&rt6_lock);
1583 fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1584 write_unlock_bh(&rt6_lock);
1587 struct rt6_mtu_change_arg
1589 struct net_device *dev;
1593 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1595 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1596 struct inet6_dev *idev;
1598 /* In IPv6 pmtu discovery is not optional,
1599 so that RTAX_MTU lock cannot disable it.
1600 We still use this lock to block changes
1601 caused by addrconf/ndisc.
1604 idev = __in6_dev_get(arg->dev);
1608 /* For administrative MTU increase, there is no way to discover
1609 IPv6 PMTU increase, so PMTU increase should be updated here.
1610 Since RFC 1981 doesn't include administrative MTU increase
1611 update PMTU increase is a MUST. (i.e. jumbo frame)
1614 If new MTU is less than route PMTU, this new MTU will be the
1615 lowest MTU in the path, update the route PMTU to reflect PMTU
1616 decreases; if new MTU is greater than route PMTU, and the
1617 old MTU is the lowest MTU in the path, update the route PMTU
1618 to reflect the increase. In this case if the other nodes' MTU
1619 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1622 if (rt->rt6i_dev == arg->dev &&
1623 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1624 (dst_mtu(&rt->u.dst) > arg->mtu ||
1625 (dst_mtu(&rt->u.dst) < arg->mtu &&
1626 dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1627 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1628 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1632 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1634 struct rt6_mtu_change_arg arg;
1638 read_lock_bh(&rt6_lock);
1639 fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1640 read_unlock_bh(&rt6_lock);
1643 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1644 struct in6_rtmsg *rtmsg)
1646 memset(rtmsg, 0, sizeof(*rtmsg));
1648 rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1649 rtmsg->rtmsg_src_len = r->rtm_src_len;
1650 rtmsg->rtmsg_flags = RTF_UP;
1651 if (r->rtm_type == RTN_UNREACHABLE)
1652 rtmsg->rtmsg_flags |= RTF_REJECT;
1654 if (rta[RTA_GATEWAY-1]) {
1655 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1657 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1658 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1660 if (rta[RTA_DST-1]) {
1661 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1663 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1665 if (rta[RTA_SRC-1]) {
1666 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1668 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1670 if (rta[RTA_OIF-1]) {
1671 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1673 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1675 if (rta[RTA_PRIORITY-1]) {
1676 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1678 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1683 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1685 struct rtmsg *r = NLMSG_DATA(nlh);
1686 struct in6_rtmsg rtmsg;
1688 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1690 return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1693 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1695 struct rtmsg *r = NLMSG_DATA(nlh);
1696 struct in6_rtmsg rtmsg;
1698 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1700 return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1703 struct rt6_rtnl_dump_arg
1705 struct sk_buff *skb;
1706 struct netlink_callback *cb;
1709 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1710 struct in6_addr *dst, struct in6_addr *src,
1711 int iif, int type, u32 pid, u32 seq,
1712 int prefix, unsigned int flags)
1715 struct nlmsghdr *nlh;
1716 unsigned char *b = skb->tail;
1717 struct rta_cacheinfo ci;
1719 if (prefix) { /* user wants prefix routes only */
1720 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1721 /* success since this is not a prefix route */
1726 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1727 rtm = NLMSG_DATA(nlh);
1728 rtm->rtm_family = AF_INET6;
1729 rtm->rtm_dst_len = rt->rt6i_dst.plen;
1730 rtm->rtm_src_len = rt->rt6i_src.plen;
1732 rtm->rtm_table = RT_TABLE_MAIN;
1733 if (rt->rt6i_flags&RTF_REJECT)
1734 rtm->rtm_type = RTN_UNREACHABLE;
1735 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1736 rtm->rtm_type = RTN_LOCAL;
1738 rtm->rtm_type = RTN_UNICAST;
1740 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1741 rtm->rtm_protocol = rt->rt6i_protocol;
1742 if (rt->rt6i_flags&RTF_DYNAMIC)
1743 rtm->rtm_protocol = RTPROT_REDIRECT;
1744 else if (rt->rt6i_flags & RTF_ADDRCONF)
1745 rtm->rtm_protocol = RTPROT_KERNEL;
1746 else if (rt->rt6i_flags&RTF_DEFAULT)
1747 rtm->rtm_protocol = RTPROT_RA;
1749 if (rt->rt6i_flags&RTF_CACHE)
1750 rtm->rtm_flags |= RTM_F_CLONED;
1753 RTA_PUT(skb, RTA_DST, 16, dst);
1754 rtm->rtm_dst_len = 128;
1755 } else if (rtm->rtm_dst_len)
1756 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1757 #ifdef CONFIG_IPV6_SUBTREES
1759 RTA_PUT(skb, RTA_SRC, 16, src);
1760 rtm->rtm_src_len = 128;
1761 } else if (rtm->rtm_src_len)
1762 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1765 RTA_PUT(skb, RTA_IIF, 4, &iif);
1767 struct in6_addr saddr_buf;
1768 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1769 RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1771 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1772 goto rtattr_failure;
1773 if (rt->u.dst.neighbour)
1774 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1776 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1777 RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1778 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1779 if (rt->rt6i_expires)
1780 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1783 ci.rta_used = rt->u.dst.__use;
1784 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1785 ci.rta_error = rt->u.dst.error;
1789 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1790 nlh->nlmsg_len = skb->tail - b;
1795 skb_trim(skb, b - skb->data);
1799 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1801 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1804 if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1805 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1806 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1810 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1811 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1812 prefix, NLM_F_MULTI);
1815 static int fib6_dump_node(struct fib6_walker_t *w)
1818 struct rt6_info *rt;
1820 for (rt = w->leaf; rt; rt = rt->u.next) {
1821 res = rt6_dump_route(rt, w->args);
1823 /* Frame is full, suspend walking */
1833 static void fib6_dump_end(struct netlink_callback *cb)
1835 struct fib6_walker_t *w = (void*)cb->args[0];
1839 fib6_walker_unlink(w);
1842 cb->done = (void*)cb->args[1];
1846 static int fib6_dump_done(struct netlink_callback *cb)
1849 return cb->done ? cb->done(cb) : 0;
1852 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1854 struct rt6_rtnl_dump_arg arg;
1855 struct fib6_walker_t *w;
1861 w = (void*)cb->args[0];
1865 * 1. hook callback destructor.
1867 cb->args[1] = (long)cb->done;
1868 cb->done = fib6_dump_done;
1871 * 2. allocate and initialize walker.
1873 w = kmalloc(sizeof(*w), GFP_ATOMIC);
1876 RT6_TRACE("dump<%p", w);
1877 memset(w, 0, sizeof(*w));
1878 w->root = &ip6_routing_table;
1879 w->func = fib6_dump_node;
1881 cb->args[0] = (long)w;
1882 read_lock_bh(&rt6_lock);
1884 read_unlock_bh(&rt6_lock);
1887 read_lock_bh(&rt6_lock);
1888 res = fib6_walk_continue(w);
1889 read_unlock_bh(&rt6_lock);
1892 if (res <= 0 && skb->len == 0)
1893 RT6_TRACE("%p>dump end\n", w);
1895 res = res < 0 ? res : skb->len;
1896 /* res < 0 is an error. (really, impossible)
1897 res == 0 means that dump is complete, but skb still can contain data.
1898 res > 0 dump is not complete, but frame is full.
1900 /* Destroy walker, if dump of this table is complete. */
1906 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1908 struct rtattr **rta = arg;
1911 struct sk_buff *skb;
1913 struct rt6_info *rt;
1915 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1919 /* Reserve room for dummy headers, this skb can pass
1920 through good chunk of routing engine.
1922 skb->mac.raw = skb->data;
1923 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1925 memset(&fl, 0, sizeof(fl));
1927 ipv6_addr_copy(&fl.fl6_src,
1928 (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1930 ipv6_addr_copy(&fl.fl6_dst,
1931 (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1934 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1937 struct net_device *dev;
1938 dev = __dev_get_by_index(iif);
1947 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1949 rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1951 skb->dst = &rt->u.dst;
1953 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1954 err = rt6_fill_node(skb, rt,
1955 &fl.fl6_dst, &fl.fl6_src,
1957 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1958 nlh->nlmsg_seq, 0, 0);
1964 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1974 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
1975 struct netlink_skb_parms *req)
1977 struct sk_buff *skb;
1978 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1979 u32 pid = current->pid;
1985 seq = nlh->nlmsg_seq;
1987 skb = alloc_skb(size, gfp_any());
1989 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
1992 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
1994 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
1997 NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
1998 netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2005 #ifdef CONFIG_PROC_FS
2007 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2018 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2020 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2023 if (arg->skip < arg->offset / RT6_INFO_LEN) {
2028 if (arg->len >= arg->length)
2031 for (i=0; i<16; i++) {
2032 sprintf(arg->buffer + arg->len, "%02x",
2033 rt->rt6i_dst.addr.s6_addr[i]);
2036 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2039 #ifdef CONFIG_IPV6_SUBTREES
2040 for (i=0; i<16; i++) {
2041 sprintf(arg->buffer + arg->len, "%02x",
2042 rt->rt6i_src.addr.s6_addr[i]);
2045 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2048 sprintf(arg->buffer + arg->len,
2049 "00000000000000000000000000000000 00 ");
2053 if (rt->rt6i_nexthop) {
2054 for (i=0; i<16; i++) {
2055 sprintf(arg->buffer + arg->len, "%02x",
2056 rt->rt6i_nexthop->primary_key[i]);
2060 sprintf(arg->buffer + arg->len,
2061 "00000000000000000000000000000000");
2064 arg->len += sprintf(arg->buffer + arg->len,
2065 " %08x %08x %08x %08x %8s\n",
2066 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2067 rt->u.dst.__use, rt->rt6i_flags,
2068 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2072 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2074 struct rt6_proc_arg arg;
2075 arg.buffer = buffer;
2076 arg.offset = offset;
2077 arg.length = length;
2081 read_lock_bh(&rt6_lock);
2082 fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2083 read_unlock_bh(&rt6_lock);
2087 *start += offset % RT6_INFO_LEN;
2089 arg.len -= offset % RT6_INFO_LEN;
2091 if (arg.len > length)
2099 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2101 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2102 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2103 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2104 rt6_stats.fib_rt_cache,
2105 atomic_read(&ip6_dst_ops.entries),
2106 rt6_stats.fib_discarded_routes);
2111 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2113 return single_open(file, rt6_stats_seq_show, NULL);
2116 static struct file_operations rt6_stats_seq_fops = {
2117 .owner = THIS_MODULE,
2118 .open = rt6_stats_seq_open,
2120 .llseek = seq_lseek,
2121 .release = single_release,
2123 #endif /* CONFIG_PROC_FS */
2125 #ifdef CONFIG_SYSCTL
2127 static int flush_delay;
2130 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2131 void __user *buffer, size_t *lenp, loff_t *ppos)
2134 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2135 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2141 ctl_table ipv6_route_table[] = {
2143 .ctl_name = NET_IPV6_ROUTE_FLUSH,
2144 .procname = "flush",
2145 .data = &flush_delay,
2146 .maxlen = sizeof(int),
2148 .proc_handler = &ipv6_sysctl_rtcache_flush
2151 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
2152 .procname = "gc_thresh",
2153 .data = &ip6_dst_ops.gc_thresh,
2154 .maxlen = sizeof(int),
2156 .proc_handler = &proc_dointvec,
2159 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
2160 .procname = "max_size",
2161 .data = &ip6_rt_max_size,
2162 .maxlen = sizeof(int),
2164 .proc_handler = &proc_dointvec,
2167 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2168 .procname = "gc_min_interval",
2169 .data = &ip6_rt_gc_min_interval,
2170 .maxlen = sizeof(int),
2172 .proc_handler = &proc_dointvec_jiffies,
2173 .strategy = &sysctl_jiffies,
2176 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
2177 .procname = "gc_timeout",
2178 .data = &ip6_rt_gc_timeout,
2179 .maxlen = sizeof(int),
2181 .proc_handler = &proc_dointvec_jiffies,
2182 .strategy = &sysctl_jiffies,
2185 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2186 .procname = "gc_interval",
2187 .data = &ip6_rt_gc_interval,
2188 .maxlen = sizeof(int),
2190 .proc_handler = &proc_dointvec_jiffies,
2191 .strategy = &sysctl_jiffies,
2194 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2195 .procname = "gc_elasticity",
2196 .data = &ip6_rt_gc_elasticity,
2197 .maxlen = sizeof(int),
2199 .proc_handler = &proc_dointvec_jiffies,
2200 .strategy = &sysctl_jiffies,
2203 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2204 .procname = "mtu_expires",
2205 .data = &ip6_rt_mtu_expires,
2206 .maxlen = sizeof(int),
2208 .proc_handler = &proc_dointvec_jiffies,
2209 .strategy = &sysctl_jiffies,
2212 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2213 .procname = "min_adv_mss",
2214 .data = &ip6_rt_min_advmss,
2215 .maxlen = sizeof(int),
2217 .proc_handler = &proc_dointvec_jiffies,
2218 .strategy = &sysctl_jiffies,
2221 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2222 .procname = "gc_min_interval_ms",
2223 .data = &ip6_rt_gc_min_interval,
2224 .maxlen = sizeof(int),
2226 .proc_handler = &proc_dointvec_ms_jiffies,
2227 .strategy = &sysctl_ms_jiffies,
2234 void __init ip6_route_init(void)
2236 struct proc_dir_entry *p;
2238 ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2239 sizeof(struct rt6_info),
2240 0, SLAB_HWCACHE_ALIGN,
2242 if (!ip6_dst_ops.kmem_cachep)
2243 panic("cannot create ip6_dst_cache");
2246 #ifdef CONFIG_PROC_FS
2247 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2249 p->owner = THIS_MODULE;
2251 proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2258 void ip6_route_cleanup(void)
2260 #ifdef CONFIG_PROC_FS
2261 proc_net_remove("ipv6_route");
2262 proc_net_remove("rt6_stats");
2269 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);