[IPV6]: ROUTE: Add experimental support for Route Information Option in RA (RFC4191).
[firefly-linux-kernel-4.4.55.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/config.h>
29 #include <linux/errno.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/init.h>
39 #include <linux/netlink.h>
40 #include <linux/if_arp.h>
41
42 #ifdef  CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 #define RT6_SELECT_F_IFACE      0x1
78 #define RT6_SELECT_F_REACHABLE  0x2
79
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(void);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct sk_buff *skb);
98 static void             ip6_link_failure(struct sk_buff *skb);
99 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103                                            struct in6_addr *gwaddr, int ifindex,
104                                            unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106                                            struct in6_addr *gwaddr, int ifindex);
107 #endif
108
109 static struct dst_ops ip6_dst_ops = {
110         .family                 =       AF_INET6,
111         .protocol               =       __constant_htons(ETH_P_IPV6),
112         .gc                     =       ip6_dst_gc,
113         .gc_thresh              =       1024,
114         .check                  =       ip6_dst_check,
115         .destroy                =       ip6_dst_destroy,
116         .ifdown                 =       ip6_dst_ifdown,
117         .negative_advice        =       ip6_negative_advice,
118         .link_failure           =       ip6_link_failure,
119         .update_pmtu            =       ip6_rt_update_pmtu,
120         .entry_size             =       sizeof(struct rt6_info),
121 };
122
123 struct rt6_info ip6_null_entry = {
124         .u = {
125                 .dst = {
126                         .__refcnt       = ATOMIC_INIT(1),
127                         .__use          = 1,
128                         .dev            = &loopback_dev,
129                         .obsolete       = -1,
130                         .error          = -ENETUNREACH,
131                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
132                         .input          = ip6_pkt_discard,
133                         .output         = ip6_pkt_discard_out,
134                         .ops            = &ip6_dst_ops,
135                         .path           = (struct dst_entry*)&ip6_null_entry,
136                 }
137         },
138         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
139         .rt6i_metric    = ~(u32) 0,
140         .rt6i_ref       = ATOMIC_INIT(1),
141 };
142
143 struct fib6_node ip6_routing_table = {
144         .leaf           = &ip6_null_entry,
145         .fn_flags       = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
146 };
147
148 /* Protects all the ip6 fib */
149
150 DEFINE_RWLOCK(rt6_lock);
151
152
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
155 {
156         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
157 }
158
159 static void ip6_dst_destroy(struct dst_entry *dst)
160 {
161         struct rt6_info *rt = (struct rt6_info *)dst;
162         struct inet6_dev *idev = rt->rt6i_idev;
163
164         if (idev != NULL) {
165                 rt->rt6i_idev = NULL;
166                 in6_dev_put(idev);
167         }       
168 }
169
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
171                            int how)
172 {
173         struct rt6_info *rt = (struct rt6_info *)dst;
174         struct inet6_dev *idev = rt->rt6i_idev;
175
176         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178                 if (loopback_idev != NULL) {
179                         rt->rt6i_idev = loopback_idev;
180                         in6_dev_put(idev);
181                 }
182         }
183 }
184
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
186 {
187         return (rt->rt6i_flags & RTF_EXPIRES &&
188                 time_after(jiffies, rt->rt6i_expires));
189 }
190
191 /*
192  *      Route lookup. Any rt6_lock is implied.
193  */
194
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
196                                                     int oif,
197                                                     int strict)
198 {
199         struct rt6_info *local = NULL;
200         struct rt6_info *sprt;
201
202         if (oif) {
203                 for (sprt = rt; sprt; sprt = sprt->u.next) {
204                         struct net_device *dev = sprt->rt6i_dev;
205                         if (dev->ifindex == oif)
206                                 return sprt;
207                         if (dev->flags & IFF_LOOPBACK) {
208                                 if (sprt->rt6i_idev == NULL ||
209                                     sprt->rt6i_idev->dev->ifindex != oif) {
210                                         if (strict && oif)
211                                                 continue;
212                                         if (local && (!oif || 
213                                                       local->rt6i_idev->dev->ifindex == oif))
214                                                 continue;
215                                 }
216                                 local = sprt;
217                         }
218                 }
219
220                 if (local)
221                         return local;
222
223                 if (strict)
224                         return &ip6_null_entry;
225         }
226         return rt;
227 }
228
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
231 {
232         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
233         /*
234          * Okay, this does not seem to be appropriate
235          * for now, however, we need to check if it
236          * is really so; aka Router Reachability Probing.
237          *
238          * Router Reachability Probe MUST be rate-limited
239          * to no more than one per minute.
240          */
241         if (!neigh || (neigh->nud_state & NUD_VALID))
242                 return;
243         read_lock_bh(&neigh->lock);
244         if (!(neigh->nud_state & NUD_VALID) &&
245             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246                 struct in6_addr mcaddr;
247                 struct in6_addr *target;
248
249                 neigh->updated = jiffies;
250                 read_unlock_bh(&neigh->lock);
251
252                 target = (struct in6_addr *)&neigh->primary_key;
253                 addrconf_addr_solict_mult(target, &mcaddr);
254                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
255         } else
256                 read_unlock_bh(&neigh->lock);
257 }
258 #else
259 static inline void rt6_probe(struct rt6_info *rt)
260 {
261         return;
262 }
263 #endif
264
265 /*
266  * Default Router Selection (RFC 2461 6.3.6)
267  */
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
269 {
270         struct net_device *dev = rt->rt6i_dev;
271         if (!oif || dev->ifindex == oif)
272                 return 2;
273         if ((dev->flags & IFF_LOOPBACK) &&
274             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
275                 return 1;
276         return 0;
277 }
278
279 static int inline rt6_check_neigh(struct rt6_info *rt)
280 {
281         struct neighbour *neigh = rt->rt6i_nexthop;
282         int m = 0;
283         if (neigh) {
284                 read_lock_bh(&neigh->lock);
285                 if (neigh->nud_state & NUD_VALID)
286                         m = 1;
287                 read_unlock_bh(&neigh->lock);
288         }
289         return m;
290 }
291
292 static int rt6_score_route(struct rt6_info *rt, int oif,
293                            int strict)
294 {
295         int m = rt6_check_dev(rt, oif);
296         if (!m && (strict & RT6_SELECT_F_IFACE))
297                 return -1;
298 #ifdef CONFIG_IPV6_ROUTER_PREF
299         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
300 #endif
301         if (rt6_check_neigh(rt))
302                 m |= 16;
303         else if (strict & RT6_SELECT_F_REACHABLE)
304                 return -1;
305         return m;
306 }
307
308 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
309                                    int strict)
310 {
311         struct rt6_info *match = NULL, *last = NULL;
312         struct rt6_info *rt, *rt0 = *head;
313         u32 metric;
314         int mpri = -1;
315
316         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
317                   __FUNCTION__, head, head ? *head : NULL, oif);
318
319         for (rt = rt0, metric = rt0->rt6i_metric;
320              rt && rt->rt6i_metric == metric;
321              rt = rt->u.next) {
322                 int m;
323
324                 if (rt6_check_expired(rt))
325                         continue;
326
327                 last = rt;
328
329                 m = rt6_score_route(rt, oif, strict);
330                 if (m < 0)
331                         continue;
332
333                 if (m > mpri) {
334                         rt6_probe(match);
335                         match = rt;
336                         mpri = m;
337                 } else {
338                         rt6_probe(rt);
339                 }
340         }
341
342         if (!match &&
343             (strict & RT6_SELECT_F_REACHABLE) &&
344             last && last != rt0) {
345                 /* no entries matched; do round-robin */
346                 *head = rt0->u.next;
347                 rt0->u.next = last->u.next;
348                 last->u.next = rt0;
349         }
350
351         RT6_TRACE("%s() => %p, score=%d\n",
352                   __FUNCTION__, match, mpri);
353
354         return (match ? match : &ip6_null_entry);
355 }
356
357 #ifdef CONFIG_IPV6_ROUTE_INFO
358 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
359                   struct in6_addr *gwaddr)
360 {
361         struct route_info *rinfo = (struct route_info *) opt;
362         struct in6_addr prefix_buf, *prefix;
363         unsigned int pref;
364         u32 lifetime;
365         struct rt6_info *rt;
366
367         if (len < sizeof(struct route_info)) {
368                 return -EINVAL;
369         }
370
371         /* Sanity check for prefix_len and length */
372         if (rinfo->length > 3) {
373                 return -EINVAL;
374         } else if (rinfo->prefix_len > 128) {
375                 return -EINVAL;
376         } else if (rinfo->prefix_len > 64) {
377                 if (rinfo->length < 2) {
378                         return -EINVAL;
379                 }
380         } else if (rinfo->prefix_len > 0) {
381                 if (rinfo->length < 1) {
382                         return -EINVAL;
383                 }
384         }
385
386         pref = rinfo->route_pref;
387         if (pref == ICMPV6_ROUTER_PREF_INVALID)
388                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
389
390         lifetime = htonl(rinfo->lifetime);
391         if (lifetime == 0xffffffff) {
392                 /* infinity */
393         } else if (lifetime > 0x7fffffff/HZ) {
394                 /* Avoid arithmetic overflow */
395                 lifetime = 0x7fffffff/HZ - 1;
396         }
397
398         if (rinfo->length == 3)
399                 prefix = (struct in6_addr *)rinfo->prefix;
400         else {
401                 /* this function is safe */
402                 ipv6_addr_prefix(&prefix_buf,
403                                  (struct in6_addr *)rinfo->prefix,
404                                  rinfo->prefix_len);
405                 prefix = &prefix_buf;
406         }
407
408         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
409
410         if (rt && !lifetime) {
411                 ip6_del_rt(rt, NULL, NULL, NULL);
412                 rt = NULL;
413         }
414
415         if (!rt && lifetime)
416                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
417                                         pref);
418         else if (rt)
419                 rt->rt6i_flags = RTF_ROUTEINFO |
420                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
421
422         if (rt) {
423                 if (lifetime == 0xffffffff) {
424                         rt->rt6i_flags &= ~RTF_EXPIRES;
425                 } else {
426                         rt->rt6i_expires = jiffies + HZ * lifetime;
427                         rt->rt6i_flags |= RTF_EXPIRES;
428                 }
429                 dst_release(&rt->u.dst);
430         }
431         return 0;
432 }
433 #endif
434
435 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
436                             int oif, int strict)
437 {
438         struct fib6_node *fn;
439         struct rt6_info *rt;
440
441         read_lock_bh(&rt6_lock);
442         fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
443         rt = rt6_device_match(fn->leaf, oif, strict);
444         dst_hold(&rt->u.dst);
445         rt->u.dst.__use++;
446         read_unlock_bh(&rt6_lock);
447
448         rt->u.dst.lastuse = jiffies;
449         if (rt->u.dst.error == 0)
450                 return rt;
451         dst_release(&rt->u.dst);
452         return NULL;
453 }
454
455 /* ip6_ins_rt is called with FREE rt6_lock.
456    It takes new route entry, the addition fails by any reason the
457    route is freed. In any case, if caller does not hold it, it may
458    be destroyed.
459  */
460
461 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
462                 void *_rtattr, struct netlink_skb_parms *req)
463 {
464         int err;
465
466         write_lock_bh(&rt6_lock);
467         err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
468         write_unlock_bh(&rt6_lock);
469
470         return err;
471 }
472
473 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
474                                       struct in6_addr *saddr)
475 {
476         struct rt6_info *rt;
477
478         /*
479          *      Clone the route.
480          */
481
482         rt = ip6_rt_copy(ort);
483
484         if (rt) {
485                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
486                         if (rt->rt6i_dst.plen != 128 &&
487                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
488                                 rt->rt6i_flags |= RTF_ANYCAST;
489                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
490                 }
491
492                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
493                 rt->rt6i_dst.plen = 128;
494                 rt->rt6i_flags |= RTF_CACHE;
495                 rt->u.dst.flags |= DST_HOST;
496
497 #ifdef CONFIG_IPV6_SUBTREES
498                 if (rt->rt6i_src.plen && saddr) {
499                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
500                         rt->rt6i_src.plen = 128;
501                 }
502 #endif
503
504                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
505
506         }
507
508         return rt;
509 }
510
511 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
512 {
513         struct rt6_info *rt = ip6_rt_copy(ort);
514         if (rt) {
515                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
516                 rt->rt6i_dst.plen = 128;
517                 rt->rt6i_flags |= RTF_CACHE;
518                 if (rt->rt6i_flags & RTF_REJECT)
519                         rt->u.dst.error = ort->u.dst.error;
520                 rt->u.dst.flags |= DST_HOST;
521                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
522         }
523         return rt;
524 }
525
526 #define BACKTRACK() \
527 if (rt == &ip6_null_entry) { \
528        while ((fn = fn->parent) != NULL) { \
529                 if (fn->fn_flags & RTN_ROOT) { \
530                         goto out; \
531                 } \
532                 if (fn->fn_flags & RTN_RTINFO) \
533                         goto restart; \
534         } \
535 }
536
537
538 void ip6_route_input(struct sk_buff *skb)
539 {
540         struct fib6_node *fn;
541         struct rt6_info *rt, *nrt;
542         int strict;
543         int attempts = 3;
544         int err;
545         int reachable = RT6_SELECT_F_REACHABLE;
546
547         strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
548
549 relookup:
550         read_lock_bh(&rt6_lock);
551
552 restart_2:
553         fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
554                          &skb->nh.ipv6h->saddr);
555
556 restart:
557         rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
558         BACKTRACK();
559         if (rt == &ip6_null_entry ||
560             rt->rt6i_flags & RTF_CACHE)
561                 goto out;
562
563         dst_hold(&rt->u.dst);
564         read_unlock_bh(&rt6_lock);
565
566         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
567                 nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
568         else {
569 #if CLONE_OFFLINK_ROUTE
570                 nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
571 #else
572                 goto out2;
573 #endif
574         }
575
576         dst_release(&rt->u.dst);
577         rt = nrt ? : &ip6_null_entry;
578
579         dst_hold(&rt->u.dst);
580         if (nrt) {
581                 err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
582                 if (!err)
583                         goto out2;
584         }
585
586         if (--attempts <= 0)
587                 goto out2;
588
589         /*
590          * Race condition! In the gap, when rt6_lock was
591          * released someone could insert this route.  Relookup.
592          */
593         dst_release(&rt->u.dst);
594         goto relookup;
595
596 out:
597         if (reachable) {
598                 reachable = 0;
599                 goto restart_2;
600         }
601         dst_hold(&rt->u.dst);
602         read_unlock_bh(&rt6_lock);
603 out2:
604         rt->u.dst.lastuse = jiffies;
605         rt->u.dst.__use++;
606         skb->dst = (struct dst_entry *) rt;
607         return;
608 }
609
610 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
611 {
612         struct fib6_node *fn;
613         struct rt6_info *rt, *nrt;
614         int strict;
615         int attempts = 3;
616         int err;
617         int reachable = RT6_SELECT_F_REACHABLE;
618
619         strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
620
621 relookup:
622         read_lock_bh(&rt6_lock);
623
624 restart_2:
625         fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
626
627 restart:
628         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
629         BACKTRACK();
630         if (rt == &ip6_null_entry ||
631             rt->rt6i_flags & RTF_CACHE)
632                 goto out;
633
634         dst_hold(&rt->u.dst);
635         read_unlock_bh(&rt6_lock);
636
637         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
638                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
639         else {
640 #if CLONE_OFFLINK_ROUTE
641                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
642 #else
643                 goto out2;
644 #endif
645         }
646
647         dst_release(&rt->u.dst);
648         rt = nrt ? : &ip6_null_entry;
649
650         dst_hold(&rt->u.dst);
651         if (nrt) {
652                 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
653                 if (!err)
654                         goto out2;
655         }
656
657         if (--attempts <= 0)
658                 goto out2;
659
660         /*
661          * Race condition! In the gap, when rt6_lock was
662          * released someone could insert this route.  Relookup.
663          */
664         dst_release(&rt->u.dst);
665         goto relookup;
666
667 out:
668         if (reachable) {
669                 reachable = 0;
670                 goto restart_2;
671         }
672         dst_hold(&rt->u.dst);
673         read_unlock_bh(&rt6_lock);
674 out2:
675         rt->u.dst.lastuse = jiffies;
676         rt->u.dst.__use++;
677         return &rt->u.dst;
678 }
679
680
681 /*
682  *      Destination cache support functions
683  */
684
685 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
686 {
687         struct rt6_info *rt;
688
689         rt = (struct rt6_info *) dst;
690
691         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
692                 return dst;
693
694         return NULL;
695 }
696
697 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
698 {
699         struct rt6_info *rt = (struct rt6_info *) dst;
700
701         if (rt) {
702                 if (rt->rt6i_flags & RTF_CACHE)
703                         ip6_del_rt(rt, NULL, NULL, NULL);
704                 else
705                         dst_release(dst);
706         }
707         return NULL;
708 }
709
710 static void ip6_link_failure(struct sk_buff *skb)
711 {
712         struct rt6_info *rt;
713
714         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
715
716         rt = (struct rt6_info *) skb->dst;
717         if (rt) {
718                 if (rt->rt6i_flags&RTF_CACHE) {
719                         dst_set_expires(&rt->u.dst, 0);
720                         rt->rt6i_flags |= RTF_EXPIRES;
721                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
722                         rt->rt6i_node->fn_sernum = -1;
723         }
724 }
725
726 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
727 {
728         struct rt6_info *rt6 = (struct rt6_info*)dst;
729
730         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
731                 rt6->rt6i_flags |= RTF_MODIFIED;
732                 if (mtu < IPV6_MIN_MTU) {
733                         mtu = IPV6_MIN_MTU;
734                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
735                 }
736                 dst->metrics[RTAX_MTU-1] = mtu;
737         }
738 }
739
740 /* Protected by rt6_lock.  */
741 static struct dst_entry *ndisc_dst_gc_list;
742 static int ipv6_get_mtu(struct net_device *dev);
743
744 static inline unsigned int ipv6_advmss(unsigned int mtu)
745 {
746         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
747
748         if (mtu < ip6_rt_min_advmss)
749                 mtu = ip6_rt_min_advmss;
750
751         /*
752          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
753          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
754          * IPV6_MAXPLEN is also valid and means: "any MSS, 
755          * rely only on pmtu discovery"
756          */
757         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
758                 mtu = IPV6_MAXPLEN;
759         return mtu;
760 }
761
762 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
763                                   struct neighbour *neigh,
764                                   struct in6_addr *addr,
765                                   int (*output)(struct sk_buff *))
766 {
767         struct rt6_info *rt;
768         struct inet6_dev *idev = in6_dev_get(dev);
769
770         if (unlikely(idev == NULL))
771                 return NULL;
772
773         rt = ip6_dst_alloc();
774         if (unlikely(rt == NULL)) {
775                 in6_dev_put(idev);
776                 goto out;
777         }
778
779         dev_hold(dev);
780         if (neigh)
781                 neigh_hold(neigh);
782         else
783                 neigh = ndisc_get_neigh(dev, addr);
784
785         rt->rt6i_dev      = dev;
786         rt->rt6i_idev     = idev;
787         rt->rt6i_nexthop  = neigh;
788         atomic_set(&rt->u.dst.__refcnt, 1);
789         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
790         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
791         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
792         rt->u.dst.output  = output;
793
794 #if 0   /* there's no chance to use these for ndisc */
795         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
796                                 ? DST_HOST 
797                                 : 0;
798         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
799         rt->rt6i_dst.plen = 128;
800 #endif
801
802         write_lock_bh(&rt6_lock);
803         rt->u.dst.next = ndisc_dst_gc_list;
804         ndisc_dst_gc_list = &rt->u.dst;
805         write_unlock_bh(&rt6_lock);
806
807         fib6_force_start_gc();
808
809 out:
810         return (struct dst_entry *)rt;
811 }
812
813 int ndisc_dst_gc(int *more)
814 {
815         struct dst_entry *dst, *next, **pprev;
816         int freed;
817
818         next = NULL;
819         pprev = &ndisc_dst_gc_list;
820         freed = 0;
821         while ((dst = *pprev) != NULL) {
822                 if (!atomic_read(&dst->__refcnt)) {
823                         *pprev = dst->next;
824                         dst_free(dst);
825                         freed++;
826                 } else {
827                         pprev = &dst->next;
828                         (*more)++;
829                 }
830         }
831
832         return freed;
833 }
834
835 static int ip6_dst_gc(void)
836 {
837         static unsigned expire = 30*HZ;
838         static unsigned long last_gc;
839         unsigned long now = jiffies;
840
841         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
842             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
843                 goto out;
844
845         expire++;
846         fib6_run_gc(expire);
847         last_gc = now;
848         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
849                 expire = ip6_rt_gc_timeout>>1;
850
851 out:
852         expire -= expire>>ip6_rt_gc_elasticity;
853         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
854 }
855
856 /* Clean host part of a prefix. Not necessary in radix tree,
857    but results in cleaner routing tables.
858
859    Remove it only when all the things will work!
860  */
861
862 static int ipv6_get_mtu(struct net_device *dev)
863 {
864         int mtu = IPV6_MIN_MTU;
865         struct inet6_dev *idev;
866
867         idev = in6_dev_get(dev);
868         if (idev) {
869                 mtu = idev->cnf.mtu6;
870                 in6_dev_put(idev);
871         }
872         return mtu;
873 }
874
875 int ipv6_get_hoplimit(struct net_device *dev)
876 {
877         int hoplimit = ipv6_devconf.hop_limit;
878         struct inet6_dev *idev;
879
880         idev = in6_dev_get(dev);
881         if (idev) {
882                 hoplimit = idev->cnf.hop_limit;
883                 in6_dev_put(idev);
884         }
885         return hoplimit;
886 }
887
888 /*
889  *
890  */
891
892 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, 
893                 void *_rtattr, struct netlink_skb_parms *req)
894 {
895         int err;
896         struct rtmsg *r;
897         struct rtattr **rta;
898         struct rt6_info *rt = NULL;
899         struct net_device *dev = NULL;
900         struct inet6_dev *idev = NULL;
901         int addr_type;
902
903         rta = (struct rtattr **) _rtattr;
904
905         if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
906                 return -EINVAL;
907 #ifndef CONFIG_IPV6_SUBTREES
908         if (rtmsg->rtmsg_src_len)
909                 return -EINVAL;
910 #endif
911         if (rtmsg->rtmsg_ifindex) {
912                 err = -ENODEV;
913                 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
914                 if (!dev)
915                         goto out;
916                 idev = in6_dev_get(dev);
917                 if (!idev)
918                         goto out;
919         }
920
921         if (rtmsg->rtmsg_metric == 0)
922                 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
923
924         rt = ip6_dst_alloc();
925
926         if (rt == NULL) {
927                 err = -ENOMEM;
928                 goto out;
929         }
930
931         rt->u.dst.obsolete = -1;
932         rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
933         if (nlh && (r = NLMSG_DATA(nlh))) {
934                 rt->rt6i_protocol = r->rtm_protocol;
935         } else {
936                 rt->rt6i_protocol = RTPROT_BOOT;
937         }
938
939         addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
940
941         if (addr_type & IPV6_ADDR_MULTICAST)
942                 rt->u.dst.input = ip6_mc_input;
943         else
944                 rt->u.dst.input = ip6_forward;
945
946         rt->u.dst.output = ip6_output;
947
948         ipv6_addr_prefix(&rt->rt6i_dst.addr, 
949                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
950         rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
951         if (rt->rt6i_dst.plen == 128)
952                rt->u.dst.flags = DST_HOST;
953
954 #ifdef CONFIG_IPV6_SUBTREES
955         ipv6_addr_prefix(&rt->rt6i_src.addr, 
956                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
957         rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
958 #endif
959
960         rt->rt6i_metric = rtmsg->rtmsg_metric;
961
962         /* We cannot add true routes via loopback here,
963            they would result in kernel looping; promote them to reject routes
964          */
965         if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
966             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
967                 /* hold loopback dev/idev if we haven't done so. */
968                 if (dev != &loopback_dev) {
969                         if (dev) {
970                                 dev_put(dev);
971                                 in6_dev_put(idev);
972                         }
973                         dev = &loopback_dev;
974                         dev_hold(dev);
975                         idev = in6_dev_get(dev);
976                         if (!idev) {
977                                 err = -ENODEV;
978                                 goto out;
979                         }
980                 }
981                 rt->u.dst.output = ip6_pkt_discard_out;
982                 rt->u.dst.input = ip6_pkt_discard;
983                 rt->u.dst.error = -ENETUNREACH;
984                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
985                 goto install_route;
986         }
987
988         if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
989                 struct in6_addr *gw_addr;
990                 int gwa_type;
991
992                 gw_addr = &rtmsg->rtmsg_gateway;
993                 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
994                 gwa_type = ipv6_addr_type(gw_addr);
995
996                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
997                         struct rt6_info *grt;
998
999                         /* IPv6 strictly inhibits using not link-local
1000                            addresses as nexthop address.
1001                            Otherwise, router will not able to send redirects.
1002                            It is very good, but in some (rare!) circumstances
1003                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1004                            some exceptions. --ANK
1005                          */
1006                         err = -EINVAL;
1007                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1008                                 goto out;
1009
1010                         grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1011
1012                         err = -EHOSTUNREACH;
1013                         if (grt == NULL)
1014                                 goto out;
1015                         if (dev) {
1016                                 if (dev != grt->rt6i_dev) {
1017                                         dst_release(&grt->u.dst);
1018                                         goto out;
1019                                 }
1020                         } else {
1021                                 dev = grt->rt6i_dev;
1022                                 idev = grt->rt6i_idev;
1023                                 dev_hold(dev);
1024                                 in6_dev_hold(grt->rt6i_idev);
1025                         }
1026                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1027                                 err = 0;
1028                         dst_release(&grt->u.dst);
1029
1030                         if (err)
1031                                 goto out;
1032                 }
1033                 err = -EINVAL;
1034                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1035                         goto out;
1036         }
1037
1038         err = -ENODEV;
1039         if (dev == NULL)
1040                 goto out;
1041
1042         if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1043                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1044                 if (IS_ERR(rt->rt6i_nexthop)) {
1045                         err = PTR_ERR(rt->rt6i_nexthop);
1046                         rt->rt6i_nexthop = NULL;
1047                         goto out;
1048                 }
1049         }
1050
1051         rt->rt6i_flags = rtmsg->rtmsg_flags;
1052
1053 install_route:
1054         if (rta && rta[RTA_METRICS-1]) {
1055                 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1056                 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1057
1058                 while (RTA_OK(attr, attrlen)) {
1059                         unsigned flavor = attr->rta_type;
1060                         if (flavor) {
1061                                 if (flavor > RTAX_MAX) {
1062                                         err = -EINVAL;
1063                                         goto out;
1064                                 }
1065                                 rt->u.dst.metrics[flavor-1] =
1066                                         *(u32 *)RTA_DATA(attr);
1067                         }
1068                         attr = RTA_NEXT(attr, attrlen);
1069                 }
1070         }
1071
1072         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1073                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1074         if (!rt->u.dst.metrics[RTAX_MTU-1])
1075                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1076         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1077                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1078         rt->u.dst.dev = dev;
1079         rt->rt6i_idev = idev;
1080         return ip6_ins_rt(rt, nlh, _rtattr, req);
1081
1082 out:
1083         if (dev)
1084                 dev_put(dev);
1085         if (idev)
1086                 in6_dev_put(idev);
1087         if (rt)
1088                 dst_free((struct dst_entry *) rt);
1089         return err;
1090 }
1091
1092 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1093 {
1094         int err;
1095
1096         write_lock_bh(&rt6_lock);
1097
1098         err = fib6_del(rt, nlh, _rtattr, req);
1099         dst_release(&rt->u.dst);
1100
1101         write_unlock_bh(&rt6_lock);
1102
1103         return err;
1104 }
1105
1106 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1107 {
1108         struct fib6_node *fn;
1109         struct rt6_info *rt;
1110         int err = -ESRCH;
1111
1112         read_lock_bh(&rt6_lock);
1113
1114         fn = fib6_locate(&ip6_routing_table,
1115                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1116                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1117         
1118         if (fn) {
1119                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1120                         if (rtmsg->rtmsg_ifindex &&
1121                             (rt->rt6i_dev == NULL ||
1122                              rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1123                                 continue;
1124                         if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1125                             !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1126                                 continue;
1127                         if (rtmsg->rtmsg_metric &&
1128                             rtmsg->rtmsg_metric != rt->rt6i_metric)
1129                                 continue;
1130                         dst_hold(&rt->u.dst);
1131                         read_unlock_bh(&rt6_lock);
1132
1133                         return ip6_del_rt(rt, nlh, _rtattr, req);
1134                 }
1135         }
1136         read_unlock_bh(&rt6_lock);
1137
1138         return err;
1139 }
1140
1141 /*
1142  *      Handle redirects
1143  */
1144 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1145                   struct neighbour *neigh, u8 *lladdr, int on_link)
1146 {
1147         struct rt6_info *rt, *nrt;
1148
1149         /* Locate old route to this destination. */
1150         rt = rt6_lookup(dest, NULL, neigh->dev->ifindex, 1);
1151
1152         if (rt == NULL)
1153                 return;
1154
1155         if (neigh->dev != rt->rt6i_dev)
1156                 goto out;
1157
1158         /*
1159          * Current route is on-link; redirect is always invalid.
1160          * 
1161          * Seems, previous statement is not true. It could
1162          * be node, which looks for us as on-link (f.e. proxy ndisc)
1163          * But then router serving it might decide, that we should
1164          * know truth 8)8) --ANK (980726).
1165          */
1166         if (!(rt->rt6i_flags&RTF_GATEWAY))
1167                 goto out;
1168
1169         /*
1170          *      RFC 2461 specifies that redirects should only be
1171          *      accepted if they come from the nexthop to the target.
1172          *      Due to the way default routers are chosen, this notion
1173          *      is a bit fuzzy and one might need to check all default
1174          *      routers.
1175          */
1176         if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway)) {
1177                 if (rt->rt6i_flags & RTF_DEFAULT) {
1178                         struct rt6_info *rt1;
1179
1180                         read_lock(&rt6_lock);
1181                         for (rt1 = ip6_routing_table.leaf; rt1; rt1 = rt1->u.next) {
1182                                 if (ipv6_addr_equal(saddr, &rt1->rt6i_gateway)) {
1183                                         dst_hold(&rt1->u.dst);
1184                                         dst_release(&rt->u.dst);
1185                                         read_unlock(&rt6_lock);
1186                                         rt = rt1;
1187                                         goto source_ok;
1188                                 }
1189                         }
1190                         read_unlock(&rt6_lock);
1191                 }
1192                 if (net_ratelimit())
1193                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1194                                "for redirect target\n");
1195                 goto out;
1196         }
1197
1198 source_ok:
1199
1200         /*
1201          *      We have finally decided to accept it.
1202          */
1203
1204         neigh_update(neigh, lladdr, NUD_STALE, 
1205                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1206                      NEIGH_UPDATE_F_OVERRIDE|
1207                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1208                                      NEIGH_UPDATE_F_ISROUTER))
1209                      );
1210
1211         /*
1212          * Redirect received -> path was valid.
1213          * Look, redirects are sent only in response to data packets,
1214          * so that this nexthop apparently is reachable. --ANK
1215          */
1216         dst_confirm(&rt->u.dst);
1217
1218         /* Duplicate redirect: silently ignore. */
1219         if (neigh == rt->u.dst.neighbour)
1220                 goto out;
1221
1222         nrt = ip6_rt_copy(rt);
1223         if (nrt == NULL)
1224                 goto out;
1225
1226         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1227         if (on_link)
1228                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1229
1230         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1231         nrt->rt6i_dst.plen = 128;
1232         nrt->u.dst.flags |= DST_HOST;
1233
1234         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1235         nrt->rt6i_nexthop = neigh_clone(neigh);
1236         /* Reset pmtu, it may be better */
1237         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1238         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1239
1240         if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1241                 goto out;
1242
1243         if (rt->rt6i_flags&RTF_CACHE) {
1244                 ip6_del_rt(rt, NULL, NULL, NULL);
1245                 return;
1246         }
1247
1248 out:
1249         dst_release(&rt->u.dst);
1250         return;
1251 }
1252
1253 /*
1254  *      Handle ICMP "packet too big" messages
1255  *      i.e. Path MTU discovery
1256  */
1257
1258 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1259                         struct net_device *dev, u32 pmtu)
1260 {
1261         struct rt6_info *rt, *nrt;
1262         int allfrag = 0;
1263
1264         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1265         if (rt == NULL)
1266                 return;
1267
1268         if (pmtu >= dst_mtu(&rt->u.dst))
1269                 goto out;
1270
1271         if (pmtu < IPV6_MIN_MTU) {
1272                 /*
1273                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1274                  * MTU (1280) and a fragment header should always be included
1275                  * after a node receiving Too Big message reporting PMTU is
1276                  * less than the IPv6 Minimum Link MTU.
1277                  */
1278                 pmtu = IPV6_MIN_MTU;
1279                 allfrag = 1;
1280         }
1281
1282         /* New mtu received -> path was valid.
1283            They are sent only in response to data packets,
1284            so that this nexthop apparently is reachable. --ANK
1285          */
1286         dst_confirm(&rt->u.dst);
1287
1288         /* Host route. If it is static, it would be better
1289            not to override it, but add new one, so that
1290            when cache entry will expire old pmtu
1291            would return automatically.
1292          */
1293         if (rt->rt6i_flags & RTF_CACHE) {
1294                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1295                 if (allfrag)
1296                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1297                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1298                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1299                 goto out;
1300         }
1301
1302         /* Network route.
1303            Two cases are possible:
1304            1. It is connected route. Action: COW
1305            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1306          */
1307         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1308                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1309         else
1310                 nrt = rt6_alloc_clone(rt, daddr);
1311
1312         if (nrt) {
1313                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1314                 if (allfrag)
1315                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1316
1317                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1318                  * happened within 5 mins, the recommended timer is 10 mins.
1319                  * Here this route expiration time is set to ip6_rt_mtu_expires
1320                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1321                  * and detecting PMTU increase will be automatically happened.
1322                  */
1323                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1324                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1325
1326                 ip6_ins_rt(nrt, NULL, NULL, NULL);
1327         }
1328 out:
1329         dst_release(&rt->u.dst);
1330 }
1331
1332 /*
1333  *      Misc support functions
1334  */
1335
1336 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1337 {
1338         struct rt6_info *rt = ip6_dst_alloc();
1339
1340         if (rt) {
1341                 rt->u.dst.input = ort->u.dst.input;
1342                 rt->u.dst.output = ort->u.dst.output;
1343
1344                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1345                 rt->u.dst.dev = ort->u.dst.dev;
1346                 if (rt->u.dst.dev)
1347                         dev_hold(rt->u.dst.dev);
1348                 rt->rt6i_idev = ort->rt6i_idev;
1349                 if (rt->rt6i_idev)
1350                         in6_dev_hold(rt->rt6i_idev);
1351                 rt->u.dst.lastuse = jiffies;
1352                 rt->rt6i_expires = 0;
1353
1354                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1355                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1356                 rt->rt6i_metric = 0;
1357
1358                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1359 #ifdef CONFIG_IPV6_SUBTREES
1360                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1361 #endif
1362         }
1363         return rt;
1364 }
1365
1366 #ifdef CONFIG_IPV6_ROUTE_INFO
1367 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1368                                            struct in6_addr *gwaddr, int ifindex)
1369 {
1370         struct fib6_node *fn;
1371         struct rt6_info *rt = NULL;
1372
1373         write_lock_bh(&rt6_lock);
1374         fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1375         if (!fn)
1376                 goto out;
1377
1378         for (rt = fn->leaf; rt; rt = rt->u.next) {
1379                 if (rt->rt6i_dev->ifindex != ifindex)
1380                         continue;
1381                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1382                         continue;
1383                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1384                         continue;
1385                 dst_hold(&rt->u.dst);
1386                 break;
1387         }
1388 out:
1389         write_unlock_bh(&rt6_lock);
1390         return rt;
1391 }
1392
1393 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1394                                            struct in6_addr *gwaddr, int ifindex,
1395                                            unsigned pref)
1396 {
1397         struct in6_rtmsg rtmsg;
1398
1399         memset(&rtmsg, 0, sizeof(rtmsg));
1400         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1401         ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1402         rtmsg.rtmsg_dst_len = prefixlen;
1403         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1404         rtmsg.rtmsg_metric = 1024;
1405         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1406         rtmsg.rtmsg_ifindex = ifindex;
1407
1408         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1409
1410         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1411 }
1412 #endif
1413
1414 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1415 {       
1416         struct rt6_info *rt;
1417         struct fib6_node *fn;
1418
1419         fn = &ip6_routing_table;
1420
1421         write_lock_bh(&rt6_lock);
1422         for (rt = fn->leaf; rt; rt=rt->u.next) {
1423                 if (dev == rt->rt6i_dev &&
1424                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1425                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1426                         break;
1427         }
1428         if (rt)
1429                 dst_hold(&rt->u.dst);
1430         write_unlock_bh(&rt6_lock);
1431         return rt;
1432 }
1433
1434 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1435                                      struct net_device *dev,
1436                                      unsigned int pref)
1437 {
1438         struct in6_rtmsg rtmsg;
1439
1440         memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1441         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1442         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1443         rtmsg.rtmsg_metric = 1024;
1444         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1445                             RTF_PREF(pref);
1446
1447         rtmsg.rtmsg_ifindex = dev->ifindex;
1448
1449         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1450         return rt6_get_dflt_router(gwaddr, dev);
1451 }
1452
1453 void rt6_purge_dflt_routers(void)
1454 {
1455         struct rt6_info *rt;
1456
1457 restart:
1458         read_lock_bh(&rt6_lock);
1459         for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1460                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1461                         dst_hold(&rt->u.dst);
1462
1463                         read_unlock_bh(&rt6_lock);
1464
1465                         ip6_del_rt(rt, NULL, NULL, NULL);
1466
1467                         goto restart;
1468                 }
1469         }
1470         read_unlock_bh(&rt6_lock);
1471 }
1472
1473 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1474 {
1475         struct in6_rtmsg rtmsg;
1476         int err;
1477
1478         switch(cmd) {
1479         case SIOCADDRT:         /* Add a route */
1480         case SIOCDELRT:         /* Delete a route */
1481                 if (!capable(CAP_NET_ADMIN))
1482                         return -EPERM;
1483                 err = copy_from_user(&rtmsg, arg,
1484                                      sizeof(struct in6_rtmsg));
1485                 if (err)
1486                         return -EFAULT;
1487                         
1488                 rtnl_lock();
1489                 switch (cmd) {
1490                 case SIOCADDRT:
1491                         err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1492                         break;
1493                 case SIOCDELRT:
1494                         err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1495                         break;
1496                 default:
1497                         err = -EINVAL;
1498                 }
1499                 rtnl_unlock();
1500
1501                 return err;
1502         };
1503
1504         return -EINVAL;
1505 }
1506
1507 /*
1508  *      Drop the packet on the floor
1509  */
1510
1511 static int ip6_pkt_discard(struct sk_buff *skb)
1512 {
1513         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1514         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1515         kfree_skb(skb);
1516         return 0;
1517 }
1518
1519 static int ip6_pkt_discard_out(struct sk_buff *skb)
1520 {
1521         skb->dev = skb->dst->dev;
1522         return ip6_pkt_discard(skb);
1523 }
1524
1525 /*
1526  *      Allocate a dst for local (unicast / anycast) address.
1527  */
1528
1529 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1530                                     const struct in6_addr *addr,
1531                                     int anycast)
1532 {
1533         struct rt6_info *rt = ip6_dst_alloc();
1534
1535         if (rt == NULL)
1536                 return ERR_PTR(-ENOMEM);
1537
1538         dev_hold(&loopback_dev);
1539         in6_dev_hold(idev);
1540
1541         rt->u.dst.flags = DST_HOST;
1542         rt->u.dst.input = ip6_input;
1543         rt->u.dst.output = ip6_output;
1544         rt->rt6i_dev = &loopback_dev;
1545         rt->rt6i_idev = idev;
1546         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1547         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1548         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1549         rt->u.dst.obsolete = -1;
1550
1551         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1552         if (anycast)
1553                 rt->rt6i_flags |= RTF_ANYCAST;
1554         else
1555                 rt->rt6i_flags |= RTF_LOCAL;
1556         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1557         if (rt->rt6i_nexthop == NULL) {
1558                 dst_free((struct dst_entry *) rt);
1559                 return ERR_PTR(-ENOMEM);
1560         }
1561
1562         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1563         rt->rt6i_dst.plen = 128;
1564
1565         atomic_set(&rt->u.dst.__refcnt, 1);
1566
1567         return rt;
1568 }
1569
1570 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1571 {
1572         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1573             rt != &ip6_null_entry) {
1574                 RT6_TRACE("deleted by ifdown %p\n", rt);
1575                 return -1;
1576         }
1577         return 0;
1578 }
1579
1580 void rt6_ifdown(struct net_device *dev)
1581 {
1582         write_lock_bh(&rt6_lock);
1583         fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1584         write_unlock_bh(&rt6_lock);
1585 }
1586
1587 struct rt6_mtu_change_arg
1588 {
1589         struct net_device *dev;
1590         unsigned mtu;
1591 };
1592
1593 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1594 {
1595         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1596         struct inet6_dev *idev;
1597
1598         /* In IPv6 pmtu discovery is not optional,
1599            so that RTAX_MTU lock cannot disable it.
1600            We still use this lock to block changes
1601            caused by addrconf/ndisc.
1602         */
1603
1604         idev = __in6_dev_get(arg->dev);
1605         if (idev == NULL)
1606                 return 0;
1607
1608         /* For administrative MTU increase, there is no way to discover
1609            IPv6 PMTU increase, so PMTU increase should be updated here.
1610            Since RFC 1981 doesn't include administrative MTU increase
1611            update PMTU increase is a MUST. (i.e. jumbo frame)
1612          */
1613         /*
1614            If new MTU is less than route PMTU, this new MTU will be the
1615            lowest MTU in the path, update the route PMTU to reflect PMTU
1616            decreases; if new MTU is greater than route PMTU, and the
1617            old MTU is the lowest MTU in the path, update the route PMTU
1618            to reflect the increase. In this case if the other nodes' MTU
1619            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1620            PMTU discouvery.
1621          */
1622         if (rt->rt6i_dev == arg->dev &&
1623             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1624             (dst_mtu(&rt->u.dst) > arg->mtu ||
1625              (dst_mtu(&rt->u.dst) < arg->mtu &&
1626               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1627                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1628         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1629         return 0;
1630 }
1631
1632 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1633 {
1634         struct rt6_mtu_change_arg arg;
1635
1636         arg.dev = dev;
1637         arg.mtu = mtu;
1638         read_lock_bh(&rt6_lock);
1639         fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1640         read_unlock_bh(&rt6_lock);
1641 }
1642
1643 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1644                               struct in6_rtmsg *rtmsg)
1645 {
1646         memset(rtmsg, 0, sizeof(*rtmsg));
1647
1648         rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1649         rtmsg->rtmsg_src_len = r->rtm_src_len;
1650         rtmsg->rtmsg_flags = RTF_UP;
1651         if (r->rtm_type == RTN_UNREACHABLE)
1652                 rtmsg->rtmsg_flags |= RTF_REJECT;
1653
1654         if (rta[RTA_GATEWAY-1]) {
1655                 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1656                         return -EINVAL;
1657                 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1658                 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1659         }
1660         if (rta[RTA_DST-1]) {
1661                 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1662                         return -EINVAL;
1663                 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1664         }
1665         if (rta[RTA_SRC-1]) {
1666                 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1667                         return -EINVAL;
1668                 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1669         }
1670         if (rta[RTA_OIF-1]) {
1671                 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1672                         return -EINVAL;
1673                 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1674         }
1675         if (rta[RTA_PRIORITY-1]) {
1676                 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1677                         return -EINVAL;
1678                 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1679         }
1680         return 0;
1681 }
1682
1683 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1684 {
1685         struct rtmsg *r = NLMSG_DATA(nlh);
1686         struct in6_rtmsg rtmsg;
1687
1688         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1689                 return -EINVAL;
1690         return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1691 }
1692
1693 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1694 {
1695         struct rtmsg *r = NLMSG_DATA(nlh);
1696         struct in6_rtmsg rtmsg;
1697
1698         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1699                 return -EINVAL;
1700         return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1701 }
1702
1703 struct rt6_rtnl_dump_arg
1704 {
1705         struct sk_buff *skb;
1706         struct netlink_callback *cb;
1707 };
1708
1709 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1710                          struct in6_addr *dst, struct in6_addr *src,
1711                          int iif, int type, u32 pid, u32 seq,
1712                          int prefix, unsigned int flags)
1713 {
1714         struct rtmsg *rtm;
1715         struct nlmsghdr  *nlh;
1716         unsigned char    *b = skb->tail;
1717         struct rta_cacheinfo ci;
1718
1719         if (prefix) {   /* user wants prefix routes only */
1720                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1721                         /* success since this is not a prefix route */
1722                         return 1;
1723                 }
1724         }
1725
1726         nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1727         rtm = NLMSG_DATA(nlh);
1728         rtm->rtm_family = AF_INET6;
1729         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1730         rtm->rtm_src_len = rt->rt6i_src.plen;
1731         rtm->rtm_tos = 0;
1732         rtm->rtm_table = RT_TABLE_MAIN;
1733         if (rt->rt6i_flags&RTF_REJECT)
1734                 rtm->rtm_type = RTN_UNREACHABLE;
1735         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1736                 rtm->rtm_type = RTN_LOCAL;
1737         else
1738                 rtm->rtm_type = RTN_UNICAST;
1739         rtm->rtm_flags = 0;
1740         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1741         rtm->rtm_protocol = rt->rt6i_protocol;
1742         if (rt->rt6i_flags&RTF_DYNAMIC)
1743                 rtm->rtm_protocol = RTPROT_REDIRECT;
1744         else if (rt->rt6i_flags & RTF_ADDRCONF)
1745                 rtm->rtm_protocol = RTPROT_KERNEL;
1746         else if (rt->rt6i_flags&RTF_DEFAULT)
1747                 rtm->rtm_protocol = RTPROT_RA;
1748
1749         if (rt->rt6i_flags&RTF_CACHE)
1750                 rtm->rtm_flags |= RTM_F_CLONED;
1751
1752         if (dst) {
1753                 RTA_PUT(skb, RTA_DST, 16, dst);
1754                 rtm->rtm_dst_len = 128;
1755         } else if (rtm->rtm_dst_len)
1756                 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1757 #ifdef CONFIG_IPV6_SUBTREES
1758         if (src) {
1759                 RTA_PUT(skb, RTA_SRC, 16, src);
1760                 rtm->rtm_src_len = 128;
1761         } else if (rtm->rtm_src_len)
1762                 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1763 #endif
1764         if (iif)
1765                 RTA_PUT(skb, RTA_IIF, 4, &iif);
1766         else if (dst) {
1767                 struct in6_addr saddr_buf;
1768                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1769                         RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1770         }
1771         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1772                 goto rtattr_failure;
1773         if (rt->u.dst.neighbour)
1774                 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1775         if (rt->u.dst.dev)
1776                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1777         RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1778         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1779         if (rt->rt6i_expires)
1780                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1781         else
1782                 ci.rta_expires = 0;
1783         ci.rta_used = rt->u.dst.__use;
1784         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1785         ci.rta_error = rt->u.dst.error;
1786         ci.rta_id = 0;
1787         ci.rta_ts = 0;
1788         ci.rta_tsage = 0;
1789         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1790         nlh->nlmsg_len = skb->tail - b;
1791         return skb->len;
1792
1793 nlmsg_failure:
1794 rtattr_failure:
1795         skb_trim(skb, b - skb->data);
1796         return -1;
1797 }
1798
1799 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1800 {
1801         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1802         int prefix;
1803
1804         if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1805                 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1806                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1807         } else
1808                 prefix = 0;
1809
1810         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1811                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1812                      prefix, NLM_F_MULTI);
1813 }
1814
1815 static int fib6_dump_node(struct fib6_walker_t *w)
1816 {
1817         int res;
1818         struct rt6_info *rt;
1819
1820         for (rt = w->leaf; rt; rt = rt->u.next) {
1821                 res = rt6_dump_route(rt, w->args);
1822                 if (res < 0) {
1823                         /* Frame is full, suspend walking */
1824                         w->leaf = rt;
1825                         return 1;
1826                 }
1827                 BUG_TRAP(res!=0);
1828         }
1829         w->leaf = NULL;
1830         return 0;
1831 }
1832
1833 static void fib6_dump_end(struct netlink_callback *cb)
1834 {
1835         struct fib6_walker_t *w = (void*)cb->args[0];
1836
1837         if (w) {
1838                 cb->args[0] = 0;
1839                 fib6_walker_unlink(w);
1840                 kfree(w);
1841         }
1842         cb->done = (void*)cb->args[1];
1843         cb->args[1] = 0;
1844 }
1845
1846 static int fib6_dump_done(struct netlink_callback *cb)
1847 {
1848         fib6_dump_end(cb);
1849         return cb->done ? cb->done(cb) : 0;
1850 }
1851
1852 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1853 {
1854         struct rt6_rtnl_dump_arg arg;
1855         struct fib6_walker_t *w;
1856         int res;
1857
1858         arg.skb = skb;
1859         arg.cb = cb;
1860
1861         w = (void*)cb->args[0];
1862         if (w == NULL) {
1863                 /* New dump:
1864                  * 
1865                  * 1. hook callback destructor.
1866                  */
1867                 cb->args[1] = (long)cb->done;
1868                 cb->done = fib6_dump_done;
1869
1870                 /*
1871                  * 2. allocate and initialize walker.
1872                  */
1873                 w = kmalloc(sizeof(*w), GFP_ATOMIC);
1874                 if (w == NULL)
1875                         return -ENOMEM;
1876                 RT6_TRACE("dump<%p", w);
1877                 memset(w, 0, sizeof(*w));
1878                 w->root = &ip6_routing_table;
1879                 w->func = fib6_dump_node;
1880                 w->args = &arg;
1881                 cb->args[0] = (long)w;
1882                 read_lock_bh(&rt6_lock);
1883                 res = fib6_walk(w);
1884                 read_unlock_bh(&rt6_lock);
1885         } else {
1886                 w->args = &arg;
1887                 read_lock_bh(&rt6_lock);
1888                 res = fib6_walk_continue(w);
1889                 read_unlock_bh(&rt6_lock);
1890         }
1891 #if RT6_DEBUG >= 3
1892         if (res <= 0 && skb->len == 0)
1893                 RT6_TRACE("%p>dump end\n", w);
1894 #endif
1895         res = res < 0 ? res : skb->len;
1896         /* res < 0 is an error. (really, impossible)
1897            res == 0 means that dump is complete, but skb still can contain data.
1898            res > 0 dump is not complete, but frame is full.
1899          */
1900         /* Destroy walker, if dump of this table is complete. */
1901         if (res <= 0)
1902                 fib6_dump_end(cb);
1903         return res;
1904 }
1905
1906 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1907 {
1908         struct rtattr **rta = arg;
1909         int iif = 0;
1910         int err = -ENOBUFS;
1911         struct sk_buff *skb;
1912         struct flowi fl;
1913         struct rt6_info *rt;
1914
1915         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1916         if (skb == NULL)
1917                 goto out;
1918
1919         /* Reserve room for dummy headers, this skb can pass
1920            through good chunk of routing engine.
1921          */
1922         skb->mac.raw = skb->data;
1923         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1924
1925         memset(&fl, 0, sizeof(fl));
1926         if (rta[RTA_SRC-1])
1927                 ipv6_addr_copy(&fl.fl6_src,
1928                                (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1929         if (rta[RTA_DST-1])
1930                 ipv6_addr_copy(&fl.fl6_dst,
1931                                (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1932
1933         if (rta[RTA_IIF-1])
1934                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1935
1936         if (iif) {
1937                 struct net_device *dev;
1938                 dev = __dev_get_by_index(iif);
1939                 if (!dev) {
1940                         err = -ENODEV;
1941                         goto out_free;
1942                 }
1943         }
1944
1945         fl.oif = 0;
1946         if (rta[RTA_OIF-1])
1947                 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1948
1949         rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1950
1951         skb->dst = &rt->u.dst;
1952
1953         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1954         err = rt6_fill_node(skb, rt, 
1955                             &fl.fl6_dst, &fl.fl6_src,
1956                             iif,
1957                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1958                             nlh->nlmsg_seq, 0, 0);
1959         if (err < 0) {
1960                 err = -EMSGSIZE;
1961                 goto out_free;
1962         }
1963
1964         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1965         if (err > 0)
1966                 err = 0;
1967 out:
1968         return err;
1969 out_free:
1970         kfree_skb(skb);
1971         goto out;       
1972 }
1973
1974 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, 
1975                         struct netlink_skb_parms *req)
1976 {
1977         struct sk_buff *skb;
1978         int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1979         u32 pid = current->pid;
1980         u32 seq = 0;
1981
1982         if (req)
1983                 pid = req->pid;
1984         if (nlh)
1985                 seq = nlh->nlmsg_seq;
1986         
1987         skb = alloc_skb(size, gfp_any());
1988         if (!skb) {
1989                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
1990                 return;
1991         }
1992         if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
1993                 kfree_skb(skb);
1994                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
1995                 return;
1996         }
1997         NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
1998         netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
1999 }
2000
2001 /*
2002  *      /proc
2003  */
2004
2005 #ifdef CONFIG_PROC_FS
2006
2007 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2008
2009 struct rt6_proc_arg
2010 {
2011         char *buffer;
2012         int offset;
2013         int length;
2014         int skip;
2015         int len;
2016 };
2017
2018 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2019 {
2020         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2021         int i;
2022
2023         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2024                 arg->skip++;
2025                 return 0;
2026         }
2027
2028         if (arg->len >= arg->length)
2029                 return 0;
2030
2031         for (i=0; i<16; i++) {
2032                 sprintf(arg->buffer + arg->len, "%02x",
2033                         rt->rt6i_dst.addr.s6_addr[i]);
2034                 arg->len += 2;
2035         }
2036         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2037                             rt->rt6i_dst.plen);
2038
2039 #ifdef CONFIG_IPV6_SUBTREES
2040         for (i=0; i<16; i++) {
2041                 sprintf(arg->buffer + arg->len, "%02x",
2042                         rt->rt6i_src.addr.s6_addr[i]);
2043                 arg->len += 2;
2044         }
2045         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2046                             rt->rt6i_src.plen);
2047 #else
2048         sprintf(arg->buffer + arg->len,
2049                 "00000000000000000000000000000000 00 ");
2050         arg->len += 36;
2051 #endif
2052
2053         if (rt->rt6i_nexthop) {
2054                 for (i=0; i<16; i++) {
2055                         sprintf(arg->buffer + arg->len, "%02x",
2056                                 rt->rt6i_nexthop->primary_key[i]);
2057                         arg->len += 2;
2058                 }
2059         } else {
2060                 sprintf(arg->buffer + arg->len,
2061                         "00000000000000000000000000000000");
2062                 arg->len += 32;
2063         }
2064         arg->len += sprintf(arg->buffer + arg->len,
2065                             " %08x %08x %08x %08x %8s\n",
2066                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2067                             rt->u.dst.__use, rt->rt6i_flags, 
2068                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2069         return 0;
2070 }
2071
2072 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2073 {
2074         struct rt6_proc_arg arg;
2075         arg.buffer = buffer;
2076         arg.offset = offset;
2077         arg.length = length;
2078         arg.skip = 0;
2079         arg.len = 0;
2080
2081         read_lock_bh(&rt6_lock);
2082         fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2083         read_unlock_bh(&rt6_lock);
2084
2085         *start = buffer;
2086         if (offset)
2087                 *start += offset % RT6_INFO_LEN;
2088
2089         arg.len -= offset % RT6_INFO_LEN;
2090
2091         if (arg.len > length)
2092                 arg.len = length;
2093         if (arg.len < 0)
2094                 arg.len = 0;
2095
2096         return arg.len;
2097 }
2098
2099 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2100 {
2101         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2102                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2103                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2104                       rt6_stats.fib_rt_cache,
2105                       atomic_read(&ip6_dst_ops.entries),
2106                       rt6_stats.fib_discarded_routes);
2107
2108         return 0;
2109 }
2110
2111 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2112 {
2113         return single_open(file, rt6_stats_seq_show, NULL);
2114 }
2115
2116 static struct file_operations rt6_stats_seq_fops = {
2117         .owner   = THIS_MODULE,
2118         .open    = rt6_stats_seq_open,
2119         .read    = seq_read,
2120         .llseek  = seq_lseek,
2121         .release = single_release,
2122 };
2123 #endif  /* CONFIG_PROC_FS */
2124
2125 #ifdef CONFIG_SYSCTL
2126
2127 static int flush_delay;
2128
2129 static
2130 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2131                               void __user *buffer, size_t *lenp, loff_t *ppos)
2132 {
2133         if (write) {
2134                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2135                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2136                 return 0;
2137         } else
2138                 return -EINVAL;
2139 }
2140
2141 ctl_table ipv6_route_table[] = {
2142         {
2143                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2144                 .procname       =       "flush",
2145                 .data           =       &flush_delay,
2146                 .maxlen         =       sizeof(int),
2147                 .mode           =       0200,
2148                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2149         },
2150         {
2151                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2152                 .procname       =       "gc_thresh",
2153                 .data           =       &ip6_dst_ops.gc_thresh,
2154                 .maxlen         =       sizeof(int),
2155                 .mode           =       0644,
2156                 .proc_handler   =       &proc_dointvec,
2157         },
2158         {
2159                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2160                 .procname       =       "max_size",
2161                 .data           =       &ip6_rt_max_size,
2162                 .maxlen         =       sizeof(int),
2163                 .mode           =       0644,
2164                 .proc_handler   =       &proc_dointvec,
2165         },
2166         {
2167                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2168                 .procname       =       "gc_min_interval",
2169                 .data           =       &ip6_rt_gc_min_interval,
2170                 .maxlen         =       sizeof(int),
2171                 .mode           =       0644,
2172                 .proc_handler   =       &proc_dointvec_jiffies,
2173                 .strategy       =       &sysctl_jiffies,
2174         },
2175         {
2176                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2177                 .procname       =       "gc_timeout",
2178                 .data           =       &ip6_rt_gc_timeout,
2179                 .maxlen         =       sizeof(int),
2180                 .mode           =       0644,
2181                 .proc_handler   =       &proc_dointvec_jiffies,
2182                 .strategy       =       &sysctl_jiffies,
2183         },
2184         {
2185                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2186                 .procname       =       "gc_interval",
2187                 .data           =       &ip6_rt_gc_interval,
2188                 .maxlen         =       sizeof(int),
2189                 .mode           =       0644,
2190                 .proc_handler   =       &proc_dointvec_jiffies,
2191                 .strategy       =       &sysctl_jiffies,
2192         },
2193         {
2194                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2195                 .procname       =       "gc_elasticity",
2196                 .data           =       &ip6_rt_gc_elasticity,
2197                 .maxlen         =       sizeof(int),
2198                 .mode           =       0644,
2199                 .proc_handler   =       &proc_dointvec_jiffies,
2200                 .strategy       =       &sysctl_jiffies,
2201         },
2202         {
2203                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2204                 .procname       =       "mtu_expires",
2205                 .data           =       &ip6_rt_mtu_expires,
2206                 .maxlen         =       sizeof(int),
2207                 .mode           =       0644,
2208                 .proc_handler   =       &proc_dointvec_jiffies,
2209                 .strategy       =       &sysctl_jiffies,
2210         },
2211         {
2212                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2213                 .procname       =       "min_adv_mss",
2214                 .data           =       &ip6_rt_min_advmss,
2215                 .maxlen         =       sizeof(int),
2216                 .mode           =       0644,
2217                 .proc_handler   =       &proc_dointvec_jiffies,
2218                 .strategy       =       &sysctl_jiffies,
2219         },
2220         {
2221                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2222                 .procname       =       "gc_min_interval_ms",
2223                 .data           =       &ip6_rt_gc_min_interval,
2224                 .maxlen         =       sizeof(int),
2225                 .mode           =       0644,
2226                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2227                 .strategy       =       &sysctl_ms_jiffies,
2228         },
2229         { .ctl_name = 0 }
2230 };
2231
2232 #endif
2233
2234 void __init ip6_route_init(void)
2235 {
2236         struct proc_dir_entry *p;
2237
2238         ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2239                                                      sizeof(struct rt6_info),
2240                                                      0, SLAB_HWCACHE_ALIGN,
2241                                                      NULL, NULL);
2242         if (!ip6_dst_ops.kmem_cachep)
2243                 panic("cannot create ip6_dst_cache");
2244
2245         fib6_init();
2246 #ifdef  CONFIG_PROC_FS
2247         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2248         if (p)
2249                 p->owner = THIS_MODULE;
2250
2251         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2252 #endif
2253 #ifdef CONFIG_XFRM
2254         xfrm6_init();
2255 #endif
2256 }
2257
2258 void ip6_route_cleanup(void)
2259 {
2260 #ifdef CONFIG_PROC_FS
2261         proc_net_remove("ipv6_route");
2262         proc_net_remove("rt6_stats");
2263 #endif
2264 #ifdef CONFIG_XFRM
2265         xfrm6_fini();
2266 #endif
2267         rt6_ifdown(NULL);
2268         fib6_gc_cleanup();
2269         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2270 }