Merge remote-tracking branch 'stable/linux-3.0.y' into develop-3.0
[firefly-linux-kernel-4.4.55.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            const struct in6_addr *prefix, int prefixlen,
93                                            const struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            const struct in6_addr *prefix, int prefixlen,
97                                            const struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
101 {
102         struct rt6_info *rt = (struct rt6_info *) dst;
103         struct inet_peer *peer;
104         u32 *p = NULL;
105
106         if (!rt->rt6i_peer)
107                 rt6_bind_peer(rt, 1);
108
109         peer = rt->rt6i_peer;
110         if (peer) {
111                 u32 *old_p = __DST_METRICS_PTR(old);
112                 unsigned long prev, new;
113
114                 p = peer->metrics;
115                 if (inet_metrics_new(peer))
116                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
117
118                 new = (unsigned long) p;
119                 prev = cmpxchg(&dst->_metrics, old, new);
120
121                 if (prev != old) {
122                         p = __DST_METRICS_PTR(prev);
123                         if (prev & DST_METRICS_READ_ONLY)
124                                 p = NULL;
125                 }
126         }
127         return p;
128 }
129
130 static struct dst_ops ip6_dst_ops_template = {
131         .family                 =       AF_INET6,
132         .protocol               =       cpu_to_be16(ETH_P_IPV6),
133         .gc                     =       ip6_dst_gc,
134         .gc_thresh              =       1024,
135         .check                  =       ip6_dst_check,
136         .default_advmss         =       ip6_default_advmss,
137         .default_mtu            =       ip6_default_mtu,
138         .cow_metrics            =       ipv6_cow_metrics,
139         .destroy                =       ip6_dst_destroy,
140         .ifdown                 =       ip6_dst_ifdown,
141         .negative_advice        =       ip6_negative_advice,
142         .link_failure           =       ip6_link_failure,
143         .update_pmtu            =       ip6_rt_update_pmtu,
144         .local_out              =       __ip6_local_out,
145 };
146
147 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
148 {
149         return 0;
150 }
151
152 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
153 {
154 }
155
156 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
157                                          unsigned long old)
158 {
159         return NULL;
160 }
161
162 static struct dst_ops ip6_dst_blackhole_ops = {
163         .family                 =       AF_INET6,
164         .protocol               =       cpu_to_be16(ETH_P_IPV6),
165         .destroy                =       ip6_dst_destroy,
166         .check                  =       ip6_dst_check,
167         .default_mtu            =       ip6_blackhole_default_mtu,
168         .default_advmss         =       ip6_default_advmss,
169         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
170         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
171 };
172
173 static const u32 ip6_template_metrics[RTAX_MAX] = {
174         [RTAX_HOPLIMIT - 1] = 0,
175 };
176
177 static struct rt6_info ip6_null_entry_template = {
178         .dst = {
179                 .__refcnt       = ATOMIC_INIT(1),
180                 .__use          = 1,
181                 .obsolete       = -1,
182                 .error          = -ENETUNREACH,
183                 .input          = ip6_pkt_discard,
184                 .output         = ip6_pkt_discard_out,
185         },
186         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
187         .rt6i_protocol  = RTPROT_KERNEL,
188         .rt6i_metric    = ~(u32) 0,
189         .rt6i_ref       = ATOMIC_INIT(1),
190 };
191
192 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
193
194 static int ip6_pkt_prohibit(struct sk_buff *skb);
195 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
196
197 static struct rt6_info ip6_prohibit_entry_template = {
198         .dst = {
199                 .__refcnt       = ATOMIC_INIT(1),
200                 .__use          = 1,
201                 .obsolete       = -1,
202                 .error          = -EACCES,
203                 .input          = ip6_pkt_prohibit,
204                 .output         = ip6_pkt_prohibit_out,
205         },
206         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
207         .rt6i_protocol  = RTPROT_KERNEL,
208         .rt6i_metric    = ~(u32) 0,
209         .rt6i_ref       = ATOMIC_INIT(1),
210 };
211
212 static struct rt6_info ip6_blk_hole_entry_template = {
213         .dst = {
214                 .__refcnt       = ATOMIC_INIT(1),
215                 .__use          = 1,
216                 .obsolete       = -1,
217                 .error          = -EINVAL,
218                 .input          = dst_discard,
219                 .output         = dst_discard,
220         },
221         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
222         .rt6i_protocol  = RTPROT_KERNEL,
223         .rt6i_metric    = ~(u32) 0,
224         .rt6i_ref       = ATOMIC_INIT(1),
225 };
226
227 #endif
228
229 /* allocate dst with ip6_dst_ops */
230 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
231                                              struct net_device *dev,
232                                              int flags)
233 {
234         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
235
236         if (rt != NULL)
237                 memset(&rt->rt6i_table, 0,
238                         sizeof(*rt) - sizeof(struct dst_entry));
239
240         return rt;
241 }
242
243 static void ip6_dst_destroy(struct dst_entry *dst)
244 {
245         struct rt6_info *rt = (struct rt6_info *)dst;
246         struct inet6_dev *idev = rt->rt6i_idev;
247         struct inet_peer *peer = rt->rt6i_peer;
248
249         if (idev != NULL) {
250                 rt->rt6i_idev = NULL;
251                 in6_dev_put(idev);
252         }
253         if (peer) {
254                 rt->rt6i_peer = NULL;
255                 inet_putpeer(peer);
256         }
257 }
258
259 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
260
261 static u32 rt6_peer_genid(void)
262 {
263         return atomic_read(&__rt6_peer_genid);
264 }
265
266 void rt6_bind_peer(struct rt6_info *rt, int create)
267 {
268         struct inet_peer *peer;
269
270         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
271         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
272                 inet_putpeer(peer);
273         else
274                 rt->rt6i_peer_genid = rt6_peer_genid();
275 }
276
277 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
278                            int how)
279 {
280         struct rt6_info *rt = (struct rt6_info *)dst;
281         struct inet6_dev *idev = rt->rt6i_idev;
282         struct net_device *loopback_dev =
283                 dev_net(dev)->loopback_dev;
284
285         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
286                 struct inet6_dev *loopback_idev =
287                         in6_dev_get(loopback_dev);
288                 if (loopback_idev != NULL) {
289                         rt->rt6i_idev = loopback_idev;
290                         in6_dev_put(idev);
291                 }
292         }
293 }
294
295 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
296 {
297         return (rt->rt6i_flags & RTF_EXPIRES) &&
298                 time_after(jiffies, rt->rt6i_expires);
299 }
300
301 static inline int rt6_need_strict(const struct in6_addr *daddr)
302 {
303         return ipv6_addr_type(daddr) &
304                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
305 }
306
307 /*
308  *      Route lookup. Any table->tb6_lock is implied.
309  */
310
311 static inline struct rt6_info *rt6_device_match(struct net *net,
312                                                     struct rt6_info *rt,
313                                                     const struct in6_addr *saddr,
314                                                     int oif,
315                                                     int flags)
316 {
317         struct rt6_info *local = NULL;
318         struct rt6_info *sprt;
319
320         if (!oif && ipv6_addr_any(saddr))
321                 goto out;
322
323         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
324                 struct net_device *dev = sprt->rt6i_dev;
325
326                 if (oif) {
327                         if (dev->ifindex == oif)
328                                 return sprt;
329                         if (dev->flags & IFF_LOOPBACK) {
330                                 if (sprt->rt6i_idev == NULL ||
331                                     sprt->rt6i_idev->dev->ifindex != oif) {
332                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
333                                                 continue;
334                                         if (local && (!oif ||
335                                                       local->rt6i_idev->dev->ifindex == oif))
336                                                 continue;
337                                 }
338                                 local = sprt;
339                         }
340                 } else {
341                         if (ipv6_chk_addr(net, saddr, dev,
342                                           flags & RT6_LOOKUP_F_IFACE))
343                                 return sprt;
344                 }
345         }
346
347         if (oif) {
348                 if (local)
349                         return local;
350
351                 if (flags & RT6_LOOKUP_F_IFACE)
352                         return net->ipv6.ip6_null_entry;
353         }
354 out:
355         return rt;
356 }
357
358 #ifdef CONFIG_IPV6_ROUTER_PREF
359 static void rt6_probe(struct rt6_info *rt)
360 {
361         struct neighbour *neigh;
362         /*
363          * Okay, this does not seem to be appropriate
364          * for now, however, we need to check if it
365          * is really so; aka Router Reachability Probing.
366          *
367          * Router Reachability Probe MUST be rate-limited
368          * to no more than one per minute.
369          */
370         rcu_read_lock();
371         neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
372         if (!neigh || (neigh->nud_state & NUD_VALID))
373                 goto out;
374         read_lock_bh(&neigh->lock);
375         if (!(neigh->nud_state & NUD_VALID) &&
376             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
377                 struct in6_addr mcaddr;
378                 struct in6_addr *target;
379
380                 neigh->updated = jiffies;
381                 read_unlock_bh(&neigh->lock);
382
383                 target = (struct in6_addr *)&neigh->primary_key;
384                 addrconf_addr_solict_mult(target, &mcaddr);
385                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
386         } else {
387                 read_unlock_bh(&neigh->lock);
388         }
389 out:
390         rcu_read_unlock();
391 }
392 #else
393 static inline void rt6_probe(struct rt6_info *rt)
394 {
395 }
396 #endif
397
398 /*
399  * Default Router Selection (RFC 2461 6.3.6)
400  */
401 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
402 {
403         struct net_device *dev = rt->rt6i_dev;
404         if (!oif || dev->ifindex == oif)
405                 return 2;
406         if ((dev->flags & IFF_LOOPBACK) &&
407             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
408                 return 1;
409         return 0;
410 }
411
412 static inline int rt6_check_neigh(struct rt6_info *rt)
413 {
414         struct neighbour *neigh;
415         int m;
416
417         rcu_read_lock();
418         neigh = dst_get_neighbour(&rt->dst);
419         if (rt->rt6i_flags & RTF_NONEXTHOP ||
420             !(rt->rt6i_flags & RTF_GATEWAY))
421                 m = 1;
422         else if (neigh) {
423                 read_lock_bh(&neigh->lock);
424                 if (neigh->nud_state & NUD_VALID)
425                         m = 2;
426 #ifdef CONFIG_IPV6_ROUTER_PREF
427                 else if (neigh->nud_state & NUD_FAILED)
428                         m = 0;
429 #endif
430                 else
431                         m = 1;
432                 read_unlock_bh(&neigh->lock);
433         } else
434                 m = 0;
435         rcu_read_unlock();
436         return m;
437 }
438
439 static int rt6_score_route(struct rt6_info *rt, int oif,
440                            int strict)
441 {
442         int m, n;
443
444         m = rt6_check_dev(rt, oif);
445         if (!m && (strict & RT6_LOOKUP_F_IFACE))
446                 return -1;
447 #ifdef CONFIG_IPV6_ROUTER_PREF
448         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
449 #endif
450         n = rt6_check_neigh(rt);
451         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
452                 return -1;
453         return m;
454 }
455
456 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
457                                    int *mpri, struct rt6_info *match)
458 {
459         int m;
460
461         if (rt6_check_expired(rt))
462                 goto out;
463
464         m = rt6_score_route(rt, oif, strict);
465         if (m < 0)
466                 goto out;
467
468         if (m > *mpri) {
469                 if (strict & RT6_LOOKUP_F_REACHABLE)
470                         rt6_probe(match);
471                 *mpri = m;
472                 match = rt;
473         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
474                 rt6_probe(rt);
475         }
476
477 out:
478         return match;
479 }
480
481 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
482                                      struct rt6_info *rr_head,
483                                      u32 metric, int oif, int strict)
484 {
485         struct rt6_info *rt, *match;
486         int mpri = -1;
487
488         match = NULL;
489         for (rt = rr_head; rt && rt->rt6i_metric == metric;
490              rt = rt->dst.rt6_next)
491                 match = find_match(rt, oif, strict, &mpri, match);
492         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
493              rt = rt->dst.rt6_next)
494                 match = find_match(rt, oif, strict, &mpri, match);
495
496         return match;
497 }
498
499 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
500 {
501         struct rt6_info *match, *rt0;
502         struct net *net;
503
504         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
505                   __func__, fn->leaf, oif);
506
507         rt0 = fn->rr_ptr;
508         if (!rt0)
509                 fn->rr_ptr = rt0 = fn->leaf;
510
511         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
512
513         if (!match &&
514             (strict & RT6_LOOKUP_F_REACHABLE)) {
515                 struct rt6_info *next = rt0->dst.rt6_next;
516
517                 /* no entries matched; do round-robin */
518                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
519                         next = fn->leaf;
520
521                 if (next != rt0)
522                         fn->rr_ptr = next;
523         }
524
525         RT6_TRACE("%s() => %p\n",
526                   __func__, match);
527
528         net = dev_net(rt0->rt6i_dev);
529         return match ? match : net->ipv6.ip6_null_entry;
530 }
531
532 #ifdef CONFIG_IPV6_ROUTE_INFO
533 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
534                   const struct in6_addr *gwaddr)
535 {
536         struct net *net = dev_net(dev);
537         struct route_info *rinfo = (struct route_info *) opt;
538         struct in6_addr prefix_buf, *prefix;
539         unsigned int pref;
540         unsigned long lifetime;
541         struct rt6_info *rt;
542
543         if (len < sizeof(struct route_info)) {
544                 return -EINVAL;
545         }
546
547         /* Sanity check for prefix_len and length */
548         if (rinfo->length > 3) {
549                 return -EINVAL;
550         } else if (rinfo->prefix_len > 128) {
551                 return -EINVAL;
552         } else if (rinfo->prefix_len > 64) {
553                 if (rinfo->length < 2) {
554                         return -EINVAL;
555                 }
556         } else if (rinfo->prefix_len > 0) {
557                 if (rinfo->length < 1) {
558                         return -EINVAL;
559                 }
560         }
561
562         pref = rinfo->route_pref;
563         if (pref == ICMPV6_ROUTER_PREF_INVALID)
564                 return -EINVAL;
565
566         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
567
568         if (rinfo->length == 3)
569                 prefix = (struct in6_addr *)rinfo->prefix;
570         else {
571                 /* this function is safe */
572                 ipv6_addr_prefix(&prefix_buf,
573                                  (struct in6_addr *)rinfo->prefix,
574                                  rinfo->prefix_len);
575                 prefix = &prefix_buf;
576         }
577
578         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
579                                 dev->ifindex);
580
581         if (rt && !lifetime) {
582                 ip6_del_rt(rt);
583                 rt = NULL;
584         }
585
586         if (!rt && lifetime)
587                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
588                                         pref);
589         else if (rt)
590                 rt->rt6i_flags = RTF_ROUTEINFO |
591                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
592
593         if (rt) {
594                 if (!addrconf_finite_timeout(lifetime)) {
595                         rt->rt6i_flags &= ~RTF_EXPIRES;
596                 } else {
597                         rt->rt6i_expires = jiffies + HZ * lifetime;
598                         rt->rt6i_flags |= RTF_EXPIRES;
599                 }
600                 dst_release(&rt->dst);
601         }
602         return 0;
603 }
604 #endif
605
606 #define BACKTRACK(__net, saddr)                 \
607 do { \
608         if (rt == __net->ipv6.ip6_null_entry) { \
609                 struct fib6_node *pn; \
610                 while (1) { \
611                         if (fn->fn_flags & RTN_TL_ROOT) \
612                                 goto out; \
613                         pn = fn->parent; \
614                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
615                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
616                         else \
617                                 fn = pn; \
618                         if (fn->fn_flags & RTN_RTINFO) \
619                                 goto restart; \
620                 } \
621         } \
622 } while(0)
623
624 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
625                                              struct fib6_table *table,
626                                              struct flowi6 *fl6, int flags)
627 {
628         struct fib6_node *fn;
629         struct rt6_info *rt;
630
631         read_lock_bh(&table->tb6_lock);
632         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
633 restart:
634         rt = fn->leaf;
635         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
636         BACKTRACK(net, &fl6->saddr);
637 out:
638         dst_use(&rt->dst, jiffies);
639         read_unlock_bh(&table->tb6_lock);
640         return rt;
641
642 }
643
644 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
645                             const struct in6_addr *saddr, int oif, int strict)
646 {
647         struct flowi6 fl6 = {
648                 .flowi6_oif = oif,
649                 .daddr = *daddr,
650         };
651         struct dst_entry *dst;
652         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
653
654         if (saddr) {
655                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
656                 flags |= RT6_LOOKUP_F_HAS_SADDR;
657         }
658
659         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
660         if (dst->error == 0)
661                 return (struct rt6_info *) dst;
662
663         dst_release(dst);
664
665         return NULL;
666 }
667
668 EXPORT_SYMBOL(rt6_lookup);
669
670 /* ip6_ins_rt is called with FREE table->tb6_lock.
671    It takes new route entry, the addition fails by any reason the
672    route is freed. In any case, if caller does not hold it, it may
673    be destroyed.
674  */
675
676 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
677 {
678         int err;
679         struct fib6_table *table;
680
681         table = rt->rt6i_table;
682         write_lock_bh(&table->tb6_lock);
683         err = fib6_add(&table->tb6_root, rt, info);
684         write_unlock_bh(&table->tb6_lock);
685
686         return err;
687 }
688
689 int ip6_ins_rt(struct rt6_info *rt)
690 {
691         struct nl_info info = {
692                 .nl_net = dev_net(rt->rt6i_dev),
693         };
694         return __ip6_ins_rt(rt, &info);
695 }
696
697 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, const struct in6_addr *daddr,
698                                       const struct in6_addr *saddr)
699 {
700         struct rt6_info *rt;
701
702         /*
703          *      Clone the route.
704          */
705
706         rt = ip6_rt_copy(ort);
707
708         if (rt) {
709                 struct neighbour *neigh;
710                 int attempts = !in_softirq();
711
712                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
713                         if (rt->rt6i_dst.plen != 128 &&
714                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
715                                 rt->rt6i_flags |= RTF_ANYCAST;
716                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
717                 }
718
719                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
720                 rt->rt6i_dst.plen = 128;
721                 rt->rt6i_flags |= RTF_CACHE;
722                 rt->dst.flags |= DST_HOST;
723
724 #ifdef CONFIG_IPV6_SUBTREES
725                 if (rt->rt6i_src.plen && saddr) {
726                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
727                         rt->rt6i_src.plen = 128;
728                 }
729 #endif
730
731         retry:
732                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
733                 if (IS_ERR(neigh)) {
734                         struct net *net = dev_net(rt->rt6i_dev);
735                         int saved_rt_min_interval =
736                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
737                         int saved_rt_elasticity =
738                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
739
740                         if (attempts-- > 0) {
741                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
742                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
743
744                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
745
746                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
747                                         saved_rt_elasticity;
748                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
749                                         saved_rt_min_interval;
750                                 goto retry;
751                         }
752
753                         if (net_ratelimit())
754                                 printk(KERN_WARNING
755                                        "ipv6: Neighbour table overflow.\n");
756                         dst_free(&rt->dst);
757                         return NULL;
758                 }
759                 dst_set_neighbour(&rt->dst, neigh);
760         }
761
762         return rt;
763 }
764
765 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, const struct in6_addr *daddr)
766 {
767         struct rt6_info *rt = ip6_rt_copy(ort);
768         if (rt) {
769                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
770                 rt->rt6i_dst.plen = 128;
771                 rt->rt6i_flags |= RTF_CACHE;
772                 rt->dst.flags |= DST_HOST;
773                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
774         }
775         return rt;
776 }
777
778 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
779                                       struct flowi6 *fl6, int flags)
780 {
781         struct fib6_node *fn;
782         struct rt6_info *rt, *nrt;
783         int strict = 0;
784         int attempts = 3;
785         int err;
786         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
787
788         strict |= flags & RT6_LOOKUP_F_IFACE;
789
790 relookup:
791         read_lock_bh(&table->tb6_lock);
792
793 restart_2:
794         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
795
796 restart:
797         rt = rt6_select(fn, oif, strict | reachable);
798
799         BACKTRACK(net, &fl6->saddr);
800         if (rt == net->ipv6.ip6_null_entry ||
801             rt->rt6i_flags & RTF_CACHE)
802                 goto out;
803
804         dst_hold(&rt->dst);
805         read_unlock_bh(&table->tb6_lock);
806
807         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
808                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
809         else if (!(rt->dst.flags & DST_HOST))
810                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
811         else
812                 goto out2;
813
814         dst_release(&rt->dst);
815         rt = nrt ? : net->ipv6.ip6_null_entry;
816
817         dst_hold(&rt->dst);
818         if (nrt) {
819                 err = ip6_ins_rt(nrt);
820                 if (!err)
821                         goto out2;
822         }
823
824         if (--attempts <= 0)
825                 goto out2;
826
827         /*
828          * Race condition! In the gap, when table->tb6_lock was
829          * released someone could insert this route.  Relookup.
830          */
831         dst_release(&rt->dst);
832         goto relookup;
833
834 out:
835         if (reachable) {
836                 reachable = 0;
837                 goto restart_2;
838         }
839         dst_hold(&rt->dst);
840         read_unlock_bh(&table->tb6_lock);
841 out2:
842         rt->dst.lastuse = jiffies;
843         rt->dst.__use++;
844
845         return rt;
846 }
847
848 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
849                                             struct flowi6 *fl6, int flags)
850 {
851         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
852 }
853
854 void ip6_route_input(struct sk_buff *skb)
855 {
856         const struct ipv6hdr *iph = ipv6_hdr(skb);
857         struct net *net = dev_net(skb->dev);
858         int flags = RT6_LOOKUP_F_HAS_SADDR;
859         struct flowi6 fl6 = {
860                 .flowi6_iif = skb->dev->ifindex,
861                 .daddr = iph->daddr,
862                 .saddr = iph->saddr,
863                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
864                 .flowi6_mark = skb->mark,
865                 .flowi6_proto = iph->nexthdr,
866         };
867
868         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
869                 flags |= RT6_LOOKUP_F_IFACE;
870
871         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
872 }
873
874 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
875                                              struct flowi6 *fl6, int flags)
876 {
877         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
878 }
879
880 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
881                                     struct flowi6 *fl6)
882 {
883         int flags = 0;
884
885         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
886                 flags |= RT6_LOOKUP_F_IFACE;
887
888         if (!ipv6_addr_any(&fl6->saddr))
889                 flags |= RT6_LOOKUP_F_HAS_SADDR;
890         else if (sk)
891                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
892
893         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
894 }
895
896 EXPORT_SYMBOL(ip6_route_output);
897
898 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
899 {
900         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
901         struct dst_entry *new = NULL;
902
903         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
904         if (rt) {
905                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
906
907                 new = &rt->dst;
908
909                 new->__use = 1;
910                 new->input = dst_discard;
911                 new->output = dst_discard;
912
913                 dst_copy_metrics(new, &ort->dst);
914                 rt->rt6i_idev = ort->rt6i_idev;
915                 if (rt->rt6i_idev)
916                         in6_dev_hold(rt->rt6i_idev);
917                 rt->rt6i_expires = 0;
918
919                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
920                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
921                 rt->rt6i_metric = 0;
922
923                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
924 #ifdef CONFIG_IPV6_SUBTREES
925                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
926 #endif
927
928                 dst_free(new);
929         }
930
931         dst_release(dst_orig);
932         return new ? new : ERR_PTR(-ENOMEM);
933 }
934
935 /*
936  *      Destination cache support functions
937  */
938
939 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
940 {
941         struct rt6_info *rt;
942
943         rt = (struct rt6_info *) dst;
944
945         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
946                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
947                         if (!rt->rt6i_peer)
948                                 rt6_bind_peer(rt, 0);
949                         rt->rt6i_peer_genid = rt6_peer_genid();
950                 }
951                 return dst;
952         }
953         return NULL;
954 }
955
956 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
957 {
958         struct rt6_info *rt = (struct rt6_info *) dst;
959
960         if (rt) {
961                 if (rt->rt6i_flags & RTF_CACHE) {
962                         if (rt6_check_expired(rt)) {
963                                 ip6_del_rt(rt);
964                                 dst = NULL;
965                         }
966                 } else {
967                         dst_release(dst);
968                         dst = NULL;
969                 }
970         }
971         return dst;
972 }
973
974 static void ip6_link_failure(struct sk_buff *skb)
975 {
976         struct rt6_info *rt;
977
978         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
979
980         rt = (struct rt6_info *) skb_dst(skb);
981         if (rt) {
982                 if (rt->rt6i_flags&RTF_CACHE) {
983                         dst_set_expires(&rt->dst, 0);
984                         rt->rt6i_flags |= RTF_EXPIRES;
985                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
986                         rt->rt6i_node->fn_sernum = -1;
987         }
988 }
989
990 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
991 {
992         struct rt6_info *rt6 = (struct rt6_info*)dst;
993
994         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
995                 rt6->rt6i_flags |= RTF_MODIFIED;
996                 if (mtu < IPV6_MIN_MTU) {
997                         u32 features = dst_metric(dst, RTAX_FEATURES);
998                         mtu = IPV6_MIN_MTU;
999                         features |= RTAX_FEATURE_ALLFRAG;
1000                         dst_metric_set(dst, RTAX_FEATURES, features);
1001                 }
1002                 dst_metric_set(dst, RTAX_MTU, mtu);
1003         }
1004 }
1005
1006 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1007 {
1008         struct net_device *dev = dst->dev;
1009         unsigned int mtu = dst_mtu(dst);
1010         struct net *net = dev_net(dev);
1011
1012         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1013
1014         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1015                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1016
1017         /*
1018          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1019          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1020          * IPV6_MAXPLEN is also valid and means: "any MSS,
1021          * rely only on pmtu discovery"
1022          */
1023         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1024                 mtu = IPV6_MAXPLEN;
1025         return mtu;
1026 }
1027
1028 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1029 {
1030         unsigned int mtu = IPV6_MIN_MTU;
1031         struct inet6_dev *idev;
1032
1033         rcu_read_lock();
1034         idev = __in6_dev_get(dst->dev);
1035         if (idev)
1036                 mtu = idev->cnf.mtu6;
1037         rcu_read_unlock();
1038
1039         return mtu;
1040 }
1041
1042 static struct dst_entry *icmp6_dst_gc_list;
1043 static DEFINE_SPINLOCK(icmp6_dst_lock);
1044
1045 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1046                                   struct neighbour *neigh,
1047                                   const struct in6_addr *addr)
1048 {
1049         struct rt6_info *rt;
1050         struct inet6_dev *idev = in6_dev_get(dev);
1051         struct net *net = dev_net(dev);
1052
1053         if (unlikely(idev == NULL))
1054                 return NULL;
1055
1056         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1057         if (unlikely(rt == NULL)) {
1058                 in6_dev_put(idev);
1059                 goto out;
1060         }
1061
1062         if (neigh)
1063                 neigh_hold(neigh);
1064         else {
1065                 neigh = ndisc_get_neigh(dev, addr);
1066                 if (IS_ERR(neigh))
1067                         neigh = NULL;
1068         }
1069
1070         rt->rt6i_idev     = idev;
1071         dst_set_neighbour(&rt->dst, neigh);
1072         atomic_set(&rt->dst.__refcnt, 1);
1073         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1074         rt->dst.output  = ip6_output;
1075
1076         spin_lock_bh(&icmp6_dst_lock);
1077         rt->dst.next = icmp6_dst_gc_list;
1078         icmp6_dst_gc_list = &rt->dst;
1079         spin_unlock_bh(&icmp6_dst_lock);
1080
1081         fib6_force_start_gc(net);
1082
1083 out:
1084         return &rt->dst;
1085 }
1086
1087 int icmp6_dst_gc(void)
1088 {
1089         struct dst_entry *dst, **pprev;
1090         int more = 0;
1091
1092         spin_lock_bh(&icmp6_dst_lock);
1093         pprev = &icmp6_dst_gc_list;
1094
1095         while ((dst = *pprev) != NULL) {
1096                 if (!atomic_read(&dst->__refcnt)) {
1097                         *pprev = dst->next;
1098                         dst_free(dst);
1099                 } else {
1100                         pprev = &dst->next;
1101                         ++more;
1102                 }
1103         }
1104
1105         spin_unlock_bh(&icmp6_dst_lock);
1106
1107         return more;
1108 }
1109
1110 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1111                             void *arg)
1112 {
1113         struct dst_entry *dst, **pprev;
1114
1115         spin_lock_bh(&icmp6_dst_lock);
1116         pprev = &icmp6_dst_gc_list;
1117         while ((dst = *pprev) != NULL) {
1118                 struct rt6_info *rt = (struct rt6_info *) dst;
1119                 if (func(rt, arg)) {
1120                         *pprev = dst->next;
1121                         dst_free(dst);
1122                 } else {
1123                         pprev = &dst->next;
1124                 }
1125         }
1126         spin_unlock_bh(&icmp6_dst_lock);
1127 }
1128
1129 static int ip6_dst_gc(struct dst_ops *ops)
1130 {
1131         unsigned long now = jiffies;
1132         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1133         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1134         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1135         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1136         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1137         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1138         int entries;
1139
1140         entries = dst_entries_get_fast(ops);
1141         if (time_after(rt_last_gc + rt_min_interval, now) &&
1142             entries <= rt_max_size)
1143                 goto out;
1144
1145         net->ipv6.ip6_rt_gc_expire++;
1146         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1147         net->ipv6.ip6_rt_last_gc = now;
1148         entries = dst_entries_get_slow(ops);
1149         if (entries < ops->gc_thresh)
1150                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1151 out:
1152         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1153         return entries > rt_max_size;
1154 }
1155
1156 /* Clean host part of a prefix. Not necessary in radix tree,
1157    but results in cleaner routing tables.
1158
1159    Remove it only when all the things will work!
1160  */
1161
1162 int ip6_dst_hoplimit(struct dst_entry *dst)
1163 {
1164         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1165         if (hoplimit == 0) {
1166                 struct net_device *dev = dst->dev;
1167                 struct inet6_dev *idev;
1168
1169                 rcu_read_lock();
1170                 idev = __in6_dev_get(dev);
1171                 if (idev)
1172                         hoplimit = idev->cnf.hop_limit;
1173                 else
1174                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1175                 rcu_read_unlock();
1176         }
1177         return hoplimit;
1178 }
1179 EXPORT_SYMBOL(ip6_dst_hoplimit);
1180
1181 /*
1182  *
1183  */
1184
1185 int ip6_route_add(struct fib6_config *cfg)
1186 {
1187         int err;
1188         struct net *net = cfg->fc_nlinfo.nl_net;
1189         struct rt6_info *rt = NULL;
1190         struct net_device *dev = NULL;
1191         struct inet6_dev *idev = NULL;
1192         struct fib6_table *table;
1193         int addr_type;
1194
1195         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1196                 return -EINVAL;
1197 #ifndef CONFIG_IPV6_SUBTREES
1198         if (cfg->fc_src_len)
1199                 return -EINVAL;
1200 #endif
1201         if (cfg->fc_ifindex) {
1202                 err = -ENODEV;
1203                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1204                 if (!dev)
1205                         goto out;
1206                 idev = in6_dev_get(dev);
1207                 if (!idev)
1208                         goto out;
1209         }
1210
1211         if (cfg->fc_metric == 0)
1212                 cfg->fc_metric = IP6_RT_PRIO_USER;
1213
1214         table = fib6_new_table(net, cfg->fc_table);
1215         if (table == NULL) {
1216                 err = -ENOBUFS;
1217                 goto out;
1218         }
1219
1220         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1221
1222         if (rt == NULL) {
1223                 err = -ENOMEM;
1224                 goto out;
1225         }
1226
1227         rt->dst.obsolete = -1;
1228         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1229                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1230                                 0;
1231
1232         if (cfg->fc_protocol == RTPROT_UNSPEC)
1233                 cfg->fc_protocol = RTPROT_BOOT;
1234         rt->rt6i_protocol = cfg->fc_protocol;
1235
1236         addr_type = ipv6_addr_type(&cfg->fc_dst);
1237
1238         if (addr_type & IPV6_ADDR_MULTICAST)
1239                 rt->dst.input = ip6_mc_input;
1240         else if (cfg->fc_flags & RTF_LOCAL)
1241                 rt->dst.input = ip6_input;
1242         else
1243                 rt->dst.input = ip6_forward;
1244
1245         rt->dst.output = ip6_output;
1246
1247         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1248         rt->rt6i_dst.plen = cfg->fc_dst_len;
1249         if (rt->rt6i_dst.plen == 128)
1250                rt->dst.flags |= DST_HOST;
1251
1252 #ifdef CONFIG_IPV6_SUBTREES
1253         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1254         rt->rt6i_src.plen = cfg->fc_src_len;
1255 #endif
1256
1257         rt->rt6i_metric = cfg->fc_metric;
1258
1259         /* We cannot add true routes via loopback here,
1260            they would result in kernel looping; promote them to reject routes
1261          */
1262         if ((cfg->fc_flags & RTF_REJECT) ||
1263             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1264                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1265                 /* hold loopback dev/idev if we haven't done so. */
1266                 if (dev != net->loopback_dev) {
1267                         if (dev) {
1268                                 dev_put(dev);
1269                                 in6_dev_put(idev);
1270                         }
1271                         dev = net->loopback_dev;
1272                         dev_hold(dev);
1273                         idev = in6_dev_get(dev);
1274                         if (!idev) {
1275                                 err = -ENODEV;
1276                                 goto out;
1277                         }
1278                 }
1279                 rt->dst.output = ip6_pkt_discard_out;
1280                 rt->dst.input = ip6_pkt_discard;
1281                 rt->dst.error = -ENETUNREACH;
1282                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1283                 goto install_route;
1284         }
1285
1286         if (cfg->fc_flags & RTF_GATEWAY) {
1287                 const struct in6_addr *gw_addr;
1288                 int gwa_type;
1289
1290                 gw_addr = &cfg->fc_gateway;
1291                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1292                 gwa_type = ipv6_addr_type(gw_addr);
1293
1294                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1295                         struct rt6_info *grt;
1296
1297                         /* IPv6 strictly inhibits using not link-local
1298                            addresses as nexthop address.
1299                            Otherwise, router will not able to send redirects.
1300                            It is very good, but in some (rare!) circumstances
1301                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1302                            some exceptions. --ANK
1303                          */
1304                         err = -EINVAL;
1305                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1306                                 goto out;
1307
1308                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1309
1310                         err = -EHOSTUNREACH;
1311                         if (grt == NULL)
1312                                 goto out;
1313                         if (dev) {
1314                                 if (dev != grt->rt6i_dev) {
1315                                         dst_release(&grt->dst);
1316                                         goto out;
1317                                 }
1318                         } else {
1319                                 dev = grt->rt6i_dev;
1320                                 idev = grt->rt6i_idev;
1321                                 dev_hold(dev);
1322                                 in6_dev_hold(grt->rt6i_idev);
1323                         }
1324                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1325                                 err = 0;
1326                         dst_release(&grt->dst);
1327
1328                         if (err)
1329                                 goto out;
1330                 }
1331                 err = -EINVAL;
1332                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1333                         goto out;
1334         }
1335
1336         err = -ENODEV;
1337         if (dev == NULL)
1338                 goto out;
1339
1340         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1341                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1342                         err = -EINVAL;
1343                         goto out;
1344                 }
1345                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1346                 rt->rt6i_prefsrc.plen = 128;
1347         } else
1348                 rt->rt6i_prefsrc.plen = 0;
1349
1350         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1351                 struct neighbour *neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1352                 if (IS_ERR(neigh)) {
1353                         err = PTR_ERR(neigh);
1354                         goto out;
1355                 }
1356                 dst_set_neighbour(&rt->dst, neigh);
1357         }
1358
1359         rt->rt6i_flags = cfg->fc_flags;
1360
1361 install_route:
1362         if (cfg->fc_mx) {
1363                 struct nlattr *nla;
1364                 int remaining;
1365
1366                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1367                         int type = nla_type(nla);
1368
1369                         if (type) {
1370                                 if (type > RTAX_MAX) {
1371                                         err = -EINVAL;
1372                                         goto out;
1373                                 }
1374
1375                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1376                         }
1377                 }
1378         }
1379
1380         rt->dst.dev = dev;
1381         rt->rt6i_idev = idev;
1382         rt->rt6i_table = table;
1383
1384         cfg->fc_nlinfo.nl_net = dev_net(dev);
1385
1386         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1387
1388 out:
1389         if (dev)
1390                 dev_put(dev);
1391         if (idev)
1392                 in6_dev_put(idev);
1393         if (rt)
1394                 dst_free(&rt->dst);
1395         return err;
1396 }
1397
1398 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1399 {
1400         int err;
1401         struct fib6_table *table;
1402         struct net *net = dev_net(rt->rt6i_dev);
1403
1404         if (rt == net->ipv6.ip6_null_entry) {
1405                 err = -ENOENT;
1406                 goto out;
1407         }
1408
1409         table = rt->rt6i_table;
1410         write_lock_bh(&table->tb6_lock);
1411         err = fib6_del(rt, info);
1412         write_unlock_bh(&table->tb6_lock);
1413
1414 out:
1415         dst_release(&rt->dst);
1416         return err;
1417 }
1418
1419 int ip6_del_rt(struct rt6_info *rt)
1420 {
1421         struct nl_info info = {
1422                 .nl_net = dev_net(rt->rt6i_dev),
1423         };
1424         return __ip6_del_rt(rt, &info);
1425 }
1426
1427 static int ip6_route_del(struct fib6_config *cfg)
1428 {
1429         struct fib6_table *table;
1430         struct fib6_node *fn;
1431         struct rt6_info *rt;
1432         int err = -ESRCH;
1433
1434         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1435         if (table == NULL)
1436                 return err;
1437
1438         read_lock_bh(&table->tb6_lock);
1439
1440         fn = fib6_locate(&table->tb6_root,
1441                          &cfg->fc_dst, cfg->fc_dst_len,
1442                          &cfg->fc_src, cfg->fc_src_len);
1443
1444         if (fn) {
1445                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1446                         if (cfg->fc_ifindex &&
1447                             (rt->rt6i_dev == NULL ||
1448                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1449                                 continue;
1450                         if (cfg->fc_flags & RTF_GATEWAY &&
1451                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1452                                 continue;
1453                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1454                                 continue;
1455                         dst_hold(&rt->dst);
1456                         read_unlock_bh(&table->tb6_lock);
1457
1458                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1459                 }
1460         }
1461         read_unlock_bh(&table->tb6_lock);
1462
1463         return err;
1464 }
1465
1466 /*
1467  *      Handle redirects
1468  */
1469 struct ip6rd_flowi {
1470         struct flowi6 fl6;
1471         struct in6_addr gateway;
1472 };
1473
1474 static struct rt6_info *__ip6_route_redirect(struct net *net,
1475                                              struct fib6_table *table,
1476                                              struct flowi6 *fl6,
1477                                              int flags)
1478 {
1479         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1480         struct rt6_info *rt;
1481         struct fib6_node *fn;
1482
1483         /*
1484          * Get the "current" route for this destination and
1485          * check if the redirect has come from approriate router.
1486          *
1487          * RFC 2461 specifies that redirects should only be
1488          * accepted if they come from the nexthop to the target.
1489          * Due to the way the routes are chosen, this notion
1490          * is a bit fuzzy and one might need to check all possible
1491          * routes.
1492          */
1493
1494         read_lock_bh(&table->tb6_lock);
1495         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1496 restart:
1497         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1498                 /*
1499                  * Current route is on-link; redirect is always invalid.
1500                  *
1501                  * Seems, previous statement is not true. It could
1502                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1503                  * But then router serving it might decide, that we should
1504                  * know truth 8)8) --ANK (980726).
1505                  */
1506                 if (rt6_check_expired(rt))
1507                         continue;
1508                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1509                         continue;
1510                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1511                         continue;
1512                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1513                         continue;
1514                 break;
1515         }
1516
1517         if (!rt)
1518                 rt = net->ipv6.ip6_null_entry;
1519         BACKTRACK(net, &fl6->saddr);
1520 out:
1521         dst_hold(&rt->dst);
1522
1523         read_unlock_bh(&table->tb6_lock);
1524
1525         return rt;
1526 };
1527
1528 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1529                                            const struct in6_addr *src,
1530                                            const struct in6_addr *gateway,
1531                                            struct net_device *dev)
1532 {
1533         int flags = RT6_LOOKUP_F_HAS_SADDR;
1534         struct net *net = dev_net(dev);
1535         struct ip6rd_flowi rdfl = {
1536                 .fl6 = {
1537                         .flowi6_oif = dev->ifindex,
1538                         .daddr = *dest,
1539                         .saddr = *src,
1540                 },
1541         };
1542
1543         ipv6_addr_copy(&rdfl.gateway, gateway);
1544
1545         if (rt6_need_strict(dest))
1546                 flags |= RT6_LOOKUP_F_IFACE;
1547
1548         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1549                                                    flags, __ip6_route_redirect);
1550 }
1551
1552 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1553                   const struct in6_addr *saddr,
1554                   struct neighbour *neigh, u8 *lladdr, int on_link)
1555 {
1556         struct rt6_info *rt, *nrt = NULL;
1557         struct netevent_redirect netevent;
1558         struct net *net = dev_net(neigh->dev);
1559
1560         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1561
1562         if (rt == net->ipv6.ip6_null_entry) {
1563                 if (net_ratelimit())
1564                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1565                                "for redirect target\n");
1566                 goto out;
1567         }
1568
1569         /*
1570          *      We have finally decided to accept it.
1571          */
1572
1573         neigh_update(neigh, lladdr, NUD_STALE,
1574                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1575                      NEIGH_UPDATE_F_OVERRIDE|
1576                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1577                                      NEIGH_UPDATE_F_ISROUTER))
1578                      );
1579
1580         /*
1581          * Redirect received -> path was valid.
1582          * Look, redirects are sent only in response to data packets,
1583          * so that this nexthop apparently is reachable. --ANK
1584          */
1585         dst_confirm(&rt->dst);
1586
1587         /* Duplicate redirect: silently ignore. */
1588         if (neigh == dst_get_neighbour_raw(&rt->dst))
1589                 goto out;
1590
1591         nrt = ip6_rt_copy(rt);
1592         if (nrt == NULL)
1593                 goto out;
1594
1595         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1596         if (on_link)
1597                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1598
1599         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1600         nrt->rt6i_dst.plen = 128;
1601         nrt->dst.flags |= DST_HOST;
1602
1603         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1604         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1605
1606         if (ip6_ins_rt(nrt))
1607                 goto out;
1608
1609         netevent.old = &rt->dst;
1610         netevent.new = &nrt->dst;
1611         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1612
1613         if (rt->rt6i_flags&RTF_CACHE) {
1614                 ip6_del_rt(rt);
1615                 return;
1616         }
1617
1618 out:
1619         dst_release(&rt->dst);
1620 }
1621
1622 /*
1623  *      Handle ICMP "packet too big" messages
1624  *      i.e. Path MTU discovery
1625  */
1626
1627 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1628                              struct net *net, u32 pmtu, int ifindex)
1629 {
1630         struct rt6_info *rt, *nrt;
1631         int allfrag = 0;
1632 again:
1633         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1634         if (rt == NULL)
1635                 return;
1636
1637         if (rt6_check_expired(rt)) {
1638                 ip6_del_rt(rt);
1639                 goto again;
1640         }
1641
1642         if (pmtu >= dst_mtu(&rt->dst))
1643                 goto out;
1644
1645         if (pmtu < IPV6_MIN_MTU) {
1646                 /*
1647                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1648                  * MTU (1280) and a fragment header should always be included
1649                  * after a node receiving Too Big message reporting PMTU is
1650                  * less than the IPv6 Minimum Link MTU.
1651                  */
1652                 pmtu = IPV6_MIN_MTU;
1653                 allfrag = 1;
1654         }
1655
1656         /* New mtu received -> path was valid.
1657            They are sent only in response to data packets,
1658            so that this nexthop apparently is reachable. --ANK
1659          */
1660         dst_confirm(&rt->dst);
1661
1662         /* Host route. If it is static, it would be better
1663            not to override it, but add new one, so that
1664            when cache entry will expire old pmtu
1665            would return automatically.
1666          */
1667         if (rt->rt6i_flags & RTF_CACHE) {
1668                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1669                 if (allfrag) {
1670                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1671                         features |= RTAX_FEATURE_ALLFRAG;
1672                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1673                 }
1674                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1675                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1676                 goto out;
1677         }
1678
1679         /* Network route.
1680            Two cases are possible:
1681            1. It is connected route. Action: COW
1682            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1683          */
1684         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1685                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1686         else
1687                 nrt = rt6_alloc_clone(rt, daddr);
1688
1689         if (nrt) {
1690                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1691                 if (allfrag) {
1692                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1693                         features |= RTAX_FEATURE_ALLFRAG;
1694                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1695                 }
1696
1697                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1698                  * happened within 5 mins, the recommended timer is 10 mins.
1699                  * Here this route expiration time is set to ip6_rt_mtu_expires
1700                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1701                  * and detecting PMTU increase will be automatically happened.
1702                  */
1703                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1704                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1705
1706                 ip6_ins_rt(nrt);
1707         }
1708 out:
1709         dst_release(&rt->dst);
1710 }
1711
1712 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1713                         struct net_device *dev, u32 pmtu)
1714 {
1715         struct net *net = dev_net(dev);
1716
1717         /*
1718          * RFC 1981 states that a node "MUST reduce the size of the packets it
1719          * is sending along the path" that caused the Packet Too Big message.
1720          * Since it's not possible in the general case to determine which
1721          * interface was used to send the original packet, we update the MTU
1722          * on the interface that will be used to send future packets. We also
1723          * update the MTU on the interface that received the Packet Too Big in
1724          * case the original packet was forced out that interface with
1725          * SO_BINDTODEVICE or similar. This is the next best thing to the
1726          * correct behaviour, which would be to update the MTU on all
1727          * interfaces.
1728          */
1729         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1730         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1731 }
1732
1733 /*
1734  *      Misc support functions
1735  */
1736
1737 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1738 {
1739         struct net *net = dev_net(ort->rt6i_dev);
1740         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1741                                             ort->dst.dev, 0);
1742
1743         if (rt) {
1744                 rt->dst.input = ort->dst.input;
1745                 rt->dst.output = ort->dst.output;
1746
1747                 dst_copy_metrics(&rt->dst, &ort->dst);
1748                 rt->dst.error = ort->dst.error;
1749                 rt->rt6i_idev = ort->rt6i_idev;
1750                 if (rt->rt6i_idev)
1751                         in6_dev_hold(rt->rt6i_idev);
1752                 rt->dst.lastuse = jiffies;
1753                 rt->rt6i_expires = 0;
1754
1755                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1756                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1757                 rt->rt6i_metric = 0;
1758
1759                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1760 #ifdef CONFIG_IPV6_SUBTREES
1761                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1762 #endif
1763                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1764                 rt->rt6i_table = ort->rt6i_table;
1765         }
1766         return rt;
1767 }
1768
1769 #ifdef CONFIG_IPV6_ROUTE_INFO
1770 static struct rt6_info *rt6_get_route_info(struct net *net,
1771                                            const struct in6_addr *prefix, int prefixlen,
1772                                            const struct in6_addr *gwaddr, int ifindex)
1773 {
1774         struct fib6_node *fn;
1775         struct rt6_info *rt = NULL;
1776         struct fib6_table *table;
1777
1778         table = fib6_get_table(net, RT6_TABLE_INFO);
1779         if (table == NULL)
1780                 return NULL;
1781
1782         write_lock_bh(&table->tb6_lock);
1783         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1784         if (!fn)
1785                 goto out;
1786
1787         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1788                 if (rt->rt6i_dev->ifindex != ifindex)
1789                         continue;
1790                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1791                         continue;
1792                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1793                         continue;
1794                 dst_hold(&rt->dst);
1795                 break;
1796         }
1797 out:
1798         write_unlock_bh(&table->tb6_lock);
1799         return rt;
1800 }
1801
1802 static struct rt6_info *rt6_add_route_info(struct net *net,
1803                                            const struct in6_addr *prefix, int prefixlen,
1804                                            const struct in6_addr *gwaddr, int ifindex,
1805                                            unsigned pref)
1806 {
1807         struct fib6_config cfg = {
1808                 .fc_table       = RT6_TABLE_INFO,
1809                 .fc_metric      = IP6_RT_PRIO_USER,
1810                 .fc_ifindex     = ifindex,
1811                 .fc_dst_len     = prefixlen,
1812                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1813                                   RTF_UP | RTF_PREF(pref),
1814                 .fc_nlinfo.pid = 0,
1815                 .fc_nlinfo.nlh = NULL,
1816                 .fc_nlinfo.nl_net = net,
1817         };
1818
1819         ipv6_addr_copy(&cfg.fc_dst, prefix);
1820         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1821
1822         /* We should treat it as a default route if prefix length is 0. */
1823         if (!prefixlen)
1824                 cfg.fc_flags |= RTF_DEFAULT;
1825
1826         ip6_route_add(&cfg);
1827
1828         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1829 }
1830 #endif
1831
1832 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1833 {
1834         struct rt6_info *rt;
1835         struct fib6_table *table;
1836
1837         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1838         if (table == NULL)
1839                 return NULL;
1840
1841         write_lock_bh(&table->tb6_lock);
1842         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1843                 if (dev == rt->rt6i_dev &&
1844                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1845                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1846                         break;
1847         }
1848         if (rt)
1849                 dst_hold(&rt->dst);
1850         write_unlock_bh(&table->tb6_lock);
1851         return rt;
1852 }
1853
1854 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1855                                      struct net_device *dev,
1856                                      unsigned int pref)
1857 {
1858         struct fib6_config cfg = {
1859                 .fc_table       = RT6_TABLE_DFLT,
1860                 .fc_metric      = IP6_RT_PRIO_USER,
1861                 .fc_ifindex     = dev->ifindex,
1862                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1863                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1864                 .fc_nlinfo.pid = 0,
1865                 .fc_nlinfo.nlh = NULL,
1866                 .fc_nlinfo.nl_net = dev_net(dev),
1867         };
1868
1869         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1870
1871         ip6_route_add(&cfg);
1872
1873         return rt6_get_dflt_router(gwaddr, dev);
1874 }
1875
1876 void rt6_purge_dflt_routers(struct net *net)
1877 {
1878         struct rt6_info *rt;
1879         struct fib6_table *table;
1880
1881         /* NOTE: Keep consistent with rt6_get_dflt_router */
1882         table = fib6_get_table(net, RT6_TABLE_DFLT);
1883         if (table == NULL)
1884                 return;
1885
1886 restart:
1887         read_lock_bh(&table->tb6_lock);
1888         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1889                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1890                         dst_hold(&rt->dst);
1891                         read_unlock_bh(&table->tb6_lock);
1892                         ip6_del_rt(rt);
1893                         goto restart;
1894                 }
1895         }
1896         read_unlock_bh(&table->tb6_lock);
1897 }
1898
1899 static void rtmsg_to_fib6_config(struct net *net,
1900                                  struct in6_rtmsg *rtmsg,
1901                                  struct fib6_config *cfg)
1902 {
1903         memset(cfg, 0, sizeof(*cfg));
1904
1905         cfg->fc_table = RT6_TABLE_MAIN;
1906         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1907         cfg->fc_metric = rtmsg->rtmsg_metric;
1908         cfg->fc_expires = rtmsg->rtmsg_info;
1909         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1910         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1911         cfg->fc_flags = rtmsg->rtmsg_flags;
1912
1913         cfg->fc_nlinfo.nl_net = net;
1914
1915         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1916         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1917         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1918 }
1919
1920 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1921 {
1922         struct fib6_config cfg;
1923         struct in6_rtmsg rtmsg;
1924         int err;
1925
1926         switch(cmd) {
1927         case SIOCADDRT:         /* Add a route */
1928         case SIOCDELRT:         /* Delete a route */
1929                 if (!capable(CAP_NET_ADMIN))
1930                         return -EPERM;
1931                 err = copy_from_user(&rtmsg, arg,
1932                                      sizeof(struct in6_rtmsg));
1933                 if (err)
1934                         return -EFAULT;
1935
1936                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1937
1938                 rtnl_lock();
1939                 switch (cmd) {
1940                 case SIOCADDRT:
1941                         err = ip6_route_add(&cfg);
1942                         break;
1943                 case SIOCDELRT:
1944                         err = ip6_route_del(&cfg);
1945                         break;
1946                 default:
1947                         err = -EINVAL;
1948                 }
1949                 rtnl_unlock();
1950
1951                 return err;
1952         }
1953
1954         return -EINVAL;
1955 }
1956
1957 /*
1958  *      Drop the packet on the floor
1959  */
1960
1961 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1962 {
1963         int type;
1964         struct dst_entry *dst = skb_dst(skb);
1965         switch (ipstats_mib_noroutes) {
1966         case IPSTATS_MIB_INNOROUTES:
1967                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1968                 if (type == IPV6_ADDR_ANY) {
1969                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1970                                       IPSTATS_MIB_INADDRERRORS);
1971                         break;
1972                 }
1973                 /* FALLTHROUGH */
1974         case IPSTATS_MIB_OUTNOROUTES:
1975                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1976                               ipstats_mib_noroutes);
1977                 break;
1978         }
1979         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1980         kfree_skb(skb);
1981         return 0;
1982 }
1983
1984 static int ip6_pkt_discard(struct sk_buff *skb)
1985 {
1986         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1987 }
1988
1989 static int ip6_pkt_discard_out(struct sk_buff *skb)
1990 {
1991         skb->dev = skb_dst(skb)->dev;
1992         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1993 }
1994
1995 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1996
1997 static int ip6_pkt_prohibit(struct sk_buff *skb)
1998 {
1999         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2000 }
2001
2002 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2003 {
2004         skb->dev = skb_dst(skb)->dev;
2005         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2006 }
2007
2008 #endif
2009
2010 /*
2011  *      Allocate a dst for local (unicast / anycast) address.
2012  */
2013
2014 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2015                                     const struct in6_addr *addr,
2016                                     int anycast)
2017 {
2018         struct net *net = dev_net(idev->dev);
2019         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2020                                             net->loopback_dev, 0);
2021         struct neighbour *neigh;
2022
2023         if (rt == NULL) {
2024                 if (net_ratelimit())
2025                         pr_warning("IPv6:  Maximum number of routes reached,"
2026                                    " consider increasing route/max_size.\n");
2027                 return ERR_PTR(-ENOMEM);
2028         }
2029
2030         in6_dev_hold(idev);
2031
2032         rt->dst.flags |= DST_HOST;
2033         rt->dst.input = ip6_input;
2034         rt->dst.output = ip6_output;
2035         rt->rt6i_idev = idev;
2036         rt->dst.obsolete = -1;
2037
2038         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2039         if (anycast)
2040                 rt->rt6i_flags |= RTF_ANYCAST;
2041         else
2042                 rt->rt6i_flags |= RTF_LOCAL;
2043         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2044         if (IS_ERR(neigh)) {
2045                 dst_free(&rt->dst);
2046
2047                 return ERR_CAST(neigh);
2048         }
2049         dst_set_neighbour(&rt->dst, neigh);
2050
2051         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2052         rt->rt6i_dst.plen = 128;
2053         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2054
2055         atomic_set(&rt->dst.__refcnt, 1);
2056
2057         return rt;
2058 }
2059
2060 int ip6_route_get_saddr(struct net *net,
2061                         struct rt6_info *rt,
2062                         const struct in6_addr *daddr,
2063                         unsigned int prefs,
2064                         struct in6_addr *saddr)
2065 {
2066         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2067         int err = 0;
2068         if (rt->rt6i_prefsrc.plen)
2069                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2070         else
2071                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2072                                          daddr, prefs, saddr);
2073         return err;
2074 }
2075
2076 /* remove deleted ip from prefsrc entries */
2077 struct arg_dev_net_ip {
2078         struct net_device *dev;
2079         struct net *net;
2080         struct in6_addr *addr;
2081 };
2082
2083 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2084 {
2085         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2086         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2087         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2088
2089         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2090             rt != net->ipv6.ip6_null_entry &&
2091             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2092                 /* remove prefsrc entry */
2093                 rt->rt6i_prefsrc.plen = 0;
2094         }
2095         return 0;
2096 }
2097
2098 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2099 {
2100         struct net *net = dev_net(ifp->idev->dev);
2101         struct arg_dev_net_ip adni = {
2102                 .dev = ifp->idev->dev,
2103                 .net = net,
2104                 .addr = &ifp->addr,
2105         };
2106         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2107 }
2108
2109 struct arg_dev_net {
2110         struct net_device *dev;
2111         struct net *net;
2112 };
2113
2114 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2115 {
2116         const struct arg_dev_net *adn = arg;
2117         const struct net_device *dev = adn->dev;
2118
2119         if ((rt->rt6i_dev == dev || dev == NULL) &&
2120             rt != adn->net->ipv6.ip6_null_entry) {
2121                 RT6_TRACE("deleted by ifdown %p\n", rt);
2122                 return -1;
2123         }
2124         return 0;
2125 }
2126
2127 void rt6_ifdown(struct net *net, struct net_device *dev)
2128 {
2129         struct arg_dev_net adn = {
2130                 .dev = dev,
2131                 .net = net,
2132         };
2133
2134         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2135         icmp6_clean_all(fib6_ifdown, &adn);
2136 }
2137
2138 struct rt6_mtu_change_arg
2139 {
2140         struct net_device *dev;
2141         unsigned mtu;
2142 };
2143
2144 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2145 {
2146         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2147         struct inet6_dev *idev;
2148
2149         /* In IPv6 pmtu discovery is not optional,
2150            so that RTAX_MTU lock cannot disable it.
2151            We still use this lock to block changes
2152            caused by addrconf/ndisc.
2153         */
2154
2155         idev = __in6_dev_get(arg->dev);
2156         if (idev == NULL)
2157                 return 0;
2158
2159         /* For administrative MTU increase, there is no way to discover
2160            IPv6 PMTU increase, so PMTU increase should be updated here.
2161            Since RFC 1981 doesn't include administrative MTU increase
2162            update PMTU increase is a MUST. (i.e. jumbo frame)
2163          */
2164         /*
2165            If new MTU is less than route PMTU, this new MTU will be the
2166            lowest MTU in the path, update the route PMTU to reflect PMTU
2167            decreases; if new MTU is greater than route PMTU, and the
2168            old MTU is the lowest MTU in the path, update the route PMTU
2169            to reflect the increase. In this case if the other nodes' MTU
2170            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2171            PMTU discouvery.
2172          */
2173         if (rt->rt6i_dev == arg->dev &&
2174             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2175             (dst_mtu(&rt->dst) >= arg->mtu ||
2176              (dst_mtu(&rt->dst) < arg->mtu &&
2177               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2178                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2179         }
2180         return 0;
2181 }
2182
2183 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2184 {
2185         struct rt6_mtu_change_arg arg = {
2186                 .dev = dev,
2187                 .mtu = mtu,
2188         };
2189
2190         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2191 }
2192
2193 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2194         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2195         [RTA_OIF]               = { .type = NLA_U32 },
2196         [RTA_IIF]               = { .type = NLA_U32 },
2197         [RTA_PRIORITY]          = { .type = NLA_U32 },
2198         [RTA_METRICS]           = { .type = NLA_NESTED },
2199 };
2200
2201 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2202                               struct fib6_config *cfg)
2203 {
2204         struct rtmsg *rtm;
2205         struct nlattr *tb[RTA_MAX+1];
2206         int err;
2207
2208         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2209         if (err < 0)
2210                 goto errout;
2211
2212         err = -EINVAL;
2213         rtm = nlmsg_data(nlh);
2214         memset(cfg, 0, sizeof(*cfg));
2215
2216         cfg->fc_table = rtm->rtm_table;
2217         cfg->fc_dst_len = rtm->rtm_dst_len;
2218         cfg->fc_src_len = rtm->rtm_src_len;
2219         cfg->fc_flags = RTF_UP;
2220         cfg->fc_protocol = rtm->rtm_protocol;
2221
2222         if (rtm->rtm_type == RTN_UNREACHABLE)
2223                 cfg->fc_flags |= RTF_REJECT;
2224
2225         if (rtm->rtm_type == RTN_LOCAL)
2226                 cfg->fc_flags |= RTF_LOCAL;
2227
2228         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2229         cfg->fc_nlinfo.nlh = nlh;
2230         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2231
2232         if (tb[RTA_GATEWAY]) {
2233                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2234                 cfg->fc_flags |= RTF_GATEWAY;
2235         }
2236
2237         if (tb[RTA_DST]) {
2238                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2239
2240                 if (nla_len(tb[RTA_DST]) < plen)
2241                         goto errout;
2242
2243                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2244         }
2245
2246         if (tb[RTA_SRC]) {
2247                 int plen = (rtm->rtm_src_len + 7) >> 3;
2248
2249                 if (nla_len(tb[RTA_SRC]) < plen)
2250                         goto errout;
2251
2252                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2253         }
2254
2255         if (tb[RTA_PREFSRC])
2256                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2257
2258         if (tb[RTA_OIF])
2259                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2260
2261         if (tb[RTA_PRIORITY])
2262                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2263
2264         if (tb[RTA_METRICS]) {
2265                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2266                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2267         }
2268
2269         if (tb[RTA_TABLE])
2270                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2271
2272         err = 0;
2273 errout:
2274         return err;
2275 }
2276
2277 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2278 {
2279         struct fib6_config cfg;
2280         int err;
2281
2282         err = rtm_to_fib6_config(skb, nlh, &cfg);
2283         if (err < 0)
2284                 return err;
2285
2286         return ip6_route_del(&cfg);
2287 }
2288
2289 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2290 {
2291         struct fib6_config cfg;
2292         int err;
2293
2294         err = rtm_to_fib6_config(skb, nlh, &cfg);
2295         if (err < 0)
2296                 return err;
2297
2298         return ip6_route_add(&cfg);
2299 }
2300
2301 static inline size_t rt6_nlmsg_size(void)
2302 {
2303         return NLMSG_ALIGN(sizeof(struct rtmsg))
2304                + nla_total_size(16) /* RTA_SRC */
2305                + nla_total_size(16) /* RTA_DST */
2306                + nla_total_size(16) /* RTA_GATEWAY */
2307                + nla_total_size(16) /* RTA_PREFSRC */
2308                + nla_total_size(4) /* RTA_TABLE */
2309                + nla_total_size(4) /* RTA_IIF */
2310                + nla_total_size(4) /* RTA_OIF */
2311                + nla_total_size(4) /* RTA_PRIORITY */
2312                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2313                + nla_total_size(sizeof(struct rta_cacheinfo));
2314 }
2315
2316 static int rt6_fill_node(struct net *net,
2317                          struct sk_buff *skb, struct rt6_info *rt,
2318                          struct in6_addr *dst, struct in6_addr *src,
2319                          int iif, int type, u32 pid, u32 seq,
2320                          int prefix, int nowait, unsigned int flags)
2321 {
2322         struct rtmsg *rtm;
2323         struct nlmsghdr *nlh;
2324         long expires;
2325         u32 table;
2326         struct neighbour *n;
2327
2328         if (prefix) {   /* user wants prefix routes only */
2329                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2330                         /* success since this is not a prefix route */
2331                         return 1;
2332                 }
2333         }
2334
2335         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2336         if (nlh == NULL)
2337                 return -EMSGSIZE;
2338
2339         rtm = nlmsg_data(nlh);
2340         rtm->rtm_family = AF_INET6;
2341         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2342         rtm->rtm_src_len = rt->rt6i_src.plen;
2343         rtm->rtm_tos = 0;
2344         if (rt->rt6i_table)
2345                 table = rt->rt6i_table->tb6_id;
2346         else
2347                 table = RT6_TABLE_UNSPEC;
2348         rtm->rtm_table = table;
2349         NLA_PUT_U32(skb, RTA_TABLE, table);
2350         if (rt->rt6i_flags&RTF_REJECT)
2351                 rtm->rtm_type = RTN_UNREACHABLE;
2352         else if (rt->rt6i_flags&RTF_LOCAL)
2353                 rtm->rtm_type = RTN_LOCAL;
2354         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2355                 rtm->rtm_type = RTN_LOCAL;
2356         else
2357                 rtm->rtm_type = RTN_UNICAST;
2358         rtm->rtm_flags = 0;
2359         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2360         rtm->rtm_protocol = rt->rt6i_protocol;
2361         if (rt->rt6i_flags&RTF_DYNAMIC)
2362                 rtm->rtm_protocol = RTPROT_REDIRECT;
2363         else if (rt->rt6i_flags & RTF_ADDRCONF)
2364                 rtm->rtm_protocol = RTPROT_KERNEL;
2365         else if (rt->rt6i_flags&RTF_DEFAULT)
2366                 rtm->rtm_protocol = RTPROT_RA;
2367
2368         if (rt->rt6i_flags&RTF_CACHE)
2369                 rtm->rtm_flags |= RTM_F_CLONED;
2370
2371         if (dst) {
2372                 NLA_PUT(skb, RTA_DST, 16, dst);
2373                 rtm->rtm_dst_len = 128;
2374         } else if (rtm->rtm_dst_len)
2375                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2376 #ifdef CONFIG_IPV6_SUBTREES
2377         if (src) {
2378                 NLA_PUT(skb, RTA_SRC, 16, src);
2379                 rtm->rtm_src_len = 128;
2380         } else if (rtm->rtm_src_len)
2381                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2382 #endif
2383         if (iif) {
2384 #ifdef CONFIG_IPV6_MROUTE
2385                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2386                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2387                         if (err <= 0) {
2388                                 if (!nowait) {
2389                                         if (err == 0)
2390                                                 return 0;
2391                                         goto nla_put_failure;
2392                                 } else {
2393                                         if (err == -EMSGSIZE)
2394                                                 goto nla_put_failure;
2395                                 }
2396                         }
2397                 } else
2398 #endif
2399                         NLA_PUT_U32(skb, RTA_IIF, iif);
2400         } else if (dst) {
2401                 struct in6_addr saddr_buf;
2402                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2403                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2404         }
2405
2406         if (rt->rt6i_prefsrc.plen) {
2407                 struct in6_addr saddr_buf;
2408                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2409                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2410         }
2411
2412         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2413                 goto nla_put_failure;
2414
2415         rcu_read_lock();
2416         n = dst_get_neighbour(&rt->dst);
2417         if (n) {
2418                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2419                         rcu_read_unlock();
2420                         goto nla_put_failure;
2421                 }
2422         }
2423         rcu_read_unlock();
2424
2425         if (rt->dst.dev)
2426                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2427
2428         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2429
2430         if (!(rt->rt6i_flags & RTF_EXPIRES))
2431                 expires = 0;
2432         else if (rt->rt6i_expires - jiffies < INT_MAX)
2433                 expires = rt->rt6i_expires - jiffies;
2434         else
2435                 expires = INT_MAX;
2436
2437         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2438                                expires, rt->dst.error) < 0)
2439                 goto nla_put_failure;
2440
2441         return nlmsg_end(skb, nlh);
2442
2443 nla_put_failure:
2444         nlmsg_cancel(skb, nlh);
2445         return -EMSGSIZE;
2446 }
2447
2448 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2449 {
2450         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2451         int prefix;
2452
2453         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2454                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2455                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2456         } else
2457                 prefix = 0;
2458
2459         return rt6_fill_node(arg->net,
2460                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2461                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2462                      prefix, 0, NLM_F_MULTI);
2463 }
2464
2465 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2466 {
2467         struct net *net = sock_net(in_skb->sk);
2468         struct nlattr *tb[RTA_MAX+1];
2469         struct rt6_info *rt;
2470         struct sk_buff *skb;
2471         struct rtmsg *rtm;
2472         struct flowi6 fl6;
2473         int err, iif = 0;
2474
2475         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2476         if (err < 0)
2477                 goto errout;
2478
2479         err = -EINVAL;
2480         memset(&fl6, 0, sizeof(fl6));
2481
2482         if (tb[RTA_SRC]) {
2483                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2484                         goto errout;
2485
2486                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2487         }
2488
2489         if (tb[RTA_DST]) {
2490                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2491                         goto errout;
2492
2493                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2494         }
2495
2496         if (tb[RTA_IIF])
2497                 iif = nla_get_u32(tb[RTA_IIF]);
2498
2499         if (tb[RTA_OIF])
2500                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2501
2502         if (iif) {
2503                 struct net_device *dev;
2504                 dev = __dev_get_by_index(net, iif);
2505                 if (!dev) {
2506                         err = -ENODEV;
2507                         goto errout;
2508                 }
2509         }
2510
2511         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2512         if (skb == NULL) {
2513                 err = -ENOBUFS;
2514                 goto errout;
2515         }
2516
2517         /* Reserve room for dummy headers, this skb can pass
2518            through good chunk of routing engine.
2519          */
2520         skb_reset_mac_header(skb);
2521         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2522
2523         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2524         skb_dst_set(skb, &rt->dst);
2525
2526         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2527                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2528                             nlh->nlmsg_seq, 0, 0, 0);
2529         if (err < 0) {
2530                 kfree_skb(skb);
2531                 goto errout;
2532         }
2533
2534         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2535 errout:
2536         return err;
2537 }
2538
2539 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2540 {
2541         struct sk_buff *skb;
2542         struct net *net = info->nl_net;
2543         u32 seq;
2544         int err;
2545
2546         err = -ENOBUFS;
2547         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2548
2549         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2550         if (skb == NULL)
2551                 goto errout;
2552
2553         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2554                                 event, info->pid, seq, 0, 0, 0);
2555         if (err < 0) {
2556                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2557                 WARN_ON(err == -EMSGSIZE);
2558                 kfree_skb(skb);
2559                 goto errout;
2560         }
2561         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2562                     info->nlh, gfp_any());
2563         return;
2564 errout:
2565         if (err < 0)
2566                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2567 }
2568
2569 static int ip6_route_dev_notify(struct notifier_block *this,
2570                                 unsigned long event, void *data)
2571 {
2572         struct net_device *dev = (struct net_device *)data;
2573         struct net *net = dev_net(dev);
2574
2575         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2576                 net->ipv6.ip6_null_entry->dst.dev = dev;
2577                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2578 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2579                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2580                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2581                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2582                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2583 #endif
2584         }
2585
2586         return NOTIFY_OK;
2587 }
2588
2589 /*
2590  *      /proc
2591  */
2592
2593 #ifdef CONFIG_PROC_FS
2594
2595 struct rt6_proc_arg
2596 {
2597         char *buffer;
2598         int offset;
2599         int length;
2600         int skip;
2601         int len;
2602 };
2603
2604 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2605 {
2606         struct seq_file *m = p_arg;
2607         struct neighbour *n;
2608
2609         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2610
2611 #ifdef CONFIG_IPV6_SUBTREES
2612         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2613 #else
2614         seq_puts(m, "00000000000000000000000000000000 00 ");
2615 #endif
2616         rcu_read_lock();
2617         n = dst_get_neighbour(&rt->dst);
2618         if (n) {
2619                 seq_printf(m, "%pi6", n->primary_key);
2620         } else {
2621                 seq_puts(m, "00000000000000000000000000000000");
2622         }
2623         rcu_read_unlock();
2624         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2625                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2626                    rt->dst.__use, rt->rt6i_flags,
2627                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2628         return 0;
2629 }
2630
2631 static int ipv6_route_show(struct seq_file *m, void *v)
2632 {
2633         struct net *net = (struct net *)m->private;
2634         fib6_clean_all(net, rt6_info_route, 0, m);
2635         return 0;
2636 }
2637
2638 static int ipv6_route_open(struct inode *inode, struct file *file)
2639 {
2640         return single_open_net(inode, file, ipv6_route_show);
2641 }
2642
2643 static const struct file_operations ipv6_route_proc_fops = {
2644         .owner          = THIS_MODULE,
2645         .open           = ipv6_route_open,
2646         .read           = seq_read,
2647         .llseek         = seq_lseek,
2648         .release        = single_release_net,
2649 };
2650
2651 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2652 {
2653         struct net *net = (struct net *)seq->private;
2654         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2655                    net->ipv6.rt6_stats->fib_nodes,
2656                    net->ipv6.rt6_stats->fib_route_nodes,
2657                    net->ipv6.rt6_stats->fib_rt_alloc,
2658                    net->ipv6.rt6_stats->fib_rt_entries,
2659                    net->ipv6.rt6_stats->fib_rt_cache,
2660                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2661                    net->ipv6.rt6_stats->fib_discarded_routes);
2662
2663         return 0;
2664 }
2665
2666 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2667 {
2668         return single_open_net(inode, file, rt6_stats_seq_show);
2669 }
2670
2671 static const struct file_operations rt6_stats_seq_fops = {
2672         .owner   = THIS_MODULE,
2673         .open    = rt6_stats_seq_open,
2674         .read    = seq_read,
2675         .llseek  = seq_lseek,
2676         .release = single_release_net,
2677 };
2678 #endif  /* CONFIG_PROC_FS */
2679
2680 #ifdef CONFIG_SYSCTL
2681
2682 static
2683 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2684                               void __user *buffer, size_t *lenp, loff_t *ppos)
2685 {
2686         struct net *net;
2687         int delay;
2688         if (!write)
2689                 return -EINVAL;
2690
2691         net = (struct net *)ctl->extra1;
2692         delay = net->ipv6.sysctl.flush_delay;
2693         proc_dointvec(ctl, write, buffer, lenp, ppos);
2694         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2695         return 0;
2696 }
2697
2698 ctl_table ipv6_route_table_template[] = {
2699         {
2700                 .procname       =       "flush",
2701                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2702                 .maxlen         =       sizeof(int),
2703                 .mode           =       0200,
2704                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2705         },
2706         {
2707                 .procname       =       "gc_thresh",
2708                 .data           =       &ip6_dst_ops_template.gc_thresh,
2709                 .maxlen         =       sizeof(int),
2710                 .mode           =       0644,
2711                 .proc_handler   =       proc_dointvec,
2712         },
2713         {
2714                 .procname       =       "max_size",
2715                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2716                 .maxlen         =       sizeof(int),
2717                 .mode           =       0644,
2718                 .proc_handler   =       proc_dointvec,
2719         },
2720         {
2721                 .procname       =       "gc_min_interval",
2722                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2723                 .maxlen         =       sizeof(int),
2724                 .mode           =       0644,
2725                 .proc_handler   =       proc_dointvec_jiffies,
2726         },
2727         {
2728                 .procname       =       "gc_timeout",
2729                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2730                 .maxlen         =       sizeof(int),
2731                 .mode           =       0644,
2732                 .proc_handler   =       proc_dointvec_jiffies,
2733         },
2734         {
2735                 .procname       =       "gc_interval",
2736                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2737                 .maxlen         =       sizeof(int),
2738                 .mode           =       0644,
2739                 .proc_handler   =       proc_dointvec_jiffies,
2740         },
2741         {
2742                 .procname       =       "gc_elasticity",
2743                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2744                 .maxlen         =       sizeof(int),
2745                 .mode           =       0644,
2746                 .proc_handler   =       proc_dointvec,
2747         },
2748         {
2749                 .procname       =       "mtu_expires",
2750                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2751                 .maxlen         =       sizeof(int),
2752                 .mode           =       0644,
2753                 .proc_handler   =       proc_dointvec_jiffies,
2754         },
2755         {
2756                 .procname       =       "min_adv_mss",
2757                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2758                 .maxlen         =       sizeof(int),
2759                 .mode           =       0644,
2760                 .proc_handler   =       proc_dointvec,
2761         },
2762         {
2763                 .procname       =       "gc_min_interval_ms",
2764                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2765                 .maxlen         =       sizeof(int),
2766                 .mode           =       0644,
2767                 .proc_handler   =       proc_dointvec_ms_jiffies,
2768         },
2769         { }
2770 };
2771
2772 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2773 {
2774         struct ctl_table *table;
2775
2776         table = kmemdup(ipv6_route_table_template,
2777                         sizeof(ipv6_route_table_template),
2778                         GFP_KERNEL);
2779
2780         if (table) {
2781                 table[0].data = &net->ipv6.sysctl.flush_delay;
2782                 table[0].extra1 = net;
2783                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2784                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2785                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2786                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2787                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2788                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2789                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2790                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2791                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2792         }
2793
2794         return table;
2795 }
2796 #endif
2797
2798 static int __net_init ip6_route_net_init(struct net *net)
2799 {
2800         int ret = -ENOMEM;
2801
2802         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2803                sizeof(net->ipv6.ip6_dst_ops));
2804
2805         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2806                 goto out_ip6_dst_ops;
2807
2808         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2809                                            sizeof(*net->ipv6.ip6_null_entry),
2810                                            GFP_KERNEL);
2811         if (!net->ipv6.ip6_null_entry)
2812                 goto out_ip6_dst_entries;
2813         net->ipv6.ip6_null_entry->dst.path =
2814                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2815         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2816         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2817                          ip6_template_metrics, true);
2818
2819 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2820         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2821                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2822                                                GFP_KERNEL);
2823         if (!net->ipv6.ip6_prohibit_entry)
2824                 goto out_ip6_null_entry;
2825         net->ipv6.ip6_prohibit_entry->dst.path =
2826                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2827         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2828         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2829                          ip6_template_metrics, true);
2830
2831         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2832                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2833                                                GFP_KERNEL);
2834         if (!net->ipv6.ip6_blk_hole_entry)
2835                 goto out_ip6_prohibit_entry;
2836         net->ipv6.ip6_blk_hole_entry->dst.path =
2837                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2838         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2839         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2840                          ip6_template_metrics, true);
2841 #endif
2842
2843         net->ipv6.sysctl.flush_delay = 0;
2844         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2845         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2846         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2847         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2848         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2849         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2850         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2851
2852         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2853
2854         ret = 0;
2855 out:
2856         return ret;
2857
2858 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2859 out_ip6_prohibit_entry:
2860         kfree(net->ipv6.ip6_prohibit_entry);
2861 out_ip6_null_entry:
2862         kfree(net->ipv6.ip6_null_entry);
2863 #endif
2864 out_ip6_dst_entries:
2865         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2866 out_ip6_dst_ops:
2867         goto out;
2868 }
2869
2870 static void __net_exit ip6_route_net_exit(struct net *net)
2871 {
2872         kfree(net->ipv6.ip6_null_entry);
2873 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2874         kfree(net->ipv6.ip6_prohibit_entry);
2875         kfree(net->ipv6.ip6_blk_hole_entry);
2876 #endif
2877         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2878 }
2879
2880 static int __net_init ip6_route_net_init_late(struct net *net)
2881 {
2882 #ifdef CONFIG_PROC_FS
2883         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2884         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2885 #endif
2886         return 0;
2887 }
2888
2889 static void __net_exit ip6_route_net_exit_late(struct net *net)
2890 {
2891 #ifdef CONFIG_PROC_FS
2892         proc_net_remove(net, "ipv6_route");
2893         proc_net_remove(net, "rt6_stats");
2894 #endif
2895 }
2896
2897 static struct pernet_operations ip6_route_net_ops = {
2898         .init = ip6_route_net_init,
2899         .exit = ip6_route_net_exit,
2900 };
2901
2902 static struct pernet_operations ip6_route_net_late_ops = {
2903         .init = ip6_route_net_init_late,
2904         .exit = ip6_route_net_exit_late,
2905 };
2906
2907 static struct notifier_block ip6_route_dev_notifier = {
2908         .notifier_call = ip6_route_dev_notify,
2909         .priority = 0,
2910 };
2911
2912 int __init ip6_route_init(void)
2913 {
2914         int ret;
2915
2916         ret = -ENOMEM;
2917         ip6_dst_ops_template.kmem_cachep =
2918                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2919                                   SLAB_HWCACHE_ALIGN, NULL);
2920         if (!ip6_dst_ops_template.kmem_cachep)
2921                 goto out;
2922
2923         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2924         if (ret)
2925                 goto out_kmem_cache;
2926
2927         ret = register_pernet_subsys(&ip6_route_net_ops);
2928         if (ret)
2929                 goto out_dst_entries;
2930
2931         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2932
2933         /* Registering of the loopback is done before this portion of code,
2934          * the loopback reference in rt6_info will not be taken, do it
2935          * manually for init_net */
2936         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2937         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2938   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2939         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2940         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2941         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2942         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2943   #endif
2944         ret = fib6_init();
2945         if (ret)
2946                 goto out_register_subsys;
2947
2948         ret = xfrm6_init();
2949         if (ret)
2950                 goto out_fib6_init;
2951
2952         ret = fib6_rules_init();
2953         if (ret)
2954                 goto xfrm6_init;
2955
2956         ret = register_pernet_subsys(&ip6_route_net_late_ops);
2957         if (ret)
2958                 goto fib6_rules_init;
2959
2960         ret = -ENOBUFS;
2961         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2962             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2963             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2964                 goto out_register_late_subsys;
2965
2966         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2967         if (ret)
2968                 goto out_register_late_subsys;
2969
2970 out:
2971         return ret;
2972
2973 out_register_late_subsys:
2974         unregister_pernet_subsys(&ip6_route_net_late_ops);
2975 fib6_rules_init:
2976         fib6_rules_cleanup();
2977 xfrm6_init:
2978         xfrm6_fini();
2979 out_fib6_init:
2980         fib6_gc_cleanup();
2981 out_register_subsys:
2982         unregister_pernet_subsys(&ip6_route_net_ops);
2983 out_dst_entries:
2984         dst_entries_destroy(&ip6_dst_blackhole_ops);
2985 out_kmem_cache:
2986         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2987         goto out;
2988 }
2989
2990 void ip6_route_cleanup(void)
2991 {
2992         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2993         unregister_pernet_subsys(&ip6_route_net_late_ops);
2994         fib6_rules_cleanup();
2995         xfrm6_fini();
2996         fib6_gc_cleanup();
2997         unregister_pernet_subsys(&ip6_route_net_ops);
2998         dst_entries_destroy(&ip6_dst_blackhole_ops);
2999         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3000 }