ipv6: rt6_info output redirect to tunnel output
[firefly-linux-kernel-4.4.55.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61 #include <net/lwtunnel.h>
62
63 #include <asm/uaccess.h>
64
65 #ifdef CONFIG_SYSCTL
66 #include <linux/sysctl.h>
67 #endif
68
69 enum rt6_nud_state {
70         RT6_NUD_FAIL_HARD = -3,
71         RT6_NUD_FAIL_PROBE = -2,
72         RT6_NUD_FAIL_DO_RR = -1,
73         RT6_NUD_SUCCEED = 1
74 };
75
76 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
79 static unsigned int      ip6_mtu(const struct dst_entry *dst);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void             ip6_dst_destroy(struct dst_entry *);
82 static void             ip6_dst_ifdown(struct dst_entry *,
83                                        struct net_device *dev, int how);
84 static int               ip6_dst_gc(struct dst_ops *ops);
85
86 static int              ip6_pkt_discard(struct sk_buff *skb);
87 static int              ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
88 static int              ip6_pkt_prohibit(struct sk_buff *skb);
89 static int              ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
90 static void             ip6_link_failure(struct sk_buff *skb);
91 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
92                                            struct sk_buff *skb, u32 mtu);
93 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
94                                         struct sk_buff *skb);
95 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
96 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
97
98 #ifdef CONFIG_IPV6_ROUTE_INFO
99 static struct rt6_info *rt6_add_route_info(struct net *net,
100                                            const struct in6_addr *prefix, int prefixlen,
101                                            const struct in6_addr *gwaddr, int ifindex,
102                                            unsigned int pref);
103 static struct rt6_info *rt6_get_route_info(struct net *net,
104                                            const struct in6_addr *prefix, int prefixlen,
105                                            const struct in6_addr *gwaddr, int ifindex);
106 #endif
107
108 struct uncached_list {
109         spinlock_t              lock;
110         struct list_head        head;
111 };
112
113 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
114
115 static void rt6_uncached_list_add(struct rt6_info *rt)
116 {
117         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
118
119         rt->dst.flags |= DST_NOCACHE;
120         rt->rt6i_uncached_list = ul;
121
122         spin_lock_bh(&ul->lock);
123         list_add_tail(&rt->rt6i_uncached, &ul->head);
124         spin_unlock_bh(&ul->lock);
125 }
126
127 static void rt6_uncached_list_del(struct rt6_info *rt)
128 {
129         if (!list_empty(&rt->rt6i_uncached)) {
130                 struct uncached_list *ul = rt->rt6i_uncached_list;
131
132                 spin_lock_bh(&ul->lock);
133                 list_del(&rt->rt6i_uncached);
134                 spin_unlock_bh(&ul->lock);
135         }
136 }
137
138 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
139 {
140         struct net_device *loopback_dev = net->loopback_dev;
141         int cpu;
142
143         for_each_possible_cpu(cpu) {
144                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
145                 struct rt6_info *rt;
146
147                 spin_lock_bh(&ul->lock);
148                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
149                         struct inet6_dev *rt_idev = rt->rt6i_idev;
150                         struct net_device *rt_dev = rt->dst.dev;
151
152                         if (rt_idev && (rt_idev->dev == dev || !dev) &&
153                             rt_idev->dev != loopback_dev) {
154                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
155                                 in6_dev_put(rt_idev);
156                         }
157
158                         if (rt_dev && (rt_dev == dev || !dev) &&
159                             rt_dev != loopback_dev) {
160                                 rt->dst.dev = loopback_dev;
161                                 dev_hold(rt->dst.dev);
162                                 dev_put(rt_dev);
163                         }
164                 }
165                 spin_unlock_bh(&ul->lock);
166         }
167 }
168
169 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
170 {
171         return dst_metrics_write_ptr(rt->dst.from);
172 }
173
174 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
175 {
176         struct rt6_info *rt = (struct rt6_info *)dst;
177
178         if (rt->rt6i_flags & RTF_PCPU)
179                 return rt6_pcpu_cow_metrics(rt);
180         else if (rt->rt6i_flags & RTF_CACHE)
181                 return NULL;
182         else
183                 return dst_cow_metrics_generic(dst, old);
184 }
185
186 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
187                                              struct sk_buff *skb,
188                                              const void *daddr)
189 {
190         struct in6_addr *p = &rt->rt6i_gateway;
191
192         if (!ipv6_addr_any(p))
193                 return (const void *) p;
194         else if (skb)
195                 return &ipv6_hdr(skb)->daddr;
196         return daddr;
197 }
198
199 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
200                                           struct sk_buff *skb,
201                                           const void *daddr)
202 {
203         struct rt6_info *rt = (struct rt6_info *) dst;
204         struct neighbour *n;
205
206         daddr = choose_neigh_daddr(rt, skb, daddr);
207         n = __ipv6_neigh_lookup(dst->dev, daddr);
208         if (n)
209                 return n;
210         return neigh_create(&nd_tbl, daddr, dst->dev);
211 }
212
213 static struct dst_ops ip6_dst_ops_template = {
214         .family                 =       AF_INET6,
215         .gc                     =       ip6_dst_gc,
216         .gc_thresh              =       1024,
217         .check                  =       ip6_dst_check,
218         .default_advmss         =       ip6_default_advmss,
219         .mtu                    =       ip6_mtu,
220         .cow_metrics            =       ipv6_cow_metrics,
221         .destroy                =       ip6_dst_destroy,
222         .ifdown                 =       ip6_dst_ifdown,
223         .negative_advice        =       ip6_negative_advice,
224         .link_failure           =       ip6_link_failure,
225         .update_pmtu            =       ip6_rt_update_pmtu,
226         .redirect               =       rt6_do_redirect,
227         .local_out              =       __ip6_local_out,
228         .neigh_lookup           =       ip6_neigh_lookup,
229 };
230
231 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
232 {
233         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
234
235         return mtu ? : dst->dev->mtu;
236 }
237
238 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
239                                          struct sk_buff *skb, u32 mtu)
240 {
241 }
242
243 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
244                                       struct sk_buff *skb)
245 {
246 }
247
248 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
249                                          unsigned long old)
250 {
251         return NULL;
252 }
253
254 static struct dst_ops ip6_dst_blackhole_ops = {
255         .family                 =       AF_INET6,
256         .destroy                =       ip6_dst_destroy,
257         .check                  =       ip6_dst_check,
258         .mtu                    =       ip6_blackhole_mtu,
259         .default_advmss         =       ip6_default_advmss,
260         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
261         .redirect               =       ip6_rt_blackhole_redirect,
262         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
263         .neigh_lookup           =       ip6_neigh_lookup,
264 };
265
266 static const u32 ip6_template_metrics[RTAX_MAX] = {
267         [RTAX_HOPLIMIT - 1] = 0,
268 };
269
270 static const struct rt6_info ip6_null_entry_template = {
271         .dst = {
272                 .__refcnt       = ATOMIC_INIT(1),
273                 .__use          = 1,
274                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
275                 .error          = -ENETUNREACH,
276                 .input          = ip6_pkt_discard,
277                 .output         = ip6_pkt_discard_out,
278         },
279         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
280         .rt6i_protocol  = RTPROT_KERNEL,
281         .rt6i_metric    = ~(u32) 0,
282         .rt6i_ref       = ATOMIC_INIT(1),
283 };
284
285 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
286
287 static const struct rt6_info ip6_prohibit_entry_template = {
288         .dst = {
289                 .__refcnt       = ATOMIC_INIT(1),
290                 .__use          = 1,
291                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
292                 .error          = -EACCES,
293                 .input          = ip6_pkt_prohibit,
294                 .output         = ip6_pkt_prohibit_out,
295         },
296         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
297         .rt6i_protocol  = RTPROT_KERNEL,
298         .rt6i_metric    = ~(u32) 0,
299         .rt6i_ref       = ATOMIC_INIT(1),
300 };
301
302 static const struct rt6_info ip6_blk_hole_entry_template = {
303         .dst = {
304                 .__refcnt       = ATOMIC_INIT(1),
305                 .__use          = 1,
306                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
307                 .error          = -EINVAL,
308                 .input          = dst_discard,
309                 .output         = dst_discard_sk,
310         },
311         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
312         .rt6i_protocol  = RTPROT_KERNEL,
313         .rt6i_metric    = ~(u32) 0,
314         .rt6i_ref       = ATOMIC_INIT(1),
315 };
316
317 #endif
318
319 /* allocate dst with ip6_dst_ops */
320 static struct rt6_info *__ip6_dst_alloc(struct net *net,
321                                         struct net_device *dev,
322                                         int flags,
323                                         struct fib6_table *table)
324 {
325         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
326                                         0, DST_OBSOLETE_FORCE_CHK, flags);
327
328         if (rt) {
329                 struct dst_entry *dst = &rt->dst;
330
331                 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
332                 INIT_LIST_HEAD(&rt->rt6i_siblings);
333                 INIT_LIST_HEAD(&rt->rt6i_uncached);
334         }
335         return rt;
336 }
337
338 static struct rt6_info *ip6_dst_alloc(struct net *net,
339                                       struct net_device *dev,
340                                       int flags,
341                                       struct fib6_table *table)
342 {
343         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
344
345         if (rt) {
346                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
347                 if (rt->rt6i_pcpu) {
348                         int cpu;
349
350                         for_each_possible_cpu(cpu) {
351                                 struct rt6_info **p;
352
353                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
354                                 /* no one shares rt */
355                                 *p =  NULL;
356                         }
357                 } else {
358                         dst_destroy((struct dst_entry *)rt);
359                         return NULL;
360                 }
361         }
362
363         return rt;
364 }
365
366 static void ip6_dst_destroy(struct dst_entry *dst)
367 {
368         struct rt6_info *rt = (struct rt6_info *)dst;
369         struct dst_entry *from = dst->from;
370         struct inet6_dev *idev;
371
372         dst_destroy_metrics_generic(dst);
373         free_percpu(rt->rt6i_pcpu);
374         rt6_uncached_list_del(rt);
375
376         idev = rt->rt6i_idev;
377         if (idev) {
378                 rt->rt6i_idev = NULL;
379                 in6_dev_put(idev);
380         }
381
382         dst->from = NULL;
383         dst_release(from);
384 }
385
386 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
387                            int how)
388 {
389         struct rt6_info *rt = (struct rt6_info *)dst;
390         struct inet6_dev *idev = rt->rt6i_idev;
391         struct net_device *loopback_dev =
392                 dev_net(dev)->loopback_dev;
393
394         if (dev != loopback_dev) {
395                 if (idev && idev->dev == dev) {
396                         struct inet6_dev *loopback_idev =
397                                 in6_dev_get(loopback_dev);
398                         if (loopback_idev) {
399                                 rt->rt6i_idev = loopback_idev;
400                                 in6_dev_put(idev);
401                         }
402                 }
403         }
404 }
405
406 static bool rt6_check_expired(const struct rt6_info *rt)
407 {
408         if (rt->rt6i_flags & RTF_EXPIRES) {
409                 if (time_after(jiffies, rt->dst.expires))
410                         return true;
411         } else if (rt->dst.from) {
412                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
413         }
414         return false;
415 }
416
417 /* Multipath route selection:
418  *   Hash based function using packet header and flowlabel.
419  * Adapted from fib_info_hashfn()
420  */
421 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
422                                const struct flowi6 *fl6)
423 {
424         unsigned int val = fl6->flowi6_proto;
425
426         val ^= ipv6_addr_hash(&fl6->daddr);
427         val ^= ipv6_addr_hash(&fl6->saddr);
428
429         /* Work only if this not encapsulated */
430         switch (fl6->flowi6_proto) {
431         case IPPROTO_UDP:
432         case IPPROTO_TCP:
433         case IPPROTO_SCTP:
434                 val ^= (__force u16)fl6->fl6_sport;
435                 val ^= (__force u16)fl6->fl6_dport;
436                 break;
437
438         case IPPROTO_ICMPV6:
439                 val ^= (__force u16)fl6->fl6_icmp_type;
440                 val ^= (__force u16)fl6->fl6_icmp_code;
441                 break;
442         }
443         /* RFC6438 recommands to use flowlabel */
444         val ^= (__force u32)fl6->flowlabel;
445
446         /* Perhaps, we need to tune, this function? */
447         val = val ^ (val >> 7) ^ (val >> 12);
448         return val % candidate_count;
449 }
450
451 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
452                                              struct flowi6 *fl6, int oif,
453                                              int strict)
454 {
455         struct rt6_info *sibling, *next_sibling;
456         int route_choosen;
457
458         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
459         /* Don't change the route, if route_choosen == 0
460          * (siblings does not include ourself)
461          */
462         if (route_choosen)
463                 list_for_each_entry_safe(sibling, next_sibling,
464                                 &match->rt6i_siblings, rt6i_siblings) {
465                         route_choosen--;
466                         if (route_choosen == 0) {
467                                 if (rt6_score_route(sibling, oif, strict) < 0)
468                                         break;
469                                 match = sibling;
470                                 break;
471                         }
472                 }
473         return match;
474 }
475
476 /*
477  *      Route lookup. Any table->tb6_lock is implied.
478  */
479
480 static inline struct rt6_info *rt6_device_match(struct net *net,
481                                                     struct rt6_info *rt,
482                                                     const struct in6_addr *saddr,
483                                                     int oif,
484                                                     int flags)
485 {
486         struct rt6_info *local = NULL;
487         struct rt6_info *sprt;
488
489         if (!oif && ipv6_addr_any(saddr))
490                 goto out;
491
492         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
493                 struct net_device *dev = sprt->dst.dev;
494
495                 if (oif) {
496                         if (dev->ifindex == oif)
497                                 return sprt;
498                         if (dev->flags & IFF_LOOPBACK) {
499                                 if (!sprt->rt6i_idev ||
500                                     sprt->rt6i_idev->dev->ifindex != oif) {
501                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
502                                                 continue;
503                                         if (local && (!oif ||
504                                                       local->rt6i_idev->dev->ifindex == oif))
505                                                 continue;
506                                 }
507                                 local = sprt;
508                         }
509                 } else {
510                         if (ipv6_chk_addr(net, saddr, dev,
511                                           flags & RT6_LOOKUP_F_IFACE))
512                                 return sprt;
513                 }
514         }
515
516         if (oif) {
517                 if (local)
518                         return local;
519
520                 if (flags & RT6_LOOKUP_F_IFACE)
521                         return net->ipv6.ip6_null_entry;
522         }
523 out:
524         return rt;
525 }
526
527 #ifdef CONFIG_IPV6_ROUTER_PREF
528 struct __rt6_probe_work {
529         struct work_struct work;
530         struct in6_addr target;
531         struct net_device *dev;
532 };
533
534 static void rt6_probe_deferred(struct work_struct *w)
535 {
536         struct in6_addr mcaddr;
537         struct __rt6_probe_work *work =
538                 container_of(w, struct __rt6_probe_work, work);
539
540         addrconf_addr_solict_mult(&work->target, &mcaddr);
541         ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
542         dev_put(work->dev);
543         kfree(work);
544 }
545
546 static void rt6_probe(struct rt6_info *rt)
547 {
548         struct neighbour *neigh;
549         /*
550          * Okay, this does not seem to be appropriate
551          * for now, however, we need to check if it
552          * is really so; aka Router Reachability Probing.
553          *
554          * Router Reachability Probe MUST be rate-limited
555          * to no more than one per minute.
556          */
557         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
558                 return;
559         rcu_read_lock_bh();
560         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
561         if (neigh) {
562                 write_lock(&neigh->lock);
563                 if (neigh->nud_state & NUD_VALID)
564                         goto out;
565         }
566
567         if (!neigh ||
568             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
569                 struct __rt6_probe_work *work;
570
571                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
572
573                 if (neigh && work)
574                         __neigh_set_probe_once(neigh);
575
576                 if (neigh)
577                         write_unlock(&neigh->lock);
578
579                 if (work) {
580                         INIT_WORK(&work->work, rt6_probe_deferred);
581                         work->target = rt->rt6i_gateway;
582                         dev_hold(rt->dst.dev);
583                         work->dev = rt->dst.dev;
584                         schedule_work(&work->work);
585                 }
586         } else {
587 out:
588                 write_unlock(&neigh->lock);
589         }
590         rcu_read_unlock_bh();
591 }
592 #else
593 static inline void rt6_probe(struct rt6_info *rt)
594 {
595 }
596 #endif
597
598 /*
599  * Default Router Selection (RFC 2461 6.3.6)
600  */
601 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
602 {
603         struct net_device *dev = rt->dst.dev;
604         if (!oif || dev->ifindex == oif)
605                 return 2;
606         if ((dev->flags & IFF_LOOPBACK) &&
607             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
608                 return 1;
609         return 0;
610 }
611
612 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
613 {
614         struct neighbour *neigh;
615         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
616
617         if (rt->rt6i_flags & RTF_NONEXTHOP ||
618             !(rt->rt6i_flags & RTF_GATEWAY))
619                 return RT6_NUD_SUCCEED;
620
621         rcu_read_lock_bh();
622         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
623         if (neigh) {
624                 read_lock(&neigh->lock);
625                 if (neigh->nud_state & NUD_VALID)
626                         ret = RT6_NUD_SUCCEED;
627 #ifdef CONFIG_IPV6_ROUTER_PREF
628                 else if (!(neigh->nud_state & NUD_FAILED))
629                         ret = RT6_NUD_SUCCEED;
630                 else
631                         ret = RT6_NUD_FAIL_PROBE;
632 #endif
633                 read_unlock(&neigh->lock);
634         } else {
635                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
636                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
637         }
638         rcu_read_unlock_bh();
639
640         return ret;
641 }
642
643 static int rt6_score_route(struct rt6_info *rt, int oif,
644                            int strict)
645 {
646         int m;
647
648         m = rt6_check_dev(rt, oif);
649         if (!m && (strict & RT6_LOOKUP_F_IFACE))
650                 return RT6_NUD_FAIL_HARD;
651 #ifdef CONFIG_IPV6_ROUTER_PREF
652         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
653 #endif
654         if (strict & RT6_LOOKUP_F_REACHABLE) {
655                 int n = rt6_check_neigh(rt);
656                 if (n < 0)
657                         return n;
658         }
659         return m;
660 }
661
662 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
663                                    int *mpri, struct rt6_info *match,
664                                    bool *do_rr)
665 {
666         int m;
667         bool match_do_rr = false;
668
669         if (rt6_check_expired(rt))
670                 goto out;
671
672         m = rt6_score_route(rt, oif, strict);
673         if (m == RT6_NUD_FAIL_DO_RR) {
674                 match_do_rr = true;
675                 m = 0; /* lowest valid score */
676         } else if (m == RT6_NUD_FAIL_HARD) {
677                 goto out;
678         }
679
680         if (strict & RT6_LOOKUP_F_REACHABLE)
681                 rt6_probe(rt);
682
683         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
684         if (m > *mpri) {
685                 *do_rr = match_do_rr;
686                 *mpri = m;
687                 match = rt;
688         }
689 out:
690         return match;
691 }
692
693 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
694                                      struct rt6_info *rr_head,
695                                      u32 metric, int oif, int strict,
696                                      bool *do_rr)
697 {
698         struct rt6_info *rt, *match, *cont;
699         int mpri = -1;
700
701         match = NULL;
702         cont = NULL;
703         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
704                 if (rt->rt6i_metric != metric) {
705                         cont = rt;
706                         break;
707                 }
708
709                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
710         }
711
712         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
713                 if (rt->rt6i_metric != metric) {
714                         cont = rt;
715                         break;
716                 }
717
718                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
719         }
720
721         if (match || !cont)
722                 return match;
723
724         for (rt = cont; rt; rt = rt->dst.rt6_next)
725                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
726
727         return match;
728 }
729
730 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
731 {
732         struct rt6_info *match, *rt0;
733         struct net *net;
734         bool do_rr = false;
735
736         rt0 = fn->rr_ptr;
737         if (!rt0)
738                 fn->rr_ptr = rt0 = fn->leaf;
739
740         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
741                              &do_rr);
742
743         if (do_rr) {
744                 struct rt6_info *next = rt0->dst.rt6_next;
745
746                 /* no entries matched; do round-robin */
747                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
748                         next = fn->leaf;
749
750                 if (next != rt0)
751                         fn->rr_ptr = next;
752         }
753
754         net = dev_net(rt0->dst.dev);
755         return match ? match : net->ipv6.ip6_null_entry;
756 }
757
758 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
759 {
760         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
761 }
762
763 #ifdef CONFIG_IPV6_ROUTE_INFO
764 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
765                   const struct in6_addr *gwaddr)
766 {
767         struct net *net = dev_net(dev);
768         struct route_info *rinfo = (struct route_info *) opt;
769         struct in6_addr prefix_buf, *prefix;
770         unsigned int pref;
771         unsigned long lifetime;
772         struct rt6_info *rt;
773
774         if (len < sizeof(struct route_info)) {
775                 return -EINVAL;
776         }
777
778         /* Sanity check for prefix_len and length */
779         if (rinfo->length > 3) {
780                 return -EINVAL;
781         } else if (rinfo->prefix_len > 128) {
782                 return -EINVAL;
783         } else if (rinfo->prefix_len > 64) {
784                 if (rinfo->length < 2) {
785                         return -EINVAL;
786                 }
787         } else if (rinfo->prefix_len > 0) {
788                 if (rinfo->length < 1) {
789                         return -EINVAL;
790                 }
791         }
792
793         pref = rinfo->route_pref;
794         if (pref == ICMPV6_ROUTER_PREF_INVALID)
795                 return -EINVAL;
796
797         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
798
799         if (rinfo->length == 3)
800                 prefix = (struct in6_addr *)rinfo->prefix;
801         else {
802                 /* this function is safe */
803                 ipv6_addr_prefix(&prefix_buf,
804                                  (struct in6_addr *)rinfo->prefix,
805                                  rinfo->prefix_len);
806                 prefix = &prefix_buf;
807         }
808
809         if (rinfo->prefix_len == 0)
810                 rt = rt6_get_dflt_router(gwaddr, dev);
811         else
812                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
813                                         gwaddr, dev->ifindex);
814
815         if (rt && !lifetime) {
816                 ip6_del_rt(rt);
817                 rt = NULL;
818         }
819
820         if (!rt && lifetime)
821                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
822                                         pref);
823         else if (rt)
824                 rt->rt6i_flags = RTF_ROUTEINFO |
825                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
826
827         if (rt) {
828                 if (!addrconf_finite_timeout(lifetime))
829                         rt6_clean_expires(rt);
830                 else
831                         rt6_set_expires(rt, jiffies + HZ * lifetime);
832
833                 ip6_rt_put(rt);
834         }
835         return 0;
836 }
837 #endif
838
839 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
840                                         struct in6_addr *saddr)
841 {
842         struct fib6_node *pn;
843         while (1) {
844                 if (fn->fn_flags & RTN_TL_ROOT)
845                         return NULL;
846                 pn = fn->parent;
847                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
848                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
849                 else
850                         fn = pn;
851                 if (fn->fn_flags & RTN_RTINFO)
852                         return fn;
853         }
854 }
855
856 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
857                                              struct fib6_table *table,
858                                              struct flowi6 *fl6, int flags)
859 {
860         struct fib6_node *fn;
861         struct rt6_info *rt;
862
863         read_lock_bh(&table->tb6_lock);
864         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
865 restart:
866         rt = fn->leaf;
867         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
868         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
869                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
870         if (rt == net->ipv6.ip6_null_entry) {
871                 fn = fib6_backtrack(fn, &fl6->saddr);
872                 if (fn)
873                         goto restart;
874         }
875         dst_use(&rt->dst, jiffies);
876         read_unlock_bh(&table->tb6_lock);
877         return rt;
878
879 }
880
881 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
882                                     int flags)
883 {
884         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
885 }
886 EXPORT_SYMBOL_GPL(ip6_route_lookup);
887
888 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
889                             const struct in6_addr *saddr, int oif, int strict)
890 {
891         struct flowi6 fl6 = {
892                 .flowi6_oif = oif,
893                 .daddr = *daddr,
894         };
895         struct dst_entry *dst;
896         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
897
898         if (saddr) {
899                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
900                 flags |= RT6_LOOKUP_F_HAS_SADDR;
901         }
902
903         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
904         if (dst->error == 0)
905                 return (struct rt6_info *) dst;
906
907         dst_release(dst);
908
909         return NULL;
910 }
911 EXPORT_SYMBOL(rt6_lookup);
912
913 /* ip6_ins_rt is called with FREE table->tb6_lock.
914    It takes new route entry, the addition fails by any reason the
915    route is freed. In any case, if caller does not hold it, it may
916    be destroyed.
917  */
918
919 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
920                         struct mx6_config *mxc)
921 {
922         int err;
923         struct fib6_table *table;
924
925         table = rt->rt6i_table;
926         write_lock_bh(&table->tb6_lock);
927         err = fib6_add(&table->tb6_root, rt, info, mxc);
928         write_unlock_bh(&table->tb6_lock);
929
930         return err;
931 }
932
933 int ip6_ins_rt(struct rt6_info *rt)
934 {
935         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
936         struct mx6_config mxc = { .mx = NULL, };
937
938         return __ip6_ins_rt(rt, &info, &mxc);
939 }
940
941 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
942                                            const struct in6_addr *daddr,
943                                            const struct in6_addr *saddr)
944 {
945         struct rt6_info *rt;
946
947         /*
948          *      Clone the route.
949          */
950
951         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
952                 ort = (struct rt6_info *)ort->dst.from;
953
954         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
955                              0, ort->rt6i_table);
956
957         if (!rt)
958                 return NULL;
959
960         ip6_rt_copy_init(rt, ort);
961         rt->rt6i_flags |= RTF_CACHE;
962         rt->rt6i_metric = 0;
963         rt->dst.flags |= DST_HOST;
964         rt->rt6i_dst.addr = *daddr;
965         rt->rt6i_dst.plen = 128;
966
967         if (!rt6_is_gw_or_nonexthop(ort)) {
968                 if (ort->rt6i_dst.plen != 128 &&
969                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
970                         rt->rt6i_flags |= RTF_ANYCAST;
971 #ifdef CONFIG_IPV6_SUBTREES
972                 if (rt->rt6i_src.plen && saddr) {
973                         rt->rt6i_src.addr = *saddr;
974                         rt->rt6i_src.plen = 128;
975                 }
976 #endif
977         }
978
979         return rt;
980 }
981
982 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
983 {
984         struct rt6_info *pcpu_rt;
985
986         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
987                                   rt->dst.dev, rt->dst.flags,
988                                   rt->rt6i_table);
989
990         if (!pcpu_rt)
991                 return NULL;
992         ip6_rt_copy_init(pcpu_rt, rt);
993         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
994         pcpu_rt->rt6i_flags |= RTF_PCPU;
995         return pcpu_rt;
996 }
997
998 /* It should be called with read_lock_bh(&tb6_lock) acquired */
999 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1000 {
1001         struct rt6_info *pcpu_rt, *prev, **p;
1002
1003         p = this_cpu_ptr(rt->rt6i_pcpu);
1004         pcpu_rt = *p;
1005
1006         if (pcpu_rt)
1007                 goto done;
1008
1009         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1010         if (!pcpu_rt) {
1011                 struct net *net = dev_net(rt->dst.dev);
1012
1013                 pcpu_rt = net->ipv6.ip6_null_entry;
1014                 goto done;
1015         }
1016
1017         prev = cmpxchg(p, NULL, pcpu_rt);
1018         if (prev) {
1019                 /* If someone did it before us, return prev instead */
1020                 dst_destroy(&pcpu_rt->dst);
1021                 pcpu_rt = prev;
1022         }
1023
1024 done:
1025         dst_hold(&pcpu_rt->dst);
1026         rt6_dst_from_metrics_check(pcpu_rt);
1027         return pcpu_rt;
1028 }
1029
1030 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1031                                       struct flowi6 *fl6, int flags)
1032 {
1033         struct fib6_node *fn, *saved_fn;
1034         struct rt6_info *rt;
1035         int strict = 0;
1036
1037         strict |= flags & RT6_LOOKUP_F_IFACE;
1038         if (net->ipv6.devconf_all->forwarding == 0)
1039                 strict |= RT6_LOOKUP_F_REACHABLE;
1040
1041         read_lock_bh(&table->tb6_lock);
1042
1043         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1044         saved_fn = fn;
1045
1046 redo_rt6_select:
1047         rt = rt6_select(fn, oif, strict);
1048         if (rt->rt6i_nsiblings)
1049                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1050         if (rt == net->ipv6.ip6_null_entry) {
1051                 fn = fib6_backtrack(fn, &fl6->saddr);
1052                 if (fn)
1053                         goto redo_rt6_select;
1054                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1055                         /* also consider unreachable route */
1056                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1057                         fn = saved_fn;
1058                         goto redo_rt6_select;
1059                 }
1060         }
1061
1062
1063         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1064                 dst_use(&rt->dst, jiffies);
1065                 read_unlock_bh(&table->tb6_lock);
1066
1067                 rt6_dst_from_metrics_check(rt);
1068                 return rt;
1069         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1070                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1071                 /* Create a RTF_CACHE clone which will not be
1072                  * owned by the fib6 tree.  It is for the special case where
1073                  * the daddr in the skb during the neighbor look-up is different
1074                  * from the fl6->daddr used to look-up route here.
1075                  */
1076
1077                 struct rt6_info *uncached_rt;
1078
1079                 dst_use(&rt->dst, jiffies);
1080                 read_unlock_bh(&table->tb6_lock);
1081
1082                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1083                 dst_release(&rt->dst);
1084
1085                 if (uncached_rt)
1086                         rt6_uncached_list_add(uncached_rt);
1087                 else
1088                         uncached_rt = net->ipv6.ip6_null_entry;
1089
1090                 dst_hold(&uncached_rt->dst);
1091                 return uncached_rt;
1092
1093         } else {
1094                 /* Get a percpu copy */
1095
1096                 struct rt6_info *pcpu_rt;
1097
1098                 rt->dst.lastuse = jiffies;
1099                 rt->dst.__use++;
1100                 pcpu_rt = rt6_get_pcpu_route(rt);
1101                 read_unlock_bh(&table->tb6_lock);
1102
1103                 return pcpu_rt;
1104         }
1105 }
1106
1107 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1108                                             struct flowi6 *fl6, int flags)
1109 {
1110         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1111 }
1112
1113 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1114                                                 struct net_device *dev,
1115                                                 struct flowi6 *fl6, int flags)
1116 {
1117         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1118                 flags |= RT6_LOOKUP_F_IFACE;
1119
1120         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1121 }
1122
1123 void ip6_route_input(struct sk_buff *skb)
1124 {
1125         const struct ipv6hdr *iph = ipv6_hdr(skb);
1126         struct net *net = dev_net(skb->dev);
1127         int flags = RT6_LOOKUP_F_HAS_SADDR;
1128         struct flowi6 fl6 = {
1129                 .flowi6_iif = skb->dev->ifindex,
1130                 .daddr = iph->daddr,
1131                 .saddr = iph->saddr,
1132                 .flowlabel = ip6_flowinfo(iph),
1133                 .flowi6_mark = skb->mark,
1134                 .flowi6_proto = iph->nexthdr,
1135         };
1136
1137         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1138 }
1139
1140 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1141                                              struct flowi6 *fl6, int flags)
1142 {
1143         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1144 }
1145
1146 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1147                                     struct flowi6 *fl6)
1148 {
1149         int flags = 0;
1150
1151         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1152
1153         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1154                 flags |= RT6_LOOKUP_F_IFACE;
1155
1156         if (!ipv6_addr_any(&fl6->saddr))
1157                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1158         else if (sk)
1159                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1160
1161         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1162 }
1163 EXPORT_SYMBOL(ip6_route_output);
1164
1165 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1166 {
1167         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1168         struct dst_entry *new = NULL;
1169
1170         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1171         if (rt) {
1172                 new = &rt->dst;
1173
1174                 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1175
1176                 new->__use = 1;
1177                 new->input = dst_discard;
1178                 new->output = dst_discard_sk;
1179
1180                 if (dst_metrics_read_only(&ort->dst))
1181                         new->_metrics = ort->dst._metrics;
1182                 else
1183                         dst_copy_metrics(new, &ort->dst);
1184                 rt->rt6i_idev = ort->rt6i_idev;
1185                 if (rt->rt6i_idev)
1186                         in6_dev_hold(rt->rt6i_idev);
1187
1188                 rt->rt6i_gateway = ort->rt6i_gateway;
1189                 rt->rt6i_flags = ort->rt6i_flags;
1190                 rt->rt6i_metric = 0;
1191
1192                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1193 #ifdef CONFIG_IPV6_SUBTREES
1194                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1195 #endif
1196
1197                 dst_free(new);
1198         }
1199
1200         dst_release(dst_orig);
1201         return new ? new : ERR_PTR(-ENOMEM);
1202 }
1203
1204 /*
1205  *      Destination cache support functions
1206  */
1207
1208 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1209 {
1210         if (rt->dst.from &&
1211             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1212                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1213 }
1214
1215 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1216 {
1217         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1218                 return NULL;
1219
1220         if (rt6_check_expired(rt))
1221                 return NULL;
1222
1223         return &rt->dst;
1224 }
1225
1226 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1227 {
1228         if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1229             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1230                 return &rt->dst;
1231         else
1232                 return NULL;
1233 }
1234
1235 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1236 {
1237         struct rt6_info *rt;
1238
1239         rt = (struct rt6_info *) dst;
1240
1241         /* All IPV6 dsts are created with ->obsolete set to the value
1242          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1243          * into this function always.
1244          */
1245
1246         rt6_dst_from_metrics_check(rt);
1247
1248         if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1249                 return rt6_dst_from_check(rt, cookie);
1250         else
1251                 return rt6_check(rt, cookie);
1252 }
1253
1254 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1255 {
1256         struct rt6_info *rt = (struct rt6_info *) dst;
1257
1258         if (rt) {
1259                 if (rt->rt6i_flags & RTF_CACHE) {
1260                         if (rt6_check_expired(rt)) {
1261                                 ip6_del_rt(rt);
1262                                 dst = NULL;
1263                         }
1264                 } else {
1265                         dst_release(dst);
1266                         dst = NULL;
1267                 }
1268         }
1269         return dst;
1270 }
1271
1272 static void ip6_link_failure(struct sk_buff *skb)
1273 {
1274         struct rt6_info *rt;
1275
1276         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1277
1278         rt = (struct rt6_info *) skb_dst(skb);
1279         if (rt) {
1280                 if (rt->rt6i_flags & RTF_CACHE) {
1281                         dst_hold(&rt->dst);
1282                         if (ip6_del_rt(rt))
1283                                 dst_free(&rt->dst);
1284                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1285                         rt->rt6i_node->fn_sernum = -1;
1286                 }
1287         }
1288 }
1289
1290 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1291 {
1292         struct net *net = dev_net(rt->dst.dev);
1293
1294         rt->rt6i_flags |= RTF_MODIFIED;
1295         rt->rt6i_pmtu = mtu;
1296         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1297 }
1298
1299 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1300                                  const struct ipv6hdr *iph, u32 mtu)
1301 {
1302         struct rt6_info *rt6 = (struct rt6_info *)dst;
1303
1304         if (rt6->rt6i_flags & RTF_LOCAL)
1305                 return;
1306
1307         dst_confirm(dst);
1308         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1309         if (mtu >= dst_mtu(dst))
1310                 return;
1311
1312         if (rt6->rt6i_flags & RTF_CACHE) {
1313                 rt6_do_update_pmtu(rt6, mtu);
1314         } else {
1315                 const struct in6_addr *daddr, *saddr;
1316                 struct rt6_info *nrt6;
1317
1318                 if (iph) {
1319                         daddr = &iph->daddr;
1320                         saddr = &iph->saddr;
1321                 } else if (sk) {
1322                         daddr = &sk->sk_v6_daddr;
1323                         saddr = &inet6_sk(sk)->saddr;
1324                 } else {
1325                         return;
1326                 }
1327                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1328                 if (nrt6) {
1329                         rt6_do_update_pmtu(nrt6, mtu);
1330
1331                         /* ip6_ins_rt(nrt6) will bump the
1332                          * rt6->rt6i_node->fn_sernum
1333                          * which will fail the next rt6_check() and
1334                          * invalidate the sk->sk_dst_cache.
1335                          */
1336                         ip6_ins_rt(nrt6);
1337                 }
1338         }
1339 }
1340
1341 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1342                                struct sk_buff *skb, u32 mtu)
1343 {
1344         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1345 }
1346
1347 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1348                      int oif, u32 mark)
1349 {
1350         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1351         struct dst_entry *dst;
1352         struct flowi6 fl6;
1353
1354         memset(&fl6, 0, sizeof(fl6));
1355         fl6.flowi6_oif = oif;
1356         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1357         fl6.daddr = iph->daddr;
1358         fl6.saddr = iph->saddr;
1359         fl6.flowlabel = ip6_flowinfo(iph);
1360
1361         dst = ip6_route_output(net, NULL, &fl6);
1362         if (!dst->error)
1363                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1364         dst_release(dst);
1365 }
1366 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1367
1368 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1369 {
1370         ip6_update_pmtu(skb, sock_net(sk), mtu,
1371                         sk->sk_bound_dev_if, sk->sk_mark);
1372 }
1373 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1374
1375 /* Handle redirects */
1376 struct ip6rd_flowi {
1377         struct flowi6 fl6;
1378         struct in6_addr gateway;
1379 };
1380
1381 static struct rt6_info *__ip6_route_redirect(struct net *net,
1382                                              struct fib6_table *table,
1383                                              struct flowi6 *fl6,
1384                                              int flags)
1385 {
1386         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1387         struct rt6_info *rt;
1388         struct fib6_node *fn;
1389
1390         /* Get the "current" route for this destination and
1391          * check if the redirect has come from approriate router.
1392          *
1393          * RFC 4861 specifies that redirects should only be
1394          * accepted if they come from the nexthop to the target.
1395          * Due to the way the routes are chosen, this notion
1396          * is a bit fuzzy and one might need to check all possible
1397          * routes.
1398          */
1399
1400         read_lock_bh(&table->tb6_lock);
1401         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1402 restart:
1403         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1404                 if (rt6_check_expired(rt))
1405                         continue;
1406                 if (rt->dst.error)
1407                         break;
1408                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1409                         continue;
1410                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1411                         continue;
1412                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1413                         continue;
1414                 break;
1415         }
1416
1417         if (!rt)
1418                 rt = net->ipv6.ip6_null_entry;
1419         else if (rt->dst.error) {
1420                 rt = net->ipv6.ip6_null_entry;
1421                 goto out;
1422         }
1423
1424         if (rt == net->ipv6.ip6_null_entry) {
1425                 fn = fib6_backtrack(fn, &fl6->saddr);
1426                 if (fn)
1427                         goto restart;
1428         }
1429
1430 out:
1431         dst_hold(&rt->dst);
1432
1433         read_unlock_bh(&table->tb6_lock);
1434
1435         return rt;
1436 };
1437
1438 static struct dst_entry *ip6_route_redirect(struct net *net,
1439                                         const struct flowi6 *fl6,
1440                                         const struct in6_addr *gateway)
1441 {
1442         int flags = RT6_LOOKUP_F_HAS_SADDR;
1443         struct ip6rd_flowi rdfl;
1444
1445         rdfl.fl6 = *fl6;
1446         rdfl.gateway = *gateway;
1447
1448         return fib6_rule_lookup(net, &rdfl.fl6,
1449                                 flags, __ip6_route_redirect);
1450 }
1451
1452 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1453 {
1454         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1455         struct dst_entry *dst;
1456         struct flowi6 fl6;
1457
1458         memset(&fl6, 0, sizeof(fl6));
1459         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1460         fl6.flowi6_oif = oif;
1461         fl6.flowi6_mark = mark;
1462         fl6.daddr = iph->daddr;
1463         fl6.saddr = iph->saddr;
1464         fl6.flowlabel = ip6_flowinfo(iph);
1465
1466         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1467         rt6_do_redirect(dst, NULL, skb);
1468         dst_release(dst);
1469 }
1470 EXPORT_SYMBOL_GPL(ip6_redirect);
1471
1472 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1473                             u32 mark)
1474 {
1475         const struct ipv6hdr *iph = ipv6_hdr(skb);
1476         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1477         struct dst_entry *dst;
1478         struct flowi6 fl6;
1479
1480         memset(&fl6, 0, sizeof(fl6));
1481         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1482         fl6.flowi6_oif = oif;
1483         fl6.flowi6_mark = mark;
1484         fl6.daddr = msg->dest;
1485         fl6.saddr = iph->daddr;
1486
1487         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1488         rt6_do_redirect(dst, NULL, skb);
1489         dst_release(dst);
1490 }
1491
1492 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1493 {
1494         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1495 }
1496 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1497
1498 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1499 {
1500         struct net_device *dev = dst->dev;
1501         unsigned int mtu = dst_mtu(dst);
1502         struct net *net = dev_net(dev);
1503
1504         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1505
1506         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1507                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1508
1509         /*
1510          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1511          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1512          * IPV6_MAXPLEN is also valid and means: "any MSS,
1513          * rely only on pmtu discovery"
1514          */
1515         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1516                 mtu = IPV6_MAXPLEN;
1517         return mtu;
1518 }
1519
1520 static unsigned int ip6_mtu(const struct dst_entry *dst)
1521 {
1522         const struct rt6_info *rt = (const struct rt6_info *)dst;
1523         unsigned int mtu = rt->rt6i_pmtu;
1524         struct inet6_dev *idev;
1525
1526         if (mtu)
1527                 goto out;
1528
1529         mtu = dst_metric_raw(dst, RTAX_MTU);
1530         if (mtu)
1531                 goto out;
1532
1533         mtu = IPV6_MIN_MTU;
1534
1535         rcu_read_lock();
1536         idev = __in6_dev_get(dst->dev);
1537         if (idev)
1538                 mtu = idev->cnf.mtu6;
1539         rcu_read_unlock();
1540
1541 out:
1542         return min_t(unsigned int, mtu, IP6_MAX_MTU);
1543 }
1544
1545 static struct dst_entry *icmp6_dst_gc_list;
1546 static DEFINE_SPINLOCK(icmp6_dst_lock);
1547
1548 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1549                                   struct flowi6 *fl6)
1550 {
1551         struct dst_entry *dst;
1552         struct rt6_info *rt;
1553         struct inet6_dev *idev = in6_dev_get(dev);
1554         struct net *net = dev_net(dev);
1555
1556         if (unlikely(!idev))
1557                 return ERR_PTR(-ENODEV);
1558
1559         rt = ip6_dst_alloc(net, dev, 0, NULL);
1560         if (unlikely(!rt)) {
1561                 in6_dev_put(idev);
1562                 dst = ERR_PTR(-ENOMEM);
1563                 goto out;
1564         }
1565
1566         rt->dst.flags |= DST_HOST;
1567         rt->dst.output  = ip6_output;
1568         atomic_set(&rt->dst.__refcnt, 1);
1569         rt->rt6i_gateway  = fl6->daddr;
1570         rt->rt6i_dst.addr = fl6->daddr;
1571         rt->rt6i_dst.plen = 128;
1572         rt->rt6i_idev     = idev;
1573         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1574
1575         spin_lock_bh(&icmp6_dst_lock);
1576         rt->dst.next = icmp6_dst_gc_list;
1577         icmp6_dst_gc_list = &rt->dst;
1578         spin_unlock_bh(&icmp6_dst_lock);
1579
1580         fib6_force_start_gc(net);
1581
1582         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1583
1584 out:
1585         return dst;
1586 }
1587
1588 int icmp6_dst_gc(void)
1589 {
1590         struct dst_entry *dst, **pprev;
1591         int more = 0;
1592
1593         spin_lock_bh(&icmp6_dst_lock);
1594         pprev = &icmp6_dst_gc_list;
1595
1596         while ((dst = *pprev) != NULL) {
1597                 if (!atomic_read(&dst->__refcnt)) {
1598                         *pprev = dst->next;
1599                         dst_free(dst);
1600                 } else {
1601                         pprev = &dst->next;
1602                         ++more;
1603                 }
1604         }
1605
1606         spin_unlock_bh(&icmp6_dst_lock);
1607
1608         return more;
1609 }
1610
1611 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1612                             void *arg)
1613 {
1614         struct dst_entry *dst, **pprev;
1615
1616         spin_lock_bh(&icmp6_dst_lock);
1617         pprev = &icmp6_dst_gc_list;
1618         while ((dst = *pprev) != NULL) {
1619                 struct rt6_info *rt = (struct rt6_info *) dst;
1620                 if (func(rt, arg)) {
1621                         *pprev = dst->next;
1622                         dst_free(dst);
1623                 } else {
1624                         pprev = &dst->next;
1625                 }
1626         }
1627         spin_unlock_bh(&icmp6_dst_lock);
1628 }
1629
1630 static int ip6_dst_gc(struct dst_ops *ops)
1631 {
1632         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1633         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1634         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1635         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1636         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1637         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1638         int entries;
1639
1640         entries = dst_entries_get_fast(ops);
1641         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1642             entries <= rt_max_size)
1643                 goto out;
1644
1645         net->ipv6.ip6_rt_gc_expire++;
1646         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1647         entries = dst_entries_get_slow(ops);
1648         if (entries < ops->gc_thresh)
1649                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1650 out:
1651         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1652         return entries > rt_max_size;
1653 }
1654
1655 static int ip6_convert_metrics(struct mx6_config *mxc,
1656                                const struct fib6_config *cfg)
1657 {
1658         struct nlattr *nla;
1659         int remaining;
1660         u32 *mp;
1661
1662         if (!cfg->fc_mx)
1663                 return 0;
1664
1665         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1666         if (unlikely(!mp))
1667                 return -ENOMEM;
1668
1669         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1670                 int type = nla_type(nla);
1671
1672                 if (type) {
1673                         u32 val;
1674
1675                         if (unlikely(type > RTAX_MAX))
1676                                 goto err;
1677                         if (type == RTAX_CC_ALGO) {
1678                                 char tmp[TCP_CA_NAME_MAX];
1679
1680                                 nla_strlcpy(tmp, nla, sizeof(tmp));
1681                                 val = tcp_ca_get_key_by_name(tmp);
1682                                 if (val == TCP_CA_UNSPEC)
1683                                         goto err;
1684                         } else {
1685                                 val = nla_get_u32(nla);
1686                         }
1687
1688                         mp[type - 1] = val;
1689                         __set_bit(type - 1, mxc->mx_valid);
1690                 }
1691         }
1692
1693         mxc->mx = mp;
1694
1695         return 0;
1696  err:
1697         kfree(mp);
1698         return -EINVAL;
1699 }
1700
1701 int ip6_route_add(struct fib6_config *cfg)
1702 {
1703         int err;
1704         struct net *net = cfg->fc_nlinfo.nl_net;
1705         struct rt6_info *rt = NULL;
1706         struct net_device *dev = NULL;
1707         struct inet6_dev *idev = NULL;
1708         struct fib6_table *table;
1709         struct mx6_config mxc = { .mx = NULL, };
1710         int addr_type;
1711
1712         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1713                 return -EINVAL;
1714 #ifndef CONFIG_IPV6_SUBTREES
1715         if (cfg->fc_src_len)
1716                 return -EINVAL;
1717 #endif
1718         if (cfg->fc_ifindex) {
1719                 err = -ENODEV;
1720                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1721                 if (!dev)
1722                         goto out;
1723                 idev = in6_dev_get(dev);
1724                 if (!idev)
1725                         goto out;
1726         }
1727
1728         if (cfg->fc_metric == 0)
1729                 cfg->fc_metric = IP6_RT_PRIO_USER;
1730
1731         err = -ENOBUFS;
1732         if (cfg->fc_nlinfo.nlh &&
1733             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1734                 table = fib6_get_table(net, cfg->fc_table);
1735                 if (!table) {
1736                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1737                         table = fib6_new_table(net, cfg->fc_table);
1738                 }
1739         } else {
1740                 table = fib6_new_table(net, cfg->fc_table);
1741         }
1742
1743         if (!table)
1744                 goto out;
1745
1746         rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
1747
1748         if (!rt) {
1749                 err = -ENOMEM;
1750                 goto out;
1751         }
1752
1753         if (cfg->fc_flags & RTF_EXPIRES)
1754                 rt6_set_expires(rt, jiffies +
1755                                 clock_t_to_jiffies(cfg->fc_expires));
1756         else
1757                 rt6_clean_expires(rt);
1758
1759         if (cfg->fc_protocol == RTPROT_UNSPEC)
1760                 cfg->fc_protocol = RTPROT_BOOT;
1761         rt->rt6i_protocol = cfg->fc_protocol;
1762
1763         addr_type = ipv6_addr_type(&cfg->fc_dst);
1764
1765         if (addr_type & IPV6_ADDR_MULTICAST)
1766                 rt->dst.input = ip6_mc_input;
1767         else if (cfg->fc_flags & RTF_LOCAL)
1768                 rt->dst.input = ip6_input;
1769         else
1770                 rt->dst.input = ip6_forward;
1771
1772         rt->dst.output = ip6_output;
1773
1774         if (cfg->fc_encap) {
1775                 struct lwtunnel_state *lwtstate;
1776
1777                 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1778                                            cfg->fc_encap, &lwtstate);
1779                 if (err)
1780                         goto out;
1781                 lwtunnel_state_get(lwtstate);
1782                 rt->rt6i_lwtstate = lwtstate;
1783                 rt->dst.output = lwtunnel_output6;
1784         }
1785
1786         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1787         rt->rt6i_dst.plen = cfg->fc_dst_len;
1788         if (rt->rt6i_dst.plen == 128)
1789                 rt->dst.flags |= DST_HOST;
1790
1791 #ifdef CONFIG_IPV6_SUBTREES
1792         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1793         rt->rt6i_src.plen = cfg->fc_src_len;
1794 #endif
1795
1796         rt->rt6i_metric = cfg->fc_metric;
1797
1798         /* We cannot add true routes via loopback here,
1799            they would result in kernel looping; promote them to reject routes
1800          */
1801         if ((cfg->fc_flags & RTF_REJECT) ||
1802             (dev && (dev->flags & IFF_LOOPBACK) &&
1803              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1804              !(cfg->fc_flags & RTF_LOCAL))) {
1805                 /* hold loopback dev/idev if we haven't done so. */
1806                 if (dev != net->loopback_dev) {
1807                         if (dev) {
1808                                 dev_put(dev);
1809                                 in6_dev_put(idev);
1810                         }
1811                         dev = net->loopback_dev;
1812                         dev_hold(dev);
1813                         idev = in6_dev_get(dev);
1814                         if (!idev) {
1815                                 err = -ENODEV;
1816                                 goto out;
1817                         }
1818                 }
1819                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1820                 switch (cfg->fc_type) {
1821                 case RTN_BLACKHOLE:
1822                         rt->dst.error = -EINVAL;
1823                         rt->dst.output = dst_discard_sk;
1824                         rt->dst.input = dst_discard;
1825                         break;
1826                 case RTN_PROHIBIT:
1827                         rt->dst.error = -EACCES;
1828                         rt->dst.output = ip6_pkt_prohibit_out;
1829                         rt->dst.input = ip6_pkt_prohibit;
1830                         break;
1831                 case RTN_THROW:
1832                 default:
1833                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1834                                         : -ENETUNREACH;
1835                         rt->dst.output = ip6_pkt_discard_out;
1836                         rt->dst.input = ip6_pkt_discard;
1837                         break;
1838                 }
1839                 goto install_route;
1840         }
1841
1842         if (cfg->fc_flags & RTF_GATEWAY) {
1843                 const struct in6_addr *gw_addr;
1844                 int gwa_type;
1845
1846                 gw_addr = &cfg->fc_gateway;
1847
1848                 /* if gw_addr is local we will fail to detect this in case
1849                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1850                  * will return already-added prefix route via interface that
1851                  * prefix route was assigned to, which might be non-loopback.
1852                  */
1853                 err = -EINVAL;
1854                 if (ipv6_chk_addr_and_flags(net, gw_addr, NULL, 0, 0))
1855                         goto out;
1856
1857                 rt->rt6i_gateway = *gw_addr;
1858                 gwa_type = ipv6_addr_type(gw_addr);
1859
1860                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1861                         struct rt6_info *grt;
1862
1863                         /* IPv6 strictly inhibits using not link-local
1864                            addresses as nexthop address.
1865                            Otherwise, router will not able to send redirects.
1866                            It is very good, but in some (rare!) circumstances
1867                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1868                            some exceptions. --ANK
1869                          */
1870                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1871                                 goto out;
1872
1873                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1874
1875                         err = -EHOSTUNREACH;
1876                         if (!grt)
1877                                 goto out;
1878                         if (dev) {
1879                                 if (dev != grt->dst.dev) {
1880                                         ip6_rt_put(grt);
1881                                         goto out;
1882                                 }
1883                         } else {
1884                                 dev = grt->dst.dev;
1885                                 idev = grt->rt6i_idev;
1886                                 dev_hold(dev);
1887                                 in6_dev_hold(grt->rt6i_idev);
1888                         }
1889                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1890                                 err = 0;
1891                         ip6_rt_put(grt);
1892
1893                         if (err)
1894                                 goto out;
1895                 }
1896                 err = -EINVAL;
1897                 if (!dev || (dev->flags & IFF_LOOPBACK))
1898                         goto out;
1899         }
1900
1901         err = -ENODEV;
1902         if (!dev)
1903                 goto out;
1904
1905         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1906                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1907                         err = -EINVAL;
1908                         goto out;
1909                 }
1910                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1911                 rt->rt6i_prefsrc.plen = 128;
1912         } else
1913                 rt->rt6i_prefsrc.plen = 0;
1914
1915         rt->rt6i_flags = cfg->fc_flags;
1916
1917 install_route:
1918         rt->dst.dev = dev;
1919         rt->rt6i_idev = idev;
1920         rt->rt6i_table = table;
1921
1922         cfg->fc_nlinfo.nl_net = dev_net(dev);
1923
1924         err = ip6_convert_metrics(&mxc, cfg);
1925         if (err)
1926                 goto out;
1927
1928         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1929
1930         kfree(mxc.mx);
1931         return err;
1932 out:
1933         if (dev)
1934                 dev_put(dev);
1935         if (idev)
1936                 in6_dev_put(idev);
1937         if (rt)
1938                 dst_free(&rt->dst);
1939         return err;
1940 }
1941
1942 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1943 {
1944         int err;
1945         struct fib6_table *table;
1946         struct net *net = dev_net(rt->dst.dev);
1947
1948         if (rt == net->ipv6.ip6_null_entry) {
1949                 err = -ENOENT;
1950                 goto out;
1951         }
1952
1953         table = rt->rt6i_table;
1954         write_lock_bh(&table->tb6_lock);
1955         err = fib6_del(rt, info);
1956         write_unlock_bh(&table->tb6_lock);
1957
1958 out:
1959         ip6_rt_put(rt);
1960         return err;
1961 }
1962
1963 int ip6_del_rt(struct rt6_info *rt)
1964 {
1965         struct nl_info info = {
1966                 .nl_net = dev_net(rt->dst.dev),
1967         };
1968         return __ip6_del_rt(rt, &info);
1969 }
1970
1971 static int ip6_route_del(struct fib6_config *cfg)
1972 {
1973         struct fib6_table *table;
1974         struct fib6_node *fn;
1975         struct rt6_info *rt;
1976         int err = -ESRCH;
1977
1978         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1979         if (!table)
1980                 return err;
1981
1982         read_lock_bh(&table->tb6_lock);
1983
1984         fn = fib6_locate(&table->tb6_root,
1985                          &cfg->fc_dst, cfg->fc_dst_len,
1986                          &cfg->fc_src, cfg->fc_src_len);
1987
1988         if (fn) {
1989                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1990                         if ((rt->rt6i_flags & RTF_CACHE) &&
1991                             !(cfg->fc_flags & RTF_CACHE))
1992                                 continue;
1993                         if (cfg->fc_ifindex &&
1994                             (!rt->dst.dev ||
1995                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1996                                 continue;
1997                         if (cfg->fc_flags & RTF_GATEWAY &&
1998                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1999                                 continue;
2000                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2001                                 continue;
2002                         dst_hold(&rt->dst);
2003                         read_unlock_bh(&table->tb6_lock);
2004
2005                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2006                 }
2007         }
2008         read_unlock_bh(&table->tb6_lock);
2009
2010         return err;
2011 }
2012
2013 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2014 {
2015         struct net *net = dev_net(skb->dev);
2016         struct netevent_redirect netevent;
2017         struct rt6_info *rt, *nrt = NULL;
2018         struct ndisc_options ndopts;
2019         struct inet6_dev *in6_dev;
2020         struct neighbour *neigh;
2021         struct rd_msg *msg;
2022         int optlen, on_link;
2023         u8 *lladdr;
2024
2025         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2026         optlen -= sizeof(*msg);
2027
2028         if (optlen < 0) {
2029                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2030                 return;
2031         }
2032
2033         msg = (struct rd_msg *)icmp6_hdr(skb);
2034
2035         if (ipv6_addr_is_multicast(&msg->dest)) {
2036                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2037                 return;
2038         }
2039
2040         on_link = 0;
2041         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2042                 on_link = 1;
2043         } else if (ipv6_addr_type(&msg->target) !=
2044                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2045                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2046                 return;
2047         }
2048
2049         in6_dev = __in6_dev_get(skb->dev);
2050         if (!in6_dev)
2051                 return;
2052         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2053                 return;
2054
2055         /* RFC2461 8.1:
2056          *      The IP source address of the Redirect MUST be the same as the current
2057          *      first-hop router for the specified ICMP Destination Address.
2058          */
2059
2060         if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2061                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2062                 return;
2063         }
2064
2065         lladdr = NULL;
2066         if (ndopts.nd_opts_tgt_lladdr) {
2067                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2068                                              skb->dev);
2069                 if (!lladdr) {
2070                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2071                         return;
2072                 }
2073         }
2074
2075         rt = (struct rt6_info *) dst;
2076         if (rt == net->ipv6.ip6_null_entry) {
2077                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2078                 return;
2079         }
2080
2081         /* Redirect received -> path was valid.
2082          * Look, redirects are sent only in response to data packets,
2083          * so that this nexthop apparently is reachable. --ANK
2084          */
2085         dst_confirm(&rt->dst);
2086
2087         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2088         if (!neigh)
2089                 return;
2090
2091         /*
2092          *      We have finally decided to accept it.
2093          */
2094
2095         neigh_update(neigh, lladdr, NUD_STALE,
2096                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2097                      NEIGH_UPDATE_F_OVERRIDE|
2098                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2099                                      NEIGH_UPDATE_F_ISROUTER))
2100                      );
2101
2102         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2103         if (!nrt)
2104                 goto out;
2105
2106         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2107         if (on_link)
2108                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2109
2110         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2111
2112         if (ip6_ins_rt(nrt))
2113                 goto out;
2114
2115         netevent.old = &rt->dst;
2116         netevent.new = &nrt->dst;
2117         netevent.daddr = &msg->dest;
2118         netevent.neigh = neigh;
2119         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2120
2121         if (rt->rt6i_flags & RTF_CACHE) {
2122                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2123                 ip6_del_rt(rt);
2124         }
2125
2126 out:
2127         neigh_release(neigh);
2128 }
2129
2130 /*
2131  *      Misc support functions
2132  */
2133
2134 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2135 {
2136         BUG_ON(from->dst.from);
2137
2138         rt->rt6i_flags &= ~RTF_EXPIRES;
2139         dst_hold(&from->dst);
2140         rt->dst.from = &from->dst;
2141         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2142 }
2143
2144 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2145 {
2146         rt->dst.input = ort->dst.input;
2147         rt->dst.output = ort->dst.output;
2148         rt->rt6i_dst = ort->rt6i_dst;
2149         rt->dst.error = ort->dst.error;
2150         rt->rt6i_idev = ort->rt6i_idev;
2151         if (rt->rt6i_idev)
2152                 in6_dev_hold(rt->rt6i_idev);
2153         rt->dst.lastuse = jiffies;
2154         rt->rt6i_gateway = ort->rt6i_gateway;
2155         rt->rt6i_flags = ort->rt6i_flags;
2156         rt6_set_from(rt, ort);
2157         rt->rt6i_metric = ort->rt6i_metric;
2158 #ifdef CONFIG_IPV6_SUBTREES
2159         rt->rt6i_src = ort->rt6i_src;
2160 #endif
2161         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2162         rt->rt6i_table = ort->rt6i_table;
2163 }
2164
2165 #ifdef CONFIG_IPV6_ROUTE_INFO
2166 static struct rt6_info *rt6_get_route_info(struct net *net,
2167                                            const struct in6_addr *prefix, int prefixlen,
2168                                            const struct in6_addr *gwaddr, int ifindex)
2169 {
2170         struct fib6_node *fn;
2171         struct rt6_info *rt = NULL;
2172         struct fib6_table *table;
2173
2174         table = fib6_get_table(net, RT6_TABLE_INFO);
2175         if (!table)
2176                 return NULL;
2177
2178         read_lock_bh(&table->tb6_lock);
2179         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2180         if (!fn)
2181                 goto out;
2182
2183         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2184                 if (rt->dst.dev->ifindex != ifindex)
2185                         continue;
2186                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2187                         continue;
2188                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2189                         continue;
2190                 dst_hold(&rt->dst);
2191                 break;
2192         }
2193 out:
2194         read_unlock_bh(&table->tb6_lock);
2195         return rt;
2196 }
2197
2198 static struct rt6_info *rt6_add_route_info(struct net *net,
2199                                            const struct in6_addr *prefix, int prefixlen,
2200                                            const struct in6_addr *gwaddr, int ifindex,
2201                                            unsigned int pref)
2202 {
2203         struct fib6_config cfg = {
2204                 .fc_table       = RT6_TABLE_INFO,
2205                 .fc_metric      = IP6_RT_PRIO_USER,
2206                 .fc_ifindex     = ifindex,
2207                 .fc_dst_len     = prefixlen,
2208                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2209                                   RTF_UP | RTF_PREF(pref),
2210                 .fc_nlinfo.portid = 0,
2211                 .fc_nlinfo.nlh = NULL,
2212                 .fc_nlinfo.nl_net = net,
2213         };
2214
2215         cfg.fc_dst = *prefix;
2216         cfg.fc_gateway = *gwaddr;
2217
2218         /* We should treat it as a default route if prefix length is 0. */
2219         if (!prefixlen)
2220                 cfg.fc_flags |= RTF_DEFAULT;
2221
2222         ip6_route_add(&cfg);
2223
2224         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2225 }
2226 #endif
2227
2228 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2229 {
2230         struct rt6_info *rt;
2231         struct fib6_table *table;
2232
2233         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2234         if (!table)
2235                 return NULL;
2236
2237         read_lock_bh(&table->tb6_lock);
2238         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2239                 if (dev == rt->dst.dev &&
2240                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2241                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2242                         break;
2243         }
2244         if (rt)
2245                 dst_hold(&rt->dst);
2246         read_unlock_bh(&table->tb6_lock);
2247         return rt;
2248 }
2249
2250 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2251                                      struct net_device *dev,
2252                                      unsigned int pref)
2253 {
2254         struct fib6_config cfg = {
2255                 .fc_table       = RT6_TABLE_DFLT,
2256                 .fc_metric      = IP6_RT_PRIO_USER,
2257                 .fc_ifindex     = dev->ifindex,
2258                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2259                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2260                 .fc_nlinfo.portid = 0,
2261                 .fc_nlinfo.nlh = NULL,
2262                 .fc_nlinfo.nl_net = dev_net(dev),
2263         };
2264
2265         cfg.fc_gateway = *gwaddr;
2266
2267         ip6_route_add(&cfg);
2268
2269         return rt6_get_dflt_router(gwaddr, dev);
2270 }
2271
2272 void rt6_purge_dflt_routers(struct net *net)
2273 {
2274         struct rt6_info *rt;
2275         struct fib6_table *table;
2276
2277         /* NOTE: Keep consistent with rt6_get_dflt_router */
2278         table = fib6_get_table(net, RT6_TABLE_DFLT);
2279         if (!table)
2280                 return;
2281
2282 restart:
2283         read_lock_bh(&table->tb6_lock);
2284         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2285                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2286                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2287                         dst_hold(&rt->dst);
2288                         read_unlock_bh(&table->tb6_lock);
2289                         ip6_del_rt(rt);
2290                         goto restart;
2291                 }
2292         }
2293         read_unlock_bh(&table->tb6_lock);
2294 }
2295
2296 static void rtmsg_to_fib6_config(struct net *net,
2297                                  struct in6_rtmsg *rtmsg,
2298                                  struct fib6_config *cfg)
2299 {
2300         memset(cfg, 0, sizeof(*cfg));
2301
2302         cfg->fc_table = RT6_TABLE_MAIN;
2303         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2304         cfg->fc_metric = rtmsg->rtmsg_metric;
2305         cfg->fc_expires = rtmsg->rtmsg_info;
2306         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2307         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2308         cfg->fc_flags = rtmsg->rtmsg_flags;
2309
2310         cfg->fc_nlinfo.nl_net = net;
2311
2312         cfg->fc_dst = rtmsg->rtmsg_dst;
2313         cfg->fc_src = rtmsg->rtmsg_src;
2314         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2315 }
2316
2317 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2318 {
2319         struct fib6_config cfg;
2320         struct in6_rtmsg rtmsg;
2321         int err;
2322
2323         switch (cmd) {
2324         case SIOCADDRT:         /* Add a route */
2325         case SIOCDELRT:         /* Delete a route */
2326                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2327                         return -EPERM;
2328                 err = copy_from_user(&rtmsg, arg,
2329                                      sizeof(struct in6_rtmsg));
2330                 if (err)
2331                         return -EFAULT;
2332
2333                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2334
2335                 rtnl_lock();
2336                 switch (cmd) {
2337                 case SIOCADDRT:
2338                         err = ip6_route_add(&cfg);
2339                         break;
2340                 case SIOCDELRT:
2341                         err = ip6_route_del(&cfg);
2342                         break;
2343                 default:
2344                         err = -EINVAL;
2345                 }
2346                 rtnl_unlock();
2347
2348                 return err;
2349         }
2350
2351         return -EINVAL;
2352 }
2353
2354 /*
2355  *      Drop the packet on the floor
2356  */
2357
2358 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2359 {
2360         int type;
2361         struct dst_entry *dst = skb_dst(skb);
2362         switch (ipstats_mib_noroutes) {
2363         case IPSTATS_MIB_INNOROUTES:
2364                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2365                 if (type == IPV6_ADDR_ANY) {
2366                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2367                                       IPSTATS_MIB_INADDRERRORS);
2368                         break;
2369                 }
2370                 /* FALLTHROUGH */
2371         case IPSTATS_MIB_OUTNOROUTES:
2372                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2373                               ipstats_mib_noroutes);
2374                 break;
2375         }
2376         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2377         kfree_skb(skb);
2378         return 0;
2379 }
2380
2381 static int ip6_pkt_discard(struct sk_buff *skb)
2382 {
2383         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2384 }
2385
2386 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2387 {
2388         skb->dev = skb_dst(skb)->dev;
2389         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2390 }
2391
2392 static int ip6_pkt_prohibit(struct sk_buff *skb)
2393 {
2394         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2395 }
2396
2397 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2398 {
2399         skb->dev = skb_dst(skb)->dev;
2400         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2401 }
2402
2403 /*
2404  *      Allocate a dst for local (unicast / anycast) address.
2405  */
2406
2407 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2408                                     const struct in6_addr *addr,
2409                                     bool anycast)
2410 {
2411         struct net *net = dev_net(idev->dev);
2412         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2413                                             DST_NOCOUNT, NULL);
2414         if (!rt)
2415                 return ERR_PTR(-ENOMEM);
2416
2417         in6_dev_hold(idev);
2418
2419         rt->dst.flags |= DST_HOST;
2420         rt->dst.input = ip6_input;
2421         rt->dst.output = ip6_output;
2422         rt->rt6i_idev = idev;
2423
2424         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2425         if (anycast)
2426                 rt->rt6i_flags |= RTF_ANYCAST;
2427         else
2428                 rt->rt6i_flags |= RTF_LOCAL;
2429
2430         rt->rt6i_gateway  = *addr;
2431         rt->rt6i_dst.addr = *addr;
2432         rt->rt6i_dst.plen = 128;
2433         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2434
2435         atomic_set(&rt->dst.__refcnt, 1);
2436
2437         return rt;
2438 }
2439
2440 int ip6_route_get_saddr(struct net *net,
2441                         struct rt6_info *rt,
2442                         const struct in6_addr *daddr,
2443                         unsigned int prefs,
2444                         struct in6_addr *saddr)
2445 {
2446         struct inet6_dev *idev =
2447                 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2448         int err = 0;
2449         if (rt && rt->rt6i_prefsrc.plen)
2450                 *saddr = rt->rt6i_prefsrc.addr;
2451         else
2452                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2453                                          daddr, prefs, saddr);
2454         return err;
2455 }
2456
2457 /* remove deleted ip from prefsrc entries */
2458 struct arg_dev_net_ip {
2459         struct net_device *dev;
2460         struct net *net;
2461         struct in6_addr *addr;
2462 };
2463
2464 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2465 {
2466         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2467         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2468         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2469
2470         if (((void *)rt->dst.dev == dev || !dev) &&
2471             rt != net->ipv6.ip6_null_entry &&
2472             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2473                 /* remove prefsrc entry */
2474                 rt->rt6i_prefsrc.plen = 0;
2475         }
2476         return 0;
2477 }
2478
2479 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2480 {
2481         struct net *net = dev_net(ifp->idev->dev);
2482         struct arg_dev_net_ip adni = {
2483                 .dev = ifp->idev->dev,
2484                 .net = net,
2485                 .addr = &ifp->addr,
2486         };
2487         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2488 }
2489
2490 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2491 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2492
2493 /* Remove routers and update dst entries when gateway turn into host. */
2494 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2495 {
2496         struct in6_addr *gateway = (struct in6_addr *)arg;
2497
2498         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2499              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2500              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2501                 return -1;
2502         }
2503         return 0;
2504 }
2505
2506 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2507 {
2508         fib6_clean_all(net, fib6_clean_tohost, gateway);
2509 }
2510
2511 struct arg_dev_net {
2512         struct net_device *dev;
2513         struct net *net;
2514 };
2515
2516 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2517 {
2518         const struct arg_dev_net *adn = arg;
2519         const struct net_device *dev = adn->dev;
2520
2521         if ((rt->dst.dev == dev || !dev) &&
2522             rt != adn->net->ipv6.ip6_null_entry)
2523                 return -1;
2524
2525         return 0;
2526 }
2527
2528 void rt6_ifdown(struct net *net, struct net_device *dev)
2529 {
2530         struct arg_dev_net adn = {
2531                 .dev = dev,
2532                 .net = net,
2533         };
2534
2535         fib6_clean_all(net, fib6_ifdown, &adn);
2536         icmp6_clean_all(fib6_ifdown, &adn);
2537         rt6_uncached_list_flush_dev(net, dev);
2538 }
2539
2540 struct rt6_mtu_change_arg {
2541         struct net_device *dev;
2542         unsigned int mtu;
2543 };
2544
2545 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2546 {
2547         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2548         struct inet6_dev *idev;
2549
2550         /* In IPv6 pmtu discovery is not optional,
2551            so that RTAX_MTU lock cannot disable it.
2552            We still use this lock to block changes
2553            caused by addrconf/ndisc.
2554         */
2555
2556         idev = __in6_dev_get(arg->dev);
2557         if (!idev)
2558                 return 0;
2559
2560         /* For administrative MTU increase, there is no way to discover
2561            IPv6 PMTU increase, so PMTU increase should be updated here.
2562            Since RFC 1981 doesn't include administrative MTU increase
2563            update PMTU increase is a MUST. (i.e. jumbo frame)
2564          */
2565         /*
2566            If new MTU is less than route PMTU, this new MTU will be the
2567            lowest MTU in the path, update the route PMTU to reflect PMTU
2568            decreases; if new MTU is greater than route PMTU, and the
2569            old MTU is the lowest MTU in the path, update the route PMTU
2570            to reflect the increase. In this case if the other nodes' MTU
2571            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2572            PMTU discouvery.
2573          */
2574         if (rt->dst.dev == arg->dev &&
2575             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2576                 if (rt->rt6i_flags & RTF_CACHE) {
2577                         /* For RTF_CACHE with rt6i_pmtu == 0
2578                          * (i.e. a redirected route),
2579                          * the metrics of its rt->dst.from has already
2580                          * been updated.
2581                          */
2582                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2583                                 rt->rt6i_pmtu = arg->mtu;
2584                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2585                            (dst_mtu(&rt->dst) < arg->mtu &&
2586                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2587                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2588                 }
2589         }
2590         return 0;
2591 }
2592
2593 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2594 {
2595         struct rt6_mtu_change_arg arg = {
2596                 .dev = dev,
2597                 .mtu = mtu,
2598         };
2599
2600         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2601 }
2602
2603 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2604         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2605         [RTA_OIF]               = { .type = NLA_U32 },
2606         [RTA_IIF]               = { .type = NLA_U32 },
2607         [RTA_PRIORITY]          = { .type = NLA_U32 },
2608         [RTA_METRICS]           = { .type = NLA_NESTED },
2609         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2610         [RTA_PREF]              = { .type = NLA_U8 },
2611         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2612         [RTA_ENCAP]             = { .type = NLA_NESTED },
2613 };
2614
2615 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2616                               struct fib6_config *cfg)
2617 {
2618         struct rtmsg *rtm;
2619         struct nlattr *tb[RTA_MAX+1];
2620         unsigned int pref;
2621         int err;
2622
2623         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2624         if (err < 0)
2625                 goto errout;
2626
2627         err = -EINVAL;
2628         rtm = nlmsg_data(nlh);
2629         memset(cfg, 0, sizeof(*cfg));
2630
2631         cfg->fc_table = rtm->rtm_table;
2632         cfg->fc_dst_len = rtm->rtm_dst_len;
2633         cfg->fc_src_len = rtm->rtm_src_len;
2634         cfg->fc_flags = RTF_UP;
2635         cfg->fc_protocol = rtm->rtm_protocol;
2636         cfg->fc_type = rtm->rtm_type;
2637
2638         if (rtm->rtm_type == RTN_UNREACHABLE ||
2639             rtm->rtm_type == RTN_BLACKHOLE ||
2640             rtm->rtm_type == RTN_PROHIBIT ||
2641             rtm->rtm_type == RTN_THROW)
2642                 cfg->fc_flags |= RTF_REJECT;
2643
2644         if (rtm->rtm_type == RTN_LOCAL)
2645                 cfg->fc_flags |= RTF_LOCAL;
2646
2647         if (rtm->rtm_flags & RTM_F_CLONED)
2648                 cfg->fc_flags |= RTF_CACHE;
2649
2650         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2651         cfg->fc_nlinfo.nlh = nlh;
2652         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2653
2654         if (tb[RTA_GATEWAY]) {
2655                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2656                 cfg->fc_flags |= RTF_GATEWAY;
2657         }
2658
2659         if (tb[RTA_DST]) {
2660                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2661
2662                 if (nla_len(tb[RTA_DST]) < plen)
2663                         goto errout;
2664
2665                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2666         }
2667
2668         if (tb[RTA_SRC]) {
2669                 int plen = (rtm->rtm_src_len + 7) >> 3;
2670
2671                 if (nla_len(tb[RTA_SRC]) < plen)
2672                         goto errout;
2673
2674                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2675         }
2676
2677         if (tb[RTA_PREFSRC])
2678                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2679
2680         if (tb[RTA_OIF])
2681                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2682
2683         if (tb[RTA_PRIORITY])
2684                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2685
2686         if (tb[RTA_METRICS]) {
2687                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2688                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2689         }
2690
2691         if (tb[RTA_TABLE])
2692                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2693
2694         if (tb[RTA_MULTIPATH]) {
2695                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2696                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2697         }
2698
2699         if (tb[RTA_PREF]) {
2700                 pref = nla_get_u8(tb[RTA_PREF]);
2701                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2702                     pref != ICMPV6_ROUTER_PREF_HIGH)
2703                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2704                 cfg->fc_flags |= RTF_PREF(pref);
2705         }
2706
2707         if (tb[RTA_ENCAP])
2708                 cfg->fc_encap = tb[RTA_ENCAP];
2709
2710         if (tb[RTA_ENCAP_TYPE])
2711                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2712
2713         err = 0;
2714 errout:
2715         return err;
2716 }
2717
2718 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2719 {
2720         struct fib6_config r_cfg;
2721         struct rtnexthop *rtnh;
2722         int remaining;
2723         int attrlen;
2724         int err = 0, last_err = 0;
2725
2726         remaining = cfg->fc_mp_len;
2727 beginning:
2728         rtnh = (struct rtnexthop *)cfg->fc_mp;
2729
2730         /* Parse a Multipath Entry */
2731         while (rtnh_ok(rtnh, remaining)) {
2732                 memcpy(&r_cfg, cfg, sizeof(*cfg));
2733                 if (rtnh->rtnh_ifindex)
2734                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2735
2736                 attrlen = rtnh_attrlen(rtnh);
2737                 if (attrlen > 0) {
2738                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2739
2740                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2741                         if (nla) {
2742                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2743                                 r_cfg.fc_flags |= RTF_GATEWAY;
2744                         }
2745                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2746                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2747                         if (nla)
2748                                 r_cfg.fc_encap_type = nla_get_u16(nla);
2749                 }
2750                 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2751                 if (err) {
2752                         last_err = err;
2753                         /* If we are trying to remove a route, do not stop the
2754                          * loop when ip6_route_del() fails (because next hop is
2755                          * already gone), we should try to remove all next hops.
2756                          */
2757                         if (add) {
2758                                 /* If add fails, we should try to delete all
2759                                  * next hops that have been already added.
2760                                  */
2761                                 add = 0;
2762                                 remaining = cfg->fc_mp_len - remaining;
2763                                 goto beginning;
2764                         }
2765                 }
2766                 /* Because each route is added like a single route we remove
2767                  * these flags after the first nexthop: if there is a collision,
2768                  * we have already failed to add the first nexthop:
2769                  * fib6_add_rt2node() has rejected it; when replacing, old
2770                  * nexthops have been replaced by first new, the rest should
2771                  * be added to it.
2772                  */
2773                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2774                                                      NLM_F_REPLACE);
2775                 rtnh = rtnh_next(rtnh, &remaining);
2776         }
2777
2778         return last_err;
2779 }
2780
2781 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2782 {
2783         struct fib6_config cfg;
2784         int err;
2785
2786         err = rtm_to_fib6_config(skb, nlh, &cfg);
2787         if (err < 0)
2788                 return err;
2789
2790         if (cfg.fc_mp)
2791                 return ip6_route_multipath(&cfg, 0);
2792         else
2793                 return ip6_route_del(&cfg);
2794 }
2795
2796 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2797 {
2798         struct fib6_config cfg;
2799         int err;
2800
2801         err = rtm_to_fib6_config(skb, nlh, &cfg);
2802         if (err < 0)
2803                 return err;
2804
2805         if (cfg.fc_mp)
2806                 return ip6_route_multipath(&cfg, 1);
2807         else
2808                 return ip6_route_add(&cfg);
2809 }
2810
2811 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
2812 {
2813         return NLMSG_ALIGN(sizeof(struct rtmsg))
2814                + nla_total_size(16) /* RTA_SRC */
2815                + nla_total_size(16) /* RTA_DST */
2816                + nla_total_size(16) /* RTA_GATEWAY */
2817                + nla_total_size(16) /* RTA_PREFSRC */
2818                + nla_total_size(4) /* RTA_TABLE */
2819                + nla_total_size(4) /* RTA_IIF */
2820                + nla_total_size(4) /* RTA_OIF */
2821                + nla_total_size(4) /* RTA_PRIORITY */
2822                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2823                + nla_total_size(sizeof(struct rta_cacheinfo))
2824                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2825                + nla_total_size(1) /* RTA_PREF */
2826                + lwtunnel_get_encap_size(rt->rt6i_lwtstate);
2827 }
2828
2829 static int rt6_fill_node(struct net *net,
2830                          struct sk_buff *skb, struct rt6_info *rt,
2831                          struct in6_addr *dst, struct in6_addr *src,
2832                          int iif, int type, u32 portid, u32 seq,
2833                          int prefix, int nowait, unsigned int flags)
2834 {
2835         u32 metrics[RTAX_MAX];
2836         struct rtmsg *rtm;
2837         struct nlmsghdr *nlh;
2838         long expires;
2839         u32 table;
2840
2841         if (prefix) {   /* user wants prefix routes only */
2842                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2843                         /* success since this is not a prefix route */
2844                         return 1;
2845                 }
2846         }
2847
2848         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2849         if (!nlh)
2850                 return -EMSGSIZE;
2851
2852         rtm = nlmsg_data(nlh);
2853         rtm->rtm_family = AF_INET6;
2854         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2855         rtm->rtm_src_len = rt->rt6i_src.plen;
2856         rtm->rtm_tos = 0;
2857         if (rt->rt6i_table)
2858                 table = rt->rt6i_table->tb6_id;
2859         else
2860                 table = RT6_TABLE_UNSPEC;
2861         rtm->rtm_table = table;
2862         if (nla_put_u32(skb, RTA_TABLE, table))
2863                 goto nla_put_failure;
2864         if (rt->rt6i_flags & RTF_REJECT) {
2865                 switch (rt->dst.error) {
2866                 case -EINVAL:
2867                         rtm->rtm_type = RTN_BLACKHOLE;
2868                         break;
2869                 case -EACCES:
2870                         rtm->rtm_type = RTN_PROHIBIT;
2871                         break;
2872                 case -EAGAIN:
2873                         rtm->rtm_type = RTN_THROW;
2874                         break;
2875                 default:
2876                         rtm->rtm_type = RTN_UNREACHABLE;
2877                         break;
2878                 }
2879         }
2880         else if (rt->rt6i_flags & RTF_LOCAL)
2881                 rtm->rtm_type = RTN_LOCAL;
2882         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2883                 rtm->rtm_type = RTN_LOCAL;
2884         else
2885                 rtm->rtm_type = RTN_UNICAST;
2886         rtm->rtm_flags = 0;
2887         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2888         rtm->rtm_protocol = rt->rt6i_protocol;
2889         if (rt->rt6i_flags & RTF_DYNAMIC)
2890                 rtm->rtm_protocol = RTPROT_REDIRECT;
2891         else if (rt->rt6i_flags & RTF_ADDRCONF) {
2892                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2893                         rtm->rtm_protocol = RTPROT_RA;
2894                 else
2895                         rtm->rtm_protocol = RTPROT_KERNEL;
2896         }
2897
2898         if (rt->rt6i_flags & RTF_CACHE)
2899                 rtm->rtm_flags |= RTM_F_CLONED;
2900
2901         if (dst) {
2902                 if (nla_put_in6_addr(skb, RTA_DST, dst))
2903                         goto nla_put_failure;
2904                 rtm->rtm_dst_len = 128;
2905         } else if (rtm->rtm_dst_len)
2906                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2907                         goto nla_put_failure;
2908 #ifdef CONFIG_IPV6_SUBTREES
2909         if (src) {
2910                 if (nla_put_in6_addr(skb, RTA_SRC, src))
2911                         goto nla_put_failure;
2912                 rtm->rtm_src_len = 128;
2913         } else if (rtm->rtm_src_len &&
2914                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2915                 goto nla_put_failure;
2916 #endif
2917         if (iif) {
2918 #ifdef CONFIG_IPV6_MROUTE
2919                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2920                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2921                         if (err <= 0) {
2922                                 if (!nowait) {
2923                                         if (err == 0)
2924                                                 return 0;
2925                                         goto nla_put_failure;
2926                                 } else {
2927                                         if (err == -EMSGSIZE)
2928                                                 goto nla_put_failure;
2929                                 }
2930                         }
2931                 } else
2932 #endif
2933                         if (nla_put_u32(skb, RTA_IIF, iif))
2934                                 goto nla_put_failure;
2935         } else if (dst) {
2936                 struct in6_addr saddr_buf;
2937                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2938                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2939                         goto nla_put_failure;
2940         }
2941
2942         if (rt->rt6i_prefsrc.plen) {
2943                 struct in6_addr saddr_buf;
2944                 saddr_buf = rt->rt6i_prefsrc.addr;
2945                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2946                         goto nla_put_failure;
2947         }
2948
2949         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2950         if (rt->rt6i_pmtu)
2951                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2952         if (rtnetlink_put_metrics(skb, metrics) < 0)
2953                 goto nla_put_failure;
2954
2955         if (rt->rt6i_flags & RTF_GATEWAY) {
2956                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2957                         goto nla_put_failure;
2958         }
2959
2960         if (rt->dst.dev &&
2961             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2962                 goto nla_put_failure;
2963         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2964                 goto nla_put_failure;
2965
2966         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2967
2968         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2969                 goto nla_put_failure;
2970
2971         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2972                 goto nla_put_failure;
2973
2974         lwtunnel_fill_encap(skb, rt->rt6i_lwtstate);
2975
2976         nlmsg_end(skb, nlh);
2977         return 0;
2978
2979 nla_put_failure:
2980         nlmsg_cancel(skb, nlh);
2981         return -EMSGSIZE;
2982 }
2983
2984 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2985 {
2986         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2987         int prefix;
2988
2989         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2990                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2991                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2992         } else
2993                 prefix = 0;
2994
2995         return rt6_fill_node(arg->net,
2996                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2997                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
2998                      prefix, 0, NLM_F_MULTI);
2999 }
3000
3001 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3002 {
3003         struct net *net = sock_net(in_skb->sk);
3004         struct nlattr *tb[RTA_MAX+1];
3005         struct rt6_info *rt;
3006         struct sk_buff *skb;
3007         struct rtmsg *rtm;
3008         struct flowi6 fl6;
3009         int err, iif = 0, oif = 0;
3010
3011         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3012         if (err < 0)
3013                 goto errout;
3014
3015         err = -EINVAL;
3016         memset(&fl6, 0, sizeof(fl6));
3017
3018         if (tb[RTA_SRC]) {
3019                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3020                         goto errout;
3021
3022                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3023         }
3024
3025         if (tb[RTA_DST]) {
3026                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3027                         goto errout;
3028
3029                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3030         }
3031
3032         if (tb[RTA_IIF])
3033                 iif = nla_get_u32(tb[RTA_IIF]);
3034
3035         if (tb[RTA_OIF])
3036                 oif = nla_get_u32(tb[RTA_OIF]);
3037
3038         if (tb[RTA_MARK])
3039                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3040
3041         if (iif) {
3042                 struct net_device *dev;
3043                 int flags = 0;
3044
3045                 dev = __dev_get_by_index(net, iif);
3046                 if (!dev) {
3047                         err = -ENODEV;
3048                         goto errout;
3049                 }
3050
3051                 fl6.flowi6_iif = iif;
3052
3053                 if (!ipv6_addr_any(&fl6.saddr))
3054                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3055
3056                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3057                                                                flags);
3058         } else {
3059                 fl6.flowi6_oif = oif;
3060
3061                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3062         }
3063
3064         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3065         if (!skb) {
3066                 ip6_rt_put(rt);
3067                 err = -ENOBUFS;
3068                 goto errout;
3069         }
3070
3071         /* Reserve room for dummy headers, this skb can pass
3072            through good chunk of routing engine.
3073          */
3074         skb_reset_mac_header(skb);
3075         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3076
3077         skb_dst_set(skb, &rt->dst);
3078
3079         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3080                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3081                             nlh->nlmsg_seq, 0, 0, 0);
3082         if (err < 0) {
3083                 kfree_skb(skb);
3084                 goto errout;
3085         }
3086
3087         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3088 errout:
3089         return err;
3090 }
3091
3092 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3093 {
3094         struct sk_buff *skb;
3095         struct net *net = info->nl_net;
3096         u32 seq;
3097         int err;
3098
3099         err = -ENOBUFS;
3100         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3101
3102         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3103         if (!skb)
3104                 goto errout;
3105
3106         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3107                                 event, info->portid, seq, 0, 0, 0);
3108         if (err < 0) {
3109                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3110                 WARN_ON(err == -EMSGSIZE);
3111                 kfree_skb(skb);
3112                 goto errout;
3113         }
3114         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3115                     info->nlh, gfp_any());
3116         return;
3117 errout:
3118         if (err < 0)
3119                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3120 }
3121
3122 static int ip6_route_dev_notify(struct notifier_block *this,
3123                                 unsigned long event, void *ptr)
3124 {
3125         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3126         struct net *net = dev_net(dev);
3127
3128         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3129                 net->ipv6.ip6_null_entry->dst.dev = dev;
3130                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3131 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3132                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3133                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3134                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3135                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3136 #endif
3137         }
3138
3139         return NOTIFY_OK;
3140 }
3141
3142 /*
3143  *      /proc
3144  */
3145
3146 #ifdef CONFIG_PROC_FS
3147
3148 static const struct file_operations ipv6_route_proc_fops = {
3149         .owner          = THIS_MODULE,
3150         .open           = ipv6_route_open,
3151         .read           = seq_read,
3152         .llseek         = seq_lseek,
3153         .release        = seq_release_net,
3154 };
3155
3156 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3157 {
3158         struct net *net = (struct net *)seq->private;
3159         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3160                    net->ipv6.rt6_stats->fib_nodes,
3161                    net->ipv6.rt6_stats->fib_route_nodes,
3162                    net->ipv6.rt6_stats->fib_rt_alloc,
3163                    net->ipv6.rt6_stats->fib_rt_entries,
3164                    net->ipv6.rt6_stats->fib_rt_cache,
3165                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3166                    net->ipv6.rt6_stats->fib_discarded_routes);
3167
3168         return 0;
3169 }
3170
3171 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3172 {
3173         return single_open_net(inode, file, rt6_stats_seq_show);
3174 }
3175
3176 static const struct file_operations rt6_stats_seq_fops = {
3177         .owner   = THIS_MODULE,
3178         .open    = rt6_stats_seq_open,
3179         .read    = seq_read,
3180         .llseek  = seq_lseek,
3181         .release = single_release_net,
3182 };
3183 #endif  /* CONFIG_PROC_FS */
3184
3185 #ifdef CONFIG_SYSCTL
3186
3187 static
3188 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3189                               void __user *buffer, size_t *lenp, loff_t *ppos)
3190 {
3191         struct net *net;
3192         int delay;
3193         if (!write)
3194                 return -EINVAL;
3195
3196         net = (struct net *)ctl->extra1;
3197         delay = net->ipv6.sysctl.flush_delay;
3198         proc_dointvec(ctl, write, buffer, lenp, ppos);
3199         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3200         return 0;
3201 }
3202
3203 struct ctl_table ipv6_route_table_template[] = {
3204         {
3205                 .procname       =       "flush",
3206                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3207                 .maxlen         =       sizeof(int),
3208                 .mode           =       0200,
3209                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3210         },
3211         {
3212                 .procname       =       "gc_thresh",
3213                 .data           =       &ip6_dst_ops_template.gc_thresh,
3214                 .maxlen         =       sizeof(int),
3215                 .mode           =       0644,
3216                 .proc_handler   =       proc_dointvec,
3217         },
3218         {
3219                 .procname       =       "max_size",
3220                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3221                 .maxlen         =       sizeof(int),
3222                 .mode           =       0644,
3223                 .proc_handler   =       proc_dointvec,
3224         },
3225         {
3226                 .procname       =       "gc_min_interval",
3227                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3228                 .maxlen         =       sizeof(int),
3229                 .mode           =       0644,
3230                 .proc_handler   =       proc_dointvec_jiffies,
3231         },
3232         {
3233                 .procname       =       "gc_timeout",
3234                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3235                 .maxlen         =       sizeof(int),
3236                 .mode           =       0644,
3237                 .proc_handler   =       proc_dointvec_jiffies,
3238         },
3239         {
3240                 .procname       =       "gc_interval",
3241                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3242                 .maxlen         =       sizeof(int),
3243                 .mode           =       0644,
3244                 .proc_handler   =       proc_dointvec_jiffies,
3245         },
3246         {
3247                 .procname       =       "gc_elasticity",
3248                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3249                 .maxlen         =       sizeof(int),
3250                 .mode           =       0644,
3251                 .proc_handler   =       proc_dointvec,
3252         },
3253         {
3254                 .procname       =       "mtu_expires",
3255                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3256                 .maxlen         =       sizeof(int),
3257                 .mode           =       0644,
3258                 .proc_handler   =       proc_dointvec_jiffies,
3259         },
3260         {
3261                 .procname       =       "min_adv_mss",
3262                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3263                 .maxlen         =       sizeof(int),
3264                 .mode           =       0644,
3265                 .proc_handler   =       proc_dointvec,
3266         },
3267         {
3268                 .procname       =       "gc_min_interval_ms",
3269                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3270                 .maxlen         =       sizeof(int),
3271                 .mode           =       0644,
3272                 .proc_handler   =       proc_dointvec_ms_jiffies,
3273         },
3274         { }
3275 };
3276
3277 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3278 {
3279         struct ctl_table *table;
3280
3281         table = kmemdup(ipv6_route_table_template,
3282                         sizeof(ipv6_route_table_template),
3283                         GFP_KERNEL);
3284
3285         if (table) {
3286                 table[0].data = &net->ipv6.sysctl.flush_delay;
3287                 table[0].extra1 = net;
3288                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3289                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3290                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3291                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3292                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3293                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3294                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3295                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3296                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3297
3298                 /* Don't export sysctls to unprivileged users */
3299                 if (net->user_ns != &init_user_ns)
3300                         table[0].procname = NULL;
3301         }
3302
3303         return table;
3304 }
3305 #endif
3306
3307 static int __net_init ip6_route_net_init(struct net *net)
3308 {
3309         int ret = -ENOMEM;
3310
3311         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3312                sizeof(net->ipv6.ip6_dst_ops));
3313
3314         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3315                 goto out_ip6_dst_ops;
3316
3317         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3318                                            sizeof(*net->ipv6.ip6_null_entry),
3319                                            GFP_KERNEL);
3320         if (!net->ipv6.ip6_null_entry)
3321                 goto out_ip6_dst_entries;
3322         net->ipv6.ip6_null_entry->dst.path =
3323                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3324         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3325         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3326                          ip6_template_metrics, true);
3327
3328 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3329         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3330                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3331                                                GFP_KERNEL);
3332         if (!net->ipv6.ip6_prohibit_entry)
3333                 goto out_ip6_null_entry;
3334         net->ipv6.ip6_prohibit_entry->dst.path =
3335                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3336         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3337         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3338                          ip6_template_metrics, true);
3339
3340         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3341                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3342                                                GFP_KERNEL);
3343         if (!net->ipv6.ip6_blk_hole_entry)
3344                 goto out_ip6_prohibit_entry;
3345         net->ipv6.ip6_blk_hole_entry->dst.path =
3346                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3347         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3348         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3349                          ip6_template_metrics, true);
3350 #endif
3351
3352         net->ipv6.sysctl.flush_delay = 0;
3353         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3354         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3355         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3356         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3357         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3358         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3359         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3360
3361         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3362
3363         ret = 0;
3364 out:
3365         return ret;
3366
3367 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3368 out_ip6_prohibit_entry:
3369         kfree(net->ipv6.ip6_prohibit_entry);
3370 out_ip6_null_entry:
3371         kfree(net->ipv6.ip6_null_entry);
3372 #endif
3373 out_ip6_dst_entries:
3374         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3375 out_ip6_dst_ops:
3376         goto out;
3377 }
3378
3379 static void __net_exit ip6_route_net_exit(struct net *net)
3380 {
3381         kfree(net->ipv6.ip6_null_entry);
3382 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3383         kfree(net->ipv6.ip6_prohibit_entry);
3384         kfree(net->ipv6.ip6_blk_hole_entry);
3385 #endif
3386         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3387 }
3388
3389 static int __net_init ip6_route_net_init_late(struct net *net)
3390 {
3391 #ifdef CONFIG_PROC_FS
3392         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3393         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3394 #endif
3395         return 0;
3396 }
3397
3398 static void __net_exit ip6_route_net_exit_late(struct net *net)
3399 {
3400 #ifdef CONFIG_PROC_FS
3401         remove_proc_entry("ipv6_route", net->proc_net);
3402         remove_proc_entry("rt6_stats", net->proc_net);
3403 #endif
3404 }
3405
3406 static struct pernet_operations ip6_route_net_ops = {
3407         .init = ip6_route_net_init,
3408         .exit = ip6_route_net_exit,
3409 };
3410
3411 static int __net_init ipv6_inetpeer_init(struct net *net)
3412 {
3413         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3414
3415         if (!bp)
3416                 return -ENOMEM;
3417         inet_peer_base_init(bp);
3418         net->ipv6.peers = bp;
3419         return 0;
3420 }
3421
3422 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3423 {
3424         struct inet_peer_base *bp = net->ipv6.peers;
3425
3426         net->ipv6.peers = NULL;
3427         inetpeer_invalidate_tree(bp);
3428         kfree(bp);
3429 }
3430
3431 static struct pernet_operations ipv6_inetpeer_ops = {
3432         .init   =       ipv6_inetpeer_init,
3433         .exit   =       ipv6_inetpeer_exit,
3434 };
3435
3436 static struct pernet_operations ip6_route_net_late_ops = {
3437         .init = ip6_route_net_init_late,
3438         .exit = ip6_route_net_exit_late,
3439 };
3440
3441 static struct notifier_block ip6_route_dev_notifier = {
3442         .notifier_call = ip6_route_dev_notify,
3443         .priority = 0,
3444 };
3445
3446 int __init ip6_route_init(void)
3447 {
3448         int ret;
3449         int cpu;
3450
3451         ret = -ENOMEM;
3452         ip6_dst_ops_template.kmem_cachep =
3453                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3454                                   SLAB_HWCACHE_ALIGN, NULL);
3455         if (!ip6_dst_ops_template.kmem_cachep)
3456                 goto out;
3457
3458         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3459         if (ret)
3460                 goto out_kmem_cache;
3461
3462         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3463         if (ret)
3464                 goto out_dst_entries;
3465
3466         ret = register_pernet_subsys(&ip6_route_net_ops);
3467         if (ret)
3468                 goto out_register_inetpeer;
3469
3470         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3471
3472         /* Registering of the loopback is done before this portion of code,
3473          * the loopback reference in rt6_info will not be taken, do it
3474          * manually for init_net */
3475         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3476         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3477   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3478         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3479         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3480         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3481         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3482   #endif
3483         ret = fib6_init();
3484         if (ret)
3485                 goto out_register_subsys;
3486
3487         ret = xfrm6_init();
3488         if (ret)
3489                 goto out_fib6_init;
3490
3491         ret = fib6_rules_init();
3492         if (ret)
3493                 goto xfrm6_init;
3494
3495         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3496         if (ret)
3497                 goto fib6_rules_init;
3498
3499         ret = -ENOBUFS;
3500         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3501             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3502             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3503                 goto out_register_late_subsys;
3504
3505         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3506         if (ret)
3507                 goto out_register_late_subsys;
3508
3509         for_each_possible_cpu(cpu) {
3510                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3511
3512                 INIT_LIST_HEAD(&ul->head);
3513                 spin_lock_init(&ul->lock);
3514         }
3515
3516 out:
3517         return ret;
3518
3519 out_register_late_subsys:
3520         unregister_pernet_subsys(&ip6_route_net_late_ops);
3521 fib6_rules_init:
3522         fib6_rules_cleanup();
3523 xfrm6_init:
3524         xfrm6_fini();
3525 out_fib6_init:
3526         fib6_gc_cleanup();
3527 out_register_subsys:
3528         unregister_pernet_subsys(&ip6_route_net_ops);
3529 out_register_inetpeer:
3530         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3531 out_dst_entries:
3532         dst_entries_destroy(&ip6_dst_blackhole_ops);
3533 out_kmem_cache:
3534         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3535         goto out;
3536 }
3537
3538 void ip6_route_cleanup(void)
3539 {
3540         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3541         unregister_pernet_subsys(&ip6_route_net_late_ops);
3542         fib6_rules_cleanup();
3543         xfrm6_fini();
3544         fib6_gc_cleanup();
3545         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3546         unregister_pernet_subsys(&ip6_route_net_ops);
3547         dst_entries_destroy(&ip6_dst_blackhole_ops);
3548         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3549 }