ipv6: Remove un-used argument from ip6_dst_alloc()
[firefly-linux-kernel-4.4.55.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61
62 #include <asm/uaccess.h>
63
64 #ifdef CONFIG_SYSCTL
65 #include <linux/sysctl.h>
66 #endif
67
68 enum rt6_nud_state {
69         RT6_NUD_FAIL_HARD = -3,
70         RT6_NUD_FAIL_PROBE = -2,
71         RT6_NUD_FAIL_DO_RR = -1,
72         RT6_NUD_SUCCEED = 1
73 };
74
75 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
87 static int              ip6_pkt_prohibit(struct sk_buff *skb);
88 static int              ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
91                                            struct sk_buff *skb, u32 mtu);
92 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
93                                         struct sk_buff *skb);
94 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
95 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
96
97 #ifdef CONFIG_IPV6_ROUTE_INFO
98 static struct rt6_info *rt6_add_route_info(struct net *net,
99                                            const struct in6_addr *prefix, int prefixlen,
100                                            const struct in6_addr *gwaddr, int ifindex,
101                                            unsigned int pref);
102 static struct rt6_info *rt6_get_route_info(struct net *net,
103                                            const struct in6_addr *prefix, int prefixlen,
104                                            const struct in6_addr *gwaddr, int ifindex);
105 #endif
106
107 struct uncached_list {
108         spinlock_t              lock;
109         struct list_head        head;
110 };
111
112 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
113
114 static void rt6_uncached_list_add(struct rt6_info *rt)
115 {
116         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
117
118         rt->dst.flags |= DST_NOCACHE;
119         rt->rt6i_uncached_list = ul;
120
121         spin_lock_bh(&ul->lock);
122         list_add_tail(&rt->rt6i_uncached, &ul->head);
123         spin_unlock_bh(&ul->lock);
124 }
125
126 static void rt6_uncached_list_del(struct rt6_info *rt)
127 {
128         if (!list_empty(&rt->rt6i_uncached)) {
129                 struct uncached_list *ul = rt->rt6i_uncached_list;
130
131                 spin_lock_bh(&ul->lock);
132                 list_del(&rt->rt6i_uncached);
133                 spin_unlock_bh(&ul->lock);
134         }
135 }
136
137 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
138 {
139         struct net_device *loopback_dev = net->loopback_dev;
140         int cpu;
141
142         for_each_possible_cpu(cpu) {
143                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
144                 struct rt6_info *rt;
145
146                 spin_lock_bh(&ul->lock);
147                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
148                         struct inet6_dev *rt_idev = rt->rt6i_idev;
149                         struct net_device *rt_dev = rt->dst.dev;
150
151                         if (rt_idev && (rt_idev->dev == dev || !dev) &&
152                             rt_idev->dev != loopback_dev) {
153                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
154                                 in6_dev_put(rt_idev);
155                         }
156
157                         if (rt_dev && (rt_dev == dev || !dev) &&
158                             rt_dev != loopback_dev) {
159                                 rt->dst.dev = loopback_dev;
160                                 dev_hold(rt->dst.dev);
161                                 dev_put(rt_dev);
162                         }
163                 }
164                 spin_unlock_bh(&ul->lock);
165         }
166 }
167
168 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
169 {
170         return dst_metrics_write_ptr(rt->dst.from);
171 }
172
173 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
174 {
175         struct rt6_info *rt = (struct rt6_info *)dst;
176
177         if (rt->rt6i_flags & RTF_PCPU)
178                 return rt6_pcpu_cow_metrics(rt);
179         else if (rt->rt6i_flags & RTF_CACHE)
180                 return NULL;
181         else
182                 return dst_cow_metrics_generic(dst, old);
183 }
184
185 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
186                                              struct sk_buff *skb,
187                                              const void *daddr)
188 {
189         struct in6_addr *p = &rt->rt6i_gateway;
190
191         if (!ipv6_addr_any(p))
192                 return (const void *) p;
193         else if (skb)
194                 return &ipv6_hdr(skb)->daddr;
195         return daddr;
196 }
197
198 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
199                                           struct sk_buff *skb,
200                                           const void *daddr)
201 {
202         struct rt6_info *rt = (struct rt6_info *) dst;
203         struct neighbour *n;
204
205         daddr = choose_neigh_daddr(rt, skb, daddr);
206         n = __ipv6_neigh_lookup(dst->dev, daddr);
207         if (n)
208                 return n;
209         return neigh_create(&nd_tbl, daddr, dst->dev);
210 }
211
212 static struct dst_ops ip6_dst_ops_template = {
213         .family                 =       AF_INET6,
214         .gc                     =       ip6_dst_gc,
215         .gc_thresh              =       1024,
216         .check                  =       ip6_dst_check,
217         .default_advmss         =       ip6_default_advmss,
218         .mtu                    =       ip6_mtu,
219         .cow_metrics            =       ipv6_cow_metrics,
220         .destroy                =       ip6_dst_destroy,
221         .ifdown                 =       ip6_dst_ifdown,
222         .negative_advice        =       ip6_negative_advice,
223         .link_failure           =       ip6_link_failure,
224         .update_pmtu            =       ip6_rt_update_pmtu,
225         .redirect               =       rt6_do_redirect,
226         .local_out              =       __ip6_local_out,
227         .neigh_lookup           =       ip6_neigh_lookup,
228 };
229
230 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
231 {
232         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
233
234         return mtu ? : dst->dev->mtu;
235 }
236
237 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
238                                          struct sk_buff *skb, u32 mtu)
239 {
240 }
241
242 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
243                                       struct sk_buff *skb)
244 {
245 }
246
247 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
248                                          unsigned long old)
249 {
250         return NULL;
251 }
252
253 static struct dst_ops ip6_dst_blackhole_ops = {
254         .family                 =       AF_INET6,
255         .destroy                =       ip6_dst_destroy,
256         .check                  =       ip6_dst_check,
257         .mtu                    =       ip6_blackhole_mtu,
258         .default_advmss         =       ip6_default_advmss,
259         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
260         .redirect               =       ip6_rt_blackhole_redirect,
261         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
262         .neigh_lookup           =       ip6_neigh_lookup,
263 };
264
265 static const u32 ip6_template_metrics[RTAX_MAX] = {
266         [RTAX_HOPLIMIT - 1] = 0,
267 };
268
269 static const struct rt6_info ip6_null_entry_template = {
270         .dst = {
271                 .__refcnt       = ATOMIC_INIT(1),
272                 .__use          = 1,
273                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
274                 .error          = -ENETUNREACH,
275                 .input          = ip6_pkt_discard,
276                 .output         = ip6_pkt_discard_out,
277         },
278         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
279         .rt6i_protocol  = RTPROT_KERNEL,
280         .rt6i_metric    = ~(u32) 0,
281         .rt6i_ref       = ATOMIC_INIT(1),
282 };
283
284 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
285
286 static const struct rt6_info ip6_prohibit_entry_template = {
287         .dst = {
288                 .__refcnt       = ATOMIC_INIT(1),
289                 .__use          = 1,
290                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
291                 .error          = -EACCES,
292                 .input          = ip6_pkt_prohibit,
293                 .output         = ip6_pkt_prohibit_out,
294         },
295         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .rt6i_protocol  = RTPROT_KERNEL,
297         .rt6i_metric    = ~(u32) 0,
298         .rt6i_ref       = ATOMIC_INIT(1),
299 };
300
301 static const struct rt6_info ip6_blk_hole_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -EINVAL,
307                 .input          = dst_discard,
308                 .output         = dst_discard_sk,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311         .rt6i_protocol  = RTPROT_KERNEL,
312         .rt6i_metric    = ~(u32) 0,
313         .rt6i_ref       = ATOMIC_INIT(1),
314 };
315
316 #endif
317
318 /* allocate dst with ip6_dst_ops */
319 static struct rt6_info *__ip6_dst_alloc(struct net *net,
320                                         struct net_device *dev,
321                                         int flags)
322 {
323         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
324                                         0, DST_OBSOLETE_FORCE_CHK, flags);
325
326         if (rt) {
327                 struct dst_entry *dst = &rt->dst;
328
329                 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
330                 INIT_LIST_HEAD(&rt->rt6i_siblings);
331                 INIT_LIST_HEAD(&rt->rt6i_uncached);
332         }
333         return rt;
334 }
335
336 static struct rt6_info *ip6_dst_alloc(struct net *net,
337                                       struct net_device *dev,
338                                       int flags)
339 {
340         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
341
342         if (rt) {
343                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
344                 if (rt->rt6i_pcpu) {
345                         int cpu;
346
347                         for_each_possible_cpu(cpu) {
348                                 struct rt6_info **p;
349
350                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
351                                 /* no one shares rt */
352                                 *p =  NULL;
353                         }
354                 } else {
355                         dst_destroy((struct dst_entry *)rt);
356                         return NULL;
357                 }
358         }
359
360         return rt;
361 }
362
363 static void ip6_dst_destroy(struct dst_entry *dst)
364 {
365         struct rt6_info *rt = (struct rt6_info *)dst;
366         struct dst_entry *from = dst->from;
367         struct inet6_dev *idev;
368
369         dst_destroy_metrics_generic(dst);
370         free_percpu(rt->rt6i_pcpu);
371         rt6_uncached_list_del(rt);
372
373         idev = rt->rt6i_idev;
374         if (idev) {
375                 rt->rt6i_idev = NULL;
376                 in6_dev_put(idev);
377         }
378
379         dst->from = NULL;
380         dst_release(from);
381 }
382
383 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
384                            int how)
385 {
386         struct rt6_info *rt = (struct rt6_info *)dst;
387         struct inet6_dev *idev = rt->rt6i_idev;
388         struct net_device *loopback_dev =
389                 dev_net(dev)->loopback_dev;
390
391         if (dev != loopback_dev) {
392                 if (idev && idev->dev == dev) {
393                         struct inet6_dev *loopback_idev =
394                                 in6_dev_get(loopback_dev);
395                         if (loopback_idev) {
396                                 rt->rt6i_idev = loopback_idev;
397                                 in6_dev_put(idev);
398                         }
399                 }
400         }
401 }
402
403 static bool rt6_check_expired(const struct rt6_info *rt)
404 {
405         if (rt->rt6i_flags & RTF_EXPIRES) {
406                 if (time_after(jiffies, rt->dst.expires))
407                         return true;
408         } else if (rt->dst.from) {
409                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
410         }
411         return false;
412 }
413
414 /* Multipath route selection:
415  *   Hash based function using packet header and flowlabel.
416  * Adapted from fib_info_hashfn()
417  */
418 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
419                                const struct flowi6 *fl6)
420 {
421         unsigned int val = fl6->flowi6_proto;
422
423         val ^= ipv6_addr_hash(&fl6->daddr);
424         val ^= ipv6_addr_hash(&fl6->saddr);
425
426         /* Work only if this not encapsulated */
427         switch (fl6->flowi6_proto) {
428         case IPPROTO_UDP:
429         case IPPROTO_TCP:
430         case IPPROTO_SCTP:
431                 val ^= (__force u16)fl6->fl6_sport;
432                 val ^= (__force u16)fl6->fl6_dport;
433                 break;
434
435         case IPPROTO_ICMPV6:
436                 val ^= (__force u16)fl6->fl6_icmp_type;
437                 val ^= (__force u16)fl6->fl6_icmp_code;
438                 break;
439         }
440         /* RFC6438 recommands to use flowlabel */
441         val ^= (__force u32)fl6->flowlabel;
442
443         /* Perhaps, we need to tune, this function? */
444         val = val ^ (val >> 7) ^ (val >> 12);
445         return val % candidate_count;
446 }
447
448 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
449                                              struct flowi6 *fl6, int oif,
450                                              int strict)
451 {
452         struct rt6_info *sibling, *next_sibling;
453         int route_choosen;
454
455         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
456         /* Don't change the route, if route_choosen == 0
457          * (siblings does not include ourself)
458          */
459         if (route_choosen)
460                 list_for_each_entry_safe(sibling, next_sibling,
461                                 &match->rt6i_siblings, rt6i_siblings) {
462                         route_choosen--;
463                         if (route_choosen == 0) {
464                                 if (rt6_score_route(sibling, oif, strict) < 0)
465                                         break;
466                                 match = sibling;
467                                 break;
468                         }
469                 }
470         return match;
471 }
472
473 /*
474  *      Route lookup. Any table->tb6_lock is implied.
475  */
476
477 static inline struct rt6_info *rt6_device_match(struct net *net,
478                                                     struct rt6_info *rt,
479                                                     const struct in6_addr *saddr,
480                                                     int oif,
481                                                     int flags)
482 {
483         struct rt6_info *local = NULL;
484         struct rt6_info *sprt;
485
486         if (!oif && ipv6_addr_any(saddr))
487                 goto out;
488
489         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
490                 struct net_device *dev = sprt->dst.dev;
491
492                 if (oif) {
493                         if (dev->ifindex == oif)
494                                 return sprt;
495                         if (dev->flags & IFF_LOOPBACK) {
496                                 if (!sprt->rt6i_idev ||
497                                     sprt->rt6i_idev->dev->ifindex != oif) {
498                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
499                                                 continue;
500                                         if (local && (!oif ||
501                                                       local->rt6i_idev->dev->ifindex == oif))
502                                                 continue;
503                                 }
504                                 local = sprt;
505                         }
506                 } else {
507                         if (ipv6_chk_addr(net, saddr, dev,
508                                           flags & RT6_LOOKUP_F_IFACE))
509                                 return sprt;
510                 }
511         }
512
513         if (oif) {
514                 if (local)
515                         return local;
516
517                 if (flags & RT6_LOOKUP_F_IFACE)
518                         return net->ipv6.ip6_null_entry;
519         }
520 out:
521         return rt;
522 }
523
524 #ifdef CONFIG_IPV6_ROUTER_PREF
525 struct __rt6_probe_work {
526         struct work_struct work;
527         struct in6_addr target;
528         struct net_device *dev;
529 };
530
531 static void rt6_probe_deferred(struct work_struct *w)
532 {
533         struct in6_addr mcaddr;
534         struct __rt6_probe_work *work =
535                 container_of(w, struct __rt6_probe_work, work);
536
537         addrconf_addr_solict_mult(&work->target, &mcaddr);
538         ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
539         dev_put(work->dev);
540         kfree(work);
541 }
542
543 static void rt6_probe(struct rt6_info *rt)
544 {
545         struct neighbour *neigh;
546         /*
547          * Okay, this does not seem to be appropriate
548          * for now, however, we need to check if it
549          * is really so; aka Router Reachability Probing.
550          *
551          * Router Reachability Probe MUST be rate-limited
552          * to no more than one per minute.
553          */
554         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
555                 return;
556         rcu_read_lock_bh();
557         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
558         if (neigh) {
559                 write_lock(&neigh->lock);
560                 if (neigh->nud_state & NUD_VALID)
561                         goto out;
562         }
563
564         if (!neigh ||
565             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
566                 struct __rt6_probe_work *work;
567
568                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
569
570                 if (neigh && work)
571                         __neigh_set_probe_once(neigh);
572
573                 if (neigh)
574                         write_unlock(&neigh->lock);
575
576                 if (work) {
577                         INIT_WORK(&work->work, rt6_probe_deferred);
578                         work->target = rt->rt6i_gateway;
579                         dev_hold(rt->dst.dev);
580                         work->dev = rt->dst.dev;
581                         schedule_work(&work->work);
582                 }
583         } else {
584 out:
585                 write_unlock(&neigh->lock);
586         }
587         rcu_read_unlock_bh();
588 }
589 #else
590 static inline void rt6_probe(struct rt6_info *rt)
591 {
592 }
593 #endif
594
595 /*
596  * Default Router Selection (RFC 2461 6.3.6)
597  */
598 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
599 {
600         struct net_device *dev = rt->dst.dev;
601         if (!oif || dev->ifindex == oif)
602                 return 2;
603         if ((dev->flags & IFF_LOOPBACK) &&
604             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
605                 return 1;
606         return 0;
607 }
608
609 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
610 {
611         struct neighbour *neigh;
612         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
613
614         if (rt->rt6i_flags & RTF_NONEXTHOP ||
615             !(rt->rt6i_flags & RTF_GATEWAY))
616                 return RT6_NUD_SUCCEED;
617
618         rcu_read_lock_bh();
619         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
620         if (neigh) {
621                 read_lock(&neigh->lock);
622                 if (neigh->nud_state & NUD_VALID)
623                         ret = RT6_NUD_SUCCEED;
624 #ifdef CONFIG_IPV6_ROUTER_PREF
625                 else if (!(neigh->nud_state & NUD_FAILED))
626                         ret = RT6_NUD_SUCCEED;
627                 else
628                         ret = RT6_NUD_FAIL_PROBE;
629 #endif
630                 read_unlock(&neigh->lock);
631         } else {
632                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
633                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
634         }
635         rcu_read_unlock_bh();
636
637         return ret;
638 }
639
640 static int rt6_score_route(struct rt6_info *rt, int oif,
641                            int strict)
642 {
643         int m;
644
645         m = rt6_check_dev(rt, oif);
646         if (!m && (strict & RT6_LOOKUP_F_IFACE))
647                 return RT6_NUD_FAIL_HARD;
648 #ifdef CONFIG_IPV6_ROUTER_PREF
649         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
650 #endif
651         if (strict & RT6_LOOKUP_F_REACHABLE) {
652                 int n = rt6_check_neigh(rt);
653                 if (n < 0)
654                         return n;
655         }
656         return m;
657 }
658
659 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
660                                    int *mpri, struct rt6_info *match,
661                                    bool *do_rr)
662 {
663         int m;
664         bool match_do_rr = false;
665
666         if (rt6_check_expired(rt))
667                 goto out;
668
669         m = rt6_score_route(rt, oif, strict);
670         if (m == RT6_NUD_FAIL_DO_RR) {
671                 match_do_rr = true;
672                 m = 0; /* lowest valid score */
673         } else if (m == RT6_NUD_FAIL_HARD) {
674                 goto out;
675         }
676
677         if (strict & RT6_LOOKUP_F_REACHABLE)
678                 rt6_probe(rt);
679
680         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
681         if (m > *mpri) {
682                 *do_rr = match_do_rr;
683                 *mpri = m;
684                 match = rt;
685         }
686 out:
687         return match;
688 }
689
690 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
691                                      struct rt6_info *rr_head,
692                                      u32 metric, int oif, int strict,
693                                      bool *do_rr)
694 {
695         struct rt6_info *rt, *match, *cont;
696         int mpri = -1;
697
698         match = NULL;
699         cont = NULL;
700         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
701                 if (rt->rt6i_metric != metric) {
702                         cont = rt;
703                         break;
704                 }
705
706                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
707         }
708
709         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
710                 if (rt->rt6i_metric != metric) {
711                         cont = rt;
712                         break;
713                 }
714
715                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716         }
717
718         if (match || !cont)
719                 return match;
720
721         for (rt = cont; rt; rt = rt->dst.rt6_next)
722                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
723
724         return match;
725 }
726
727 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
728 {
729         struct rt6_info *match, *rt0;
730         struct net *net;
731         bool do_rr = false;
732
733         rt0 = fn->rr_ptr;
734         if (!rt0)
735                 fn->rr_ptr = rt0 = fn->leaf;
736
737         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
738                              &do_rr);
739
740         if (do_rr) {
741                 struct rt6_info *next = rt0->dst.rt6_next;
742
743                 /* no entries matched; do round-robin */
744                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
745                         next = fn->leaf;
746
747                 if (next != rt0)
748                         fn->rr_ptr = next;
749         }
750
751         net = dev_net(rt0->dst.dev);
752         return match ? match : net->ipv6.ip6_null_entry;
753 }
754
755 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
756 {
757         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
758 }
759
760 #ifdef CONFIG_IPV6_ROUTE_INFO
761 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
762                   const struct in6_addr *gwaddr)
763 {
764         struct net *net = dev_net(dev);
765         struct route_info *rinfo = (struct route_info *) opt;
766         struct in6_addr prefix_buf, *prefix;
767         unsigned int pref;
768         unsigned long lifetime;
769         struct rt6_info *rt;
770
771         if (len < sizeof(struct route_info)) {
772                 return -EINVAL;
773         }
774
775         /* Sanity check for prefix_len and length */
776         if (rinfo->length > 3) {
777                 return -EINVAL;
778         } else if (rinfo->prefix_len > 128) {
779                 return -EINVAL;
780         } else if (rinfo->prefix_len > 64) {
781                 if (rinfo->length < 2) {
782                         return -EINVAL;
783                 }
784         } else if (rinfo->prefix_len > 0) {
785                 if (rinfo->length < 1) {
786                         return -EINVAL;
787                 }
788         }
789
790         pref = rinfo->route_pref;
791         if (pref == ICMPV6_ROUTER_PREF_INVALID)
792                 return -EINVAL;
793
794         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
795
796         if (rinfo->length == 3)
797                 prefix = (struct in6_addr *)rinfo->prefix;
798         else {
799                 /* this function is safe */
800                 ipv6_addr_prefix(&prefix_buf,
801                                  (struct in6_addr *)rinfo->prefix,
802                                  rinfo->prefix_len);
803                 prefix = &prefix_buf;
804         }
805
806         if (rinfo->prefix_len == 0)
807                 rt = rt6_get_dflt_router(gwaddr, dev);
808         else
809                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
810                                         gwaddr, dev->ifindex);
811
812         if (rt && !lifetime) {
813                 ip6_del_rt(rt);
814                 rt = NULL;
815         }
816
817         if (!rt && lifetime)
818                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
819                                         pref);
820         else if (rt)
821                 rt->rt6i_flags = RTF_ROUTEINFO |
822                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
823
824         if (rt) {
825                 if (!addrconf_finite_timeout(lifetime))
826                         rt6_clean_expires(rt);
827                 else
828                         rt6_set_expires(rt, jiffies + HZ * lifetime);
829
830                 ip6_rt_put(rt);
831         }
832         return 0;
833 }
834 #endif
835
836 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
837                                         struct in6_addr *saddr)
838 {
839         struct fib6_node *pn;
840         while (1) {
841                 if (fn->fn_flags & RTN_TL_ROOT)
842                         return NULL;
843                 pn = fn->parent;
844                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
845                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
846                 else
847                         fn = pn;
848                 if (fn->fn_flags & RTN_RTINFO)
849                         return fn;
850         }
851 }
852
853 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
854                                              struct fib6_table *table,
855                                              struct flowi6 *fl6, int flags)
856 {
857         struct fib6_node *fn;
858         struct rt6_info *rt;
859
860         read_lock_bh(&table->tb6_lock);
861         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
862 restart:
863         rt = fn->leaf;
864         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
865         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
866                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
867         if (rt == net->ipv6.ip6_null_entry) {
868                 fn = fib6_backtrack(fn, &fl6->saddr);
869                 if (fn)
870                         goto restart;
871         }
872         dst_use(&rt->dst, jiffies);
873         read_unlock_bh(&table->tb6_lock);
874         return rt;
875
876 }
877
878 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
879                                     int flags)
880 {
881         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
882 }
883 EXPORT_SYMBOL_GPL(ip6_route_lookup);
884
885 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
886                             const struct in6_addr *saddr, int oif, int strict)
887 {
888         struct flowi6 fl6 = {
889                 .flowi6_oif = oif,
890                 .daddr = *daddr,
891         };
892         struct dst_entry *dst;
893         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
894
895         if (saddr) {
896                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
897                 flags |= RT6_LOOKUP_F_HAS_SADDR;
898         }
899
900         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
901         if (dst->error == 0)
902                 return (struct rt6_info *) dst;
903
904         dst_release(dst);
905
906         return NULL;
907 }
908 EXPORT_SYMBOL(rt6_lookup);
909
910 /* ip6_ins_rt is called with FREE table->tb6_lock.
911    It takes new route entry, the addition fails by any reason the
912    route is freed. In any case, if caller does not hold it, it may
913    be destroyed.
914  */
915
916 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
917                         struct mx6_config *mxc)
918 {
919         int err;
920         struct fib6_table *table;
921
922         table = rt->rt6i_table;
923         write_lock_bh(&table->tb6_lock);
924         err = fib6_add(&table->tb6_root, rt, info, mxc);
925         write_unlock_bh(&table->tb6_lock);
926
927         return err;
928 }
929
930 int ip6_ins_rt(struct rt6_info *rt)
931 {
932         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
933         struct mx6_config mxc = { .mx = NULL, };
934
935         return __ip6_ins_rt(rt, &info, &mxc);
936 }
937
938 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
939                                            const struct in6_addr *daddr,
940                                            const struct in6_addr *saddr)
941 {
942         struct rt6_info *rt;
943
944         /*
945          *      Clone the route.
946          */
947
948         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
949                 ort = (struct rt6_info *)ort->dst.from;
950
951         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
952
953         if (!rt)
954                 return NULL;
955
956         ip6_rt_copy_init(rt, ort);
957         rt->rt6i_flags |= RTF_CACHE;
958         rt->rt6i_metric = 0;
959         rt->dst.flags |= DST_HOST;
960         rt->rt6i_dst.addr = *daddr;
961         rt->rt6i_dst.plen = 128;
962
963         if (!rt6_is_gw_or_nonexthop(ort)) {
964                 if (ort->rt6i_dst.plen != 128 &&
965                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
966                         rt->rt6i_flags |= RTF_ANYCAST;
967 #ifdef CONFIG_IPV6_SUBTREES
968                 if (rt->rt6i_src.plen && saddr) {
969                         rt->rt6i_src.addr = *saddr;
970                         rt->rt6i_src.plen = 128;
971                 }
972 #endif
973         }
974
975         return rt;
976 }
977
978 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
979 {
980         struct rt6_info *pcpu_rt;
981
982         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
983                                   rt->dst.dev, rt->dst.flags);
984
985         if (!pcpu_rt)
986                 return NULL;
987         ip6_rt_copy_init(pcpu_rt, rt);
988         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
989         pcpu_rt->rt6i_flags |= RTF_PCPU;
990         return pcpu_rt;
991 }
992
993 /* It should be called with read_lock_bh(&tb6_lock) acquired */
994 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
995 {
996         struct rt6_info *pcpu_rt, *prev, **p;
997
998         p = this_cpu_ptr(rt->rt6i_pcpu);
999         pcpu_rt = *p;
1000
1001         if (pcpu_rt)
1002                 goto done;
1003
1004         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1005         if (!pcpu_rt) {
1006                 struct net *net = dev_net(rt->dst.dev);
1007
1008                 pcpu_rt = net->ipv6.ip6_null_entry;
1009                 goto done;
1010         }
1011
1012         prev = cmpxchg(p, NULL, pcpu_rt);
1013         if (prev) {
1014                 /* If someone did it before us, return prev instead */
1015                 dst_destroy(&pcpu_rt->dst);
1016                 pcpu_rt = prev;
1017         }
1018
1019 done:
1020         dst_hold(&pcpu_rt->dst);
1021         rt6_dst_from_metrics_check(pcpu_rt);
1022         return pcpu_rt;
1023 }
1024
1025 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1026                                       struct flowi6 *fl6, int flags)
1027 {
1028         struct fib6_node *fn, *saved_fn;
1029         struct rt6_info *rt;
1030         int strict = 0;
1031
1032         strict |= flags & RT6_LOOKUP_F_IFACE;
1033         if (net->ipv6.devconf_all->forwarding == 0)
1034                 strict |= RT6_LOOKUP_F_REACHABLE;
1035
1036         read_lock_bh(&table->tb6_lock);
1037
1038         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1039         saved_fn = fn;
1040
1041 redo_rt6_select:
1042         rt = rt6_select(fn, oif, strict);
1043         if (rt->rt6i_nsiblings)
1044                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1045         if (rt == net->ipv6.ip6_null_entry) {
1046                 fn = fib6_backtrack(fn, &fl6->saddr);
1047                 if (fn)
1048                         goto redo_rt6_select;
1049                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1050                         /* also consider unreachable route */
1051                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1052                         fn = saved_fn;
1053                         goto redo_rt6_select;
1054                 }
1055         }
1056
1057
1058         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1059                 dst_use(&rt->dst, jiffies);
1060                 read_unlock_bh(&table->tb6_lock);
1061
1062                 rt6_dst_from_metrics_check(rt);
1063                 return rt;
1064         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1065                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1066                 /* Create a RTF_CACHE clone which will not be
1067                  * owned by the fib6 tree.  It is for the special case where
1068                  * the daddr in the skb during the neighbor look-up is different
1069                  * from the fl6->daddr used to look-up route here.
1070                  */
1071
1072                 struct rt6_info *uncached_rt;
1073
1074                 dst_use(&rt->dst, jiffies);
1075                 read_unlock_bh(&table->tb6_lock);
1076
1077                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1078                 dst_release(&rt->dst);
1079
1080                 if (uncached_rt)
1081                         rt6_uncached_list_add(uncached_rt);
1082                 else
1083                         uncached_rt = net->ipv6.ip6_null_entry;
1084
1085                 dst_hold(&uncached_rt->dst);
1086                 return uncached_rt;
1087
1088         } else {
1089                 /* Get a percpu copy */
1090
1091                 struct rt6_info *pcpu_rt;
1092
1093                 rt->dst.lastuse = jiffies;
1094                 rt->dst.__use++;
1095                 pcpu_rt = rt6_get_pcpu_route(rt);
1096                 read_unlock_bh(&table->tb6_lock);
1097
1098                 return pcpu_rt;
1099         }
1100 }
1101
1102 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1103                                             struct flowi6 *fl6, int flags)
1104 {
1105         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1106 }
1107
1108 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1109                                                 struct net_device *dev,
1110                                                 struct flowi6 *fl6, int flags)
1111 {
1112         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1113                 flags |= RT6_LOOKUP_F_IFACE;
1114
1115         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1116 }
1117
1118 void ip6_route_input(struct sk_buff *skb)
1119 {
1120         const struct ipv6hdr *iph = ipv6_hdr(skb);
1121         struct net *net = dev_net(skb->dev);
1122         int flags = RT6_LOOKUP_F_HAS_SADDR;
1123         struct flowi6 fl6 = {
1124                 .flowi6_iif = skb->dev->ifindex,
1125                 .daddr = iph->daddr,
1126                 .saddr = iph->saddr,
1127                 .flowlabel = ip6_flowinfo(iph),
1128                 .flowi6_mark = skb->mark,
1129                 .flowi6_proto = iph->nexthdr,
1130         };
1131
1132         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1133 }
1134
1135 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1136                                              struct flowi6 *fl6, int flags)
1137 {
1138         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1139 }
1140
1141 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1142                                     struct flowi6 *fl6)
1143 {
1144         int flags = 0;
1145
1146         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1147
1148         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1149                 flags |= RT6_LOOKUP_F_IFACE;
1150
1151         if (!ipv6_addr_any(&fl6->saddr))
1152                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1153         else if (sk)
1154                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1155
1156         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1157 }
1158 EXPORT_SYMBOL(ip6_route_output);
1159
1160 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1161 {
1162         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1163         struct dst_entry *new = NULL;
1164
1165         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1166         if (rt) {
1167                 new = &rt->dst;
1168
1169                 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1170
1171                 new->__use = 1;
1172                 new->input = dst_discard;
1173                 new->output = dst_discard_sk;
1174
1175                 if (dst_metrics_read_only(&ort->dst))
1176                         new->_metrics = ort->dst._metrics;
1177                 else
1178                         dst_copy_metrics(new, &ort->dst);
1179                 rt->rt6i_idev = ort->rt6i_idev;
1180                 if (rt->rt6i_idev)
1181                         in6_dev_hold(rt->rt6i_idev);
1182
1183                 rt->rt6i_gateway = ort->rt6i_gateway;
1184                 rt->rt6i_flags = ort->rt6i_flags;
1185                 rt->rt6i_metric = 0;
1186
1187                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1188 #ifdef CONFIG_IPV6_SUBTREES
1189                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1190 #endif
1191
1192                 dst_free(new);
1193         }
1194
1195         dst_release(dst_orig);
1196         return new ? new : ERR_PTR(-ENOMEM);
1197 }
1198
1199 /*
1200  *      Destination cache support functions
1201  */
1202
1203 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1204 {
1205         if (rt->dst.from &&
1206             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1207                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1208 }
1209
1210 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1211 {
1212         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1213                 return NULL;
1214
1215         if (rt6_check_expired(rt))
1216                 return NULL;
1217
1218         return &rt->dst;
1219 }
1220
1221 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1222 {
1223         if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1224             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1225                 return &rt->dst;
1226         else
1227                 return NULL;
1228 }
1229
1230 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1231 {
1232         struct rt6_info *rt;
1233
1234         rt = (struct rt6_info *) dst;
1235
1236         /* All IPV6 dsts are created with ->obsolete set to the value
1237          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1238          * into this function always.
1239          */
1240
1241         rt6_dst_from_metrics_check(rt);
1242
1243         if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1244                 return rt6_dst_from_check(rt, cookie);
1245         else
1246                 return rt6_check(rt, cookie);
1247 }
1248
1249 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1250 {
1251         struct rt6_info *rt = (struct rt6_info *) dst;
1252
1253         if (rt) {
1254                 if (rt->rt6i_flags & RTF_CACHE) {
1255                         if (rt6_check_expired(rt)) {
1256                                 ip6_del_rt(rt);
1257                                 dst = NULL;
1258                         }
1259                 } else {
1260                         dst_release(dst);
1261                         dst = NULL;
1262                 }
1263         }
1264         return dst;
1265 }
1266
1267 static void ip6_link_failure(struct sk_buff *skb)
1268 {
1269         struct rt6_info *rt;
1270
1271         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1272
1273         rt = (struct rt6_info *) skb_dst(skb);
1274         if (rt) {
1275                 if (rt->rt6i_flags & RTF_CACHE) {
1276                         dst_hold(&rt->dst);
1277                         if (ip6_del_rt(rt))
1278                                 dst_free(&rt->dst);
1279                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1280                         rt->rt6i_node->fn_sernum = -1;
1281                 }
1282         }
1283 }
1284
1285 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1286 {
1287         struct net *net = dev_net(rt->dst.dev);
1288
1289         rt->rt6i_flags |= RTF_MODIFIED;
1290         rt->rt6i_pmtu = mtu;
1291         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1292 }
1293
1294 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1295                                  const struct ipv6hdr *iph, u32 mtu)
1296 {
1297         struct rt6_info *rt6 = (struct rt6_info *)dst;
1298
1299         if (rt6->rt6i_flags & RTF_LOCAL)
1300                 return;
1301
1302         dst_confirm(dst);
1303         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1304         if (mtu >= dst_mtu(dst))
1305                 return;
1306
1307         if (rt6->rt6i_flags & RTF_CACHE) {
1308                 rt6_do_update_pmtu(rt6, mtu);
1309         } else {
1310                 const struct in6_addr *daddr, *saddr;
1311                 struct rt6_info *nrt6;
1312
1313                 if (iph) {
1314                         daddr = &iph->daddr;
1315                         saddr = &iph->saddr;
1316                 } else if (sk) {
1317                         daddr = &sk->sk_v6_daddr;
1318                         saddr = &inet6_sk(sk)->saddr;
1319                 } else {
1320                         return;
1321                 }
1322                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1323                 if (nrt6) {
1324                         rt6_do_update_pmtu(nrt6, mtu);
1325
1326                         /* ip6_ins_rt(nrt6) will bump the
1327                          * rt6->rt6i_node->fn_sernum
1328                          * which will fail the next rt6_check() and
1329                          * invalidate the sk->sk_dst_cache.
1330                          */
1331                         ip6_ins_rt(nrt6);
1332                 }
1333         }
1334 }
1335
1336 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1337                                struct sk_buff *skb, u32 mtu)
1338 {
1339         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1340 }
1341
1342 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1343                      int oif, u32 mark)
1344 {
1345         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1346         struct dst_entry *dst;
1347         struct flowi6 fl6;
1348
1349         memset(&fl6, 0, sizeof(fl6));
1350         fl6.flowi6_oif = oif;
1351         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1352         fl6.daddr = iph->daddr;
1353         fl6.saddr = iph->saddr;
1354         fl6.flowlabel = ip6_flowinfo(iph);
1355
1356         dst = ip6_route_output(net, NULL, &fl6);
1357         if (!dst->error)
1358                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1359         dst_release(dst);
1360 }
1361 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1362
1363 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1364 {
1365         ip6_update_pmtu(skb, sock_net(sk), mtu,
1366                         sk->sk_bound_dev_if, sk->sk_mark);
1367 }
1368 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1369
1370 /* Handle redirects */
1371 struct ip6rd_flowi {
1372         struct flowi6 fl6;
1373         struct in6_addr gateway;
1374 };
1375
1376 static struct rt6_info *__ip6_route_redirect(struct net *net,
1377                                              struct fib6_table *table,
1378                                              struct flowi6 *fl6,
1379                                              int flags)
1380 {
1381         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1382         struct rt6_info *rt;
1383         struct fib6_node *fn;
1384
1385         /* Get the "current" route for this destination and
1386          * check if the redirect has come from approriate router.
1387          *
1388          * RFC 4861 specifies that redirects should only be
1389          * accepted if they come from the nexthop to the target.
1390          * Due to the way the routes are chosen, this notion
1391          * is a bit fuzzy and one might need to check all possible
1392          * routes.
1393          */
1394
1395         read_lock_bh(&table->tb6_lock);
1396         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1397 restart:
1398         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1399                 if (rt6_check_expired(rt))
1400                         continue;
1401                 if (rt->dst.error)
1402                         break;
1403                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1404                         continue;
1405                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1406                         continue;
1407                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1408                         continue;
1409                 break;
1410         }
1411
1412         if (!rt)
1413                 rt = net->ipv6.ip6_null_entry;
1414         else if (rt->dst.error) {
1415                 rt = net->ipv6.ip6_null_entry;
1416                 goto out;
1417         }
1418
1419         if (rt == net->ipv6.ip6_null_entry) {
1420                 fn = fib6_backtrack(fn, &fl6->saddr);
1421                 if (fn)
1422                         goto restart;
1423         }
1424
1425 out:
1426         dst_hold(&rt->dst);
1427
1428         read_unlock_bh(&table->tb6_lock);
1429
1430         return rt;
1431 };
1432
1433 static struct dst_entry *ip6_route_redirect(struct net *net,
1434                                         const struct flowi6 *fl6,
1435                                         const struct in6_addr *gateway)
1436 {
1437         int flags = RT6_LOOKUP_F_HAS_SADDR;
1438         struct ip6rd_flowi rdfl;
1439
1440         rdfl.fl6 = *fl6;
1441         rdfl.gateway = *gateway;
1442
1443         return fib6_rule_lookup(net, &rdfl.fl6,
1444                                 flags, __ip6_route_redirect);
1445 }
1446
1447 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1448 {
1449         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1450         struct dst_entry *dst;
1451         struct flowi6 fl6;
1452
1453         memset(&fl6, 0, sizeof(fl6));
1454         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1455         fl6.flowi6_oif = oif;
1456         fl6.flowi6_mark = mark;
1457         fl6.daddr = iph->daddr;
1458         fl6.saddr = iph->saddr;
1459         fl6.flowlabel = ip6_flowinfo(iph);
1460
1461         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1462         rt6_do_redirect(dst, NULL, skb);
1463         dst_release(dst);
1464 }
1465 EXPORT_SYMBOL_GPL(ip6_redirect);
1466
1467 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1468                             u32 mark)
1469 {
1470         const struct ipv6hdr *iph = ipv6_hdr(skb);
1471         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1472         struct dst_entry *dst;
1473         struct flowi6 fl6;
1474
1475         memset(&fl6, 0, sizeof(fl6));
1476         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1477         fl6.flowi6_oif = oif;
1478         fl6.flowi6_mark = mark;
1479         fl6.daddr = msg->dest;
1480         fl6.saddr = iph->daddr;
1481
1482         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1483         rt6_do_redirect(dst, NULL, skb);
1484         dst_release(dst);
1485 }
1486
1487 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1488 {
1489         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1490 }
1491 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1492
1493 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1494 {
1495         struct net_device *dev = dst->dev;
1496         unsigned int mtu = dst_mtu(dst);
1497         struct net *net = dev_net(dev);
1498
1499         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1500
1501         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1502                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1503
1504         /*
1505          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1506          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1507          * IPV6_MAXPLEN is also valid and means: "any MSS,
1508          * rely only on pmtu discovery"
1509          */
1510         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1511                 mtu = IPV6_MAXPLEN;
1512         return mtu;
1513 }
1514
1515 static unsigned int ip6_mtu(const struct dst_entry *dst)
1516 {
1517         const struct rt6_info *rt = (const struct rt6_info *)dst;
1518         unsigned int mtu = rt->rt6i_pmtu;
1519         struct inet6_dev *idev;
1520
1521         if (mtu)
1522                 goto out;
1523
1524         mtu = dst_metric_raw(dst, RTAX_MTU);
1525         if (mtu)
1526                 goto out;
1527
1528         mtu = IPV6_MIN_MTU;
1529
1530         rcu_read_lock();
1531         idev = __in6_dev_get(dst->dev);
1532         if (idev)
1533                 mtu = idev->cnf.mtu6;
1534         rcu_read_unlock();
1535
1536 out:
1537         return min_t(unsigned int, mtu, IP6_MAX_MTU);
1538 }
1539
1540 static struct dst_entry *icmp6_dst_gc_list;
1541 static DEFINE_SPINLOCK(icmp6_dst_lock);
1542
1543 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1544                                   struct flowi6 *fl6)
1545 {
1546         struct dst_entry *dst;
1547         struct rt6_info *rt;
1548         struct inet6_dev *idev = in6_dev_get(dev);
1549         struct net *net = dev_net(dev);
1550
1551         if (unlikely(!idev))
1552                 return ERR_PTR(-ENODEV);
1553
1554         rt = ip6_dst_alloc(net, dev, 0);
1555         if (unlikely(!rt)) {
1556                 in6_dev_put(idev);
1557                 dst = ERR_PTR(-ENOMEM);
1558                 goto out;
1559         }
1560
1561         rt->dst.flags |= DST_HOST;
1562         rt->dst.output  = ip6_output;
1563         atomic_set(&rt->dst.__refcnt, 1);
1564         rt->rt6i_gateway  = fl6->daddr;
1565         rt->rt6i_dst.addr = fl6->daddr;
1566         rt->rt6i_dst.plen = 128;
1567         rt->rt6i_idev     = idev;
1568         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1569
1570         spin_lock_bh(&icmp6_dst_lock);
1571         rt->dst.next = icmp6_dst_gc_list;
1572         icmp6_dst_gc_list = &rt->dst;
1573         spin_unlock_bh(&icmp6_dst_lock);
1574
1575         fib6_force_start_gc(net);
1576
1577         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1578
1579 out:
1580         return dst;
1581 }
1582
1583 int icmp6_dst_gc(void)
1584 {
1585         struct dst_entry *dst, **pprev;
1586         int more = 0;
1587
1588         spin_lock_bh(&icmp6_dst_lock);
1589         pprev = &icmp6_dst_gc_list;
1590
1591         while ((dst = *pprev) != NULL) {
1592                 if (!atomic_read(&dst->__refcnt)) {
1593                         *pprev = dst->next;
1594                         dst_free(dst);
1595                 } else {
1596                         pprev = &dst->next;
1597                         ++more;
1598                 }
1599         }
1600
1601         spin_unlock_bh(&icmp6_dst_lock);
1602
1603         return more;
1604 }
1605
1606 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1607                             void *arg)
1608 {
1609         struct dst_entry *dst, **pprev;
1610
1611         spin_lock_bh(&icmp6_dst_lock);
1612         pprev = &icmp6_dst_gc_list;
1613         while ((dst = *pprev) != NULL) {
1614                 struct rt6_info *rt = (struct rt6_info *) dst;
1615                 if (func(rt, arg)) {
1616                         *pprev = dst->next;
1617                         dst_free(dst);
1618                 } else {
1619                         pprev = &dst->next;
1620                 }
1621         }
1622         spin_unlock_bh(&icmp6_dst_lock);
1623 }
1624
1625 static int ip6_dst_gc(struct dst_ops *ops)
1626 {
1627         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1628         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1629         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1630         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1631         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1632         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1633         int entries;
1634
1635         entries = dst_entries_get_fast(ops);
1636         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1637             entries <= rt_max_size)
1638                 goto out;
1639
1640         net->ipv6.ip6_rt_gc_expire++;
1641         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1642         entries = dst_entries_get_slow(ops);
1643         if (entries < ops->gc_thresh)
1644                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1645 out:
1646         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1647         return entries > rt_max_size;
1648 }
1649
1650 static int ip6_convert_metrics(struct mx6_config *mxc,
1651                                const struct fib6_config *cfg)
1652 {
1653         struct nlattr *nla;
1654         int remaining;
1655         u32 *mp;
1656
1657         if (!cfg->fc_mx)
1658                 return 0;
1659
1660         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1661         if (unlikely(!mp))
1662                 return -ENOMEM;
1663
1664         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1665                 int type = nla_type(nla);
1666
1667                 if (type) {
1668                         u32 val;
1669
1670                         if (unlikely(type > RTAX_MAX))
1671                                 goto err;
1672                         if (type == RTAX_CC_ALGO) {
1673                                 char tmp[TCP_CA_NAME_MAX];
1674
1675                                 nla_strlcpy(tmp, nla, sizeof(tmp));
1676                                 val = tcp_ca_get_key_by_name(tmp);
1677                                 if (val == TCP_CA_UNSPEC)
1678                                         goto err;
1679                         } else {
1680                                 val = nla_get_u32(nla);
1681                         }
1682
1683                         mp[type - 1] = val;
1684                         __set_bit(type - 1, mxc->mx_valid);
1685                 }
1686         }
1687
1688         mxc->mx = mp;
1689
1690         return 0;
1691  err:
1692         kfree(mp);
1693         return -EINVAL;
1694 }
1695
1696 int ip6_route_add(struct fib6_config *cfg)
1697 {
1698         int err;
1699         struct net *net = cfg->fc_nlinfo.nl_net;
1700         struct rt6_info *rt = NULL;
1701         struct net_device *dev = NULL;
1702         struct inet6_dev *idev = NULL;
1703         struct fib6_table *table;
1704         struct mx6_config mxc = { .mx = NULL, };
1705         int addr_type;
1706
1707         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1708                 return -EINVAL;
1709 #ifndef CONFIG_IPV6_SUBTREES
1710         if (cfg->fc_src_len)
1711                 return -EINVAL;
1712 #endif
1713         if (cfg->fc_ifindex) {
1714                 err = -ENODEV;
1715                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1716                 if (!dev)
1717                         goto out;
1718                 idev = in6_dev_get(dev);
1719                 if (!idev)
1720                         goto out;
1721         }
1722
1723         if (cfg->fc_metric == 0)
1724                 cfg->fc_metric = IP6_RT_PRIO_USER;
1725
1726         err = -ENOBUFS;
1727         if (cfg->fc_nlinfo.nlh &&
1728             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1729                 table = fib6_get_table(net, cfg->fc_table);
1730                 if (!table) {
1731                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1732                         table = fib6_new_table(net, cfg->fc_table);
1733                 }
1734         } else {
1735                 table = fib6_new_table(net, cfg->fc_table);
1736         }
1737
1738         if (!table)
1739                 goto out;
1740
1741         rt = ip6_dst_alloc(net, NULL,
1742                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1743
1744         if (!rt) {
1745                 err = -ENOMEM;
1746                 goto out;
1747         }
1748
1749         if (cfg->fc_flags & RTF_EXPIRES)
1750                 rt6_set_expires(rt, jiffies +
1751                                 clock_t_to_jiffies(cfg->fc_expires));
1752         else
1753                 rt6_clean_expires(rt);
1754
1755         if (cfg->fc_protocol == RTPROT_UNSPEC)
1756                 cfg->fc_protocol = RTPROT_BOOT;
1757         rt->rt6i_protocol = cfg->fc_protocol;
1758
1759         addr_type = ipv6_addr_type(&cfg->fc_dst);
1760
1761         if (addr_type & IPV6_ADDR_MULTICAST)
1762                 rt->dst.input = ip6_mc_input;
1763         else if (cfg->fc_flags & RTF_LOCAL)
1764                 rt->dst.input = ip6_input;
1765         else
1766                 rt->dst.input = ip6_forward;
1767
1768         rt->dst.output = ip6_output;
1769
1770         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1771         rt->rt6i_dst.plen = cfg->fc_dst_len;
1772         if (rt->rt6i_dst.plen == 128)
1773                 rt->dst.flags |= DST_HOST;
1774
1775 #ifdef CONFIG_IPV6_SUBTREES
1776         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1777         rt->rt6i_src.plen = cfg->fc_src_len;
1778 #endif
1779
1780         rt->rt6i_metric = cfg->fc_metric;
1781
1782         /* We cannot add true routes via loopback here,
1783            they would result in kernel looping; promote them to reject routes
1784          */
1785         if ((cfg->fc_flags & RTF_REJECT) ||
1786             (dev && (dev->flags & IFF_LOOPBACK) &&
1787              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1788              !(cfg->fc_flags & RTF_LOCAL))) {
1789                 /* hold loopback dev/idev if we haven't done so. */
1790                 if (dev != net->loopback_dev) {
1791                         if (dev) {
1792                                 dev_put(dev);
1793                                 in6_dev_put(idev);
1794                         }
1795                         dev = net->loopback_dev;
1796                         dev_hold(dev);
1797                         idev = in6_dev_get(dev);
1798                         if (!idev) {
1799                                 err = -ENODEV;
1800                                 goto out;
1801                         }
1802                 }
1803                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1804                 switch (cfg->fc_type) {
1805                 case RTN_BLACKHOLE:
1806                         rt->dst.error = -EINVAL;
1807                         rt->dst.output = dst_discard_sk;
1808                         rt->dst.input = dst_discard;
1809                         break;
1810                 case RTN_PROHIBIT:
1811                         rt->dst.error = -EACCES;
1812                         rt->dst.output = ip6_pkt_prohibit_out;
1813                         rt->dst.input = ip6_pkt_prohibit;
1814                         break;
1815                 case RTN_THROW:
1816                 default:
1817                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1818                                         : -ENETUNREACH;
1819                         rt->dst.output = ip6_pkt_discard_out;
1820                         rt->dst.input = ip6_pkt_discard;
1821                         break;
1822                 }
1823                 goto install_route;
1824         }
1825
1826         if (cfg->fc_flags & RTF_GATEWAY) {
1827                 const struct in6_addr *gw_addr;
1828                 int gwa_type;
1829
1830                 gw_addr = &cfg->fc_gateway;
1831                 gwa_type = ipv6_addr_type(gw_addr);
1832
1833                 /* if gw_addr is local we will fail to detect this in case
1834                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1835                  * will return already-added prefix route via interface that
1836                  * prefix route was assigned to, which might be non-loopback.
1837                  */
1838                 err = -EINVAL;
1839                 if (ipv6_chk_addr_and_flags(net, gw_addr,
1840                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
1841                                             dev : NULL, 0, 0))
1842                         goto out;
1843
1844                 rt->rt6i_gateway = *gw_addr;
1845
1846                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1847                         struct rt6_info *grt;
1848
1849                         /* IPv6 strictly inhibits using not link-local
1850                            addresses as nexthop address.
1851                            Otherwise, router will not able to send redirects.
1852                            It is very good, but in some (rare!) circumstances
1853                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1854                            some exceptions. --ANK
1855                          */
1856                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1857                                 goto out;
1858
1859                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1860
1861                         err = -EHOSTUNREACH;
1862                         if (!grt)
1863                                 goto out;
1864                         if (dev) {
1865                                 if (dev != grt->dst.dev) {
1866                                         ip6_rt_put(grt);
1867                                         goto out;
1868                                 }
1869                         } else {
1870                                 dev = grt->dst.dev;
1871                                 idev = grt->rt6i_idev;
1872                                 dev_hold(dev);
1873                                 in6_dev_hold(grt->rt6i_idev);
1874                         }
1875                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1876                                 err = 0;
1877                         ip6_rt_put(grt);
1878
1879                         if (err)
1880                                 goto out;
1881                 }
1882                 err = -EINVAL;
1883                 if (!dev || (dev->flags & IFF_LOOPBACK))
1884                         goto out;
1885         }
1886
1887         err = -ENODEV;
1888         if (!dev)
1889                 goto out;
1890
1891         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1892                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1893                         err = -EINVAL;
1894                         goto out;
1895                 }
1896                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1897                 rt->rt6i_prefsrc.plen = 128;
1898         } else
1899                 rt->rt6i_prefsrc.plen = 0;
1900
1901         rt->rt6i_flags = cfg->fc_flags;
1902
1903 install_route:
1904         rt->dst.dev = dev;
1905         rt->rt6i_idev = idev;
1906         rt->rt6i_table = table;
1907
1908         cfg->fc_nlinfo.nl_net = dev_net(dev);
1909
1910         err = ip6_convert_metrics(&mxc, cfg);
1911         if (err)
1912                 goto out;
1913
1914         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1915
1916         kfree(mxc.mx);
1917         return err;
1918 out:
1919         if (dev)
1920                 dev_put(dev);
1921         if (idev)
1922                 in6_dev_put(idev);
1923         if (rt)
1924                 dst_free(&rt->dst);
1925         return err;
1926 }
1927
1928 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1929 {
1930         int err;
1931         struct fib6_table *table;
1932         struct net *net = dev_net(rt->dst.dev);
1933
1934         if (rt == net->ipv6.ip6_null_entry) {
1935                 err = -ENOENT;
1936                 goto out;
1937         }
1938
1939         table = rt->rt6i_table;
1940         write_lock_bh(&table->tb6_lock);
1941         err = fib6_del(rt, info);
1942         write_unlock_bh(&table->tb6_lock);
1943
1944 out:
1945         ip6_rt_put(rt);
1946         return err;
1947 }
1948
1949 int ip6_del_rt(struct rt6_info *rt)
1950 {
1951         struct nl_info info = {
1952                 .nl_net = dev_net(rt->dst.dev),
1953         };
1954         return __ip6_del_rt(rt, &info);
1955 }
1956
1957 static int ip6_route_del(struct fib6_config *cfg)
1958 {
1959         struct fib6_table *table;
1960         struct fib6_node *fn;
1961         struct rt6_info *rt;
1962         int err = -ESRCH;
1963
1964         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1965         if (!table)
1966                 return err;
1967
1968         read_lock_bh(&table->tb6_lock);
1969
1970         fn = fib6_locate(&table->tb6_root,
1971                          &cfg->fc_dst, cfg->fc_dst_len,
1972                          &cfg->fc_src, cfg->fc_src_len);
1973
1974         if (fn) {
1975                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1976                         if ((rt->rt6i_flags & RTF_CACHE) &&
1977                             !(cfg->fc_flags & RTF_CACHE))
1978                                 continue;
1979                         if (cfg->fc_ifindex &&
1980                             (!rt->dst.dev ||
1981                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1982                                 continue;
1983                         if (cfg->fc_flags & RTF_GATEWAY &&
1984                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1985                                 continue;
1986                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1987                                 continue;
1988                         dst_hold(&rt->dst);
1989                         read_unlock_bh(&table->tb6_lock);
1990
1991                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1992                 }
1993         }
1994         read_unlock_bh(&table->tb6_lock);
1995
1996         return err;
1997 }
1998
1999 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2000 {
2001         struct net *net = dev_net(skb->dev);
2002         struct netevent_redirect netevent;
2003         struct rt6_info *rt, *nrt = NULL;
2004         struct ndisc_options ndopts;
2005         struct inet6_dev *in6_dev;
2006         struct neighbour *neigh;
2007         struct rd_msg *msg;
2008         int optlen, on_link;
2009         u8 *lladdr;
2010
2011         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2012         optlen -= sizeof(*msg);
2013
2014         if (optlen < 0) {
2015                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2016                 return;
2017         }
2018
2019         msg = (struct rd_msg *)icmp6_hdr(skb);
2020
2021         if (ipv6_addr_is_multicast(&msg->dest)) {
2022                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2023                 return;
2024         }
2025
2026         on_link = 0;
2027         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2028                 on_link = 1;
2029         } else if (ipv6_addr_type(&msg->target) !=
2030                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2031                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2032                 return;
2033         }
2034
2035         in6_dev = __in6_dev_get(skb->dev);
2036         if (!in6_dev)
2037                 return;
2038         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2039                 return;
2040
2041         /* RFC2461 8.1:
2042          *      The IP source address of the Redirect MUST be the same as the current
2043          *      first-hop router for the specified ICMP Destination Address.
2044          */
2045
2046         if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2047                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2048                 return;
2049         }
2050
2051         lladdr = NULL;
2052         if (ndopts.nd_opts_tgt_lladdr) {
2053                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2054                                              skb->dev);
2055                 if (!lladdr) {
2056                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2057                         return;
2058                 }
2059         }
2060
2061         rt = (struct rt6_info *) dst;
2062         if (rt == net->ipv6.ip6_null_entry) {
2063                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2064                 return;
2065         }
2066
2067         /* Redirect received -> path was valid.
2068          * Look, redirects are sent only in response to data packets,
2069          * so that this nexthop apparently is reachable. --ANK
2070          */
2071         dst_confirm(&rt->dst);
2072
2073         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2074         if (!neigh)
2075                 return;
2076
2077         /*
2078          *      We have finally decided to accept it.
2079          */
2080
2081         neigh_update(neigh, lladdr, NUD_STALE,
2082                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2083                      NEIGH_UPDATE_F_OVERRIDE|
2084                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2085                                      NEIGH_UPDATE_F_ISROUTER))
2086                      );
2087
2088         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2089         if (!nrt)
2090                 goto out;
2091
2092         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2093         if (on_link)
2094                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2095
2096         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2097
2098         if (ip6_ins_rt(nrt))
2099                 goto out;
2100
2101         netevent.old = &rt->dst;
2102         netevent.new = &nrt->dst;
2103         netevent.daddr = &msg->dest;
2104         netevent.neigh = neigh;
2105         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2106
2107         if (rt->rt6i_flags & RTF_CACHE) {
2108                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2109                 ip6_del_rt(rt);
2110         }
2111
2112 out:
2113         neigh_release(neigh);
2114 }
2115
2116 /*
2117  *      Misc support functions
2118  */
2119
2120 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2121 {
2122         BUG_ON(from->dst.from);
2123
2124         rt->rt6i_flags &= ~RTF_EXPIRES;
2125         dst_hold(&from->dst);
2126         rt->dst.from = &from->dst;
2127         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2128 }
2129
2130 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2131 {
2132         rt->dst.input = ort->dst.input;
2133         rt->dst.output = ort->dst.output;
2134         rt->rt6i_dst = ort->rt6i_dst;
2135         rt->dst.error = ort->dst.error;
2136         rt->rt6i_idev = ort->rt6i_idev;
2137         if (rt->rt6i_idev)
2138                 in6_dev_hold(rt->rt6i_idev);
2139         rt->dst.lastuse = jiffies;
2140         rt->rt6i_gateway = ort->rt6i_gateway;
2141         rt->rt6i_flags = ort->rt6i_flags;
2142         rt6_set_from(rt, ort);
2143         rt->rt6i_metric = ort->rt6i_metric;
2144 #ifdef CONFIG_IPV6_SUBTREES
2145         rt->rt6i_src = ort->rt6i_src;
2146 #endif
2147         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2148         rt->rt6i_table = ort->rt6i_table;
2149 }
2150
2151 #ifdef CONFIG_IPV6_ROUTE_INFO
2152 static struct rt6_info *rt6_get_route_info(struct net *net,
2153                                            const struct in6_addr *prefix, int prefixlen,
2154                                            const struct in6_addr *gwaddr, int ifindex)
2155 {
2156         struct fib6_node *fn;
2157         struct rt6_info *rt = NULL;
2158         struct fib6_table *table;
2159
2160         table = fib6_get_table(net, RT6_TABLE_INFO);
2161         if (!table)
2162                 return NULL;
2163
2164         read_lock_bh(&table->tb6_lock);
2165         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2166         if (!fn)
2167                 goto out;
2168
2169         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2170                 if (rt->dst.dev->ifindex != ifindex)
2171                         continue;
2172                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2173                         continue;
2174                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2175                         continue;
2176                 dst_hold(&rt->dst);
2177                 break;
2178         }
2179 out:
2180         read_unlock_bh(&table->tb6_lock);
2181         return rt;
2182 }
2183
2184 static struct rt6_info *rt6_add_route_info(struct net *net,
2185                                            const struct in6_addr *prefix, int prefixlen,
2186                                            const struct in6_addr *gwaddr, int ifindex,
2187                                            unsigned int pref)
2188 {
2189         struct fib6_config cfg = {
2190                 .fc_table       = RT6_TABLE_INFO,
2191                 .fc_metric      = IP6_RT_PRIO_USER,
2192                 .fc_ifindex     = ifindex,
2193                 .fc_dst_len     = prefixlen,
2194                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2195                                   RTF_UP | RTF_PREF(pref),
2196                 .fc_nlinfo.portid = 0,
2197                 .fc_nlinfo.nlh = NULL,
2198                 .fc_nlinfo.nl_net = net,
2199         };
2200
2201         cfg.fc_dst = *prefix;
2202         cfg.fc_gateway = *gwaddr;
2203
2204         /* We should treat it as a default route if prefix length is 0. */
2205         if (!prefixlen)
2206                 cfg.fc_flags |= RTF_DEFAULT;
2207
2208         ip6_route_add(&cfg);
2209
2210         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2211 }
2212 #endif
2213
2214 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2215 {
2216         struct rt6_info *rt;
2217         struct fib6_table *table;
2218
2219         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2220         if (!table)
2221                 return NULL;
2222
2223         read_lock_bh(&table->tb6_lock);
2224         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2225                 if (dev == rt->dst.dev &&
2226                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2227                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2228                         break;
2229         }
2230         if (rt)
2231                 dst_hold(&rt->dst);
2232         read_unlock_bh(&table->tb6_lock);
2233         return rt;
2234 }
2235
2236 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2237                                      struct net_device *dev,
2238                                      unsigned int pref)
2239 {
2240         struct fib6_config cfg = {
2241                 .fc_table       = RT6_TABLE_DFLT,
2242                 .fc_metric      = IP6_RT_PRIO_USER,
2243                 .fc_ifindex     = dev->ifindex,
2244                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2245                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2246                 .fc_nlinfo.portid = 0,
2247                 .fc_nlinfo.nlh = NULL,
2248                 .fc_nlinfo.nl_net = dev_net(dev),
2249         };
2250
2251         cfg.fc_gateway = *gwaddr;
2252
2253         ip6_route_add(&cfg);
2254
2255         return rt6_get_dflt_router(gwaddr, dev);
2256 }
2257
2258 void rt6_purge_dflt_routers(struct net *net)
2259 {
2260         struct rt6_info *rt;
2261         struct fib6_table *table;
2262
2263         /* NOTE: Keep consistent with rt6_get_dflt_router */
2264         table = fib6_get_table(net, RT6_TABLE_DFLT);
2265         if (!table)
2266                 return;
2267
2268 restart:
2269         read_lock_bh(&table->tb6_lock);
2270         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2271                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2272                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2273                         dst_hold(&rt->dst);
2274                         read_unlock_bh(&table->tb6_lock);
2275                         ip6_del_rt(rt);
2276                         goto restart;
2277                 }
2278         }
2279         read_unlock_bh(&table->tb6_lock);
2280 }
2281
2282 static void rtmsg_to_fib6_config(struct net *net,
2283                                  struct in6_rtmsg *rtmsg,
2284                                  struct fib6_config *cfg)
2285 {
2286         memset(cfg, 0, sizeof(*cfg));
2287
2288         cfg->fc_table = RT6_TABLE_MAIN;
2289         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2290         cfg->fc_metric = rtmsg->rtmsg_metric;
2291         cfg->fc_expires = rtmsg->rtmsg_info;
2292         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2293         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2294         cfg->fc_flags = rtmsg->rtmsg_flags;
2295
2296         cfg->fc_nlinfo.nl_net = net;
2297
2298         cfg->fc_dst = rtmsg->rtmsg_dst;
2299         cfg->fc_src = rtmsg->rtmsg_src;
2300         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2301 }
2302
2303 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2304 {
2305         struct fib6_config cfg;
2306         struct in6_rtmsg rtmsg;
2307         int err;
2308
2309         switch (cmd) {
2310         case SIOCADDRT:         /* Add a route */
2311         case SIOCDELRT:         /* Delete a route */
2312                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2313                         return -EPERM;
2314                 err = copy_from_user(&rtmsg, arg,
2315                                      sizeof(struct in6_rtmsg));
2316                 if (err)
2317                         return -EFAULT;
2318
2319                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2320
2321                 rtnl_lock();
2322                 switch (cmd) {
2323                 case SIOCADDRT:
2324                         err = ip6_route_add(&cfg);
2325                         break;
2326                 case SIOCDELRT:
2327                         err = ip6_route_del(&cfg);
2328                         break;
2329                 default:
2330                         err = -EINVAL;
2331                 }
2332                 rtnl_unlock();
2333
2334                 return err;
2335         }
2336
2337         return -EINVAL;
2338 }
2339
2340 /*
2341  *      Drop the packet on the floor
2342  */
2343
2344 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2345 {
2346         int type;
2347         struct dst_entry *dst = skb_dst(skb);
2348         switch (ipstats_mib_noroutes) {
2349         case IPSTATS_MIB_INNOROUTES:
2350                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2351                 if (type == IPV6_ADDR_ANY) {
2352                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2353                                       IPSTATS_MIB_INADDRERRORS);
2354                         break;
2355                 }
2356                 /* FALLTHROUGH */
2357         case IPSTATS_MIB_OUTNOROUTES:
2358                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2359                               ipstats_mib_noroutes);
2360                 break;
2361         }
2362         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2363         kfree_skb(skb);
2364         return 0;
2365 }
2366
2367 static int ip6_pkt_discard(struct sk_buff *skb)
2368 {
2369         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2370 }
2371
2372 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2373 {
2374         skb->dev = skb_dst(skb)->dev;
2375         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2376 }
2377
2378 static int ip6_pkt_prohibit(struct sk_buff *skb)
2379 {
2380         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2381 }
2382
2383 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2384 {
2385         skb->dev = skb_dst(skb)->dev;
2386         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2387 }
2388
2389 /*
2390  *      Allocate a dst for local (unicast / anycast) address.
2391  */
2392
2393 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2394                                     const struct in6_addr *addr,
2395                                     bool anycast)
2396 {
2397         struct net *net = dev_net(idev->dev);
2398         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2399                                             DST_NOCOUNT);
2400         if (!rt)
2401                 return ERR_PTR(-ENOMEM);
2402
2403         in6_dev_hold(idev);
2404
2405         rt->dst.flags |= DST_HOST;
2406         rt->dst.input = ip6_input;
2407         rt->dst.output = ip6_output;
2408         rt->rt6i_idev = idev;
2409
2410         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2411         if (anycast)
2412                 rt->rt6i_flags |= RTF_ANYCAST;
2413         else
2414                 rt->rt6i_flags |= RTF_LOCAL;
2415
2416         rt->rt6i_gateway  = *addr;
2417         rt->rt6i_dst.addr = *addr;
2418         rt->rt6i_dst.plen = 128;
2419         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2420
2421         atomic_set(&rt->dst.__refcnt, 1);
2422
2423         return rt;
2424 }
2425
2426 int ip6_route_get_saddr(struct net *net,
2427                         struct rt6_info *rt,
2428                         const struct in6_addr *daddr,
2429                         unsigned int prefs,
2430                         struct in6_addr *saddr)
2431 {
2432         struct inet6_dev *idev =
2433                 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2434         int err = 0;
2435         if (rt && rt->rt6i_prefsrc.plen)
2436                 *saddr = rt->rt6i_prefsrc.addr;
2437         else
2438                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2439                                          daddr, prefs, saddr);
2440         return err;
2441 }
2442
2443 /* remove deleted ip from prefsrc entries */
2444 struct arg_dev_net_ip {
2445         struct net_device *dev;
2446         struct net *net;
2447         struct in6_addr *addr;
2448 };
2449
2450 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2451 {
2452         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2453         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2454         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2455
2456         if (((void *)rt->dst.dev == dev || !dev) &&
2457             rt != net->ipv6.ip6_null_entry &&
2458             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2459                 /* remove prefsrc entry */
2460                 rt->rt6i_prefsrc.plen = 0;
2461         }
2462         return 0;
2463 }
2464
2465 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2466 {
2467         struct net *net = dev_net(ifp->idev->dev);
2468         struct arg_dev_net_ip adni = {
2469                 .dev = ifp->idev->dev,
2470                 .net = net,
2471                 .addr = &ifp->addr,
2472         };
2473         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2474 }
2475
2476 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2477 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2478
2479 /* Remove routers and update dst entries when gateway turn into host. */
2480 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2481 {
2482         struct in6_addr *gateway = (struct in6_addr *)arg;
2483
2484         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2485              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2486              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2487                 return -1;
2488         }
2489         return 0;
2490 }
2491
2492 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2493 {
2494         fib6_clean_all(net, fib6_clean_tohost, gateway);
2495 }
2496
2497 struct arg_dev_net {
2498         struct net_device *dev;
2499         struct net *net;
2500 };
2501
2502 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2503 {
2504         const struct arg_dev_net *adn = arg;
2505         const struct net_device *dev = adn->dev;
2506
2507         if ((rt->dst.dev == dev || !dev) &&
2508             rt != adn->net->ipv6.ip6_null_entry)
2509                 return -1;
2510
2511         return 0;
2512 }
2513
2514 void rt6_ifdown(struct net *net, struct net_device *dev)
2515 {
2516         struct arg_dev_net adn = {
2517                 .dev = dev,
2518                 .net = net,
2519         };
2520
2521         fib6_clean_all(net, fib6_ifdown, &adn);
2522         icmp6_clean_all(fib6_ifdown, &adn);
2523         rt6_uncached_list_flush_dev(net, dev);
2524 }
2525
2526 struct rt6_mtu_change_arg {
2527         struct net_device *dev;
2528         unsigned int mtu;
2529 };
2530
2531 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2532 {
2533         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2534         struct inet6_dev *idev;
2535
2536         /* In IPv6 pmtu discovery is not optional,
2537            so that RTAX_MTU lock cannot disable it.
2538            We still use this lock to block changes
2539            caused by addrconf/ndisc.
2540         */
2541
2542         idev = __in6_dev_get(arg->dev);
2543         if (!idev)
2544                 return 0;
2545
2546         /* For administrative MTU increase, there is no way to discover
2547            IPv6 PMTU increase, so PMTU increase should be updated here.
2548            Since RFC 1981 doesn't include administrative MTU increase
2549            update PMTU increase is a MUST. (i.e. jumbo frame)
2550          */
2551         /*
2552            If new MTU is less than route PMTU, this new MTU will be the
2553            lowest MTU in the path, update the route PMTU to reflect PMTU
2554            decreases; if new MTU is greater than route PMTU, and the
2555            old MTU is the lowest MTU in the path, update the route PMTU
2556            to reflect the increase. In this case if the other nodes' MTU
2557            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2558            PMTU discouvery.
2559          */
2560         if (rt->dst.dev == arg->dev &&
2561             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2562                 if (rt->rt6i_flags & RTF_CACHE) {
2563                         /* For RTF_CACHE with rt6i_pmtu == 0
2564                          * (i.e. a redirected route),
2565                          * the metrics of its rt->dst.from has already
2566                          * been updated.
2567                          */
2568                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2569                                 rt->rt6i_pmtu = arg->mtu;
2570                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2571                            (dst_mtu(&rt->dst) < arg->mtu &&
2572                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2573                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2574                 }
2575         }
2576         return 0;
2577 }
2578
2579 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2580 {
2581         struct rt6_mtu_change_arg arg = {
2582                 .dev = dev,
2583                 .mtu = mtu,
2584         };
2585
2586         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2587 }
2588
2589 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2590         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2591         [RTA_OIF]               = { .type = NLA_U32 },
2592         [RTA_IIF]               = { .type = NLA_U32 },
2593         [RTA_PRIORITY]          = { .type = NLA_U32 },
2594         [RTA_METRICS]           = { .type = NLA_NESTED },
2595         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2596         [RTA_PREF]              = { .type = NLA_U8 },
2597 };
2598
2599 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2600                               struct fib6_config *cfg)
2601 {
2602         struct rtmsg *rtm;
2603         struct nlattr *tb[RTA_MAX+1];
2604         unsigned int pref;
2605         int err;
2606
2607         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2608         if (err < 0)
2609                 goto errout;
2610
2611         err = -EINVAL;
2612         rtm = nlmsg_data(nlh);
2613         memset(cfg, 0, sizeof(*cfg));
2614
2615         cfg->fc_table = rtm->rtm_table;
2616         cfg->fc_dst_len = rtm->rtm_dst_len;
2617         cfg->fc_src_len = rtm->rtm_src_len;
2618         cfg->fc_flags = RTF_UP;
2619         cfg->fc_protocol = rtm->rtm_protocol;
2620         cfg->fc_type = rtm->rtm_type;
2621
2622         if (rtm->rtm_type == RTN_UNREACHABLE ||
2623             rtm->rtm_type == RTN_BLACKHOLE ||
2624             rtm->rtm_type == RTN_PROHIBIT ||
2625             rtm->rtm_type == RTN_THROW)
2626                 cfg->fc_flags |= RTF_REJECT;
2627
2628         if (rtm->rtm_type == RTN_LOCAL)
2629                 cfg->fc_flags |= RTF_LOCAL;
2630
2631         if (rtm->rtm_flags & RTM_F_CLONED)
2632                 cfg->fc_flags |= RTF_CACHE;
2633
2634         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2635         cfg->fc_nlinfo.nlh = nlh;
2636         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2637
2638         if (tb[RTA_GATEWAY]) {
2639                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2640                 cfg->fc_flags |= RTF_GATEWAY;
2641         }
2642
2643         if (tb[RTA_DST]) {
2644                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2645
2646                 if (nla_len(tb[RTA_DST]) < plen)
2647                         goto errout;
2648
2649                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2650         }
2651
2652         if (tb[RTA_SRC]) {
2653                 int plen = (rtm->rtm_src_len + 7) >> 3;
2654
2655                 if (nla_len(tb[RTA_SRC]) < plen)
2656                         goto errout;
2657
2658                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2659         }
2660
2661         if (tb[RTA_PREFSRC])
2662                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2663
2664         if (tb[RTA_OIF])
2665                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2666
2667         if (tb[RTA_PRIORITY])
2668                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2669
2670         if (tb[RTA_METRICS]) {
2671                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2672                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2673         }
2674
2675         if (tb[RTA_TABLE])
2676                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2677
2678         if (tb[RTA_MULTIPATH]) {
2679                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2680                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2681         }
2682
2683         if (tb[RTA_PREF]) {
2684                 pref = nla_get_u8(tb[RTA_PREF]);
2685                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2686                     pref != ICMPV6_ROUTER_PREF_HIGH)
2687                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2688                 cfg->fc_flags |= RTF_PREF(pref);
2689         }
2690
2691         err = 0;
2692 errout:
2693         return err;
2694 }
2695
2696 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2697 {
2698         struct fib6_config r_cfg;
2699         struct rtnexthop *rtnh;
2700         int remaining;
2701         int attrlen;
2702         int err = 0, last_err = 0;
2703
2704         remaining = cfg->fc_mp_len;
2705 beginning:
2706         rtnh = (struct rtnexthop *)cfg->fc_mp;
2707
2708         /* Parse a Multipath Entry */
2709         while (rtnh_ok(rtnh, remaining)) {
2710                 memcpy(&r_cfg, cfg, sizeof(*cfg));
2711                 if (rtnh->rtnh_ifindex)
2712                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2713
2714                 attrlen = rtnh_attrlen(rtnh);
2715                 if (attrlen > 0) {
2716                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2717
2718                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2719                         if (nla) {
2720                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2721                                 r_cfg.fc_flags |= RTF_GATEWAY;
2722                         }
2723                 }
2724                 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2725                 if (err) {
2726                         last_err = err;
2727                         /* If we are trying to remove a route, do not stop the
2728                          * loop when ip6_route_del() fails (because next hop is
2729                          * already gone), we should try to remove all next hops.
2730                          */
2731                         if (add) {
2732                                 /* If add fails, we should try to delete all
2733                                  * next hops that have been already added.
2734                                  */
2735                                 add = 0;
2736                                 remaining = cfg->fc_mp_len - remaining;
2737                                 goto beginning;
2738                         }
2739                 }
2740                 /* Because each route is added like a single route we remove
2741                  * these flags after the first nexthop: if there is a collision,
2742                  * we have already failed to add the first nexthop:
2743                  * fib6_add_rt2node() has rejected it; when replacing, old
2744                  * nexthops have been replaced by first new, the rest should
2745                  * be added to it.
2746                  */
2747                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2748                                                      NLM_F_REPLACE);
2749                 rtnh = rtnh_next(rtnh, &remaining);
2750         }
2751
2752         return last_err;
2753 }
2754
2755 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2756 {
2757         struct fib6_config cfg;
2758         int err;
2759
2760         err = rtm_to_fib6_config(skb, nlh, &cfg);
2761         if (err < 0)
2762                 return err;
2763
2764         if (cfg.fc_mp)
2765                 return ip6_route_multipath(&cfg, 0);
2766         else
2767                 return ip6_route_del(&cfg);
2768 }
2769
2770 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2771 {
2772         struct fib6_config cfg;
2773         int err;
2774
2775         err = rtm_to_fib6_config(skb, nlh, &cfg);
2776         if (err < 0)
2777                 return err;
2778
2779         if (cfg.fc_mp)
2780                 return ip6_route_multipath(&cfg, 1);
2781         else
2782                 return ip6_route_add(&cfg);
2783 }
2784
2785 static inline size_t rt6_nlmsg_size(void)
2786 {
2787         return NLMSG_ALIGN(sizeof(struct rtmsg))
2788                + nla_total_size(16) /* RTA_SRC */
2789                + nla_total_size(16) /* RTA_DST */
2790                + nla_total_size(16) /* RTA_GATEWAY */
2791                + nla_total_size(16) /* RTA_PREFSRC */
2792                + nla_total_size(4) /* RTA_TABLE */
2793                + nla_total_size(4) /* RTA_IIF */
2794                + nla_total_size(4) /* RTA_OIF */
2795                + nla_total_size(4) /* RTA_PRIORITY */
2796                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2797                + nla_total_size(sizeof(struct rta_cacheinfo))
2798                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2799                + nla_total_size(1); /* RTA_PREF */
2800 }
2801
2802 static int rt6_fill_node(struct net *net,
2803                          struct sk_buff *skb, struct rt6_info *rt,
2804                          struct in6_addr *dst, struct in6_addr *src,
2805                          int iif, int type, u32 portid, u32 seq,
2806                          int prefix, int nowait, unsigned int flags)
2807 {
2808         u32 metrics[RTAX_MAX];
2809         struct rtmsg *rtm;
2810         struct nlmsghdr *nlh;
2811         long expires;
2812         u32 table;
2813
2814         if (prefix) {   /* user wants prefix routes only */
2815                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2816                         /* success since this is not a prefix route */
2817                         return 1;
2818                 }
2819         }
2820
2821         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2822         if (!nlh)
2823                 return -EMSGSIZE;
2824
2825         rtm = nlmsg_data(nlh);
2826         rtm->rtm_family = AF_INET6;
2827         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2828         rtm->rtm_src_len = rt->rt6i_src.plen;
2829         rtm->rtm_tos = 0;
2830         if (rt->rt6i_table)
2831                 table = rt->rt6i_table->tb6_id;
2832         else
2833                 table = RT6_TABLE_UNSPEC;
2834         rtm->rtm_table = table;
2835         if (nla_put_u32(skb, RTA_TABLE, table))
2836                 goto nla_put_failure;
2837         if (rt->rt6i_flags & RTF_REJECT) {
2838                 switch (rt->dst.error) {
2839                 case -EINVAL:
2840                         rtm->rtm_type = RTN_BLACKHOLE;
2841                         break;
2842                 case -EACCES:
2843                         rtm->rtm_type = RTN_PROHIBIT;
2844                         break;
2845                 case -EAGAIN:
2846                         rtm->rtm_type = RTN_THROW;
2847                         break;
2848                 default:
2849                         rtm->rtm_type = RTN_UNREACHABLE;
2850                         break;
2851                 }
2852         }
2853         else if (rt->rt6i_flags & RTF_LOCAL)
2854                 rtm->rtm_type = RTN_LOCAL;
2855         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2856                 rtm->rtm_type = RTN_LOCAL;
2857         else
2858                 rtm->rtm_type = RTN_UNICAST;
2859         rtm->rtm_flags = 0;
2860         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2861         rtm->rtm_protocol = rt->rt6i_protocol;
2862         if (rt->rt6i_flags & RTF_DYNAMIC)
2863                 rtm->rtm_protocol = RTPROT_REDIRECT;
2864         else if (rt->rt6i_flags & RTF_ADDRCONF) {
2865                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2866                         rtm->rtm_protocol = RTPROT_RA;
2867                 else
2868                         rtm->rtm_protocol = RTPROT_KERNEL;
2869         }
2870
2871         if (rt->rt6i_flags & RTF_CACHE)
2872                 rtm->rtm_flags |= RTM_F_CLONED;
2873
2874         if (dst) {
2875                 if (nla_put_in6_addr(skb, RTA_DST, dst))
2876                         goto nla_put_failure;
2877                 rtm->rtm_dst_len = 128;
2878         } else if (rtm->rtm_dst_len)
2879                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2880                         goto nla_put_failure;
2881 #ifdef CONFIG_IPV6_SUBTREES
2882         if (src) {
2883                 if (nla_put_in6_addr(skb, RTA_SRC, src))
2884                         goto nla_put_failure;
2885                 rtm->rtm_src_len = 128;
2886         } else if (rtm->rtm_src_len &&
2887                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2888                 goto nla_put_failure;
2889 #endif
2890         if (iif) {
2891 #ifdef CONFIG_IPV6_MROUTE
2892                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2893                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2894                         if (err <= 0) {
2895                                 if (!nowait) {
2896                                         if (err == 0)
2897                                                 return 0;
2898                                         goto nla_put_failure;
2899                                 } else {
2900                                         if (err == -EMSGSIZE)
2901                                                 goto nla_put_failure;
2902                                 }
2903                         }
2904                 } else
2905 #endif
2906                         if (nla_put_u32(skb, RTA_IIF, iif))
2907                                 goto nla_put_failure;
2908         } else if (dst) {
2909                 struct in6_addr saddr_buf;
2910                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2911                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2912                         goto nla_put_failure;
2913         }
2914
2915         if (rt->rt6i_prefsrc.plen) {
2916                 struct in6_addr saddr_buf;
2917                 saddr_buf = rt->rt6i_prefsrc.addr;
2918                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2919                         goto nla_put_failure;
2920         }
2921
2922         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2923         if (rt->rt6i_pmtu)
2924                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2925         if (rtnetlink_put_metrics(skb, metrics) < 0)
2926                 goto nla_put_failure;
2927
2928         if (rt->rt6i_flags & RTF_GATEWAY) {
2929                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2930                         goto nla_put_failure;
2931         }
2932
2933         if (rt->dst.dev &&
2934             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2935                 goto nla_put_failure;
2936         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2937                 goto nla_put_failure;
2938
2939         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2940
2941         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2942                 goto nla_put_failure;
2943
2944         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2945                 goto nla_put_failure;
2946
2947         nlmsg_end(skb, nlh);
2948         return 0;
2949
2950 nla_put_failure:
2951         nlmsg_cancel(skb, nlh);
2952         return -EMSGSIZE;
2953 }
2954
2955 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2956 {
2957         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2958         int prefix;
2959
2960         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2961                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2962                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2963         } else
2964                 prefix = 0;
2965
2966         return rt6_fill_node(arg->net,
2967                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2968                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
2969                      prefix, 0, NLM_F_MULTI);
2970 }
2971
2972 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2973 {
2974         struct net *net = sock_net(in_skb->sk);
2975         struct nlattr *tb[RTA_MAX+1];
2976         struct rt6_info *rt;
2977         struct sk_buff *skb;
2978         struct rtmsg *rtm;
2979         struct flowi6 fl6;
2980         int err, iif = 0, oif = 0;
2981
2982         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2983         if (err < 0)
2984                 goto errout;
2985
2986         err = -EINVAL;
2987         memset(&fl6, 0, sizeof(fl6));
2988
2989         if (tb[RTA_SRC]) {
2990                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2991                         goto errout;
2992
2993                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2994         }
2995
2996         if (tb[RTA_DST]) {
2997                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2998                         goto errout;
2999
3000                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3001         }
3002
3003         if (tb[RTA_IIF])
3004                 iif = nla_get_u32(tb[RTA_IIF]);
3005
3006         if (tb[RTA_OIF])
3007                 oif = nla_get_u32(tb[RTA_OIF]);
3008
3009         if (tb[RTA_MARK])
3010                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3011
3012         if (iif) {
3013                 struct net_device *dev;
3014                 int flags = 0;
3015
3016                 dev = __dev_get_by_index(net, iif);
3017                 if (!dev) {
3018                         err = -ENODEV;
3019                         goto errout;
3020                 }
3021
3022                 fl6.flowi6_iif = iif;
3023
3024                 if (!ipv6_addr_any(&fl6.saddr))
3025                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3026
3027                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3028                                                                flags);
3029         } else {
3030                 fl6.flowi6_oif = oif;
3031
3032                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3033         }
3034
3035         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3036         if (!skb) {
3037                 ip6_rt_put(rt);
3038                 err = -ENOBUFS;
3039                 goto errout;
3040         }
3041
3042         /* Reserve room for dummy headers, this skb can pass
3043            through good chunk of routing engine.
3044          */
3045         skb_reset_mac_header(skb);
3046         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3047
3048         skb_dst_set(skb, &rt->dst);
3049
3050         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3051                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3052                             nlh->nlmsg_seq, 0, 0, 0);
3053         if (err < 0) {
3054                 kfree_skb(skb);
3055                 goto errout;
3056         }
3057
3058         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3059 errout:
3060         return err;
3061 }
3062
3063 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3064 {
3065         struct sk_buff *skb;
3066         struct net *net = info->nl_net;
3067         u32 seq;
3068         int err;
3069
3070         err = -ENOBUFS;
3071         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3072
3073         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
3074         if (!skb)
3075                 goto errout;
3076
3077         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3078                                 event, info->portid, seq, 0, 0, 0);
3079         if (err < 0) {
3080                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3081                 WARN_ON(err == -EMSGSIZE);
3082                 kfree_skb(skb);
3083                 goto errout;
3084         }
3085         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3086                     info->nlh, gfp_any());
3087         return;
3088 errout:
3089         if (err < 0)
3090                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3091 }
3092
3093 static int ip6_route_dev_notify(struct notifier_block *this,
3094                                 unsigned long event, void *ptr)
3095 {
3096         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3097         struct net *net = dev_net(dev);
3098
3099         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3100                 net->ipv6.ip6_null_entry->dst.dev = dev;
3101                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3102 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3103                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3104                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3105                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3106                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3107 #endif
3108         }
3109
3110         return NOTIFY_OK;
3111 }
3112
3113 /*
3114  *      /proc
3115  */
3116
3117 #ifdef CONFIG_PROC_FS
3118
3119 static const struct file_operations ipv6_route_proc_fops = {
3120         .owner          = THIS_MODULE,
3121         .open           = ipv6_route_open,
3122         .read           = seq_read,
3123         .llseek         = seq_lseek,
3124         .release        = seq_release_net,
3125 };
3126
3127 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3128 {
3129         struct net *net = (struct net *)seq->private;
3130         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3131                    net->ipv6.rt6_stats->fib_nodes,
3132                    net->ipv6.rt6_stats->fib_route_nodes,
3133                    net->ipv6.rt6_stats->fib_rt_alloc,
3134                    net->ipv6.rt6_stats->fib_rt_entries,
3135                    net->ipv6.rt6_stats->fib_rt_cache,
3136                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3137                    net->ipv6.rt6_stats->fib_discarded_routes);
3138
3139         return 0;
3140 }
3141
3142 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3143 {
3144         return single_open_net(inode, file, rt6_stats_seq_show);
3145 }
3146
3147 static const struct file_operations rt6_stats_seq_fops = {
3148         .owner   = THIS_MODULE,
3149         .open    = rt6_stats_seq_open,
3150         .read    = seq_read,
3151         .llseek  = seq_lseek,
3152         .release = single_release_net,
3153 };
3154 #endif  /* CONFIG_PROC_FS */
3155
3156 #ifdef CONFIG_SYSCTL
3157
3158 static
3159 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3160                               void __user *buffer, size_t *lenp, loff_t *ppos)
3161 {
3162         struct net *net;
3163         int delay;
3164         if (!write)
3165                 return -EINVAL;
3166
3167         net = (struct net *)ctl->extra1;
3168         delay = net->ipv6.sysctl.flush_delay;
3169         proc_dointvec(ctl, write, buffer, lenp, ppos);
3170         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3171         return 0;
3172 }
3173
3174 struct ctl_table ipv6_route_table_template[] = {
3175         {
3176                 .procname       =       "flush",
3177                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3178                 .maxlen         =       sizeof(int),
3179                 .mode           =       0200,
3180                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3181         },
3182         {
3183                 .procname       =       "gc_thresh",
3184                 .data           =       &ip6_dst_ops_template.gc_thresh,
3185                 .maxlen         =       sizeof(int),
3186                 .mode           =       0644,
3187                 .proc_handler   =       proc_dointvec,
3188         },
3189         {
3190                 .procname       =       "max_size",
3191                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3192                 .maxlen         =       sizeof(int),
3193                 .mode           =       0644,
3194                 .proc_handler   =       proc_dointvec,
3195         },
3196         {
3197                 .procname       =       "gc_min_interval",
3198                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3199                 .maxlen         =       sizeof(int),
3200                 .mode           =       0644,
3201                 .proc_handler   =       proc_dointvec_jiffies,
3202         },
3203         {
3204                 .procname       =       "gc_timeout",
3205                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3206                 .maxlen         =       sizeof(int),
3207                 .mode           =       0644,
3208                 .proc_handler   =       proc_dointvec_jiffies,
3209         },
3210         {
3211                 .procname       =       "gc_interval",
3212                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3213                 .maxlen         =       sizeof(int),
3214                 .mode           =       0644,
3215                 .proc_handler   =       proc_dointvec_jiffies,
3216         },
3217         {
3218                 .procname       =       "gc_elasticity",
3219                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3220                 .maxlen         =       sizeof(int),
3221                 .mode           =       0644,
3222                 .proc_handler   =       proc_dointvec,
3223         },
3224         {
3225                 .procname       =       "mtu_expires",
3226                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3227                 .maxlen         =       sizeof(int),
3228                 .mode           =       0644,
3229                 .proc_handler   =       proc_dointvec_jiffies,
3230         },
3231         {
3232                 .procname       =       "min_adv_mss",
3233                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3234                 .maxlen         =       sizeof(int),
3235                 .mode           =       0644,
3236                 .proc_handler   =       proc_dointvec,
3237         },
3238         {
3239                 .procname       =       "gc_min_interval_ms",
3240                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3241                 .maxlen         =       sizeof(int),
3242                 .mode           =       0644,
3243                 .proc_handler   =       proc_dointvec_ms_jiffies,
3244         },
3245         { }
3246 };
3247
3248 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3249 {
3250         struct ctl_table *table;
3251
3252         table = kmemdup(ipv6_route_table_template,
3253                         sizeof(ipv6_route_table_template),
3254                         GFP_KERNEL);
3255
3256         if (table) {
3257                 table[0].data = &net->ipv6.sysctl.flush_delay;
3258                 table[0].extra1 = net;
3259                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3260                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3261                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3262                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3263                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3264                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3265                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3266                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3267                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3268
3269                 /* Don't export sysctls to unprivileged users */
3270                 if (net->user_ns != &init_user_ns)
3271                         table[0].procname = NULL;
3272         }
3273
3274         return table;
3275 }
3276 #endif
3277
3278 static int __net_init ip6_route_net_init(struct net *net)
3279 {
3280         int ret = -ENOMEM;
3281
3282         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3283                sizeof(net->ipv6.ip6_dst_ops));
3284
3285         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3286                 goto out_ip6_dst_ops;
3287
3288         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3289                                            sizeof(*net->ipv6.ip6_null_entry),
3290                                            GFP_KERNEL);
3291         if (!net->ipv6.ip6_null_entry)
3292                 goto out_ip6_dst_entries;
3293         net->ipv6.ip6_null_entry->dst.path =
3294                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3295         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3296         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3297                          ip6_template_metrics, true);
3298
3299 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3300         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3301                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3302                                                GFP_KERNEL);
3303         if (!net->ipv6.ip6_prohibit_entry)
3304                 goto out_ip6_null_entry;
3305         net->ipv6.ip6_prohibit_entry->dst.path =
3306                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3307         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3308         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3309                          ip6_template_metrics, true);
3310
3311         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3312                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3313                                                GFP_KERNEL);
3314         if (!net->ipv6.ip6_blk_hole_entry)
3315                 goto out_ip6_prohibit_entry;
3316         net->ipv6.ip6_blk_hole_entry->dst.path =
3317                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3318         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3319         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3320                          ip6_template_metrics, true);
3321 #endif
3322
3323         net->ipv6.sysctl.flush_delay = 0;
3324         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3325         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3326         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3327         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3328         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3329         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3330         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3331
3332         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3333
3334         ret = 0;
3335 out:
3336         return ret;
3337
3338 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3339 out_ip6_prohibit_entry:
3340         kfree(net->ipv6.ip6_prohibit_entry);
3341 out_ip6_null_entry:
3342         kfree(net->ipv6.ip6_null_entry);
3343 #endif
3344 out_ip6_dst_entries:
3345         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3346 out_ip6_dst_ops:
3347         goto out;
3348 }
3349
3350 static void __net_exit ip6_route_net_exit(struct net *net)
3351 {
3352         kfree(net->ipv6.ip6_null_entry);
3353 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3354         kfree(net->ipv6.ip6_prohibit_entry);
3355         kfree(net->ipv6.ip6_blk_hole_entry);
3356 #endif
3357         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3358 }
3359
3360 static int __net_init ip6_route_net_init_late(struct net *net)
3361 {
3362 #ifdef CONFIG_PROC_FS
3363         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3364         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3365 #endif
3366         return 0;
3367 }
3368
3369 static void __net_exit ip6_route_net_exit_late(struct net *net)
3370 {
3371 #ifdef CONFIG_PROC_FS
3372         remove_proc_entry("ipv6_route", net->proc_net);
3373         remove_proc_entry("rt6_stats", net->proc_net);
3374 #endif
3375 }
3376
3377 static struct pernet_operations ip6_route_net_ops = {
3378         .init = ip6_route_net_init,
3379         .exit = ip6_route_net_exit,
3380 };
3381
3382 static int __net_init ipv6_inetpeer_init(struct net *net)
3383 {
3384         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3385
3386         if (!bp)
3387                 return -ENOMEM;
3388         inet_peer_base_init(bp);
3389         net->ipv6.peers = bp;
3390         return 0;
3391 }
3392
3393 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3394 {
3395         struct inet_peer_base *bp = net->ipv6.peers;
3396
3397         net->ipv6.peers = NULL;
3398         inetpeer_invalidate_tree(bp);
3399         kfree(bp);
3400 }
3401
3402 static struct pernet_operations ipv6_inetpeer_ops = {
3403         .init   =       ipv6_inetpeer_init,
3404         .exit   =       ipv6_inetpeer_exit,
3405 };
3406
3407 static struct pernet_operations ip6_route_net_late_ops = {
3408         .init = ip6_route_net_init_late,
3409         .exit = ip6_route_net_exit_late,
3410 };
3411
3412 static struct notifier_block ip6_route_dev_notifier = {
3413         .notifier_call = ip6_route_dev_notify,
3414         .priority = 0,
3415 };
3416
3417 int __init ip6_route_init(void)
3418 {
3419         int ret;
3420         int cpu;
3421
3422         ret = -ENOMEM;
3423         ip6_dst_ops_template.kmem_cachep =
3424                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3425                                   SLAB_HWCACHE_ALIGN, NULL);
3426         if (!ip6_dst_ops_template.kmem_cachep)
3427                 goto out;
3428
3429         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3430         if (ret)
3431                 goto out_kmem_cache;
3432
3433         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3434         if (ret)
3435                 goto out_dst_entries;
3436
3437         ret = register_pernet_subsys(&ip6_route_net_ops);
3438         if (ret)
3439                 goto out_register_inetpeer;
3440
3441         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3442
3443         /* Registering of the loopback is done before this portion of code,
3444          * the loopback reference in rt6_info will not be taken, do it
3445          * manually for init_net */
3446         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3447         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3448   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3449         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3450         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3451         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3452         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3453   #endif
3454         ret = fib6_init();
3455         if (ret)
3456                 goto out_register_subsys;
3457
3458         ret = xfrm6_init();
3459         if (ret)
3460                 goto out_fib6_init;
3461
3462         ret = fib6_rules_init();
3463         if (ret)
3464                 goto xfrm6_init;
3465
3466         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3467         if (ret)
3468                 goto fib6_rules_init;
3469
3470         ret = -ENOBUFS;
3471         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3472             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3473             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3474                 goto out_register_late_subsys;
3475
3476         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3477         if (ret)
3478                 goto out_register_late_subsys;
3479
3480         for_each_possible_cpu(cpu) {
3481                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3482
3483                 INIT_LIST_HEAD(&ul->head);
3484                 spin_lock_init(&ul->lock);
3485         }
3486
3487 out:
3488         return ret;
3489
3490 out_register_late_subsys:
3491         unregister_pernet_subsys(&ip6_route_net_late_ops);
3492 fib6_rules_init:
3493         fib6_rules_cleanup();
3494 xfrm6_init:
3495         xfrm6_fini();
3496 out_fib6_init:
3497         fib6_gc_cleanup();
3498 out_register_subsys:
3499         unregister_pernet_subsys(&ip6_route_net_ops);
3500 out_register_inetpeer:
3501         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3502 out_dst_entries:
3503         dst_entries_destroy(&ip6_dst_blackhole_ops);
3504 out_kmem_cache:
3505         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3506         goto out;
3507 }
3508
3509 void ip6_route_cleanup(void)
3510 {
3511         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3512         unregister_pernet_subsys(&ip6_route_net_late_ops);
3513         fib6_rules_cleanup();
3514         xfrm6_fini();
3515         fib6_gc_cleanup();
3516         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3517         unregister_pernet_subsys(&ip6_route_net_ops);
3518         dst_entries_destroy(&ip6_dst_blackhole_ops);
3519         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3520 }