Merge tag 'v3.5-rc7' into late/soc
[firefly-linux-kernel-4.4.55.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60
61 #include <asm/uaccess.h>
62
63 #ifdef CONFIG_SYSCTL
64 #include <linux/sysctl.h>
65 #endif
66
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68                                     const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int      ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void             ip6_dst_destroy(struct dst_entry *);
74 static void             ip6_dst_ifdown(struct dst_entry *,
75                                        struct net_device *dev, int how);
76 static int               ip6_dst_gc(struct dst_ops *ops);
77
78 static int              ip6_pkt_discard(struct sk_buff *skb);
79 static int              ip6_pkt_discard_out(struct sk_buff *skb);
80 static void             ip6_link_failure(struct sk_buff *skb);
81 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
82
83 #ifdef CONFIG_IPV6_ROUTE_INFO
84 static struct rt6_info *rt6_add_route_info(struct net *net,
85                                            const struct in6_addr *prefix, int prefixlen,
86                                            const struct in6_addr *gwaddr, int ifindex,
87                                            unsigned int pref);
88 static struct rt6_info *rt6_get_route_info(struct net *net,
89                                            const struct in6_addr *prefix, int prefixlen,
90                                            const struct in6_addr *gwaddr, int ifindex);
91 #endif
92
93 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
94 {
95         struct rt6_info *rt = (struct rt6_info *) dst;
96         struct inet_peer *peer;
97         u32 *p = NULL;
98
99         if (!(rt->dst.flags & DST_HOST))
100                 return NULL;
101
102         if (!rt->rt6i_peer)
103                 rt6_bind_peer(rt, 1);
104
105         peer = rt->rt6i_peer;
106         if (peer) {
107                 u32 *old_p = __DST_METRICS_PTR(old);
108                 unsigned long prev, new;
109
110                 p = peer->metrics;
111                 if (inet_metrics_new(peer))
112                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
113
114                 new = (unsigned long) p;
115                 prev = cmpxchg(&dst->_metrics, old, new);
116
117                 if (prev != old) {
118                         p = __DST_METRICS_PTR(prev);
119                         if (prev & DST_METRICS_READ_ONLY)
120                                 p = NULL;
121                 }
122         }
123         return p;
124 }
125
126 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
127 {
128         struct in6_addr *p = &rt->rt6i_gateway;
129
130         if (!ipv6_addr_any(p))
131                 return (const void *) p;
132         return daddr;
133 }
134
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 {
137         struct rt6_info *rt = (struct rt6_info *) dst;
138         struct neighbour *n;
139
140         daddr = choose_neigh_daddr(rt, daddr);
141         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
142         if (n)
143                 return n;
144         return neigh_create(&nd_tbl, daddr, dst->dev);
145 }
146
147 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
148 {
149         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
150         if (!n) {
151                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
152                 if (IS_ERR(n))
153                         return PTR_ERR(n);
154         }
155         dst_set_neighbour(&rt->dst, n);
156
157         return 0;
158 }
159
160 static struct dst_ops ip6_dst_ops_template = {
161         .family                 =       AF_INET6,
162         .protocol               =       cpu_to_be16(ETH_P_IPV6),
163         .gc                     =       ip6_dst_gc,
164         .gc_thresh              =       1024,
165         .check                  =       ip6_dst_check,
166         .default_advmss         =       ip6_default_advmss,
167         .mtu                    =       ip6_mtu,
168         .cow_metrics            =       ipv6_cow_metrics,
169         .destroy                =       ip6_dst_destroy,
170         .ifdown                 =       ip6_dst_ifdown,
171         .negative_advice        =       ip6_negative_advice,
172         .link_failure           =       ip6_link_failure,
173         .update_pmtu            =       ip6_rt_update_pmtu,
174         .local_out              =       __ip6_local_out,
175         .neigh_lookup           =       ip6_neigh_lookup,
176 };
177
178 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
179 {
180         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
181
182         return mtu ? : dst->dev->mtu;
183 }
184
185 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
186 {
187 }
188
189 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
190                                          unsigned long old)
191 {
192         return NULL;
193 }
194
195 static struct dst_ops ip6_dst_blackhole_ops = {
196         .family                 =       AF_INET6,
197         .protocol               =       cpu_to_be16(ETH_P_IPV6),
198         .destroy                =       ip6_dst_destroy,
199         .check                  =       ip6_dst_check,
200         .mtu                    =       ip6_blackhole_mtu,
201         .default_advmss         =       ip6_default_advmss,
202         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
203         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
204         .neigh_lookup           =       ip6_neigh_lookup,
205 };
206
207 static const u32 ip6_template_metrics[RTAX_MAX] = {
208         [RTAX_HOPLIMIT - 1] = 255,
209 };
210
211 static struct rt6_info ip6_null_entry_template = {
212         .dst = {
213                 .__refcnt       = ATOMIC_INIT(1),
214                 .__use          = 1,
215                 .obsolete       = -1,
216                 .error          = -ENETUNREACH,
217                 .input          = ip6_pkt_discard,
218                 .output         = ip6_pkt_discard_out,
219         },
220         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
221         .rt6i_protocol  = RTPROT_KERNEL,
222         .rt6i_metric    = ~(u32) 0,
223         .rt6i_ref       = ATOMIC_INIT(1),
224 };
225
226 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
227
228 static int ip6_pkt_prohibit(struct sk_buff *skb);
229 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
230
231 static struct rt6_info ip6_prohibit_entry_template = {
232         .dst = {
233                 .__refcnt       = ATOMIC_INIT(1),
234                 .__use          = 1,
235                 .obsolete       = -1,
236                 .error          = -EACCES,
237                 .input          = ip6_pkt_prohibit,
238                 .output         = ip6_pkt_prohibit_out,
239         },
240         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
241         .rt6i_protocol  = RTPROT_KERNEL,
242         .rt6i_metric    = ~(u32) 0,
243         .rt6i_ref       = ATOMIC_INIT(1),
244 };
245
246 static struct rt6_info ip6_blk_hole_entry_template = {
247         .dst = {
248                 .__refcnt       = ATOMIC_INIT(1),
249                 .__use          = 1,
250                 .obsolete       = -1,
251                 .error          = -EINVAL,
252                 .input          = dst_discard,
253                 .output         = dst_discard,
254         },
255         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
256         .rt6i_protocol  = RTPROT_KERNEL,
257         .rt6i_metric    = ~(u32) 0,
258         .rt6i_ref       = ATOMIC_INIT(1),
259 };
260
261 #endif
262
263 /* allocate dst with ip6_dst_ops */
264 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
265                                              struct net_device *dev,
266                                              int flags)
267 {
268         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
269
270         if (rt)
271                 memset(&rt->rt6i_table, 0,
272                        sizeof(*rt) - sizeof(struct dst_entry));
273
274         return rt;
275 }
276
277 static void ip6_dst_destroy(struct dst_entry *dst)
278 {
279         struct rt6_info *rt = (struct rt6_info *)dst;
280         struct inet6_dev *idev = rt->rt6i_idev;
281         struct inet_peer *peer = rt->rt6i_peer;
282
283         if (!(rt->dst.flags & DST_HOST))
284                 dst_destroy_metrics_generic(dst);
285
286         if (idev) {
287                 rt->rt6i_idev = NULL;
288                 in6_dev_put(idev);
289         }
290
291         if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
292                 dst_release(dst->from);
293
294         if (peer) {
295                 rt->rt6i_peer = NULL;
296                 inet_putpeer(peer);
297         }
298 }
299
300 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
301
302 static u32 rt6_peer_genid(void)
303 {
304         return atomic_read(&__rt6_peer_genid);
305 }
306
307 void rt6_bind_peer(struct rt6_info *rt, int create)
308 {
309         struct inet_peer *peer;
310
311         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
312         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
313                 inet_putpeer(peer);
314         else
315                 rt->rt6i_peer_genid = rt6_peer_genid();
316 }
317
318 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
319                            int how)
320 {
321         struct rt6_info *rt = (struct rt6_info *)dst;
322         struct inet6_dev *idev = rt->rt6i_idev;
323         struct net_device *loopback_dev =
324                 dev_net(dev)->loopback_dev;
325
326         if (dev != loopback_dev && idev && idev->dev == dev) {
327                 struct inet6_dev *loopback_idev =
328                         in6_dev_get(loopback_dev);
329                 if (loopback_idev) {
330                         rt->rt6i_idev = loopback_idev;
331                         in6_dev_put(idev);
332                 }
333         }
334 }
335
336 static bool rt6_check_expired(const struct rt6_info *rt)
337 {
338         struct rt6_info *ort = NULL;
339
340         if (rt->rt6i_flags & RTF_EXPIRES) {
341                 if (time_after(jiffies, rt->dst.expires))
342                         return true;
343         } else if (rt->dst.from) {
344                 ort = (struct rt6_info *) rt->dst.from;
345                 return (ort->rt6i_flags & RTF_EXPIRES) &&
346                         time_after(jiffies, ort->dst.expires);
347         }
348         return false;
349 }
350
351 static bool rt6_need_strict(const struct in6_addr *daddr)
352 {
353         return ipv6_addr_type(daddr) &
354                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
355 }
356
357 /*
358  *      Route lookup. Any table->tb6_lock is implied.
359  */
360
361 static inline struct rt6_info *rt6_device_match(struct net *net,
362                                                     struct rt6_info *rt,
363                                                     const struct in6_addr *saddr,
364                                                     int oif,
365                                                     int flags)
366 {
367         struct rt6_info *local = NULL;
368         struct rt6_info *sprt;
369
370         if (!oif && ipv6_addr_any(saddr))
371                 goto out;
372
373         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
374                 struct net_device *dev = sprt->dst.dev;
375
376                 if (oif) {
377                         if (dev->ifindex == oif)
378                                 return sprt;
379                         if (dev->flags & IFF_LOOPBACK) {
380                                 if (!sprt->rt6i_idev ||
381                                     sprt->rt6i_idev->dev->ifindex != oif) {
382                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
383                                                 continue;
384                                         if (local && (!oif ||
385                                                       local->rt6i_idev->dev->ifindex == oif))
386                                                 continue;
387                                 }
388                                 local = sprt;
389                         }
390                 } else {
391                         if (ipv6_chk_addr(net, saddr, dev,
392                                           flags & RT6_LOOKUP_F_IFACE))
393                                 return sprt;
394                 }
395         }
396
397         if (oif) {
398                 if (local)
399                         return local;
400
401                 if (flags & RT6_LOOKUP_F_IFACE)
402                         return net->ipv6.ip6_null_entry;
403         }
404 out:
405         return rt;
406 }
407
408 #ifdef CONFIG_IPV6_ROUTER_PREF
409 static void rt6_probe(struct rt6_info *rt)
410 {
411         struct neighbour *neigh;
412         /*
413          * Okay, this does not seem to be appropriate
414          * for now, however, we need to check if it
415          * is really so; aka Router Reachability Probing.
416          *
417          * Router Reachability Probe MUST be rate-limited
418          * to no more than one per minute.
419          */
420         rcu_read_lock();
421         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
422         if (!neigh || (neigh->nud_state & NUD_VALID))
423                 goto out;
424         read_lock_bh(&neigh->lock);
425         if (!(neigh->nud_state & NUD_VALID) &&
426             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
427                 struct in6_addr mcaddr;
428                 struct in6_addr *target;
429
430                 neigh->updated = jiffies;
431                 read_unlock_bh(&neigh->lock);
432
433                 target = (struct in6_addr *)&neigh->primary_key;
434                 addrconf_addr_solict_mult(target, &mcaddr);
435                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
436         } else {
437                 read_unlock_bh(&neigh->lock);
438         }
439 out:
440         rcu_read_unlock();
441 }
442 #else
443 static inline void rt6_probe(struct rt6_info *rt)
444 {
445 }
446 #endif
447
448 /*
449  * Default Router Selection (RFC 2461 6.3.6)
450  */
451 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
452 {
453         struct net_device *dev = rt->dst.dev;
454         if (!oif || dev->ifindex == oif)
455                 return 2;
456         if ((dev->flags & IFF_LOOPBACK) &&
457             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
458                 return 1;
459         return 0;
460 }
461
462 static inline int rt6_check_neigh(struct rt6_info *rt)
463 {
464         struct neighbour *neigh;
465         int m;
466
467         rcu_read_lock();
468         neigh = dst_get_neighbour_noref(&rt->dst);
469         if (rt->rt6i_flags & RTF_NONEXTHOP ||
470             !(rt->rt6i_flags & RTF_GATEWAY))
471                 m = 1;
472         else if (neigh) {
473                 read_lock_bh(&neigh->lock);
474                 if (neigh->nud_state & NUD_VALID)
475                         m = 2;
476 #ifdef CONFIG_IPV6_ROUTER_PREF
477                 else if (neigh->nud_state & NUD_FAILED)
478                         m = 0;
479 #endif
480                 else
481                         m = 1;
482                 read_unlock_bh(&neigh->lock);
483         } else
484                 m = 0;
485         rcu_read_unlock();
486         return m;
487 }
488
489 static int rt6_score_route(struct rt6_info *rt, int oif,
490                            int strict)
491 {
492         int m, n;
493
494         m = rt6_check_dev(rt, oif);
495         if (!m && (strict & RT6_LOOKUP_F_IFACE))
496                 return -1;
497 #ifdef CONFIG_IPV6_ROUTER_PREF
498         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
499 #endif
500         n = rt6_check_neigh(rt);
501         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
502                 return -1;
503         return m;
504 }
505
506 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
507                                    int *mpri, struct rt6_info *match)
508 {
509         int m;
510
511         if (rt6_check_expired(rt))
512                 goto out;
513
514         m = rt6_score_route(rt, oif, strict);
515         if (m < 0)
516                 goto out;
517
518         if (m > *mpri) {
519                 if (strict & RT6_LOOKUP_F_REACHABLE)
520                         rt6_probe(match);
521                 *mpri = m;
522                 match = rt;
523         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
524                 rt6_probe(rt);
525         }
526
527 out:
528         return match;
529 }
530
531 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
532                                      struct rt6_info *rr_head,
533                                      u32 metric, int oif, int strict)
534 {
535         struct rt6_info *rt, *match;
536         int mpri = -1;
537
538         match = NULL;
539         for (rt = rr_head; rt && rt->rt6i_metric == metric;
540              rt = rt->dst.rt6_next)
541                 match = find_match(rt, oif, strict, &mpri, match);
542         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
543              rt = rt->dst.rt6_next)
544                 match = find_match(rt, oif, strict, &mpri, match);
545
546         return match;
547 }
548
549 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
550 {
551         struct rt6_info *match, *rt0;
552         struct net *net;
553
554         rt0 = fn->rr_ptr;
555         if (!rt0)
556                 fn->rr_ptr = rt0 = fn->leaf;
557
558         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
559
560         if (!match &&
561             (strict & RT6_LOOKUP_F_REACHABLE)) {
562                 struct rt6_info *next = rt0->dst.rt6_next;
563
564                 /* no entries matched; do round-robin */
565                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
566                         next = fn->leaf;
567
568                 if (next != rt0)
569                         fn->rr_ptr = next;
570         }
571
572         net = dev_net(rt0->dst.dev);
573         return match ? match : net->ipv6.ip6_null_entry;
574 }
575
576 #ifdef CONFIG_IPV6_ROUTE_INFO
577 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
578                   const struct in6_addr *gwaddr)
579 {
580         struct net *net = dev_net(dev);
581         struct route_info *rinfo = (struct route_info *) opt;
582         struct in6_addr prefix_buf, *prefix;
583         unsigned int pref;
584         unsigned long lifetime;
585         struct rt6_info *rt;
586
587         if (len < sizeof(struct route_info)) {
588                 return -EINVAL;
589         }
590
591         /* Sanity check for prefix_len and length */
592         if (rinfo->length > 3) {
593                 return -EINVAL;
594         } else if (rinfo->prefix_len > 128) {
595                 return -EINVAL;
596         } else if (rinfo->prefix_len > 64) {
597                 if (rinfo->length < 2) {
598                         return -EINVAL;
599                 }
600         } else if (rinfo->prefix_len > 0) {
601                 if (rinfo->length < 1) {
602                         return -EINVAL;
603                 }
604         }
605
606         pref = rinfo->route_pref;
607         if (pref == ICMPV6_ROUTER_PREF_INVALID)
608                 return -EINVAL;
609
610         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
611
612         if (rinfo->length == 3)
613                 prefix = (struct in6_addr *)rinfo->prefix;
614         else {
615                 /* this function is safe */
616                 ipv6_addr_prefix(&prefix_buf,
617                                  (struct in6_addr *)rinfo->prefix,
618                                  rinfo->prefix_len);
619                 prefix = &prefix_buf;
620         }
621
622         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
623                                 dev->ifindex);
624
625         if (rt && !lifetime) {
626                 ip6_del_rt(rt);
627                 rt = NULL;
628         }
629
630         if (!rt && lifetime)
631                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
632                                         pref);
633         else if (rt)
634                 rt->rt6i_flags = RTF_ROUTEINFO |
635                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
636
637         if (rt) {
638                 if (!addrconf_finite_timeout(lifetime))
639                         rt6_clean_expires(rt);
640                 else
641                         rt6_set_expires(rt, jiffies + HZ * lifetime);
642
643                 dst_release(&rt->dst);
644         }
645         return 0;
646 }
647 #endif
648
649 #define BACKTRACK(__net, saddr)                 \
650 do { \
651         if (rt == __net->ipv6.ip6_null_entry) { \
652                 struct fib6_node *pn; \
653                 while (1) { \
654                         if (fn->fn_flags & RTN_TL_ROOT) \
655                                 goto out; \
656                         pn = fn->parent; \
657                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
658                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
659                         else \
660                                 fn = pn; \
661                         if (fn->fn_flags & RTN_RTINFO) \
662                                 goto restart; \
663                 } \
664         } \
665 } while (0)
666
667 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
668                                              struct fib6_table *table,
669                                              struct flowi6 *fl6, int flags)
670 {
671         struct fib6_node *fn;
672         struct rt6_info *rt;
673
674         read_lock_bh(&table->tb6_lock);
675         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
676 restart:
677         rt = fn->leaf;
678         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
679         BACKTRACK(net, &fl6->saddr);
680 out:
681         dst_use(&rt->dst, jiffies);
682         read_unlock_bh(&table->tb6_lock);
683         return rt;
684
685 }
686
687 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
688                                     int flags)
689 {
690         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
691 }
692 EXPORT_SYMBOL_GPL(ip6_route_lookup);
693
694 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
695                             const struct in6_addr *saddr, int oif, int strict)
696 {
697         struct flowi6 fl6 = {
698                 .flowi6_oif = oif,
699                 .daddr = *daddr,
700         };
701         struct dst_entry *dst;
702         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
703
704         if (saddr) {
705                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
706                 flags |= RT6_LOOKUP_F_HAS_SADDR;
707         }
708
709         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
710         if (dst->error == 0)
711                 return (struct rt6_info *) dst;
712
713         dst_release(dst);
714
715         return NULL;
716 }
717
718 EXPORT_SYMBOL(rt6_lookup);
719
720 /* ip6_ins_rt is called with FREE table->tb6_lock.
721    It takes new route entry, the addition fails by any reason the
722    route is freed. In any case, if caller does not hold it, it may
723    be destroyed.
724  */
725
726 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
727 {
728         int err;
729         struct fib6_table *table;
730
731         table = rt->rt6i_table;
732         write_lock_bh(&table->tb6_lock);
733         err = fib6_add(&table->tb6_root, rt, info);
734         write_unlock_bh(&table->tb6_lock);
735
736         return err;
737 }
738
739 int ip6_ins_rt(struct rt6_info *rt)
740 {
741         struct nl_info info = {
742                 .nl_net = dev_net(rt->dst.dev),
743         };
744         return __ip6_ins_rt(rt, &info);
745 }
746
747 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
748                                       const struct in6_addr *daddr,
749                                       const struct in6_addr *saddr)
750 {
751         struct rt6_info *rt;
752
753         /*
754          *      Clone the route.
755          */
756
757         rt = ip6_rt_copy(ort, daddr);
758
759         if (rt) {
760                 int attempts = !in_softirq();
761
762                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
763                         if (ort->rt6i_dst.plen != 128 &&
764                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
765                                 rt->rt6i_flags |= RTF_ANYCAST;
766                         rt->rt6i_gateway = *daddr;
767                 }
768
769                 rt->rt6i_flags |= RTF_CACHE;
770
771 #ifdef CONFIG_IPV6_SUBTREES
772                 if (rt->rt6i_src.plen && saddr) {
773                         rt->rt6i_src.addr = *saddr;
774                         rt->rt6i_src.plen = 128;
775                 }
776 #endif
777
778         retry:
779                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
780                         struct net *net = dev_net(rt->dst.dev);
781                         int saved_rt_min_interval =
782                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
783                         int saved_rt_elasticity =
784                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
785
786                         if (attempts-- > 0) {
787                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
788                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
789
790                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
791
792                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
793                                         saved_rt_elasticity;
794                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
795                                         saved_rt_min_interval;
796                                 goto retry;
797                         }
798
799                         net_warn_ratelimited("Neighbour table overflow\n");
800                         dst_free(&rt->dst);
801                         return NULL;
802                 }
803         }
804
805         return rt;
806 }
807
808 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
809                                         const struct in6_addr *daddr)
810 {
811         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
812
813         if (rt) {
814                 rt->rt6i_flags |= RTF_CACHE;
815                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
816         }
817         return rt;
818 }
819
820 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
821                                       struct flowi6 *fl6, int flags)
822 {
823         struct fib6_node *fn;
824         struct rt6_info *rt, *nrt;
825         int strict = 0;
826         int attempts = 3;
827         int err;
828         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
829
830         strict |= flags & RT6_LOOKUP_F_IFACE;
831
832 relookup:
833         read_lock_bh(&table->tb6_lock);
834
835 restart_2:
836         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
837
838 restart:
839         rt = rt6_select(fn, oif, strict | reachable);
840
841         BACKTRACK(net, &fl6->saddr);
842         if (rt == net->ipv6.ip6_null_entry ||
843             rt->rt6i_flags & RTF_CACHE)
844                 goto out;
845
846         dst_hold(&rt->dst);
847         read_unlock_bh(&table->tb6_lock);
848
849         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
850                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
851         else if (!(rt->dst.flags & DST_HOST))
852                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
853         else
854                 goto out2;
855
856         dst_release(&rt->dst);
857         rt = nrt ? : net->ipv6.ip6_null_entry;
858
859         dst_hold(&rt->dst);
860         if (nrt) {
861                 err = ip6_ins_rt(nrt);
862                 if (!err)
863                         goto out2;
864         }
865
866         if (--attempts <= 0)
867                 goto out2;
868
869         /*
870          * Race condition! In the gap, when table->tb6_lock was
871          * released someone could insert this route.  Relookup.
872          */
873         dst_release(&rt->dst);
874         goto relookup;
875
876 out:
877         if (reachable) {
878                 reachable = 0;
879                 goto restart_2;
880         }
881         dst_hold(&rt->dst);
882         read_unlock_bh(&table->tb6_lock);
883 out2:
884         rt->dst.lastuse = jiffies;
885         rt->dst.__use++;
886
887         return rt;
888 }
889
890 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
891                                             struct flowi6 *fl6, int flags)
892 {
893         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
894 }
895
896 static struct dst_entry *ip6_route_input_lookup(struct net *net,
897                                                 struct net_device *dev,
898                                                 struct flowi6 *fl6, int flags)
899 {
900         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
901                 flags |= RT6_LOOKUP_F_IFACE;
902
903         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
904 }
905
906 void ip6_route_input(struct sk_buff *skb)
907 {
908         const struct ipv6hdr *iph = ipv6_hdr(skb);
909         struct net *net = dev_net(skb->dev);
910         int flags = RT6_LOOKUP_F_HAS_SADDR;
911         struct flowi6 fl6 = {
912                 .flowi6_iif = skb->dev->ifindex,
913                 .daddr = iph->daddr,
914                 .saddr = iph->saddr,
915                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
916                 .flowi6_mark = skb->mark,
917                 .flowi6_proto = iph->nexthdr,
918         };
919
920         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
921 }
922
923 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
924                                              struct flowi6 *fl6, int flags)
925 {
926         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
927 }
928
929 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
930                                     struct flowi6 *fl6)
931 {
932         int flags = 0;
933
934         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
935                 flags |= RT6_LOOKUP_F_IFACE;
936
937         if (!ipv6_addr_any(&fl6->saddr))
938                 flags |= RT6_LOOKUP_F_HAS_SADDR;
939         else if (sk)
940                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
941
942         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
943 }
944
945 EXPORT_SYMBOL(ip6_route_output);
946
947 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
948 {
949         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
950         struct dst_entry *new = NULL;
951
952         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
953         if (rt) {
954                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
955
956                 new = &rt->dst;
957
958                 new->__use = 1;
959                 new->input = dst_discard;
960                 new->output = dst_discard;
961
962                 if (dst_metrics_read_only(&ort->dst))
963                         new->_metrics = ort->dst._metrics;
964                 else
965                         dst_copy_metrics(new, &ort->dst);
966                 rt->rt6i_idev = ort->rt6i_idev;
967                 if (rt->rt6i_idev)
968                         in6_dev_hold(rt->rt6i_idev);
969
970                 rt->rt6i_gateway = ort->rt6i_gateway;
971                 rt->rt6i_flags = ort->rt6i_flags;
972                 rt6_clean_expires(rt);
973                 rt->rt6i_metric = 0;
974
975                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
976 #ifdef CONFIG_IPV6_SUBTREES
977                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
978 #endif
979
980                 dst_free(new);
981         }
982
983         dst_release(dst_orig);
984         return new ? new : ERR_PTR(-ENOMEM);
985 }
986
987 /*
988  *      Destination cache support functions
989  */
990
991 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
992 {
993         struct rt6_info *rt;
994
995         rt = (struct rt6_info *) dst;
996
997         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
998                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
999                         if (!rt->rt6i_peer)
1000                                 rt6_bind_peer(rt, 0);
1001                         rt->rt6i_peer_genid = rt6_peer_genid();
1002                 }
1003                 return dst;
1004         }
1005         return NULL;
1006 }
1007
1008 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1009 {
1010         struct rt6_info *rt = (struct rt6_info *) dst;
1011
1012         if (rt) {
1013                 if (rt->rt6i_flags & RTF_CACHE) {
1014                         if (rt6_check_expired(rt)) {
1015                                 ip6_del_rt(rt);
1016                                 dst = NULL;
1017                         }
1018                 } else {
1019                         dst_release(dst);
1020                         dst = NULL;
1021                 }
1022         }
1023         return dst;
1024 }
1025
1026 static void ip6_link_failure(struct sk_buff *skb)
1027 {
1028         struct rt6_info *rt;
1029
1030         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1031
1032         rt = (struct rt6_info *) skb_dst(skb);
1033         if (rt) {
1034                 if (rt->rt6i_flags & RTF_CACHE)
1035                         rt6_update_expires(rt, 0);
1036                 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1037                         rt->rt6i_node->fn_sernum = -1;
1038         }
1039 }
1040
1041 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1042 {
1043         struct rt6_info *rt6 = (struct rt6_info*)dst;
1044
1045         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1046                 rt6->rt6i_flags |= RTF_MODIFIED;
1047                 if (mtu < IPV6_MIN_MTU) {
1048                         u32 features = dst_metric(dst, RTAX_FEATURES);
1049                         mtu = IPV6_MIN_MTU;
1050                         features |= RTAX_FEATURE_ALLFRAG;
1051                         dst_metric_set(dst, RTAX_FEATURES, features);
1052                 }
1053                 dst_metric_set(dst, RTAX_MTU, mtu);
1054         }
1055 }
1056
1057 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1058 {
1059         struct net_device *dev = dst->dev;
1060         unsigned int mtu = dst_mtu(dst);
1061         struct net *net = dev_net(dev);
1062
1063         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1064
1065         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1066                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1067
1068         /*
1069          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1070          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1071          * IPV6_MAXPLEN is also valid and means: "any MSS,
1072          * rely only on pmtu discovery"
1073          */
1074         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1075                 mtu = IPV6_MAXPLEN;
1076         return mtu;
1077 }
1078
1079 static unsigned int ip6_mtu(const struct dst_entry *dst)
1080 {
1081         struct inet6_dev *idev;
1082         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1083
1084         if (mtu)
1085                 return mtu;
1086
1087         mtu = IPV6_MIN_MTU;
1088
1089         rcu_read_lock();
1090         idev = __in6_dev_get(dst->dev);
1091         if (idev)
1092                 mtu = idev->cnf.mtu6;
1093         rcu_read_unlock();
1094
1095         return mtu;
1096 }
1097
1098 static struct dst_entry *icmp6_dst_gc_list;
1099 static DEFINE_SPINLOCK(icmp6_dst_lock);
1100
1101 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1102                                   struct neighbour *neigh,
1103                                   struct flowi6 *fl6)
1104 {
1105         struct dst_entry *dst;
1106         struct rt6_info *rt;
1107         struct inet6_dev *idev = in6_dev_get(dev);
1108         struct net *net = dev_net(dev);
1109
1110         if (unlikely(!idev))
1111                 return ERR_PTR(-ENODEV);
1112
1113         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1114         if (unlikely(!rt)) {
1115                 in6_dev_put(idev);
1116                 dst = ERR_PTR(-ENOMEM);
1117                 goto out;
1118         }
1119
1120         if (neigh)
1121                 neigh_hold(neigh);
1122         else {
1123                 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1124                 if (IS_ERR(neigh)) {
1125                         in6_dev_put(idev);
1126                         dst_free(&rt->dst);
1127                         return ERR_CAST(neigh);
1128                 }
1129         }
1130
1131         rt->dst.flags |= DST_HOST;
1132         rt->dst.output  = ip6_output;
1133         dst_set_neighbour(&rt->dst, neigh);
1134         atomic_set(&rt->dst.__refcnt, 1);
1135         rt->rt6i_dst.addr = fl6->daddr;
1136         rt->rt6i_dst.plen = 128;
1137         rt->rt6i_idev     = idev;
1138         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1139
1140         spin_lock_bh(&icmp6_dst_lock);
1141         rt->dst.next = icmp6_dst_gc_list;
1142         icmp6_dst_gc_list = &rt->dst;
1143         spin_unlock_bh(&icmp6_dst_lock);
1144
1145         fib6_force_start_gc(net);
1146
1147         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1148
1149 out:
1150         return dst;
1151 }
1152
1153 int icmp6_dst_gc(void)
1154 {
1155         struct dst_entry *dst, **pprev;
1156         int more = 0;
1157
1158         spin_lock_bh(&icmp6_dst_lock);
1159         pprev = &icmp6_dst_gc_list;
1160
1161         while ((dst = *pprev) != NULL) {
1162                 if (!atomic_read(&dst->__refcnt)) {
1163                         *pprev = dst->next;
1164                         dst_free(dst);
1165                 } else {
1166                         pprev = &dst->next;
1167                         ++more;
1168                 }
1169         }
1170
1171         spin_unlock_bh(&icmp6_dst_lock);
1172
1173         return more;
1174 }
1175
1176 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1177                             void *arg)
1178 {
1179         struct dst_entry *dst, **pprev;
1180
1181         spin_lock_bh(&icmp6_dst_lock);
1182         pprev = &icmp6_dst_gc_list;
1183         while ((dst = *pprev) != NULL) {
1184                 struct rt6_info *rt = (struct rt6_info *) dst;
1185                 if (func(rt, arg)) {
1186                         *pprev = dst->next;
1187                         dst_free(dst);
1188                 } else {
1189                         pprev = &dst->next;
1190                 }
1191         }
1192         spin_unlock_bh(&icmp6_dst_lock);
1193 }
1194
1195 static int ip6_dst_gc(struct dst_ops *ops)
1196 {
1197         unsigned long now = jiffies;
1198         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1199         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1200         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1201         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1202         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1203         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1204         int entries;
1205
1206         entries = dst_entries_get_fast(ops);
1207         if (time_after(rt_last_gc + rt_min_interval, now) &&
1208             entries <= rt_max_size)
1209                 goto out;
1210
1211         net->ipv6.ip6_rt_gc_expire++;
1212         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1213         net->ipv6.ip6_rt_last_gc = now;
1214         entries = dst_entries_get_slow(ops);
1215         if (entries < ops->gc_thresh)
1216                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1217 out:
1218         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1219         return entries > rt_max_size;
1220 }
1221
1222 /* Clean host part of a prefix. Not necessary in radix tree,
1223    but results in cleaner routing tables.
1224
1225    Remove it only when all the things will work!
1226  */
1227
1228 int ip6_dst_hoplimit(struct dst_entry *dst)
1229 {
1230         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1231         if (hoplimit == 0) {
1232                 struct net_device *dev = dst->dev;
1233                 struct inet6_dev *idev;
1234
1235                 rcu_read_lock();
1236                 idev = __in6_dev_get(dev);
1237                 if (idev)
1238                         hoplimit = idev->cnf.hop_limit;
1239                 else
1240                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1241                 rcu_read_unlock();
1242         }
1243         return hoplimit;
1244 }
1245 EXPORT_SYMBOL(ip6_dst_hoplimit);
1246
1247 /*
1248  *
1249  */
1250
1251 int ip6_route_add(struct fib6_config *cfg)
1252 {
1253         int err;
1254         struct net *net = cfg->fc_nlinfo.nl_net;
1255         struct rt6_info *rt = NULL;
1256         struct net_device *dev = NULL;
1257         struct inet6_dev *idev = NULL;
1258         struct fib6_table *table;
1259         int addr_type;
1260
1261         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1262                 return -EINVAL;
1263 #ifndef CONFIG_IPV6_SUBTREES
1264         if (cfg->fc_src_len)
1265                 return -EINVAL;
1266 #endif
1267         if (cfg->fc_ifindex) {
1268                 err = -ENODEV;
1269                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1270                 if (!dev)
1271                         goto out;
1272                 idev = in6_dev_get(dev);
1273                 if (!idev)
1274                         goto out;
1275         }
1276
1277         if (cfg->fc_metric == 0)
1278                 cfg->fc_metric = IP6_RT_PRIO_USER;
1279
1280         err = -ENOBUFS;
1281         if (cfg->fc_nlinfo.nlh &&
1282             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1283                 table = fib6_get_table(net, cfg->fc_table);
1284                 if (!table) {
1285                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1286                         table = fib6_new_table(net, cfg->fc_table);
1287                 }
1288         } else {
1289                 table = fib6_new_table(net, cfg->fc_table);
1290         }
1291
1292         if (!table)
1293                 goto out;
1294
1295         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1296
1297         if (!rt) {
1298                 err = -ENOMEM;
1299                 goto out;
1300         }
1301
1302         rt->dst.obsolete = -1;
1303
1304         if (cfg->fc_flags & RTF_EXPIRES)
1305                 rt6_set_expires(rt, jiffies +
1306                                 clock_t_to_jiffies(cfg->fc_expires));
1307         else
1308                 rt6_clean_expires(rt);
1309
1310         if (cfg->fc_protocol == RTPROT_UNSPEC)
1311                 cfg->fc_protocol = RTPROT_BOOT;
1312         rt->rt6i_protocol = cfg->fc_protocol;
1313
1314         addr_type = ipv6_addr_type(&cfg->fc_dst);
1315
1316         if (addr_type & IPV6_ADDR_MULTICAST)
1317                 rt->dst.input = ip6_mc_input;
1318         else if (cfg->fc_flags & RTF_LOCAL)
1319                 rt->dst.input = ip6_input;
1320         else
1321                 rt->dst.input = ip6_forward;
1322
1323         rt->dst.output = ip6_output;
1324
1325         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1326         rt->rt6i_dst.plen = cfg->fc_dst_len;
1327         if (rt->rt6i_dst.plen == 128)
1328                rt->dst.flags |= DST_HOST;
1329
1330         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1331                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1332                 if (!metrics) {
1333                         err = -ENOMEM;
1334                         goto out;
1335                 }
1336                 dst_init_metrics(&rt->dst, metrics, 0);
1337         }
1338 #ifdef CONFIG_IPV6_SUBTREES
1339         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1340         rt->rt6i_src.plen = cfg->fc_src_len;
1341 #endif
1342
1343         rt->rt6i_metric = cfg->fc_metric;
1344
1345         /* We cannot add true routes via loopback here,
1346            they would result in kernel looping; promote them to reject routes
1347          */
1348         if ((cfg->fc_flags & RTF_REJECT) ||
1349             (dev && (dev->flags & IFF_LOOPBACK) &&
1350              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1351              !(cfg->fc_flags & RTF_LOCAL))) {
1352                 /* hold loopback dev/idev if we haven't done so. */
1353                 if (dev != net->loopback_dev) {
1354                         if (dev) {
1355                                 dev_put(dev);
1356                                 in6_dev_put(idev);
1357                         }
1358                         dev = net->loopback_dev;
1359                         dev_hold(dev);
1360                         idev = in6_dev_get(dev);
1361                         if (!idev) {
1362                                 err = -ENODEV;
1363                                 goto out;
1364                         }
1365                 }
1366                 rt->dst.output = ip6_pkt_discard_out;
1367                 rt->dst.input = ip6_pkt_discard;
1368                 rt->dst.error = -ENETUNREACH;
1369                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1370                 goto install_route;
1371         }
1372
1373         if (cfg->fc_flags & RTF_GATEWAY) {
1374                 const struct in6_addr *gw_addr;
1375                 int gwa_type;
1376
1377                 gw_addr = &cfg->fc_gateway;
1378                 rt->rt6i_gateway = *gw_addr;
1379                 gwa_type = ipv6_addr_type(gw_addr);
1380
1381                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1382                         struct rt6_info *grt;
1383
1384                         /* IPv6 strictly inhibits using not link-local
1385                            addresses as nexthop address.
1386                            Otherwise, router will not able to send redirects.
1387                            It is very good, but in some (rare!) circumstances
1388                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1389                            some exceptions. --ANK
1390                          */
1391                         err = -EINVAL;
1392                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1393                                 goto out;
1394
1395                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1396
1397                         err = -EHOSTUNREACH;
1398                         if (!grt)
1399                                 goto out;
1400                         if (dev) {
1401                                 if (dev != grt->dst.dev) {
1402                                         dst_release(&grt->dst);
1403                                         goto out;
1404                                 }
1405                         } else {
1406                                 dev = grt->dst.dev;
1407                                 idev = grt->rt6i_idev;
1408                                 dev_hold(dev);
1409                                 in6_dev_hold(grt->rt6i_idev);
1410                         }
1411                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1412                                 err = 0;
1413                         dst_release(&grt->dst);
1414
1415                         if (err)
1416                                 goto out;
1417                 }
1418                 err = -EINVAL;
1419                 if (!dev || (dev->flags & IFF_LOOPBACK))
1420                         goto out;
1421         }
1422
1423         err = -ENODEV;
1424         if (!dev)
1425                 goto out;
1426
1427         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1428                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1429                         err = -EINVAL;
1430                         goto out;
1431                 }
1432                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1433                 rt->rt6i_prefsrc.plen = 128;
1434         } else
1435                 rt->rt6i_prefsrc.plen = 0;
1436
1437         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1438                 err = rt6_bind_neighbour(rt, dev);
1439                 if (err)
1440                         goto out;
1441         }
1442
1443         rt->rt6i_flags = cfg->fc_flags;
1444
1445 install_route:
1446         if (cfg->fc_mx) {
1447                 struct nlattr *nla;
1448                 int remaining;
1449
1450                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1451                         int type = nla_type(nla);
1452
1453                         if (type) {
1454                                 if (type > RTAX_MAX) {
1455                                         err = -EINVAL;
1456                                         goto out;
1457                                 }
1458
1459                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1460                         }
1461                 }
1462         }
1463
1464         rt->dst.dev = dev;
1465         rt->rt6i_idev = idev;
1466         rt->rt6i_table = table;
1467
1468         cfg->fc_nlinfo.nl_net = dev_net(dev);
1469
1470         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1471
1472 out:
1473         if (dev)
1474                 dev_put(dev);
1475         if (idev)
1476                 in6_dev_put(idev);
1477         if (rt)
1478                 dst_free(&rt->dst);
1479         return err;
1480 }
1481
1482 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1483 {
1484         int err;
1485         struct fib6_table *table;
1486         struct net *net = dev_net(rt->dst.dev);
1487
1488         if (rt == net->ipv6.ip6_null_entry)
1489                 return -ENOENT;
1490
1491         table = rt->rt6i_table;
1492         write_lock_bh(&table->tb6_lock);
1493
1494         err = fib6_del(rt, info);
1495         dst_release(&rt->dst);
1496
1497         write_unlock_bh(&table->tb6_lock);
1498
1499         return err;
1500 }
1501
1502 int ip6_del_rt(struct rt6_info *rt)
1503 {
1504         struct nl_info info = {
1505                 .nl_net = dev_net(rt->dst.dev),
1506         };
1507         return __ip6_del_rt(rt, &info);
1508 }
1509
1510 static int ip6_route_del(struct fib6_config *cfg)
1511 {
1512         struct fib6_table *table;
1513         struct fib6_node *fn;
1514         struct rt6_info *rt;
1515         int err = -ESRCH;
1516
1517         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1518         if (!table)
1519                 return err;
1520
1521         read_lock_bh(&table->tb6_lock);
1522
1523         fn = fib6_locate(&table->tb6_root,
1524                          &cfg->fc_dst, cfg->fc_dst_len,
1525                          &cfg->fc_src, cfg->fc_src_len);
1526
1527         if (fn) {
1528                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1529                         if (cfg->fc_ifindex &&
1530                             (!rt->dst.dev ||
1531                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1532                                 continue;
1533                         if (cfg->fc_flags & RTF_GATEWAY &&
1534                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1535                                 continue;
1536                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1537                                 continue;
1538                         dst_hold(&rt->dst);
1539                         read_unlock_bh(&table->tb6_lock);
1540
1541                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1542                 }
1543         }
1544         read_unlock_bh(&table->tb6_lock);
1545
1546         return err;
1547 }
1548
1549 /*
1550  *      Handle redirects
1551  */
1552 struct ip6rd_flowi {
1553         struct flowi6 fl6;
1554         struct in6_addr gateway;
1555 };
1556
1557 static struct rt6_info *__ip6_route_redirect(struct net *net,
1558                                              struct fib6_table *table,
1559                                              struct flowi6 *fl6,
1560                                              int flags)
1561 {
1562         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1563         struct rt6_info *rt;
1564         struct fib6_node *fn;
1565
1566         /*
1567          * Get the "current" route for this destination and
1568          * check if the redirect has come from approriate router.
1569          *
1570          * RFC 2461 specifies that redirects should only be
1571          * accepted if they come from the nexthop to the target.
1572          * Due to the way the routes are chosen, this notion
1573          * is a bit fuzzy and one might need to check all possible
1574          * routes.
1575          */
1576
1577         read_lock_bh(&table->tb6_lock);
1578         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1579 restart:
1580         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1581                 /*
1582                  * Current route is on-link; redirect is always invalid.
1583                  *
1584                  * Seems, previous statement is not true. It could
1585                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1586                  * But then router serving it might decide, that we should
1587                  * know truth 8)8) --ANK (980726).
1588                  */
1589                 if (rt6_check_expired(rt))
1590                         continue;
1591                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1592                         continue;
1593                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1594                         continue;
1595                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1596                         continue;
1597                 break;
1598         }
1599
1600         if (!rt)
1601                 rt = net->ipv6.ip6_null_entry;
1602         BACKTRACK(net, &fl6->saddr);
1603 out:
1604         dst_hold(&rt->dst);
1605
1606         read_unlock_bh(&table->tb6_lock);
1607
1608         return rt;
1609 };
1610
1611 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1612                                            const struct in6_addr *src,
1613                                            const struct in6_addr *gateway,
1614                                            struct net_device *dev)
1615 {
1616         int flags = RT6_LOOKUP_F_HAS_SADDR;
1617         struct net *net = dev_net(dev);
1618         struct ip6rd_flowi rdfl = {
1619                 .fl6 = {
1620                         .flowi6_oif = dev->ifindex,
1621                         .daddr = *dest,
1622                         .saddr = *src,
1623                 },
1624         };
1625
1626         rdfl.gateway = *gateway;
1627
1628         if (rt6_need_strict(dest))
1629                 flags |= RT6_LOOKUP_F_IFACE;
1630
1631         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1632                                                    flags, __ip6_route_redirect);
1633 }
1634
1635 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1636                   const struct in6_addr *saddr,
1637                   struct neighbour *neigh, u8 *lladdr, int on_link)
1638 {
1639         struct rt6_info *rt, *nrt = NULL;
1640         struct netevent_redirect netevent;
1641         struct net *net = dev_net(neigh->dev);
1642
1643         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1644
1645         if (rt == net->ipv6.ip6_null_entry) {
1646                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1647                 goto out;
1648         }
1649
1650         /*
1651          *      We have finally decided to accept it.
1652          */
1653
1654         neigh_update(neigh, lladdr, NUD_STALE,
1655                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1656                      NEIGH_UPDATE_F_OVERRIDE|
1657                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1658                                      NEIGH_UPDATE_F_ISROUTER))
1659                      );
1660
1661         /*
1662          * Redirect received -> path was valid.
1663          * Look, redirects are sent only in response to data packets,
1664          * so that this nexthop apparently is reachable. --ANK
1665          */
1666         dst_confirm(&rt->dst);
1667
1668         /* Duplicate redirect: silently ignore. */
1669         if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1670                 goto out;
1671
1672         nrt = ip6_rt_copy(rt, dest);
1673         if (!nrt)
1674                 goto out;
1675
1676         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1677         if (on_link)
1678                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1679
1680         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1681         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1682
1683         if (ip6_ins_rt(nrt))
1684                 goto out;
1685
1686         netevent.old = &rt->dst;
1687         netevent.new = &nrt->dst;
1688         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1689
1690         if (rt->rt6i_flags & RTF_CACHE) {
1691                 ip6_del_rt(rt);
1692                 return;
1693         }
1694
1695 out:
1696         dst_release(&rt->dst);
1697 }
1698
1699 /*
1700  *      Handle ICMP "packet too big" messages
1701  *      i.e. Path MTU discovery
1702  */
1703
1704 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1705                              struct net *net, u32 pmtu, int ifindex)
1706 {
1707         struct rt6_info *rt, *nrt;
1708         int allfrag = 0;
1709 again:
1710         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1711         if (!rt)
1712                 return;
1713
1714         if (rt6_check_expired(rt)) {
1715                 ip6_del_rt(rt);
1716                 goto again;
1717         }
1718
1719         if (pmtu >= dst_mtu(&rt->dst))
1720                 goto out;
1721
1722         if (pmtu < IPV6_MIN_MTU) {
1723                 /*
1724                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1725                  * MTU (1280) and a fragment header should always be included
1726                  * after a node receiving Too Big message reporting PMTU is
1727                  * less than the IPv6 Minimum Link MTU.
1728                  */
1729                 pmtu = IPV6_MIN_MTU;
1730                 allfrag = 1;
1731         }
1732
1733         /* New mtu received -> path was valid.
1734            They are sent only in response to data packets,
1735            so that this nexthop apparently is reachable. --ANK
1736          */
1737         dst_confirm(&rt->dst);
1738
1739         /* Host route. If it is static, it would be better
1740            not to override it, but add new one, so that
1741            when cache entry will expire old pmtu
1742            would return automatically.
1743          */
1744         if (rt->rt6i_flags & RTF_CACHE) {
1745                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1746                 if (allfrag) {
1747                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1748                         features |= RTAX_FEATURE_ALLFRAG;
1749                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1750                 }
1751                 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1752                 rt->rt6i_flags |= RTF_MODIFIED;
1753                 goto out;
1754         }
1755
1756         /* Network route.
1757            Two cases are possible:
1758            1. It is connected route. Action: COW
1759            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1760          */
1761         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1762                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1763         else
1764                 nrt = rt6_alloc_clone(rt, daddr);
1765
1766         if (nrt) {
1767                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1768                 if (allfrag) {
1769                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1770                         features |= RTAX_FEATURE_ALLFRAG;
1771                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1772                 }
1773
1774                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1775                  * happened within 5 mins, the recommended timer is 10 mins.
1776                  * Here this route expiration time is set to ip6_rt_mtu_expires
1777                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1778                  * and detecting PMTU increase will be automatically happened.
1779                  */
1780                 rt6_update_expires(nrt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1781                 nrt->rt6i_flags |= RTF_DYNAMIC;
1782                 ip6_ins_rt(nrt);
1783         }
1784 out:
1785         dst_release(&rt->dst);
1786 }
1787
1788 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1789                         struct net_device *dev, u32 pmtu)
1790 {
1791         struct net *net = dev_net(dev);
1792
1793         /*
1794          * RFC 1981 states that a node "MUST reduce the size of the packets it
1795          * is sending along the path" that caused the Packet Too Big message.
1796          * Since it's not possible in the general case to determine which
1797          * interface was used to send the original packet, we update the MTU
1798          * on the interface that will be used to send future packets. We also
1799          * update the MTU on the interface that received the Packet Too Big in
1800          * case the original packet was forced out that interface with
1801          * SO_BINDTODEVICE or similar. This is the next best thing to the
1802          * correct behaviour, which would be to update the MTU on all
1803          * interfaces.
1804          */
1805         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1806         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1807 }
1808
1809 /*
1810  *      Misc support functions
1811  */
1812
1813 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1814                                     const struct in6_addr *dest)
1815 {
1816         struct net *net = dev_net(ort->dst.dev);
1817         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1818                                             ort->dst.dev, 0);
1819
1820         if (rt) {
1821                 rt->dst.input = ort->dst.input;
1822                 rt->dst.output = ort->dst.output;
1823                 rt->dst.flags |= DST_HOST;
1824
1825                 rt->rt6i_dst.addr = *dest;
1826                 rt->rt6i_dst.plen = 128;
1827                 dst_copy_metrics(&rt->dst, &ort->dst);
1828                 rt->dst.error = ort->dst.error;
1829                 rt->rt6i_idev = ort->rt6i_idev;
1830                 if (rt->rt6i_idev)
1831                         in6_dev_hold(rt->rt6i_idev);
1832                 rt->dst.lastuse = jiffies;
1833
1834                 rt->rt6i_gateway = ort->rt6i_gateway;
1835                 rt->rt6i_flags = ort->rt6i_flags;
1836                 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1837                     (RTF_DEFAULT | RTF_ADDRCONF))
1838                         rt6_set_from(rt, ort);
1839                 else
1840                         rt6_clean_expires(rt);
1841                 rt->rt6i_metric = 0;
1842
1843 #ifdef CONFIG_IPV6_SUBTREES
1844                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1845 #endif
1846                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1847                 rt->rt6i_table = ort->rt6i_table;
1848         }
1849         return rt;
1850 }
1851
1852 #ifdef CONFIG_IPV6_ROUTE_INFO
1853 static struct rt6_info *rt6_get_route_info(struct net *net,
1854                                            const struct in6_addr *prefix, int prefixlen,
1855                                            const struct in6_addr *gwaddr, int ifindex)
1856 {
1857         struct fib6_node *fn;
1858         struct rt6_info *rt = NULL;
1859         struct fib6_table *table;
1860
1861         table = fib6_get_table(net, RT6_TABLE_INFO);
1862         if (!table)
1863                 return NULL;
1864
1865         write_lock_bh(&table->tb6_lock);
1866         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1867         if (!fn)
1868                 goto out;
1869
1870         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1871                 if (rt->dst.dev->ifindex != ifindex)
1872                         continue;
1873                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1874                         continue;
1875                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1876                         continue;
1877                 dst_hold(&rt->dst);
1878                 break;
1879         }
1880 out:
1881         write_unlock_bh(&table->tb6_lock);
1882         return rt;
1883 }
1884
1885 static struct rt6_info *rt6_add_route_info(struct net *net,
1886                                            const struct in6_addr *prefix, int prefixlen,
1887                                            const struct in6_addr *gwaddr, int ifindex,
1888                                            unsigned int pref)
1889 {
1890         struct fib6_config cfg = {
1891                 .fc_table       = RT6_TABLE_INFO,
1892                 .fc_metric      = IP6_RT_PRIO_USER,
1893                 .fc_ifindex     = ifindex,
1894                 .fc_dst_len     = prefixlen,
1895                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1896                                   RTF_UP | RTF_PREF(pref),
1897                 .fc_nlinfo.pid = 0,
1898                 .fc_nlinfo.nlh = NULL,
1899                 .fc_nlinfo.nl_net = net,
1900         };
1901
1902         cfg.fc_dst = *prefix;
1903         cfg.fc_gateway = *gwaddr;
1904
1905         /* We should treat it as a default route if prefix length is 0. */
1906         if (!prefixlen)
1907                 cfg.fc_flags |= RTF_DEFAULT;
1908
1909         ip6_route_add(&cfg);
1910
1911         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1912 }
1913 #endif
1914
1915 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1916 {
1917         struct rt6_info *rt;
1918         struct fib6_table *table;
1919
1920         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1921         if (!table)
1922                 return NULL;
1923
1924         write_lock_bh(&table->tb6_lock);
1925         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1926                 if (dev == rt->dst.dev &&
1927                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1928                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1929                         break;
1930         }
1931         if (rt)
1932                 dst_hold(&rt->dst);
1933         write_unlock_bh(&table->tb6_lock);
1934         return rt;
1935 }
1936
1937 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1938                                      struct net_device *dev,
1939                                      unsigned int pref)
1940 {
1941         struct fib6_config cfg = {
1942                 .fc_table       = RT6_TABLE_DFLT,
1943                 .fc_metric      = IP6_RT_PRIO_USER,
1944                 .fc_ifindex     = dev->ifindex,
1945                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1946                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1947                 .fc_nlinfo.pid = 0,
1948                 .fc_nlinfo.nlh = NULL,
1949                 .fc_nlinfo.nl_net = dev_net(dev),
1950         };
1951
1952         cfg.fc_gateway = *gwaddr;
1953
1954         ip6_route_add(&cfg);
1955
1956         return rt6_get_dflt_router(gwaddr, dev);
1957 }
1958
1959 void rt6_purge_dflt_routers(struct net *net)
1960 {
1961         struct rt6_info *rt;
1962         struct fib6_table *table;
1963
1964         /* NOTE: Keep consistent with rt6_get_dflt_router */
1965         table = fib6_get_table(net, RT6_TABLE_DFLT);
1966         if (!table)
1967                 return;
1968
1969 restart:
1970         read_lock_bh(&table->tb6_lock);
1971         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1972                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1973                         dst_hold(&rt->dst);
1974                         read_unlock_bh(&table->tb6_lock);
1975                         ip6_del_rt(rt);
1976                         goto restart;
1977                 }
1978         }
1979         read_unlock_bh(&table->tb6_lock);
1980 }
1981
1982 static void rtmsg_to_fib6_config(struct net *net,
1983                                  struct in6_rtmsg *rtmsg,
1984                                  struct fib6_config *cfg)
1985 {
1986         memset(cfg, 0, sizeof(*cfg));
1987
1988         cfg->fc_table = RT6_TABLE_MAIN;
1989         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1990         cfg->fc_metric = rtmsg->rtmsg_metric;
1991         cfg->fc_expires = rtmsg->rtmsg_info;
1992         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1993         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1994         cfg->fc_flags = rtmsg->rtmsg_flags;
1995
1996         cfg->fc_nlinfo.nl_net = net;
1997
1998         cfg->fc_dst = rtmsg->rtmsg_dst;
1999         cfg->fc_src = rtmsg->rtmsg_src;
2000         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2001 }
2002
2003 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2004 {
2005         struct fib6_config cfg;
2006         struct in6_rtmsg rtmsg;
2007         int err;
2008
2009         switch(cmd) {
2010         case SIOCADDRT:         /* Add a route */
2011         case SIOCDELRT:         /* Delete a route */
2012                 if (!capable(CAP_NET_ADMIN))
2013                         return -EPERM;
2014                 err = copy_from_user(&rtmsg, arg,
2015                                      sizeof(struct in6_rtmsg));
2016                 if (err)
2017                         return -EFAULT;
2018
2019                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2020
2021                 rtnl_lock();
2022                 switch (cmd) {
2023                 case SIOCADDRT:
2024                         err = ip6_route_add(&cfg);
2025                         break;
2026                 case SIOCDELRT:
2027                         err = ip6_route_del(&cfg);
2028                         break;
2029                 default:
2030                         err = -EINVAL;
2031                 }
2032                 rtnl_unlock();
2033
2034                 return err;
2035         }
2036
2037         return -EINVAL;
2038 }
2039
2040 /*
2041  *      Drop the packet on the floor
2042  */
2043
2044 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2045 {
2046         int type;
2047         struct dst_entry *dst = skb_dst(skb);
2048         switch (ipstats_mib_noroutes) {
2049         case IPSTATS_MIB_INNOROUTES:
2050                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2051                 if (type == IPV6_ADDR_ANY) {
2052                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2053                                       IPSTATS_MIB_INADDRERRORS);
2054                         break;
2055                 }
2056                 /* FALLTHROUGH */
2057         case IPSTATS_MIB_OUTNOROUTES:
2058                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2059                               ipstats_mib_noroutes);
2060                 break;
2061         }
2062         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2063         kfree_skb(skb);
2064         return 0;
2065 }
2066
2067 static int ip6_pkt_discard(struct sk_buff *skb)
2068 {
2069         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2070 }
2071
2072 static int ip6_pkt_discard_out(struct sk_buff *skb)
2073 {
2074         skb->dev = skb_dst(skb)->dev;
2075         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2076 }
2077
2078 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2079
2080 static int ip6_pkt_prohibit(struct sk_buff *skb)
2081 {
2082         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2083 }
2084
2085 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2086 {
2087         skb->dev = skb_dst(skb)->dev;
2088         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2089 }
2090
2091 #endif
2092
2093 /*
2094  *      Allocate a dst for local (unicast / anycast) address.
2095  */
2096
2097 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2098                                     const struct in6_addr *addr,
2099                                     bool anycast)
2100 {
2101         struct net *net = dev_net(idev->dev);
2102         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2103                                             net->loopback_dev, 0);
2104         int err;
2105
2106         if (!rt) {
2107                 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2108                 return ERR_PTR(-ENOMEM);
2109         }
2110
2111         in6_dev_hold(idev);
2112
2113         rt->dst.flags |= DST_HOST;
2114         rt->dst.input = ip6_input;
2115         rt->dst.output = ip6_output;
2116         rt->rt6i_idev = idev;
2117         rt->dst.obsolete = -1;
2118
2119         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2120         if (anycast)
2121                 rt->rt6i_flags |= RTF_ANYCAST;
2122         else
2123                 rt->rt6i_flags |= RTF_LOCAL;
2124         err = rt6_bind_neighbour(rt, rt->dst.dev);
2125         if (err) {
2126                 dst_free(&rt->dst);
2127                 return ERR_PTR(err);
2128         }
2129
2130         rt->rt6i_dst.addr = *addr;
2131         rt->rt6i_dst.plen = 128;
2132         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2133
2134         atomic_set(&rt->dst.__refcnt, 1);
2135
2136         return rt;
2137 }
2138
2139 int ip6_route_get_saddr(struct net *net,
2140                         struct rt6_info *rt,
2141                         const struct in6_addr *daddr,
2142                         unsigned int prefs,
2143                         struct in6_addr *saddr)
2144 {
2145         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2146         int err = 0;
2147         if (rt->rt6i_prefsrc.plen)
2148                 *saddr = rt->rt6i_prefsrc.addr;
2149         else
2150                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2151                                          daddr, prefs, saddr);
2152         return err;
2153 }
2154
2155 /* remove deleted ip from prefsrc entries */
2156 struct arg_dev_net_ip {
2157         struct net_device *dev;
2158         struct net *net;
2159         struct in6_addr *addr;
2160 };
2161
2162 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2163 {
2164         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2165         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2166         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2167
2168         if (((void *)rt->dst.dev == dev || !dev) &&
2169             rt != net->ipv6.ip6_null_entry &&
2170             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2171                 /* remove prefsrc entry */
2172                 rt->rt6i_prefsrc.plen = 0;
2173         }
2174         return 0;
2175 }
2176
2177 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2178 {
2179         struct net *net = dev_net(ifp->idev->dev);
2180         struct arg_dev_net_ip adni = {
2181                 .dev = ifp->idev->dev,
2182                 .net = net,
2183                 .addr = &ifp->addr,
2184         };
2185         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2186 }
2187
2188 struct arg_dev_net {
2189         struct net_device *dev;
2190         struct net *net;
2191 };
2192
2193 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2194 {
2195         const struct arg_dev_net *adn = arg;
2196         const struct net_device *dev = adn->dev;
2197
2198         if ((rt->dst.dev == dev || !dev) &&
2199             rt != adn->net->ipv6.ip6_null_entry)
2200                 return -1;
2201
2202         return 0;
2203 }
2204
2205 void rt6_ifdown(struct net *net, struct net_device *dev)
2206 {
2207         struct arg_dev_net adn = {
2208                 .dev = dev,
2209                 .net = net,
2210         };
2211
2212         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2213         icmp6_clean_all(fib6_ifdown, &adn);
2214 }
2215
2216 struct rt6_mtu_change_arg {
2217         struct net_device *dev;
2218         unsigned int mtu;
2219 };
2220
2221 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2222 {
2223         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2224         struct inet6_dev *idev;
2225
2226         /* In IPv6 pmtu discovery is not optional,
2227            so that RTAX_MTU lock cannot disable it.
2228            We still use this lock to block changes
2229            caused by addrconf/ndisc.
2230         */
2231
2232         idev = __in6_dev_get(arg->dev);
2233         if (!idev)
2234                 return 0;
2235
2236         /* For administrative MTU increase, there is no way to discover
2237            IPv6 PMTU increase, so PMTU increase should be updated here.
2238            Since RFC 1981 doesn't include administrative MTU increase
2239            update PMTU increase is a MUST. (i.e. jumbo frame)
2240          */
2241         /*
2242            If new MTU is less than route PMTU, this new MTU will be the
2243            lowest MTU in the path, update the route PMTU to reflect PMTU
2244            decreases; if new MTU is greater than route PMTU, and the
2245            old MTU is the lowest MTU in the path, update the route PMTU
2246            to reflect the increase. In this case if the other nodes' MTU
2247            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2248            PMTU discouvery.
2249          */
2250         if (rt->dst.dev == arg->dev &&
2251             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2252             (dst_mtu(&rt->dst) >= arg->mtu ||
2253              (dst_mtu(&rt->dst) < arg->mtu &&
2254               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2255                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2256         }
2257         return 0;
2258 }
2259
2260 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2261 {
2262         struct rt6_mtu_change_arg arg = {
2263                 .dev = dev,
2264                 .mtu = mtu,
2265         };
2266
2267         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2268 }
2269
2270 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2271         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2272         [RTA_OIF]               = { .type = NLA_U32 },
2273         [RTA_IIF]               = { .type = NLA_U32 },
2274         [RTA_PRIORITY]          = { .type = NLA_U32 },
2275         [RTA_METRICS]           = { .type = NLA_NESTED },
2276 };
2277
2278 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2279                               struct fib6_config *cfg)
2280 {
2281         struct rtmsg *rtm;
2282         struct nlattr *tb[RTA_MAX+1];
2283         int err;
2284
2285         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2286         if (err < 0)
2287                 goto errout;
2288
2289         err = -EINVAL;
2290         rtm = nlmsg_data(nlh);
2291         memset(cfg, 0, sizeof(*cfg));
2292
2293         cfg->fc_table = rtm->rtm_table;
2294         cfg->fc_dst_len = rtm->rtm_dst_len;
2295         cfg->fc_src_len = rtm->rtm_src_len;
2296         cfg->fc_flags = RTF_UP;
2297         cfg->fc_protocol = rtm->rtm_protocol;
2298
2299         if (rtm->rtm_type == RTN_UNREACHABLE)
2300                 cfg->fc_flags |= RTF_REJECT;
2301
2302         if (rtm->rtm_type == RTN_LOCAL)
2303                 cfg->fc_flags |= RTF_LOCAL;
2304
2305         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2306         cfg->fc_nlinfo.nlh = nlh;
2307         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2308
2309         if (tb[RTA_GATEWAY]) {
2310                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2311                 cfg->fc_flags |= RTF_GATEWAY;
2312         }
2313
2314         if (tb[RTA_DST]) {
2315                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2316
2317                 if (nla_len(tb[RTA_DST]) < plen)
2318                         goto errout;
2319
2320                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2321         }
2322
2323         if (tb[RTA_SRC]) {
2324                 int plen = (rtm->rtm_src_len + 7) >> 3;
2325
2326                 if (nla_len(tb[RTA_SRC]) < plen)
2327                         goto errout;
2328
2329                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2330         }
2331
2332         if (tb[RTA_PREFSRC])
2333                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2334
2335         if (tb[RTA_OIF])
2336                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2337
2338         if (tb[RTA_PRIORITY])
2339                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2340
2341         if (tb[RTA_METRICS]) {
2342                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2343                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2344         }
2345
2346         if (tb[RTA_TABLE])
2347                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2348
2349         err = 0;
2350 errout:
2351         return err;
2352 }
2353
2354 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2355 {
2356         struct fib6_config cfg;
2357         int err;
2358
2359         err = rtm_to_fib6_config(skb, nlh, &cfg);
2360         if (err < 0)
2361                 return err;
2362
2363         return ip6_route_del(&cfg);
2364 }
2365
2366 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2367 {
2368         struct fib6_config cfg;
2369         int err;
2370
2371         err = rtm_to_fib6_config(skb, nlh, &cfg);
2372         if (err < 0)
2373                 return err;
2374
2375         return ip6_route_add(&cfg);
2376 }
2377
2378 static inline size_t rt6_nlmsg_size(void)
2379 {
2380         return NLMSG_ALIGN(sizeof(struct rtmsg))
2381                + nla_total_size(16) /* RTA_SRC */
2382                + nla_total_size(16) /* RTA_DST */
2383                + nla_total_size(16) /* RTA_GATEWAY */
2384                + nla_total_size(16) /* RTA_PREFSRC */
2385                + nla_total_size(4) /* RTA_TABLE */
2386                + nla_total_size(4) /* RTA_IIF */
2387                + nla_total_size(4) /* RTA_OIF */
2388                + nla_total_size(4) /* RTA_PRIORITY */
2389                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2390                + nla_total_size(sizeof(struct rta_cacheinfo));
2391 }
2392
2393 static int rt6_fill_node(struct net *net,
2394                          struct sk_buff *skb, struct rt6_info *rt,
2395                          struct in6_addr *dst, struct in6_addr *src,
2396                          int iif, int type, u32 pid, u32 seq,
2397                          int prefix, int nowait, unsigned int flags)
2398 {
2399         const struct inet_peer *peer;
2400         struct rtmsg *rtm;
2401         struct nlmsghdr *nlh;
2402         long expires;
2403         u32 table;
2404         struct neighbour *n;
2405         u32 ts, tsage;
2406
2407         if (prefix) {   /* user wants prefix routes only */
2408                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2409                         /* success since this is not a prefix route */
2410                         return 1;
2411                 }
2412         }
2413
2414         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2415         if (!nlh)
2416                 return -EMSGSIZE;
2417
2418         rtm = nlmsg_data(nlh);
2419         rtm->rtm_family = AF_INET6;
2420         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2421         rtm->rtm_src_len = rt->rt6i_src.plen;
2422         rtm->rtm_tos = 0;
2423         if (rt->rt6i_table)
2424                 table = rt->rt6i_table->tb6_id;
2425         else
2426                 table = RT6_TABLE_UNSPEC;
2427         rtm->rtm_table = table;
2428         if (nla_put_u32(skb, RTA_TABLE, table))
2429                 goto nla_put_failure;
2430         if (rt->rt6i_flags & RTF_REJECT)
2431                 rtm->rtm_type = RTN_UNREACHABLE;
2432         else if (rt->rt6i_flags & RTF_LOCAL)
2433                 rtm->rtm_type = RTN_LOCAL;
2434         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2435                 rtm->rtm_type = RTN_LOCAL;
2436         else
2437                 rtm->rtm_type = RTN_UNICAST;
2438         rtm->rtm_flags = 0;
2439         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2440         rtm->rtm_protocol = rt->rt6i_protocol;
2441         if (rt->rt6i_flags & RTF_DYNAMIC)
2442                 rtm->rtm_protocol = RTPROT_REDIRECT;
2443         else if (rt->rt6i_flags & RTF_ADDRCONF)
2444                 rtm->rtm_protocol = RTPROT_KERNEL;
2445         else if (rt->rt6i_flags & RTF_DEFAULT)
2446                 rtm->rtm_protocol = RTPROT_RA;
2447
2448         if (rt->rt6i_flags & RTF_CACHE)
2449                 rtm->rtm_flags |= RTM_F_CLONED;
2450
2451         if (dst) {
2452                 if (nla_put(skb, RTA_DST, 16, dst))
2453                         goto nla_put_failure;
2454                 rtm->rtm_dst_len = 128;
2455         } else if (rtm->rtm_dst_len)
2456                 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2457                         goto nla_put_failure;
2458 #ifdef CONFIG_IPV6_SUBTREES
2459         if (src) {
2460                 if (nla_put(skb, RTA_SRC, 16, src))
2461                         goto nla_put_failure;
2462                 rtm->rtm_src_len = 128;
2463         } else if (rtm->rtm_src_len &&
2464                    nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2465                 goto nla_put_failure;
2466 #endif
2467         if (iif) {
2468 #ifdef CONFIG_IPV6_MROUTE
2469                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2470                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2471                         if (err <= 0) {
2472                                 if (!nowait) {
2473                                         if (err == 0)
2474                                                 return 0;
2475                                         goto nla_put_failure;
2476                                 } else {
2477                                         if (err == -EMSGSIZE)
2478                                                 goto nla_put_failure;
2479                                 }
2480                         }
2481                 } else
2482 #endif
2483                         if (nla_put_u32(skb, RTA_IIF, iif))
2484                                 goto nla_put_failure;
2485         } else if (dst) {
2486                 struct in6_addr saddr_buf;
2487                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2488                     nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2489                         goto nla_put_failure;
2490         }
2491
2492         if (rt->rt6i_prefsrc.plen) {
2493                 struct in6_addr saddr_buf;
2494                 saddr_buf = rt->rt6i_prefsrc.addr;
2495                 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2496                         goto nla_put_failure;
2497         }
2498
2499         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2500                 goto nla_put_failure;
2501
2502         rcu_read_lock();
2503         n = dst_get_neighbour_noref(&rt->dst);
2504         if (n) {
2505                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2506                         rcu_read_unlock();
2507                         goto nla_put_failure;
2508                 }
2509         }
2510         rcu_read_unlock();
2511
2512         if (rt->dst.dev &&
2513             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2514                 goto nla_put_failure;
2515         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2516                 goto nla_put_failure;
2517         if (!(rt->rt6i_flags & RTF_EXPIRES))
2518                 expires = 0;
2519         else if (rt->dst.expires - jiffies < INT_MAX)
2520                 expires = rt->dst.expires - jiffies;
2521         else
2522                 expires = INT_MAX;
2523
2524         peer = rt->rt6i_peer;
2525         ts = tsage = 0;
2526         if (peer && peer->tcp_ts_stamp) {
2527                 ts = peer->tcp_ts;
2528                 tsage = get_seconds() - peer->tcp_ts_stamp;
2529         }
2530
2531         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2532                                expires, rt->dst.error) < 0)
2533                 goto nla_put_failure;
2534
2535         return nlmsg_end(skb, nlh);
2536
2537 nla_put_failure:
2538         nlmsg_cancel(skb, nlh);
2539         return -EMSGSIZE;
2540 }
2541
2542 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2543 {
2544         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2545         int prefix;
2546
2547         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2548                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2549                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2550         } else
2551                 prefix = 0;
2552
2553         return rt6_fill_node(arg->net,
2554                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2555                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2556                      prefix, 0, NLM_F_MULTI);
2557 }
2558
2559 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2560 {
2561         struct net *net = sock_net(in_skb->sk);
2562         struct nlattr *tb[RTA_MAX+1];
2563         struct rt6_info *rt;
2564         struct sk_buff *skb;
2565         struct rtmsg *rtm;
2566         struct flowi6 fl6;
2567         int err, iif = 0, oif = 0;
2568
2569         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2570         if (err < 0)
2571                 goto errout;
2572
2573         err = -EINVAL;
2574         memset(&fl6, 0, sizeof(fl6));
2575
2576         if (tb[RTA_SRC]) {
2577                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2578                         goto errout;
2579
2580                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2581         }
2582
2583         if (tb[RTA_DST]) {
2584                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2585                         goto errout;
2586
2587                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2588         }
2589
2590         if (tb[RTA_IIF])
2591                 iif = nla_get_u32(tb[RTA_IIF]);
2592
2593         if (tb[RTA_OIF])
2594                 oif = nla_get_u32(tb[RTA_OIF]);
2595
2596         if (iif) {
2597                 struct net_device *dev;
2598                 int flags = 0;
2599
2600                 dev = __dev_get_by_index(net, iif);
2601                 if (!dev) {
2602                         err = -ENODEV;
2603                         goto errout;
2604                 }
2605
2606                 fl6.flowi6_iif = iif;
2607
2608                 if (!ipv6_addr_any(&fl6.saddr))
2609                         flags |= RT6_LOOKUP_F_HAS_SADDR;
2610
2611                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2612                                                                flags);
2613         } else {
2614                 fl6.flowi6_oif = oif;
2615
2616                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2617         }
2618
2619         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2620         if (!skb) {
2621                 dst_release(&rt->dst);
2622                 err = -ENOBUFS;
2623                 goto errout;
2624         }
2625
2626         /* Reserve room for dummy headers, this skb can pass
2627            through good chunk of routing engine.
2628          */
2629         skb_reset_mac_header(skb);
2630         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2631
2632         skb_dst_set(skb, &rt->dst);
2633
2634         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2635                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2636                             nlh->nlmsg_seq, 0, 0, 0);
2637         if (err < 0) {
2638                 kfree_skb(skb);
2639                 goto errout;
2640         }
2641
2642         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2643 errout:
2644         return err;
2645 }
2646
2647 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2648 {
2649         struct sk_buff *skb;
2650         struct net *net = info->nl_net;
2651         u32 seq;
2652         int err;
2653
2654         err = -ENOBUFS;
2655         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2656
2657         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2658         if (!skb)
2659                 goto errout;
2660
2661         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2662                                 event, info->pid, seq, 0, 0, 0);
2663         if (err < 0) {
2664                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2665                 WARN_ON(err == -EMSGSIZE);
2666                 kfree_skb(skb);
2667                 goto errout;
2668         }
2669         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2670                     info->nlh, gfp_any());
2671         return;
2672 errout:
2673         if (err < 0)
2674                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2675 }
2676
2677 static int ip6_route_dev_notify(struct notifier_block *this,
2678                                 unsigned long event, void *data)
2679 {
2680         struct net_device *dev = (struct net_device *)data;
2681         struct net *net = dev_net(dev);
2682
2683         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2684                 net->ipv6.ip6_null_entry->dst.dev = dev;
2685                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2686 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2687                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2688                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2689                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2690                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2691 #endif
2692         }
2693
2694         return NOTIFY_OK;
2695 }
2696
2697 /*
2698  *      /proc
2699  */
2700
2701 #ifdef CONFIG_PROC_FS
2702
2703 struct rt6_proc_arg
2704 {
2705         char *buffer;
2706         int offset;
2707         int length;
2708         int skip;
2709         int len;
2710 };
2711
2712 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2713 {
2714         struct seq_file *m = p_arg;
2715         struct neighbour *n;
2716
2717         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2718
2719 #ifdef CONFIG_IPV6_SUBTREES
2720         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2721 #else
2722         seq_puts(m, "00000000000000000000000000000000 00 ");
2723 #endif
2724         rcu_read_lock();
2725         n = dst_get_neighbour_noref(&rt->dst);
2726         if (n) {
2727                 seq_printf(m, "%pi6", n->primary_key);
2728         } else {
2729                 seq_puts(m, "00000000000000000000000000000000");
2730         }
2731         rcu_read_unlock();
2732         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2733                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2734                    rt->dst.__use, rt->rt6i_flags,
2735                    rt->dst.dev ? rt->dst.dev->name : "");
2736         return 0;
2737 }
2738
2739 static int ipv6_route_show(struct seq_file *m, void *v)
2740 {
2741         struct net *net = (struct net *)m->private;
2742         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2743         return 0;
2744 }
2745
2746 static int ipv6_route_open(struct inode *inode, struct file *file)
2747 {
2748         return single_open_net(inode, file, ipv6_route_show);
2749 }
2750
2751 static const struct file_operations ipv6_route_proc_fops = {
2752         .owner          = THIS_MODULE,
2753         .open           = ipv6_route_open,
2754         .read           = seq_read,
2755         .llseek         = seq_lseek,
2756         .release        = single_release_net,
2757 };
2758
2759 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2760 {
2761         struct net *net = (struct net *)seq->private;
2762         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2763                    net->ipv6.rt6_stats->fib_nodes,
2764                    net->ipv6.rt6_stats->fib_route_nodes,
2765                    net->ipv6.rt6_stats->fib_rt_alloc,
2766                    net->ipv6.rt6_stats->fib_rt_entries,
2767                    net->ipv6.rt6_stats->fib_rt_cache,
2768                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2769                    net->ipv6.rt6_stats->fib_discarded_routes);
2770
2771         return 0;
2772 }
2773
2774 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2775 {
2776         return single_open_net(inode, file, rt6_stats_seq_show);
2777 }
2778
2779 static const struct file_operations rt6_stats_seq_fops = {
2780         .owner   = THIS_MODULE,
2781         .open    = rt6_stats_seq_open,
2782         .read    = seq_read,
2783         .llseek  = seq_lseek,
2784         .release = single_release_net,
2785 };
2786 #endif  /* CONFIG_PROC_FS */
2787
2788 #ifdef CONFIG_SYSCTL
2789
2790 static
2791 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2792                               void __user *buffer, size_t *lenp, loff_t *ppos)
2793 {
2794         struct net *net;
2795         int delay;
2796         if (!write)
2797                 return -EINVAL;
2798
2799         net = (struct net *)ctl->extra1;
2800         delay = net->ipv6.sysctl.flush_delay;
2801         proc_dointvec(ctl, write, buffer, lenp, ppos);
2802         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2803         return 0;
2804 }
2805
2806 ctl_table ipv6_route_table_template[] = {
2807         {
2808                 .procname       =       "flush",
2809                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2810                 .maxlen         =       sizeof(int),
2811                 .mode           =       0200,
2812                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2813         },
2814         {
2815                 .procname       =       "gc_thresh",
2816                 .data           =       &ip6_dst_ops_template.gc_thresh,
2817                 .maxlen         =       sizeof(int),
2818                 .mode           =       0644,
2819                 .proc_handler   =       proc_dointvec,
2820         },
2821         {
2822                 .procname       =       "max_size",
2823                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2824                 .maxlen         =       sizeof(int),
2825                 .mode           =       0644,
2826                 .proc_handler   =       proc_dointvec,
2827         },
2828         {
2829                 .procname       =       "gc_min_interval",
2830                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2831                 .maxlen         =       sizeof(int),
2832                 .mode           =       0644,
2833                 .proc_handler   =       proc_dointvec_jiffies,
2834         },
2835         {
2836                 .procname       =       "gc_timeout",
2837                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2838                 .maxlen         =       sizeof(int),
2839                 .mode           =       0644,
2840                 .proc_handler   =       proc_dointvec_jiffies,
2841         },
2842         {
2843                 .procname       =       "gc_interval",
2844                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2845                 .maxlen         =       sizeof(int),
2846                 .mode           =       0644,
2847                 .proc_handler   =       proc_dointvec_jiffies,
2848         },
2849         {
2850                 .procname       =       "gc_elasticity",
2851                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2852                 .maxlen         =       sizeof(int),
2853                 .mode           =       0644,
2854                 .proc_handler   =       proc_dointvec,
2855         },
2856         {
2857                 .procname       =       "mtu_expires",
2858                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2859                 .maxlen         =       sizeof(int),
2860                 .mode           =       0644,
2861                 .proc_handler   =       proc_dointvec_jiffies,
2862         },
2863         {
2864                 .procname       =       "min_adv_mss",
2865                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2866                 .maxlen         =       sizeof(int),
2867                 .mode           =       0644,
2868                 .proc_handler   =       proc_dointvec,
2869         },
2870         {
2871                 .procname       =       "gc_min_interval_ms",
2872                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2873                 .maxlen         =       sizeof(int),
2874                 .mode           =       0644,
2875                 .proc_handler   =       proc_dointvec_ms_jiffies,
2876         },
2877         { }
2878 };
2879
2880 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2881 {
2882         struct ctl_table *table;
2883
2884         table = kmemdup(ipv6_route_table_template,
2885                         sizeof(ipv6_route_table_template),
2886                         GFP_KERNEL);
2887
2888         if (table) {
2889                 table[0].data = &net->ipv6.sysctl.flush_delay;
2890                 table[0].extra1 = net;
2891                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2892                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2893                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2894                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2895                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2896                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2897                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2898                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2899                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2900         }
2901
2902         return table;
2903 }
2904 #endif
2905
2906 static int __net_init ip6_route_net_init(struct net *net)
2907 {
2908         int ret = -ENOMEM;
2909
2910         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2911                sizeof(net->ipv6.ip6_dst_ops));
2912
2913         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2914                 goto out_ip6_dst_ops;
2915
2916         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2917                                            sizeof(*net->ipv6.ip6_null_entry),
2918                                            GFP_KERNEL);
2919         if (!net->ipv6.ip6_null_entry)
2920                 goto out_ip6_dst_entries;
2921         net->ipv6.ip6_null_entry->dst.path =
2922                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2923         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2924         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2925                          ip6_template_metrics, true);
2926
2927 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2928         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2929                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2930                                                GFP_KERNEL);
2931         if (!net->ipv6.ip6_prohibit_entry)
2932                 goto out_ip6_null_entry;
2933         net->ipv6.ip6_prohibit_entry->dst.path =
2934                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2935         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2936         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2937                          ip6_template_metrics, true);
2938
2939         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2940                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2941                                                GFP_KERNEL);
2942         if (!net->ipv6.ip6_blk_hole_entry)
2943                 goto out_ip6_prohibit_entry;
2944         net->ipv6.ip6_blk_hole_entry->dst.path =
2945                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2946         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2947         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2948                          ip6_template_metrics, true);
2949 #endif
2950
2951         net->ipv6.sysctl.flush_delay = 0;
2952         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2953         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2954         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2955         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2956         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2957         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2958         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2959
2960         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2961
2962         ret = 0;
2963 out:
2964         return ret;
2965
2966 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2967 out_ip6_prohibit_entry:
2968         kfree(net->ipv6.ip6_prohibit_entry);
2969 out_ip6_null_entry:
2970         kfree(net->ipv6.ip6_null_entry);
2971 #endif
2972 out_ip6_dst_entries:
2973         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2974 out_ip6_dst_ops:
2975         goto out;
2976 }
2977
2978 static void __net_exit ip6_route_net_exit(struct net *net)
2979 {
2980         kfree(net->ipv6.ip6_null_entry);
2981 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2982         kfree(net->ipv6.ip6_prohibit_entry);
2983         kfree(net->ipv6.ip6_blk_hole_entry);
2984 #endif
2985         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2986 }
2987
2988 static int __net_init ip6_route_net_init_late(struct net *net)
2989 {
2990 #ifdef CONFIG_PROC_FS
2991         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2992         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2993 #endif
2994         return 0;
2995 }
2996
2997 static void __net_exit ip6_route_net_exit_late(struct net *net)
2998 {
2999 #ifdef CONFIG_PROC_FS
3000         proc_net_remove(net, "ipv6_route");
3001         proc_net_remove(net, "rt6_stats");
3002 #endif
3003 }
3004
3005 static struct pernet_operations ip6_route_net_ops = {
3006         .init = ip6_route_net_init,
3007         .exit = ip6_route_net_exit,
3008 };
3009
3010 static struct pernet_operations ip6_route_net_late_ops = {
3011         .init = ip6_route_net_init_late,
3012         .exit = ip6_route_net_exit_late,
3013 };
3014
3015 static struct notifier_block ip6_route_dev_notifier = {
3016         .notifier_call = ip6_route_dev_notify,
3017         .priority = 0,
3018 };
3019
3020 int __init ip6_route_init(void)
3021 {
3022         int ret;
3023
3024         ret = -ENOMEM;
3025         ip6_dst_ops_template.kmem_cachep =
3026                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3027                                   SLAB_HWCACHE_ALIGN, NULL);
3028         if (!ip6_dst_ops_template.kmem_cachep)
3029                 goto out;
3030
3031         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3032         if (ret)
3033                 goto out_kmem_cache;
3034
3035         ret = register_pernet_subsys(&ip6_route_net_ops);
3036         if (ret)
3037                 goto out_dst_entries;
3038
3039         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3040
3041         /* Registering of the loopback is done before this portion of code,
3042          * the loopback reference in rt6_info will not be taken, do it
3043          * manually for init_net */
3044         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3045         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3046   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3047         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3048         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3049         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3050         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3051   #endif
3052         ret = fib6_init();
3053         if (ret)
3054                 goto out_register_subsys;
3055
3056         ret = xfrm6_init();
3057         if (ret)
3058                 goto out_fib6_init;
3059
3060         ret = fib6_rules_init();
3061         if (ret)
3062                 goto xfrm6_init;
3063
3064         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3065         if (ret)
3066                 goto fib6_rules_init;
3067
3068         ret = -ENOBUFS;
3069         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3070             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3071             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3072                 goto out_register_late_subsys;
3073
3074         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3075         if (ret)
3076                 goto out_register_late_subsys;
3077
3078 out:
3079         return ret;
3080
3081 out_register_late_subsys:
3082         unregister_pernet_subsys(&ip6_route_net_late_ops);
3083 fib6_rules_init:
3084         fib6_rules_cleanup();
3085 xfrm6_init:
3086         xfrm6_fini();
3087 out_fib6_init:
3088         fib6_gc_cleanup();
3089 out_register_subsys:
3090         unregister_pernet_subsys(&ip6_route_net_ops);
3091 out_dst_entries:
3092         dst_entries_destroy(&ip6_dst_blackhole_ops);
3093 out_kmem_cache:
3094         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3095         goto out;
3096 }
3097
3098 void ip6_route_cleanup(void)
3099 {
3100         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3101         unregister_pernet_subsys(&ip6_route_net_late_ops);
3102         fib6_rules_cleanup();
3103         xfrm6_fini();
3104         fib6_gc_cleanup();
3105         unregister_pernet_subsys(&ip6_route_net_ops);
3106         dst_entries_destroy(&ip6_dst_blackhole_ops);
3107         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3108 }