Merge remote-tracking branch 'lsk/v3.10/topic/gator' into linux-linaro-lsk
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_redirect_number __read_mostly  = 9;
121 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
122 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
123 static int ip_rt_error_cost __read_mostly       = HZ;
124 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
125 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
126 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
127 static int ip_rt_min_advmss __read_mostly       = 256;
128
129 /*
130  *      Interface to generic destination cache.
131  */
132
133 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
134 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
135 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
136 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
137 static void              ipv4_link_failure(struct sk_buff *skb);
138 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
139                                            struct sk_buff *skb, u32 mtu);
140 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
141                                         struct sk_buff *skb);
142 static void             ipv4_dst_destroy(struct dst_entry *dst);
143
144 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
145                             int how)
146 {
147 }
148
149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151         WARN_ON(1);
152         return NULL;
153 }
154
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156                                            struct sk_buff *skb,
157                                            const void *daddr);
158
159 static struct dst_ops ipv4_dst_ops = {
160         .family =               AF_INET,
161         .protocol =             cpu_to_be16(ETH_P_IP),
162         .check =                ipv4_dst_check,
163         .default_advmss =       ipv4_default_advmss,
164         .mtu =                  ipv4_mtu,
165         .cow_metrics =          ipv4_cow_metrics,
166         .destroy =              ipv4_dst_destroy,
167         .ifdown =               ipv4_dst_ifdown,
168         .negative_advice =      ipv4_negative_advice,
169         .link_failure =         ipv4_link_failure,
170         .update_pmtu =          ip_rt_update_pmtu,
171         .redirect =             ip_do_redirect,
172         .local_out =            __ip_local_out,
173         .neigh_lookup =         ipv4_neigh_lookup,
174 };
175
176 #define ECN_OR_COST(class)      TC_PRIO_##class
177
178 const __u8 ip_tos2prio[16] = {
179         TC_PRIO_BESTEFFORT,
180         ECN_OR_COST(BESTEFFORT),
181         TC_PRIO_BESTEFFORT,
182         ECN_OR_COST(BESTEFFORT),
183         TC_PRIO_BULK,
184         ECN_OR_COST(BULK),
185         TC_PRIO_BULK,
186         ECN_OR_COST(BULK),
187         TC_PRIO_INTERACTIVE,
188         ECN_OR_COST(INTERACTIVE),
189         TC_PRIO_INTERACTIVE,
190         ECN_OR_COST(INTERACTIVE),
191         TC_PRIO_INTERACTIVE_BULK,
192         ECN_OR_COST(INTERACTIVE_BULK),
193         TC_PRIO_INTERACTIVE_BULK,
194         ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
200
201 #ifdef CONFIG_PROC_FS
202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204         if (*pos)
205                 return NULL;
206         return SEQ_START_TOKEN;
207 }
208
209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211         ++*pos;
212         return NULL;
213 }
214
215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218
219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221         if (v == SEQ_START_TOKEN)
222                 seq_printf(seq, "%-127s\n",
223                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225                            "HHUptod\tSpecDst");
226         return 0;
227 }
228
229 static const struct seq_operations rt_cache_seq_ops = {
230         .start  = rt_cache_seq_start,
231         .next   = rt_cache_seq_next,
232         .stop   = rt_cache_seq_stop,
233         .show   = rt_cache_seq_show,
234 };
235
236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238         return seq_open(file, &rt_cache_seq_ops);
239 }
240
241 static const struct file_operations rt_cache_seq_fops = {
242         .owner   = THIS_MODULE,
243         .open    = rt_cache_seq_open,
244         .read    = seq_read,
245         .llseek  = seq_lseek,
246         .release = seq_release,
247 };
248
249
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252         int cpu;
253
254         if (*pos == 0)
255                 return SEQ_START_TOKEN;
256
257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258                 if (!cpu_possible(cpu))
259                         continue;
260                 *pos = cpu+1;
261                 return &per_cpu(rt_cache_stat, cpu);
262         }
263         return NULL;
264 }
265
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268         int cpu;
269
270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271                 if (!cpu_possible(cpu))
272                         continue;
273                 *pos = cpu+1;
274                 return &per_cpu(rt_cache_stat, cpu);
275         }
276         return NULL;
277
278 }
279
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287         struct rt_cache_stat *st = v;
288
289         if (v == SEQ_START_TOKEN) {
290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291                 return 0;
292         }
293
294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296                    dst_entries_get_slow(&ipv4_dst_ops),
297                    st->in_hit,
298                    st->in_slow_tot,
299                    st->in_slow_mc,
300                    st->in_no_route,
301                    st->in_brd,
302                    st->in_martian_dst,
303                    st->in_martian_src,
304
305                    st->out_hit,
306                    st->out_slow_tot,
307                    st->out_slow_mc,
308
309                    st->gc_total,
310                    st->gc_ignored,
311                    st->gc_goal_miss,
312                    st->gc_dst_overflow,
313                    st->in_hlist_search,
314                    st->out_hlist_search
315                 );
316         return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320         .start  = rt_cpu_seq_start,
321         .next   = rt_cpu_seq_next,
322         .stop   = rt_cpu_seq_stop,
323         .show   = rt_cpu_seq_show,
324 };
325
326
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329         return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct file_operations rt_cpu_seq_fops = {
333         .owner   = THIS_MODULE,
334         .open    = rt_cpu_seq_open,
335         .read    = seq_read,
336         .llseek  = seq_lseek,
337         .release = seq_release,
338 };
339
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343         struct ip_rt_acct *dst, *src;
344         unsigned int i, j;
345
346         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347         if (!dst)
348                 return -ENOMEM;
349
350         for_each_possible_cpu(i) {
351                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352                 for (j = 0; j < 256; j++) {
353                         dst[j].o_bytes   += src[j].o_bytes;
354                         dst[j].o_packets += src[j].o_packets;
355                         dst[j].i_bytes   += src[j].i_bytes;
356                         dst[j].i_packets += src[j].i_packets;
357                 }
358         }
359
360         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361         kfree(dst);
362         return 0;
363 }
364
365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
366 {
367         return single_open(file, rt_acct_proc_show, NULL);
368 }
369
370 static const struct file_operations rt_acct_proc_fops = {
371         .owner          = THIS_MODULE,
372         .open           = rt_acct_proc_open,
373         .read           = seq_read,
374         .llseek         = seq_lseek,
375         .release        = single_release,
376 };
377 #endif
378
379 static int __net_init ip_rt_do_proc_init(struct net *net)
380 {
381         struct proc_dir_entry *pde;
382
383         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
384                           &rt_cache_seq_fops);
385         if (!pde)
386                 goto err1;
387
388         pde = proc_create("rt_cache", S_IRUGO,
389                           net->proc_net_stat, &rt_cpu_seq_fops);
390         if (!pde)
391                 goto err2;
392
393 #ifdef CONFIG_IP_ROUTE_CLASSID
394         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
395         if (!pde)
396                 goto err3;
397 #endif
398         return 0;
399
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401 err3:
402         remove_proc_entry("rt_cache", net->proc_net_stat);
403 #endif
404 err2:
405         remove_proc_entry("rt_cache", net->proc_net);
406 err1:
407         return -ENOMEM;
408 }
409
410 static void __net_exit ip_rt_do_proc_exit(struct net *net)
411 {
412         remove_proc_entry("rt_cache", net->proc_net_stat);
413         remove_proc_entry("rt_cache", net->proc_net);
414 #ifdef CONFIG_IP_ROUTE_CLASSID
415         remove_proc_entry("rt_acct", net->proc_net);
416 #endif
417 }
418
419 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
420         .init = ip_rt_do_proc_init,
421         .exit = ip_rt_do_proc_exit,
422 };
423
424 static int __init ip_rt_proc_init(void)
425 {
426         return register_pernet_subsys(&ip_rt_proc_ops);
427 }
428
429 #else
430 static inline int ip_rt_proc_init(void)
431 {
432         return 0;
433 }
434 #endif /* CONFIG_PROC_FS */
435
436 static inline bool rt_is_expired(const struct rtable *rth)
437 {
438         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
439 }
440
441 void rt_cache_flush(struct net *net)
442 {
443         rt_genid_bump(net);
444 }
445
446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
447                                            struct sk_buff *skb,
448                                            const void *daddr)
449 {
450         struct net_device *dev = dst->dev;
451         const __be32 *pkey = daddr;
452         const struct rtable *rt;
453         struct neighbour *n;
454
455         rt = (const struct rtable *) dst;
456         if (rt->rt_gateway)
457                 pkey = (const __be32 *) &rt->rt_gateway;
458         else if (skb)
459                 pkey = &ip_hdr(skb)->daddr;
460
461         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
462         if (n)
463                 return n;
464         return neigh_create(&arp_tbl, pkey, dev);
465 }
466
467 /*
468  * Peer allocation may fail only in serious out-of-memory conditions.  However
469  * we still can generate some output.
470  * Random ID selection looks a bit dangerous because we have no chances to
471  * select ID being unique in a reasonable period of time.
472  * But broken packet identifier may be better than no packet at all.
473  */
474 static void ip_select_fb_ident(struct iphdr *iph)
475 {
476         static DEFINE_SPINLOCK(ip_fb_id_lock);
477         static u32 ip_fallback_id;
478         u32 salt;
479
480         spin_lock_bh(&ip_fb_id_lock);
481         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
482         iph->id = htons(salt & 0xFFFF);
483         ip_fallback_id = salt;
484         spin_unlock_bh(&ip_fb_id_lock);
485 }
486
487 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
488 {
489         struct net *net = dev_net(dst->dev);
490         struct inet_peer *peer;
491
492         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
493         if (peer) {
494                 iph->id = htons(inet_getid(peer, more));
495                 inet_putpeer(peer);
496                 return;
497         }
498
499         ip_select_fb_ident(iph);
500 }
501 EXPORT_SYMBOL(__ip_select_ident);
502
503 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
504                              const struct iphdr *iph,
505                              int oif, u8 tos,
506                              u8 prot, u32 mark, int flow_flags)
507 {
508         if (sk) {
509                 const struct inet_sock *inet = inet_sk(sk);
510
511                 oif = sk->sk_bound_dev_if;
512                 mark = sk->sk_mark;
513                 tos = RT_CONN_FLAGS(sk);
514                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
515         }
516         flowi4_init_output(fl4, oif, mark, tos,
517                            RT_SCOPE_UNIVERSE, prot,
518                            flow_flags,
519                            iph->daddr, iph->saddr, 0, 0);
520 }
521
522 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
523                                const struct sock *sk)
524 {
525         const struct iphdr *iph = ip_hdr(skb);
526         int oif = skb->dev->ifindex;
527         u8 tos = RT_TOS(iph->tos);
528         u8 prot = iph->protocol;
529         u32 mark = skb->mark;
530
531         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
532 }
533
534 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
535 {
536         const struct inet_sock *inet = inet_sk(sk);
537         const struct ip_options_rcu *inet_opt;
538         __be32 daddr = inet->inet_daddr;
539
540         rcu_read_lock();
541         inet_opt = rcu_dereference(inet->inet_opt);
542         if (inet_opt && inet_opt->opt.srr)
543                 daddr = inet_opt->opt.faddr;
544         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
545                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
546                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
547                            inet_sk_flowi_flags(sk),
548                            daddr, inet->inet_saddr, 0, 0);
549         rcu_read_unlock();
550 }
551
552 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
553                                  const struct sk_buff *skb)
554 {
555         if (skb)
556                 build_skb_flow_key(fl4, skb, sk);
557         else
558                 build_sk_flow_key(fl4, sk);
559 }
560
561 static inline void rt_free(struct rtable *rt)
562 {
563         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
564 }
565
566 static DEFINE_SPINLOCK(fnhe_lock);
567
568 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
569 {
570         struct fib_nh_exception *fnhe, *oldest;
571         struct rtable *orig;
572
573         oldest = rcu_dereference(hash->chain);
574         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
575              fnhe = rcu_dereference(fnhe->fnhe_next)) {
576                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
577                         oldest = fnhe;
578         }
579         orig = rcu_dereference(oldest->fnhe_rth);
580         if (orig) {
581                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
582                 rt_free(orig);
583         }
584         return oldest;
585 }
586
587 static inline u32 fnhe_hashfun(__be32 daddr)
588 {
589         u32 hval;
590
591         hval = (__force u32) daddr;
592         hval ^= (hval >> 11) ^ (hval >> 22);
593
594         return hval & (FNHE_HASH_SIZE - 1);
595 }
596
597 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
598                                   u32 pmtu, unsigned long expires)
599 {
600         struct fnhe_hash_bucket *hash;
601         struct fib_nh_exception *fnhe;
602         int depth;
603         u32 hval = fnhe_hashfun(daddr);
604
605         spin_lock_bh(&fnhe_lock);
606
607         hash = nh->nh_exceptions;
608         if (!hash) {
609                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
610                 if (!hash)
611                         goto out_unlock;
612                 nh->nh_exceptions = hash;
613         }
614
615         hash += hval;
616
617         depth = 0;
618         for (fnhe = rcu_dereference(hash->chain); fnhe;
619              fnhe = rcu_dereference(fnhe->fnhe_next)) {
620                 if (fnhe->fnhe_daddr == daddr)
621                         break;
622                 depth++;
623         }
624
625         if (fnhe) {
626                 if (gw)
627                         fnhe->fnhe_gw = gw;
628                 if (pmtu) {
629                         fnhe->fnhe_pmtu = pmtu;
630                         fnhe->fnhe_expires = expires;
631                 }
632         } else {
633                 if (depth > FNHE_RECLAIM_DEPTH)
634                         fnhe = fnhe_oldest(hash);
635                 else {
636                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
637                         if (!fnhe)
638                                 goto out_unlock;
639
640                         fnhe->fnhe_next = hash->chain;
641                         rcu_assign_pointer(hash->chain, fnhe);
642                 }
643                 fnhe->fnhe_daddr = daddr;
644                 fnhe->fnhe_gw = gw;
645                 fnhe->fnhe_pmtu = pmtu;
646                 fnhe->fnhe_expires = expires;
647         }
648
649         fnhe->fnhe_stamp = jiffies;
650
651 out_unlock:
652         spin_unlock_bh(&fnhe_lock);
653         return;
654 }
655
656 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
657                              bool kill_route)
658 {
659         __be32 new_gw = icmp_hdr(skb)->un.gateway;
660         __be32 old_gw = ip_hdr(skb)->saddr;
661         struct net_device *dev = skb->dev;
662         struct in_device *in_dev;
663         struct fib_result res;
664         struct neighbour *n;
665         struct net *net;
666
667         switch (icmp_hdr(skb)->code & 7) {
668         case ICMP_REDIR_NET:
669         case ICMP_REDIR_NETTOS:
670         case ICMP_REDIR_HOST:
671         case ICMP_REDIR_HOSTTOS:
672                 break;
673
674         default:
675                 return;
676         }
677
678         if (rt->rt_gateway != old_gw)
679                 return;
680
681         in_dev = __in_dev_get_rcu(dev);
682         if (!in_dev)
683                 return;
684
685         net = dev_net(dev);
686         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
687             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
688             ipv4_is_zeronet(new_gw))
689                 goto reject_redirect;
690
691         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
692                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
693                         goto reject_redirect;
694                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
695                         goto reject_redirect;
696         } else {
697                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
698                         goto reject_redirect;
699         }
700
701         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
702         if (n) {
703                 if (!(n->nud_state & NUD_VALID)) {
704                         neigh_event_send(n, NULL);
705                 } else {
706                         if (fib_lookup(net, fl4, &res) == 0) {
707                                 struct fib_nh *nh = &FIB_RES_NH(res);
708
709                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
710                                                       0, 0);
711                         }
712                         if (kill_route)
713                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
714                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
715                 }
716                 neigh_release(n);
717         }
718         return;
719
720 reject_redirect:
721 #ifdef CONFIG_IP_ROUTE_VERBOSE
722         if (IN_DEV_LOG_MARTIANS(in_dev)) {
723                 const struct iphdr *iph = (const struct iphdr *) skb->data;
724                 __be32 daddr = iph->daddr;
725                 __be32 saddr = iph->saddr;
726
727                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
728                                      "  Advised path = %pI4 -> %pI4\n",
729                                      &old_gw, dev->name, &new_gw,
730                                      &saddr, &daddr);
731         }
732 #endif
733         ;
734 }
735
736 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
737 {
738         struct rtable *rt;
739         struct flowi4 fl4;
740         const struct iphdr *iph = (const struct iphdr *) skb->data;
741         int oif = skb->dev->ifindex;
742         u8 tos = RT_TOS(iph->tos);
743         u8 prot = iph->protocol;
744         u32 mark = skb->mark;
745
746         rt = (struct rtable *) dst;
747
748         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
749         __ip_do_redirect(rt, skb, &fl4, true);
750 }
751
752 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
753 {
754         struct rtable *rt = (struct rtable *)dst;
755         struct dst_entry *ret = dst;
756
757         if (rt) {
758                 if (dst->obsolete > 0) {
759                         ip_rt_put(rt);
760                         ret = NULL;
761                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
762                            rt->dst.expires) {
763                         ip_rt_put(rt);
764                         ret = NULL;
765                 }
766         }
767         return ret;
768 }
769
770 /*
771  * Algorithm:
772  *      1. The first ip_rt_redirect_number redirects are sent
773  *         with exponential backoff, then we stop sending them at all,
774  *         assuming that the host ignores our redirects.
775  *      2. If we did not see packets requiring redirects
776  *         during ip_rt_redirect_silence, we assume that the host
777  *         forgot redirected route and start to send redirects again.
778  *
779  * This algorithm is much cheaper and more intelligent than dumb load limiting
780  * in icmp.c.
781  *
782  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
783  * and "frag. need" (breaks PMTU discovery) in icmp.c.
784  */
785
786 void ip_rt_send_redirect(struct sk_buff *skb)
787 {
788         struct rtable *rt = skb_rtable(skb);
789         struct in_device *in_dev;
790         struct inet_peer *peer;
791         struct net *net;
792         int log_martians;
793
794         rcu_read_lock();
795         in_dev = __in_dev_get_rcu(rt->dst.dev);
796         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
797                 rcu_read_unlock();
798                 return;
799         }
800         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
801         rcu_read_unlock();
802
803         net = dev_net(rt->dst.dev);
804         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
805         if (!peer) {
806                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
807                           rt_nexthop(rt, ip_hdr(skb)->daddr));
808                 return;
809         }
810
811         /* No redirected packets during ip_rt_redirect_silence;
812          * reset the algorithm.
813          */
814         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
815                 peer->rate_tokens = 0;
816
817         /* Too many ignored redirects; do not send anything
818          * set dst.rate_last to the last seen redirected packet.
819          */
820         if (peer->rate_tokens >= ip_rt_redirect_number) {
821                 peer->rate_last = jiffies;
822                 goto out_put_peer;
823         }
824
825         /* Check for load limit; set rate_last to the latest sent
826          * redirect.
827          */
828         if (peer->rate_tokens == 0 ||
829             time_after(jiffies,
830                        (peer->rate_last +
831                         (ip_rt_redirect_load << peer->rate_tokens)))) {
832                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
833
834                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
835                 peer->rate_last = jiffies;
836                 ++peer->rate_tokens;
837 #ifdef CONFIG_IP_ROUTE_VERBOSE
838                 if (log_martians &&
839                     peer->rate_tokens == ip_rt_redirect_number)
840                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
841                                              &ip_hdr(skb)->saddr, inet_iif(skb),
842                                              &ip_hdr(skb)->daddr, &gw);
843 #endif
844         }
845 out_put_peer:
846         inet_putpeer(peer);
847 }
848
849 static int ip_error(struct sk_buff *skb)
850 {
851         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
852         struct rtable *rt = skb_rtable(skb);
853         struct inet_peer *peer;
854         unsigned long now;
855         struct net *net;
856         bool send;
857         int code;
858
859         net = dev_net(rt->dst.dev);
860         if (!IN_DEV_FORWARD(in_dev)) {
861                 switch (rt->dst.error) {
862                 case EHOSTUNREACH:
863                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
864                         break;
865
866                 case ENETUNREACH:
867                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
868                         break;
869                 }
870                 goto out;
871         }
872
873         switch (rt->dst.error) {
874         case EINVAL:
875         default:
876                 goto out;
877         case EHOSTUNREACH:
878                 code = ICMP_HOST_UNREACH;
879                 break;
880         case ENETUNREACH:
881                 code = ICMP_NET_UNREACH;
882                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
883                 break;
884         case EACCES:
885                 code = ICMP_PKT_FILTERED;
886                 break;
887         }
888
889         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
890
891         send = true;
892         if (peer) {
893                 now = jiffies;
894                 peer->rate_tokens += now - peer->rate_last;
895                 if (peer->rate_tokens > ip_rt_error_burst)
896                         peer->rate_tokens = ip_rt_error_burst;
897                 peer->rate_last = now;
898                 if (peer->rate_tokens >= ip_rt_error_cost)
899                         peer->rate_tokens -= ip_rt_error_cost;
900                 else
901                         send = false;
902                 inet_putpeer(peer);
903         }
904         if (send)
905                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
906
907 out:    kfree_skb(skb);
908         return 0;
909 }
910
911 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
912 {
913         struct dst_entry *dst = &rt->dst;
914         struct fib_result res;
915
916         if (dst_metric_locked(dst, RTAX_MTU))
917                 return;
918
919         if (dst->dev->mtu < mtu)
920                 return;
921
922         if (mtu < ip_rt_min_pmtu)
923                 mtu = ip_rt_min_pmtu;
924
925         if (!rt->rt_pmtu) {
926                 dst->obsolete = DST_OBSOLETE_KILL;
927         } else {
928                 rt->rt_pmtu = mtu;
929                 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
930         }
931
932         rcu_read_lock();
933         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
934                 struct fib_nh *nh = &FIB_RES_NH(res);
935
936                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
937                                       jiffies + ip_rt_mtu_expires);
938         }
939         rcu_read_unlock();
940 }
941
942 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
943                               struct sk_buff *skb, u32 mtu)
944 {
945         struct rtable *rt = (struct rtable *) dst;
946         struct flowi4 fl4;
947
948         ip_rt_build_flow_key(&fl4, sk, skb);
949         __ip_rt_update_pmtu(rt, &fl4, mtu);
950 }
951
952 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
953                       int oif, u32 mark, u8 protocol, int flow_flags)
954 {
955         const struct iphdr *iph = (const struct iphdr *) skb->data;
956         struct flowi4 fl4;
957         struct rtable *rt;
958
959         __build_flow_key(&fl4, NULL, iph, oif,
960                          RT_TOS(iph->tos), protocol, mark, flow_flags);
961         rt = __ip_route_output_key(net, &fl4);
962         if (!IS_ERR(rt)) {
963                 __ip_rt_update_pmtu(rt, &fl4, mtu);
964                 ip_rt_put(rt);
965         }
966 }
967 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
968
969 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
970 {
971         const struct iphdr *iph = (const struct iphdr *) skb->data;
972         struct flowi4 fl4;
973         struct rtable *rt;
974
975         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
976         rt = __ip_route_output_key(sock_net(sk), &fl4);
977         if (!IS_ERR(rt)) {
978                 __ip_rt_update_pmtu(rt, &fl4, mtu);
979                 ip_rt_put(rt);
980         }
981 }
982
983 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
984 {
985         const struct iphdr *iph = (const struct iphdr *) skb->data;
986         struct flowi4 fl4;
987         struct rtable *rt;
988         struct dst_entry *odst = NULL;
989         bool new = false;
990
991         bh_lock_sock(sk);
992         odst = sk_dst_get(sk);
993
994         if (sock_owned_by_user(sk) || !odst) {
995                 __ipv4_sk_update_pmtu(skb, sk, mtu);
996                 goto out;
997         }
998
999         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1000
1001         rt = (struct rtable *)odst;
1002         if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
1003                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1004                 if (IS_ERR(rt))
1005                         goto out;
1006
1007                 new = true;
1008         }
1009
1010         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1011
1012         if (!dst_check(&rt->dst, 0)) {
1013                 if (new)
1014                         dst_release(&rt->dst);
1015
1016                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1017                 if (IS_ERR(rt))
1018                         goto out;
1019
1020                 new = true;
1021         }
1022
1023         if (new)
1024                 sk_dst_set(sk, &rt->dst);
1025
1026 out:
1027         bh_unlock_sock(sk);
1028         dst_release(odst);
1029 }
1030 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1031
1032 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1033                    int oif, u32 mark, u8 protocol, int flow_flags)
1034 {
1035         const struct iphdr *iph = (const struct iphdr *) skb->data;
1036         struct flowi4 fl4;
1037         struct rtable *rt;
1038
1039         __build_flow_key(&fl4, NULL, iph, oif,
1040                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1041         rt = __ip_route_output_key(net, &fl4);
1042         if (!IS_ERR(rt)) {
1043                 __ip_do_redirect(rt, skb, &fl4, false);
1044                 ip_rt_put(rt);
1045         }
1046 }
1047 EXPORT_SYMBOL_GPL(ipv4_redirect);
1048
1049 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1050 {
1051         const struct iphdr *iph = (const struct iphdr *) skb->data;
1052         struct flowi4 fl4;
1053         struct rtable *rt;
1054
1055         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1056         rt = __ip_route_output_key(sock_net(sk), &fl4);
1057         if (!IS_ERR(rt)) {
1058                 __ip_do_redirect(rt, skb, &fl4, false);
1059                 ip_rt_put(rt);
1060         }
1061 }
1062 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1063
1064 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1065 {
1066         struct rtable *rt = (struct rtable *) dst;
1067
1068         /* All IPV4 dsts are created with ->obsolete set to the value
1069          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1070          * into this function always.
1071          *
1072          * When a PMTU/redirect information update invalidates a
1073          * route, this is indicated by setting obsolete to
1074          * DST_OBSOLETE_KILL.
1075          */
1076         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1077                 return NULL;
1078         return dst;
1079 }
1080
1081 static void ipv4_link_failure(struct sk_buff *skb)
1082 {
1083         struct rtable *rt;
1084
1085         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1086
1087         rt = skb_rtable(skb);
1088         if (rt)
1089                 dst_set_expires(&rt->dst, 0);
1090 }
1091
1092 static int ip_rt_bug(struct sk_buff *skb)
1093 {
1094         pr_debug("%s: %pI4 -> %pI4, %s\n",
1095                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1096                  skb->dev ? skb->dev->name : "?");
1097         kfree_skb(skb);
1098         WARN_ON(1);
1099         return 0;
1100 }
1101
1102 /*
1103    We do not cache source address of outgoing interface,
1104    because it is used only by IP RR, TS and SRR options,
1105    so that it out of fast path.
1106
1107    BTW remember: "addr" is allowed to be not aligned
1108    in IP options!
1109  */
1110
1111 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1112 {
1113         __be32 src;
1114
1115         if (rt_is_output_route(rt))
1116                 src = ip_hdr(skb)->saddr;
1117         else {
1118                 struct fib_result res;
1119                 struct flowi4 fl4;
1120                 struct iphdr *iph;
1121
1122                 iph = ip_hdr(skb);
1123
1124                 memset(&fl4, 0, sizeof(fl4));
1125                 fl4.daddr = iph->daddr;
1126                 fl4.saddr = iph->saddr;
1127                 fl4.flowi4_tos = RT_TOS(iph->tos);
1128                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1129                 fl4.flowi4_iif = skb->dev->ifindex;
1130                 fl4.flowi4_mark = skb->mark;
1131
1132                 rcu_read_lock();
1133                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1134                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1135                 else
1136                         src = inet_select_addr(rt->dst.dev,
1137                                                rt_nexthop(rt, iph->daddr),
1138                                                RT_SCOPE_UNIVERSE);
1139                 rcu_read_unlock();
1140         }
1141         memcpy(addr, &src, 4);
1142 }
1143
1144 #ifdef CONFIG_IP_ROUTE_CLASSID
1145 static void set_class_tag(struct rtable *rt, u32 tag)
1146 {
1147         if (!(rt->dst.tclassid & 0xFFFF))
1148                 rt->dst.tclassid |= tag & 0xFFFF;
1149         if (!(rt->dst.tclassid & 0xFFFF0000))
1150                 rt->dst.tclassid |= tag & 0xFFFF0000;
1151 }
1152 #endif
1153
1154 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1155 {
1156         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1157
1158         if (advmss == 0) {
1159                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1160                                ip_rt_min_advmss);
1161                 if (advmss > 65535 - 40)
1162                         advmss = 65535 - 40;
1163         }
1164         return advmss;
1165 }
1166
1167 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1168 {
1169         const struct rtable *rt = (const struct rtable *) dst;
1170         unsigned int mtu = rt->rt_pmtu;
1171
1172         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1173                 mtu = dst_metric_raw(dst, RTAX_MTU);
1174
1175         if (mtu)
1176                 return mtu;
1177
1178         mtu = dst->dev->mtu;
1179
1180         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1181                 if (rt->rt_uses_gateway && mtu > 576)
1182                         mtu = 576;
1183         }
1184
1185         if (mtu > IP_MAX_MTU)
1186                 mtu = IP_MAX_MTU;
1187
1188         return mtu;
1189 }
1190
1191 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1192 {
1193         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1194         struct fib_nh_exception *fnhe;
1195         u32 hval;
1196
1197         if (!hash)
1198                 return NULL;
1199
1200         hval = fnhe_hashfun(daddr);
1201
1202         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1203              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1204                 if (fnhe->fnhe_daddr == daddr)
1205                         return fnhe;
1206         }
1207         return NULL;
1208 }
1209
1210 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1211                               __be32 daddr)
1212 {
1213         bool ret = false;
1214
1215         spin_lock_bh(&fnhe_lock);
1216
1217         if (daddr == fnhe->fnhe_daddr) {
1218                 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1219                 if (orig && rt_is_expired(orig)) {
1220                         fnhe->fnhe_gw = 0;
1221                         fnhe->fnhe_pmtu = 0;
1222                         fnhe->fnhe_expires = 0;
1223                 }
1224                 if (fnhe->fnhe_pmtu) {
1225                         unsigned long expires = fnhe->fnhe_expires;
1226                         unsigned long diff = expires - jiffies;
1227
1228                         if (time_before(jiffies, expires)) {
1229                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1230                                 dst_set_expires(&rt->dst, diff);
1231                         }
1232                 }
1233                 if (fnhe->fnhe_gw) {
1234                         rt->rt_flags |= RTCF_REDIRECTED;
1235                         rt->rt_gateway = fnhe->fnhe_gw;
1236                         rt->rt_uses_gateway = 1;
1237                 } else if (!rt->rt_gateway)
1238                         rt->rt_gateway = daddr;
1239
1240                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1241                 if (orig)
1242                         rt_free(orig);
1243
1244                 fnhe->fnhe_stamp = jiffies;
1245                 ret = true;
1246         }
1247         spin_unlock_bh(&fnhe_lock);
1248
1249         return ret;
1250 }
1251
1252 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1253 {
1254         struct rtable *orig, *prev, **p;
1255         bool ret = true;
1256
1257         if (rt_is_input_route(rt)) {
1258                 p = (struct rtable **)&nh->nh_rth_input;
1259         } else {
1260                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1261         }
1262         orig = *p;
1263
1264         prev = cmpxchg(p, orig, rt);
1265         if (prev == orig) {
1266                 if (orig)
1267                         rt_free(orig);
1268         } else
1269                 ret = false;
1270
1271         return ret;
1272 }
1273
1274 static DEFINE_SPINLOCK(rt_uncached_lock);
1275 static LIST_HEAD(rt_uncached_list);
1276
1277 static void rt_add_uncached_list(struct rtable *rt)
1278 {
1279         spin_lock_bh(&rt_uncached_lock);
1280         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1281         spin_unlock_bh(&rt_uncached_lock);
1282 }
1283
1284 static void ipv4_dst_destroy(struct dst_entry *dst)
1285 {
1286         struct rtable *rt = (struct rtable *) dst;
1287
1288         if (!list_empty(&rt->rt_uncached)) {
1289                 spin_lock_bh(&rt_uncached_lock);
1290                 list_del(&rt->rt_uncached);
1291                 spin_unlock_bh(&rt_uncached_lock);
1292         }
1293 }
1294
1295 void rt_flush_dev(struct net_device *dev)
1296 {
1297         if (!list_empty(&rt_uncached_list)) {
1298                 struct net *net = dev_net(dev);
1299                 struct rtable *rt;
1300
1301                 spin_lock_bh(&rt_uncached_lock);
1302                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1303                         if (rt->dst.dev != dev)
1304                                 continue;
1305                         rt->dst.dev = net->loopback_dev;
1306                         dev_hold(rt->dst.dev);
1307                         dev_put(dev);
1308                 }
1309                 spin_unlock_bh(&rt_uncached_lock);
1310         }
1311 }
1312
1313 static bool rt_cache_valid(const struct rtable *rt)
1314 {
1315         return  rt &&
1316                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1317                 !rt_is_expired(rt);
1318 }
1319
1320 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1321                            const struct fib_result *res,
1322                            struct fib_nh_exception *fnhe,
1323                            struct fib_info *fi, u16 type, u32 itag)
1324 {
1325         bool cached = false;
1326
1327         if (fi) {
1328                 struct fib_nh *nh = &FIB_RES_NH(*res);
1329
1330                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1331                         rt->rt_gateway = nh->nh_gw;
1332                         rt->rt_uses_gateway = 1;
1333                 }
1334                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1335 #ifdef CONFIG_IP_ROUTE_CLASSID
1336                 rt->dst.tclassid = nh->nh_tclassid;
1337 #endif
1338                 if (unlikely(fnhe))
1339                         cached = rt_bind_exception(rt, fnhe, daddr);
1340                 else if (!(rt->dst.flags & DST_NOCACHE))
1341                         cached = rt_cache_route(nh, rt);
1342                 if (unlikely(!cached)) {
1343                         /* Routes we intend to cache in nexthop exception or
1344                          * FIB nexthop have the DST_NOCACHE bit clear.
1345                          * However, if we are unsuccessful at storing this
1346                          * route into the cache we really need to set it.
1347                          */
1348                         rt->dst.flags |= DST_NOCACHE;
1349                         if (!rt->rt_gateway)
1350                                 rt->rt_gateway = daddr;
1351                         rt_add_uncached_list(rt);
1352                 }
1353         } else
1354                 rt_add_uncached_list(rt);
1355
1356 #ifdef CONFIG_IP_ROUTE_CLASSID
1357 #ifdef CONFIG_IP_MULTIPLE_TABLES
1358         set_class_tag(rt, res->tclassid);
1359 #endif
1360         set_class_tag(rt, itag);
1361 #endif
1362 }
1363
1364 static struct rtable *rt_dst_alloc(struct net_device *dev,
1365                                    bool nopolicy, bool noxfrm, bool will_cache)
1366 {
1367         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1368                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1369                          (nopolicy ? DST_NOPOLICY : 0) |
1370                          (noxfrm ? DST_NOXFRM : 0));
1371 }
1372
1373 /* called in rcu_read_lock() section */
1374 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1375                                 u8 tos, struct net_device *dev, int our)
1376 {
1377         struct rtable *rth;
1378         struct in_device *in_dev = __in_dev_get_rcu(dev);
1379         u32 itag = 0;
1380         int err;
1381
1382         /* Primary sanity checks. */
1383
1384         if (in_dev == NULL)
1385                 return -EINVAL;
1386
1387         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1388             skb->protocol != htons(ETH_P_IP))
1389                 goto e_inval;
1390
1391         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1392                 if (ipv4_is_loopback(saddr))
1393                         goto e_inval;
1394
1395         if (ipv4_is_zeronet(saddr)) {
1396                 if (!ipv4_is_local_multicast(daddr))
1397                         goto e_inval;
1398         } else {
1399                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1400                                           in_dev, &itag);
1401                 if (err < 0)
1402                         goto e_err;
1403         }
1404         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1405                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1406         if (!rth)
1407                 goto e_nobufs;
1408
1409 #ifdef CONFIG_IP_ROUTE_CLASSID
1410         rth->dst.tclassid = itag;
1411 #endif
1412         rth->dst.output = ip_rt_bug;
1413
1414         rth->rt_genid   = rt_genid(dev_net(dev));
1415         rth->rt_flags   = RTCF_MULTICAST;
1416         rth->rt_type    = RTN_MULTICAST;
1417         rth->rt_is_input= 1;
1418         rth->rt_iif     = 0;
1419         rth->rt_pmtu    = 0;
1420         rth->rt_gateway = 0;
1421         rth->rt_uses_gateway = 0;
1422         INIT_LIST_HEAD(&rth->rt_uncached);
1423         if (our) {
1424                 rth->dst.input= ip_local_deliver;
1425                 rth->rt_flags |= RTCF_LOCAL;
1426         }
1427
1428 #ifdef CONFIG_IP_MROUTE
1429         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1430                 rth->dst.input = ip_mr_input;
1431 #endif
1432         RT_CACHE_STAT_INC(in_slow_mc);
1433
1434         skb_dst_set(skb, &rth->dst);
1435         return 0;
1436
1437 e_nobufs:
1438         return -ENOBUFS;
1439 e_inval:
1440         return -EINVAL;
1441 e_err:
1442         return err;
1443 }
1444
1445
1446 static void ip_handle_martian_source(struct net_device *dev,
1447                                      struct in_device *in_dev,
1448                                      struct sk_buff *skb,
1449                                      __be32 daddr,
1450                                      __be32 saddr)
1451 {
1452         RT_CACHE_STAT_INC(in_martian_src);
1453 #ifdef CONFIG_IP_ROUTE_VERBOSE
1454         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1455                 /*
1456                  *      RFC1812 recommendation, if source is martian,
1457                  *      the only hint is MAC header.
1458                  */
1459                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1460                         &daddr, &saddr, dev->name);
1461                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1462                         print_hex_dump(KERN_WARNING, "ll header: ",
1463                                        DUMP_PREFIX_OFFSET, 16, 1,
1464                                        skb_mac_header(skb),
1465                                        dev->hard_header_len, true);
1466                 }
1467         }
1468 #endif
1469 }
1470
1471 /* called in rcu_read_lock() section */
1472 static int __mkroute_input(struct sk_buff *skb,
1473                            const struct fib_result *res,
1474                            struct in_device *in_dev,
1475                            __be32 daddr, __be32 saddr, u32 tos)
1476 {
1477         struct rtable *rth;
1478         int err;
1479         struct in_device *out_dev;
1480         unsigned int flags = 0;
1481         bool do_cache;
1482         u32 itag = 0;
1483
1484         /* get a working reference to the output device */
1485         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1486         if (out_dev == NULL) {
1487                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1488                 return -EINVAL;
1489         }
1490
1491         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1492                                   in_dev->dev, in_dev, &itag);
1493         if (err < 0) {
1494                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1495                                          saddr);
1496
1497                 goto cleanup;
1498         }
1499
1500         do_cache = res->fi && !itag;
1501         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1502             (IN_DEV_SHARED_MEDIA(out_dev) ||
1503              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1504                 flags |= RTCF_DOREDIRECT;
1505                 do_cache = false;
1506         }
1507
1508         if (skb->protocol != htons(ETH_P_IP)) {
1509                 /* Not IP (i.e. ARP). Do not create route, if it is
1510                  * invalid for proxy arp. DNAT routes are always valid.
1511                  *
1512                  * Proxy arp feature have been extended to allow, ARP
1513                  * replies back to the same interface, to support
1514                  * Private VLAN switch technologies. See arp.c.
1515                  */
1516                 if (out_dev == in_dev &&
1517                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1518                         err = -EINVAL;
1519                         goto cleanup;
1520                 }
1521         }
1522
1523         if (do_cache) {
1524                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1525                 if (rt_cache_valid(rth)) {
1526                         skb_dst_set_noref(skb, &rth->dst);
1527                         goto out;
1528                 }
1529         }
1530
1531         rth = rt_dst_alloc(out_dev->dev,
1532                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1533                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1534         if (!rth) {
1535                 err = -ENOBUFS;
1536                 goto cleanup;
1537         }
1538
1539         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1540         rth->rt_flags = flags;
1541         rth->rt_type = res->type;
1542         rth->rt_is_input = 1;
1543         rth->rt_iif     = 0;
1544         rth->rt_pmtu    = 0;
1545         rth->rt_gateway = 0;
1546         rth->rt_uses_gateway = 0;
1547         INIT_LIST_HEAD(&rth->rt_uncached);
1548         RT_CACHE_STAT_INC(in_slow_tot);
1549
1550         rth->dst.input = ip_forward;
1551         rth->dst.output = ip_output;
1552
1553         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1554         skb_dst_set(skb, &rth->dst);
1555 out:
1556         err = 0;
1557  cleanup:
1558         return err;
1559 }
1560
1561 static int ip_mkroute_input(struct sk_buff *skb,
1562                             struct fib_result *res,
1563                             const struct flowi4 *fl4,
1564                             struct in_device *in_dev,
1565                             __be32 daddr, __be32 saddr, u32 tos)
1566 {
1567 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1568         if (res->fi && res->fi->fib_nhs > 1)
1569                 fib_select_multipath(res);
1570 #endif
1571
1572         /* create a routing cache entry */
1573         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1574 }
1575
1576 /*
1577  *      NOTE. We drop all the packets that has local source
1578  *      addresses, because every properly looped back packet
1579  *      must have correct destination already attached by output routine.
1580  *
1581  *      Such approach solves two big problems:
1582  *      1. Not simplex devices are handled properly.
1583  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1584  *      called with rcu_read_lock()
1585  */
1586
1587 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1588                                u8 tos, struct net_device *dev)
1589 {
1590         struct fib_result res;
1591         struct in_device *in_dev = __in_dev_get_rcu(dev);
1592         struct flowi4   fl4;
1593         unsigned int    flags = 0;
1594         u32             itag = 0;
1595         struct rtable   *rth;
1596         int             err = -EINVAL;
1597         struct net    *net = dev_net(dev);
1598         bool do_cache;
1599
1600         /* IP on this device is disabled. */
1601
1602         if (!in_dev)
1603                 goto out;
1604
1605         /* Check for the most weird martians, which can be not detected
1606            by fib_lookup.
1607          */
1608
1609         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1610                 goto martian_source;
1611
1612         res.fi = NULL;
1613         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1614                 goto brd_input;
1615
1616         /* Accept zero addresses only to limited broadcast;
1617          * I even do not know to fix it or not. Waiting for complains :-)
1618          */
1619         if (ipv4_is_zeronet(saddr))
1620                 goto martian_source;
1621
1622         if (ipv4_is_zeronet(daddr))
1623                 goto martian_destination;
1624
1625         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1626          * and call it once if daddr or/and saddr are loopback addresses
1627          */
1628         if (ipv4_is_loopback(daddr)) {
1629                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1630                         goto martian_destination;
1631         } else if (ipv4_is_loopback(saddr)) {
1632                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1633                         goto martian_source;
1634         }
1635
1636         /*
1637          *      Now we are ready to route packet.
1638          */
1639         fl4.flowi4_oif = 0;
1640         fl4.flowi4_iif = dev->ifindex;
1641         fl4.flowi4_mark = skb->mark;
1642         fl4.flowi4_tos = tos;
1643         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1644         fl4.daddr = daddr;
1645         fl4.saddr = saddr;
1646         err = fib_lookup(net, &fl4, &res);
1647         if (err != 0)
1648                 goto no_route;
1649
1650         if (res.type == RTN_BROADCAST)
1651                 goto brd_input;
1652
1653         if (res.type == RTN_LOCAL) {
1654                 err = fib_validate_source(skb, saddr, daddr, tos,
1655                                           LOOPBACK_IFINDEX,
1656                                           dev, in_dev, &itag);
1657                 if (err < 0)
1658                         goto martian_source_keep_err;
1659                 goto local_input;
1660         }
1661
1662         if (!IN_DEV_FORWARD(in_dev))
1663                 goto no_route;
1664         if (res.type != RTN_UNICAST)
1665                 goto martian_destination;
1666
1667         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1668 out:    return err;
1669
1670 brd_input:
1671         if (skb->protocol != htons(ETH_P_IP))
1672                 goto e_inval;
1673
1674         if (!ipv4_is_zeronet(saddr)) {
1675                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1676                                           in_dev, &itag);
1677                 if (err < 0)
1678                         goto martian_source_keep_err;
1679         }
1680         flags |= RTCF_BROADCAST;
1681         res.type = RTN_BROADCAST;
1682         RT_CACHE_STAT_INC(in_brd);
1683
1684 local_input:
1685         do_cache = false;
1686         if (res.fi) {
1687                 if (!itag) {
1688                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1689                         if (rt_cache_valid(rth)) {
1690                                 skb_dst_set_noref(skb, &rth->dst);
1691                                 err = 0;
1692                                 goto out;
1693                         }
1694                         do_cache = true;
1695                 }
1696         }
1697
1698         rth = rt_dst_alloc(net->loopback_dev,
1699                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1700         if (!rth)
1701                 goto e_nobufs;
1702
1703         rth->dst.input= ip_local_deliver;
1704         rth->dst.output= ip_rt_bug;
1705 #ifdef CONFIG_IP_ROUTE_CLASSID
1706         rth->dst.tclassid = itag;
1707 #endif
1708
1709         rth->rt_genid = rt_genid(net);
1710         rth->rt_flags   = flags|RTCF_LOCAL;
1711         rth->rt_type    = res.type;
1712         rth->rt_is_input = 1;
1713         rth->rt_iif     = 0;
1714         rth->rt_pmtu    = 0;
1715         rth->rt_gateway = 0;
1716         rth->rt_uses_gateway = 0;
1717         INIT_LIST_HEAD(&rth->rt_uncached);
1718         RT_CACHE_STAT_INC(in_slow_tot);
1719         if (res.type == RTN_UNREACHABLE) {
1720                 rth->dst.input= ip_error;
1721                 rth->dst.error= -err;
1722                 rth->rt_flags   &= ~RTCF_LOCAL;
1723         }
1724         if (do_cache) {
1725                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1726                         rth->dst.flags |= DST_NOCACHE;
1727                         rt_add_uncached_list(rth);
1728                 }
1729         }
1730         skb_dst_set(skb, &rth->dst);
1731         err = 0;
1732         goto out;
1733
1734 no_route:
1735         RT_CACHE_STAT_INC(in_no_route);
1736         res.type = RTN_UNREACHABLE;
1737         if (err == -ESRCH)
1738                 err = -ENETUNREACH;
1739         goto local_input;
1740
1741         /*
1742          *      Do not cache martian addresses: they should be logged (RFC1812)
1743          */
1744 martian_destination:
1745         RT_CACHE_STAT_INC(in_martian_dst);
1746 #ifdef CONFIG_IP_ROUTE_VERBOSE
1747         if (IN_DEV_LOG_MARTIANS(in_dev))
1748                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1749                                      &daddr, &saddr, dev->name);
1750 #endif
1751
1752 e_inval:
1753         err = -EINVAL;
1754         goto out;
1755
1756 e_nobufs:
1757         err = -ENOBUFS;
1758         goto out;
1759
1760 martian_source:
1761         err = -EINVAL;
1762 martian_source_keep_err:
1763         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1764         goto out;
1765 }
1766
1767 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1768                          u8 tos, struct net_device *dev)
1769 {
1770         int res;
1771
1772         rcu_read_lock();
1773
1774         /* Multicast recognition logic is moved from route cache to here.
1775            The problem was that too many Ethernet cards have broken/missing
1776            hardware multicast filters :-( As result the host on multicasting
1777            network acquires a lot of useless route cache entries, sort of
1778            SDR messages from all the world. Now we try to get rid of them.
1779            Really, provided software IP multicast filter is organized
1780            reasonably (at least, hashed), it does not result in a slowdown
1781            comparing with route cache reject entries.
1782            Note, that multicast routers are not affected, because
1783            route cache entry is created eventually.
1784          */
1785         if (ipv4_is_multicast(daddr)) {
1786                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1787
1788                 if (in_dev) {
1789                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1790                                                   ip_hdr(skb)->protocol);
1791                         if (our
1792 #ifdef CONFIG_IP_MROUTE
1793                                 ||
1794                             (!ipv4_is_local_multicast(daddr) &&
1795                              IN_DEV_MFORWARD(in_dev))
1796 #endif
1797                            ) {
1798                                 int res = ip_route_input_mc(skb, daddr, saddr,
1799                                                             tos, dev, our);
1800                                 rcu_read_unlock();
1801                                 return res;
1802                         }
1803                 }
1804                 rcu_read_unlock();
1805                 return -EINVAL;
1806         }
1807         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1808         rcu_read_unlock();
1809         return res;
1810 }
1811 EXPORT_SYMBOL(ip_route_input_noref);
1812
1813 /* called with rcu_read_lock() */
1814 static struct rtable *__mkroute_output(const struct fib_result *res,
1815                                        const struct flowi4 *fl4, int orig_oif,
1816                                        struct net_device *dev_out,
1817                                        unsigned int flags)
1818 {
1819         struct fib_info *fi = res->fi;
1820         struct fib_nh_exception *fnhe;
1821         struct in_device *in_dev;
1822         u16 type = res->type;
1823         struct rtable *rth;
1824         bool do_cache;
1825
1826         in_dev = __in_dev_get_rcu(dev_out);
1827         if (!in_dev)
1828                 return ERR_PTR(-EINVAL);
1829
1830         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1831                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1832                         return ERR_PTR(-EINVAL);
1833
1834         if (ipv4_is_lbcast(fl4->daddr))
1835                 type = RTN_BROADCAST;
1836         else if (ipv4_is_multicast(fl4->daddr))
1837                 type = RTN_MULTICAST;
1838         else if (ipv4_is_zeronet(fl4->daddr))
1839                 return ERR_PTR(-EINVAL);
1840
1841         if (dev_out->flags & IFF_LOOPBACK)
1842                 flags |= RTCF_LOCAL;
1843
1844         do_cache = true;
1845         if (type == RTN_BROADCAST) {
1846                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1847                 fi = NULL;
1848         } else if (type == RTN_MULTICAST) {
1849                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1850                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1851                                      fl4->flowi4_proto))
1852                         flags &= ~RTCF_LOCAL;
1853                 else
1854                         do_cache = false;
1855                 /* If multicast route do not exist use
1856                  * default one, but do not gateway in this case.
1857                  * Yes, it is hack.
1858                  */
1859                 if (fi && res->prefixlen < 4)
1860                         fi = NULL;
1861         }
1862
1863         fnhe = NULL;
1864         do_cache &= fi != NULL;
1865         if (do_cache) {
1866                 struct rtable __rcu **prth;
1867                 struct fib_nh *nh = &FIB_RES_NH(*res);
1868
1869                 fnhe = find_exception(nh, fl4->daddr);
1870                 if (fnhe)
1871                         prth = &fnhe->fnhe_rth;
1872                 else {
1873                         if (unlikely(fl4->flowi4_flags &
1874                                      FLOWI_FLAG_KNOWN_NH &&
1875                                      !(nh->nh_gw &&
1876                                        nh->nh_scope == RT_SCOPE_LINK))) {
1877                                 do_cache = false;
1878                                 goto add;
1879                         }
1880                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1881                 }
1882                 rth = rcu_dereference(*prth);
1883                 if (rt_cache_valid(rth)) {
1884                         dst_hold(&rth->dst);
1885                         return rth;
1886                 }
1887         }
1888
1889 add:
1890         rth = rt_dst_alloc(dev_out,
1891                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1892                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1893                            do_cache);
1894         if (!rth)
1895                 return ERR_PTR(-ENOBUFS);
1896
1897         rth->dst.output = ip_output;
1898
1899         rth->rt_genid = rt_genid(dev_net(dev_out));
1900         rth->rt_flags   = flags;
1901         rth->rt_type    = type;
1902         rth->rt_is_input = 0;
1903         rth->rt_iif     = orig_oif ? : 0;
1904         rth->rt_pmtu    = 0;
1905         rth->rt_gateway = 0;
1906         rth->rt_uses_gateway = 0;
1907         INIT_LIST_HEAD(&rth->rt_uncached);
1908
1909         RT_CACHE_STAT_INC(out_slow_tot);
1910
1911         if (flags & RTCF_LOCAL)
1912                 rth->dst.input = ip_local_deliver;
1913         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1914                 if (flags & RTCF_LOCAL &&
1915                     !(dev_out->flags & IFF_LOOPBACK)) {
1916                         rth->dst.output = ip_mc_output;
1917                         RT_CACHE_STAT_INC(out_slow_mc);
1918                 }
1919 #ifdef CONFIG_IP_MROUTE
1920                 if (type == RTN_MULTICAST) {
1921                         if (IN_DEV_MFORWARD(in_dev) &&
1922                             !ipv4_is_local_multicast(fl4->daddr)) {
1923                                 rth->dst.input = ip_mr_input;
1924                                 rth->dst.output = ip_mc_output;
1925                         }
1926                 }
1927 #endif
1928         }
1929
1930         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1931
1932         return rth;
1933 }
1934
1935 /*
1936  * Major route resolver routine.
1937  */
1938
1939 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1940 {
1941         struct net_device *dev_out = NULL;
1942         __u8 tos = RT_FL_TOS(fl4);
1943         unsigned int flags = 0;
1944         struct fib_result res;
1945         struct rtable *rth;
1946         int orig_oif;
1947
1948         res.tclassid    = 0;
1949         res.fi          = NULL;
1950         res.table       = NULL;
1951
1952         orig_oif = fl4->flowi4_oif;
1953
1954         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1955         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1956         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1957                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1958
1959         rcu_read_lock();
1960         if (fl4->saddr) {
1961                 rth = ERR_PTR(-EINVAL);
1962                 if (ipv4_is_multicast(fl4->saddr) ||
1963                     ipv4_is_lbcast(fl4->saddr) ||
1964                     ipv4_is_zeronet(fl4->saddr))
1965                         goto out;
1966
1967                 /* I removed check for oif == dev_out->oif here.
1968                    It was wrong for two reasons:
1969                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1970                       is assigned to multiple interfaces.
1971                    2. Moreover, we are allowed to send packets with saddr
1972                       of another iface. --ANK
1973                  */
1974
1975                 if (fl4->flowi4_oif == 0 &&
1976                     (ipv4_is_multicast(fl4->daddr) ||
1977                      ipv4_is_lbcast(fl4->daddr))) {
1978                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1979                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1980                         if (dev_out == NULL)
1981                                 goto out;
1982
1983                         /* Special hack: user can direct multicasts
1984                            and limited broadcast via necessary interface
1985                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1986                            This hack is not just for fun, it allows
1987                            vic,vat and friends to work.
1988                            They bind socket to loopback, set ttl to zero
1989                            and expect that it will work.
1990                            From the viewpoint of routing cache they are broken,
1991                            because we are not allowed to build multicast path
1992                            with loopback source addr (look, routing cache
1993                            cannot know, that ttl is zero, so that packet
1994                            will not leave this host and route is valid).
1995                            Luckily, this hack is good workaround.
1996                          */
1997
1998                         fl4->flowi4_oif = dev_out->ifindex;
1999                         goto make_route;
2000                 }
2001
2002                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2003                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2004                         if (!__ip_dev_find(net, fl4->saddr, false))
2005                                 goto out;
2006                 }
2007         }
2008
2009
2010         if (fl4->flowi4_oif) {
2011                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2012                 rth = ERR_PTR(-ENODEV);
2013                 if (dev_out == NULL)
2014                         goto out;
2015
2016                 /* RACE: Check return value of inet_select_addr instead. */
2017                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2018                         rth = ERR_PTR(-ENETUNREACH);
2019                         goto out;
2020                 }
2021                 if (ipv4_is_local_multicast(fl4->daddr) ||
2022                     ipv4_is_lbcast(fl4->daddr)) {
2023                         if (!fl4->saddr)
2024                                 fl4->saddr = inet_select_addr(dev_out, 0,
2025                                                               RT_SCOPE_LINK);
2026                         goto make_route;
2027                 }
2028                 if (!fl4->saddr) {
2029                         if (ipv4_is_multicast(fl4->daddr))
2030                                 fl4->saddr = inet_select_addr(dev_out, 0,
2031                                                               fl4->flowi4_scope);
2032                         else if (!fl4->daddr)
2033                                 fl4->saddr = inet_select_addr(dev_out, 0,
2034                                                               RT_SCOPE_HOST);
2035                 }
2036         }
2037
2038         if (!fl4->daddr) {
2039                 fl4->daddr = fl4->saddr;
2040                 if (!fl4->daddr)
2041                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2042                 dev_out = net->loopback_dev;
2043                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2044                 res.type = RTN_LOCAL;
2045                 flags |= RTCF_LOCAL;
2046                 goto make_route;
2047         }
2048
2049         if (fib_lookup(net, fl4, &res)) {
2050                 res.fi = NULL;
2051                 res.table = NULL;
2052                 if (fl4->flowi4_oif) {
2053                         /* Apparently, routing tables are wrong. Assume,
2054                            that the destination is on link.
2055
2056                            WHY? DW.
2057                            Because we are allowed to send to iface
2058                            even if it has NO routes and NO assigned
2059                            addresses. When oif is specified, routing
2060                            tables are looked up with only one purpose:
2061                            to catch if destination is gatewayed, rather than
2062                            direct. Moreover, if MSG_DONTROUTE is set,
2063                            we send packet, ignoring both routing tables
2064                            and ifaddr state. --ANK
2065
2066
2067                            We could make it even if oif is unknown,
2068                            likely IPv6, but we do not.
2069                          */
2070
2071                         if (fl4->saddr == 0)
2072                                 fl4->saddr = inet_select_addr(dev_out, 0,
2073                                                               RT_SCOPE_LINK);
2074                         res.type = RTN_UNICAST;
2075                         goto make_route;
2076                 }
2077                 rth = ERR_PTR(-ENETUNREACH);
2078                 goto out;
2079         }
2080
2081         if (res.type == RTN_LOCAL) {
2082                 if (!fl4->saddr) {
2083                         if (res.fi->fib_prefsrc)
2084                                 fl4->saddr = res.fi->fib_prefsrc;
2085                         else
2086                                 fl4->saddr = fl4->daddr;
2087                 }
2088                 dev_out = net->loopback_dev;
2089                 fl4->flowi4_oif = dev_out->ifindex;
2090                 flags |= RTCF_LOCAL;
2091                 goto make_route;
2092         }
2093
2094 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2095         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2096                 fib_select_multipath(&res);
2097         else
2098 #endif
2099         if (!res.prefixlen &&
2100             res.table->tb_num_default > 1 &&
2101             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2102                 fib_select_default(&res);
2103
2104         if (!fl4->saddr)
2105                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2106
2107         dev_out = FIB_RES_DEV(res);
2108         fl4->flowi4_oif = dev_out->ifindex;
2109
2110
2111 make_route:
2112         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2113
2114 out:
2115         rcu_read_unlock();
2116         return rth;
2117 }
2118 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2119
2120 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2121 {
2122         return NULL;
2123 }
2124
2125 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2126 {
2127         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2128
2129         return mtu ? : dst->dev->mtu;
2130 }
2131
2132 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2133                                           struct sk_buff *skb, u32 mtu)
2134 {
2135 }
2136
2137 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2138                                        struct sk_buff *skb)
2139 {
2140 }
2141
2142 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2143                                           unsigned long old)
2144 {
2145         return NULL;
2146 }
2147
2148 static struct dst_ops ipv4_dst_blackhole_ops = {
2149         .family                 =       AF_INET,
2150         .protocol               =       cpu_to_be16(ETH_P_IP),
2151         .check                  =       ipv4_blackhole_dst_check,
2152         .mtu                    =       ipv4_blackhole_mtu,
2153         .default_advmss         =       ipv4_default_advmss,
2154         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2155         .redirect               =       ipv4_rt_blackhole_redirect,
2156         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2157         .neigh_lookup           =       ipv4_neigh_lookup,
2158 };
2159
2160 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2161 {
2162         struct rtable *ort = (struct rtable *) dst_orig;
2163         struct rtable *rt;
2164
2165         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2166         if (rt) {
2167                 struct dst_entry *new = &rt->dst;
2168
2169                 new->__use = 1;
2170                 new->input = dst_discard;
2171                 new->output = dst_discard;
2172
2173                 new->dev = ort->dst.dev;
2174                 if (new->dev)
2175                         dev_hold(new->dev);
2176
2177                 rt->rt_is_input = ort->rt_is_input;
2178                 rt->rt_iif = ort->rt_iif;
2179                 rt->rt_pmtu = ort->rt_pmtu;
2180
2181                 rt->rt_genid = rt_genid(net);
2182                 rt->rt_flags = ort->rt_flags;
2183                 rt->rt_type = ort->rt_type;
2184                 rt->rt_gateway = ort->rt_gateway;
2185                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2186
2187                 INIT_LIST_HEAD(&rt->rt_uncached);
2188
2189                 dst_free(new);
2190         }
2191
2192         dst_release(dst_orig);
2193
2194         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2195 }
2196
2197 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2198                                     struct sock *sk)
2199 {
2200         struct rtable *rt = __ip_route_output_key(net, flp4);
2201
2202         if (IS_ERR(rt))
2203                 return rt;
2204
2205         if (flp4->flowi4_proto)
2206                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2207                                                    flowi4_to_flowi(flp4),
2208                                                    sk, 0);
2209
2210         return rt;
2211 }
2212 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2213
2214 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2215                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2216                         u32 seq, int event, int nowait, unsigned int flags)
2217 {
2218         struct rtable *rt = skb_rtable(skb);
2219         struct rtmsg *r;
2220         struct nlmsghdr *nlh;
2221         unsigned long expires = 0;
2222         u32 error;
2223         u32 metrics[RTAX_MAX];
2224
2225         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2226         if (nlh == NULL)
2227                 return -EMSGSIZE;
2228
2229         r = nlmsg_data(nlh);
2230         r->rtm_family    = AF_INET;
2231         r->rtm_dst_len  = 32;
2232         r->rtm_src_len  = 0;
2233         r->rtm_tos      = fl4->flowi4_tos;
2234         r->rtm_table    = RT_TABLE_MAIN;
2235         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2236                 goto nla_put_failure;
2237         r->rtm_type     = rt->rt_type;
2238         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2239         r->rtm_protocol = RTPROT_UNSPEC;
2240         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2241         if (rt->rt_flags & RTCF_NOTIFY)
2242                 r->rtm_flags |= RTM_F_NOTIFY;
2243
2244         if (nla_put_be32(skb, RTA_DST, dst))
2245                 goto nla_put_failure;
2246         if (src) {
2247                 r->rtm_src_len = 32;
2248                 if (nla_put_be32(skb, RTA_SRC, src))
2249                         goto nla_put_failure;
2250         }
2251         if (rt->dst.dev &&
2252             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2253                 goto nla_put_failure;
2254 #ifdef CONFIG_IP_ROUTE_CLASSID
2255         if (rt->dst.tclassid &&
2256             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2257                 goto nla_put_failure;
2258 #endif
2259         if (!rt_is_input_route(rt) &&
2260             fl4->saddr != src) {
2261                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2262                         goto nla_put_failure;
2263         }
2264         if (rt->rt_uses_gateway &&
2265             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2266                 goto nla_put_failure;
2267
2268         expires = rt->dst.expires;
2269         if (expires) {
2270                 unsigned long now = jiffies;
2271
2272                 if (time_before(now, expires))
2273                         expires -= now;
2274                 else
2275                         expires = 0;
2276         }
2277
2278         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2279         if (rt->rt_pmtu && expires)
2280                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2281         if (rtnetlink_put_metrics(skb, metrics) < 0)
2282                 goto nla_put_failure;
2283
2284         if (fl4->flowi4_mark &&
2285             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2286                 goto nla_put_failure;
2287
2288         error = rt->dst.error;
2289
2290         if (rt_is_input_route(rt)) {
2291 #ifdef CONFIG_IP_MROUTE
2292                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2293                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2294                         int err = ipmr_get_route(net, skb,
2295                                                  fl4->saddr, fl4->daddr,
2296                                                  r, nowait);
2297                         if (err <= 0) {
2298                                 if (!nowait) {
2299                                         if (err == 0)
2300                                                 return 0;
2301                                         goto nla_put_failure;
2302                                 } else {
2303                                         if (err == -EMSGSIZE)
2304                                                 goto nla_put_failure;
2305                                         error = err;
2306                                 }
2307                         }
2308                 } else
2309 #endif
2310                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2311                                 goto nla_put_failure;
2312         }
2313
2314         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2315                 goto nla_put_failure;
2316
2317         return nlmsg_end(skb, nlh);
2318
2319 nla_put_failure:
2320         nlmsg_cancel(skb, nlh);
2321         return -EMSGSIZE;
2322 }
2323
2324 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2325 {
2326         struct net *net = sock_net(in_skb->sk);
2327         struct rtmsg *rtm;
2328         struct nlattr *tb[RTA_MAX+1];
2329         struct rtable *rt = NULL;
2330         struct flowi4 fl4;
2331         __be32 dst = 0;
2332         __be32 src = 0;
2333         u32 iif;
2334         int err;
2335         int mark;
2336         struct sk_buff *skb;
2337
2338         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2339         if (err < 0)
2340                 goto errout;
2341
2342         rtm = nlmsg_data(nlh);
2343
2344         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2345         if (skb == NULL) {
2346                 err = -ENOBUFS;
2347                 goto errout;
2348         }
2349
2350         /* Reserve room for dummy headers, this skb can pass
2351            through good chunk of routing engine.
2352          */
2353         skb_reset_mac_header(skb);
2354         skb_reset_network_header(skb);
2355
2356         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2357         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2358         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2359
2360         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2361         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2362         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2363         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2364
2365         memset(&fl4, 0, sizeof(fl4));
2366         fl4.daddr = dst;
2367         fl4.saddr = src;
2368         fl4.flowi4_tos = rtm->rtm_tos;
2369         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2370         fl4.flowi4_mark = mark;
2371
2372         if (iif) {
2373                 struct net_device *dev;
2374
2375                 dev = __dev_get_by_index(net, iif);
2376                 if (dev == NULL) {
2377                         err = -ENODEV;
2378                         goto errout_free;
2379                 }
2380
2381                 skb->protocol   = htons(ETH_P_IP);
2382                 skb->dev        = dev;
2383                 skb->mark       = mark;
2384                 local_bh_disable();
2385                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2386                 local_bh_enable();
2387
2388                 rt = skb_rtable(skb);
2389                 if (err == 0 && rt->dst.error)
2390                         err = -rt->dst.error;
2391         } else {
2392                 rt = ip_route_output_key(net, &fl4);
2393
2394                 err = 0;
2395                 if (IS_ERR(rt))
2396                         err = PTR_ERR(rt);
2397         }
2398
2399         if (err)
2400                 goto errout_free;
2401
2402         skb_dst_set(skb, &rt->dst);
2403         if (rtm->rtm_flags & RTM_F_NOTIFY)
2404                 rt->rt_flags |= RTCF_NOTIFY;
2405
2406         err = rt_fill_info(net, dst, src, &fl4, skb,
2407                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2408                            RTM_NEWROUTE, 0, 0);
2409         if (err <= 0)
2410                 goto errout_free;
2411
2412         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2413 errout:
2414         return err;
2415
2416 errout_free:
2417         kfree_skb(skb);
2418         goto errout;
2419 }
2420
2421 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2422 {
2423         return skb->len;
2424 }
2425
2426 void ip_rt_multicast_event(struct in_device *in_dev)
2427 {
2428         rt_cache_flush(dev_net(in_dev->dev));
2429 }
2430
2431 #ifdef CONFIG_SYSCTL
2432 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2433 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2434 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2435 static int ip_rt_gc_elasticity __read_mostly    = 8;
2436
2437 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2438                                         void __user *buffer,
2439                                         size_t *lenp, loff_t *ppos)
2440 {
2441         if (write) {
2442                 rt_cache_flush((struct net *)__ctl->extra1);
2443                 return 0;
2444         }
2445
2446         return -EINVAL;
2447 }
2448
2449 static ctl_table ipv4_route_table[] = {
2450         {
2451                 .procname       = "gc_thresh",
2452                 .data           = &ipv4_dst_ops.gc_thresh,
2453                 .maxlen         = sizeof(int),
2454                 .mode           = 0644,
2455                 .proc_handler   = proc_dointvec,
2456         },
2457         {
2458                 .procname       = "max_size",
2459                 .data           = &ip_rt_max_size,
2460                 .maxlen         = sizeof(int),
2461                 .mode           = 0644,
2462                 .proc_handler   = proc_dointvec,
2463         },
2464         {
2465                 /*  Deprecated. Use gc_min_interval_ms */
2466
2467                 .procname       = "gc_min_interval",
2468                 .data           = &ip_rt_gc_min_interval,
2469                 .maxlen         = sizeof(int),
2470                 .mode           = 0644,
2471                 .proc_handler   = proc_dointvec_jiffies,
2472         },
2473         {
2474                 .procname       = "gc_min_interval_ms",
2475                 .data           = &ip_rt_gc_min_interval,
2476                 .maxlen         = sizeof(int),
2477                 .mode           = 0644,
2478                 .proc_handler   = proc_dointvec_ms_jiffies,
2479         },
2480         {
2481                 .procname       = "gc_timeout",
2482                 .data           = &ip_rt_gc_timeout,
2483                 .maxlen         = sizeof(int),
2484                 .mode           = 0644,
2485                 .proc_handler   = proc_dointvec_jiffies,
2486         },
2487         {
2488                 .procname       = "gc_interval",
2489                 .data           = &ip_rt_gc_interval,
2490                 .maxlen         = sizeof(int),
2491                 .mode           = 0644,
2492                 .proc_handler   = proc_dointvec_jiffies,
2493         },
2494         {
2495                 .procname       = "redirect_load",
2496                 .data           = &ip_rt_redirect_load,
2497                 .maxlen         = sizeof(int),
2498                 .mode           = 0644,
2499                 .proc_handler   = proc_dointvec,
2500         },
2501         {
2502                 .procname       = "redirect_number",
2503                 .data           = &ip_rt_redirect_number,
2504                 .maxlen         = sizeof(int),
2505                 .mode           = 0644,
2506                 .proc_handler   = proc_dointvec,
2507         },
2508         {
2509                 .procname       = "redirect_silence",
2510                 .data           = &ip_rt_redirect_silence,
2511                 .maxlen         = sizeof(int),
2512                 .mode           = 0644,
2513                 .proc_handler   = proc_dointvec,
2514         },
2515         {
2516                 .procname       = "error_cost",
2517                 .data           = &ip_rt_error_cost,
2518                 .maxlen         = sizeof(int),
2519                 .mode           = 0644,
2520                 .proc_handler   = proc_dointvec,
2521         },
2522         {
2523                 .procname       = "error_burst",
2524                 .data           = &ip_rt_error_burst,
2525                 .maxlen         = sizeof(int),
2526                 .mode           = 0644,
2527                 .proc_handler   = proc_dointvec,
2528         },
2529         {
2530                 .procname       = "gc_elasticity",
2531                 .data           = &ip_rt_gc_elasticity,
2532                 .maxlen         = sizeof(int),
2533                 .mode           = 0644,
2534                 .proc_handler   = proc_dointvec,
2535         },
2536         {
2537                 .procname       = "mtu_expires",
2538                 .data           = &ip_rt_mtu_expires,
2539                 .maxlen         = sizeof(int),
2540                 .mode           = 0644,
2541                 .proc_handler   = proc_dointvec_jiffies,
2542         },
2543         {
2544                 .procname       = "min_pmtu",
2545                 .data           = &ip_rt_min_pmtu,
2546                 .maxlen         = sizeof(int),
2547                 .mode           = 0644,
2548                 .proc_handler   = proc_dointvec,
2549         },
2550         {
2551                 .procname       = "min_adv_mss",
2552                 .data           = &ip_rt_min_advmss,
2553                 .maxlen         = sizeof(int),
2554                 .mode           = 0644,
2555                 .proc_handler   = proc_dointvec,
2556         },
2557         { }
2558 };
2559
2560 static struct ctl_table ipv4_route_flush_table[] = {
2561         {
2562                 .procname       = "flush",
2563                 .maxlen         = sizeof(int),
2564                 .mode           = 0200,
2565                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2566         },
2567         { },
2568 };
2569
2570 static __net_init int sysctl_route_net_init(struct net *net)
2571 {
2572         struct ctl_table *tbl;
2573
2574         tbl = ipv4_route_flush_table;
2575         if (!net_eq(net, &init_net)) {
2576                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2577                 if (tbl == NULL)
2578                         goto err_dup;
2579
2580                 /* Don't export sysctls to unprivileged users */
2581                 if (net->user_ns != &init_user_ns)
2582                         tbl[0].procname = NULL;
2583         }
2584         tbl[0].extra1 = net;
2585
2586         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2587         if (net->ipv4.route_hdr == NULL)
2588                 goto err_reg;
2589         return 0;
2590
2591 err_reg:
2592         if (tbl != ipv4_route_flush_table)
2593                 kfree(tbl);
2594 err_dup:
2595         return -ENOMEM;
2596 }
2597
2598 static __net_exit void sysctl_route_net_exit(struct net *net)
2599 {
2600         struct ctl_table *tbl;
2601
2602         tbl = net->ipv4.route_hdr->ctl_table_arg;
2603         unregister_net_sysctl_table(net->ipv4.route_hdr);
2604         BUG_ON(tbl == ipv4_route_flush_table);
2605         kfree(tbl);
2606 }
2607
2608 static __net_initdata struct pernet_operations sysctl_route_ops = {
2609         .init = sysctl_route_net_init,
2610         .exit = sysctl_route_net_exit,
2611 };
2612 #endif
2613
2614 static __net_init int rt_genid_init(struct net *net)
2615 {
2616         atomic_set(&net->rt_genid, 0);
2617         get_random_bytes(&net->ipv4.dev_addr_genid,
2618                          sizeof(net->ipv4.dev_addr_genid));
2619         return 0;
2620 }
2621
2622 static __net_initdata struct pernet_operations rt_genid_ops = {
2623         .init = rt_genid_init,
2624 };
2625
2626 static int __net_init ipv4_inetpeer_init(struct net *net)
2627 {
2628         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2629
2630         if (!bp)
2631                 return -ENOMEM;
2632         inet_peer_base_init(bp);
2633         net->ipv4.peers = bp;
2634         return 0;
2635 }
2636
2637 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2638 {
2639         struct inet_peer_base *bp = net->ipv4.peers;
2640
2641         net->ipv4.peers = NULL;
2642         inetpeer_invalidate_tree(bp);
2643         kfree(bp);
2644 }
2645
2646 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2647         .init   =       ipv4_inetpeer_init,
2648         .exit   =       ipv4_inetpeer_exit,
2649 };
2650
2651 #ifdef CONFIG_IP_ROUTE_CLASSID
2652 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2653 #endif /* CONFIG_IP_ROUTE_CLASSID */
2654
2655 int __init ip_rt_init(void)
2656 {
2657         int rc = 0;
2658
2659 #ifdef CONFIG_IP_ROUTE_CLASSID
2660         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2661         if (!ip_rt_acct)
2662                 panic("IP: failed to allocate ip_rt_acct\n");
2663 #endif
2664
2665         ipv4_dst_ops.kmem_cachep =
2666                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2667                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2668
2669         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2670
2671         if (dst_entries_init(&ipv4_dst_ops) < 0)
2672                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2673
2674         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2675                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2676
2677         ipv4_dst_ops.gc_thresh = ~0;
2678         ip_rt_max_size = INT_MAX;
2679
2680         devinet_init();
2681         ip_fib_init();
2682
2683         if (ip_rt_proc_init())
2684                 pr_err("Unable to create route proc files\n");
2685 #ifdef CONFIG_XFRM
2686         xfrm_init();
2687         xfrm4_init();
2688 #endif
2689         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2690
2691 #ifdef CONFIG_SYSCTL
2692         register_pernet_subsys(&sysctl_route_ops);
2693 #endif
2694         register_pernet_subsys(&rt_genid_ops);
2695         register_pernet_subsys(&ipv4_inetpeer_ops);
2696         return rc;
2697 }
2698
2699 #ifdef CONFIG_SYSCTL
2700 /*
2701  * We really need to sanitize the damn ipv4 init order, then all
2702  * this nonsense will go away.
2703  */
2704 void __init ip_static_sysctl_init(void)
2705 {
2706         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2707 }
2708 #endif