Merge commit 'ed30f24e8d07d30aa3e69d1f508f4d7bd2e8ea14' of git://git.linaro.org/landi...
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_redirect_number __read_mostly  = 9;
121 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
122 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
123 static int ip_rt_error_cost __read_mostly       = HZ;
124 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
125 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
126 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
127 static int ip_rt_min_advmss __read_mostly       = 256;
128
129 /*
130  *      Interface to generic destination cache.
131  */
132
133 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
134 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
135 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
136 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
137 static void              ipv4_link_failure(struct sk_buff *skb);
138 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
139                                            struct sk_buff *skb, u32 mtu);
140 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
141                                         struct sk_buff *skb);
142 static void             ipv4_dst_destroy(struct dst_entry *dst);
143
144 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
145                             int how)
146 {
147 }
148
149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151         WARN_ON(1);
152         return NULL;
153 }
154
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156                                            struct sk_buff *skb,
157                                            const void *daddr);
158
159 static struct dst_ops ipv4_dst_ops = {
160         .family =               AF_INET,
161         .protocol =             cpu_to_be16(ETH_P_IP),
162         .check =                ipv4_dst_check,
163         .default_advmss =       ipv4_default_advmss,
164         .mtu =                  ipv4_mtu,
165         .cow_metrics =          ipv4_cow_metrics,
166         .destroy =              ipv4_dst_destroy,
167         .ifdown =               ipv4_dst_ifdown,
168         .negative_advice =      ipv4_negative_advice,
169         .link_failure =         ipv4_link_failure,
170         .update_pmtu =          ip_rt_update_pmtu,
171         .redirect =             ip_do_redirect,
172         .local_out =            __ip_local_out,
173         .neigh_lookup =         ipv4_neigh_lookup,
174 };
175
176 #define ECN_OR_COST(class)      TC_PRIO_##class
177
178 const __u8 ip_tos2prio[16] = {
179         TC_PRIO_BESTEFFORT,
180         ECN_OR_COST(BESTEFFORT),
181         TC_PRIO_BESTEFFORT,
182         ECN_OR_COST(BESTEFFORT),
183         TC_PRIO_BULK,
184         ECN_OR_COST(BULK),
185         TC_PRIO_BULK,
186         ECN_OR_COST(BULK),
187         TC_PRIO_INTERACTIVE,
188         ECN_OR_COST(INTERACTIVE),
189         TC_PRIO_INTERACTIVE,
190         ECN_OR_COST(INTERACTIVE),
191         TC_PRIO_INTERACTIVE_BULK,
192         ECN_OR_COST(INTERACTIVE_BULK),
193         TC_PRIO_INTERACTIVE_BULK,
194         ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
200
201 #ifdef CONFIG_PROC_FS
202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204         if (*pos)
205                 return NULL;
206         return SEQ_START_TOKEN;
207 }
208
209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211         ++*pos;
212         return NULL;
213 }
214
215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218
219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221         if (v == SEQ_START_TOKEN)
222                 seq_printf(seq, "%-127s\n",
223                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225                            "HHUptod\tSpecDst");
226         return 0;
227 }
228
229 static const struct seq_operations rt_cache_seq_ops = {
230         .start  = rt_cache_seq_start,
231         .next   = rt_cache_seq_next,
232         .stop   = rt_cache_seq_stop,
233         .show   = rt_cache_seq_show,
234 };
235
236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238         return seq_open(file, &rt_cache_seq_ops);
239 }
240
241 static const struct file_operations rt_cache_seq_fops = {
242         .owner   = THIS_MODULE,
243         .open    = rt_cache_seq_open,
244         .read    = seq_read,
245         .llseek  = seq_lseek,
246         .release = seq_release,
247 };
248
249
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252         int cpu;
253
254         if (*pos == 0)
255                 return SEQ_START_TOKEN;
256
257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258                 if (!cpu_possible(cpu))
259                         continue;
260                 *pos = cpu+1;
261                 return &per_cpu(rt_cache_stat, cpu);
262         }
263         return NULL;
264 }
265
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268         int cpu;
269
270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271                 if (!cpu_possible(cpu))
272                         continue;
273                 *pos = cpu+1;
274                 return &per_cpu(rt_cache_stat, cpu);
275         }
276         return NULL;
277
278 }
279
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287         struct rt_cache_stat *st = v;
288
289         if (v == SEQ_START_TOKEN) {
290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291                 return 0;
292         }
293
294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296                    dst_entries_get_slow(&ipv4_dst_ops),
297                    st->in_hit,
298                    st->in_slow_tot,
299                    st->in_slow_mc,
300                    st->in_no_route,
301                    st->in_brd,
302                    st->in_martian_dst,
303                    st->in_martian_src,
304
305                    st->out_hit,
306                    st->out_slow_tot,
307                    st->out_slow_mc,
308
309                    st->gc_total,
310                    st->gc_ignored,
311                    st->gc_goal_miss,
312                    st->gc_dst_overflow,
313                    st->in_hlist_search,
314                    st->out_hlist_search
315                 );
316         return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320         .start  = rt_cpu_seq_start,
321         .next   = rt_cpu_seq_next,
322         .stop   = rt_cpu_seq_stop,
323         .show   = rt_cpu_seq_show,
324 };
325
326
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329         return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct file_operations rt_cpu_seq_fops = {
333         .owner   = THIS_MODULE,
334         .open    = rt_cpu_seq_open,
335         .read    = seq_read,
336         .llseek  = seq_lseek,
337         .release = seq_release,
338 };
339
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343         struct ip_rt_acct *dst, *src;
344         unsigned int i, j;
345
346         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347         if (!dst)
348                 return -ENOMEM;
349
350         for_each_possible_cpu(i) {
351                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352                 for (j = 0; j < 256; j++) {
353                         dst[j].o_bytes   += src[j].o_bytes;
354                         dst[j].o_packets += src[j].o_packets;
355                         dst[j].i_bytes   += src[j].i_bytes;
356                         dst[j].i_packets += src[j].i_packets;
357                 }
358         }
359
360         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361         kfree(dst);
362         return 0;
363 }
364
365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
366 {
367         return single_open(file, rt_acct_proc_show, NULL);
368 }
369
370 static const struct file_operations rt_acct_proc_fops = {
371         .owner          = THIS_MODULE,
372         .open           = rt_acct_proc_open,
373         .read           = seq_read,
374         .llseek         = seq_lseek,
375         .release        = single_release,
376 };
377 #endif
378
379 static int __net_init ip_rt_do_proc_init(struct net *net)
380 {
381         struct proc_dir_entry *pde;
382
383         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
384                           &rt_cache_seq_fops);
385         if (!pde)
386                 goto err1;
387
388         pde = proc_create("rt_cache", S_IRUGO,
389                           net->proc_net_stat, &rt_cpu_seq_fops);
390         if (!pde)
391                 goto err2;
392
393 #ifdef CONFIG_IP_ROUTE_CLASSID
394         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
395         if (!pde)
396                 goto err3;
397 #endif
398         return 0;
399
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401 err3:
402         remove_proc_entry("rt_cache", net->proc_net_stat);
403 #endif
404 err2:
405         remove_proc_entry("rt_cache", net->proc_net);
406 err1:
407         return -ENOMEM;
408 }
409
410 static void __net_exit ip_rt_do_proc_exit(struct net *net)
411 {
412         remove_proc_entry("rt_cache", net->proc_net_stat);
413         remove_proc_entry("rt_cache", net->proc_net);
414 #ifdef CONFIG_IP_ROUTE_CLASSID
415         remove_proc_entry("rt_acct", net->proc_net);
416 #endif
417 }
418
419 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
420         .init = ip_rt_do_proc_init,
421         .exit = ip_rt_do_proc_exit,
422 };
423
424 static int __init ip_rt_proc_init(void)
425 {
426         return register_pernet_subsys(&ip_rt_proc_ops);
427 }
428
429 #else
430 static inline int ip_rt_proc_init(void)
431 {
432         return 0;
433 }
434 #endif /* CONFIG_PROC_FS */
435
436 static inline bool rt_is_expired(const struct rtable *rth)
437 {
438         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
439 }
440
441 void rt_cache_flush(struct net *net)
442 {
443         rt_genid_bump(net);
444 }
445
446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
447                                            struct sk_buff *skb,
448                                            const void *daddr)
449 {
450         struct net_device *dev = dst->dev;
451         const __be32 *pkey = daddr;
452         const struct rtable *rt;
453         struct neighbour *n;
454
455         rt = (const struct rtable *) dst;
456         if (rt->rt_gateway)
457                 pkey = (const __be32 *) &rt->rt_gateway;
458         else if (skb)
459                 pkey = &ip_hdr(skb)->daddr;
460
461         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
462         if (n)
463                 return n;
464         return neigh_create(&arp_tbl, pkey, dev);
465 }
466
467 /*
468  * Peer allocation may fail only in serious out-of-memory conditions.  However
469  * we still can generate some output.
470  * Random ID selection looks a bit dangerous because we have no chances to
471  * select ID being unique in a reasonable period of time.
472  * But broken packet identifier may be better than no packet at all.
473  */
474 static void ip_select_fb_ident(struct iphdr *iph)
475 {
476         static DEFINE_SPINLOCK(ip_fb_id_lock);
477         static u32 ip_fallback_id;
478         u32 salt;
479
480         spin_lock_bh(&ip_fb_id_lock);
481         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
482         iph->id = htons(salt & 0xFFFF);
483         ip_fallback_id = salt;
484         spin_unlock_bh(&ip_fb_id_lock);
485 }
486
487 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
488 {
489         struct net *net = dev_net(dst->dev);
490         struct inet_peer *peer;
491
492         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
493         if (peer) {
494                 iph->id = htons(inet_getid(peer, more));
495                 inet_putpeer(peer);
496                 return;
497         }
498
499         ip_select_fb_ident(iph);
500 }
501 EXPORT_SYMBOL(__ip_select_ident);
502
503 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
504                              const struct iphdr *iph,
505                              int oif, u8 tos,
506                              u8 prot, u32 mark, int flow_flags)
507 {
508         if (sk) {
509                 const struct inet_sock *inet = inet_sk(sk);
510
511                 oif = sk->sk_bound_dev_if;
512                 mark = sk->sk_mark;
513                 tos = RT_CONN_FLAGS(sk);
514                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
515         }
516         flowi4_init_output(fl4, oif, mark, tos,
517                            RT_SCOPE_UNIVERSE, prot,
518                            flow_flags,
519                            iph->daddr, iph->saddr, 0, 0);
520 }
521
522 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
523                                const struct sock *sk)
524 {
525         const struct iphdr *iph = ip_hdr(skb);
526         int oif = skb->dev->ifindex;
527         u8 tos = RT_TOS(iph->tos);
528         u8 prot = iph->protocol;
529         u32 mark = skb->mark;
530
531         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
532 }
533
534 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
535 {
536         const struct inet_sock *inet = inet_sk(sk);
537         const struct ip_options_rcu *inet_opt;
538         __be32 daddr = inet->inet_daddr;
539
540         rcu_read_lock();
541         inet_opt = rcu_dereference(inet->inet_opt);
542         if (inet_opt && inet_opt->opt.srr)
543                 daddr = inet_opt->opt.faddr;
544         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
545                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
546                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
547                            inet_sk_flowi_flags(sk),
548                            daddr, inet->inet_saddr, 0, 0);
549         rcu_read_unlock();
550 }
551
552 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
553                                  const struct sk_buff *skb)
554 {
555         if (skb)
556                 build_skb_flow_key(fl4, skb, sk);
557         else
558                 build_sk_flow_key(fl4, sk);
559 }
560
561 static inline void rt_free(struct rtable *rt)
562 {
563         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
564 }
565
566 static DEFINE_SPINLOCK(fnhe_lock);
567
568 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
569 {
570         struct fib_nh_exception *fnhe, *oldest;
571         struct rtable *orig;
572
573         oldest = rcu_dereference(hash->chain);
574         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
575              fnhe = rcu_dereference(fnhe->fnhe_next)) {
576                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
577                         oldest = fnhe;
578         }
579         orig = rcu_dereference(oldest->fnhe_rth);
580         if (orig) {
581                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
582                 rt_free(orig);
583         }
584         return oldest;
585 }
586
587 static inline u32 fnhe_hashfun(__be32 daddr)
588 {
589         u32 hval;
590
591         hval = (__force u32) daddr;
592         hval ^= (hval >> 11) ^ (hval >> 22);
593
594         return hval & (FNHE_HASH_SIZE - 1);
595 }
596
597 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
598                                   u32 pmtu, unsigned long expires)
599 {
600         struct fnhe_hash_bucket *hash;
601         struct fib_nh_exception *fnhe;
602         int depth;
603         u32 hval = fnhe_hashfun(daddr);
604
605         spin_lock_bh(&fnhe_lock);
606
607         hash = nh->nh_exceptions;
608         if (!hash) {
609                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
610                 if (!hash)
611                         goto out_unlock;
612                 nh->nh_exceptions = hash;
613         }
614
615         hash += hval;
616
617         depth = 0;
618         for (fnhe = rcu_dereference(hash->chain); fnhe;
619              fnhe = rcu_dereference(fnhe->fnhe_next)) {
620                 if (fnhe->fnhe_daddr == daddr)
621                         break;
622                 depth++;
623         }
624
625         if (fnhe) {
626                 if (gw)
627                         fnhe->fnhe_gw = gw;
628                 if (pmtu) {
629                         fnhe->fnhe_pmtu = pmtu;
630                         fnhe->fnhe_expires = expires;
631                 }
632         } else {
633                 if (depth > FNHE_RECLAIM_DEPTH)
634                         fnhe = fnhe_oldest(hash);
635                 else {
636                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
637                         if (!fnhe)
638                                 goto out_unlock;
639
640                         fnhe->fnhe_next = hash->chain;
641                         rcu_assign_pointer(hash->chain, fnhe);
642                 }
643                 fnhe->fnhe_daddr = daddr;
644                 fnhe->fnhe_gw = gw;
645                 fnhe->fnhe_pmtu = pmtu;
646                 fnhe->fnhe_expires = expires;
647         }
648
649         fnhe->fnhe_stamp = jiffies;
650
651 out_unlock:
652         spin_unlock_bh(&fnhe_lock);
653         return;
654 }
655
656 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
657                              bool kill_route)
658 {
659         __be32 new_gw = icmp_hdr(skb)->un.gateway;
660         __be32 old_gw = ip_hdr(skb)->saddr;
661         struct net_device *dev = skb->dev;
662         struct in_device *in_dev;
663         struct fib_result res;
664         struct neighbour *n;
665         struct net *net;
666
667         switch (icmp_hdr(skb)->code & 7) {
668         case ICMP_REDIR_NET:
669         case ICMP_REDIR_NETTOS:
670         case ICMP_REDIR_HOST:
671         case ICMP_REDIR_HOSTTOS:
672                 break;
673
674         default:
675                 return;
676         }
677
678         if (rt->rt_gateway != old_gw)
679                 return;
680
681         in_dev = __in_dev_get_rcu(dev);
682         if (!in_dev)
683                 return;
684
685         net = dev_net(dev);
686         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
687             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
688             ipv4_is_zeronet(new_gw))
689                 goto reject_redirect;
690
691         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
692                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
693                         goto reject_redirect;
694                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
695                         goto reject_redirect;
696         } else {
697                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
698                         goto reject_redirect;
699         }
700
701         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
702         if (n) {
703                 if (!(n->nud_state & NUD_VALID)) {
704                         neigh_event_send(n, NULL);
705                 } else {
706                         if (fib_lookup(net, fl4, &res) == 0) {
707                                 struct fib_nh *nh = &FIB_RES_NH(res);
708
709                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
710                                                       0, 0);
711                         }
712                         if (kill_route)
713                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
714                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
715                 }
716                 neigh_release(n);
717         }
718         return;
719
720 reject_redirect:
721 #ifdef CONFIG_IP_ROUTE_VERBOSE
722         if (IN_DEV_LOG_MARTIANS(in_dev)) {
723                 const struct iphdr *iph = (const struct iphdr *) skb->data;
724                 __be32 daddr = iph->daddr;
725                 __be32 saddr = iph->saddr;
726
727                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
728                                      "  Advised path = %pI4 -> %pI4\n",
729                                      &old_gw, dev->name, &new_gw,
730                                      &saddr, &daddr);
731         }
732 #endif
733         ;
734 }
735
736 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
737 {
738         struct rtable *rt;
739         struct flowi4 fl4;
740         const struct iphdr *iph = (const struct iphdr *) skb->data;
741         int oif = skb->dev->ifindex;
742         u8 tos = RT_TOS(iph->tos);
743         u8 prot = iph->protocol;
744         u32 mark = skb->mark;
745
746         rt = (struct rtable *) dst;
747
748         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
749         __ip_do_redirect(rt, skb, &fl4, true);
750 }
751
752 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
753 {
754         struct rtable *rt = (struct rtable *)dst;
755         struct dst_entry *ret = dst;
756
757         if (rt) {
758                 if (dst->obsolete > 0) {
759                         ip_rt_put(rt);
760                         ret = NULL;
761                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
762                            rt->dst.expires) {
763                         ip_rt_put(rt);
764                         ret = NULL;
765                 }
766         }
767         return ret;
768 }
769
770 /*
771  * Algorithm:
772  *      1. The first ip_rt_redirect_number redirects are sent
773  *         with exponential backoff, then we stop sending them at all,
774  *         assuming that the host ignores our redirects.
775  *      2. If we did not see packets requiring redirects
776  *         during ip_rt_redirect_silence, we assume that the host
777  *         forgot redirected route and start to send redirects again.
778  *
779  * This algorithm is much cheaper and more intelligent than dumb load limiting
780  * in icmp.c.
781  *
782  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
783  * and "frag. need" (breaks PMTU discovery) in icmp.c.
784  */
785
786 void ip_rt_send_redirect(struct sk_buff *skb)
787 {
788         struct rtable *rt = skb_rtable(skb);
789         struct in_device *in_dev;
790         struct inet_peer *peer;
791         struct net *net;
792         int log_martians;
793
794         rcu_read_lock();
795         in_dev = __in_dev_get_rcu(rt->dst.dev);
796         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
797                 rcu_read_unlock();
798                 return;
799         }
800         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
801         rcu_read_unlock();
802
803         net = dev_net(rt->dst.dev);
804         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
805         if (!peer) {
806                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
807                           rt_nexthop(rt, ip_hdr(skb)->daddr));
808                 return;
809         }
810
811         /* No redirected packets during ip_rt_redirect_silence;
812          * reset the algorithm.
813          */
814         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
815                 peer->rate_tokens = 0;
816
817         /* Too many ignored redirects; do not send anything
818          * set dst.rate_last to the last seen redirected packet.
819          */
820         if (peer->rate_tokens >= ip_rt_redirect_number) {
821                 peer->rate_last = jiffies;
822                 goto out_put_peer;
823         }
824
825         /* Check for load limit; set rate_last to the latest sent
826          * redirect.
827          */
828         if (peer->rate_tokens == 0 ||
829             time_after(jiffies,
830                        (peer->rate_last +
831                         (ip_rt_redirect_load << peer->rate_tokens)))) {
832                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
833
834                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
835                 peer->rate_last = jiffies;
836                 ++peer->rate_tokens;
837 #ifdef CONFIG_IP_ROUTE_VERBOSE
838                 if (log_martians &&
839                     peer->rate_tokens == ip_rt_redirect_number)
840                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
841                                              &ip_hdr(skb)->saddr, inet_iif(skb),
842                                              &ip_hdr(skb)->daddr, &gw);
843 #endif
844         }
845 out_put_peer:
846         inet_putpeer(peer);
847 }
848
849 static int ip_error(struct sk_buff *skb)
850 {
851         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
852         struct rtable *rt = skb_rtable(skb);
853         struct inet_peer *peer;
854         unsigned long now;
855         struct net *net;
856         bool send;
857         int code;
858
859         net = dev_net(rt->dst.dev);
860         if (!IN_DEV_FORWARD(in_dev)) {
861                 switch (rt->dst.error) {
862                 case EHOSTUNREACH:
863                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
864                         break;
865
866                 case ENETUNREACH:
867                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
868                         break;
869                 }
870                 goto out;
871         }
872
873         switch (rt->dst.error) {
874         case EINVAL:
875         default:
876                 goto out;
877         case EHOSTUNREACH:
878                 code = ICMP_HOST_UNREACH;
879                 break;
880         case ENETUNREACH:
881                 code = ICMP_NET_UNREACH;
882                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
883                 break;
884         case EACCES:
885                 code = ICMP_PKT_FILTERED;
886                 break;
887         }
888
889         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
890
891         send = true;
892         if (peer) {
893                 now = jiffies;
894                 peer->rate_tokens += now - peer->rate_last;
895                 if (peer->rate_tokens > ip_rt_error_burst)
896                         peer->rate_tokens = ip_rt_error_burst;
897                 peer->rate_last = now;
898                 if (peer->rate_tokens >= ip_rt_error_cost)
899                         peer->rate_tokens -= ip_rt_error_cost;
900                 else
901                         send = false;
902                 inet_putpeer(peer);
903         }
904         if (send)
905                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
906
907 out:    kfree_skb(skb);
908         return 0;
909 }
910
911 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
912 {
913         struct dst_entry *dst = &rt->dst;
914         struct fib_result res;
915
916         if (dst_metric_locked(dst, RTAX_MTU))
917                 return;
918
919         if (dst->dev->mtu < mtu)
920                 return;
921
922         if (mtu < ip_rt_min_pmtu)
923                 mtu = ip_rt_min_pmtu;
924
925         if (!rt->rt_pmtu) {
926                 dst->obsolete = DST_OBSOLETE_KILL;
927         } else {
928                 rt->rt_pmtu = mtu;
929                 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
930         }
931
932         rcu_read_lock();
933         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
934                 struct fib_nh *nh = &FIB_RES_NH(res);
935
936                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
937                                       jiffies + ip_rt_mtu_expires);
938         }
939         rcu_read_unlock();
940 }
941
942 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
943                               struct sk_buff *skb, u32 mtu)
944 {
945         struct rtable *rt = (struct rtable *) dst;
946         struct flowi4 fl4;
947
948         ip_rt_build_flow_key(&fl4, sk, skb);
949         __ip_rt_update_pmtu(rt, &fl4, mtu);
950 }
951
952 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
953                       int oif, u32 mark, u8 protocol, int flow_flags)
954 {
955         const struct iphdr *iph = (const struct iphdr *) skb->data;
956         struct flowi4 fl4;
957         struct rtable *rt;
958
959         __build_flow_key(&fl4, NULL, iph, oif,
960                          RT_TOS(iph->tos), protocol, mark, flow_flags);
961         rt = __ip_route_output_key(net, &fl4);
962         if (!IS_ERR(rt)) {
963                 __ip_rt_update_pmtu(rt, &fl4, mtu);
964                 ip_rt_put(rt);
965         }
966 }
967 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
968
969 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
970 {
971         const struct iphdr *iph = (const struct iphdr *) skb->data;
972         struct flowi4 fl4;
973         struct rtable *rt;
974
975         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
976         rt = __ip_route_output_key(sock_net(sk), &fl4);
977         if (!IS_ERR(rt)) {
978                 __ip_rt_update_pmtu(rt, &fl4, mtu);
979                 ip_rt_put(rt);
980         }
981 }
982
983 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
984 {
985         const struct iphdr *iph = (const struct iphdr *) skb->data;
986         struct flowi4 fl4;
987         struct rtable *rt;
988         struct dst_entry *dst;
989         bool new = false;
990
991         bh_lock_sock(sk);
992         rt = (struct rtable *) __sk_dst_get(sk);
993
994         if (sock_owned_by_user(sk) || !rt) {
995                 __ipv4_sk_update_pmtu(skb, sk, mtu);
996                 goto out;
997         }
998
999         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1000
1001         if (!__sk_dst_check(sk, 0)) {
1002                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1003                 if (IS_ERR(rt))
1004                         goto out;
1005
1006                 new = true;
1007         }
1008
1009         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1010
1011         dst = dst_check(&rt->dst, 0);
1012         if (!dst) {
1013                 if (new)
1014                         dst_release(&rt->dst);
1015
1016                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1017                 if (IS_ERR(rt))
1018                         goto out;
1019
1020                 new = true;
1021         }
1022
1023         if (new)
1024                 __sk_dst_set(sk, &rt->dst);
1025
1026 out:
1027         bh_unlock_sock(sk);
1028 }
1029 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1030
1031 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1032                    int oif, u32 mark, u8 protocol, int flow_flags)
1033 {
1034         const struct iphdr *iph = (const struct iphdr *) skb->data;
1035         struct flowi4 fl4;
1036         struct rtable *rt;
1037
1038         __build_flow_key(&fl4, NULL, iph, oif,
1039                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1040         rt = __ip_route_output_key(net, &fl4);
1041         if (!IS_ERR(rt)) {
1042                 __ip_do_redirect(rt, skb, &fl4, false);
1043                 ip_rt_put(rt);
1044         }
1045 }
1046 EXPORT_SYMBOL_GPL(ipv4_redirect);
1047
1048 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1049 {
1050         const struct iphdr *iph = (const struct iphdr *) skb->data;
1051         struct flowi4 fl4;
1052         struct rtable *rt;
1053
1054         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1055         rt = __ip_route_output_key(sock_net(sk), &fl4);
1056         if (!IS_ERR(rt)) {
1057                 __ip_do_redirect(rt, skb, &fl4, false);
1058                 ip_rt_put(rt);
1059         }
1060 }
1061 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1062
1063 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1064 {
1065         struct rtable *rt = (struct rtable *) dst;
1066
1067         /* All IPV4 dsts are created with ->obsolete set to the value
1068          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1069          * into this function always.
1070          *
1071          * When a PMTU/redirect information update invalidates a
1072          * route, this is indicated by setting obsolete to
1073          * DST_OBSOLETE_KILL.
1074          */
1075         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1076                 return NULL;
1077         return dst;
1078 }
1079
1080 static void ipv4_link_failure(struct sk_buff *skb)
1081 {
1082         struct rtable *rt;
1083
1084         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1085
1086         rt = skb_rtable(skb);
1087         if (rt)
1088                 dst_set_expires(&rt->dst, 0);
1089 }
1090
1091 static int ip_rt_bug(struct sk_buff *skb)
1092 {
1093         pr_debug("%s: %pI4 -> %pI4, %s\n",
1094                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1095                  skb->dev ? skb->dev->name : "?");
1096         kfree_skb(skb);
1097         WARN_ON(1);
1098         return 0;
1099 }
1100
1101 /*
1102    We do not cache source address of outgoing interface,
1103    because it is used only by IP RR, TS and SRR options,
1104    so that it out of fast path.
1105
1106    BTW remember: "addr" is allowed to be not aligned
1107    in IP options!
1108  */
1109
1110 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1111 {
1112         __be32 src;
1113
1114         if (rt_is_output_route(rt))
1115                 src = ip_hdr(skb)->saddr;
1116         else {
1117                 struct fib_result res;
1118                 struct flowi4 fl4;
1119                 struct iphdr *iph;
1120
1121                 iph = ip_hdr(skb);
1122
1123                 memset(&fl4, 0, sizeof(fl4));
1124                 fl4.daddr = iph->daddr;
1125                 fl4.saddr = iph->saddr;
1126                 fl4.flowi4_tos = RT_TOS(iph->tos);
1127                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1128                 fl4.flowi4_iif = skb->dev->ifindex;
1129                 fl4.flowi4_mark = skb->mark;
1130
1131                 rcu_read_lock();
1132                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1133                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1134                 else
1135                         src = inet_select_addr(rt->dst.dev,
1136                                                rt_nexthop(rt, iph->daddr),
1137                                                RT_SCOPE_UNIVERSE);
1138                 rcu_read_unlock();
1139         }
1140         memcpy(addr, &src, 4);
1141 }
1142
1143 #ifdef CONFIG_IP_ROUTE_CLASSID
1144 static void set_class_tag(struct rtable *rt, u32 tag)
1145 {
1146         if (!(rt->dst.tclassid & 0xFFFF))
1147                 rt->dst.tclassid |= tag & 0xFFFF;
1148         if (!(rt->dst.tclassid & 0xFFFF0000))
1149                 rt->dst.tclassid |= tag & 0xFFFF0000;
1150 }
1151 #endif
1152
1153 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1154 {
1155         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1156
1157         if (advmss == 0) {
1158                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1159                                ip_rt_min_advmss);
1160                 if (advmss > 65535 - 40)
1161                         advmss = 65535 - 40;
1162         }
1163         return advmss;
1164 }
1165
1166 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1167 {
1168         const struct rtable *rt = (const struct rtable *) dst;
1169         unsigned int mtu = rt->rt_pmtu;
1170
1171         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1172                 mtu = dst_metric_raw(dst, RTAX_MTU);
1173
1174         if (mtu)
1175                 return mtu;
1176
1177         mtu = dst->dev->mtu;
1178
1179         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1180                 if (rt->rt_uses_gateway && mtu > 576)
1181                         mtu = 576;
1182         }
1183
1184         if (mtu > IP_MAX_MTU)
1185                 mtu = IP_MAX_MTU;
1186
1187         return mtu;
1188 }
1189
1190 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1191 {
1192         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1193         struct fib_nh_exception *fnhe;
1194         u32 hval;
1195
1196         if (!hash)
1197                 return NULL;
1198
1199         hval = fnhe_hashfun(daddr);
1200
1201         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1202              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1203                 if (fnhe->fnhe_daddr == daddr)
1204                         return fnhe;
1205         }
1206         return NULL;
1207 }
1208
1209 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1210                               __be32 daddr)
1211 {
1212         bool ret = false;
1213
1214         spin_lock_bh(&fnhe_lock);
1215
1216         if (daddr == fnhe->fnhe_daddr) {
1217                 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1218                 if (orig && rt_is_expired(orig)) {
1219                         fnhe->fnhe_gw = 0;
1220                         fnhe->fnhe_pmtu = 0;
1221                         fnhe->fnhe_expires = 0;
1222                 }
1223                 if (fnhe->fnhe_pmtu) {
1224                         unsigned long expires = fnhe->fnhe_expires;
1225                         unsigned long diff = expires - jiffies;
1226
1227                         if (time_before(jiffies, expires)) {
1228                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1229                                 dst_set_expires(&rt->dst, diff);
1230                         }
1231                 }
1232                 if (fnhe->fnhe_gw) {
1233                         rt->rt_flags |= RTCF_REDIRECTED;
1234                         rt->rt_gateway = fnhe->fnhe_gw;
1235                         rt->rt_uses_gateway = 1;
1236                 } else if (!rt->rt_gateway)
1237                         rt->rt_gateway = daddr;
1238
1239                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1240                 if (orig)
1241                         rt_free(orig);
1242
1243                 fnhe->fnhe_stamp = jiffies;
1244                 ret = true;
1245         }
1246         spin_unlock_bh(&fnhe_lock);
1247
1248         return ret;
1249 }
1250
1251 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1252 {
1253         struct rtable *orig, *prev, **p;
1254         bool ret = true;
1255
1256         if (rt_is_input_route(rt)) {
1257                 p = (struct rtable **)&nh->nh_rth_input;
1258         } else {
1259                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1260         }
1261         orig = *p;
1262
1263         prev = cmpxchg(p, orig, rt);
1264         if (prev == orig) {
1265                 if (orig)
1266                         rt_free(orig);
1267         } else
1268                 ret = false;
1269
1270         return ret;
1271 }
1272
1273 static DEFINE_SPINLOCK(rt_uncached_lock);
1274 static LIST_HEAD(rt_uncached_list);
1275
1276 static void rt_add_uncached_list(struct rtable *rt)
1277 {
1278         spin_lock_bh(&rt_uncached_lock);
1279         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1280         spin_unlock_bh(&rt_uncached_lock);
1281 }
1282
1283 static void ipv4_dst_destroy(struct dst_entry *dst)
1284 {
1285         struct rtable *rt = (struct rtable *) dst;
1286
1287         if (!list_empty(&rt->rt_uncached)) {
1288                 spin_lock_bh(&rt_uncached_lock);
1289                 list_del(&rt->rt_uncached);
1290                 spin_unlock_bh(&rt_uncached_lock);
1291         }
1292 }
1293
1294 void rt_flush_dev(struct net_device *dev)
1295 {
1296         if (!list_empty(&rt_uncached_list)) {
1297                 struct net *net = dev_net(dev);
1298                 struct rtable *rt;
1299
1300                 spin_lock_bh(&rt_uncached_lock);
1301                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1302                         if (rt->dst.dev != dev)
1303                                 continue;
1304                         rt->dst.dev = net->loopback_dev;
1305                         dev_hold(rt->dst.dev);
1306                         dev_put(dev);
1307                 }
1308                 spin_unlock_bh(&rt_uncached_lock);
1309         }
1310 }
1311
1312 static bool rt_cache_valid(const struct rtable *rt)
1313 {
1314         return  rt &&
1315                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1316                 !rt_is_expired(rt);
1317 }
1318
1319 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1320                            const struct fib_result *res,
1321                            struct fib_nh_exception *fnhe,
1322                            struct fib_info *fi, u16 type, u32 itag)
1323 {
1324         bool cached = false;
1325
1326         if (fi) {
1327                 struct fib_nh *nh = &FIB_RES_NH(*res);
1328
1329                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1330                         rt->rt_gateway = nh->nh_gw;
1331                         rt->rt_uses_gateway = 1;
1332                 }
1333                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1334 #ifdef CONFIG_IP_ROUTE_CLASSID
1335                 rt->dst.tclassid = nh->nh_tclassid;
1336 #endif
1337                 if (unlikely(fnhe))
1338                         cached = rt_bind_exception(rt, fnhe, daddr);
1339                 else if (!(rt->dst.flags & DST_NOCACHE))
1340                         cached = rt_cache_route(nh, rt);
1341                 if (unlikely(!cached)) {
1342                         /* Routes we intend to cache in nexthop exception or
1343                          * FIB nexthop have the DST_NOCACHE bit clear.
1344                          * However, if we are unsuccessful at storing this
1345                          * route into the cache we really need to set it.
1346                          */
1347                         rt->dst.flags |= DST_NOCACHE;
1348                         if (!rt->rt_gateway)
1349                                 rt->rt_gateway = daddr;
1350                         rt_add_uncached_list(rt);
1351                 }
1352         } else
1353                 rt_add_uncached_list(rt);
1354
1355 #ifdef CONFIG_IP_ROUTE_CLASSID
1356 #ifdef CONFIG_IP_MULTIPLE_TABLES
1357         set_class_tag(rt, res->tclassid);
1358 #endif
1359         set_class_tag(rt, itag);
1360 #endif
1361 }
1362
1363 static struct rtable *rt_dst_alloc(struct net_device *dev,
1364                                    bool nopolicy, bool noxfrm, bool will_cache)
1365 {
1366         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1367                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1368                          (nopolicy ? DST_NOPOLICY : 0) |
1369                          (noxfrm ? DST_NOXFRM : 0));
1370 }
1371
1372 /* called in rcu_read_lock() section */
1373 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1374                                 u8 tos, struct net_device *dev, int our)
1375 {
1376         struct rtable *rth;
1377         struct in_device *in_dev = __in_dev_get_rcu(dev);
1378         u32 itag = 0;
1379         int err;
1380
1381         /* Primary sanity checks. */
1382
1383         if (in_dev == NULL)
1384                 return -EINVAL;
1385
1386         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1387             skb->protocol != htons(ETH_P_IP))
1388                 goto e_inval;
1389
1390         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1391                 if (ipv4_is_loopback(saddr))
1392                         goto e_inval;
1393
1394         if (ipv4_is_zeronet(saddr)) {
1395                 if (!ipv4_is_local_multicast(daddr))
1396                         goto e_inval;
1397         } else {
1398                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1399                                           in_dev, &itag);
1400                 if (err < 0)
1401                         goto e_err;
1402         }
1403         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1404                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1405         if (!rth)
1406                 goto e_nobufs;
1407
1408 #ifdef CONFIG_IP_ROUTE_CLASSID
1409         rth->dst.tclassid = itag;
1410 #endif
1411         rth->dst.output = ip_rt_bug;
1412
1413         rth->rt_genid   = rt_genid(dev_net(dev));
1414         rth->rt_flags   = RTCF_MULTICAST;
1415         rth->rt_type    = RTN_MULTICAST;
1416         rth->rt_is_input= 1;
1417         rth->rt_iif     = 0;
1418         rth->rt_pmtu    = 0;
1419         rth->rt_gateway = 0;
1420         rth->rt_uses_gateway = 0;
1421         INIT_LIST_HEAD(&rth->rt_uncached);
1422         if (our) {
1423                 rth->dst.input= ip_local_deliver;
1424                 rth->rt_flags |= RTCF_LOCAL;
1425         }
1426
1427 #ifdef CONFIG_IP_MROUTE
1428         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1429                 rth->dst.input = ip_mr_input;
1430 #endif
1431         RT_CACHE_STAT_INC(in_slow_mc);
1432
1433         skb_dst_set(skb, &rth->dst);
1434         return 0;
1435
1436 e_nobufs:
1437         return -ENOBUFS;
1438 e_inval:
1439         return -EINVAL;
1440 e_err:
1441         return err;
1442 }
1443
1444
1445 static void ip_handle_martian_source(struct net_device *dev,
1446                                      struct in_device *in_dev,
1447                                      struct sk_buff *skb,
1448                                      __be32 daddr,
1449                                      __be32 saddr)
1450 {
1451         RT_CACHE_STAT_INC(in_martian_src);
1452 #ifdef CONFIG_IP_ROUTE_VERBOSE
1453         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1454                 /*
1455                  *      RFC1812 recommendation, if source is martian,
1456                  *      the only hint is MAC header.
1457                  */
1458                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1459                         &daddr, &saddr, dev->name);
1460                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1461                         print_hex_dump(KERN_WARNING, "ll header: ",
1462                                        DUMP_PREFIX_OFFSET, 16, 1,
1463                                        skb_mac_header(skb),
1464                                        dev->hard_header_len, true);
1465                 }
1466         }
1467 #endif
1468 }
1469
1470 /* called in rcu_read_lock() section */
1471 static int __mkroute_input(struct sk_buff *skb,
1472                            const struct fib_result *res,
1473                            struct in_device *in_dev,
1474                            __be32 daddr, __be32 saddr, u32 tos)
1475 {
1476         struct rtable *rth;
1477         int err;
1478         struct in_device *out_dev;
1479         unsigned int flags = 0;
1480         bool do_cache;
1481         u32 itag;
1482
1483         /* get a working reference to the output device */
1484         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1485         if (out_dev == NULL) {
1486                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1487                 return -EINVAL;
1488         }
1489
1490         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1491                                   in_dev->dev, in_dev, &itag);
1492         if (err < 0) {
1493                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1494                                          saddr);
1495
1496                 goto cleanup;
1497         }
1498
1499         do_cache = res->fi && !itag;
1500         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1501             (IN_DEV_SHARED_MEDIA(out_dev) ||
1502              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1503                 flags |= RTCF_DOREDIRECT;
1504                 do_cache = false;
1505         }
1506
1507         if (skb->protocol != htons(ETH_P_IP)) {
1508                 /* Not IP (i.e. ARP). Do not create route, if it is
1509                  * invalid for proxy arp. DNAT routes are always valid.
1510                  *
1511                  * Proxy arp feature have been extended to allow, ARP
1512                  * replies back to the same interface, to support
1513                  * Private VLAN switch technologies. See arp.c.
1514                  */
1515                 if (out_dev == in_dev &&
1516                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1517                         err = -EINVAL;
1518                         goto cleanup;
1519                 }
1520         }
1521
1522         if (do_cache) {
1523                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1524                 if (rt_cache_valid(rth)) {
1525                         skb_dst_set_noref(skb, &rth->dst);
1526                         goto out;
1527                 }
1528         }
1529
1530         rth = rt_dst_alloc(out_dev->dev,
1531                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1532                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1533         if (!rth) {
1534                 err = -ENOBUFS;
1535                 goto cleanup;
1536         }
1537
1538         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1539         rth->rt_flags = flags;
1540         rth->rt_type = res->type;
1541         rth->rt_is_input = 1;
1542         rth->rt_iif     = 0;
1543         rth->rt_pmtu    = 0;
1544         rth->rt_gateway = 0;
1545         rth->rt_uses_gateway = 0;
1546         INIT_LIST_HEAD(&rth->rt_uncached);
1547
1548         rth->dst.input = ip_forward;
1549         rth->dst.output = ip_output;
1550
1551         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1552         skb_dst_set(skb, &rth->dst);
1553 out:
1554         err = 0;
1555  cleanup:
1556         return err;
1557 }
1558
1559 static int ip_mkroute_input(struct sk_buff *skb,
1560                             struct fib_result *res,
1561                             const struct flowi4 *fl4,
1562                             struct in_device *in_dev,
1563                             __be32 daddr, __be32 saddr, u32 tos)
1564 {
1565 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1566         if (res->fi && res->fi->fib_nhs > 1)
1567                 fib_select_multipath(res);
1568 #endif
1569
1570         /* create a routing cache entry */
1571         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1572 }
1573
1574 /*
1575  *      NOTE. We drop all the packets that has local source
1576  *      addresses, because every properly looped back packet
1577  *      must have correct destination already attached by output routine.
1578  *
1579  *      Such approach solves two big problems:
1580  *      1. Not simplex devices are handled properly.
1581  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1582  *      called with rcu_read_lock()
1583  */
1584
1585 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1586                                u8 tos, struct net_device *dev)
1587 {
1588         struct fib_result res;
1589         struct in_device *in_dev = __in_dev_get_rcu(dev);
1590         struct flowi4   fl4;
1591         unsigned int    flags = 0;
1592         u32             itag = 0;
1593         struct rtable   *rth;
1594         int             err = -EINVAL;
1595         struct net    *net = dev_net(dev);
1596         bool do_cache;
1597
1598         /* IP on this device is disabled. */
1599
1600         if (!in_dev)
1601                 goto out;
1602
1603         /* Check for the most weird martians, which can be not detected
1604            by fib_lookup.
1605          */
1606
1607         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1608                 goto martian_source;
1609
1610         res.fi = NULL;
1611         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1612                 goto brd_input;
1613
1614         /* Accept zero addresses only to limited broadcast;
1615          * I even do not know to fix it or not. Waiting for complains :-)
1616          */
1617         if (ipv4_is_zeronet(saddr))
1618                 goto martian_source;
1619
1620         if (ipv4_is_zeronet(daddr))
1621                 goto martian_destination;
1622
1623         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1624          * and call it once if daddr or/and saddr are loopback addresses
1625          */
1626         if (ipv4_is_loopback(daddr)) {
1627                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1628                         goto martian_destination;
1629         } else if (ipv4_is_loopback(saddr)) {
1630                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1631                         goto martian_source;
1632         }
1633
1634         /*
1635          *      Now we are ready to route packet.
1636          */
1637         fl4.flowi4_oif = 0;
1638         fl4.flowi4_iif = dev->ifindex;
1639         fl4.flowi4_mark = skb->mark;
1640         fl4.flowi4_tos = tos;
1641         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1642         fl4.daddr = daddr;
1643         fl4.saddr = saddr;
1644         err = fib_lookup(net, &fl4, &res);
1645         if (err != 0)
1646                 goto no_route;
1647
1648         RT_CACHE_STAT_INC(in_slow_tot);
1649
1650         if (res.type == RTN_BROADCAST)
1651                 goto brd_input;
1652
1653         if (res.type == RTN_LOCAL) {
1654                 err = fib_validate_source(skb, saddr, daddr, tos,
1655                                           LOOPBACK_IFINDEX,
1656                                           dev, in_dev, &itag);
1657                 if (err < 0)
1658                         goto martian_source_keep_err;
1659                 goto local_input;
1660         }
1661
1662         if (!IN_DEV_FORWARD(in_dev))
1663                 goto no_route;
1664         if (res.type != RTN_UNICAST)
1665                 goto martian_destination;
1666
1667         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1668 out:    return err;
1669
1670 brd_input:
1671         if (skb->protocol != htons(ETH_P_IP))
1672                 goto e_inval;
1673
1674         if (!ipv4_is_zeronet(saddr)) {
1675                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1676                                           in_dev, &itag);
1677                 if (err < 0)
1678                         goto martian_source_keep_err;
1679         }
1680         flags |= RTCF_BROADCAST;
1681         res.type = RTN_BROADCAST;
1682         RT_CACHE_STAT_INC(in_brd);
1683
1684 local_input:
1685         do_cache = false;
1686         if (res.fi) {
1687                 if (!itag) {
1688                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1689                         if (rt_cache_valid(rth)) {
1690                                 skb_dst_set_noref(skb, &rth->dst);
1691                                 err = 0;
1692                                 goto out;
1693                         }
1694                         do_cache = true;
1695                 }
1696         }
1697
1698         rth = rt_dst_alloc(net->loopback_dev,
1699                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1700         if (!rth)
1701                 goto e_nobufs;
1702
1703         rth->dst.input= ip_local_deliver;
1704         rth->dst.output= ip_rt_bug;
1705 #ifdef CONFIG_IP_ROUTE_CLASSID
1706         rth->dst.tclassid = itag;
1707 #endif
1708
1709         rth->rt_genid = rt_genid(net);
1710         rth->rt_flags   = flags|RTCF_LOCAL;
1711         rth->rt_type    = res.type;
1712         rth->rt_is_input = 1;
1713         rth->rt_iif     = 0;
1714         rth->rt_pmtu    = 0;
1715         rth->rt_gateway = 0;
1716         rth->rt_uses_gateway = 0;
1717         INIT_LIST_HEAD(&rth->rt_uncached);
1718         if (res.type == RTN_UNREACHABLE) {
1719                 rth->dst.input= ip_error;
1720                 rth->dst.error= -err;
1721                 rth->rt_flags   &= ~RTCF_LOCAL;
1722         }
1723         if (do_cache)
1724                 rt_cache_route(&FIB_RES_NH(res), rth);
1725         skb_dst_set(skb, &rth->dst);
1726         err = 0;
1727         goto out;
1728
1729 no_route:
1730         RT_CACHE_STAT_INC(in_no_route);
1731         res.type = RTN_UNREACHABLE;
1732         if (err == -ESRCH)
1733                 err = -ENETUNREACH;
1734         goto local_input;
1735
1736         /*
1737          *      Do not cache martian addresses: they should be logged (RFC1812)
1738          */
1739 martian_destination:
1740         RT_CACHE_STAT_INC(in_martian_dst);
1741 #ifdef CONFIG_IP_ROUTE_VERBOSE
1742         if (IN_DEV_LOG_MARTIANS(in_dev))
1743                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1744                                      &daddr, &saddr, dev->name);
1745 #endif
1746
1747 e_inval:
1748         err = -EINVAL;
1749         goto out;
1750
1751 e_nobufs:
1752         err = -ENOBUFS;
1753         goto out;
1754
1755 martian_source:
1756         err = -EINVAL;
1757 martian_source_keep_err:
1758         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1759         goto out;
1760 }
1761
1762 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1763                          u8 tos, struct net_device *dev)
1764 {
1765         int res;
1766
1767         rcu_read_lock();
1768
1769         /* Multicast recognition logic is moved from route cache to here.
1770            The problem was that too many Ethernet cards have broken/missing
1771            hardware multicast filters :-( As result the host on multicasting
1772            network acquires a lot of useless route cache entries, sort of
1773            SDR messages from all the world. Now we try to get rid of them.
1774            Really, provided software IP multicast filter is organized
1775            reasonably (at least, hashed), it does not result in a slowdown
1776            comparing with route cache reject entries.
1777            Note, that multicast routers are not affected, because
1778            route cache entry is created eventually.
1779          */
1780         if (ipv4_is_multicast(daddr)) {
1781                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1782
1783                 if (in_dev) {
1784                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1785                                                   ip_hdr(skb)->protocol);
1786                         if (our
1787 #ifdef CONFIG_IP_MROUTE
1788                                 ||
1789                             (!ipv4_is_local_multicast(daddr) &&
1790                              IN_DEV_MFORWARD(in_dev))
1791 #endif
1792                            ) {
1793                                 int res = ip_route_input_mc(skb, daddr, saddr,
1794                                                             tos, dev, our);
1795                                 rcu_read_unlock();
1796                                 return res;
1797                         }
1798                 }
1799                 rcu_read_unlock();
1800                 return -EINVAL;
1801         }
1802         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1803         rcu_read_unlock();
1804         return res;
1805 }
1806 EXPORT_SYMBOL(ip_route_input_noref);
1807
1808 /* called with rcu_read_lock() */
1809 static struct rtable *__mkroute_output(const struct fib_result *res,
1810                                        const struct flowi4 *fl4, int orig_oif,
1811                                        struct net_device *dev_out,
1812                                        unsigned int flags)
1813 {
1814         struct fib_info *fi = res->fi;
1815         struct fib_nh_exception *fnhe;
1816         struct in_device *in_dev;
1817         u16 type = res->type;
1818         struct rtable *rth;
1819         bool do_cache;
1820
1821         in_dev = __in_dev_get_rcu(dev_out);
1822         if (!in_dev)
1823                 return ERR_PTR(-EINVAL);
1824
1825         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1826                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1827                         return ERR_PTR(-EINVAL);
1828
1829         if (ipv4_is_lbcast(fl4->daddr))
1830                 type = RTN_BROADCAST;
1831         else if (ipv4_is_multicast(fl4->daddr))
1832                 type = RTN_MULTICAST;
1833         else if (ipv4_is_zeronet(fl4->daddr))
1834                 return ERR_PTR(-EINVAL);
1835
1836         if (dev_out->flags & IFF_LOOPBACK)
1837                 flags |= RTCF_LOCAL;
1838
1839         do_cache = true;
1840         if (type == RTN_BROADCAST) {
1841                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1842                 fi = NULL;
1843         } else if (type == RTN_MULTICAST) {
1844                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1845                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1846                                      fl4->flowi4_proto))
1847                         flags &= ~RTCF_LOCAL;
1848                 else
1849                         do_cache = false;
1850                 /* If multicast route do not exist use
1851                  * default one, but do not gateway in this case.
1852                  * Yes, it is hack.
1853                  */
1854                 if (fi && res->prefixlen < 4)
1855                         fi = NULL;
1856         }
1857
1858         fnhe = NULL;
1859         do_cache &= fi != NULL;
1860         if (do_cache) {
1861                 struct rtable __rcu **prth;
1862                 struct fib_nh *nh = &FIB_RES_NH(*res);
1863
1864                 fnhe = find_exception(nh, fl4->daddr);
1865                 if (fnhe)
1866                         prth = &fnhe->fnhe_rth;
1867                 else {
1868                         if (unlikely(fl4->flowi4_flags &
1869                                      FLOWI_FLAG_KNOWN_NH &&
1870                                      !(nh->nh_gw &&
1871                                        nh->nh_scope == RT_SCOPE_LINK))) {
1872                                 do_cache = false;
1873                                 goto add;
1874                         }
1875                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1876                 }
1877                 rth = rcu_dereference(*prth);
1878                 if (rt_cache_valid(rth)) {
1879                         dst_hold(&rth->dst);
1880                         return rth;
1881                 }
1882         }
1883
1884 add:
1885         rth = rt_dst_alloc(dev_out,
1886                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1887                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1888                            do_cache);
1889         if (!rth)
1890                 return ERR_PTR(-ENOBUFS);
1891
1892         rth->dst.output = ip_output;
1893
1894         rth->rt_genid = rt_genid(dev_net(dev_out));
1895         rth->rt_flags   = flags;
1896         rth->rt_type    = type;
1897         rth->rt_is_input = 0;
1898         rth->rt_iif     = orig_oif ? : 0;
1899         rth->rt_pmtu    = 0;
1900         rth->rt_gateway = 0;
1901         rth->rt_uses_gateway = 0;
1902         INIT_LIST_HEAD(&rth->rt_uncached);
1903
1904         RT_CACHE_STAT_INC(out_slow_tot);
1905
1906         if (flags & RTCF_LOCAL)
1907                 rth->dst.input = ip_local_deliver;
1908         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1909                 if (flags & RTCF_LOCAL &&
1910                     !(dev_out->flags & IFF_LOOPBACK)) {
1911                         rth->dst.output = ip_mc_output;
1912                         RT_CACHE_STAT_INC(out_slow_mc);
1913                 }
1914 #ifdef CONFIG_IP_MROUTE
1915                 if (type == RTN_MULTICAST) {
1916                         if (IN_DEV_MFORWARD(in_dev) &&
1917                             !ipv4_is_local_multicast(fl4->daddr)) {
1918                                 rth->dst.input = ip_mr_input;
1919                                 rth->dst.output = ip_mc_output;
1920                         }
1921                 }
1922 #endif
1923         }
1924
1925         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1926
1927         return rth;
1928 }
1929
1930 /*
1931  * Major route resolver routine.
1932  */
1933
1934 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1935 {
1936         struct net_device *dev_out = NULL;
1937         __u8 tos = RT_FL_TOS(fl4);
1938         unsigned int flags = 0;
1939         struct fib_result res;
1940         struct rtable *rth;
1941         int orig_oif;
1942
1943         res.tclassid    = 0;
1944         res.fi          = NULL;
1945         res.table       = NULL;
1946
1947         orig_oif = fl4->flowi4_oif;
1948
1949         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1950         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1951         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1952                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1953
1954         rcu_read_lock();
1955         if (fl4->saddr) {
1956                 rth = ERR_PTR(-EINVAL);
1957                 if (ipv4_is_multicast(fl4->saddr) ||
1958                     ipv4_is_lbcast(fl4->saddr) ||
1959                     ipv4_is_zeronet(fl4->saddr))
1960                         goto out;
1961
1962                 /* I removed check for oif == dev_out->oif here.
1963                    It was wrong for two reasons:
1964                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1965                       is assigned to multiple interfaces.
1966                    2. Moreover, we are allowed to send packets with saddr
1967                       of another iface. --ANK
1968                  */
1969
1970                 if (fl4->flowi4_oif == 0 &&
1971                     (ipv4_is_multicast(fl4->daddr) ||
1972                      ipv4_is_lbcast(fl4->daddr))) {
1973                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1974                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1975                         if (dev_out == NULL)
1976                                 goto out;
1977
1978                         /* Special hack: user can direct multicasts
1979                            and limited broadcast via necessary interface
1980                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1981                            This hack is not just for fun, it allows
1982                            vic,vat and friends to work.
1983                            They bind socket to loopback, set ttl to zero
1984                            and expect that it will work.
1985                            From the viewpoint of routing cache they are broken,
1986                            because we are not allowed to build multicast path
1987                            with loopback source addr (look, routing cache
1988                            cannot know, that ttl is zero, so that packet
1989                            will not leave this host and route is valid).
1990                            Luckily, this hack is good workaround.
1991                          */
1992
1993                         fl4->flowi4_oif = dev_out->ifindex;
1994                         goto make_route;
1995                 }
1996
1997                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1998                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1999                         if (!__ip_dev_find(net, fl4->saddr, false))
2000                                 goto out;
2001                 }
2002         }
2003
2004
2005         if (fl4->flowi4_oif) {
2006                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2007                 rth = ERR_PTR(-ENODEV);
2008                 if (dev_out == NULL)
2009                         goto out;
2010
2011                 /* RACE: Check return value of inet_select_addr instead. */
2012                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2013                         rth = ERR_PTR(-ENETUNREACH);
2014                         goto out;
2015                 }
2016                 if (ipv4_is_local_multicast(fl4->daddr) ||
2017                     ipv4_is_lbcast(fl4->daddr)) {
2018                         if (!fl4->saddr)
2019                                 fl4->saddr = inet_select_addr(dev_out, 0,
2020                                                               RT_SCOPE_LINK);
2021                         goto make_route;
2022                 }
2023                 if (fl4->saddr) {
2024                         if (ipv4_is_multicast(fl4->daddr))
2025                                 fl4->saddr = inet_select_addr(dev_out, 0,
2026                                                               fl4->flowi4_scope);
2027                         else if (!fl4->daddr)
2028                                 fl4->saddr = inet_select_addr(dev_out, 0,
2029                                                               RT_SCOPE_HOST);
2030                 }
2031         }
2032
2033         if (!fl4->daddr) {
2034                 fl4->daddr = fl4->saddr;
2035                 if (!fl4->daddr)
2036                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2037                 dev_out = net->loopback_dev;
2038                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2039                 res.type = RTN_LOCAL;
2040                 flags |= RTCF_LOCAL;
2041                 goto make_route;
2042         }
2043
2044         if (fib_lookup(net, fl4, &res)) {
2045                 res.fi = NULL;
2046                 res.table = NULL;
2047                 if (fl4->flowi4_oif) {
2048                         /* Apparently, routing tables are wrong. Assume,
2049                            that the destination is on link.
2050
2051                            WHY? DW.
2052                            Because we are allowed to send to iface
2053                            even if it has NO routes and NO assigned
2054                            addresses. When oif is specified, routing
2055                            tables are looked up with only one purpose:
2056                            to catch if destination is gatewayed, rather than
2057                            direct. Moreover, if MSG_DONTROUTE is set,
2058                            we send packet, ignoring both routing tables
2059                            and ifaddr state. --ANK
2060
2061
2062                            We could make it even if oif is unknown,
2063                            likely IPv6, but we do not.
2064                          */
2065
2066                         if (fl4->saddr == 0)
2067                                 fl4->saddr = inet_select_addr(dev_out, 0,
2068                                                               RT_SCOPE_LINK);
2069                         res.type = RTN_UNICAST;
2070                         goto make_route;
2071                 }
2072                 rth = ERR_PTR(-ENETUNREACH);
2073                 goto out;
2074         }
2075
2076         if (res.type == RTN_LOCAL) {
2077                 if (!fl4->saddr) {
2078                         if (res.fi->fib_prefsrc)
2079                                 fl4->saddr = res.fi->fib_prefsrc;
2080                         else
2081                                 fl4->saddr = fl4->daddr;
2082                 }
2083                 dev_out = net->loopback_dev;
2084                 fl4->flowi4_oif = dev_out->ifindex;
2085                 flags |= RTCF_LOCAL;
2086                 goto make_route;
2087         }
2088
2089 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2090         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2091                 fib_select_multipath(&res);
2092         else
2093 #endif
2094         if (!res.prefixlen &&
2095             res.table->tb_num_default > 1 &&
2096             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2097                 fib_select_default(&res);
2098
2099         if (!fl4->saddr)
2100                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2101
2102         dev_out = FIB_RES_DEV(res);
2103         fl4->flowi4_oif = dev_out->ifindex;
2104
2105
2106 make_route:
2107         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2108
2109 out:
2110         rcu_read_unlock();
2111         return rth;
2112 }
2113 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2114
2115 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2116 {
2117         return NULL;
2118 }
2119
2120 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2121 {
2122         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2123
2124         return mtu ? : dst->dev->mtu;
2125 }
2126
2127 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2128                                           struct sk_buff *skb, u32 mtu)
2129 {
2130 }
2131
2132 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2133                                        struct sk_buff *skb)
2134 {
2135 }
2136
2137 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2138                                           unsigned long old)
2139 {
2140         return NULL;
2141 }
2142
2143 static struct dst_ops ipv4_dst_blackhole_ops = {
2144         .family                 =       AF_INET,
2145         .protocol               =       cpu_to_be16(ETH_P_IP),
2146         .check                  =       ipv4_blackhole_dst_check,
2147         .mtu                    =       ipv4_blackhole_mtu,
2148         .default_advmss         =       ipv4_default_advmss,
2149         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2150         .redirect               =       ipv4_rt_blackhole_redirect,
2151         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2152         .neigh_lookup           =       ipv4_neigh_lookup,
2153 };
2154
2155 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2156 {
2157         struct rtable *ort = (struct rtable *) dst_orig;
2158         struct rtable *rt;
2159
2160         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2161         if (rt) {
2162                 struct dst_entry *new = &rt->dst;
2163
2164                 new->__use = 1;
2165                 new->input = dst_discard;
2166                 new->output = dst_discard;
2167
2168                 new->dev = ort->dst.dev;
2169                 if (new->dev)
2170                         dev_hold(new->dev);
2171
2172                 rt->rt_is_input = ort->rt_is_input;
2173                 rt->rt_iif = ort->rt_iif;
2174                 rt->rt_pmtu = ort->rt_pmtu;
2175
2176                 rt->rt_genid = rt_genid(net);
2177                 rt->rt_flags = ort->rt_flags;
2178                 rt->rt_type = ort->rt_type;
2179                 rt->rt_gateway = ort->rt_gateway;
2180                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2181
2182                 INIT_LIST_HEAD(&rt->rt_uncached);
2183
2184                 dst_free(new);
2185         }
2186
2187         dst_release(dst_orig);
2188
2189         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2190 }
2191
2192 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2193                                     struct sock *sk)
2194 {
2195         struct rtable *rt = __ip_route_output_key(net, flp4);
2196
2197         if (IS_ERR(rt))
2198                 return rt;
2199
2200         if (flp4->flowi4_proto)
2201                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2202                                                    flowi4_to_flowi(flp4),
2203                                                    sk, 0);
2204
2205         return rt;
2206 }
2207 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2208
2209 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2210                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2211                         u32 seq, int event, int nowait, unsigned int flags)
2212 {
2213         struct rtable *rt = skb_rtable(skb);
2214         struct rtmsg *r;
2215         struct nlmsghdr *nlh;
2216         unsigned long expires = 0;
2217         u32 error;
2218         u32 metrics[RTAX_MAX];
2219
2220         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2221         if (nlh == NULL)
2222                 return -EMSGSIZE;
2223
2224         r = nlmsg_data(nlh);
2225         r->rtm_family    = AF_INET;
2226         r->rtm_dst_len  = 32;
2227         r->rtm_src_len  = 0;
2228         r->rtm_tos      = fl4->flowi4_tos;
2229         r->rtm_table    = RT_TABLE_MAIN;
2230         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2231                 goto nla_put_failure;
2232         r->rtm_type     = rt->rt_type;
2233         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2234         r->rtm_protocol = RTPROT_UNSPEC;
2235         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2236         if (rt->rt_flags & RTCF_NOTIFY)
2237                 r->rtm_flags |= RTM_F_NOTIFY;
2238
2239         if (nla_put_be32(skb, RTA_DST, dst))
2240                 goto nla_put_failure;
2241         if (src) {
2242                 r->rtm_src_len = 32;
2243                 if (nla_put_be32(skb, RTA_SRC, src))
2244                         goto nla_put_failure;
2245         }
2246         if (rt->dst.dev &&
2247             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2248                 goto nla_put_failure;
2249 #ifdef CONFIG_IP_ROUTE_CLASSID
2250         if (rt->dst.tclassid &&
2251             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2252                 goto nla_put_failure;
2253 #endif
2254         if (!rt_is_input_route(rt) &&
2255             fl4->saddr != src) {
2256                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2257                         goto nla_put_failure;
2258         }
2259         if (rt->rt_uses_gateway &&
2260             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2261                 goto nla_put_failure;
2262
2263         expires = rt->dst.expires;
2264         if (expires) {
2265                 unsigned long now = jiffies;
2266
2267                 if (time_before(now, expires))
2268                         expires -= now;
2269                 else
2270                         expires = 0;
2271         }
2272
2273         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2274         if (rt->rt_pmtu && expires)
2275                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2276         if (rtnetlink_put_metrics(skb, metrics) < 0)
2277                 goto nla_put_failure;
2278
2279         if (fl4->flowi4_mark &&
2280             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2281                 goto nla_put_failure;
2282
2283         error = rt->dst.error;
2284
2285         if (rt_is_input_route(rt)) {
2286 #ifdef CONFIG_IP_MROUTE
2287                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2288                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2289                         int err = ipmr_get_route(net, skb,
2290                                                  fl4->saddr, fl4->daddr,
2291                                                  r, nowait);
2292                         if (err <= 0) {
2293                                 if (!nowait) {
2294                                         if (err == 0)
2295                                                 return 0;
2296                                         goto nla_put_failure;
2297                                 } else {
2298                                         if (err == -EMSGSIZE)
2299                                                 goto nla_put_failure;
2300                                         error = err;
2301                                 }
2302                         }
2303                 } else
2304 #endif
2305                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2306                                 goto nla_put_failure;
2307         }
2308
2309         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2310                 goto nla_put_failure;
2311
2312         return nlmsg_end(skb, nlh);
2313
2314 nla_put_failure:
2315         nlmsg_cancel(skb, nlh);
2316         return -EMSGSIZE;
2317 }
2318
2319 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2320 {
2321         struct net *net = sock_net(in_skb->sk);
2322         struct rtmsg *rtm;
2323         struct nlattr *tb[RTA_MAX+1];
2324         struct rtable *rt = NULL;
2325         struct flowi4 fl4;
2326         __be32 dst = 0;
2327         __be32 src = 0;
2328         u32 iif;
2329         int err;
2330         int mark;
2331         struct sk_buff *skb;
2332
2333         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2334         if (err < 0)
2335                 goto errout;
2336
2337         rtm = nlmsg_data(nlh);
2338
2339         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2340         if (skb == NULL) {
2341                 err = -ENOBUFS;
2342                 goto errout;
2343         }
2344
2345         /* Reserve room for dummy headers, this skb can pass
2346            through good chunk of routing engine.
2347          */
2348         skb_reset_mac_header(skb);
2349         skb_reset_network_header(skb);
2350
2351         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2352         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2353         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2354
2355         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2356         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2357         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2358         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2359
2360         memset(&fl4, 0, sizeof(fl4));
2361         fl4.daddr = dst;
2362         fl4.saddr = src;
2363         fl4.flowi4_tos = rtm->rtm_tos;
2364         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2365         fl4.flowi4_mark = mark;
2366
2367         if (iif) {
2368                 struct net_device *dev;
2369
2370                 dev = __dev_get_by_index(net, iif);
2371                 if (dev == NULL) {
2372                         err = -ENODEV;
2373                         goto errout_free;
2374                 }
2375
2376                 skb->protocol   = htons(ETH_P_IP);
2377                 skb->dev        = dev;
2378                 skb->mark       = mark;
2379                 local_bh_disable();
2380                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2381                 local_bh_enable();
2382
2383                 rt = skb_rtable(skb);
2384                 if (err == 0 && rt->dst.error)
2385                         err = -rt->dst.error;
2386         } else {
2387                 rt = ip_route_output_key(net, &fl4);
2388
2389                 err = 0;
2390                 if (IS_ERR(rt))
2391                         err = PTR_ERR(rt);
2392         }
2393
2394         if (err)
2395                 goto errout_free;
2396
2397         skb_dst_set(skb, &rt->dst);
2398         if (rtm->rtm_flags & RTM_F_NOTIFY)
2399                 rt->rt_flags |= RTCF_NOTIFY;
2400
2401         err = rt_fill_info(net, dst, src, &fl4, skb,
2402                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2403                            RTM_NEWROUTE, 0, 0);
2404         if (err <= 0)
2405                 goto errout_free;
2406
2407         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2408 errout:
2409         return err;
2410
2411 errout_free:
2412         kfree_skb(skb);
2413         goto errout;
2414 }
2415
2416 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2417 {
2418         return skb->len;
2419 }
2420
2421 void ip_rt_multicast_event(struct in_device *in_dev)
2422 {
2423         rt_cache_flush(dev_net(in_dev->dev));
2424 }
2425
2426 #ifdef CONFIG_SYSCTL
2427 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2428 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2429 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2430 static int ip_rt_gc_elasticity __read_mostly    = 8;
2431
2432 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2433                                         void __user *buffer,
2434                                         size_t *lenp, loff_t *ppos)
2435 {
2436         if (write) {
2437                 rt_cache_flush((struct net *)__ctl->extra1);
2438                 return 0;
2439         }
2440
2441         return -EINVAL;
2442 }
2443
2444 static ctl_table ipv4_route_table[] = {
2445         {
2446                 .procname       = "gc_thresh",
2447                 .data           = &ipv4_dst_ops.gc_thresh,
2448                 .maxlen         = sizeof(int),
2449                 .mode           = 0644,
2450                 .proc_handler   = proc_dointvec,
2451         },
2452         {
2453                 .procname       = "max_size",
2454                 .data           = &ip_rt_max_size,
2455                 .maxlen         = sizeof(int),
2456                 .mode           = 0644,
2457                 .proc_handler   = proc_dointvec,
2458         },
2459         {
2460                 /*  Deprecated. Use gc_min_interval_ms */
2461
2462                 .procname       = "gc_min_interval",
2463                 .data           = &ip_rt_gc_min_interval,
2464                 .maxlen         = sizeof(int),
2465                 .mode           = 0644,
2466                 .proc_handler   = proc_dointvec_jiffies,
2467         },
2468         {
2469                 .procname       = "gc_min_interval_ms",
2470                 .data           = &ip_rt_gc_min_interval,
2471                 .maxlen         = sizeof(int),
2472                 .mode           = 0644,
2473                 .proc_handler   = proc_dointvec_ms_jiffies,
2474         },
2475         {
2476                 .procname       = "gc_timeout",
2477                 .data           = &ip_rt_gc_timeout,
2478                 .maxlen         = sizeof(int),
2479                 .mode           = 0644,
2480                 .proc_handler   = proc_dointvec_jiffies,
2481         },
2482         {
2483                 .procname       = "gc_interval",
2484                 .data           = &ip_rt_gc_interval,
2485                 .maxlen         = sizeof(int),
2486                 .mode           = 0644,
2487                 .proc_handler   = proc_dointvec_jiffies,
2488         },
2489         {
2490                 .procname       = "redirect_load",
2491                 .data           = &ip_rt_redirect_load,
2492                 .maxlen         = sizeof(int),
2493                 .mode           = 0644,
2494                 .proc_handler   = proc_dointvec,
2495         },
2496         {
2497                 .procname       = "redirect_number",
2498                 .data           = &ip_rt_redirect_number,
2499                 .maxlen         = sizeof(int),
2500                 .mode           = 0644,
2501                 .proc_handler   = proc_dointvec,
2502         },
2503         {
2504                 .procname       = "redirect_silence",
2505                 .data           = &ip_rt_redirect_silence,
2506                 .maxlen         = sizeof(int),
2507                 .mode           = 0644,
2508                 .proc_handler   = proc_dointvec,
2509         },
2510         {
2511                 .procname       = "error_cost",
2512                 .data           = &ip_rt_error_cost,
2513                 .maxlen         = sizeof(int),
2514                 .mode           = 0644,
2515                 .proc_handler   = proc_dointvec,
2516         },
2517         {
2518                 .procname       = "error_burst",
2519                 .data           = &ip_rt_error_burst,
2520                 .maxlen         = sizeof(int),
2521                 .mode           = 0644,
2522                 .proc_handler   = proc_dointvec,
2523         },
2524         {
2525                 .procname       = "gc_elasticity",
2526                 .data           = &ip_rt_gc_elasticity,
2527                 .maxlen         = sizeof(int),
2528                 .mode           = 0644,
2529                 .proc_handler   = proc_dointvec,
2530         },
2531         {
2532                 .procname       = "mtu_expires",
2533                 .data           = &ip_rt_mtu_expires,
2534                 .maxlen         = sizeof(int),
2535                 .mode           = 0644,
2536                 .proc_handler   = proc_dointvec_jiffies,
2537         },
2538         {
2539                 .procname       = "min_pmtu",
2540                 .data           = &ip_rt_min_pmtu,
2541                 .maxlen         = sizeof(int),
2542                 .mode           = 0644,
2543                 .proc_handler   = proc_dointvec,
2544         },
2545         {
2546                 .procname       = "min_adv_mss",
2547                 .data           = &ip_rt_min_advmss,
2548                 .maxlen         = sizeof(int),
2549                 .mode           = 0644,
2550                 .proc_handler   = proc_dointvec,
2551         },
2552         { }
2553 };
2554
2555 static struct ctl_table ipv4_route_flush_table[] = {
2556         {
2557                 .procname       = "flush",
2558                 .maxlen         = sizeof(int),
2559                 .mode           = 0200,
2560                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2561         },
2562         { },
2563 };
2564
2565 static __net_init int sysctl_route_net_init(struct net *net)
2566 {
2567         struct ctl_table *tbl;
2568
2569         tbl = ipv4_route_flush_table;
2570         if (!net_eq(net, &init_net)) {
2571                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2572                 if (tbl == NULL)
2573                         goto err_dup;
2574
2575                 /* Don't export sysctls to unprivileged users */
2576                 if (net->user_ns != &init_user_ns)
2577                         tbl[0].procname = NULL;
2578         }
2579         tbl[0].extra1 = net;
2580
2581         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2582         if (net->ipv4.route_hdr == NULL)
2583                 goto err_reg;
2584         return 0;
2585
2586 err_reg:
2587         if (tbl != ipv4_route_flush_table)
2588                 kfree(tbl);
2589 err_dup:
2590         return -ENOMEM;
2591 }
2592
2593 static __net_exit void sysctl_route_net_exit(struct net *net)
2594 {
2595         struct ctl_table *tbl;
2596
2597         tbl = net->ipv4.route_hdr->ctl_table_arg;
2598         unregister_net_sysctl_table(net->ipv4.route_hdr);
2599         BUG_ON(tbl == ipv4_route_flush_table);
2600         kfree(tbl);
2601 }
2602
2603 static __net_initdata struct pernet_operations sysctl_route_ops = {
2604         .init = sysctl_route_net_init,
2605         .exit = sysctl_route_net_exit,
2606 };
2607 #endif
2608
2609 static __net_init int rt_genid_init(struct net *net)
2610 {
2611         atomic_set(&net->rt_genid, 0);
2612         get_random_bytes(&net->ipv4.dev_addr_genid,
2613                          sizeof(net->ipv4.dev_addr_genid));
2614         return 0;
2615 }
2616
2617 static __net_initdata struct pernet_operations rt_genid_ops = {
2618         .init = rt_genid_init,
2619 };
2620
2621 static int __net_init ipv4_inetpeer_init(struct net *net)
2622 {
2623         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2624
2625         if (!bp)
2626                 return -ENOMEM;
2627         inet_peer_base_init(bp);
2628         net->ipv4.peers = bp;
2629         return 0;
2630 }
2631
2632 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2633 {
2634         struct inet_peer_base *bp = net->ipv4.peers;
2635
2636         net->ipv4.peers = NULL;
2637         inetpeer_invalidate_tree(bp);
2638         kfree(bp);
2639 }
2640
2641 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2642         .init   =       ipv4_inetpeer_init,
2643         .exit   =       ipv4_inetpeer_exit,
2644 };
2645
2646 #ifdef CONFIG_IP_ROUTE_CLASSID
2647 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2648 #endif /* CONFIG_IP_ROUTE_CLASSID */
2649
2650 int __init ip_rt_init(void)
2651 {
2652         int rc = 0;
2653
2654 #ifdef CONFIG_IP_ROUTE_CLASSID
2655         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2656         if (!ip_rt_acct)
2657                 panic("IP: failed to allocate ip_rt_acct\n");
2658 #endif
2659
2660         ipv4_dst_ops.kmem_cachep =
2661                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2662                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2663
2664         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2665
2666         if (dst_entries_init(&ipv4_dst_ops) < 0)
2667                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2668
2669         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2670                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2671
2672         ipv4_dst_ops.gc_thresh = ~0;
2673         ip_rt_max_size = INT_MAX;
2674
2675         devinet_init();
2676         ip_fib_init();
2677
2678         if (ip_rt_proc_init())
2679                 pr_err("Unable to create route proc files\n");
2680 #ifdef CONFIG_XFRM
2681         xfrm_init();
2682         xfrm4_init();
2683 #endif
2684         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2685
2686 #ifdef CONFIG_SYSCTL
2687         register_pernet_subsys(&sysctl_route_ops);
2688 #endif
2689         register_pernet_subsys(&rt_genid_ops);
2690         register_pernet_subsys(&ipv4_inetpeer_ops);
2691         return rc;
2692 }
2693
2694 #ifdef CONFIG_SYSCTL
2695 /*
2696  * We really need to sanitize the damn ipv4 init order, then all
2697  * this nonsense will go away.
2698  */
2699 void __init ip_static_sysctl_init(void)
2700 {
2701         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2702 }
2703 #endif