Merge remote-tracking branch 'lsk/v3.10/topic/libfdt' into linux-linaro-lsk
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_redirect_number __read_mostly  = 9;
121 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
122 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
123 static int ip_rt_error_cost __read_mostly       = HZ;
124 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
125 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
126 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
127 static int ip_rt_min_advmss __read_mostly       = 256;
128
129 /*
130  *      Interface to generic destination cache.
131  */
132
133 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
134 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
135 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
136 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
137 static void              ipv4_link_failure(struct sk_buff *skb);
138 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
139                                            struct sk_buff *skb, u32 mtu);
140 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
141                                         struct sk_buff *skb);
142 static void             ipv4_dst_destroy(struct dst_entry *dst);
143
144 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
145                             int how)
146 {
147 }
148
149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151         WARN_ON(1);
152         return NULL;
153 }
154
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156                                            struct sk_buff *skb,
157                                            const void *daddr);
158
159 static struct dst_ops ipv4_dst_ops = {
160         .family =               AF_INET,
161         .protocol =             cpu_to_be16(ETH_P_IP),
162         .check =                ipv4_dst_check,
163         .default_advmss =       ipv4_default_advmss,
164         .mtu =                  ipv4_mtu,
165         .cow_metrics =          ipv4_cow_metrics,
166         .destroy =              ipv4_dst_destroy,
167         .ifdown =               ipv4_dst_ifdown,
168         .negative_advice =      ipv4_negative_advice,
169         .link_failure =         ipv4_link_failure,
170         .update_pmtu =          ip_rt_update_pmtu,
171         .redirect =             ip_do_redirect,
172         .local_out =            __ip_local_out,
173         .neigh_lookup =         ipv4_neigh_lookup,
174 };
175
176 #define ECN_OR_COST(class)      TC_PRIO_##class
177
178 const __u8 ip_tos2prio[16] = {
179         TC_PRIO_BESTEFFORT,
180         ECN_OR_COST(BESTEFFORT),
181         TC_PRIO_BESTEFFORT,
182         ECN_OR_COST(BESTEFFORT),
183         TC_PRIO_BULK,
184         ECN_OR_COST(BULK),
185         TC_PRIO_BULK,
186         ECN_OR_COST(BULK),
187         TC_PRIO_INTERACTIVE,
188         ECN_OR_COST(INTERACTIVE),
189         TC_PRIO_INTERACTIVE,
190         ECN_OR_COST(INTERACTIVE),
191         TC_PRIO_INTERACTIVE_BULK,
192         ECN_OR_COST(INTERACTIVE_BULK),
193         TC_PRIO_INTERACTIVE_BULK,
194         ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
200
201 #ifdef CONFIG_PROC_FS
202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204         if (*pos)
205                 return NULL;
206         return SEQ_START_TOKEN;
207 }
208
209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211         ++*pos;
212         return NULL;
213 }
214
215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218
219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221         if (v == SEQ_START_TOKEN)
222                 seq_printf(seq, "%-127s\n",
223                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225                            "HHUptod\tSpecDst");
226         return 0;
227 }
228
229 static const struct seq_operations rt_cache_seq_ops = {
230         .start  = rt_cache_seq_start,
231         .next   = rt_cache_seq_next,
232         .stop   = rt_cache_seq_stop,
233         .show   = rt_cache_seq_show,
234 };
235
236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238         return seq_open(file, &rt_cache_seq_ops);
239 }
240
241 static const struct file_operations rt_cache_seq_fops = {
242         .owner   = THIS_MODULE,
243         .open    = rt_cache_seq_open,
244         .read    = seq_read,
245         .llseek  = seq_lseek,
246         .release = seq_release,
247 };
248
249
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252         int cpu;
253
254         if (*pos == 0)
255                 return SEQ_START_TOKEN;
256
257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258                 if (!cpu_possible(cpu))
259                         continue;
260                 *pos = cpu+1;
261                 return &per_cpu(rt_cache_stat, cpu);
262         }
263         return NULL;
264 }
265
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268         int cpu;
269
270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271                 if (!cpu_possible(cpu))
272                         continue;
273                 *pos = cpu+1;
274                 return &per_cpu(rt_cache_stat, cpu);
275         }
276         return NULL;
277
278 }
279
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287         struct rt_cache_stat *st = v;
288
289         if (v == SEQ_START_TOKEN) {
290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291                 return 0;
292         }
293
294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296                    dst_entries_get_slow(&ipv4_dst_ops),
297                    st->in_hit,
298                    st->in_slow_tot,
299                    st->in_slow_mc,
300                    st->in_no_route,
301                    st->in_brd,
302                    st->in_martian_dst,
303                    st->in_martian_src,
304
305                    st->out_hit,
306                    st->out_slow_tot,
307                    st->out_slow_mc,
308
309                    st->gc_total,
310                    st->gc_ignored,
311                    st->gc_goal_miss,
312                    st->gc_dst_overflow,
313                    st->in_hlist_search,
314                    st->out_hlist_search
315                 );
316         return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320         .start  = rt_cpu_seq_start,
321         .next   = rt_cpu_seq_next,
322         .stop   = rt_cpu_seq_stop,
323         .show   = rt_cpu_seq_show,
324 };
325
326
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329         return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct file_operations rt_cpu_seq_fops = {
333         .owner   = THIS_MODULE,
334         .open    = rt_cpu_seq_open,
335         .read    = seq_read,
336         .llseek  = seq_lseek,
337         .release = seq_release,
338 };
339
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343         struct ip_rt_acct *dst, *src;
344         unsigned int i, j;
345
346         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347         if (!dst)
348                 return -ENOMEM;
349
350         for_each_possible_cpu(i) {
351                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352                 for (j = 0; j < 256; j++) {
353                         dst[j].o_bytes   += src[j].o_bytes;
354                         dst[j].o_packets += src[j].o_packets;
355                         dst[j].i_bytes   += src[j].i_bytes;
356                         dst[j].i_packets += src[j].i_packets;
357                 }
358         }
359
360         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361         kfree(dst);
362         return 0;
363 }
364
365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
366 {
367         return single_open(file, rt_acct_proc_show, NULL);
368 }
369
370 static const struct file_operations rt_acct_proc_fops = {
371         .owner          = THIS_MODULE,
372         .open           = rt_acct_proc_open,
373         .read           = seq_read,
374         .llseek         = seq_lseek,
375         .release        = single_release,
376 };
377 #endif
378
379 static int __net_init ip_rt_do_proc_init(struct net *net)
380 {
381         struct proc_dir_entry *pde;
382
383         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
384                           &rt_cache_seq_fops);
385         if (!pde)
386                 goto err1;
387
388         pde = proc_create("rt_cache", S_IRUGO,
389                           net->proc_net_stat, &rt_cpu_seq_fops);
390         if (!pde)
391                 goto err2;
392
393 #ifdef CONFIG_IP_ROUTE_CLASSID
394         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
395         if (!pde)
396                 goto err3;
397 #endif
398         return 0;
399
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401 err3:
402         remove_proc_entry("rt_cache", net->proc_net_stat);
403 #endif
404 err2:
405         remove_proc_entry("rt_cache", net->proc_net);
406 err1:
407         return -ENOMEM;
408 }
409
410 static void __net_exit ip_rt_do_proc_exit(struct net *net)
411 {
412         remove_proc_entry("rt_cache", net->proc_net_stat);
413         remove_proc_entry("rt_cache", net->proc_net);
414 #ifdef CONFIG_IP_ROUTE_CLASSID
415         remove_proc_entry("rt_acct", net->proc_net);
416 #endif
417 }
418
419 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
420         .init = ip_rt_do_proc_init,
421         .exit = ip_rt_do_proc_exit,
422 };
423
424 static int __init ip_rt_proc_init(void)
425 {
426         return register_pernet_subsys(&ip_rt_proc_ops);
427 }
428
429 #else
430 static inline int ip_rt_proc_init(void)
431 {
432         return 0;
433 }
434 #endif /* CONFIG_PROC_FS */
435
436 static inline bool rt_is_expired(const struct rtable *rth)
437 {
438         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
439 }
440
441 void rt_cache_flush(struct net *net)
442 {
443         rt_genid_bump(net);
444 }
445
446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
447                                            struct sk_buff *skb,
448                                            const void *daddr)
449 {
450         struct net_device *dev = dst->dev;
451         const __be32 *pkey = daddr;
452         const struct rtable *rt;
453         struct neighbour *n;
454
455         rt = (const struct rtable *) dst;
456         if (rt->rt_gateway)
457                 pkey = (const __be32 *) &rt->rt_gateway;
458         else if (skb)
459                 pkey = &ip_hdr(skb)->daddr;
460
461         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
462         if (n)
463                 return n;
464         return neigh_create(&arp_tbl, pkey, dev);
465 }
466
467 /*
468  * Peer allocation may fail only in serious out-of-memory conditions.  However
469  * we still can generate some output.
470  * Random ID selection looks a bit dangerous because we have no chances to
471  * select ID being unique in a reasonable period of time.
472  * But broken packet identifier may be better than no packet at all.
473  */
474 static void ip_select_fb_ident(struct iphdr *iph)
475 {
476         static DEFINE_SPINLOCK(ip_fb_id_lock);
477         static u32 ip_fallback_id;
478         u32 salt;
479
480         spin_lock_bh(&ip_fb_id_lock);
481         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
482         iph->id = htons(salt & 0xFFFF);
483         ip_fallback_id = salt;
484         spin_unlock_bh(&ip_fb_id_lock);
485 }
486
487 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
488 {
489         struct net *net = dev_net(dst->dev);
490         struct inet_peer *peer;
491
492         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
493         if (peer) {
494                 iph->id = htons(inet_getid(peer, more));
495                 inet_putpeer(peer);
496                 return;
497         }
498
499         ip_select_fb_ident(iph);
500 }
501 EXPORT_SYMBOL(__ip_select_ident);
502
503 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
504                              const struct iphdr *iph,
505                              int oif, u8 tos,
506                              u8 prot, u32 mark, int flow_flags)
507 {
508         if (sk) {
509                 const struct inet_sock *inet = inet_sk(sk);
510
511                 oif = sk->sk_bound_dev_if;
512                 mark = sk->sk_mark;
513                 tos = RT_CONN_FLAGS(sk);
514                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
515         }
516         flowi4_init_output(fl4, oif, mark, tos,
517                            RT_SCOPE_UNIVERSE, prot,
518                            flow_flags,
519                            iph->daddr, iph->saddr, 0, 0);
520 }
521
522 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
523                                const struct sock *sk)
524 {
525         const struct iphdr *iph = ip_hdr(skb);
526         int oif = skb->dev->ifindex;
527         u8 tos = RT_TOS(iph->tos);
528         u8 prot = iph->protocol;
529         u32 mark = skb->mark;
530
531         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
532 }
533
534 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
535 {
536         const struct inet_sock *inet = inet_sk(sk);
537         const struct ip_options_rcu *inet_opt;
538         __be32 daddr = inet->inet_daddr;
539
540         rcu_read_lock();
541         inet_opt = rcu_dereference(inet->inet_opt);
542         if (inet_opt && inet_opt->opt.srr)
543                 daddr = inet_opt->opt.faddr;
544         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
545                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
546                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
547                            inet_sk_flowi_flags(sk),
548                            daddr, inet->inet_saddr, 0, 0);
549         rcu_read_unlock();
550 }
551
552 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
553                                  const struct sk_buff *skb)
554 {
555         if (skb)
556                 build_skb_flow_key(fl4, skb, sk);
557         else
558                 build_sk_flow_key(fl4, sk);
559 }
560
561 static inline void rt_free(struct rtable *rt)
562 {
563         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
564 }
565
566 static DEFINE_SPINLOCK(fnhe_lock);
567
568 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
569 {
570         struct fib_nh_exception *fnhe, *oldest;
571         struct rtable *orig;
572
573         oldest = rcu_dereference(hash->chain);
574         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
575              fnhe = rcu_dereference(fnhe->fnhe_next)) {
576                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
577                         oldest = fnhe;
578         }
579         orig = rcu_dereference(oldest->fnhe_rth);
580         if (orig) {
581                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
582                 rt_free(orig);
583         }
584         return oldest;
585 }
586
587 static inline u32 fnhe_hashfun(__be32 daddr)
588 {
589         u32 hval;
590
591         hval = (__force u32) daddr;
592         hval ^= (hval >> 11) ^ (hval >> 22);
593
594         return hval & (FNHE_HASH_SIZE - 1);
595 }
596
597 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
598                                   u32 pmtu, unsigned long expires)
599 {
600         struct fnhe_hash_bucket *hash;
601         struct fib_nh_exception *fnhe;
602         int depth;
603         u32 hval = fnhe_hashfun(daddr);
604
605         spin_lock_bh(&fnhe_lock);
606
607         hash = nh->nh_exceptions;
608         if (!hash) {
609                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
610                 if (!hash)
611                         goto out_unlock;
612                 nh->nh_exceptions = hash;
613         }
614
615         hash += hval;
616
617         depth = 0;
618         for (fnhe = rcu_dereference(hash->chain); fnhe;
619              fnhe = rcu_dereference(fnhe->fnhe_next)) {
620                 if (fnhe->fnhe_daddr == daddr)
621                         break;
622                 depth++;
623         }
624
625         if (fnhe) {
626                 if (gw)
627                         fnhe->fnhe_gw = gw;
628                 if (pmtu) {
629                         fnhe->fnhe_pmtu = pmtu;
630                         fnhe->fnhe_expires = expires;
631                 }
632         } else {
633                 if (depth > FNHE_RECLAIM_DEPTH)
634                         fnhe = fnhe_oldest(hash);
635                 else {
636                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
637                         if (!fnhe)
638                                 goto out_unlock;
639
640                         fnhe->fnhe_next = hash->chain;
641                         rcu_assign_pointer(hash->chain, fnhe);
642                 }
643                 fnhe->fnhe_daddr = daddr;
644                 fnhe->fnhe_gw = gw;
645                 fnhe->fnhe_pmtu = pmtu;
646                 fnhe->fnhe_expires = expires;
647         }
648
649         fnhe->fnhe_stamp = jiffies;
650
651 out_unlock:
652         spin_unlock_bh(&fnhe_lock);
653         return;
654 }
655
656 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
657                              bool kill_route)
658 {
659         __be32 new_gw = icmp_hdr(skb)->un.gateway;
660         __be32 old_gw = ip_hdr(skb)->saddr;
661         struct net_device *dev = skb->dev;
662         struct in_device *in_dev;
663         struct fib_result res;
664         struct neighbour *n;
665         struct net *net;
666
667         switch (icmp_hdr(skb)->code & 7) {
668         case ICMP_REDIR_NET:
669         case ICMP_REDIR_NETTOS:
670         case ICMP_REDIR_HOST:
671         case ICMP_REDIR_HOSTTOS:
672                 break;
673
674         default:
675                 return;
676         }
677
678         if (rt->rt_gateway != old_gw)
679                 return;
680
681         in_dev = __in_dev_get_rcu(dev);
682         if (!in_dev)
683                 return;
684
685         net = dev_net(dev);
686         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
687             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
688             ipv4_is_zeronet(new_gw))
689                 goto reject_redirect;
690
691         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
692                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
693                         goto reject_redirect;
694                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
695                         goto reject_redirect;
696         } else {
697                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
698                         goto reject_redirect;
699         }
700
701         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
702         if (n) {
703                 if (!(n->nud_state & NUD_VALID)) {
704                         neigh_event_send(n, NULL);
705                 } else {
706                         if (fib_lookup(net, fl4, &res) == 0) {
707                                 struct fib_nh *nh = &FIB_RES_NH(res);
708
709                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
710                                                       0, 0);
711                         }
712                         if (kill_route)
713                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
714                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
715                 }
716                 neigh_release(n);
717         }
718         return;
719
720 reject_redirect:
721 #ifdef CONFIG_IP_ROUTE_VERBOSE
722         if (IN_DEV_LOG_MARTIANS(in_dev)) {
723                 const struct iphdr *iph = (const struct iphdr *) skb->data;
724                 __be32 daddr = iph->daddr;
725                 __be32 saddr = iph->saddr;
726
727                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
728                                      "  Advised path = %pI4 -> %pI4\n",
729                                      &old_gw, dev->name, &new_gw,
730                                      &saddr, &daddr);
731         }
732 #endif
733         ;
734 }
735
736 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
737 {
738         struct rtable *rt;
739         struct flowi4 fl4;
740         const struct iphdr *iph = (const struct iphdr *) skb->data;
741         int oif = skb->dev->ifindex;
742         u8 tos = RT_TOS(iph->tos);
743         u8 prot = iph->protocol;
744         u32 mark = skb->mark;
745
746         rt = (struct rtable *) dst;
747
748         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
749         __ip_do_redirect(rt, skb, &fl4, true);
750 }
751
752 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
753 {
754         struct rtable *rt = (struct rtable *)dst;
755         struct dst_entry *ret = dst;
756
757         if (rt) {
758                 if (dst->obsolete > 0) {
759                         ip_rt_put(rt);
760                         ret = NULL;
761                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
762                            rt->dst.expires) {
763                         ip_rt_put(rt);
764                         ret = NULL;
765                 }
766         }
767         return ret;
768 }
769
770 /*
771  * Algorithm:
772  *      1. The first ip_rt_redirect_number redirects are sent
773  *         with exponential backoff, then we stop sending them at all,
774  *         assuming that the host ignores our redirects.
775  *      2. If we did not see packets requiring redirects
776  *         during ip_rt_redirect_silence, we assume that the host
777  *         forgot redirected route and start to send redirects again.
778  *
779  * This algorithm is much cheaper and more intelligent than dumb load limiting
780  * in icmp.c.
781  *
782  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
783  * and "frag. need" (breaks PMTU discovery) in icmp.c.
784  */
785
786 void ip_rt_send_redirect(struct sk_buff *skb)
787 {
788         struct rtable *rt = skb_rtable(skb);
789         struct in_device *in_dev;
790         struct inet_peer *peer;
791         struct net *net;
792         int log_martians;
793
794         rcu_read_lock();
795         in_dev = __in_dev_get_rcu(rt->dst.dev);
796         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
797                 rcu_read_unlock();
798                 return;
799         }
800         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
801         rcu_read_unlock();
802
803         net = dev_net(rt->dst.dev);
804         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
805         if (!peer) {
806                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
807                           rt_nexthop(rt, ip_hdr(skb)->daddr));
808                 return;
809         }
810
811         /* No redirected packets during ip_rt_redirect_silence;
812          * reset the algorithm.
813          */
814         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
815                 peer->rate_tokens = 0;
816
817         /* Too many ignored redirects; do not send anything
818          * set dst.rate_last to the last seen redirected packet.
819          */
820         if (peer->rate_tokens >= ip_rt_redirect_number) {
821                 peer->rate_last = jiffies;
822                 goto out_put_peer;
823         }
824
825         /* Check for load limit; set rate_last to the latest sent
826          * redirect.
827          */
828         if (peer->rate_tokens == 0 ||
829             time_after(jiffies,
830                        (peer->rate_last +
831                         (ip_rt_redirect_load << peer->rate_tokens)))) {
832                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
833
834                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
835                 peer->rate_last = jiffies;
836                 ++peer->rate_tokens;
837 #ifdef CONFIG_IP_ROUTE_VERBOSE
838                 if (log_martians &&
839                     peer->rate_tokens == ip_rt_redirect_number)
840                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
841                                              &ip_hdr(skb)->saddr, inet_iif(skb),
842                                              &ip_hdr(skb)->daddr, &gw);
843 #endif
844         }
845 out_put_peer:
846         inet_putpeer(peer);
847 }
848
849 static int ip_error(struct sk_buff *skb)
850 {
851         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
852         struct rtable *rt = skb_rtable(skb);
853         struct inet_peer *peer;
854         unsigned long now;
855         struct net *net;
856         bool send;
857         int code;
858
859         net = dev_net(rt->dst.dev);
860         if (!IN_DEV_FORWARD(in_dev)) {
861                 switch (rt->dst.error) {
862                 case EHOSTUNREACH:
863                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
864                         break;
865
866                 case ENETUNREACH:
867                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
868                         break;
869                 }
870                 goto out;
871         }
872
873         switch (rt->dst.error) {
874         case EINVAL:
875         default:
876                 goto out;
877         case EHOSTUNREACH:
878                 code = ICMP_HOST_UNREACH;
879                 break;
880         case ENETUNREACH:
881                 code = ICMP_NET_UNREACH;
882                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
883                 break;
884         case EACCES:
885                 code = ICMP_PKT_FILTERED;
886                 break;
887         }
888
889         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
890
891         send = true;
892         if (peer) {
893                 now = jiffies;
894                 peer->rate_tokens += now - peer->rate_last;
895                 if (peer->rate_tokens > ip_rt_error_burst)
896                         peer->rate_tokens = ip_rt_error_burst;
897                 peer->rate_last = now;
898                 if (peer->rate_tokens >= ip_rt_error_cost)
899                         peer->rate_tokens -= ip_rt_error_cost;
900                 else
901                         send = false;
902                 inet_putpeer(peer);
903         }
904         if (send)
905                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
906
907 out:    kfree_skb(skb);
908         return 0;
909 }
910
911 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
912 {
913         struct dst_entry *dst = &rt->dst;
914         struct fib_result res;
915
916         if (dst_metric_locked(dst, RTAX_MTU))
917                 return;
918
919         if (dst->dev->mtu < mtu)
920                 return;
921
922         if (mtu < ip_rt_min_pmtu)
923                 mtu = ip_rt_min_pmtu;
924
925         if (!rt->rt_pmtu) {
926                 dst->obsolete = DST_OBSOLETE_KILL;
927         } else {
928                 rt->rt_pmtu = mtu;
929                 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
930         }
931
932         rcu_read_lock();
933         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
934                 struct fib_nh *nh = &FIB_RES_NH(res);
935
936                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
937                                       jiffies + ip_rt_mtu_expires);
938         }
939         rcu_read_unlock();
940 }
941
942 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
943                               struct sk_buff *skb, u32 mtu)
944 {
945         struct rtable *rt = (struct rtable *) dst;
946         struct flowi4 fl4;
947
948         ip_rt_build_flow_key(&fl4, sk, skb);
949         __ip_rt_update_pmtu(rt, &fl4, mtu);
950 }
951
952 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
953                       int oif, u32 mark, u8 protocol, int flow_flags)
954 {
955         const struct iphdr *iph = (const struct iphdr *) skb->data;
956         struct flowi4 fl4;
957         struct rtable *rt;
958
959         __build_flow_key(&fl4, NULL, iph, oif,
960                          RT_TOS(iph->tos), protocol, mark, flow_flags);
961         rt = __ip_route_output_key(net, &fl4);
962         if (!IS_ERR(rt)) {
963                 __ip_rt_update_pmtu(rt, &fl4, mtu);
964                 ip_rt_put(rt);
965         }
966 }
967 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
968
969 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
970 {
971         const struct iphdr *iph = (const struct iphdr *) skb->data;
972         struct flowi4 fl4;
973         struct rtable *rt;
974
975         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
976         rt = __ip_route_output_key(sock_net(sk), &fl4);
977         if (!IS_ERR(rt)) {
978                 __ip_rt_update_pmtu(rt, &fl4, mtu);
979                 ip_rt_put(rt);
980         }
981 }
982
983 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
984 {
985         const struct iphdr *iph = (const struct iphdr *) skb->data;
986         struct flowi4 fl4;
987         struct rtable *rt;
988         struct dst_entry *dst;
989         bool new = false;
990
991         bh_lock_sock(sk);
992         rt = (struct rtable *) __sk_dst_get(sk);
993
994         if (sock_owned_by_user(sk) || !rt) {
995                 __ipv4_sk_update_pmtu(skb, sk, mtu);
996                 goto out;
997         }
998
999         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1000
1001         if (!__sk_dst_check(sk, 0)) {
1002                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1003                 if (IS_ERR(rt))
1004                         goto out;
1005
1006                 new = true;
1007         }
1008
1009         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1010
1011         dst = dst_check(&rt->dst, 0);
1012         if (!dst) {
1013                 if (new)
1014                         dst_release(&rt->dst);
1015
1016                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1017                 if (IS_ERR(rt))
1018                         goto out;
1019
1020                 new = true;
1021         }
1022
1023         if (new)
1024                 __sk_dst_set(sk, &rt->dst);
1025
1026 out:
1027         bh_unlock_sock(sk);
1028 }
1029 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1030
1031 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1032                    int oif, u32 mark, u8 protocol, int flow_flags)
1033 {
1034         const struct iphdr *iph = (const struct iphdr *) skb->data;
1035         struct flowi4 fl4;
1036         struct rtable *rt;
1037
1038         __build_flow_key(&fl4, NULL, iph, oif,
1039                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1040         rt = __ip_route_output_key(net, &fl4);
1041         if (!IS_ERR(rt)) {
1042                 __ip_do_redirect(rt, skb, &fl4, false);
1043                 ip_rt_put(rt);
1044         }
1045 }
1046 EXPORT_SYMBOL_GPL(ipv4_redirect);
1047
1048 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1049 {
1050         const struct iphdr *iph = (const struct iphdr *) skb->data;
1051         struct flowi4 fl4;
1052         struct rtable *rt;
1053
1054         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1055         rt = __ip_route_output_key(sock_net(sk), &fl4);
1056         if (!IS_ERR(rt)) {
1057                 __ip_do_redirect(rt, skb, &fl4, false);
1058                 ip_rt_put(rt);
1059         }
1060 }
1061 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1062
1063 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1064 {
1065         struct rtable *rt = (struct rtable *) dst;
1066
1067         /* All IPV4 dsts are created with ->obsolete set to the value
1068          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1069          * into this function always.
1070          *
1071          * When a PMTU/redirect information update invalidates a
1072          * route, this is indicated by setting obsolete to
1073          * DST_OBSOLETE_KILL.
1074          */
1075         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1076                 return NULL;
1077         return dst;
1078 }
1079
1080 static void ipv4_link_failure(struct sk_buff *skb)
1081 {
1082         struct rtable *rt;
1083
1084         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1085
1086         rt = skb_rtable(skb);
1087         if (rt)
1088                 dst_set_expires(&rt->dst, 0);
1089 }
1090
1091 static int ip_rt_bug(struct sk_buff *skb)
1092 {
1093         pr_debug("%s: %pI4 -> %pI4, %s\n",
1094                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1095                  skb->dev ? skb->dev->name : "?");
1096         kfree_skb(skb);
1097         WARN_ON(1);
1098         return 0;
1099 }
1100
1101 /*
1102    We do not cache source address of outgoing interface,
1103    because it is used only by IP RR, TS and SRR options,
1104    so that it out of fast path.
1105
1106    BTW remember: "addr" is allowed to be not aligned
1107    in IP options!
1108  */
1109
1110 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1111 {
1112         __be32 src;
1113
1114         if (rt_is_output_route(rt))
1115                 src = ip_hdr(skb)->saddr;
1116         else {
1117                 struct fib_result res;
1118                 struct flowi4 fl4;
1119                 struct iphdr *iph;
1120
1121                 iph = ip_hdr(skb);
1122
1123                 memset(&fl4, 0, sizeof(fl4));
1124                 fl4.daddr = iph->daddr;
1125                 fl4.saddr = iph->saddr;
1126                 fl4.flowi4_tos = RT_TOS(iph->tos);
1127                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1128                 fl4.flowi4_iif = skb->dev->ifindex;
1129                 fl4.flowi4_mark = skb->mark;
1130
1131                 rcu_read_lock();
1132                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1133                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1134                 else
1135                         src = inet_select_addr(rt->dst.dev,
1136                                                rt_nexthop(rt, iph->daddr),
1137                                                RT_SCOPE_UNIVERSE);
1138                 rcu_read_unlock();
1139         }
1140         memcpy(addr, &src, 4);
1141 }
1142
1143 #ifdef CONFIG_IP_ROUTE_CLASSID
1144 static void set_class_tag(struct rtable *rt, u32 tag)
1145 {
1146         if (!(rt->dst.tclassid & 0xFFFF))
1147                 rt->dst.tclassid |= tag & 0xFFFF;
1148         if (!(rt->dst.tclassid & 0xFFFF0000))
1149                 rt->dst.tclassid |= tag & 0xFFFF0000;
1150 }
1151 #endif
1152
1153 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1154 {
1155         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1156
1157         if (advmss == 0) {
1158                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1159                                ip_rt_min_advmss);
1160                 if (advmss > 65535 - 40)
1161                         advmss = 65535 - 40;
1162         }
1163         return advmss;
1164 }
1165
1166 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1167 {
1168         const struct rtable *rt = (const struct rtable *) dst;
1169         unsigned int mtu = rt->rt_pmtu;
1170
1171         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1172                 mtu = dst_metric_raw(dst, RTAX_MTU);
1173
1174         if (mtu)
1175                 return mtu;
1176
1177         mtu = dst->dev->mtu;
1178
1179         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1180                 if (rt->rt_uses_gateway && mtu > 576)
1181                         mtu = 576;
1182         }
1183
1184         if (mtu > IP_MAX_MTU)
1185                 mtu = IP_MAX_MTU;
1186
1187         return mtu;
1188 }
1189
1190 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1191 {
1192         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1193         struct fib_nh_exception *fnhe;
1194         u32 hval;
1195
1196         if (!hash)
1197                 return NULL;
1198
1199         hval = fnhe_hashfun(daddr);
1200
1201         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1202              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1203                 if (fnhe->fnhe_daddr == daddr)
1204                         return fnhe;
1205         }
1206         return NULL;
1207 }
1208
1209 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1210                               __be32 daddr)
1211 {
1212         bool ret = false;
1213
1214         spin_lock_bh(&fnhe_lock);
1215
1216         if (daddr == fnhe->fnhe_daddr) {
1217                 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1218                 if (orig && rt_is_expired(orig)) {
1219                         fnhe->fnhe_gw = 0;
1220                         fnhe->fnhe_pmtu = 0;
1221                         fnhe->fnhe_expires = 0;
1222                 }
1223                 if (fnhe->fnhe_pmtu) {
1224                         unsigned long expires = fnhe->fnhe_expires;
1225                         unsigned long diff = expires - jiffies;
1226
1227                         if (time_before(jiffies, expires)) {
1228                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1229                                 dst_set_expires(&rt->dst, diff);
1230                         }
1231                 }
1232                 if (fnhe->fnhe_gw) {
1233                         rt->rt_flags |= RTCF_REDIRECTED;
1234                         rt->rt_gateway = fnhe->fnhe_gw;
1235                         rt->rt_uses_gateway = 1;
1236                 } else if (!rt->rt_gateway)
1237                         rt->rt_gateway = daddr;
1238
1239                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1240                 if (orig)
1241                         rt_free(orig);
1242
1243                 fnhe->fnhe_stamp = jiffies;
1244                 ret = true;
1245         }
1246         spin_unlock_bh(&fnhe_lock);
1247
1248         return ret;
1249 }
1250
1251 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1252 {
1253         struct rtable *orig, *prev, **p;
1254         bool ret = true;
1255
1256         if (rt_is_input_route(rt)) {
1257                 p = (struct rtable **)&nh->nh_rth_input;
1258         } else {
1259                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1260         }
1261         orig = *p;
1262
1263         prev = cmpxchg(p, orig, rt);
1264         if (prev == orig) {
1265                 if (orig)
1266                         rt_free(orig);
1267         } else
1268                 ret = false;
1269
1270         return ret;
1271 }
1272
1273 static DEFINE_SPINLOCK(rt_uncached_lock);
1274 static LIST_HEAD(rt_uncached_list);
1275
1276 static void rt_add_uncached_list(struct rtable *rt)
1277 {
1278         spin_lock_bh(&rt_uncached_lock);
1279         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1280         spin_unlock_bh(&rt_uncached_lock);
1281 }
1282
1283 static void ipv4_dst_destroy(struct dst_entry *dst)
1284 {
1285         struct rtable *rt = (struct rtable *) dst;
1286
1287         if (!list_empty(&rt->rt_uncached)) {
1288                 spin_lock_bh(&rt_uncached_lock);
1289                 list_del(&rt->rt_uncached);
1290                 spin_unlock_bh(&rt_uncached_lock);
1291         }
1292 }
1293
1294 void rt_flush_dev(struct net_device *dev)
1295 {
1296         if (!list_empty(&rt_uncached_list)) {
1297                 struct net *net = dev_net(dev);
1298                 struct rtable *rt;
1299
1300                 spin_lock_bh(&rt_uncached_lock);
1301                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1302                         if (rt->dst.dev != dev)
1303                                 continue;
1304                         rt->dst.dev = net->loopback_dev;
1305                         dev_hold(rt->dst.dev);
1306                         dev_put(dev);
1307                 }
1308                 spin_unlock_bh(&rt_uncached_lock);
1309         }
1310 }
1311
1312 static bool rt_cache_valid(const struct rtable *rt)
1313 {
1314         return  rt &&
1315                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1316                 !rt_is_expired(rt);
1317 }
1318
1319 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1320                            const struct fib_result *res,
1321                            struct fib_nh_exception *fnhe,
1322                            struct fib_info *fi, u16 type, u32 itag)
1323 {
1324         bool cached = false;
1325
1326         if (fi) {
1327                 struct fib_nh *nh = &FIB_RES_NH(*res);
1328
1329                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1330                         rt->rt_gateway = nh->nh_gw;
1331                         rt->rt_uses_gateway = 1;
1332                 }
1333                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1334 #ifdef CONFIG_IP_ROUTE_CLASSID
1335                 rt->dst.tclassid = nh->nh_tclassid;
1336 #endif
1337                 if (unlikely(fnhe))
1338                         cached = rt_bind_exception(rt, fnhe, daddr);
1339                 else if (!(rt->dst.flags & DST_NOCACHE))
1340                         cached = rt_cache_route(nh, rt);
1341                 if (unlikely(!cached)) {
1342                         /* Routes we intend to cache in nexthop exception or
1343                          * FIB nexthop have the DST_NOCACHE bit clear.
1344                          * However, if we are unsuccessful at storing this
1345                          * route into the cache we really need to set it.
1346                          */
1347                         rt->dst.flags |= DST_NOCACHE;
1348                         if (!rt->rt_gateway)
1349                                 rt->rt_gateway = daddr;
1350                         rt_add_uncached_list(rt);
1351                 }
1352         } else
1353                 rt_add_uncached_list(rt);
1354
1355 #ifdef CONFIG_IP_ROUTE_CLASSID
1356 #ifdef CONFIG_IP_MULTIPLE_TABLES
1357         set_class_tag(rt, res->tclassid);
1358 #endif
1359         set_class_tag(rt, itag);
1360 #endif
1361 }
1362
1363 static struct rtable *rt_dst_alloc(struct net_device *dev,
1364                                    bool nopolicy, bool noxfrm, bool will_cache)
1365 {
1366         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1367                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1368                          (nopolicy ? DST_NOPOLICY : 0) |
1369                          (noxfrm ? DST_NOXFRM : 0));
1370 }
1371
1372 /* called in rcu_read_lock() section */
1373 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1374                                 u8 tos, struct net_device *dev, int our)
1375 {
1376         struct rtable *rth;
1377         struct in_device *in_dev = __in_dev_get_rcu(dev);
1378         u32 itag = 0;
1379         int err;
1380
1381         /* Primary sanity checks. */
1382
1383         if (in_dev == NULL)
1384                 return -EINVAL;
1385
1386         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1387             skb->protocol != htons(ETH_P_IP))
1388                 goto e_inval;
1389
1390         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1391                 if (ipv4_is_loopback(saddr))
1392                         goto e_inval;
1393
1394         if (ipv4_is_zeronet(saddr)) {
1395                 if (!ipv4_is_local_multicast(daddr))
1396                         goto e_inval;
1397         } else {
1398                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1399                                           in_dev, &itag);
1400                 if (err < 0)
1401                         goto e_err;
1402         }
1403         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1404                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1405         if (!rth)
1406                 goto e_nobufs;
1407
1408 #ifdef CONFIG_IP_ROUTE_CLASSID
1409         rth->dst.tclassid = itag;
1410 #endif
1411         rth->dst.output = ip_rt_bug;
1412
1413         rth->rt_genid   = rt_genid(dev_net(dev));
1414         rth->rt_flags   = RTCF_MULTICAST;
1415         rth->rt_type    = RTN_MULTICAST;
1416         rth->rt_is_input= 1;
1417         rth->rt_iif     = 0;
1418         rth->rt_pmtu    = 0;
1419         rth->rt_gateway = 0;
1420         rth->rt_uses_gateway = 0;
1421         INIT_LIST_HEAD(&rth->rt_uncached);
1422         if (our) {
1423                 rth->dst.input= ip_local_deliver;
1424                 rth->rt_flags |= RTCF_LOCAL;
1425         }
1426
1427 #ifdef CONFIG_IP_MROUTE
1428         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1429                 rth->dst.input = ip_mr_input;
1430 #endif
1431         RT_CACHE_STAT_INC(in_slow_mc);
1432
1433         skb_dst_set(skb, &rth->dst);
1434         return 0;
1435
1436 e_nobufs:
1437         return -ENOBUFS;
1438 e_inval:
1439         return -EINVAL;
1440 e_err:
1441         return err;
1442 }
1443
1444
1445 static void ip_handle_martian_source(struct net_device *dev,
1446                                      struct in_device *in_dev,
1447                                      struct sk_buff *skb,
1448                                      __be32 daddr,
1449                                      __be32 saddr)
1450 {
1451         RT_CACHE_STAT_INC(in_martian_src);
1452 #ifdef CONFIG_IP_ROUTE_VERBOSE
1453         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1454                 /*
1455                  *      RFC1812 recommendation, if source is martian,
1456                  *      the only hint is MAC header.
1457                  */
1458                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1459                         &daddr, &saddr, dev->name);
1460                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1461                         print_hex_dump(KERN_WARNING, "ll header: ",
1462                                        DUMP_PREFIX_OFFSET, 16, 1,
1463                                        skb_mac_header(skb),
1464                                        dev->hard_header_len, true);
1465                 }
1466         }
1467 #endif
1468 }
1469
1470 /* called in rcu_read_lock() section */
1471 static int __mkroute_input(struct sk_buff *skb,
1472                            const struct fib_result *res,
1473                            struct in_device *in_dev,
1474                            __be32 daddr, __be32 saddr, u32 tos)
1475 {
1476         struct rtable *rth;
1477         int err;
1478         struct in_device *out_dev;
1479         unsigned int flags = 0;
1480         bool do_cache;
1481         u32 itag = 0;
1482
1483         /* get a working reference to the output device */
1484         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1485         if (out_dev == NULL) {
1486                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1487                 return -EINVAL;
1488         }
1489
1490         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1491                                   in_dev->dev, in_dev, &itag);
1492         if (err < 0) {
1493                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1494                                          saddr);
1495
1496                 goto cleanup;
1497         }
1498
1499         do_cache = res->fi && !itag;
1500         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1501             (IN_DEV_SHARED_MEDIA(out_dev) ||
1502              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1503                 flags |= RTCF_DOREDIRECT;
1504                 do_cache = false;
1505         }
1506
1507         if (skb->protocol != htons(ETH_P_IP)) {
1508                 /* Not IP (i.e. ARP). Do not create route, if it is
1509                  * invalid for proxy arp. DNAT routes are always valid.
1510                  *
1511                  * Proxy arp feature have been extended to allow, ARP
1512                  * replies back to the same interface, to support
1513                  * Private VLAN switch technologies. See arp.c.
1514                  */
1515                 if (out_dev == in_dev &&
1516                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1517                         err = -EINVAL;
1518                         goto cleanup;
1519                 }
1520         }
1521
1522         if (do_cache) {
1523                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1524                 if (rt_cache_valid(rth)) {
1525                         skb_dst_set_noref(skb, &rth->dst);
1526                         goto out;
1527                 }
1528         }
1529
1530         rth = rt_dst_alloc(out_dev->dev,
1531                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1532                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1533         if (!rth) {
1534                 err = -ENOBUFS;
1535                 goto cleanup;
1536         }
1537
1538         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1539         rth->rt_flags = flags;
1540         rth->rt_type = res->type;
1541         rth->rt_is_input = 1;
1542         rth->rt_iif     = 0;
1543         rth->rt_pmtu    = 0;
1544         rth->rt_gateway = 0;
1545         rth->rt_uses_gateway = 0;
1546         INIT_LIST_HEAD(&rth->rt_uncached);
1547         RT_CACHE_STAT_INC(in_slow_tot);
1548
1549         rth->dst.input = ip_forward;
1550         rth->dst.output = ip_output;
1551
1552         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1553         skb_dst_set(skb, &rth->dst);
1554 out:
1555         err = 0;
1556  cleanup:
1557         return err;
1558 }
1559
1560 static int ip_mkroute_input(struct sk_buff *skb,
1561                             struct fib_result *res,
1562                             const struct flowi4 *fl4,
1563                             struct in_device *in_dev,
1564                             __be32 daddr, __be32 saddr, u32 tos)
1565 {
1566 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1567         if (res->fi && res->fi->fib_nhs > 1)
1568                 fib_select_multipath(res);
1569 #endif
1570
1571         /* create a routing cache entry */
1572         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1573 }
1574
1575 /*
1576  *      NOTE. We drop all the packets that has local source
1577  *      addresses, because every properly looped back packet
1578  *      must have correct destination already attached by output routine.
1579  *
1580  *      Such approach solves two big problems:
1581  *      1. Not simplex devices are handled properly.
1582  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1583  *      called with rcu_read_lock()
1584  */
1585
1586 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1587                                u8 tos, struct net_device *dev)
1588 {
1589         struct fib_result res;
1590         struct in_device *in_dev = __in_dev_get_rcu(dev);
1591         struct flowi4   fl4;
1592         unsigned int    flags = 0;
1593         u32             itag = 0;
1594         struct rtable   *rth;
1595         int             err = -EINVAL;
1596         struct net    *net = dev_net(dev);
1597         bool do_cache;
1598
1599         /* IP on this device is disabled. */
1600
1601         if (!in_dev)
1602                 goto out;
1603
1604         /* Check for the most weird martians, which can be not detected
1605            by fib_lookup.
1606          */
1607
1608         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1609                 goto martian_source;
1610
1611         res.fi = NULL;
1612         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1613                 goto brd_input;
1614
1615         /* Accept zero addresses only to limited broadcast;
1616          * I even do not know to fix it or not. Waiting for complains :-)
1617          */
1618         if (ipv4_is_zeronet(saddr))
1619                 goto martian_source;
1620
1621         if (ipv4_is_zeronet(daddr))
1622                 goto martian_destination;
1623
1624         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1625          * and call it once if daddr or/and saddr are loopback addresses
1626          */
1627         if (ipv4_is_loopback(daddr)) {
1628                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1629                         goto martian_destination;
1630         } else if (ipv4_is_loopback(saddr)) {
1631                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1632                         goto martian_source;
1633         }
1634
1635         /*
1636          *      Now we are ready to route packet.
1637          */
1638         fl4.flowi4_oif = 0;
1639         fl4.flowi4_iif = dev->ifindex;
1640         fl4.flowi4_mark = skb->mark;
1641         fl4.flowi4_tos = tos;
1642         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1643         fl4.daddr = daddr;
1644         fl4.saddr = saddr;
1645         err = fib_lookup(net, &fl4, &res);
1646         if (err != 0)
1647                 goto no_route;
1648
1649         if (res.type == RTN_BROADCAST)
1650                 goto brd_input;
1651
1652         if (res.type == RTN_LOCAL) {
1653                 err = fib_validate_source(skb, saddr, daddr, tos,
1654                                           LOOPBACK_IFINDEX,
1655                                           dev, in_dev, &itag);
1656                 if (err < 0)
1657                         goto martian_source_keep_err;
1658                 goto local_input;
1659         }
1660
1661         if (!IN_DEV_FORWARD(in_dev))
1662                 goto no_route;
1663         if (res.type != RTN_UNICAST)
1664                 goto martian_destination;
1665
1666         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1667 out:    return err;
1668
1669 brd_input:
1670         if (skb->protocol != htons(ETH_P_IP))
1671                 goto e_inval;
1672
1673         if (!ipv4_is_zeronet(saddr)) {
1674                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1675                                           in_dev, &itag);
1676                 if (err < 0)
1677                         goto martian_source_keep_err;
1678         }
1679         flags |= RTCF_BROADCAST;
1680         res.type = RTN_BROADCAST;
1681         RT_CACHE_STAT_INC(in_brd);
1682
1683 local_input:
1684         do_cache = false;
1685         if (res.fi) {
1686                 if (!itag) {
1687                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1688                         if (rt_cache_valid(rth)) {
1689                                 skb_dst_set_noref(skb, &rth->dst);
1690                                 err = 0;
1691                                 goto out;
1692                         }
1693                         do_cache = true;
1694                 }
1695         }
1696
1697         rth = rt_dst_alloc(net->loopback_dev,
1698                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1699         if (!rth)
1700                 goto e_nobufs;
1701
1702         rth->dst.input= ip_local_deliver;
1703         rth->dst.output= ip_rt_bug;
1704 #ifdef CONFIG_IP_ROUTE_CLASSID
1705         rth->dst.tclassid = itag;
1706 #endif
1707
1708         rth->rt_genid = rt_genid(net);
1709         rth->rt_flags   = flags|RTCF_LOCAL;
1710         rth->rt_type    = res.type;
1711         rth->rt_is_input = 1;
1712         rth->rt_iif     = 0;
1713         rth->rt_pmtu    = 0;
1714         rth->rt_gateway = 0;
1715         rth->rt_uses_gateway = 0;
1716         INIT_LIST_HEAD(&rth->rt_uncached);
1717         RT_CACHE_STAT_INC(in_slow_tot);
1718         if (res.type == RTN_UNREACHABLE) {
1719                 rth->dst.input= ip_error;
1720                 rth->dst.error= -err;
1721                 rth->rt_flags   &= ~RTCF_LOCAL;
1722         }
1723         if (do_cache) {
1724                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1725                         rth->dst.flags |= DST_NOCACHE;
1726                         rt_add_uncached_list(rth);
1727                 }
1728         }
1729         skb_dst_set(skb, &rth->dst);
1730         err = 0;
1731         goto out;
1732
1733 no_route:
1734         RT_CACHE_STAT_INC(in_no_route);
1735         res.type = RTN_UNREACHABLE;
1736         if (err == -ESRCH)
1737                 err = -ENETUNREACH;
1738         goto local_input;
1739
1740         /*
1741          *      Do not cache martian addresses: they should be logged (RFC1812)
1742          */
1743 martian_destination:
1744         RT_CACHE_STAT_INC(in_martian_dst);
1745 #ifdef CONFIG_IP_ROUTE_VERBOSE
1746         if (IN_DEV_LOG_MARTIANS(in_dev))
1747                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1748                                      &daddr, &saddr, dev->name);
1749 #endif
1750
1751 e_inval:
1752         err = -EINVAL;
1753         goto out;
1754
1755 e_nobufs:
1756         err = -ENOBUFS;
1757         goto out;
1758
1759 martian_source:
1760         err = -EINVAL;
1761 martian_source_keep_err:
1762         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1763         goto out;
1764 }
1765
1766 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1767                          u8 tos, struct net_device *dev)
1768 {
1769         int res;
1770
1771         rcu_read_lock();
1772
1773         /* Multicast recognition logic is moved from route cache to here.
1774            The problem was that too many Ethernet cards have broken/missing
1775            hardware multicast filters :-( As result the host on multicasting
1776            network acquires a lot of useless route cache entries, sort of
1777            SDR messages from all the world. Now we try to get rid of them.
1778            Really, provided software IP multicast filter is organized
1779            reasonably (at least, hashed), it does not result in a slowdown
1780            comparing with route cache reject entries.
1781            Note, that multicast routers are not affected, because
1782            route cache entry is created eventually.
1783          */
1784         if (ipv4_is_multicast(daddr)) {
1785                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1786
1787                 if (in_dev) {
1788                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1789                                                   ip_hdr(skb)->protocol);
1790                         if (our
1791 #ifdef CONFIG_IP_MROUTE
1792                                 ||
1793                             (!ipv4_is_local_multicast(daddr) &&
1794                              IN_DEV_MFORWARD(in_dev))
1795 #endif
1796                            ) {
1797                                 int res = ip_route_input_mc(skb, daddr, saddr,
1798                                                             tos, dev, our);
1799                                 rcu_read_unlock();
1800                                 return res;
1801                         }
1802                 }
1803                 rcu_read_unlock();
1804                 return -EINVAL;
1805         }
1806         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1807         rcu_read_unlock();
1808         return res;
1809 }
1810 EXPORT_SYMBOL(ip_route_input_noref);
1811
1812 /* called with rcu_read_lock() */
1813 static struct rtable *__mkroute_output(const struct fib_result *res,
1814                                        const struct flowi4 *fl4, int orig_oif,
1815                                        struct net_device *dev_out,
1816                                        unsigned int flags)
1817 {
1818         struct fib_info *fi = res->fi;
1819         struct fib_nh_exception *fnhe;
1820         struct in_device *in_dev;
1821         u16 type = res->type;
1822         struct rtable *rth;
1823         bool do_cache;
1824
1825         in_dev = __in_dev_get_rcu(dev_out);
1826         if (!in_dev)
1827                 return ERR_PTR(-EINVAL);
1828
1829         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1830                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1831                         return ERR_PTR(-EINVAL);
1832
1833         if (ipv4_is_lbcast(fl4->daddr))
1834                 type = RTN_BROADCAST;
1835         else if (ipv4_is_multicast(fl4->daddr))
1836                 type = RTN_MULTICAST;
1837         else if (ipv4_is_zeronet(fl4->daddr))
1838                 return ERR_PTR(-EINVAL);
1839
1840         if (dev_out->flags & IFF_LOOPBACK)
1841                 flags |= RTCF_LOCAL;
1842
1843         do_cache = true;
1844         if (type == RTN_BROADCAST) {
1845                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1846                 fi = NULL;
1847         } else if (type == RTN_MULTICAST) {
1848                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1849                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1850                                      fl4->flowi4_proto))
1851                         flags &= ~RTCF_LOCAL;
1852                 else
1853                         do_cache = false;
1854                 /* If multicast route do not exist use
1855                  * default one, but do not gateway in this case.
1856                  * Yes, it is hack.
1857                  */
1858                 if (fi && res->prefixlen < 4)
1859                         fi = NULL;
1860         }
1861
1862         fnhe = NULL;
1863         do_cache &= fi != NULL;
1864         if (do_cache) {
1865                 struct rtable __rcu **prth;
1866                 struct fib_nh *nh = &FIB_RES_NH(*res);
1867
1868                 fnhe = find_exception(nh, fl4->daddr);
1869                 if (fnhe)
1870                         prth = &fnhe->fnhe_rth;
1871                 else {
1872                         if (unlikely(fl4->flowi4_flags &
1873                                      FLOWI_FLAG_KNOWN_NH &&
1874                                      !(nh->nh_gw &&
1875                                        nh->nh_scope == RT_SCOPE_LINK))) {
1876                                 do_cache = false;
1877                                 goto add;
1878                         }
1879                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1880                 }
1881                 rth = rcu_dereference(*prth);
1882                 if (rt_cache_valid(rth)) {
1883                         dst_hold(&rth->dst);
1884                         return rth;
1885                 }
1886         }
1887
1888 add:
1889         rth = rt_dst_alloc(dev_out,
1890                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1891                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1892                            do_cache);
1893         if (!rth)
1894                 return ERR_PTR(-ENOBUFS);
1895
1896         rth->dst.output = ip_output;
1897
1898         rth->rt_genid = rt_genid(dev_net(dev_out));
1899         rth->rt_flags   = flags;
1900         rth->rt_type    = type;
1901         rth->rt_is_input = 0;
1902         rth->rt_iif     = orig_oif ? : 0;
1903         rth->rt_pmtu    = 0;
1904         rth->rt_gateway = 0;
1905         rth->rt_uses_gateway = 0;
1906         INIT_LIST_HEAD(&rth->rt_uncached);
1907
1908         RT_CACHE_STAT_INC(out_slow_tot);
1909
1910         if (flags & RTCF_LOCAL)
1911                 rth->dst.input = ip_local_deliver;
1912         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1913                 if (flags & RTCF_LOCAL &&
1914                     !(dev_out->flags & IFF_LOOPBACK)) {
1915                         rth->dst.output = ip_mc_output;
1916                         RT_CACHE_STAT_INC(out_slow_mc);
1917                 }
1918 #ifdef CONFIG_IP_MROUTE
1919                 if (type == RTN_MULTICAST) {
1920                         if (IN_DEV_MFORWARD(in_dev) &&
1921                             !ipv4_is_local_multicast(fl4->daddr)) {
1922                                 rth->dst.input = ip_mr_input;
1923                                 rth->dst.output = ip_mc_output;
1924                         }
1925                 }
1926 #endif
1927         }
1928
1929         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1930
1931         return rth;
1932 }
1933
1934 /*
1935  * Major route resolver routine.
1936  */
1937
1938 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1939 {
1940         struct net_device *dev_out = NULL;
1941         __u8 tos = RT_FL_TOS(fl4);
1942         unsigned int flags = 0;
1943         struct fib_result res;
1944         struct rtable *rth;
1945         int orig_oif;
1946
1947         res.tclassid    = 0;
1948         res.fi          = NULL;
1949         res.table       = NULL;
1950
1951         orig_oif = fl4->flowi4_oif;
1952
1953         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1954         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1955         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1956                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1957
1958         rcu_read_lock();
1959         if (fl4->saddr) {
1960                 rth = ERR_PTR(-EINVAL);
1961                 if (ipv4_is_multicast(fl4->saddr) ||
1962                     ipv4_is_lbcast(fl4->saddr) ||
1963                     ipv4_is_zeronet(fl4->saddr))
1964                         goto out;
1965
1966                 /* I removed check for oif == dev_out->oif here.
1967                    It was wrong for two reasons:
1968                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1969                       is assigned to multiple interfaces.
1970                    2. Moreover, we are allowed to send packets with saddr
1971                       of another iface. --ANK
1972                  */
1973
1974                 if (fl4->flowi4_oif == 0 &&
1975                     (ipv4_is_multicast(fl4->daddr) ||
1976                      ipv4_is_lbcast(fl4->daddr))) {
1977                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1978                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1979                         if (dev_out == NULL)
1980                                 goto out;
1981
1982                         /* Special hack: user can direct multicasts
1983                            and limited broadcast via necessary interface
1984                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1985                            This hack is not just for fun, it allows
1986                            vic,vat and friends to work.
1987                            They bind socket to loopback, set ttl to zero
1988                            and expect that it will work.
1989                            From the viewpoint of routing cache they are broken,
1990                            because we are not allowed to build multicast path
1991                            with loopback source addr (look, routing cache
1992                            cannot know, that ttl is zero, so that packet
1993                            will not leave this host and route is valid).
1994                            Luckily, this hack is good workaround.
1995                          */
1996
1997                         fl4->flowi4_oif = dev_out->ifindex;
1998                         goto make_route;
1999                 }
2000
2001                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2002                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2003                         if (!__ip_dev_find(net, fl4->saddr, false))
2004                                 goto out;
2005                 }
2006         }
2007
2008
2009         if (fl4->flowi4_oif) {
2010                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2011                 rth = ERR_PTR(-ENODEV);
2012                 if (dev_out == NULL)
2013                         goto out;
2014
2015                 /* RACE: Check return value of inet_select_addr instead. */
2016                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2017                         rth = ERR_PTR(-ENETUNREACH);
2018                         goto out;
2019                 }
2020                 if (ipv4_is_local_multicast(fl4->daddr) ||
2021                     ipv4_is_lbcast(fl4->daddr)) {
2022                         if (!fl4->saddr)
2023                                 fl4->saddr = inet_select_addr(dev_out, 0,
2024                                                               RT_SCOPE_LINK);
2025                         goto make_route;
2026                 }
2027                 if (!fl4->saddr) {
2028                         if (ipv4_is_multicast(fl4->daddr))
2029                                 fl4->saddr = inet_select_addr(dev_out, 0,
2030                                                               fl4->flowi4_scope);
2031                         else if (!fl4->daddr)
2032                                 fl4->saddr = inet_select_addr(dev_out, 0,
2033                                                               RT_SCOPE_HOST);
2034                 }
2035         }
2036
2037         if (!fl4->daddr) {
2038                 fl4->daddr = fl4->saddr;
2039                 if (!fl4->daddr)
2040                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2041                 dev_out = net->loopback_dev;
2042                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2043                 res.type = RTN_LOCAL;
2044                 flags |= RTCF_LOCAL;
2045                 goto make_route;
2046         }
2047
2048         if (fib_lookup(net, fl4, &res)) {
2049                 res.fi = NULL;
2050                 res.table = NULL;
2051                 if (fl4->flowi4_oif) {
2052                         /* Apparently, routing tables are wrong. Assume,
2053                            that the destination is on link.
2054
2055                            WHY? DW.
2056                            Because we are allowed to send to iface
2057                            even if it has NO routes and NO assigned
2058                            addresses. When oif is specified, routing
2059                            tables are looked up with only one purpose:
2060                            to catch if destination is gatewayed, rather than
2061                            direct. Moreover, if MSG_DONTROUTE is set,
2062                            we send packet, ignoring both routing tables
2063                            and ifaddr state. --ANK
2064
2065
2066                            We could make it even if oif is unknown,
2067                            likely IPv6, but we do not.
2068                          */
2069
2070                         if (fl4->saddr == 0)
2071                                 fl4->saddr = inet_select_addr(dev_out, 0,
2072                                                               RT_SCOPE_LINK);
2073                         res.type = RTN_UNICAST;
2074                         goto make_route;
2075                 }
2076                 rth = ERR_PTR(-ENETUNREACH);
2077                 goto out;
2078         }
2079
2080         if (res.type == RTN_LOCAL) {
2081                 if (!fl4->saddr) {
2082                         if (res.fi->fib_prefsrc)
2083                                 fl4->saddr = res.fi->fib_prefsrc;
2084                         else
2085                                 fl4->saddr = fl4->daddr;
2086                 }
2087                 dev_out = net->loopback_dev;
2088                 fl4->flowi4_oif = dev_out->ifindex;
2089                 flags |= RTCF_LOCAL;
2090                 goto make_route;
2091         }
2092
2093 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2094         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2095                 fib_select_multipath(&res);
2096         else
2097 #endif
2098         if (!res.prefixlen &&
2099             res.table->tb_num_default > 1 &&
2100             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2101                 fib_select_default(&res);
2102
2103         if (!fl4->saddr)
2104                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2105
2106         dev_out = FIB_RES_DEV(res);
2107         fl4->flowi4_oif = dev_out->ifindex;
2108
2109
2110 make_route:
2111         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2112
2113 out:
2114         rcu_read_unlock();
2115         return rth;
2116 }
2117 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2118
2119 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2120 {
2121         return NULL;
2122 }
2123
2124 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2125 {
2126         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2127
2128         return mtu ? : dst->dev->mtu;
2129 }
2130
2131 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2132                                           struct sk_buff *skb, u32 mtu)
2133 {
2134 }
2135
2136 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2137                                        struct sk_buff *skb)
2138 {
2139 }
2140
2141 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2142                                           unsigned long old)
2143 {
2144         return NULL;
2145 }
2146
2147 static struct dst_ops ipv4_dst_blackhole_ops = {
2148         .family                 =       AF_INET,
2149         .protocol               =       cpu_to_be16(ETH_P_IP),
2150         .check                  =       ipv4_blackhole_dst_check,
2151         .mtu                    =       ipv4_blackhole_mtu,
2152         .default_advmss         =       ipv4_default_advmss,
2153         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2154         .redirect               =       ipv4_rt_blackhole_redirect,
2155         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2156         .neigh_lookup           =       ipv4_neigh_lookup,
2157 };
2158
2159 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2160 {
2161         struct rtable *ort = (struct rtable *) dst_orig;
2162         struct rtable *rt;
2163
2164         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2165         if (rt) {
2166                 struct dst_entry *new = &rt->dst;
2167
2168                 new->__use = 1;
2169                 new->input = dst_discard;
2170                 new->output = dst_discard;
2171
2172                 new->dev = ort->dst.dev;
2173                 if (new->dev)
2174                         dev_hold(new->dev);
2175
2176                 rt->rt_is_input = ort->rt_is_input;
2177                 rt->rt_iif = ort->rt_iif;
2178                 rt->rt_pmtu = ort->rt_pmtu;
2179
2180                 rt->rt_genid = rt_genid(net);
2181                 rt->rt_flags = ort->rt_flags;
2182                 rt->rt_type = ort->rt_type;
2183                 rt->rt_gateway = ort->rt_gateway;
2184                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2185
2186                 INIT_LIST_HEAD(&rt->rt_uncached);
2187
2188                 dst_free(new);
2189         }
2190
2191         dst_release(dst_orig);
2192
2193         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2194 }
2195
2196 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2197                                     struct sock *sk)
2198 {
2199         struct rtable *rt = __ip_route_output_key(net, flp4);
2200
2201         if (IS_ERR(rt))
2202                 return rt;
2203
2204         if (flp4->flowi4_proto)
2205                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2206                                                    flowi4_to_flowi(flp4),
2207                                                    sk, 0);
2208
2209         return rt;
2210 }
2211 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2212
2213 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2214                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2215                         u32 seq, int event, int nowait, unsigned int flags)
2216 {
2217         struct rtable *rt = skb_rtable(skb);
2218         struct rtmsg *r;
2219         struct nlmsghdr *nlh;
2220         unsigned long expires = 0;
2221         u32 error;
2222         u32 metrics[RTAX_MAX];
2223
2224         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2225         if (nlh == NULL)
2226                 return -EMSGSIZE;
2227
2228         r = nlmsg_data(nlh);
2229         r->rtm_family    = AF_INET;
2230         r->rtm_dst_len  = 32;
2231         r->rtm_src_len  = 0;
2232         r->rtm_tos      = fl4->flowi4_tos;
2233         r->rtm_table    = RT_TABLE_MAIN;
2234         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2235                 goto nla_put_failure;
2236         r->rtm_type     = rt->rt_type;
2237         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2238         r->rtm_protocol = RTPROT_UNSPEC;
2239         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2240         if (rt->rt_flags & RTCF_NOTIFY)
2241                 r->rtm_flags |= RTM_F_NOTIFY;
2242
2243         if (nla_put_be32(skb, RTA_DST, dst))
2244                 goto nla_put_failure;
2245         if (src) {
2246                 r->rtm_src_len = 32;
2247                 if (nla_put_be32(skb, RTA_SRC, src))
2248                         goto nla_put_failure;
2249         }
2250         if (rt->dst.dev &&
2251             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2252                 goto nla_put_failure;
2253 #ifdef CONFIG_IP_ROUTE_CLASSID
2254         if (rt->dst.tclassid &&
2255             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2256                 goto nla_put_failure;
2257 #endif
2258         if (!rt_is_input_route(rt) &&
2259             fl4->saddr != src) {
2260                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2261                         goto nla_put_failure;
2262         }
2263         if (rt->rt_uses_gateway &&
2264             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2265                 goto nla_put_failure;
2266
2267         expires = rt->dst.expires;
2268         if (expires) {
2269                 unsigned long now = jiffies;
2270
2271                 if (time_before(now, expires))
2272                         expires -= now;
2273                 else
2274                         expires = 0;
2275         }
2276
2277         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2278         if (rt->rt_pmtu && expires)
2279                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2280         if (rtnetlink_put_metrics(skb, metrics) < 0)
2281                 goto nla_put_failure;
2282
2283         if (fl4->flowi4_mark &&
2284             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2285                 goto nla_put_failure;
2286
2287         error = rt->dst.error;
2288
2289         if (rt_is_input_route(rt)) {
2290 #ifdef CONFIG_IP_MROUTE
2291                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2292                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2293                         int err = ipmr_get_route(net, skb,
2294                                                  fl4->saddr, fl4->daddr,
2295                                                  r, nowait);
2296                         if (err <= 0) {
2297                                 if (!nowait) {
2298                                         if (err == 0)
2299                                                 return 0;
2300                                         goto nla_put_failure;
2301                                 } else {
2302                                         if (err == -EMSGSIZE)
2303                                                 goto nla_put_failure;
2304                                         error = err;
2305                                 }
2306                         }
2307                 } else
2308 #endif
2309                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2310                                 goto nla_put_failure;
2311         }
2312
2313         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2314                 goto nla_put_failure;
2315
2316         return nlmsg_end(skb, nlh);
2317
2318 nla_put_failure:
2319         nlmsg_cancel(skb, nlh);
2320         return -EMSGSIZE;
2321 }
2322
2323 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2324 {
2325         struct net *net = sock_net(in_skb->sk);
2326         struct rtmsg *rtm;
2327         struct nlattr *tb[RTA_MAX+1];
2328         struct rtable *rt = NULL;
2329         struct flowi4 fl4;
2330         __be32 dst = 0;
2331         __be32 src = 0;
2332         u32 iif;
2333         int err;
2334         int mark;
2335         struct sk_buff *skb;
2336
2337         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2338         if (err < 0)
2339                 goto errout;
2340
2341         rtm = nlmsg_data(nlh);
2342
2343         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2344         if (skb == NULL) {
2345                 err = -ENOBUFS;
2346                 goto errout;
2347         }
2348
2349         /* Reserve room for dummy headers, this skb can pass
2350            through good chunk of routing engine.
2351          */
2352         skb_reset_mac_header(skb);
2353         skb_reset_network_header(skb);
2354
2355         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2356         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2357         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2358
2359         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2360         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2361         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2362         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2363
2364         memset(&fl4, 0, sizeof(fl4));
2365         fl4.daddr = dst;
2366         fl4.saddr = src;
2367         fl4.flowi4_tos = rtm->rtm_tos;
2368         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2369         fl4.flowi4_mark = mark;
2370
2371         if (iif) {
2372                 struct net_device *dev;
2373
2374                 dev = __dev_get_by_index(net, iif);
2375                 if (dev == NULL) {
2376                         err = -ENODEV;
2377                         goto errout_free;
2378                 }
2379
2380                 skb->protocol   = htons(ETH_P_IP);
2381                 skb->dev        = dev;
2382                 skb->mark       = mark;
2383                 local_bh_disable();
2384                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2385                 local_bh_enable();
2386
2387                 rt = skb_rtable(skb);
2388                 if (err == 0 && rt->dst.error)
2389                         err = -rt->dst.error;
2390         } else {
2391                 rt = ip_route_output_key(net, &fl4);
2392
2393                 err = 0;
2394                 if (IS_ERR(rt))
2395                         err = PTR_ERR(rt);
2396         }
2397
2398         if (err)
2399                 goto errout_free;
2400
2401         skb_dst_set(skb, &rt->dst);
2402         if (rtm->rtm_flags & RTM_F_NOTIFY)
2403                 rt->rt_flags |= RTCF_NOTIFY;
2404
2405         err = rt_fill_info(net, dst, src, &fl4, skb,
2406                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2407                            RTM_NEWROUTE, 0, 0);
2408         if (err <= 0)
2409                 goto errout_free;
2410
2411         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2412 errout:
2413         return err;
2414
2415 errout_free:
2416         kfree_skb(skb);
2417         goto errout;
2418 }
2419
2420 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2421 {
2422         return skb->len;
2423 }
2424
2425 void ip_rt_multicast_event(struct in_device *in_dev)
2426 {
2427         rt_cache_flush(dev_net(in_dev->dev));
2428 }
2429
2430 #ifdef CONFIG_SYSCTL
2431 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2432 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2433 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2434 static int ip_rt_gc_elasticity __read_mostly    = 8;
2435
2436 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2437                                         void __user *buffer,
2438                                         size_t *lenp, loff_t *ppos)
2439 {
2440         if (write) {
2441                 rt_cache_flush((struct net *)__ctl->extra1);
2442                 return 0;
2443         }
2444
2445         return -EINVAL;
2446 }
2447
2448 static ctl_table ipv4_route_table[] = {
2449         {
2450                 .procname       = "gc_thresh",
2451                 .data           = &ipv4_dst_ops.gc_thresh,
2452                 .maxlen         = sizeof(int),
2453                 .mode           = 0644,
2454                 .proc_handler   = proc_dointvec,
2455         },
2456         {
2457                 .procname       = "max_size",
2458                 .data           = &ip_rt_max_size,
2459                 .maxlen         = sizeof(int),
2460                 .mode           = 0644,
2461                 .proc_handler   = proc_dointvec,
2462         },
2463         {
2464                 /*  Deprecated. Use gc_min_interval_ms */
2465
2466                 .procname       = "gc_min_interval",
2467                 .data           = &ip_rt_gc_min_interval,
2468                 .maxlen         = sizeof(int),
2469                 .mode           = 0644,
2470                 .proc_handler   = proc_dointvec_jiffies,
2471         },
2472         {
2473                 .procname       = "gc_min_interval_ms",
2474                 .data           = &ip_rt_gc_min_interval,
2475                 .maxlen         = sizeof(int),
2476                 .mode           = 0644,
2477                 .proc_handler   = proc_dointvec_ms_jiffies,
2478         },
2479         {
2480                 .procname       = "gc_timeout",
2481                 .data           = &ip_rt_gc_timeout,
2482                 .maxlen         = sizeof(int),
2483                 .mode           = 0644,
2484                 .proc_handler   = proc_dointvec_jiffies,
2485         },
2486         {
2487                 .procname       = "gc_interval",
2488                 .data           = &ip_rt_gc_interval,
2489                 .maxlen         = sizeof(int),
2490                 .mode           = 0644,
2491                 .proc_handler   = proc_dointvec_jiffies,
2492         },
2493         {
2494                 .procname       = "redirect_load",
2495                 .data           = &ip_rt_redirect_load,
2496                 .maxlen         = sizeof(int),
2497                 .mode           = 0644,
2498                 .proc_handler   = proc_dointvec,
2499         },
2500         {
2501                 .procname       = "redirect_number",
2502                 .data           = &ip_rt_redirect_number,
2503                 .maxlen         = sizeof(int),
2504                 .mode           = 0644,
2505                 .proc_handler   = proc_dointvec,
2506         },
2507         {
2508                 .procname       = "redirect_silence",
2509                 .data           = &ip_rt_redirect_silence,
2510                 .maxlen         = sizeof(int),
2511                 .mode           = 0644,
2512                 .proc_handler   = proc_dointvec,
2513         },
2514         {
2515                 .procname       = "error_cost",
2516                 .data           = &ip_rt_error_cost,
2517                 .maxlen         = sizeof(int),
2518                 .mode           = 0644,
2519                 .proc_handler   = proc_dointvec,
2520         },
2521         {
2522                 .procname       = "error_burst",
2523                 .data           = &ip_rt_error_burst,
2524                 .maxlen         = sizeof(int),
2525                 .mode           = 0644,
2526                 .proc_handler   = proc_dointvec,
2527         },
2528         {
2529                 .procname       = "gc_elasticity",
2530                 .data           = &ip_rt_gc_elasticity,
2531                 .maxlen         = sizeof(int),
2532                 .mode           = 0644,
2533                 .proc_handler   = proc_dointvec,
2534         },
2535         {
2536                 .procname       = "mtu_expires",
2537                 .data           = &ip_rt_mtu_expires,
2538                 .maxlen         = sizeof(int),
2539                 .mode           = 0644,
2540                 .proc_handler   = proc_dointvec_jiffies,
2541         },
2542         {
2543                 .procname       = "min_pmtu",
2544                 .data           = &ip_rt_min_pmtu,
2545                 .maxlen         = sizeof(int),
2546                 .mode           = 0644,
2547                 .proc_handler   = proc_dointvec,
2548         },
2549         {
2550                 .procname       = "min_adv_mss",
2551                 .data           = &ip_rt_min_advmss,
2552                 .maxlen         = sizeof(int),
2553                 .mode           = 0644,
2554                 .proc_handler   = proc_dointvec,
2555         },
2556         { }
2557 };
2558
2559 static struct ctl_table ipv4_route_flush_table[] = {
2560         {
2561                 .procname       = "flush",
2562                 .maxlen         = sizeof(int),
2563                 .mode           = 0200,
2564                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2565         },
2566         { },
2567 };
2568
2569 static __net_init int sysctl_route_net_init(struct net *net)
2570 {
2571         struct ctl_table *tbl;
2572
2573         tbl = ipv4_route_flush_table;
2574         if (!net_eq(net, &init_net)) {
2575                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2576                 if (tbl == NULL)
2577                         goto err_dup;
2578
2579                 /* Don't export sysctls to unprivileged users */
2580                 if (net->user_ns != &init_user_ns)
2581                         tbl[0].procname = NULL;
2582         }
2583         tbl[0].extra1 = net;
2584
2585         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2586         if (net->ipv4.route_hdr == NULL)
2587                 goto err_reg;
2588         return 0;
2589
2590 err_reg:
2591         if (tbl != ipv4_route_flush_table)
2592                 kfree(tbl);
2593 err_dup:
2594         return -ENOMEM;
2595 }
2596
2597 static __net_exit void sysctl_route_net_exit(struct net *net)
2598 {
2599         struct ctl_table *tbl;
2600
2601         tbl = net->ipv4.route_hdr->ctl_table_arg;
2602         unregister_net_sysctl_table(net->ipv4.route_hdr);
2603         BUG_ON(tbl == ipv4_route_flush_table);
2604         kfree(tbl);
2605 }
2606
2607 static __net_initdata struct pernet_operations sysctl_route_ops = {
2608         .init = sysctl_route_net_init,
2609         .exit = sysctl_route_net_exit,
2610 };
2611 #endif
2612
2613 static __net_init int rt_genid_init(struct net *net)
2614 {
2615         atomic_set(&net->rt_genid, 0);
2616         get_random_bytes(&net->ipv4.dev_addr_genid,
2617                          sizeof(net->ipv4.dev_addr_genid));
2618         return 0;
2619 }
2620
2621 static __net_initdata struct pernet_operations rt_genid_ops = {
2622         .init = rt_genid_init,
2623 };
2624
2625 static int __net_init ipv4_inetpeer_init(struct net *net)
2626 {
2627         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2628
2629         if (!bp)
2630                 return -ENOMEM;
2631         inet_peer_base_init(bp);
2632         net->ipv4.peers = bp;
2633         return 0;
2634 }
2635
2636 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2637 {
2638         struct inet_peer_base *bp = net->ipv4.peers;
2639
2640         net->ipv4.peers = NULL;
2641         inetpeer_invalidate_tree(bp);
2642         kfree(bp);
2643 }
2644
2645 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2646         .init   =       ipv4_inetpeer_init,
2647         .exit   =       ipv4_inetpeer_exit,
2648 };
2649
2650 #ifdef CONFIG_IP_ROUTE_CLASSID
2651 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2652 #endif /* CONFIG_IP_ROUTE_CLASSID */
2653
2654 int __init ip_rt_init(void)
2655 {
2656         int rc = 0;
2657
2658 #ifdef CONFIG_IP_ROUTE_CLASSID
2659         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2660         if (!ip_rt_acct)
2661                 panic("IP: failed to allocate ip_rt_acct\n");
2662 #endif
2663
2664         ipv4_dst_ops.kmem_cachep =
2665                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2666                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2667
2668         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2669
2670         if (dst_entries_init(&ipv4_dst_ops) < 0)
2671                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2672
2673         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2674                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2675
2676         ipv4_dst_ops.gc_thresh = ~0;
2677         ip_rt_max_size = INT_MAX;
2678
2679         devinet_init();
2680         ip_fib_init();
2681
2682         if (ip_rt_proc_init())
2683                 pr_err("Unable to create route proc files\n");
2684 #ifdef CONFIG_XFRM
2685         xfrm_init();
2686         xfrm4_init();
2687 #endif
2688         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2689
2690 #ifdef CONFIG_SYSCTL
2691         register_pernet_subsys(&sysctl_route_ops);
2692 #endif
2693         register_pernet_subsys(&rt_genid_ops);
2694         register_pernet_subsys(&ipv4_inetpeer_ops);
2695         return rc;
2696 }
2697
2698 #ifdef CONFIG_SYSCTL
2699 /*
2700  * We really need to sanitize the damn ipv4 init order, then all
2701  * this nonsense will go away.
2702  */
2703 void __init ip_static_sysctl_init(void)
2704 {
2705         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2706 }
2707 #endif