d4d162eac4df095821a20191e41904e575d6e533
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #include <linux/kmemleak.h>
110 #endif
111 #include <net/secure_seq.h>
112
113 #define RT_FL_TOS(oldflp4) \
114         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 /*
131  *      Interface to generic destination cache.
132  */
133
134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
135 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
136 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
137 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
138 static void              ipv4_link_failure(struct sk_buff *skb);
139 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
140                                            struct sk_buff *skb, u32 mtu);
141 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
142                                         struct sk_buff *skb);
143 static void             ipv4_dst_destroy(struct dst_entry *dst);
144
145 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
146                             int how)
147 {
148 }
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             cpu_to_be16(ETH_P_IP),
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .ifdown =               ipv4_dst_ifdown,
169         .negative_advice =      ipv4_negative_advice,
170         .link_failure =         ipv4_link_failure,
171         .update_pmtu =          ip_rt_update_pmtu,
172         .redirect =             ip_do_redirect,
173         .local_out =            __ip_local_out,
174         .neigh_lookup =         ipv4_neigh_lookup,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .owner   = THIS_MODULE,
244         .open    = rt_cache_seq_open,
245         .read    = seq_read,
246         .llseek  = seq_lseek,
247         .release = seq_release,
248 };
249
250
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252 {
253         int cpu;
254
255         if (*pos == 0)
256                 return SEQ_START_TOKEN;
257
258         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259                 if (!cpu_possible(cpu))
260                         continue;
261                 *pos = cpu+1;
262                 return &per_cpu(rt_cache_stat, cpu);
263         }
264         return NULL;
265 }
266
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269         int cpu;
270
271         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272                 if (!cpu_possible(cpu))
273                         continue;
274                 *pos = cpu+1;
275                 return &per_cpu(rt_cache_stat, cpu);
276         }
277         return NULL;
278
279 }
280
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283
284 }
285
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288         struct rt_cache_stat *st = v;
289
290         if (v == SEQ_START_TOKEN) {
291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292                 return 0;
293         }
294
295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297                    dst_entries_get_slow(&ipv4_dst_ops),
298                    st->in_hit,
299                    st->in_slow_tot,
300                    st->in_slow_mc,
301                    st->in_no_route,
302                    st->in_brd,
303                    st->in_martian_dst,
304                    st->in_martian_src,
305
306                    st->out_hit,
307                    st->out_slow_tot,
308                    st->out_slow_mc,
309
310                    st->gc_total,
311                    st->gc_ignored,
312                    st->gc_goal_miss,
313                    st->gc_dst_overflow,
314                    st->in_hlist_search,
315                    st->out_hlist_search
316                 );
317         return 0;
318 }
319
320 static const struct seq_operations rt_cpu_seq_ops = {
321         .start  = rt_cpu_seq_start,
322         .next   = rt_cpu_seq_next,
323         .stop   = rt_cpu_seq_stop,
324         .show   = rt_cpu_seq_show,
325 };
326
327
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330         return seq_open(file, &rt_cpu_seq_ops);
331 }
332
333 static const struct file_operations rt_cpu_seq_fops = {
334         .owner   = THIS_MODULE,
335         .open    = rt_cpu_seq_open,
336         .read    = seq_read,
337         .llseek  = seq_lseek,
338         .release = seq_release,
339 };
340
341 #ifdef CONFIG_IP_ROUTE_CLASSID
342 static int rt_acct_proc_show(struct seq_file *m, void *v)
343 {
344         struct ip_rt_acct *dst, *src;
345         unsigned int i, j;
346
347         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348         if (!dst)
349                 return -ENOMEM;
350
351         for_each_possible_cpu(i) {
352                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353                 for (j = 0; j < 256; j++) {
354                         dst[j].o_bytes   += src[j].o_bytes;
355                         dst[j].o_packets += src[j].o_packets;
356                         dst[j].i_bytes   += src[j].i_bytes;
357                         dst[j].i_packets += src[j].i_packets;
358                 }
359         }
360
361         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362         kfree(dst);
363         return 0;
364 }
365
366 static int rt_acct_proc_open(struct inode *inode, struct file *file)
367 {
368         return single_open(file, rt_acct_proc_show, NULL);
369 }
370
371 static const struct file_operations rt_acct_proc_fops = {
372         .owner          = THIS_MODULE,
373         .open           = rt_acct_proc_open,
374         .read           = seq_read,
375         .llseek         = seq_lseek,
376         .release        = single_release,
377 };
378 #endif
379
380 static int __net_init ip_rt_do_proc_init(struct net *net)
381 {
382         struct proc_dir_entry *pde;
383
384         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385                           &rt_cache_seq_fops);
386         if (!pde)
387                 goto err1;
388
389         pde = proc_create("rt_cache", S_IRUGO,
390                           net->proc_net_stat, &rt_cpu_seq_fops);
391         if (!pde)
392                 goto err2;
393
394 #ifdef CONFIG_IP_ROUTE_CLASSID
395         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
396         if (!pde)
397                 goto err3;
398 #endif
399         return 0;
400
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 err3:
403         remove_proc_entry("rt_cache", net->proc_net_stat);
404 #endif
405 err2:
406         remove_proc_entry("rt_cache", net->proc_net);
407 err1:
408         return -ENOMEM;
409 }
410
411 static void __net_exit ip_rt_do_proc_exit(struct net *net)
412 {
413         remove_proc_entry("rt_cache", net->proc_net_stat);
414         remove_proc_entry("rt_cache", net->proc_net);
415 #ifdef CONFIG_IP_ROUTE_CLASSID
416         remove_proc_entry("rt_acct", net->proc_net);
417 #endif
418 }
419
420 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
421         .init = ip_rt_do_proc_init,
422         .exit = ip_rt_do_proc_exit,
423 };
424
425 static int __init ip_rt_proc_init(void)
426 {
427         return register_pernet_subsys(&ip_rt_proc_ops);
428 }
429
430 #else
431 static inline int ip_rt_proc_init(void)
432 {
433         return 0;
434 }
435 #endif /* CONFIG_PROC_FS */
436
437 static inline bool rt_is_expired(const struct rtable *rth)
438 {
439         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
440 }
441
442 void rt_cache_flush(struct net *net)
443 {
444         rt_genid_bump(net);
445 }
446
447 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448                                            struct sk_buff *skb,
449                                            const void *daddr)
450 {
451         struct net_device *dev = dst->dev;
452         const __be32 *pkey = daddr;
453         const struct rtable *rt;
454         struct neighbour *n;
455
456         rt = (const struct rtable *) dst;
457         if (rt->rt_gateway)
458                 pkey = (const __be32 *) &rt->rt_gateway;
459         else if (skb)
460                 pkey = &ip_hdr(skb)->daddr;
461
462         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
463         if (n)
464                 return n;
465         return neigh_create(&arp_tbl, pkey, dev);
466 }
467
468 #define IP_IDENTS_SZ 2048u
469 struct ip_ident_bucket {
470         atomic_t        id;
471         u32             stamp32;
472 };
473
474 static struct ip_ident_bucket *ip_idents __read_mostly;
475
476 /* In order to protect privacy, we add a perturbation to identifiers
477  * if one generator is seldom used. This makes hard for an attacker
478  * to infer how many packets were sent between two points in time.
479  */
480 u32 ip_idents_reserve(u32 hash, int segs)
481 {
482         struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
483         u32 old = ACCESS_ONCE(bucket->stamp32);
484         u32 now = (u32)jiffies;
485         u32 delta = 0;
486
487         if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
488                 u64 x = prandom_u32();
489
490                 x *= (now - old);
491                 delta = (u32)(x >> 32);
492         }
493
494         return atomic_add_return(segs + delta, &bucket->id) - segs;
495 }
496 EXPORT_SYMBOL(ip_idents_reserve);
497
498 void __ip_select_ident(struct iphdr *iph, int segs)
499 {
500         static u32 ip_idents_hashrnd __read_mostly;
501         static bool hashrnd_initialized = false;
502         u32 hash, id;
503
504         if (unlikely(!hashrnd_initialized)) {
505                 hashrnd_initialized = true;
506                 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
507         }
508
509         hash = jhash_3words((__force u32)iph->daddr,
510                             (__force u32)iph->saddr,
511                             iph->protocol,
512                             ip_idents_hashrnd);
513         id = ip_idents_reserve(hash, segs);
514         iph->id = htons(id);
515 }
516 EXPORT_SYMBOL(__ip_select_ident);
517
518 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
519                              const struct iphdr *iph,
520                              int oif, u8 tos,
521                              u8 prot, u32 mark, int flow_flags)
522 {
523         if (sk) {
524                 const struct inet_sock *inet = inet_sk(sk);
525
526                 oif = sk->sk_bound_dev_if;
527                 mark = sk->sk_mark;
528                 tos = RT_CONN_FLAGS(sk);
529                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
530         }
531         flowi4_init_output(fl4, oif, mark, tos,
532                            RT_SCOPE_UNIVERSE, prot,
533                            flow_flags,
534                            iph->daddr, iph->saddr, 0, 0);
535 }
536
537 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
538                                const struct sock *sk)
539 {
540         const struct iphdr *iph = ip_hdr(skb);
541         int oif = skb->dev->ifindex;
542         u8 tos = RT_TOS(iph->tos);
543         u8 prot = iph->protocol;
544         u32 mark = skb->mark;
545
546         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
547 }
548
549 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
550 {
551         const struct inet_sock *inet = inet_sk(sk);
552         const struct ip_options_rcu *inet_opt;
553         __be32 daddr = inet->inet_daddr;
554
555         rcu_read_lock();
556         inet_opt = rcu_dereference(inet->inet_opt);
557         if (inet_opt && inet_opt->opt.srr)
558                 daddr = inet_opt->opt.faddr;
559         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
560                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
561                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
562                            inet_sk_flowi_flags(sk),
563                            daddr, inet->inet_saddr, 0, 0);
564         rcu_read_unlock();
565 }
566
567 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
568                                  const struct sk_buff *skb)
569 {
570         if (skb)
571                 build_skb_flow_key(fl4, skb, sk);
572         else
573                 build_sk_flow_key(fl4, sk);
574 }
575
576 static inline void rt_free(struct rtable *rt)
577 {
578         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
579 }
580
581 static DEFINE_SPINLOCK(fnhe_lock);
582
583 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
584 {
585         struct fib_nh_exception *fnhe, *oldest;
586         struct rtable *orig;
587
588         oldest = rcu_dereference(hash->chain);
589         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
590              fnhe = rcu_dereference(fnhe->fnhe_next)) {
591                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
592                         oldest = fnhe;
593         }
594         orig = rcu_dereference(oldest->fnhe_rth);
595         if (orig) {
596                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
597                 rt_free(orig);
598         }
599         return oldest;
600 }
601
602 static inline u32 fnhe_hashfun(__be32 daddr)
603 {
604         u32 hval;
605
606         hval = (__force u32) daddr;
607         hval ^= (hval >> 11) ^ (hval >> 22);
608
609         return hval & (FNHE_HASH_SIZE - 1);
610 }
611
612 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
613                                   u32 pmtu, unsigned long expires)
614 {
615         struct fnhe_hash_bucket *hash;
616         struct fib_nh_exception *fnhe;
617         int depth;
618         u32 hval = fnhe_hashfun(daddr);
619
620         spin_lock_bh(&fnhe_lock);
621
622         hash = nh->nh_exceptions;
623         if (!hash) {
624                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
625                 if (!hash)
626                         goto out_unlock;
627                 nh->nh_exceptions = hash;
628         }
629
630         hash += hval;
631
632         depth = 0;
633         for (fnhe = rcu_dereference(hash->chain); fnhe;
634              fnhe = rcu_dereference(fnhe->fnhe_next)) {
635                 if (fnhe->fnhe_daddr == daddr)
636                         break;
637                 depth++;
638         }
639
640         if (fnhe) {
641                 if (gw)
642                         fnhe->fnhe_gw = gw;
643                 if (pmtu) {
644                         fnhe->fnhe_pmtu = pmtu;
645                         fnhe->fnhe_expires = expires;
646                 }
647         } else {
648                 if (depth > FNHE_RECLAIM_DEPTH)
649                         fnhe = fnhe_oldest(hash);
650                 else {
651                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
652                         if (!fnhe)
653                                 goto out_unlock;
654
655                         fnhe->fnhe_next = hash->chain;
656                         rcu_assign_pointer(hash->chain, fnhe);
657                 }
658                 fnhe->fnhe_daddr = daddr;
659                 fnhe->fnhe_gw = gw;
660                 fnhe->fnhe_pmtu = pmtu;
661                 fnhe->fnhe_expires = expires;
662         }
663
664         fnhe->fnhe_stamp = jiffies;
665
666 out_unlock:
667         spin_unlock_bh(&fnhe_lock);
668         return;
669 }
670
671 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
672                              bool kill_route)
673 {
674         __be32 new_gw = icmp_hdr(skb)->un.gateway;
675         __be32 old_gw = ip_hdr(skb)->saddr;
676         struct net_device *dev = skb->dev;
677         struct in_device *in_dev;
678         struct fib_result res;
679         struct neighbour *n;
680         struct net *net;
681
682         switch (icmp_hdr(skb)->code & 7) {
683         case ICMP_REDIR_NET:
684         case ICMP_REDIR_NETTOS:
685         case ICMP_REDIR_HOST:
686         case ICMP_REDIR_HOSTTOS:
687                 break;
688
689         default:
690                 return;
691         }
692
693         if (rt->rt_gateway != old_gw)
694                 return;
695
696         in_dev = __in_dev_get_rcu(dev);
697         if (!in_dev)
698                 return;
699
700         net = dev_net(dev);
701         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
702             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
703             ipv4_is_zeronet(new_gw))
704                 goto reject_redirect;
705
706         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
707                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
708                         goto reject_redirect;
709                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
710                         goto reject_redirect;
711         } else {
712                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
713                         goto reject_redirect;
714         }
715
716         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
717         if (n) {
718                 if (!(n->nud_state & NUD_VALID)) {
719                         neigh_event_send(n, NULL);
720                 } else {
721                         if (fib_lookup(net, fl4, &res) == 0) {
722                                 struct fib_nh *nh = &FIB_RES_NH(res);
723
724                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
725                                                       0, 0);
726                         }
727                         if (kill_route)
728                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
729                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
730                 }
731                 neigh_release(n);
732         }
733         return;
734
735 reject_redirect:
736 #ifdef CONFIG_IP_ROUTE_VERBOSE
737         if (IN_DEV_LOG_MARTIANS(in_dev)) {
738                 const struct iphdr *iph = (const struct iphdr *) skb->data;
739                 __be32 daddr = iph->daddr;
740                 __be32 saddr = iph->saddr;
741
742                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
743                                      "  Advised path = %pI4 -> %pI4\n",
744                                      &old_gw, dev->name, &new_gw,
745                                      &saddr, &daddr);
746         }
747 #endif
748         ;
749 }
750
751 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
752 {
753         struct rtable *rt;
754         struct flowi4 fl4;
755         const struct iphdr *iph = (const struct iphdr *) skb->data;
756         int oif = skb->dev->ifindex;
757         u8 tos = RT_TOS(iph->tos);
758         u8 prot = iph->protocol;
759         u32 mark = skb->mark;
760
761         rt = (struct rtable *) dst;
762
763         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
764         __ip_do_redirect(rt, skb, &fl4, true);
765 }
766
767 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
768 {
769         struct rtable *rt = (struct rtable *)dst;
770         struct dst_entry *ret = dst;
771
772         if (rt) {
773                 if (dst->obsolete > 0) {
774                         ip_rt_put(rt);
775                         ret = NULL;
776                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
777                            rt->dst.expires) {
778                         ip_rt_put(rt);
779                         ret = NULL;
780                 }
781         }
782         return ret;
783 }
784
785 /*
786  * Algorithm:
787  *      1. The first ip_rt_redirect_number redirects are sent
788  *         with exponential backoff, then we stop sending them at all,
789  *         assuming that the host ignores our redirects.
790  *      2. If we did not see packets requiring redirects
791  *         during ip_rt_redirect_silence, we assume that the host
792  *         forgot redirected route and start to send redirects again.
793  *
794  * This algorithm is much cheaper and more intelligent than dumb load limiting
795  * in icmp.c.
796  *
797  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
798  * and "frag. need" (breaks PMTU discovery) in icmp.c.
799  */
800
801 void ip_rt_send_redirect(struct sk_buff *skb)
802 {
803         struct rtable *rt = skb_rtable(skb);
804         struct in_device *in_dev;
805         struct inet_peer *peer;
806         struct net *net;
807         int log_martians;
808
809         rcu_read_lock();
810         in_dev = __in_dev_get_rcu(rt->dst.dev);
811         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
812                 rcu_read_unlock();
813                 return;
814         }
815         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
816         rcu_read_unlock();
817
818         net = dev_net(rt->dst.dev);
819         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
820         if (!peer) {
821                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
822                           rt_nexthop(rt, ip_hdr(skb)->daddr));
823                 return;
824         }
825
826         /* No redirected packets during ip_rt_redirect_silence;
827          * reset the algorithm.
828          */
829         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
830                 peer->rate_tokens = 0;
831
832         /* Too many ignored redirects; do not send anything
833          * set dst.rate_last to the last seen redirected packet.
834          */
835         if (peer->rate_tokens >= ip_rt_redirect_number) {
836                 peer->rate_last = jiffies;
837                 goto out_put_peer;
838         }
839
840         /* Check for load limit; set rate_last to the latest sent
841          * redirect.
842          */
843         if (peer->rate_tokens == 0 ||
844             time_after(jiffies,
845                        (peer->rate_last +
846                         (ip_rt_redirect_load << peer->rate_tokens)))) {
847                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
848
849                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
850                 peer->rate_last = jiffies;
851                 ++peer->rate_tokens;
852 #ifdef CONFIG_IP_ROUTE_VERBOSE
853                 if (log_martians &&
854                     peer->rate_tokens == ip_rt_redirect_number)
855                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
856                                              &ip_hdr(skb)->saddr, inet_iif(skb),
857                                              &ip_hdr(skb)->daddr, &gw);
858 #endif
859         }
860 out_put_peer:
861         inet_putpeer(peer);
862 }
863
864 static int ip_error(struct sk_buff *skb)
865 {
866         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
867         struct rtable *rt = skb_rtable(skb);
868         struct inet_peer *peer;
869         unsigned long now;
870         struct net *net;
871         bool send;
872         int code;
873
874         net = dev_net(rt->dst.dev);
875         if (!IN_DEV_FORWARD(in_dev)) {
876                 switch (rt->dst.error) {
877                 case EHOSTUNREACH:
878                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
879                         break;
880
881                 case ENETUNREACH:
882                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
883                         break;
884                 }
885                 goto out;
886         }
887
888         switch (rt->dst.error) {
889         case EINVAL:
890         default:
891                 goto out;
892         case EHOSTUNREACH:
893                 code = ICMP_HOST_UNREACH;
894                 break;
895         case ENETUNREACH:
896                 code = ICMP_NET_UNREACH;
897                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
898                 break;
899         case EACCES:
900                 code = ICMP_PKT_FILTERED;
901                 break;
902         }
903
904         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
905
906         send = true;
907         if (peer) {
908                 now = jiffies;
909                 peer->rate_tokens += now - peer->rate_last;
910                 if (peer->rate_tokens > ip_rt_error_burst)
911                         peer->rate_tokens = ip_rt_error_burst;
912                 peer->rate_last = now;
913                 if (peer->rate_tokens >= ip_rt_error_cost)
914                         peer->rate_tokens -= ip_rt_error_cost;
915                 else
916                         send = false;
917                 inet_putpeer(peer);
918         }
919         if (send)
920                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
921
922 out:    kfree_skb(skb);
923         return 0;
924 }
925
926 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
927 {
928         struct dst_entry *dst = &rt->dst;
929         struct fib_result res;
930
931         if (dst_metric_locked(dst, RTAX_MTU))
932                 return;
933
934         if (dst->dev->mtu < mtu)
935                 return;
936
937         if (mtu < ip_rt_min_pmtu)
938                 mtu = ip_rt_min_pmtu;
939
940         if (!rt->rt_pmtu) {
941                 dst->obsolete = DST_OBSOLETE_KILL;
942         } else {
943                 rt->rt_pmtu = mtu;
944                 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
945         }
946
947         rcu_read_lock();
948         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
949                 struct fib_nh *nh = &FIB_RES_NH(res);
950
951                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
952                                       jiffies + ip_rt_mtu_expires);
953         }
954         rcu_read_unlock();
955 }
956
957 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
958                               struct sk_buff *skb, u32 mtu)
959 {
960         struct rtable *rt = (struct rtable *) dst;
961         struct flowi4 fl4;
962
963         ip_rt_build_flow_key(&fl4, sk, skb);
964         __ip_rt_update_pmtu(rt, &fl4, mtu);
965 }
966
967 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
968                       int oif, u32 mark, u8 protocol, int flow_flags)
969 {
970         const struct iphdr *iph = (const struct iphdr *) skb->data;
971         struct flowi4 fl4;
972         struct rtable *rt;
973
974         __build_flow_key(&fl4, NULL, iph, oif,
975                          RT_TOS(iph->tos), protocol, mark, flow_flags);
976         rt = __ip_route_output_key(net, &fl4);
977         if (!IS_ERR(rt)) {
978                 __ip_rt_update_pmtu(rt, &fl4, mtu);
979                 ip_rt_put(rt);
980         }
981 }
982 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
983
984 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
985 {
986         const struct iphdr *iph = (const struct iphdr *) skb->data;
987         struct flowi4 fl4;
988         struct rtable *rt;
989
990         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
991         rt = __ip_route_output_key(sock_net(sk), &fl4);
992         if (!IS_ERR(rt)) {
993                 __ip_rt_update_pmtu(rt, &fl4, mtu);
994                 ip_rt_put(rt);
995         }
996 }
997
998 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
999 {
1000         const struct iphdr *iph = (const struct iphdr *) skb->data;
1001         struct flowi4 fl4;
1002         struct rtable *rt;
1003         struct dst_entry *odst = NULL;
1004         bool new = false;
1005
1006         bh_lock_sock(sk);
1007         odst = sk_dst_get(sk);
1008
1009         if (sock_owned_by_user(sk) || !odst) {
1010                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1011                 goto out;
1012         }
1013
1014         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1015
1016         rt = (struct rtable *)odst;
1017         if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
1018                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1019                 if (IS_ERR(rt))
1020                         goto out;
1021
1022                 new = true;
1023         }
1024
1025         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1026
1027         if (!dst_check(&rt->dst, 0)) {
1028                 if (new)
1029                         dst_release(&rt->dst);
1030
1031                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1032                 if (IS_ERR(rt))
1033                         goto out;
1034
1035                 new = true;
1036         }
1037
1038         if (new)
1039                 sk_dst_set(sk, &rt->dst);
1040
1041 out:
1042         bh_unlock_sock(sk);
1043         dst_release(odst);
1044 }
1045 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1046
1047 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1048                    int oif, u32 mark, u8 protocol, int flow_flags)
1049 {
1050         const struct iphdr *iph = (const struct iphdr *) skb->data;
1051         struct flowi4 fl4;
1052         struct rtable *rt;
1053
1054         __build_flow_key(&fl4, NULL, iph, oif,
1055                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1056         rt = __ip_route_output_key(net, &fl4);
1057         if (!IS_ERR(rt)) {
1058                 __ip_do_redirect(rt, skb, &fl4, false);
1059                 ip_rt_put(rt);
1060         }
1061 }
1062 EXPORT_SYMBOL_GPL(ipv4_redirect);
1063
1064 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1065 {
1066         const struct iphdr *iph = (const struct iphdr *) skb->data;
1067         struct flowi4 fl4;
1068         struct rtable *rt;
1069
1070         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1071         rt = __ip_route_output_key(sock_net(sk), &fl4);
1072         if (!IS_ERR(rt)) {
1073                 __ip_do_redirect(rt, skb, &fl4, false);
1074                 ip_rt_put(rt);
1075         }
1076 }
1077 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1078
1079 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1080 {
1081         struct rtable *rt = (struct rtable *) dst;
1082
1083         /* All IPV4 dsts are created with ->obsolete set to the value
1084          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1085          * into this function always.
1086          *
1087          * When a PMTU/redirect information update invalidates a
1088          * route, this is indicated by setting obsolete to
1089          * DST_OBSOLETE_KILL.
1090          */
1091         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1092                 return NULL;
1093         return dst;
1094 }
1095
1096 static void ipv4_link_failure(struct sk_buff *skb)
1097 {
1098         struct rtable *rt;
1099
1100         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1101
1102         rt = skb_rtable(skb);
1103         if (rt)
1104                 dst_set_expires(&rt->dst, 0);
1105 }
1106
1107 static int ip_rt_bug(struct sk_buff *skb)
1108 {
1109         pr_debug("%s: %pI4 -> %pI4, %s\n",
1110                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1111                  skb->dev ? skb->dev->name : "?");
1112         kfree_skb(skb);
1113         WARN_ON(1);
1114         return 0;
1115 }
1116
1117 /*
1118    We do not cache source address of outgoing interface,
1119    because it is used only by IP RR, TS and SRR options,
1120    so that it out of fast path.
1121
1122    BTW remember: "addr" is allowed to be not aligned
1123    in IP options!
1124  */
1125
1126 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1127 {
1128         __be32 src;
1129
1130         if (rt_is_output_route(rt))
1131                 src = ip_hdr(skb)->saddr;
1132         else {
1133                 struct fib_result res;
1134                 struct flowi4 fl4;
1135                 struct iphdr *iph;
1136
1137                 iph = ip_hdr(skb);
1138
1139                 memset(&fl4, 0, sizeof(fl4));
1140                 fl4.daddr = iph->daddr;
1141                 fl4.saddr = iph->saddr;
1142                 fl4.flowi4_tos = RT_TOS(iph->tos);
1143                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1144                 fl4.flowi4_iif = skb->dev->ifindex;
1145                 fl4.flowi4_mark = skb->mark;
1146
1147                 rcu_read_lock();
1148                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1149                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1150                 else
1151                         src = inet_select_addr(rt->dst.dev,
1152                                                rt_nexthop(rt, iph->daddr),
1153                                                RT_SCOPE_UNIVERSE);
1154                 rcu_read_unlock();
1155         }
1156         memcpy(addr, &src, 4);
1157 }
1158
1159 #ifdef CONFIG_IP_ROUTE_CLASSID
1160 static void set_class_tag(struct rtable *rt, u32 tag)
1161 {
1162         if (!(rt->dst.tclassid & 0xFFFF))
1163                 rt->dst.tclassid |= tag & 0xFFFF;
1164         if (!(rt->dst.tclassid & 0xFFFF0000))
1165                 rt->dst.tclassid |= tag & 0xFFFF0000;
1166 }
1167 #endif
1168
1169 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1170 {
1171         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1172
1173         if (advmss == 0) {
1174                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1175                                ip_rt_min_advmss);
1176                 if (advmss > 65535 - 40)
1177                         advmss = 65535 - 40;
1178         }
1179         return advmss;
1180 }
1181
1182 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1183 {
1184         const struct rtable *rt = (const struct rtable *) dst;
1185         unsigned int mtu = rt->rt_pmtu;
1186
1187         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1188                 mtu = dst_metric_raw(dst, RTAX_MTU);
1189
1190         if (mtu)
1191                 return mtu;
1192
1193         mtu = dst->dev->mtu;
1194
1195         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1196                 if (rt->rt_uses_gateway && mtu > 576)
1197                         mtu = 576;
1198         }
1199
1200         if (mtu > IP_MAX_MTU)
1201                 mtu = IP_MAX_MTU;
1202
1203         return mtu;
1204 }
1205
1206 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1207 {
1208         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1209         struct fib_nh_exception *fnhe;
1210         u32 hval;
1211
1212         if (!hash)
1213                 return NULL;
1214
1215         hval = fnhe_hashfun(daddr);
1216
1217         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1218              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1219                 if (fnhe->fnhe_daddr == daddr)
1220                         return fnhe;
1221         }
1222         return NULL;
1223 }
1224
1225 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1226                               __be32 daddr)
1227 {
1228         bool ret = false;
1229
1230         spin_lock_bh(&fnhe_lock);
1231
1232         if (daddr == fnhe->fnhe_daddr) {
1233                 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1234                 if (orig && rt_is_expired(orig)) {
1235                         fnhe->fnhe_gw = 0;
1236                         fnhe->fnhe_pmtu = 0;
1237                         fnhe->fnhe_expires = 0;
1238                 }
1239                 if (fnhe->fnhe_pmtu) {
1240                         unsigned long expires = fnhe->fnhe_expires;
1241                         unsigned long diff = expires - jiffies;
1242
1243                         if (time_before(jiffies, expires)) {
1244                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1245                                 dst_set_expires(&rt->dst, diff);
1246                         }
1247                 }
1248                 if (fnhe->fnhe_gw) {
1249                         rt->rt_flags |= RTCF_REDIRECTED;
1250                         rt->rt_gateway = fnhe->fnhe_gw;
1251                         rt->rt_uses_gateway = 1;
1252                 } else if (!rt->rt_gateway)
1253                         rt->rt_gateway = daddr;
1254
1255                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1256                 if (orig)
1257                         rt_free(orig);
1258
1259                 fnhe->fnhe_stamp = jiffies;
1260                 ret = true;
1261         }
1262         spin_unlock_bh(&fnhe_lock);
1263
1264         return ret;
1265 }
1266
1267 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1268 {
1269         struct rtable *orig, *prev, **p;
1270         bool ret = true;
1271
1272         if (rt_is_input_route(rt)) {
1273                 p = (struct rtable **)&nh->nh_rth_input;
1274         } else {
1275                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1276         }
1277         orig = *p;
1278
1279         prev = cmpxchg(p, orig, rt);
1280         if (prev == orig) {
1281                 if (orig)
1282                         rt_free(orig);
1283         } else
1284                 ret = false;
1285
1286         return ret;
1287 }
1288
1289 static DEFINE_SPINLOCK(rt_uncached_lock);
1290 static LIST_HEAD(rt_uncached_list);
1291
1292 static void rt_add_uncached_list(struct rtable *rt)
1293 {
1294         spin_lock_bh(&rt_uncached_lock);
1295         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1296         spin_unlock_bh(&rt_uncached_lock);
1297 }
1298
1299 static void ipv4_dst_destroy(struct dst_entry *dst)
1300 {
1301         struct rtable *rt = (struct rtable *) dst;
1302
1303         if (!list_empty(&rt->rt_uncached)) {
1304                 spin_lock_bh(&rt_uncached_lock);
1305                 list_del(&rt->rt_uncached);
1306                 spin_unlock_bh(&rt_uncached_lock);
1307         }
1308 }
1309
1310 void rt_flush_dev(struct net_device *dev)
1311 {
1312         if (!list_empty(&rt_uncached_list)) {
1313                 struct net *net = dev_net(dev);
1314                 struct rtable *rt;
1315
1316                 spin_lock_bh(&rt_uncached_lock);
1317                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1318                         if (rt->dst.dev != dev)
1319                                 continue;
1320                         rt->dst.dev = net->loopback_dev;
1321                         dev_hold(rt->dst.dev);
1322                         dev_put(dev);
1323                 }
1324                 spin_unlock_bh(&rt_uncached_lock);
1325         }
1326 }
1327
1328 static bool rt_cache_valid(const struct rtable *rt)
1329 {
1330         return  rt &&
1331                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1332                 !rt_is_expired(rt);
1333 }
1334
1335 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1336                            const struct fib_result *res,
1337                            struct fib_nh_exception *fnhe,
1338                            struct fib_info *fi, u16 type, u32 itag)
1339 {
1340         bool cached = false;
1341
1342         if (fi) {
1343                 struct fib_nh *nh = &FIB_RES_NH(*res);
1344
1345                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1346                         rt->rt_gateway = nh->nh_gw;
1347                         rt->rt_uses_gateway = 1;
1348                 }
1349                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1350 #ifdef CONFIG_IP_ROUTE_CLASSID
1351                 rt->dst.tclassid = nh->nh_tclassid;
1352 #endif
1353                 if (unlikely(fnhe))
1354                         cached = rt_bind_exception(rt, fnhe, daddr);
1355                 else if (!(rt->dst.flags & DST_NOCACHE))
1356                         cached = rt_cache_route(nh, rt);
1357                 if (unlikely(!cached)) {
1358                         /* Routes we intend to cache in nexthop exception or
1359                          * FIB nexthop have the DST_NOCACHE bit clear.
1360                          * However, if we are unsuccessful at storing this
1361                          * route into the cache we really need to set it.
1362                          */
1363                         rt->dst.flags |= DST_NOCACHE;
1364                         if (!rt->rt_gateway)
1365                                 rt->rt_gateway = daddr;
1366                         rt_add_uncached_list(rt);
1367                 }
1368         } else
1369                 rt_add_uncached_list(rt);
1370
1371 #ifdef CONFIG_IP_ROUTE_CLASSID
1372 #ifdef CONFIG_IP_MULTIPLE_TABLES
1373         set_class_tag(rt, res->tclassid);
1374 #endif
1375         set_class_tag(rt, itag);
1376 #endif
1377 }
1378
1379 static struct rtable *rt_dst_alloc(struct net_device *dev,
1380                                    bool nopolicy, bool noxfrm, bool will_cache)
1381 {
1382         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1383                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1384                          (nopolicy ? DST_NOPOLICY : 0) |
1385                          (noxfrm ? DST_NOXFRM : 0));
1386 }
1387
1388 /* called in rcu_read_lock() section */
1389 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1390                                 u8 tos, struct net_device *dev, int our)
1391 {
1392         struct rtable *rth;
1393         struct in_device *in_dev = __in_dev_get_rcu(dev);
1394         u32 itag = 0;
1395         int err;
1396
1397         /* Primary sanity checks. */
1398
1399         if (in_dev == NULL)
1400                 return -EINVAL;
1401
1402         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1403             skb->protocol != htons(ETH_P_IP))
1404                 goto e_inval;
1405
1406         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1407                 if (ipv4_is_loopback(saddr))
1408                         goto e_inval;
1409
1410         if (ipv4_is_zeronet(saddr)) {
1411                 if (!ipv4_is_local_multicast(daddr))
1412                         goto e_inval;
1413         } else {
1414                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1415                                           in_dev, &itag);
1416                 if (err < 0)
1417                         goto e_err;
1418         }
1419         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1420                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1421         if (!rth)
1422                 goto e_nobufs;
1423
1424 #ifdef CONFIG_IP_ROUTE_CLASSID
1425         rth->dst.tclassid = itag;
1426 #endif
1427         rth->dst.output = ip_rt_bug;
1428
1429         rth->rt_genid   = rt_genid(dev_net(dev));
1430         rth->rt_flags   = RTCF_MULTICAST;
1431         rth->rt_type    = RTN_MULTICAST;
1432         rth->rt_is_input= 1;
1433         rth->rt_iif     = 0;
1434         rth->rt_pmtu    = 0;
1435         rth->rt_gateway = 0;
1436         rth->rt_uses_gateway = 0;
1437         INIT_LIST_HEAD(&rth->rt_uncached);
1438         if (our) {
1439                 rth->dst.input= ip_local_deliver;
1440                 rth->rt_flags |= RTCF_LOCAL;
1441         }
1442
1443 #ifdef CONFIG_IP_MROUTE
1444         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1445                 rth->dst.input = ip_mr_input;
1446 #endif
1447         RT_CACHE_STAT_INC(in_slow_mc);
1448
1449         skb_dst_set(skb, &rth->dst);
1450         return 0;
1451
1452 e_nobufs:
1453         return -ENOBUFS;
1454 e_inval:
1455         return -EINVAL;
1456 e_err:
1457         return err;
1458 }
1459
1460
1461 static void ip_handle_martian_source(struct net_device *dev,
1462                                      struct in_device *in_dev,
1463                                      struct sk_buff *skb,
1464                                      __be32 daddr,
1465                                      __be32 saddr)
1466 {
1467         RT_CACHE_STAT_INC(in_martian_src);
1468 #ifdef CONFIG_IP_ROUTE_VERBOSE
1469         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1470                 /*
1471                  *      RFC1812 recommendation, if source is martian,
1472                  *      the only hint is MAC header.
1473                  */
1474                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1475                         &daddr, &saddr, dev->name);
1476                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1477                         print_hex_dump(KERN_WARNING, "ll header: ",
1478                                        DUMP_PREFIX_OFFSET, 16, 1,
1479                                        skb_mac_header(skb),
1480                                        dev->hard_header_len, true);
1481                 }
1482         }
1483 #endif
1484 }
1485
1486 /* called in rcu_read_lock() section */
1487 static int __mkroute_input(struct sk_buff *skb,
1488                            const struct fib_result *res,
1489                            struct in_device *in_dev,
1490                            __be32 daddr, __be32 saddr, u32 tos)
1491 {
1492         struct rtable *rth;
1493         int err;
1494         struct in_device *out_dev;
1495         unsigned int flags = 0;
1496         bool do_cache;
1497         u32 itag = 0;
1498
1499         /* get a working reference to the output device */
1500         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1501         if (out_dev == NULL) {
1502                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1503                 return -EINVAL;
1504         }
1505
1506         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1507                                   in_dev->dev, in_dev, &itag);
1508         if (err < 0) {
1509                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1510                                          saddr);
1511
1512                 goto cleanup;
1513         }
1514
1515         do_cache = res->fi && !itag;
1516         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1517             (IN_DEV_SHARED_MEDIA(out_dev) ||
1518              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1519                 flags |= RTCF_DOREDIRECT;
1520                 do_cache = false;
1521         }
1522
1523         if (skb->protocol != htons(ETH_P_IP)) {
1524                 /* Not IP (i.e. ARP). Do not create route, if it is
1525                  * invalid for proxy arp. DNAT routes are always valid.
1526                  *
1527                  * Proxy arp feature have been extended to allow, ARP
1528                  * replies back to the same interface, to support
1529                  * Private VLAN switch technologies. See arp.c.
1530                  */
1531                 if (out_dev == in_dev &&
1532                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1533                         err = -EINVAL;
1534                         goto cleanup;
1535                 }
1536         }
1537
1538         if (do_cache) {
1539                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1540                 if (rt_cache_valid(rth)) {
1541                         skb_dst_set_noref(skb, &rth->dst);
1542                         goto out;
1543                 }
1544         }
1545
1546         rth = rt_dst_alloc(out_dev->dev,
1547                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1548                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1549         if (!rth) {
1550                 err = -ENOBUFS;
1551                 goto cleanup;
1552         }
1553
1554         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1555         rth->rt_flags = flags;
1556         rth->rt_type = res->type;
1557         rth->rt_is_input = 1;
1558         rth->rt_iif     = 0;
1559         rth->rt_pmtu    = 0;
1560         rth->rt_gateway = 0;
1561         rth->rt_uses_gateway = 0;
1562         INIT_LIST_HEAD(&rth->rt_uncached);
1563         RT_CACHE_STAT_INC(in_slow_tot);
1564
1565         rth->dst.input = ip_forward;
1566         rth->dst.output = ip_output;
1567
1568         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1569         skb_dst_set(skb, &rth->dst);
1570 out:
1571         err = 0;
1572  cleanup:
1573         return err;
1574 }
1575
1576 static int ip_mkroute_input(struct sk_buff *skb,
1577                             struct fib_result *res,
1578                             const struct flowi4 *fl4,
1579                             struct in_device *in_dev,
1580                             __be32 daddr, __be32 saddr, u32 tos)
1581 {
1582 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1583         if (res->fi && res->fi->fib_nhs > 1)
1584                 fib_select_multipath(res);
1585 #endif
1586
1587         /* create a routing cache entry */
1588         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1589 }
1590
1591 /*
1592  *      NOTE. We drop all the packets that has local source
1593  *      addresses, because every properly looped back packet
1594  *      must have correct destination already attached by output routine.
1595  *
1596  *      Such approach solves two big problems:
1597  *      1. Not simplex devices are handled properly.
1598  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1599  *      called with rcu_read_lock()
1600  */
1601
1602 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1603                                u8 tos, struct net_device *dev)
1604 {
1605         struct fib_result res;
1606         struct in_device *in_dev = __in_dev_get_rcu(dev);
1607         struct flowi4   fl4;
1608         unsigned int    flags = 0;
1609         u32             itag = 0;
1610         struct rtable   *rth;
1611         int             err = -EINVAL;
1612         struct net    *net = dev_net(dev);
1613         bool do_cache;
1614
1615         /* IP on this device is disabled. */
1616
1617         if (!in_dev)
1618                 goto out;
1619
1620         /* Check for the most weird martians, which can be not detected
1621            by fib_lookup.
1622          */
1623
1624         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1625                 goto martian_source;
1626
1627         res.fi = NULL;
1628         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1629                 goto brd_input;
1630
1631         /* Accept zero addresses only to limited broadcast;
1632          * I even do not know to fix it or not. Waiting for complains :-)
1633          */
1634         if (ipv4_is_zeronet(saddr))
1635                 goto martian_source;
1636
1637         if (ipv4_is_zeronet(daddr))
1638                 goto martian_destination;
1639
1640         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1641          * and call it once if daddr or/and saddr are loopback addresses
1642          */
1643         if (ipv4_is_loopback(daddr)) {
1644                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1645                         goto martian_destination;
1646         } else if (ipv4_is_loopback(saddr)) {
1647                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1648                         goto martian_source;
1649         }
1650
1651         /*
1652          *      Now we are ready to route packet.
1653          */
1654         fl4.flowi4_oif = 0;
1655         fl4.flowi4_iif = dev->ifindex;
1656         fl4.flowi4_mark = skb->mark;
1657         fl4.flowi4_tos = tos;
1658         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1659         fl4.daddr = daddr;
1660         fl4.saddr = saddr;
1661         err = fib_lookup(net, &fl4, &res);
1662         if (err != 0)
1663                 goto no_route;
1664
1665         if (res.type == RTN_BROADCAST)
1666                 goto brd_input;
1667
1668         if (res.type == RTN_LOCAL) {
1669                 err = fib_validate_source(skb, saddr, daddr, tos,
1670                                           LOOPBACK_IFINDEX,
1671                                           dev, in_dev, &itag);
1672                 if (err < 0)
1673                         goto martian_source_keep_err;
1674                 goto local_input;
1675         }
1676
1677         if (!IN_DEV_FORWARD(in_dev))
1678                 goto no_route;
1679         if (res.type != RTN_UNICAST)
1680                 goto martian_destination;
1681
1682         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1683 out:    return err;
1684
1685 brd_input:
1686         if (skb->protocol != htons(ETH_P_IP))
1687                 goto e_inval;
1688
1689         if (!ipv4_is_zeronet(saddr)) {
1690                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1691                                           in_dev, &itag);
1692                 if (err < 0)
1693                         goto martian_source_keep_err;
1694         }
1695         flags |= RTCF_BROADCAST;
1696         res.type = RTN_BROADCAST;
1697         RT_CACHE_STAT_INC(in_brd);
1698
1699 local_input:
1700         do_cache = false;
1701         if (res.fi) {
1702                 if (!itag) {
1703                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1704                         if (rt_cache_valid(rth)) {
1705                                 skb_dst_set_noref(skb, &rth->dst);
1706                                 err = 0;
1707                                 goto out;
1708                         }
1709                         do_cache = true;
1710                 }
1711         }
1712
1713         rth = rt_dst_alloc(net->loopback_dev,
1714                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1715         if (!rth)
1716                 goto e_nobufs;
1717
1718         rth->dst.input= ip_local_deliver;
1719         rth->dst.output= ip_rt_bug;
1720 #ifdef CONFIG_IP_ROUTE_CLASSID
1721         rth->dst.tclassid = itag;
1722 #endif
1723
1724         rth->rt_genid = rt_genid(net);
1725         rth->rt_flags   = flags|RTCF_LOCAL;
1726         rth->rt_type    = res.type;
1727         rth->rt_is_input = 1;
1728         rth->rt_iif     = 0;
1729         rth->rt_pmtu    = 0;
1730         rth->rt_gateway = 0;
1731         rth->rt_uses_gateway = 0;
1732         INIT_LIST_HEAD(&rth->rt_uncached);
1733         RT_CACHE_STAT_INC(in_slow_tot);
1734         if (res.type == RTN_UNREACHABLE) {
1735                 rth->dst.input= ip_error;
1736                 rth->dst.error= -err;
1737                 rth->rt_flags   &= ~RTCF_LOCAL;
1738         }
1739         if (do_cache) {
1740                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1741                         rth->dst.flags |= DST_NOCACHE;
1742                         rt_add_uncached_list(rth);
1743                 }
1744         }
1745         skb_dst_set(skb, &rth->dst);
1746         err = 0;
1747         goto out;
1748
1749 no_route:
1750         RT_CACHE_STAT_INC(in_no_route);
1751         res.type = RTN_UNREACHABLE;
1752         if (err == -ESRCH)
1753                 err = -ENETUNREACH;
1754         goto local_input;
1755
1756         /*
1757          *      Do not cache martian addresses: they should be logged (RFC1812)
1758          */
1759 martian_destination:
1760         RT_CACHE_STAT_INC(in_martian_dst);
1761 #ifdef CONFIG_IP_ROUTE_VERBOSE
1762         if (IN_DEV_LOG_MARTIANS(in_dev))
1763                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1764                                      &daddr, &saddr, dev->name);
1765 #endif
1766
1767 e_inval:
1768         err = -EINVAL;
1769         goto out;
1770
1771 e_nobufs:
1772         err = -ENOBUFS;
1773         goto out;
1774
1775 martian_source:
1776         err = -EINVAL;
1777 martian_source_keep_err:
1778         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1779         goto out;
1780 }
1781
1782 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1783                          u8 tos, struct net_device *dev)
1784 {
1785         int res;
1786
1787         rcu_read_lock();
1788
1789         /* Multicast recognition logic is moved from route cache to here.
1790            The problem was that too many Ethernet cards have broken/missing
1791            hardware multicast filters :-( As result the host on multicasting
1792            network acquires a lot of useless route cache entries, sort of
1793            SDR messages from all the world. Now we try to get rid of them.
1794            Really, provided software IP multicast filter is organized
1795            reasonably (at least, hashed), it does not result in a slowdown
1796            comparing with route cache reject entries.
1797            Note, that multicast routers are not affected, because
1798            route cache entry is created eventually.
1799          */
1800         if (ipv4_is_multicast(daddr)) {
1801                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1802
1803                 if (in_dev) {
1804                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1805                                                   ip_hdr(skb)->protocol);
1806                         if (our
1807 #ifdef CONFIG_IP_MROUTE
1808                                 ||
1809                             (!ipv4_is_local_multicast(daddr) &&
1810                              IN_DEV_MFORWARD(in_dev))
1811 #endif
1812                            ) {
1813                                 int res = ip_route_input_mc(skb, daddr, saddr,
1814                                                             tos, dev, our);
1815                                 rcu_read_unlock();
1816                                 return res;
1817                         }
1818                 }
1819                 rcu_read_unlock();
1820                 return -EINVAL;
1821         }
1822         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1823         rcu_read_unlock();
1824         return res;
1825 }
1826 EXPORT_SYMBOL(ip_route_input_noref);
1827
1828 /* called with rcu_read_lock() */
1829 static struct rtable *__mkroute_output(const struct fib_result *res,
1830                                        const struct flowi4 *fl4, int orig_oif,
1831                                        struct net_device *dev_out,
1832                                        unsigned int flags)
1833 {
1834         struct fib_info *fi = res->fi;
1835         struct fib_nh_exception *fnhe;
1836         struct in_device *in_dev;
1837         u16 type = res->type;
1838         struct rtable *rth;
1839         bool do_cache;
1840
1841         in_dev = __in_dev_get_rcu(dev_out);
1842         if (!in_dev)
1843                 return ERR_PTR(-EINVAL);
1844
1845         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1846                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1847                         return ERR_PTR(-EINVAL);
1848
1849         if (ipv4_is_lbcast(fl4->daddr))
1850                 type = RTN_BROADCAST;
1851         else if (ipv4_is_multicast(fl4->daddr))
1852                 type = RTN_MULTICAST;
1853         else if (ipv4_is_zeronet(fl4->daddr))
1854                 return ERR_PTR(-EINVAL);
1855
1856         if (dev_out->flags & IFF_LOOPBACK)
1857                 flags |= RTCF_LOCAL;
1858
1859         do_cache = true;
1860         if (type == RTN_BROADCAST) {
1861                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1862                 fi = NULL;
1863         } else if (type == RTN_MULTICAST) {
1864                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1865                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1866                                      fl4->flowi4_proto))
1867                         flags &= ~RTCF_LOCAL;
1868                 else
1869                         do_cache = false;
1870                 /* If multicast route do not exist use
1871                  * default one, but do not gateway in this case.
1872                  * Yes, it is hack.
1873                  */
1874                 if (fi && res->prefixlen < 4)
1875                         fi = NULL;
1876         }
1877
1878         fnhe = NULL;
1879         do_cache &= fi != NULL;
1880         if (do_cache) {
1881                 struct rtable __rcu **prth;
1882                 struct fib_nh *nh = &FIB_RES_NH(*res);
1883
1884                 fnhe = find_exception(nh, fl4->daddr);
1885                 if (fnhe)
1886                         prth = &fnhe->fnhe_rth;
1887                 else {
1888                         if (unlikely(fl4->flowi4_flags &
1889                                      FLOWI_FLAG_KNOWN_NH &&
1890                                      !(nh->nh_gw &&
1891                                        nh->nh_scope == RT_SCOPE_LINK))) {
1892                                 do_cache = false;
1893                                 goto add;
1894                         }
1895                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1896                 }
1897                 rth = rcu_dereference(*prth);
1898                 if (rt_cache_valid(rth)) {
1899                         dst_hold(&rth->dst);
1900                         return rth;
1901                 }
1902         }
1903
1904 add:
1905         rth = rt_dst_alloc(dev_out,
1906                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1907                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1908                            do_cache);
1909         if (!rth)
1910                 return ERR_PTR(-ENOBUFS);
1911
1912         rth->dst.output = ip_output;
1913
1914         rth->rt_genid = rt_genid(dev_net(dev_out));
1915         rth->rt_flags   = flags;
1916         rth->rt_type    = type;
1917         rth->rt_is_input = 0;
1918         rth->rt_iif     = orig_oif ? : 0;
1919         rth->rt_pmtu    = 0;
1920         rth->rt_gateway = 0;
1921         rth->rt_uses_gateway = 0;
1922         INIT_LIST_HEAD(&rth->rt_uncached);
1923
1924         RT_CACHE_STAT_INC(out_slow_tot);
1925
1926         if (flags & RTCF_LOCAL)
1927                 rth->dst.input = ip_local_deliver;
1928         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1929                 if (flags & RTCF_LOCAL &&
1930                     !(dev_out->flags & IFF_LOOPBACK)) {
1931                         rth->dst.output = ip_mc_output;
1932                         RT_CACHE_STAT_INC(out_slow_mc);
1933                 }
1934 #ifdef CONFIG_IP_MROUTE
1935                 if (type == RTN_MULTICAST) {
1936                         if (IN_DEV_MFORWARD(in_dev) &&
1937                             !ipv4_is_local_multicast(fl4->daddr)) {
1938                                 rth->dst.input = ip_mr_input;
1939                                 rth->dst.output = ip_mc_output;
1940                         }
1941                 }
1942 #endif
1943         }
1944
1945         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1946
1947         return rth;
1948 }
1949
1950 /*
1951  * Major route resolver routine.
1952  */
1953
1954 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1955 {
1956         struct net_device *dev_out = NULL;
1957         __u8 tos = RT_FL_TOS(fl4);
1958         unsigned int flags = 0;
1959         struct fib_result res;
1960         struct rtable *rth;
1961         int orig_oif;
1962
1963         res.tclassid    = 0;
1964         res.fi          = NULL;
1965         res.table       = NULL;
1966
1967         orig_oif = fl4->flowi4_oif;
1968
1969         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1970         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1971         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1972                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1973
1974         rcu_read_lock();
1975         if (fl4->saddr) {
1976                 rth = ERR_PTR(-EINVAL);
1977                 if (ipv4_is_multicast(fl4->saddr) ||
1978                     ipv4_is_lbcast(fl4->saddr) ||
1979                     ipv4_is_zeronet(fl4->saddr))
1980                         goto out;
1981
1982                 /* I removed check for oif == dev_out->oif here.
1983                    It was wrong for two reasons:
1984                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1985                       is assigned to multiple interfaces.
1986                    2. Moreover, we are allowed to send packets with saddr
1987                       of another iface. --ANK
1988                  */
1989
1990                 if (fl4->flowi4_oif == 0 &&
1991                     (ipv4_is_multicast(fl4->daddr) ||
1992                      ipv4_is_lbcast(fl4->daddr))) {
1993                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1994                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1995                         if (dev_out == NULL)
1996                                 goto out;
1997
1998                         /* Special hack: user can direct multicasts
1999                            and limited broadcast via necessary interface
2000                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2001                            This hack is not just for fun, it allows
2002                            vic,vat and friends to work.
2003                            They bind socket to loopback, set ttl to zero
2004                            and expect that it will work.
2005                            From the viewpoint of routing cache they are broken,
2006                            because we are not allowed to build multicast path
2007                            with loopback source addr (look, routing cache
2008                            cannot know, that ttl is zero, so that packet
2009                            will not leave this host and route is valid).
2010                            Luckily, this hack is good workaround.
2011                          */
2012
2013                         fl4->flowi4_oif = dev_out->ifindex;
2014                         goto make_route;
2015                 }
2016
2017                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2018                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2019                         if (!__ip_dev_find(net, fl4->saddr, false))
2020                                 goto out;
2021                 }
2022         }
2023
2024
2025         if (fl4->flowi4_oif) {
2026                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2027                 rth = ERR_PTR(-ENODEV);
2028                 if (dev_out == NULL)
2029                         goto out;
2030
2031                 /* RACE: Check return value of inet_select_addr instead. */
2032                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2033                         rth = ERR_PTR(-ENETUNREACH);
2034                         goto out;
2035                 }
2036                 if (ipv4_is_local_multicast(fl4->daddr) ||
2037                     ipv4_is_lbcast(fl4->daddr)) {
2038                         if (!fl4->saddr)
2039                                 fl4->saddr = inet_select_addr(dev_out, 0,
2040                                                               RT_SCOPE_LINK);
2041                         goto make_route;
2042                 }
2043                 if (!fl4->saddr) {
2044                         if (ipv4_is_multicast(fl4->daddr))
2045                                 fl4->saddr = inet_select_addr(dev_out, 0,
2046                                                               fl4->flowi4_scope);
2047                         else if (!fl4->daddr)
2048                                 fl4->saddr = inet_select_addr(dev_out, 0,
2049                                                               RT_SCOPE_HOST);
2050                 }
2051         }
2052
2053         if (!fl4->daddr) {
2054                 fl4->daddr = fl4->saddr;
2055                 if (!fl4->daddr)
2056                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2057                 dev_out = net->loopback_dev;
2058                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2059                 res.type = RTN_LOCAL;
2060                 flags |= RTCF_LOCAL;
2061                 goto make_route;
2062         }
2063
2064         if (fib_lookup(net, fl4, &res)) {
2065                 res.fi = NULL;
2066                 res.table = NULL;
2067                 if (fl4->flowi4_oif) {
2068                         /* Apparently, routing tables are wrong. Assume,
2069                            that the destination is on link.
2070
2071                            WHY? DW.
2072                            Because we are allowed to send to iface
2073                            even if it has NO routes and NO assigned
2074                            addresses. When oif is specified, routing
2075                            tables are looked up with only one purpose:
2076                            to catch if destination is gatewayed, rather than
2077                            direct. Moreover, if MSG_DONTROUTE is set,
2078                            we send packet, ignoring both routing tables
2079                            and ifaddr state. --ANK
2080
2081
2082                            We could make it even if oif is unknown,
2083                            likely IPv6, but we do not.
2084                          */
2085
2086                         if (fl4->saddr == 0)
2087                                 fl4->saddr = inet_select_addr(dev_out, 0,
2088                                                               RT_SCOPE_LINK);
2089                         res.type = RTN_UNICAST;
2090                         goto make_route;
2091                 }
2092                 rth = ERR_PTR(-ENETUNREACH);
2093                 goto out;
2094         }
2095
2096         if (res.type == RTN_LOCAL) {
2097                 if (!fl4->saddr) {
2098                         if (res.fi->fib_prefsrc)
2099                                 fl4->saddr = res.fi->fib_prefsrc;
2100                         else
2101                                 fl4->saddr = fl4->daddr;
2102                 }
2103                 dev_out = net->loopback_dev;
2104                 fl4->flowi4_oif = dev_out->ifindex;
2105                 flags |= RTCF_LOCAL;
2106                 goto make_route;
2107         }
2108
2109 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2110         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2111                 fib_select_multipath(&res);
2112         else
2113 #endif
2114         if (!res.prefixlen &&
2115             res.table->tb_num_default > 1 &&
2116             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2117                 fib_select_default(&res);
2118
2119         if (!fl4->saddr)
2120                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2121
2122         dev_out = FIB_RES_DEV(res);
2123         fl4->flowi4_oif = dev_out->ifindex;
2124
2125
2126 make_route:
2127         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2128
2129 out:
2130         rcu_read_unlock();
2131         return rth;
2132 }
2133 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2134
2135 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2136 {
2137         return NULL;
2138 }
2139
2140 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2141 {
2142         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2143
2144         return mtu ? : dst->dev->mtu;
2145 }
2146
2147 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2148                                           struct sk_buff *skb, u32 mtu)
2149 {
2150 }
2151
2152 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2153                                        struct sk_buff *skb)
2154 {
2155 }
2156
2157 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2158                                           unsigned long old)
2159 {
2160         return NULL;
2161 }
2162
2163 static struct dst_ops ipv4_dst_blackhole_ops = {
2164         .family                 =       AF_INET,
2165         .protocol               =       cpu_to_be16(ETH_P_IP),
2166         .check                  =       ipv4_blackhole_dst_check,
2167         .mtu                    =       ipv4_blackhole_mtu,
2168         .default_advmss         =       ipv4_default_advmss,
2169         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2170         .redirect               =       ipv4_rt_blackhole_redirect,
2171         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2172         .neigh_lookup           =       ipv4_neigh_lookup,
2173 };
2174
2175 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2176 {
2177         struct rtable *ort = (struct rtable *) dst_orig;
2178         struct rtable *rt;
2179
2180         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2181         if (rt) {
2182                 struct dst_entry *new = &rt->dst;
2183
2184                 new->__use = 1;
2185                 new->input = dst_discard;
2186                 new->output = dst_discard;
2187
2188                 new->dev = ort->dst.dev;
2189                 if (new->dev)
2190                         dev_hold(new->dev);
2191
2192                 rt->rt_is_input = ort->rt_is_input;
2193                 rt->rt_iif = ort->rt_iif;
2194                 rt->rt_pmtu = ort->rt_pmtu;
2195
2196                 rt->rt_genid = rt_genid(net);
2197                 rt->rt_flags = ort->rt_flags;
2198                 rt->rt_type = ort->rt_type;
2199                 rt->rt_gateway = ort->rt_gateway;
2200                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2201
2202                 INIT_LIST_HEAD(&rt->rt_uncached);
2203
2204                 dst_free(new);
2205         }
2206
2207         dst_release(dst_orig);
2208
2209         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2210 }
2211
2212 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2213                                     struct sock *sk)
2214 {
2215         struct rtable *rt = __ip_route_output_key(net, flp4);
2216
2217         if (IS_ERR(rt))
2218                 return rt;
2219
2220         if (flp4->flowi4_proto)
2221                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2222                                                    flowi4_to_flowi(flp4),
2223                                                    sk, 0);
2224
2225         return rt;
2226 }
2227 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2228
2229 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2230                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2231                         u32 seq, int event, int nowait, unsigned int flags)
2232 {
2233         struct rtable *rt = skb_rtable(skb);
2234         struct rtmsg *r;
2235         struct nlmsghdr *nlh;
2236         unsigned long expires = 0;
2237         u32 error;
2238         u32 metrics[RTAX_MAX];
2239
2240         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2241         if (nlh == NULL)
2242                 return -EMSGSIZE;
2243
2244         r = nlmsg_data(nlh);
2245         r->rtm_family    = AF_INET;
2246         r->rtm_dst_len  = 32;
2247         r->rtm_src_len  = 0;
2248         r->rtm_tos      = fl4->flowi4_tos;
2249         r->rtm_table    = RT_TABLE_MAIN;
2250         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2251                 goto nla_put_failure;
2252         r->rtm_type     = rt->rt_type;
2253         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2254         r->rtm_protocol = RTPROT_UNSPEC;
2255         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2256         if (rt->rt_flags & RTCF_NOTIFY)
2257                 r->rtm_flags |= RTM_F_NOTIFY;
2258
2259         if (nla_put_be32(skb, RTA_DST, dst))
2260                 goto nla_put_failure;
2261         if (src) {
2262                 r->rtm_src_len = 32;
2263                 if (nla_put_be32(skb, RTA_SRC, src))
2264                         goto nla_put_failure;
2265         }
2266         if (rt->dst.dev &&
2267             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2268                 goto nla_put_failure;
2269 #ifdef CONFIG_IP_ROUTE_CLASSID
2270         if (rt->dst.tclassid &&
2271             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2272                 goto nla_put_failure;
2273 #endif
2274         if (!rt_is_input_route(rt) &&
2275             fl4->saddr != src) {
2276                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2277                         goto nla_put_failure;
2278         }
2279         if (rt->rt_uses_gateway &&
2280             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2281                 goto nla_put_failure;
2282
2283         expires = rt->dst.expires;
2284         if (expires) {
2285                 unsigned long now = jiffies;
2286
2287                 if (time_before(now, expires))
2288                         expires -= now;
2289                 else
2290                         expires = 0;
2291         }
2292
2293         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2294         if (rt->rt_pmtu && expires)
2295                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2296         if (rtnetlink_put_metrics(skb, metrics) < 0)
2297                 goto nla_put_failure;
2298
2299         if (fl4->flowi4_mark &&
2300             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2301                 goto nla_put_failure;
2302
2303         error = rt->dst.error;
2304
2305         if (rt_is_input_route(rt)) {
2306 #ifdef CONFIG_IP_MROUTE
2307                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2308                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2309                         int err = ipmr_get_route(net, skb,
2310                                                  fl4->saddr, fl4->daddr,
2311                                                  r, nowait);
2312                         if (err <= 0) {
2313                                 if (!nowait) {
2314                                         if (err == 0)
2315                                                 return 0;
2316                                         goto nla_put_failure;
2317                                 } else {
2318                                         if (err == -EMSGSIZE)
2319                                                 goto nla_put_failure;
2320                                         error = err;
2321                                 }
2322                         }
2323                 } else
2324 #endif
2325                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2326                                 goto nla_put_failure;
2327         }
2328
2329         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2330                 goto nla_put_failure;
2331
2332         return nlmsg_end(skb, nlh);
2333
2334 nla_put_failure:
2335         nlmsg_cancel(skb, nlh);
2336         return -EMSGSIZE;
2337 }
2338
2339 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2340 {
2341         struct net *net = sock_net(in_skb->sk);
2342         struct rtmsg *rtm;
2343         struct nlattr *tb[RTA_MAX+1];
2344         struct rtable *rt = NULL;
2345         struct flowi4 fl4;
2346         __be32 dst = 0;
2347         __be32 src = 0;
2348         u32 iif;
2349         int err;
2350         int mark;
2351         struct sk_buff *skb;
2352
2353         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2354         if (err < 0)
2355                 goto errout;
2356
2357         rtm = nlmsg_data(nlh);
2358
2359         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2360         if (skb == NULL) {
2361                 err = -ENOBUFS;
2362                 goto errout;
2363         }
2364
2365         /* Reserve room for dummy headers, this skb can pass
2366            through good chunk of routing engine.
2367          */
2368         skb_reset_mac_header(skb);
2369         skb_reset_network_header(skb);
2370
2371         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2372         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2373         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2374
2375         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2376         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2377         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2378         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2379
2380         memset(&fl4, 0, sizeof(fl4));
2381         fl4.daddr = dst;
2382         fl4.saddr = src;
2383         fl4.flowi4_tos = rtm->rtm_tos;
2384         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2385         fl4.flowi4_mark = mark;
2386
2387         if (iif) {
2388                 struct net_device *dev;
2389
2390                 dev = __dev_get_by_index(net, iif);
2391                 if (dev == NULL) {
2392                         err = -ENODEV;
2393                         goto errout_free;
2394                 }
2395
2396                 skb->protocol   = htons(ETH_P_IP);
2397                 skb->dev        = dev;
2398                 skb->mark       = mark;
2399                 local_bh_disable();
2400                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2401                 local_bh_enable();
2402
2403                 rt = skb_rtable(skb);
2404                 if (err == 0 && rt->dst.error)
2405                         err = -rt->dst.error;
2406         } else {
2407                 rt = ip_route_output_key(net, &fl4);
2408
2409                 err = 0;
2410                 if (IS_ERR(rt))
2411                         err = PTR_ERR(rt);
2412         }
2413
2414         if (err)
2415                 goto errout_free;
2416
2417         skb_dst_set(skb, &rt->dst);
2418         if (rtm->rtm_flags & RTM_F_NOTIFY)
2419                 rt->rt_flags |= RTCF_NOTIFY;
2420
2421         err = rt_fill_info(net, dst, src, &fl4, skb,
2422                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2423                            RTM_NEWROUTE, 0, 0);
2424         if (err <= 0)
2425                 goto errout_free;
2426
2427         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2428 errout:
2429         return err;
2430
2431 errout_free:
2432         kfree_skb(skb);
2433         goto errout;
2434 }
2435
2436 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2437 {
2438         return skb->len;
2439 }
2440
2441 void ip_rt_multicast_event(struct in_device *in_dev)
2442 {
2443         rt_cache_flush(dev_net(in_dev->dev));
2444 }
2445
2446 #ifdef CONFIG_SYSCTL
2447 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2448 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2449 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2450 static int ip_rt_gc_elasticity __read_mostly    = 8;
2451
2452 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2453                                         void __user *buffer,
2454                                         size_t *lenp, loff_t *ppos)
2455 {
2456         if (write) {
2457                 rt_cache_flush((struct net *)__ctl->extra1);
2458                 return 0;
2459         }
2460
2461         return -EINVAL;
2462 }
2463
2464 static ctl_table ipv4_route_table[] = {
2465         {
2466                 .procname       = "gc_thresh",
2467                 .data           = &ipv4_dst_ops.gc_thresh,
2468                 .maxlen         = sizeof(int),
2469                 .mode           = 0644,
2470                 .proc_handler   = proc_dointvec,
2471         },
2472         {
2473                 .procname       = "max_size",
2474                 .data           = &ip_rt_max_size,
2475                 .maxlen         = sizeof(int),
2476                 .mode           = 0644,
2477                 .proc_handler   = proc_dointvec,
2478         },
2479         {
2480                 /*  Deprecated. Use gc_min_interval_ms */
2481
2482                 .procname       = "gc_min_interval",
2483                 .data           = &ip_rt_gc_min_interval,
2484                 .maxlen         = sizeof(int),
2485                 .mode           = 0644,
2486                 .proc_handler   = proc_dointvec_jiffies,
2487         },
2488         {
2489                 .procname       = "gc_min_interval_ms",
2490                 .data           = &ip_rt_gc_min_interval,
2491                 .maxlen         = sizeof(int),
2492                 .mode           = 0644,
2493                 .proc_handler   = proc_dointvec_ms_jiffies,
2494         },
2495         {
2496                 .procname       = "gc_timeout",
2497                 .data           = &ip_rt_gc_timeout,
2498                 .maxlen         = sizeof(int),
2499                 .mode           = 0644,
2500                 .proc_handler   = proc_dointvec_jiffies,
2501         },
2502         {
2503                 .procname       = "gc_interval",
2504                 .data           = &ip_rt_gc_interval,
2505                 .maxlen         = sizeof(int),
2506                 .mode           = 0644,
2507                 .proc_handler   = proc_dointvec_jiffies,
2508         },
2509         {
2510                 .procname       = "redirect_load",
2511                 .data           = &ip_rt_redirect_load,
2512                 .maxlen         = sizeof(int),
2513                 .mode           = 0644,
2514                 .proc_handler   = proc_dointvec,
2515         },
2516         {
2517                 .procname       = "redirect_number",
2518                 .data           = &ip_rt_redirect_number,
2519                 .maxlen         = sizeof(int),
2520                 .mode           = 0644,
2521                 .proc_handler   = proc_dointvec,
2522         },
2523         {
2524                 .procname       = "redirect_silence",
2525                 .data           = &ip_rt_redirect_silence,
2526                 .maxlen         = sizeof(int),
2527                 .mode           = 0644,
2528                 .proc_handler   = proc_dointvec,
2529         },
2530         {
2531                 .procname       = "error_cost",
2532                 .data           = &ip_rt_error_cost,
2533                 .maxlen         = sizeof(int),
2534                 .mode           = 0644,
2535                 .proc_handler   = proc_dointvec,
2536         },
2537         {
2538                 .procname       = "error_burst",
2539                 .data           = &ip_rt_error_burst,
2540                 .maxlen         = sizeof(int),
2541                 .mode           = 0644,
2542                 .proc_handler   = proc_dointvec,
2543         },
2544         {
2545                 .procname       = "gc_elasticity",
2546                 .data           = &ip_rt_gc_elasticity,
2547                 .maxlen         = sizeof(int),
2548                 .mode           = 0644,
2549                 .proc_handler   = proc_dointvec,
2550         },
2551         {
2552                 .procname       = "mtu_expires",
2553                 .data           = &ip_rt_mtu_expires,
2554                 .maxlen         = sizeof(int),
2555                 .mode           = 0644,
2556                 .proc_handler   = proc_dointvec_jiffies,
2557         },
2558         {
2559                 .procname       = "min_pmtu",
2560                 .data           = &ip_rt_min_pmtu,
2561                 .maxlen         = sizeof(int),
2562                 .mode           = 0644,
2563                 .proc_handler   = proc_dointvec,
2564         },
2565         {
2566                 .procname       = "min_adv_mss",
2567                 .data           = &ip_rt_min_advmss,
2568                 .maxlen         = sizeof(int),
2569                 .mode           = 0644,
2570                 .proc_handler   = proc_dointvec,
2571         },
2572         { }
2573 };
2574
2575 static struct ctl_table ipv4_route_flush_table[] = {
2576         {
2577                 .procname       = "flush",
2578                 .maxlen         = sizeof(int),
2579                 .mode           = 0200,
2580                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2581         },
2582         { },
2583 };
2584
2585 static __net_init int sysctl_route_net_init(struct net *net)
2586 {
2587         struct ctl_table *tbl;
2588
2589         tbl = ipv4_route_flush_table;
2590         if (!net_eq(net, &init_net)) {
2591                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2592                 if (tbl == NULL)
2593                         goto err_dup;
2594
2595                 /* Don't export sysctls to unprivileged users */
2596                 if (net->user_ns != &init_user_ns)
2597                         tbl[0].procname = NULL;
2598         }
2599         tbl[0].extra1 = net;
2600
2601         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2602         if (net->ipv4.route_hdr == NULL)
2603                 goto err_reg;
2604         return 0;
2605
2606 err_reg:
2607         if (tbl != ipv4_route_flush_table)
2608                 kfree(tbl);
2609 err_dup:
2610         return -ENOMEM;
2611 }
2612
2613 static __net_exit void sysctl_route_net_exit(struct net *net)
2614 {
2615         struct ctl_table *tbl;
2616
2617         tbl = net->ipv4.route_hdr->ctl_table_arg;
2618         unregister_net_sysctl_table(net->ipv4.route_hdr);
2619         BUG_ON(tbl == ipv4_route_flush_table);
2620         kfree(tbl);
2621 }
2622
2623 static __net_initdata struct pernet_operations sysctl_route_ops = {
2624         .init = sysctl_route_net_init,
2625         .exit = sysctl_route_net_exit,
2626 };
2627 #endif
2628
2629 static __net_init int rt_genid_init(struct net *net)
2630 {
2631         atomic_set(&net->rt_genid, 0);
2632         get_random_bytes(&net->ipv4.dev_addr_genid,
2633                          sizeof(net->ipv4.dev_addr_genid));
2634         return 0;
2635 }
2636
2637 static __net_initdata struct pernet_operations rt_genid_ops = {
2638         .init = rt_genid_init,
2639 };
2640
2641 static int __net_init ipv4_inetpeer_init(struct net *net)
2642 {
2643         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2644
2645         if (!bp)
2646                 return -ENOMEM;
2647         inet_peer_base_init(bp);
2648         net->ipv4.peers = bp;
2649         return 0;
2650 }
2651
2652 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2653 {
2654         struct inet_peer_base *bp = net->ipv4.peers;
2655
2656         net->ipv4.peers = NULL;
2657         inetpeer_invalidate_tree(bp);
2658         kfree(bp);
2659 }
2660
2661 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2662         .init   =       ipv4_inetpeer_init,
2663         .exit   =       ipv4_inetpeer_exit,
2664 };
2665
2666 #ifdef CONFIG_IP_ROUTE_CLASSID
2667 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2668 #endif /* CONFIG_IP_ROUTE_CLASSID */
2669
2670 int __init ip_rt_init(void)
2671 {
2672         int rc = 0;
2673
2674         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2675         if (!ip_idents)
2676                 panic("IP: failed to allocate ip_idents\n");
2677
2678         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2679
2680 #ifdef CONFIG_IP_ROUTE_CLASSID
2681         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2682         if (!ip_rt_acct)
2683                 panic("IP: failed to allocate ip_rt_acct\n");
2684 #endif
2685
2686         ipv4_dst_ops.kmem_cachep =
2687                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2688                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2689
2690         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2691
2692         if (dst_entries_init(&ipv4_dst_ops) < 0)
2693                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2694
2695         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2696                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2697
2698         ipv4_dst_ops.gc_thresh = ~0;
2699         ip_rt_max_size = INT_MAX;
2700
2701         devinet_init();
2702         ip_fib_init();
2703
2704         if (ip_rt_proc_init())
2705                 pr_err("Unable to create route proc files\n");
2706 #ifdef CONFIG_XFRM
2707         xfrm_init();
2708         xfrm4_init();
2709 #endif
2710         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2711
2712 #ifdef CONFIG_SYSCTL
2713         register_pernet_subsys(&sysctl_route_ops);
2714 #endif
2715         register_pernet_subsys(&rt_genid_ops);
2716         register_pernet_subsys(&ipv4_inetpeer_ops);
2717         return rc;
2718 }
2719
2720 #ifdef CONFIG_SYSCTL
2721 /*
2722  * We really need to sanitize the damn ipv4 init order, then all
2723  * this nonsense will go away.
2724  */
2725 void __init ip_static_sysctl_init(void)
2726 {
2727         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2728 }
2729 #endif