Merge branch 'linux-linaro-lsk' into linux-linaro-lsk-android
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #include <linux/kmemleak.h>
110 #endif
111 #include <net/secure_seq.h>
112
113 #define RT_FL_TOS(oldflp4) \
114         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 /*
131  *      Interface to generic destination cache.
132  */
133
134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
135 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
136 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
137 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
138 static void              ipv4_link_failure(struct sk_buff *skb);
139 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
140                                            struct sk_buff *skb, u32 mtu);
141 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
142                                         struct sk_buff *skb);
143 static void             ipv4_dst_destroy(struct dst_entry *dst);
144
145 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
146                             int how)
147 {
148 }
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             cpu_to_be16(ETH_P_IP),
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .ifdown =               ipv4_dst_ifdown,
169         .negative_advice =      ipv4_negative_advice,
170         .link_failure =         ipv4_link_failure,
171         .update_pmtu =          ip_rt_update_pmtu,
172         .redirect =             ip_do_redirect,
173         .local_out =            __ip_local_out,
174         .neigh_lookup =         ipv4_neigh_lookup,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .owner   = THIS_MODULE,
244         .open    = rt_cache_seq_open,
245         .read    = seq_read,
246         .llseek  = seq_lseek,
247         .release = seq_release,
248 };
249
250
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252 {
253         int cpu;
254
255         if (*pos == 0)
256                 return SEQ_START_TOKEN;
257
258         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259                 if (!cpu_possible(cpu))
260                         continue;
261                 *pos = cpu+1;
262                 return &per_cpu(rt_cache_stat, cpu);
263         }
264         return NULL;
265 }
266
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269         int cpu;
270
271         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272                 if (!cpu_possible(cpu))
273                         continue;
274                 *pos = cpu+1;
275                 return &per_cpu(rt_cache_stat, cpu);
276         }
277         return NULL;
278
279 }
280
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283
284 }
285
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288         struct rt_cache_stat *st = v;
289
290         if (v == SEQ_START_TOKEN) {
291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292                 return 0;
293         }
294
295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297                    dst_entries_get_slow(&ipv4_dst_ops),
298                    st->in_hit,
299                    st->in_slow_tot,
300                    st->in_slow_mc,
301                    st->in_no_route,
302                    st->in_brd,
303                    st->in_martian_dst,
304                    st->in_martian_src,
305
306                    st->out_hit,
307                    st->out_slow_tot,
308                    st->out_slow_mc,
309
310                    st->gc_total,
311                    st->gc_ignored,
312                    st->gc_goal_miss,
313                    st->gc_dst_overflow,
314                    st->in_hlist_search,
315                    st->out_hlist_search
316                 );
317         return 0;
318 }
319
320 static const struct seq_operations rt_cpu_seq_ops = {
321         .start  = rt_cpu_seq_start,
322         .next   = rt_cpu_seq_next,
323         .stop   = rt_cpu_seq_stop,
324         .show   = rt_cpu_seq_show,
325 };
326
327
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330         return seq_open(file, &rt_cpu_seq_ops);
331 }
332
333 static const struct file_operations rt_cpu_seq_fops = {
334         .owner   = THIS_MODULE,
335         .open    = rt_cpu_seq_open,
336         .read    = seq_read,
337         .llseek  = seq_lseek,
338         .release = seq_release,
339 };
340
341 #ifdef CONFIG_IP_ROUTE_CLASSID
342 static int rt_acct_proc_show(struct seq_file *m, void *v)
343 {
344         struct ip_rt_acct *dst, *src;
345         unsigned int i, j;
346
347         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348         if (!dst)
349                 return -ENOMEM;
350
351         for_each_possible_cpu(i) {
352                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353                 for (j = 0; j < 256; j++) {
354                         dst[j].o_bytes   += src[j].o_bytes;
355                         dst[j].o_packets += src[j].o_packets;
356                         dst[j].i_bytes   += src[j].i_bytes;
357                         dst[j].i_packets += src[j].i_packets;
358                 }
359         }
360
361         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362         kfree(dst);
363         return 0;
364 }
365
366 static int rt_acct_proc_open(struct inode *inode, struct file *file)
367 {
368         return single_open(file, rt_acct_proc_show, NULL);
369 }
370
371 static const struct file_operations rt_acct_proc_fops = {
372         .owner          = THIS_MODULE,
373         .open           = rt_acct_proc_open,
374         .read           = seq_read,
375         .llseek         = seq_lseek,
376         .release        = single_release,
377 };
378 #endif
379
380 static int __net_init ip_rt_do_proc_init(struct net *net)
381 {
382         struct proc_dir_entry *pde;
383
384         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385                           &rt_cache_seq_fops);
386         if (!pde)
387                 goto err1;
388
389         pde = proc_create("rt_cache", S_IRUGO,
390                           net->proc_net_stat, &rt_cpu_seq_fops);
391         if (!pde)
392                 goto err2;
393
394 #ifdef CONFIG_IP_ROUTE_CLASSID
395         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
396         if (!pde)
397                 goto err3;
398 #endif
399         return 0;
400
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 err3:
403         remove_proc_entry("rt_cache", net->proc_net_stat);
404 #endif
405 err2:
406         remove_proc_entry("rt_cache", net->proc_net);
407 err1:
408         return -ENOMEM;
409 }
410
411 static void __net_exit ip_rt_do_proc_exit(struct net *net)
412 {
413         remove_proc_entry("rt_cache", net->proc_net_stat);
414         remove_proc_entry("rt_cache", net->proc_net);
415 #ifdef CONFIG_IP_ROUTE_CLASSID
416         remove_proc_entry("rt_acct", net->proc_net);
417 #endif
418 }
419
420 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
421         .init = ip_rt_do_proc_init,
422         .exit = ip_rt_do_proc_exit,
423 };
424
425 static int __init ip_rt_proc_init(void)
426 {
427         return register_pernet_subsys(&ip_rt_proc_ops);
428 }
429
430 #else
431 static inline int ip_rt_proc_init(void)
432 {
433         return 0;
434 }
435 #endif /* CONFIG_PROC_FS */
436
437 static inline bool rt_is_expired(const struct rtable *rth)
438 {
439         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
440 }
441
442 void rt_cache_flush(struct net *net)
443 {
444         rt_genid_bump(net);
445 }
446
447 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448                                            struct sk_buff *skb,
449                                            const void *daddr)
450 {
451         struct net_device *dev = dst->dev;
452         const __be32 *pkey = daddr;
453         const struct rtable *rt;
454         struct neighbour *n;
455
456         rt = (const struct rtable *) dst;
457         if (rt->rt_gateway)
458                 pkey = (const __be32 *) &rt->rt_gateway;
459         else if (skb)
460                 pkey = &ip_hdr(skb)->daddr;
461
462         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
463         if (n)
464                 return n;
465         return neigh_create(&arp_tbl, pkey, dev);
466 }
467
468 #define IP_IDENTS_SZ 2048u
469 struct ip_ident_bucket {
470         atomic_t        id;
471         u32             stamp32;
472 };
473
474 static struct ip_ident_bucket *ip_idents __read_mostly;
475
476 /* In order to protect privacy, we add a perturbation to identifiers
477  * if one generator is seldom used. This makes hard for an attacker
478  * to infer how many packets were sent between two points in time.
479  */
480 u32 ip_idents_reserve(u32 hash, int segs)
481 {
482         struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
483         u32 old = ACCESS_ONCE(bucket->stamp32);
484         u32 now = (u32)jiffies;
485         u32 delta = 0;
486
487         if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
488                 u64 x = prandom_u32();
489
490                 x *= (now - old);
491                 delta = (u32)(x >> 32);
492         }
493
494         return atomic_add_return(segs + delta, &bucket->id) - segs;
495 }
496 EXPORT_SYMBOL(ip_idents_reserve);
497
498 void __ip_select_ident(struct iphdr *iph, int segs)
499 {
500         static u32 ip_idents_hashrnd __read_mostly;
501         static bool hashrnd_initialized = false;
502         u32 hash, id;
503
504         if (unlikely(!hashrnd_initialized)) {
505                 hashrnd_initialized = true;
506                 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
507         }
508
509         hash = jhash_3words((__force u32)iph->daddr,
510                             (__force u32)iph->saddr,
511                             iph->protocol,
512                             ip_idents_hashrnd);
513         id = ip_idents_reserve(hash, segs);
514         iph->id = htons(id);
515 }
516 EXPORT_SYMBOL(__ip_select_ident);
517
518 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
519                              const struct iphdr *iph,
520                              int oif, u8 tos,
521                              u8 prot, u32 mark, int flow_flags)
522 {
523         if (sk) {
524                 const struct inet_sock *inet = inet_sk(sk);
525
526                 oif = sk->sk_bound_dev_if;
527                 mark = sk->sk_mark;
528                 tos = RT_CONN_FLAGS(sk);
529                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
530         }
531         flowi4_init_output(fl4, oif, mark, tos,
532                            RT_SCOPE_UNIVERSE, prot,
533                            flow_flags,
534                            iph->daddr, iph->saddr, 0, 0);
535 }
536
537 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
538                                const struct sock *sk)
539 {
540         const struct iphdr *iph = ip_hdr(skb);
541         int oif = skb->dev->ifindex;
542         u8 tos = RT_TOS(iph->tos);
543         u8 prot = iph->protocol;
544         u32 mark = skb->mark;
545
546         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
547 }
548
549 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
550 {
551         const struct inet_sock *inet = inet_sk(sk);
552         const struct ip_options_rcu *inet_opt;
553         __be32 daddr = inet->inet_daddr;
554
555         rcu_read_lock();
556         inet_opt = rcu_dereference(inet->inet_opt);
557         if (inet_opt && inet_opt->opt.srr)
558                 daddr = inet_opt->opt.faddr;
559         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
560                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
561                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
562                            inet_sk_flowi_flags(sk),
563                            daddr, inet->inet_saddr, 0, 0);
564         rcu_read_unlock();
565 }
566
567 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
568                                  const struct sk_buff *skb)
569 {
570         if (skb)
571                 build_skb_flow_key(fl4, skb, sk);
572         else
573                 build_sk_flow_key(fl4, sk);
574 }
575
576 static inline void rt_free(struct rtable *rt)
577 {
578         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
579 }
580
581 static DEFINE_SPINLOCK(fnhe_lock);
582
583 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
584 {
585         struct fib_nh_exception *fnhe, *oldest;
586         struct rtable *orig;
587
588         oldest = rcu_dereference(hash->chain);
589         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
590              fnhe = rcu_dereference(fnhe->fnhe_next)) {
591                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
592                         oldest = fnhe;
593         }
594         orig = rcu_dereference(oldest->fnhe_rth);
595         if (orig) {
596                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
597                 rt_free(orig);
598         }
599         return oldest;
600 }
601
602 static inline u32 fnhe_hashfun(__be32 daddr)
603 {
604         u32 hval;
605
606         hval = (__force u32) daddr;
607         hval ^= (hval >> 11) ^ (hval >> 22);
608
609         return hval & (FNHE_HASH_SIZE - 1);
610 }
611
612 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
613                                   u32 pmtu, unsigned long expires)
614 {
615         struct fnhe_hash_bucket *hash;
616         struct fib_nh_exception *fnhe;
617         int depth;
618         u32 hval = fnhe_hashfun(daddr);
619
620         spin_lock_bh(&fnhe_lock);
621
622         hash = nh->nh_exceptions;
623         if (!hash) {
624                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
625                 if (!hash)
626                         goto out_unlock;
627                 nh->nh_exceptions = hash;
628         }
629
630         hash += hval;
631
632         depth = 0;
633         for (fnhe = rcu_dereference(hash->chain); fnhe;
634              fnhe = rcu_dereference(fnhe->fnhe_next)) {
635                 if (fnhe->fnhe_daddr == daddr)
636                         break;
637                 depth++;
638         }
639
640         if (fnhe) {
641                 if (gw)
642                         fnhe->fnhe_gw = gw;
643                 if (pmtu) {
644                         fnhe->fnhe_pmtu = pmtu;
645                         fnhe->fnhe_expires = expires;
646                 }
647         } else {
648                 if (depth > FNHE_RECLAIM_DEPTH)
649                         fnhe = fnhe_oldest(hash);
650                 else {
651                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
652                         if (!fnhe)
653                                 goto out_unlock;
654
655                         fnhe->fnhe_next = hash->chain;
656                         rcu_assign_pointer(hash->chain, fnhe);
657                 }
658                 fnhe->fnhe_daddr = daddr;
659                 fnhe->fnhe_gw = gw;
660                 fnhe->fnhe_pmtu = pmtu;
661                 fnhe->fnhe_expires = expires;
662         }
663
664         fnhe->fnhe_stamp = jiffies;
665
666 out_unlock:
667         spin_unlock_bh(&fnhe_lock);
668         return;
669 }
670
671 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
672                              bool kill_route)
673 {
674         __be32 new_gw = icmp_hdr(skb)->un.gateway;
675         __be32 old_gw = ip_hdr(skb)->saddr;
676         struct net_device *dev = skb->dev;
677         struct in_device *in_dev;
678         struct fib_result res;
679         struct neighbour *n;
680         struct net *net;
681
682         switch (icmp_hdr(skb)->code & 7) {
683         case ICMP_REDIR_NET:
684         case ICMP_REDIR_NETTOS:
685         case ICMP_REDIR_HOST:
686         case ICMP_REDIR_HOSTTOS:
687                 break;
688
689         default:
690                 return;
691         }
692
693         if (rt->rt_gateway != old_gw)
694                 return;
695
696         in_dev = __in_dev_get_rcu(dev);
697         if (!in_dev)
698                 return;
699
700         net = dev_net(dev);
701         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
702             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
703             ipv4_is_zeronet(new_gw))
704                 goto reject_redirect;
705
706         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
707                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
708                         goto reject_redirect;
709                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
710                         goto reject_redirect;
711         } else {
712                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
713                         goto reject_redirect;
714         }
715
716         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
717         if (n) {
718                 if (!(n->nud_state & NUD_VALID)) {
719                         neigh_event_send(n, NULL);
720                 } else {
721                         if (fib_lookup(net, fl4, &res) == 0) {
722                                 struct fib_nh *nh = &FIB_RES_NH(res);
723
724                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
725                                                       0, 0);
726                         }
727                         if (kill_route)
728                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
729                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
730                 }
731                 neigh_release(n);
732         }
733         return;
734
735 reject_redirect:
736 #ifdef CONFIG_IP_ROUTE_VERBOSE
737         if (IN_DEV_LOG_MARTIANS(in_dev)) {
738                 const struct iphdr *iph = (const struct iphdr *) skb->data;
739                 __be32 daddr = iph->daddr;
740                 __be32 saddr = iph->saddr;
741
742                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
743                                      "  Advised path = %pI4 -> %pI4\n",
744                                      &old_gw, dev->name, &new_gw,
745                                      &saddr, &daddr);
746         }
747 #endif
748         ;
749 }
750
751 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
752 {
753         struct rtable *rt;
754         struct flowi4 fl4;
755         const struct iphdr *iph = (const struct iphdr *) skb->data;
756         int oif = skb->dev->ifindex;
757         u8 tos = RT_TOS(iph->tos);
758         u8 prot = iph->protocol;
759         u32 mark = skb->mark;
760
761         rt = (struct rtable *) dst;
762
763         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
764         __ip_do_redirect(rt, skb, &fl4, true);
765 }
766
767 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
768 {
769         struct rtable *rt = (struct rtable *)dst;
770         struct dst_entry *ret = dst;
771
772         if (rt) {
773                 if (dst->obsolete > 0) {
774                         ip_rt_put(rt);
775                         ret = NULL;
776                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
777                            rt->dst.expires) {
778                         ip_rt_put(rt);
779                         ret = NULL;
780                 }
781         }
782         return ret;
783 }
784
785 /*
786  * Algorithm:
787  *      1. The first ip_rt_redirect_number redirects are sent
788  *         with exponential backoff, then we stop sending them at all,
789  *         assuming that the host ignores our redirects.
790  *      2. If we did not see packets requiring redirects
791  *         during ip_rt_redirect_silence, we assume that the host
792  *         forgot redirected route and start to send redirects again.
793  *
794  * This algorithm is much cheaper and more intelligent than dumb load limiting
795  * in icmp.c.
796  *
797  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
798  * and "frag. need" (breaks PMTU discovery) in icmp.c.
799  */
800
801 void ip_rt_send_redirect(struct sk_buff *skb)
802 {
803         struct rtable *rt = skb_rtable(skb);
804         struct in_device *in_dev;
805         struct inet_peer *peer;
806         struct net *net;
807         int log_martians;
808
809         rcu_read_lock();
810         in_dev = __in_dev_get_rcu(rt->dst.dev);
811         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
812                 rcu_read_unlock();
813                 return;
814         }
815         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
816         rcu_read_unlock();
817
818         net = dev_net(rt->dst.dev);
819         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
820         if (!peer) {
821                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
822                           rt_nexthop(rt, ip_hdr(skb)->daddr));
823                 return;
824         }
825
826         /* No redirected packets during ip_rt_redirect_silence;
827          * reset the algorithm.
828          */
829         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
830                 peer->rate_tokens = 0;
831
832         /* Too many ignored redirects; do not send anything
833          * set dst.rate_last to the last seen redirected packet.
834          */
835         if (peer->rate_tokens >= ip_rt_redirect_number) {
836                 peer->rate_last = jiffies;
837                 goto out_put_peer;
838         }
839
840         /* Check for load limit; set rate_last to the latest sent
841          * redirect.
842          */
843         if (peer->rate_tokens == 0 ||
844             time_after(jiffies,
845                        (peer->rate_last +
846                         (ip_rt_redirect_load << peer->rate_tokens)))) {
847                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
848
849                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
850                 peer->rate_last = jiffies;
851                 ++peer->rate_tokens;
852 #ifdef CONFIG_IP_ROUTE_VERBOSE
853                 if (log_martians &&
854                     peer->rate_tokens == ip_rt_redirect_number)
855                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
856                                              &ip_hdr(skb)->saddr, inet_iif(skb),
857                                              &ip_hdr(skb)->daddr, &gw);
858 #endif
859         }
860 out_put_peer:
861         inet_putpeer(peer);
862 }
863
864 static int ip_error(struct sk_buff *skb)
865 {
866         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
867         struct rtable *rt = skb_rtable(skb);
868         struct inet_peer *peer;
869         unsigned long now;
870         struct net *net;
871         bool send;
872         int code;
873
874         net = dev_net(rt->dst.dev);
875         if (!IN_DEV_FORWARD(in_dev)) {
876                 switch (rt->dst.error) {
877                 case EHOSTUNREACH:
878                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
879                         break;
880
881                 case ENETUNREACH:
882                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
883                         break;
884                 }
885                 goto out;
886         }
887
888         switch (rt->dst.error) {
889         case EINVAL:
890         default:
891                 goto out;
892         case EHOSTUNREACH:
893                 code = ICMP_HOST_UNREACH;
894                 break;
895         case ENETUNREACH:
896                 code = ICMP_NET_UNREACH;
897                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
898                 break;
899         case EACCES:
900                 code = ICMP_PKT_FILTERED;
901                 break;
902         }
903
904         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
905
906         send = true;
907         if (peer) {
908                 now = jiffies;
909                 peer->rate_tokens += now - peer->rate_last;
910                 if (peer->rate_tokens > ip_rt_error_burst)
911                         peer->rate_tokens = ip_rt_error_burst;
912                 peer->rate_last = now;
913                 if (peer->rate_tokens >= ip_rt_error_cost)
914                         peer->rate_tokens -= ip_rt_error_cost;
915                 else
916                         send = false;
917                 inet_putpeer(peer);
918         }
919         if (send)
920                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
921
922 out:    kfree_skb(skb);
923         return 0;
924 }
925
926 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
927 {
928         struct dst_entry *dst = &rt->dst;
929         struct fib_result res;
930
931         if (dst_metric_locked(dst, RTAX_MTU))
932                 return;
933
934         if (dst->dev->mtu < mtu)
935                 return;
936
937         if (mtu < ip_rt_min_pmtu)
938                 mtu = ip_rt_min_pmtu;
939
940         if (!rt->rt_pmtu) {
941                 dst->obsolete = DST_OBSOLETE_KILL;
942         } else {
943                 rt->rt_pmtu = mtu;
944                 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
945         }
946
947         rcu_read_lock();
948         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
949                 struct fib_nh *nh = &FIB_RES_NH(res);
950
951                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
952                                       jiffies + ip_rt_mtu_expires);
953         }
954         rcu_read_unlock();
955 }
956
957 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
958                               struct sk_buff *skb, u32 mtu)
959 {
960         struct rtable *rt = (struct rtable *) dst;
961         struct flowi4 fl4;
962
963         ip_rt_build_flow_key(&fl4, sk, skb);
964         __ip_rt_update_pmtu(rt, &fl4, mtu);
965 }
966
967 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
968                       int oif, u32 mark, u8 protocol, int flow_flags)
969 {
970         const struct iphdr *iph = (const struct iphdr *) skb->data;
971         struct flowi4 fl4;
972         struct rtable *rt;
973
974         if (!mark)
975                 mark = IP4_REPLY_MARK(net, skb->mark);
976
977         __build_flow_key(&fl4, NULL, iph, oif,
978                          RT_TOS(iph->tos), protocol, mark, flow_flags);
979         rt = __ip_route_output_key(net, &fl4);
980         if (!IS_ERR(rt)) {
981                 __ip_rt_update_pmtu(rt, &fl4, mtu);
982                 ip_rt_put(rt);
983         }
984 }
985 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
986
987 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
988 {
989         const struct iphdr *iph = (const struct iphdr *) skb->data;
990         struct flowi4 fl4;
991         struct rtable *rt;
992
993         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
994
995         if (!fl4.flowi4_mark)
996                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
997
998         rt = __ip_route_output_key(sock_net(sk), &fl4);
999         if (!IS_ERR(rt)) {
1000                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1001                 ip_rt_put(rt);
1002         }
1003 }
1004
1005 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1006 {
1007         const struct iphdr *iph = (const struct iphdr *) skb->data;
1008         struct flowi4 fl4;
1009         struct rtable *rt;
1010         struct dst_entry *odst = NULL;
1011         bool new = false;
1012
1013         bh_lock_sock(sk);
1014         odst = sk_dst_get(sk);
1015
1016         if (sock_owned_by_user(sk) || !odst) {
1017                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1018                 goto out;
1019         }
1020
1021         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1022
1023         rt = (struct rtable *)odst;
1024         if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
1025                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1026                 if (IS_ERR(rt))
1027                         goto out;
1028
1029                 new = true;
1030         }
1031
1032         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1033
1034         if (!dst_check(&rt->dst, 0)) {
1035                 if (new)
1036                         dst_release(&rt->dst);
1037
1038                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1039                 if (IS_ERR(rt))
1040                         goto out;
1041
1042                 new = true;
1043         }
1044
1045         if (new)
1046                 sk_dst_set(sk, &rt->dst);
1047
1048 out:
1049         bh_unlock_sock(sk);
1050         dst_release(odst);
1051 }
1052 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1053
1054 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1055                    int oif, u32 mark, u8 protocol, int flow_flags)
1056 {
1057         const struct iphdr *iph = (const struct iphdr *) skb->data;
1058         struct flowi4 fl4;
1059         struct rtable *rt;
1060
1061         __build_flow_key(&fl4, NULL, iph, oif,
1062                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1063         rt = __ip_route_output_key(net, &fl4);
1064         if (!IS_ERR(rt)) {
1065                 __ip_do_redirect(rt, skb, &fl4, false);
1066                 ip_rt_put(rt);
1067         }
1068 }
1069 EXPORT_SYMBOL_GPL(ipv4_redirect);
1070
1071 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1072 {
1073         const struct iphdr *iph = (const struct iphdr *) skb->data;
1074         struct flowi4 fl4;
1075         struct rtable *rt;
1076
1077         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1078         rt = __ip_route_output_key(sock_net(sk), &fl4);
1079         if (!IS_ERR(rt)) {
1080                 __ip_do_redirect(rt, skb, &fl4, false);
1081                 ip_rt_put(rt);
1082         }
1083 }
1084 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1085
1086 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1087 {
1088         struct rtable *rt = (struct rtable *) dst;
1089
1090         /* All IPV4 dsts are created with ->obsolete set to the value
1091          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1092          * into this function always.
1093          *
1094          * When a PMTU/redirect information update invalidates a
1095          * route, this is indicated by setting obsolete to
1096          * DST_OBSOLETE_KILL.
1097          */
1098         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1099                 return NULL;
1100         return dst;
1101 }
1102
1103 static void ipv4_link_failure(struct sk_buff *skb)
1104 {
1105         struct rtable *rt;
1106
1107         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1108
1109         rt = skb_rtable(skb);
1110         if (rt)
1111                 dst_set_expires(&rt->dst, 0);
1112 }
1113
1114 static int ip_rt_bug(struct sk_buff *skb)
1115 {
1116         pr_debug("%s: %pI4 -> %pI4, %s\n",
1117                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1118                  skb->dev ? skb->dev->name : "?");
1119         kfree_skb(skb);
1120         WARN_ON(1);
1121         return 0;
1122 }
1123
1124 /*
1125    We do not cache source address of outgoing interface,
1126    because it is used only by IP RR, TS and SRR options,
1127    so that it out of fast path.
1128
1129    BTW remember: "addr" is allowed to be not aligned
1130    in IP options!
1131  */
1132
1133 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1134 {
1135         __be32 src;
1136
1137         if (rt_is_output_route(rt))
1138                 src = ip_hdr(skb)->saddr;
1139         else {
1140                 struct fib_result res;
1141                 struct flowi4 fl4;
1142                 struct iphdr *iph;
1143
1144                 iph = ip_hdr(skb);
1145
1146                 memset(&fl4, 0, sizeof(fl4));
1147                 fl4.daddr = iph->daddr;
1148                 fl4.saddr = iph->saddr;
1149                 fl4.flowi4_tos = RT_TOS(iph->tos);
1150                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1151                 fl4.flowi4_iif = skb->dev->ifindex;
1152                 fl4.flowi4_mark = skb->mark;
1153
1154                 rcu_read_lock();
1155                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1156                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1157                 else
1158                         src = inet_select_addr(rt->dst.dev,
1159                                                rt_nexthop(rt, iph->daddr),
1160                                                RT_SCOPE_UNIVERSE);
1161                 rcu_read_unlock();
1162         }
1163         memcpy(addr, &src, 4);
1164 }
1165
1166 #ifdef CONFIG_IP_ROUTE_CLASSID
1167 static void set_class_tag(struct rtable *rt, u32 tag)
1168 {
1169         if (!(rt->dst.tclassid & 0xFFFF))
1170                 rt->dst.tclassid |= tag & 0xFFFF;
1171         if (!(rt->dst.tclassid & 0xFFFF0000))
1172                 rt->dst.tclassid |= tag & 0xFFFF0000;
1173 }
1174 #endif
1175
1176 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1177 {
1178         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1179
1180         if (advmss == 0) {
1181                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1182                                ip_rt_min_advmss);
1183                 if (advmss > 65535 - 40)
1184                         advmss = 65535 - 40;
1185         }
1186         return advmss;
1187 }
1188
1189 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1190 {
1191         const struct rtable *rt = (const struct rtable *) dst;
1192         unsigned int mtu = rt->rt_pmtu;
1193
1194         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1195                 mtu = dst_metric_raw(dst, RTAX_MTU);
1196
1197         if (mtu)
1198                 return mtu;
1199
1200         mtu = dst->dev->mtu;
1201
1202         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1203                 if (rt->rt_uses_gateway && mtu > 576)
1204                         mtu = 576;
1205         }
1206
1207         if (mtu > IP_MAX_MTU)
1208                 mtu = IP_MAX_MTU;
1209
1210         return mtu;
1211 }
1212
1213 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1214 {
1215         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1216         struct fib_nh_exception *fnhe;
1217         u32 hval;
1218
1219         if (!hash)
1220                 return NULL;
1221
1222         hval = fnhe_hashfun(daddr);
1223
1224         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1225              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1226                 if (fnhe->fnhe_daddr == daddr)
1227                         return fnhe;
1228         }
1229         return NULL;
1230 }
1231
1232 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1233                               __be32 daddr)
1234 {
1235         bool ret = false;
1236
1237         spin_lock_bh(&fnhe_lock);
1238
1239         if (daddr == fnhe->fnhe_daddr) {
1240                 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1241                 if (orig && rt_is_expired(orig)) {
1242                         fnhe->fnhe_gw = 0;
1243                         fnhe->fnhe_pmtu = 0;
1244                         fnhe->fnhe_expires = 0;
1245                 }
1246                 if (fnhe->fnhe_pmtu) {
1247                         unsigned long expires = fnhe->fnhe_expires;
1248                         unsigned long diff = expires - jiffies;
1249
1250                         if (time_before(jiffies, expires)) {
1251                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1252                                 dst_set_expires(&rt->dst, diff);
1253                         }
1254                 }
1255                 if (fnhe->fnhe_gw) {
1256                         rt->rt_flags |= RTCF_REDIRECTED;
1257                         rt->rt_gateway = fnhe->fnhe_gw;
1258                         rt->rt_uses_gateway = 1;
1259                 } else if (!rt->rt_gateway)
1260                         rt->rt_gateway = daddr;
1261
1262                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1263                 if (orig)
1264                         rt_free(orig);
1265
1266                 fnhe->fnhe_stamp = jiffies;
1267                 ret = true;
1268         }
1269         spin_unlock_bh(&fnhe_lock);
1270
1271         return ret;
1272 }
1273
1274 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1275 {
1276         struct rtable *orig, *prev, **p;
1277         bool ret = true;
1278
1279         if (rt_is_input_route(rt)) {
1280                 p = (struct rtable **)&nh->nh_rth_input;
1281         } else {
1282                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1283         }
1284         orig = *p;
1285
1286         prev = cmpxchg(p, orig, rt);
1287         if (prev == orig) {
1288                 if (orig)
1289                         rt_free(orig);
1290         } else
1291                 ret = false;
1292
1293         return ret;
1294 }
1295
1296 static DEFINE_SPINLOCK(rt_uncached_lock);
1297 static LIST_HEAD(rt_uncached_list);
1298
1299 static void rt_add_uncached_list(struct rtable *rt)
1300 {
1301         spin_lock_bh(&rt_uncached_lock);
1302         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1303         spin_unlock_bh(&rt_uncached_lock);
1304 }
1305
1306 static void ipv4_dst_destroy(struct dst_entry *dst)
1307 {
1308         struct rtable *rt = (struct rtable *) dst;
1309
1310         if (!list_empty(&rt->rt_uncached)) {
1311                 spin_lock_bh(&rt_uncached_lock);
1312                 list_del(&rt->rt_uncached);
1313                 spin_unlock_bh(&rt_uncached_lock);
1314         }
1315 }
1316
1317 void rt_flush_dev(struct net_device *dev)
1318 {
1319         if (!list_empty(&rt_uncached_list)) {
1320                 struct net *net = dev_net(dev);
1321                 struct rtable *rt;
1322
1323                 spin_lock_bh(&rt_uncached_lock);
1324                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1325                         if (rt->dst.dev != dev)
1326                                 continue;
1327                         rt->dst.dev = net->loopback_dev;
1328                         dev_hold(rt->dst.dev);
1329                         dev_put(dev);
1330                 }
1331                 spin_unlock_bh(&rt_uncached_lock);
1332         }
1333 }
1334
1335 static bool rt_cache_valid(const struct rtable *rt)
1336 {
1337         return  rt &&
1338                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1339                 !rt_is_expired(rt);
1340 }
1341
1342 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1343                            const struct fib_result *res,
1344                            struct fib_nh_exception *fnhe,
1345                            struct fib_info *fi, u16 type, u32 itag)
1346 {
1347         bool cached = false;
1348
1349         if (fi) {
1350                 struct fib_nh *nh = &FIB_RES_NH(*res);
1351
1352                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1353                         rt->rt_gateway = nh->nh_gw;
1354                         rt->rt_uses_gateway = 1;
1355                 }
1356                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1357 #ifdef CONFIG_IP_ROUTE_CLASSID
1358                 rt->dst.tclassid = nh->nh_tclassid;
1359 #endif
1360                 if (unlikely(fnhe))
1361                         cached = rt_bind_exception(rt, fnhe, daddr);
1362                 else if (!(rt->dst.flags & DST_NOCACHE))
1363                         cached = rt_cache_route(nh, rt);
1364                 if (unlikely(!cached)) {
1365                         /* Routes we intend to cache in nexthop exception or
1366                          * FIB nexthop have the DST_NOCACHE bit clear.
1367                          * However, if we are unsuccessful at storing this
1368                          * route into the cache we really need to set it.
1369                          */
1370                         rt->dst.flags |= DST_NOCACHE;
1371                         if (!rt->rt_gateway)
1372                                 rt->rt_gateway = daddr;
1373                         rt_add_uncached_list(rt);
1374                 }
1375         } else
1376                 rt_add_uncached_list(rt);
1377
1378 #ifdef CONFIG_IP_ROUTE_CLASSID
1379 #ifdef CONFIG_IP_MULTIPLE_TABLES
1380         set_class_tag(rt, res->tclassid);
1381 #endif
1382         set_class_tag(rt, itag);
1383 #endif
1384 }
1385
1386 static struct rtable *rt_dst_alloc(struct net_device *dev,
1387                                    bool nopolicy, bool noxfrm, bool will_cache)
1388 {
1389         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1390                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1391                          (nopolicy ? DST_NOPOLICY : 0) |
1392                          (noxfrm ? DST_NOXFRM : 0));
1393 }
1394
1395 /* called in rcu_read_lock() section */
1396 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1397                                 u8 tos, struct net_device *dev, int our)
1398 {
1399         struct rtable *rth;
1400         struct in_device *in_dev = __in_dev_get_rcu(dev);
1401         u32 itag = 0;
1402         int err;
1403
1404         /* Primary sanity checks. */
1405
1406         if (in_dev == NULL)
1407                 return -EINVAL;
1408
1409         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1410             skb->protocol != htons(ETH_P_IP))
1411                 goto e_inval;
1412
1413         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1414                 if (ipv4_is_loopback(saddr))
1415                         goto e_inval;
1416
1417         if (ipv4_is_zeronet(saddr)) {
1418                 if (!ipv4_is_local_multicast(daddr))
1419                         goto e_inval;
1420         } else {
1421                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1422                                           in_dev, &itag);
1423                 if (err < 0)
1424                         goto e_err;
1425         }
1426         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1427                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1428         if (!rth)
1429                 goto e_nobufs;
1430
1431 #ifdef CONFIG_IP_ROUTE_CLASSID
1432         rth->dst.tclassid = itag;
1433 #endif
1434         rth->dst.output = ip_rt_bug;
1435
1436         rth->rt_genid   = rt_genid(dev_net(dev));
1437         rth->rt_flags   = RTCF_MULTICAST;
1438         rth->rt_type    = RTN_MULTICAST;
1439         rth->rt_is_input= 1;
1440         rth->rt_iif     = 0;
1441         rth->rt_pmtu    = 0;
1442         rth->rt_gateway = 0;
1443         rth->rt_uses_gateway = 0;
1444         INIT_LIST_HEAD(&rth->rt_uncached);
1445         if (our) {
1446                 rth->dst.input= ip_local_deliver;
1447                 rth->rt_flags |= RTCF_LOCAL;
1448         }
1449
1450 #ifdef CONFIG_IP_MROUTE
1451         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1452                 rth->dst.input = ip_mr_input;
1453 #endif
1454         RT_CACHE_STAT_INC(in_slow_mc);
1455
1456         skb_dst_set(skb, &rth->dst);
1457         return 0;
1458
1459 e_nobufs:
1460         return -ENOBUFS;
1461 e_inval:
1462         return -EINVAL;
1463 e_err:
1464         return err;
1465 }
1466
1467
1468 static void ip_handle_martian_source(struct net_device *dev,
1469                                      struct in_device *in_dev,
1470                                      struct sk_buff *skb,
1471                                      __be32 daddr,
1472                                      __be32 saddr)
1473 {
1474         RT_CACHE_STAT_INC(in_martian_src);
1475 #ifdef CONFIG_IP_ROUTE_VERBOSE
1476         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1477                 /*
1478                  *      RFC1812 recommendation, if source is martian,
1479                  *      the only hint is MAC header.
1480                  */
1481                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1482                         &daddr, &saddr, dev->name);
1483                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1484                         print_hex_dump(KERN_WARNING, "ll header: ",
1485                                        DUMP_PREFIX_OFFSET, 16, 1,
1486                                        skb_mac_header(skb),
1487                                        dev->hard_header_len, true);
1488                 }
1489         }
1490 #endif
1491 }
1492
1493 /* called in rcu_read_lock() section */
1494 static int __mkroute_input(struct sk_buff *skb,
1495                            const struct fib_result *res,
1496                            struct in_device *in_dev,
1497                            __be32 daddr, __be32 saddr, u32 tos)
1498 {
1499         struct rtable *rth;
1500         int err;
1501         struct in_device *out_dev;
1502         unsigned int flags = 0;
1503         bool do_cache;
1504         u32 itag = 0;
1505
1506         /* get a working reference to the output device */
1507         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1508         if (out_dev == NULL) {
1509                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1510                 return -EINVAL;
1511         }
1512
1513         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1514                                   in_dev->dev, in_dev, &itag);
1515         if (err < 0) {
1516                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1517                                          saddr);
1518
1519                 goto cleanup;
1520         }
1521
1522         do_cache = res->fi && !itag;
1523         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1524             (IN_DEV_SHARED_MEDIA(out_dev) ||
1525              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1526                 flags |= RTCF_DOREDIRECT;
1527                 do_cache = false;
1528         }
1529
1530         if (skb->protocol != htons(ETH_P_IP)) {
1531                 /* Not IP (i.e. ARP). Do not create route, if it is
1532                  * invalid for proxy arp. DNAT routes are always valid.
1533                  *
1534                  * Proxy arp feature have been extended to allow, ARP
1535                  * replies back to the same interface, to support
1536                  * Private VLAN switch technologies. See arp.c.
1537                  */
1538                 if (out_dev == in_dev &&
1539                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1540                         err = -EINVAL;
1541                         goto cleanup;
1542                 }
1543         }
1544
1545         if (do_cache) {
1546                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1547                 if (rt_cache_valid(rth)) {
1548                         skb_dst_set_noref(skb, &rth->dst);
1549                         goto out;
1550                 }
1551         }
1552
1553         rth = rt_dst_alloc(out_dev->dev,
1554                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1555                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1556         if (!rth) {
1557                 err = -ENOBUFS;
1558                 goto cleanup;
1559         }
1560
1561         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1562         rth->rt_flags = flags;
1563         rth->rt_type = res->type;
1564         rth->rt_is_input = 1;
1565         rth->rt_iif     = 0;
1566         rth->rt_pmtu    = 0;
1567         rth->rt_gateway = 0;
1568         rth->rt_uses_gateway = 0;
1569         INIT_LIST_HEAD(&rth->rt_uncached);
1570         RT_CACHE_STAT_INC(in_slow_tot);
1571
1572         rth->dst.input = ip_forward;
1573         rth->dst.output = ip_output;
1574
1575         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1576         skb_dst_set(skb, &rth->dst);
1577 out:
1578         err = 0;
1579  cleanup:
1580         return err;
1581 }
1582
1583 static int ip_mkroute_input(struct sk_buff *skb,
1584                             struct fib_result *res,
1585                             const struct flowi4 *fl4,
1586                             struct in_device *in_dev,
1587                             __be32 daddr, __be32 saddr, u32 tos)
1588 {
1589 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1590         if (res->fi && res->fi->fib_nhs > 1)
1591                 fib_select_multipath(res);
1592 #endif
1593
1594         /* create a routing cache entry */
1595         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1596 }
1597
1598 /*
1599  *      NOTE. We drop all the packets that has local source
1600  *      addresses, because every properly looped back packet
1601  *      must have correct destination already attached by output routine.
1602  *
1603  *      Such approach solves two big problems:
1604  *      1. Not simplex devices are handled properly.
1605  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1606  *      called with rcu_read_lock()
1607  */
1608
1609 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1610                                u8 tos, struct net_device *dev)
1611 {
1612         struct fib_result res;
1613         struct in_device *in_dev = __in_dev_get_rcu(dev);
1614         struct flowi4   fl4;
1615         unsigned int    flags = 0;
1616         u32             itag = 0;
1617         struct rtable   *rth;
1618         int             err = -EINVAL;
1619         struct net    *net = dev_net(dev);
1620         bool do_cache;
1621
1622         /* IP on this device is disabled. */
1623
1624         if (!in_dev)
1625                 goto out;
1626
1627         /* Check for the most weird martians, which can be not detected
1628            by fib_lookup.
1629          */
1630
1631         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1632                 goto martian_source;
1633
1634         res.fi = NULL;
1635         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1636                 goto brd_input;
1637
1638         /* Accept zero addresses only to limited broadcast;
1639          * I even do not know to fix it or not. Waiting for complains :-)
1640          */
1641         if (ipv4_is_zeronet(saddr))
1642                 goto martian_source;
1643
1644         if (ipv4_is_zeronet(daddr))
1645                 goto martian_destination;
1646
1647         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1648          * and call it once if daddr or/and saddr are loopback addresses
1649          */
1650         if (ipv4_is_loopback(daddr)) {
1651                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1652                         goto martian_destination;
1653         } else if (ipv4_is_loopback(saddr)) {
1654                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1655                         goto martian_source;
1656         }
1657
1658         /*
1659          *      Now we are ready to route packet.
1660          */
1661         fl4.flowi4_oif = 0;
1662         fl4.flowi4_iif = dev->ifindex;
1663         fl4.flowi4_mark = skb->mark;
1664         fl4.flowi4_tos = tos;
1665         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1666         fl4.daddr = daddr;
1667         fl4.saddr = saddr;
1668         err = fib_lookup(net, &fl4, &res);
1669         if (err != 0)
1670                 goto no_route;
1671
1672         if (res.type == RTN_BROADCAST)
1673                 goto brd_input;
1674
1675         if (res.type == RTN_LOCAL) {
1676                 err = fib_validate_source(skb, saddr, daddr, tos,
1677                                           LOOPBACK_IFINDEX,
1678                                           dev, in_dev, &itag);
1679                 if (err < 0)
1680                         goto martian_source_keep_err;
1681                 goto local_input;
1682         }
1683
1684         if (!IN_DEV_FORWARD(in_dev))
1685                 goto no_route;
1686         if (res.type != RTN_UNICAST)
1687                 goto martian_destination;
1688
1689         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1690 out:    return err;
1691
1692 brd_input:
1693         if (skb->protocol != htons(ETH_P_IP))
1694                 goto e_inval;
1695
1696         if (!ipv4_is_zeronet(saddr)) {
1697                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1698                                           in_dev, &itag);
1699                 if (err < 0)
1700                         goto martian_source_keep_err;
1701         }
1702         flags |= RTCF_BROADCAST;
1703         res.type = RTN_BROADCAST;
1704         RT_CACHE_STAT_INC(in_brd);
1705
1706 local_input:
1707         do_cache = false;
1708         if (res.fi) {
1709                 if (!itag) {
1710                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1711                         if (rt_cache_valid(rth)) {
1712                                 skb_dst_set_noref(skb, &rth->dst);
1713                                 err = 0;
1714                                 goto out;
1715                         }
1716                         do_cache = true;
1717                 }
1718         }
1719
1720         rth = rt_dst_alloc(net->loopback_dev,
1721                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1722         if (!rth)
1723                 goto e_nobufs;
1724
1725         rth->dst.input= ip_local_deliver;
1726         rth->dst.output= ip_rt_bug;
1727 #ifdef CONFIG_IP_ROUTE_CLASSID
1728         rth->dst.tclassid = itag;
1729 #endif
1730
1731         rth->rt_genid = rt_genid(net);
1732         rth->rt_flags   = flags|RTCF_LOCAL;
1733         rth->rt_type    = res.type;
1734         rth->rt_is_input = 1;
1735         rth->rt_iif     = 0;
1736         rth->rt_pmtu    = 0;
1737         rth->rt_gateway = 0;
1738         rth->rt_uses_gateway = 0;
1739         INIT_LIST_HEAD(&rth->rt_uncached);
1740         RT_CACHE_STAT_INC(in_slow_tot);
1741         if (res.type == RTN_UNREACHABLE) {
1742                 rth->dst.input= ip_error;
1743                 rth->dst.error= -err;
1744                 rth->rt_flags   &= ~RTCF_LOCAL;
1745         }
1746         if (do_cache) {
1747                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1748                         rth->dst.flags |= DST_NOCACHE;
1749                         rt_add_uncached_list(rth);
1750                 }
1751         }
1752         skb_dst_set(skb, &rth->dst);
1753         err = 0;
1754         goto out;
1755
1756 no_route:
1757         RT_CACHE_STAT_INC(in_no_route);
1758         res.type = RTN_UNREACHABLE;
1759         if (err == -ESRCH)
1760                 err = -ENETUNREACH;
1761         goto local_input;
1762
1763         /*
1764          *      Do not cache martian addresses: they should be logged (RFC1812)
1765          */
1766 martian_destination:
1767         RT_CACHE_STAT_INC(in_martian_dst);
1768 #ifdef CONFIG_IP_ROUTE_VERBOSE
1769         if (IN_DEV_LOG_MARTIANS(in_dev))
1770                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1771                                      &daddr, &saddr, dev->name);
1772 #endif
1773
1774 e_inval:
1775         err = -EINVAL;
1776         goto out;
1777
1778 e_nobufs:
1779         err = -ENOBUFS;
1780         goto out;
1781
1782 martian_source:
1783         err = -EINVAL;
1784 martian_source_keep_err:
1785         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1786         goto out;
1787 }
1788
1789 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1790                          u8 tos, struct net_device *dev)
1791 {
1792         int res;
1793
1794         rcu_read_lock();
1795
1796         /* Multicast recognition logic is moved from route cache to here.
1797            The problem was that too many Ethernet cards have broken/missing
1798            hardware multicast filters :-( As result the host on multicasting
1799            network acquires a lot of useless route cache entries, sort of
1800            SDR messages from all the world. Now we try to get rid of them.
1801            Really, provided software IP multicast filter is organized
1802            reasonably (at least, hashed), it does not result in a slowdown
1803            comparing with route cache reject entries.
1804            Note, that multicast routers are not affected, because
1805            route cache entry is created eventually.
1806          */
1807         if (ipv4_is_multicast(daddr)) {
1808                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1809
1810                 if (in_dev) {
1811                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1812                                                   ip_hdr(skb)->protocol);
1813                         if (our
1814 #ifdef CONFIG_IP_MROUTE
1815                                 ||
1816                             (!ipv4_is_local_multicast(daddr) &&
1817                              IN_DEV_MFORWARD(in_dev))
1818 #endif
1819                            ) {
1820                                 int res = ip_route_input_mc(skb, daddr, saddr,
1821                                                             tos, dev, our);
1822                                 rcu_read_unlock();
1823                                 return res;
1824                         }
1825                 }
1826                 rcu_read_unlock();
1827                 return -EINVAL;
1828         }
1829         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1830         rcu_read_unlock();
1831         return res;
1832 }
1833 EXPORT_SYMBOL(ip_route_input_noref);
1834
1835 /* called with rcu_read_lock() */
1836 static struct rtable *__mkroute_output(const struct fib_result *res,
1837                                        const struct flowi4 *fl4, int orig_oif,
1838                                        struct net_device *dev_out,
1839                                        unsigned int flags)
1840 {
1841         struct fib_info *fi = res->fi;
1842         struct fib_nh_exception *fnhe;
1843         struct in_device *in_dev;
1844         u16 type = res->type;
1845         struct rtable *rth;
1846         bool do_cache;
1847
1848         in_dev = __in_dev_get_rcu(dev_out);
1849         if (!in_dev)
1850                 return ERR_PTR(-EINVAL);
1851
1852         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1853                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1854                         return ERR_PTR(-EINVAL);
1855
1856         if (ipv4_is_lbcast(fl4->daddr))
1857                 type = RTN_BROADCAST;
1858         else if (ipv4_is_multicast(fl4->daddr))
1859                 type = RTN_MULTICAST;
1860         else if (ipv4_is_zeronet(fl4->daddr))
1861                 return ERR_PTR(-EINVAL);
1862
1863         if (dev_out->flags & IFF_LOOPBACK)
1864                 flags |= RTCF_LOCAL;
1865
1866         do_cache = true;
1867         if (type == RTN_BROADCAST) {
1868                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1869                 fi = NULL;
1870         } else if (type == RTN_MULTICAST) {
1871                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1872                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1873                                      fl4->flowi4_proto))
1874                         flags &= ~RTCF_LOCAL;
1875                 else
1876                         do_cache = false;
1877                 /* If multicast route do not exist use
1878                  * default one, but do not gateway in this case.
1879                  * Yes, it is hack.
1880                  */
1881                 if (fi && res->prefixlen < 4)
1882                         fi = NULL;
1883         }
1884
1885         fnhe = NULL;
1886         do_cache &= fi != NULL;
1887         if (do_cache) {
1888                 struct rtable __rcu **prth;
1889                 struct fib_nh *nh = &FIB_RES_NH(*res);
1890
1891                 fnhe = find_exception(nh, fl4->daddr);
1892                 if (fnhe)
1893                         prth = &fnhe->fnhe_rth;
1894                 else {
1895                         if (unlikely(fl4->flowi4_flags &
1896                                      FLOWI_FLAG_KNOWN_NH &&
1897                                      !(nh->nh_gw &&
1898                                        nh->nh_scope == RT_SCOPE_LINK))) {
1899                                 do_cache = false;
1900                                 goto add;
1901                         }
1902                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1903                 }
1904                 rth = rcu_dereference(*prth);
1905                 if (rt_cache_valid(rth)) {
1906                         dst_hold(&rth->dst);
1907                         return rth;
1908                 }
1909         }
1910
1911 add:
1912         rth = rt_dst_alloc(dev_out,
1913                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1914                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1915                            do_cache);
1916         if (!rth)
1917                 return ERR_PTR(-ENOBUFS);
1918
1919         rth->dst.output = ip_output;
1920
1921         rth->rt_genid = rt_genid(dev_net(dev_out));
1922         rth->rt_flags   = flags;
1923         rth->rt_type    = type;
1924         rth->rt_is_input = 0;
1925         rth->rt_iif     = orig_oif ? : 0;
1926         rth->rt_pmtu    = 0;
1927         rth->rt_gateway = 0;
1928         rth->rt_uses_gateway = 0;
1929         INIT_LIST_HEAD(&rth->rt_uncached);
1930
1931         RT_CACHE_STAT_INC(out_slow_tot);
1932
1933         if (flags & RTCF_LOCAL)
1934                 rth->dst.input = ip_local_deliver;
1935         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1936                 if (flags & RTCF_LOCAL &&
1937                     !(dev_out->flags & IFF_LOOPBACK)) {
1938                         rth->dst.output = ip_mc_output;
1939                         RT_CACHE_STAT_INC(out_slow_mc);
1940                 }
1941 #ifdef CONFIG_IP_MROUTE
1942                 if (type == RTN_MULTICAST) {
1943                         if (IN_DEV_MFORWARD(in_dev) &&
1944                             !ipv4_is_local_multicast(fl4->daddr)) {
1945                                 rth->dst.input = ip_mr_input;
1946                                 rth->dst.output = ip_mc_output;
1947                         }
1948                 }
1949 #endif
1950         }
1951
1952         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1953
1954         return rth;
1955 }
1956
1957 /*
1958  * Major route resolver routine.
1959  */
1960
1961 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1962 {
1963         struct net_device *dev_out = NULL;
1964         __u8 tos = RT_FL_TOS(fl4);
1965         unsigned int flags = 0;
1966         struct fib_result res;
1967         struct rtable *rth;
1968         int orig_oif;
1969
1970         res.tclassid    = 0;
1971         res.fi          = NULL;
1972         res.table       = NULL;
1973
1974         orig_oif = fl4->flowi4_oif;
1975
1976         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1977         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1978         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1979                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1980
1981         rcu_read_lock();
1982         if (fl4->saddr) {
1983                 rth = ERR_PTR(-EINVAL);
1984                 if (ipv4_is_multicast(fl4->saddr) ||
1985                     ipv4_is_lbcast(fl4->saddr) ||
1986                     ipv4_is_zeronet(fl4->saddr))
1987                         goto out;
1988
1989                 /* I removed check for oif == dev_out->oif here.
1990                    It was wrong for two reasons:
1991                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1992                       is assigned to multiple interfaces.
1993                    2. Moreover, we are allowed to send packets with saddr
1994                       of another iface. --ANK
1995                  */
1996
1997                 if (fl4->flowi4_oif == 0 &&
1998                     (ipv4_is_multicast(fl4->daddr) ||
1999                      ipv4_is_lbcast(fl4->daddr))) {
2000                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2001                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2002                         if (dev_out == NULL)
2003                                 goto out;
2004
2005                         /* Special hack: user can direct multicasts
2006                            and limited broadcast via necessary interface
2007                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2008                            This hack is not just for fun, it allows
2009                            vic,vat and friends to work.
2010                            They bind socket to loopback, set ttl to zero
2011                            and expect that it will work.
2012                            From the viewpoint of routing cache they are broken,
2013                            because we are not allowed to build multicast path
2014                            with loopback source addr (look, routing cache
2015                            cannot know, that ttl is zero, so that packet
2016                            will not leave this host and route is valid).
2017                            Luckily, this hack is good workaround.
2018                          */
2019
2020                         fl4->flowi4_oif = dev_out->ifindex;
2021                         goto make_route;
2022                 }
2023
2024                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2025                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2026                         if (!__ip_dev_find(net, fl4->saddr, false))
2027                                 goto out;
2028                 }
2029         }
2030
2031
2032         if (fl4->flowi4_oif) {
2033                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2034                 rth = ERR_PTR(-ENODEV);
2035                 if (dev_out == NULL)
2036                         goto out;
2037
2038                 /* RACE: Check return value of inet_select_addr instead. */
2039                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2040                         rth = ERR_PTR(-ENETUNREACH);
2041                         goto out;
2042                 }
2043                 if (ipv4_is_local_multicast(fl4->daddr) ||
2044                     ipv4_is_lbcast(fl4->daddr)) {
2045                         if (!fl4->saddr)
2046                                 fl4->saddr = inet_select_addr(dev_out, 0,
2047                                                               RT_SCOPE_LINK);
2048                         goto make_route;
2049                 }
2050                 if (!fl4->saddr) {
2051                         if (ipv4_is_multicast(fl4->daddr))
2052                                 fl4->saddr = inet_select_addr(dev_out, 0,
2053                                                               fl4->flowi4_scope);
2054                         else if (!fl4->daddr)
2055                                 fl4->saddr = inet_select_addr(dev_out, 0,
2056                                                               RT_SCOPE_HOST);
2057                 }
2058         }
2059
2060         if (!fl4->daddr) {
2061                 fl4->daddr = fl4->saddr;
2062                 if (!fl4->daddr)
2063                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2064                 dev_out = net->loopback_dev;
2065                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2066                 res.type = RTN_LOCAL;
2067                 flags |= RTCF_LOCAL;
2068                 goto make_route;
2069         }
2070
2071         if (fib_lookup(net, fl4, &res)) {
2072                 res.fi = NULL;
2073                 res.table = NULL;
2074                 if (fl4->flowi4_oif) {
2075                         /* Apparently, routing tables are wrong. Assume,
2076                            that the destination is on link.
2077
2078                            WHY? DW.
2079                            Because we are allowed to send to iface
2080                            even if it has NO routes and NO assigned
2081                            addresses. When oif is specified, routing
2082                            tables are looked up with only one purpose:
2083                            to catch if destination is gatewayed, rather than
2084                            direct. Moreover, if MSG_DONTROUTE is set,
2085                            we send packet, ignoring both routing tables
2086                            and ifaddr state. --ANK
2087
2088
2089                            We could make it even if oif is unknown,
2090                            likely IPv6, but we do not.
2091                          */
2092
2093                         if (fl4->saddr == 0)
2094                                 fl4->saddr = inet_select_addr(dev_out, 0,
2095                                                               RT_SCOPE_LINK);
2096                         res.type = RTN_UNICAST;
2097                         goto make_route;
2098                 }
2099                 rth = ERR_PTR(-ENETUNREACH);
2100                 goto out;
2101         }
2102
2103         if (res.type == RTN_LOCAL) {
2104                 if (!fl4->saddr) {
2105                         if (res.fi->fib_prefsrc)
2106                                 fl4->saddr = res.fi->fib_prefsrc;
2107                         else
2108                                 fl4->saddr = fl4->daddr;
2109                 }
2110                 dev_out = net->loopback_dev;
2111                 fl4->flowi4_oif = dev_out->ifindex;
2112                 flags |= RTCF_LOCAL;
2113                 goto make_route;
2114         }
2115
2116 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2117         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2118                 fib_select_multipath(&res);
2119         else
2120 #endif
2121         if (!res.prefixlen &&
2122             res.table->tb_num_default > 1 &&
2123             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2124                 fib_select_default(&res);
2125
2126         if (!fl4->saddr)
2127                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2128
2129         dev_out = FIB_RES_DEV(res);
2130         fl4->flowi4_oif = dev_out->ifindex;
2131
2132
2133 make_route:
2134         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2135
2136 out:
2137         rcu_read_unlock();
2138         return rth;
2139 }
2140 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2141
2142 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2143 {
2144         return NULL;
2145 }
2146
2147 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2148 {
2149         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2150
2151         return mtu ? : dst->dev->mtu;
2152 }
2153
2154 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2155                                           struct sk_buff *skb, u32 mtu)
2156 {
2157 }
2158
2159 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2160                                        struct sk_buff *skb)
2161 {
2162 }
2163
2164 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2165                                           unsigned long old)
2166 {
2167         return NULL;
2168 }
2169
2170 static struct dst_ops ipv4_dst_blackhole_ops = {
2171         .family                 =       AF_INET,
2172         .protocol               =       cpu_to_be16(ETH_P_IP),
2173         .check                  =       ipv4_blackhole_dst_check,
2174         .mtu                    =       ipv4_blackhole_mtu,
2175         .default_advmss         =       ipv4_default_advmss,
2176         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2177         .redirect               =       ipv4_rt_blackhole_redirect,
2178         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2179         .neigh_lookup           =       ipv4_neigh_lookup,
2180 };
2181
2182 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2183 {
2184         struct rtable *ort = (struct rtable *) dst_orig;
2185         struct rtable *rt;
2186
2187         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2188         if (rt) {
2189                 struct dst_entry *new = &rt->dst;
2190
2191                 new->__use = 1;
2192                 new->input = dst_discard;
2193                 new->output = dst_discard;
2194
2195                 new->dev = ort->dst.dev;
2196                 if (new->dev)
2197                         dev_hold(new->dev);
2198
2199                 rt->rt_is_input = ort->rt_is_input;
2200                 rt->rt_iif = ort->rt_iif;
2201                 rt->rt_pmtu = ort->rt_pmtu;
2202
2203                 rt->rt_genid = rt_genid(net);
2204                 rt->rt_flags = ort->rt_flags;
2205                 rt->rt_type = ort->rt_type;
2206                 rt->rt_gateway = ort->rt_gateway;
2207                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2208
2209                 INIT_LIST_HEAD(&rt->rt_uncached);
2210
2211                 dst_free(new);
2212         }
2213
2214         dst_release(dst_orig);
2215
2216         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2217 }
2218
2219 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2220                                     struct sock *sk)
2221 {
2222         struct rtable *rt = __ip_route_output_key(net, flp4);
2223
2224         if (IS_ERR(rt))
2225                 return rt;
2226
2227         if (flp4->flowi4_proto)
2228                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2229                                                    flowi4_to_flowi(flp4),
2230                                                    sk, 0);
2231
2232         return rt;
2233 }
2234 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2235
2236 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2237                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2238                         u32 seq, int event, int nowait, unsigned int flags)
2239 {
2240         struct rtable *rt = skb_rtable(skb);
2241         struct rtmsg *r;
2242         struct nlmsghdr *nlh;
2243         unsigned long expires = 0;
2244         u32 error;
2245         u32 metrics[RTAX_MAX];
2246
2247         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2248         if (nlh == NULL)
2249                 return -EMSGSIZE;
2250
2251         r = nlmsg_data(nlh);
2252         r->rtm_family    = AF_INET;
2253         r->rtm_dst_len  = 32;
2254         r->rtm_src_len  = 0;
2255         r->rtm_tos      = fl4->flowi4_tos;
2256         r->rtm_table    = RT_TABLE_MAIN;
2257         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2258                 goto nla_put_failure;
2259         r->rtm_type     = rt->rt_type;
2260         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2261         r->rtm_protocol = RTPROT_UNSPEC;
2262         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2263         if (rt->rt_flags & RTCF_NOTIFY)
2264                 r->rtm_flags |= RTM_F_NOTIFY;
2265
2266         if (nla_put_be32(skb, RTA_DST, dst))
2267                 goto nla_put_failure;
2268         if (src) {
2269                 r->rtm_src_len = 32;
2270                 if (nla_put_be32(skb, RTA_SRC, src))
2271                         goto nla_put_failure;
2272         }
2273         if (rt->dst.dev &&
2274             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2275                 goto nla_put_failure;
2276 #ifdef CONFIG_IP_ROUTE_CLASSID
2277         if (rt->dst.tclassid &&
2278             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2279                 goto nla_put_failure;
2280 #endif
2281         if (!rt_is_input_route(rt) &&
2282             fl4->saddr != src) {
2283                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2284                         goto nla_put_failure;
2285         }
2286         if (rt->rt_uses_gateway &&
2287             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2288                 goto nla_put_failure;
2289
2290         expires = rt->dst.expires;
2291         if (expires) {
2292                 unsigned long now = jiffies;
2293
2294                 if (time_before(now, expires))
2295                         expires -= now;
2296                 else
2297                         expires = 0;
2298         }
2299
2300         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2301         if (rt->rt_pmtu && expires)
2302                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2303         if (rtnetlink_put_metrics(skb, metrics) < 0)
2304                 goto nla_put_failure;
2305
2306         if (fl4->flowi4_mark &&
2307             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2308                 goto nla_put_failure;
2309
2310         error = rt->dst.error;
2311
2312         if (rt_is_input_route(rt)) {
2313 #ifdef CONFIG_IP_MROUTE
2314                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2315                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2316                         int err = ipmr_get_route(net, skb,
2317                                                  fl4->saddr, fl4->daddr,
2318                                                  r, nowait);
2319                         if (err <= 0) {
2320                                 if (!nowait) {
2321                                         if (err == 0)
2322                                                 return 0;
2323                                         goto nla_put_failure;
2324                                 } else {
2325                                         if (err == -EMSGSIZE)
2326                                                 goto nla_put_failure;
2327                                         error = err;
2328                                 }
2329                         }
2330                 } else
2331 #endif
2332                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2333                                 goto nla_put_failure;
2334         }
2335
2336         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2337                 goto nla_put_failure;
2338
2339         return nlmsg_end(skb, nlh);
2340
2341 nla_put_failure:
2342         nlmsg_cancel(skb, nlh);
2343         return -EMSGSIZE;
2344 }
2345
2346 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2347 {
2348         struct net *net = sock_net(in_skb->sk);
2349         struct rtmsg *rtm;
2350         struct nlattr *tb[RTA_MAX+1];
2351         struct rtable *rt = NULL;
2352         struct flowi4 fl4;
2353         __be32 dst = 0;
2354         __be32 src = 0;
2355         u32 iif;
2356         int err;
2357         int mark;
2358         struct sk_buff *skb;
2359
2360         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2361         if (err < 0)
2362                 goto errout;
2363
2364         rtm = nlmsg_data(nlh);
2365
2366         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2367         if (skb == NULL) {
2368                 err = -ENOBUFS;
2369                 goto errout;
2370         }
2371
2372         /* Reserve room for dummy headers, this skb can pass
2373            through good chunk of routing engine.
2374          */
2375         skb_reset_mac_header(skb);
2376         skb_reset_network_header(skb);
2377
2378         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2379         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2380         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2381
2382         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2383         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2384         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2385         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2386
2387         memset(&fl4, 0, sizeof(fl4));
2388         fl4.daddr = dst;
2389         fl4.saddr = src;
2390         fl4.flowi4_tos = rtm->rtm_tos;
2391         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2392         fl4.flowi4_mark = mark;
2393
2394         if (iif) {
2395                 struct net_device *dev;
2396
2397                 dev = __dev_get_by_index(net, iif);
2398                 if (dev == NULL) {
2399                         err = -ENODEV;
2400                         goto errout_free;
2401                 }
2402
2403                 skb->protocol   = htons(ETH_P_IP);
2404                 skb->dev        = dev;
2405                 skb->mark       = mark;
2406                 local_bh_disable();
2407                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2408                 local_bh_enable();
2409
2410                 rt = skb_rtable(skb);
2411                 if (err == 0 && rt->dst.error)
2412                         err = -rt->dst.error;
2413         } else {
2414                 rt = ip_route_output_key(net, &fl4);
2415
2416                 err = 0;
2417                 if (IS_ERR(rt))
2418                         err = PTR_ERR(rt);
2419         }
2420
2421         if (err)
2422                 goto errout_free;
2423
2424         skb_dst_set(skb, &rt->dst);
2425         if (rtm->rtm_flags & RTM_F_NOTIFY)
2426                 rt->rt_flags |= RTCF_NOTIFY;
2427
2428         err = rt_fill_info(net, dst, src, &fl4, skb,
2429                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2430                            RTM_NEWROUTE, 0, 0);
2431         if (err <= 0)
2432                 goto errout_free;
2433
2434         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2435 errout:
2436         return err;
2437
2438 errout_free:
2439         kfree_skb(skb);
2440         goto errout;
2441 }
2442
2443 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2444 {
2445         return skb->len;
2446 }
2447
2448 void ip_rt_multicast_event(struct in_device *in_dev)
2449 {
2450         rt_cache_flush(dev_net(in_dev->dev));
2451 }
2452
2453 #ifdef CONFIG_SYSCTL
2454 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2455 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2456 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2457 static int ip_rt_gc_elasticity __read_mostly    = 8;
2458
2459 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2460                                         void __user *buffer,
2461                                         size_t *lenp, loff_t *ppos)
2462 {
2463         if (write) {
2464                 rt_cache_flush((struct net *)__ctl->extra1);
2465                 return 0;
2466         }
2467
2468         return -EINVAL;
2469 }
2470
2471 static ctl_table ipv4_route_table[] = {
2472         {
2473                 .procname       = "gc_thresh",
2474                 .data           = &ipv4_dst_ops.gc_thresh,
2475                 .maxlen         = sizeof(int),
2476                 .mode           = 0644,
2477                 .proc_handler   = proc_dointvec,
2478         },
2479         {
2480                 .procname       = "max_size",
2481                 .data           = &ip_rt_max_size,
2482                 .maxlen         = sizeof(int),
2483                 .mode           = 0644,
2484                 .proc_handler   = proc_dointvec,
2485         },
2486         {
2487                 /*  Deprecated. Use gc_min_interval_ms */
2488
2489                 .procname       = "gc_min_interval",
2490                 .data           = &ip_rt_gc_min_interval,
2491                 .maxlen         = sizeof(int),
2492                 .mode           = 0644,
2493                 .proc_handler   = proc_dointvec_jiffies,
2494         },
2495         {
2496                 .procname       = "gc_min_interval_ms",
2497                 .data           = &ip_rt_gc_min_interval,
2498                 .maxlen         = sizeof(int),
2499                 .mode           = 0644,
2500                 .proc_handler   = proc_dointvec_ms_jiffies,
2501         },
2502         {
2503                 .procname       = "gc_timeout",
2504                 .data           = &ip_rt_gc_timeout,
2505                 .maxlen         = sizeof(int),
2506                 .mode           = 0644,
2507                 .proc_handler   = proc_dointvec_jiffies,
2508         },
2509         {
2510                 .procname       = "gc_interval",
2511                 .data           = &ip_rt_gc_interval,
2512                 .maxlen         = sizeof(int),
2513                 .mode           = 0644,
2514                 .proc_handler   = proc_dointvec_jiffies,
2515         },
2516         {
2517                 .procname       = "redirect_load",
2518                 .data           = &ip_rt_redirect_load,
2519                 .maxlen         = sizeof(int),
2520                 .mode           = 0644,
2521                 .proc_handler   = proc_dointvec,
2522         },
2523         {
2524                 .procname       = "redirect_number",
2525                 .data           = &ip_rt_redirect_number,
2526                 .maxlen         = sizeof(int),
2527                 .mode           = 0644,
2528                 .proc_handler   = proc_dointvec,
2529         },
2530         {
2531                 .procname       = "redirect_silence",
2532                 .data           = &ip_rt_redirect_silence,
2533                 .maxlen         = sizeof(int),
2534                 .mode           = 0644,
2535                 .proc_handler   = proc_dointvec,
2536         },
2537         {
2538                 .procname       = "error_cost",
2539                 .data           = &ip_rt_error_cost,
2540                 .maxlen         = sizeof(int),
2541                 .mode           = 0644,
2542                 .proc_handler   = proc_dointvec,
2543         },
2544         {
2545                 .procname       = "error_burst",
2546                 .data           = &ip_rt_error_burst,
2547                 .maxlen         = sizeof(int),
2548                 .mode           = 0644,
2549                 .proc_handler   = proc_dointvec,
2550         },
2551         {
2552                 .procname       = "gc_elasticity",
2553                 .data           = &ip_rt_gc_elasticity,
2554                 .maxlen         = sizeof(int),
2555                 .mode           = 0644,
2556                 .proc_handler   = proc_dointvec,
2557         },
2558         {
2559                 .procname       = "mtu_expires",
2560                 .data           = &ip_rt_mtu_expires,
2561                 .maxlen         = sizeof(int),
2562                 .mode           = 0644,
2563                 .proc_handler   = proc_dointvec_jiffies,
2564         },
2565         {
2566                 .procname       = "min_pmtu",
2567                 .data           = &ip_rt_min_pmtu,
2568                 .maxlen         = sizeof(int),
2569                 .mode           = 0644,
2570                 .proc_handler   = proc_dointvec,
2571         },
2572         {
2573                 .procname       = "min_adv_mss",
2574                 .data           = &ip_rt_min_advmss,
2575                 .maxlen         = sizeof(int),
2576                 .mode           = 0644,
2577                 .proc_handler   = proc_dointvec,
2578         },
2579         { }
2580 };
2581
2582 static struct ctl_table ipv4_route_flush_table[] = {
2583         {
2584                 .procname       = "flush",
2585                 .maxlen         = sizeof(int),
2586                 .mode           = 0200,
2587                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2588         },
2589         { },
2590 };
2591
2592 static __net_init int sysctl_route_net_init(struct net *net)
2593 {
2594         struct ctl_table *tbl;
2595
2596         tbl = ipv4_route_flush_table;
2597         if (!net_eq(net, &init_net)) {
2598                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2599                 if (tbl == NULL)
2600                         goto err_dup;
2601
2602                 /* Don't export sysctls to unprivileged users */
2603                 if (net->user_ns != &init_user_ns)
2604                         tbl[0].procname = NULL;
2605         }
2606         tbl[0].extra1 = net;
2607
2608         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2609         if (net->ipv4.route_hdr == NULL)
2610                 goto err_reg;
2611         return 0;
2612
2613 err_reg:
2614         if (tbl != ipv4_route_flush_table)
2615                 kfree(tbl);
2616 err_dup:
2617         return -ENOMEM;
2618 }
2619
2620 static __net_exit void sysctl_route_net_exit(struct net *net)
2621 {
2622         struct ctl_table *tbl;
2623
2624         tbl = net->ipv4.route_hdr->ctl_table_arg;
2625         unregister_net_sysctl_table(net->ipv4.route_hdr);
2626         BUG_ON(tbl == ipv4_route_flush_table);
2627         kfree(tbl);
2628 }
2629
2630 static __net_initdata struct pernet_operations sysctl_route_ops = {
2631         .init = sysctl_route_net_init,
2632         .exit = sysctl_route_net_exit,
2633 };
2634 #endif
2635
2636 static __net_init int rt_genid_init(struct net *net)
2637 {
2638         atomic_set(&net->rt_genid, 0);
2639         get_random_bytes(&net->ipv4.dev_addr_genid,
2640                          sizeof(net->ipv4.dev_addr_genid));
2641         return 0;
2642 }
2643
2644 static __net_initdata struct pernet_operations rt_genid_ops = {
2645         .init = rt_genid_init,
2646 };
2647
2648 static int __net_init ipv4_inetpeer_init(struct net *net)
2649 {
2650         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2651
2652         if (!bp)
2653                 return -ENOMEM;
2654         inet_peer_base_init(bp);
2655         net->ipv4.peers = bp;
2656         return 0;
2657 }
2658
2659 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2660 {
2661         struct inet_peer_base *bp = net->ipv4.peers;
2662
2663         net->ipv4.peers = NULL;
2664         inetpeer_invalidate_tree(bp);
2665         kfree(bp);
2666 }
2667
2668 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2669         .init   =       ipv4_inetpeer_init,
2670         .exit   =       ipv4_inetpeer_exit,
2671 };
2672
2673 #ifdef CONFIG_IP_ROUTE_CLASSID
2674 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2675 #endif /* CONFIG_IP_ROUTE_CLASSID */
2676
2677 int __init ip_rt_init(void)
2678 {
2679         int rc = 0;
2680
2681         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2682         if (!ip_idents)
2683                 panic("IP: failed to allocate ip_idents\n");
2684
2685         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2686
2687 #ifdef CONFIG_IP_ROUTE_CLASSID
2688         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2689         if (!ip_rt_acct)
2690                 panic("IP: failed to allocate ip_rt_acct\n");
2691 #endif
2692
2693         ipv4_dst_ops.kmem_cachep =
2694                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2695                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2696
2697         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2698
2699         if (dst_entries_init(&ipv4_dst_ops) < 0)
2700                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2701
2702         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2703                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2704
2705         ipv4_dst_ops.gc_thresh = ~0;
2706         ip_rt_max_size = INT_MAX;
2707
2708         devinet_init();
2709         ip_fib_init();
2710
2711         if (ip_rt_proc_init())
2712                 pr_err("Unable to create route proc files\n");
2713 #ifdef CONFIG_XFRM
2714         xfrm_init();
2715         xfrm4_init();
2716 #endif
2717         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2718
2719 #ifdef CONFIG_SYSCTL
2720         register_pernet_subsys(&sysctl_route_ops);
2721 #endif
2722         register_pernet_subsys(&rt_genid_ops);
2723         register_pernet_subsys(&ipv4_inetpeer_ops);
2724         return rc;
2725 }
2726
2727 #ifdef CONFIG_SYSCTL
2728 /*
2729  * We really need to sanitize the damn ipv4 init order, then all
2730  * this nonsense will go away.
2731  */
2732 void __init ip_static_sysctl_init(void)
2733 {
2734         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2735 }
2736 #endif