Merge branch 'linux-linaro-lsk' into linux-linaro-lsk-android
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #include <linux/kmemleak.h>
110 #endif
111 #include <net/secure_seq.h>
112
113 #define RT_FL_TOS(oldflp4) \
114         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 /*
131  *      Interface to generic destination cache.
132  */
133
134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
135 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
136 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
137 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
138 static void              ipv4_link_failure(struct sk_buff *skb);
139 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
140                                            struct sk_buff *skb, u32 mtu);
141 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
142                                         struct sk_buff *skb);
143 static void             ipv4_dst_destroy(struct dst_entry *dst);
144
145 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
146                             int how)
147 {
148 }
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             cpu_to_be16(ETH_P_IP),
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .ifdown =               ipv4_dst_ifdown,
169         .negative_advice =      ipv4_negative_advice,
170         .link_failure =         ipv4_link_failure,
171         .update_pmtu =          ip_rt_update_pmtu,
172         .redirect =             ip_do_redirect,
173         .local_out =            __ip_local_out,
174         .neigh_lookup =         ipv4_neigh_lookup,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .owner   = THIS_MODULE,
244         .open    = rt_cache_seq_open,
245         .read    = seq_read,
246         .llseek  = seq_lseek,
247         .release = seq_release,
248 };
249
250
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252 {
253         int cpu;
254
255         if (*pos == 0)
256                 return SEQ_START_TOKEN;
257
258         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259                 if (!cpu_possible(cpu))
260                         continue;
261                 *pos = cpu+1;
262                 return &per_cpu(rt_cache_stat, cpu);
263         }
264         return NULL;
265 }
266
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269         int cpu;
270
271         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272                 if (!cpu_possible(cpu))
273                         continue;
274                 *pos = cpu+1;
275                 return &per_cpu(rt_cache_stat, cpu);
276         }
277         return NULL;
278
279 }
280
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283
284 }
285
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288         struct rt_cache_stat *st = v;
289
290         if (v == SEQ_START_TOKEN) {
291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292                 return 0;
293         }
294
295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297                    dst_entries_get_slow(&ipv4_dst_ops),
298                    st->in_hit,
299                    st->in_slow_tot,
300                    st->in_slow_mc,
301                    st->in_no_route,
302                    st->in_brd,
303                    st->in_martian_dst,
304                    st->in_martian_src,
305
306                    st->out_hit,
307                    st->out_slow_tot,
308                    st->out_slow_mc,
309
310                    st->gc_total,
311                    st->gc_ignored,
312                    st->gc_goal_miss,
313                    st->gc_dst_overflow,
314                    st->in_hlist_search,
315                    st->out_hlist_search
316                 );
317         return 0;
318 }
319
320 static const struct seq_operations rt_cpu_seq_ops = {
321         .start  = rt_cpu_seq_start,
322         .next   = rt_cpu_seq_next,
323         .stop   = rt_cpu_seq_stop,
324         .show   = rt_cpu_seq_show,
325 };
326
327
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330         return seq_open(file, &rt_cpu_seq_ops);
331 }
332
333 static const struct file_operations rt_cpu_seq_fops = {
334         .owner   = THIS_MODULE,
335         .open    = rt_cpu_seq_open,
336         .read    = seq_read,
337         .llseek  = seq_lseek,
338         .release = seq_release,
339 };
340
341 #ifdef CONFIG_IP_ROUTE_CLASSID
342 static int rt_acct_proc_show(struct seq_file *m, void *v)
343 {
344         struct ip_rt_acct *dst, *src;
345         unsigned int i, j;
346
347         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348         if (!dst)
349                 return -ENOMEM;
350
351         for_each_possible_cpu(i) {
352                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353                 for (j = 0; j < 256; j++) {
354                         dst[j].o_bytes   += src[j].o_bytes;
355                         dst[j].o_packets += src[j].o_packets;
356                         dst[j].i_bytes   += src[j].i_bytes;
357                         dst[j].i_packets += src[j].i_packets;
358                 }
359         }
360
361         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362         kfree(dst);
363         return 0;
364 }
365
366 static int rt_acct_proc_open(struct inode *inode, struct file *file)
367 {
368         return single_open(file, rt_acct_proc_show, NULL);
369 }
370
371 static const struct file_operations rt_acct_proc_fops = {
372         .owner          = THIS_MODULE,
373         .open           = rt_acct_proc_open,
374         .read           = seq_read,
375         .llseek         = seq_lseek,
376         .release        = single_release,
377 };
378 #endif
379
380 static int __net_init ip_rt_do_proc_init(struct net *net)
381 {
382         struct proc_dir_entry *pde;
383
384         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385                           &rt_cache_seq_fops);
386         if (!pde)
387                 goto err1;
388
389         pde = proc_create("rt_cache", S_IRUGO,
390                           net->proc_net_stat, &rt_cpu_seq_fops);
391         if (!pde)
392                 goto err2;
393
394 #ifdef CONFIG_IP_ROUTE_CLASSID
395         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
396         if (!pde)
397                 goto err3;
398 #endif
399         return 0;
400
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 err3:
403         remove_proc_entry("rt_cache", net->proc_net_stat);
404 #endif
405 err2:
406         remove_proc_entry("rt_cache", net->proc_net);
407 err1:
408         return -ENOMEM;
409 }
410
411 static void __net_exit ip_rt_do_proc_exit(struct net *net)
412 {
413         remove_proc_entry("rt_cache", net->proc_net_stat);
414         remove_proc_entry("rt_cache", net->proc_net);
415 #ifdef CONFIG_IP_ROUTE_CLASSID
416         remove_proc_entry("rt_acct", net->proc_net);
417 #endif
418 }
419
420 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
421         .init = ip_rt_do_proc_init,
422         .exit = ip_rt_do_proc_exit,
423 };
424
425 static int __init ip_rt_proc_init(void)
426 {
427         return register_pernet_subsys(&ip_rt_proc_ops);
428 }
429
430 #else
431 static inline int ip_rt_proc_init(void)
432 {
433         return 0;
434 }
435 #endif /* CONFIG_PROC_FS */
436
437 static inline bool rt_is_expired(const struct rtable *rth)
438 {
439         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
440 }
441
442 void rt_cache_flush(struct net *net)
443 {
444         rt_genid_bump(net);
445 }
446
447 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448                                            struct sk_buff *skb,
449                                            const void *daddr)
450 {
451         struct net_device *dev = dst->dev;
452         const __be32 *pkey = daddr;
453         const struct rtable *rt;
454         struct neighbour *n;
455
456         rt = (const struct rtable *) dst;
457         if (rt->rt_gateway)
458                 pkey = (const __be32 *) &rt->rt_gateway;
459         else if (skb)
460                 pkey = &ip_hdr(skb)->daddr;
461
462         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
463         if (n)
464                 return n;
465         return neigh_create(&arp_tbl, pkey, dev);
466 }
467
468 #define IP_IDENTS_SZ 2048u
469 struct ip_ident_bucket {
470         atomic_t        id;
471         u32             stamp32;
472 };
473
474 static struct ip_ident_bucket *ip_idents __read_mostly;
475
476 /* In order to protect privacy, we add a perturbation to identifiers
477  * if one generator is seldom used. This makes hard for an attacker
478  * to infer how many packets were sent between two points in time.
479  */
480 u32 ip_idents_reserve(u32 hash, int segs)
481 {
482         struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
483         u32 old = ACCESS_ONCE(bucket->stamp32);
484         u32 now = (u32)jiffies;
485         u32 delta = 0;
486
487         if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
488                 u64 x = prandom_u32();
489
490                 x *= (now - old);
491                 delta = (u32)(x >> 32);
492         }
493
494         return atomic_add_return(segs + delta, &bucket->id) - segs;
495 }
496 EXPORT_SYMBOL(ip_idents_reserve);
497
498 void __ip_select_ident(struct iphdr *iph, int segs)
499 {
500         static u32 ip_idents_hashrnd __read_mostly;
501         static bool hashrnd_initialized = false;
502         u32 hash, id;
503
504         if (unlikely(!hashrnd_initialized)) {
505                 hashrnd_initialized = true;
506                 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
507         }
508
509         hash = jhash_3words((__force u32)iph->daddr,
510                             (__force u32)iph->saddr,
511                             iph->protocol,
512                             ip_idents_hashrnd);
513         id = ip_idents_reserve(hash, segs);
514         iph->id = htons(id);
515 }
516 EXPORT_SYMBOL(__ip_select_ident);
517
518 static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
519                              const struct iphdr *iph,
520                              int oif, u8 tos,
521                              u8 prot, u32 mark, int flow_flags)
522 {
523         if (sk) {
524                 const struct inet_sock *inet = inet_sk(sk);
525
526                 oif = sk->sk_bound_dev_if;
527                 mark = sk->sk_mark;
528                 tos = RT_CONN_FLAGS(sk);
529                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
530         }
531         flowi4_init_output(fl4, oif, mark, tos,
532                            RT_SCOPE_UNIVERSE, prot,
533                            flow_flags,
534                            iph->daddr, iph->saddr, 0, 0,
535                            sk ? sock_i_uid(sk) : 0);
536 }
537
538 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
539                                struct sock *sk)
540 {
541         const struct iphdr *iph = ip_hdr(skb);
542         int oif = skb->dev->ifindex;
543         u8 tos = RT_TOS(iph->tos);
544         u8 prot = iph->protocol;
545         u32 mark = skb->mark;
546
547         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
548 }
549
550 static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
551 {
552         const struct inet_sock *inet = inet_sk(sk);
553         const struct ip_options_rcu *inet_opt;
554         __be32 daddr = inet->inet_daddr;
555
556         rcu_read_lock();
557         inet_opt = rcu_dereference(inet->inet_opt);
558         if (inet_opt && inet_opt->opt.srr)
559                 daddr = inet_opt->opt.faddr;
560         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
561                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
562                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
563                            inet_sk_flowi_flags(sk),
564                            daddr, inet->inet_saddr, 0, 0,
565                            sock_i_uid(sk));
566         rcu_read_unlock();
567 }
568
569 static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
570                                  const struct sk_buff *skb)
571 {
572         if (skb)
573                 build_skb_flow_key(fl4, skb, sk);
574         else
575                 build_sk_flow_key(fl4, sk);
576 }
577
578 static inline void rt_free(struct rtable *rt)
579 {
580         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
581 }
582
583 static DEFINE_SPINLOCK(fnhe_lock);
584
585 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
586 {
587         struct fib_nh_exception *fnhe, *oldest;
588         struct rtable *orig;
589
590         oldest = rcu_dereference(hash->chain);
591         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
592              fnhe = rcu_dereference(fnhe->fnhe_next)) {
593                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
594                         oldest = fnhe;
595         }
596         orig = rcu_dereference(oldest->fnhe_rth);
597         if (orig) {
598                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
599                 rt_free(orig);
600         }
601         return oldest;
602 }
603
604 static inline u32 fnhe_hashfun(__be32 daddr)
605 {
606         u32 hval;
607
608         hval = (__force u32) daddr;
609         hval ^= (hval >> 11) ^ (hval >> 22);
610
611         return hval & (FNHE_HASH_SIZE - 1);
612 }
613
614 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
615                                   u32 pmtu, unsigned long expires)
616 {
617         struct fnhe_hash_bucket *hash;
618         struct fib_nh_exception *fnhe;
619         int depth;
620         u32 hval = fnhe_hashfun(daddr);
621
622         spin_lock_bh(&fnhe_lock);
623
624         hash = nh->nh_exceptions;
625         if (!hash) {
626                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
627                 if (!hash)
628                         goto out_unlock;
629                 nh->nh_exceptions = hash;
630         }
631
632         hash += hval;
633
634         depth = 0;
635         for (fnhe = rcu_dereference(hash->chain); fnhe;
636              fnhe = rcu_dereference(fnhe->fnhe_next)) {
637                 if (fnhe->fnhe_daddr == daddr)
638                         break;
639                 depth++;
640         }
641
642         if (fnhe) {
643                 if (gw)
644                         fnhe->fnhe_gw = gw;
645                 if (pmtu) {
646                         fnhe->fnhe_pmtu = pmtu;
647                         fnhe->fnhe_expires = expires;
648                 }
649         } else {
650                 if (depth > FNHE_RECLAIM_DEPTH)
651                         fnhe = fnhe_oldest(hash);
652                 else {
653                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
654                         if (!fnhe)
655                                 goto out_unlock;
656
657                         fnhe->fnhe_next = hash->chain;
658                         rcu_assign_pointer(hash->chain, fnhe);
659                 }
660                 fnhe->fnhe_daddr = daddr;
661                 fnhe->fnhe_gw = gw;
662                 fnhe->fnhe_pmtu = pmtu;
663                 fnhe->fnhe_expires = expires;
664         }
665
666         fnhe->fnhe_stamp = jiffies;
667
668 out_unlock:
669         spin_unlock_bh(&fnhe_lock);
670         return;
671 }
672
673 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
674                              bool kill_route)
675 {
676         __be32 new_gw = icmp_hdr(skb)->un.gateway;
677         __be32 old_gw = ip_hdr(skb)->saddr;
678         struct net_device *dev = skb->dev;
679         struct in_device *in_dev;
680         struct fib_result res;
681         struct neighbour *n;
682         struct net *net;
683
684         switch (icmp_hdr(skb)->code & 7) {
685         case ICMP_REDIR_NET:
686         case ICMP_REDIR_NETTOS:
687         case ICMP_REDIR_HOST:
688         case ICMP_REDIR_HOSTTOS:
689                 break;
690
691         default:
692                 return;
693         }
694
695         if (rt->rt_gateway != old_gw)
696                 return;
697
698         in_dev = __in_dev_get_rcu(dev);
699         if (!in_dev)
700                 return;
701
702         net = dev_net(dev);
703         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
704             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
705             ipv4_is_zeronet(new_gw))
706                 goto reject_redirect;
707
708         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
709                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
710                         goto reject_redirect;
711                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
712                         goto reject_redirect;
713         } else {
714                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
715                         goto reject_redirect;
716         }
717
718         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
719         if (n) {
720                 if (!(n->nud_state & NUD_VALID)) {
721                         neigh_event_send(n, NULL);
722                 } else {
723                         if (fib_lookup(net, fl4, &res) == 0) {
724                                 struct fib_nh *nh = &FIB_RES_NH(res);
725
726                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
727                                                       0, 0);
728                         }
729                         if (kill_route)
730                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
731                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
732                 }
733                 neigh_release(n);
734         }
735         return;
736
737 reject_redirect:
738 #ifdef CONFIG_IP_ROUTE_VERBOSE
739         if (IN_DEV_LOG_MARTIANS(in_dev)) {
740                 const struct iphdr *iph = (const struct iphdr *) skb->data;
741                 __be32 daddr = iph->daddr;
742                 __be32 saddr = iph->saddr;
743
744                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
745                                      "  Advised path = %pI4 -> %pI4\n",
746                                      &old_gw, dev->name, &new_gw,
747                                      &saddr, &daddr);
748         }
749 #endif
750         ;
751 }
752
753 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
754 {
755         struct rtable *rt;
756         struct flowi4 fl4;
757         const struct iphdr *iph = (const struct iphdr *) skb->data;
758         int oif = skb->dev->ifindex;
759         u8 tos = RT_TOS(iph->tos);
760         u8 prot = iph->protocol;
761         u32 mark = skb->mark;
762
763         rt = (struct rtable *) dst;
764
765         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
766         __ip_do_redirect(rt, skb, &fl4, true);
767 }
768
769 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
770 {
771         struct rtable *rt = (struct rtable *)dst;
772         struct dst_entry *ret = dst;
773
774         if (rt) {
775                 if (dst->obsolete > 0) {
776                         ip_rt_put(rt);
777                         ret = NULL;
778                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
779                            rt->dst.expires) {
780                         ip_rt_put(rt);
781                         ret = NULL;
782                 }
783         }
784         return ret;
785 }
786
787 /*
788  * Algorithm:
789  *      1. The first ip_rt_redirect_number redirects are sent
790  *         with exponential backoff, then we stop sending them at all,
791  *         assuming that the host ignores our redirects.
792  *      2. If we did not see packets requiring redirects
793  *         during ip_rt_redirect_silence, we assume that the host
794  *         forgot redirected route and start to send redirects again.
795  *
796  * This algorithm is much cheaper and more intelligent than dumb load limiting
797  * in icmp.c.
798  *
799  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
800  * and "frag. need" (breaks PMTU discovery) in icmp.c.
801  */
802
803 void ip_rt_send_redirect(struct sk_buff *skb)
804 {
805         struct rtable *rt = skb_rtable(skb);
806         struct in_device *in_dev;
807         struct inet_peer *peer;
808         struct net *net;
809         int log_martians;
810
811         rcu_read_lock();
812         in_dev = __in_dev_get_rcu(rt->dst.dev);
813         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
814                 rcu_read_unlock();
815                 return;
816         }
817         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
818         rcu_read_unlock();
819
820         net = dev_net(rt->dst.dev);
821         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
822         if (!peer) {
823                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
824                           rt_nexthop(rt, ip_hdr(skb)->daddr));
825                 return;
826         }
827
828         /* No redirected packets during ip_rt_redirect_silence;
829          * reset the algorithm.
830          */
831         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
832                 peer->rate_tokens = 0;
833
834         /* Too many ignored redirects; do not send anything
835          * set dst.rate_last to the last seen redirected packet.
836          */
837         if (peer->rate_tokens >= ip_rt_redirect_number) {
838                 peer->rate_last = jiffies;
839                 goto out_put_peer;
840         }
841
842         /* Check for load limit; set rate_last to the latest sent
843          * redirect.
844          */
845         if (peer->rate_tokens == 0 ||
846             time_after(jiffies,
847                        (peer->rate_last +
848                         (ip_rt_redirect_load << peer->rate_tokens)))) {
849                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
850
851                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
852                 peer->rate_last = jiffies;
853                 ++peer->rate_tokens;
854 #ifdef CONFIG_IP_ROUTE_VERBOSE
855                 if (log_martians &&
856                     peer->rate_tokens == ip_rt_redirect_number)
857                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
858                                              &ip_hdr(skb)->saddr, inet_iif(skb),
859                                              &ip_hdr(skb)->daddr, &gw);
860 #endif
861         }
862 out_put_peer:
863         inet_putpeer(peer);
864 }
865
866 static int ip_error(struct sk_buff *skb)
867 {
868         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
869         struct rtable *rt = skb_rtable(skb);
870         struct inet_peer *peer;
871         unsigned long now;
872         struct net *net;
873         bool send;
874         int code;
875
876         net = dev_net(rt->dst.dev);
877         if (!IN_DEV_FORWARD(in_dev)) {
878                 switch (rt->dst.error) {
879                 case EHOSTUNREACH:
880                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
881                         break;
882
883                 case ENETUNREACH:
884                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
885                         break;
886                 }
887                 goto out;
888         }
889
890         switch (rt->dst.error) {
891         case EINVAL:
892         default:
893                 goto out;
894         case EHOSTUNREACH:
895                 code = ICMP_HOST_UNREACH;
896                 break;
897         case ENETUNREACH:
898                 code = ICMP_NET_UNREACH;
899                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
900                 break;
901         case EACCES:
902                 code = ICMP_PKT_FILTERED;
903                 break;
904         }
905
906         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
907
908         send = true;
909         if (peer) {
910                 now = jiffies;
911                 peer->rate_tokens += now - peer->rate_last;
912                 if (peer->rate_tokens > ip_rt_error_burst)
913                         peer->rate_tokens = ip_rt_error_burst;
914                 peer->rate_last = now;
915                 if (peer->rate_tokens >= ip_rt_error_cost)
916                         peer->rate_tokens -= ip_rt_error_cost;
917                 else
918                         send = false;
919                 inet_putpeer(peer);
920         }
921         if (send)
922                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
923
924 out:    kfree_skb(skb);
925         return 0;
926 }
927
928 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
929 {
930         struct dst_entry *dst = &rt->dst;
931         struct fib_result res;
932
933         if (dst_metric_locked(dst, RTAX_MTU))
934                 return;
935
936         if (dst->dev->mtu < mtu)
937                 return;
938
939         if (mtu < ip_rt_min_pmtu)
940                 mtu = ip_rt_min_pmtu;
941
942         if (!rt->rt_pmtu) {
943                 dst->obsolete = DST_OBSOLETE_KILL;
944         } else {
945                 rt->rt_pmtu = mtu;
946                 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
947         }
948
949         rcu_read_lock();
950         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
951                 struct fib_nh *nh = &FIB_RES_NH(res);
952
953                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
954                                       jiffies + ip_rt_mtu_expires);
955         }
956         rcu_read_unlock();
957 }
958
959 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
960                               struct sk_buff *skb, u32 mtu)
961 {
962         struct rtable *rt = (struct rtable *) dst;
963         struct flowi4 fl4;
964
965         ip_rt_build_flow_key(&fl4, sk, skb);
966         __ip_rt_update_pmtu(rt, &fl4, mtu);
967 }
968
969 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
970                       int oif, u32 mark, u8 protocol, int flow_flags)
971 {
972         const struct iphdr *iph = (const struct iphdr *) skb->data;
973         struct flowi4 fl4;
974         struct rtable *rt;
975
976         if (!mark)
977                 mark = IP4_REPLY_MARK(net, skb->mark);
978
979         __build_flow_key(&fl4, NULL, iph, oif,
980                          RT_TOS(iph->tos), protocol, mark, flow_flags);
981         rt = __ip_route_output_key(net, &fl4);
982         if (!IS_ERR(rt)) {
983                 __ip_rt_update_pmtu(rt, &fl4, mtu);
984                 ip_rt_put(rt);
985         }
986 }
987 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
988
989 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
990 {
991         const struct iphdr *iph = (const struct iphdr *) skb->data;
992         struct flowi4 fl4;
993         struct rtable *rt;
994
995         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
996
997         if (!fl4.flowi4_mark)
998                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
999
1000         rt = __ip_route_output_key(sock_net(sk), &fl4);
1001         if (!IS_ERR(rt)) {
1002                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1003                 ip_rt_put(rt);
1004         }
1005 }
1006
1007 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1008 {
1009         const struct iphdr *iph = (const struct iphdr *) skb->data;
1010         struct flowi4 fl4;
1011         struct rtable *rt;
1012         struct dst_entry *odst = NULL;
1013         bool new = false;
1014
1015         bh_lock_sock(sk);
1016         odst = sk_dst_get(sk);
1017
1018         if (sock_owned_by_user(sk) || !odst) {
1019                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1020                 goto out;
1021         }
1022
1023         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1024
1025         rt = (struct rtable *)odst;
1026         if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
1027                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1028                 if (IS_ERR(rt))
1029                         goto out;
1030
1031                 new = true;
1032         }
1033
1034         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1035
1036         if (!dst_check(&rt->dst, 0)) {
1037                 if (new)
1038                         dst_release(&rt->dst);
1039
1040                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1041                 if (IS_ERR(rt))
1042                         goto out;
1043
1044                 new = true;
1045         }
1046
1047         if (new)
1048                 sk_dst_set(sk, &rt->dst);
1049
1050 out:
1051         bh_unlock_sock(sk);
1052         dst_release(odst);
1053 }
1054 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1055
1056 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1057                    int oif, u32 mark, u8 protocol, int flow_flags)
1058 {
1059         const struct iphdr *iph = (const struct iphdr *) skb->data;
1060         struct flowi4 fl4;
1061         struct rtable *rt;
1062
1063         __build_flow_key(&fl4, NULL, iph, oif,
1064                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1065         rt = __ip_route_output_key(net, &fl4);
1066         if (!IS_ERR(rt)) {
1067                 __ip_do_redirect(rt, skb, &fl4, false);
1068                 ip_rt_put(rt);
1069         }
1070 }
1071 EXPORT_SYMBOL_GPL(ipv4_redirect);
1072
1073 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1074 {
1075         const struct iphdr *iph = (const struct iphdr *) skb->data;
1076         struct flowi4 fl4;
1077         struct rtable *rt;
1078
1079         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1080         rt = __ip_route_output_key(sock_net(sk), &fl4);
1081         if (!IS_ERR(rt)) {
1082                 __ip_do_redirect(rt, skb, &fl4, false);
1083                 ip_rt_put(rt);
1084         }
1085 }
1086 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1087
1088 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1089 {
1090         struct rtable *rt = (struct rtable *) dst;
1091
1092         /* All IPV4 dsts are created with ->obsolete set to the value
1093          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1094          * into this function always.
1095          *
1096          * When a PMTU/redirect information update invalidates a
1097          * route, this is indicated by setting obsolete to
1098          * DST_OBSOLETE_KILL.
1099          */
1100         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1101                 return NULL;
1102         return dst;
1103 }
1104
1105 static void ipv4_link_failure(struct sk_buff *skb)
1106 {
1107         struct rtable *rt;
1108
1109         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1110
1111         rt = skb_rtable(skb);
1112         if (rt)
1113                 dst_set_expires(&rt->dst, 0);
1114 }
1115
1116 static int ip_rt_bug(struct sk_buff *skb)
1117 {
1118         pr_debug("%s: %pI4 -> %pI4, %s\n",
1119                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1120                  skb->dev ? skb->dev->name : "?");
1121         kfree_skb(skb);
1122         WARN_ON(1);
1123         return 0;
1124 }
1125
1126 /*
1127    We do not cache source address of outgoing interface,
1128    because it is used only by IP RR, TS and SRR options,
1129    so that it out of fast path.
1130
1131    BTW remember: "addr" is allowed to be not aligned
1132    in IP options!
1133  */
1134
1135 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1136 {
1137         __be32 src;
1138
1139         if (rt_is_output_route(rt))
1140                 src = ip_hdr(skb)->saddr;
1141         else {
1142                 struct fib_result res;
1143                 struct flowi4 fl4;
1144                 struct iphdr *iph;
1145
1146                 iph = ip_hdr(skb);
1147
1148                 memset(&fl4, 0, sizeof(fl4));
1149                 fl4.daddr = iph->daddr;
1150                 fl4.saddr = iph->saddr;
1151                 fl4.flowi4_tos = RT_TOS(iph->tos);
1152                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1153                 fl4.flowi4_iif = skb->dev->ifindex;
1154                 fl4.flowi4_mark = skb->mark;
1155
1156                 rcu_read_lock();
1157                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1158                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1159                 else
1160                         src = inet_select_addr(rt->dst.dev,
1161                                                rt_nexthop(rt, iph->daddr),
1162                                                RT_SCOPE_UNIVERSE);
1163                 rcu_read_unlock();
1164         }
1165         memcpy(addr, &src, 4);
1166 }
1167
1168 #ifdef CONFIG_IP_ROUTE_CLASSID
1169 static void set_class_tag(struct rtable *rt, u32 tag)
1170 {
1171         if (!(rt->dst.tclassid & 0xFFFF))
1172                 rt->dst.tclassid |= tag & 0xFFFF;
1173         if (!(rt->dst.tclassid & 0xFFFF0000))
1174                 rt->dst.tclassid |= tag & 0xFFFF0000;
1175 }
1176 #endif
1177
1178 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1179 {
1180         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1181
1182         if (advmss == 0) {
1183                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1184                                ip_rt_min_advmss);
1185                 if (advmss > 65535 - 40)
1186                         advmss = 65535 - 40;
1187         }
1188         return advmss;
1189 }
1190
1191 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1192 {
1193         const struct rtable *rt = (const struct rtable *) dst;
1194         unsigned int mtu = rt->rt_pmtu;
1195
1196         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1197                 mtu = dst_metric_raw(dst, RTAX_MTU);
1198
1199         if (mtu)
1200                 return mtu;
1201
1202         mtu = dst->dev->mtu;
1203
1204         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1205                 if (rt->rt_uses_gateway && mtu > 576)
1206                         mtu = 576;
1207         }
1208
1209         if (mtu > IP_MAX_MTU)
1210                 mtu = IP_MAX_MTU;
1211
1212         return mtu;
1213 }
1214
1215 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1216 {
1217         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1218         struct fib_nh_exception *fnhe;
1219         u32 hval;
1220
1221         if (!hash)
1222                 return NULL;
1223
1224         hval = fnhe_hashfun(daddr);
1225
1226         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1227              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1228                 if (fnhe->fnhe_daddr == daddr)
1229                         return fnhe;
1230         }
1231         return NULL;
1232 }
1233
1234 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1235                               __be32 daddr)
1236 {
1237         bool ret = false;
1238
1239         spin_lock_bh(&fnhe_lock);
1240
1241         if (daddr == fnhe->fnhe_daddr) {
1242                 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1243                 if (orig && rt_is_expired(orig)) {
1244                         fnhe->fnhe_gw = 0;
1245                         fnhe->fnhe_pmtu = 0;
1246                         fnhe->fnhe_expires = 0;
1247                 }
1248                 if (fnhe->fnhe_pmtu) {
1249                         unsigned long expires = fnhe->fnhe_expires;
1250                         unsigned long diff = expires - jiffies;
1251
1252                         if (time_before(jiffies, expires)) {
1253                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1254                                 dst_set_expires(&rt->dst, diff);
1255                         }
1256                 }
1257                 if (fnhe->fnhe_gw) {
1258                         rt->rt_flags |= RTCF_REDIRECTED;
1259                         rt->rt_gateway = fnhe->fnhe_gw;
1260                         rt->rt_uses_gateway = 1;
1261                 } else if (!rt->rt_gateway)
1262                         rt->rt_gateway = daddr;
1263
1264                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1265                 if (orig)
1266                         rt_free(orig);
1267
1268                 fnhe->fnhe_stamp = jiffies;
1269                 ret = true;
1270         }
1271         spin_unlock_bh(&fnhe_lock);
1272
1273         return ret;
1274 }
1275
1276 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1277 {
1278         struct rtable *orig, *prev, **p;
1279         bool ret = true;
1280
1281         if (rt_is_input_route(rt)) {
1282                 p = (struct rtable **)&nh->nh_rth_input;
1283         } else {
1284                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1285         }
1286         orig = *p;
1287
1288         prev = cmpxchg(p, orig, rt);
1289         if (prev == orig) {
1290                 if (orig)
1291                         rt_free(orig);
1292         } else
1293                 ret = false;
1294
1295         return ret;
1296 }
1297
1298 static DEFINE_SPINLOCK(rt_uncached_lock);
1299 static LIST_HEAD(rt_uncached_list);
1300
1301 static void rt_add_uncached_list(struct rtable *rt)
1302 {
1303         spin_lock_bh(&rt_uncached_lock);
1304         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1305         spin_unlock_bh(&rt_uncached_lock);
1306 }
1307
1308 static void ipv4_dst_destroy(struct dst_entry *dst)
1309 {
1310         struct rtable *rt = (struct rtable *) dst;
1311
1312         if (!list_empty(&rt->rt_uncached)) {
1313                 spin_lock_bh(&rt_uncached_lock);
1314                 list_del(&rt->rt_uncached);
1315                 spin_unlock_bh(&rt_uncached_lock);
1316         }
1317 }
1318
1319 void rt_flush_dev(struct net_device *dev)
1320 {
1321         if (!list_empty(&rt_uncached_list)) {
1322                 struct net *net = dev_net(dev);
1323                 struct rtable *rt;
1324
1325                 spin_lock_bh(&rt_uncached_lock);
1326                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1327                         if (rt->dst.dev != dev)
1328                                 continue;
1329                         rt->dst.dev = net->loopback_dev;
1330                         dev_hold(rt->dst.dev);
1331                         dev_put(dev);
1332                 }
1333                 spin_unlock_bh(&rt_uncached_lock);
1334         }
1335 }
1336
1337 static bool rt_cache_valid(const struct rtable *rt)
1338 {
1339         return  rt &&
1340                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1341                 !rt_is_expired(rt);
1342 }
1343
1344 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1345                            const struct fib_result *res,
1346                            struct fib_nh_exception *fnhe,
1347                            struct fib_info *fi, u16 type, u32 itag)
1348 {
1349         bool cached = false;
1350
1351         if (fi) {
1352                 struct fib_nh *nh = &FIB_RES_NH(*res);
1353
1354                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1355                         rt->rt_gateway = nh->nh_gw;
1356                         rt->rt_uses_gateway = 1;
1357                 }
1358                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1359 #ifdef CONFIG_IP_ROUTE_CLASSID
1360                 rt->dst.tclassid = nh->nh_tclassid;
1361 #endif
1362                 if (unlikely(fnhe))
1363                         cached = rt_bind_exception(rt, fnhe, daddr);
1364                 else if (!(rt->dst.flags & DST_NOCACHE))
1365                         cached = rt_cache_route(nh, rt);
1366                 if (unlikely(!cached)) {
1367                         /* Routes we intend to cache in nexthop exception or
1368                          * FIB nexthop have the DST_NOCACHE bit clear.
1369                          * However, if we are unsuccessful at storing this
1370                          * route into the cache we really need to set it.
1371                          */
1372                         rt->dst.flags |= DST_NOCACHE;
1373                         if (!rt->rt_gateway)
1374                                 rt->rt_gateway = daddr;
1375                         rt_add_uncached_list(rt);
1376                 }
1377         } else
1378                 rt_add_uncached_list(rt);
1379
1380 #ifdef CONFIG_IP_ROUTE_CLASSID
1381 #ifdef CONFIG_IP_MULTIPLE_TABLES
1382         set_class_tag(rt, res->tclassid);
1383 #endif
1384         set_class_tag(rt, itag);
1385 #endif
1386 }
1387
1388 static struct rtable *rt_dst_alloc(struct net_device *dev,
1389                                    bool nopolicy, bool noxfrm, bool will_cache)
1390 {
1391         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1392                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1393                          (nopolicy ? DST_NOPOLICY : 0) |
1394                          (noxfrm ? DST_NOXFRM : 0));
1395 }
1396
1397 /* called in rcu_read_lock() section */
1398 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1399                                 u8 tos, struct net_device *dev, int our)
1400 {
1401         struct rtable *rth;
1402         struct in_device *in_dev = __in_dev_get_rcu(dev);
1403         u32 itag = 0;
1404         int err;
1405
1406         /* Primary sanity checks. */
1407
1408         if (in_dev == NULL)
1409                 return -EINVAL;
1410
1411         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1412             skb->protocol != htons(ETH_P_IP))
1413                 goto e_inval;
1414
1415         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1416                 if (ipv4_is_loopback(saddr))
1417                         goto e_inval;
1418
1419         if (ipv4_is_zeronet(saddr)) {
1420                 if (!ipv4_is_local_multicast(daddr))
1421                         goto e_inval;
1422         } else {
1423                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1424                                           in_dev, &itag);
1425                 if (err < 0)
1426                         goto e_err;
1427         }
1428         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1429                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1430         if (!rth)
1431                 goto e_nobufs;
1432
1433 #ifdef CONFIG_IP_ROUTE_CLASSID
1434         rth->dst.tclassid = itag;
1435 #endif
1436         rth->dst.output = ip_rt_bug;
1437
1438         rth->rt_genid   = rt_genid(dev_net(dev));
1439         rth->rt_flags   = RTCF_MULTICAST;
1440         rth->rt_type    = RTN_MULTICAST;
1441         rth->rt_is_input= 1;
1442         rth->rt_iif     = 0;
1443         rth->rt_pmtu    = 0;
1444         rth->rt_gateway = 0;
1445         rth->rt_uses_gateway = 0;
1446         INIT_LIST_HEAD(&rth->rt_uncached);
1447         if (our) {
1448                 rth->dst.input= ip_local_deliver;
1449                 rth->rt_flags |= RTCF_LOCAL;
1450         }
1451
1452 #ifdef CONFIG_IP_MROUTE
1453         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1454                 rth->dst.input = ip_mr_input;
1455 #endif
1456         RT_CACHE_STAT_INC(in_slow_mc);
1457
1458         skb_dst_set(skb, &rth->dst);
1459         return 0;
1460
1461 e_nobufs:
1462         return -ENOBUFS;
1463 e_inval:
1464         return -EINVAL;
1465 e_err:
1466         return err;
1467 }
1468
1469
1470 static void ip_handle_martian_source(struct net_device *dev,
1471                                      struct in_device *in_dev,
1472                                      struct sk_buff *skb,
1473                                      __be32 daddr,
1474                                      __be32 saddr)
1475 {
1476         RT_CACHE_STAT_INC(in_martian_src);
1477 #ifdef CONFIG_IP_ROUTE_VERBOSE
1478         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1479                 /*
1480                  *      RFC1812 recommendation, if source is martian,
1481                  *      the only hint is MAC header.
1482                  */
1483                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1484                         &daddr, &saddr, dev->name);
1485                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1486                         print_hex_dump(KERN_WARNING, "ll header: ",
1487                                        DUMP_PREFIX_OFFSET, 16, 1,
1488                                        skb_mac_header(skb),
1489                                        dev->hard_header_len, true);
1490                 }
1491         }
1492 #endif
1493 }
1494
1495 /* called in rcu_read_lock() section */
1496 static int __mkroute_input(struct sk_buff *skb,
1497                            const struct fib_result *res,
1498                            struct in_device *in_dev,
1499                            __be32 daddr, __be32 saddr, u32 tos)
1500 {
1501         struct rtable *rth;
1502         int err;
1503         struct in_device *out_dev;
1504         unsigned int flags = 0;
1505         bool do_cache;
1506         u32 itag = 0;
1507
1508         /* get a working reference to the output device */
1509         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1510         if (out_dev == NULL) {
1511                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1512                 return -EINVAL;
1513         }
1514
1515         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1516                                   in_dev->dev, in_dev, &itag);
1517         if (err < 0) {
1518                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1519                                          saddr);
1520
1521                 goto cleanup;
1522         }
1523
1524         do_cache = res->fi && !itag;
1525         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1526             (IN_DEV_SHARED_MEDIA(out_dev) ||
1527              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1528                 flags |= RTCF_DOREDIRECT;
1529                 do_cache = false;
1530         }
1531
1532         if (skb->protocol != htons(ETH_P_IP)) {
1533                 /* Not IP (i.e. ARP). Do not create route, if it is
1534                  * invalid for proxy arp. DNAT routes are always valid.
1535                  *
1536                  * Proxy arp feature have been extended to allow, ARP
1537                  * replies back to the same interface, to support
1538                  * Private VLAN switch technologies. See arp.c.
1539                  */
1540                 if (out_dev == in_dev &&
1541                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1542                         err = -EINVAL;
1543                         goto cleanup;
1544                 }
1545         }
1546
1547         if (do_cache) {
1548                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1549                 if (rt_cache_valid(rth)) {
1550                         skb_dst_set_noref(skb, &rth->dst);
1551                         goto out;
1552                 }
1553         }
1554
1555         rth = rt_dst_alloc(out_dev->dev,
1556                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1557                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1558         if (!rth) {
1559                 err = -ENOBUFS;
1560                 goto cleanup;
1561         }
1562
1563         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1564         rth->rt_flags = flags;
1565         rth->rt_type = res->type;
1566         rth->rt_is_input = 1;
1567         rth->rt_iif     = 0;
1568         rth->rt_pmtu    = 0;
1569         rth->rt_gateway = 0;
1570         rth->rt_uses_gateway = 0;
1571         INIT_LIST_HEAD(&rth->rt_uncached);
1572         RT_CACHE_STAT_INC(in_slow_tot);
1573
1574         rth->dst.input = ip_forward;
1575         rth->dst.output = ip_output;
1576
1577         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1578         skb_dst_set(skb, &rth->dst);
1579 out:
1580         err = 0;
1581  cleanup:
1582         return err;
1583 }
1584
1585 static int ip_mkroute_input(struct sk_buff *skb,
1586                             struct fib_result *res,
1587                             const struct flowi4 *fl4,
1588                             struct in_device *in_dev,
1589                             __be32 daddr, __be32 saddr, u32 tos)
1590 {
1591 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1592         if (res->fi && res->fi->fib_nhs > 1)
1593                 fib_select_multipath(res);
1594 #endif
1595
1596         /* create a routing cache entry */
1597         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1598 }
1599
1600 /*
1601  *      NOTE. We drop all the packets that has local source
1602  *      addresses, because every properly looped back packet
1603  *      must have correct destination already attached by output routine.
1604  *
1605  *      Such approach solves two big problems:
1606  *      1. Not simplex devices are handled properly.
1607  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1608  *      called with rcu_read_lock()
1609  */
1610
1611 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1612                                u8 tos, struct net_device *dev)
1613 {
1614         struct fib_result res;
1615         struct in_device *in_dev = __in_dev_get_rcu(dev);
1616         struct flowi4   fl4;
1617         unsigned int    flags = 0;
1618         u32             itag = 0;
1619         struct rtable   *rth;
1620         int             err = -EINVAL;
1621         struct net    *net = dev_net(dev);
1622         bool do_cache;
1623
1624         /* IP on this device is disabled. */
1625
1626         if (!in_dev)
1627                 goto out;
1628
1629         /* Check for the most weird martians, which can be not detected
1630            by fib_lookup.
1631          */
1632
1633         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1634                 goto martian_source;
1635
1636         res.fi = NULL;
1637         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1638                 goto brd_input;
1639
1640         /* Accept zero addresses only to limited broadcast;
1641          * I even do not know to fix it or not. Waiting for complains :-)
1642          */
1643         if (ipv4_is_zeronet(saddr))
1644                 goto martian_source;
1645
1646         if (ipv4_is_zeronet(daddr))
1647                 goto martian_destination;
1648
1649         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1650          * and call it once if daddr or/and saddr are loopback addresses
1651          */
1652         if (ipv4_is_loopback(daddr)) {
1653                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1654                         goto martian_destination;
1655         } else if (ipv4_is_loopback(saddr)) {
1656                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1657                         goto martian_source;
1658         }
1659
1660         /*
1661          *      Now we are ready to route packet.
1662          */
1663         fl4.flowi4_oif = 0;
1664         fl4.flowi4_iif = dev->ifindex;
1665         fl4.flowi4_mark = skb->mark;
1666         fl4.flowi4_tos = tos;
1667         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1668         fl4.daddr = daddr;
1669         fl4.saddr = saddr;
1670         err = fib_lookup(net, &fl4, &res);
1671         if (err != 0)
1672                 goto no_route;
1673
1674         if (res.type == RTN_BROADCAST)
1675                 goto brd_input;
1676
1677         if (res.type == RTN_LOCAL) {
1678                 err = fib_validate_source(skb, saddr, daddr, tos,
1679                                           LOOPBACK_IFINDEX,
1680                                           dev, in_dev, &itag);
1681                 if (err < 0)
1682                         goto martian_source_keep_err;
1683                 goto local_input;
1684         }
1685
1686         if (!IN_DEV_FORWARD(in_dev))
1687                 goto no_route;
1688         if (res.type != RTN_UNICAST)
1689                 goto martian_destination;
1690
1691         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1692 out:    return err;
1693
1694 brd_input:
1695         if (skb->protocol != htons(ETH_P_IP))
1696                 goto e_inval;
1697
1698         if (!ipv4_is_zeronet(saddr)) {
1699                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1700                                           in_dev, &itag);
1701                 if (err < 0)
1702                         goto martian_source_keep_err;
1703         }
1704         flags |= RTCF_BROADCAST;
1705         res.type = RTN_BROADCAST;
1706         RT_CACHE_STAT_INC(in_brd);
1707
1708 local_input:
1709         do_cache = false;
1710         if (res.fi) {
1711                 if (!itag) {
1712                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1713                         if (rt_cache_valid(rth)) {
1714                                 skb_dst_set_noref(skb, &rth->dst);
1715                                 err = 0;
1716                                 goto out;
1717                         }
1718                         do_cache = true;
1719                 }
1720         }
1721
1722         rth = rt_dst_alloc(net->loopback_dev,
1723                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1724         if (!rth)
1725                 goto e_nobufs;
1726
1727         rth->dst.input= ip_local_deliver;
1728         rth->dst.output= ip_rt_bug;
1729 #ifdef CONFIG_IP_ROUTE_CLASSID
1730         rth->dst.tclassid = itag;
1731 #endif
1732
1733         rth->rt_genid = rt_genid(net);
1734         rth->rt_flags   = flags|RTCF_LOCAL;
1735         rth->rt_type    = res.type;
1736         rth->rt_is_input = 1;
1737         rth->rt_iif     = 0;
1738         rth->rt_pmtu    = 0;
1739         rth->rt_gateway = 0;
1740         rth->rt_uses_gateway = 0;
1741         INIT_LIST_HEAD(&rth->rt_uncached);
1742         RT_CACHE_STAT_INC(in_slow_tot);
1743         if (res.type == RTN_UNREACHABLE) {
1744                 rth->dst.input= ip_error;
1745                 rth->dst.error= -err;
1746                 rth->rt_flags   &= ~RTCF_LOCAL;
1747         }
1748         if (do_cache) {
1749                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1750                         rth->dst.flags |= DST_NOCACHE;
1751                         rt_add_uncached_list(rth);
1752                 }
1753         }
1754         skb_dst_set(skb, &rth->dst);
1755         err = 0;
1756         goto out;
1757
1758 no_route:
1759         RT_CACHE_STAT_INC(in_no_route);
1760         res.type = RTN_UNREACHABLE;
1761         if (err == -ESRCH)
1762                 err = -ENETUNREACH;
1763         goto local_input;
1764
1765         /*
1766          *      Do not cache martian addresses: they should be logged (RFC1812)
1767          */
1768 martian_destination:
1769         RT_CACHE_STAT_INC(in_martian_dst);
1770 #ifdef CONFIG_IP_ROUTE_VERBOSE
1771         if (IN_DEV_LOG_MARTIANS(in_dev))
1772                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1773                                      &daddr, &saddr, dev->name);
1774 #endif
1775
1776 e_inval:
1777         err = -EINVAL;
1778         goto out;
1779
1780 e_nobufs:
1781         err = -ENOBUFS;
1782         goto out;
1783
1784 martian_source:
1785         err = -EINVAL;
1786 martian_source_keep_err:
1787         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1788         goto out;
1789 }
1790
1791 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1792                          u8 tos, struct net_device *dev)
1793 {
1794         int res;
1795
1796         rcu_read_lock();
1797
1798         /* Multicast recognition logic is moved from route cache to here.
1799            The problem was that too many Ethernet cards have broken/missing
1800            hardware multicast filters :-( As result the host on multicasting
1801            network acquires a lot of useless route cache entries, sort of
1802            SDR messages from all the world. Now we try to get rid of them.
1803            Really, provided software IP multicast filter is organized
1804            reasonably (at least, hashed), it does not result in a slowdown
1805            comparing with route cache reject entries.
1806            Note, that multicast routers are not affected, because
1807            route cache entry is created eventually.
1808          */
1809         if (ipv4_is_multicast(daddr)) {
1810                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1811
1812                 if (in_dev) {
1813                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1814                                                   ip_hdr(skb)->protocol);
1815                         if (our
1816 #ifdef CONFIG_IP_MROUTE
1817                                 ||
1818                             (!ipv4_is_local_multicast(daddr) &&
1819                              IN_DEV_MFORWARD(in_dev))
1820 #endif
1821                            ) {
1822                                 int res = ip_route_input_mc(skb, daddr, saddr,
1823                                                             tos, dev, our);
1824                                 rcu_read_unlock();
1825                                 return res;
1826                         }
1827                 }
1828                 rcu_read_unlock();
1829                 return -EINVAL;
1830         }
1831         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1832         rcu_read_unlock();
1833         return res;
1834 }
1835 EXPORT_SYMBOL(ip_route_input_noref);
1836
1837 /* called with rcu_read_lock() */
1838 static struct rtable *__mkroute_output(const struct fib_result *res,
1839                                        const struct flowi4 *fl4, int orig_oif,
1840                                        struct net_device *dev_out,
1841                                        unsigned int flags)
1842 {
1843         struct fib_info *fi = res->fi;
1844         struct fib_nh_exception *fnhe;
1845         struct in_device *in_dev;
1846         u16 type = res->type;
1847         struct rtable *rth;
1848         bool do_cache;
1849
1850         in_dev = __in_dev_get_rcu(dev_out);
1851         if (!in_dev)
1852                 return ERR_PTR(-EINVAL);
1853
1854         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1855                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1856                         return ERR_PTR(-EINVAL);
1857
1858         if (ipv4_is_lbcast(fl4->daddr))
1859                 type = RTN_BROADCAST;
1860         else if (ipv4_is_multicast(fl4->daddr))
1861                 type = RTN_MULTICAST;
1862         else if (ipv4_is_zeronet(fl4->daddr))
1863                 return ERR_PTR(-EINVAL);
1864
1865         if (dev_out->flags & IFF_LOOPBACK)
1866                 flags |= RTCF_LOCAL;
1867
1868         do_cache = true;
1869         if (type == RTN_BROADCAST) {
1870                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1871                 fi = NULL;
1872         } else if (type == RTN_MULTICAST) {
1873                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1874                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1875                                      fl4->flowi4_proto))
1876                         flags &= ~RTCF_LOCAL;
1877                 else
1878                         do_cache = false;
1879                 /* If multicast route do not exist use
1880                  * default one, but do not gateway in this case.
1881                  * Yes, it is hack.
1882                  */
1883                 if (fi && res->prefixlen < 4)
1884                         fi = NULL;
1885         }
1886
1887         fnhe = NULL;
1888         do_cache &= fi != NULL;
1889         if (do_cache) {
1890                 struct rtable __rcu **prth;
1891                 struct fib_nh *nh = &FIB_RES_NH(*res);
1892
1893                 fnhe = find_exception(nh, fl4->daddr);
1894                 if (fnhe)
1895                         prth = &fnhe->fnhe_rth;
1896                 else {
1897                         if (unlikely(fl4->flowi4_flags &
1898                                      FLOWI_FLAG_KNOWN_NH &&
1899                                      !(nh->nh_gw &&
1900                                        nh->nh_scope == RT_SCOPE_LINK))) {
1901                                 do_cache = false;
1902                                 goto add;
1903                         }
1904                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1905                 }
1906                 rth = rcu_dereference(*prth);
1907                 if (rt_cache_valid(rth)) {
1908                         dst_hold(&rth->dst);
1909                         return rth;
1910                 }
1911         }
1912
1913 add:
1914         rth = rt_dst_alloc(dev_out,
1915                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1916                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1917                            do_cache);
1918         if (!rth)
1919                 return ERR_PTR(-ENOBUFS);
1920
1921         rth->dst.output = ip_output;
1922
1923         rth->rt_genid = rt_genid(dev_net(dev_out));
1924         rth->rt_flags   = flags;
1925         rth->rt_type    = type;
1926         rth->rt_is_input = 0;
1927         rth->rt_iif     = orig_oif ? : 0;
1928         rth->rt_pmtu    = 0;
1929         rth->rt_gateway = 0;
1930         rth->rt_uses_gateway = 0;
1931         INIT_LIST_HEAD(&rth->rt_uncached);
1932
1933         RT_CACHE_STAT_INC(out_slow_tot);
1934
1935         if (flags & RTCF_LOCAL)
1936                 rth->dst.input = ip_local_deliver;
1937         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1938                 if (flags & RTCF_LOCAL &&
1939                     !(dev_out->flags & IFF_LOOPBACK)) {
1940                         rth->dst.output = ip_mc_output;
1941                         RT_CACHE_STAT_INC(out_slow_mc);
1942                 }
1943 #ifdef CONFIG_IP_MROUTE
1944                 if (type == RTN_MULTICAST) {
1945                         if (IN_DEV_MFORWARD(in_dev) &&
1946                             !ipv4_is_local_multicast(fl4->daddr)) {
1947                                 rth->dst.input = ip_mr_input;
1948                                 rth->dst.output = ip_mc_output;
1949                         }
1950                 }
1951 #endif
1952         }
1953
1954         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1955
1956         return rth;
1957 }
1958
1959 /*
1960  * Major route resolver routine.
1961  */
1962
1963 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1964 {
1965         struct net_device *dev_out = NULL;
1966         __u8 tos = RT_FL_TOS(fl4);
1967         unsigned int flags = 0;
1968         struct fib_result res;
1969         struct rtable *rth;
1970         int orig_oif;
1971
1972         res.tclassid    = 0;
1973         res.fi          = NULL;
1974         res.table       = NULL;
1975
1976         orig_oif = fl4->flowi4_oif;
1977
1978         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1979         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1980         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1981                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1982
1983         rcu_read_lock();
1984         if (fl4->saddr) {
1985                 rth = ERR_PTR(-EINVAL);
1986                 if (ipv4_is_multicast(fl4->saddr) ||
1987                     ipv4_is_lbcast(fl4->saddr) ||
1988                     ipv4_is_zeronet(fl4->saddr))
1989                         goto out;
1990
1991                 /* I removed check for oif == dev_out->oif here.
1992                    It was wrong for two reasons:
1993                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1994                       is assigned to multiple interfaces.
1995                    2. Moreover, we are allowed to send packets with saddr
1996                       of another iface. --ANK
1997                  */
1998
1999                 if (fl4->flowi4_oif == 0 &&
2000                     (ipv4_is_multicast(fl4->daddr) ||
2001                      ipv4_is_lbcast(fl4->daddr))) {
2002                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2003                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2004                         if (dev_out == NULL)
2005                                 goto out;
2006
2007                         /* Special hack: user can direct multicasts
2008                            and limited broadcast via necessary interface
2009                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2010                            This hack is not just for fun, it allows
2011                            vic,vat and friends to work.
2012                            They bind socket to loopback, set ttl to zero
2013                            and expect that it will work.
2014                            From the viewpoint of routing cache they are broken,
2015                            because we are not allowed to build multicast path
2016                            with loopback source addr (look, routing cache
2017                            cannot know, that ttl is zero, so that packet
2018                            will not leave this host and route is valid).
2019                            Luckily, this hack is good workaround.
2020                          */
2021
2022                         fl4->flowi4_oif = dev_out->ifindex;
2023                         goto make_route;
2024                 }
2025
2026                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2027                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2028                         if (!__ip_dev_find(net, fl4->saddr, false))
2029                                 goto out;
2030                 }
2031         }
2032
2033
2034         if (fl4->flowi4_oif) {
2035                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2036                 rth = ERR_PTR(-ENODEV);
2037                 if (dev_out == NULL)
2038                         goto out;
2039
2040                 /* RACE: Check return value of inet_select_addr instead. */
2041                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2042                         rth = ERR_PTR(-ENETUNREACH);
2043                         goto out;
2044                 }
2045                 if (ipv4_is_local_multicast(fl4->daddr) ||
2046                     ipv4_is_lbcast(fl4->daddr)) {
2047                         if (!fl4->saddr)
2048                                 fl4->saddr = inet_select_addr(dev_out, 0,
2049                                                               RT_SCOPE_LINK);
2050                         goto make_route;
2051                 }
2052                 if (!fl4->saddr) {
2053                         if (ipv4_is_multicast(fl4->daddr))
2054                                 fl4->saddr = inet_select_addr(dev_out, 0,
2055                                                               fl4->flowi4_scope);
2056                         else if (!fl4->daddr)
2057                                 fl4->saddr = inet_select_addr(dev_out, 0,
2058                                                               RT_SCOPE_HOST);
2059                 }
2060         }
2061
2062         if (!fl4->daddr) {
2063                 fl4->daddr = fl4->saddr;
2064                 if (!fl4->daddr)
2065                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2066                 dev_out = net->loopback_dev;
2067                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2068                 res.type = RTN_LOCAL;
2069                 flags |= RTCF_LOCAL;
2070                 goto make_route;
2071         }
2072
2073         if (fib_lookup(net, fl4, &res)) {
2074                 res.fi = NULL;
2075                 res.table = NULL;
2076                 if (fl4->flowi4_oif) {
2077                         /* Apparently, routing tables are wrong. Assume,
2078                            that the destination is on link.
2079
2080                            WHY? DW.
2081                            Because we are allowed to send to iface
2082                            even if it has NO routes and NO assigned
2083                            addresses. When oif is specified, routing
2084                            tables are looked up with only one purpose:
2085                            to catch if destination is gatewayed, rather than
2086                            direct. Moreover, if MSG_DONTROUTE is set,
2087                            we send packet, ignoring both routing tables
2088                            and ifaddr state. --ANK
2089
2090
2091                            We could make it even if oif is unknown,
2092                            likely IPv6, but we do not.
2093                          */
2094
2095                         if (fl4->saddr == 0)
2096                                 fl4->saddr = inet_select_addr(dev_out, 0,
2097                                                               RT_SCOPE_LINK);
2098                         res.type = RTN_UNICAST;
2099                         goto make_route;
2100                 }
2101                 rth = ERR_PTR(-ENETUNREACH);
2102                 goto out;
2103         }
2104
2105         if (res.type == RTN_LOCAL) {
2106                 if (!fl4->saddr) {
2107                         if (res.fi->fib_prefsrc)
2108                                 fl4->saddr = res.fi->fib_prefsrc;
2109                         else
2110                                 fl4->saddr = fl4->daddr;
2111                 }
2112                 dev_out = net->loopback_dev;
2113                 fl4->flowi4_oif = dev_out->ifindex;
2114                 flags |= RTCF_LOCAL;
2115                 goto make_route;
2116         }
2117
2118 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2119         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2120                 fib_select_multipath(&res);
2121         else
2122 #endif
2123         if (!res.prefixlen &&
2124             res.table->tb_num_default > 1 &&
2125             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2126                 fib_select_default(&res);
2127
2128         if (!fl4->saddr)
2129                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2130
2131         dev_out = FIB_RES_DEV(res);
2132         fl4->flowi4_oif = dev_out->ifindex;
2133
2134
2135 make_route:
2136         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2137
2138 out:
2139         rcu_read_unlock();
2140         return rth;
2141 }
2142 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2143
2144 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2145 {
2146         return NULL;
2147 }
2148
2149 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2150 {
2151         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2152
2153         return mtu ? : dst->dev->mtu;
2154 }
2155
2156 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2157                                           struct sk_buff *skb, u32 mtu)
2158 {
2159 }
2160
2161 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2162                                        struct sk_buff *skb)
2163 {
2164 }
2165
2166 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2167                                           unsigned long old)
2168 {
2169         return NULL;
2170 }
2171
2172 static struct dst_ops ipv4_dst_blackhole_ops = {
2173         .family                 =       AF_INET,
2174         .protocol               =       cpu_to_be16(ETH_P_IP),
2175         .check                  =       ipv4_blackhole_dst_check,
2176         .mtu                    =       ipv4_blackhole_mtu,
2177         .default_advmss         =       ipv4_default_advmss,
2178         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2179         .redirect               =       ipv4_rt_blackhole_redirect,
2180         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2181         .neigh_lookup           =       ipv4_neigh_lookup,
2182 };
2183
2184 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2185 {
2186         struct rtable *ort = (struct rtable *) dst_orig;
2187         struct rtable *rt;
2188
2189         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2190         if (rt) {
2191                 struct dst_entry *new = &rt->dst;
2192
2193                 new->__use = 1;
2194                 new->input = dst_discard;
2195                 new->output = dst_discard;
2196
2197                 new->dev = ort->dst.dev;
2198                 if (new->dev)
2199                         dev_hold(new->dev);
2200
2201                 rt->rt_is_input = ort->rt_is_input;
2202                 rt->rt_iif = ort->rt_iif;
2203                 rt->rt_pmtu = ort->rt_pmtu;
2204
2205                 rt->rt_genid = rt_genid(net);
2206                 rt->rt_flags = ort->rt_flags;
2207                 rt->rt_type = ort->rt_type;
2208                 rt->rt_gateway = ort->rt_gateway;
2209                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2210
2211                 INIT_LIST_HEAD(&rt->rt_uncached);
2212
2213                 dst_free(new);
2214         }
2215
2216         dst_release(dst_orig);
2217
2218         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2219 }
2220
2221 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2222                                     struct sock *sk)
2223 {
2224         struct rtable *rt = __ip_route_output_key(net, flp4);
2225
2226         if (IS_ERR(rt))
2227                 return rt;
2228
2229         if (flp4->flowi4_proto)
2230                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2231                                                    flowi4_to_flowi(flp4),
2232                                                    sk, 0);
2233
2234         return rt;
2235 }
2236 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2237
2238 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2239                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2240                         u32 seq, int event, int nowait, unsigned int flags)
2241 {
2242         struct rtable *rt = skb_rtable(skb);
2243         struct rtmsg *r;
2244         struct nlmsghdr *nlh;
2245         unsigned long expires = 0;
2246         u32 error;
2247         u32 metrics[RTAX_MAX];
2248
2249         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2250         if (nlh == NULL)
2251                 return -EMSGSIZE;
2252
2253         r = nlmsg_data(nlh);
2254         r->rtm_family    = AF_INET;
2255         r->rtm_dst_len  = 32;
2256         r->rtm_src_len  = 0;
2257         r->rtm_tos      = fl4->flowi4_tos;
2258         r->rtm_table    = RT_TABLE_MAIN;
2259         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2260                 goto nla_put_failure;
2261         r->rtm_type     = rt->rt_type;
2262         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2263         r->rtm_protocol = RTPROT_UNSPEC;
2264         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2265         if (rt->rt_flags & RTCF_NOTIFY)
2266                 r->rtm_flags |= RTM_F_NOTIFY;
2267
2268         if (nla_put_be32(skb, RTA_DST, dst))
2269                 goto nla_put_failure;
2270         if (src) {
2271                 r->rtm_src_len = 32;
2272                 if (nla_put_be32(skb, RTA_SRC, src))
2273                         goto nla_put_failure;
2274         }
2275         if (rt->dst.dev &&
2276             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2277                 goto nla_put_failure;
2278 #ifdef CONFIG_IP_ROUTE_CLASSID
2279         if (rt->dst.tclassid &&
2280             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2281                 goto nla_put_failure;
2282 #endif
2283         if (!rt_is_input_route(rt) &&
2284             fl4->saddr != src) {
2285                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2286                         goto nla_put_failure;
2287         }
2288         if (rt->rt_uses_gateway &&
2289             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2290                 goto nla_put_failure;
2291
2292         expires = rt->dst.expires;
2293         if (expires) {
2294                 unsigned long now = jiffies;
2295
2296                 if (time_before(now, expires))
2297                         expires -= now;
2298                 else
2299                         expires = 0;
2300         }
2301
2302         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2303         if (rt->rt_pmtu && expires)
2304                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2305         if (rtnetlink_put_metrics(skb, metrics) < 0)
2306                 goto nla_put_failure;
2307
2308         if (fl4->flowi4_mark &&
2309             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2310                 goto nla_put_failure;
2311
2312         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2313             nla_put_u32(skb, RTA_UID,
2314                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2315                 goto nla_put_failure;
2316
2317         error = rt->dst.error;
2318
2319         if (rt_is_input_route(rt)) {
2320 #ifdef CONFIG_IP_MROUTE
2321                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2322                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2323                         int err = ipmr_get_route(net, skb,
2324                                                  fl4->saddr, fl4->daddr,
2325                                                  r, nowait);
2326                         if (err <= 0) {
2327                                 if (!nowait) {
2328                                         if (err == 0)
2329                                                 return 0;
2330                                         goto nla_put_failure;
2331                                 } else {
2332                                         if (err == -EMSGSIZE)
2333                                                 goto nla_put_failure;
2334                                         error = err;
2335                                 }
2336                         }
2337                 } else
2338 #endif
2339                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2340                                 goto nla_put_failure;
2341         }
2342
2343         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2344                 goto nla_put_failure;
2345
2346         return nlmsg_end(skb, nlh);
2347
2348 nla_put_failure:
2349         nlmsg_cancel(skb, nlh);
2350         return -EMSGSIZE;
2351 }
2352
2353 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2354 {
2355         struct net *net = sock_net(in_skb->sk);
2356         struct rtmsg *rtm;
2357         struct nlattr *tb[RTA_MAX+1];
2358         struct rtable *rt = NULL;
2359         struct flowi4 fl4;
2360         __be32 dst = 0;
2361         __be32 src = 0;
2362         u32 iif;
2363         int err;
2364         int mark;
2365         struct sk_buff *skb;
2366         kuid_t uid;
2367
2368         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2369         if (err < 0)
2370                 goto errout;
2371
2372         rtm = nlmsg_data(nlh);
2373
2374         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2375         if (skb == NULL) {
2376                 err = -ENOBUFS;
2377                 goto errout;
2378         }
2379
2380         /* Reserve room for dummy headers, this skb can pass
2381            through good chunk of routing engine.
2382          */
2383         skb_reset_mac_header(skb);
2384         skb_reset_network_header(skb);
2385
2386         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2387         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2388         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2389
2390         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2391         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2392         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2393         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2394         if (tb[RTA_UID])
2395                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2396         else
2397                 uid = (iif ? INVALID_UID : current_uid());
2398
2399         memset(&fl4, 0, sizeof(fl4));
2400         fl4.daddr = dst;
2401         fl4.saddr = src;
2402         fl4.flowi4_tos = rtm->rtm_tos;
2403         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2404         fl4.flowi4_mark = mark;
2405         fl4.flowi4_uid = uid;
2406
2407         if (iif) {
2408                 struct net_device *dev;
2409
2410                 dev = __dev_get_by_index(net, iif);
2411                 if (dev == NULL) {
2412                         err = -ENODEV;
2413                         goto errout_free;
2414                 }
2415
2416                 skb->protocol   = htons(ETH_P_IP);
2417                 skb->dev        = dev;
2418                 skb->mark       = mark;
2419                 local_bh_disable();
2420                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2421                 local_bh_enable();
2422
2423                 rt = skb_rtable(skb);
2424                 if (err == 0 && rt->dst.error)
2425                         err = -rt->dst.error;
2426         } else {
2427                 rt = ip_route_output_key(net, &fl4);
2428
2429                 err = 0;
2430                 if (IS_ERR(rt))
2431                         err = PTR_ERR(rt);
2432         }
2433
2434         if (err)
2435                 goto errout_free;
2436
2437         skb_dst_set(skb, &rt->dst);
2438         if (rtm->rtm_flags & RTM_F_NOTIFY)
2439                 rt->rt_flags |= RTCF_NOTIFY;
2440
2441         err = rt_fill_info(net, dst, src, &fl4, skb,
2442                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2443                            RTM_NEWROUTE, 0, 0);
2444         if (err <= 0)
2445                 goto errout_free;
2446
2447         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2448 errout:
2449         return err;
2450
2451 errout_free:
2452         kfree_skb(skb);
2453         goto errout;
2454 }
2455
2456 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2457 {
2458         return skb->len;
2459 }
2460
2461 void ip_rt_multicast_event(struct in_device *in_dev)
2462 {
2463         rt_cache_flush(dev_net(in_dev->dev));
2464 }
2465
2466 #ifdef CONFIG_SYSCTL
2467 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2468 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2469 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2470 static int ip_rt_gc_elasticity __read_mostly    = 8;
2471
2472 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2473                                         void __user *buffer,
2474                                         size_t *lenp, loff_t *ppos)
2475 {
2476         if (write) {
2477                 rt_cache_flush((struct net *)__ctl->extra1);
2478                 return 0;
2479         }
2480
2481         return -EINVAL;
2482 }
2483
2484 static ctl_table ipv4_route_table[] = {
2485         {
2486                 .procname       = "gc_thresh",
2487                 .data           = &ipv4_dst_ops.gc_thresh,
2488                 .maxlen         = sizeof(int),
2489                 .mode           = 0644,
2490                 .proc_handler   = proc_dointvec,
2491         },
2492         {
2493                 .procname       = "max_size",
2494                 .data           = &ip_rt_max_size,
2495                 .maxlen         = sizeof(int),
2496                 .mode           = 0644,
2497                 .proc_handler   = proc_dointvec,
2498         },
2499         {
2500                 /*  Deprecated. Use gc_min_interval_ms */
2501
2502                 .procname       = "gc_min_interval",
2503                 .data           = &ip_rt_gc_min_interval,
2504                 .maxlen         = sizeof(int),
2505                 .mode           = 0644,
2506                 .proc_handler   = proc_dointvec_jiffies,
2507         },
2508         {
2509                 .procname       = "gc_min_interval_ms",
2510                 .data           = &ip_rt_gc_min_interval,
2511                 .maxlen         = sizeof(int),
2512                 .mode           = 0644,
2513                 .proc_handler   = proc_dointvec_ms_jiffies,
2514         },
2515         {
2516                 .procname       = "gc_timeout",
2517                 .data           = &ip_rt_gc_timeout,
2518                 .maxlen         = sizeof(int),
2519                 .mode           = 0644,
2520                 .proc_handler   = proc_dointvec_jiffies,
2521         },
2522         {
2523                 .procname       = "gc_interval",
2524                 .data           = &ip_rt_gc_interval,
2525                 .maxlen         = sizeof(int),
2526                 .mode           = 0644,
2527                 .proc_handler   = proc_dointvec_jiffies,
2528         },
2529         {
2530                 .procname       = "redirect_load",
2531                 .data           = &ip_rt_redirect_load,
2532                 .maxlen         = sizeof(int),
2533                 .mode           = 0644,
2534                 .proc_handler   = proc_dointvec,
2535         },
2536         {
2537                 .procname       = "redirect_number",
2538                 .data           = &ip_rt_redirect_number,
2539                 .maxlen         = sizeof(int),
2540                 .mode           = 0644,
2541                 .proc_handler   = proc_dointvec,
2542         },
2543         {
2544                 .procname       = "redirect_silence",
2545                 .data           = &ip_rt_redirect_silence,
2546                 .maxlen         = sizeof(int),
2547                 .mode           = 0644,
2548                 .proc_handler   = proc_dointvec,
2549         },
2550         {
2551                 .procname       = "error_cost",
2552                 .data           = &ip_rt_error_cost,
2553                 .maxlen         = sizeof(int),
2554                 .mode           = 0644,
2555                 .proc_handler   = proc_dointvec,
2556         },
2557         {
2558                 .procname       = "error_burst",
2559                 .data           = &ip_rt_error_burst,
2560                 .maxlen         = sizeof(int),
2561                 .mode           = 0644,
2562                 .proc_handler   = proc_dointvec,
2563         },
2564         {
2565                 .procname       = "gc_elasticity",
2566                 .data           = &ip_rt_gc_elasticity,
2567                 .maxlen         = sizeof(int),
2568                 .mode           = 0644,
2569                 .proc_handler   = proc_dointvec,
2570         },
2571         {
2572                 .procname       = "mtu_expires",
2573                 .data           = &ip_rt_mtu_expires,
2574                 .maxlen         = sizeof(int),
2575                 .mode           = 0644,
2576                 .proc_handler   = proc_dointvec_jiffies,
2577         },
2578         {
2579                 .procname       = "min_pmtu",
2580                 .data           = &ip_rt_min_pmtu,
2581                 .maxlen         = sizeof(int),
2582                 .mode           = 0644,
2583                 .proc_handler   = proc_dointvec,
2584         },
2585         {
2586                 .procname       = "min_adv_mss",
2587                 .data           = &ip_rt_min_advmss,
2588                 .maxlen         = sizeof(int),
2589                 .mode           = 0644,
2590                 .proc_handler   = proc_dointvec,
2591         },
2592         { }
2593 };
2594
2595 static struct ctl_table ipv4_route_flush_table[] = {
2596         {
2597                 .procname       = "flush",
2598                 .maxlen         = sizeof(int),
2599                 .mode           = 0200,
2600                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2601         },
2602         { },
2603 };
2604
2605 static __net_init int sysctl_route_net_init(struct net *net)
2606 {
2607         struct ctl_table *tbl;
2608
2609         tbl = ipv4_route_flush_table;
2610         if (!net_eq(net, &init_net)) {
2611                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2612                 if (tbl == NULL)
2613                         goto err_dup;
2614
2615                 /* Don't export sysctls to unprivileged users */
2616                 if (net->user_ns != &init_user_ns)
2617                         tbl[0].procname = NULL;
2618         }
2619         tbl[0].extra1 = net;
2620
2621         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2622         if (net->ipv4.route_hdr == NULL)
2623                 goto err_reg;
2624         return 0;
2625
2626 err_reg:
2627         if (tbl != ipv4_route_flush_table)
2628                 kfree(tbl);
2629 err_dup:
2630         return -ENOMEM;
2631 }
2632
2633 static __net_exit void sysctl_route_net_exit(struct net *net)
2634 {
2635         struct ctl_table *tbl;
2636
2637         tbl = net->ipv4.route_hdr->ctl_table_arg;
2638         unregister_net_sysctl_table(net->ipv4.route_hdr);
2639         BUG_ON(tbl == ipv4_route_flush_table);
2640         kfree(tbl);
2641 }
2642
2643 static __net_initdata struct pernet_operations sysctl_route_ops = {
2644         .init = sysctl_route_net_init,
2645         .exit = sysctl_route_net_exit,
2646 };
2647 #endif
2648
2649 static __net_init int rt_genid_init(struct net *net)
2650 {
2651         atomic_set(&net->rt_genid, 0);
2652         get_random_bytes(&net->ipv4.dev_addr_genid,
2653                          sizeof(net->ipv4.dev_addr_genid));
2654         return 0;
2655 }
2656
2657 static __net_initdata struct pernet_operations rt_genid_ops = {
2658         .init = rt_genid_init,
2659 };
2660
2661 static int __net_init ipv4_inetpeer_init(struct net *net)
2662 {
2663         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2664
2665         if (!bp)
2666                 return -ENOMEM;
2667         inet_peer_base_init(bp);
2668         net->ipv4.peers = bp;
2669         return 0;
2670 }
2671
2672 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2673 {
2674         struct inet_peer_base *bp = net->ipv4.peers;
2675
2676         net->ipv4.peers = NULL;
2677         inetpeer_invalidate_tree(bp);
2678         kfree(bp);
2679 }
2680
2681 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2682         .init   =       ipv4_inetpeer_init,
2683         .exit   =       ipv4_inetpeer_exit,
2684 };
2685
2686 #ifdef CONFIG_IP_ROUTE_CLASSID
2687 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2688 #endif /* CONFIG_IP_ROUTE_CLASSID */
2689
2690 int __init ip_rt_init(void)
2691 {
2692         int rc = 0;
2693
2694         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2695         if (!ip_idents)
2696                 panic("IP: failed to allocate ip_idents\n");
2697
2698         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2699
2700 #ifdef CONFIG_IP_ROUTE_CLASSID
2701         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2702         if (!ip_rt_acct)
2703                 panic("IP: failed to allocate ip_rt_acct\n");
2704 #endif
2705
2706         ipv4_dst_ops.kmem_cachep =
2707                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2708                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2709
2710         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2711
2712         if (dst_entries_init(&ipv4_dst_ops) < 0)
2713                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2714
2715         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2716                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2717
2718         ipv4_dst_ops.gc_thresh = ~0;
2719         ip_rt_max_size = INT_MAX;
2720
2721         devinet_init();
2722         ip_fib_init();
2723
2724         if (ip_rt_proc_init())
2725                 pr_err("Unable to create route proc files\n");
2726 #ifdef CONFIG_XFRM
2727         xfrm_init();
2728         xfrm4_init();
2729 #endif
2730         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2731
2732 #ifdef CONFIG_SYSCTL
2733         register_pernet_subsys(&sysctl_route_ops);
2734 #endif
2735         register_pernet_subsys(&rt_genid_ops);
2736         register_pernet_subsys(&ipv4_inetpeer_ops);
2737         return rc;
2738 }
2739
2740 #ifdef CONFIG_SYSCTL
2741 /*
2742  * We really need to sanitize the damn ipv4 init order, then all
2743  * this nonsense will go away.
2744  */
2745 void __init ip_static_sysctl_init(void)
2746 {
2747         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2748 }
2749 #endif