Merge branch 'linux-linaro-lsk-v3.10' into linux-linaro-lsk-v3.10-android
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #include <linux/kmemleak.h>
110 #endif
111 #include <net/secure_seq.h>
112
113 #define RT_FL_TOS(oldflp4) \
114         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 /*
131  *      Interface to generic destination cache.
132  */
133
134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
135 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
136 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
137 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
138 static void              ipv4_link_failure(struct sk_buff *skb);
139 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
140                                            struct sk_buff *skb, u32 mtu);
141 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
142                                         struct sk_buff *skb);
143 static void             ipv4_dst_destroy(struct dst_entry *dst);
144
145 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
146                             int how)
147 {
148 }
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             cpu_to_be16(ETH_P_IP),
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .ifdown =               ipv4_dst_ifdown,
169         .negative_advice =      ipv4_negative_advice,
170         .link_failure =         ipv4_link_failure,
171         .update_pmtu =          ip_rt_update_pmtu,
172         .redirect =             ip_do_redirect,
173         .local_out =            __ip_local_out,
174         .neigh_lookup =         ipv4_neigh_lookup,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .owner   = THIS_MODULE,
244         .open    = rt_cache_seq_open,
245         .read    = seq_read,
246         .llseek  = seq_lseek,
247         .release = seq_release,
248 };
249
250
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252 {
253         int cpu;
254
255         if (*pos == 0)
256                 return SEQ_START_TOKEN;
257
258         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259                 if (!cpu_possible(cpu))
260                         continue;
261                 *pos = cpu+1;
262                 return &per_cpu(rt_cache_stat, cpu);
263         }
264         return NULL;
265 }
266
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269         int cpu;
270
271         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272                 if (!cpu_possible(cpu))
273                         continue;
274                 *pos = cpu+1;
275                 return &per_cpu(rt_cache_stat, cpu);
276         }
277         return NULL;
278
279 }
280
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283
284 }
285
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288         struct rt_cache_stat *st = v;
289
290         if (v == SEQ_START_TOKEN) {
291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292                 return 0;
293         }
294
295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297                    dst_entries_get_slow(&ipv4_dst_ops),
298                    st->in_hit,
299                    st->in_slow_tot,
300                    st->in_slow_mc,
301                    st->in_no_route,
302                    st->in_brd,
303                    st->in_martian_dst,
304                    st->in_martian_src,
305
306                    st->out_hit,
307                    st->out_slow_tot,
308                    st->out_slow_mc,
309
310                    st->gc_total,
311                    st->gc_ignored,
312                    st->gc_goal_miss,
313                    st->gc_dst_overflow,
314                    st->in_hlist_search,
315                    st->out_hlist_search
316                 );
317         return 0;
318 }
319
320 static const struct seq_operations rt_cpu_seq_ops = {
321         .start  = rt_cpu_seq_start,
322         .next   = rt_cpu_seq_next,
323         .stop   = rt_cpu_seq_stop,
324         .show   = rt_cpu_seq_show,
325 };
326
327
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330         return seq_open(file, &rt_cpu_seq_ops);
331 }
332
333 static const struct file_operations rt_cpu_seq_fops = {
334         .owner   = THIS_MODULE,
335         .open    = rt_cpu_seq_open,
336         .read    = seq_read,
337         .llseek  = seq_lseek,
338         .release = seq_release,
339 };
340
341 #ifdef CONFIG_IP_ROUTE_CLASSID
342 static int rt_acct_proc_show(struct seq_file *m, void *v)
343 {
344         struct ip_rt_acct *dst, *src;
345         unsigned int i, j;
346
347         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348         if (!dst)
349                 return -ENOMEM;
350
351         for_each_possible_cpu(i) {
352                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353                 for (j = 0; j < 256; j++) {
354                         dst[j].o_bytes   += src[j].o_bytes;
355                         dst[j].o_packets += src[j].o_packets;
356                         dst[j].i_bytes   += src[j].i_bytes;
357                         dst[j].i_packets += src[j].i_packets;
358                 }
359         }
360
361         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362         kfree(dst);
363         return 0;
364 }
365
366 static int rt_acct_proc_open(struct inode *inode, struct file *file)
367 {
368         return single_open(file, rt_acct_proc_show, NULL);
369 }
370
371 static const struct file_operations rt_acct_proc_fops = {
372         .owner          = THIS_MODULE,
373         .open           = rt_acct_proc_open,
374         .read           = seq_read,
375         .llseek         = seq_lseek,
376         .release        = single_release,
377 };
378 #endif
379
380 static int __net_init ip_rt_do_proc_init(struct net *net)
381 {
382         struct proc_dir_entry *pde;
383
384         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385                           &rt_cache_seq_fops);
386         if (!pde)
387                 goto err1;
388
389         pde = proc_create("rt_cache", S_IRUGO,
390                           net->proc_net_stat, &rt_cpu_seq_fops);
391         if (!pde)
392                 goto err2;
393
394 #ifdef CONFIG_IP_ROUTE_CLASSID
395         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
396         if (!pde)
397                 goto err3;
398 #endif
399         return 0;
400
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 err3:
403         remove_proc_entry("rt_cache", net->proc_net_stat);
404 #endif
405 err2:
406         remove_proc_entry("rt_cache", net->proc_net);
407 err1:
408         return -ENOMEM;
409 }
410
411 static void __net_exit ip_rt_do_proc_exit(struct net *net)
412 {
413         remove_proc_entry("rt_cache", net->proc_net_stat);
414         remove_proc_entry("rt_cache", net->proc_net);
415 #ifdef CONFIG_IP_ROUTE_CLASSID
416         remove_proc_entry("rt_acct", net->proc_net);
417 #endif
418 }
419
420 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
421         .init = ip_rt_do_proc_init,
422         .exit = ip_rt_do_proc_exit,
423 };
424
425 static int __init ip_rt_proc_init(void)
426 {
427         return register_pernet_subsys(&ip_rt_proc_ops);
428 }
429
430 #else
431 static inline int ip_rt_proc_init(void)
432 {
433         return 0;
434 }
435 #endif /* CONFIG_PROC_FS */
436
437 static inline bool rt_is_expired(const struct rtable *rth)
438 {
439         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
440 }
441
442 void rt_cache_flush(struct net *net)
443 {
444         rt_genid_bump(net);
445 }
446
447 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448                                            struct sk_buff *skb,
449                                            const void *daddr)
450 {
451         struct net_device *dev = dst->dev;
452         const __be32 *pkey = daddr;
453         const struct rtable *rt;
454         struct neighbour *n;
455
456         rt = (const struct rtable *) dst;
457         if (rt->rt_gateway)
458                 pkey = (const __be32 *) &rt->rt_gateway;
459         else if (skb)
460                 pkey = &ip_hdr(skb)->daddr;
461
462         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
463         if (n)
464                 return n;
465         return neigh_create(&arp_tbl, pkey, dev);
466 }
467
468 #define IP_IDENTS_SZ 2048u
469 struct ip_ident_bucket {
470         atomic_t        id;
471         u32             stamp32;
472 };
473
474 static struct ip_ident_bucket *ip_idents __read_mostly;
475
476 /* In order to protect privacy, we add a perturbation to identifiers
477  * if one generator is seldom used. This makes hard for an attacker
478  * to infer how many packets were sent between two points in time.
479  */
480 u32 ip_idents_reserve(u32 hash, int segs)
481 {
482         struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
483         u32 old = ACCESS_ONCE(bucket->stamp32);
484         u32 now = (u32)jiffies;
485         u32 delta = 0;
486
487         if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
488                 u64 x = prandom_u32();
489
490                 x *= (now - old);
491                 delta = (u32)(x >> 32);
492         }
493
494         return atomic_add_return(segs + delta, &bucket->id) - segs;
495 }
496 EXPORT_SYMBOL(ip_idents_reserve);
497
498 void __ip_select_ident(struct iphdr *iph, int segs)
499 {
500         static u32 ip_idents_hashrnd __read_mostly;
501         static bool hashrnd_initialized = false;
502         u32 hash, id;
503
504         if (unlikely(!hashrnd_initialized)) {
505                 hashrnd_initialized = true;
506                 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
507         }
508
509         hash = jhash_3words((__force u32)iph->daddr,
510                             (__force u32)iph->saddr,
511                             iph->protocol,
512                             ip_idents_hashrnd);
513         id = ip_idents_reserve(hash, segs);
514         iph->id = htons(id);
515 }
516 EXPORT_SYMBOL(__ip_select_ident);
517
518 static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
519                              const struct iphdr *iph,
520                              int oif, u8 tos,
521                              u8 prot, u32 mark, int flow_flags)
522 {
523         if (sk) {
524                 const struct inet_sock *inet = inet_sk(sk);
525
526                 oif = sk->sk_bound_dev_if;
527                 mark = sk->sk_mark;
528                 tos = RT_CONN_FLAGS(sk);
529                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
530         }
531         flowi4_init_output(fl4, oif, mark, tos,
532                            RT_SCOPE_UNIVERSE, prot,
533                            flow_flags,
534                            iph->daddr, iph->saddr, 0, 0,
535                            sk ? sock_i_uid(sk) : 0);
536 }
537
538 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
539                                struct sock *sk)
540 {
541         const struct iphdr *iph = ip_hdr(skb);
542         int oif = skb->dev->ifindex;
543         u8 tos = RT_TOS(iph->tos);
544         u8 prot = iph->protocol;
545         u32 mark = skb->mark;
546
547         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
548 }
549
550 static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
551 {
552         const struct inet_sock *inet = inet_sk(sk);
553         const struct ip_options_rcu *inet_opt;
554         __be32 daddr = inet->inet_daddr;
555
556         rcu_read_lock();
557         inet_opt = rcu_dereference(inet->inet_opt);
558         if (inet_opt && inet_opt->opt.srr)
559                 daddr = inet_opt->opt.faddr;
560         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
561                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
562                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
563                            inet_sk_flowi_flags(sk),
564                            daddr, inet->inet_saddr, 0, 0,
565                            sock_i_uid(sk));
566         rcu_read_unlock();
567 }
568
569 static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
570                                  const struct sk_buff *skb)
571 {
572         if (skb)
573                 build_skb_flow_key(fl4, skb, sk);
574         else
575                 build_sk_flow_key(fl4, sk);
576 }
577
578 static inline void rt_free(struct rtable *rt)
579 {
580         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
581 }
582
583 static DEFINE_SPINLOCK(fnhe_lock);
584
585 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
586 {
587         struct fib_nh_exception *fnhe, *oldest;
588         struct rtable *orig;
589
590         oldest = rcu_dereference(hash->chain);
591         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
592              fnhe = rcu_dereference(fnhe->fnhe_next)) {
593                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
594                         oldest = fnhe;
595         }
596         orig = rcu_dereference(oldest->fnhe_rth);
597         if (orig) {
598                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
599                 rt_free(orig);
600         }
601         return oldest;
602 }
603
604 static inline u32 fnhe_hashfun(__be32 daddr)
605 {
606         u32 hval;
607
608         hval = (__force u32) daddr;
609         hval ^= (hval >> 11) ^ (hval >> 22);
610
611         return hval & (FNHE_HASH_SIZE - 1);
612 }
613
614 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
615                                   u32 pmtu, unsigned long expires)
616 {
617         struct fnhe_hash_bucket *hash;
618         struct fib_nh_exception *fnhe;
619         int depth;
620         u32 hval = fnhe_hashfun(daddr);
621
622         spin_lock_bh(&fnhe_lock);
623
624         hash = nh->nh_exceptions;
625         if (!hash) {
626                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
627                 if (!hash)
628                         goto out_unlock;
629                 nh->nh_exceptions = hash;
630         }
631
632         hash += hval;
633
634         depth = 0;
635         for (fnhe = rcu_dereference(hash->chain); fnhe;
636              fnhe = rcu_dereference(fnhe->fnhe_next)) {
637                 if (fnhe->fnhe_daddr == daddr)
638                         break;
639                 depth++;
640         }
641
642         if (fnhe) {
643                 if (gw)
644                         fnhe->fnhe_gw = gw;
645                 if (pmtu) {
646                         fnhe->fnhe_pmtu = pmtu;
647                         fnhe->fnhe_expires = expires;
648                 }
649         } else {
650                 if (depth > FNHE_RECLAIM_DEPTH)
651                         fnhe = fnhe_oldest(hash);
652                 else {
653                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
654                         if (!fnhe)
655                                 goto out_unlock;
656
657                         fnhe->fnhe_next = hash->chain;
658                         rcu_assign_pointer(hash->chain, fnhe);
659                 }
660                 fnhe->fnhe_daddr = daddr;
661                 fnhe->fnhe_gw = gw;
662                 fnhe->fnhe_pmtu = pmtu;
663                 fnhe->fnhe_expires = expires;
664         }
665
666         fnhe->fnhe_stamp = jiffies;
667
668 out_unlock:
669         spin_unlock_bh(&fnhe_lock);
670         return;
671 }
672
673 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
674                              bool kill_route)
675 {
676         __be32 new_gw = icmp_hdr(skb)->un.gateway;
677         __be32 old_gw = ip_hdr(skb)->saddr;
678         struct net_device *dev = skb->dev;
679         struct in_device *in_dev;
680         struct fib_result res;
681         struct neighbour *n;
682         struct net *net;
683
684         switch (icmp_hdr(skb)->code & 7) {
685         case ICMP_REDIR_NET:
686         case ICMP_REDIR_NETTOS:
687         case ICMP_REDIR_HOST:
688         case ICMP_REDIR_HOSTTOS:
689                 break;
690
691         default:
692                 return;
693         }
694
695         if (rt->rt_gateway != old_gw)
696                 return;
697
698         in_dev = __in_dev_get_rcu(dev);
699         if (!in_dev)
700                 return;
701
702         net = dev_net(dev);
703         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
704             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
705             ipv4_is_zeronet(new_gw))
706                 goto reject_redirect;
707
708         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
709                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
710                         goto reject_redirect;
711                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
712                         goto reject_redirect;
713         } else {
714                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
715                         goto reject_redirect;
716         }
717
718         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
719         if (n) {
720                 if (!(n->nud_state & NUD_VALID)) {
721                         neigh_event_send(n, NULL);
722                 } else {
723                         if (fib_lookup(net, fl4, &res) == 0) {
724                                 struct fib_nh *nh = &FIB_RES_NH(res);
725
726                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
727                                                       0, 0);
728                         }
729                         if (kill_route)
730                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
731                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
732                 }
733                 neigh_release(n);
734         }
735         return;
736
737 reject_redirect:
738 #ifdef CONFIG_IP_ROUTE_VERBOSE
739         if (IN_DEV_LOG_MARTIANS(in_dev)) {
740                 const struct iphdr *iph = (const struct iphdr *) skb->data;
741                 __be32 daddr = iph->daddr;
742                 __be32 saddr = iph->saddr;
743
744                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
745                                      "  Advised path = %pI4 -> %pI4\n",
746                                      &old_gw, dev->name, &new_gw,
747                                      &saddr, &daddr);
748         }
749 #endif
750         ;
751 }
752
753 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
754 {
755         struct rtable *rt;
756         struct flowi4 fl4;
757         const struct iphdr *iph = (const struct iphdr *) skb->data;
758         int oif = skb->dev->ifindex;
759         u8 tos = RT_TOS(iph->tos);
760         u8 prot = iph->protocol;
761         u32 mark = skb->mark;
762
763         rt = (struct rtable *) dst;
764
765         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
766         __ip_do_redirect(rt, skb, &fl4, true);
767 }
768
769 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
770 {
771         struct rtable *rt = (struct rtable *)dst;
772         struct dst_entry *ret = dst;
773
774         if (rt) {
775                 if (dst->obsolete > 0) {
776                         ip_rt_put(rt);
777                         ret = NULL;
778                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
779                            rt->dst.expires) {
780                         ip_rt_put(rt);
781                         ret = NULL;
782                 }
783         }
784         return ret;
785 }
786
787 /*
788  * Algorithm:
789  *      1. The first ip_rt_redirect_number redirects are sent
790  *         with exponential backoff, then we stop sending them at all,
791  *         assuming that the host ignores our redirects.
792  *      2. If we did not see packets requiring redirects
793  *         during ip_rt_redirect_silence, we assume that the host
794  *         forgot redirected route and start to send redirects again.
795  *
796  * This algorithm is much cheaper and more intelligent than dumb load limiting
797  * in icmp.c.
798  *
799  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
800  * and "frag. need" (breaks PMTU discovery) in icmp.c.
801  */
802
803 void ip_rt_send_redirect(struct sk_buff *skb)
804 {
805         struct rtable *rt = skb_rtable(skb);
806         struct in_device *in_dev;
807         struct inet_peer *peer;
808         struct net *net;
809         int log_martians;
810
811         rcu_read_lock();
812         in_dev = __in_dev_get_rcu(rt->dst.dev);
813         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
814                 rcu_read_unlock();
815                 return;
816         }
817         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
818         rcu_read_unlock();
819
820         net = dev_net(rt->dst.dev);
821         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
822         if (!peer) {
823                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
824                           rt_nexthop(rt, ip_hdr(skb)->daddr));
825                 return;
826         }
827
828         /* No redirected packets during ip_rt_redirect_silence;
829          * reset the algorithm.
830          */
831         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
832                 peer->rate_tokens = 0;
833
834         /* Too many ignored redirects; do not send anything
835          * set dst.rate_last to the last seen redirected packet.
836          */
837         if (peer->rate_tokens >= ip_rt_redirect_number) {
838                 peer->rate_last = jiffies;
839                 goto out_put_peer;
840         }
841
842         /* Check for load limit; set rate_last to the latest sent
843          * redirect.
844          */
845         if (peer->rate_tokens == 0 ||
846             time_after(jiffies,
847                        (peer->rate_last +
848                         (ip_rt_redirect_load << peer->rate_tokens)))) {
849                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
850
851                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
852                 peer->rate_last = jiffies;
853                 ++peer->rate_tokens;
854 #ifdef CONFIG_IP_ROUTE_VERBOSE
855                 if (log_martians &&
856                     peer->rate_tokens == ip_rt_redirect_number)
857                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
858                                              &ip_hdr(skb)->saddr, inet_iif(skb),
859                                              &ip_hdr(skb)->daddr, &gw);
860 #endif
861         }
862 out_put_peer:
863         inet_putpeer(peer);
864 }
865
866 static int ip_error(struct sk_buff *skb)
867 {
868         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
869         struct rtable *rt = skb_rtable(skb);
870         struct inet_peer *peer;
871         unsigned long now;
872         struct net *net;
873         bool send;
874         int code;
875
876         /* IP on this device is disabled. */
877         if (!in_dev)
878                 goto out;
879
880         net = dev_net(rt->dst.dev);
881         if (!IN_DEV_FORWARD(in_dev)) {
882                 switch (rt->dst.error) {
883                 case EHOSTUNREACH:
884                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
885                         break;
886
887                 case ENETUNREACH:
888                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
889                         break;
890                 }
891                 goto out;
892         }
893
894         switch (rt->dst.error) {
895         case EINVAL:
896         default:
897                 goto out;
898         case EHOSTUNREACH:
899                 code = ICMP_HOST_UNREACH;
900                 break;
901         case ENETUNREACH:
902                 code = ICMP_NET_UNREACH;
903                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
904                 break;
905         case EACCES:
906                 code = ICMP_PKT_FILTERED;
907                 break;
908         }
909
910         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
911
912         send = true;
913         if (peer) {
914                 now = jiffies;
915                 peer->rate_tokens += now - peer->rate_last;
916                 if (peer->rate_tokens > ip_rt_error_burst)
917                         peer->rate_tokens = ip_rt_error_burst;
918                 peer->rate_last = now;
919                 if (peer->rate_tokens >= ip_rt_error_cost)
920                         peer->rate_tokens -= ip_rt_error_cost;
921                 else
922                         send = false;
923                 inet_putpeer(peer);
924         }
925         if (send)
926                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
927
928 out:    kfree_skb(skb);
929         return 0;
930 }
931
932 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
933 {
934         struct dst_entry *dst = &rt->dst;
935         struct fib_result res;
936
937         if (dst_metric_locked(dst, RTAX_MTU))
938                 return;
939
940         if (dst->dev->mtu < mtu)
941                 return;
942
943         if (mtu < ip_rt_min_pmtu)
944                 mtu = ip_rt_min_pmtu;
945
946         if (!rt->rt_pmtu) {
947                 dst->obsolete = DST_OBSOLETE_KILL;
948         } else {
949                 rt->rt_pmtu = mtu;
950                 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
951         }
952
953         rcu_read_lock();
954         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
955                 struct fib_nh *nh = &FIB_RES_NH(res);
956
957                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
958                                       jiffies + ip_rt_mtu_expires);
959         }
960         rcu_read_unlock();
961 }
962
963 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
964                               struct sk_buff *skb, u32 mtu)
965 {
966         struct rtable *rt = (struct rtable *) dst;
967         struct flowi4 fl4;
968
969         ip_rt_build_flow_key(&fl4, sk, skb);
970         __ip_rt_update_pmtu(rt, &fl4, mtu);
971 }
972
973 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
974                       int oif, u32 mark, u8 protocol, int flow_flags)
975 {
976         const struct iphdr *iph = (const struct iphdr *) skb->data;
977         struct flowi4 fl4;
978         struct rtable *rt;
979
980         if (!mark)
981                 mark = IP4_REPLY_MARK(net, skb->mark);
982
983         __build_flow_key(&fl4, NULL, iph, oif,
984                          RT_TOS(iph->tos), protocol, mark, flow_flags);
985         rt = __ip_route_output_key(net, &fl4);
986         if (!IS_ERR(rt)) {
987                 __ip_rt_update_pmtu(rt, &fl4, mtu);
988                 ip_rt_put(rt);
989         }
990 }
991 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
992
993 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
994 {
995         const struct iphdr *iph = (const struct iphdr *) skb->data;
996         struct flowi4 fl4;
997         struct rtable *rt;
998
999         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1000
1001         if (!fl4.flowi4_mark)
1002                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1003
1004         rt = __ip_route_output_key(sock_net(sk), &fl4);
1005         if (!IS_ERR(rt)) {
1006                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1007                 ip_rt_put(rt);
1008         }
1009 }
1010
1011 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1012 {
1013         const struct iphdr *iph = (const struct iphdr *) skb->data;
1014         struct flowi4 fl4;
1015         struct rtable *rt;
1016         struct dst_entry *odst = NULL;
1017         bool new = false;
1018
1019         bh_lock_sock(sk);
1020         odst = sk_dst_get(sk);
1021
1022         if (sock_owned_by_user(sk) || !odst) {
1023                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1024                 goto out;
1025         }
1026
1027         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1028
1029         rt = (struct rtable *)odst;
1030         if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
1031                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1032                 if (IS_ERR(rt))
1033                         goto out;
1034
1035                 new = true;
1036         }
1037
1038         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1039
1040         if (!dst_check(&rt->dst, 0)) {
1041                 if (new)
1042                         dst_release(&rt->dst);
1043
1044                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1045                 if (IS_ERR(rt))
1046                         goto out;
1047
1048                 new = true;
1049         }
1050
1051         if (new)
1052                 sk_dst_set(sk, &rt->dst);
1053
1054 out:
1055         bh_unlock_sock(sk);
1056         dst_release(odst);
1057 }
1058 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1059
1060 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1061                    int oif, u32 mark, u8 protocol, int flow_flags)
1062 {
1063         const struct iphdr *iph = (const struct iphdr *) skb->data;
1064         struct flowi4 fl4;
1065         struct rtable *rt;
1066
1067         __build_flow_key(&fl4, NULL, iph, oif,
1068                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1069         rt = __ip_route_output_key(net, &fl4);
1070         if (!IS_ERR(rt)) {
1071                 __ip_do_redirect(rt, skb, &fl4, false);
1072                 ip_rt_put(rt);
1073         }
1074 }
1075 EXPORT_SYMBOL_GPL(ipv4_redirect);
1076
1077 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1078 {
1079         const struct iphdr *iph = (const struct iphdr *) skb->data;
1080         struct flowi4 fl4;
1081         struct rtable *rt;
1082
1083         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1084         rt = __ip_route_output_key(sock_net(sk), &fl4);
1085         if (!IS_ERR(rt)) {
1086                 __ip_do_redirect(rt, skb, &fl4, false);
1087                 ip_rt_put(rt);
1088         }
1089 }
1090 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1091
1092 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1093 {
1094         struct rtable *rt = (struct rtable *) dst;
1095
1096         /* All IPV4 dsts are created with ->obsolete set to the value
1097          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1098          * into this function always.
1099          *
1100          * When a PMTU/redirect information update invalidates a
1101          * route, this is indicated by setting obsolete to
1102          * DST_OBSOLETE_KILL.
1103          */
1104         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1105                 return NULL;
1106         return dst;
1107 }
1108
1109 static void ipv4_link_failure(struct sk_buff *skb)
1110 {
1111         struct rtable *rt;
1112
1113         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1114
1115         rt = skb_rtable(skb);
1116         if (rt)
1117                 dst_set_expires(&rt->dst, 0);
1118 }
1119
1120 static int ip_rt_bug(struct sk_buff *skb)
1121 {
1122         pr_debug("%s: %pI4 -> %pI4, %s\n",
1123                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1124                  skb->dev ? skb->dev->name : "?");
1125         kfree_skb(skb);
1126         WARN_ON(1);
1127         return 0;
1128 }
1129
1130 /*
1131    We do not cache source address of outgoing interface,
1132    because it is used only by IP RR, TS and SRR options,
1133    so that it out of fast path.
1134
1135    BTW remember: "addr" is allowed to be not aligned
1136    in IP options!
1137  */
1138
1139 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1140 {
1141         __be32 src;
1142
1143         if (rt_is_output_route(rt))
1144                 src = ip_hdr(skb)->saddr;
1145         else {
1146                 struct fib_result res;
1147                 struct flowi4 fl4;
1148                 struct iphdr *iph;
1149
1150                 iph = ip_hdr(skb);
1151
1152                 memset(&fl4, 0, sizeof(fl4));
1153                 fl4.daddr = iph->daddr;
1154                 fl4.saddr = iph->saddr;
1155                 fl4.flowi4_tos = RT_TOS(iph->tos);
1156                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1157                 fl4.flowi4_iif = skb->dev->ifindex;
1158                 fl4.flowi4_mark = skb->mark;
1159
1160                 rcu_read_lock();
1161                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1162                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1163                 else
1164                         src = inet_select_addr(rt->dst.dev,
1165                                                rt_nexthop(rt, iph->daddr),
1166                                                RT_SCOPE_UNIVERSE);
1167                 rcu_read_unlock();
1168         }
1169         memcpy(addr, &src, 4);
1170 }
1171
1172 #ifdef CONFIG_IP_ROUTE_CLASSID
1173 static void set_class_tag(struct rtable *rt, u32 tag)
1174 {
1175         if (!(rt->dst.tclassid & 0xFFFF))
1176                 rt->dst.tclassid |= tag & 0xFFFF;
1177         if (!(rt->dst.tclassid & 0xFFFF0000))
1178                 rt->dst.tclassid |= tag & 0xFFFF0000;
1179 }
1180 #endif
1181
1182 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1183 {
1184         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1185
1186         if (advmss == 0) {
1187                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1188                                ip_rt_min_advmss);
1189                 if (advmss > 65535 - 40)
1190                         advmss = 65535 - 40;
1191         }
1192         return advmss;
1193 }
1194
1195 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1196 {
1197         const struct rtable *rt = (const struct rtable *) dst;
1198         unsigned int mtu = rt->rt_pmtu;
1199
1200         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1201                 mtu = dst_metric_raw(dst, RTAX_MTU);
1202
1203         if (mtu)
1204                 return mtu;
1205
1206         mtu = dst->dev->mtu;
1207
1208         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1209                 if (rt->rt_uses_gateway && mtu > 576)
1210                         mtu = 576;
1211         }
1212
1213         if (mtu > IP_MAX_MTU)
1214                 mtu = IP_MAX_MTU;
1215
1216         return mtu;
1217 }
1218
1219 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1220 {
1221         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1222         struct fib_nh_exception *fnhe;
1223         u32 hval;
1224
1225         if (!hash)
1226                 return NULL;
1227
1228         hval = fnhe_hashfun(daddr);
1229
1230         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1231              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1232                 if (fnhe->fnhe_daddr == daddr)
1233                         return fnhe;
1234         }
1235         return NULL;
1236 }
1237
1238 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1239                               __be32 daddr)
1240 {
1241         bool ret = false;
1242
1243         spin_lock_bh(&fnhe_lock);
1244
1245         if (daddr == fnhe->fnhe_daddr) {
1246                 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1247                 if (orig && rt_is_expired(orig)) {
1248                         fnhe->fnhe_gw = 0;
1249                         fnhe->fnhe_pmtu = 0;
1250                         fnhe->fnhe_expires = 0;
1251                 }
1252                 if (fnhe->fnhe_pmtu) {
1253                         unsigned long expires = fnhe->fnhe_expires;
1254                         unsigned long diff = expires - jiffies;
1255
1256                         if (time_before(jiffies, expires)) {
1257                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1258                                 dst_set_expires(&rt->dst, diff);
1259                         }
1260                 }
1261                 if (fnhe->fnhe_gw) {
1262                         rt->rt_flags |= RTCF_REDIRECTED;
1263                         rt->rt_gateway = fnhe->fnhe_gw;
1264                         rt->rt_uses_gateway = 1;
1265                 } else if (!rt->rt_gateway)
1266                         rt->rt_gateway = daddr;
1267
1268                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1269                 if (orig)
1270                         rt_free(orig);
1271
1272                 fnhe->fnhe_stamp = jiffies;
1273                 ret = true;
1274         }
1275         spin_unlock_bh(&fnhe_lock);
1276
1277         return ret;
1278 }
1279
1280 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1281 {
1282         struct rtable *orig, *prev, **p;
1283         bool ret = true;
1284
1285         if (rt_is_input_route(rt)) {
1286                 p = (struct rtable **)&nh->nh_rth_input;
1287         } else {
1288                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1289         }
1290         orig = *p;
1291
1292         prev = cmpxchg(p, orig, rt);
1293         if (prev == orig) {
1294                 if (orig)
1295                         rt_free(orig);
1296         } else
1297                 ret = false;
1298
1299         return ret;
1300 }
1301
1302 static DEFINE_SPINLOCK(rt_uncached_lock);
1303 static LIST_HEAD(rt_uncached_list);
1304
1305 static void rt_add_uncached_list(struct rtable *rt)
1306 {
1307         spin_lock_bh(&rt_uncached_lock);
1308         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1309         spin_unlock_bh(&rt_uncached_lock);
1310 }
1311
1312 static void ipv4_dst_destroy(struct dst_entry *dst)
1313 {
1314         struct rtable *rt = (struct rtable *) dst;
1315
1316         if (!list_empty(&rt->rt_uncached)) {
1317                 spin_lock_bh(&rt_uncached_lock);
1318                 list_del(&rt->rt_uncached);
1319                 spin_unlock_bh(&rt_uncached_lock);
1320         }
1321 }
1322
1323 void rt_flush_dev(struct net_device *dev)
1324 {
1325         if (!list_empty(&rt_uncached_list)) {
1326                 struct net *net = dev_net(dev);
1327                 struct rtable *rt;
1328
1329                 spin_lock_bh(&rt_uncached_lock);
1330                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1331                         if (rt->dst.dev != dev)
1332                                 continue;
1333                         rt->dst.dev = net->loopback_dev;
1334                         dev_hold(rt->dst.dev);
1335                         dev_put(dev);
1336                 }
1337                 spin_unlock_bh(&rt_uncached_lock);
1338         }
1339 }
1340
1341 static bool rt_cache_valid(const struct rtable *rt)
1342 {
1343         return  rt &&
1344                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1345                 !rt_is_expired(rt);
1346 }
1347
1348 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1349                            const struct fib_result *res,
1350                            struct fib_nh_exception *fnhe,
1351                            struct fib_info *fi, u16 type, u32 itag)
1352 {
1353         bool cached = false;
1354
1355         if (fi) {
1356                 struct fib_nh *nh = &FIB_RES_NH(*res);
1357
1358                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1359                         rt->rt_gateway = nh->nh_gw;
1360                         rt->rt_uses_gateway = 1;
1361                 }
1362                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1363 #ifdef CONFIG_IP_ROUTE_CLASSID
1364                 rt->dst.tclassid = nh->nh_tclassid;
1365 #endif
1366                 if (unlikely(fnhe))
1367                         cached = rt_bind_exception(rt, fnhe, daddr);
1368                 else if (!(rt->dst.flags & DST_NOCACHE))
1369                         cached = rt_cache_route(nh, rt);
1370                 if (unlikely(!cached)) {
1371                         /* Routes we intend to cache in nexthop exception or
1372                          * FIB nexthop have the DST_NOCACHE bit clear.
1373                          * However, if we are unsuccessful at storing this
1374                          * route into the cache we really need to set it.
1375                          */
1376                         rt->dst.flags |= DST_NOCACHE;
1377                         if (!rt->rt_gateway)
1378                                 rt->rt_gateway = daddr;
1379                         rt_add_uncached_list(rt);
1380                 }
1381         } else
1382                 rt_add_uncached_list(rt);
1383
1384 #ifdef CONFIG_IP_ROUTE_CLASSID
1385 #ifdef CONFIG_IP_MULTIPLE_TABLES
1386         set_class_tag(rt, res->tclassid);
1387 #endif
1388         set_class_tag(rt, itag);
1389 #endif
1390 }
1391
1392 static struct rtable *rt_dst_alloc(struct net_device *dev,
1393                                    bool nopolicy, bool noxfrm, bool will_cache)
1394 {
1395         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1396                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1397                          (nopolicy ? DST_NOPOLICY : 0) |
1398                          (noxfrm ? DST_NOXFRM : 0));
1399 }
1400
1401 /* called in rcu_read_lock() section */
1402 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1403                                 u8 tos, struct net_device *dev, int our)
1404 {
1405         struct rtable *rth;
1406         struct in_device *in_dev = __in_dev_get_rcu(dev);
1407         u32 itag = 0;
1408         int err;
1409
1410         /* Primary sanity checks. */
1411
1412         if (in_dev == NULL)
1413                 return -EINVAL;
1414
1415         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1416             skb->protocol != htons(ETH_P_IP))
1417                 goto e_inval;
1418
1419         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1420                 if (ipv4_is_loopback(saddr))
1421                         goto e_inval;
1422
1423         if (ipv4_is_zeronet(saddr)) {
1424                 if (!ipv4_is_local_multicast(daddr))
1425                         goto e_inval;
1426         } else {
1427                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1428                                           in_dev, &itag);
1429                 if (err < 0)
1430                         goto e_err;
1431         }
1432         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1433                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1434         if (!rth)
1435                 goto e_nobufs;
1436
1437 #ifdef CONFIG_IP_ROUTE_CLASSID
1438         rth->dst.tclassid = itag;
1439 #endif
1440         rth->dst.output = ip_rt_bug;
1441
1442         rth->rt_genid   = rt_genid(dev_net(dev));
1443         rth->rt_flags   = RTCF_MULTICAST;
1444         rth->rt_type    = RTN_MULTICAST;
1445         rth->rt_is_input= 1;
1446         rth->rt_iif     = 0;
1447         rth->rt_pmtu    = 0;
1448         rth->rt_gateway = 0;
1449         rth->rt_uses_gateway = 0;
1450         INIT_LIST_HEAD(&rth->rt_uncached);
1451         if (our) {
1452                 rth->dst.input= ip_local_deliver;
1453                 rth->rt_flags |= RTCF_LOCAL;
1454         }
1455
1456 #ifdef CONFIG_IP_MROUTE
1457         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1458                 rth->dst.input = ip_mr_input;
1459 #endif
1460         RT_CACHE_STAT_INC(in_slow_mc);
1461
1462         skb_dst_set(skb, &rth->dst);
1463         return 0;
1464
1465 e_nobufs:
1466         return -ENOBUFS;
1467 e_inval:
1468         return -EINVAL;
1469 e_err:
1470         return err;
1471 }
1472
1473
1474 static void ip_handle_martian_source(struct net_device *dev,
1475                                      struct in_device *in_dev,
1476                                      struct sk_buff *skb,
1477                                      __be32 daddr,
1478                                      __be32 saddr)
1479 {
1480         RT_CACHE_STAT_INC(in_martian_src);
1481 #ifdef CONFIG_IP_ROUTE_VERBOSE
1482         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1483                 /*
1484                  *      RFC1812 recommendation, if source is martian,
1485                  *      the only hint is MAC header.
1486                  */
1487                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1488                         &daddr, &saddr, dev->name);
1489                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1490                         print_hex_dump(KERN_WARNING, "ll header: ",
1491                                        DUMP_PREFIX_OFFSET, 16, 1,
1492                                        skb_mac_header(skb),
1493                                        dev->hard_header_len, true);
1494                 }
1495         }
1496 #endif
1497 }
1498
1499 /* called in rcu_read_lock() section */
1500 static int __mkroute_input(struct sk_buff *skb,
1501                            const struct fib_result *res,
1502                            struct in_device *in_dev,
1503                            __be32 daddr, __be32 saddr, u32 tos)
1504 {
1505         struct rtable *rth;
1506         int err;
1507         struct in_device *out_dev;
1508         unsigned int flags = 0;
1509         bool do_cache;
1510         u32 itag = 0;
1511
1512         /* get a working reference to the output device */
1513         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1514         if (out_dev == NULL) {
1515                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1516                 return -EINVAL;
1517         }
1518
1519         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1520                                   in_dev->dev, in_dev, &itag);
1521         if (err < 0) {
1522                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1523                                          saddr);
1524
1525                 goto cleanup;
1526         }
1527
1528         do_cache = res->fi && !itag;
1529         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1530             skb->protocol == htons(ETH_P_IP) &&
1531             (IN_DEV_SHARED_MEDIA(out_dev) ||
1532              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1533                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1534
1535         if (skb->protocol != htons(ETH_P_IP)) {
1536                 /* Not IP (i.e. ARP). Do not create route, if it is
1537                  * invalid for proxy arp. DNAT routes are always valid.
1538                  *
1539                  * Proxy arp feature have been extended to allow, ARP
1540                  * replies back to the same interface, to support
1541                  * Private VLAN switch technologies. See arp.c.
1542                  */
1543                 if (out_dev == in_dev &&
1544                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1545                         err = -EINVAL;
1546                         goto cleanup;
1547                 }
1548         }
1549
1550         if (do_cache) {
1551                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1552                 if (rt_cache_valid(rth)) {
1553                         skb_dst_set_noref(skb, &rth->dst);
1554                         goto out;
1555                 }
1556         }
1557
1558         rth = rt_dst_alloc(out_dev->dev,
1559                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1560                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1561         if (!rth) {
1562                 err = -ENOBUFS;
1563                 goto cleanup;
1564         }
1565
1566         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1567         rth->rt_flags = flags;
1568         rth->rt_type = res->type;
1569         rth->rt_is_input = 1;
1570         rth->rt_iif     = 0;
1571         rth->rt_pmtu    = 0;
1572         rth->rt_gateway = 0;
1573         rth->rt_uses_gateway = 0;
1574         INIT_LIST_HEAD(&rth->rt_uncached);
1575         RT_CACHE_STAT_INC(in_slow_tot);
1576
1577         rth->dst.input = ip_forward;
1578         rth->dst.output = ip_output;
1579
1580         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1581         skb_dst_set(skb, &rth->dst);
1582 out:
1583         err = 0;
1584  cleanup:
1585         return err;
1586 }
1587
1588 static int ip_mkroute_input(struct sk_buff *skb,
1589                             struct fib_result *res,
1590                             const struct flowi4 *fl4,
1591                             struct in_device *in_dev,
1592                             __be32 daddr, __be32 saddr, u32 tos)
1593 {
1594 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1595         if (res->fi && res->fi->fib_nhs > 1)
1596                 fib_select_multipath(res);
1597 #endif
1598
1599         /* create a routing cache entry */
1600         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1601 }
1602
1603 /*
1604  *      NOTE. We drop all the packets that has local source
1605  *      addresses, because every properly looped back packet
1606  *      must have correct destination already attached by output routine.
1607  *
1608  *      Such approach solves two big problems:
1609  *      1. Not simplex devices are handled properly.
1610  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1611  *      called with rcu_read_lock()
1612  */
1613
1614 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1615                                u8 tos, struct net_device *dev)
1616 {
1617         struct fib_result res;
1618         struct in_device *in_dev = __in_dev_get_rcu(dev);
1619         struct flowi4   fl4;
1620         unsigned int    flags = 0;
1621         u32             itag = 0;
1622         struct rtable   *rth;
1623         int             err = -EINVAL;
1624         struct net    *net = dev_net(dev);
1625         bool do_cache;
1626
1627         /* IP on this device is disabled. */
1628
1629         if (!in_dev)
1630                 goto out;
1631
1632         /* Check for the most weird martians, which can be not detected
1633            by fib_lookup.
1634          */
1635
1636         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1637                 goto martian_source;
1638
1639         res.fi = NULL;
1640         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1641                 goto brd_input;
1642
1643         /* Accept zero addresses only to limited broadcast;
1644          * I even do not know to fix it or not. Waiting for complains :-)
1645          */
1646         if (ipv4_is_zeronet(saddr))
1647                 goto martian_source;
1648
1649         if (ipv4_is_zeronet(daddr))
1650                 goto martian_destination;
1651
1652         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1653          * and call it once if daddr or/and saddr are loopback addresses
1654          */
1655         if (ipv4_is_loopback(daddr)) {
1656                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1657                         goto martian_destination;
1658         } else if (ipv4_is_loopback(saddr)) {
1659                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1660                         goto martian_source;
1661         }
1662
1663         /*
1664          *      Now we are ready to route packet.
1665          */
1666         fl4.flowi4_oif = 0;
1667         fl4.flowi4_iif = dev->ifindex;
1668         fl4.flowi4_mark = skb->mark;
1669         fl4.flowi4_tos = tos;
1670         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1671         fl4.daddr = daddr;
1672         fl4.saddr = saddr;
1673         err = fib_lookup(net, &fl4, &res);
1674         if (err != 0)
1675                 goto no_route;
1676
1677         if (res.type == RTN_BROADCAST)
1678                 goto brd_input;
1679
1680         if (res.type == RTN_LOCAL) {
1681                 err = fib_validate_source(skb, saddr, daddr, tos,
1682                                           LOOPBACK_IFINDEX,
1683                                           dev, in_dev, &itag);
1684                 if (err < 0)
1685                         goto martian_source_keep_err;
1686                 goto local_input;
1687         }
1688
1689         if (!IN_DEV_FORWARD(in_dev))
1690                 goto no_route;
1691         if (res.type != RTN_UNICAST)
1692                 goto martian_destination;
1693
1694         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1695 out:    return err;
1696
1697 brd_input:
1698         if (skb->protocol != htons(ETH_P_IP))
1699                 goto e_inval;
1700
1701         if (!ipv4_is_zeronet(saddr)) {
1702                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1703                                           in_dev, &itag);
1704                 if (err < 0)
1705                         goto martian_source_keep_err;
1706         }
1707         flags |= RTCF_BROADCAST;
1708         res.type = RTN_BROADCAST;
1709         RT_CACHE_STAT_INC(in_brd);
1710
1711 local_input:
1712         do_cache = false;
1713         if (res.fi) {
1714                 if (!itag) {
1715                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1716                         if (rt_cache_valid(rth)) {
1717                                 skb_dst_set_noref(skb, &rth->dst);
1718                                 err = 0;
1719                                 goto out;
1720                         }
1721                         do_cache = true;
1722                 }
1723         }
1724
1725         rth = rt_dst_alloc(net->loopback_dev,
1726                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1727         if (!rth)
1728                 goto e_nobufs;
1729
1730         rth->dst.input= ip_local_deliver;
1731         rth->dst.output= ip_rt_bug;
1732 #ifdef CONFIG_IP_ROUTE_CLASSID
1733         rth->dst.tclassid = itag;
1734 #endif
1735
1736         rth->rt_genid = rt_genid(net);
1737         rth->rt_flags   = flags|RTCF_LOCAL;
1738         rth->rt_type    = res.type;
1739         rth->rt_is_input = 1;
1740         rth->rt_iif     = 0;
1741         rth->rt_pmtu    = 0;
1742         rth->rt_gateway = 0;
1743         rth->rt_uses_gateway = 0;
1744         INIT_LIST_HEAD(&rth->rt_uncached);
1745         RT_CACHE_STAT_INC(in_slow_tot);
1746         if (res.type == RTN_UNREACHABLE) {
1747                 rth->dst.input= ip_error;
1748                 rth->dst.error= -err;
1749                 rth->rt_flags   &= ~RTCF_LOCAL;
1750         }
1751         if (do_cache) {
1752                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1753                         rth->dst.flags |= DST_NOCACHE;
1754                         rt_add_uncached_list(rth);
1755                 }
1756         }
1757         skb_dst_set(skb, &rth->dst);
1758         err = 0;
1759         goto out;
1760
1761 no_route:
1762         RT_CACHE_STAT_INC(in_no_route);
1763         res.type = RTN_UNREACHABLE;
1764         if (err == -ESRCH)
1765                 err = -ENETUNREACH;
1766         goto local_input;
1767
1768         /*
1769          *      Do not cache martian addresses: they should be logged (RFC1812)
1770          */
1771 martian_destination:
1772         RT_CACHE_STAT_INC(in_martian_dst);
1773 #ifdef CONFIG_IP_ROUTE_VERBOSE
1774         if (IN_DEV_LOG_MARTIANS(in_dev))
1775                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1776                                      &daddr, &saddr, dev->name);
1777 #endif
1778
1779 e_inval:
1780         err = -EINVAL;
1781         goto out;
1782
1783 e_nobufs:
1784         err = -ENOBUFS;
1785         goto out;
1786
1787 martian_source:
1788         err = -EINVAL;
1789 martian_source_keep_err:
1790         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1791         goto out;
1792 }
1793
1794 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1795                          u8 tos, struct net_device *dev)
1796 {
1797         int res;
1798
1799         rcu_read_lock();
1800
1801         /* Multicast recognition logic is moved from route cache to here.
1802            The problem was that too many Ethernet cards have broken/missing
1803            hardware multicast filters :-( As result the host on multicasting
1804            network acquires a lot of useless route cache entries, sort of
1805            SDR messages from all the world. Now we try to get rid of them.
1806            Really, provided software IP multicast filter is organized
1807            reasonably (at least, hashed), it does not result in a slowdown
1808            comparing with route cache reject entries.
1809            Note, that multicast routers are not affected, because
1810            route cache entry is created eventually.
1811          */
1812         if (ipv4_is_multicast(daddr)) {
1813                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1814
1815                 if (in_dev) {
1816                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1817                                                   ip_hdr(skb)->protocol);
1818                         if (our
1819 #ifdef CONFIG_IP_MROUTE
1820                                 ||
1821                             (!ipv4_is_local_multicast(daddr) &&
1822                              IN_DEV_MFORWARD(in_dev))
1823 #endif
1824                            ) {
1825                                 int res = ip_route_input_mc(skb, daddr, saddr,
1826                                                             tos, dev, our);
1827                                 rcu_read_unlock();
1828                                 return res;
1829                         }
1830                 }
1831                 rcu_read_unlock();
1832                 return -EINVAL;
1833         }
1834         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1835         rcu_read_unlock();
1836         return res;
1837 }
1838 EXPORT_SYMBOL(ip_route_input_noref);
1839
1840 /* called with rcu_read_lock() */
1841 static struct rtable *__mkroute_output(const struct fib_result *res,
1842                                        const struct flowi4 *fl4, int orig_oif,
1843                                        struct net_device *dev_out,
1844                                        unsigned int flags)
1845 {
1846         struct fib_info *fi = res->fi;
1847         struct fib_nh_exception *fnhe;
1848         struct in_device *in_dev;
1849         u16 type = res->type;
1850         struct rtable *rth;
1851         bool do_cache;
1852
1853         in_dev = __in_dev_get_rcu(dev_out);
1854         if (!in_dev)
1855                 return ERR_PTR(-EINVAL);
1856
1857         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1858                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1859                         return ERR_PTR(-EINVAL);
1860
1861         if (ipv4_is_lbcast(fl4->daddr))
1862                 type = RTN_BROADCAST;
1863         else if (ipv4_is_multicast(fl4->daddr))
1864                 type = RTN_MULTICAST;
1865         else if (ipv4_is_zeronet(fl4->daddr))
1866                 return ERR_PTR(-EINVAL);
1867
1868         if (dev_out->flags & IFF_LOOPBACK)
1869                 flags |= RTCF_LOCAL;
1870
1871         do_cache = true;
1872         if (type == RTN_BROADCAST) {
1873                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1874                 fi = NULL;
1875         } else if (type == RTN_MULTICAST) {
1876                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1877                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1878                                      fl4->flowi4_proto))
1879                         flags &= ~RTCF_LOCAL;
1880                 else
1881                         do_cache = false;
1882                 /* If multicast route do not exist use
1883                  * default one, but do not gateway in this case.
1884                  * Yes, it is hack.
1885                  */
1886                 if (fi && res->prefixlen < 4)
1887                         fi = NULL;
1888         }
1889
1890         fnhe = NULL;
1891         do_cache &= fi != NULL;
1892         if (do_cache) {
1893                 struct rtable __rcu **prth;
1894                 struct fib_nh *nh = &FIB_RES_NH(*res);
1895
1896                 fnhe = find_exception(nh, fl4->daddr);
1897                 if (fnhe)
1898                         prth = &fnhe->fnhe_rth;
1899                 else {
1900                         if (unlikely(fl4->flowi4_flags &
1901                                      FLOWI_FLAG_KNOWN_NH &&
1902                                      !(nh->nh_gw &&
1903                                        nh->nh_scope == RT_SCOPE_LINK))) {
1904                                 do_cache = false;
1905                                 goto add;
1906                         }
1907                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1908                 }
1909                 rth = rcu_dereference(*prth);
1910                 if (rt_cache_valid(rth)) {
1911                         dst_hold(&rth->dst);
1912                         return rth;
1913                 }
1914         }
1915
1916 add:
1917         rth = rt_dst_alloc(dev_out,
1918                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1919                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1920                            do_cache);
1921         if (!rth)
1922                 return ERR_PTR(-ENOBUFS);
1923
1924         rth->dst.output = ip_output;
1925
1926         rth->rt_genid = rt_genid(dev_net(dev_out));
1927         rth->rt_flags   = flags;
1928         rth->rt_type    = type;
1929         rth->rt_is_input = 0;
1930         rth->rt_iif     = orig_oif ? : 0;
1931         rth->rt_pmtu    = 0;
1932         rth->rt_gateway = 0;
1933         rth->rt_uses_gateway = 0;
1934         INIT_LIST_HEAD(&rth->rt_uncached);
1935
1936         RT_CACHE_STAT_INC(out_slow_tot);
1937
1938         if (flags & RTCF_LOCAL)
1939                 rth->dst.input = ip_local_deliver;
1940         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1941                 if (flags & RTCF_LOCAL &&
1942                     !(dev_out->flags & IFF_LOOPBACK)) {
1943                         rth->dst.output = ip_mc_output;
1944                         RT_CACHE_STAT_INC(out_slow_mc);
1945                 }
1946 #ifdef CONFIG_IP_MROUTE
1947                 if (type == RTN_MULTICAST) {
1948                         if (IN_DEV_MFORWARD(in_dev) &&
1949                             !ipv4_is_local_multicast(fl4->daddr)) {
1950                                 rth->dst.input = ip_mr_input;
1951                                 rth->dst.output = ip_mc_output;
1952                         }
1953                 }
1954 #endif
1955         }
1956
1957         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1958
1959         return rth;
1960 }
1961
1962 /*
1963  * Major route resolver routine.
1964  */
1965
1966 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1967 {
1968         struct net_device *dev_out = NULL;
1969         __u8 tos = RT_FL_TOS(fl4);
1970         unsigned int flags = 0;
1971         struct fib_result res;
1972         struct rtable *rth;
1973         int orig_oif;
1974
1975         res.tclassid    = 0;
1976         res.fi          = NULL;
1977         res.table       = NULL;
1978
1979         orig_oif = fl4->flowi4_oif;
1980
1981         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1982         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1983         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1984                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1985
1986         rcu_read_lock();
1987         if (fl4->saddr) {
1988                 rth = ERR_PTR(-EINVAL);
1989                 if (ipv4_is_multicast(fl4->saddr) ||
1990                     ipv4_is_lbcast(fl4->saddr) ||
1991                     ipv4_is_zeronet(fl4->saddr))
1992                         goto out;
1993
1994                 /* I removed check for oif == dev_out->oif here.
1995                    It was wrong for two reasons:
1996                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1997                       is assigned to multiple interfaces.
1998                    2. Moreover, we are allowed to send packets with saddr
1999                       of another iface. --ANK
2000                  */
2001
2002                 if (fl4->flowi4_oif == 0 &&
2003                     (ipv4_is_multicast(fl4->daddr) ||
2004                      ipv4_is_lbcast(fl4->daddr))) {
2005                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2006                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2007                         if (dev_out == NULL)
2008                                 goto out;
2009
2010                         /* Special hack: user can direct multicasts
2011                            and limited broadcast via necessary interface
2012                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2013                            This hack is not just for fun, it allows
2014                            vic,vat and friends to work.
2015                            They bind socket to loopback, set ttl to zero
2016                            and expect that it will work.
2017                            From the viewpoint of routing cache they are broken,
2018                            because we are not allowed to build multicast path
2019                            with loopback source addr (look, routing cache
2020                            cannot know, that ttl is zero, so that packet
2021                            will not leave this host and route is valid).
2022                            Luckily, this hack is good workaround.
2023                          */
2024
2025                         fl4->flowi4_oif = dev_out->ifindex;
2026                         goto make_route;
2027                 }
2028
2029                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2030                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2031                         if (!__ip_dev_find(net, fl4->saddr, false))
2032                                 goto out;
2033                 }
2034         }
2035
2036
2037         if (fl4->flowi4_oif) {
2038                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2039                 rth = ERR_PTR(-ENODEV);
2040                 if (dev_out == NULL)
2041                         goto out;
2042
2043                 /* RACE: Check return value of inet_select_addr instead. */
2044                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2045                         rth = ERR_PTR(-ENETUNREACH);
2046                         goto out;
2047                 }
2048                 if (ipv4_is_local_multicast(fl4->daddr) ||
2049                     ipv4_is_lbcast(fl4->daddr)) {
2050                         if (!fl4->saddr)
2051                                 fl4->saddr = inet_select_addr(dev_out, 0,
2052                                                               RT_SCOPE_LINK);
2053                         goto make_route;
2054                 }
2055                 if (!fl4->saddr) {
2056                         if (ipv4_is_multicast(fl4->daddr))
2057                                 fl4->saddr = inet_select_addr(dev_out, 0,
2058                                                               fl4->flowi4_scope);
2059                         else if (!fl4->daddr)
2060                                 fl4->saddr = inet_select_addr(dev_out, 0,
2061                                                               RT_SCOPE_HOST);
2062                 }
2063         }
2064
2065         if (!fl4->daddr) {
2066                 fl4->daddr = fl4->saddr;
2067                 if (!fl4->daddr)
2068                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2069                 dev_out = net->loopback_dev;
2070                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2071                 res.type = RTN_LOCAL;
2072                 flags |= RTCF_LOCAL;
2073                 goto make_route;
2074         }
2075
2076         if (fib_lookup(net, fl4, &res)) {
2077                 res.fi = NULL;
2078                 res.table = NULL;
2079                 if (fl4->flowi4_oif) {
2080                         /* Apparently, routing tables are wrong. Assume,
2081                            that the destination is on link.
2082
2083                            WHY? DW.
2084                            Because we are allowed to send to iface
2085                            even if it has NO routes and NO assigned
2086                            addresses. When oif is specified, routing
2087                            tables are looked up with only one purpose:
2088                            to catch if destination is gatewayed, rather than
2089                            direct. Moreover, if MSG_DONTROUTE is set,
2090                            we send packet, ignoring both routing tables
2091                            and ifaddr state. --ANK
2092
2093
2094                            We could make it even if oif is unknown,
2095                            likely IPv6, but we do not.
2096                          */
2097
2098                         if (fl4->saddr == 0)
2099                                 fl4->saddr = inet_select_addr(dev_out, 0,
2100                                                               RT_SCOPE_LINK);
2101                         res.type = RTN_UNICAST;
2102                         goto make_route;
2103                 }
2104                 rth = ERR_PTR(-ENETUNREACH);
2105                 goto out;
2106         }
2107
2108         if (res.type == RTN_LOCAL) {
2109                 if (!fl4->saddr) {
2110                         if (res.fi->fib_prefsrc)
2111                                 fl4->saddr = res.fi->fib_prefsrc;
2112                         else
2113                                 fl4->saddr = fl4->daddr;
2114                 }
2115                 dev_out = net->loopback_dev;
2116                 fl4->flowi4_oif = dev_out->ifindex;
2117                 flags |= RTCF_LOCAL;
2118                 goto make_route;
2119         }
2120
2121 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2122         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2123                 fib_select_multipath(&res);
2124         else
2125 #endif
2126         if (!res.prefixlen &&
2127             res.table->tb_num_default > 1 &&
2128             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2129                 fib_select_default(&res);
2130
2131         if (!fl4->saddr)
2132                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2133
2134         dev_out = FIB_RES_DEV(res);
2135         fl4->flowi4_oif = dev_out->ifindex;
2136
2137
2138 make_route:
2139         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2140
2141 out:
2142         rcu_read_unlock();
2143         return rth;
2144 }
2145 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2146
2147 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2148 {
2149         return NULL;
2150 }
2151
2152 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2153 {
2154         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2155
2156         return mtu ? : dst->dev->mtu;
2157 }
2158
2159 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2160                                           struct sk_buff *skb, u32 mtu)
2161 {
2162 }
2163
2164 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2165                                        struct sk_buff *skb)
2166 {
2167 }
2168
2169 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2170                                           unsigned long old)
2171 {
2172         return NULL;
2173 }
2174
2175 static struct dst_ops ipv4_dst_blackhole_ops = {
2176         .family                 =       AF_INET,
2177         .protocol               =       cpu_to_be16(ETH_P_IP),
2178         .check                  =       ipv4_blackhole_dst_check,
2179         .mtu                    =       ipv4_blackhole_mtu,
2180         .default_advmss         =       ipv4_default_advmss,
2181         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2182         .redirect               =       ipv4_rt_blackhole_redirect,
2183         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2184         .neigh_lookup           =       ipv4_neigh_lookup,
2185 };
2186
2187 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2188 {
2189         struct rtable *ort = (struct rtable *) dst_orig;
2190         struct rtable *rt;
2191
2192         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2193         if (rt) {
2194                 struct dst_entry *new = &rt->dst;
2195
2196                 new->__use = 1;
2197                 new->input = dst_discard;
2198                 new->output = dst_discard;
2199
2200                 new->dev = ort->dst.dev;
2201                 if (new->dev)
2202                         dev_hold(new->dev);
2203
2204                 rt->rt_is_input = ort->rt_is_input;
2205                 rt->rt_iif = ort->rt_iif;
2206                 rt->rt_pmtu = ort->rt_pmtu;
2207
2208                 rt->rt_genid = rt_genid(net);
2209                 rt->rt_flags = ort->rt_flags;
2210                 rt->rt_type = ort->rt_type;
2211                 rt->rt_gateway = ort->rt_gateway;
2212                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2213
2214                 INIT_LIST_HEAD(&rt->rt_uncached);
2215
2216                 dst_free(new);
2217         }
2218
2219         dst_release(dst_orig);
2220
2221         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2222 }
2223
2224 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2225                                     struct sock *sk)
2226 {
2227         struct rtable *rt = __ip_route_output_key(net, flp4);
2228
2229         if (IS_ERR(rt))
2230                 return rt;
2231
2232         if (flp4->flowi4_proto)
2233                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2234                                                    flowi4_to_flowi(flp4),
2235                                                    sk, 0);
2236
2237         return rt;
2238 }
2239 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2240
2241 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2242                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2243                         u32 seq, int event, int nowait, unsigned int flags)
2244 {
2245         struct rtable *rt = skb_rtable(skb);
2246         struct rtmsg *r;
2247         struct nlmsghdr *nlh;
2248         unsigned long expires = 0;
2249         u32 error;
2250         u32 metrics[RTAX_MAX];
2251
2252         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2253         if (nlh == NULL)
2254                 return -EMSGSIZE;
2255
2256         r = nlmsg_data(nlh);
2257         r->rtm_family    = AF_INET;
2258         r->rtm_dst_len  = 32;
2259         r->rtm_src_len  = 0;
2260         r->rtm_tos      = fl4->flowi4_tos;
2261         r->rtm_table    = RT_TABLE_MAIN;
2262         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2263                 goto nla_put_failure;
2264         r->rtm_type     = rt->rt_type;
2265         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2266         r->rtm_protocol = RTPROT_UNSPEC;
2267         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2268         if (rt->rt_flags & RTCF_NOTIFY)
2269                 r->rtm_flags |= RTM_F_NOTIFY;
2270         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2271                 r->rtm_flags |= RTCF_DOREDIRECT;
2272
2273         if (nla_put_be32(skb, RTA_DST, dst))
2274                 goto nla_put_failure;
2275         if (src) {
2276                 r->rtm_src_len = 32;
2277                 if (nla_put_be32(skb, RTA_SRC, src))
2278                         goto nla_put_failure;
2279         }
2280         if (rt->dst.dev &&
2281             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2282                 goto nla_put_failure;
2283 #ifdef CONFIG_IP_ROUTE_CLASSID
2284         if (rt->dst.tclassid &&
2285             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2286                 goto nla_put_failure;
2287 #endif
2288         if (!rt_is_input_route(rt) &&
2289             fl4->saddr != src) {
2290                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2291                         goto nla_put_failure;
2292         }
2293         if (rt->rt_uses_gateway &&
2294             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2295                 goto nla_put_failure;
2296
2297         expires = rt->dst.expires;
2298         if (expires) {
2299                 unsigned long now = jiffies;
2300
2301                 if (time_before(now, expires))
2302                         expires -= now;
2303                 else
2304                         expires = 0;
2305         }
2306
2307         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2308         if (rt->rt_pmtu && expires)
2309                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2310         if (rtnetlink_put_metrics(skb, metrics) < 0)
2311                 goto nla_put_failure;
2312
2313         if (fl4->flowi4_mark &&
2314             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2315                 goto nla_put_failure;
2316
2317         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2318             nla_put_u32(skb, RTA_UID,
2319                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2320                 goto nla_put_failure;
2321
2322         error = rt->dst.error;
2323
2324         if (rt_is_input_route(rt)) {
2325 #ifdef CONFIG_IP_MROUTE
2326                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2327                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2328                         int err = ipmr_get_route(net, skb,
2329                                                  fl4->saddr, fl4->daddr,
2330                                                  r, nowait);
2331                         if (err <= 0) {
2332                                 if (!nowait) {
2333                                         if (err == 0)
2334                                                 return 0;
2335                                         goto nla_put_failure;
2336                                 } else {
2337                                         if (err == -EMSGSIZE)
2338                                                 goto nla_put_failure;
2339                                         error = err;
2340                                 }
2341                         }
2342                 } else
2343 #endif
2344                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2345                                 goto nla_put_failure;
2346         }
2347
2348         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2349                 goto nla_put_failure;
2350
2351         return nlmsg_end(skb, nlh);
2352
2353 nla_put_failure:
2354         nlmsg_cancel(skb, nlh);
2355         return -EMSGSIZE;
2356 }
2357
2358 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2359 {
2360         struct net *net = sock_net(in_skb->sk);
2361         struct rtmsg *rtm;
2362         struct nlattr *tb[RTA_MAX+1];
2363         struct rtable *rt = NULL;
2364         struct flowi4 fl4;
2365         __be32 dst = 0;
2366         __be32 src = 0;
2367         u32 iif;
2368         int err;
2369         int mark;
2370         struct sk_buff *skb;
2371         kuid_t uid;
2372
2373         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2374         if (err < 0)
2375                 goto errout;
2376
2377         rtm = nlmsg_data(nlh);
2378
2379         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2380         if (skb == NULL) {
2381                 err = -ENOBUFS;
2382                 goto errout;
2383         }
2384
2385         /* Reserve room for dummy headers, this skb can pass
2386            through good chunk of routing engine.
2387          */
2388         skb_reset_mac_header(skb);
2389         skb_reset_network_header(skb);
2390
2391         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2392         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2393         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2394
2395         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2396         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2397         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2398         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2399         if (tb[RTA_UID])
2400                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2401         else
2402                 uid = (iif ? INVALID_UID : current_uid());
2403
2404         memset(&fl4, 0, sizeof(fl4));
2405         fl4.daddr = dst;
2406         fl4.saddr = src;
2407         fl4.flowi4_tos = rtm->rtm_tos;
2408         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2409         fl4.flowi4_mark = mark;
2410         fl4.flowi4_uid = uid;
2411
2412         if (iif) {
2413                 struct net_device *dev;
2414
2415                 dev = __dev_get_by_index(net, iif);
2416                 if (dev == NULL) {
2417                         err = -ENODEV;
2418                         goto errout_free;
2419                 }
2420
2421                 skb->protocol   = htons(ETH_P_IP);
2422                 skb->dev        = dev;
2423                 skb->mark       = mark;
2424                 local_bh_disable();
2425                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2426                 local_bh_enable();
2427
2428                 rt = skb_rtable(skb);
2429                 if (err == 0 && rt->dst.error)
2430                         err = -rt->dst.error;
2431         } else {
2432                 rt = ip_route_output_key(net, &fl4);
2433
2434                 err = 0;
2435                 if (IS_ERR(rt))
2436                         err = PTR_ERR(rt);
2437         }
2438
2439         if (err)
2440                 goto errout_free;
2441
2442         skb_dst_set(skb, &rt->dst);
2443         if (rtm->rtm_flags & RTM_F_NOTIFY)
2444                 rt->rt_flags |= RTCF_NOTIFY;
2445
2446         err = rt_fill_info(net, dst, src, &fl4, skb,
2447                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2448                            RTM_NEWROUTE, 0, 0);
2449         if (err <= 0)
2450                 goto errout_free;
2451
2452         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2453 errout:
2454         return err;
2455
2456 errout_free:
2457         kfree_skb(skb);
2458         goto errout;
2459 }
2460
2461 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2462 {
2463         return skb->len;
2464 }
2465
2466 void ip_rt_multicast_event(struct in_device *in_dev)
2467 {
2468         rt_cache_flush(dev_net(in_dev->dev));
2469 }
2470
2471 #ifdef CONFIG_SYSCTL
2472 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2473 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2474 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2475 static int ip_rt_gc_elasticity __read_mostly    = 8;
2476
2477 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2478                                         void __user *buffer,
2479                                         size_t *lenp, loff_t *ppos)
2480 {
2481         if (write) {
2482                 rt_cache_flush((struct net *)__ctl->extra1);
2483                 return 0;
2484         }
2485
2486         return -EINVAL;
2487 }
2488
2489 static ctl_table ipv4_route_table[] = {
2490         {
2491                 .procname       = "gc_thresh",
2492                 .data           = &ipv4_dst_ops.gc_thresh,
2493                 .maxlen         = sizeof(int),
2494                 .mode           = 0644,
2495                 .proc_handler   = proc_dointvec,
2496         },
2497         {
2498                 .procname       = "max_size",
2499                 .data           = &ip_rt_max_size,
2500                 .maxlen         = sizeof(int),
2501                 .mode           = 0644,
2502                 .proc_handler   = proc_dointvec,
2503         },
2504         {
2505                 /*  Deprecated. Use gc_min_interval_ms */
2506
2507                 .procname       = "gc_min_interval",
2508                 .data           = &ip_rt_gc_min_interval,
2509                 .maxlen         = sizeof(int),
2510                 .mode           = 0644,
2511                 .proc_handler   = proc_dointvec_jiffies,
2512         },
2513         {
2514                 .procname       = "gc_min_interval_ms",
2515                 .data           = &ip_rt_gc_min_interval,
2516                 .maxlen         = sizeof(int),
2517                 .mode           = 0644,
2518                 .proc_handler   = proc_dointvec_ms_jiffies,
2519         },
2520         {
2521                 .procname       = "gc_timeout",
2522                 .data           = &ip_rt_gc_timeout,
2523                 .maxlen         = sizeof(int),
2524                 .mode           = 0644,
2525                 .proc_handler   = proc_dointvec_jiffies,
2526         },
2527         {
2528                 .procname       = "gc_interval",
2529                 .data           = &ip_rt_gc_interval,
2530                 .maxlen         = sizeof(int),
2531                 .mode           = 0644,
2532                 .proc_handler   = proc_dointvec_jiffies,
2533         },
2534         {
2535                 .procname       = "redirect_load",
2536                 .data           = &ip_rt_redirect_load,
2537                 .maxlen         = sizeof(int),
2538                 .mode           = 0644,
2539                 .proc_handler   = proc_dointvec,
2540         },
2541         {
2542                 .procname       = "redirect_number",
2543                 .data           = &ip_rt_redirect_number,
2544                 .maxlen         = sizeof(int),
2545                 .mode           = 0644,
2546                 .proc_handler   = proc_dointvec,
2547         },
2548         {
2549                 .procname       = "redirect_silence",
2550                 .data           = &ip_rt_redirect_silence,
2551                 .maxlen         = sizeof(int),
2552                 .mode           = 0644,
2553                 .proc_handler   = proc_dointvec,
2554         },
2555         {
2556                 .procname       = "error_cost",
2557                 .data           = &ip_rt_error_cost,
2558                 .maxlen         = sizeof(int),
2559                 .mode           = 0644,
2560                 .proc_handler   = proc_dointvec,
2561         },
2562         {
2563                 .procname       = "error_burst",
2564                 .data           = &ip_rt_error_burst,
2565                 .maxlen         = sizeof(int),
2566                 .mode           = 0644,
2567                 .proc_handler   = proc_dointvec,
2568         },
2569         {
2570                 .procname       = "gc_elasticity",
2571                 .data           = &ip_rt_gc_elasticity,
2572                 .maxlen         = sizeof(int),
2573                 .mode           = 0644,
2574                 .proc_handler   = proc_dointvec,
2575         },
2576         {
2577                 .procname       = "mtu_expires",
2578                 .data           = &ip_rt_mtu_expires,
2579                 .maxlen         = sizeof(int),
2580                 .mode           = 0644,
2581                 .proc_handler   = proc_dointvec_jiffies,
2582         },
2583         {
2584                 .procname       = "min_pmtu",
2585                 .data           = &ip_rt_min_pmtu,
2586                 .maxlen         = sizeof(int),
2587                 .mode           = 0644,
2588                 .proc_handler   = proc_dointvec,
2589         },
2590         {
2591                 .procname       = "min_adv_mss",
2592                 .data           = &ip_rt_min_advmss,
2593                 .maxlen         = sizeof(int),
2594                 .mode           = 0644,
2595                 .proc_handler   = proc_dointvec,
2596         },
2597         { }
2598 };
2599
2600 static struct ctl_table ipv4_route_flush_table[] = {
2601         {
2602                 .procname       = "flush",
2603                 .maxlen         = sizeof(int),
2604                 .mode           = 0200,
2605                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2606         },
2607         { },
2608 };
2609
2610 static __net_init int sysctl_route_net_init(struct net *net)
2611 {
2612         struct ctl_table *tbl;
2613
2614         tbl = ipv4_route_flush_table;
2615         if (!net_eq(net, &init_net)) {
2616                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2617                 if (tbl == NULL)
2618                         goto err_dup;
2619
2620                 /* Don't export sysctls to unprivileged users */
2621                 if (net->user_ns != &init_user_ns)
2622                         tbl[0].procname = NULL;
2623         }
2624         tbl[0].extra1 = net;
2625
2626         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2627         if (net->ipv4.route_hdr == NULL)
2628                 goto err_reg;
2629         return 0;
2630
2631 err_reg:
2632         if (tbl != ipv4_route_flush_table)
2633                 kfree(tbl);
2634 err_dup:
2635         return -ENOMEM;
2636 }
2637
2638 static __net_exit void sysctl_route_net_exit(struct net *net)
2639 {
2640         struct ctl_table *tbl;
2641
2642         tbl = net->ipv4.route_hdr->ctl_table_arg;
2643         unregister_net_sysctl_table(net->ipv4.route_hdr);
2644         BUG_ON(tbl == ipv4_route_flush_table);
2645         kfree(tbl);
2646 }
2647
2648 static __net_initdata struct pernet_operations sysctl_route_ops = {
2649         .init = sysctl_route_net_init,
2650         .exit = sysctl_route_net_exit,
2651 };
2652 #endif
2653
2654 static __net_init int rt_genid_init(struct net *net)
2655 {
2656         atomic_set(&net->rt_genid, 0);
2657         get_random_bytes(&net->ipv4.dev_addr_genid,
2658                          sizeof(net->ipv4.dev_addr_genid));
2659         return 0;
2660 }
2661
2662 static __net_initdata struct pernet_operations rt_genid_ops = {
2663         .init = rt_genid_init,
2664 };
2665
2666 static int __net_init ipv4_inetpeer_init(struct net *net)
2667 {
2668         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2669
2670         if (!bp)
2671                 return -ENOMEM;
2672         inet_peer_base_init(bp);
2673         net->ipv4.peers = bp;
2674         return 0;
2675 }
2676
2677 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2678 {
2679         struct inet_peer_base *bp = net->ipv4.peers;
2680
2681         net->ipv4.peers = NULL;
2682         inetpeer_invalidate_tree(bp);
2683         kfree(bp);
2684 }
2685
2686 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2687         .init   =       ipv4_inetpeer_init,
2688         .exit   =       ipv4_inetpeer_exit,
2689 };
2690
2691 #ifdef CONFIG_IP_ROUTE_CLASSID
2692 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2693 #endif /* CONFIG_IP_ROUTE_CLASSID */
2694
2695 int __init ip_rt_init(void)
2696 {
2697         int rc = 0;
2698
2699         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2700         if (!ip_idents)
2701                 panic("IP: failed to allocate ip_idents\n");
2702
2703         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2704
2705 #ifdef CONFIG_IP_ROUTE_CLASSID
2706         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2707         if (!ip_rt_acct)
2708                 panic("IP: failed to allocate ip_rt_acct\n");
2709 #endif
2710
2711         ipv4_dst_ops.kmem_cachep =
2712                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2713                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2714
2715         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2716
2717         if (dst_entries_init(&ipv4_dst_ops) < 0)
2718                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2719
2720         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2721                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2722
2723         ipv4_dst_ops.gc_thresh = ~0;
2724         ip_rt_max_size = INT_MAX;
2725
2726         devinet_init();
2727         ip_fib_init();
2728
2729         if (ip_rt_proc_init())
2730                 pr_err("Unable to create route proc files\n");
2731 #ifdef CONFIG_XFRM
2732         xfrm_init();
2733         xfrm4_init();
2734 #endif
2735         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2736
2737 #ifdef CONFIG_SYSCTL
2738         register_pernet_subsys(&sysctl_route_ops);
2739 #endif
2740         register_pernet_subsys(&rt_genid_ops);
2741         register_pernet_subsys(&ipv4_inetpeer_ops);
2742         return rc;
2743 }
2744
2745 #ifdef CONFIG_SYSCTL
2746 /*
2747  * We really need to sanitize the damn ipv4 init order, then all
2748  * this nonsense will go away.
2749  */
2750 void __init ip_static_sysctl_init(void)
2751 {
2752         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2753 }
2754 #endif