ipv4: Avoid crashing in ip_error
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #include <linux/kmemleak.h>
110 #endif
111 #include <net/secure_seq.h>
112
113 #define RT_FL_TOS(oldflp4) \
114         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 /*
131  *      Interface to generic destination cache.
132  */
133
134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
135 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
136 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
137 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
138 static void              ipv4_link_failure(struct sk_buff *skb);
139 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
140                                            struct sk_buff *skb, u32 mtu);
141 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
142                                         struct sk_buff *skb);
143 static void             ipv4_dst_destroy(struct dst_entry *dst);
144
145 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
146                             int how)
147 {
148 }
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             cpu_to_be16(ETH_P_IP),
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .ifdown =               ipv4_dst_ifdown,
169         .negative_advice =      ipv4_negative_advice,
170         .link_failure =         ipv4_link_failure,
171         .update_pmtu =          ip_rt_update_pmtu,
172         .redirect =             ip_do_redirect,
173         .local_out =            __ip_local_out,
174         .neigh_lookup =         ipv4_neigh_lookup,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .owner   = THIS_MODULE,
244         .open    = rt_cache_seq_open,
245         .read    = seq_read,
246         .llseek  = seq_lseek,
247         .release = seq_release,
248 };
249
250
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252 {
253         int cpu;
254
255         if (*pos == 0)
256                 return SEQ_START_TOKEN;
257
258         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259                 if (!cpu_possible(cpu))
260                         continue;
261                 *pos = cpu+1;
262                 return &per_cpu(rt_cache_stat, cpu);
263         }
264         return NULL;
265 }
266
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269         int cpu;
270
271         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272                 if (!cpu_possible(cpu))
273                         continue;
274                 *pos = cpu+1;
275                 return &per_cpu(rt_cache_stat, cpu);
276         }
277         return NULL;
278
279 }
280
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283
284 }
285
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288         struct rt_cache_stat *st = v;
289
290         if (v == SEQ_START_TOKEN) {
291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292                 return 0;
293         }
294
295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297                    dst_entries_get_slow(&ipv4_dst_ops),
298                    st->in_hit,
299                    st->in_slow_tot,
300                    st->in_slow_mc,
301                    st->in_no_route,
302                    st->in_brd,
303                    st->in_martian_dst,
304                    st->in_martian_src,
305
306                    st->out_hit,
307                    st->out_slow_tot,
308                    st->out_slow_mc,
309
310                    st->gc_total,
311                    st->gc_ignored,
312                    st->gc_goal_miss,
313                    st->gc_dst_overflow,
314                    st->in_hlist_search,
315                    st->out_hlist_search
316                 );
317         return 0;
318 }
319
320 static const struct seq_operations rt_cpu_seq_ops = {
321         .start  = rt_cpu_seq_start,
322         .next   = rt_cpu_seq_next,
323         .stop   = rt_cpu_seq_stop,
324         .show   = rt_cpu_seq_show,
325 };
326
327
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330         return seq_open(file, &rt_cpu_seq_ops);
331 }
332
333 static const struct file_operations rt_cpu_seq_fops = {
334         .owner   = THIS_MODULE,
335         .open    = rt_cpu_seq_open,
336         .read    = seq_read,
337         .llseek  = seq_lseek,
338         .release = seq_release,
339 };
340
341 #ifdef CONFIG_IP_ROUTE_CLASSID
342 static int rt_acct_proc_show(struct seq_file *m, void *v)
343 {
344         struct ip_rt_acct *dst, *src;
345         unsigned int i, j;
346
347         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348         if (!dst)
349                 return -ENOMEM;
350
351         for_each_possible_cpu(i) {
352                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353                 for (j = 0; j < 256; j++) {
354                         dst[j].o_bytes   += src[j].o_bytes;
355                         dst[j].o_packets += src[j].o_packets;
356                         dst[j].i_bytes   += src[j].i_bytes;
357                         dst[j].i_packets += src[j].i_packets;
358                 }
359         }
360
361         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362         kfree(dst);
363         return 0;
364 }
365
366 static int rt_acct_proc_open(struct inode *inode, struct file *file)
367 {
368         return single_open(file, rt_acct_proc_show, NULL);
369 }
370
371 static const struct file_operations rt_acct_proc_fops = {
372         .owner          = THIS_MODULE,
373         .open           = rt_acct_proc_open,
374         .read           = seq_read,
375         .llseek         = seq_lseek,
376         .release        = single_release,
377 };
378 #endif
379
380 static int __net_init ip_rt_do_proc_init(struct net *net)
381 {
382         struct proc_dir_entry *pde;
383
384         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385                           &rt_cache_seq_fops);
386         if (!pde)
387                 goto err1;
388
389         pde = proc_create("rt_cache", S_IRUGO,
390                           net->proc_net_stat, &rt_cpu_seq_fops);
391         if (!pde)
392                 goto err2;
393
394 #ifdef CONFIG_IP_ROUTE_CLASSID
395         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
396         if (!pde)
397                 goto err3;
398 #endif
399         return 0;
400
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 err3:
403         remove_proc_entry("rt_cache", net->proc_net_stat);
404 #endif
405 err2:
406         remove_proc_entry("rt_cache", net->proc_net);
407 err1:
408         return -ENOMEM;
409 }
410
411 static void __net_exit ip_rt_do_proc_exit(struct net *net)
412 {
413         remove_proc_entry("rt_cache", net->proc_net_stat);
414         remove_proc_entry("rt_cache", net->proc_net);
415 #ifdef CONFIG_IP_ROUTE_CLASSID
416         remove_proc_entry("rt_acct", net->proc_net);
417 #endif
418 }
419
420 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
421         .init = ip_rt_do_proc_init,
422         .exit = ip_rt_do_proc_exit,
423 };
424
425 static int __init ip_rt_proc_init(void)
426 {
427         return register_pernet_subsys(&ip_rt_proc_ops);
428 }
429
430 #else
431 static inline int ip_rt_proc_init(void)
432 {
433         return 0;
434 }
435 #endif /* CONFIG_PROC_FS */
436
437 static inline bool rt_is_expired(const struct rtable *rth)
438 {
439         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
440 }
441
442 void rt_cache_flush(struct net *net)
443 {
444         rt_genid_bump(net);
445 }
446
447 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448                                            struct sk_buff *skb,
449                                            const void *daddr)
450 {
451         struct net_device *dev = dst->dev;
452         const __be32 *pkey = daddr;
453         const struct rtable *rt;
454         struct neighbour *n;
455
456         rt = (const struct rtable *) dst;
457         if (rt->rt_gateway)
458                 pkey = (const __be32 *) &rt->rt_gateway;
459         else if (skb)
460                 pkey = &ip_hdr(skb)->daddr;
461
462         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
463         if (n)
464                 return n;
465         return neigh_create(&arp_tbl, pkey, dev);
466 }
467
468 #define IP_IDENTS_SZ 2048u
469 struct ip_ident_bucket {
470         atomic_t        id;
471         u32             stamp32;
472 };
473
474 static struct ip_ident_bucket *ip_idents __read_mostly;
475
476 /* In order to protect privacy, we add a perturbation to identifiers
477  * if one generator is seldom used. This makes hard for an attacker
478  * to infer how many packets were sent between two points in time.
479  */
480 u32 ip_idents_reserve(u32 hash, int segs)
481 {
482         struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
483         u32 old = ACCESS_ONCE(bucket->stamp32);
484         u32 now = (u32)jiffies;
485         u32 delta = 0;
486
487         if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
488                 u64 x = prandom_u32();
489
490                 x *= (now - old);
491                 delta = (u32)(x >> 32);
492         }
493
494         return atomic_add_return(segs + delta, &bucket->id) - segs;
495 }
496 EXPORT_SYMBOL(ip_idents_reserve);
497
498 void __ip_select_ident(struct iphdr *iph, int segs)
499 {
500         static u32 ip_idents_hashrnd __read_mostly;
501         static bool hashrnd_initialized = false;
502         u32 hash, id;
503
504         if (unlikely(!hashrnd_initialized)) {
505                 hashrnd_initialized = true;
506                 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
507         }
508
509         hash = jhash_3words((__force u32)iph->daddr,
510                             (__force u32)iph->saddr,
511                             iph->protocol,
512                             ip_idents_hashrnd);
513         id = ip_idents_reserve(hash, segs);
514         iph->id = htons(id);
515 }
516 EXPORT_SYMBOL(__ip_select_ident);
517
518 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
519                              const struct iphdr *iph,
520                              int oif, u8 tos,
521                              u8 prot, u32 mark, int flow_flags)
522 {
523         if (sk) {
524                 const struct inet_sock *inet = inet_sk(sk);
525
526                 oif = sk->sk_bound_dev_if;
527                 mark = sk->sk_mark;
528                 tos = RT_CONN_FLAGS(sk);
529                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
530         }
531         flowi4_init_output(fl4, oif, mark, tos,
532                            RT_SCOPE_UNIVERSE, prot,
533                            flow_flags,
534                            iph->daddr, iph->saddr, 0, 0);
535 }
536
537 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
538                                const struct sock *sk)
539 {
540         const struct iphdr *iph = ip_hdr(skb);
541         int oif = skb->dev->ifindex;
542         u8 tos = RT_TOS(iph->tos);
543         u8 prot = iph->protocol;
544         u32 mark = skb->mark;
545
546         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
547 }
548
549 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
550 {
551         const struct inet_sock *inet = inet_sk(sk);
552         const struct ip_options_rcu *inet_opt;
553         __be32 daddr = inet->inet_daddr;
554
555         rcu_read_lock();
556         inet_opt = rcu_dereference(inet->inet_opt);
557         if (inet_opt && inet_opt->opt.srr)
558                 daddr = inet_opt->opt.faddr;
559         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
560                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
561                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
562                            inet_sk_flowi_flags(sk),
563                            daddr, inet->inet_saddr, 0, 0);
564         rcu_read_unlock();
565 }
566
567 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
568                                  const struct sk_buff *skb)
569 {
570         if (skb)
571                 build_skb_flow_key(fl4, skb, sk);
572         else
573                 build_sk_flow_key(fl4, sk);
574 }
575
576 static inline void rt_free(struct rtable *rt)
577 {
578         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
579 }
580
581 static DEFINE_SPINLOCK(fnhe_lock);
582
583 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
584 {
585         struct fib_nh_exception *fnhe, *oldest;
586         struct rtable *orig;
587
588         oldest = rcu_dereference(hash->chain);
589         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
590              fnhe = rcu_dereference(fnhe->fnhe_next)) {
591                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
592                         oldest = fnhe;
593         }
594         orig = rcu_dereference(oldest->fnhe_rth);
595         if (orig) {
596                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
597                 rt_free(orig);
598         }
599         return oldest;
600 }
601
602 static inline u32 fnhe_hashfun(__be32 daddr)
603 {
604         u32 hval;
605
606         hval = (__force u32) daddr;
607         hval ^= (hval >> 11) ^ (hval >> 22);
608
609         return hval & (FNHE_HASH_SIZE - 1);
610 }
611
612 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
613                                   u32 pmtu, unsigned long expires)
614 {
615         struct fnhe_hash_bucket *hash;
616         struct fib_nh_exception *fnhe;
617         int depth;
618         u32 hval = fnhe_hashfun(daddr);
619
620         spin_lock_bh(&fnhe_lock);
621
622         hash = nh->nh_exceptions;
623         if (!hash) {
624                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
625                 if (!hash)
626                         goto out_unlock;
627                 nh->nh_exceptions = hash;
628         }
629
630         hash += hval;
631
632         depth = 0;
633         for (fnhe = rcu_dereference(hash->chain); fnhe;
634              fnhe = rcu_dereference(fnhe->fnhe_next)) {
635                 if (fnhe->fnhe_daddr == daddr)
636                         break;
637                 depth++;
638         }
639
640         if (fnhe) {
641                 if (gw)
642                         fnhe->fnhe_gw = gw;
643                 if (pmtu) {
644                         fnhe->fnhe_pmtu = pmtu;
645                         fnhe->fnhe_expires = expires;
646                 }
647         } else {
648                 if (depth > FNHE_RECLAIM_DEPTH)
649                         fnhe = fnhe_oldest(hash);
650                 else {
651                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
652                         if (!fnhe)
653                                 goto out_unlock;
654
655                         fnhe->fnhe_next = hash->chain;
656                         rcu_assign_pointer(hash->chain, fnhe);
657                 }
658                 fnhe->fnhe_daddr = daddr;
659                 fnhe->fnhe_gw = gw;
660                 fnhe->fnhe_pmtu = pmtu;
661                 fnhe->fnhe_expires = expires;
662         }
663
664         fnhe->fnhe_stamp = jiffies;
665
666 out_unlock:
667         spin_unlock_bh(&fnhe_lock);
668         return;
669 }
670
671 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
672                              bool kill_route)
673 {
674         __be32 new_gw = icmp_hdr(skb)->un.gateway;
675         __be32 old_gw = ip_hdr(skb)->saddr;
676         struct net_device *dev = skb->dev;
677         struct in_device *in_dev;
678         struct fib_result res;
679         struct neighbour *n;
680         struct net *net;
681
682         switch (icmp_hdr(skb)->code & 7) {
683         case ICMP_REDIR_NET:
684         case ICMP_REDIR_NETTOS:
685         case ICMP_REDIR_HOST:
686         case ICMP_REDIR_HOSTTOS:
687                 break;
688
689         default:
690                 return;
691         }
692
693         if (rt->rt_gateway != old_gw)
694                 return;
695
696         in_dev = __in_dev_get_rcu(dev);
697         if (!in_dev)
698                 return;
699
700         net = dev_net(dev);
701         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
702             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
703             ipv4_is_zeronet(new_gw))
704                 goto reject_redirect;
705
706         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
707                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
708                         goto reject_redirect;
709                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
710                         goto reject_redirect;
711         } else {
712                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
713                         goto reject_redirect;
714         }
715
716         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
717         if (n) {
718                 if (!(n->nud_state & NUD_VALID)) {
719                         neigh_event_send(n, NULL);
720                 } else {
721                         if (fib_lookup(net, fl4, &res) == 0) {
722                                 struct fib_nh *nh = &FIB_RES_NH(res);
723
724                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
725                                                       0, 0);
726                         }
727                         if (kill_route)
728                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
729                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
730                 }
731                 neigh_release(n);
732         }
733         return;
734
735 reject_redirect:
736 #ifdef CONFIG_IP_ROUTE_VERBOSE
737         if (IN_DEV_LOG_MARTIANS(in_dev)) {
738                 const struct iphdr *iph = (const struct iphdr *) skb->data;
739                 __be32 daddr = iph->daddr;
740                 __be32 saddr = iph->saddr;
741
742                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
743                                      "  Advised path = %pI4 -> %pI4\n",
744                                      &old_gw, dev->name, &new_gw,
745                                      &saddr, &daddr);
746         }
747 #endif
748         ;
749 }
750
751 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
752 {
753         struct rtable *rt;
754         struct flowi4 fl4;
755         const struct iphdr *iph = (const struct iphdr *) skb->data;
756         int oif = skb->dev->ifindex;
757         u8 tos = RT_TOS(iph->tos);
758         u8 prot = iph->protocol;
759         u32 mark = skb->mark;
760
761         rt = (struct rtable *) dst;
762
763         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
764         __ip_do_redirect(rt, skb, &fl4, true);
765 }
766
767 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
768 {
769         struct rtable *rt = (struct rtable *)dst;
770         struct dst_entry *ret = dst;
771
772         if (rt) {
773                 if (dst->obsolete > 0) {
774                         ip_rt_put(rt);
775                         ret = NULL;
776                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
777                            rt->dst.expires) {
778                         ip_rt_put(rt);
779                         ret = NULL;
780                 }
781         }
782         return ret;
783 }
784
785 /*
786  * Algorithm:
787  *      1. The first ip_rt_redirect_number redirects are sent
788  *         with exponential backoff, then we stop sending them at all,
789  *         assuming that the host ignores our redirects.
790  *      2. If we did not see packets requiring redirects
791  *         during ip_rt_redirect_silence, we assume that the host
792  *         forgot redirected route and start to send redirects again.
793  *
794  * This algorithm is much cheaper and more intelligent than dumb load limiting
795  * in icmp.c.
796  *
797  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
798  * and "frag. need" (breaks PMTU discovery) in icmp.c.
799  */
800
801 void ip_rt_send_redirect(struct sk_buff *skb)
802 {
803         struct rtable *rt = skb_rtable(skb);
804         struct in_device *in_dev;
805         struct inet_peer *peer;
806         struct net *net;
807         int log_martians;
808
809         rcu_read_lock();
810         in_dev = __in_dev_get_rcu(rt->dst.dev);
811         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
812                 rcu_read_unlock();
813                 return;
814         }
815         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
816         rcu_read_unlock();
817
818         net = dev_net(rt->dst.dev);
819         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
820         if (!peer) {
821                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
822                           rt_nexthop(rt, ip_hdr(skb)->daddr));
823                 return;
824         }
825
826         /* No redirected packets during ip_rt_redirect_silence;
827          * reset the algorithm.
828          */
829         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
830                 peer->rate_tokens = 0;
831
832         /* Too many ignored redirects; do not send anything
833          * set dst.rate_last to the last seen redirected packet.
834          */
835         if (peer->rate_tokens >= ip_rt_redirect_number) {
836                 peer->rate_last = jiffies;
837                 goto out_put_peer;
838         }
839
840         /* Check for load limit; set rate_last to the latest sent
841          * redirect.
842          */
843         if (peer->rate_tokens == 0 ||
844             time_after(jiffies,
845                        (peer->rate_last +
846                         (ip_rt_redirect_load << peer->rate_tokens)))) {
847                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
848
849                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
850                 peer->rate_last = jiffies;
851                 ++peer->rate_tokens;
852 #ifdef CONFIG_IP_ROUTE_VERBOSE
853                 if (log_martians &&
854                     peer->rate_tokens == ip_rt_redirect_number)
855                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
856                                              &ip_hdr(skb)->saddr, inet_iif(skb),
857                                              &ip_hdr(skb)->daddr, &gw);
858 #endif
859         }
860 out_put_peer:
861         inet_putpeer(peer);
862 }
863
864 static int ip_error(struct sk_buff *skb)
865 {
866         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
867         struct rtable *rt = skb_rtable(skb);
868         struct inet_peer *peer;
869         unsigned long now;
870         struct net *net;
871         bool send;
872         int code;
873
874         /* IP on this device is disabled. */
875         if (!in_dev)
876                 goto out;
877
878         net = dev_net(rt->dst.dev);
879         if (!IN_DEV_FORWARD(in_dev)) {
880                 switch (rt->dst.error) {
881                 case EHOSTUNREACH:
882                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
883                         break;
884
885                 case ENETUNREACH:
886                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
887                         break;
888                 }
889                 goto out;
890         }
891
892         switch (rt->dst.error) {
893         case EINVAL:
894         default:
895                 goto out;
896         case EHOSTUNREACH:
897                 code = ICMP_HOST_UNREACH;
898                 break;
899         case ENETUNREACH:
900                 code = ICMP_NET_UNREACH;
901                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
902                 break;
903         case EACCES:
904                 code = ICMP_PKT_FILTERED;
905                 break;
906         }
907
908         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
909
910         send = true;
911         if (peer) {
912                 now = jiffies;
913                 peer->rate_tokens += now - peer->rate_last;
914                 if (peer->rate_tokens > ip_rt_error_burst)
915                         peer->rate_tokens = ip_rt_error_burst;
916                 peer->rate_last = now;
917                 if (peer->rate_tokens >= ip_rt_error_cost)
918                         peer->rate_tokens -= ip_rt_error_cost;
919                 else
920                         send = false;
921                 inet_putpeer(peer);
922         }
923         if (send)
924                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
925
926 out:    kfree_skb(skb);
927         return 0;
928 }
929
930 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
931 {
932         struct dst_entry *dst = &rt->dst;
933         struct fib_result res;
934
935         if (dst_metric_locked(dst, RTAX_MTU))
936                 return;
937
938         if (dst->dev->mtu < mtu)
939                 return;
940
941         if (mtu < ip_rt_min_pmtu)
942                 mtu = ip_rt_min_pmtu;
943
944         if (!rt->rt_pmtu) {
945                 dst->obsolete = DST_OBSOLETE_KILL;
946         } else {
947                 rt->rt_pmtu = mtu;
948                 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
949         }
950
951         rcu_read_lock();
952         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
953                 struct fib_nh *nh = &FIB_RES_NH(res);
954
955                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
956                                       jiffies + ip_rt_mtu_expires);
957         }
958         rcu_read_unlock();
959 }
960
961 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
962                               struct sk_buff *skb, u32 mtu)
963 {
964         struct rtable *rt = (struct rtable *) dst;
965         struct flowi4 fl4;
966
967         ip_rt_build_flow_key(&fl4, sk, skb);
968         __ip_rt_update_pmtu(rt, &fl4, mtu);
969 }
970
971 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
972                       int oif, u32 mark, u8 protocol, int flow_flags)
973 {
974         const struct iphdr *iph = (const struct iphdr *) skb->data;
975         struct flowi4 fl4;
976         struct rtable *rt;
977
978         __build_flow_key(&fl4, NULL, iph, oif,
979                          RT_TOS(iph->tos), protocol, mark, flow_flags);
980         rt = __ip_route_output_key(net, &fl4);
981         if (!IS_ERR(rt)) {
982                 __ip_rt_update_pmtu(rt, &fl4, mtu);
983                 ip_rt_put(rt);
984         }
985 }
986 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
987
988 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
989 {
990         const struct iphdr *iph = (const struct iphdr *) skb->data;
991         struct flowi4 fl4;
992         struct rtable *rt;
993
994         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
995         rt = __ip_route_output_key(sock_net(sk), &fl4);
996         if (!IS_ERR(rt)) {
997                 __ip_rt_update_pmtu(rt, &fl4, mtu);
998                 ip_rt_put(rt);
999         }
1000 }
1001
1002 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1003 {
1004         const struct iphdr *iph = (const struct iphdr *) skb->data;
1005         struct flowi4 fl4;
1006         struct rtable *rt;
1007         struct dst_entry *odst = NULL;
1008         bool new = false;
1009
1010         bh_lock_sock(sk);
1011         odst = sk_dst_get(sk);
1012
1013         if (sock_owned_by_user(sk) || !odst) {
1014                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1015                 goto out;
1016         }
1017
1018         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1019
1020         rt = (struct rtable *)odst;
1021         if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
1022                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1023                 if (IS_ERR(rt))
1024                         goto out;
1025
1026                 new = true;
1027         }
1028
1029         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1030
1031         if (!dst_check(&rt->dst, 0)) {
1032                 if (new)
1033                         dst_release(&rt->dst);
1034
1035                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1036                 if (IS_ERR(rt))
1037                         goto out;
1038
1039                 new = true;
1040         }
1041
1042         if (new)
1043                 sk_dst_set(sk, &rt->dst);
1044
1045 out:
1046         bh_unlock_sock(sk);
1047         dst_release(odst);
1048 }
1049 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1050
1051 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1052                    int oif, u32 mark, u8 protocol, int flow_flags)
1053 {
1054         const struct iphdr *iph = (const struct iphdr *) skb->data;
1055         struct flowi4 fl4;
1056         struct rtable *rt;
1057
1058         __build_flow_key(&fl4, NULL, iph, oif,
1059                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1060         rt = __ip_route_output_key(net, &fl4);
1061         if (!IS_ERR(rt)) {
1062                 __ip_do_redirect(rt, skb, &fl4, false);
1063                 ip_rt_put(rt);
1064         }
1065 }
1066 EXPORT_SYMBOL_GPL(ipv4_redirect);
1067
1068 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1069 {
1070         const struct iphdr *iph = (const struct iphdr *) skb->data;
1071         struct flowi4 fl4;
1072         struct rtable *rt;
1073
1074         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1075         rt = __ip_route_output_key(sock_net(sk), &fl4);
1076         if (!IS_ERR(rt)) {
1077                 __ip_do_redirect(rt, skb, &fl4, false);
1078                 ip_rt_put(rt);
1079         }
1080 }
1081 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1082
1083 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1084 {
1085         struct rtable *rt = (struct rtable *) dst;
1086
1087         /* All IPV4 dsts are created with ->obsolete set to the value
1088          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1089          * into this function always.
1090          *
1091          * When a PMTU/redirect information update invalidates a
1092          * route, this is indicated by setting obsolete to
1093          * DST_OBSOLETE_KILL.
1094          */
1095         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1096                 return NULL;
1097         return dst;
1098 }
1099
1100 static void ipv4_link_failure(struct sk_buff *skb)
1101 {
1102         struct rtable *rt;
1103
1104         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1105
1106         rt = skb_rtable(skb);
1107         if (rt)
1108                 dst_set_expires(&rt->dst, 0);
1109 }
1110
1111 static int ip_rt_bug(struct sk_buff *skb)
1112 {
1113         pr_debug("%s: %pI4 -> %pI4, %s\n",
1114                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1115                  skb->dev ? skb->dev->name : "?");
1116         kfree_skb(skb);
1117         WARN_ON(1);
1118         return 0;
1119 }
1120
1121 /*
1122    We do not cache source address of outgoing interface,
1123    because it is used only by IP RR, TS and SRR options,
1124    so that it out of fast path.
1125
1126    BTW remember: "addr" is allowed to be not aligned
1127    in IP options!
1128  */
1129
1130 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1131 {
1132         __be32 src;
1133
1134         if (rt_is_output_route(rt))
1135                 src = ip_hdr(skb)->saddr;
1136         else {
1137                 struct fib_result res;
1138                 struct flowi4 fl4;
1139                 struct iphdr *iph;
1140
1141                 iph = ip_hdr(skb);
1142
1143                 memset(&fl4, 0, sizeof(fl4));
1144                 fl4.daddr = iph->daddr;
1145                 fl4.saddr = iph->saddr;
1146                 fl4.flowi4_tos = RT_TOS(iph->tos);
1147                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1148                 fl4.flowi4_iif = skb->dev->ifindex;
1149                 fl4.flowi4_mark = skb->mark;
1150
1151                 rcu_read_lock();
1152                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1153                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1154                 else
1155                         src = inet_select_addr(rt->dst.dev,
1156                                                rt_nexthop(rt, iph->daddr),
1157                                                RT_SCOPE_UNIVERSE);
1158                 rcu_read_unlock();
1159         }
1160         memcpy(addr, &src, 4);
1161 }
1162
1163 #ifdef CONFIG_IP_ROUTE_CLASSID
1164 static void set_class_tag(struct rtable *rt, u32 tag)
1165 {
1166         if (!(rt->dst.tclassid & 0xFFFF))
1167                 rt->dst.tclassid |= tag & 0xFFFF;
1168         if (!(rt->dst.tclassid & 0xFFFF0000))
1169                 rt->dst.tclassid |= tag & 0xFFFF0000;
1170 }
1171 #endif
1172
1173 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1174 {
1175         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1176
1177         if (advmss == 0) {
1178                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1179                                ip_rt_min_advmss);
1180                 if (advmss > 65535 - 40)
1181                         advmss = 65535 - 40;
1182         }
1183         return advmss;
1184 }
1185
1186 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1187 {
1188         const struct rtable *rt = (const struct rtable *) dst;
1189         unsigned int mtu = rt->rt_pmtu;
1190
1191         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1192                 mtu = dst_metric_raw(dst, RTAX_MTU);
1193
1194         if (mtu)
1195                 return mtu;
1196
1197         mtu = dst->dev->mtu;
1198
1199         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1200                 if (rt->rt_uses_gateway && mtu > 576)
1201                         mtu = 576;
1202         }
1203
1204         if (mtu > IP_MAX_MTU)
1205                 mtu = IP_MAX_MTU;
1206
1207         return mtu;
1208 }
1209
1210 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1211 {
1212         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1213         struct fib_nh_exception *fnhe;
1214         u32 hval;
1215
1216         if (!hash)
1217                 return NULL;
1218
1219         hval = fnhe_hashfun(daddr);
1220
1221         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1222              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1223                 if (fnhe->fnhe_daddr == daddr)
1224                         return fnhe;
1225         }
1226         return NULL;
1227 }
1228
1229 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1230                               __be32 daddr)
1231 {
1232         bool ret = false;
1233
1234         spin_lock_bh(&fnhe_lock);
1235
1236         if (daddr == fnhe->fnhe_daddr) {
1237                 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1238                 if (orig && rt_is_expired(orig)) {
1239                         fnhe->fnhe_gw = 0;
1240                         fnhe->fnhe_pmtu = 0;
1241                         fnhe->fnhe_expires = 0;
1242                 }
1243                 if (fnhe->fnhe_pmtu) {
1244                         unsigned long expires = fnhe->fnhe_expires;
1245                         unsigned long diff = expires - jiffies;
1246
1247                         if (time_before(jiffies, expires)) {
1248                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1249                                 dst_set_expires(&rt->dst, diff);
1250                         }
1251                 }
1252                 if (fnhe->fnhe_gw) {
1253                         rt->rt_flags |= RTCF_REDIRECTED;
1254                         rt->rt_gateway = fnhe->fnhe_gw;
1255                         rt->rt_uses_gateway = 1;
1256                 } else if (!rt->rt_gateway)
1257                         rt->rt_gateway = daddr;
1258
1259                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1260                 if (orig)
1261                         rt_free(orig);
1262
1263                 fnhe->fnhe_stamp = jiffies;
1264                 ret = true;
1265         }
1266         spin_unlock_bh(&fnhe_lock);
1267
1268         return ret;
1269 }
1270
1271 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1272 {
1273         struct rtable *orig, *prev, **p;
1274         bool ret = true;
1275
1276         if (rt_is_input_route(rt)) {
1277                 p = (struct rtable **)&nh->nh_rth_input;
1278         } else {
1279                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1280         }
1281         orig = *p;
1282
1283         prev = cmpxchg(p, orig, rt);
1284         if (prev == orig) {
1285                 if (orig)
1286                         rt_free(orig);
1287         } else
1288                 ret = false;
1289
1290         return ret;
1291 }
1292
1293 static DEFINE_SPINLOCK(rt_uncached_lock);
1294 static LIST_HEAD(rt_uncached_list);
1295
1296 static void rt_add_uncached_list(struct rtable *rt)
1297 {
1298         spin_lock_bh(&rt_uncached_lock);
1299         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1300         spin_unlock_bh(&rt_uncached_lock);
1301 }
1302
1303 static void ipv4_dst_destroy(struct dst_entry *dst)
1304 {
1305         struct rtable *rt = (struct rtable *) dst;
1306
1307         if (!list_empty(&rt->rt_uncached)) {
1308                 spin_lock_bh(&rt_uncached_lock);
1309                 list_del(&rt->rt_uncached);
1310                 spin_unlock_bh(&rt_uncached_lock);
1311         }
1312 }
1313
1314 void rt_flush_dev(struct net_device *dev)
1315 {
1316         if (!list_empty(&rt_uncached_list)) {
1317                 struct net *net = dev_net(dev);
1318                 struct rtable *rt;
1319
1320                 spin_lock_bh(&rt_uncached_lock);
1321                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1322                         if (rt->dst.dev != dev)
1323                                 continue;
1324                         rt->dst.dev = net->loopback_dev;
1325                         dev_hold(rt->dst.dev);
1326                         dev_put(dev);
1327                 }
1328                 spin_unlock_bh(&rt_uncached_lock);
1329         }
1330 }
1331
1332 static bool rt_cache_valid(const struct rtable *rt)
1333 {
1334         return  rt &&
1335                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1336                 !rt_is_expired(rt);
1337 }
1338
1339 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1340                            const struct fib_result *res,
1341                            struct fib_nh_exception *fnhe,
1342                            struct fib_info *fi, u16 type, u32 itag)
1343 {
1344         bool cached = false;
1345
1346         if (fi) {
1347                 struct fib_nh *nh = &FIB_RES_NH(*res);
1348
1349                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1350                         rt->rt_gateway = nh->nh_gw;
1351                         rt->rt_uses_gateway = 1;
1352                 }
1353                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1354 #ifdef CONFIG_IP_ROUTE_CLASSID
1355                 rt->dst.tclassid = nh->nh_tclassid;
1356 #endif
1357                 if (unlikely(fnhe))
1358                         cached = rt_bind_exception(rt, fnhe, daddr);
1359                 else if (!(rt->dst.flags & DST_NOCACHE))
1360                         cached = rt_cache_route(nh, rt);
1361                 if (unlikely(!cached)) {
1362                         /* Routes we intend to cache in nexthop exception or
1363                          * FIB nexthop have the DST_NOCACHE bit clear.
1364                          * However, if we are unsuccessful at storing this
1365                          * route into the cache we really need to set it.
1366                          */
1367                         rt->dst.flags |= DST_NOCACHE;
1368                         if (!rt->rt_gateway)
1369                                 rt->rt_gateway = daddr;
1370                         rt_add_uncached_list(rt);
1371                 }
1372         } else
1373                 rt_add_uncached_list(rt);
1374
1375 #ifdef CONFIG_IP_ROUTE_CLASSID
1376 #ifdef CONFIG_IP_MULTIPLE_TABLES
1377         set_class_tag(rt, res->tclassid);
1378 #endif
1379         set_class_tag(rt, itag);
1380 #endif
1381 }
1382
1383 static struct rtable *rt_dst_alloc(struct net_device *dev,
1384                                    bool nopolicy, bool noxfrm, bool will_cache)
1385 {
1386         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1387                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1388                          (nopolicy ? DST_NOPOLICY : 0) |
1389                          (noxfrm ? DST_NOXFRM : 0));
1390 }
1391
1392 /* called in rcu_read_lock() section */
1393 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1394                                 u8 tos, struct net_device *dev, int our)
1395 {
1396         struct rtable *rth;
1397         struct in_device *in_dev = __in_dev_get_rcu(dev);
1398         u32 itag = 0;
1399         int err;
1400
1401         /* Primary sanity checks. */
1402
1403         if (in_dev == NULL)
1404                 return -EINVAL;
1405
1406         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1407             skb->protocol != htons(ETH_P_IP))
1408                 goto e_inval;
1409
1410         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1411                 if (ipv4_is_loopback(saddr))
1412                         goto e_inval;
1413
1414         if (ipv4_is_zeronet(saddr)) {
1415                 if (!ipv4_is_local_multicast(daddr))
1416                         goto e_inval;
1417         } else {
1418                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1419                                           in_dev, &itag);
1420                 if (err < 0)
1421                         goto e_err;
1422         }
1423         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1424                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1425         if (!rth)
1426                 goto e_nobufs;
1427
1428 #ifdef CONFIG_IP_ROUTE_CLASSID
1429         rth->dst.tclassid = itag;
1430 #endif
1431         rth->dst.output = ip_rt_bug;
1432
1433         rth->rt_genid   = rt_genid(dev_net(dev));
1434         rth->rt_flags   = RTCF_MULTICAST;
1435         rth->rt_type    = RTN_MULTICAST;
1436         rth->rt_is_input= 1;
1437         rth->rt_iif     = 0;
1438         rth->rt_pmtu    = 0;
1439         rth->rt_gateway = 0;
1440         rth->rt_uses_gateway = 0;
1441         INIT_LIST_HEAD(&rth->rt_uncached);
1442         if (our) {
1443                 rth->dst.input= ip_local_deliver;
1444                 rth->rt_flags |= RTCF_LOCAL;
1445         }
1446
1447 #ifdef CONFIG_IP_MROUTE
1448         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1449                 rth->dst.input = ip_mr_input;
1450 #endif
1451         RT_CACHE_STAT_INC(in_slow_mc);
1452
1453         skb_dst_set(skb, &rth->dst);
1454         return 0;
1455
1456 e_nobufs:
1457         return -ENOBUFS;
1458 e_inval:
1459         return -EINVAL;
1460 e_err:
1461         return err;
1462 }
1463
1464
1465 static void ip_handle_martian_source(struct net_device *dev,
1466                                      struct in_device *in_dev,
1467                                      struct sk_buff *skb,
1468                                      __be32 daddr,
1469                                      __be32 saddr)
1470 {
1471         RT_CACHE_STAT_INC(in_martian_src);
1472 #ifdef CONFIG_IP_ROUTE_VERBOSE
1473         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1474                 /*
1475                  *      RFC1812 recommendation, if source is martian,
1476                  *      the only hint is MAC header.
1477                  */
1478                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1479                         &daddr, &saddr, dev->name);
1480                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1481                         print_hex_dump(KERN_WARNING, "ll header: ",
1482                                        DUMP_PREFIX_OFFSET, 16, 1,
1483                                        skb_mac_header(skb),
1484                                        dev->hard_header_len, true);
1485                 }
1486         }
1487 #endif
1488 }
1489
1490 /* called in rcu_read_lock() section */
1491 static int __mkroute_input(struct sk_buff *skb,
1492                            const struct fib_result *res,
1493                            struct in_device *in_dev,
1494                            __be32 daddr, __be32 saddr, u32 tos)
1495 {
1496         struct rtable *rth;
1497         int err;
1498         struct in_device *out_dev;
1499         unsigned int flags = 0;
1500         bool do_cache;
1501         u32 itag = 0;
1502
1503         /* get a working reference to the output device */
1504         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1505         if (out_dev == NULL) {
1506                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1507                 return -EINVAL;
1508         }
1509
1510         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1511                                   in_dev->dev, in_dev, &itag);
1512         if (err < 0) {
1513                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1514                                          saddr);
1515
1516                 goto cleanup;
1517         }
1518
1519         do_cache = res->fi && !itag;
1520         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1521             skb->protocol == htons(ETH_P_IP) &&
1522             (IN_DEV_SHARED_MEDIA(out_dev) ||
1523              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1524                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1525
1526         if (skb->protocol != htons(ETH_P_IP)) {
1527                 /* Not IP (i.e. ARP). Do not create route, if it is
1528                  * invalid for proxy arp. DNAT routes are always valid.
1529                  *
1530                  * Proxy arp feature have been extended to allow, ARP
1531                  * replies back to the same interface, to support
1532                  * Private VLAN switch technologies. See arp.c.
1533                  */
1534                 if (out_dev == in_dev &&
1535                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1536                         err = -EINVAL;
1537                         goto cleanup;
1538                 }
1539         }
1540
1541         if (do_cache) {
1542                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1543                 if (rt_cache_valid(rth)) {
1544                         skb_dst_set_noref(skb, &rth->dst);
1545                         goto out;
1546                 }
1547         }
1548
1549         rth = rt_dst_alloc(out_dev->dev,
1550                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1551                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1552         if (!rth) {
1553                 err = -ENOBUFS;
1554                 goto cleanup;
1555         }
1556
1557         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1558         rth->rt_flags = flags;
1559         rth->rt_type = res->type;
1560         rth->rt_is_input = 1;
1561         rth->rt_iif     = 0;
1562         rth->rt_pmtu    = 0;
1563         rth->rt_gateway = 0;
1564         rth->rt_uses_gateway = 0;
1565         INIT_LIST_HEAD(&rth->rt_uncached);
1566         RT_CACHE_STAT_INC(in_slow_tot);
1567
1568         rth->dst.input = ip_forward;
1569         rth->dst.output = ip_output;
1570
1571         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1572         skb_dst_set(skb, &rth->dst);
1573 out:
1574         err = 0;
1575  cleanup:
1576         return err;
1577 }
1578
1579 static int ip_mkroute_input(struct sk_buff *skb,
1580                             struct fib_result *res,
1581                             const struct flowi4 *fl4,
1582                             struct in_device *in_dev,
1583                             __be32 daddr, __be32 saddr, u32 tos)
1584 {
1585 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1586         if (res->fi && res->fi->fib_nhs > 1)
1587                 fib_select_multipath(res);
1588 #endif
1589
1590         /* create a routing cache entry */
1591         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1592 }
1593
1594 /*
1595  *      NOTE. We drop all the packets that has local source
1596  *      addresses, because every properly looped back packet
1597  *      must have correct destination already attached by output routine.
1598  *
1599  *      Such approach solves two big problems:
1600  *      1. Not simplex devices are handled properly.
1601  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1602  *      called with rcu_read_lock()
1603  */
1604
1605 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1606                                u8 tos, struct net_device *dev)
1607 {
1608         struct fib_result res;
1609         struct in_device *in_dev = __in_dev_get_rcu(dev);
1610         struct flowi4   fl4;
1611         unsigned int    flags = 0;
1612         u32             itag = 0;
1613         struct rtable   *rth;
1614         int             err = -EINVAL;
1615         struct net    *net = dev_net(dev);
1616         bool do_cache;
1617
1618         /* IP on this device is disabled. */
1619
1620         if (!in_dev)
1621                 goto out;
1622
1623         /* Check for the most weird martians, which can be not detected
1624            by fib_lookup.
1625          */
1626
1627         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1628                 goto martian_source;
1629
1630         res.fi = NULL;
1631         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1632                 goto brd_input;
1633
1634         /* Accept zero addresses only to limited broadcast;
1635          * I even do not know to fix it or not. Waiting for complains :-)
1636          */
1637         if (ipv4_is_zeronet(saddr))
1638                 goto martian_source;
1639
1640         if (ipv4_is_zeronet(daddr))
1641                 goto martian_destination;
1642
1643         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1644          * and call it once if daddr or/and saddr are loopback addresses
1645          */
1646         if (ipv4_is_loopback(daddr)) {
1647                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1648                         goto martian_destination;
1649         } else if (ipv4_is_loopback(saddr)) {
1650                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1651                         goto martian_source;
1652         }
1653
1654         /*
1655          *      Now we are ready to route packet.
1656          */
1657         fl4.flowi4_oif = 0;
1658         fl4.flowi4_iif = dev->ifindex;
1659         fl4.flowi4_mark = skb->mark;
1660         fl4.flowi4_tos = tos;
1661         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1662         fl4.daddr = daddr;
1663         fl4.saddr = saddr;
1664         err = fib_lookup(net, &fl4, &res);
1665         if (err != 0)
1666                 goto no_route;
1667
1668         if (res.type == RTN_BROADCAST)
1669                 goto brd_input;
1670
1671         if (res.type == RTN_LOCAL) {
1672                 err = fib_validate_source(skb, saddr, daddr, tos,
1673                                           LOOPBACK_IFINDEX,
1674                                           dev, in_dev, &itag);
1675                 if (err < 0)
1676                         goto martian_source_keep_err;
1677                 goto local_input;
1678         }
1679
1680         if (!IN_DEV_FORWARD(in_dev))
1681                 goto no_route;
1682         if (res.type != RTN_UNICAST)
1683                 goto martian_destination;
1684
1685         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1686 out:    return err;
1687
1688 brd_input:
1689         if (skb->protocol != htons(ETH_P_IP))
1690                 goto e_inval;
1691
1692         if (!ipv4_is_zeronet(saddr)) {
1693                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1694                                           in_dev, &itag);
1695                 if (err < 0)
1696                         goto martian_source_keep_err;
1697         }
1698         flags |= RTCF_BROADCAST;
1699         res.type = RTN_BROADCAST;
1700         RT_CACHE_STAT_INC(in_brd);
1701
1702 local_input:
1703         do_cache = false;
1704         if (res.fi) {
1705                 if (!itag) {
1706                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1707                         if (rt_cache_valid(rth)) {
1708                                 skb_dst_set_noref(skb, &rth->dst);
1709                                 err = 0;
1710                                 goto out;
1711                         }
1712                         do_cache = true;
1713                 }
1714         }
1715
1716         rth = rt_dst_alloc(net->loopback_dev,
1717                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1718         if (!rth)
1719                 goto e_nobufs;
1720
1721         rth->dst.input= ip_local_deliver;
1722         rth->dst.output= ip_rt_bug;
1723 #ifdef CONFIG_IP_ROUTE_CLASSID
1724         rth->dst.tclassid = itag;
1725 #endif
1726
1727         rth->rt_genid = rt_genid(net);
1728         rth->rt_flags   = flags|RTCF_LOCAL;
1729         rth->rt_type    = res.type;
1730         rth->rt_is_input = 1;
1731         rth->rt_iif     = 0;
1732         rth->rt_pmtu    = 0;
1733         rth->rt_gateway = 0;
1734         rth->rt_uses_gateway = 0;
1735         INIT_LIST_HEAD(&rth->rt_uncached);
1736         RT_CACHE_STAT_INC(in_slow_tot);
1737         if (res.type == RTN_UNREACHABLE) {
1738                 rth->dst.input= ip_error;
1739                 rth->dst.error= -err;
1740                 rth->rt_flags   &= ~RTCF_LOCAL;
1741         }
1742         if (do_cache) {
1743                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1744                         rth->dst.flags |= DST_NOCACHE;
1745                         rt_add_uncached_list(rth);
1746                 }
1747         }
1748         skb_dst_set(skb, &rth->dst);
1749         err = 0;
1750         goto out;
1751
1752 no_route:
1753         RT_CACHE_STAT_INC(in_no_route);
1754         res.type = RTN_UNREACHABLE;
1755         if (err == -ESRCH)
1756                 err = -ENETUNREACH;
1757         goto local_input;
1758
1759         /*
1760          *      Do not cache martian addresses: they should be logged (RFC1812)
1761          */
1762 martian_destination:
1763         RT_CACHE_STAT_INC(in_martian_dst);
1764 #ifdef CONFIG_IP_ROUTE_VERBOSE
1765         if (IN_DEV_LOG_MARTIANS(in_dev))
1766                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1767                                      &daddr, &saddr, dev->name);
1768 #endif
1769
1770 e_inval:
1771         err = -EINVAL;
1772         goto out;
1773
1774 e_nobufs:
1775         err = -ENOBUFS;
1776         goto out;
1777
1778 martian_source:
1779         err = -EINVAL;
1780 martian_source_keep_err:
1781         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1782         goto out;
1783 }
1784
1785 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1786                          u8 tos, struct net_device *dev)
1787 {
1788         int res;
1789
1790         rcu_read_lock();
1791
1792         /* Multicast recognition logic is moved from route cache to here.
1793            The problem was that too many Ethernet cards have broken/missing
1794            hardware multicast filters :-( As result the host on multicasting
1795            network acquires a lot of useless route cache entries, sort of
1796            SDR messages from all the world. Now we try to get rid of them.
1797            Really, provided software IP multicast filter is organized
1798            reasonably (at least, hashed), it does not result in a slowdown
1799            comparing with route cache reject entries.
1800            Note, that multicast routers are not affected, because
1801            route cache entry is created eventually.
1802          */
1803         if (ipv4_is_multicast(daddr)) {
1804                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1805
1806                 if (in_dev) {
1807                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1808                                                   ip_hdr(skb)->protocol);
1809                         if (our
1810 #ifdef CONFIG_IP_MROUTE
1811                                 ||
1812                             (!ipv4_is_local_multicast(daddr) &&
1813                              IN_DEV_MFORWARD(in_dev))
1814 #endif
1815                            ) {
1816                                 int res = ip_route_input_mc(skb, daddr, saddr,
1817                                                             tos, dev, our);
1818                                 rcu_read_unlock();
1819                                 return res;
1820                         }
1821                 }
1822                 rcu_read_unlock();
1823                 return -EINVAL;
1824         }
1825         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1826         rcu_read_unlock();
1827         return res;
1828 }
1829 EXPORT_SYMBOL(ip_route_input_noref);
1830
1831 /* called with rcu_read_lock() */
1832 static struct rtable *__mkroute_output(const struct fib_result *res,
1833                                        const struct flowi4 *fl4, int orig_oif,
1834                                        struct net_device *dev_out,
1835                                        unsigned int flags)
1836 {
1837         struct fib_info *fi = res->fi;
1838         struct fib_nh_exception *fnhe;
1839         struct in_device *in_dev;
1840         u16 type = res->type;
1841         struct rtable *rth;
1842         bool do_cache;
1843
1844         in_dev = __in_dev_get_rcu(dev_out);
1845         if (!in_dev)
1846                 return ERR_PTR(-EINVAL);
1847
1848         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1849                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1850                         return ERR_PTR(-EINVAL);
1851
1852         if (ipv4_is_lbcast(fl4->daddr))
1853                 type = RTN_BROADCAST;
1854         else if (ipv4_is_multicast(fl4->daddr))
1855                 type = RTN_MULTICAST;
1856         else if (ipv4_is_zeronet(fl4->daddr))
1857                 return ERR_PTR(-EINVAL);
1858
1859         if (dev_out->flags & IFF_LOOPBACK)
1860                 flags |= RTCF_LOCAL;
1861
1862         do_cache = true;
1863         if (type == RTN_BROADCAST) {
1864                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1865                 fi = NULL;
1866         } else if (type == RTN_MULTICAST) {
1867                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1868                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1869                                      fl4->flowi4_proto))
1870                         flags &= ~RTCF_LOCAL;
1871                 else
1872                         do_cache = false;
1873                 /* If multicast route do not exist use
1874                  * default one, but do not gateway in this case.
1875                  * Yes, it is hack.
1876                  */
1877                 if (fi && res->prefixlen < 4)
1878                         fi = NULL;
1879         }
1880
1881         fnhe = NULL;
1882         do_cache &= fi != NULL;
1883         if (do_cache) {
1884                 struct rtable __rcu **prth;
1885                 struct fib_nh *nh = &FIB_RES_NH(*res);
1886
1887                 fnhe = find_exception(nh, fl4->daddr);
1888                 if (fnhe)
1889                         prth = &fnhe->fnhe_rth;
1890                 else {
1891                         if (unlikely(fl4->flowi4_flags &
1892                                      FLOWI_FLAG_KNOWN_NH &&
1893                                      !(nh->nh_gw &&
1894                                        nh->nh_scope == RT_SCOPE_LINK))) {
1895                                 do_cache = false;
1896                                 goto add;
1897                         }
1898                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1899                 }
1900                 rth = rcu_dereference(*prth);
1901                 if (rt_cache_valid(rth)) {
1902                         dst_hold(&rth->dst);
1903                         return rth;
1904                 }
1905         }
1906
1907 add:
1908         rth = rt_dst_alloc(dev_out,
1909                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1910                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1911                            do_cache);
1912         if (!rth)
1913                 return ERR_PTR(-ENOBUFS);
1914
1915         rth->dst.output = ip_output;
1916
1917         rth->rt_genid = rt_genid(dev_net(dev_out));
1918         rth->rt_flags   = flags;
1919         rth->rt_type    = type;
1920         rth->rt_is_input = 0;
1921         rth->rt_iif     = orig_oif ? : 0;
1922         rth->rt_pmtu    = 0;
1923         rth->rt_gateway = 0;
1924         rth->rt_uses_gateway = 0;
1925         INIT_LIST_HEAD(&rth->rt_uncached);
1926
1927         RT_CACHE_STAT_INC(out_slow_tot);
1928
1929         if (flags & RTCF_LOCAL)
1930                 rth->dst.input = ip_local_deliver;
1931         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1932                 if (flags & RTCF_LOCAL &&
1933                     !(dev_out->flags & IFF_LOOPBACK)) {
1934                         rth->dst.output = ip_mc_output;
1935                         RT_CACHE_STAT_INC(out_slow_mc);
1936                 }
1937 #ifdef CONFIG_IP_MROUTE
1938                 if (type == RTN_MULTICAST) {
1939                         if (IN_DEV_MFORWARD(in_dev) &&
1940                             !ipv4_is_local_multicast(fl4->daddr)) {
1941                                 rth->dst.input = ip_mr_input;
1942                                 rth->dst.output = ip_mc_output;
1943                         }
1944                 }
1945 #endif
1946         }
1947
1948         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1949
1950         return rth;
1951 }
1952
1953 /*
1954  * Major route resolver routine.
1955  */
1956
1957 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1958 {
1959         struct net_device *dev_out = NULL;
1960         __u8 tos = RT_FL_TOS(fl4);
1961         unsigned int flags = 0;
1962         struct fib_result res;
1963         struct rtable *rth;
1964         int orig_oif;
1965
1966         res.tclassid    = 0;
1967         res.fi          = NULL;
1968         res.table       = NULL;
1969
1970         orig_oif = fl4->flowi4_oif;
1971
1972         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1973         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1974         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1975                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1976
1977         rcu_read_lock();
1978         if (fl4->saddr) {
1979                 rth = ERR_PTR(-EINVAL);
1980                 if (ipv4_is_multicast(fl4->saddr) ||
1981                     ipv4_is_lbcast(fl4->saddr) ||
1982                     ipv4_is_zeronet(fl4->saddr))
1983                         goto out;
1984
1985                 /* I removed check for oif == dev_out->oif here.
1986                    It was wrong for two reasons:
1987                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1988                       is assigned to multiple interfaces.
1989                    2. Moreover, we are allowed to send packets with saddr
1990                       of another iface. --ANK
1991                  */
1992
1993                 if (fl4->flowi4_oif == 0 &&
1994                     (ipv4_is_multicast(fl4->daddr) ||
1995                      ipv4_is_lbcast(fl4->daddr))) {
1996                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1997                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1998                         if (dev_out == NULL)
1999                                 goto out;
2000
2001                         /* Special hack: user can direct multicasts
2002                            and limited broadcast via necessary interface
2003                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2004                            This hack is not just for fun, it allows
2005                            vic,vat and friends to work.
2006                            They bind socket to loopback, set ttl to zero
2007                            and expect that it will work.
2008                            From the viewpoint of routing cache they are broken,
2009                            because we are not allowed to build multicast path
2010                            with loopback source addr (look, routing cache
2011                            cannot know, that ttl is zero, so that packet
2012                            will not leave this host and route is valid).
2013                            Luckily, this hack is good workaround.
2014                          */
2015
2016                         fl4->flowi4_oif = dev_out->ifindex;
2017                         goto make_route;
2018                 }
2019
2020                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2021                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2022                         if (!__ip_dev_find(net, fl4->saddr, false))
2023                                 goto out;
2024                 }
2025         }
2026
2027
2028         if (fl4->flowi4_oif) {
2029                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2030                 rth = ERR_PTR(-ENODEV);
2031                 if (dev_out == NULL)
2032                         goto out;
2033
2034                 /* RACE: Check return value of inet_select_addr instead. */
2035                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2036                         rth = ERR_PTR(-ENETUNREACH);
2037                         goto out;
2038                 }
2039                 if (ipv4_is_local_multicast(fl4->daddr) ||
2040                     ipv4_is_lbcast(fl4->daddr)) {
2041                         if (!fl4->saddr)
2042                                 fl4->saddr = inet_select_addr(dev_out, 0,
2043                                                               RT_SCOPE_LINK);
2044                         goto make_route;
2045                 }
2046                 if (!fl4->saddr) {
2047                         if (ipv4_is_multicast(fl4->daddr))
2048                                 fl4->saddr = inet_select_addr(dev_out, 0,
2049                                                               fl4->flowi4_scope);
2050                         else if (!fl4->daddr)
2051                                 fl4->saddr = inet_select_addr(dev_out, 0,
2052                                                               RT_SCOPE_HOST);
2053                 }
2054         }
2055
2056         if (!fl4->daddr) {
2057                 fl4->daddr = fl4->saddr;
2058                 if (!fl4->daddr)
2059                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2060                 dev_out = net->loopback_dev;
2061                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2062                 res.type = RTN_LOCAL;
2063                 flags |= RTCF_LOCAL;
2064                 goto make_route;
2065         }
2066
2067         if (fib_lookup(net, fl4, &res)) {
2068                 res.fi = NULL;
2069                 res.table = NULL;
2070                 if (fl4->flowi4_oif) {
2071                         /* Apparently, routing tables are wrong. Assume,
2072                            that the destination is on link.
2073
2074                            WHY? DW.
2075                            Because we are allowed to send to iface
2076                            even if it has NO routes and NO assigned
2077                            addresses. When oif is specified, routing
2078                            tables are looked up with only one purpose:
2079                            to catch if destination is gatewayed, rather than
2080                            direct. Moreover, if MSG_DONTROUTE is set,
2081                            we send packet, ignoring both routing tables
2082                            and ifaddr state. --ANK
2083
2084
2085                            We could make it even if oif is unknown,
2086                            likely IPv6, but we do not.
2087                          */
2088
2089                         if (fl4->saddr == 0)
2090                                 fl4->saddr = inet_select_addr(dev_out, 0,
2091                                                               RT_SCOPE_LINK);
2092                         res.type = RTN_UNICAST;
2093                         goto make_route;
2094                 }
2095                 rth = ERR_PTR(-ENETUNREACH);
2096                 goto out;
2097         }
2098
2099         if (res.type == RTN_LOCAL) {
2100                 if (!fl4->saddr) {
2101                         if (res.fi->fib_prefsrc)
2102                                 fl4->saddr = res.fi->fib_prefsrc;
2103                         else
2104                                 fl4->saddr = fl4->daddr;
2105                 }
2106                 dev_out = net->loopback_dev;
2107                 fl4->flowi4_oif = dev_out->ifindex;
2108                 flags |= RTCF_LOCAL;
2109                 goto make_route;
2110         }
2111
2112 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2113         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2114                 fib_select_multipath(&res);
2115         else
2116 #endif
2117         if (!res.prefixlen &&
2118             res.table->tb_num_default > 1 &&
2119             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2120                 fib_select_default(&res);
2121
2122         if (!fl4->saddr)
2123                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2124
2125         dev_out = FIB_RES_DEV(res);
2126         fl4->flowi4_oif = dev_out->ifindex;
2127
2128
2129 make_route:
2130         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2131
2132 out:
2133         rcu_read_unlock();
2134         return rth;
2135 }
2136 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2137
2138 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2139 {
2140         return NULL;
2141 }
2142
2143 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2144 {
2145         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2146
2147         return mtu ? : dst->dev->mtu;
2148 }
2149
2150 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2151                                           struct sk_buff *skb, u32 mtu)
2152 {
2153 }
2154
2155 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2156                                        struct sk_buff *skb)
2157 {
2158 }
2159
2160 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2161                                           unsigned long old)
2162 {
2163         return NULL;
2164 }
2165
2166 static struct dst_ops ipv4_dst_blackhole_ops = {
2167         .family                 =       AF_INET,
2168         .protocol               =       cpu_to_be16(ETH_P_IP),
2169         .check                  =       ipv4_blackhole_dst_check,
2170         .mtu                    =       ipv4_blackhole_mtu,
2171         .default_advmss         =       ipv4_default_advmss,
2172         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2173         .redirect               =       ipv4_rt_blackhole_redirect,
2174         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2175         .neigh_lookup           =       ipv4_neigh_lookup,
2176 };
2177
2178 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2179 {
2180         struct rtable *ort = (struct rtable *) dst_orig;
2181         struct rtable *rt;
2182
2183         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2184         if (rt) {
2185                 struct dst_entry *new = &rt->dst;
2186
2187                 new->__use = 1;
2188                 new->input = dst_discard;
2189                 new->output = dst_discard;
2190
2191                 new->dev = ort->dst.dev;
2192                 if (new->dev)
2193                         dev_hold(new->dev);
2194
2195                 rt->rt_is_input = ort->rt_is_input;
2196                 rt->rt_iif = ort->rt_iif;
2197                 rt->rt_pmtu = ort->rt_pmtu;
2198
2199                 rt->rt_genid = rt_genid(net);
2200                 rt->rt_flags = ort->rt_flags;
2201                 rt->rt_type = ort->rt_type;
2202                 rt->rt_gateway = ort->rt_gateway;
2203                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2204
2205                 INIT_LIST_HEAD(&rt->rt_uncached);
2206
2207                 dst_free(new);
2208         }
2209
2210         dst_release(dst_orig);
2211
2212         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2213 }
2214
2215 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2216                                     struct sock *sk)
2217 {
2218         struct rtable *rt = __ip_route_output_key(net, flp4);
2219
2220         if (IS_ERR(rt))
2221                 return rt;
2222
2223         if (flp4->flowi4_proto)
2224                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2225                                                    flowi4_to_flowi(flp4),
2226                                                    sk, 0);
2227
2228         return rt;
2229 }
2230 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2231
2232 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2233                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2234                         u32 seq, int event, int nowait, unsigned int flags)
2235 {
2236         struct rtable *rt = skb_rtable(skb);
2237         struct rtmsg *r;
2238         struct nlmsghdr *nlh;
2239         unsigned long expires = 0;
2240         u32 error;
2241         u32 metrics[RTAX_MAX];
2242
2243         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2244         if (nlh == NULL)
2245                 return -EMSGSIZE;
2246
2247         r = nlmsg_data(nlh);
2248         r->rtm_family    = AF_INET;
2249         r->rtm_dst_len  = 32;
2250         r->rtm_src_len  = 0;
2251         r->rtm_tos      = fl4->flowi4_tos;
2252         r->rtm_table    = RT_TABLE_MAIN;
2253         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2254                 goto nla_put_failure;
2255         r->rtm_type     = rt->rt_type;
2256         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2257         r->rtm_protocol = RTPROT_UNSPEC;
2258         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2259         if (rt->rt_flags & RTCF_NOTIFY)
2260                 r->rtm_flags |= RTM_F_NOTIFY;
2261         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2262                 r->rtm_flags |= RTCF_DOREDIRECT;
2263
2264         if (nla_put_be32(skb, RTA_DST, dst))
2265                 goto nla_put_failure;
2266         if (src) {
2267                 r->rtm_src_len = 32;
2268                 if (nla_put_be32(skb, RTA_SRC, src))
2269                         goto nla_put_failure;
2270         }
2271         if (rt->dst.dev &&
2272             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2273                 goto nla_put_failure;
2274 #ifdef CONFIG_IP_ROUTE_CLASSID
2275         if (rt->dst.tclassid &&
2276             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2277                 goto nla_put_failure;
2278 #endif
2279         if (!rt_is_input_route(rt) &&
2280             fl4->saddr != src) {
2281                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2282                         goto nla_put_failure;
2283         }
2284         if (rt->rt_uses_gateway &&
2285             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2286                 goto nla_put_failure;
2287
2288         expires = rt->dst.expires;
2289         if (expires) {
2290                 unsigned long now = jiffies;
2291
2292                 if (time_before(now, expires))
2293                         expires -= now;
2294                 else
2295                         expires = 0;
2296         }
2297
2298         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2299         if (rt->rt_pmtu && expires)
2300                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2301         if (rtnetlink_put_metrics(skb, metrics) < 0)
2302                 goto nla_put_failure;
2303
2304         if (fl4->flowi4_mark &&
2305             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2306                 goto nla_put_failure;
2307
2308         error = rt->dst.error;
2309
2310         if (rt_is_input_route(rt)) {
2311 #ifdef CONFIG_IP_MROUTE
2312                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2313                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2314                         int err = ipmr_get_route(net, skb,
2315                                                  fl4->saddr, fl4->daddr,
2316                                                  r, nowait);
2317                         if (err <= 0) {
2318                                 if (!nowait) {
2319                                         if (err == 0)
2320                                                 return 0;
2321                                         goto nla_put_failure;
2322                                 } else {
2323                                         if (err == -EMSGSIZE)
2324                                                 goto nla_put_failure;
2325                                         error = err;
2326                                 }
2327                         }
2328                 } else
2329 #endif
2330                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2331                                 goto nla_put_failure;
2332         }
2333
2334         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2335                 goto nla_put_failure;
2336
2337         return nlmsg_end(skb, nlh);
2338
2339 nla_put_failure:
2340         nlmsg_cancel(skb, nlh);
2341         return -EMSGSIZE;
2342 }
2343
2344 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2345 {
2346         struct net *net = sock_net(in_skb->sk);
2347         struct rtmsg *rtm;
2348         struct nlattr *tb[RTA_MAX+1];
2349         struct rtable *rt = NULL;
2350         struct flowi4 fl4;
2351         __be32 dst = 0;
2352         __be32 src = 0;
2353         u32 iif;
2354         int err;
2355         int mark;
2356         struct sk_buff *skb;
2357
2358         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2359         if (err < 0)
2360                 goto errout;
2361
2362         rtm = nlmsg_data(nlh);
2363
2364         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2365         if (skb == NULL) {
2366                 err = -ENOBUFS;
2367                 goto errout;
2368         }
2369
2370         /* Reserve room for dummy headers, this skb can pass
2371            through good chunk of routing engine.
2372          */
2373         skb_reset_mac_header(skb);
2374         skb_reset_network_header(skb);
2375
2376         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2377         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2378         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2379
2380         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2381         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2382         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2383         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2384
2385         memset(&fl4, 0, sizeof(fl4));
2386         fl4.daddr = dst;
2387         fl4.saddr = src;
2388         fl4.flowi4_tos = rtm->rtm_tos;
2389         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2390         fl4.flowi4_mark = mark;
2391
2392         if (iif) {
2393                 struct net_device *dev;
2394
2395                 dev = __dev_get_by_index(net, iif);
2396                 if (dev == NULL) {
2397                         err = -ENODEV;
2398                         goto errout_free;
2399                 }
2400
2401                 skb->protocol   = htons(ETH_P_IP);
2402                 skb->dev        = dev;
2403                 skb->mark       = mark;
2404                 local_bh_disable();
2405                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2406                 local_bh_enable();
2407
2408                 rt = skb_rtable(skb);
2409                 if (err == 0 && rt->dst.error)
2410                         err = -rt->dst.error;
2411         } else {
2412                 rt = ip_route_output_key(net, &fl4);
2413
2414                 err = 0;
2415                 if (IS_ERR(rt))
2416                         err = PTR_ERR(rt);
2417         }
2418
2419         if (err)
2420                 goto errout_free;
2421
2422         skb_dst_set(skb, &rt->dst);
2423         if (rtm->rtm_flags & RTM_F_NOTIFY)
2424                 rt->rt_flags |= RTCF_NOTIFY;
2425
2426         err = rt_fill_info(net, dst, src, &fl4, skb,
2427                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2428                            RTM_NEWROUTE, 0, 0);
2429         if (err <= 0)
2430                 goto errout_free;
2431
2432         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2433 errout:
2434         return err;
2435
2436 errout_free:
2437         kfree_skb(skb);
2438         goto errout;
2439 }
2440
2441 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2442 {
2443         return skb->len;
2444 }
2445
2446 void ip_rt_multicast_event(struct in_device *in_dev)
2447 {
2448         rt_cache_flush(dev_net(in_dev->dev));
2449 }
2450
2451 #ifdef CONFIG_SYSCTL
2452 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2453 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2454 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2455 static int ip_rt_gc_elasticity __read_mostly    = 8;
2456
2457 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2458                                         void __user *buffer,
2459                                         size_t *lenp, loff_t *ppos)
2460 {
2461         if (write) {
2462                 rt_cache_flush((struct net *)__ctl->extra1);
2463                 return 0;
2464         }
2465
2466         return -EINVAL;
2467 }
2468
2469 static ctl_table ipv4_route_table[] = {
2470         {
2471                 .procname       = "gc_thresh",
2472                 .data           = &ipv4_dst_ops.gc_thresh,
2473                 .maxlen         = sizeof(int),
2474                 .mode           = 0644,
2475                 .proc_handler   = proc_dointvec,
2476         },
2477         {
2478                 .procname       = "max_size",
2479                 .data           = &ip_rt_max_size,
2480                 .maxlen         = sizeof(int),
2481                 .mode           = 0644,
2482                 .proc_handler   = proc_dointvec,
2483         },
2484         {
2485                 /*  Deprecated. Use gc_min_interval_ms */
2486
2487                 .procname       = "gc_min_interval",
2488                 .data           = &ip_rt_gc_min_interval,
2489                 .maxlen         = sizeof(int),
2490                 .mode           = 0644,
2491                 .proc_handler   = proc_dointvec_jiffies,
2492         },
2493         {
2494                 .procname       = "gc_min_interval_ms",
2495                 .data           = &ip_rt_gc_min_interval,
2496                 .maxlen         = sizeof(int),
2497                 .mode           = 0644,
2498                 .proc_handler   = proc_dointvec_ms_jiffies,
2499         },
2500         {
2501                 .procname       = "gc_timeout",
2502                 .data           = &ip_rt_gc_timeout,
2503                 .maxlen         = sizeof(int),
2504                 .mode           = 0644,
2505                 .proc_handler   = proc_dointvec_jiffies,
2506         },
2507         {
2508                 .procname       = "gc_interval",
2509                 .data           = &ip_rt_gc_interval,
2510                 .maxlen         = sizeof(int),
2511                 .mode           = 0644,
2512                 .proc_handler   = proc_dointvec_jiffies,
2513         },
2514         {
2515                 .procname       = "redirect_load",
2516                 .data           = &ip_rt_redirect_load,
2517                 .maxlen         = sizeof(int),
2518                 .mode           = 0644,
2519                 .proc_handler   = proc_dointvec,
2520         },
2521         {
2522                 .procname       = "redirect_number",
2523                 .data           = &ip_rt_redirect_number,
2524                 .maxlen         = sizeof(int),
2525                 .mode           = 0644,
2526                 .proc_handler   = proc_dointvec,
2527         },
2528         {
2529                 .procname       = "redirect_silence",
2530                 .data           = &ip_rt_redirect_silence,
2531                 .maxlen         = sizeof(int),
2532                 .mode           = 0644,
2533                 .proc_handler   = proc_dointvec,
2534         },
2535         {
2536                 .procname       = "error_cost",
2537                 .data           = &ip_rt_error_cost,
2538                 .maxlen         = sizeof(int),
2539                 .mode           = 0644,
2540                 .proc_handler   = proc_dointvec,
2541         },
2542         {
2543                 .procname       = "error_burst",
2544                 .data           = &ip_rt_error_burst,
2545                 .maxlen         = sizeof(int),
2546                 .mode           = 0644,
2547                 .proc_handler   = proc_dointvec,
2548         },
2549         {
2550                 .procname       = "gc_elasticity",
2551                 .data           = &ip_rt_gc_elasticity,
2552                 .maxlen         = sizeof(int),
2553                 .mode           = 0644,
2554                 .proc_handler   = proc_dointvec,
2555         },
2556         {
2557                 .procname       = "mtu_expires",
2558                 .data           = &ip_rt_mtu_expires,
2559                 .maxlen         = sizeof(int),
2560                 .mode           = 0644,
2561                 .proc_handler   = proc_dointvec_jiffies,
2562         },
2563         {
2564                 .procname       = "min_pmtu",
2565                 .data           = &ip_rt_min_pmtu,
2566                 .maxlen         = sizeof(int),
2567                 .mode           = 0644,
2568                 .proc_handler   = proc_dointvec,
2569         },
2570         {
2571                 .procname       = "min_adv_mss",
2572                 .data           = &ip_rt_min_advmss,
2573                 .maxlen         = sizeof(int),
2574                 .mode           = 0644,
2575                 .proc_handler   = proc_dointvec,
2576         },
2577         { }
2578 };
2579
2580 static struct ctl_table ipv4_route_flush_table[] = {
2581         {
2582                 .procname       = "flush",
2583                 .maxlen         = sizeof(int),
2584                 .mode           = 0200,
2585                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2586         },
2587         { },
2588 };
2589
2590 static __net_init int sysctl_route_net_init(struct net *net)
2591 {
2592         struct ctl_table *tbl;
2593
2594         tbl = ipv4_route_flush_table;
2595         if (!net_eq(net, &init_net)) {
2596                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2597                 if (tbl == NULL)
2598                         goto err_dup;
2599
2600                 /* Don't export sysctls to unprivileged users */
2601                 if (net->user_ns != &init_user_ns)
2602                         tbl[0].procname = NULL;
2603         }
2604         tbl[0].extra1 = net;
2605
2606         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2607         if (net->ipv4.route_hdr == NULL)
2608                 goto err_reg;
2609         return 0;
2610
2611 err_reg:
2612         if (tbl != ipv4_route_flush_table)
2613                 kfree(tbl);
2614 err_dup:
2615         return -ENOMEM;
2616 }
2617
2618 static __net_exit void sysctl_route_net_exit(struct net *net)
2619 {
2620         struct ctl_table *tbl;
2621
2622         tbl = net->ipv4.route_hdr->ctl_table_arg;
2623         unregister_net_sysctl_table(net->ipv4.route_hdr);
2624         BUG_ON(tbl == ipv4_route_flush_table);
2625         kfree(tbl);
2626 }
2627
2628 static __net_initdata struct pernet_operations sysctl_route_ops = {
2629         .init = sysctl_route_net_init,
2630         .exit = sysctl_route_net_exit,
2631 };
2632 #endif
2633
2634 static __net_init int rt_genid_init(struct net *net)
2635 {
2636         atomic_set(&net->rt_genid, 0);
2637         get_random_bytes(&net->ipv4.dev_addr_genid,
2638                          sizeof(net->ipv4.dev_addr_genid));
2639         return 0;
2640 }
2641
2642 static __net_initdata struct pernet_operations rt_genid_ops = {
2643         .init = rt_genid_init,
2644 };
2645
2646 static int __net_init ipv4_inetpeer_init(struct net *net)
2647 {
2648         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2649
2650         if (!bp)
2651                 return -ENOMEM;
2652         inet_peer_base_init(bp);
2653         net->ipv4.peers = bp;
2654         return 0;
2655 }
2656
2657 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2658 {
2659         struct inet_peer_base *bp = net->ipv4.peers;
2660
2661         net->ipv4.peers = NULL;
2662         inetpeer_invalidate_tree(bp);
2663         kfree(bp);
2664 }
2665
2666 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2667         .init   =       ipv4_inetpeer_init,
2668         .exit   =       ipv4_inetpeer_exit,
2669 };
2670
2671 #ifdef CONFIG_IP_ROUTE_CLASSID
2672 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2673 #endif /* CONFIG_IP_ROUTE_CLASSID */
2674
2675 int __init ip_rt_init(void)
2676 {
2677         int rc = 0;
2678
2679         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2680         if (!ip_idents)
2681                 panic("IP: failed to allocate ip_idents\n");
2682
2683         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2684
2685 #ifdef CONFIG_IP_ROUTE_CLASSID
2686         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2687         if (!ip_rt_acct)
2688                 panic("IP: failed to allocate ip_rt_acct\n");
2689 #endif
2690
2691         ipv4_dst_ops.kmem_cachep =
2692                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2693                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2694
2695         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2696
2697         if (dst_entries_init(&ipv4_dst_ops) < 0)
2698                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2699
2700         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2701                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2702
2703         ipv4_dst_ops.gc_thresh = ~0;
2704         ip_rt_max_size = INT_MAX;
2705
2706         devinet_init();
2707         ip_fib_init();
2708
2709         if (ip_rt_proc_init())
2710                 pr_err("Unable to create route proc files\n");
2711 #ifdef CONFIG_XFRM
2712         xfrm_init();
2713         xfrm4_init();
2714 #endif
2715         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2716
2717 #ifdef CONFIG_SYSCTL
2718         register_pernet_subsys(&sysctl_route_ops);
2719 #endif
2720         register_pernet_subsys(&rt_genid_ops);
2721         register_pernet_subsys(&ipv4_inetpeer_ops);
2722         return rc;
2723 }
2724
2725 #ifdef CONFIG_SYSCTL
2726 /*
2727  * We really need to sanitize the damn ipv4 init order, then all
2728  * this nonsense will go away.
2729  */
2730 void __init ip_static_sysctl_init(void)
2731 {
2732         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2733 }
2734 #endif