regulator: ltc3589: Staticize ltc3589_reg_defaults
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #include <linux/kmemleak.h>
110 #endif
111 #include <net/secure_seq.h>
112
113 #define RT_FL_TOS(oldflp4) \
114         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
115
116 #define RT_GC_TIMEOUT (300*HZ)
117
118 static int ip_rt_max_size;
119 static int ip_rt_redirect_number __read_mostly  = 9;
120 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
121 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
122 static int ip_rt_error_cost __read_mostly       = HZ;
123 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
124 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
125 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
126 static int ip_rt_min_advmss __read_mostly       = 256;
127
128 /*
129  *      Interface to generic destination cache.
130  */
131
132 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
133 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
134 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
135 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
136 static void              ipv4_link_failure(struct sk_buff *skb);
137 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
138                                            struct sk_buff *skb, u32 mtu);
139 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
140                                         struct sk_buff *skb);
141 static void             ipv4_dst_destroy(struct dst_entry *dst);
142
143 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
144 {
145         WARN_ON(1);
146         return NULL;
147 }
148
149 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
150                                            struct sk_buff *skb,
151                                            const void *daddr);
152
153 static struct dst_ops ipv4_dst_ops = {
154         .family =               AF_INET,
155         .protocol =             cpu_to_be16(ETH_P_IP),
156         .check =                ipv4_dst_check,
157         .default_advmss =       ipv4_default_advmss,
158         .mtu =                  ipv4_mtu,
159         .cow_metrics =          ipv4_cow_metrics,
160         .destroy =              ipv4_dst_destroy,
161         .negative_advice =      ipv4_negative_advice,
162         .link_failure =         ipv4_link_failure,
163         .update_pmtu =          ip_rt_update_pmtu,
164         .redirect =             ip_do_redirect,
165         .local_out =            __ip_local_out,
166         .neigh_lookup =         ipv4_neigh_lookup,
167 };
168
169 #define ECN_OR_COST(class)      TC_PRIO_##class
170
171 const __u8 ip_tos2prio[16] = {
172         TC_PRIO_BESTEFFORT,
173         ECN_OR_COST(BESTEFFORT),
174         TC_PRIO_BESTEFFORT,
175         ECN_OR_COST(BESTEFFORT),
176         TC_PRIO_BULK,
177         ECN_OR_COST(BULK),
178         TC_PRIO_BULK,
179         ECN_OR_COST(BULK),
180         TC_PRIO_INTERACTIVE,
181         ECN_OR_COST(INTERACTIVE),
182         TC_PRIO_INTERACTIVE,
183         ECN_OR_COST(INTERACTIVE),
184         TC_PRIO_INTERACTIVE_BULK,
185         ECN_OR_COST(INTERACTIVE_BULK),
186         TC_PRIO_INTERACTIVE_BULK,
187         ECN_OR_COST(INTERACTIVE_BULK)
188 };
189 EXPORT_SYMBOL(ip_tos2prio);
190
191 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
192 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
193
194 #ifdef CONFIG_PROC_FS
195 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
196 {
197         if (*pos)
198                 return NULL;
199         return SEQ_START_TOKEN;
200 }
201
202 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
203 {
204         ++*pos;
205         return NULL;
206 }
207
208 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
209 {
210 }
211
212 static int rt_cache_seq_show(struct seq_file *seq, void *v)
213 {
214         if (v == SEQ_START_TOKEN)
215                 seq_printf(seq, "%-127s\n",
216                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
217                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
218                            "HHUptod\tSpecDst");
219         return 0;
220 }
221
222 static const struct seq_operations rt_cache_seq_ops = {
223         .start  = rt_cache_seq_start,
224         .next   = rt_cache_seq_next,
225         .stop   = rt_cache_seq_stop,
226         .show   = rt_cache_seq_show,
227 };
228
229 static int rt_cache_seq_open(struct inode *inode, struct file *file)
230 {
231         return seq_open(file, &rt_cache_seq_ops);
232 }
233
234 static const struct file_operations rt_cache_seq_fops = {
235         .owner   = THIS_MODULE,
236         .open    = rt_cache_seq_open,
237         .read    = seq_read,
238         .llseek  = seq_lseek,
239         .release = seq_release,
240 };
241
242
243 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
244 {
245         int cpu;
246
247         if (*pos == 0)
248                 return SEQ_START_TOKEN;
249
250         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
251                 if (!cpu_possible(cpu))
252                         continue;
253                 *pos = cpu+1;
254                 return &per_cpu(rt_cache_stat, cpu);
255         }
256         return NULL;
257 }
258
259 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
260 {
261         int cpu;
262
263         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
264                 if (!cpu_possible(cpu))
265                         continue;
266                 *pos = cpu+1;
267                 return &per_cpu(rt_cache_stat, cpu);
268         }
269         return NULL;
270
271 }
272
273 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
274 {
275
276 }
277
278 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
279 {
280         struct rt_cache_stat *st = v;
281
282         if (v == SEQ_START_TOKEN) {
283                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
284                 return 0;
285         }
286
287         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
288                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
289                    dst_entries_get_slow(&ipv4_dst_ops),
290                    0, /* st->in_hit */
291                    st->in_slow_tot,
292                    st->in_slow_mc,
293                    st->in_no_route,
294                    st->in_brd,
295                    st->in_martian_dst,
296                    st->in_martian_src,
297
298                    0, /* st->out_hit */
299                    st->out_slow_tot,
300                    st->out_slow_mc,
301
302                    0, /* st->gc_total */
303                    0, /* st->gc_ignored */
304                    0, /* st->gc_goal_miss */
305                    0, /* st->gc_dst_overflow */
306                    0, /* st->in_hlist_search */
307                    0  /* st->out_hlist_search */
308                 );
309         return 0;
310 }
311
312 static const struct seq_operations rt_cpu_seq_ops = {
313         .start  = rt_cpu_seq_start,
314         .next   = rt_cpu_seq_next,
315         .stop   = rt_cpu_seq_stop,
316         .show   = rt_cpu_seq_show,
317 };
318
319
320 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
321 {
322         return seq_open(file, &rt_cpu_seq_ops);
323 }
324
325 static const struct file_operations rt_cpu_seq_fops = {
326         .owner   = THIS_MODULE,
327         .open    = rt_cpu_seq_open,
328         .read    = seq_read,
329         .llseek  = seq_lseek,
330         .release = seq_release,
331 };
332
333 #ifdef CONFIG_IP_ROUTE_CLASSID
334 static int rt_acct_proc_show(struct seq_file *m, void *v)
335 {
336         struct ip_rt_acct *dst, *src;
337         unsigned int i, j;
338
339         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
340         if (!dst)
341                 return -ENOMEM;
342
343         for_each_possible_cpu(i) {
344                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
345                 for (j = 0; j < 256; j++) {
346                         dst[j].o_bytes   += src[j].o_bytes;
347                         dst[j].o_packets += src[j].o_packets;
348                         dst[j].i_bytes   += src[j].i_bytes;
349                         dst[j].i_packets += src[j].i_packets;
350                 }
351         }
352
353         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
354         kfree(dst);
355         return 0;
356 }
357
358 static int rt_acct_proc_open(struct inode *inode, struct file *file)
359 {
360         return single_open(file, rt_acct_proc_show, NULL);
361 }
362
363 static const struct file_operations rt_acct_proc_fops = {
364         .owner          = THIS_MODULE,
365         .open           = rt_acct_proc_open,
366         .read           = seq_read,
367         .llseek         = seq_lseek,
368         .release        = single_release,
369 };
370 #endif
371
372 static int __net_init ip_rt_do_proc_init(struct net *net)
373 {
374         struct proc_dir_entry *pde;
375
376         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
377                           &rt_cache_seq_fops);
378         if (!pde)
379                 goto err1;
380
381         pde = proc_create("rt_cache", S_IRUGO,
382                           net->proc_net_stat, &rt_cpu_seq_fops);
383         if (!pde)
384                 goto err2;
385
386 #ifdef CONFIG_IP_ROUTE_CLASSID
387         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
388         if (!pde)
389                 goto err3;
390 #endif
391         return 0;
392
393 #ifdef CONFIG_IP_ROUTE_CLASSID
394 err3:
395         remove_proc_entry("rt_cache", net->proc_net_stat);
396 #endif
397 err2:
398         remove_proc_entry("rt_cache", net->proc_net);
399 err1:
400         return -ENOMEM;
401 }
402
403 static void __net_exit ip_rt_do_proc_exit(struct net *net)
404 {
405         remove_proc_entry("rt_cache", net->proc_net_stat);
406         remove_proc_entry("rt_cache", net->proc_net);
407 #ifdef CONFIG_IP_ROUTE_CLASSID
408         remove_proc_entry("rt_acct", net->proc_net);
409 #endif
410 }
411
412 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
413         .init = ip_rt_do_proc_init,
414         .exit = ip_rt_do_proc_exit,
415 };
416
417 static int __init ip_rt_proc_init(void)
418 {
419         return register_pernet_subsys(&ip_rt_proc_ops);
420 }
421
422 #else
423 static inline int ip_rt_proc_init(void)
424 {
425         return 0;
426 }
427 #endif /* CONFIG_PROC_FS */
428
429 static inline bool rt_is_expired(const struct rtable *rth)
430 {
431         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
432 }
433
434 void rt_cache_flush(struct net *net)
435 {
436         rt_genid_bump_ipv4(net);
437 }
438
439 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
440                                            struct sk_buff *skb,
441                                            const void *daddr)
442 {
443         struct net_device *dev = dst->dev;
444         const __be32 *pkey = daddr;
445         const struct rtable *rt;
446         struct neighbour *n;
447
448         rt = (const struct rtable *) dst;
449         if (rt->rt_gateway)
450                 pkey = (const __be32 *) &rt->rt_gateway;
451         else if (skb)
452                 pkey = &ip_hdr(skb)->daddr;
453
454         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
455         if (n)
456                 return n;
457         return neigh_create(&arp_tbl, pkey, dev);
458 }
459
460 atomic_t *ip_idents __read_mostly;
461 EXPORT_SYMBOL(ip_idents);
462
463 void __ip_select_ident(struct iphdr *iph, int segs)
464 {
465         static u32 ip_idents_hashrnd __read_mostly;
466         u32 hash, id;
467
468         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
469
470         hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd);
471         id = ip_idents_reserve(hash, segs);
472         iph->id = htons(id);
473 }
474 EXPORT_SYMBOL(__ip_select_ident);
475
476 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
477                              const struct iphdr *iph,
478                              int oif, u8 tos,
479                              u8 prot, u32 mark, int flow_flags)
480 {
481         if (sk) {
482                 const struct inet_sock *inet = inet_sk(sk);
483
484                 oif = sk->sk_bound_dev_if;
485                 mark = sk->sk_mark;
486                 tos = RT_CONN_FLAGS(sk);
487                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
488         }
489         flowi4_init_output(fl4, oif, mark, tos,
490                            RT_SCOPE_UNIVERSE, prot,
491                            flow_flags,
492                            iph->daddr, iph->saddr, 0, 0);
493 }
494
495 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
496                                const struct sock *sk)
497 {
498         const struct iphdr *iph = ip_hdr(skb);
499         int oif = skb->dev->ifindex;
500         u8 tos = RT_TOS(iph->tos);
501         u8 prot = iph->protocol;
502         u32 mark = skb->mark;
503
504         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
505 }
506
507 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
508 {
509         const struct inet_sock *inet = inet_sk(sk);
510         const struct ip_options_rcu *inet_opt;
511         __be32 daddr = inet->inet_daddr;
512
513         rcu_read_lock();
514         inet_opt = rcu_dereference(inet->inet_opt);
515         if (inet_opt && inet_opt->opt.srr)
516                 daddr = inet_opt->opt.faddr;
517         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
518                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
519                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
520                            inet_sk_flowi_flags(sk),
521                            daddr, inet->inet_saddr, 0, 0);
522         rcu_read_unlock();
523 }
524
525 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
526                                  const struct sk_buff *skb)
527 {
528         if (skb)
529                 build_skb_flow_key(fl4, skb, sk);
530         else
531                 build_sk_flow_key(fl4, sk);
532 }
533
534 static inline void rt_free(struct rtable *rt)
535 {
536         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
537 }
538
539 static DEFINE_SPINLOCK(fnhe_lock);
540
541 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
542 {
543         struct rtable *rt;
544
545         rt = rcu_dereference(fnhe->fnhe_rth_input);
546         if (rt) {
547                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
548                 rt_free(rt);
549         }
550         rt = rcu_dereference(fnhe->fnhe_rth_output);
551         if (rt) {
552                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
553                 rt_free(rt);
554         }
555 }
556
557 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
558 {
559         struct fib_nh_exception *fnhe, *oldest;
560
561         oldest = rcu_dereference(hash->chain);
562         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
563              fnhe = rcu_dereference(fnhe->fnhe_next)) {
564                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
565                         oldest = fnhe;
566         }
567         fnhe_flush_routes(oldest);
568         return oldest;
569 }
570
571 static inline u32 fnhe_hashfun(__be32 daddr)
572 {
573         u32 hval;
574
575         hval = (__force u32) daddr;
576         hval ^= (hval >> 11) ^ (hval >> 22);
577
578         return hval & (FNHE_HASH_SIZE - 1);
579 }
580
581 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
582 {
583         rt->rt_pmtu = fnhe->fnhe_pmtu;
584         rt->dst.expires = fnhe->fnhe_expires;
585
586         if (fnhe->fnhe_gw) {
587                 rt->rt_flags |= RTCF_REDIRECTED;
588                 rt->rt_gateway = fnhe->fnhe_gw;
589                 rt->rt_uses_gateway = 1;
590         }
591 }
592
593 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
594                                   u32 pmtu, unsigned long expires)
595 {
596         struct fnhe_hash_bucket *hash;
597         struct fib_nh_exception *fnhe;
598         struct rtable *rt;
599         unsigned int i;
600         int depth;
601         u32 hval = fnhe_hashfun(daddr);
602
603         spin_lock_bh(&fnhe_lock);
604
605         hash = nh->nh_exceptions;
606         if (!hash) {
607                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
608                 if (!hash)
609                         goto out_unlock;
610                 nh->nh_exceptions = hash;
611         }
612
613         hash += hval;
614
615         depth = 0;
616         for (fnhe = rcu_dereference(hash->chain); fnhe;
617              fnhe = rcu_dereference(fnhe->fnhe_next)) {
618                 if (fnhe->fnhe_daddr == daddr)
619                         break;
620                 depth++;
621         }
622
623         if (fnhe) {
624                 if (gw)
625                         fnhe->fnhe_gw = gw;
626                 if (pmtu) {
627                         fnhe->fnhe_pmtu = pmtu;
628                         fnhe->fnhe_expires = max(1UL, expires);
629                 }
630                 /* Update all cached dsts too */
631                 rt = rcu_dereference(fnhe->fnhe_rth_input);
632                 if (rt)
633                         fill_route_from_fnhe(rt, fnhe);
634                 rt = rcu_dereference(fnhe->fnhe_rth_output);
635                 if (rt)
636                         fill_route_from_fnhe(rt, fnhe);
637         } else {
638                 if (depth > FNHE_RECLAIM_DEPTH)
639                         fnhe = fnhe_oldest(hash);
640                 else {
641                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
642                         if (!fnhe)
643                                 goto out_unlock;
644
645                         fnhe->fnhe_next = hash->chain;
646                         rcu_assign_pointer(hash->chain, fnhe);
647                 }
648                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
649                 fnhe->fnhe_daddr = daddr;
650                 fnhe->fnhe_gw = gw;
651                 fnhe->fnhe_pmtu = pmtu;
652                 fnhe->fnhe_expires = expires;
653
654                 /* Exception created; mark the cached routes for the nexthop
655                  * stale, so anyone caching it rechecks if this exception
656                  * applies to them.
657                  */
658                 rt = rcu_dereference(nh->nh_rth_input);
659                 if (rt)
660                         rt->dst.obsolete = DST_OBSOLETE_KILL;
661
662                 for_each_possible_cpu(i) {
663                         struct rtable __rcu **prt;
664                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
665                         rt = rcu_dereference(*prt);
666                         if (rt)
667                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
668                 }
669         }
670
671         fnhe->fnhe_stamp = jiffies;
672
673 out_unlock:
674         spin_unlock_bh(&fnhe_lock);
675 }
676
677 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
678                              bool kill_route)
679 {
680         __be32 new_gw = icmp_hdr(skb)->un.gateway;
681         __be32 old_gw = ip_hdr(skb)->saddr;
682         struct net_device *dev = skb->dev;
683         struct in_device *in_dev;
684         struct fib_result res;
685         struct neighbour *n;
686         struct net *net;
687
688         switch (icmp_hdr(skb)->code & 7) {
689         case ICMP_REDIR_NET:
690         case ICMP_REDIR_NETTOS:
691         case ICMP_REDIR_HOST:
692         case ICMP_REDIR_HOSTTOS:
693                 break;
694
695         default:
696                 return;
697         }
698
699         if (rt->rt_gateway != old_gw)
700                 return;
701
702         in_dev = __in_dev_get_rcu(dev);
703         if (!in_dev)
704                 return;
705
706         net = dev_net(dev);
707         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
708             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
709             ipv4_is_zeronet(new_gw))
710                 goto reject_redirect;
711
712         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
713                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
714                         goto reject_redirect;
715                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
716                         goto reject_redirect;
717         } else {
718                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
719                         goto reject_redirect;
720         }
721
722         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
723         if (n) {
724                 if (!(n->nud_state & NUD_VALID)) {
725                         neigh_event_send(n, NULL);
726                 } else {
727                         if (fib_lookup(net, fl4, &res) == 0) {
728                                 struct fib_nh *nh = &FIB_RES_NH(res);
729
730                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
731                                                       0, 0);
732                         }
733                         if (kill_route)
734                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
735                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
736                 }
737                 neigh_release(n);
738         }
739         return;
740
741 reject_redirect:
742 #ifdef CONFIG_IP_ROUTE_VERBOSE
743         if (IN_DEV_LOG_MARTIANS(in_dev)) {
744                 const struct iphdr *iph = (const struct iphdr *) skb->data;
745                 __be32 daddr = iph->daddr;
746                 __be32 saddr = iph->saddr;
747
748                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
749                                      "  Advised path = %pI4 -> %pI4\n",
750                                      &old_gw, dev->name, &new_gw,
751                                      &saddr, &daddr);
752         }
753 #endif
754         ;
755 }
756
757 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
758 {
759         struct rtable *rt;
760         struct flowi4 fl4;
761         const struct iphdr *iph = (const struct iphdr *) skb->data;
762         int oif = skb->dev->ifindex;
763         u8 tos = RT_TOS(iph->tos);
764         u8 prot = iph->protocol;
765         u32 mark = skb->mark;
766
767         rt = (struct rtable *) dst;
768
769         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
770         __ip_do_redirect(rt, skb, &fl4, true);
771 }
772
773 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
774 {
775         struct rtable *rt = (struct rtable *)dst;
776         struct dst_entry *ret = dst;
777
778         if (rt) {
779                 if (dst->obsolete > 0) {
780                         ip_rt_put(rt);
781                         ret = NULL;
782                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
783                            rt->dst.expires) {
784                         ip_rt_put(rt);
785                         ret = NULL;
786                 }
787         }
788         return ret;
789 }
790
791 /*
792  * Algorithm:
793  *      1. The first ip_rt_redirect_number redirects are sent
794  *         with exponential backoff, then we stop sending them at all,
795  *         assuming that the host ignores our redirects.
796  *      2. If we did not see packets requiring redirects
797  *         during ip_rt_redirect_silence, we assume that the host
798  *         forgot redirected route and start to send redirects again.
799  *
800  * This algorithm is much cheaper and more intelligent than dumb load limiting
801  * in icmp.c.
802  *
803  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
804  * and "frag. need" (breaks PMTU discovery) in icmp.c.
805  */
806
807 void ip_rt_send_redirect(struct sk_buff *skb)
808 {
809         struct rtable *rt = skb_rtable(skb);
810         struct in_device *in_dev;
811         struct inet_peer *peer;
812         struct net *net;
813         int log_martians;
814
815         rcu_read_lock();
816         in_dev = __in_dev_get_rcu(rt->dst.dev);
817         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
818                 rcu_read_unlock();
819                 return;
820         }
821         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
822         rcu_read_unlock();
823
824         net = dev_net(rt->dst.dev);
825         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
826         if (!peer) {
827                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
828                           rt_nexthop(rt, ip_hdr(skb)->daddr));
829                 return;
830         }
831
832         /* No redirected packets during ip_rt_redirect_silence;
833          * reset the algorithm.
834          */
835         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
836                 peer->rate_tokens = 0;
837
838         /* Too many ignored redirects; do not send anything
839          * set dst.rate_last to the last seen redirected packet.
840          */
841         if (peer->rate_tokens >= ip_rt_redirect_number) {
842                 peer->rate_last = jiffies;
843                 goto out_put_peer;
844         }
845
846         /* Check for load limit; set rate_last to the latest sent
847          * redirect.
848          */
849         if (peer->rate_tokens == 0 ||
850             time_after(jiffies,
851                        (peer->rate_last +
852                         (ip_rt_redirect_load << peer->rate_tokens)))) {
853                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
854
855                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
856                 peer->rate_last = jiffies;
857                 ++peer->rate_tokens;
858 #ifdef CONFIG_IP_ROUTE_VERBOSE
859                 if (log_martians &&
860                     peer->rate_tokens == ip_rt_redirect_number)
861                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
862                                              &ip_hdr(skb)->saddr, inet_iif(skb),
863                                              &ip_hdr(skb)->daddr, &gw);
864 #endif
865         }
866 out_put_peer:
867         inet_putpeer(peer);
868 }
869
870 static int ip_error(struct sk_buff *skb)
871 {
872         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
873         struct rtable *rt = skb_rtable(skb);
874         struct inet_peer *peer;
875         unsigned long now;
876         struct net *net;
877         bool send;
878         int code;
879
880         net = dev_net(rt->dst.dev);
881         if (!IN_DEV_FORWARD(in_dev)) {
882                 switch (rt->dst.error) {
883                 case EHOSTUNREACH:
884                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
885                         break;
886
887                 case ENETUNREACH:
888                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
889                         break;
890                 }
891                 goto out;
892         }
893
894         switch (rt->dst.error) {
895         case EINVAL:
896         default:
897                 goto out;
898         case EHOSTUNREACH:
899                 code = ICMP_HOST_UNREACH;
900                 break;
901         case ENETUNREACH:
902                 code = ICMP_NET_UNREACH;
903                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
904                 break;
905         case EACCES:
906                 code = ICMP_PKT_FILTERED;
907                 break;
908         }
909
910         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
911
912         send = true;
913         if (peer) {
914                 now = jiffies;
915                 peer->rate_tokens += now - peer->rate_last;
916                 if (peer->rate_tokens > ip_rt_error_burst)
917                         peer->rate_tokens = ip_rt_error_burst;
918                 peer->rate_last = now;
919                 if (peer->rate_tokens >= ip_rt_error_cost)
920                         peer->rate_tokens -= ip_rt_error_cost;
921                 else
922                         send = false;
923                 inet_putpeer(peer);
924         }
925         if (send)
926                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
927
928 out:    kfree_skb(skb);
929         return 0;
930 }
931
932 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
933 {
934         struct dst_entry *dst = &rt->dst;
935         struct fib_result res;
936
937         if (dst_metric_locked(dst, RTAX_MTU))
938                 return;
939
940         if (dst->dev->mtu < mtu)
941                 return;
942
943         if (mtu < ip_rt_min_pmtu)
944                 mtu = ip_rt_min_pmtu;
945
946         if (rt->rt_pmtu == mtu &&
947             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
948                 return;
949
950         rcu_read_lock();
951         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
952                 struct fib_nh *nh = &FIB_RES_NH(res);
953
954                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
955                                       jiffies + ip_rt_mtu_expires);
956         }
957         rcu_read_unlock();
958 }
959
960 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
961                               struct sk_buff *skb, u32 mtu)
962 {
963         struct rtable *rt = (struct rtable *) dst;
964         struct flowi4 fl4;
965
966         ip_rt_build_flow_key(&fl4, sk, skb);
967         __ip_rt_update_pmtu(rt, &fl4, mtu);
968 }
969
970 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
971                       int oif, u32 mark, u8 protocol, int flow_flags)
972 {
973         const struct iphdr *iph = (const struct iphdr *) skb->data;
974         struct flowi4 fl4;
975         struct rtable *rt;
976
977         if (!mark)
978                 mark = IP4_REPLY_MARK(net, skb->mark);
979
980         __build_flow_key(&fl4, NULL, iph, oif,
981                          RT_TOS(iph->tos), protocol, mark, flow_flags);
982         rt = __ip_route_output_key(net, &fl4);
983         if (!IS_ERR(rt)) {
984                 __ip_rt_update_pmtu(rt, &fl4, mtu);
985                 ip_rt_put(rt);
986         }
987 }
988 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
989
990 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
991 {
992         const struct iphdr *iph = (const struct iphdr *) skb->data;
993         struct flowi4 fl4;
994         struct rtable *rt;
995
996         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
997
998         if (!fl4.flowi4_mark)
999                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1000
1001         rt = __ip_route_output_key(sock_net(sk), &fl4);
1002         if (!IS_ERR(rt)) {
1003                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1004                 ip_rt_put(rt);
1005         }
1006 }
1007
1008 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1009 {
1010         const struct iphdr *iph = (const struct iphdr *) skb->data;
1011         struct flowi4 fl4;
1012         struct rtable *rt;
1013         struct dst_entry *dst;
1014         bool new = false;
1015
1016         bh_lock_sock(sk);
1017
1018         if (!ip_sk_accept_pmtu(sk))
1019                 goto out;
1020
1021         rt = (struct rtable *) __sk_dst_get(sk);
1022
1023         if (sock_owned_by_user(sk) || !rt) {
1024                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1025                 goto out;
1026         }
1027
1028         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1029
1030         if (!__sk_dst_check(sk, 0)) {
1031                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1032                 if (IS_ERR(rt))
1033                         goto out;
1034
1035                 new = true;
1036         }
1037
1038         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1039
1040         dst = dst_check(&rt->dst, 0);
1041         if (!dst) {
1042                 if (new)
1043                         dst_release(&rt->dst);
1044
1045                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1046                 if (IS_ERR(rt))
1047                         goto out;
1048
1049                 new = true;
1050         }
1051
1052         if (new)
1053                 __sk_dst_set(sk, &rt->dst);
1054
1055 out:
1056         bh_unlock_sock(sk);
1057 }
1058 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1059
1060 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1061                    int oif, u32 mark, u8 protocol, int flow_flags)
1062 {
1063         const struct iphdr *iph = (const struct iphdr *) skb->data;
1064         struct flowi4 fl4;
1065         struct rtable *rt;
1066
1067         __build_flow_key(&fl4, NULL, iph, oif,
1068                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1069         rt = __ip_route_output_key(net, &fl4);
1070         if (!IS_ERR(rt)) {
1071                 __ip_do_redirect(rt, skb, &fl4, false);
1072                 ip_rt_put(rt);
1073         }
1074 }
1075 EXPORT_SYMBOL_GPL(ipv4_redirect);
1076
1077 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1078 {
1079         const struct iphdr *iph = (const struct iphdr *) skb->data;
1080         struct flowi4 fl4;
1081         struct rtable *rt;
1082
1083         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1084         rt = __ip_route_output_key(sock_net(sk), &fl4);
1085         if (!IS_ERR(rt)) {
1086                 __ip_do_redirect(rt, skb, &fl4, false);
1087                 ip_rt_put(rt);
1088         }
1089 }
1090 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1091
1092 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1093 {
1094         struct rtable *rt = (struct rtable *) dst;
1095
1096         /* All IPV4 dsts are created with ->obsolete set to the value
1097          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1098          * into this function always.
1099          *
1100          * When a PMTU/redirect information update invalidates a route,
1101          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1102          * DST_OBSOLETE_DEAD by dst_free().
1103          */
1104         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1105                 return NULL;
1106         return dst;
1107 }
1108
1109 static void ipv4_link_failure(struct sk_buff *skb)
1110 {
1111         struct rtable *rt;
1112
1113         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1114
1115         rt = skb_rtable(skb);
1116         if (rt)
1117                 dst_set_expires(&rt->dst, 0);
1118 }
1119
1120 static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
1121 {
1122         pr_debug("%s: %pI4 -> %pI4, %s\n",
1123                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1124                  skb->dev ? skb->dev->name : "?");
1125         kfree_skb(skb);
1126         WARN_ON(1);
1127         return 0;
1128 }
1129
1130 /*
1131    We do not cache source address of outgoing interface,
1132    because it is used only by IP RR, TS and SRR options,
1133    so that it out of fast path.
1134
1135    BTW remember: "addr" is allowed to be not aligned
1136    in IP options!
1137  */
1138
1139 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1140 {
1141         __be32 src;
1142
1143         if (rt_is_output_route(rt))
1144                 src = ip_hdr(skb)->saddr;
1145         else {
1146                 struct fib_result res;
1147                 struct flowi4 fl4;
1148                 struct iphdr *iph;
1149
1150                 iph = ip_hdr(skb);
1151
1152                 memset(&fl4, 0, sizeof(fl4));
1153                 fl4.daddr = iph->daddr;
1154                 fl4.saddr = iph->saddr;
1155                 fl4.flowi4_tos = RT_TOS(iph->tos);
1156                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1157                 fl4.flowi4_iif = skb->dev->ifindex;
1158                 fl4.flowi4_mark = skb->mark;
1159
1160                 rcu_read_lock();
1161                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1162                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1163                 else
1164                         src = inet_select_addr(rt->dst.dev,
1165                                                rt_nexthop(rt, iph->daddr),
1166                                                RT_SCOPE_UNIVERSE);
1167                 rcu_read_unlock();
1168         }
1169         memcpy(addr, &src, 4);
1170 }
1171
1172 #ifdef CONFIG_IP_ROUTE_CLASSID
1173 static void set_class_tag(struct rtable *rt, u32 tag)
1174 {
1175         if (!(rt->dst.tclassid & 0xFFFF))
1176                 rt->dst.tclassid |= tag & 0xFFFF;
1177         if (!(rt->dst.tclassid & 0xFFFF0000))
1178                 rt->dst.tclassid |= tag & 0xFFFF0000;
1179 }
1180 #endif
1181
1182 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1183 {
1184         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1185
1186         if (advmss == 0) {
1187                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1188                                ip_rt_min_advmss);
1189                 if (advmss > 65535 - 40)
1190                         advmss = 65535 - 40;
1191         }
1192         return advmss;
1193 }
1194
1195 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1196 {
1197         const struct rtable *rt = (const struct rtable *) dst;
1198         unsigned int mtu = rt->rt_pmtu;
1199
1200         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1201                 mtu = dst_metric_raw(dst, RTAX_MTU);
1202
1203         if (mtu)
1204                 return mtu;
1205
1206         mtu = dst->dev->mtu;
1207
1208         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1209                 if (rt->rt_uses_gateway && mtu > 576)
1210                         mtu = 576;
1211         }
1212
1213         return min_t(unsigned int, mtu, IP_MAX_MTU);
1214 }
1215
1216 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1217 {
1218         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1219         struct fib_nh_exception *fnhe;
1220         u32 hval;
1221
1222         if (!hash)
1223                 return NULL;
1224
1225         hval = fnhe_hashfun(daddr);
1226
1227         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1228              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1229                 if (fnhe->fnhe_daddr == daddr)
1230                         return fnhe;
1231         }
1232         return NULL;
1233 }
1234
1235 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1236                               __be32 daddr)
1237 {
1238         bool ret = false;
1239
1240         spin_lock_bh(&fnhe_lock);
1241
1242         if (daddr == fnhe->fnhe_daddr) {
1243                 struct rtable __rcu **porig;
1244                 struct rtable *orig;
1245                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1246
1247                 if (rt_is_input_route(rt))
1248                         porig = &fnhe->fnhe_rth_input;
1249                 else
1250                         porig = &fnhe->fnhe_rth_output;
1251                 orig = rcu_dereference(*porig);
1252
1253                 if (fnhe->fnhe_genid != genid) {
1254                         fnhe->fnhe_genid = genid;
1255                         fnhe->fnhe_gw = 0;
1256                         fnhe->fnhe_pmtu = 0;
1257                         fnhe->fnhe_expires = 0;
1258                         fnhe_flush_routes(fnhe);
1259                         orig = NULL;
1260                 }
1261                 fill_route_from_fnhe(rt, fnhe);
1262                 if (!rt->rt_gateway)
1263                         rt->rt_gateway = daddr;
1264
1265                 if (!(rt->dst.flags & DST_NOCACHE)) {
1266                         rcu_assign_pointer(*porig, rt);
1267                         if (orig)
1268                                 rt_free(orig);
1269                         ret = true;
1270                 }
1271
1272                 fnhe->fnhe_stamp = jiffies;
1273         }
1274         spin_unlock_bh(&fnhe_lock);
1275
1276         return ret;
1277 }
1278
1279 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1280 {
1281         struct rtable *orig, *prev, **p;
1282         bool ret = true;
1283
1284         if (rt_is_input_route(rt)) {
1285                 p = (struct rtable **)&nh->nh_rth_input;
1286         } else {
1287                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1288         }
1289         orig = *p;
1290
1291         prev = cmpxchg(p, orig, rt);
1292         if (prev == orig) {
1293                 if (orig)
1294                         rt_free(orig);
1295         } else
1296                 ret = false;
1297
1298         return ret;
1299 }
1300
1301 static DEFINE_SPINLOCK(rt_uncached_lock);
1302 static LIST_HEAD(rt_uncached_list);
1303
1304 static void rt_add_uncached_list(struct rtable *rt)
1305 {
1306         spin_lock_bh(&rt_uncached_lock);
1307         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1308         spin_unlock_bh(&rt_uncached_lock);
1309 }
1310
1311 static void ipv4_dst_destroy(struct dst_entry *dst)
1312 {
1313         struct rtable *rt = (struct rtable *) dst;
1314
1315         if (!list_empty(&rt->rt_uncached)) {
1316                 spin_lock_bh(&rt_uncached_lock);
1317                 list_del(&rt->rt_uncached);
1318                 spin_unlock_bh(&rt_uncached_lock);
1319         }
1320 }
1321
1322 void rt_flush_dev(struct net_device *dev)
1323 {
1324         if (!list_empty(&rt_uncached_list)) {
1325                 struct net *net = dev_net(dev);
1326                 struct rtable *rt;
1327
1328                 spin_lock_bh(&rt_uncached_lock);
1329                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1330                         if (rt->dst.dev != dev)
1331                                 continue;
1332                         rt->dst.dev = net->loopback_dev;
1333                         dev_hold(rt->dst.dev);
1334                         dev_put(dev);
1335                 }
1336                 spin_unlock_bh(&rt_uncached_lock);
1337         }
1338 }
1339
1340 static bool rt_cache_valid(const struct rtable *rt)
1341 {
1342         return  rt &&
1343                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1344                 !rt_is_expired(rt);
1345 }
1346
1347 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1348                            const struct fib_result *res,
1349                            struct fib_nh_exception *fnhe,
1350                            struct fib_info *fi, u16 type, u32 itag)
1351 {
1352         bool cached = false;
1353
1354         if (fi) {
1355                 struct fib_nh *nh = &FIB_RES_NH(*res);
1356
1357                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1358                         rt->rt_gateway = nh->nh_gw;
1359                         rt->rt_uses_gateway = 1;
1360                 }
1361                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1362 #ifdef CONFIG_IP_ROUTE_CLASSID
1363                 rt->dst.tclassid = nh->nh_tclassid;
1364 #endif
1365                 if (unlikely(fnhe))
1366                         cached = rt_bind_exception(rt, fnhe, daddr);
1367                 else if (!(rt->dst.flags & DST_NOCACHE))
1368                         cached = rt_cache_route(nh, rt);
1369                 if (unlikely(!cached)) {
1370                         /* Routes we intend to cache in nexthop exception or
1371                          * FIB nexthop have the DST_NOCACHE bit clear.
1372                          * However, if we are unsuccessful at storing this
1373                          * route into the cache we really need to set it.
1374                          */
1375                         rt->dst.flags |= DST_NOCACHE;
1376                         if (!rt->rt_gateway)
1377                                 rt->rt_gateway = daddr;
1378                         rt_add_uncached_list(rt);
1379                 }
1380         } else
1381                 rt_add_uncached_list(rt);
1382
1383 #ifdef CONFIG_IP_ROUTE_CLASSID
1384 #ifdef CONFIG_IP_MULTIPLE_TABLES
1385         set_class_tag(rt, res->tclassid);
1386 #endif
1387         set_class_tag(rt, itag);
1388 #endif
1389 }
1390
1391 static struct rtable *rt_dst_alloc(struct net_device *dev,
1392                                    bool nopolicy, bool noxfrm, bool will_cache)
1393 {
1394         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1395                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1396                          (nopolicy ? DST_NOPOLICY : 0) |
1397                          (noxfrm ? DST_NOXFRM : 0));
1398 }
1399
1400 /* called in rcu_read_lock() section */
1401 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1402                                 u8 tos, struct net_device *dev, int our)
1403 {
1404         struct rtable *rth;
1405         struct in_device *in_dev = __in_dev_get_rcu(dev);
1406         u32 itag = 0;
1407         int err;
1408
1409         /* Primary sanity checks. */
1410
1411         if (in_dev == NULL)
1412                 return -EINVAL;
1413
1414         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1415             skb->protocol != htons(ETH_P_IP))
1416                 goto e_inval;
1417
1418         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1419                 if (ipv4_is_loopback(saddr))
1420                         goto e_inval;
1421
1422         if (ipv4_is_zeronet(saddr)) {
1423                 if (!ipv4_is_local_multicast(daddr))
1424                         goto e_inval;
1425         } else {
1426                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1427                                           in_dev, &itag);
1428                 if (err < 0)
1429                         goto e_err;
1430         }
1431         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1432                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1433         if (!rth)
1434                 goto e_nobufs;
1435
1436 #ifdef CONFIG_IP_ROUTE_CLASSID
1437         rth->dst.tclassid = itag;
1438 #endif
1439         rth->dst.output = ip_rt_bug;
1440
1441         rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1442         rth->rt_flags   = RTCF_MULTICAST;
1443         rth->rt_type    = RTN_MULTICAST;
1444         rth->rt_is_input= 1;
1445         rth->rt_iif     = 0;
1446         rth->rt_pmtu    = 0;
1447         rth->rt_gateway = 0;
1448         rth->rt_uses_gateway = 0;
1449         INIT_LIST_HEAD(&rth->rt_uncached);
1450         if (our) {
1451                 rth->dst.input= ip_local_deliver;
1452                 rth->rt_flags |= RTCF_LOCAL;
1453         }
1454
1455 #ifdef CONFIG_IP_MROUTE
1456         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1457                 rth->dst.input = ip_mr_input;
1458 #endif
1459         RT_CACHE_STAT_INC(in_slow_mc);
1460
1461         skb_dst_set(skb, &rth->dst);
1462         return 0;
1463
1464 e_nobufs:
1465         return -ENOBUFS;
1466 e_inval:
1467         return -EINVAL;
1468 e_err:
1469         return err;
1470 }
1471
1472
1473 static void ip_handle_martian_source(struct net_device *dev,
1474                                      struct in_device *in_dev,
1475                                      struct sk_buff *skb,
1476                                      __be32 daddr,
1477                                      __be32 saddr)
1478 {
1479         RT_CACHE_STAT_INC(in_martian_src);
1480 #ifdef CONFIG_IP_ROUTE_VERBOSE
1481         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1482                 /*
1483                  *      RFC1812 recommendation, if source is martian,
1484                  *      the only hint is MAC header.
1485                  */
1486                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1487                         &daddr, &saddr, dev->name);
1488                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1489                         print_hex_dump(KERN_WARNING, "ll header: ",
1490                                        DUMP_PREFIX_OFFSET, 16, 1,
1491                                        skb_mac_header(skb),
1492                                        dev->hard_header_len, true);
1493                 }
1494         }
1495 #endif
1496 }
1497
1498 /* called in rcu_read_lock() section */
1499 static int __mkroute_input(struct sk_buff *skb,
1500                            const struct fib_result *res,
1501                            struct in_device *in_dev,
1502                            __be32 daddr, __be32 saddr, u32 tos)
1503 {
1504         struct fib_nh_exception *fnhe;
1505         struct rtable *rth;
1506         int err;
1507         struct in_device *out_dev;
1508         unsigned int flags = 0;
1509         bool do_cache;
1510         u32 itag = 0;
1511
1512         /* get a working reference to the output device */
1513         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1514         if (out_dev == NULL) {
1515                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1516                 return -EINVAL;
1517         }
1518
1519         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1520                                   in_dev->dev, in_dev, &itag);
1521         if (err < 0) {
1522                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1523                                          saddr);
1524
1525                 goto cleanup;
1526         }
1527
1528         do_cache = res->fi && !itag;
1529         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1530             (IN_DEV_SHARED_MEDIA(out_dev) ||
1531              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1532                 flags |= RTCF_DOREDIRECT;
1533                 do_cache = false;
1534         }
1535
1536         if (skb->protocol != htons(ETH_P_IP)) {
1537                 /* Not IP (i.e. ARP). Do not create route, if it is
1538                  * invalid for proxy arp. DNAT routes are always valid.
1539                  *
1540                  * Proxy arp feature have been extended to allow, ARP
1541                  * replies back to the same interface, to support
1542                  * Private VLAN switch technologies. See arp.c.
1543                  */
1544                 if (out_dev == in_dev &&
1545                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1546                         err = -EINVAL;
1547                         goto cleanup;
1548                 }
1549         }
1550
1551         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1552         if (do_cache) {
1553                 if (fnhe != NULL)
1554                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1555                 else
1556                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1557
1558                 if (rt_cache_valid(rth)) {
1559                         skb_dst_set_noref(skb, &rth->dst);
1560                         goto out;
1561                 }
1562         }
1563
1564         rth = rt_dst_alloc(out_dev->dev,
1565                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1566                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1567         if (!rth) {
1568                 err = -ENOBUFS;
1569                 goto cleanup;
1570         }
1571
1572         rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1573         rth->rt_flags = flags;
1574         rth->rt_type = res->type;
1575         rth->rt_is_input = 1;
1576         rth->rt_iif     = 0;
1577         rth->rt_pmtu    = 0;
1578         rth->rt_gateway = 0;
1579         rth->rt_uses_gateway = 0;
1580         INIT_LIST_HEAD(&rth->rt_uncached);
1581         RT_CACHE_STAT_INC(in_slow_tot);
1582
1583         rth->dst.input = ip_forward;
1584         rth->dst.output = ip_output;
1585
1586         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1587         skb_dst_set(skb, &rth->dst);
1588 out:
1589         err = 0;
1590  cleanup:
1591         return err;
1592 }
1593
1594 static int ip_mkroute_input(struct sk_buff *skb,
1595                             struct fib_result *res,
1596                             const struct flowi4 *fl4,
1597                             struct in_device *in_dev,
1598                             __be32 daddr, __be32 saddr, u32 tos)
1599 {
1600 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1601         if (res->fi && res->fi->fib_nhs > 1)
1602                 fib_select_multipath(res);
1603 #endif
1604
1605         /* create a routing cache entry */
1606         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1607 }
1608
1609 /*
1610  *      NOTE. We drop all the packets that has local source
1611  *      addresses, because every properly looped back packet
1612  *      must have correct destination already attached by output routine.
1613  *
1614  *      Such approach solves two big problems:
1615  *      1. Not simplex devices are handled properly.
1616  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1617  *      called with rcu_read_lock()
1618  */
1619
1620 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1621                                u8 tos, struct net_device *dev)
1622 {
1623         struct fib_result res;
1624         struct in_device *in_dev = __in_dev_get_rcu(dev);
1625         struct flowi4   fl4;
1626         unsigned int    flags = 0;
1627         u32             itag = 0;
1628         struct rtable   *rth;
1629         int             err = -EINVAL;
1630         struct net    *net = dev_net(dev);
1631         bool do_cache;
1632
1633         /* IP on this device is disabled. */
1634
1635         if (!in_dev)
1636                 goto out;
1637
1638         /* Check for the most weird martians, which can be not detected
1639            by fib_lookup.
1640          */
1641
1642         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1643                 goto martian_source;
1644
1645         res.fi = NULL;
1646         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1647                 goto brd_input;
1648
1649         /* Accept zero addresses only to limited broadcast;
1650          * I even do not know to fix it or not. Waiting for complains :-)
1651          */
1652         if (ipv4_is_zeronet(saddr))
1653                 goto martian_source;
1654
1655         if (ipv4_is_zeronet(daddr))
1656                 goto martian_destination;
1657
1658         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1659          * and call it once if daddr or/and saddr are loopback addresses
1660          */
1661         if (ipv4_is_loopback(daddr)) {
1662                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1663                         goto martian_destination;
1664         } else if (ipv4_is_loopback(saddr)) {
1665                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1666                         goto martian_source;
1667         }
1668
1669         /*
1670          *      Now we are ready to route packet.
1671          */
1672         fl4.flowi4_oif = 0;
1673         fl4.flowi4_iif = dev->ifindex;
1674         fl4.flowi4_mark = skb->mark;
1675         fl4.flowi4_tos = tos;
1676         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1677         fl4.daddr = daddr;
1678         fl4.saddr = saddr;
1679         err = fib_lookup(net, &fl4, &res);
1680         if (err != 0) {
1681                 if (!IN_DEV_FORWARD(in_dev))
1682                         err = -EHOSTUNREACH;
1683                 goto no_route;
1684         }
1685
1686         if (res.type == RTN_BROADCAST)
1687                 goto brd_input;
1688
1689         if (res.type == RTN_LOCAL) {
1690                 err = fib_validate_source(skb, saddr, daddr, tos,
1691                                           0, dev, in_dev, &itag);
1692                 if (err < 0)
1693                         goto martian_source_keep_err;
1694                 goto local_input;
1695         }
1696
1697         if (!IN_DEV_FORWARD(in_dev)) {
1698                 err = -EHOSTUNREACH;
1699                 goto no_route;
1700         }
1701         if (res.type != RTN_UNICAST)
1702                 goto martian_destination;
1703
1704         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1705 out:    return err;
1706
1707 brd_input:
1708         if (skb->protocol != htons(ETH_P_IP))
1709                 goto e_inval;
1710
1711         if (!ipv4_is_zeronet(saddr)) {
1712                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1713                                           in_dev, &itag);
1714                 if (err < 0)
1715                         goto martian_source_keep_err;
1716         }
1717         flags |= RTCF_BROADCAST;
1718         res.type = RTN_BROADCAST;
1719         RT_CACHE_STAT_INC(in_brd);
1720
1721 local_input:
1722         do_cache = false;
1723         if (res.fi) {
1724                 if (!itag) {
1725                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1726                         if (rt_cache_valid(rth)) {
1727                                 skb_dst_set_noref(skb, &rth->dst);
1728                                 err = 0;
1729                                 goto out;
1730                         }
1731                         do_cache = true;
1732                 }
1733         }
1734
1735         rth = rt_dst_alloc(net->loopback_dev,
1736                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1737         if (!rth)
1738                 goto e_nobufs;
1739
1740         rth->dst.input= ip_local_deliver;
1741         rth->dst.output= ip_rt_bug;
1742 #ifdef CONFIG_IP_ROUTE_CLASSID
1743         rth->dst.tclassid = itag;
1744 #endif
1745
1746         rth->rt_genid = rt_genid_ipv4(net);
1747         rth->rt_flags   = flags|RTCF_LOCAL;
1748         rth->rt_type    = res.type;
1749         rth->rt_is_input = 1;
1750         rth->rt_iif     = 0;
1751         rth->rt_pmtu    = 0;
1752         rth->rt_gateway = 0;
1753         rth->rt_uses_gateway = 0;
1754         INIT_LIST_HEAD(&rth->rt_uncached);
1755         RT_CACHE_STAT_INC(in_slow_tot);
1756         if (res.type == RTN_UNREACHABLE) {
1757                 rth->dst.input= ip_error;
1758                 rth->dst.error= -err;
1759                 rth->rt_flags   &= ~RTCF_LOCAL;
1760         }
1761         if (do_cache) {
1762                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1763                         rth->dst.flags |= DST_NOCACHE;
1764                         rt_add_uncached_list(rth);
1765                 }
1766         }
1767         skb_dst_set(skb, &rth->dst);
1768         err = 0;
1769         goto out;
1770
1771 no_route:
1772         RT_CACHE_STAT_INC(in_no_route);
1773         res.type = RTN_UNREACHABLE;
1774         if (err == -ESRCH)
1775                 err = -ENETUNREACH;
1776         goto local_input;
1777
1778         /*
1779          *      Do not cache martian addresses: they should be logged (RFC1812)
1780          */
1781 martian_destination:
1782         RT_CACHE_STAT_INC(in_martian_dst);
1783 #ifdef CONFIG_IP_ROUTE_VERBOSE
1784         if (IN_DEV_LOG_MARTIANS(in_dev))
1785                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1786                                      &daddr, &saddr, dev->name);
1787 #endif
1788
1789 e_inval:
1790         err = -EINVAL;
1791         goto out;
1792
1793 e_nobufs:
1794         err = -ENOBUFS;
1795         goto out;
1796
1797 martian_source:
1798         err = -EINVAL;
1799 martian_source_keep_err:
1800         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1801         goto out;
1802 }
1803
1804 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1805                          u8 tos, struct net_device *dev)
1806 {
1807         int res;
1808
1809         rcu_read_lock();
1810
1811         /* Multicast recognition logic is moved from route cache to here.
1812            The problem was that too many Ethernet cards have broken/missing
1813            hardware multicast filters :-( As result the host on multicasting
1814            network acquires a lot of useless route cache entries, sort of
1815            SDR messages from all the world. Now we try to get rid of them.
1816            Really, provided software IP multicast filter is organized
1817            reasonably (at least, hashed), it does not result in a slowdown
1818            comparing with route cache reject entries.
1819            Note, that multicast routers are not affected, because
1820            route cache entry is created eventually.
1821          */
1822         if (ipv4_is_multicast(daddr)) {
1823                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1824
1825                 if (in_dev) {
1826                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1827                                                   ip_hdr(skb)->protocol);
1828                         if (our
1829 #ifdef CONFIG_IP_MROUTE
1830                                 ||
1831                             (!ipv4_is_local_multicast(daddr) &&
1832                              IN_DEV_MFORWARD(in_dev))
1833 #endif
1834                            ) {
1835                                 int res = ip_route_input_mc(skb, daddr, saddr,
1836                                                             tos, dev, our);
1837                                 rcu_read_unlock();
1838                                 return res;
1839                         }
1840                 }
1841                 rcu_read_unlock();
1842                 return -EINVAL;
1843         }
1844         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1845         rcu_read_unlock();
1846         return res;
1847 }
1848 EXPORT_SYMBOL(ip_route_input_noref);
1849
1850 /* called with rcu_read_lock() */
1851 static struct rtable *__mkroute_output(const struct fib_result *res,
1852                                        const struct flowi4 *fl4, int orig_oif,
1853                                        struct net_device *dev_out,
1854                                        unsigned int flags)
1855 {
1856         struct fib_info *fi = res->fi;
1857         struct fib_nh_exception *fnhe;
1858         struct in_device *in_dev;
1859         u16 type = res->type;
1860         struct rtable *rth;
1861         bool do_cache;
1862
1863         in_dev = __in_dev_get_rcu(dev_out);
1864         if (!in_dev)
1865                 return ERR_PTR(-EINVAL);
1866
1867         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1868                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1869                         return ERR_PTR(-EINVAL);
1870
1871         if (ipv4_is_lbcast(fl4->daddr))
1872                 type = RTN_BROADCAST;
1873         else if (ipv4_is_multicast(fl4->daddr))
1874                 type = RTN_MULTICAST;
1875         else if (ipv4_is_zeronet(fl4->daddr))
1876                 return ERR_PTR(-EINVAL);
1877
1878         if (dev_out->flags & IFF_LOOPBACK)
1879                 flags |= RTCF_LOCAL;
1880
1881         do_cache = true;
1882         if (type == RTN_BROADCAST) {
1883                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1884                 fi = NULL;
1885         } else if (type == RTN_MULTICAST) {
1886                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1887                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1888                                      fl4->flowi4_proto))
1889                         flags &= ~RTCF_LOCAL;
1890                 else
1891                         do_cache = false;
1892                 /* If multicast route do not exist use
1893                  * default one, but do not gateway in this case.
1894                  * Yes, it is hack.
1895                  */
1896                 if (fi && res->prefixlen < 4)
1897                         fi = NULL;
1898         }
1899
1900         fnhe = NULL;
1901         do_cache &= fi != NULL;
1902         if (do_cache) {
1903                 struct rtable __rcu **prth;
1904                 struct fib_nh *nh = &FIB_RES_NH(*res);
1905
1906                 fnhe = find_exception(nh, fl4->daddr);
1907                 if (fnhe)
1908                         prth = &fnhe->fnhe_rth_output;
1909                 else {
1910                         if (unlikely(fl4->flowi4_flags &
1911                                      FLOWI_FLAG_KNOWN_NH &&
1912                                      !(nh->nh_gw &&
1913                                        nh->nh_scope == RT_SCOPE_LINK))) {
1914                                 do_cache = false;
1915                                 goto add;
1916                         }
1917                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1918                 }
1919                 rth = rcu_dereference(*prth);
1920                 if (rt_cache_valid(rth)) {
1921                         dst_hold(&rth->dst);
1922                         return rth;
1923                 }
1924         }
1925
1926 add:
1927         rth = rt_dst_alloc(dev_out,
1928                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1929                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1930                            do_cache);
1931         if (!rth)
1932                 return ERR_PTR(-ENOBUFS);
1933
1934         rth->dst.output = ip_output;
1935
1936         rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1937         rth->rt_flags   = flags;
1938         rth->rt_type    = type;
1939         rth->rt_is_input = 0;
1940         rth->rt_iif     = orig_oif ? : 0;
1941         rth->rt_pmtu    = 0;
1942         rth->rt_gateway = 0;
1943         rth->rt_uses_gateway = 0;
1944         INIT_LIST_HEAD(&rth->rt_uncached);
1945
1946         RT_CACHE_STAT_INC(out_slow_tot);
1947
1948         if (flags & RTCF_LOCAL)
1949                 rth->dst.input = ip_local_deliver;
1950         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1951                 if (flags & RTCF_LOCAL &&
1952                     !(dev_out->flags & IFF_LOOPBACK)) {
1953                         rth->dst.output = ip_mc_output;
1954                         RT_CACHE_STAT_INC(out_slow_mc);
1955                 }
1956 #ifdef CONFIG_IP_MROUTE
1957                 if (type == RTN_MULTICAST) {
1958                         if (IN_DEV_MFORWARD(in_dev) &&
1959                             !ipv4_is_local_multicast(fl4->daddr)) {
1960                                 rth->dst.input = ip_mr_input;
1961                                 rth->dst.output = ip_mc_output;
1962                         }
1963                 }
1964 #endif
1965         }
1966
1967         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1968
1969         return rth;
1970 }
1971
1972 /*
1973  * Major route resolver routine.
1974  */
1975
1976 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1977 {
1978         struct net_device *dev_out = NULL;
1979         __u8 tos = RT_FL_TOS(fl4);
1980         unsigned int flags = 0;
1981         struct fib_result res;
1982         struct rtable *rth;
1983         int orig_oif;
1984
1985         res.tclassid    = 0;
1986         res.fi          = NULL;
1987         res.table       = NULL;
1988
1989         orig_oif = fl4->flowi4_oif;
1990
1991         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1992         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1993         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1994                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1995
1996         rcu_read_lock();
1997         if (fl4->saddr) {
1998                 rth = ERR_PTR(-EINVAL);
1999                 if (ipv4_is_multicast(fl4->saddr) ||
2000                     ipv4_is_lbcast(fl4->saddr) ||
2001                     ipv4_is_zeronet(fl4->saddr))
2002                         goto out;
2003
2004                 /* I removed check for oif == dev_out->oif here.
2005                    It was wrong for two reasons:
2006                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2007                       is assigned to multiple interfaces.
2008                    2. Moreover, we are allowed to send packets with saddr
2009                       of another iface. --ANK
2010                  */
2011
2012                 if (fl4->flowi4_oif == 0 &&
2013                     (ipv4_is_multicast(fl4->daddr) ||
2014                      ipv4_is_lbcast(fl4->daddr))) {
2015                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2016                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2017                         if (dev_out == NULL)
2018                                 goto out;
2019
2020                         /* Special hack: user can direct multicasts
2021                            and limited broadcast via necessary interface
2022                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2023                            This hack is not just for fun, it allows
2024                            vic,vat and friends to work.
2025                            They bind socket to loopback, set ttl to zero
2026                            and expect that it will work.
2027                            From the viewpoint of routing cache they are broken,
2028                            because we are not allowed to build multicast path
2029                            with loopback source addr (look, routing cache
2030                            cannot know, that ttl is zero, so that packet
2031                            will not leave this host and route is valid).
2032                            Luckily, this hack is good workaround.
2033                          */
2034
2035                         fl4->flowi4_oif = dev_out->ifindex;
2036                         goto make_route;
2037                 }
2038
2039                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2040                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2041                         if (!__ip_dev_find(net, fl4->saddr, false))
2042                                 goto out;
2043                 }
2044         }
2045
2046
2047         if (fl4->flowi4_oif) {
2048                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2049                 rth = ERR_PTR(-ENODEV);
2050                 if (dev_out == NULL)
2051                         goto out;
2052
2053                 /* RACE: Check return value of inet_select_addr instead. */
2054                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2055                         rth = ERR_PTR(-ENETUNREACH);
2056                         goto out;
2057                 }
2058                 if (ipv4_is_local_multicast(fl4->daddr) ||
2059                     ipv4_is_lbcast(fl4->daddr)) {
2060                         if (!fl4->saddr)
2061                                 fl4->saddr = inet_select_addr(dev_out, 0,
2062                                                               RT_SCOPE_LINK);
2063                         goto make_route;
2064                 }
2065                 if (!fl4->saddr) {
2066                         if (ipv4_is_multicast(fl4->daddr))
2067                                 fl4->saddr = inet_select_addr(dev_out, 0,
2068                                                               fl4->flowi4_scope);
2069                         else if (!fl4->daddr)
2070                                 fl4->saddr = inet_select_addr(dev_out, 0,
2071                                                               RT_SCOPE_HOST);
2072                 }
2073         }
2074
2075         if (!fl4->daddr) {
2076                 fl4->daddr = fl4->saddr;
2077                 if (!fl4->daddr)
2078                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2079                 dev_out = net->loopback_dev;
2080                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2081                 res.type = RTN_LOCAL;
2082                 flags |= RTCF_LOCAL;
2083                 goto make_route;
2084         }
2085
2086         if (fib_lookup(net, fl4, &res)) {
2087                 res.fi = NULL;
2088                 res.table = NULL;
2089                 if (fl4->flowi4_oif) {
2090                         /* Apparently, routing tables are wrong. Assume,
2091                            that the destination is on link.
2092
2093                            WHY? DW.
2094                            Because we are allowed to send to iface
2095                            even if it has NO routes and NO assigned
2096                            addresses. When oif is specified, routing
2097                            tables are looked up with only one purpose:
2098                            to catch if destination is gatewayed, rather than
2099                            direct. Moreover, if MSG_DONTROUTE is set,
2100                            we send packet, ignoring both routing tables
2101                            and ifaddr state. --ANK
2102
2103
2104                            We could make it even if oif is unknown,
2105                            likely IPv6, but we do not.
2106                          */
2107
2108                         if (fl4->saddr == 0)
2109                                 fl4->saddr = inet_select_addr(dev_out, 0,
2110                                                               RT_SCOPE_LINK);
2111                         res.type = RTN_UNICAST;
2112                         goto make_route;
2113                 }
2114                 rth = ERR_PTR(-ENETUNREACH);
2115                 goto out;
2116         }
2117
2118         if (res.type == RTN_LOCAL) {
2119                 if (!fl4->saddr) {
2120                         if (res.fi->fib_prefsrc)
2121                                 fl4->saddr = res.fi->fib_prefsrc;
2122                         else
2123                                 fl4->saddr = fl4->daddr;
2124                 }
2125                 dev_out = net->loopback_dev;
2126                 fl4->flowi4_oif = dev_out->ifindex;
2127                 flags |= RTCF_LOCAL;
2128                 goto make_route;
2129         }
2130
2131 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2132         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2133                 fib_select_multipath(&res);
2134         else
2135 #endif
2136         if (!res.prefixlen &&
2137             res.table->tb_num_default > 1 &&
2138             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2139                 fib_select_default(&res);
2140
2141         if (!fl4->saddr)
2142                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2143
2144         dev_out = FIB_RES_DEV(res);
2145         fl4->flowi4_oif = dev_out->ifindex;
2146
2147
2148 make_route:
2149         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2150
2151 out:
2152         rcu_read_unlock();
2153         return rth;
2154 }
2155 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2156
2157 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2158 {
2159         return NULL;
2160 }
2161
2162 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2163 {
2164         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2165
2166         return mtu ? : dst->dev->mtu;
2167 }
2168
2169 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2170                                           struct sk_buff *skb, u32 mtu)
2171 {
2172 }
2173
2174 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2175                                        struct sk_buff *skb)
2176 {
2177 }
2178
2179 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2180                                           unsigned long old)
2181 {
2182         return NULL;
2183 }
2184
2185 static struct dst_ops ipv4_dst_blackhole_ops = {
2186         .family                 =       AF_INET,
2187         .protocol               =       cpu_to_be16(ETH_P_IP),
2188         .check                  =       ipv4_blackhole_dst_check,
2189         .mtu                    =       ipv4_blackhole_mtu,
2190         .default_advmss         =       ipv4_default_advmss,
2191         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2192         .redirect               =       ipv4_rt_blackhole_redirect,
2193         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2194         .neigh_lookup           =       ipv4_neigh_lookup,
2195 };
2196
2197 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2198 {
2199         struct rtable *ort = (struct rtable *) dst_orig;
2200         struct rtable *rt;
2201
2202         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2203         if (rt) {
2204                 struct dst_entry *new = &rt->dst;
2205
2206                 new->__use = 1;
2207                 new->input = dst_discard;
2208                 new->output = dst_discard_sk;
2209
2210                 new->dev = ort->dst.dev;
2211                 if (new->dev)
2212                         dev_hold(new->dev);
2213
2214                 rt->rt_is_input = ort->rt_is_input;
2215                 rt->rt_iif = ort->rt_iif;
2216                 rt->rt_pmtu = ort->rt_pmtu;
2217
2218                 rt->rt_genid = rt_genid_ipv4(net);
2219                 rt->rt_flags = ort->rt_flags;
2220                 rt->rt_type = ort->rt_type;
2221                 rt->rt_gateway = ort->rt_gateway;
2222                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2223
2224                 INIT_LIST_HEAD(&rt->rt_uncached);
2225
2226                 dst_free(new);
2227         }
2228
2229         dst_release(dst_orig);
2230
2231         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2232 }
2233
2234 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2235                                     struct sock *sk)
2236 {
2237         struct rtable *rt = __ip_route_output_key(net, flp4);
2238
2239         if (IS_ERR(rt))
2240                 return rt;
2241
2242         if (flp4->flowi4_proto)
2243                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2244                                                    flowi4_to_flowi(flp4),
2245                                                    sk, 0);
2246
2247         return rt;
2248 }
2249 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2250
2251 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2252                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2253                         u32 seq, int event, int nowait, unsigned int flags)
2254 {
2255         struct rtable *rt = skb_rtable(skb);
2256         struct rtmsg *r;
2257         struct nlmsghdr *nlh;
2258         unsigned long expires = 0;
2259         u32 error;
2260         u32 metrics[RTAX_MAX];
2261
2262         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2263         if (nlh == NULL)
2264                 return -EMSGSIZE;
2265
2266         r = nlmsg_data(nlh);
2267         r->rtm_family    = AF_INET;
2268         r->rtm_dst_len  = 32;
2269         r->rtm_src_len  = 0;
2270         r->rtm_tos      = fl4->flowi4_tos;
2271         r->rtm_table    = RT_TABLE_MAIN;
2272         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2273                 goto nla_put_failure;
2274         r->rtm_type     = rt->rt_type;
2275         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2276         r->rtm_protocol = RTPROT_UNSPEC;
2277         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2278         if (rt->rt_flags & RTCF_NOTIFY)
2279                 r->rtm_flags |= RTM_F_NOTIFY;
2280
2281         if (nla_put_be32(skb, RTA_DST, dst))
2282                 goto nla_put_failure;
2283         if (src) {
2284                 r->rtm_src_len = 32;
2285                 if (nla_put_be32(skb, RTA_SRC, src))
2286                         goto nla_put_failure;
2287         }
2288         if (rt->dst.dev &&
2289             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2290                 goto nla_put_failure;
2291 #ifdef CONFIG_IP_ROUTE_CLASSID
2292         if (rt->dst.tclassid &&
2293             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2294                 goto nla_put_failure;
2295 #endif
2296         if (!rt_is_input_route(rt) &&
2297             fl4->saddr != src) {
2298                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2299                         goto nla_put_failure;
2300         }
2301         if (rt->rt_uses_gateway &&
2302             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2303                 goto nla_put_failure;
2304
2305         expires = rt->dst.expires;
2306         if (expires) {
2307                 unsigned long now = jiffies;
2308
2309                 if (time_before(now, expires))
2310                         expires -= now;
2311                 else
2312                         expires = 0;
2313         }
2314
2315         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2316         if (rt->rt_pmtu && expires)
2317                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2318         if (rtnetlink_put_metrics(skb, metrics) < 0)
2319                 goto nla_put_failure;
2320
2321         if (fl4->flowi4_mark &&
2322             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2323                 goto nla_put_failure;
2324
2325         error = rt->dst.error;
2326
2327         if (rt_is_input_route(rt)) {
2328 #ifdef CONFIG_IP_MROUTE
2329                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2330                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2331                         int err = ipmr_get_route(net, skb,
2332                                                  fl4->saddr, fl4->daddr,
2333                                                  r, nowait);
2334                         if (err <= 0) {
2335                                 if (!nowait) {
2336                                         if (err == 0)
2337                                                 return 0;
2338                                         goto nla_put_failure;
2339                                 } else {
2340                                         if (err == -EMSGSIZE)
2341                                                 goto nla_put_failure;
2342                                         error = err;
2343                                 }
2344                         }
2345                 } else
2346 #endif
2347                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2348                                 goto nla_put_failure;
2349         }
2350
2351         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2352                 goto nla_put_failure;
2353
2354         return nlmsg_end(skb, nlh);
2355
2356 nla_put_failure:
2357         nlmsg_cancel(skb, nlh);
2358         return -EMSGSIZE;
2359 }
2360
2361 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2362 {
2363         struct net *net = sock_net(in_skb->sk);
2364         struct rtmsg *rtm;
2365         struct nlattr *tb[RTA_MAX+1];
2366         struct rtable *rt = NULL;
2367         struct flowi4 fl4;
2368         __be32 dst = 0;
2369         __be32 src = 0;
2370         u32 iif;
2371         int err;
2372         int mark;
2373         struct sk_buff *skb;
2374
2375         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2376         if (err < 0)
2377                 goto errout;
2378
2379         rtm = nlmsg_data(nlh);
2380
2381         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2382         if (skb == NULL) {
2383                 err = -ENOBUFS;
2384                 goto errout;
2385         }
2386
2387         /* Reserve room for dummy headers, this skb can pass
2388            through good chunk of routing engine.
2389          */
2390         skb_reset_mac_header(skb);
2391         skb_reset_network_header(skb);
2392
2393         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2394         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2395         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2396
2397         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2398         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2399         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2400         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2401
2402         memset(&fl4, 0, sizeof(fl4));
2403         fl4.daddr = dst;
2404         fl4.saddr = src;
2405         fl4.flowi4_tos = rtm->rtm_tos;
2406         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2407         fl4.flowi4_mark = mark;
2408
2409         if (iif) {
2410                 struct net_device *dev;
2411
2412                 dev = __dev_get_by_index(net, iif);
2413                 if (dev == NULL) {
2414                         err = -ENODEV;
2415                         goto errout_free;
2416                 }
2417
2418                 skb->protocol   = htons(ETH_P_IP);
2419                 skb->dev        = dev;
2420                 skb->mark       = mark;
2421                 local_bh_disable();
2422                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2423                 local_bh_enable();
2424
2425                 rt = skb_rtable(skb);
2426                 if (err == 0 && rt->dst.error)
2427                         err = -rt->dst.error;
2428         } else {
2429                 rt = ip_route_output_key(net, &fl4);
2430
2431                 err = 0;
2432                 if (IS_ERR(rt))
2433                         err = PTR_ERR(rt);
2434         }
2435
2436         if (err)
2437                 goto errout_free;
2438
2439         skb_dst_set(skb, &rt->dst);
2440         if (rtm->rtm_flags & RTM_F_NOTIFY)
2441                 rt->rt_flags |= RTCF_NOTIFY;
2442
2443         err = rt_fill_info(net, dst, src, &fl4, skb,
2444                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2445                            RTM_NEWROUTE, 0, 0);
2446         if (err <= 0)
2447                 goto errout_free;
2448
2449         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2450 errout:
2451         return err;
2452
2453 errout_free:
2454         kfree_skb(skb);
2455         goto errout;
2456 }
2457
2458 void ip_rt_multicast_event(struct in_device *in_dev)
2459 {
2460         rt_cache_flush(dev_net(in_dev->dev));
2461 }
2462
2463 #ifdef CONFIG_SYSCTL
2464 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2465 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2466 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2467 static int ip_rt_gc_elasticity __read_mostly    = 8;
2468
2469 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2470                                         void __user *buffer,
2471                                         size_t *lenp, loff_t *ppos)
2472 {
2473         struct net *net = (struct net *)__ctl->extra1;
2474
2475         if (write) {
2476                 rt_cache_flush(net);
2477                 fnhe_genid_bump(net);
2478                 return 0;
2479         }
2480
2481         return -EINVAL;
2482 }
2483
2484 static struct ctl_table ipv4_route_table[] = {
2485         {
2486                 .procname       = "gc_thresh",
2487                 .data           = &ipv4_dst_ops.gc_thresh,
2488                 .maxlen         = sizeof(int),
2489                 .mode           = 0644,
2490                 .proc_handler   = proc_dointvec,
2491         },
2492         {
2493                 .procname       = "max_size",
2494                 .data           = &ip_rt_max_size,
2495                 .maxlen         = sizeof(int),
2496                 .mode           = 0644,
2497                 .proc_handler   = proc_dointvec,
2498         },
2499         {
2500                 /*  Deprecated. Use gc_min_interval_ms */
2501
2502                 .procname       = "gc_min_interval",
2503                 .data           = &ip_rt_gc_min_interval,
2504                 .maxlen         = sizeof(int),
2505                 .mode           = 0644,
2506                 .proc_handler   = proc_dointvec_jiffies,
2507         },
2508         {
2509                 .procname       = "gc_min_interval_ms",
2510                 .data           = &ip_rt_gc_min_interval,
2511                 .maxlen         = sizeof(int),
2512                 .mode           = 0644,
2513                 .proc_handler   = proc_dointvec_ms_jiffies,
2514         },
2515         {
2516                 .procname       = "gc_timeout",
2517                 .data           = &ip_rt_gc_timeout,
2518                 .maxlen         = sizeof(int),
2519                 .mode           = 0644,
2520                 .proc_handler   = proc_dointvec_jiffies,
2521         },
2522         {
2523                 .procname       = "gc_interval",
2524                 .data           = &ip_rt_gc_interval,
2525                 .maxlen         = sizeof(int),
2526                 .mode           = 0644,
2527                 .proc_handler   = proc_dointvec_jiffies,
2528         },
2529         {
2530                 .procname       = "redirect_load",
2531                 .data           = &ip_rt_redirect_load,
2532                 .maxlen         = sizeof(int),
2533                 .mode           = 0644,
2534                 .proc_handler   = proc_dointvec,
2535         },
2536         {
2537                 .procname       = "redirect_number",
2538                 .data           = &ip_rt_redirect_number,
2539                 .maxlen         = sizeof(int),
2540                 .mode           = 0644,
2541                 .proc_handler   = proc_dointvec,
2542         },
2543         {
2544                 .procname       = "redirect_silence",
2545                 .data           = &ip_rt_redirect_silence,
2546                 .maxlen         = sizeof(int),
2547                 .mode           = 0644,
2548                 .proc_handler   = proc_dointvec,
2549         },
2550         {
2551                 .procname       = "error_cost",
2552                 .data           = &ip_rt_error_cost,
2553                 .maxlen         = sizeof(int),
2554                 .mode           = 0644,
2555                 .proc_handler   = proc_dointvec,
2556         },
2557         {
2558                 .procname       = "error_burst",
2559                 .data           = &ip_rt_error_burst,
2560                 .maxlen         = sizeof(int),
2561                 .mode           = 0644,
2562                 .proc_handler   = proc_dointvec,
2563         },
2564         {
2565                 .procname       = "gc_elasticity",
2566                 .data           = &ip_rt_gc_elasticity,
2567                 .maxlen         = sizeof(int),
2568                 .mode           = 0644,
2569                 .proc_handler   = proc_dointvec,
2570         },
2571         {
2572                 .procname       = "mtu_expires",
2573                 .data           = &ip_rt_mtu_expires,
2574                 .maxlen         = sizeof(int),
2575                 .mode           = 0644,
2576                 .proc_handler   = proc_dointvec_jiffies,
2577         },
2578         {
2579                 .procname       = "min_pmtu",
2580                 .data           = &ip_rt_min_pmtu,
2581                 .maxlen         = sizeof(int),
2582                 .mode           = 0644,
2583                 .proc_handler   = proc_dointvec,
2584         },
2585         {
2586                 .procname       = "min_adv_mss",
2587                 .data           = &ip_rt_min_advmss,
2588                 .maxlen         = sizeof(int),
2589                 .mode           = 0644,
2590                 .proc_handler   = proc_dointvec,
2591         },
2592         { }
2593 };
2594
2595 static struct ctl_table ipv4_route_flush_table[] = {
2596         {
2597                 .procname       = "flush",
2598                 .maxlen         = sizeof(int),
2599                 .mode           = 0200,
2600                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2601         },
2602         { },
2603 };
2604
2605 static __net_init int sysctl_route_net_init(struct net *net)
2606 {
2607         struct ctl_table *tbl;
2608
2609         tbl = ipv4_route_flush_table;
2610         if (!net_eq(net, &init_net)) {
2611                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2612                 if (tbl == NULL)
2613                         goto err_dup;
2614
2615                 /* Don't export sysctls to unprivileged users */
2616                 if (net->user_ns != &init_user_ns)
2617                         tbl[0].procname = NULL;
2618         }
2619         tbl[0].extra1 = net;
2620
2621         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2622         if (net->ipv4.route_hdr == NULL)
2623                 goto err_reg;
2624         return 0;
2625
2626 err_reg:
2627         if (tbl != ipv4_route_flush_table)
2628                 kfree(tbl);
2629 err_dup:
2630         return -ENOMEM;
2631 }
2632
2633 static __net_exit void sysctl_route_net_exit(struct net *net)
2634 {
2635         struct ctl_table *tbl;
2636
2637         tbl = net->ipv4.route_hdr->ctl_table_arg;
2638         unregister_net_sysctl_table(net->ipv4.route_hdr);
2639         BUG_ON(tbl == ipv4_route_flush_table);
2640         kfree(tbl);
2641 }
2642
2643 static __net_initdata struct pernet_operations sysctl_route_ops = {
2644         .init = sysctl_route_net_init,
2645         .exit = sysctl_route_net_exit,
2646 };
2647 #endif
2648
2649 static __net_init int rt_genid_init(struct net *net)
2650 {
2651         atomic_set(&net->ipv4.rt_genid, 0);
2652         atomic_set(&net->fnhe_genid, 0);
2653         get_random_bytes(&net->ipv4.dev_addr_genid,
2654                          sizeof(net->ipv4.dev_addr_genid));
2655         return 0;
2656 }
2657
2658 static __net_initdata struct pernet_operations rt_genid_ops = {
2659         .init = rt_genid_init,
2660 };
2661
2662 static int __net_init ipv4_inetpeer_init(struct net *net)
2663 {
2664         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2665
2666         if (!bp)
2667                 return -ENOMEM;
2668         inet_peer_base_init(bp);
2669         net->ipv4.peers = bp;
2670         return 0;
2671 }
2672
2673 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2674 {
2675         struct inet_peer_base *bp = net->ipv4.peers;
2676
2677         net->ipv4.peers = NULL;
2678         inetpeer_invalidate_tree(bp);
2679         kfree(bp);
2680 }
2681
2682 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2683         .init   =       ipv4_inetpeer_init,
2684         .exit   =       ipv4_inetpeer_exit,
2685 };
2686
2687 #ifdef CONFIG_IP_ROUTE_CLASSID
2688 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2689 #endif /* CONFIG_IP_ROUTE_CLASSID */
2690
2691 int __init ip_rt_init(void)
2692 {
2693         int rc = 0;
2694
2695         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2696         if (!ip_idents)
2697                 panic("IP: failed to allocate ip_idents\n");
2698
2699         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2700
2701 #ifdef CONFIG_IP_ROUTE_CLASSID
2702         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2703         if (!ip_rt_acct)
2704                 panic("IP: failed to allocate ip_rt_acct\n");
2705 #endif
2706
2707         ipv4_dst_ops.kmem_cachep =
2708                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2709                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2710
2711         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2712
2713         if (dst_entries_init(&ipv4_dst_ops) < 0)
2714                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2715
2716         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2717                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2718
2719         ipv4_dst_ops.gc_thresh = ~0;
2720         ip_rt_max_size = INT_MAX;
2721
2722         devinet_init();
2723         ip_fib_init();
2724
2725         if (ip_rt_proc_init())
2726                 pr_err("Unable to create route proc files\n");
2727 #ifdef CONFIG_XFRM
2728         xfrm_init();
2729         xfrm4_init();
2730 #endif
2731         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2732
2733 #ifdef CONFIG_SYSCTL
2734         register_pernet_subsys(&sysctl_route_ops);
2735 #endif
2736         register_pernet_subsys(&rt_genid_ops);
2737         register_pernet_subsys(&ipv4_inetpeer_ops);
2738         return rc;
2739 }
2740
2741 #ifdef CONFIG_SYSCTL
2742 /*
2743  * We really need to sanitize the damn ipv4 init order, then all
2744  * this nonsense will go away.
2745  */
2746 void __init ip_static_sysctl_init(void)
2747 {
2748         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2749 }
2750 #endif