net: core: Support UID-based routing.
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116
117 #define RT_FL_TOS(oldflp4) \
118         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
119
120 #define RT_GC_TIMEOUT (300*HZ)
121
122 static int ip_rt_max_size;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly       = 256;
131
132 /*
133  *      Interface to generic destination cache.
134  */
135
136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
137 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
138 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140 static void              ipv4_link_failure(struct sk_buff *skb);
141 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
142                                            struct sk_buff *skb, u32 mtu);
143 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
144                                         struct sk_buff *skb);
145 static void             ipv4_dst_destroy(struct dst_entry *dst);
146
147 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
148 {
149         WARN_ON(1);
150         return NULL;
151 }
152
153 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
154                                            struct sk_buff *skb,
155                                            const void *daddr);
156
157 static struct dst_ops ipv4_dst_ops = {
158         .family =               AF_INET,
159         .check =                ipv4_dst_check,
160         .default_advmss =       ipv4_default_advmss,
161         .mtu =                  ipv4_mtu,
162         .cow_metrics =          ipv4_cow_metrics,
163         .destroy =              ipv4_dst_destroy,
164         .negative_advice =      ipv4_negative_advice,
165         .link_failure =         ipv4_link_failure,
166         .update_pmtu =          ip_rt_update_pmtu,
167         .redirect =             ip_do_redirect,
168         .local_out =            __ip_local_out,
169         .neigh_lookup =         ipv4_neigh_lookup,
170 };
171
172 #define ECN_OR_COST(class)      TC_PRIO_##class
173
174 const __u8 ip_tos2prio[16] = {
175         TC_PRIO_BESTEFFORT,
176         ECN_OR_COST(BESTEFFORT),
177         TC_PRIO_BESTEFFORT,
178         ECN_OR_COST(BESTEFFORT),
179         TC_PRIO_BULK,
180         ECN_OR_COST(BULK),
181         TC_PRIO_BULK,
182         ECN_OR_COST(BULK),
183         TC_PRIO_INTERACTIVE,
184         ECN_OR_COST(INTERACTIVE),
185         TC_PRIO_INTERACTIVE,
186         ECN_OR_COST(INTERACTIVE),
187         TC_PRIO_INTERACTIVE_BULK,
188         ECN_OR_COST(INTERACTIVE_BULK),
189         TC_PRIO_INTERACTIVE_BULK,
190         ECN_OR_COST(INTERACTIVE_BULK)
191 };
192 EXPORT_SYMBOL(ip_tos2prio);
193
194 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
195 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
196
197 #ifdef CONFIG_PROC_FS
198 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
199 {
200         if (*pos)
201                 return NULL;
202         return SEQ_START_TOKEN;
203 }
204
205 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
206 {
207         ++*pos;
208         return NULL;
209 }
210
211 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
212 {
213 }
214
215 static int rt_cache_seq_show(struct seq_file *seq, void *v)
216 {
217         if (v == SEQ_START_TOKEN)
218                 seq_printf(seq, "%-127s\n",
219                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
220                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
221                            "HHUptod\tSpecDst");
222         return 0;
223 }
224
225 static const struct seq_operations rt_cache_seq_ops = {
226         .start  = rt_cache_seq_start,
227         .next   = rt_cache_seq_next,
228         .stop   = rt_cache_seq_stop,
229         .show   = rt_cache_seq_show,
230 };
231
232 static int rt_cache_seq_open(struct inode *inode, struct file *file)
233 {
234         return seq_open(file, &rt_cache_seq_ops);
235 }
236
237 static const struct file_operations rt_cache_seq_fops = {
238         .owner   = THIS_MODULE,
239         .open    = rt_cache_seq_open,
240         .read    = seq_read,
241         .llseek  = seq_lseek,
242         .release = seq_release,
243 };
244
245
246 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
247 {
248         int cpu;
249
250         if (*pos == 0)
251                 return SEQ_START_TOKEN;
252
253         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
254                 if (!cpu_possible(cpu))
255                         continue;
256                 *pos = cpu+1;
257                 return &per_cpu(rt_cache_stat, cpu);
258         }
259         return NULL;
260 }
261
262 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
263 {
264         int cpu;
265
266         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
267                 if (!cpu_possible(cpu))
268                         continue;
269                 *pos = cpu+1;
270                 return &per_cpu(rt_cache_stat, cpu);
271         }
272         return NULL;
273
274 }
275
276 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
277 {
278
279 }
280
281 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
282 {
283         struct rt_cache_stat *st = v;
284
285         if (v == SEQ_START_TOKEN) {
286                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
287                 return 0;
288         }
289
290         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
291                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
292                    dst_entries_get_slow(&ipv4_dst_ops),
293                    0, /* st->in_hit */
294                    st->in_slow_tot,
295                    st->in_slow_mc,
296                    st->in_no_route,
297                    st->in_brd,
298                    st->in_martian_dst,
299                    st->in_martian_src,
300
301                    0, /* st->out_hit */
302                    st->out_slow_tot,
303                    st->out_slow_mc,
304
305                    0, /* st->gc_total */
306                    0, /* st->gc_ignored */
307                    0, /* st->gc_goal_miss */
308                    0, /* st->gc_dst_overflow */
309                    0, /* st->in_hlist_search */
310                    0  /* st->out_hlist_search */
311                 );
312         return 0;
313 }
314
315 static const struct seq_operations rt_cpu_seq_ops = {
316         .start  = rt_cpu_seq_start,
317         .next   = rt_cpu_seq_next,
318         .stop   = rt_cpu_seq_stop,
319         .show   = rt_cpu_seq_show,
320 };
321
322
323 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
324 {
325         return seq_open(file, &rt_cpu_seq_ops);
326 }
327
328 static const struct file_operations rt_cpu_seq_fops = {
329         .owner   = THIS_MODULE,
330         .open    = rt_cpu_seq_open,
331         .read    = seq_read,
332         .llseek  = seq_lseek,
333         .release = seq_release,
334 };
335
336 #ifdef CONFIG_IP_ROUTE_CLASSID
337 static int rt_acct_proc_show(struct seq_file *m, void *v)
338 {
339         struct ip_rt_acct *dst, *src;
340         unsigned int i, j;
341
342         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
343         if (!dst)
344                 return -ENOMEM;
345
346         for_each_possible_cpu(i) {
347                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
348                 for (j = 0; j < 256; j++) {
349                         dst[j].o_bytes   += src[j].o_bytes;
350                         dst[j].o_packets += src[j].o_packets;
351                         dst[j].i_bytes   += src[j].i_bytes;
352                         dst[j].i_packets += src[j].i_packets;
353                 }
354         }
355
356         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
357         kfree(dst);
358         return 0;
359 }
360
361 static int rt_acct_proc_open(struct inode *inode, struct file *file)
362 {
363         return single_open(file, rt_acct_proc_show, NULL);
364 }
365
366 static const struct file_operations rt_acct_proc_fops = {
367         .owner          = THIS_MODULE,
368         .open           = rt_acct_proc_open,
369         .read           = seq_read,
370         .llseek         = seq_lseek,
371         .release        = single_release,
372 };
373 #endif
374
375 static int __net_init ip_rt_do_proc_init(struct net *net)
376 {
377         struct proc_dir_entry *pde;
378
379         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
380                           &rt_cache_seq_fops);
381         if (!pde)
382                 goto err1;
383
384         pde = proc_create("rt_cache", S_IRUGO,
385                           net->proc_net_stat, &rt_cpu_seq_fops);
386         if (!pde)
387                 goto err2;
388
389 #ifdef CONFIG_IP_ROUTE_CLASSID
390         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
391         if (!pde)
392                 goto err3;
393 #endif
394         return 0;
395
396 #ifdef CONFIG_IP_ROUTE_CLASSID
397 err3:
398         remove_proc_entry("rt_cache", net->proc_net_stat);
399 #endif
400 err2:
401         remove_proc_entry("rt_cache", net->proc_net);
402 err1:
403         return -ENOMEM;
404 }
405
406 static void __net_exit ip_rt_do_proc_exit(struct net *net)
407 {
408         remove_proc_entry("rt_cache", net->proc_net_stat);
409         remove_proc_entry("rt_cache", net->proc_net);
410 #ifdef CONFIG_IP_ROUTE_CLASSID
411         remove_proc_entry("rt_acct", net->proc_net);
412 #endif
413 }
414
415 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
416         .init = ip_rt_do_proc_init,
417         .exit = ip_rt_do_proc_exit,
418 };
419
420 static int __init ip_rt_proc_init(void)
421 {
422         return register_pernet_subsys(&ip_rt_proc_ops);
423 }
424
425 #else
426 static inline int ip_rt_proc_init(void)
427 {
428         return 0;
429 }
430 #endif /* CONFIG_PROC_FS */
431
432 static inline bool rt_is_expired(const struct rtable *rth)
433 {
434         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
435 }
436
437 void rt_cache_flush(struct net *net)
438 {
439         rt_genid_bump_ipv4(net);
440 }
441
442 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
443                                            struct sk_buff *skb,
444                                            const void *daddr)
445 {
446         struct net_device *dev = dst->dev;
447         const __be32 *pkey = daddr;
448         const struct rtable *rt;
449         struct neighbour *n;
450
451         rt = (const struct rtable *) dst;
452         if (rt->rt_gateway)
453                 pkey = (const __be32 *) &rt->rt_gateway;
454         else if (skb)
455                 pkey = &ip_hdr(skb)->daddr;
456
457         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
458         if (n)
459                 return n;
460         return neigh_create(&arp_tbl, pkey, dev);
461 }
462
463 #define IP_IDENTS_SZ 2048u
464
465 static atomic_t *ip_idents __read_mostly;
466 static u32 *ip_tstamps __read_mostly;
467
468 /* In order to protect privacy, we add a perturbation to identifiers
469  * if one generator is seldom used. This makes hard for an attacker
470  * to infer how many packets were sent between two points in time.
471  */
472 u32 ip_idents_reserve(u32 hash, int segs)
473 {
474         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
475         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
476         u32 old = ACCESS_ONCE(*p_tstamp);
477         u32 now = (u32)jiffies;
478         u32 delta = 0;
479
480         if (old != now && cmpxchg(p_tstamp, old, now) == old)
481                 delta = prandom_u32_max(now - old);
482
483         return atomic_add_return(segs + delta, p_id) - segs;
484 }
485 EXPORT_SYMBOL(ip_idents_reserve);
486
487 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
488 {
489         static u32 ip_idents_hashrnd __read_mostly;
490         u32 hash, id;
491
492         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
493
494         hash = jhash_3words((__force u32)iph->daddr,
495                             (__force u32)iph->saddr,
496                             iph->protocol ^ net_hash_mix(net),
497                             ip_idents_hashrnd);
498         id = ip_idents_reserve(hash, segs);
499         iph->id = htons(id);
500 }
501 EXPORT_SYMBOL(__ip_select_ident);
502
503 static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
504                              const struct iphdr *iph,
505                              int oif, u8 tos,
506                              u8 prot, u32 mark, int flow_flags)
507 {
508         if (sk) {
509                 const struct inet_sock *inet = inet_sk(sk);
510
511                 oif = sk->sk_bound_dev_if;
512                 mark = sk->sk_mark;
513                 tos = RT_CONN_FLAGS(sk);
514                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
515         }
516         flowi4_init_output(fl4, oif, mark, tos,
517                            RT_SCOPE_UNIVERSE, prot,
518                            flow_flags,
519                            iph->daddr, iph->saddr, 0, 0,
520                            sock_i_uid(sk));
521 }
522
523 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
524                                struct sock *sk)
525 {
526         const struct iphdr *iph = ip_hdr(skb);
527         int oif = skb->dev->ifindex;
528         u8 tos = RT_TOS(iph->tos);
529         u8 prot = iph->protocol;
530         u32 mark = skb->mark;
531
532         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
533 }
534
535 static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
536 {
537         const struct inet_sock *inet = inet_sk(sk);
538         const struct ip_options_rcu *inet_opt;
539         __be32 daddr = inet->inet_daddr;
540
541         rcu_read_lock();
542         inet_opt = rcu_dereference(inet->inet_opt);
543         if (inet_opt && inet_opt->opt.srr)
544                 daddr = inet_opt->opt.faddr;
545         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
546                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
547                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
548                            inet_sk_flowi_flags(sk),
549                            daddr, inet->inet_saddr, 0, 0,
550                            sock_i_uid(sk));
551         rcu_read_unlock();
552 }
553
554 static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
555                                  const struct sk_buff *skb)
556 {
557         if (skb)
558                 build_skb_flow_key(fl4, skb, sk);
559         else
560                 build_sk_flow_key(fl4, sk);
561 }
562
563 static inline void rt_free(struct rtable *rt)
564 {
565         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
566 }
567
568 static DEFINE_SPINLOCK(fnhe_lock);
569
570 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
571 {
572         struct rtable *rt;
573
574         rt = rcu_dereference(fnhe->fnhe_rth_input);
575         if (rt) {
576                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
577                 rt_free(rt);
578         }
579         rt = rcu_dereference(fnhe->fnhe_rth_output);
580         if (rt) {
581                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
582                 rt_free(rt);
583         }
584 }
585
586 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
587 {
588         struct fib_nh_exception *fnhe, *oldest;
589
590         oldest = rcu_dereference(hash->chain);
591         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
592              fnhe = rcu_dereference(fnhe->fnhe_next)) {
593                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
594                         oldest = fnhe;
595         }
596         fnhe_flush_routes(oldest);
597         return oldest;
598 }
599
600 static inline u32 fnhe_hashfun(__be32 daddr)
601 {
602         static u32 fnhe_hashrnd __read_mostly;
603         u32 hval;
604
605         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
606         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
607         return hash_32(hval, FNHE_HASH_SHIFT);
608 }
609
610 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
611 {
612         rt->rt_pmtu = fnhe->fnhe_pmtu;
613         rt->dst.expires = fnhe->fnhe_expires;
614
615         if (fnhe->fnhe_gw) {
616                 rt->rt_flags |= RTCF_REDIRECTED;
617                 rt->rt_gateway = fnhe->fnhe_gw;
618                 rt->rt_uses_gateway = 1;
619         }
620 }
621
622 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
623                                   u32 pmtu, unsigned long expires)
624 {
625         struct fnhe_hash_bucket *hash;
626         struct fib_nh_exception *fnhe;
627         struct rtable *rt;
628         unsigned int i;
629         int depth;
630         u32 hval = fnhe_hashfun(daddr);
631
632         spin_lock_bh(&fnhe_lock);
633
634         hash = rcu_dereference(nh->nh_exceptions);
635         if (!hash) {
636                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
637                 if (!hash)
638                         goto out_unlock;
639                 rcu_assign_pointer(nh->nh_exceptions, hash);
640         }
641
642         hash += hval;
643
644         depth = 0;
645         for (fnhe = rcu_dereference(hash->chain); fnhe;
646              fnhe = rcu_dereference(fnhe->fnhe_next)) {
647                 if (fnhe->fnhe_daddr == daddr)
648                         break;
649                 depth++;
650         }
651
652         if (fnhe) {
653                 if (gw)
654                         fnhe->fnhe_gw = gw;
655                 if (pmtu) {
656                         fnhe->fnhe_pmtu = pmtu;
657                         fnhe->fnhe_expires = max(1UL, expires);
658                 }
659                 /* Update all cached dsts too */
660                 rt = rcu_dereference(fnhe->fnhe_rth_input);
661                 if (rt)
662                         fill_route_from_fnhe(rt, fnhe);
663                 rt = rcu_dereference(fnhe->fnhe_rth_output);
664                 if (rt)
665                         fill_route_from_fnhe(rt, fnhe);
666         } else {
667                 if (depth > FNHE_RECLAIM_DEPTH)
668                         fnhe = fnhe_oldest(hash);
669                 else {
670                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
671                         if (!fnhe)
672                                 goto out_unlock;
673
674                         fnhe->fnhe_next = hash->chain;
675                         rcu_assign_pointer(hash->chain, fnhe);
676                 }
677                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
678                 fnhe->fnhe_daddr = daddr;
679                 fnhe->fnhe_gw = gw;
680                 fnhe->fnhe_pmtu = pmtu;
681                 fnhe->fnhe_expires = expires;
682
683                 /* Exception created; mark the cached routes for the nexthop
684                  * stale, so anyone caching it rechecks if this exception
685                  * applies to them.
686                  */
687                 rt = rcu_dereference(nh->nh_rth_input);
688                 if (rt)
689                         rt->dst.obsolete = DST_OBSOLETE_KILL;
690
691                 for_each_possible_cpu(i) {
692                         struct rtable __rcu **prt;
693                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
694                         rt = rcu_dereference(*prt);
695                         if (rt)
696                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
697                 }
698         }
699
700         fnhe->fnhe_stamp = jiffies;
701
702 out_unlock:
703         spin_unlock_bh(&fnhe_lock);
704 }
705
706 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
707                              bool kill_route)
708 {
709         __be32 new_gw = icmp_hdr(skb)->un.gateway;
710         __be32 old_gw = ip_hdr(skb)->saddr;
711         struct net_device *dev = skb->dev;
712         struct in_device *in_dev;
713         struct fib_result res;
714         struct neighbour *n;
715         struct net *net;
716
717         switch (icmp_hdr(skb)->code & 7) {
718         case ICMP_REDIR_NET:
719         case ICMP_REDIR_NETTOS:
720         case ICMP_REDIR_HOST:
721         case ICMP_REDIR_HOSTTOS:
722                 break;
723
724         default:
725                 return;
726         }
727
728         if (rt->rt_gateway != old_gw)
729                 return;
730
731         in_dev = __in_dev_get_rcu(dev);
732         if (!in_dev)
733                 return;
734
735         net = dev_net(dev);
736         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
737             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
738             ipv4_is_zeronet(new_gw))
739                 goto reject_redirect;
740
741         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
742                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
743                         goto reject_redirect;
744                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
745                         goto reject_redirect;
746         } else {
747                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
748                         goto reject_redirect;
749         }
750
751         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
752         if (!IS_ERR(n)) {
753                 if (!(n->nud_state & NUD_VALID)) {
754                         neigh_event_send(n, NULL);
755                 } else {
756                         if (fib_lookup(net, fl4, &res, 0) == 0) {
757                                 struct fib_nh *nh = &FIB_RES_NH(res);
758
759                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
760                                                       0, 0);
761                         }
762                         if (kill_route)
763                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
764                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
765                 }
766                 neigh_release(n);
767         }
768         return;
769
770 reject_redirect:
771 #ifdef CONFIG_IP_ROUTE_VERBOSE
772         if (IN_DEV_LOG_MARTIANS(in_dev)) {
773                 const struct iphdr *iph = (const struct iphdr *) skb->data;
774                 __be32 daddr = iph->daddr;
775                 __be32 saddr = iph->saddr;
776
777                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
778                                      "  Advised path = %pI4 -> %pI4\n",
779                                      &old_gw, dev->name, &new_gw,
780                                      &saddr, &daddr);
781         }
782 #endif
783         ;
784 }
785
786 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
787 {
788         struct rtable *rt;
789         struct flowi4 fl4;
790         const struct iphdr *iph = (const struct iphdr *) skb->data;
791         int oif = skb->dev->ifindex;
792         u8 tos = RT_TOS(iph->tos);
793         u8 prot = iph->protocol;
794         u32 mark = skb->mark;
795
796         rt = (struct rtable *) dst;
797
798         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
799         __ip_do_redirect(rt, skb, &fl4, true);
800 }
801
802 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
803 {
804         struct rtable *rt = (struct rtable *)dst;
805         struct dst_entry *ret = dst;
806
807         if (rt) {
808                 if (dst->obsolete > 0) {
809                         ip_rt_put(rt);
810                         ret = NULL;
811                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
812                            rt->dst.expires) {
813                         ip_rt_put(rt);
814                         ret = NULL;
815                 }
816         }
817         return ret;
818 }
819
820 /*
821  * Algorithm:
822  *      1. The first ip_rt_redirect_number redirects are sent
823  *         with exponential backoff, then we stop sending them at all,
824  *         assuming that the host ignores our redirects.
825  *      2. If we did not see packets requiring redirects
826  *         during ip_rt_redirect_silence, we assume that the host
827  *         forgot redirected route and start to send redirects again.
828  *
829  * This algorithm is much cheaper and more intelligent than dumb load limiting
830  * in icmp.c.
831  *
832  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
833  * and "frag. need" (breaks PMTU discovery) in icmp.c.
834  */
835
836 void ip_rt_send_redirect(struct sk_buff *skb)
837 {
838         struct rtable *rt = skb_rtable(skb);
839         struct in_device *in_dev;
840         struct inet_peer *peer;
841         struct net *net;
842         int log_martians;
843         int vif;
844
845         rcu_read_lock();
846         in_dev = __in_dev_get_rcu(rt->dst.dev);
847         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
848                 rcu_read_unlock();
849                 return;
850         }
851         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
852         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
853         rcu_read_unlock();
854
855         net = dev_net(rt->dst.dev);
856         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
857         if (!peer) {
858                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
859                           rt_nexthop(rt, ip_hdr(skb)->daddr));
860                 return;
861         }
862
863         /* No redirected packets during ip_rt_redirect_silence;
864          * reset the algorithm.
865          */
866         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
867                 peer->rate_tokens = 0;
868
869         /* Too many ignored redirects; do not send anything
870          * set dst.rate_last to the last seen redirected packet.
871          */
872         if (peer->rate_tokens >= ip_rt_redirect_number) {
873                 peer->rate_last = jiffies;
874                 goto out_put_peer;
875         }
876
877         /* Check for load limit; set rate_last to the latest sent
878          * redirect.
879          */
880         if (peer->rate_tokens == 0 ||
881             time_after(jiffies,
882                        (peer->rate_last +
883                         (ip_rt_redirect_load << peer->rate_tokens)))) {
884                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
885
886                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
887                 peer->rate_last = jiffies;
888                 ++peer->rate_tokens;
889 #ifdef CONFIG_IP_ROUTE_VERBOSE
890                 if (log_martians &&
891                     peer->rate_tokens == ip_rt_redirect_number)
892                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
893                                              &ip_hdr(skb)->saddr, inet_iif(skb),
894                                              &ip_hdr(skb)->daddr, &gw);
895 #endif
896         }
897 out_put_peer:
898         inet_putpeer(peer);
899 }
900
901 static int ip_error(struct sk_buff *skb)
902 {
903         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
904         struct rtable *rt = skb_rtable(skb);
905         struct inet_peer *peer;
906         unsigned long now;
907         struct net *net;
908         bool send;
909         int code;
910
911         /* IP on this device is disabled. */
912         if (!in_dev)
913                 goto out;
914
915         net = dev_net(rt->dst.dev);
916         if (!IN_DEV_FORWARD(in_dev)) {
917                 switch (rt->dst.error) {
918                 case EHOSTUNREACH:
919                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
920                         break;
921
922                 case ENETUNREACH:
923                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
924                         break;
925                 }
926                 goto out;
927         }
928
929         switch (rt->dst.error) {
930         case EINVAL:
931         default:
932                 goto out;
933         case EHOSTUNREACH:
934                 code = ICMP_HOST_UNREACH;
935                 break;
936         case ENETUNREACH:
937                 code = ICMP_NET_UNREACH;
938                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
939                 break;
940         case EACCES:
941                 code = ICMP_PKT_FILTERED;
942                 break;
943         }
944
945         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
946                                l3mdev_master_ifindex(skb->dev), 1);
947
948         send = true;
949         if (peer) {
950                 now = jiffies;
951                 peer->rate_tokens += now - peer->rate_last;
952                 if (peer->rate_tokens > ip_rt_error_burst)
953                         peer->rate_tokens = ip_rt_error_burst;
954                 peer->rate_last = now;
955                 if (peer->rate_tokens >= ip_rt_error_cost)
956                         peer->rate_tokens -= ip_rt_error_cost;
957                 else
958                         send = false;
959                 inet_putpeer(peer);
960         }
961         if (send)
962                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
963
964 out:    kfree_skb(skb);
965         return 0;
966 }
967
968 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
969 {
970         struct dst_entry *dst = &rt->dst;
971         struct fib_result res;
972
973         if (dst_metric_locked(dst, RTAX_MTU))
974                 return;
975
976         if (ipv4_mtu(dst) < mtu)
977                 return;
978
979         if (mtu < ip_rt_min_pmtu)
980                 mtu = ip_rt_min_pmtu;
981
982         if (rt->rt_pmtu == mtu &&
983             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
984                 return;
985
986         rcu_read_lock();
987         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
988                 struct fib_nh *nh = &FIB_RES_NH(res);
989
990                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
991                                       jiffies + ip_rt_mtu_expires);
992         }
993         rcu_read_unlock();
994 }
995
996 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
997                               struct sk_buff *skb, u32 mtu)
998 {
999         struct rtable *rt = (struct rtable *) dst;
1000         struct flowi4 fl4;
1001
1002         ip_rt_build_flow_key(&fl4, sk, skb);
1003         __ip_rt_update_pmtu(rt, &fl4, mtu);
1004 }
1005
1006 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1007                       int oif, u32 mark, u8 protocol, int flow_flags)
1008 {
1009         const struct iphdr *iph = (const struct iphdr *) skb->data;
1010         struct flowi4 fl4;
1011         struct rtable *rt;
1012
1013         if (!mark)
1014                 mark = IP4_REPLY_MARK(net, skb->mark);
1015
1016         __build_flow_key(&fl4, NULL, iph, oif,
1017                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1018         rt = __ip_route_output_key(net, &fl4);
1019         if (!IS_ERR(rt)) {
1020                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1021                 ip_rt_put(rt);
1022         }
1023 }
1024 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1025
1026 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1027 {
1028         const struct iphdr *iph = (const struct iphdr *) skb->data;
1029         struct flowi4 fl4;
1030         struct rtable *rt;
1031
1032         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1033
1034         if (!fl4.flowi4_mark)
1035                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1036
1037         rt = __ip_route_output_key(sock_net(sk), &fl4);
1038         if (!IS_ERR(rt)) {
1039                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1040                 ip_rt_put(rt);
1041         }
1042 }
1043
1044 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1045 {
1046         const struct iphdr *iph = (const struct iphdr *) skb->data;
1047         struct flowi4 fl4;
1048         struct rtable *rt;
1049         struct dst_entry *odst = NULL;
1050         bool new = false;
1051
1052         bh_lock_sock(sk);
1053
1054         if (!ip_sk_accept_pmtu(sk))
1055                 goto out;
1056
1057         odst = sk_dst_get(sk);
1058
1059         if (sock_owned_by_user(sk) || !odst) {
1060                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1061                 goto out;
1062         }
1063
1064         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1065
1066         rt = (struct rtable *)odst;
1067         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1068                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1069                 if (IS_ERR(rt))
1070                         goto out;
1071
1072                 new = true;
1073         }
1074
1075         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1076
1077         if (!dst_check(&rt->dst, 0)) {
1078                 if (new)
1079                         dst_release(&rt->dst);
1080
1081                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1082                 if (IS_ERR(rt))
1083                         goto out;
1084
1085                 new = true;
1086         }
1087
1088         if (new)
1089                 sk_dst_set(sk, &rt->dst);
1090
1091 out:
1092         bh_unlock_sock(sk);
1093         dst_release(odst);
1094 }
1095 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1096
1097 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1098                    int oif, u32 mark, u8 protocol, int flow_flags)
1099 {
1100         const struct iphdr *iph = (const struct iphdr *) skb->data;
1101         struct flowi4 fl4;
1102         struct rtable *rt;
1103
1104         __build_flow_key(&fl4, NULL, iph, oif,
1105                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1106         rt = __ip_route_output_key(net, &fl4);
1107         if (!IS_ERR(rt)) {
1108                 __ip_do_redirect(rt, skb, &fl4, false);
1109                 ip_rt_put(rt);
1110         }
1111 }
1112 EXPORT_SYMBOL_GPL(ipv4_redirect);
1113
1114 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1115 {
1116         const struct iphdr *iph = (const struct iphdr *) skb->data;
1117         struct flowi4 fl4;
1118         struct rtable *rt;
1119
1120         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1121         rt = __ip_route_output_key(sock_net(sk), &fl4);
1122         if (!IS_ERR(rt)) {
1123                 __ip_do_redirect(rt, skb, &fl4, false);
1124                 ip_rt_put(rt);
1125         }
1126 }
1127 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1128
1129 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1130 {
1131         struct rtable *rt = (struct rtable *) dst;
1132
1133         /* All IPV4 dsts are created with ->obsolete set to the value
1134          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1135          * into this function always.
1136          *
1137          * When a PMTU/redirect information update invalidates a route,
1138          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1139          * DST_OBSOLETE_DEAD by dst_free().
1140          */
1141         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1142                 return NULL;
1143         return dst;
1144 }
1145
1146 static void ipv4_link_failure(struct sk_buff *skb)
1147 {
1148         struct rtable *rt;
1149
1150         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1151
1152         rt = skb_rtable(skb);
1153         if (rt)
1154                 dst_set_expires(&rt->dst, 0);
1155 }
1156
1157 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1158 {
1159         pr_debug("%s: %pI4 -> %pI4, %s\n",
1160                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1161                  skb->dev ? skb->dev->name : "?");
1162         kfree_skb(skb);
1163         WARN_ON(1);
1164         return 0;
1165 }
1166
1167 /*
1168    We do not cache source address of outgoing interface,
1169    because it is used only by IP RR, TS and SRR options,
1170    so that it out of fast path.
1171
1172    BTW remember: "addr" is allowed to be not aligned
1173    in IP options!
1174  */
1175
1176 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1177 {
1178         __be32 src;
1179
1180         if (rt_is_output_route(rt))
1181                 src = ip_hdr(skb)->saddr;
1182         else {
1183                 struct fib_result res;
1184                 struct flowi4 fl4;
1185                 struct iphdr *iph;
1186
1187                 iph = ip_hdr(skb);
1188
1189                 memset(&fl4, 0, sizeof(fl4));
1190                 fl4.daddr = iph->daddr;
1191                 fl4.saddr = iph->saddr;
1192                 fl4.flowi4_tos = RT_TOS(iph->tos);
1193                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1194                 fl4.flowi4_iif = skb->dev->ifindex;
1195                 fl4.flowi4_mark = skb->mark;
1196
1197                 rcu_read_lock();
1198                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1199                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1200                 else
1201                         src = inet_select_addr(rt->dst.dev,
1202                                                rt_nexthop(rt, iph->daddr),
1203                                                RT_SCOPE_UNIVERSE);
1204                 rcu_read_unlock();
1205         }
1206         memcpy(addr, &src, 4);
1207 }
1208
1209 #ifdef CONFIG_IP_ROUTE_CLASSID
1210 static void set_class_tag(struct rtable *rt, u32 tag)
1211 {
1212         if (!(rt->dst.tclassid & 0xFFFF))
1213                 rt->dst.tclassid |= tag & 0xFFFF;
1214         if (!(rt->dst.tclassid & 0xFFFF0000))
1215                 rt->dst.tclassid |= tag & 0xFFFF0000;
1216 }
1217 #endif
1218
1219 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1220 {
1221         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1222
1223         if (advmss == 0) {
1224                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1225                                ip_rt_min_advmss);
1226                 if (advmss > 65535 - 40)
1227                         advmss = 65535 - 40;
1228         }
1229         return advmss;
1230 }
1231
1232 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1233 {
1234         const struct rtable *rt = (const struct rtable *) dst;
1235         unsigned int mtu = rt->rt_pmtu;
1236
1237         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1238                 mtu = dst_metric_raw(dst, RTAX_MTU);
1239
1240         if (mtu)
1241                 return mtu;
1242
1243         mtu = dst->dev->mtu;
1244
1245         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1246                 if (rt->rt_uses_gateway && mtu > 576)
1247                         mtu = 576;
1248         }
1249
1250         return min_t(unsigned int, mtu, IP_MAX_MTU);
1251 }
1252
1253 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1254 {
1255         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1256         struct fib_nh_exception *fnhe;
1257         u32 hval;
1258
1259         if (!hash)
1260                 return NULL;
1261
1262         hval = fnhe_hashfun(daddr);
1263
1264         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1265              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1266                 if (fnhe->fnhe_daddr == daddr)
1267                         return fnhe;
1268         }
1269         return NULL;
1270 }
1271
1272 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1273                               __be32 daddr)
1274 {
1275         bool ret = false;
1276
1277         spin_lock_bh(&fnhe_lock);
1278
1279         if (daddr == fnhe->fnhe_daddr) {
1280                 struct rtable __rcu **porig;
1281                 struct rtable *orig;
1282                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1283
1284                 if (rt_is_input_route(rt))
1285                         porig = &fnhe->fnhe_rth_input;
1286                 else
1287                         porig = &fnhe->fnhe_rth_output;
1288                 orig = rcu_dereference(*porig);
1289
1290                 if (fnhe->fnhe_genid != genid) {
1291                         fnhe->fnhe_genid = genid;
1292                         fnhe->fnhe_gw = 0;
1293                         fnhe->fnhe_pmtu = 0;
1294                         fnhe->fnhe_expires = 0;
1295                         fnhe_flush_routes(fnhe);
1296                         orig = NULL;
1297                 }
1298                 fill_route_from_fnhe(rt, fnhe);
1299                 if (!rt->rt_gateway)
1300                         rt->rt_gateway = daddr;
1301
1302                 if (!(rt->dst.flags & DST_NOCACHE)) {
1303                         rcu_assign_pointer(*porig, rt);
1304                         if (orig)
1305                                 rt_free(orig);
1306                         ret = true;
1307                 }
1308
1309                 fnhe->fnhe_stamp = jiffies;
1310         }
1311         spin_unlock_bh(&fnhe_lock);
1312
1313         return ret;
1314 }
1315
1316 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1317 {
1318         struct rtable *orig, *prev, **p;
1319         bool ret = true;
1320
1321         if (rt_is_input_route(rt)) {
1322                 p = (struct rtable **)&nh->nh_rth_input;
1323         } else {
1324                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1325         }
1326         orig = *p;
1327
1328         prev = cmpxchg(p, orig, rt);
1329         if (prev == orig) {
1330                 if (orig)
1331                         rt_free(orig);
1332         } else
1333                 ret = false;
1334
1335         return ret;
1336 }
1337
1338 struct uncached_list {
1339         spinlock_t              lock;
1340         struct list_head        head;
1341 };
1342
1343 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1344
1345 static void rt_add_uncached_list(struct rtable *rt)
1346 {
1347         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1348
1349         rt->rt_uncached_list = ul;
1350
1351         spin_lock_bh(&ul->lock);
1352         list_add_tail(&rt->rt_uncached, &ul->head);
1353         spin_unlock_bh(&ul->lock);
1354 }
1355
1356 static void ipv4_dst_destroy(struct dst_entry *dst)
1357 {
1358         struct rtable *rt = (struct rtable *) dst;
1359
1360         if (!list_empty(&rt->rt_uncached)) {
1361                 struct uncached_list *ul = rt->rt_uncached_list;
1362
1363                 spin_lock_bh(&ul->lock);
1364                 list_del(&rt->rt_uncached);
1365                 spin_unlock_bh(&ul->lock);
1366         }
1367 }
1368
1369 void rt_flush_dev(struct net_device *dev)
1370 {
1371         struct net *net = dev_net(dev);
1372         struct rtable *rt;
1373         int cpu;
1374
1375         for_each_possible_cpu(cpu) {
1376                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1377
1378                 spin_lock_bh(&ul->lock);
1379                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1380                         if (rt->dst.dev != dev)
1381                                 continue;
1382                         rt->dst.dev = net->loopback_dev;
1383                         dev_hold(rt->dst.dev);
1384                         dev_put(dev);
1385                 }
1386                 spin_unlock_bh(&ul->lock);
1387         }
1388 }
1389
1390 static bool rt_cache_valid(const struct rtable *rt)
1391 {
1392         return  rt &&
1393                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1394                 !rt_is_expired(rt);
1395 }
1396
1397 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1398                            const struct fib_result *res,
1399                            struct fib_nh_exception *fnhe,
1400                            struct fib_info *fi, u16 type, u32 itag)
1401 {
1402         bool cached = false;
1403
1404         if (fi) {
1405                 struct fib_nh *nh = &FIB_RES_NH(*res);
1406
1407                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1408                         rt->rt_gateway = nh->nh_gw;
1409                         rt->rt_uses_gateway = 1;
1410                 }
1411                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1412 #ifdef CONFIG_IP_ROUTE_CLASSID
1413                 rt->dst.tclassid = nh->nh_tclassid;
1414 #endif
1415                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1416                 if (unlikely(fnhe))
1417                         cached = rt_bind_exception(rt, fnhe, daddr);
1418                 else if (!(rt->dst.flags & DST_NOCACHE))
1419                         cached = rt_cache_route(nh, rt);
1420                 if (unlikely(!cached)) {
1421                         /* Routes we intend to cache in nexthop exception or
1422                          * FIB nexthop have the DST_NOCACHE bit clear.
1423                          * However, if we are unsuccessful at storing this
1424                          * route into the cache we really need to set it.
1425                          */
1426                         rt->dst.flags |= DST_NOCACHE;
1427                         if (!rt->rt_gateway)
1428                                 rt->rt_gateway = daddr;
1429                         rt_add_uncached_list(rt);
1430                 }
1431         } else
1432                 rt_add_uncached_list(rt);
1433
1434 #ifdef CONFIG_IP_ROUTE_CLASSID
1435 #ifdef CONFIG_IP_MULTIPLE_TABLES
1436         set_class_tag(rt, res->tclassid);
1437 #endif
1438         set_class_tag(rt, itag);
1439 #endif
1440 }
1441
1442 static struct rtable *rt_dst_alloc(struct net_device *dev,
1443                                    unsigned int flags, u16 type,
1444                                    bool nopolicy, bool noxfrm, bool will_cache)
1445 {
1446         struct rtable *rt;
1447
1448         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1449                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1450                        (nopolicy ? DST_NOPOLICY : 0) |
1451                        (noxfrm ? DST_NOXFRM : 0));
1452
1453         if (rt) {
1454                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1455                 rt->rt_flags = flags;
1456                 rt->rt_type = type;
1457                 rt->rt_is_input = 0;
1458                 rt->rt_iif = 0;
1459                 rt->rt_pmtu = 0;
1460                 rt->rt_gateway = 0;
1461                 rt->rt_uses_gateway = 0;
1462                 rt->rt_table_id = 0;
1463                 INIT_LIST_HEAD(&rt->rt_uncached);
1464
1465                 rt->dst.output = ip_output;
1466                 if (flags & RTCF_LOCAL)
1467                         rt->dst.input = ip_local_deliver;
1468         }
1469
1470         return rt;
1471 }
1472
1473 /* called in rcu_read_lock() section */
1474 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1475                                 u8 tos, struct net_device *dev, int our)
1476 {
1477         struct rtable *rth;
1478         struct in_device *in_dev = __in_dev_get_rcu(dev);
1479         unsigned int flags = RTCF_MULTICAST;
1480         u32 itag = 0;
1481         int err;
1482
1483         /* Primary sanity checks. */
1484
1485         if (!in_dev)
1486                 return -EINVAL;
1487
1488         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1489             skb->protocol != htons(ETH_P_IP))
1490                 goto e_inval;
1491
1492         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1493                 goto e_inval;
1494
1495         if (ipv4_is_zeronet(saddr)) {
1496                 if (!ipv4_is_local_multicast(daddr))
1497                         goto e_inval;
1498         } else {
1499                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1500                                           in_dev, &itag);
1501                 if (err < 0)
1502                         goto e_err;
1503         }
1504         if (our)
1505                 flags |= RTCF_LOCAL;
1506
1507         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1508                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1509         if (!rth)
1510                 goto e_nobufs;
1511
1512 #ifdef CONFIG_IP_ROUTE_CLASSID
1513         rth->dst.tclassid = itag;
1514 #endif
1515         rth->dst.output = ip_rt_bug;
1516         rth->rt_is_input= 1;
1517
1518 #ifdef CONFIG_IP_MROUTE
1519         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1520                 rth->dst.input = ip_mr_input;
1521 #endif
1522         RT_CACHE_STAT_INC(in_slow_mc);
1523
1524         skb_dst_set(skb, &rth->dst);
1525         return 0;
1526
1527 e_nobufs:
1528         return -ENOBUFS;
1529 e_inval:
1530         return -EINVAL;
1531 e_err:
1532         return err;
1533 }
1534
1535
1536 static void ip_handle_martian_source(struct net_device *dev,
1537                                      struct in_device *in_dev,
1538                                      struct sk_buff *skb,
1539                                      __be32 daddr,
1540                                      __be32 saddr)
1541 {
1542         RT_CACHE_STAT_INC(in_martian_src);
1543 #ifdef CONFIG_IP_ROUTE_VERBOSE
1544         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1545                 /*
1546                  *      RFC1812 recommendation, if source is martian,
1547                  *      the only hint is MAC header.
1548                  */
1549                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1550                         &daddr, &saddr, dev->name);
1551                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1552                         print_hex_dump(KERN_WARNING, "ll header: ",
1553                                        DUMP_PREFIX_OFFSET, 16, 1,
1554                                        skb_mac_header(skb),
1555                                        dev->hard_header_len, true);
1556                 }
1557         }
1558 #endif
1559 }
1560
1561 /* called in rcu_read_lock() section */
1562 static int __mkroute_input(struct sk_buff *skb,
1563                            const struct fib_result *res,
1564                            struct in_device *in_dev,
1565                            __be32 daddr, __be32 saddr, u32 tos)
1566 {
1567         struct fib_nh_exception *fnhe;
1568         struct rtable *rth;
1569         int err;
1570         struct in_device *out_dev;
1571         bool do_cache;
1572         u32 itag = 0;
1573
1574         /* get a working reference to the output device */
1575         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1576         if (!out_dev) {
1577                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1578                 return -EINVAL;
1579         }
1580
1581         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1582                                   in_dev->dev, in_dev, &itag);
1583         if (err < 0) {
1584                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1585                                          saddr);
1586
1587                 goto cleanup;
1588         }
1589
1590         do_cache = res->fi && !itag;
1591         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1592             skb->protocol == htons(ETH_P_IP) &&
1593             (IN_DEV_SHARED_MEDIA(out_dev) ||
1594              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1595                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1596
1597         if (skb->protocol != htons(ETH_P_IP)) {
1598                 /* Not IP (i.e. ARP). Do not create route, if it is
1599                  * invalid for proxy arp. DNAT routes are always valid.
1600                  *
1601                  * Proxy arp feature have been extended to allow, ARP
1602                  * replies back to the same interface, to support
1603                  * Private VLAN switch technologies. See arp.c.
1604                  */
1605                 if (out_dev == in_dev &&
1606                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1607                         err = -EINVAL;
1608                         goto cleanup;
1609                 }
1610         }
1611
1612         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1613         if (do_cache) {
1614                 if (fnhe)
1615                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1616                 else
1617                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1618
1619                 if (rt_cache_valid(rth)) {
1620                         skb_dst_set_noref(skb, &rth->dst);
1621                         goto out;
1622                 }
1623         }
1624
1625         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1626                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1627                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1628         if (!rth) {
1629                 err = -ENOBUFS;
1630                 goto cleanup;
1631         }
1632
1633         rth->rt_is_input = 1;
1634         if (res->table)
1635                 rth->rt_table_id = res->table->tb_id;
1636         RT_CACHE_STAT_INC(in_slow_tot);
1637
1638         rth->dst.input = ip_forward;
1639
1640         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1641         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1642                 rth->dst.lwtstate->orig_output = rth->dst.output;
1643                 rth->dst.output = lwtunnel_output;
1644         }
1645         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1646                 rth->dst.lwtstate->orig_input = rth->dst.input;
1647                 rth->dst.input = lwtunnel_input;
1648         }
1649         skb_dst_set(skb, &rth->dst);
1650 out:
1651         err = 0;
1652  cleanup:
1653         return err;
1654 }
1655
1656 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1657
1658 /* To make ICMP packets follow the right flow, the multipath hash is
1659  * calculated from the inner IP addresses in reverse order.
1660  */
1661 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1662 {
1663         const struct iphdr *outer_iph = ip_hdr(skb);
1664         struct icmphdr _icmph;
1665         const struct icmphdr *icmph;
1666         struct iphdr _inner_iph;
1667         const struct iphdr *inner_iph;
1668
1669         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1670                 goto standard_hash;
1671
1672         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1673                                    &_icmph);
1674         if (!icmph)
1675                 goto standard_hash;
1676
1677         if (icmph->type != ICMP_DEST_UNREACH &&
1678             icmph->type != ICMP_REDIRECT &&
1679             icmph->type != ICMP_TIME_EXCEEDED &&
1680             icmph->type != ICMP_PARAMETERPROB) {
1681                 goto standard_hash;
1682         }
1683
1684         inner_iph = skb_header_pointer(skb,
1685                                        outer_iph->ihl * 4 + sizeof(_icmph),
1686                                        sizeof(_inner_iph), &_inner_iph);
1687         if (!inner_iph)
1688                 goto standard_hash;
1689
1690         return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1691
1692 standard_hash:
1693         return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1694 }
1695
1696 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1697
1698 static int ip_mkroute_input(struct sk_buff *skb,
1699                             struct fib_result *res,
1700                             const struct flowi4 *fl4,
1701                             struct in_device *in_dev,
1702                             __be32 daddr, __be32 saddr, u32 tos)
1703 {
1704 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1705         if (res->fi && res->fi->fib_nhs > 1) {
1706                 int h;
1707
1708                 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1709                         h = ip_multipath_icmp_hash(skb);
1710                 else
1711                         h = fib_multipath_hash(saddr, daddr);
1712                 fib_select_multipath(res, h);
1713         }
1714 #endif
1715
1716         /* create a routing cache entry */
1717         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1718 }
1719
1720 /*
1721  *      NOTE. We drop all the packets that has local source
1722  *      addresses, because every properly looped back packet
1723  *      must have correct destination already attached by output routine.
1724  *
1725  *      Such approach solves two big problems:
1726  *      1. Not simplex devices are handled properly.
1727  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1728  *      called with rcu_read_lock()
1729  */
1730
1731 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1732                                u8 tos, struct net_device *dev)
1733 {
1734         struct fib_result res;
1735         struct in_device *in_dev = __in_dev_get_rcu(dev);
1736         struct ip_tunnel_info *tun_info;
1737         struct flowi4   fl4;
1738         unsigned int    flags = 0;
1739         u32             itag = 0;
1740         struct rtable   *rth;
1741         int             err = -EINVAL;
1742         struct net    *net = dev_net(dev);
1743         bool do_cache;
1744
1745         /* IP on this device is disabled. */
1746
1747         if (!in_dev)
1748                 goto out;
1749
1750         /* Check for the most weird martians, which can be not detected
1751            by fib_lookup.
1752          */
1753
1754         tun_info = skb_tunnel_info(skb);
1755         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1756                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1757         else
1758                 fl4.flowi4_tun_key.tun_id = 0;
1759         skb_dst_drop(skb);
1760
1761         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1762                 goto martian_source;
1763
1764         res.fi = NULL;
1765         res.table = NULL;
1766         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1767                 goto brd_input;
1768
1769         /* Accept zero addresses only to limited broadcast;
1770          * I even do not know to fix it or not. Waiting for complains :-)
1771          */
1772         if (ipv4_is_zeronet(saddr))
1773                 goto martian_source;
1774
1775         if (ipv4_is_zeronet(daddr))
1776                 goto martian_destination;
1777
1778         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1779          * and call it once if daddr or/and saddr are loopback addresses
1780          */
1781         if (ipv4_is_loopback(daddr)) {
1782                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1783                         goto martian_destination;
1784         } else if (ipv4_is_loopback(saddr)) {
1785                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1786                         goto martian_source;
1787         }
1788
1789         /*
1790          *      Now we are ready to route packet.
1791          */
1792         fl4.flowi4_oif = 0;
1793         fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
1794         fl4.flowi4_mark = skb->mark;
1795         fl4.flowi4_tos = tos;
1796         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1797         fl4.flowi4_flags = 0;
1798         fl4.daddr = daddr;
1799         fl4.saddr = saddr;
1800         err = fib_lookup(net, &fl4, &res, 0);
1801         if (err != 0) {
1802                 if (!IN_DEV_FORWARD(in_dev))
1803                         err = -EHOSTUNREACH;
1804                 goto no_route;
1805         }
1806
1807         if (res.type == RTN_BROADCAST)
1808                 goto brd_input;
1809
1810         if (res.type == RTN_LOCAL) {
1811                 err = fib_validate_source(skb, saddr, daddr, tos,
1812                                           0, dev, in_dev, &itag);
1813                 if (err < 0)
1814                         goto martian_source;
1815                 goto local_input;
1816         }
1817
1818         if (!IN_DEV_FORWARD(in_dev)) {
1819                 err = -EHOSTUNREACH;
1820                 goto no_route;
1821         }
1822         if (res.type != RTN_UNICAST)
1823                 goto martian_destination;
1824
1825         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1826 out:    return err;
1827
1828 brd_input:
1829         if (skb->protocol != htons(ETH_P_IP))
1830                 goto e_inval;
1831
1832         if (!ipv4_is_zeronet(saddr)) {
1833                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1834                                           in_dev, &itag);
1835                 if (err < 0)
1836                         goto martian_source;
1837         }
1838         flags |= RTCF_BROADCAST;
1839         res.type = RTN_BROADCAST;
1840         RT_CACHE_STAT_INC(in_brd);
1841
1842 local_input:
1843         do_cache = false;
1844         if (res.fi) {
1845                 if (!itag) {
1846                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1847                         if (rt_cache_valid(rth)) {
1848                                 skb_dst_set_noref(skb, &rth->dst);
1849                                 err = 0;
1850                                 goto out;
1851                         }
1852                         do_cache = true;
1853                 }
1854         }
1855
1856         rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1857                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1858         if (!rth)
1859                 goto e_nobufs;
1860
1861         rth->dst.output= ip_rt_bug;
1862 #ifdef CONFIG_IP_ROUTE_CLASSID
1863         rth->dst.tclassid = itag;
1864 #endif
1865         rth->rt_is_input = 1;
1866         if (res.table)
1867                 rth->rt_table_id = res.table->tb_id;
1868
1869         RT_CACHE_STAT_INC(in_slow_tot);
1870         if (res.type == RTN_UNREACHABLE) {
1871                 rth->dst.input= ip_error;
1872                 rth->dst.error= -err;
1873                 rth->rt_flags   &= ~RTCF_LOCAL;
1874         }
1875         if (do_cache) {
1876                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1877                         rth->dst.flags |= DST_NOCACHE;
1878                         rt_add_uncached_list(rth);
1879                 }
1880         }
1881         skb_dst_set(skb, &rth->dst);
1882         err = 0;
1883         goto out;
1884
1885 no_route:
1886         RT_CACHE_STAT_INC(in_no_route);
1887         res.type = RTN_UNREACHABLE;
1888         res.fi = NULL;
1889         res.table = NULL;
1890         goto local_input;
1891
1892         /*
1893          *      Do not cache martian addresses: they should be logged (RFC1812)
1894          */
1895 martian_destination:
1896         RT_CACHE_STAT_INC(in_martian_dst);
1897 #ifdef CONFIG_IP_ROUTE_VERBOSE
1898         if (IN_DEV_LOG_MARTIANS(in_dev))
1899                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1900                                      &daddr, &saddr, dev->name);
1901 #endif
1902
1903 e_inval:
1904         err = -EINVAL;
1905         goto out;
1906
1907 e_nobufs:
1908         err = -ENOBUFS;
1909         goto out;
1910
1911 martian_source:
1912         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1913         goto out;
1914 }
1915
1916 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1917                          u8 tos, struct net_device *dev)
1918 {
1919         int res;
1920
1921         rcu_read_lock();
1922
1923         /* Multicast recognition logic is moved from route cache to here.
1924            The problem was that too many Ethernet cards have broken/missing
1925            hardware multicast filters :-( As result the host on multicasting
1926            network acquires a lot of useless route cache entries, sort of
1927            SDR messages from all the world. Now we try to get rid of them.
1928            Really, provided software IP multicast filter is organized
1929            reasonably (at least, hashed), it does not result in a slowdown
1930            comparing with route cache reject entries.
1931            Note, that multicast routers are not affected, because
1932            route cache entry is created eventually.
1933          */
1934         if (ipv4_is_multicast(daddr)) {
1935                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1936
1937                 if (in_dev) {
1938                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1939                                                   ip_hdr(skb)->protocol);
1940                         if (our
1941 #ifdef CONFIG_IP_MROUTE
1942                                 ||
1943                             (!ipv4_is_local_multicast(daddr) &&
1944                              IN_DEV_MFORWARD(in_dev))
1945 #endif
1946                            ) {
1947                                 int res = ip_route_input_mc(skb, daddr, saddr,
1948                                                             tos, dev, our);
1949                                 rcu_read_unlock();
1950                                 return res;
1951                         }
1952                 }
1953                 rcu_read_unlock();
1954                 return -EINVAL;
1955         }
1956         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1957         rcu_read_unlock();
1958         return res;
1959 }
1960 EXPORT_SYMBOL(ip_route_input_noref);
1961
1962 /* called with rcu_read_lock() */
1963 static struct rtable *__mkroute_output(const struct fib_result *res,
1964                                        const struct flowi4 *fl4, int orig_oif,
1965                                        struct net_device *dev_out,
1966                                        unsigned int flags)
1967 {
1968         struct fib_info *fi = res->fi;
1969         struct fib_nh_exception *fnhe;
1970         struct in_device *in_dev;
1971         u16 type = res->type;
1972         struct rtable *rth;
1973         bool do_cache;
1974
1975         in_dev = __in_dev_get_rcu(dev_out);
1976         if (!in_dev)
1977                 return ERR_PTR(-EINVAL);
1978
1979         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1980                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1981                         return ERR_PTR(-EINVAL);
1982
1983         if (ipv4_is_lbcast(fl4->daddr))
1984                 type = RTN_BROADCAST;
1985         else if (ipv4_is_multicast(fl4->daddr))
1986                 type = RTN_MULTICAST;
1987         else if (ipv4_is_zeronet(fl4->daddr))
1988                 return ERR_PTR(-EINVAL);
1989
1990         if (dev_out->flags & IFF_LOOPBACK)
1991                 flags |= RTCF_LOCAL;
1992
1993         do_cache = true;
1994         if (type == RTN_BROADCAST) {
1995                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1996                 fi = NULL;
1997         } else if (type == RTN_MULTICAST) {
1998                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1999                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2000                                      fl4->flowi4_proto))
2001                         flags &= ~RTCF_LOCAL;
2002                 else
2003                         do_cache = false;
2004                 /* If multicast route do not exist use
2005                  * default one, but do not gateway in this case.
2006                  * Yes, it is hack.
2007                  */
2008                 if (fi && res->prefixlen < 4)
2009                         fi = NULL;
2010         }
2011
2012         fnhe = NULL;
2013         do_cache &= fi != NULL;
2014         if (do_cache) {
2015                 struct rtable __rcu **prth;
2016                 struct fib_nh *nh = &FIB_RES_NH(*res);
2017
2018                 fnhe = find_exception(nh, fl4->daddr);
2019                 if (fnhe)
2020                         prth = &fnhe->fnhe_rth_output;
2021                 else {
2022                         if (unlikely(fl4->flowi4_flags &
2023                                      FLOWI_FLAG_KNOWN_NH &&
2024                                      !(nh->nh_gw &&
2025                                        nh->nh_scope == RT_SCOPE_LINK))) {
2026                                 do_cache = false;
2027                                 goto add;
2028                         }
2029                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2030                 }
2031                 rth = rcu_dereference(*prth);
2032                 if (rt_cache_valid(rth)) {
2033                         dst_hold(&rth->dst);
2034                         return rth;
2035                 }
2036         }
2037
2038 add:
2039         rth = rt_dst_alloc(dev_out, flags, type,
2040                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2041                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2042                            do_cache);
2043         if (!rth)
2044                 return ERR_PTR(-ENOBUFS);
2045
2046         rth->rt_iif     = orig_oif ? : 0;
2047         if (res->table)
2048                 rth->rt_table_id = res->table->tb_id;
2049
2050         RT_CACHE_STAT_INC(out_slow_tot);
2051
2052         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2053                 if (flags & RTCF_LOCAL &&
2054                     !(dev_out->flags & IFF_LOOPBACK)) {
2055                         rth->dst.output = ip_mc_output;
2056                         RT_CACHE_STAT_INC(out_slow_mc);
2057                 }
2058 #ifdef CONFIG_IP_MROUTE
2059                 if (type == RTN_MULTICAST) {
2060                         if (IN_DEV_MFORWARD(in_dev) &&
2061                             !ipv4_is_local_multicast(fl4->daddr)) {
2062                                 rth->dst.input = ip_mr_input;
2063                                 rth->dst.output = ip_mc_output;
2064                         }
2065                 }
2066 #endif
2067         }
2068
2069         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2070         if (lwtunnel_output_redirect(rth->dst.lwtstate))
2071                 rth->dst.output = lwtunnel_output;
2072
2073         return rth;
2074 }
2075
2076 /*
2077  * Major route resolver routine.
2078  */
2079
2080 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2081                                           int mp_hash)
2082 {
2083         struct net_device *dev_out = NULL;
2084         __u8 tos = RT_FL_TOS(fl4);
2085         unsigned int flags = 0;
2086         struct fib_result res;
2087         struct rtable *rth;
2088         int orig_oif;
2089         int err = -ENETUNREACH;
2090
2091         res.tclassid    = 0;
2092         res.fi          = NULL;
2093         res.table       = NULL;
2094
2095         orig_oif = fl4->flowi4_oif;
2096
2097         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2098         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2099         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2100                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2101
2102         rcu_read_lock();
2103         if (fl4->saddr) {
2104                 rth = ERR_PTR(-EINVAL);
2105                 if (ipv4_is_multicast(fl4->saddr) ||
2106                     ipv4_is_lbcast(fl4->saddr) ||
2107                     ipv4_is_zeronet(fl4->saddr))
2108                         goto out;
2109
2110                 /* I removed check for oif == dev_out->oif here.
2111                    It was wrong for two reasons:
2112                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2113                       is assigned to multiple interfaces.
2114                    2. Moreover, we are allowed to send packets with saddr
2115                       of another iface. --ANK
2116                  */
2117
2118                 if (fl4->flowi4_oif == 0 &&
2119                     (ipv4_is_multicast(fl4->daddr) ||
2120                      ipv4_is_lbcast(fl4->daddr))) {
2121                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2122                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2123                         if (!dev_out)
2124                                 goto out;
2125
2126                         /* Special hack: user can direct multicasts
2127                            and limited broadcast via necessary interface
2128                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2129                            This hack is not just for fun, it allows
2130                            vic,vat and friends to work.
2131                            They bind socket to loopback, set ttl to zero
2132                            and expect that it will work.
2133                            From the viewpoint of routing cache they are broken,
2134                            because we are not allowed to build multicast path
2135                            with loopback source addr (look, routing cache
2136                            cannot know, that ttl is zero, so that packet
2137                            will not leave this host and route is valid).
2138                            Luckily, this hack is good workaround.
2139                          */
2140
2141                         fl4->flowi4_oif = dev_out->ifindex;
2142                         goto make_route;
2143                 }
2144
2145                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2146                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2147                         if (!__ip_dev_find(net, fl4->saddr, false))
2148                                 goto out;
2149                 }
2150         }
2151
2152
2153         if (fl4->flowi4_oif) {
2154                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2155                 rth = ERR_PTR(-ENODEV);
2156                 if (!dev_out)
2157                         goto out;
2158
2159                 /* RACE: Check return value of inet_select_addr instead. */
2160                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2161                         rth = ERR_PTR(-ENETUNREACH);
2162                         goto out;
2163                 }
2164                 if (ipv4_is_local_multicast(fl4->daddr) ||
2165                     ipv4_is_lbcast(fl4->daddr) ||
2166                     fl4->flowi4_proto == IPPROTO_IGMP) {
2167                         if (!fl4->saddr)
2168                                 fl4->saddr = inet_select_addr(dev_out, 0,
2169                                                               RT_SCOPE_LINK);
2170                         goto make_route;
2171                 }
2172                 if (!fl4->saddr) {
2173                         if (ipv4_is_multicast(fl4->daddr))
2174                                 fl4->saddr = inet_select_addr(dev_out, 0,
2175                                                               fl4->flowi4_scope);
2176                         else if (!fl4->daddr)
2177                                 fl4->saddr = inet_select_addr(dev_out, 0,
2178                                                               RT_SCOPE_HOST);
2179                 }
2180
2181                 rth = l3mdev_get_rtable(dev_out, fl4);
2182                 if (rth)
2183                         goto out;
2184         }
2185
2186         if (!fl4->daddr) {
2187                 fl4->daddr = fl4->saddr;
2188                 if (!fl4->daddr)
2189                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2190                 dev_out = net->loopback_dev;
2191                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2192                 res.type = RTN_LOCAL;
2193                 flags |= RTCF_LOCAL;
2194                 goto make_route;
2195         }
2196
2197         err = fib_lookup(net, fl4, &res, 0);
2198         if (err) {
2199                 res.fi = NULL;
2200                 res.table = NULL;
2201                 if (fl4->flowi4_oif &&
2202                     !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2203                         /* Apparently, routing tables are wrong. Assume,
2204                            that the destination is on link.
2205
2206                            WHY? DW.
2207                            Because we are allowed to send to iface
2208                            even if it has NO routes and NO assigned
2209                            addresses. When oif is specified, routing
2210                            tables are looked up with only one purpose:
2211                            to catch if destination is gatewayed, rather than
2212                            direct. Moreover, if MSG_DONTROUTE is set,
2213                            we send packet, ignoring both routing tables
2214                            and ifaddr state. --ANK
2215
2216
2217                            We could make it even if oif is unknown,
2218                            likely IPv6, but we do not.
2219                          */
2220
2221                         if (fl4->saddr == 0)
2222                                 fl4->saddr = inet_select_addr(dev_out, 0,
2223                                                               RT_SCOPE_LINK);
2224                         res.type = RTN_UNICAST;
2225                         goto make_route;
2226                 }
2227                 rth = ERR_PTR(err);
2228                 goto out;
2229         }
2230
2231         if (res.type == RTN_LOCAL) {
2232                 if (!fl4->saddr) {
2233                         if (res.fi->fib_prefsrc)
2234                                 fl4->saddr = res.fi->fib_prefsrc;
2235                         else
2236                                 fl4->saddr = fl4->daddr;
2237                 }
2238                 dev_out = net->loopback_dev;
2239                 fl4->flowi4_oif = dev_out->ifindex;
2240                 flags |= RTCF_LOCAL;
2241                 goto make_route;
2242         }
2243
2244         fib_select_path(net, &res, fl4, mp_hash);
2245
2246         dev_out = FIB_RES_DEV(res);
2247         fl4->flowi4_oif = dev_out->ifindex;
2248
2249
2250 make_route:
2251         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2252
2253 out:
2254         rcu_read_unlock();
2255         return rth;
2256 }
2257 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2258
2259 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2260 {
2261         return NULL;
2262 }
2263
2264 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2265 {
2266         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2267
2268         return mtu ? : dst->dev->mtu;
2269 }
2270
2271 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2272                                           struct sk_buff *skb, u32 mtu)
2273 {
2274 }
2275
2276 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2277                                        struct sk_buff *skb)
2278 {
2279 }
2280
2281 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2282                                           unsigned long old)
2283 {
2284         return NULL;
2285 }
2286
2287 static struct dst_ops ipv4_dst_blackhole_ops = {
2288         .family                 =       AF_INET,
2289         .check                  =       ipv4_blackhole_dst_check,
2290         .mtu                    =       ipv4_blackhole_mtu,
2291         .default_advmss         =       ipv4_default_advmss,
2292         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2293         .redirect               =       ipv4_rt_blackhole_redirect,
2294         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2295         .neigh_lookup           =       ipv4_neigh_lookup,
2296 };
2297
2298 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2299 {
2300         struct rtable *ort = (struct rtable *) dst_orig;
2301         struct rtable *rt;
2302
2303         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2304         if (rt) {
2305                 struct dst_entry *new = &rt->dst;
2306
2307                 new->__use = 1;
2308                 new->input = dst_discard;
2309                 new->output = dst_discard_out;
2310
2311                 new->dev = ort->dst.dev;
2312                 if (new->dev)
2313                         dev_hold(new->dev);
2314
2315                 rt->rt_is_input = ort->rt_is_input;
2316                 rt->rt_iif = ort->rt_iif;
2317                 rt->rt_pmtu = ort->rt_pmtu;
2318
2319                 rt->rt_genid = rt_genid_ipv4(net);
2320                 rt->rt_flags = ort->rt_flags;
2321                 rt->rt_type = ort->rt_type;
2322                 rt->rt_gateway = ort->rt_gateway;
2323                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2324
2325                 INIT_LIST_HEAD(&rt->rt_uncached);
2326                 dst_free(new);
2327         }
2328
2329         dst_release(dst_orig);
2330
2331         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2332 }
2333
2334 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2335                                     const struct sock *sk)
2336 {
2337         struct rtable *rt = __ip_route_output_key(net, flp4);
2338
2339         if (IS_ERR(rt))
2340                 return rt;
2341
2342         if (flp4->flowi4_proto)
2343                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2344                                                         flowi4_to_flowi(flp4),
2345                                                         sk, 0);
2346
2347         return rt;
2348 }
2349 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2350
2351 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2352                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2353                         u32 seq, int event, int nowait, unsigned int flags)
2354 {
2355         struct rtable *rt = skb_rtable(skb);
2356         struct rtmsg *r;
2357         struct nlmsghdr *nlh;
2358         unsigned long expires = 0;
2359         u32 error;
2360         u32 metrics[RTAX_MAX];
2361
2362         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2363         if (!nlh)
2364                 return -EMSGSIZE;
2365
2366         r = nlmsg_data(nlh);
2367         r->rtm_family    = AF_INET;
2368         r->rtm_dst_len  = 32;
2369         r->rtm_src_len  = 0;
2370         r->rtm_tos      = fl4->flowi4_tos;
2371         r->rtm_table    = table_id;
2372         if (nla_put_u32(skb, RTA_TABLE, table_id))
2373                 goto nla_put_failure;
2374         r->rtm_type     = rt->rt_type;
2375         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2376         r->rtm_protocol = RTPROT_UNSPEC;
2377         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2378         if (rt->rt_flags & RTCF_NOTIFY)
2379                 r->rtm_flags |= RTM_F_NOTIFY;
2380         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2381                 r->rtm_flags |= RTCF_DOREDIRECT;
2382
2383         if (nla_put_in_addr(skb, RTA_DST, dst))
2384                 goto nla_put_failure;
2385         if (src) {
2386                 r->rtm_src_len = 32;
2387                 if (nla_put_in_addr(skb, RTA_SRC, src))
2388                         goto nla_put_failure;
2389         }
2390         if (rt->dst.dev &&
2391             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2392                 goto nla_put_failure;
2393 #ifdef CONFIG_IP_ROUTE_CLASSID
2394         if (rt->dst.tclassid &&
2395             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2396                 goto nla_put_failure;
2397 #endif
2398         if (!rt_is_input_route(rt) &&
2399             fl4->saddr != src) {
2400                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2401                         goto nla_put_failure;
2402         }
2403         if (rt->rt_uses_gateway &&
2404             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2405                 goto nla_put_failure;
2406
2407         expires = rt->dst.expires;
2408         if (expires) {
2409                 unsigned long now = jiffies;
2410
2411                 if (time_before(now, expires))
2412                         expires -= now;
2413                 else
2414                         expires = 0;
2415         }
2416
2417         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2418         if (rt->rt_pmtu && expires)
2419                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2420         if (rtnetlink_put_metrics(skb, metrics) < 0)
2421                 goto nla_put_failure;
2422
2423         if (fl4->flowi4_mark &&
2424             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2425                 goto nla_put_failure;
2426
2427         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2428             nla_put_u32(skb, RTA_UID,
2429                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2430                 goto nla_put_failure;
2431
2432         error = rt->dst.error;
2433
2434         if (rt_is_input_route(rt)) {
2435 #ifdef CONFIG_IP_MROUTE
2436                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2437                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2438                         int err = ipmr_get_route(net, skb,
2439                                                  fl4->saddr, fl4->daddr,
2440                                                  r, nowait);
2441                         if (err <= 0) {
2442                                 if (!nowait) {
2443                                         if (err == 0)
2444                                                 return 0;
2445                                         goto nla_put_failure;
2446                                 } else {
2447                                         if (err == -EMSGSIZE)
2448                                                 goto nla_put_failure;
2449                                         error = err;
2450                                 }
2451                         }
2452                 } else
2453 #endif
2454                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2455                                 goto nla_put_failure;
2456         }
2457
2458         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2459                 goto nla_put_failure;
2460
2461         nlmsg_end(skb, nlh);
2462         return 0;
2463
2464 nla_put_failure:
2465         nlmsg_cancel(skb, nlh);
2466         return -EMSGSIZE;
2467 }
2468
2469 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2470 {
2471         struct net *net = sock_net(in_skb->sk);
2472         struct rtmsg *rtm;
2473         struct nlattr *tb[RTA_MAX+1];
2474         struct rtable *rt = NULL;
2475         struct flowi4 fl4;
2476         __be32 dst = 0;
2477         __be32 src = 0;
2478         u32 iif;
2479         int err;
2480         int mark;
2481         struct sk_buff *skb;
2482         u32 table_id = RT_TABLE_MAIN;
2483         kuid_t uid;
2484
2485         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2486         if (err < 0)
2487                 goto errout;
2488
2489         rtm = nlmsg_data(nlh);
2490
2491         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2492         if (!skb) {
2493                 err = -ENOBUFS;
2494                 goto errout;
2495         }
2496
2497         /* Reserve room for dummy headers, this skb can pass
2498            through good chunk of routing engine.
2499          */
2500         skb_reset_mac_header(skb);
2501         skb_reset_network_header(skb);
2502
2503         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2504         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2505         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2506
2507         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2508         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2509         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2510         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2511         if (tb[RTA_UID])
2512                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2513         else
2514                 uid = (iif ? INVALID_UID : current_uid());
2515
2516         memset(&fl4, 0, sizeof(fl4));
2517         fl4.daddr = dst;
2518         fl4.saddr = src;
2519         fl4.flowi4_tos = rtm->rtm_tos;
2520         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2521         fl4.flowi4_mark = mark;
2522         fl4.flowi4_uid = uid;
2523
2524         if (netif_index_is_l3_master(net, fl4.flowi4_oif))
2525                 fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
2526
2527         if (iif) {
2528                 struct net_device *dev;
2529
2530                 dev = __dev_get_by_index(net, iif);
2531                 if (!dev) {
2532                         err = -ENODEV;
2533                         goto errout_free;
2534                 }
2535
2536                 skb->protocol   = htons(ETH_P_IP);
2537                 skb->dev        = dev;
2538                 skb->mark       = mark;
2539                 local_bh_disable();
2540                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2541                 local_bh_enable();
2542
2543                 rt = skb_rtable(skb);
2544                 if (err == 0 && rt->dst.error)
2545                         err = -rt->dst.error;
2546         } else {
2547                 rt = ip_route_output_key(net, &fl4);
2548
2549                 err = 0;
2550                 if (IS_ERR(rt))
2551                         err = PTR_ERR(rt);
2552         }
2553
2554         if (err)
2555                 goto errout_free;
2556
2557         skb_dst_set(skb, &rt->dst);
2558         if (rtm->rtm_flags & RTM_F_NOTIFY)
2559                 rt->rt_flags |= RTCF_NOTIFY;
2560
2561         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2562                 table_id = rt->rt_table_id;
2563
2564         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2565                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2566                            RTM_NEWROUTE, 0, 0);
2567         if (err < 0)
2568                 goto errout_free;
2569
2570         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2571 errout:
2572         return err;
2573
2574 errout_free:
2575         kfree_skb(skb);
2576         goto errout;
2577 }
2578
2579 void ip_rt_multicast_event(struct in_device *in_dev)
2580 {
2581         rt_cache_flush(dev_net(in_dev->dev));
2582 }
2583
2584 #ifdef CONFIG_SYSCTL
2585 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2586 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2587 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2588 static int ip_rt_gc_elasticity __read_mostly    = 8;
2589
2590 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2591                                         void __user *buffer,
2592                                         size_t *lenp, loff_t *ppos)
2593 {
2594         struct net *net = (struct net *)__ctl->extra1;
2595
2596         if (write) {
2597                 rt_cache_flush(net);
2598                 fnhe_genid_bump(net);
2599                 return 0;
2600         }
2601
2602         return -EINVAL;
2603 }
2604
2605 static struct ctl_table ipv4_route_table[] = {
2606         {
2607                 .procname       = "gc_thresh",
2608                 .data           = &ipv4_dst_ops.gc_thresh,
2609                 .maxlen         = sizeof(int),
2610                 .mode           = 0644,
2611                 .proc_handler   = proc_dointvec,
2612         },
2613         {
2614                 .procname       = "max_size",
2615                 .data           = &ip_rt_max_size,
2616                 .maxlen         = sizeof(int),
2617                 .mode           = 0644,
2618                 .proc_handler   = proc_dointvec,
2619         },
2620         {
2621                 /*  Deprecated. Use gc_min_interval_ms */
2622
2623                 .procname       = "gc_min_interval",
2624                 .data           = &ip_rt_gc_min_interval,
2625                 .maxlen         = sizeof(int),
2626                 .mode           = 0644,
2627                 .proc_handler   = proc_dointvec_jiffies,
2628         },
2629         {
2630                 .procname       = "gc_min_interval_ms",
2631                 .data           = &ip_rt_gc_min_interval,
2632                 .maxlen         = sizeof(int),
2633                 .mode           = 0644,
2634                 .proc_handler   = proc_dointvec_ms_jiffies,
2635         },
2636         {
2637                 .procname       = "gc_timeout",
2638                 .data           = &ip_rt_gc_timeout,
2639                 .maxlen         = sizeof(int),
2640                 .mode           = 0644,
2641                 .proc_handler   = proc_dointvec_jiffies,
2642         },
2643         {
2644                 .procname       = "gc_interval",
2645                 .data           = &ip_rt_gc_interval,
2646                 .maxlen         = sizeof(int),
2647                 .mode           = 0644,
2648                 .proc_handler   = proc_dointvec_jiffies,
2649         },
2650         {
2651                 .procname       = "redirect_load",
2652                 .data           = &ip_rt_redirect_load,
2653                 .maxlen         = sizeof(int),
2654                 .mode           = 0644,
2655                 .proc_handler   = proc_dointvec,
2656         },
2657         {
2658                 .procname       = "redirect_number",
2659                 .data           = &ip_rt_redirect_number,
2660                 .maxlen         = sizeof(int),
2661                 .mode           = 0644,
2662                 .proc_handler   = proc_dointvec,
2663         },
2664         {
2665                 .procname       = "redirect_silence",
2666                 .data           = &ip_rt_redirect_silence,
2667                 .maxlen         = sizeof(int),
2668                 .mode           = 0644,
2669                 .proc_handler   = proc_dointvec,
2670         },
2671         {
2672                 .procname       = "error_cost",
2673                 .data           = &ip_rt_error_cost,
2674                 .maxlen         = sizeof(int),
2675                 .mode           = 0644,
2676                 .proc_handler   = proc_dointvec,
2677         },
2678         {
2679                 .procname       = "error_burst",
2680                 .data           = &ip_rt_error_burst,
2681                 .maxlen         = sizeof(int),
2682                 .mode           = 0644,
2683                 .proc_handler   = proc_dointvec,
2684         },
2685         {
2686                 .procname       = "gc_elasticity",
2687                 .data           = &ip_rt_gc_elasticity,
2688                 .maxlen         = sizeof(int),
2689                 .mode           = 0644,
2690                 .proc_handler   = proc_dointvec,
2691         },
2692         {
2693                 .procname       = "mtu_expires",
2694                 .data           = &ip_rt_mtu_expires,
2695                 .maxlen         = sizeof(int),
2696                 .mode           = 0644,
2697                 .proc_handler   = proc_dointvec_jiffies,
2698         },
2699         {
2700                 .procname       = "min_pmtu",
2701                 .data           = &ip_rt_min_pmtu,
2702                 .maxlen         = sizeof(int),
2703                 .mode           = 0644,
2704                 .proc_handler   = proc_dointvec,
2705         },
2706         {
2707                 .procname       = "min_adv_mss",
2708                 .data           = &ip_rt_min_advmss,
2709                 .maxlen         = sizeof(int),
2710                 .mode           = 0644,
2711                 .proc_handler   = proc_dointvec,
2712         },
2713         { }
2714 };
2715
2716 static struct ctl_table ipv4_route_flush_table[] = {
2717         {
2718                 .procname       = "flush",
2719                 .maxlen         = sizeof(int),
2720                 .mode           = 0200,
2721                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2722         },
2723         { },
2724 };
2725
2726 static __net_init int sysctl_route_net_init(struct net *net)
2727 {
2728         struct ctl_table *tbl;
2729
2730         tbl = ipv4_route_flush_table;
2731         if (!net_eq(net, &init_net)) {
2732                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2733                 if (!tbl)
2734                         goto err_dup;
2735
2736                 /* Don't export sysctls to unprivileged users */
2737                 if (net->user_ns != &init_user_ns)
2738                         tbl[0].procname = NULL;
2739         }
2740         tbl[0].extra1 = net;
2741
2742         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2743         if (!net->ipv4.route_hdr)
2744                 goto err_reg;
2745         return 0;
2746
2747 err_reg:
2748         if (tbl != ipv4_route_flush_table)
2749                 kfree(tbl);
2750 err_dup:
2751         return -ENOMEM;
2752 }
2753
2754 static __net_exit void sysctl_route_net_exit(struct net *net)
2755 {
2756         struct ctl_table *tbl;
2757
2758         tbl = net->ipv4.route_hdr->ctl_table_arg;
2759         unregister_net_sysctl_table(net->ipv4.route_hdr);
2760         BUG_ON(tbl == ipv4_route_flush_table);
2761         kfree(tbl);
2762 }
2763
2764 static __net_initdata struct pernet_operations sysctl_route_ops = {
2765         .init = sysctl_route_net_init,
2766         .exit = sysctl_route_net_exit,
2767 };
2768 #endif
2769
2770 static __net_init int rt_genid_init(struct net *net)
2771 {
2772         atomic_set(&net->ipv4.rt_genid, 0);
2773         atomic_set(&net->fnhe_genid, 0);
2774         get_random_bytes(&net->ipv4.dev_addr_genid,
2775                          sizeof(net->ipv4.dev_addr_genid));
2776         return 0;
2777 }
2778
2779 static __net_initdata struct pernet_operations rt_genid_ops = {
2780         .init = rt_genid_init,
2781 };
2782
2783 static int __net_init ipv4_inetpeer_init(struct net *net)
2784 {
2785         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2786
2787         if (!bp)
2788                 return -ENOMEM;
2789         inet_peer_base_init(bp);
2790         net->ipv4.peers = bp;
2791         return 0;
2792 }
2793
2794 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2795 {
2796         struct inet_peer_base *bp = net->ipv4.peers;
2797
2798         net->ipv4.peers = NULL;
2799         inetpeer_invalidate_tree(bp);
2800         kfree(bp);
2801 }
2802
2803 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2804         .init   =       ipv4_inetpeer_init,
2805         .exit   =       ipv4_inetpeer_exit,
2806 };
2807
2808 #ifdef CONFIG_IP_ROUTE_CLASSID
2809 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2810 #endif /* CONFIG_IP_ROUTE_CLASSID */
2811
2812 int __init ip_rt_init(void)
2813 {
2814         int rc = 0;
2815         int cpu;
2816
2817         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2818         if (!ip_idents)
2819                 panic("IP: failed to allocate ip_idents\n");
2820
2821         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2822
2823         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2824         if (!ip_tstamps)
2825                 panic("IP: failed to allocate ip_tstamps\n");
2826
2827         for_each_possible_cpu(cpu) {
2828                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2829
2830                 INIT_LIST_HEAD(&ul->head);
2831                 spin_lock_init(&ul->lock);
2832         }
2833 #ifdef CONFIG_IP_ROUTE_CLASSID
2834         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2835         if (!ip_rt_acct)
2836                 panic("IP: failed to allocate ip_rt_acct\n");
2837 #endif
2838
2839         ipv4_dst_ops.kmem_cachep =
2840                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2841                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2842
2843         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2844
2845         if (dst_entries_init(&ipv4_dst_ops) < 0)
2846                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2847
2848         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2849                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2850
2851         ipv4_dst_ops.gc_thresh = ~0;
2852         ip_rt_max_size = INT_MAX;
2853
2854         devinet_init();
2855         ip_fib_init();
2856
2857         if (ip_rt_proc_init())
2858                 pr_err("Unable to create route proc files\n");
2859 #ifdef CONFIG_XFRM
2860         xfrm_init();
2861         xfrm4_init();
2862 #endif
2863         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2864
2865 #ifdef CONFIG_SYSCTL
2866         register_pernet_subsys(&sysctl_route_ops);
2867 #endif
2868         register_pernet_subsys(&rt_genid_ops);
2869         register_pernet_subsys(&ipv4_inetpeer_ops);
2870         return rc;
2871 }
2872
2873 #ifdef CONFIG_SYSCTL
2874 /*
2875  * We really need to sanitize the damn ipv4 init order, then all
2876  * this nonsense will go away.
2877  */
2878 void __init ip_static_sysctl_init(void)
2879 {
2880         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2881 }
2882 #endif