Merge tag 'for-linus-20140127' of git://git.infradead.org/linux-mtd
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57
58 #if IS_ENABLED(CONFIG_IPV6)
59 #include <net/ipv6.h>
60 #include <net/ip6_fib.h>
61 #include <net/ip6_route.h>
62 #endif
63
64 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
65 {
66         return hash_32((__force u32)key ^ (__force u32)remote,
67                          IP_TNL_HASH_BITS);
68 }
69
70 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
71                              struct dst_entry *dst)
72 {
73         struct dst_entry *old_dst;
74
75         if (dst) {
76                 if (dst->flags & DST_NOCACHE)
77                         dst = NULL;
78                 else
79                         dst_clone(dst);
80         }
81         old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
82         dst_release(old_dst);
83 }
84
85 static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
86 {
87         __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
88 }
89
90 static void tunnel_dst_reset(struct ip_tunnel *t)
91 {
92         tunnel_dst_set(t, NULL);
93 }
94
95 static void tunnel_dst_reset_all(struct ip_tunnel *t)
96 {
97         int i;
98
99         for_each_possible_cpu(i)
100                 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
101 }
102
103 static struct dst_entry *tunnel_dst_get(struct ip_tunnel *t)
104 {
105         struct dst_entry *dst;
106
107         rcu_read_lock();
108         dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
109         if (dst)
110                 dst_hold(dst);
111         rcu_read_unlock();
112         return dst;
113 }
114
115 static struct dst_entry *tunnel_dst_check(struct ip_tunnel *t, u32 cookie)
116 {
117         struct dst_entry *dst = tunnel_dst_get(t);
118
119         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
120                 tunnel_dst_reset(t);
121                 return NULL;
122         }
123
124         return dst;
125 }
126
127 /* Often modified stats are per cpu, other are shared (netdev->stats) */
128 struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
129                                                 struct rtnl_link_stats64 *tot)
130 {
131         int i;
132
133         for_each_possible_cpu(i) {
134                 const struct pcpu_sw_netstats *tstats =
135                                                    per_cpu_ptr(dev->tstats, i);
136                 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
137                 unsigned int start;
138
139                 do {
140                         start = u64_stats_fetch_begin_bh(&tstats->syncp);
141                         rx_packets = tstats->rx_packets;
142                         tx_packets = tstats->tx_packets;
143                         rx_bytes = tstats->rx_bytes;
144                         tx_bytes = tstats->tx_bytes;
145                 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
146
147                 tot->rx_packets += rx_packets;
148                 tot->tx_packets += tx_packets;
149                 tot->rx_bytes   += rx_bytes;
150                 tot->tx_bytes   += tx_bytes;
151         }
152
153         tot->multicast = dev->stats.multicast;
154
155         tot->rx_crc_errors = dev->stats.rx_crc_errors;
156         tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
157         tot->rx_length_errors = dev->stats.rx_length_errors;
158         tot->rx_frame_errors = dev->stats.rx_frame_errors;
159         tot->rx_errors = dev->stats.rx_errors;
160
161         tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
162         tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
163         tot->tx_dropped = dev->stats.tx_dropped;
164         tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
165         tot->tx_errors = dev->stats.tx_errors;
166
167         tot->collisions  = dev->stats.collisions;
168
169         return tot;
170 }
171 EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
172
173 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
174                                 __be16 flags, __be32 key)
175 {
176         if (p->i_flags & TUNNEL_KEY) {
177                 if (flags & TUNNEL_KEY)
178                         return key == p->i_key;
179                 else
180                         /* key expected, none present */
181                         return false;
182         } else
183                 return !(flags & TUNNEL_KEY);
184 }
185
186 /* Fallback tunnel: no source, no destination, no key, no options
187
188    Tunnel hash table:
189    We require exact key match i.e. if a key is present in packet
190    it will match only tunnel with the same key; if it is not present,
191    it will match only keyless tunnel.
192
193    All keysless packets, if not matched configured keyless tunnels
194    will match fallback tunnel.
195    Given src, dst and key, find appropriate for input tunnel.
196 */
197 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
198                                    int link, __be16 flags,
199                                    __be32 remote, __be32 local,
200                                    __be32 key)
201 {
202         unsigned int hash;
203         struct ip_tunnel *t, *cand = NULL;
204         struct hlist_head *head;
205
206         hash = ip_tunnel_hash(key, remote);
207         head = &itn->tunnels[hash];
208
209         hlist_for_each_entry_rcu(t, head, hash_node) {
210                 if (local != t->parms.iph.saddr ||
211                     remote != t->parms.iph.daddr ||
212                     !(t->dev->flags & IFF_UP))
213                         continue;
214
215                 if (!ip_tunnel_key_match(&t->parms, flags, key))
216                         continue;
217
218                 if (t->parms.link == link)
219                         return t;
220                 else
221                         cand = t;
222         }
223
224         hlist_for_each_entry_rcu(t, head, hash_node) {
225                 if (remote != t->parms.iph.daddr ||
226                     !(t->dev->flags & IFF_UP))
227                         continue;
228
229                 if (!ip_tunnel_key_match(&t->parms, flags, key))
230                         continue;
231
232                 if (t->parms.link == link)
233                         return t;
234                 else if (!cand)
235                         cand = t;
236         }
237
238         hash = ip_tunnel_hash(key, 0);
239         head = &itn->tunnels[hash];
240
241         hlist_for_each_entry_rcu(t, head, hash_node) {
242                 if ((local != t->parms.iph.saddr &&
243                      (local != t->parms.iph.daddr ||
244                       !ipv4_is_multicast(local))) ||
245                     !(t->dev->flags & IFF_UP))
246                         continue;
247
248                 if (!ip_tunnel_key_match(&t->parms, flags, key))
249                         continue;
250
251                 if (t->parms.link == link)
252                         return t;
253                 else if (!cand)
254                         cand = t;
255         }
256
257         if (flags & TUNNEL_NO_KEY)
258                 goto skip_key_lookup;
259
260         hlist_for_each_entry_rcu(t, head, hash_node) {
261                 if (t->parms.i_key != key ||
262                     !(t->dev->flags & IFF_UP))
263                         continue;
264
265                 if (t->parms.link == link)
266                         return t;
267                 else if (!cand)
268                         cand = t;
269         }
270
271 skip_key_lookup:
272         if (cand)
273                 return cand;
274
275         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
276                 return netdev_priv(itn->fb_tunnel_dev);
277
278
279         return NULL;
280 }
281 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
282
283 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
284                                     struct ip_tunnel_parm *parms)
285 {
286         unsigned int h;
287         __be32 remote;
288
289         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
290                 remote = parms->iph.daddr;
291         else
292                 remote = 0;
293
294         h = ip_tunnel_hash(parms->i_key, remote);
295         return &itn->tunnels[h];
296 }
297
298 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
299 {
300         struct hlist_head *head = ip_bucket(itn, &t->parms);
301
302         hlist_add_head_rcu(&t->hash_node, head);
303 }
304
305 static void ip_tunnel_del(struct ip_tunnel *t)
306 {
307         hlist_del_init_rcu(&t->hash_node);
308 }
309
310 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
311                                         struct ip_tunnel_parm *parms,
312                                         int type)
313 {
314         __be32 remote = parms->iph.daddr;
315         __be32 local = parms->iph.saddr;
316         __be32 key = parms->i_key;
317         int link = parms->link;
318         struct ip_tunnel *t = NULL;
319         struct hlist_head *head = ip_bucket(itn, parms);
320
321         hlist_for_each_entry_rcu(t, head, hash_node) {
322                 if (local == t->parms.iph.saddr &&
323                     remote == t->parms.iph.daddr &&
324                     key == t->parms.i_key &&
325                     link == t->parms.link &&
326                     type == t->dev->type)
327                         break;
328         }
329         return t;
330 }
331
332 static struct net_device *__ip_tunnel_create(struct net *net,
333                                              const struct rtnl_link_ops *ops,
334                                              struct ip_tunnel_parm *parms)
335 {
336         int err;
337         struct ip_tunnel *tunnel;
338         struct net_device *dev;
339         char name[IFNAMSIZ];
340
341         if (parms->name[0])
342                 strlcpy(name, parms->name, IFNAMSIZ);
343         else {
344                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
345                         err = -E2BIG;
346                         goto failed;
347                 }
348                 strlcpy(name, ops->kind, IFNAMSIZ);
349                 strncat(name, "%d", 2);
350         }
351
352         ASSERT_RTNL();
353         dev = alloc_netdev(ops->priv_size, name, ops->setup);
354         if (!dev) {
355                 err = -ENOMEM;
356                 goto failed;
357         }
358         dev_net_set(dev, net);
359
360         dev->rtnl_link_ops = ops;
361
362         tunnel = netdev_priv(dev);
363         tunnel->parms = *parms;
364         tunnel->net = net;
365
366         err = register_netdevice(dev);
367         if (err)
368                 goto failed_free;
369
370         return dev;
371
372 failed_free:
373         free_netdev(dev);
374 failed:
375         return ERR_PTR(err);
376 }
377
378 static inline void init_tunnel_flow(struct flowi4 *fl4,
379                                     int proto,
380                                     __be32 daddr, __be32 saddr,
381                                     __be32 key, __u8 tos, int oif)
382 {
383         memset(fl4, 0, sizeof(*fl4));
384         fl4->flowi4_oif = oif;
385         fl4->daddr = daddr;
386         fl4->saddr = saddr;
387         fl4->flowi4_tos = tos;
388         fl4->flowi4_proto = proto;
389         fl4->fl4_gre_key = key;
390 }
391
392 static int ip_tunnel_bind_dev(struct net_device *dev)
393 {
394         struct net_device *tdev = NULL;
395         struct ip_tunnel *tunnel = netdev_priv(dev);
396         const struct iphdr *iph;
397         int hlen = LL_MAX_HEADER;
398         int mtu = ETH_DATA_LEN;
399         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
400
401         iph = &tunnel->parms.iph;
402
403         /* Guess output device to choose reasonable mtu and needed_headroom */
404         if (iph->daddr) {
405                 struct flowi4 fl4;
406                 struct rtable *rt;
407
408                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
409                                  iph->saddr, tunnel->parms.o_key,
410                                  RT_TOS(iph->tos), tunnel->parms.link);
411                 rt = ip_route_output_key(tunnel->net, &fl4);
412
413                 if (!IS_ERR(rt)) {
414                         tdev = rt->dst.dev;
415                         tunnel_dst_set(tunnel, &rt->dst);
416                         ip_rt_put(rt);
417                 }
418                 if (dev->type != ARPHRD_ETHER)
419                         dev->flags |= IFF_POINTOPOINT;
420         }
421
422         if (!tdev && tunnel->parms.link)
423                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
424
425         if (tdev) {
426                 hlen = tdev->hard_header_len + tdev->needed_headroom;
427                 mtu = tdev->mtu;
428         }
429         dev->iflink = tunnel->parms.link;
430
431         dev->needed_headroom = t_hlen + hlen;
432         mtu -= (dev->hard_header_len + t_hlen);
433
434         if (mtu < 68)
435                 mtu = 68;
436
437         return mtu;
438 }
439
440 static struct ip_tunnel *ip_tunnel_create(struct net *net,
441                                           struct ip_tunnel_net *itn,
442                                           struct ip_tunnel_parm *parms)
443 {
444         struct ip_tunnel *nt, *fbt;
445         struct net_device *dev;
446
447         BUG_ON(!itn->fb_tunnel_dev);
448         fbt = netdev_priv(itn->fb_tunnel_dev);
449         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
450         if (IS_ERR(dev))
451                 return NULL;
452
453         dev->mtu = ip_tunnel_bind_dev(dev);
454
455         nt = netdev_priv(dev);
456         ip_tunnel_add(itn, nt);
457         return nt;
458 }
459
460 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
461                   const struct tnl_ptk_info *tpi, bool log_ecn_error)
462 {
463         struct pcpu_sw_netstats *tstats;
464         const struct iphdr *iph = ip_hdr(skb);
465         int err;
466
467 #ifdef CONFIG_NET_IPGRE_BROADCAST
468         if (ipv4_is_multicast(iph->daddr)) {
469                 /* Looped back packet, drop it! */
470                 if (rt_is_output_route(skb_rtable(skb)))
471                         goto drop;
472                 tunnel->dev->stats.multicast++;
473                 skb->pkt_type = PACKET_BROADCAST;
474         }
475 #endif
476
477         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
478              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
479                 tunnel->dev->stats.rx_crc_errors++;
480                 tunnel->dev->stats.rx_errors++;
481                 goto drop;
482         }
483
484         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
485                 if (!(tpi->flags&TUNNEL_SEQ) ||
486                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
487                         tunnel->dev->stats.rx_fifo_errors++;
488                         tunnel->dev->stats.rx_errors++;
489                         goto drop;
490                 }
491                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
492         }
493
494         err = IP_ECN_decapsulate(iph, skb);
495         if (unlikely(err)) {
496                 if (log_ecn_error)
497                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
498                                         &iph->saddr, iph->tos);
499                 if (err > 1) {
500                         ++tunnel->dev->stats.rx_frame_errors;
501                         ++tunnel->dev->stats.rx_errors;
502                         goto drop;
503                 }
504         }
505
506         tstats = this_cpu_ptr(tunnel->dev->tstats);
507         u64_stats_update_begin(&tstats->syncp);
508         tstats->rx_packets++;
509         tstats->rx_bytes += skb->len;
510         u64_stats_update_end(&tstats->syncp);
511
512         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
513
514         if (tunnel->dev->type == ARPHRD_ETHER) {
515                 skb->protocol = eth_type_trans(skb, tunnel->dev);
516                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
517         } else {
518                 skb->dev = tunnel->dev;
519         }
520
521         gro_cells_receive(&tunnel->gro_cells, skb);
522         return 0;
523
524 drop:
525         kfree_skb(skb);
526         return 0;
527 }
528 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
529
530 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
531                             struct rtable *rt, __be16 df)
532 {
533         struct ip_tunnel *tunnel = netdev_priv(dev);
534         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
535         int mtu;
536
537         if (df)
538                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
539                                         - sizeof(struct iphdr) - tunnel->hlen;
540         else
541                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
542
543         if (skb_dst(skb))
544                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
545
546         if (skb->protocol == htons(ETH_P_IP)) {
547                 if (!skb_is_gso(skb) &&
548                     (df & htons(IP_DF)) && mtu < pkt_size) {
549                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
550                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
551                         return -E2BIG;
552                 }
553         }
554 #if IS_ENABLED(CONFIG_IPV6)
555         else if (skb->protocol == htons(ETH_P_IPV6)) {
556                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
557
558                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
559                            mtu >= IPV6_MIN_MTU) {
560                         if ((tunnel->parms.iph.daddr &&
561                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
562                             rt6->rt6i_dst.plen == 128) {
563                                 rt6->rt6i_flags |= RTF_MODIFIED;
564                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
565                         }
566                 }
567
568                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
569                                         mtu < pkt_size) {
570                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
571                         return -E2BIG;
572                 }
573         }
574 #endif
575         return 0;
576 }
577
578 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
579                     const struct iphdr *tnl_params, const u8 protocol)
580 {
581         struct ip_tunnel *tunnel = netdev_priv(dev);
582         const struct iphdr *inner_iph;
583         struct flowi4 fl4;
584         u8     tos, ttl;
585         __be16 df;
586         struct rtable *rt = NULL;       /* Route to the other host */
587         unsigned int max_headroom;      /* The extra header space needed */
588         __be32 dst;
589         int err;
590         bool connected = true;
591
592         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
593
594         dst = tnl_params->daddr;
595         if (dst == 0) {
596                 /* NBMA tunnel */
597
598                 if (skb_dst(skb) == NULL) {
599                         dev->stats.tx_fifo_errors++;
600                         goto tx_error;
601                 }
602
603                 if (skb->protocol == htons(ETH_P_IP)) {
604                         rt = skb_rtable(skb);
605                         dst = rt_nexthop(rt, inner_iph->daddr);
606                 }
607 #if IS_ENABLED(CONFIG_IPV6)
608                 else if (skb->protocol == htons(ETH_P_IPV6)) {
609                         const struct in6_addr *addr6;
610                         struct neighbour *neigh;
611                         bool do_tx_error_icmp;
612                         int addr_type;
613
614                         neigh = dst_neigh_lookup(skb_dst(skb),
615                                                  &ipv6_hdr(skb)->daddr);
616                         if (neigh == NULL)
617                                 goto tx_error;
618
619                         addr6 = (const struct in6_addr *)&neigh->primary_key;
620                         addr_type = ipv6_addr_type(addr6);
621
622                         if (addr_type == IPV6_ADDR_ANY) {
623                                 addr6 = &ipv6_hdr(skb)->daddr;
624                                 addr_type = ipv6_addr_type(addr6);
625                         }
626
627                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
628                                 do_tx_error_icmp = true;
629                         else {
630                                 do_tx_error_icmp = false;
631                                 dst = addr6->s6_addr32[3];
632                         }
633                         neigh_release(neigh);
634                         if (do_tx_error_icmp)
635                                 goto tx_error_icmp;
636                 }
637 #endif
638                 else
639                         goto tx_error;
640
641                 connected = false;
642         }
643
644         tos = tnl_params->tos;
645         if (tos & 0x1) {
646                 tos &= ~0x1;
647                 if (skb->protocol == htons(ETH_P_IP)) {
648                         tos = inner_iph->tos;
649                         connected = false;
650                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
651                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
652                         connected = false;
653                 }
654         }
655
656         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
657                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
658
659         if (connected)
660                 rt = (struct rtable *)tunnel_dst_check(tunnel, 0);
661
662         if (!rt) {
663                 rt = ip_route_output_key(tunnel->net, &fl4);
664
665                 if (IS_ERR(rt)) {
666                         dev->stats.tx_carrier_errors++;
667                         goto tx_error;
668                 }
669                 if (connected)
670                         tunnel_dst_set(tunnel, &rt->dst);
671         }
672
673         if (rt->dst.dev == dev) {
674                 ip_rt_put(rt);
675                 dev->stats.collisions++;
676                 goto tx_error;
677         }
678
679         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
680                 ip_rt_put(rt);
681                 goto tx_error;
682         }
683
684         if (tunnel->err_count > 0) {
685                 if (time_before(jiffies,
686                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
687                         tunnel->err_count--;
688
689                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
690                         dst_link_failure(skb);
691                 } else
692                         tunnel->err_count = 0;
693         }
694
695         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
696         ttl = tnl_params->ttl;
697         if (ttl == 0) {
698                 if (skb->protocol == htons(ETH_P_IP))
699                         ttl = inner_iph->ttl;
700 #if IS_ENABLED(CONFIG_IPV6)
701                 else if (skb->protocol == htons(ETH_P_IPV6))
702                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
703 #endif
704                 else
705                         ttl = ip4_dst_hoplimit(&rt->dst);
706         }
707
708         df = tnl_params->frag_off;
709         if (skb->protocol == htons(ETH_P_IP))
710                 df |= (inner_iph->frag_off&htons(IP_DF));
711
712         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
713                         + rt->dst.header_len;
714         if (max_headroom > dev->needed_headroom)
715                 dev->needed_headroom = max_headroom;
716
717         if (skb_cow_head(skb, dev->needed_headroom)) {
718                 dev->stats.tx_dropped++;
719                 kfree_skb(skb);
720                 return;
721         }
722
723         err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
724                             tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
725         iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
726
727         return;
728
729 #if IS_ENABLED(CONFIG_IPV6)
730 tx_error_icmp:
731         dst_link_failure(skb);
732 #endif
733 tx_error:
734         dev->stats.tx_errors++;
735         kfree_skb(skb);
736 }
737 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
738
739 static void ip_tunnel_update(struct ip_tunnel_net *itn,
740                              struct ip_tunnel *t,
741                              struct net_device *dev,
742                              struct ip_tunnel_parm *p,
743                              bool set_mtu)
744 {
745         ip_tunnel_del(t);
746         t->parms.iph.saddr = p->iph.saddr;
747         t->parms.iph.daddr = p->iph.daddr;
748         t->parms.i_key = p->i_key;
749         t->parms.o_key = p->o_key;
750         if (dev->type != ARPHRD_ETHER) {
751                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
752                 memcpy(dev->broadcast, &p->iph.daddr, 4);
753         }
754         ip_tunnel_add(itn, t);
755
756         t->parms.iph.ttl = p->iph.ttl;
757         t->parms.iph.tos = p->iph.tos;
758         t->parms.iph.frag_off = p->iph.frag_off;
759
760         if (t->parms.link != p->link) {
761                 int mtu;
762
763                 t->parms.link = p->link;
764                 mtu = ip_tunnel_bind_dev(dev);
765                 if (set_mtu)
766                         dev->mtu = mtu;
767         }
768         tunnel_dst_reset_all(t);
769         netdev_state_change(dev);
770 }
771
772 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
773 {
774         int err = 0;
775         struct ip_tunnel *t;
776         struct net *net = dev_net(dev);
777         struct ip_tunnel *tunnel = netdev_priv(dev);
778         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
779
780         BUG_ON(!itn->fb_tunnel_dev);
781         switch (cmd) {
782         case SIOCGETTUNNEL:
783                 t = NULL;
784                 if (dev == itn->fb_tunnel_dev)
785                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
786                 if (t == NULL)
787                         t = netdev_priv(dev);
788                 memcpy(p, &t->parms, sizeof(*p));
789                 break;
790
791         case SIOCADDTUNNEL:
792         case SIOCCHGTUNNEL:
793                 err = -EPERM;
794                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
795                         goto done;
796                 if (p->iph.ttl)
797                         p->iph.frag_off |= htons(IP_DF);
798                 if (!(p->i_flags&TUNNEL_KEY))
799                         p->i_key = 0;
800                 if (!(p->o_flags&TUNNEL_KEY))
801                         p->o_key = 0;
802
803                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
804
805                 if (!t && (cmd == SIOCADDTUNNEL))
806                         t = ip_tunnel_create(net, itn, p);
807
808                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
809                         if (t != NULL) {
810                                 if (t->dev != dev) {
811                                         err = -EEXIST;
812                                         break;
813                                 }
814                         } else {
815                                 unsigned int nflags = 0;
816
817                                 if (ipv4_is_multicast(p->iph.daddr))
818                                         nflags = IFF_BROADCAST;
819                                 else if (p->iph.daddr)
820                                         nflags = IFF_POINTOPOINT;
821
822                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
823                                         err = -EINVAL;
824                                         break;
825                                 }
826
827                                 t = netdev_priv(dev);
828                         }
829                 }
830
831                 if (t) {
832                         err = 0;
833                         ip_tunnel_update(itn, t, dev, p, true);
834                 } else
835                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
836                 break;
837
838         case SIOCDELTUNNEL:
839                 err = -EPERM;
840                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
841                         goto done;
842
843                 if (dev == itn->fb_tunnel_dev) {
844                         err = -ENOENT;
845                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
846                         if (t == NULL)
847                                 goto done;
848                         err = -EPERM;
849                         if (t == netdev_priv(itn->fb_tunnel_dev))
850                                 goto done;
851                         dev = t->dev;
852                 }
853                 unregister_netdevice(dev);
854                 err = 0;
855                 break;
856
857         default:
858                 err = -EINVAL;
859         }
860
861 done:
862         return err;
863 }
864 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
865
866 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
867 {
868         struct ip_tunnel *tunnel = netdev_priv(dev);
869         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
870
871         if (new_mtu < 68 ||
872             new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
873                 return -EINVAL;
874         dev->mtu = new_mtu;
875         return 0;
876 }
877 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
878
879 static void ip_tunnel_dev_free(struct net_device *dev)
880 {
881         struct ip_tunnel *tunnel = netdev_priv(dev);
882
883         gro_cells_destroy(&tunnel->gro_cells);
884         free_percpu(tunnel->dst_cache);
885         free_percpu(dev->tstats);
886         free_netdev(dev);
887 }
888
889 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
890 {
891         struct ip_tunnel *tunnel = netdev_priv(dev);
892         struct ip_tunnel_net *itn;
893
894         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
895
896         if (itn->fb_tunnel_dev != dev) {
897                 ip_tunnel_del(netdev_priv(dev));
898                 unregister_netdevice_queue(dev, head);
899         }
900 }
901 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
902
903 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
904                                   struct rtnl_link_ops *ops, char *devname)
905 {
906         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
907         struct ip_tunnel_parm parms;
908         unsigned int i;
909
910         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
911                 INIT_HLIST_HEAD(&itn->tunnels[i]);
912
913         if (!ops) {
914                 itn->fb_tunnel_dev = NULL;
915                 return 0;
916         }
917
918         memset(&parms, 0, sizeof(parms));
919         if (devname)
920                 strlcpy(parms.name, devname, IFNAMSIZ);
921
922         rtnl_lock();
923         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
924         /* FB netdevice is special: we have one, and only one per netns.
925          * Allowing to move it to another netns is clearly unsafe.
926          */
927         if (!IS_ERR(itn->fb_tunnel_dev)) {
928                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
929                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
930         }
931         rtnl_unlock();
932
933         return PTR_RET(itn->fb_tunnel_dev);
934 }
935 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
936
937 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
938                               struct rtnl_link_ops *ops)
939 {
940         struct net *net = dev_net(itn->fb_tunnel_dev);
941         struct net_device *dev, *aux;
942         int h;
943
944         for_each_netdev_safe(net, dev, aux)
945                 if (dev->rtnl_link_ops == ops)
946                         unregister_netdevice_queue(dev, head);
947
948         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
949                 struct ip_tunnel *t;
950                 struct hlist_node *n;
951                 struct hlist_head *thead = &itn->tunnels[h];
952
953                 hlist_for_each_entry_safe(t, n, thead, hash_node)
954                         /* If dev is in the same netns, it has already
955                          * been added to the list by the previous loop.
956                          */
957                         if (!net_eq(dev_net(t->dev), net))
958                                 unregister_netdevice_queue(t->dev, head);
959         }
960 }
961
962 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
963 {
964         LIST_HEAD(list);
965
966         rtnl_lock();
967         ip_tunnel_destroy(itn, &list, ops);
968         unregister_netdevice_many(&list);
969         rtnl_unlock();
970 }
971 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
972
973 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
974                       struct ip_tunnel_parm *p)
975 {
976         struct ip_tunnel *nt;
977         struct net *net = dev_net(dev);
978         struct ip_tunnel_net *itn;
979         int mtu;
980         int err;
981
982         nt = netdev_priv(dev);
983         itn = net_generic(net, nt->ip_tnl_net_id);
984
985         if (ip_tunnel_find(itn, p, dev->type))
986                 return -EEXIST;
987
988         nt->net = net;
989         nt->parms = *p;
990         err = register_netdevice(dev);
991         if (err)
992                 goto out;
993
994         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
995                 eth_hw_addr_random(dev);
996
997         mtu = ip_tunnel_bind_dev(dev);
998         if (!tb[IFLA_MTU])
999                 dev->mtu = mtu;
1000
1001         ip_tunnel_add(itn, nt);
1002
1003 out:
1004         return err;
1005 }
1006 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1007
1008 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1009                          struct ip_tunnel_parm *p)
1010 {
1011         struct ip_tunnel *t;
1012         struct ip_tunnel *tunnel = netdev_priv(dev);
1013         struct net *net = tunnel->net;
1014         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1015
1016         if (dev == itn->fb_tunnel_dev)
1017                 return -EINVAL;
1018
1019         t = ip_tunnel_find(itn, p, dev->type);
1020
1021         if (t) {
1022                 if (t->dev != dev)
1023                         return -EEXIST;
1024         } else {
1025                 t = tunnel;
1026
1027                 if (dev->type != ARPHRD_ETHER) {
1028                         unsigned int nflags = 0;
1029
1030                         if (ipv4_is_multicast(p->iph.daddr))
1031                                 nflags = IFF_BROADCAST;
1032                         else if (p->iph.daddr)
1033                                 nflags = IFF_POINTOPOINT;
1034
1035                         if ((dev->flags ^ nflags) &
1036                             (IFF_POINTOPOINT | IFF_BROADCAST))
1037                                 return -EINVAL;
1038                 }
1039         }
1040
1041         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1042         return 0;
1043 }
1044 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1045
1046 int ip_tunnel_init(struct net_device *dev)
1047 {
1048         struct ip_tunnel *tunnel = netdev_priv(dev);
1049         struct iphdr *iph = &tunnel->parms.iph;
1050         int i, err;
1051
1052         dev->destructor = ip_tunnel_dev_free;
1053         dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
1054         if (!dev->tstats)
1055                 return -ENOMEM;
1056
1057         for_each_possible_cpu(i) {
1058                 struct pcpu_sw_netstats *ipt_stats;
1059                 ipt_stats = per_cpu_ptr(dev->tstats, i);
1060                 u64_stats_init(&ipt_stats->syncp);
1061         }
1062
1063         tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1064         if (!tunnel->dst_cache) {
1065                 free_percpu(dev->tstats);
1066                 return -ENOMEM;
1067         }
1068
1069         err = gro_cells_init(&tunnel->gro_cells, dev);
1070         if (err) {
1071                 free_percpu(tunnel->dst_cache);
1072                 free_percpu(dev->tstats);
1073                 return err;
1074         }
1075
1076         tunnel->dev = dev;
1077         tunnel->net = dev_net(dev);
1078         strcpy(tunnel->parms.name, dev->name);
1079         iph->version            = 4;
1080         iph->ihl                = 5;
1081
1082         return 0;
1083 }
1084 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1085
1086 void ip_tunnel_uninit(struct net_device *dev)
1087 {
1088         struct ip_tunnel *tunnel = netdev_priv(dev);
1089         struct net *net = tunnel->net;
1090         struct ip_tunnel_net *itn;
1091
1092         itn = net_generic(net, tunnel->ip_tnl_net_id);
1093         /* fb_tunnel_dev will be unregisted in net-exit call. */
1094         if (itn->fb_tunnel_dev != dev)
1095                 ip_tunnel_del(netdev_priv(dev));
1096
1097         tunnel_dst_reset_all(tunnel);
1098 }
1099 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1100
1101 /* Do least required initialization, rest of init is done in tunnel_init call */
1102 void ip_tunnel_setup(struct net_device *dev, int net_id)
1103 {
1104         struct ip_tunnel *tunnel = netdev_priv(dev);
1105         tunnel->ip_tnl_net_id = net_id;
1106 }
1107 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1108
1109 MODULE_LICENSE("GPL");