2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
36 #include <net/protocol.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
57 1. The most important issue is detecting local dead loops.
58 They would cause complete host lockup in transmit, which
59 would be "resolved" by stack overflow or, if queueing is enabled,
60 with infinite looping in net_bh.
62 We cannot track such dead loops during route installation,
63 it is infeasible task. The most general solutions would be
64 to keep skb->encapsulation counter (sort of local ttl),
65 and silently drop packet when it expires. It is the best
66 solution, but it supposes maintaing new variable in ALL
67 skb, even if no tunneling is used.
69 Current solution: HARD_TX_LOCK lock breaks dead loops.
73 2. Networking dead loops would not kill routers, but would really
74 kill network. IP hop limit plays role of "t->recursion" in this case,
75 if we copy it from packet being encapsulated to upper header.
76 It is very good solution, but it introduces two problems:
78 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79 do not work over tunnels.
80 - traceroute does not work. I planned to relay ICMP from tunnel,
81 so that this problem would be solved and traceroute output
82 would even more informative. This idea appeared to be wrong:
83 only Linux complies to rfc1812 now (yes, guys, Linux is the only
84 true router now :-)), all routers (at least, in neighbourhood of mine)
85 return only 8 bytes of payload. It is the end.
87 Hence, if we want that OSPF worked or traceroute said something reasonable,
88 we should search for another solution.
90 One of them is to parse packet trying to detect inner encapsulation
91 made by our node. It is difficult or even impossible, especially,
92 taking into account fragmentation. TO be short, tt is not solution at all.
94 Current solution: The solution was UNEXPECTEDLY SIMPLE.
95 We force DF flag on tunnels with preconfigured hop limit,
96 that is ALL. :-) Well, it does not remove the problem completely,
97 but exponential growth of network traffic is changed to linear
98 (branches, that exceed pmtu are pruned) and tunnel mtu
99 fastly degrades to value <68, where looping stops.
100 Yes, it is not good if there exists a router in the loop,
101 which does not force DF, even when encapsulating packets have DF set.
102 But it is not our problem! Nobody could accuse us, we made
103 all that we could make. Even if it is your gated who injected
104 fatal route to network, even if it were you who configured
105 fatal static route: you are innocent. :-)
109 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
110 practically identical code. It would be good to glue them
111 together, but it is not very evident, how to make them modular.
112 sit is integral part of IPv6, ipip and gre are naturally modular.
113 We could extract common parts (hash table, ioctl etc)
114 to a separate module (ip_tunnel.c).
119 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
122 static int ipgre_tunnel_bind_dev(struct net_device *dev);
124 /* Fallback tunnel: no source, no destination, no key, no options */
128 static int ipgre_net_id __read_mostly;
130 struct ip_tunnel *tunnels[4][HASH_SIZE];
132 struct net_device *fb_tunnel_dev;
135 /* Tunnel hash table */
145 We require exact key match i.e. if a key is present in packet
146 it will match only tunnel with the same key; if it is not present,
147 it will match only keyless tunnel.
149 All keysless packets, if not matched configured keyless tunnels
150 will match fallback tunnel.
153 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
155 #define tunnels_r_l tunnels[3]
156 #define tunnels_r tunnels[2]
157 #define tunnels_l tunnels[1]
158 #define tunnels_wc tunnels[0]
160 * Locking : hash tables are protected by RCU and a spinlock
162 static DEFINE_SPINLOCK(ipgre_lock);
164 #define for_each_ip_tunnel_rcu(start) \
165 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167 /* Given src, dst and key, find appropriate for input tunnel. */
169 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
170 __be32 remote, __be32 local,
171 __be32 key, __be16 gre_proto)
173 struct net *net = dev_net(dev);
174 int link = dev->ifindex;
175 unsigned h0 = HASH(remote);
176 unsigned h1 = HASH(key);
177 struct ip_tunnel *t, *cand = NULL;
178 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
179 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
180 ARPHRD_ETHER : ARPHRD_IPGRE;
181 int score, cand_score = 4;
183 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
184 if (local != t->parms.iph.saddr ||
185 remote != t->parms.iph.daddr ||
186 key != t->parms.i_key ||
187 !(t->dev->flags & IFF_UP))
190 if (t->dev->type != ARPHRD_IPGRE &&
191 t->dev->type != dev_type)
195 if (t->parms.link != link)
197 if (t->dev->type != dev_type)
202 if (score < cand_score) {
208 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
209 if (remote != t->parms.iph.daddr ||
210 key != t->parms.i_key ||
211 !(t->dev->flags & IFF_UP))
214 if (t->dev->type != ARPHRD_IPGRE &&
215 t->dev->type != dev_type)
219 if (t->parms.link != link)
221 if (t->dev->type != dev_type)
226 if (score < cand_score) {
232 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
233 if ((local != t->parms.iph.saddr &&
234 (local != t->parms.iph.daddr ||
235 !ipv4_is_multicast(local))) ||
236 key != t->parms.i_key ||
237 !(t->dev->flags & IFF_UP))
240 if (t->dev->type != ARPHRD_IPGRE &&
241 t->dev->type != dev_type)
245 if (t->parms.link != link)
247 if (t->dev->type != dev_type)
252 if (score < cand_score) {
258 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
259 if (t->parms.i_key != key ||
260 !(t->dev->flags & IFF_UP))
263 if (t->dev->type != ARPHRD_IPGRE &&
264 t->dev->type != dev_type)
268 if (t->parms.link != link)
270 if (t->dev->type != dev_type)
275 if (score < cand_score) {
284 dev = ign->fb_tunnel_dev;
285 if (dev->flags & IFF_UP)
286 return netdev_priv(dev);
291 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
292 struct ip_tunnel_parm *parms)
294 __be32 remote = parms->iph.daddr;
295 __be32 local = parms->iph.saddr;
296 __be32 key = parms->i_key;
297 unsigned h = HASH(key);
302 if (remote && !ipv4_is_multicast(remote)) {
307 return &ign->tunnels[prio][h];
310 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
313 return __ipgre_bucket(ign, &t->parms);
316 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
318 struct ip_tunnel **tp = ipgre_bucket(ign, t);
320 spin_lock_bh(&ipgre_lock);
322 rcu_assign_pointer(*tp, t);
323 spin_unlock_bh(&ipgre_lock);
326 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
328 struct ip_tunnel **tp;
330 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
332 spin_lock_bh(&ipgre_lock);
334 spin_unlock_bh(&ipgre_lock);
340 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
341 struct ip_tunnel_parm *parms,
344 __be32 remote = parms->iph.daddr;
345 __be32 local = parms->iph.saddr;
346 __be32 key = parms->i_key;
347 int link = parms->link;
348 struct ip_tunnel *t, **tp;
349 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
351 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
352 if (local == t->parms.iph.saddr &&
353 remote == t->parms.iph.daddr &&
354 key == t->parms.i_key &&
355 link == t->parms.link &&
356 type == t->dev->type)
362 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
363 struct ip_tunnel_parm *parms, int create)
365 struct ip_tunnel *t, *nt;
366 struct net_device *dev;
368 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
370 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
375 strlcpy(name, parms->name, IFNAMSIZ);
377 sprintf(name, "gre%%d");
379 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
383 dev_net_set(dev, net);
385 if (strchr(name, '%')) {
386 if (dev_alloc_name(dev, name) < 0)
390 nt = netdev_priv(dev);
392 dev->rtnl_link_ops = &ipgre_link_ops;
394 dev->mtu = ipgre_tunnel_bind_dev(dev);
396 if (register_netdevice(dev) < 0)
400 ipgre_tunnel_link(ign, nt);
408 static void ipgre_tunnel_uninit(struct net_device *dev)
410 struct net *net = dev_net(dev);
411 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
413 ipgre_tunnel_unlink(ign, netdev_priv(dev));
418 static void ipgre_err(struct sk_buff *skb, u32 info)
421 /* All the routers (except for Linux) return only
422 8 bytes of packet payload. It means, that precise relaying of
423 ICMP in the real Internet is absolutely infeasible.
425 Moreover, Cisco "wise men" put GRE key to the third word
426 in GRE header. It makes impossible maintaining even soft state for keyed
427 GRE tunnels with enabled checksum. Tell them "thank you".
429 Well, I wonder, rfc1812 was written by Cisco employee,
430 what the hell these idiots break standrads established
434 struct iphdr *iph = (struct iphdr *)skb->data;
435 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
436 int grehlen = (iph->ihl<<2) + 4;
437 const int type = icmp_hdr(skb)->type;
438 const int code = icmp_hdr(skb)->code;
443 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
444 if (flags&(GRE_VERSION|GRE_ROUTING))
453 /* If only 8 bytes returned, keyed message will be dropped here */
454 if (skb_headlen(skb) < grehlen)
459 case ICMP_PARAMETERPROB:
462 case ICMP_DEST_UNREACH:
465 case ICMP_PORT_UNREACH:
466 /* Impossible event. */
468 case ICMP_FRAG_NEEDED:
469 /* Soft state for pmtu is maintained by IP core. */
472 /* All others are translated to HOST_UNREACH.
473 rfc2003 contains "deep thoughts" about NET_UNREACH,
474 I believe they are just ether pollution. --ANK
479 case ICMP_TIME_EXCEEDED:
480 if (code != ICMP_EXC_TTL)
486 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
488 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
490 if (t == NULL || t->parms.iph.daddr == 0 ||
491 ipv4_is_multicast(t->parms.iph.daddr))
494 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
497 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
501 t->err_time = jiffies;
507 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
509 if (INET_ECN_is_ce(iph->tos)) {
510 if (skb->protocol == htons(ETH_P_IP)) {
511 IP_ECN_set_ce(ip_hdr(skb));
512 } else if (skb->protocol == htons(ETH_P_IPV6)) {
513 IP6_ECN_set_ce(ipv6_hdr(skb));
519 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
522 if (skb->protocol == htons(ETH_P_IP))
523 inner = old_iph->tos;
524 else if (skb->protocol == htons(ETH_P_IPV6))
525 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
526 return INET_ECN_encapsulate(tos, inner);
529 static int ipgre_rcv(struct sk_buff *skb)
537 struct ip_tunnel *tunnel;
542 if (!pskb_may_pull(skb, 16))
549 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
550 /* - Version must be 0.
551 - We do not support routing headers.
553 if (flags&(GRE_VERSION|GRE_ROUTING))
556 if (flags&GRE_CSUM) {
557 switch (skb->ip_summed) {
558 case CHECKSUM_COMPLETE:
559 csum = csum_fold(skb->csum);
565 csum = __skb_checksum_complete(skb);
566 skb->ip_summed = CHECKSUM_COMPLETE;
571 key = *(__be32*)(h + offset);
575 seqno = ntohl(*(__be32*)(h + offset));
580 gre_proto = *(__be16 *)(h + 2);
583 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
584 iph->saddr, iph->daddr, key,
586 struct net_device_stats *stats = &tunnel->dev->stats;
590 skb->protocol = gre_proto;
591 /* WCCP version 1 and 2 protocol decoding.
592 * - Change protocol to IP
593 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
595 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
596 skb->protocol = htons(ETH_P_IP);
597 if ((*(h + offset) & 0xF0) != 0x40)
601 skb->mac_header = skb->network_header;
602 __pskb_pull(skb, offset);
603 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
604 skb->pkt_type = PACKET_HOST;
605 #ifdef CONFIG_NET_IPGRE_BROADCAST
606 if (ipv4_is_multicast(iph->daddr)) {
607 /* Looped back packet, drop it! */
608 if (skb_rtable(skb)->fl.iif == 0)
611 skb->pkt_type = PACKET_BROADCAST;
615 if (((flags&GRE_CSUM) && csum) ||
616 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
617 stats->rx_crc_errors++;
621 if (tunnel->parms.i_flags&GRE_SEQ) {
622 if (!(flags&GRE_SEQ) ||
623 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
624 stats->rx_fifo_errors++;
628 tunnel->i_seqno = seqno + 1;
633 /* Warning: All skb pointers will be invalidated! */
634 if (tunnel->dev->type == ARPHRD_ETHER) {
635 if (!pskb_may_pull(skb, ETH_HLEN)) {
636 stats->rx_length_errors++;
642 skb->protocol = eth_type_trans(skb, tunnel->dev);
643 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
647 stats->rx_bytes += len;
648 skb->dev = tunnel->dev;
652 skb_reset_network_header(skb);
653 ipgre_ecn_decapsulate(iph, skb);
659 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
668 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
670 struct ip_tunnel *tunnel = netdev_priv(dev);
671 struct net_device_stats *stats = &dev->stats;
672 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
673 struct iphdr *old_iph = ip_hdr(skb);
677 struct rtable *rt; /* Route to the other host */
678 struct net_device *tdev; /* Device to other host */
679 struct iphdr *iph; /* Our new IP header */
680 unsigned int max_headroom; /* The extra header space needed */
685 if (dev->type == ARPHRD_ETHER)
686 IPCB(skb)->flags = 0;
688 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
690 tiph = (struct iphdr *)skb->data;
692 gre_hlen = tunnel->hlen;
693 tiph = &tunnel->parms.iph;
696 if ((dst = tiph->daddr) == 0) {
699 if (skb_dst(skb) == NULL) {
700 stats->tx_fifo_errors++;
704 if (skb->protocol == htons(ETH_P_IP)) {
705 rt = skb_rtable(skb);
706 if ((dst = rt->rt_gateway) == 0)
710 else if (skb->protocol == htons(ETH_P_IPV6)) {
711 struct in6_addr *addr6;
713 struct neighbour *neigh = skb_dst(skb)->neighbour;
718 addr6 = (struct in6_addr *)&neigh->primary_key;
719 addr_type = ipv6_addr_type(addr6);
721 if (addr_type == IPV6_ADDR_ANY) {
722 addr6 = &ipv6_hdr(skb)->daddr;
723 addr_type = ipv6_addr_type(addr6);
726 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
729 dst = addr6->s6_addr32[3];
739 if (skb->protocol == htons(ETH_P_IP))
744 struct flowi fl = { .oif = tunnel->parms.link,
747 .saddr = tiph->saddr,
748 .tos = RT_TOS(tos) } },
749 .proto = IPPROTO_GRE };
750 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
751 stats->tx_carrier_errors++;
755 tdev = rt->u.dst.dev;
765 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
767 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
770 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
772 if (skb->protocol == htons(ETH_P_IP)) {
773 df |= (old_iph->frag_off&htons(IP_DF));
775 if ((old_iph->frag_off&htons(IP_DF)) &&
776 mtu < ntohs(old_iph->tot_len)) {
777 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
783 else if (skb->protocol == htons(ETH_P_IPV6)) {
784 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
786 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
787 if ((tunnel->parms.iph.daddr &&
788 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
789 rt6->rt6i_dst.plen == 128) {
790 rt6->rt6i_flags |= RTF_MODIFIED;
791 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
795 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
796 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
803 if (tunnel->err_count > 0) {
804 if (time_before(jiffies,
805 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
808 dst_link_failure(skb);
810 tunnel->err_count = 0;
813 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
815 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
816 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
817 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
825 skb_set_owner_w(new_skb, skb->sk);
828 old_iph = ip_hdr(skb);
831 skb_reset_transport_header(skb);
832 skb_push(skb, gre_hlen);
833 skb_reset_network_header(skb);
834 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
835 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
838 skb_dst_set(skb, &rt->u.dst);
841 * Push down and install the IPIP header.
846 iph->ihl = sizeof(struct iphdr) >> 2;
848 iph->protocol = IPPROTO_GRE;
849 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
850 iph->daddr = rt->rt_dst;
851 iph->saddr = rt->rt_src;
853 if ((iph->ttl = tiph->ttl) == 0) {
854 if (skb->protocol == htons(ETH_P_IP))
855 iph->ttl = old_iph->ttl;
857 else if (skb->protocol == htons(ETH_P_IPV6))
858 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
861 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
864 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
865 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
866 htons(ETH_P_TEB) : skb->protocol;
868 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
869 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
871 if (tunnel->parms.o_flags&GRE_SEQ) {
873 *ptr = htonl(tunnel->o_seqno);
876 if (tunnel->parms.o_flags&GRE_KEY) {
877 *ptr = tunnel->parms.o_key;
880 if (tunnel->parms.o_flags&GRE_CSUM) {
882 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
892 dst_link_failure(skb);
900 static int ipgre_tunnel_bind_dev(struct net_device *dev)
902 struct net_device *tdev = NULL;
903 struct ip_tunnel *tunnel;
905 int hlen = LL_MAX_HEADER;
906 int mtu = ETH_DATA_LEN;
907 int addend = sizeof(struct iphdr) + 4;
909 tunnel = netdev_priv(dev);
910 iph = &tunnel->parms.iph;
912 /* Guess output device to choose reasonable mtu and needed_headroom */
915 struct flowi fl = { .oif = tunnel->parms.link,
917 { .daddr = iph->daddr,
919 .tos = RT_TOS(iph->tos) } },
920 .proto = IPPROTO_GRE };
922 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
923 tdev = rt->u.dst.dev;
927 if (dev->type != ARPHRD_ETHER)
928 dev->flags |= IFF_POINTOPOINT;
931 if (!tdev && tunnel->parms.link)
932 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
935 hlen = tdev->hard_header_len + tdev->needed_headroom;
938 dev->iflink = tunnel->parms.link;
940 /* Precalculate GRE options length */
941 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
942 if (tunnel->parms.o_flags&GRE_CSUM)
944 if (tunnel->parms.o_flags&GRE_KEY)
946 if (tunnel->parms.o_flags&GRE_SEQ)
949 dev->needed_headroom = addend + hlen;
950 mtu -= dev->hard_header_len + addend;
955 tunnel->hlen = addend;
961 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
964 struct ip_tunnel_parm p;
966 struct net *net = dev_net(dev);
967 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
972 if (dev == ign->fb_tunnel_dev) {
973 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
977 t = ipgre_tunnel_locate(net, &p, 0);
980 t = netdev_priv(dev);
981 memcpy(&p, &t->parms, sizeof(p));
982 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
989 if (!capable(CAP_NET_ADMIN))
993 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
997 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
998 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
999 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1002 p.iph.frag_off |= htons(IP_DF);
1004 if (!(p.i_flags&GRE_KEY))
1006 if (!(p.o_flags&GRE_KEY))
1009 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1011 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1013 if (t->dev != dev) {
1018 unsigned nflags = 0;
1020 t = netdev_priv(dev);
1022 if (ipv4_is_multicast(p.iph.daddr))
1023 nflags = IFF_BROADCAST;
1024 else if (p.iph.daddr)
1025 nflags = IFF_POINTOPOINT;
1027 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1031 ipgre_tunnel_unlink(ign, t);
1032 t->parms.iph.saddr = p.iph.saddr;
1033 t->parms.iph.daddr = p.iph.daddr;
1034 t->parms.i_key = p.i_key;
1035 t->parms.o_key = p.o_key;
1036 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1037 memcpy(dev->broadcast, &p.iph.daddr, 4);
1038 ipgre_tunnel_link(ign, t);
1039 netdev_state_change(dev);
1045 if (cmd == SIOCCHGTUNNEL) {
1046 t->parms.iph.ttl = p.iph.ttl;
1047 t->parms.iph.tos = p.iph.tos;
1048 t->parms.iph.frag_off = p.iph.frag_off;
1049 if (t->parms.link != p.link) {
1050 t->parms.link = p.link;
1051 dev->mtu = ipgre_tunnel_bind_dev(dev);
1052 netdev_state_change(dev);
1055 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1058 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1063 if (!capable(CAP_NET_ADMIN))
1066 if (dev == ign->fb_tunnel_dev) {
1068 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1071 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1074 if (t == netdev_priv(ign->fb_tunnel_dev))
1078 unregister_netdevice(dev);
1090 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1092 struct ip_tunnel *tunnel = netdev_priv(dev);
1094 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1100 /* Nice toy. Unfortunately, useless in real life :-)
1101 It allows to construct virtual multiprotocol broadcast "LAN"
1102 over the Internet, provided multicast routing is tuned.
1105 I have no idea was this bicycle invented before me,
1106 so that I had to set ARPHRD_IPGRE to a random value.
1107 I have an impression, that Cisco could make something similar,
1108 but this feature is apparently missing in IOS<=11.2(8).
1110 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1111 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1113 ping -t 255 224.66.66.66
1115 If nobody answers, mbone does not work.
1117 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1118 ip addr add 10.66.66.<somewhat>/24 dev Universe
1119 ifconfig Universe up
1120 ifconfig Universe add fe80::<Your_real_addr>/10
1121 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1124 ftp fec0:6666:6666::193.233.7.65
1129 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1130 unsigned short type,
1131 const void *daddr, const void *saddr, unsigned len)
1133 struct ip_tunnel *t = netdev_priv(dev);
1134 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1135 __be16 *p = (__be16*)(iph+1);
1137 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1138 p[0] = t->parms.o_flags;
1142 * Set the source hardware address.
1146 memcpy(&iph->saddr, saddr, 4);
1148 memcpy(&iph->daddr, daddr, 4);
1155 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1157 struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1158 memcpy(haddr, &iph->saddr, 4);
1162 static const struct header_ops ipgre_header_ops = {
1163 .create = ipgre_header,
1164 .parse = ipgre_header_parse,
1167 #ifdef CONFIG_NET_IPGRE_BROADCAST
1168 static int ipgre_open(struct net_device *dev)
1170 struct ip_tunnel *t = netdev_priv(dev);
1172 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1173 struct flowi fl = { .oif = t->parms.link,
1175 { .daddr = t->parms.iph.daddr,
1176 .saddr = t->parms.iph.saddr,
1177 .tos = RT_TOS(t->parms.iph.tos) } },
1178 .proto = IPPROTO_GRE };
1180 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1181 return -EADDRNOTAVAIL;
1182 dev = rt->u.dst.dev;
1184 if (__in_dev_get_rtnl(dev) == NULL)
1185 return -EADDRNOTAVAIL;
1186 t->mlink = dev->ifindex;
1187 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1192 static int ipgre_close(struct net_device *dev)
1194 struct ip_tunnel *t = netdev_priv(dev);
1196 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1197 struct in_device *in_dev;
1198 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1200 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1209 static const struct net_device_ops ipgre_netdev_ops = {
1210 .ndo_init = ipgre_tunnel_init,
1211 .ndo_uninit = ipgre_tunnel_uninit,
1212 #ifdef CONFIG_NET_IPGRE_BROADCAST
1213 .ndo_open = ipgre_open,
1214 .ndo_stop = ipgre_close,
1216 .ndo_start_xmit = ipgre_tunnel_xmit,
1217 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1218 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1221 static void ipgre_tunnel_setup(struct net_device *dev)
1223 dev->netdev_ops = &ipgre_netdev_ops;
1224 dev->destructor = free_netdev;
1226 dev->type = ARPHRD_IPGRE;
1227 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1228 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1229 dev->flags = IFF_NOARP;
1232 dev->features |= NETIF_F_NETNS_LOCAL;
1233 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1236 static int ipgre_tunnel_init(struct net_device *dev)
1238 struct ip_tunnel *tunnel;
1241 tunnel = netdev_priv(dev);
1242 iph = &tunnel->parms.iph;
1245 strcpy(tunnel->parms.name, dev->name);
1247 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1248 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1251 #ifdef CONFIG_NET_IPGRE_BROADCAST
1252 if (ipv4_is_multicast(iph->daddr)) {
1255 dev->flags = IFF_BROADCAST;
1256 dev->header_ops = &ipgre_header_ops;
1260 dev->header_ops = &ipgre_header_ops;
1265 static void ipgre_fb_tunnel_init(struct net_device *dev)
1267 struct ip_tunnel *tunnel = netdev_priv(dev);
1268 struct iphdr *iph = &tunnel->parms.iph;
1269 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1272 strcpy(tunnel->parms.name, dev->name);
1275 iph->protocol = IPPROTO_GRE;
1277 tunnel->hlen = sizeof(struct iphdr) + 4;
1280 ign->tunnels_wc[0] = tunnel;
1284 static const struct net_protocol ipgre_protocol = {
1285 .handler = ipgre_rcv,
1286 .err_handler = ipgre_err,
1290 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1294 for (prio = 0; prio < 4; prio++) {
1296 for (h = 0; h < HASH_SIZE; h++) {
1297 struct ip_tunnel *t = ign->tunnels[prio][h];
1300 unregister_netdevice_queue(t->dev, head);
1307 static int __net_init ipgre_init_net(struct net *net)
1309 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1312 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1313 ipgre_tunnel_setup);
1314 if (!ign->fb_tunnel_dev) {
1318 dev_net_set(ign->fb_tunnel_dev, net);
1320 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1321 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1323 if ((err = register_netdev(ign->fb_tunnel_dev)))
1329 free_netdev(ign->fb_tunnel_dev);
1334 static void __net_exit ipgre_exit_net(struct net *net)
1336 struct ipgre_net *ign;
1339 ign = net_generic(net, ipgre_net_id);
1341 ipgre_destroy_tunnels(ign, &list);
1342 unregister_netdevice_many(&list);
1346 static struct pernet_operations ipgre_net_ops = {
1347 .init = ipgre_init_net,
1348 .exit = ipgre_exit_net,
1349 .id = &ipgre_net_id,
1350 .size = sizeof(struct ipgre_net),
1353 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1361 if (data[IFLA_GRE_IFLAGS])
1362 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1363 if (data[IFLA_GRE_OFLAGS])
1364 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1365 if (flags & (GRE_VERSION|GRE_ROUTING))
1371 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1375 if (tb[IFLA_ADDRESS]) {
1376 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1378 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1379 return -EADDRNOTAVAIL;
1385 if (data[IFLA_GRE_REMOTE]) {
1386 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1392 return ipgre_tunnel_validate(tb, data);
1395 static void ipgre_netlink_parms(struct nlattr *data[],
1396 struct ip_tunnel_parm *parms)
1398 memset(parms, 0, sizeof(*parms));
1400 parms->iph.protocol = IPPROTO_GRE;
1405 if (data[IFLA_GRE_LINK])
1406 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1408 if (data[IFLA_GRE_IFLAGS])
1409 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1411 if (data[IFLA_GRE_OFLAGS])
1412 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1414 if (data[IFLA_GRE_IKEY])
1415 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1417 if (data[IFLA_GRE_OKEY])
1418 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1420 if (data[IFLA_GRE_LOCAL])
1421 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1423 if (data[IFLA_GRE_REMOTE])
1424 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1426 if (data[IFLA_GRE_TTL])
1427 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1429 if (data[IFLA_GRE_TOS])
1430 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1432 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1433 parms->iph.frag_off = htons(IP_DF);
1436 static int ipgre_tap_init(struct net_device *dev)
1438 struct ip_tunnel *tunnel;
1440 tunnel = netdev_priv(dev);
1443 strcpy(tunnel->parms.name, dev->name);
1445 ipgre_tunnel_bind_dev(dev);
1450 static const struct net_device_ops ipgre_tap_netdev_ops = {
1451 .ndo_init = ipgre_tap_init,
1452 .ndo_uninit = ipgre_tunnel_uninit,
1453 .ndo_start_xmit = ipgre_tunnel_xmit,
1454 .ndo_set_mac_address = eth_mac_addr,
1455 .ndo_validate_addr = eth_validate_addr,
1456 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1459 static void ipgre_tap_setup(struct net_device *dev)
1464 dev->netdev_ops = &ipgre_tap_netdev_ops;
1465 dev->destructor = free_netdev;
1468 dev->features |= NETIF_F_NETNS_LOCAL;
1471 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1472 struct nlattr *data[])
1474 struct ip_tunnel *nt;
1475 struct net *net = dev_net(dev);
1476 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1480 nt = netdev_priv(dev);
1481 ipgre_netlink_parms(data, &nt->parms);
1483 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1486 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1487 random_ether_addr(dev->dev_addr);
1489 mtu = ipgre_tunnel_bind_dev(dev);
1493 err = register_netdevice(dev);
1498 ipgre_tunnel_link(ign, nt);
1504 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1505 struct nlattr *data[])
1507 struct ip_tunnel *t, *nt;
1508 struct net *net = dev_net(dev);
1509 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1510 struct ip_tunnel_parm p;
1513 if (dev == ign->fb_tunnel_dev)
1516 nt = netdev_priv(dev);
1517 ipgre_netlink_parms(data, &p);
1519 t = ipgre_tunnel_locate(net, &p, 0);
1527 if (dev->type != ARPHRD_ETHER) {
1528 unsigned nflags = 0;
1530 if (ipv4_is_multicast(p.iph.daddr))
1531 nflags = IFF_BROADCAST;
1532 else if (p.iph.daddr)
1533 nflags = IFF_POINTOPOINT;
1535 if ((dev->flags ^ nflags) &
1536 (IFF_POINTOPOINT | IFF_BROADCAST))
1540 ipgre_tunnel_unlink(ign, t);
1541 t->parms.iph.saddr = p.iph.saddr;
1542 t->parms.iph.daddr = p.iph.daddr;
1543 t->parms.i_key = p.i_key;
1544 if (dev->type != ARPHRD_ETHER) {
1545 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1546 memcpy(dev->broadcast, &p.iph.daddr, 4);
1548 ipgre_tunnel_link(ign, t);
1549 netdev_state_change(dev);
1552 t->parms.o_key = p.o_key;
1553 t->parms.iph.ttl = p.iph.ttl;
1554 t->parms.iph.tos = p.iph.tos;
1555 t->parms.iph.frag_off = p.iph.frag_off;
1557 if (t->parms.link != p.link) {
1558 t->parms.link = p.link;
1559 mtu = ipgre_tunnel_bind_dev(dev);
1562 netdev_state_change(dev);
1568 static size_t ipgre_get_size(const struct net_device *dev)
1573 /* IFLA_GRE_IFLAGS */
1575 /* IFLA_GRE_OFLAGS */
1581 /* IFLA_GRE_LOCAL */
1583 /* IFLA_GRE_REMOTE */
1589 /* IFLA_GRE_PMTUDISC */
1594 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1596 struct ip_tunnel *t = netdev_priv(dev);
1597 struct ip_tunnel_parm *p = &t->parms;
1599 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1600 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1601 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1602 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1603 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1604 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1605 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1606 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1607 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1608 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1616 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1617 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1618 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1619 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1620 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1621 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
1622 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1623 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1624 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1625 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1626 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1629 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1631 .maxtype = IFLA_GRE_MAX,
1632 .policy = ipgre_policy,
1633 .priv_size = sizeof(struct ip_tunnel),
1634 .setup = ipgre_tunnel_setup,
1635 .validate = ipgre_tunnel_validate,
1636 .newlink = ipgre_newlink,
1637 .changelink = ipgre_changelink,
1638 .get_size = ipgre_get_size,
1639 .fill_info = ipgre_fill_info,
1642 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1644 .maxtype = IFLA_GRE_MAX,
1645 .policy = ipgre_policy,
1646 .priv_size = sizeof(struct ip_tunnel),
1647 .setup = ipgre_tap_setup,
1648 .validate = ipgre_tap_validate,
1649 .newlink = ipgre_newlink,
1650 .changelink = ipgre_changelink,
1651 .get_size = ipgre_get_size,
1652 .fill_info = ipgre_fill_info,
1656 * And now the modules code and kernel interface.
1659 static int __init ipgre_init(void)
1663 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1665 err = register_pernet_device(&ipgre_net_ops);
1669 err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
1671 printk(KERN_INFO "ipgre init: can't add protocol\n");
1672 goto add_proto_failed;
1675 err = rtnl_link_register(&ipgre_link_ops);
1677 goto rtnl_link_failed;
1679 err = rtnl_link_register(&ipgre_tap_ops);
1681 goto tap_ops_failed;
1687 rtnl_link_unregister(&ipgre_link_ops);
1689 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1691 unregister_pernet_device(&ipgre_net_ops);
1695 static void __exit ipgre_fini(void)
1697 rtnl_link_unregister(&ipgre_tap_ops);
1698 rtnl_link_unregister(&ipgre_link_ops);
1699 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1700 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1701 unregister_pernet_device(&ipgre_net_ops);
1704 module_init(ipgre_init);
1705 module_exit(ipgre_fini);
1706 MODULE_LICENSE("GPL");
1707 MODULE_ALIAS_RTNL_LINK("gre");
1708 MODULE_ALIAS_RTNL_LINK("gretap");