2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
53 #define pr_fmt(fmt) "TCP: " fmt
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
66 #include <net/net_namespace.h>
68 #include <net/inet_hashtables.h>
70 #include <net/transp_v6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77 #include <net/busy_poll.h>
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94 __be32 daddr, __be32 saddr, const struct tcphdr *th);
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
100 static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
102 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
105 tcp_hdr(skb)->source);
108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
113 /* With PAWS, it is safe from the viewpoint
114 of data integrity. Even without PAWS it is safe provided sequence
115 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117 Actually, the idea is close to VJ's one, only timestamp cache is
118 held not per host, but per port pair and TW bucket is used as state
121 If TW bucket has been already destroyed we fall back to VJ's scheme
122 and use initial timestamp retrieved from peer table.
124 if (tcptw->tw_ts_recent_stamp &&
125 (!twp || (sysctl_tcp_tw_reuse &&
126 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128 if (tp->write_seq == 0)
130 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
131 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140 /* This will initiate an outgoing connection. */
141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
143 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144 struct inet_sock *inet = inet_sk(sk);
145 struct tcp_sock *tp = tcp_sk(sk);
146 __be16 orig_sport, orig_dport;
147 __be32 daddr, nexthop;
151 struct ip_options_rcu *inet_opt;
153 if (addr_len < sizeof(struct sockaddr_in))
156 if (usin->sin_family != AF_INET)
157 return -EAFNOSUPPORT;
159 nexthop = daddr = usin->sin_addr.s_addr;
160 inet_opt = rcu_dereference_protected(inet->inet_opt,
161 sock_owned_by_user(sk));
162 if (inet_opt && inet_opt->opt.srr) {
165 nexthop = inet_opt->opt.faddr;
168 orig_sport = inet->inet_sport;
169 orig_dport = usin->sin_port;
170 fl4 = &inet->cork.fl.u.ip4;
171 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174 orig_sport, orig_dport, sk);
177 if (err == -ENETUNREACH)
178 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
182 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187 if (!inet_opt || !inet_opt->opt.srr)
190 if (!inet->inet_saddr)
191 inet->inet_saddr = fl4->saddr;
192 sk_rcv_saddr_set(sk, inet->inet_saddr);
194 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195 /* Reset inherited state */
196 tp->rx_opt.ts_recent = 0;
197 tp->rx_opt.ts_recent_stamp = 0;
198 if (likely(!tp->repair))
202 if (tcp_death_row.sysctl_tw_recycle &&
203 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204 tcp_fetch_timewait_stamp(sk, &rt->dst);
206 inet->inet_dport = usin->sin_port;
207 sk_daddr_set(sk, daddr);
209 inet_csk(sk)->icsk_ext_hdr_len = 0;
211 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
213 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
215 /* Socket identity is still unknown (sport may be zero).
216 * However we set state to SYN-SENT and not releasing socket
217 * lock select source port, enter ourselves into the hash tables and
218 * complete initialization after this.
220 tcp_set_state(sk, TCP_SYN_SENT);
221 err = inet_hash_connect(&tcp_death_row, sk);
227 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 inet->inet_sport, inet->inet_dport, sk);
234 /* OK, now commit destination to socket. */
235 sk->sk_gso_type = SKB_GSO_TCPV4;
236 sk_setup_caps(sk, &rt->dst);
238 if (!tp->write_seq && likely(!tp->repair))
239 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
244 inet->inet_id = tp->write_seq ^ jiffies;
246 err = tcp_connect(sk);
256 * This unhashes the socket and releases the local port,
259 tcp_set_state(sk, TCP_CLOSE);
261 sk->sk_route_caps = 0;
262 inet->inet_dport = 0;
265 EXPORT_SYMBOL(tcp_v4_connect);
268 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269 * It can be called through tcp_release_cb() if socket was owned by user
270 * at the time tcp_v4_err() was called to handle ICMP message.
272 void tcp_v4_mtu_reduced(struct sock *sk)
274 struct dst_entry *dst;
275 struct inet_sock *inet = inet_sk(sk);
276 u32 mtu = tcp_sk(sk)->mtu_info;
278 dst = inet_csk_update_pmtu(sk, mtu);
282 /* Something is about to be wrong... Remember soft error
283 * for the case, if this connection will not able to recover.
285 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 sk->sk_err_soft = EMSGSIZE;
290 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291 ip_sk_accept_pmtu(sk) &&
292 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293 tcp_sync_mss(sk, mtu);
295 /* Resend the TCP packet because it's
296 * clear that the old packet has been
297 * dropped. This is the new "fast" path mtu
300 tcp_simple_retransmit(sk);
301 } /* else let the usual retransmit timer handle it */
303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
307 struct dst_entry *dst = __sk_dst_check(sk, 0);
310 dst->ops->redirect(dst, sk, skb);
314 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
315 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
317 struct request_sock *req = inet_reqsk(sk);
318 struct net *net = sock_net(sk);
320 /* ICMPs are not backlogged, hence we cannot get
321 * an established socket here.
323 if (seq != tcp_rsk(req)->snt_isn) {
324 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
327 * Still in SYN_RECV, just remove it silently.
328 * There is no good way to pass the error to the newly
329 * created socket, and POSIX does not want network
330 * errors returned from accept().
332 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
333 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
337 EXPORT_SYMBOL(tcp_req_err);
340 * This routine is called by the ICMP module when it gets some
341 * sort of error condition. If err < 0 then the socket should
342 * be closed and the error returned to the user. If err > 0
343 * it's just the icmp type << 8 | icmp code. After adjustment
344 * header points to the first 8 bytes of the tcp header. We need
345 * to find the appropriate port.
347 * The locking strategy used here is very "optimistic". When
348 * someone else accesses the socket the ICMP is just dropped
349 * and for some paths there is no check at all.
350 * A more general error queue to queue errors for later handling
351 * is probably better.
355 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
357 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
358 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
359 struct inet_connection_sock *icsk;
361 struct inet_sock *inet;
362 const int type = icmp_hdr(icmp_skb)->type;
363 const int code = icmp_hdr(icmp_skb)->code;
366 struct request_sock *fastopen;
370 struct net *net = dev_net(icmp_skb->dev);
372 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
373 th->dest, iph->saddr, ntohs(th->source),
376 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
379 if (sk->sk_state == TCP_TIME_WAIT) {
380 inet_twsk_put(inet_twsk(sk));
383 seq = ntohl(th->seq);
384 if (sk->sk_state == TCP_NEW_SYN_RECV)
385 return tcp_req_err(sk, seq,
386 type == ICMP_PARAMETERPROB ||
387 type == ICMP_TIME_EXCEEDED ||
388 (type == ICMP_DEST_UNREACH &&
389 (code == ICMP_NET_UNREACH ||
390 code == ICMP_HOST_UNREACH)));
393 /* If too many ICMPs get dropped on busy
394 * servers this needs to be solved differently.
395 * We do take care of PMTU discovery (RFC1191) special case :
396 * we can receive locally generated ICMP messages while socket is held.
398 if (sock_owned_by_user(sk)) {
399 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
400 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
402 if (sk->sk_state == TCP_CLOSE)
405 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
406 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
412 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
413 fastopen = tp->fastopen_rsk;
414 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
415 if (sk->sk_state != TCP_LISTEN &&
416 !between(seq, snd_una, tp->snd_nxt)) {
417 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
423 do_redirect(icmp_skb, sk);
425 case ICMP_SOURCE_QUENCH:
426 /* Just silently ignore these. */
428 case ICMP_PARAMETERPROB:
431 case ICMP_DEST_UNREACH:
432 if (code > NR_ICMP_UNREACH)
435 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
436 /* We are not interested in TCP_LISTEN and open_requests
437 * (SYN-ACKs send out by Linux are always <576bytes so
438 * they should go through unfragmented).
440 if (sk->sk_state == TCP_LISTEN)
444 if (!sock_owned_by_user(sk)) {
445 tcp_v4_mtu_reduced(sk);
447 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
453 err = icmp_err_convert[code].errno;
454 /* check if icmp_skb allows revert of backoff
455 * (see draft-zimmermann-tcp-lcd) */
456 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
458 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
459 !icsk->icsk_backoff || fastopen)
462 if (sock_owned_by_user(sk))
465 icsk->icsk_backoff--;
466 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
468 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
470 skb = tcp_write_queue_head(sk);
473 remaining = icsk->icsk_rto -
475 tcp_time_stamp - tcp_skb_timestamp(skb));
478 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
479 remaining, TCP_RTO_MAX);
481 /* RTO revert clocked out retransmission.
482 * Will retransmit now */
483 tcp_retransmit_timer(sk);
487 case ICMP_TIME_EXCEEDED:
494 switch (sk->sk_state) {
497 /* Only in fast or simultaneous open. If a fast open socket is
498 * is already accepted it is treated as a connected one below.
500 if (fastopen && !fastopen->sk)
503 if (!sock_owned_by_user(sk)) {
506 sk->sk_error_report(sk);
510 sk->sk_err_soft = err;
515 /* If we've already connected we will keep trying
516 * until we time out, or the user gives up.
518 * rfc1122 4.2.3.9 allows to consider as hard errors
519 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
520 * but it is obsoleted by pmtu discovery).
522 * Note, that in modern internet, where routing is unreliable
523 * and in each dark corner broken firewalls sit, sending random
524 * errors ordered by their masters even this two messages finally lose
525 * their original sense (even Linux sends invalid PORT_UNREACHs)
527 * Now we are in compliance with RFCs.
532 if (!sock_owned_by_user(sk) && inet->recverr) {
534 sk->sk_error_report(sk);
535 } else { /* Only an error on timeout */
536 sk->sk_err_soft = err;
544 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
546 struct tcphdr *th = tcp_hdr(skb);
548 if (skb->ip_summed == CHECKSUM_PARTIAL) {
549 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
550 skb->csum_start = skb_transport_header(skb) - skb->head;
551 skb->csum_offset = offsetof(struct tcphdr, check);
553 th->check = tcp_v4_check(skb->len, saddr, daddr,
560 /* This routine computes an IPv4 TCP checksum. */
561 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
563 const struct inet_sock *inet = inet_sk(sk);
565 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
567 EXPORT_SYMBOL(tcp_v4_send_check);
570 * This routine will send an RST to the other tcp.
572 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
574 * Answer: if a packet caused RST, it is not for a socket
575 * existing in our system, if it is matched to a socket,
576 * it is just duplicate segment or bug in other side's TCP.
577 * So that we build reply only basing on parameters
578 * arrived with segment.
579 * Exception: precedence violation. We do not implement it in any case.
582 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
584 const struct tcphdr *th = tcp_hdr(skb);
587 #ifdef CONFIG_TCP_MD5SIG
588 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
591 struct ip_reply_arg arg;
592 #ifdef CONFIG_TCP_MD5SIG
593 struct tcp_md5sig_key *key;
594 const __u8 *hash_location = NULL;
595 unsigned char newhash[16];
597 struct sock *sk1 = NULL;
601 /* Never send a reset in response to a reset. */
605 /* If sk not NULL, it means we did a successful lookup and incoming
606 * route had to be correct. prequeue might have dropped our dst.
608 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
611 /* Swap the send and the receive. */
612 memset(&rep, 0, sizeof(rep));
613 rep.th.dest = th->source;
614 rep.th.source = th->dest;
615 rep.th.doff = sizeof(struct tcphdr) / 4;
619 rep.th.seq = th->ack_seq;
622 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
623 skb->len - (th->doff << 2));
626 memset(&arg, 0, sizeof(arg));
627 arg.iov[0].iov_base = (unsigned char *)&rep;
628 arg.iov[0].iov_len = sizeof(rep.th);
630 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
631 #ifdef CONFIG_TCP_MD5SIG
632 hash_location = tcp_parse_md5sig_option(th);
633 if (!sk && hash_location) {
635 * active side is lost. Try to find listening socket through
636 * source port, and then find md5 key through listening socket.
637 * we are not loose security here:
638 * Incoming packet is checked with md5 hash with finding key,
639 * no RST generated if md5 hash doesn't match.
641 sk1 = __inet_lookup_listener(net,
642 &tcp_hashinfo, ip_hdr(skb)->saddr,
643 th->source, ip_hdr(skb)->daddr,
644 ntohs(th->source), inet_iif(skb));
645 /* don't send rst if it can't find key */
649 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
650 &ip_hdr(skb)->saddr, AF_INET);
654 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
655 if (genhash || memcmp(hash_location, newhash, 16) != 0)
658 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
664 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
666 (TCPOPT_MD5SIG << 8) |
668 /* Update length and the length the header thinks exists */
669 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
670 rep.th.doff = arg.iov[0].iov_len / 4;
672 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
673 key, ip_hdr(skb)->saddr,
674 ip_hdr(skb)->daddr, &rep.th);
677 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
678 ip_hdr(skb)->saddr, /* XXX */
679 arg.iov[0].iov_len, IPPROTO_TCP, 0);
680 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
681 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
682 /* When socket is gone, all binding information is lost.
683 * routing might fail in this case. No choice here, if we choose to force
684 * input interface, we will misroute in case of asymmetric route.
687 arg.bound_dev_if = sk->sk_bound_dev_if;
689 arg.tos = ip_hdr(skb)->tos;
690 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
691 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
692 skb, &TCP_SKB_CB(skb)->header.h4.opt,
693 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
694 &arg, arg.iov[0].iov_len);
696 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
697 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
699 #ifdef CONFIG_TCP_MD5SIG
708 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
709 outside socket context is ugly, certainly. What can I do?
712 static void tcp_v4_send_ack(const struct sock *sk,
713 struct sk_buff *skb, u32 seq, u32 ack,
714 u32 win, u32 tsval, u32 tsecr, int oif,
715 struct tcp_md5sig_key *key,
716 int reply_flags, u8 tos)
718 const struct tcphdr *th = tcp_hdr(skb);
721 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
722 #ifdef CONFIG_TCP_MD5SIG
723 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
727 struct net *net = sock_net(sk);
728 struct ip_reply_arg arg;
730 memset(&rep.th, 0, sizeof(struct tcphdr));
731 memset(&arg, 0, sizeof(arg));
733 arg.iov[0].iov_base = (unsigned char *)&rep;
734 arg.iov[0].iov_len = sizeof(rep.th);
736 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
737 (TCPOPT_TIMESTAMP << 8) |
739 rep.opt[1] = htonl(tsval);
740 rep.opt[2] = htonl(tsecr);
741 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
744 /* Swap the send and the receive. */
745 rep.th.dest = th->source;
746 rep.th.source = th->dest;
747 rep.th.doff = arg.iov[0].iov_len / 4;
748 rep.th.seq = htonl(seq);
749 rep.th.ack_seq = htonl(ack);
751 rep.th.window = htons(win);
753 #ifdef CONFIG_TCP_MD5SIG
755 int offset = (tsecr) ? 3 : 0;
757 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
759 (TCPOPT_MD5SIG << 8) |
761 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
762 rep.th.doff = arg.iov[0].iov_len/4;
764 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
765 key, ip_hdr(skb)->saddr,
766 ip_hdr(skb)->daddr, &rep.th);
769 arg.flags = reply_flags;
770 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
771 ip_hdr(skb)->saddr, /* XXX */
772 arg.iov[0].iov_len, IPPROTO_TCP, 0);
773 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
775 arg.bound_dev_if = oif;
777 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
778 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
779 skb, &TCP_SKB_CB(skb)->header.h4.opt,
780 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
781 &arg, arg.iov[0].iov_len);
783 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
786 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
788 struct inet_timewait_sock *tw = inet_twsk(sk);
789 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
791 tcp_v4_send_ack(sk, skb,
792 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
793 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
794 tcp_time_stamp + tcptw->tw_ts_offset,
797 tcp_twsk_md5_key(tcptw),
798 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
805 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
806 struct request_sock *req)
808 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
809 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
811 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
815 * The window field (SEG.WND) of every outgoing segment, with the
816 * exception of <SYN> segments, MUST be right-shifted by
817 * Rcv.Wind.Shift bits:
819 tcp_v4_send_ack(sk, skb, seq,
820 tcp_rsk(req)->rcv_nxt,
821 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
825 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
827 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
832 * Send a SYN-ACK after having received a SYN.
833 * This still operates on a request_sock only, not on a big
836 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
838 struct request_sock *req,
839 struct tcp_fastopen_cookie *foc,
842 const struct inet_request_sock *ireq = inet_rsk(req);
847 /* First, grab a route. */
848 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
851 skb = tcp_make_synack(sk, dst, req, foc, attach_req);
854 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
856 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
859 err = net_xmit_eval(err);
866 * IPv4 request_sock destructor.
868 static void tcp_v4_reqsk_destructor(struct request_sock *req)
870 kfree(inet_rsk(req)->opt);
874 #ifdef CONFIG_TCP_MD5SIG
876 * RFC2385 MD5 checksumming requires a mapping of
877 * IP address->MD5 Key.
878 * We need to maintain these in the sk structure.
881 /* Find the Key structure for an address. */
882 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
883 const union tcp_md5_addr *addr,
886 const struct tcp_sock *tp = tcp_sk(sk);
887 struct tcp_md5sig_key *key;
888 unsigned int size = sizeof(struct in_addr);
889 const struct tcp_md5sig_info *md5sig;
891 /* caller either holds rcu_read_lock() or socket lock */
892 md5sig = rcu_dereference_check(tp->md5sig_info,
893 sock_owned_by_user(sk) ||
894 lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
897 #if IS_ENABLED(CONFIG_IPV6)
898 if (family == AF_INET6)
899 size = sizeof(struct in6_addr);
901 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
902 if (key->family != family)
904 if (!memcmp(&key->addr, addr, size))
909 EXPORT_SYMBOL(tcp_md5_do_lookup);
911 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
912 const struct sock *addr_sk)
914 const union tcp_md5_addr *addr;
916 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
917 return tcp_md5_do_lookup(sk, addr, AF_INET);
919 EXPORT_SYMBOL(tcp_v4_md5_lookup);
921 /* This can be called on a newly created socket, from other files */
922 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
923 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
925 /* Add Key to the list */
926 struct tcp_md5sig_key *key;
927 struct tcp_sock *tp = tcp_sk(sk);
928 struct tcp_md5sig_info *md5sig;
930 key = tcp_md5_do_lookup(sk, addr, family);
932 /* Pre-existing entry - just update that one. */
933 memcpy(key->key, newkey, newkeylen);
934 key->keylen = newkeylen;
938 md5sig = rcu_dereference_protected(tp->md5sig_info,
939 sock_owned_by_user(sk) ||
940 lockdep_is_held(&sk->sk_lock.slock));
942 md5sig = kmalloc(sizeof(*md5sig), gfp);
946 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
947 INIT_HLIST_HEAD(&md5sig->head);
948 rcu_assign_pointer(tp->md5sig_info, md5sig);
951 key = sock_kmalloc(sk, sizeof(*key), gfp);
954 if (!tcp_alloc_md5sig_pool()) {
955 sock_kfree_s(sk, key, sizeof(*key));
959 memcpy(key->key, newkey, newkeylen);
960 key->keylen = newkeylen;
961 key->family = family;
962 memcpy(&key->addr, addr,
963 (family == AF_INET6) ? sizeof(struct in6_addr) :
964 sizeof(struct in_addr));
965 hlist_add_head_rcu(&key->node, &md5sig->head);
968 EXPORT_SYMBOL(tcp_md5_do_add);
970 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
972 struct tcp_md5sig_key *key;
974 key = tcp_md5_do_lookup(sk, addr, family);
977 hlist_del_rcu(&key->node);
978 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
982 EXPORT_SYMBOL(tcp_md5_do_del);
984 static void tcp_clear_md5_list(struct sock *sk)
986 struct tcp_sock *tp = tcp_sk(sk);
987 struct tcp_md5sig_key *key;
988 struct hlist_node *n;
989 struct tcp_md5sig_info *md5sig;
991 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
993 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
994 hlist_del_rcu(&key->node);
995 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1000 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1003 struct tcp_md5sig cmd;
1004 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1006 if (optlen < sizeof(cmd))
1009 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1012 if (sin->sin_family != AF_INET)
1015 if (!cmd.tcpm_keylen)
1016 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1019 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1022 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1023 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1027 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1028 __be32 daddr, __be32 saddr, int nbytes)
1030 struct tcp4_pseudohdr *bp;
1031 struct scatterlist sg;
1033 bp = &hp->md5_blk.ip4;
1036 * 1. the TCP pseudo-header (in the order: source IP address,
1037 * destination IP address, zero-padded protocol number, and
1043 bp->protocol = IPPROTO_TCP;
1044 bp->len = cpu_to_be16(nbytes);
1046 sg_init_one(&sg, bp, sizeof(*bp));
1047 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1050 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1051 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1053 struct tcp_md5sig_pool *hp;
1054 struct hash_desc *desc;
1056 hp = tcp_get_md5sig_pool();
1058 goto clear_hash_noput;
1059 desc = &hp->md5_desc;
1061 if (crypto_hash_init(desc))
1063 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1065 if (tcp_md5_hash_header(hp, th))
1067 if (tcp_md5_hash_key(hp, key))
1069 if (crypto_hash_final(desc, md5_hash))
1072 tcp_put_md5sig_pool();
1076 tcp_put_md5sig_pool();
1078 memset(md5_hash, 0, 16);
1082 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1083 const struct sock *sk,
1084 const struct sk_buff *skb)
1086 struct tcp_md5sig_pool *hp;
1087 struct hash_desc *desc;
1088 const struct tcphdr *th = tcp_hdr(skb);
1089 __be32 saddr, daddr;
1091 if (sk) { /* valid for establish/request sockets */
1092 saddr = sk->sk_rcv_saddr;
1093 daddr = sk->sk_daddr;
1095 const struct iphdr *iph = ip_hdr(skb);
1100 hp = tcp_get_md5sig_pool();
1102 goto clear_hash_noput;
1103 desc = &hp->md5_desc;
1105 if (crypto_hash_init(desc))
1108 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1110 if (tcp_md5_hash_header(hp, th))
1112 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1114 if (tcp_md5_hash_key(hp, key))
1116 if (crypto_hash_final(desc, md5_hash))
1119 tcp_put_md5sig_pool();
1123 tcp_put_md5sig_pool();
1125 memset(md5_hash, 0, 16);
1128 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1132 /* Called with rcu_read_lock() */
1133 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1134 const struct sk_buff *skb)
1136 #ifdef CONFIG_TCP_MD5SIG
1138 * This gets called for each TCP segment that arrives
1139 * so we want to be efficient.
1140 * We have 3 drop cases:
1141 * o No MD5 hash and one expected.
1142 * o MD5 hash and we're not expecting one.
1143 * o MD5 hash and its wrong.
1145 const __u8 *hash_location = NULL;
1146 struct tcp_md5sig_key *hash_expected;
1147 const struct iphdr *iph = ip_hdr(skb);
1148 const struct tcphdr *th = tcp_hdr(skb);
1150 unsigned char newhash[16];
1152 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1154 hash_location = tcp_parse_md5sig_option(th);
1156 /* We've parsed the options - do we have a hash? */
1157 if (!hash_expected && !hash_location)
1160 if (hash_expected && !hash_location) {
1161 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1165 if (!hash_expected && hash_location) {
1166 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1170 /* Okay, so this is hash_expected and hash_location -
1171 * so we need to calculate the checksum.
1173 genhash = tcp_v4_md5_hash_skb(newhash,
1177 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1178 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1179 &iph->saddr, ntohs(th->source),
1180 &iph->daddr, ntohs(th->dest),
1181 genhash ? " tcp_v4_calc_md5_hash failed"
1190 static void tcp_v4_init_req(struct request_sock *req,
1191 const struct sock *sk_listener,
1192 struct sk_buff *skb)
1194 struct inet_request_sock *ireq = inet_rsk(req);
1196 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1197 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1198 ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1199 ireq->opt = tcp_v4_save_options(skb);
1202 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1204 const struct request_sock *req,
1207 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1210 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1219 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1221 .obj_size = sizeof(struct tcp_request_sock),
1222 .rtx_syn_ack = tcp_rtx_synack,
1223 .send_ack = tcp_v4_reqsk_send_ack,
1224 .destructor = tcp_v4_reqsk_destructor,
1225 .send_reset = tcp_v4_send_reset,
1226 .syn_ack_timeout = tcp_syn_ack_timeout,
1229 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1230 .mss_clamp = TCP_MSS_DEFAULT,
1231 #ifdef CONFIG_TCP_MD5SIG
1232 .req_md5_lookup = tcp_v4_md5_lookup,
1233 .calc_md5_hash = tcp_v4_md5_hash_skb,
1235 .init_req = tcp_v4_init_req,
1236 #ifdef CONFIG_SYN_COOKIES
1237 .cookie_init_seq = cookie_v4_init_sequence,
1239 .route_req = tcp_v4_route_req,
1240 .init_seq = tcp_v4_init_sequence,
1241 .send_synack = tcp_v4_send_synack,
1244 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1246 /* Never answer to SYNs send to broadcast or multicast */
1247 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1250 return tcp_conn_request(&tcp_request_sock_ops,
1251 &tcp_request_sock_ipv4_ops, sk, skb);
1254 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1257 EXPORT_SYMBOL(tcp_v4_conn_request);
1261 * The three way handshake has completed - we got a valid synack -
1262 * now create the new socket.
1264 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1265 struct request_sock *req,
1266 struct dst_entry *dst,
1267 struct request_sock *req_unhash,
1270 struct inet_request_sock *ireq;
1271 struct inet_sock *newinet;
1272 struct tcp_sock *newtp;
1274 #ifdef CONFIG_TCP_MD5SIG
1275 struct tcp_md5sig_key *key;
1277 struct ip_options_rcu *inet_opt;
1279 if (sk_acceptq_is_full(sk))
1282 newsk = tcp_create_openreq_child(sk, req, skb);
1286 newsk->sk_gso_type = SKB_GSO_TCPV4;
1287 inet_sk_rx_dst_set(newsk, skb);
1289 newtp = tcp_sk(newsk);
1290 newinet = inet_sk(newsk);
1291 ireq = inet_rsk(req);
1292 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1293 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1294 newinet->inet_saddr = ireq->ir_loc_addr;
1295 inet_opt = ireq->opt;
1296 rcu_assign_pointer(newinet->inet_opt, inet_opt);
1298 newinet->mc_index = inet_iif(skb);
1299 newinet->mc_ttl = ip_hdr(skb)->ttl;
1300 newinet->rcv_tos = ip_hdr(skb)->tos;
1301 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1303 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1304 newinet->inet_id = newtp->write_seq ^ jiffies;
1307 dst = inet_csk_route_child_sock(sk, newsk, req);
1311 /* syncookie case : see end of cookie_v4_check() */
1313 sk_setup_caps(newsk, dst);
1315 tcp_ca_openreq_child(newsk, dst);
1317 tcp_sync_mss(newsk, dst_mtu(dst));
1318 newtp->advmss = dst_metric_advmss(dst);
1319 if (tcp_sk(sk)->rx_opt.user_mss &&
1320 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1321 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1323 tcp_initialize_rcv_mss(newsk);
1325 #ifdef CONFIG_TCP_MD5SIG
1326 /* Copy over the MD5 key from the original socket */
1327 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1331 * We're using one, so create a matching key
1332 * on the newsk structure. If we fail to get
1333 * memory, then we end up not copying the key
1336 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1337 AF_INET, key->key, key->keylen, GFP_ATOMIC);
1338 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1342 if (__inet_inherit_port(sk, newsk) < 0)
1344 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1346 tcp_move_syn(newtp, req);
1351 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1355 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1358 inet_csk_prepare_forced_close(newsk);
1362 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1364 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1366 #ifdef CONFIG_SYN_COOKIES
1367 const struct tcphdr *th = tcp_hdr(skb);
1370 sk = cookie_v4_check(sk, skb);
1375 /* The socket must have it's spinlock held when we get
1376 * here, unless it is a TCP_LISTEN socket.
1378 * We have a potential double-lock case here, so even when
1379 * doing backlog processing we use the BH locking scheme.
1380 * This is because we cannot sleep with the original spinlock
1383 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1387 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1388 struct dst_entry *dst = sk->sk_rx_dst;
1390 sock_rps_save_rxhash(sk, skb);
1391 sk_mark_napi_id(sk, skb);
1393 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1394 !dst->ops->check(dst, 0)) {
1396 sk->sk_rx_dst = NULL;
1399 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1403 if (tcp_checksum_complete(skb))
1406 if (sk->sk_state == TCP_LISTEN) {
1407 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1412 sock_rps_save_rxhash(nsk, skb);
1413 sk_mark_napi_id(nsk, skb);
1414 if (tcp_child_process(sk, nsk, skb)) {
1421 sock_rps_save_rxhash(sk, skb);
1423 if (tcp_rcv_state_process(sk, skb)) {
1430 tcp_v4_send_reset(rsk, skb);
1433 /* Be careful here. If this function gets more complicated and
1434 * gcc suffers from register pressure on the x86, sk (in %ebx)
1435 * might be destroyed here. This current version compiles correctly,
1436 * but you have been warned.
1441 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1442 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1445 EXPORT_SYMBOL(tcp_v4_do_rcv);
1447 void tcp_v4_early_demux(struct sk_buff *skb)
1449 const struct iphdr *iph;
1450 const struct tcphdr *th;
1453 if (skb->pkt_type != PACKET_HOST)
1456 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1462 if (th->doff < sizeof(struct tcphdr) / 4)
1465 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1466 iph->saddr, th->source,
1467 iph->daddr, ntohs(th->dest),
1471 skb->destructor = sock_edemux;
1472 if (sk_fullsock(sk)) {
1473 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1476 dst = dst_check(dst, 0);
1478 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1479 skb_dst_set_noref(skb, dst);
1484 /* Packet is added to VJ-style prequeue for processing in process
1485 * context, if a reader task is waiting. Apparently, this exciting
1486 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1487 * failed somewhere. Latency? Burstiness? Well, at least now we will
1488 * see, why it failed. 8)8) --ANK
1491 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1493 struct tcp_sock *tp = tcp_sk(sk);
1495 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1498 if (skb->len <= tcp_hdrlen(skb) &&
1499 skb_queue_len(&tp->ucopy.prequeue) == 0)
1502 /* Before escaping RCU protected region, we need to take care of skb
1503 * dst. Prequeue is only enabled for established sockets.
1504 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1505 * Instead of doing full sk_rx_dst validity here, let's perform
1506 * an optimistic check.
1508 if (likely(sk->sk_rx_dst))
1511 skb_dst_force_safe(skb);
1513 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1514 tp->ucopy.memory += skb->truesize;
1515 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1516 struct sk_buff *skb1;
1518 BUG_ON(sock_owned_by_user(sk));
1520 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1521 sk_backlog_rcv(sk, skb1);
1522 NET_INC_STATS_BH(sock_net(sk),
1523 LINUX_MIB_TCPPREQUEUEDROPPED);
1526 tp->ucopy.memory = 0;
1527 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1528 wake_up_interruptible_sync_poll(sk_sleep(sk),
1529 POLLIN | POLLRDNORM | POLLRDBAND);
1530 if (!inet_csk_ack_scheduled(sk))
1531 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1532 (3 * tcp_rto_min(sk)) / 4,
1537 EXPORT_SYMBOL(tcp_prequeue);
1539 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1541 struct tcphdr *th = (struct tcphdr *)skb->data;
1542 unsigned int eaten = skb->len;
1545 err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1548 TCP_SKB_CB(skb)->end_seq -= eaten;
1552 EXPORT_SYMBOL(tcp_filter);
1558 int tcp_v4_rcv(struct sk_buff *skb)
1560 const struct iphdr *iph;
1561 const struct tcphdr *th;
1564 struct net *net = dev_net(skb->dev);
1566 if (skb->pkt_type != PACKET_HOST)
1569 /* Count it even if it's bad */
1570 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1572 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1577 if (th->doff < sizeof(struct tcphdr) / 4)
1579 if (!pskb_may_pull(skb, th->doff * 4))
1582 /* An explanation is required here, I think.
1583 * Packet length and doff are validated by header prediction,
1584 * provided case of th->doff==0 is eliminated.
1585 * So, we defer the checks. */
1587 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1592 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1593 * barrier() makes sure compiler wont play fool^Waliasing games.
1595 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1596 sizeof(struct inet_skb_parm));
1599 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1600 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1601 skb->len - th->doff * 4);
1602 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1603 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1604 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1605 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1606 TCP_SKB_CB(skb)->sacked = 0;
1609 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1614 if (sk->sk_state == TCP_TIME_WAIT)
1617 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1618 struct request_sock *req = inet_reqsk(sk);
1621 sk = req->rsk_listener;
1622 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1626 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1627 inet_csk_reqsk_queue_drop_and_put(sk, req);
1631 nsk = tcp_check_req(sk, skb, req, false);
1634 goto discard_and_relse;
1638 } else if (tcp_child_process(sk, nsk, skb)) {
1639 tcp_v4_send_reset(nsk, skb);
1640 goto discard_and_relse;
1646 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1647 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1648 goto discard_and_relse;
1651 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1652 goto discard_and_relse;
1654 if (tcp_v4_inbound_md5_hash(sk, skb))
1655 goto discard_and_relse;
1659 if (tcp_filter(sk, skb))
1660 goto discard_and_relse;
1661 th = (const struct tcphdr *)skb->data;
1666 if (sk->sk_state == TCP_LISTEN) {
1667 ret = tcp_v4_do_rcv(sk, skb);
1668 goto put_and_return;
1671 sk_incoming_cpu_update(sk);
1673 bh_lock_sock_nested(sk);
1674 tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1676 if (!sock_owned_by_user(sk)) {
1677 if (!tcp_prequeue(sk, skb))
1678 ret = tcp_v4_do_rcv(sk, skb);
1679 } else if (unlikely(sk_add_backlog(sk, skb,
1680 sk->sk_rcvbuf + sk->sk_sndbuf))) {
1682 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1683 goto discard_and_relse;
1693 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1696 if (tcp_checksum_complete(skb)) {
1698 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1700 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1702 tcp_v4_send_reset(NULL, skb);
1706 /* Discard frame. */
1715 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1716 inet_twsk_put(inet_twsk(sk));
1720 if (tcp_checksum_complete(skb)) {
1721 inet_twsk_put(inet_twsk(sk));
1724 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1726 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1728 iph->saddr, th->source,
1729 iph->daddr, th->dest,
1732 inet_twsk_deschedule_put(inet_twsk(sk));
1736 /* Fall through to ACK */
1739 tcp_v4_timewait_ack(sk, skb);
1743 case TCP_TW_SUCCESS:;
1748 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1749 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1750 .twsk_unique = tcp_twsk_unique,
1751 .twsk_destructor= tcp_twsk_destructor,
1754 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1756 struct dst_entry *dst = skb_dst(skb);
1758 if (dst && dst_hold_safe(dst)) {
1759 sk->sk_rx_dst = dst;
1760 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1763 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1765 const struct inet_connection_sock_af_ops ipv4_specific = {
1766 .queue_xmit = ip_queue_xmit,
1767 .send_check = tcp_v4_send_check,
1768 .rebuild_header = inet_sk_rebuild_header,
1769 .sk_rx_dst_set = inet_sk_rx_dst_set,
1770 .conn_request = tcp_v4_conn_request,
1771 .syn_recv_sock = tcp_v4_syn_recv_sock,
1772 .net_header_len = sizeof(struct iphdr),
1773 .setsockopt = ip_setsockopt,
1774 .getsockopt = ip_getsockopt,
1775 .addr2sockaddr = inet_csk_addr2sockaddr,
1776 .sockaddr_len = sizeof(struct sockaddr_in),
1777 .bind_conflict = inet_csk_bind_conflict,
1778 #ifdef CONFIG_COMPAT
1779 .compat_setsockopt = compat_ip_setsockopt,
1780 .compat_getsockopt = compat_ip_getsockopt,
1782 .mtu_reduced = tcp_v4_mtu_reduced,
1784 EXPORT_SYMBOL(ipv4_specific);
1786 #ifdef CONFIG_TCP_MD5SIG
1787 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1788 .md5_lookup = tcp_v4_md5_lookup,
1789 .calc_md5_hash = tcp_v4_md5_hash_skb,
1790 .md5_parse = tcp_v4_parse_md5_keys,
1794 /* NOTE: A lot of things set to zero explicitly by call to
1795 * sk_alloc() so need not be done here.
1797 static int tcp_v4_init_sock(struct sock *sk)
1799 struct inet_connection_sock *icsk = inet_csk(sk);
1803 icsk->icsk_af_ops = &ipv4_specific;
1805 #ifdef CONFIG_TCP_MD5SIG
1806 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1812 void tcp_v4_destroy_sock(struct sock *sk)
1814 struct tcp_sock *tp = tcp_sk(sk);
1816 tcp_clear_xmit_timers(sk);
1818 tcp_cleanup_congestion_control(sk);
1820 /* Cleanup up the write buffer. */
1821 tcp_write_queue_purge(sk);
1823 /* Cleans up our, hopefully empty, out_of_order_queue. */
1824 __skb_queue_purge(&tp->out_of_order_queue);
1826 #ifdef CONFIG_TCP_MD5SIG
1827 /* Clean up the MD5 key list, if any */
1828 if (tp->md5sig_info) {
1829 tcp_clear_md5_list(sk);
1830 kfree_rcu(tp->md5sig_info, rcu);
1831 tp->md5sig_info = NULL;
1835 /* Clean prequeue, it must be empty really */
1836 __skb_queue_purge(&tp->ucopy.prequeue);
1838 /* Clean up a referenced TCP bind bucket. */
1839 if (inet_csk(sk)->icsk_bind_hash)
1842 BUG_ON(tp->fastopen_rsk);
1844 /* If socket is aborted during connect operation */
1845 tcp_free_fastopen_req(tp);
1846 tcp_saved_syn_free(tp);
1848 sk_sockets_allocated_dec(sk);
1849 sock_release_memcg(sk);
1851 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1853 #ifdef CONFIG_PROC_FS
1854 /* Proc filesystem TCP sock list dumping. */
1857 * Get next listener socket follow cur. If cur is NULL, get first socket
1858 * starting from bucket given in st->bucket; when st->bucket is zero the
1859 * very first socket in the hash table is returned.
1861 static void *listening_get_next(struct seq_file *seq, void *cur)
1863 struct inet_connection_sock *icsk;
1864 struct hlist_nulls_node *node;
1865 struct sock *sk = cur;
1866 struct inet_listen_hashbucket *ilb;
1867 struct tcp_iter_state *st = seq->private;
1868 struct net *net = seq_file_net(seq);
1871 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1872 spin_lock_bh(&ilb->lock);
1873 sk = sk_nulls_head(&ilb->head);
1877 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1881 sk = sk_nulls_next(sk);
1883 sk_nulls_for_each_from(sk, node) {
1884 if (!net_eq(sock_net(sk), net))
1886 if (sk->sk_family == st->family) {
1890 icsk = inet_csk(sk);
1892 spin_unlock_bh(&ilb->lock);
1894 if (++st->bucket < INET_LHTABLE_SIZE) {
1895 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1896 spin_lock_bh(&ilb->lock);
1897 sk = sk_nulls_head(&ilb->head);
1905 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1907 struct tcp_iter_state *st = seq->private;
1912 rc = listening_get_next(seq, NULL);
1914 while (rc && *pos) {
1915 rc = listening_get_next(seq, rc);
1921 static inline bool empty_bucket(const struct tcp_iter_state *st)
1923 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1927 * Get first established socket starting from bucket given in st->bucket.
1928 * If st->bucket is zero, the very first socket in the hash is returned.
1930 static void *established_get_first(struct seq_file *seq)
1932 struct tcp_iter_state *st = seq->private;
1933 struct net *net = seq_file_net(seq);
1937 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1939 struct hlist_nulls_node *node;
1940 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1942 /* Lockless fast path for the common case of empty buckets */
1943 if (empty_bucket(st))
1947 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1948 if (sk->sk_family != st->family ||
1949 !net_eq(sock_net(sk), net)) {
1955 spin_unlock_bh(lock);
1961 static void *established_get_next(struct seq_file *seq, void *cur)
1963 struct sock *sk = cur;
1964 struct hlist_nulls_node *node;
1965 struct tcp_iter_state *st = seq->private;
1966 struct net *net = seq_file_net(seq);
1971 sk = sk_nulls_next(sk);
1973 sk_nulls_for_each_from(sk, node) {
1974 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1978 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1980 return established_get_first(seq);
1983 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1985 struct tcp_iter_state *st = seq->private;
1989 rc = established_get_first(seq);
1992 rc = established_get_next(seq, rc);
1998 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2001 struct tcp_iter_state *st = seq->private;
2003 st->state = TCP_SEQ_STATE_LISTENING;
2004 rc = listening_get_idx(seq, &pos);
2007 st->state = TCP_SEQ_STATE_ESTABLISHED;
2008 rc = established_get_idx(seq, pos);
2014 static void *tcp_seek_last_pos(struct seq_file *seq)
2016 struct tcp_iter_state *st = seq->private;
2017 int offset = st->offset;
2018 int orig_num = st->num;
2021 switch (st->state) {
2022 case TCP_SEQ_STATE_LISTENING:
2023 if (st->bucket >= INET_LHTABLE_SIZE)
2025 st->state = TCP_SEQ_STATE_LISTENING;
2026 rc = listening_get_next(seq, NULL);
2027 while (offset-- && rc)
2028 rc = listening_get_next(seq, rc);
2032 st->state = TCP_SEQ_STATE_ESTABLISHED;
2034 case TCP_SEQ_STATE_ESTABLISHED:
2035 if (st->bucket > tcp_hashinfo.ehash_mask)
2037 rc = established_get_first(seq);
2038 while (offset-- && rc)
2039 rc = established_get_next(seq, rc);
2047 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2049 struct tcp_iter_state *st = seq->private;
2052 if (*pos && *pos == st->last_pos) {
2053 rc = tcp_seek_last_pos(seq);
2058 st->state = TCP_SEQ_STATE_LISTENING;
2062 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2065 st->last_pos = *pos;
2069 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2071 struct tcp_iter_state *st = seq->private;
2074 if (v == SEQ_START_TOKEN) {
2075 rc = tcp_get_idx(seq, 0);
2079 switch (st->state) {
2080 case TCP_SEQ_STATE_LISTENING:
2081 rc = listening_get_next(seq, v);
2083 st->state = TCP_SEQ_STATE_ESTABLISHED;
2086 rc = established_get_first(seq);
2089 case TCP_SEQ_STATE_ESTABLISHED:
2090 rc = established_get_next(seq, v);
2095 st->last_pos = *pos;
2099 static void tcp_seq_stop(struct seq_file *seq, void *v)
2101 struct tcp_iter_state *st = seq->private;
2103 switch (st->state) {
2104 case TCP_SEQ_STATE_LISTENING:
2105 if (v != SEQ_START_TOKEN)
2106 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2108 case TCP_SEQ_STATE_ESTABLISHED:
2110 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2115 int tcp_seq_open(struct inode *inode, struct file *file)
2117 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2118 struct tcp_iter_state *s;
2121 err = seq_open_net(inode, file, &afinfo->seq_ops,
2122 sizeof(struct tcp_iter_state));
2126 s = ((struct seq_file *)file->private_data)->private;
2127 s->family = afinfo->family;
2131 EXPORT_SYMBOL(tcp_seq_open);
2133 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2136 struct proc_dir_entry *p;
2138 afinfo->seq_ops.start = tcp_seq_start;
2139 afinfo->seq_ops.next = tcp_seq_next;
2140 afinfo->seq_ops.stop = tcp_seq_stop;
2142 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2143 afinfo->seq_fops, afinfo);
2148 EXPORT_SYMBOL(tcp_proc_register);
2150 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2152 remove_proc_entry(afinfo->name, net->proc_net);
2154 EXPORT_SYMBOL(tcp_proc_unregister);
2156 static void get_openreq4(const struct request_sock *req,
2157 struct seq_file *f, int i)
2159 const struct inet_request_sock *ireq = inet_rsk(req);
2160 long delta = req->rsk_timer.expires - jiffies;
2162 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2163 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2168 ntohs(ireq->ir_rmt_port),
2170 0, 0, /* could print option size, but that is af dependent. */
2171 1, /* timers active (only the expire timer) */
2172 jiffies_delta_to_clock_t(delta),
2174 from_kuid_munged(seq_user_ns(f),
2175 sock_i_uid(req->rsk_listener)),
2176 0, /* non standard timer */
2177 0, /* open_requests have no inode */
2182 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2185 unsigned long timer_expires;
2186 const struct tcp_sock *tp = tcp_sk(sk);
2187 const struct inet_connection_sock *icsk = inet_csk(sk);
2188 const struct inet_sock *inet = inet_sk(sk);
2189 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2190 __be32 dest = inet->inet_daddr;
2191 __be32 src = inet->inet_rcv_saddr;
2192 __u16 destp = ntohs(inet->inet_dport);
2193 __u16 srcp = ntohs(inet->inet_sport);
2197 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2198 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2199 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2201 timer_expires = icsk->icsk_timeout;
2202 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2204 timer_expires = icsk->icsk_timeout;
2205 } else if (timer_pending(&sk->sk_timer)) {
2207 timer_expires = sk->sk_timer.expires;
2210 timer_expires = jiffies;
2213 state = sk_state_load(sk);
2214 if (state == TCP_LISTEN)
2215 rx_queue = sk->sk_ack_backlog;
2217 /* Because we don't lock the socket,
2218 * we might find a transient negative value.
2220 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2222 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2223 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2224 i, src, srcp, dest, destp, state,
2225 tp->write_seq - tp->snd_una,
2228 jiffies_delta_to_clock_t(timer_expires - jiffies),
2229 icsk->icsk_retransmits,
2230 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2231 icsk->icsk_probes_out,
2233 atomic_read(&sk->sk_refcnt), sk,
2234 jiffies_to_clock_t(icsk->icsk_rto),
2235 jiffies_to_clock_t(icsk->icsk_ack.ato),
2236 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2238 state == TCP_LISTEN ?
2239 fastopenq->max_qlen :
2240 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2243 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2244 struct seq_file *f, int i)
2246 long delta = tw->tw_timer.expires - jiffies;
2250 dest = tw->tw_daddr;
2251 src = tw->tw_rcv_saddr;
2252 destp = ntohs(tw->tw_dport);
2253 srcp = ntohs(tw->tw_sport);
2255 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2256 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2257 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2258 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2259 atomic_read(&tw->tw_refcnt), tw);
2264 static int tcp4_seq_show(struct seq_file *seq, void *v)
2266 struct tcp_iter_state *st;
2267 struct sock *sk = v;
2269 seq_setwidth(seq, TMPSZ - 1);
2270 if (v == SEQ_START_TOKEN) {
2271 seq_puts(seq, " sl local_address rem_address st tx_queue "
2272 "rx_queue tr tm->when retrnsmt uid timeout "
2278 if (sk->sk_state == TCP_TIME_WAIT)
2279 get_timewait4_sock(v, seq, st->num);
2280 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2281 get_openreq4(v, seq, st->num);
2283 get_tcp4_sock(v, seq, st->num);
2289 static const struct file_operations tcp_afinfo_seq_fops = {
2290 .owner = THIS_MODULE,
2291 .open = tcp_seq_open,
2293 .llseek = seq_lseek,
2294 .release = seq_release_net
2297 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2300 .seq_fops = &tcp_afinfo_seq_fops,
2302 .show = tcp4_seq_show,
2306 static int __net_init tcp4_proc_init_net(struct net *net)
2308 return tcp_proc_register(net, &tcp4_seq_afinfo);
2311 static void __net_exit tcp4_proc_exit_net(struct net *net)
2313 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2316 static struct pernet_operations tcp4_net_ops = {
2317 .init = tcp4_proc_init_net,
2318 .exit = tcp4_proc_exit_net,
2321 int __init tcp4_proc_init(void)
2323 return register_pernet_subsys(&tcp4_net_ops);
2326 void tcp4_proc_exit(void)
2328 unregister_pernet_subsys(&tcp4_net_ops);
2330 #endif /* CONFIG_PROC_FS */
2332 struct proto tcp_prot = {
2334 .owner = THIS_MODULE,
2336 .connect = tcp_v4_connect,
2337 .disconnect = tcp_disconnect,
2338 .accept = inet_csk_accept,
2340 .init = tcp_v4_init_sock,
2341 .destroy = tcp_v4_destroy_sock,
2342 .shutdown = tcp_shutdown,
2343 .setsockopt = tcp_setsockopt,
2344 .getsockopt = tcp_getsockopt,
2345 .recvmsg = tcp_recvmsg,
2346 .sendmsg = tcp_sendmsg,
2347 .sendpage = tcp_sendpage,
2348 .backlog_rcv = tcp_v4_do_rcv,
2349 .release_cb = tcp_release_cb,
2351 .unhash = inet_unhash,
2352 .get_port = inet_csk_get_port,
2353 .enter_memory_pressure = tcp_enter_memory_pressure,
2354 .stream_memory_free = tcp_stream_memory_free,
2355 .sockets_allocated = &tcp_sockets_allocated,
2356 .orphan_count = &tcp_orphan_count,
2357 .memory_allocated = &tcp_memory_allocated,
2358 .memory_pressure = &tcp_memory_pressure,
2359 .sysctl_mem = sysctl_tcp_mem,
2360 .sysctl_wmem = sysctl_tcp_wmem,
2361 .sysctl_rmem = sysctl_tcp_rmem,
2362 .max_header = MAX_TCP_HEADER,
2363 .obj_size = sizeof(struct tcp_sock),
2364 .slab_flags = SLAB_DESTROY_BY_RCU,
2365 .twsk_prot = &tcp_timewait_sock_ops,
2366 .rsk_prot = &tcp_request_sock_ops,
2367 .h.hashinfo = &tcp_hashinfo,
2368 .no_autobind = true,
2369 #ifdef CONFIG_COMPAT
2370 .compat_setsockopt = compat_tcp_setsockopt,
2371 .compat_getsockopt = compat_tcp_getsockopt,
2373 #ifdef CONFIG_MEMCG_KMEM
2374 .init_cgroup = tcp_init_cgroup,
2375 .destroy_cgroup = tcp_destroy_cgroup,
2376 .proto_cgroup = tcp_proto_cgroup,
2378 .diag_destroy = tcp_abort,
2380 EXPORT_SYMBOL(tcp_prot);
2382 static void __net_exit tcp_sk_exit(struct net *net)
2386 for_each_possible_cpu(cpu)
2387 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2388 free_percpu(net->ipv4.tcp_sk);
2391 static int __net_init tcp_sk_init(struct net *net)
2395 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2396 if (!net->ipv4.tcp_sk)
2399 for_each_possible_cpu(cpu) {
2402 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2406 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2409 net->ipv4.sysctl_tcp_ecn = 2;
2410 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2412 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2413 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2414 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2423 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2425 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2428 static struct pernet_operations __net_initdata tcp_sk_ops = {
2429 .init = tcp_sk_init,
2430 .exit = tcp_sk_exit,
2431 .exit_batch = tcp_sk_exit_batch,
2434 void __init tcp_v4_init(void)
2436 inet_hashinfo_init(&tcp_hashinfo);
2437 if (register_pernet_subsys(&tcp_sk_ops))
2438 panic("Failed to create the TCP control socket.\n");