net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/tcp_memcontrol.h>
  77 #include <net/busy_poll.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92 #ifdef CONFIG_TCP_MD5SIG
  93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  94                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  95 #endif
  96
  97 struct inet_hashinfo tcp_hashinfo;
  98 EXPORT_SYMBOL(tcp_hashinfo);
  99
 100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 101 {
 102         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 103                                           ip_hdr(skb)->saddr,
 104                                           tcp_hdr(skb)->dest,
 105                                           tcp_hdr(skb)->source);
 106 }
 107
 108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 109 {
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112
 113         /* With PAWS, it is safe from the viewpoint
 114            of data integrity. Even without PAWS it is safe provided sequence
 115            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 116
 117            Actually, the idea is close to VJ's one, only timestamp cache is
 118            held not per host, but per port pair and TW bucket is used as state
 119            holder.
 120
 121            If TW bucket has been already destroyed we fall back to VJ's scheme
 122            and use initial timestamp retrieved from peer table.
 123          */
 124         if (tcptw->tw_ts_recent_stamp &&
 125             (!twp || (sysctl_tcp_tw_reuse &&
 126                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 127                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 128                 if (tp->write_seq == 0)
 129                         tp->write_seq = 1;
 130                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 131                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 132                 sock_hold(sktw);
 133                 return 1;
 134         }
 135
 136         return 0;
 137 }
 138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 139
 140 /* This will initiate an outgoing connection. */
 141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 142 {
 143         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 144         struct inet_sock *inet = inet_sk(sk);
 145         struct tcp_sock *tp = tcp_sk(sk);
 146         __be16 orig_sport, orig_dport;
 147         __be32 daddr, nexthop;
 148         struct flowi4 *fl4;
 149         struct rtable *rt;
 150         int err;
 151         struct ip_options_rcu *inet_opt;
 152
 153         if (addr_len < sizeof(struct sockaddr_in))
 154                 return -EINVAL;
 155
 156         if (usin->sin_family != AF_INET)
 157                 return -EAFNOSUPPORT;
 158
 159         nexthop = daddr = usin->sin_addr.s_addr;
 160         inet_opt = rcu_dereference_protected(inet->inet_opt,
 161                                              sock_owned_by_user(sk));
 162         if (inet_opt && inet_opt->opt.srr) {
 163                 if (!daddr)
 164                         return -EINVAL;
 165                 nexthop = inet_opt->opt.faddr;
 166         }
 167
 168         orig_sport = inet->inet_sport;
 169         orig_dport = usin->sin_port;
 170         fl4 = &inet->cork.fl.u.ip4;
 171         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 172                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 173                               IPPROTO_TCP,
 174                               orig_sport, orig_dport, sk);
 175         if (IS_ERR(rt)) {
 176                 err = PTR_ERR(rt);
 177                 if (err == -ENETUNREACH)
 178                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 179                 return err;
 180         }
 181
 182         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 183                 ip_rt_put(rt);
 184                 return -ENETUNREACH;
 185         }
 186
 187         if (!inet_opt || !inet_opt->opt.srr)
 188                 daddr = fl4->daddr;
 189
 190         if (!inet->inet_saddr)
 191                 inet->inet_saddr = fl4->saddr;
 192         sk_rcv_saddr_set(sk, inet->inet_saddr);
 193
 194         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 195                 /* Reset inherited state */
 196                 tp->rx_opt.ts_recent       = 0;
 197                 tp->rx_opt.ts_recent_stamp = 0;
 198                 if (likely(!tp->repair))
 199                         tp->write_seq      = 0;
 200         }
 201
 202         if (tcp_death_row.sysctl_tw_recycle &&
 203             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 204                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 205
 206         inet->inet_dport = usin->sin_port;
 207         sk_daddr_set(sk, daddr);
 208
 209         inet_csk(sk)->icsk_ext_hdr_len = 0;
 210         if (inet_opt)
 211                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 212
 213         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 214
 215         /* Socket identity is still unknown (sport may be zero).
 216          * However we set state to SYN-SENT and not releasing socket
 217          * lock select source port, enter ourselves into the hash tables and
 218          * complete initialization after this.
 219          */
 220         tcp_set_state(sk, TCP_SYN_SENT);
 221         err = inet_hash_connect(&tcp_death_row, sk);
 222         if (err)
 223                 goto failure;
 224
 225         sk_set_txhash(sk);
 226
 227         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 228                                inet->inet_sport, inet->inet_dport, sk);
 229         if (IS_ERR(rt)) {
 230                 err = PTR_ERR(rt);
 231                 rt = NULL;
 232                 goto failure;
 233         }
 234         /* OK, now commit destination to socket.  */
 235         sk->sk_gso_type = SKB_GSO_TCPV4;
 236         sk_setup_caps(sk, &rt->dst);
 237
 238         if (!tp->write_seq && likely(!tp->repair))
 239                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 240                                                            inet->inet_daddr,
 241                                                            inet->inet_sport,
 242                                                            usin->sin_port);
 243
 244         inet->inet_id = tp->write_seq ^ jiffies;
 245
 246         err = tcp_connect(sk);
 247
 248         rt = NULL;
 249         if (err)
 250                 goto failure;
 251
 252         return 0;
 253
 254 failure:
 255         /*
 256          * This unhashes the socket and releases the local port,
 257          * if necessary.
 258          */
 259         tcp_set_state(sk, TCP_CLOSE);
 260         ip_rt_put(rt);
 261         sk->sk_route_caps = 0;
 262         inet->inet_dport = 0;
 263         return err;
 264 }
 265 EXPORT_SYMBOL(tcp_v4_connect);
 266
 267 /*
 268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 269  * It can be called through tcp_release_cb() if socket was owned by user
 270  * at the time tcp_v4_err() was called to handle ICMP message.
 271  */
 272 void tcp_v4_mtu_reduced(struct sock *sk)
 273 {
 274         struct dst_entry *dst;
 275         struct inet_sock *inet = inet_sk(sk);
 276         u32 mtu = tcp_sk(sk)->mtu_info;
 277
 278         dst = inet_csk_update_pmtu(sk, mtu);
 279         if (!dst)
 280                 return;
 281
 282         /* Something is about to be wrong... Remember soft error
 283          * for the case, if this connection will not able to recover.
 284          */
 285         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 286                 sk->sk_err_soft = EMSGSIZE;
 287
 288         mtu = dst_mtu(dst);
 289
 290         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 291             ip_sk_accept_pmtu(sk) &&
 292             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 293                 tcp_sync_mss(sk, mtu);
 294
 295                 /* Resend the TCP packet because it's
 296                  * clear that the old packet has been
 297                  * dropped. This is the new "fast" path mtu
 298                  * discovery.
 299                  */
 300                 tcp_simple_retransmit(sk);
 301         } /* else let the usual retransmit timer handle it */
 302 }
 303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 304
 305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 306 {
 307         struct dst_entry *dst = __sk_dst_check(sk, 0);
 308
 309         if (dst)
 310                 dst->ops->redirect(dst, sk, skb);
 311 }
 312
 313
 314 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 315 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 316 {
 317         struct request_sock *req = inet_reqsk(sk);
 318         struct net *net = sock_net(sk);
 319
 320         /* ICMPs are not backlogged, hence we cannot get
 321          * an established socket here.
 322          */
 323         if (seq != tcp_rsk(req)->snt_isn) {
 324                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 325         } else if (abort) {
 326                 /*
 327                  * Still in SYN_RECV, just remove it silently.
 328                  * There is no good way to pass the error to the newly
 329                  * created socket, and POSIX does not want network
 330                  * errors returned from accept().
 331                  */
 332                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 333                 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
 334         }
 335         reqsk_put(req);
 336 }
 337 EXPORT_SYMBOL(tcp_req_err);
 338
 339 /*
 340  * This routine is called by the ICMP module when it gets some
 341  * sort of error condition.  If err < 0 then the socket should
 342  * be closed and the error returned to the user.  If err > 0
 343  * it's just the icmp type << 8 | icmp code.  After adjustment
 344  * header points to the first 8 bytes of the tcp header.  We need
 345  * to find the appropriate port.
 346  *
 347  * The locking strategy used here is very "optimistic". When
 348  * someone else accesses the socket the ICMP is just dropped
 349  * and for some paths there is no check at all.
 350  * A more general error queue to queue errors for later handling
 351  * is probably better.
 352  *
 353  */
 354
 355 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 356 {
 357         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 358         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 359         struct inet_connection_sock *icsk;
 360         struct tcp_sock *tp;
 361         struct inet_sock *inet;
 362         const int type = icmp_hdr(icmp_skb)->type;
 363         const int code = icmp_hdr(icmp_skb)->code;
 364         struct sock *sk;
 365         struct sk_buff *skb;
 366         struct request_sock *fastopen;
 367         __u32 seq, snd_una;
 368         __u32 remaining;
 369         int err;
 370         struct net *net = dev_net(icmp_skb->dev);
 371
 372         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 373                                        th->dest, iph->saddr, ntohs(th->source),
 374                                        inet_iif(icmp_skb));
 375         if (!sk) {
 376                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 377                 return;
 378         }
 379         if (sk->sk_state == TCP_TIME_WAIT) {
 380                 inet_twsk_put(inet_twsk(sk));
 381                 return;
 382         }
 383         seq = ntohl(th->seq);
 384         if (sk->sk_state == TCP_NEW_SYN_RECV)
 385                 return tcp_req_err(sk, seq,
 386                                   type == ICMP_PARAMETERPROB ||
 387                                   type == ICMP_TIME_EXCEEDED ||
 388                                   (type == ICMP_DEST_UNREACH &&
 389                                    (code == ICMP_NET_UNREACH ||
 390                                     code == ICMP_HOST_UNREACH)));
 391
 392         bh_lock_sock(sk);
 393         /* If too many ICMPs get dropped on busy
 394          * servers this needs to be solved differently.
 395          * We do take care of PMTU discovery (RFC1191) special case :
 396          * we can receive locally generated ICMP messages while socket is held.
 397          */
 398         if (sock_owned_by_user(sk)) {
 399                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 400                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 401         }
 402         if (sk->sk_state == TCP_CLOSE)
 403                 goto out;
 404
 405         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 406                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 407                 goto out;
 408         }
 409
 410         icsk = inet_csk(sk);
 411         tp = tcp_sk(sk);
 412         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 413         fastopen = tp->fastopen_rsk;
 414         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 415         if (sk->sk_state != TCP_LISTEN &&
 416             !between(seq, snd_una, tp->snd_nxt)) {
 417                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 418                 goto out;
 419         }
 420
 421         switch (type) {
 422         case ICMP_REDIRECT:
 423                 do_redirect(icmp_skb, sk);
 424                 goto out;
 425         case ICMP_SOURCE_QUENCH:
 426                 /* Just silently ignore these. */
 427                 goto out;
 428         case ICMP_PARAMETERPROB:
 429                 err = EPROTO;
 430                 break;
 431         case ICMP_DEST_UNREACH:
 432                 if (code > NR_ICMP_UNREACH)
 433                         goto out;
 434
 435                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 436                         /* We are not interested in TCP_LISTEN and open_requests
 437                          * (SYN-ACKs send out by Linux are always <576bytes so
 438                          * they should go through unfragmented).
 439                          */
 440                         if (sk->sk_state == TCP_LISTEN)
 441                                 goto out;
 442
 443                         tp->mtu_info = info;
 444                         if (!sock_owned_by_user(sk)) {
 445                                 tcp_v4_mtu_reduced(sk);
 446                         } else {
 447                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 448                                         sock_hold(sk);
 449                         }
 450                         goto out;
 451                 }
 452
 453                 err = icmp_err_convert[code].errno;
 454                 /* check if icmp_skb allows revert of backoff
 455                  * (see draft-zimmermann-tcp-lcd) */
 456                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 457                         break;
 458                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 459                     !icsk->icsk_backoff || fastopen)
 460                         break;
 461
 462                 if (sock_owned_by_user(sk))
 463                         break;
 464
 465                 icsk->icsk_backoff--;
 466                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 467                                                TCP_TIMEOUT_INIT;
 468                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 469
 470                 skb = tcp_write_queue_head(sk);
 471                 BUG_ON(!skb);
 472
 473                 remaining = icsk->icsk_rto -
 474                             min(icsk->icsk_rto,
 475                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 476
 477                 if (remaining) {
 478                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 479                                                   remaining, TCP_RTO_MAX);
 480                 } else {
 481                         /* RTO revert clocked out retransmission.
 482                          * Will retransmit now */
 483                         tcp_retransmit_timer(sk);
 484                 }
 485
 486                 break;
 487         case ICMP_TIME_EXCEEDED:
 488                 err = EHOSTUNREACH;
 489                 break;
 490         default:
 491                 goto out;
 492         }
 493
 494         switch (sk->sk_state) {
 495         case TCP_SYN_SENT:
 496         case TCP_SYN_RECV:
 497                 /* Only in fast or simultaneous open. If a fast open socket is
 498                  * is already accepted it is treated as a connected one below.
 499                  */
 500                 if (fastopen && !fastopen->sk)
 501                         break;
 502
 503                 if (!sock_owned_by_user(sk)) {
 504                         sk->sk_err = err;
 505
 506                         sk->sk_error_report(sk);
 507
 508                         tcp_done(sk);
 509                 } else {
 510                         sk->sk_err_soft = err;
 511                 }
 512                 goto out;
 513         }
 514
 515         /* If we've already connected we will keep trying
 516          * until we time out, or the user gives up.
 517          *
 518          * rfc1122 4.2.3.9 allows to consider as hard errors
 519          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 520          * but it is obsoleted by pmtu discovery).
 521          *
 522          * Note, that in modern internet, where routing is unreliable
 523          * and in each dark corner broken firewalls sit, sending random
 524          * errors ordered by their masters even this two messages finally lose
 525          * their original sense (even Linux sends invalid PORT_UNREACHs)
 526          *
 527          * Now we are in compliance with RFCs.
 528          *                                                      --ANK (980905)
 529          */
 530
 531         inet = inet_sk(sk);
 532         if (!sock_owned_by_user(sk) && inet->recverr) {
 533                 sk->sk_err = err;
 534                 sk->sk_error_report(sk);
 535         } else  { /* Only an error on timeout */
 536                 sk->sk_err_soft = err;
 537         }
 538
 539 out:
 540         bh_unlock_sock(sk);
 541         sock_put(sk);
 542 }
 543
 544 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 545 {
 546         struct tcphdr *th = tcp_hdr(skb);
 547
 548         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 549                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 550                 skb->csum_start = skb_transport_header(skb) - skb->head;
 551                 skb->csum_offset = offsetof(struct tcphdr, check);
 552         } else {
 553                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 554                                          csum_partial(th,
 555                                                       th->doff << 2,
 556                                                       skb->csum));
 557         }
 558 }
 559
 560 /* This routine computes an IPv4 TCP checksum. */
 561 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 562 {
 563         const struct inet_sock *inet = inet_sk(sk);
 564
 565         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 566 }
 567 EXPORT_SYMBOL(tcp_v4_send_check);
 568
 569 /*
 570  *      This routine will send an RST to the other tcp.
 571  *
 572  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 573  *                    for reset.
 574  *      Answer: if a packet caused RST, it is not for a socket
 575  *              existing in our system, if it is matched to a socket,
 576  *              it is just duplicate segment or bug in other side's TCP.
 577  *              So that we build reply only basing on parameters
 578  *              arrived with segment.
 579  *      Exception: precedence violation. We do not implement it in any case.
 580  */
 581
 582 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 583 {
 584         const struct tcphdr *th = tcp_hdr(skb);
 585         struct {
 586                 struct tcphdr th;
 587 #ifdef CONFIG_TCP_MD5SIG
 588                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 589 #endif
 590         } rep;
 591         struct ip_reply_arg arg;
 592 #ifdef CONFIG_TCP_MD5SIG
 593         struct tcp_md5sig_key *key;
 594         const __u8 *hash_location = NULL;
 595         unsigned char newhash[16];
 596         int genhash;
 597         struct sock *sk1 = NULL;
 598 #endif
 599         struct net *net;
 600
 601         /* Never send a reset in response to a reset. */
 602         if (th->rst)
 603                 return;
 604
 605         /* If sk not NULL, it means we did a successful lookup and incoming
 606          * route had to be correct. prequeue might have dropped our dst.
 607          */
 608         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 609                 return;
 610
 611         /* Swap the send and the receive. */
 612         memset(&rep, 0, sizeof(rep));
 613         rep.th.dest   = th->source;
 614         rep.th.source = th->dest;
 615         rep.th.doff   = sizeof(struct tcphdr) / 4;
 616         rep.th.rst    = 1;
 617
 618         if (th->ack) {
 619                 rep.th.seq = th->ack_seq;
 620         } else {
 621                 rep.th.ack = 1;
 622                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 623                                        skb->len - (th->doff << 2));
 624         }
 625
 626         memset(&arg, 0, sizeof(arg));
 627         arg.iov[0].iov_base = (unsigned char *)&rep;
 628         arg.iov[0].iov_len  = sizeof(rep.th);
 629
 630         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 631 #ifdef CONFIG_TCP_MD5SIG
 632         hash_location = tcp_parse_md5sig_option(th);
 633         if (!sk && hash_location) {
 634                 /*
 635                  * active side is lost. Try to find listening socket through
 636                  * source port, and then find md5 key through listening socket.
 637                  * we are not loose security here:
 638                  * Incoming packet is checked with md5 hash with finding key,
 639                  * no RST generated if md5 hash doesn't match.
 640                  */
 641                 sk1 = __inet_lookup_listener(net,
 642                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
 643                                              th->source, ip_hdr(skb)->daddr,
 644                                              ntohs(th->source), inet_iif(skb));
 645                 /* don't send rst if it can't find key */
 646                 if (!sk1)
 647                         return;
 648                 rcu_read_lock();
 649                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 650                                         &ip_hdr(skb)->saddr, AF_INET);
 651                 if (!key)
 652                         goto release_sk1;
 653
 654                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 655                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 656                         goto release_sk1;
 657         } else {
 658                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 659                                              &ip_hdr(skb)->saddr,
 660                                              AF_INET) : NULL;
 661         }
 662
 663         if (key) {
 664                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 665                                    (TCPOPT_NOP << 16) |
 666                                    (TCPOPT_MD5SIG << 8) |
 667                                    TCPOLEN_MD5SIG);
 668                 /* Update length and the length the header thinks exists */
 669                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 670                 rep.th.doff = arg.iov[0].iov_len / 4;
 671
 672                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 673                                      key, ip_hdr(skb)->saddr,
 674                                      ip_hdr(skb)->daddr, &rep.th);
 675         }
 676 #endif
 677         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 678                                       ip_hdr(skb)->saddr, /* XXX */
 679                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 680         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 681         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 682         /* When socket is gone, all binding information is lost.
 683          * routing might fail in this case. No choice here, if we choose to force
 684          * input interface, we will misroute in case of asymmetric route.
 685          */
 686         if (sk)
 687                 arg.bound_dev_if = sk->sk_bound_dev_if;
 688
 689         arg.tos = ip_hdr(skb)->tos;
 690         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 691         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 692                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 693                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 694                               &arg, arg.iov[0].iov_len);
 695
 696         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 697         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 698
 699 #ifdef CONFIG_TCP_MD5SIG
 700 release_sk1:
 701         if (sk1) {
 702                 rcu_read_unlock();
 703                 sock_put(sk1);
 704         }
 705 #endif
 706 }
 707
 708 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 709    outside socket context is ugly, certainly. What can I do?
 710  */
 711
 712 static void tcp_v4_send_ack(const struct sock *sk,
 713                             struct sk_buff *skb, u32 seq, u32 ack,
 714                             u32 win, u32 tsval, u32 tsecr, int oif,
 715                             struct tcp_md5sig_key *key,
 716                             int reply_flags, u8 tos)
 717 {
 718         const struct tcphdr *th = tcp_hdr(skb);
 719         struct {
 720                 struct tcphdr th;
 721                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 722 #ifdef CONFIG_TCP_MD5SIG
 723                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 724 #endif
 725                         ];
 726         } rep;
 727         struct net *net = sock_net(sk);
 728         struct ip_reply_arg arg;
 729
 730         memset(&rep.th, 0, sizeof(struct tcphdr));
 731         memset(&arg, 0, sizeof(arg));
 732
 733         arg.iov[0].iov_base = (unsigned char *)&rep;
 734         arg.iov[0].iov_len  = sizeof(rep.th);
 735         if (tsecr) {
 736                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 737                                    (TCPOPT_TIMESTAMP << 8) |
 738                                    TCPOLEN_TIMESTAMP);
 739                 rep.opt[1] = htonl(tsval);
 740                 rep.opt[2] = htonl(tsecr);
 741                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 742         }
 743
 744         /* Swap the send and the receive. */
 745         rep.th.dest    = th->source;
 746         rep.th.source  = th->dest;
 747         rep.th.doff    = arg.iov[0].iov_len / 4;
 748         rep.th.seq     = htonl(seq);
 749         rep.th.ack_seq = htonl(ack);
 750         rep.th.ack     = 1;
 751         rep.th.window  = htons(win);
 752
 753 #ifdef CONFIG_TCP_MD5SIG
 754         if (key) {
 755                 int offset = (tsecr) ? 3 : 0;
 756
 757                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 758                                           (TCPOPT_NOP << 16) |
 759                                           (TCPOPT_MD5SIG << 8) |
 760                                           TCPOLEN_MD5SIG);
 761                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 762                 rep.th.doff = arg.iov[0].iov_len/4;
 763
 764                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 765                                     key, ip_hdr(skb)->saddr,
 766                                     ip_hdr(skb)->daddr, &rep.th);
 767         }
 768 #endif
 769         arg.flags = reply_flags;
 770         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 771                                       ip_hdr(skb)->saddr, /* XXX */
 772                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 773         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 774         if (oif)
 775                 arg.bound_dev_if = oif;
 776         arg.tos = tos;
 777         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 778         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 779                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 780                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 781                               &arg, arg.iov[0].iov_len);
 782
 783         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 784 }
 785
 786 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 787 {
 788         struct inet_timewait_sock *tw = inet_twsk(sk);
 789         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 790
 791         tcp_v4_send_ack(sk, skb,
 792                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 793                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 794                         tcp_time_stamp + tcptw->tw_ts_offset,
 795                         tcptw->tw_ts_recent,
 796                         tw->tw_bound_dev_if,
 797                         tcp_twsk_md5_key(tcptw),
 798                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 799                         tw->tw_tos
 800                         );
 801
 802         inet_twsk_put(tw);
 803 }
 804
 805 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 806                                   struct request_sock *req)
 807 {
 808         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 809          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 810          */
 811         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 812                                              tcp_sk(sk)->snd_nxt;
 813
 814         /* RFC 7323 2.3
 815          * The window field (SEG.WND) of every outgoing segment, with the
 816          * exception of <SYN> segments, MUST be right-shifted by
 817          * Rcv.Wind.Shift bits:
 818          */
 819         tcp_v4_send_ack(sk, skb, seq,
 820                         tcp_rsk(req)->rcv_nxt,
 821                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 822                         tcp_time_stamp,
 823                         req->ts_recent,
 824                         0,
 825                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 826                                           AF_INET),
 827                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 828                         ip_hdr(skb)->tos);
 829 }
 830
 831 /*
 832  *      Send a SYN-ACK after having received a SYN.
 833  *      This still operates on a request_sock only, not on a big
 834  *      socket.
 835  */
 836 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 837                               struct flowi *fl,
 838                               struct request_sock *req,
 839                               struct tcp_fastopen_cookie *foc,
 840                                   bool attach_req)
 841 {
 842         const struct inet_request_sock *ireq = inet_rsk(req);
 843         struct flowi4 fl4;
 844         int err = -1;
 845         struct sk_buff *skb;
 846
 847         /* First, grab a route. */
 848         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 849                 return -1;
 850
 851         skb = tcp_make_synack(sk, dst, req, foc, attach_req);
 852
 853         if (skb) {
 854                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 855
 856                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 857                                             ireq->ir_rmt_addr,
 858                                             ireq->opt);
 859                 err = net_xmit_eval(err);
 860         }
 861
 862         return err;
 863 }
 864
 865 /*
 866  *      IPv4 request_sock destructor.
 867  */
 868 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 869 {
 870         kfree(inet_rsk(req)->opt);
 871 }
 872
 873
 874 #ifdef CONFIG_TCP_MD5SIG
 875 /*
 876  * RFC2385 MD5 checksumming requires a mapping of
 877  * IP address->MD5 Key.
 878  * We need to maintain these in the sk structure.
 879  */
 880
 881 /* Find the Key structure for an address.  */
 882 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 883                                          const union tcp_md5_addr *addr,
 884                                          int family)
 885 {
 886         const struct tcp_sock *tp = tcp_sk(sk);
 887         struct tcp_md5sig_key *key;
 888         unsigned int size = sizeof(struct in_addr);
 889         const struct tcp_md5sig_info *md5sig;
 890
 891         /* caller either holds rcu_read_lock() or socket lock */
 892         md5sig = rcu_dereference_check(tp->md5sig_info,
 893                                        sock_owned_by_user(sk) ||
 894                                        lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
 895         if (!md5sig)
 896                 return NULL;
 897 #if IS_ENABLED(CONFIG_IPV6)
 898         if (family == AF_INET6)
 899                 size = sizeof(struct in6_addr);
 900 #endif
 901         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 902                 if (key->family != family)
 903                         continue;
 904                 if (!memcmp(&key->addr, addr, size))
 905                         return key;
 906         }
 907         return NULL;
 908 }
 909 EXPORT_SYMBOL(tcp_md5_do_lookup);
 910
 911 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 912                                          const struct sock *addr_sk)
 913 {
 914         const union tcp_md5_addr *addr;
 915
 916         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 917         return tcp_md5_do_lookup(sk, addr, AF_INET);
 918 }
 919 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 920
 921 /* This can be called on a newly created socket, from other files */
 922 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 923                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 924 {
 925         /* Add Key to the list */
 926         struct tcp_md5sig_key *key;
 927         struct tcp_sock *tp = tcp_sk(sk);
 928         struct tcp_md5sig_info *md5sig;
 929
 930         key = tcp_md5_do_lookup(sk, addr, family);
 931         if (key) {
 932                 /* Pre-existing entry - just update that one. */
 933                 memcpy(key->key, newkey, newkeylen);
 934                 key->keylen = newkeylen;
 935                 return 0;
 936         }
 937
 938         md5sig = rcu_dereference_protected(tp->md5sig_info,
 939                                            sock_owned_by_user(sk) ||
 940                                            lockdep_is_held(&sk->sk_lock.slock));
 941         if (!md5sig) {
 942                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 943                 if (!md5sig)
 944                         return -ENOMEM;
 945
 946                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 947                 INIT_HLIST_HEAD(&md5sig->head);
 948                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 949         }
 950
 951         key = sock_kmalloc(sk, sizeof(*key), gfp);
 952         if (!key)
 953                 return -ENOMEM;
 954         if (!tcp_alloc_md5sig_pool()) {
 955                 sock_kfree_s(sk, key, sizeof(*key));
 956                 return -ENOMEM;
 957         }
 958
 959         memcpy(key->key, newkey, newkeylen);
 960         key->keylen = newkeylen;
 961         key->family = family;
 962         memcpy(&key->addr, addr,
 963                (family == AF_INET6) ? sizeof(struct in6_addr) :
 964                                       sizeof(struct in_addr));
 965         hlist_add_head_rcu(&key->node, &md5sig->head);
 966         return 0;
 967 }
 968 EXPORT_SYMBOL(tcp_md5_do_add);
 969
 970 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 971 {
 972         struct tcp_md5sig_key *key;
 973
 974         key = tcp_md5_do_lookup(sk, addr, family);
 975         if (!key)
 976                 return -ENOENT;
 977         hlist_del_rcu(&key->node);
 978         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 979         kfree_rcu(key, rcu);
 980         return 0;
 981 }
 982 EXPORT_SYMBOL(tcp_md5_do_del);
 983
 984 static void tcp_clear_md5_list(struct sock *sk)
 985 {
 986         struct tcp_sock *tp = tcp_sk(sk);
 987         struct tcp_md5sig_key *key;
 988         struct hlist_node *n;
 989         struct tcp_md5sig_info *md5sig;
 990
 991         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 992
 993         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
 994                 hlist_del_rcu(&key->node);
 995                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 996                 kfree_rcu(key, rcu);
 997         }
 998 }
 999
1000 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1001                                  int optlen)
1002 {
1003         struct tcp_md5sig cmd;
1004         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1005
1006         if (optlen < sizeof(cmd))
1007                 return -EINVAL;
1008
1009         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1010                 return -EFAULT;
1011
1012         if (sin->sin_family != AF_INET)
1013                 return -EINVAL;
1014
1015         if (!cmd.tcpm_keylen)
1016                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1017                                       AF_INET);
1018
1019         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1020                 return -EINVAL;
1021
1022         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1023                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1024                               GFP_KERNEL);
1025 }
1026
1027 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1028                                         __be32 daddr, __be32 saddr, int nbytes)
1029 {
1030         struct tcp4_pseudohdr *bp;
1031         struct scatterlist sg;
1032
1033         bp = &hp->md5_blk.ip4;
1034
1035         /*
1036          * 1. the TCP pseudo-header (in the order: source IP address,
1037          * destination IP address, zero-padded protocol number, and
1038          * segment length)
1039          */
1040         bp->saddr = saddr;
1041         bp->daddr = daddr;
1042         bp->pad = 0;
1043         bp->protocol = IPPROTO_TCP;
1044         bp->len = cpu_to_be16(nbytes);
1045
1046         sg_init_one(&sg, bp, sizeof(*bp));
1047         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1048 }
1049
1050 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1051                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1052 {
1053         struct tcp_md5sig_pool *hp;
1054         struct hash_desc *desc;
1055
1056         hp = tcp_get_md5sig_pool();
1057         if (!hp)
1058                 goto clear_hash_noput;
1059         desc = &hp->md5_desc;
1060
1061         if (crypto_hash_init(desc))
1062                 goto clear_hash;
1063         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1064                 goto clear_hash;
1065         if (tcp_md5_hash_header(hp, th))
1066                 goto clear_hash;
1067         if (tcp_md5_hash_key(hp, key))
1068                 goto clear_hash;
1069         if (crypto_hash_final(desc, md5_hash))
1070                 goto clear_hash;
1071
1072         tcp_put_md5sig_pool();
1073         return 0;
1074
1075 clear_hash:
1076         tcp_put_md5sig_pool();
1077 clear_hash_noput:
1078         memset(md5_hash, 0, 16);
1079         return 1;
1080 }
1081
1082 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1083                         const struct sock *sk,
1084                         const struct sk_buff *skb)
1085 {
1086         struct tcp_md5sig_pool *hp;
1087         struct hash_desc *desc;
1088         const struct tcphdr *th = tcp_hdr(skb);
1089         __be32 saddr, daddr;
1090
1091         if (sk) { /* valid for establish/request sockets */
1092                 saddr = sk->sk_rcv_saddr;
1093                 daddr = sk->sk_daddr;
1094         } else {
1095                 const struct iphdr *iph = ip_hdr(skb);
1096                 saddr = iph->saddr;
1097                 daddr = iph->daddr;
1098         }
1099
1100         hp = tcp_get_md5sig_pool();
1101         if (!hp)
1102                 goto clear_hash_noput;
1103         desc = &hp->md5_desc;
1104
1105         if (crypto_hash_init(desc))
1106                 goto clear_hash;
1107
1108         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1109                 goto clear_hash;
1110         if (tcp_md5_hash_header(hp, th))
1111                 goto clear_hash;
1112         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1113                 goto clear_hash;
1114         if (tcp_md5_hash_key(hp, key))
1115                 goto clear_hash;
1116         if (crypto_hash_final(desc, md5_hash))
1117                 goto clear_hash;
1118
1119         tcp_put_md5sig_pool();
1120         return 0;
1121
1122 clear_hash:
1123         tcp_put_md5sig_pool();
1124 clear_hash_noput:
1125         memset(md5_hash, 0, 16);
1126         return 1;
1127 }
1128 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1129
1130 #endif
1131
1132 /* Called with rcu_read_lock() */
1133 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1134                                     const struct sk_buff *skb)
1135 {
1136 #ifdef CONFIG_TCP_MD5SIG
1137         /*
1138          * This gets called for each TCP segment that arrives
1139          * so we want to be efficient.
1140          * We have 3 drop cases:
1141          * o No MD5 hash and one expected.
1142          * o MD5 hash and we're not expecting one.
1143          * o MD5 hash and its wrong.
1144          */
1145         const __u8 *hash_location = NULL;
1146         struct tcp_md5sig_key *hash_expected;
1147         const struct iphdr *iph = ip_hdr(skb);
1148         const struct tcphdr *th = tcp_hdr(skb);
1149         int genhash;
1150         unsigned char newhash[16];
1151
1152         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1153                                           AF_INET);
1154         hash_location = tcp_parse_md5sig_option(th);
1155
1156         /* We've parsed the options - do we have a hash? */
1157         if (!hash_expected && !hash_location)
1158                 return false;
1159
1160         if (hash_expected && !hash_location) {
1161                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1162                 return true;
1163         }
1164
1165         if (!hash_expected && hash_location) {
1166                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1167                 return true;
1168         }
1169
1170         /* Okay, so this is hash_expected and hash_location -
1171          * so we need to calculate the checksum.
1172          */
1173         genhash = tcp_v4_md5_hash_skb(newhash,
1174                                       hash_expected,
1175                                       NULL, skb);
1176
1177         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1178                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1179                                      &iph->saddr, ntohs(th->source),
1180                                      &iph->daddr, ntohs(th->dest),
1181                                      genhash ? " tcp_v4_calc_md5_hash failed"
1182                                      : "");
1183                 return true;
1184         }
1185         return false;
1186 #endif
1187         return false;
1188 }
1189
1190 static void tcp_v4_init_req(struct request_sock *req,
1191                             const struct sock *sk_listener,
1192                             struct sk_buff *skb)
1193 {
1194         struct inet_request_sock *ireq = inet_rsk(req);
1195
1196         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1197         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1198         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1199         ireq->opt = tcp_v4_save_options(skb);
1200 }
1201
1202 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1203                                           struct flowi *fl,
1204                                           const struct request_sock *req,
1205                                           bool *strict)
1206 {
1207         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1208
1209         if (strict) {
1210                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1211                         *strict = true;
1212                 else
1213                         *strict = false;
1214         }
1215
1216         return dst;
1217 }
1218
1219 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1220         .family         =       PF_INET,
1221         .obj_size       =       sizeof(struct tcp_request_sock),
1222         .rtx_syn_ack    =       tcp_rtx_synack,
1223         .send_ack       =       tcp_v4_reqsk_send_ack,
1224         .destructor     =       tcp_v4_reqsk_destructor,
1225         .send_reset     =       tcp_v4_send_reset,
1226         .syn_ack_timeout =      tcp_syn_ack_timeout,
1227 };
1228
1229 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1230         .mss_clamp      =       TCP_MSS_DEFAULT,
1231 #ifdef CONFIG_TCP_MD5SIG
1232         .req_md5_lookup =       tcp_v4_md5_lookup,
1233         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1234 #endif
1235         .init_req       =       tcp_v4_init_req,
1236 #ifdef CONFIG_SYN_COOKIES
1237         .cookie_init_seq =      cookie_v4_init_sequence,
1238 #endif
1239         .route_req      =       tcp_v4_route_req,
1240         .init_seq       =       tcp_v4_init_sequence,
1241         .send_synack    =       tcp_v4_send_synack,
1242 };
1243
1244 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1245 {
1246         /* Never answer to SYNs send to broadcast or multicast */
1247         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1248                 goto drop;
1249
1250         return tcp_conn_request(&tcp_request_sock_ops,
1251                                 &tcp_request_sock_ipv4_ops, sk, skb);
1252
1253 drop:
1254         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1255         return 0;
1256 }
1257 EXPORT_SYMBOL(tcp_v4_conn_request);
1258
1259
1260 /*
1261  * The three way handshake has completed - we got a valid synack -
1262  * now create the new socket.
1263  */
1264 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1265                                   struct request_sock *req,
1266                                   struct dst_entry *dst,
1267                                   struct request_sock *req_unhash,
1268                                   bool *own_req)
1269 {
1270         struct inet_request_sock *ireq;
1271         struct inet_sock *newinet;
1272         struct tcp_sock *newtp;
1273         struct sock *newsk;
1274 #ifdef CONFIG_TCP_MD5SIG
1275         struct tcp_md5sig_key *key;
1276 #endif
1277         struct ip_options_rcu *inet_opt;
1278
1279         if (sk_acceptq_is_full(sk))
1280                 goto exit_overflow;
1281
1282         newsk = tcp_create_openreq_child(sk, req, skb);
1283         if (!newsk)
1284                 goto exit_nonewsk;
1285
1286         newsk->sk_gso_type = SKB_GSO_TCPV4;
1287         inet_sk_rx_dst_set(newsk, skb);
1288
1289         newtp                 = tcp_sk(newsk);
1290         newinet               = inet_sk(newsk);
1291         ireq                  = inet_rsk(req);
1292         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1293         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1294         newinet->inet_saddr           = ireq->ir_loc_addr;
1295         inet_opt              = ireq->opt;
1296         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1297         ireq->opt             = NULL;
1298         newinet->mc_index     = inet_iif(skb);
1299         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1300         newinet->rcv_tos      = ip_hdr(skb)->tos;
1301         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1302         if (inet_opt)
1303                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1304         newinet->inet_id = newtp->write_seq ^ jiffies;
1305
1306         if (!dst) {
1307                 dst = inet_csk_route_child_sock(sk, newsk, req);
1308                 if (!dst)
1309                         goto put_and_exit;
1310         } else {
1311                 /* syncookie case : see end of cookie_v4_check() */
1312         }
1313         sk_setup_caps(newsk, dst);
1314
1315         tcp_ca_openreq_child(newsk, dst);
1316
1317         tcp_sync_mss(newsk, dst_mtu(dst));
1318         newtp->advmss = dst_metric_advmss(dst);
1319         if (tcp_sk(sk)->rx_opt.user_mss &&
1320             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1321                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1322
1323         tcp_initialize_rcv_mss(newsk);
1324
1325 #ifdef CONFIG_TCP_MD5SIG
1326         /* Copy over the MD5 key from the original socket */
1327         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1328                                 AF_INET);
1329         if (key) {
1330                 /*
1331                  * We're using one, so create a matching key
1332                  * on the newsk structure. If we fail to get
1333                  * memory, then we end up not copying the key
1334                  * across. Shucks.
1335                  */
1336                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1337                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1338                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1339         }
1340 #endif
1341
1342         if (__inet_inherit_port(sk, newsk) < 0)
1343                 goto put_and_exit;
1344         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1345         if (*own_req)
1346                 tcp_move_syn(newtp, req);
1347
1348         return newsk;
1349
1350 exit_overflow:
1351         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1352 exit_nonewsk:
1353         dst_release(dst);
1354 exit:
1355         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1356         return NULL;
1357 put_and_exit:
1358         inet_csk_prepare_forced_close(newsk);
1359         tcp_done(newsk);
1360         goto exit;
1361 }
1362 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1363
1364 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1365 {
1366 #ifdef CONFIG_SYN_COOKIES
1367         const struct tcphdr *th = tcp_hdr(skb);
1368
1369         if (!th->syn)
1370                 sk = cookie_v4_check(sk, skb);
1371 #endif
1372         return sk;
1373 }
1374
1375 /* The socket must have it's spinlock held when we get
1376  * here, unless it is a TCP_LISTEN socket.
1377  *
1378  * We have a potential double-lock case here, so even when
1379  * doing backlog processing we use the BH locking scheme.
1380  * This is because we cannot sleep with the original spinlock
1381  * held.
1382  */
1383 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1384 {
1385         struct sock *rsk;
1386
1387         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1388                 struct dst_entry *dst = sk->sk_rx_dst;
1389
1390                 sock_rps_save_rxhash(sk, skb);
1391                 sk_mark_napi_id(sk, skb);
1392                 if (dst) {
1393                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1394                             !dst->ops->check(dst, 0)) {
1395                                 dst_release(dst);
1396                                 sk->sk_rx_dst = NULL;
1397                         }
1398                 }
1399                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1400                 return 0;
1401         }
1402
1403         if (tcp_checksum_complete(skb))
1404                 goto csum_err;
1405
1406         if (sk->sk_state == TCP_LISTEN) {
1407                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1408
1409                 if (!nsk)
1410                         goto discard;
1411                 if (nsk != sk) {
1412                         sock_rps_save_rxhash(nsk, skb);
1413                         sk_mark_napi_id(nsk, skb);
1414                         if (tcp_child_process(sk, nsk, skb)) {
1415                                 rsk = nsk;
1416                                 goto reset;
1417                         }
1418                         return 0;
1419                 }
1420         } else
1421                 sock_rps_save_rxhash(sk, skb);
1422
1423         if (tcp_rcv_state_process(sk, skb)) {
1424                 rsk = sk;
1425                 goto reset;
1426         }
1427         return 0;
1428
1429 reset:
1430         tcp_v4_send_reset(rsk, skb);
1431 discard:
1432         kfree_skb(skb);
1433         /* Be careful here. If this function gets more complicated and
1434          * gcc suffers from register pressure on the x86, sk (in %ebx)
1435          * might be destroyed here. This current version compiles correctly,
1436          * but you have been warned.
1437          */
1438         return 0;
1439
1440 csum_err:
1441         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1442         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1443         goto discard;
1444 }
1445 EXPORT_SYMBOL(tcp_v4_do_rcv);
1446
1447 void tcp_v4_early_demux(struct sk_buff *skb)
1448 {
1449         const struct iphdr *iph;
1450         const struct tcphdr *th;
1451         struct sock *sk;
1452
1453         if (skb->pkt_type != PACKET_HOST)
1454                 return;
1455
1456         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1457                 return;
1458
1459         iph = ip_hdr(skb);
1460         th = tcp_hdr(skb);
1461
1462         if (th->doff < sizeof(struct tcphdr) / 4)
1463                 return;
1464
1465         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1466                                        iph->saddr, th->source,
1467                                        iph->daddr, ntohs(th->dest),
1468                                        skb->skb_iif);
1469         if (sk) {
1470                 skb->sk = sk;
1471                 skb->destructor = sock_edemux;
1472                 if (sk_fullsock(sk)) {
1473                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1474
1475                         if (dst)
1476                                 dst = dst_check(dst, 0);
1477                         if (dst &&
1478                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1479                                 skb_dst_set_noref(skb, dst);
1480                 }
1481         }
1482 }
1483
1484 /* Packet is added to VJ-style prequeue for processing in process
1485  * context, if a reader task is waiting. Apparently, this exciting
1486  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1487  * failed somewhere. Latency? Burstiness? Well, at least now we will
1488  * see, why it failed. 8)8)                               --ANK
1489  *
1490  */
1491 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1492 {
1493         struct tcp_sock *tp = tcp_sk(sk);
1494
1495         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1496                 return false;
1497
1498         if (skb->len <= tcp_hdrlen(skb) &&
1499             skb_queue_len(&tp->ucopy.prequeue) == 0)
1500                 return false;
1501
1502         /* Before escaping RCU protected region, we need to take care of skb
1503          * dst. Prequeue is only enabled for established sockets.
1504          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1505          * Instead of doing full sk_rx_dst validity here, let's perform
1506          * an optimistic check.
1507          */
1508         if (likely(sk->sk_rx_dst))
1509                 skb_dst_drop(skb);
1510         else
1511                 skb_dst_force_safe(skb);
1512
1513         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1514         tp->ucopy.memory += skb->truesize;
1515         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1516                 struct sk_buff *skb1;
1517
1518                 BUG_ON(sock_owned_by_user(sk));
1519
1520                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1521                         sk_backlog_rcv(sk, skb1);
1522                         NET_INC_STATS_BH(sock_net(sk),
1523                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1524                 }
1525
1526                 tp->ucopy.memory = 0;
1527         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1528                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1529                                            POLLIN | POLLRDNORM | POLLRDBAND);
1530                 if (!inet_csk_ack_scheduled(sk))
1531                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1532                                                   (3 * tcp_rto_min(sk)) / 4,
1533                                                   TCP_RTO_MAX);
1534         }
1535         return true;
1536 }
1537 EXPORT_SYMBOL(tcp_prequeue);
1538
1539 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1540 {
1541         struct tcphdr *th = (struct tcphdr *)skb->data;
1542         unsigned int eaten = skb->len;
1543         int err;
1544
1545         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1546         if (!err) {
1547                 eaten -= skb->len;
1548                 TCP_SKB_CB(skb)->end_seq -= eaten;
1549         }
1550         return err;
1551 }
1552 EXPORT_SYMBOL(tcp_filter);
1553
1554 /*
1555  *      From tcp_input.c
1556  */
1557
1558 int tcp_v4_rcv(struct sk_buff *skb)
1559 {
1560         const struct iphdr *iph;
1561         const struct tcphdr *th;
1562         struct sock *sk;
1563         int ret;
1564         struct net *net = dev_net(skb->dev);
1565
1566         if (skb->pkt_type != PACKET_HOST)
1567                 goto discard_it;
1568
1569         /* Count it even if it's bad */
1570         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1571
1572         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1573                 goto discard_it;
1574
1575         th = tcp_hdr(skb);
1576
1577         if (th->doff < sizeof(struct tcphdr) / 4)
1578                 goto bad_packet;
1579         if (!pskb_may_pull(skb, th->doff * 4))
1580                 goto discard_it;
1581
1582         /* An explanation is required here, I think.
1583          * Packet length and doff are validated by header prediction,
1584          * provided case of th->doff==0 is eliminated.
1585          * So, we defer the checks. */
1586
1587         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1588                 goto csum_error;
1589
1590         th = tcp_hdr(skb);
1591         iph = ip_hdr(skb);
1592         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1593          * barrier() makes sure compiler wont play fool^Waliasing games.
1594          */
1595         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1596                 sizeof(struct inet_skb_parm));
1597         barrier();
1598
1599         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1600         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1601                                     skb->len - th->doff * 4);
1602         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1603         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1604         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1605         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1606         TCP_SKB_CB(skb)->sacked  = 0;
1607
1608 lookup:
1609         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1610         if (!sk)
1611                 goto no_tcp_socket;
1612
1613 process:
1614         if (sk->sk_state == TCP_TIME_WAIT)
1615                 goto do_time_wait;
1616
1617         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1618                 struct request_sock *req = inet_reqsk(sk);
1619                 struct sock *nsk;
1620
1621                 sk = req->rsk_listener;
1622                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1623                         reqsk_put(req);
1624                         goto discard_it;
1625                 }
1626                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1627                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1628                         goto lookup;
1629                 }
1630                 sock_hold(sk);
1631                 nsk = tcp_check_req(sk, skb, req, false);
1632                 if (!nsk) {
1633                         reqsk_put(req);
1634                         goto discard_and_relse;
1635                 }
1636                 if (nsk == sk) {
1637                         reqsk_put(req);
1638                 } else if (tcp_child_process(sk, nsk, skb)) {
1639                         tcp_v4_send_reset(nsk, skb);
1640                         goto discard_and_relse;
1641                 } else {
1642                         sock_put(sk);
1643                         return 0;
1644                 }
1645         }
1646         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1647                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1648                 goto discard_and_relse;
1649         }
1650
1651         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1652                 goto discard_and_relse;
1653
1654         if (tcp_v4_inbound_md5_hash(sk, skb))
1655                 goto discard_and_relse;
1656
1657         nf_reset(skb);
1658
1659         if (tcp_filter(sk, skb))
1660                 goto discard_and_relse;
1661         th = (const struct tcphdr *)skb->data;
1662         iph = ip_hdr(skb);
1663
1664         skb->dev = NULL;
1665
1666         if (sk->sk_state == TCP_LISTEN) {
1667                 ret = tcp_v4_do_rcv(sk, skb);
1668                 goto put_and_return;
1669         }
1670
1671         sk_incoming_cpu_update(sk);
1672
1673         bh_lock_sock_nested(sk);
1674         tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1675         ret = 0;
1676         if (!sock_owned_by_user(sk)) {
1677                 if (!tcp_prequeue(sk, skb))
1678                         ret = tcp_v4_do_rcv(sk, skb);
1679         } else if (unlikely(sk_add_backlog(sk, skb,
1680                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1681                 bh_unlock_sock(sk);
1682                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1683                 goto discard_and_relse;
1684         }
1685         bh_unlock_sock(sk);
1686
1687 put_and_return:
1688         sock_put(sk);
1689
1690         return ret;
1691
1692 no_tcp_socket:
1693         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1694                 goto discard_it;
1695
1696         if (tcp_checksum_complete(skb)) {
1697 csum_error:
1698                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1699 bad_packet:
1700                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1701         } else {
1702                 tcp_v4_send_reset(NULL, skb);
1703         }
1704
1705 discard_it:
1706         /* Discard frame. */
1707         kfree_skb(skb);
1708         return 0;
1709
1710 discard_and_relse:
1711         sock_put(sk);
1712         goto discard_it;
1713
1714 do_time_wait:
1715         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1716                 inet_twsk_put(inet_twsk(sk));
1717                 goto discard_it;
1718         }
1719
1720         if (tcp_checksum_complete(skb)) {
1721                 inet_twsk_put(inet_twsk(sk));
1722                 goto csum_error;
1723         }
1724         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1725         case TCP_TW_SYN: {
1726                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1727                                                         &tcp_hashinfo,
1728                                                         iph->saddr, th->source,
1729                                                         iph->daddr, th->dest,
1730                                                         inet_iif(skb));
1731                 if (sk2) {
1732                         inet_twsk_deschedule_put(inet_twsk(sk));
1733                         sk = sk2;
1734                         goto process;
1735                 }
1736                 /* Fall through to ACK */
1737         }
1738         case TCP_TW_ACK:
1739                 tcp_v4_timewait_ack(sk, skb);
1740                 break;
1741         case TCP_TW_RST:
1742                 goto no_tcp_socket;
1743         case TCP_TW_SUCCESS:;
1744         }
1745         goto discard_it;
1746 }
1747
1748 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1749         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1750         .twsk_unique    = tcp_twsk_unique,
1751         .twsk_destructor= tcp_twsk_destructor,
1752 };
1753
1754 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1755 {
1756         struct dst_entry *dst = skb_dst(skb);
1757
1758         if (dst && dst_hold_safe(dst)) {
1759                 sk->sk_rx_dst = dst;
1760                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1761         }
1762 }
1763 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1764
1765 const struct inet_connection_sock_af_ops ipv4_specific = {
1766         .queue_xmit        = ip_queue_xmit,
1767         .send_check        = tcp_v4_send_check,
1768         .rebuild_header    = inet_sk_rebuild_header,
1769         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1770         .conn_request      = tcp_v4_conn_request,
1771         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1772         .net_header_len    = sizeof(struct iphdr),
1773         .setsockopt        = ip_setsockopt,
1774         .getsockopt        = ip_getsockopt,
1775         .addr2sockaddr     = inet_csk_addr2sockaddr,
1776         .sockaddr_len      = sizeof(struct sockaddr_in),
1777         .bind_conflict     = inet_csk_bind_conflict,
1778 #ifdef CONFIG_COMPAT
1779         .compat_setsockopt = compat_ip_setsockopt,
1780         .compat_getsockopt = compat_ip_getsockopt,
1781 #endif
1782         .mtu_reduced       = tcp_v4_mtu_reduced,
1783 };
1784 EXPORT_SYMBOL(ipv4_specific);
1785
1786 #ifdef CONFIG_TCP_MD5SIG
1787 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1788         .md5_lookup             = tcp_v4_md5_lookup,
1789         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1790         .md5_parse              = tcp_v4_parse_md5_keys,
1791 };
1792 #endif
1793
1794 /* NOTE: A lot of things set to zero explicitly by call to
1795  *       sk_alloc() so need not be done here.
1796  */
1797 static int tcp_v4_init_sock(struct sock *sk)
1798 {
1799         struct inet_connection_sock *icsk = inet_csk(sk);
1800
1801         tcp_init_sock(sk);
1802
1803         icsk->icsk_af_ops = &ipv4_specific;
1804
1805 #ifdef CONFIG_TCP_MD5SIG
1806         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1807 #endif
1808
1809         return 0;
1810 }
1811
1812 void tcp_v4_destroy_sock(struct sock *sk)
1813 {
1814         struct tcp_sock *tp = tcp_sk(sk);
1815
1816         tcp_clear_xmit_timers(sk);
1817
1818         tcp_cleanup_congestion_control(sk);
1819
1820         /* Cleanup up the write buffer. */
1821         tcp_write_queue_purge(sk);
1822
1823         /* Cleans up our, hopefully empty, out_of_order_queue. */
1824         __skb_queue_purge(&tp->out_of_order_queue);
1825
1826 #ifdef CONFIG_TCP_MD5SIG
1827         /* Clean up the MD5 key list, if any */
1828         if (tp->md5sig_info) {
1829                 tcp_clear_md5_list(sk);
1830                 kfree_rcu(tp->md5sig_info, rcu);
1831                 tp->md5sig_info = NULL;
1832         }
1833 #endif
1834
1835         /* Clean prequeue, it must be empty really */
1836         __skb_queue_purge(&tp->ucopy.prequeue);
1837
1838         /* Clean up a referenced TCP bind bucket. */
1839         if (inet_csk(sk)->icsk_bind_hash)
1840                 inet_put_port(sk);
1841
1842         BUG_ON(tp->fastopen_rsk);
1843
1844         /* If socket is aborted during connect operation */
1845         tcp_free_fastopen_req(tp);
1846         tcp_saved_syn_free(tp);
1847
1848         sk_sockets_allocated_dec(sk);
1849         sock_release_memcg(sk);
1850 }
1851 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1852
1853 #ifdef CONFIG_PROC_FS
1854 /* Proc filesystem TCP sock list dumping. */
1855
1856 /*
1857  * Get next listener socket follow cur.  If cur is NULL, get first socket
1858  * starting from bucket given in st->bucket; when st->bucket is zero the
1859  * very first socket in the hash table is returned.
1860  */
1861 static void *listening_get_next(struct seq_file *seq, void *cur)
1862 {
1863         struct inet_connection_sock *icsk;
1864         struct hlist_nulls_node *node;
1865         struct sock *sk = cur;
1866         struct inet_listen_hashbucket *ilb;
1867         struct tcp_iter_state *st = seq->private;
1868         struct net *net = seq_file_net(seq);
1869
1870         if (!sk) {
1871                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1872                 spin_lock_bh(&ilb->lock);
1873                 sk = sk_nulls_head(&ilb->head);
1874                 st->offset = 0;
1875                 goto get_sk;
1876         }
1877         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1878         ++st->num;
1879         ++st->offset;
1880
1881         sk = sk_nulls_next(sk);
1882 get_sk:
1883         sk_nulls_for_each_from(sk, node) {
1884                 if (!net_eq(sock_net(sk), net))
1885                         continue;
1886                 if (sk->sk_family == st->family) {
1887                         cur = sk;
1888                         goto out;
1889                 }
1890                 icsk = inet_csk(sk);
1891         }
1892         spin_unlock_bh(&ilb->lock);
1893         st->offset = 0;
1894         if (++st->bucket < INET_LHTABLE_SIZE) {
1895                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1896                 spin_lock_bh(&ilb->lock);
1897                 sk = sk_nulls_head(&ilb->head);
1898                 goto get_sk;
1899         }
1900         cur = NULL;
1901 out:
1902         return cur;
1903 }
1904
1905 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1906 {
1907         struct tcp_iter_state *st = seq->private;
1908         void *rc;
1909
1910         st->bucket = 0;
1911         st->offset = 0;
1912         rc = listening_get_next(seq, NULL);
1913
1914         while (rc && *pos) {
1915                 rc = listening_get_next(seq, rc);
1916                 --*pos;
1917         }
1918         return rc;
1919 }
1920
1921 static inline bool empty_bucket(const struct tcp_iter_state *st)
1922 {
1923         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1924 }
1925
1926 /*
1927  * Get first established socket starting from bucket given in st->bucket.
1928  * If st->bucket is zero, the very first socket in the hash is returned.
1929  */
1930 static void *established_get_first(struct seq_file *seq)
1931 {
1932         struct tcp_iter_state *st = seq->private;
1933         struct net *net = seq_file_net(seq);
1934         void *rc = NULL;
1935
1936         st->offset = 0;
1937         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1938                 struct sock *sk;
1939                 struct hlist_nulls_node *node;
1940                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1941
1942                 /* Lockless fast path for the common case of empty buckets */
1943                 if (empty_bucket(st))
1944                         continue;
1945
1946                 spin_lock_bh(lock);
1947                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1948                         if (sk->sk_family != st->family ||
1949                             !net_eq(sock_net(sk), net)) {
1950                                 continue;
1951                         }
1952                         rc = sk;
1953                         goto out;
1954                 }
1955                 spin_unlock_bh(lock);
1956         }
1957 out:
1958         return rc;
1959 }
1960
1961 static void *established_get_next(struct seq_file *seq, void *cur)
1962 {
1963         struct sock *sk = cur;
1964         struct hlist_nulls_node *node;
1965         struct tcp_iter_state *st = seq->private;
1966         struct net *net = seq_file_net(seq);
1967
1968         ++st->num;
1969         ++st->offset;
1970
1971         sk = sk_nulls_next(sk);
1972
1973         sk_nulls_for_each_from(sk, node) {
1974                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1975                         return sk;
1976         }
1977
1978         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1979         ++st->bucket;
1980         return established_get_first(seq);
1981 }
1982
1983 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1984 {
1985         struct tcp_iter_state *st = seq->private;
1986         void *rc;
1987
1988         st->bucket = 0;
1989         rc = established_get_first(seq);
1990
1991         while (rc && pos) {
1992                 rc = established_get_next(seq, rc);
1993                 --pos;
1994         }
1995         return rc;
1996 }
1997
1998 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1999 {
2000         void *rc;
2001         struct tcp_iter_state *st = seq->private;
2002
2003         st->state = TCP_SEQ_STATE_LISTENING;
2004         rc        = listening_get_idx(seq, &pos);
2005
2006         if (!rc) {
2007                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2008                 rc        = established_get_idx(seq, pos);
2009         }
2010
2011         return rc;
2012 }
2013
2014 static void *tcp_seek_last_pos(struct seq_file *seq)
2015 {
2016         struct tcp_iter_state *st = seq->private;
2017         int offset = st->offset;
2018         int orig_num = st->num;
2019         void *rc = NULL;
2020
2021         switch (st->state) {
2022         case TCP_SEQ_STATE_LISTENING:
2023                 if (st->bucket >= INET_LHTABLE_SIZE)
2024                         break;
2025                 st->state = TCP_SEQ_STATE_LISTENING;
2026                 rc = listening_get_next(seq, NULL);
2027                 while (offset-- && rc)
2028                         rc = listening_get_next(seq, rc);
2029                 if (rc)
2030                         break;
2031                 st->bucket = 0;
2032                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2033                 /* Fallthrough */
2034         case TCP_SEQ_STATE_ESTABLISHED:
2035                 if (st->bucket > tcp_hashinfo.ehash_mask)
2036                         break;
2037                 rc = established_get_first(seq);
2038                 while (offset-- && rc)
2039                         rc = established_get_next(seq, rc);
2040         }
2041
2042         st->num = orig_num;
2043
2044         return rc;
2045 }
2046
2047 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2048 {
2049         struct tcp_iter_state *st = seq->private;
2050         void *rc;
2051
2052         if (*pos && *pos == st->last_pos) {
2053                 rc = tcp_seek_last_pos(seq);
2054                 if (rc)
2055                         goto out;
2056         }
2057
2058         st->state = TCP_SEQ_STATE_LISTENING;
2059         st->num = 0;
2060         st->bucket = 0;
2061         st->offset = 0;
2062         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2063
2064 out:
2065         st->last_pos = *pos;
2066         return rc;
2067 }
2068
2069 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2070 {
2071         struct tcp_iter_state *st = seq->private;
2072         void *rc = NULL;
2073
2074         if (v == SEQ_START_TOKEN) {
2075                 rc = tcp_get_idx(seq, 0);
2076                 goto out;
2077         }
2078
2079         switch (st->state) {
2080         case TCP_SEQ_STATE_LISTENING:
2081                 rc = listening_get_next(seq, v);
2082                 if (!rc) {
2083                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2084                         st->bucket = 0;
2085                         st->offset = 0;
2086                         rc        = established_get_first(seq);
2087                 }
2088                 break;
2089         case TCP_SEQ_STATE_ESTABLISHED:
2090                 rc = established_get_next(seq, v);
2091                 break;
2092         }
2093 out:
2094         ++*pos;
2095         st->last_pos = *pos;
2096         return rc;
2097 }
2098
2099 static void tcp_seq_stop(struct seq_file *seq, void *v)
2100 {
2101         struct tcp_iter_state *st = seq->private;
2102
2103         switch (st->state) {
2104         case TCP_SEQ_STATE_LISTENING:
2105                 if (v != SEQ_START_TOKEN)
2106                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2107                 break;
2108         case TCP_SEQ_STATE_ESTABLISHED:
2109                 if (v)
2110                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2111                 break;
2112         }
2113 }
2114
2115 int tcp_seq_open(struct inode *inode, struct file *file)
2116 {
2117         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2118         struct tcp_iter_state *s;
2119         int err;
2120
2121         err = seq_open_net(inode, file, &afinfo->seq_ops,
2122                           sizeof(struct tcp_iter_state));
2123         if (err < 0)
2124                 return err;
2125
2126         s = ((struct seq_file *)file->private_data)->private;
2127         s->family               = afinfo->family;
2128         s->last_pos             = 0;
2129         return 0;
2130 }
2131 EXPORT_SYMBOL(tcp_seq_open);
2132
2133 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2134 {
2135         int rc = 0;
2136         struct proc_dir_entry *p;
2137
2138         afinfo->seq_ops.start           = tcp_seq_start;
2139         afinfo->seq_ops.next            = tcp_seq_next;
2140         afinfo->seq_ops.stop            = tcp_seq_stop;
2141
2142         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2143                              afinfo->seq_fops, afinfo);
2144         if (!p)
2145                 rc = -ENOMEM;
2146         return rc;
2147 }
2148 EXPORT_SYMBOL(tcp_proc_register);
2149
2150 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2151 {
2152         remove_proc_entry(afinfo->name, net->proc_net);
2153 }
2154 EXPORT_SYMBOL(tcp_proc_unregister);
2155
2156 static void get_openreq4(const struct request_sock *req,
2157                          struct seq_file *f, int i)
2158 {
2159         const struct inet_request_sock *ireq = inet_rsk(req);
2160         long delta = req->rsk_timer.expires - jiffies;
2161
2162         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2163                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2164                 i,
2165                 ireq->ir_loc_addr,
2166                 ireq->ir_num,
2167                 ireq->ir_rmt_addr,
2168                 ntohs(ireq->ir_rmt_port),
2169                 TCP_SYN_RECV,
2170                 0, 0, /* could print option size, but that is af dependent. */
2171                 1,    /* timers active (only the expire timer) */
2172                 jiffies_delta_to_clock_t(delta),
2173                 req->num_timeout,
2174                 from_kuid_munged(seq_user_ns(f),
2175                                  sock_i_uid(req->rsk_listener)),
2176                 0,  /* non standard timer */
2177                 0, /* open_requests have no inode */
2178                 0,
2179                 req);
2180 }
2181
2182 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2183 {
2184         int timer_active;
2185         unsigned long timer_expires;
2186         const struct tcp_sock *tp = tcp_sk(sk);
2187         const struct inet_connection_sock *icsk = inet_csk(sk);
2188         const struct inet_sock *inet = inet_sk(sk);
2189         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2190         __be32 dest = inet->inet_daddr;
2191         __be32 src = inet->inet_rcv_saddr;
2192         __u16 destp = ntohs(inet->inet_dport);
2193         __u16 srcp = ntohs(inet->inet_sport);
2194         int rx_queue;
2195         int state;
2196
2197         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2198             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2199             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2200                 timer_active    = 1;
2201                 timer_expires   = icsk->icsk_timeout;
2202         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2203                 timer_active    = 4;
2204                 timer_expires   = icsk->icsk_timeout;
2205         } else if (timer_pending(&sk->sk_timer)) {
2206                 timer_active    = 2;
2207                 timer_expires   = sk->sk_timer.expires;
2208         } else {
2209                 timer_active    = 0;
2210                 timer_expires = jiffies;
2211         }
2212
2213         state = sk_state_load(sk);
2214         if (state == TCP_LISTEN)
2215                 rx_queue = sk->sk_ack_backlog;
2216         else
2217                 /* Because we don't lock the socket,
2218                  * we might find a transient negative value.
2219                  */
2220                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2221
2222         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2223                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2224                 i, src, srcp, dest, destp, state,
2225                 tp->write_seq - tp->snd_una,
2226                 rx_queue,
2227                 timer_active,
2228                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2229                 icsk->icsk_retransmits,
2230                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2231                 icsk->icsk_probes_out,
2232                 sock_i_ino(sk),
2233                 atomic_read(&sk->sk_refcnt), sk,
2234                 jiffies_to_clock_t(icsk->icsk_rto),
2235                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2236                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2237                 tp->snd_cwnd,
2238                 state == TCP_LISTEN ?
2239                     fastopenq->max_qlen :
2240                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2241 }
2242
2243 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2244                                struct seq_file *f, int i)
2245 {
2246         long delta = tw->tw_timer.expires - jiffies;
2247         __be32 dest, src;
2248         __u16 destp, srcp;
2249
2250         dest  = tw->tw_daddr;
2251         src   = tw->tw_rcv_saddr;
2252         destp = ntohs(tw->tw_dport);
2253         srcp  = ntohs(tw->tw_sport);
2254
2255         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2256                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2257                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2258                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2259                 atomic_read(&tw->tw_refcnt), tw);
2260 }
2261
2262 #define TMPSZ 150
2263
2264 static int tcp4_seq_show(struct seq_file *seq, void *v)
2265 {
2266         struct tcp_iter_state *st;
2267         struct sock *sk = v;
2268
2269         seq_setwidth(seq, TMPSZ - 1);
2270         if (v == SEQ_START_TOKEN) {
2271                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2272                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2273                            "inode");
2274                 goto out;
2275         }
2276         st = seq->private;
2277
2278         if (sk->sk_state == TCP_TIME_WAIT)
2279                 get_timewait4_sock(v, seq, st->num);
2280         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2281                 get_openreq4(v, seq, st->num);
2282         else
2283                 get_tcp4_sock(v, seq, st->num);
2284 out:
2285         seq_pad(seq, '\n');
2286         return 0;
2287 }
2288
2289 static const struct file_operations tcp_afinfo_seq_fops = {
2290         .owner   = THIS_MODULE,
2291         .open    = tcp_seq_open,
2292         .read    = seq_read,
2293         .llseek  = seq_lseek,
2294         .release = seq_release_net
2295 };
2296
2297 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2298         .name           = "tcp",
2299         .family         = AF_INET,
2300         .seq_fops       = &tcp_afinfo_seq_fops,
2301         .seq_ops        = {
2302                 .show           = tcp4_seq_show,
2303         },
2304 };
2305
2306 static int __net_init tcp4_proc_init_net(struct net *net)
2307 {
2308         return tcp_proc_register(net, &tcp4_seq_afinfo);
2309 }
2310
2311 static void __net_exit tcp4_proc_exit_net(struct net *net)
2312 {
2313         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2314 }
2315
2316 static struct pernet_operations tcp4_net_ops = {
2317         .init = tcp4_proc_init_net,
2318         .exit = tcp4_proc_exit_net,
2319 };
2320
2321 int __init tcp4_proc_init(void)
2322 {
2323         return register_pernet_subsys(&tcp4_net_ops);
2324 }
2325
2326 void tcp4_proc_exit(void)
2327 {
2328         unregister_pernet_subsys(&tcp4_net_ops);
2329 }
2330 #endif /* CONFIG_PROC_FS */
2331
2332 struct proto tcp_prot = {
2333         .name                   = "TCP",
2334         .owner                  = THIS_MODULE,
2335         .close                  = tcp_close,
2336         .connect                = tcp_v4_connect,
2337         .disconnect             = tcp_disconnect,
2338         .accept                 = inet_csk_accept,
2339         .ioctl                  = tcp_ioctl,
2340         .init                   = tcp_v4_init_sock,
2341         .destroy                = tcp_v4_destroy_sock,
2342         .shutdown               = tcp_shutdown,
2343         .setsockopt             = tcp_setsockopt,
2344         .getsockopt             = tcp_getsockopt,
2345         .recvmsg                = tcp_recvmsg,
2346         .sendmsg                = tcp_sendmsg,
2347         .sendpage               = tcp_sendpage,
2348         .backlog_rcv            = tcp_v4_do_rcv,
2349         .release_cb             = tcp_release_cb,
2350         .hash                   = inet_hash,
2351         .unhash                 = inet_unhash,
2352         .get_port               = inet_csk_get_port,
2353         .enter_memory_pressure  = tcp_enter_memory_pressure,
2354         .stream_memory_free     = tcp_stream_memory_free,
2355         .sockets_allocated      = &tcp_sockets_allocated,
2356         .orphan_count           = &tcp_orphan_count,
2357         .memory_allocated       = &tcp_memory_allocated,
2358         .memory_pressure        = &tcp_memory_pressure,
2359         .sysctl_mem             = sysctl_tcp_mem,
2360         .sysctl_wmem            = sysctl_tcp_wmem,
2361         .sysctl_rmem            = sysctl_tcp_rmem,
2362         .max_header             = MAX_TCP_HEADER,
2363         .obj_size               = sizeof(struct tcp_sock),
2364         .slab_flags             = SLAB_DESTROY_BY_RCU,
2365         .twsk_prot              = &tcp_timewait_sock_ops,
2366         .rsk_prot               = &tcp_request_sock_ops,
2367         .h.hashinfo             = &tcp_hashinfo,
2368         .no_autobind            = true,
2369 #ifdef CONFIG_COMPAT
2370         .compat_setsockopt      = compat_tcp_setsockopt,
2371         .compat_getsockopt      = compat_tcp_getsockopt,
2372 #endif
2373 #ifdef CONFIG_MEMCG_KMEM
2374         .init_cgroup            = tcp_init_cgroup,
2375         .destroy_cgroup         = tcp_destroy_cgroup,
2376         .proto_cgroup           = tcp_proto_cgroup,
2377 #endif
2378         .diag_destroy           = tcp_abort,
2379 };
2380 EXPORT_SYMBOL(tcp_prot);
2381
2382 static void __net_exit tcp_sk_exit(struct net *net)
2383 {
2384         int cpu;
2385
2386         for_each_possible_cpu(cpu)
2387                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2388         free_percpu(net->ipv4.tcp_sk);
2389 }
2390
2391 static int __net_init tcp_sk_init(struct net *net)
2392 {
2393         int res, cpu;
2394
2395         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2396         if (!net->ipv4.tcp_sk)
2397                 return -ENOMEM;
2398
2399         for_each_possible_cpu(cpu) {
2400                 struct sock *sk;
2401
2402                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2403                                            IPPROTO_TCP, net);
2404                 if (res)
2405                         goto fail;
2406                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2407         }
2408
2409         net->ipv4.sysctl_tcp_ecn = 2;
2410         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2411
2412         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2413         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2414         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2415
2416         return 0;
2417 fail:
2418         tcp_sk_exit(net);
2419
2420         return res;
2421 }
2422
2423 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2424 {
2425         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2426 }
2427
2428 static struct pernet_operations __net_initdata tcp_sk_ops = {
2429        .init       = tcp_sk_init,
2430        .exit       = tcp_sk_exit,
2431        .exit_batch = tcp_sk_exit_batch,
2432 };
2433
2434 void __init tcp_v4_init(void)
2435 {
2436         inet_hashinfo_init(&tcp_hashinfo);
2437         if (register_pernet_subsys(&tcp_sk_ops))
2438                 panic("Failed to create the TCP control socket.\n");
2439 }