Merge branch 'linux-linaro-lsk-v4.4' into linux-linaro-lsk-v4.4-android
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77 #include <net/busy_poll.h>
78
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
99
100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
101 {
102         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103                                           ip_hdr(skb)->saddr,
104                                           tcp_hdr(skb)->dest,
105                                           tcp_hdr(skb)->source);
106 }
107
108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 {
110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111         struct tcp_sock *tp = tcp_sk(sk);
112
113         /* With PAWS, it is safe from the viewpoint
114            of data integrity. Even without PAWS it is safe provided sequence
115            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116
117            Actually, the idea is close to VJ's one, only timestamp cache is
118            held not per host, but per port pair and TW bucket is used as state
119            holder.
120
121            If TW bucket has been already destroyed we fall back to VJ's scheme
122            and use initial timestamp retrieved from peer table.
123          */
124         if (tcptw->tw_ts_recent_stamp &&
125             (!twp || (sysctl_tcp_tw_reuse &&
126                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128                 if (tp->write_seq == 0)
129                         tp->write_seq = 1;
130                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
131                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132                 sock_hold(sktw);
133                 return 1;
134         }
135
136         return 0;
137 }
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139
140 /* This will initiate an outgoing connection. */
141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142 {
143         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144         struct inet_sock *inet = inet_sk(sk);
145         struct tcp_sock *tp = tcp_sk(sk);
146         __be16 orig_sport, orig_dport;
147         __be32 daddr, nexthop;
148         struct flowi4 *fl4;
149         struct rtable *rt;
150         int err;
151         struct ip_options_rcu *inet_opt;
152
153         if (addr_len < sizeof(struct sockaddr_in))
154                 return -EINVAL;
155
156         if (usin->sin_family != AF_INET)
157                 return -EAFNOSUPPORT;
158
159         nexthop = daddr = usin->sin_addr.s_addr;
160         inet_opt = rcu_dereference_protected(inet->inet_opt,
161                                              sock_owned_by_user(sk));
162         if (inet_opt && inet_opt->opt.srr) {
163                 if (!daddr)
164                         return -EINVAL;
165                 nexthop = inet_opt->opt.faddr;
166         }
167
168         orig_sport = inet->inet_sport;
169         orig_dport = usin->sin_port;
170         fl4 = &inet->cork.fl.u.ip4;
171         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173                               IPPROTO_TCP,
174                               orig_sport, orig_dport, sk);
175         if (IS_ERR(rt)) {
176                 err = PTR_ERR(rt);
177                 if (err == -ENETUNREACH)
178                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179                 return err;
180         }
181
182         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183                 ip_rt_put(rt);
184                 return -ENETUNREACH;
185         }
186
187         if (!inet_opt || !inet_opt->opt.srr)
188                 daddr = fl4->daddr;
189
190         if (!inet->inet_saddr)
191                 inet->inet_saddr = fl4->saddr;
192         sk_rcv_saddr_set(sk, inet->inet_saddr);
193
194         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195                 /* Reset inherited state */
196                 tp->rx_opt.ts_recent       = 0;
197                 tp->rx_opt.ts_recent_stamp = 0;
198                 if (likely(!tp->repair))
199                         tp->write_seq      = 0;
200         }
201
202         if (tcp_death_row.sysctl_tw_recycle &&
203             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204                 tcp_fetch_timewait_stamp(sk, &rt->dst);
205
206         inet->inet_dport = usin->sin_port;
207         sk_daddr_set(sk, daddr);
208
209         inet_csk(sk)->icsk_ext_hdr_len = 0;
210         if (inet_opt)
211                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212
213         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214
215         /* Socket identity is still unknown (sport may be zero).
216          * However we set state to SYN-SENT and not releasing socket
217          * lock select source port, enter ourselves into the hash tables and
218          * complete initialization after this.
219          */
220         tcp_set_state(sk, TCP_SYN_SENT);
221         err = inet_hash_connect(&tcp_death_row, sk);
222         if (err)
223                 goto failure;
224
225         sk_set_txhash(sk);
226
227         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228                                inet->inet_sport, inet->inet_dport, sk);
229         if (IS_ERR(rt)) {
230                 err = PTR_ERR(rt);
231                 rt = NULL;
232                 goto failure;
233         }
234         /* OK, now commit destination to socket.  */
235         sk->sk_gso_type = SKB_GSO_TCPV4;
236         sk_setup_caps(sk, &rt->dst);
237
238         if (!tp->write_seq && likely(!tp->repair))
239                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240                                                            inet->inet_daddr,
241                                                            inet->inet_sport,
242                                                            usin->sin_port);
243
244         inet->inet_id = tp->write_seq ^ jiffies;
245
246         err = tcp_connect(sk);
247
248         rt = NULL;
249         if (err)
250                 goto failure;
251
252         return 0;
253
254 failure:
255         /*
256          * This unhashes the socket and releases the local port,
257          * if necessary.
258          */
259         tcp_set_state(sk, TCP_CLOSE);
260         ip_rt_put(rt);
261         sk->sk_route_caps = 0;
262         inet->inet_dport = 0;
263         return err;
264 }
265 EXPORT_SYMBOL(tcp_v4_connect);
266
267 /*
268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269  * It can be called through tcp_release_cb() if socket was owned by user
270  * at the time tcp_v4_err() was called to handle ICMP message.
271  */
272 void tcp_v4_mtu_reduced(struct sock *sk)
273 {
274         struct dst_entry *dst;
275         struct inet_sock *inet = inet_sk(sk);
276         u32 mtu = tcp_sk(sk)->mtu_info;
277
278         dst = inet_csk_update_pmtu(sk, mtu);
279         if (!dst)
280                 return;
281
282         /* Something is about to be wrong... Remember soft error
283          * for the case, if this connection will not able to recover.
284          */
285         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286                 sk->sk_err_soft = EMSGSIZE;
287
288         mtu = dst_mtu(dst);
289
290         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291             ip_sk_accept_pmtu(sk) &&
292             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293                 tcp_sync_mss(sk, mtu);
294
295                 /* Resend the TCP packet because it's
296                  * clear that the old packet has been
297                  * dropped. This is the new "fast" path mtu
298                  * discovery.
299                  */
300                 tcp_simple_retransmit(sk);
301         } /* else let the usual retransmit timer handle it */
302 }
303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
304
305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
306 {
307         struct dst_entry *dst = __sk_dst_check(sk, 0);
308
309         if (dst)
310                 dst->ops->redirect(dst, sk, skb);
311 }
312
313
314 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
315 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
316 {
317         struct request_sock *req = inet_reqsk(sk);
318         struct net *net = sock_net(sk);
319
320         /* ICMPs are not backlogged, hence we cannot get
321          * an established socket here.
322          */
323         if (seq != tcp_rsk(req)->snt_isn) {
324                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
325         } else if (abort) {
326                 /*
327                  * Still in SYN_RECV, just remove it silently.
328                  * There is no good way to pass the error to the newly
329                  * created socket, and POSIX does not want network
330                  * errors returned from accept().
331                  */
332                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
333                 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
334         }
335         reqsk_put(req);
336 }
337 EXPORT_SYMBOL(tcp_req_err);
338
339 /*
340  * This routine is called by the ICMP module when it gets some
341  * sort of error condition.  If err < 0 then the socket should
342  * be closed and the error returned to the user.  If err > 0
343  * it's just the icmp type << 8 | icmp code.  After adjustment
344  * header points to the first 8 bytes of the tcp header.  We need
345  * to find the appropriate port.
346  *
347  * The locking strategy used here is very "optimistic". When
348  * someone else accesses the socket the ICMP is just dropped
349  * and for some paths there is no check at all.
350  * A more general error queue to queue errors for later handling
351  * is probably better.
352  *
353  */
354
355 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
356 {
357         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
358         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
359         struct inet_connection_sock *icsk;
360         struct tcp_sock *tp;
361         struct inet_sock *inet;
362         const int type = icmp_hdr(icmp_skb)->type;
363         const int code = icmp_hdr(icmp_skb)->code;
364         struct sock *sk;
365         struct sk_buff *skb;
366         struct request_sock *fastopen;
367         __u32 seq, snd_una;
368         __u32 remaining;
369         int err;
370         struct net *net = dev_net(icmp_skb->dev);
371
372         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
373                                        th->dest, iph->saddr, ntohs(th->source),
374                                        inet_iif(icmp_skb));
375         if (!sk) {
376                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
377                 return;
378         }
379         if (sk->sk_state == TCP_TIME_WAIT) {
380                 inet_twsk_put(inet_twsk(sk));
381                 return;
382         }
383         seq = ntohl(th->seq);
384         if (sk->sk_state == TCP_NEW_SYN_RECV)
385                 return tcp_req_err(sk, seq,
386                                   type == ICMP_PARAMETERPROB ||
387                                   type == ICMP_TIME_EXCEEDED ||
388                                   (type == ICMP_DEST_UNREACH &&
389                                    (code == ICMP_NET_UNREACH ||
390                                     code == ICMP_HOST_UNREACH)));
391
392         bh_lock_sock(sk);
393         /* If too many ICMPs get dropped on busy
394          * servers this needs to be solved differently.
395          * We do take care of PMTU discovery (RFC1191) special case :
396          * we can receive locally generated ICMP messages while socket is held.
397          */
398         if (sock_owned_by_user(sk)) {
399                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
400                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
401         }
402         if (sk->sk_state == TCP_CLOSE)
403                 goto out;
404
405         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
406                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
407                 goto out;
408         }
409
410         icsk = inet_csk(sk);
411         tp = tcp_sk(sk);
412         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
413         fastopen = tp->fastopen_rsk;
414         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
415         if (sk->sk_state != TCP_LISTEN &&
416             !between(seq, snd_una, tp->snd_nxt)) {
417                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
418                 goto out;
419         }
420
421         switch (type) {
422         case ICMP_REDIRECT:
423                 do_redirect(icmp_skb, sk);
424                 goto out;
425         case ICMP_SOURCE_QUENCH:
426                 /* Just silently ignore these. */
427                 goto out;
428         case ICMP_PARAMETERPROB:
429                 err = EPROTO;
430                 break;
431         case ICMP_DEST_UNREACH:
432                 if (code > NR_ICMP_UNREACH)
433                         goto out;
434
435                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
436                         /* We are not interested in TCP_LISTEN and open_requests
437                          * (SYN-ACKs send out by Linux are always <576bytes so
438                          * they should go through unfragmented).
439                          */
440                         if (sk->sk_state == TCP_LISTEN)
441                                 goto out;
442
443                         tp->mtu_info = info;
444                         if (!sock_owned_by_user(sk)) {
445                                 tcp_v4_mtu_reduced(sk);
446                         } else {
447                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
448                                         sock_hold(sk);
449                         }
450                         goto out;
451                 }
452
453                 err = icmp_err_convert[code].errno;
454                 /* check if icmp_skb allows revert of backoff
455                  * (see draft-zimmermann-tcp-lcd) */
456                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
457                         break;
458                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
459                     !icsk->icsk_backoff || fastopen)
460                         break;
461
462                 if (sock_owned_by_user(sk))
463                         break;
464
465                 icsk->icsk_backoff--;
466                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
467                                                TCP_TIMEOUT_INIT;
468                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
469
470                 skb = tcp_write_queue_head(sk);
471                 BUG_ON(!skb);
472
473                 remaining = icsk->icsk_rto -
474                             min(icsk->icsk_rto,
475                                 tcp_time_stamp - tcp_skb_timestamp(skb));
476
477                 if (remaining) {
478                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
479                                                   remaining, TCP_RTO_MAX);
480                 } else {
481                         /* RTO revert clocked out retransmission.
482                          * Will retransmit now */
483                         tcp_retransmit_timer(sk);
484                 }
485
486                 break;
487         case ICMP_TIME_EXCEEDED:
488                 err = EHOSTUNREACH;
489                 break;
490         default:
491                 goto out;
492         }
493
494         switch (sk->sk_state) {
495         case TCP_SYN_SENT:
496         case TCP_SYN_RECV:
497                 /* Only in fast or simultaneous open. If a fast open socket is
498                  * is already accepted it is treated as a connected one below.
499                  */
500                 if (fastopen && !fastopen->sk)
501                         break;
502
503                 if (!sock_owned_by_user(sk)) {
504                         sk->sk_err = err;
505
506                         sk->sk_error_report(sk);
507
508                         tcp_done(sk);
509                 } else {
510                         sk->sk_err_soft = err;
511                 }
512                 goto out;
513         }
514
515         /* If we've already connected we will keep trying
516          * until we time out, or the user gives up.
517          *
518          * rfc1122 4.2.3.9 allows to consider as hard errors
519          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
520          * but it is obsoleted by pmtu discovery).
521          *
522          * Note, that in modern internet, where routing is unreliable
523          * and in each dark corner broken firewalls sit, sending random
524          * errors ordered by their masters even this two messages finally lose
525          * their original sense (even Linux sends invalid PORT_UNREACHs)
526          *
527          * Now we are in compliance with RFCs.
528          *                                                      --ANK (980905)
529          */
530
531         inet = inet_sk(sk);
532         if (!sock_owned_by_user(sk) && inet->recverr) {
533                 sk->sk_err = err;
534                 sk->sk_error_report(sk);
535         } else  { /* Only an error on timeout */
536                 sk->sk_err_soft = err;
537         }
538
539 out:
540         bh_unlock_sock(sk);
541         sock_put(sk);
542 }
543
544 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
545 {
546         struct tcphdr *th = tcp_hdr(skb);
547
548         if (skb->ip_summed == CHECKSUM_PARTIAL) {
549                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
550                 skb->csum_start = skb_transport_header(skb) - skb->head;
551                 skb->csum_offset = offsetof(struct tcphdr, check);
552         } else {
553                 th->check = tcp_v4_check(skb->len, saddr, daddr,
554                                          csum_partial(th,
555                                                       th->doff << 2,
556                                                       skb->csum));
557         }
558 }
559
560 /* This routine computes an IPv4 TCP checksum. */
561 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
562 {
563         const struct inet_sock *inet = inet_sk(sk);
564
565         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
566 }
567 EXPORT_SYMBOL(tcp_v4_send_check);
568
569 /*
570  *      This routine will send an RST to the other tcp.
571  *
572  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
573  *                    for reset.
574  *      Answer: if a packet caused RST, it is not for a socket
575  *              existing in our system, if it is matched to a socket,
576  *              it is just duplicate segment or bug in other side's TCP.
577  *              So that we build reply only basing on parameters
578  *              arrived with segment.
579  *      Exception: precedence violation. We do not implement it in any case.
580  */
581
582 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
583 {
584         const struct tcphdr *th = tcp_hdr(skb);
585         struct {
586                 struct tcphdr th;
587 #ifdef CONFIG_TCP_MD5SIG
588                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
589 #endif
590         } rep;
591         struct ip_reply_arg arg;
592 #ifdef CONFIG_TCP_MD5SIG
593         struct tcp_md5sig_key *key;
594         const __u8 *hash_location = NULL;
595         unsigned char newhash[16];
596         int genhash;
597         struct sock *sk1 = NULL;
598 #endif
599         struct net *net;
600
601         /* Never send a reset in response to a reset. */
602         if (th->rst)
603                 return;
604
605         /* If sk not NULL, it means we did a successful lookup and incoming
606          * route had to be correct. prequeue might have dropped our dst.
607          */
608         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
609                 return;
610
611         /* Swap the send and the receive. */
612         memset(&rep, 0, sizeof(rep));
613         rep.th.dest   = th->source;
614         rep.th.source = th->dest;
615         rep.th.doff   = sizeof(struct tcphdr) / 4;
616         rep.th.rst    = 1;
617
618         if (th->ack) {
619                 rep.th.seq = th->ack_seq;
620         } else {
621                 rep.th.ack = 1;
622                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
623                                        skb->len - (th->doff << 2));
624         }
625
626         memset(&arg, 0, sizeof(arg));
627         arg.iov[0].iov_base = (unsigned char *)&rep;
628         arg.iov[0].iov_len  = sizeof(rep.th);
629
630         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
631 #ifdef CONFIG_TCP_MD5SIG
632         hash_location = tcp_parse_md5sig_option(th);
633         if (!sk && hash_location) {
634                 /*
635                  * active side is lost. Try to find listening socket through
636                  * source port, and then find md5 key through listening socket.
637                  * we are not loose security here:
638                  * Incoming packet is checked with md5 hash with finding key,
639                  * no RST generated if md5 hash doesn't match.
640                  */
641                 sk1 = __inet_lookup_listener(net,
642                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
643                                              th->source, ip_hdr(skb)->daddr,
644                                              ntohs(th->source), inet_iif(skb));
645                 /* don't send rst if it can't find key */
646                 if (!sk1)
647                         return;
648                 rcu_read_lock();
649                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
650                                         &ip_hdr(skb)->saddr, AF_INET);
651                 if (!key)
652                         goto release_sk1;
653
654                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
655                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
656                         goto release_sk1;
657         } else {
658                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
659                                              &ip_hdr(skb)->saddr,
660                                              AF_INET) : NULL;
661         }
662
663         if (key) {
664                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
665                                    (TCPOPT_NOP << 16) |
666                                    (TCPOPT_MD5SIG << 8) |
667                                    TCPOLEN_MD5SIG);
668                 /* Update length and the length the header thinks exists */
669                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
670                 rep.th.doff = arg.iov[0].iov_len / 4;
671
672                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
673                                      key, ip_hdr(skb)->saddr,
674                                      ip_hdr(skb)->daddr, &rep.th);
675         }
676 #endif
677         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
678                                       ip_hdr(skb)->saddr, /* XXX */
679                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
680         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
681         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
682         /* When socket is gone, all binding information is lost.
683          * routing might fail in this case. No choice here, if we choose to force
684          * input interface, we will misroute in case of asymmetric route.
685          */
686         if (sk)
687                 arg.bound_dev_if = sk->sk_bound_dev_if;
688
689         arg.tos = ip_hdr(skb)->tos;
690         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
691                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
692                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
693                               &arg, arg.iov[0].iov_len);
694
695         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
696         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
697
698 #ifdef CONFIG_TCP_MD5SIG
699 release_sk1:
700         if (sk1) {
701                 rcu_read_unlock();
702                 sock_put(sk1);
703         }
704 #endif
705 }
706
707 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
708    outside socket context is ugly, certainly. What can I do?
709  */
710
711 static void tcp_v4_send_ack(struct net *net,
712                             struct sk_buff *skb, u32 seq, u32 ack,
713                             u32 win, u32 tsval, u32 tsecr, int oif,
714                             struct tcp_md5sig_key *key,
715                             int reply_flags, u8 tos)
716 {
717         const struct tcphdr *th = tcp_hdr(skb);
718         struct {
719                 struct tcphdr th;
720                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
721 #ifdef CONFIG_TCP_MD5SIG
722                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
723 #endif
724                         ];
725         } rep;
726         struct ip_reply_arg arg;
727
728         memset(&rep.th, 0, sizeof(struct tcphdr));
729         memset(&arg, 0, sizeof(arg));
730
731         arg.iov[0].iov_base = (unsigned char *)&rep;
732         arg.iov[0].iov_len  = sizeof(rep.th);
733         if (tsecr) {
734                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
735                                    (TCPOPT_TIMESTAMP << 8) |
736                                    TCPOLEN_TIMESTAMP);
737                 rep.opt[1] = htonl(tsval);
738                 rep.opt[2] = htonl(tsecr);
739                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
740         }
741
742         /* Swap the send and the receive. */
743         rep.th.dest    = th->source;
744         rep.th.source  = th->dest;
745         rep.th.doff    = arg.iov[0].iov_len / 4;
746         rep.th.seq     = htonl(seq);
747         rep.th.ack_seq = htonl(ack);
748         rep.th.ack     = 1;
749         rep.th.window  = htons(win);
750
751 #ifdef CONFIG_TCP_MD5SIG
752         if (key) {
753                 int offset = (tsecr) ? 3 : 0;
754
755                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
756                                           (TCPOPT_NOP << 16) |
757                                           (TCPOPT_MD5SIG << 8) |
758                                           TCPOLEN_MD5SIG);
759                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
760                 rep.th.doff = arg.iov[0].iov_len/4;
761
762                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
763                                     key, ip_hdr(skb)->saddr,
764                                     ip_hdr(skb)->daddr, &rep.th);
765         }
766 #endif
767         arg.flags = reply_flags;
768         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
769                                       ip_hdr(skb)->saddr, /* XXX */
770                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
771         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
772         if (oif)
773                 arg.bound_dev_if = oif;
774         arg.tos = tos;
775         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
776                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
777                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
778                               &arg, arg.iov[0].iov_len);
779
780         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
781 }
782
783 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
784 {
785         struct inet_timewait_sock *tw = inet_twsk(sk);
786         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
787
788         tcp_v4_send_ack(sock_net(sk), skb,
789                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
790                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
791                         tcp_time_stamp + tcptw->tw_ts_offset,
792                         tcptw->tw_ts_recent,
793                         tw->tw_bound_dev_if,
794                         tcp_twsk_md5_key(tcptw),
795                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
796                         tw->tw_tos
797                         );
798
799         inet_twsk_put(tw);
800 }
801
802 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
803                                   struct request_sock *req)
804 {
805         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
806          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
807          */
808         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
809                                              tcp_sk(sk)->snd_nxt;
810
811         tcp_v4_send_ack(sock_net(sk), skb, seq,
812                         tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
813                         tcp_time_stamp,
814                         req->ts_recent,
815                         0,
816                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
817                                           AF_INET),
818                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
819                         ip_hdr(skb)->tos);
820 }
821
822 /*
823  *      Send a SYN-ACK after having received a SYN.
824  *      This still operates on a request_sock only, not on a big
825  *      socket.
826  */
827 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
828                               struct flowi *fl,
829                               struct request_sock *req,
830                               struct tcp_fastopen_cookie *foc,
831                                   bool attach_req)
832 {
833         const struct inet_request_sock *ireq = inet_rsk(req);
834         struct flowi4 fl4;
835         int err = -1;
836         struct sk_buff *skb;
837
838         /* First, grab a route. */
839         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
840                 return -1;
841
842         skb = tcp_make_synack(sk, dst, req, foc, attach_req);
843
844         if (skb) {
845                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
846
847                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
848                                             ireq->ir_rmt_addr,
849                                             ireq->opt);
850                 err = net_xmit_eval(err);
851         }
852
853         return err;
854 }
855
856 /*
857  *      IPv4 request_sock destructor.
858  */
859 static void tcp_v4_reqsk_destructor(struct request_sock *req)
860 {
861         kfree(inet_rsk(req)->opt);
862 }
863
864
865 #ifdef CONFIG_TCP_MD5SIG
866 /*
867  * RFC2385 MD5 checksumming requires a mapping of
868  * IP address->MD5 Key.
869  * We need to maintain these in the sk structure.
870  */
871
872 /* Find the Key structure for an address.  */
873 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
874                                          const union tcp_md5_addr *addr,
875                                          int family)
876 {
877         const struct tcp_sock *tp = tcp_sk(sk);
878         struct tcp_md5sig_key *key;
879         unsigned int size = sizeof(struct in_addr);
880         const struct tcp_md5sig_info *md5sig;
881
882         /* caller either holds rcu_read_lock() or socket lock */
883         md5sig = rcu_dereference_check(tp->md5sig_info,
884                                        sock_owned_by_user(sk) ||
885                                        lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
886         if (!md5sig)
887                 return NULL;
888 #if IS_ENABLED(CONFIG_IPV6)
889         if (family == AF_INET6)
890                 size = sizeof(struct in6_addr);
891 #endif
892         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
893                 if (key->family != family)
894                         continue;
895                 if (!memcmp(&key->addr, addr, size))
896                         return key;
897         }
898         return NULL;
899 }
900 EXPORT_SYMBOL(tcp_md5_do_lookup);
901
902 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
903                                          const struct sock *addr_sk)
904 {
905         const union tcp_md5_addr *addr;
906
907         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
908         return tcp_md5_do_lookup(sk, addr, AF_INET);
909 }
910 EXPORT_SYMBOL(tcp_v4_md5_lookup);
911
912 /* This can be called on a newly created socket, from other files */
913 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
914                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
915 {
916         /* Add Key to the list */
917         struct tcp_md5sig_key *key;
918         struct tcp_sock *tp = tcp_sk(sk);
919         struct tcp_md5sig_info *md5sig;
920
921         key = tcp_md5_do_lookup(sk, addr, family);
922         if (key) {
923                 /* Pre-existing entry - just update that one. */
924                 memcpy(key->key, newkey, newkeylen);
925                 key->keylen = newkeylen;
926                 return 0;
927         }
928
929         md5sig = rcu_dereference_protected(tp->md5sig_info,
930                                            sock_owned_by_user(sk) ||
931                                            lockdep_is_held(&sk->sk_lock.slock));
932         if (!md5sig) {
933                 md5sig = kmalloc(sizeof(*md5sig), gfp);
934                 if (!md5sig)
935                         return -ENOMEM;
936
937                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
938                 INIT_HLIST_HEAD(&md5sig->head);
939                 rcu_assign_pointer(tp->md5sig_info, md5sig);
940         }
941
942         key = sock_kmalloc(sk, sizeof(*key), gfp);
943         if (!key)
944                 return -ENOMEM;
945         if (!tcp_alloc_md5sig_pool()) {
946                 sock_kfree_s(sk, key, sizeof(*key));
947                 return -ENOMEM;
948         }
949
950         memcpy(key->key, newkey, newkeylen);
951         key->keylen = newkeylen;
952         key->family = family;
953         memcpy(&key->addr, addr,
954                (family == AF_INET6) ? sizeof(struct in6_addr) :
955                                       sizeof(struct in_addr));
956         hlist_add_head_rcu(&key->node, &md5sig->head);
957         return 0;
958 }
959 EXPORT_SYMBOL(tcp_md5_do_add);
960
961 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
962 {
963         struct tcp_md5sig_key *key;
964
965         key = tcp_md5_do_lookup(sk, addr, family);
966         if (!key)
967                 return -ENOENT;
968         hlist_del_rcu(&key->node);
969         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
970         kfree_rcu(key, rcu);
971         return 0;
972 }
973 EXPORT_SYMBOL(tcp_md5_do_del);
974
975 static void tcp_clear_md5_list(struct sock *sk)
976 {
977         struct tcp_sock *tp = tcp_sk(sk);
978         struct tcp_md5sig_key *key;
979         struct hlist_node *n;
980         struct tcp_md5sig_info *md5sig;
981
982         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
983
984         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
985                 hlist_del_rcu(&key->node);
986                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
987                 kfree_rcu(key, rcu);
988         }
989 }
990
991 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
992                                  int optlen)
993 {
994         struct tcp_md5sig cmd;
995         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
996
997         if (optlen < sizeof(cmd))
998                 return -EINVAL;
999
1000         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1001                 return -EFAULT;
1002
1003         if (sin->sin_family != AF_INET)
1004                 return -EINVAL;
1005
1006         if (!cmd.tcpm_keylen)
1007                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1008                                       AF_INET);
1009
1010         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1011                 return -EINVAL;
1012
1013         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1014                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1015                               GFP_KERNEL);
1016 }
1017
1018 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1019                                         __be32 daddr, __be32 saddr, int nbytes)
1020 {
1021         struct tcp4_pseudohdr *bp;
1022         struct scatterlist sg;
1023
1024         bp = &hp->md5_blk.ip4;
1025
1026         /*
1027          * 1. the TCP pseudo-header (in the order: source IP address,
1028          * destination IP address, zero-padded protocol number, and
1029          * segment length)
1030          */
1031         bp->saddr = saddr;
1032         bp->daddr = daddr;
1033         bp->pad = 0;
1034         bp->protocol = IPPROTO_TCP;
1035         bp->len = cpu_to_be16(nbytes);
1036
1037         sg_init_one(&sg, bp, sizeof(*bp));
1038         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1039 }
1040
1041 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1042                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1043 {
1044         struct tcp_md5sig_pool *hp;
1045         struct hash_desc *desc;
1046
1047         hp = tcp_get_md5sig_pool();
1048         if (!hp)
1049                 goto clear_hash_noput;
1050         desc = &hp->md5_desc;
1051
1052         if (crypto_hash_init(desc))
1053                 goto clear_hash;
1054         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1055                 goto clear_hash;
1056         if (tcp_md5_hash_header(hp, th))
1057                 goto clear_hash;
1058         if (tcp_md5_hash_key(hp, key))
1059                 goto clear_hash;
1060         if (crypto_hash_final(desc, md5_hash))
1061                 goto clear_hash;
1062
1063         tcp_put_md5sig_pool();
1064         return 0;
1065
1066 clear_hash:
1067         tcp_put_md5sig_pool();
1068 clear_hash_noput:
1069         memset(md5_hash, 0, 16);
1070         return 1;
1071 }
1072
1073 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1074                         const struct sock *sk,
1075                         const struct sk_buff *skb)
1076 {
1077         struct tcp_md5sig_pool *hp;
1078         struct hash_desc *desc;
1079         const struct tcphdr *th = tcp_hdr(skb);
1080         __be32 saddr, daddr;
1081
1082         if (sk) { /* valid for establish/request sockets */
1083                 saddr = sk->sk_rcv_saddr;
1084                 daddr = sk->sk_daddr;
1085         } else {
1086                 const struct iphdr *iph = ip_hdr(skb);
1087                 saddr = iph->saddr;
1088                 daddr = iph->daddr;
1089         }
1090
1091         hp = tcp_get_md5sig_pool();
1092         if (!hp)
1093                 goto clear_hash_noput;
1094         desc = &hp->md5_desc;
1095
1096         if (crypto_hash_init(desc))
1097                 goto clear_hash;
1098
1099         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1100                 goto clear_hash;
1101         if (tcp_md5_hash_header(hp, th))
1102                 goto clear_hash;
1103         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1104                 goto clear_hash;
1105         if (tcp_md5_hash_key(hp, key))
1106                 goto clear_hash;
1107         if (crypto_hash_final(desc, md5_hash))
1108                 goto clear_hash;
1109
1110         tcp_put_md5sig_pool();
1111         return 0;
1112
1113 clear_hash:
1114         tcp_put_md5sig_pool();
1115 clear_hash_noput:
1116         memset(md5_hash, 0, 16);
1117         return 1;
1118 }
1119 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1120
1121 #endif
1122
1123 /* Called with rcu_read_lock() */
1124 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1125                                     const struct sk_buff *skb)
1126 {
1127 #ifdef CONFIG_TCP_MD5SIG
1128         /*
1129          * This gets called for each TCP segment that arrives
1130          * so we want to be efficient.
1131          * We have 3 drop cases:
1132          * o No MD5 hash and one expected.
1133          * o MD5 hash and we're not expecting one.
1134          * o MD5 hash and its wrong.
1135          */
1136         const __u8 *hash_location = NULL;
1137         struct tcp_md5sig_key *hash_expected;
1138         const struct iphdr *iph = ip_hdr(skb);
1139         const struct tcphdr *th = tcp_hdr(skb);
1140         int genhash;
1141         unsigned char newhash[16];
1142
1143         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1144                                           AF_INET);
1145         hash_location = tcp_parse_md5sig_option(th);
1146
1147         /* We've parsed the options - do we have a hash? */
1148         if (!hash_expected && !hash_location)
1149                 return false;
1150
1151         if (hash_expected && !hash_location) {
1152                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1153                 return true;
1154         }
1155
1156         if (!hash_expected && hash_location) {
1157                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1158                 return true;
1159         }
1160
1161         /* Okay, so this is hash_expected and hash_location -
1162          * so we need to calculate the checksum.
1163          */
1164         genhash = tcp_v4_md5_hash_skb(newhash,
1165                                       hash_expected,
1166                                       NULL, skb);
1167
1168         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1169                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1170                                      &iph->saddr, ntohs(th->source),
1171                                      &iph->daddr, ntohs(th->dest),
1172                                      genhash ? " tcp_v4_calc_md5_hash failed"
1173                                      : "");
1174                 return true;
1175         }
1176         return false;
1177 #endif
1178         return false;
1179 }
1180
1181 static void tcp_v4_init_req(struct request_sock *req,
1182                             const struct sock *sk_listener,
1183                             struct sk_buff *skb)
1184 {
1185         struct inet_request_sock *ireq = inet_rsk(req);
1186
1187         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1188         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1189         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1190         ireq->opt = tcp_v4_save_options(skb);
1191 }
1192
1193 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1194                                           struct flowi *fl,
1195                                           const struct request_sock *req,
1196                                           bool *strict)
1197 {
1198         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1199
1200         if (strict) {
1201                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1202                         *strict = true;
1203                 else
1204                         *strict = false;
1205         }
1206
1207         return dst;
1208 }
1209
1210 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1211         .family         =       PF_INET,
1212         .obj_size       =       sizeof(struct tcp_request_sock),
1213         .rtx_syn_ack    =       tcp_rtx_synack,
1214         .send_ack       =       tcp_v4_reqsk_send_ack,
1215         .destructor     =       tcp_v4_reqsk_destructor,
1216         .send_reset     =       tcp_v4_send_reset,
1217         .syn_ack_timeout =      tcp_syn_ack_timeout,
1218 };
1219
1220 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1221         .mss_clamp      =       TCP_MSS_DEFAULT,
1222 #ifdef CONFIG_TCP_MD5SIG
1223         .req_md5_lookup =       tcp_v4_md5_lookup,
1224         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1225 #endif
1226         .init_req       =       tcp_v4_init_req,
1227 #ifdef CONFIG_SYN_COOKIES
1228         .cookie_init_seq =      cookie_v4_init_sequence,
1229 #endif
1230         .route_req      =       tcp_v4_route_req,
1231         .init_seq       =       tcp_v4_init_sequence,
1232         .send_synack    =       tcp_v4_send_synack,
1233 };
1234
1235 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1236 {
1237         /* Never answer to SYNs send to broadcast or multicast */
1238         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1239                 goto drop;
1240
1241         return tcp_conn_request(&tcp_request_sock_ops,
1242                                 &tcp_request_sock_ipv4_ops, sk, skb);
1243
1244 drop:
1245         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1246         return 0;
1247 }
1248 EXPORT_SYMBOL(tcp_v4_conn_request);
1249
1250
1251 /*
1252  * The three way handshake has completed - we got a valid synack -
1253  * now create the new socket.
1254  */
1255 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1256                                   struct request_sock *req,
1257                                   struct dst_entry *dst,
1258                                   struct request_sock *req_unhash,
1259                                   bool *own_req)
1260 {
1261         struct inet_request_sock *ireq;
1262         struct inet_sock *newinet;
1263         struct tcp_sock *newtp;
1264         struct sock *newsk;
1265 #ifdef CONFIG_TCP_MD5SIG
1266         struct tcp_md5sig_key *key;
1267 #endif
1268         struct ip_options_rcu *inet_opt;
1269
1270         if (sk_acceptq_is_full(sk))
1271                 goto exit_overflow;
1272
1273         newsk = tcp_create_openreq_child(sk, req, skb);
1274         if (!newsk)
1275                 goto exit_nonewsk;
1276
1277         newsk->sk_gso_type = SKB_GSO_TCPV4;
1278         inet_sk_rx_dst_set(newsk, skb);
1279
1280         newtp                 = tcp_sk(newsk);
1281         newinet               = inet_sk(newsk);
1282         ireq                  = inet_rsk(req);
1283         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1284         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1285         newinet->inet_saddr           = ireq->ir_loc_addr;
1286         inet_opt              = ireq->opt;
1287         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1288         ireq->opt             = NULL;
1289         newinet->mc_index     = inet_iif(skb);
1290         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1291         newinet->rcv_tos      = ip_hdr(skb)->tos;
1292         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1293         if (inet_opt)
1294                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1295         newinet->inet_id = newtp->write_seq ^ jiffies;
1296
1297         if (!dst) {
1298                 dst = inet_csk_route_child_sock(sk, newsk, req);
1299                 if (!dst)
1300                         goto put_and_exit;
1301         } else {
1302                 /* syncookie case : see end of cookie_v4_check() */
1303         }
1304         sk_setup_caps(newsk, dst);
1305
1306         tcp_ca_openreq_child(newsk, dst);
1307
1308         tcp_sync_mss(newsk, dst_mtu(dst));
1309         newtp->advmss = dst_metric_advmss(dst);
1310         if (tcp_sk(sk)->rx_opt.user_mss &&
1311             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1312                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1313
1314         tcp_initialize_rcv_mss(newsk);
1315
1316 #ifdef CONFIG_TCP_MD5SIG
1317         /* Copy over the MD5 key from the original socket */
1318         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1319                                 AF_INET);
1320         if (key) {
1321                 /*
1322                  * We're using one, so create a matching key
1323                  * on the newsk structure. If we fail to get
1324                  * memory, then we end up not copying the key
1325                  * across. Shucks.
1326                  */
1327                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1328                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1329                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1330         }
1331 #endif
1332
1333         if (__inet_inherit_port(sk, newsk) < 0)
1334                 goto put_and_exit;
1335         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1336         if (*own_req)
1337                 tcp_move_syn(newtp, req);
1338
1339         return newsk;
1340
1341 exit_overflow:
1342         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1343 exit_nonewsk:
1344         dst_release(dst);
1345 exit:
1346         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1347         return NULL;
1348 put_and_exit:
1349         inet_csk_prepare_forced_close(newsk);
1350         tcp_done(newsk);
1351         goto exit;
1352 }
1353 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1354
1355 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1356 {
1357 #ifdef CONFIG_SYN_COOKIES
1358         const struct tcphdr *th = tcp_hdr(skb);
1359
1360         if (!th->syn)
1361                 sk = cookie_v4_check(sk, skb);
1362 #endif
1363         return sk;
1364 }
1365
1366 /* The socket must have it's spinlock held when we get
1367  * here, unless it is a TCP_LISTEN socket.
1368  *
1369  * We have a potential double-lock case here, so even when
1370  * doing backlog processing we use the BH locking scheme.
1371  * This is because we cannot sleep with the original spinlock
1372  * held.
1373  */
1374 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1375 {
1376         struct sock *rsk;
1377
1378         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1379                 struct dst_entry *dst = sk->sk_rx_dst;
1380
1381                 sock_rps_save_rxhash(sk, skb);
1382                 sk_mark_napi_id(sk, skb);
1383                 if (dst) {
1384                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1385                             !dst->ops->check(dst, 0)) {
1386                                 dst_release(dst);
1387                                 sk->sk_rx_dst = NULL;
1388                         }
1389                 }
1390                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1391                 return 0;
1392         }
1393
1394         if (tcp_checksum_complete(skb))
1395                 goto csum_err;
1396
1397         if (sk->sk_state == TCP_LISTEN) {
1398                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1399
1400                 if (!nsk)
1401                         goto discard;
1402                 if (nsk != sk) {
1403                         sock_rps_save_rxhash(nsk, skb);
1404                         sk_mark_napi_id(nsk, skb);
1405                         if (tcp_child_process(sk, nsk, skb)) {
1406                                 rsk = nsk;
1407                                 goto reset;
1408                         }
1409                         return 0;
1410                 }
1411         } else
1412                 sock_rps_save_rxhash(sk, skb);
1413
1414         if (tcp_rcv_state_process(sk, skb)) {
1415                 rsk = sk;
1416                 goto reset;
1417         }
1418         return 0;
1419
1420 reset:
1421         tcp_v4_send_reset(rsk, skb);
1422 discard:
1423         kfree_skb(skb);
1424         /* Be careful here. If this function gets more complicated and
1425          * gcc suffers from register pressure on the x86, sk (in %ebx)
1426          * might be destroyed here. This current version compiles correctly,
1427          * but you have been warned.
1428          */
1429         return 0;
1430
1431 csum_err:
1432         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1433         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1434         goto discard;
1435 }
1436 EXPORT_SYMBOL(tcp_v4_do_rcv);
1437
1438 void tcp_v4_early_demux(struct sk_buff *skb)
1439 {
1440         const struct iphdr *iph;
1441         const struct tcphdr *th;
1442         struct sock *sk;
1443
1444         if (skb->pkt_type != PACKET_HOST)
1445                 return;
1446
1447         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1448                 return;
1449
1450         iph = ip_hdr(skb);
1451         th = tcp_hdr(skb);
1452
1453         if (th->doff < sizeof(struct tcphdr) / 4)
1454                 return;
1455
1456         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1457                                        iph->saddr, th->source,
1458                                        iph->daddr, ntohs(th->dest),
1459                                        skb->skb_iif);
1460         if (sk) {
1461                 skb->sk = sk;
1462                 skb->destructor = sock_edemux;
1463                 if (sk_fullsock(sk)) {
1464                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1465
1466                         if (dst)
1467                                 dst = dst_check(dst, 0);
1468                         if (dst &&
1469                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1470                                 skb_dst_set_noref(skb, dst);
1471                 }
1472         }
1473 }
1474
1475 /* Packet is added to VJ-style prequeue for processing in process
1476  * context, if a reader task is waiting. Apparently, this exciting
1477  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1478  * failed somewhere. Latency? Burstiness? Well, at least now we will
1479  * see, why it failed. 8)8)                               --ANK
1480  *
1481  */
1482 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1483 {
1484         struct tcp_sock *tp = tcp_sk(sk);
1485
1486         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1487                 return false;
1488
1489         if (skb->len <= tcp_hdrlen(skb) &&
1490             skb_queue_len(&tp->ucopy.prequeue) == 0)
1491                 return false;
1492
1493         /* Before escaping RCU protected region, we need to take care of skb
1494          * dst. Prequeue is only enabled for established sockets.
1495          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1496          * Instead of doing full sk_rx_dst validity here, let's perform
1497          * an optimistic check.
1498          */
1499         if (likely(sk->sk_rx_dst))
1500                 skb_dst_drop(skb);
1501         else
1502                 skb_dst_force_safe(skb);
1503
1504         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1505         tp->ucopy.memory += skb->truesize;
1506         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1507                 struct sk_buff *skb1;
1508
1509                 BUG_ON(sock_owned_by_user(sk));
1510
1511                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1512                         sk_backlog_rcv(sk, skb1);
1513                         NET_INC_STATS_BH(sock_net(sk),
1514                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1515                 }
1516
1517                 tp->ucopy.memory = 0;
1518         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1519                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1520                                            POLLIN | POLLRDNORM | POLLRDBAND);
1521                 if (!inet_csk_ack_scheduled(sk))
1522                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1523                                                   (3 * tcp_rto_min(sk)) / 4,
1524                                                   TCP_RTO_MAX);
1525         }
1526         return true;
1527 }
1528 EXPORT_SYMBOL(tcp_prequeue);
1529
1530 /*
1531  *      From tcp_input.c
1532  */
1533
1534 int tcp_v4_rcv(struct sk_buff *skb)
1535 {
1536         const struct iphdr *iph;
1537         const struct tcphdr *th;
1538         struct sock *sk;
1539         int ret;
1540         struct net *net = dev_net(skb->dev);
1541
1542         if (skb->pkt_type != PACKET_HOST)
1543                 goto discard_it;
1544
1545         /* Count it even if it's bad */
1546         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1547
1548         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1549                 goto discard_it;
1550
1551         th = tcp_hdr(skb);
1552
1553         if (th->doff < sizeof(struct tcphdr) / 4)
1554                 goto bad_packet;
1555         if (!pskb_may_pull(skb, th->doff * 4))
1556                 goto discard_it;
1557
1558         /* An explanation is required here, I think.
1559          * Packet length and doff are validated by header prediction,
1560          * provided case of th->doff==0 is eliminated.
1561          * So, we defer the checks. */
1562
1563         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1564                 goto csum_error;
1565
1566         th = tcp_hdr(skb);
1567         iph = ip_hdr(skb);
1568         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1569          * barrier() makes sure compiler wont play fool^Waliasing games.
1570          */
1571         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1572                 sizeof(struct inet_skb_parm));
1573         barrier();
1574
1575         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1576         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1577                                     skb->len - th->doff * 4);
1578         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1579         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1580         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1581         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1582         TCP_SKB_CB(skb)->sacked  = 0;
1583
1584 lookup:
1585         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1586         if (!sk)
1587                 goto no_tcp_socket;
1588
1589 process:
1590         if (sk->sk_state == TCP_TIME_WAIT)
1591                 goto do_time_wait;
1592
1593         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1594                 struct request_sock *req = inet_reqsk(sk);
1595                 struct sock *nsk;
1596
1597                 sk = req->rsk_listener;
1598                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1599                         reqsk_put(req);
1600                         goto discard_it;
1601                 }
1602                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1603                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1604                         goto lookup;
1605                 }
1606                 sock_hold(sk);
1607                 nsk = tcp_check_req(sk, skb, req, false);
1608                 if (!nsk) {
1609                         reqsk_put(req);
1610                         goto discard_and_relse;
1611                 }
1612                 if (nsk == sk) {
1613                         reqsk_put(req);
1614                 } else if (tcp_child_process(sk, nsk, skb)) {
1615                         tcp_v4_send_reset(nsk, skb);
1616                         goto discard_and_relse;
1617                 } else {
1618                         sock_put(sk);
1619                         return 0;
1620                 }
1621         }
1622         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1623                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1624                 goto discard_and_relse;
1625         }
1626
1627         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1628                 goto discard_and_relse;
1629
1630         if (tcp_v4_inbound_md5_hash(sk, skb))
1631                 goto discard_and_relse;
1632
1633         nf_reset(skb);
1634
1635         if (sk_filter(sk, skb))
1636                 goto discard_and_relse;
1637
1638         skb->dev = NULL;
1639
1640         if (sk->sk_state == TCP_LISTEN) {
1641                 ret = tcp_v4_do_rcv(sk, skb);
1642                 goto put_and_return;
1643         }
1644
1645         sk_incoming_cpu_update(sk);
1646
1647         bh_lock_sock_nested(sk);
1648         tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1649         ret = 0;
1650         if (!sock_owned_by_user(sk)) {
1651                 if (!tcp_prequeue(sk, skb))
1652                         ret = tcp_v4_do_rcv(sk, skb);
1653         } else if (unlikely(sk_add_backlog(sk, skb,
1654                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1655                 bh_unlock_sock(sk);
1656                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1657                 goto discard_and_relse;
1658         }
1659         bh_unlock_sock(sk);
1660
1661 put_and_return:
1662         sock_put(sk);
1663
1664         return ret;
1665
1666 no_tcp_socket:
1667         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1668                 goto discard_it;
1669
1670         if (tcp_checksum_complete(skb)) {
1671 csum_error:
1672                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1673 bad_packet:
1674                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1675         } else {
1676                 tcp_v4_send_reset(NULL, skb);
1677         }
1678
1679 discard_it:
1680         /* Discard frame. */
1681         kfree_skb(skb);
1682         return 0;
1683
1684 discard_and_relse:
1685         sock_put(sk);
1686         goto discard_it;
1687
1688 do_time_wait:
1689         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1690                 inet_twsk_put(inet_twsk(sk));
1691                 goto discard_it;
1692         }
1693
1694         if (tcp_checksum_complete(skb)) {
1695                 inet_twsk_put(inet_twsk(sk));
1696                 goto csum_error;
1697         }
1698         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1699         case TCP_TW_SYN: {
1700                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1701                                                         &tcp_hashinfo,
1702                                                         iph->saddr, th->source,
1703                                                         iph->daddr, th->dest,
1704                                                         inet_iif(skb));
1705                 if (sk2) {
1706                         inet_twsk_deschedule_put(inet_twsk(sk));
1707                         sk = sk2;
1708                         goto process;
1709                 }
1710                 /* Fall through to ACK */
1711         }
1712         case TCP_TW_ACK:
1713                 tcp_v4_timewait_ack(sk, skb);
1714                 break;
1715         case TCP_TW_RST:
1716                 goto no_tcp_socket;
1717         case TCP_TW_SUCCESS:;
1718         }
1719         goto discard_it;
1720 }
1721
1722 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1723         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1724         .twsk_unique    = tcp_twsk_unique,
1725         .twsk_destructor= tcp_twsk_destructor,
1726 };
1727
1728 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1729 {
1730         struct dst_entry *dst = skb_dst(skb);
1731
1732         if (dst && dst_hold_safe(dst)) {
1733                 sk->sk_rx_dst = dst;
1734                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1735         }
1736 }
1737 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1738
1739 const struct inet_connection_sock_af_ops ipv4_specific = {
1740         .queue_xmit        = ip_queue_xmit,
1741         .send_check        = tcp_v4_send_check,
1742         .rebuild_header    = inet_sk_rebuild_header,
1743         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1744         .conn_request      = tcp_v4_conn_request,
1745         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1746         .net_header_len    = sizeof(struct iphdr),
1747         .setsockopt        = ip_setsockopt,
1748         .getsockopt        = ip_getsockopt,
1749         .addr2sockaddr     = inet_csk_addr2sockaddr,
1750         .sockaddr_len      = sizeof(struct sockaddr_in),
1751         .bind_conflict     = inet_csk_bind_conflict,
1752 #ifdef CONFIG_COMPAT
1753         .compat_setsockopt = compat_ip_setsockopt,
1754         .compat_getsockopt = compat_ip_getsockopt,
1755 #endif
1756         .mtu_reduced       = tcp_v4_mtu_reduced,
1757 };
1758 EXPORT_SYMBOL(ipv4_specific);
1759
1760 #ifdef CONFIG_TCP_MD5SIG
1761 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1762         .md5_lookup             = tcp_v4_md5_lookup,
1763         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1764         .md5_parse              = tcp_v4_parse_md5_keys,
1765 };
1766 #endif
1767
1768 /* NOTE: A lot of things set to zero explicitly by call to
1769  *       sk_alloc() so need not be done here.
1770  */
1771 static int tcp_v4_init_sock(struct sock *sk)
1772 {
1773         struct inet_connection_sock *icsk = inet_csk(sk);
1774
1775         tcp_init_sock(sk);
1776
1777         icsk->icsk_af_ops = &ipv4_specific;
1778
1779 #ifdef CONFIG_TCP_MD5SIG
1780         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1781 #endif
1782
1783         return 0;
1784 }
1785
1786 void tcp_v4_destroy_sock(struct sock *sk)
1787 {
1788         struct tcp_sock *tp = tcp_sk(sk);
1789
1790         tcp_clear_xmit_timers(sk);
1791
1792         tcp_cleanup_congestion_control(sk);
1793
1794         /* Cleanup up the write buffer. */
1795         tcp_write_queue_purge(sk);
1796
1797         /* Cleans up our, hopefully empty, out_of_order_queue. */
1798         __skb_queue_purge(&tp->out_of_order_queue);
1799
1800 #ifdef CONFIG_TCP_MD5SIG
1801         /* Clean up the MD5 key list, if any */
1802         if (tp->md5sig_info) {
1803                 tcp_clear_md5_list(sk);
1804                 kfree_rcu(tp->md5sig_info, rcu);
1805                 tp->md5sig_info = NULL;
1806         }
1807 #endif
1808
1809         /* Clean prequeue, it must be empty really */
1810         __skb_queue_purge(&tp->ucopy.prequeue);
1811
1812         /* Clean up a referenced TCP bind bucket. */
1813         if (inet_csk(sk)->icsk_bind_hash)
1814                 inet_put_port(sk);
1815
1816         BUG_ON(tp->fastopen_rsk);
1817
1818         /* If socket is aborted during connect operation */
1819         tcp_free_fastopen_req(tp);
1820         tcp_saved_syn_free(tp);
1821
1822         sk_sockets_allocated_dec(sk);
1823         sock_release_memcg(sk);
1824 }
1825 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1826
1827 #ifdef CONFIG_PROC_FS
1828 /* Proc filesystem TCP sock list dumping. */
1829
1830 /*
1831  * Get next listener socket follow cur.  If cur is NULL, get first socket
1832  * starting from bucket given in st->bucket; when st->bucket is zero the
1833  * very first socket in the hash table is returned.
1834  */
1835 static void *listening_get_next(struct seq_file *seq, void *cur)
1836 {
1837         struct inet_connection_sock *icsk;
1838         struct hlist_nulls_node *node;
1839         struct sock *sk = cur;
1840         struct inet_listen_hashbucket *ilb;
1841         struct tcp_iter_state *st = seq->private;
1842         struct net *net = seq_file_net(seq);
1843
1844         if (!sk) {
1845                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1846                 spin_lock_bh(&ilb->lock);
1847                 sk = sk_nulls_head(&ilb->head);
1848                 st->offset = 0;
1849                 goto get_sk;
1850         }
1851         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1852         ++st->num;
1853         ++st->offset;
1854
1855         sk = sk_nulls_next(sk);
1856 get_sk:
1857         sk_nulls_for_each_from(sk, node) {
1858                 if (!net_eq(sock_net(sk), net))
1859                         continue;
1860                 if (sk->sk_family == st->family) {
1861                         cur = sk;
1862                         goto out;
1863                 }
1864                 icsk = inet_csk(sk);
1865         }
1866         spin_unlock_bh(&ilb->lock);
1867         st->offset = 0;
1868         if (++st->bucket < INET_LHTABLE_SIZE) {
1869                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1870                 spin_lock_bh(&ilb->lock);
1871                 sk = sk_nulls_head(&ilb->head);
1872                 goto get_sk;
1873         }
1874         cur = NULL;
1875 out:
1876         return cur;
1877 }
1878
1879 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1880 {
1881         struct tcp_iter_state *st = seq->private;
1882         void *rc;
1883
1884         st->bucket = 0;
1885         st->offset = 0;
1886         rc = listening_get_next(seq, NULL);
1887
1888         while (rc && *pos) {
1889                 rc = listening_get_next(seq, rc);
1890                 --*pos;
1891         }
1892         return rc;
1893 }
1894
1895 static inline bool empty_bucket(const struct tcp_iter_state *st)
1896 {
1897         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1898 }
1899
1900 /*
1901  * Get first established socket starting from bucket given in st->bucket.
1902  * If st->bucket is zero, the very first socket in the hash is returned.
1903  */
1904 static void *established_get_first(struct seq_file *seq)
1905 {
1906         struct tcp_iter_state *st = seq->private;
1907         struct net *net = seq_file_net(seq);
1908         void *rc = NULL;
1909
1910         st->offset = 0;
1911         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1912                 struct sock *sk;
1913                 struct hlist_nulls_node *node;
1914                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1915
1916                 /* Lockless fast path for the common case of empty buckets */
1917                 if (empty_bucket(st))
1918                         continue;
1919
1920                 spin_lock_bh(lock);
1921                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1922                         if (sk->sk_family != st->family ||
1923                             !net_eq(sock_net(sk), net)) {
1924                                 continue;
1925                         }
1926                         rc = sk;
1927                         goto out;
1928                 }
1929                 spin_unlock_bh(lock);
1930         }
1931 out:
1932         return rc;
1933 }
1934
1935 static void *established_get_next(struct seq_file *seq, void *cur)
1936 {
1937         struct sock *sk = cur;
1938         struct hlist_nulls_node *node;
1939         struct tcp_iter_state *st = seq->private;
1940         struct net *net = seq_file_net(seq);
1941
1942         ++st->num;
1943         ++st->offset;
1944
1945         sk = sk_nulls_next(sk);
1946
1947         sk_nulls_for_each_from(sk, node) {
1948                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1949                         return sk;
1950         }
1951
1952         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1953         ++st->bucket;
1954         return established_get_first(seq);
1955 }
1956
1957 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1958 {
1959         struct tcp_iter_state *st = seq->private;
1960         void *rc;
1961
1962         st->bucket = 0;
1963         rc = established_get_first(seq);
1964
1965         while (rc && pos) {
1966                 rc = established_get_next(seq, rc);
1967                 --pos;
1968         }
1969         return rc;
1970 }
1971
1972 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1973 {
1974         void *rc;
1975         struct tcp_iter_state *st = seq->private;
1976
1977         st->state = TCP_SEQ_STATE_LISTENING;
1978         rc        = listening_get_idx(seq, &pos);
1979
1980         if (!rc) {
1981                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1982                 rc        = established_get_idx(seq, pos);
1983         }
1984
1985         return rc;
1986 }
1987
1988 static void *tcp_seek_last_pos(struct seq_file *seq)
1989 {
1990         struct tcp_iter_state *st = seq->private;
1991         int offset = st->offset;
1992         int orig_num = st->num;
1993         void *rc = NULL;
1994
1995         switch (st->state) {
1996         case TCP_SEQ_STATE_LISTENING:
1997                 if (st->bucket >= INET_LHTABLE_SIZE)
1998                         break;
1999                 st->state = TCP_SEQ_STATE_LISTENING;
2000                 rc = listening_get_next(seq, NULL);
2001                 while (offset-- && rc)
2002                         rc = listening_get_next(seq, rc);
2003                 if (rc)
2004                         break;
2005                 st->bucket = 0;
2006                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2007                 /* Fallthrough */
2008         case TCP_SEQ_STATE_ESTABLISHED:
2009                 if (st->bucket > tcp_hashinfo.ehash_mask)
2010                         break;
2011                 rc = established_get_first(seq);
2012                 while (offset-- && rc)
2013                         rc = established_get_next(seq, rc);
2014         }
2015
2016         st->num = orig_num;
2017
2018         return rc;
2019 }
2020
2021 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2022 {
2023         struct tcp_iter_state *st = seq->private;
2024         void *rc;
2025
2026         if (*pos && *pos == st->last_pos) {
2027                 rc = tcp_seek_last_pos(seq);
2028                 if (rc)
2029                         goto out;
2030         }
2031
2032         st->state = TCP_SEQ_STATE_LISTENING;
2033         st->num = 0;
2034         st->bucket = 0;
2035         st->offset = 0;
2036         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2037
2038 out:
2039         st->last_pos = *pos;
2040         return rc;
2041 }
2042
2043 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2044 {
2045         struct tcp_iter_state *st = seq->private;
2046         void *rc = NULL;
2047
2048         if (v == SEQ_START_TOKEN) {
2049                 rc = tcp_get_idx(seq, 0);
2050                 goto out;
2051         }
2052
2053         switch (st->state) {
2054         case TCP_SEQ_STATE_LISTENING:
2055                 rc = listening_get_next(seq, v);
2056                 if (!rc) {
2057                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2058                         st->bucket = 0;
2059                         st->offset = 0;
2060                         rc        = established_get_first(seq);
2061                 }
2062                 break;
2063         case TCP_SEQ_STATE_ESTABLISHED:
2064                 rc = established_get_next(seq, v);
2065                 break;
2066         }
2067 out:
2068         ++*pos;
2069         st->last_pos = *pos;
2070         return rc;
2071 }
2072
2073 static void tcp_seq_stop(struct seq_file *seq, void *v)
2074 {
2075         struct tcp_iter_state *st = seq->private;
2076
2077         switch (st->state) {
2078         case TCP_SEQ_STATE_LISTENING:
2079                 if (v != SEQ_START_TOKEN)
2080                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2081                 break;
2082         case TCP_SEQ_STATE_ESTABLISHED:
2083                 if (v)
2084                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2085                 break;
2086         }
2087 }
2088
2089 int tcp_seq_open(struct inode *inode, struct file *file)
2090 {
2091         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2092         struct tcp_iter_state *s;
2093         int err;
2094
2095         err = seq_open_net(inode, file, &afinfo->seq_ops,
2096                           sizeof(struct tcp_iter_state));
2097         if (err < 0)
2098                 return err;
2099
2100         s = ((struct seq_file *)file->private_data)->private;
2101         s->family               = afinfo->family;
2102         s->last_pos             = 0;
2103         return 0;
2104 }
2105 EXPORT_SYMBOL(tcp_seq_open);
2106
2107 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2108 {
2109         int rc = 0;
2110         struct proc_dir_entry *p;
2111
2112         afinfo->seq_ops.start           = tcp_seq_start;
2113         afinfo->seq_ops.next            = tcp_seq_next;
2114         afinfo->seq_ops.stop            = tcp_seq_stop;
2115
2116         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2117                              afinfo->seq_fops, afinfo);
2118         if (!p)
2119                 rc = -ENOMEM;
2120         return rc;
2121 }
2122 EXPORT_SYMBOL(tcp_proc_register);
2123
2124 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2125 {
2126         remove_proc_entry(afinfo->name, net->proc_net);
2127 }
2128 EXPORT_SYMBOL(tcp_proc_unregister);
2129
2130 static void get_openreq4(const struct request_sock *req,
2131                          struct seq_file *f, int i)
2132 {
2133         const struct inet_request_sock *ireq = inet_rsk(req);
2134         long delta = req->rsk_timer.expires - jiffies;
2135
2136         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2137                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2138                 i,
2139                 ireq->ir_loc_addr,
2140                 ireq->ir_num,
2141                 ireq->ir_rmt_addr,
2142                 ntohs(ireq->ir_rmt_port),
2143                 TCP_SYN_RECV,
2144                 0, 0, /* could print option size, but that is af dependent. */
2145                 1,    /* timers active (only the expire timer) */
2146                 jiffies_delta_to_clock_t(delta),
2147                 req->num_timeout,
2148                 from_kuid_munged(seq_user_ns(f),
2149                                  sock_i_uid(req->rsk_listener)),
2150                 0,  /* non standard timer */
2151                 0, /* open_requests have no inode */
2152                 0,
2153                 req);
2154 }
2155
2156 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2157 {
2158         int timer_active;
2159         unsigned long timer_expires;
2160         const struct tcp_sock *tp = tcp_sk(sk);
2161         const struct inet_connection_sock *icsk = inet_csk(sk);
2162         const struct inet_sock *inet = inet_sk(sk);
2163         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2164         __be32 dest = inet->inet_daddr;
2165         __be32 src = inet->inet_rcv_saddr;
2166         __u16 destp = ntohs(inet->inet_dport);
2167         __u16 srcp = ntohs(inet->inet_sport);
2168         int rx_queue;
2169         int state;
2170
2171         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2172             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2173             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2174                 timer_active    = 1;
2175                 timer_expires   = icsk->icsk_timeout;
2176         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2177                 timer_active    = 4;
2178                 timer_expires   = icsk->icsk_timeout;
2179         } else if (timer_pending(&sk->sk_timer)) {
2180                 timer_active    = 2;
2181                 timer_expires   = sk->sk_timer.expires;
2182         } else {
2183                 timer_active    = 0;
2184                 timer_expires = jiffies;
2185         }
2186
2187         state = sk_state_load(sk);
2188         if (state == TCP_LISTEN)
2189                 rx_queue = sk->sk_ack_backlog;
2190         else
2191                 /* Because we don't lock the socket,
2192                  * we might find a transient negative value.
2193                  */
2194                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2195
2196         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2197                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2198                 i, src, srcp, dest, destp, state,
2199                 tp->write_seq - tp->snd_una,
2200                 rx_queue,
2201                 timer_active,
2202                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2203                 icsk->icsk_retransmits,
2204                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2205                 icsk->icsk_probes_out,
2206                 sock_i_ino(sk),
2207                 atomic_read(&sk->sk_refcnt), sk,
2208                 jiffies_to_clock_t(icsk->icsk_rto),
2209                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2210                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2211                 tp->snd_cwnd,
2212                 state == TCP_LISTEN ?
2213                     fastopenq->max_qlen :
2214                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2215 }
2216
2217 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2218                                struct seq_file *f, int i)
2219 {
2220         long delta = tw->tw_timer.expires - jiffies;
2221         __be32 dest, src;
2222         __u16 destp, srcp;
2223
2224         dest  = tw->tw_daddr;
2225         src   = tw->tw_rcv_saddr;
2226         destp = ntohs(tw->tw_dport);
2227         srcp  = ntohs(tw->tw_sport);
2228
2229         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2230                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2231                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2232                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2233                 atomic_read(&tw->tw_refcnt), tw);
2234 }
2235
2236 #define TMPSZ 150
2237
2238 static int tcp4_seq_show(struct seq_file *seq, void *v)
2239 {
2240         struct tcp_iter_state *st;
2241         struct sock *sk = v;
2242
2243         seq_setwidth(seq, TMPSZ - 1);
2244         if (v == SEQ_START_TOKEN) {
2245                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2246                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2247                            "inode");
2248                 goto out;
2249         }
2250         st = seq->private;
2251
2252         if (sk->sk_state == TCP_TIME_WAIT)
2253                 get_timewait4_sock(v, seq, st->num);
2254         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2255                 get_openreq4(v, seq, st->num);
2256         else
2257                 get_tcp4_sock(v, seq, st->num);
2258 out:
2259         seq_pad(seq, '\n');
2260         return 0;
2261 }
2262
2263 static const struct file_operations tcp_afinfo_seq_fops = {
2264         .owner   = THIS_MODULE,
2265         .open    = tcp_seq_open,
2266         .read    = seq_read,
2267         .llseek  = seq_lseek,
2268         .release = seq_release_net
2269 };
2270
2271 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2272         .name           = "tcp",
2273         .family         = AF_INET,
2274         .seq_fops       = &tcp_afinfo_seq_fops,
2275         .seq_ops        = {
2276                 .show           = tcp4_seq_show,
2277         },
2278 };
2279
2280 static int __net_init tcp4_proc_init_net(struct net *net)
2281 {
2282         return tcp_proc_register(net, &tcp4_seq_afinfo);
2283 }
2284
2285 static void __net_exit tcp4_proc_exit_net(struct net *net)
2286 {
2287         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2288 }
2289
2290 static struct pernet_operations tcp4_net_ops = {
2291         .init = tcp4_proc_init_net,
2292         .exit = tcp4_proc_exit_net,
2293 };
2294
2295 int __init tcp4_proc_init(void)
2296 {
2297         return register_pernet_subsys(&tcp4_net_ops);
2298 }
2299
2300 void tcp4_proc_exit(void)
2301 {
2302         unregister_pernet_subsys(&tcp4_net_ops);
2303 }
2304 #endif /* CONFIG_PROC_FS */
2305
2306 struct proto tcp_prot = {
2307         .name                   = "TCP",
2308         .owner                  = THIS_MODULE,
2309         .close                  = tcp_close,
2310         .connect                = tcp_v4_connect,
2311         .disconnect             = tcp_disconnect,
2312         .accept                 = inet_csk_accept,
2313         .ioctl                  = tcp_ioctl,
2314         .init                   = tcp_v4_init_sock,
2315         .destroy                = tcp_v4_destroy_sock,
2316         .shutdown               = tcp_shutdown,
2317         .setsockopt             = tcp_setsockopt,
2318         .getsockopt             = tcp_getsockopt,
2319         .recvmsg                = tcp_recvmsg,
2320         .sendmsg                = tcp_sendmsg,
2321         .sendpage               = tcp_sendpage,
2322         .backlog_rcv            = tcp_v4_do_rcv,
2323         .release_cb             = tcp_release_cb,
2324         .hash                   = inet_hash,
2325         .unhash                 = inet_unhash,
2326         .get_port               = inet_csk_get_port,
2327         .enter_memory_pressure  = tcp_enter_memory_pressure,
2328         .stream_memory_free     = tcp_stream_memory_free,
2329         .sockets_allocated      = &tcp_sockets_allocated,
2330         .orphan_count           = &tcp_orphan_count,
2331         .memory_allocated       = &tcp_memory_allocated,
2332         .memory_pressure        = &tcp_memory_pressure,
2333         .sysctl_mem             = sysctl_tcp_mem,
2334         .sysctl_wmem            = sysctl_tcp_wmem,
2335         .sysctl_rmem            = sysctl_tcp_rmem,
2336         .max_header             = MAX_TCP_HEADER,
2337         .obj_size               = sizeof(struct tcp_sock),
2338         .slab_flags             = SLAB_DESTROY_BY_RCU,
2339         .twsk_prot              = &tcp_timewait_sock_ops,
2340         .rsk_prot               = &tcp_request_sock_ops,
2341         .h.hashinfo             = &tcp_hashinfo,
2342         .no_autobind            = true,
2343 #ifdef CONFIG_COMPAT
2344         .compat_setsockopt      = compat_tcp_setsockopt,
2345         .compat_getsockopt      = compat_tcp_getsockopt,
2346 #endif
2347 #ifdef CONFIG_MEMCG_KMEM
2348         .init_cgroup            = tcp_init_cgroup,
2349         .destroy_cgroup         = tcp_destroy_cgroup,
2350         .proto_cgroup           = tcp_proto_cgroup,
2351 #endif
2352         .diag_destroy           = tcp_abort,
2353 };
2354 EXPORT_SYMBOL(tcp_prot);
2355
2356 static void __net_exit tcp_sk_exit(struct net *net)
2357 {
2358         int cpu;
2359
2360         for_each_possible_cpu(cpu)
2361                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2362         free_percpu(net->ipv4.tcp_sk);
2363 }
2364
2365 static int __net_init tcp_sk_init(struct net *net)
2366 {
2367         int res, cpu;
2368
2369         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2370         if (!net->ipv4.tcp_sk)
2371                 return -ENOMEM;
2372
2373         for_each_possible_cpu(cpu) {
2374                 struct sock *sk;
2375
2376                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2377                                            IPPROTO_TCP, net);
2378                 if (res)
2379                         goto fail;
2380                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2381         }
2382
2383         net->ipv4.sysctl_tcp_ecn = 2;
2384         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2385
2386         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2387         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2388         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2389
2390         return 0;
2391 fail:
2392         tcp_sk_exit(net);
2393
2394         return res;
2395 }
2396
2397 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2398 {
2399         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2400 }
2401
2402 static struct pernet_operations __net_initdata tcp_sk_ops = {
2403        .init       = tcp_sk_init,
2404        .exit       = tcp_sk_exit,
2405        .exit_batch = tcp_sk_exit_batch,
2406 };
2407
2408 void __init tcp_v4_init(void)
2409 {
2410         inet_hashinfo_init(&tcp_hashinfo);
2411         if (register_pernet_subsys(&tcp_sk_ops))
2412                 panic("Failed to create the TCP control socket.\n");
2413 }