Merge remote-tracking branch 'lsk/v3.10/topic/gator' into linux-linaro-lsk
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / tcp_metrics.c
1 #include <linux/rcupdate.h>
2 #include <linux/spinlock.h>
3 #include <linux/jiffies.h>
4 #include <linux/module.h>
5 #include <linux/cache.h>
6 #include <linux/slab.h>
7 #include <linux/init.h>
8 #include <linux/tcp.h>
9 #include <linux/hash.h>
10 #include <linux/tcp_metrics.h>
11 #include <linux/vmalloc.h>
12
13 #include <net/inet_connection_sock.h>
14 #include <net/net_namespace.h>
15 #include <net/request_sock.h>
16 #include <net/inetpeer.h>
17 #include <net/sock.h>
18 #include <net/ipv6.h>
19 #include <net/dst.h>
20 #include <net/tcp.h>
21 #include <net/genetlink.h>
22
23 int sysctl_tcp_nometrics_save __read_mostly;
24
25 static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
26                                                    struct net *net, unsigned int hash);
27
28 struct tcp_fastopen_metrics {
29         u16     mss;
30         u16     syn_loss:10;            /* Recurring Fast Open SYN losses */
31         unsigned long   last_syn_loss;  /* Last Fast Open SYN loss */
32         struct  tcp_fastopen_cookie     cookie;
33 };
34
35 struct tcp_metrics_block {
36         struct tcp_metrics_block __rcu  *tcpm_next;
37         struct inetpeer_addr            tcpm_addr;
38         unsigned long                   tcpm_stamp;
39         u32                             tcpm_ts;
40         u32                             tcpm_ts_stamp;
41         u32                             tcpm_lock;
42         u32                             tcpm_vals[TCP_METRIC_MAX + 1];
43         struct tcp_fastopen_metrics     tcpm_fastopen;
44
45         struct rcu_head                 rcu_head;
46 };
47
48 static bool tcp_metric_locked(struct tcp_metrics_block *tm,
49                               enum tcp_metric_index idx)
50 {
51         return tm->tcpm_lock & (1 << idx);
52 }
53
54 static u32 tcp_metric_get(struct tcp_metrics_block *tm,
55                           enum tcp_metric_index idx)
56 {
57         return tm->tcpm_vals[idx];
58 }
59
60 static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
61                                   enum tcp_metric_index idx)
62 {
63         return msecs_to_jiffies(tm->tcpm_vals[idx]);
64 }
65
66 static void tcp_metric_set(struct tcp_metrics_block *tm,
67                            enum tcp_metric_index idx,
68                            u32 val)
69 {
70         tm->tcpm_vals[idx] = val;
71 }
72
73 static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
74                                  enum tcp_metric_index idx,
75                                  u32 val)
76 {
77         tm->tcpm_vals[idx] = jiffies_to_msecs(val);
78 }
79
80 static bool addr_same(const struct inetpeer_addr *a,
81                       const struct inetpeer_addr *b)
82 {
83         const struct in6_addr *a6, *b6;
84
85         if (a->family != b->family)
86                 return false;
87         if (a->family == AF_INET)
88                 return a->addr.a4 == b->addr.a4;
89
90         a6 = (const struct in6_addr *) &a->addr.a6[0];
91         b6 = (const struct in6_addr *) &b->addr.a6[0];
92
93         return ipv6_addr_equal(a6, b6);
94 }
95
96 struct tcpm_hash_bucket {
97         struct tcp_metrics_block __rcu  *chain;
98 };
99
100 static DEFINE_SPINLOCK(tcp_metrics_lock);
101
102 static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
103                           bool fastopen_clear)
104 {
105         u32 val;
106
107         tm->tcpm_stamp = jiffies;
108
109         val = 0;
110         if (dst_metric_locked(dst, RTAX_RTT))
111                 val |= 1 << TCP_METRIC_RTT;
112         if (dst_metric_locked(dst, RTAX_RTTVAR))
113                 val |= 1 << TCP_METRIC_RTTVAR;
114         if (dst_metric_locked(dst, RTAX_SSTHRESH))
115                 val |= 1 << TCP_METRIC_SSTHRESH;
116         if (dst_metric_locked(dst, RTAX_CWND))
117                 val |= 1 << TCP_METRIC_CWND;
118         if (dst_metric_locked(dst, RTAX_REORDERING))
119                 val |= 1 << TCP_METRIC_REORDERING;
120         tm->tcpm_lock = val;
121
122         tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
123         tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
124         tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
125         tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
126         tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
127         tm->tcpm_ts = 0;
128         tm->tcpm_ts_stamp = 0;
129         if (fastopen_clear) {
130                 tm->tcpm_fastopen.mss = 0;
131                 tm->tcpm_fastopen.syn_loss = 0;
132                 tm->tcpm_fastopen.cookie.len = 0;
133         }
134 }
135
136 #define TCP_METRICS_TIMEOUT             (60 * 60 * HZ)
137
138 static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
139 {
140         if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
141                 tcpm_suck_dst(tm, dst, false);
142 }
143
144 #define TCP_METRICS_RECLAIM_DEPTH       5
145 #define TCP_METRICS_RECLAIM_PTR         (struct tcp_metrics_block *) 0x1UL
146
147 static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
148                                           struct inetpeer_addr *addr,
149                                           unsigned int hash)
150 {
151         struct tcp_metrics_block *tm;
152         struct net *net;
153         bool reclaim = false;
154
155         spin_lock_bh(&tcp_metrics_lock);
156         net = dev_net(dst->dev);
157
158         /* While waiting for the spin-lock the cache might have been populated
159          * with this entry and so we have to check again.
160          */
161         tm = __tcp_get_metrics(addr, net, hash);
162         if (tm == TCP_METRICS_RECLAIM_PTR) {
163                 reclaim = true;
164                 tm = NULL;
165         }
166         if (tm) {
167                 tcpm_check_stamp(tm, dst);
168                 goto out_unlock;
169         }
170
171         if (unlikely(reclaim)) {
172                 struct tcp_metrics_block *oldest;
173
174                 oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
175                 for (tm = rcu_dereference(oldest->tcpm_next); tm;
176                      tm = rcu_dereference(tm->tcpm_next)) {
177                         if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
178                                 oldest = tm;
179                 }
180                 tm = oldest;
181         } else {
182                 tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
183                 if (!tm)
184                         goto out_unlock;
185         }
186         tm->tcpm_addr = *addr;
187
188         tcpm_suck_dst(tm, dst, true);
189
190         if (likely(!reclaim)) {
191                 tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
192                 rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
193         }
194
195 out_unlock:
196         spin_unlock_bh(&tcp_metrics_lock);
197         return tm;
198 }
199
200 static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
201 {
202         if (tm)
203                 return tm;
204         if (depth > TCP_METRICS_RECLAIM_DEPTH)
205                 return TCP_METRICS_RECLAIM_PTR;
206         return NULL;
207 }
208
209 static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
210                                                    struct net *net, unsigned int hash)
211 {
212         struct tcp_metrics_block *tm;
213         int depth = 0;
214
215         for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
216              tm = rcu_dereference(tm->tcpm_next)) {
217                 if (addr_same(&tm->tcpm_addr, addr))
218                         break;
219                 depth++;
220         }
221         return tcp_get_encode(tm, depth);
222 }
223
224 static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
225                                                        struct dst_entry *dst)
226 {
227         struct tcp_metrics_block *tm;
228         struct inetpeer_addr addr;
229         unsigned int hash;
230         struct net *net;
231
232         addr.family = req->rsk_ops->family;
233         switch (addr.family) {
234         case AF_INET:
235                 addr.addr.a4 = inet_rsk(req)->rmt_addr;
236                 hash = (__force unsigned int) addr.addr.a4;
237                 break;
238         case AF_INET6:
239                 *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
240                 hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr);
241                 break;
242         default:
243                 return NULL;
244         }
245
246         net = dev_net(dst->dev);
247         hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
248
249         for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
250              tm = rcu_dereference(tm->tcpm_next)) {
251                 if (addr_same(&tm->tcpm_addr, &addr))
252                         break;
253         }
254         tcpm_check_stamp(tm, dst);
255         return tm;
256 }
257
258 static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
259 {
260         struct inet6_timewait_sock *tw6;
261         struct tcp_metrics_block *tm;
262         struct inetpeer_addr addr;
263         unsigned int hash;
264         struct net *net;
265
266         addr.family = tw->tw_family;
267         switch (addr.family) {
268         case AF_INET:
269                 addr.addr.a4 = tw->tw_daddr;
270                 hash = (__force unsigned int) addr.addr.a4;
271                 break;
272         case AF_INET6:
273                 tw6 = inet6_twsk((struct sock *)tw);
274                 *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
275                 hash = ipv6_addr_hash(&tw6->tw_v6_daddr);
276                 break;
277         default:
278                 return NULL;
279         }
280
281         net = twsk_net(tw);
282         hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
283
284         for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
285              tm = rcu_dereference(tm->tcpm_next)) {
286                 if (addr_same(&tm->tcpm_addr, &addr))
287                         break;
288         }
289         return tm;
290 }
291
292 static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
293                                                  struct dst_entry *dst,
294                                                  bool create)
295 {
296         struct tcp_metrics_block *tm;
297         struct inetpeer_addr addr;
298         unsigned int hash;
299         struct net *net;
300
301         addr.family = sk->sk_family;
302         switch (addr.family) {
303         case AF_INET:
304                 addr.addr.a4 = inet_sk(sk)->inet_daddr;
305                 hash = (__force unsigned int) addr.addr.a4;
306                 break;
307         case AF_INET6:
308                 *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
309                 hash = ipv6_addr_hash(&inet6_sk(sk)->daddr);
310                 break;
311         default:
312                 return NULL;
313         }
314
315         net = dev_net(dst->dev);
316         hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
317
318         tm = __tcp_get_metrics(&addr, net, hash);
319         if (tm == TCP_METRICS_RECLAIM_PTR)
320                 tm = NULL;
321         if (!tm && create)
322                 tm = tcpm_new(dst, &addr, hash);
323         else
324                 tcpm_check_stamp(tm, dst);
325
326         return tm;
327 }
328
329 /* Save metrics learned by this TCP session.  This function is called
330  * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
331  * or goes from LAST-ACK to CLOSE.
332  */
333 void tcp_update_metrics(struct sock *sk)
334 {
335         const struct inet_connection_sock *icsk = inet_csk(sk);
336         struct dst_entry *dst = __sk_dst_get(sk);
337         struct tcp_sock *tp = tcp_sk(sk);
338         struct tcp_metrics_block *tm;
339         unsigned long rtt;
340         u32 val;
341         int m;
342
343         if (sysctl_tcp_nometrics_save || !dst)
344                 return;
345
346         if (dst->flags & DST_HOST)
347                 dst_confirm(dst);
348
349         rcu_read_lock();
350         if (icsk->icsk_backoff || !tp->srtt) {
351                 /* This session failed to estimate rtt. Why?
352                  * Probably, no packets returned in time.  Reset our
353                  * results.
354                  */
355                 tm = tcp_get_metrics(sk, dst, false);
356                 if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
357                         tcp_metric_set(tm, TCP_METRIC_RTT, 0);
358                 goto out_unlock;
359         } else
360                 tm = tcp_get_metrics(sk, dst, true);
361
362         if (!tm)
363                 goto out_unlock;
364
365         rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
366         m = rtt - tp->srtt;
367
368         /* If newly calculated rtt larger than stored one, store new
369          * one. Otherwise, use EWMA. Remember, rtt overestimation is
370          * always better than underestimation.
371          */
372         if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
373                 if (m <= 0)
374                         rtt = tp->srtt;
375                 else
376                         rtt -= (m >> 3);
377                 tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
378         }
379
380         if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
381                 unsigned long var;
382
383                 if (m < 0)
384                         m = -m;
385
386                 /* Scale deviation to rttvar fixed point */
387                 m >>= 1;
388                 if (m < tp->mdev)
389                         m = tp->mdev;
390
391                 var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
392                 if (m >= var)
393                         var = m;
394                 else
395                         var -= (var - m) >> 2;
396
397                 tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
398         }
399
400         if (tcp_in_initial_slowstart(tp)) {
401                 /* Slow start still did not finish. */
402                 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
403                         val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
404                         if (val && (tp->snd_cwnd >> 1) > val)
405                                 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
406                                                tp->snd_cwnd >> 1);
407                 }
408                 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
409                         val = tcp_metric_get(tm, TCP_METRIC_CWND);
410                         if (tp->snd_cwnd > val)
411                                 tcp_metric_set(tm, TCP_METRIC_CWND,
412                                                tp->snd_cwnd);
413                 }
414         } else if (tp->snd_cwnd > tp->snd_ssthresh &&
415                    icsk->icsk_ca_state == TCP_CA_Open) {
416                 /* Cong. avoidance phase, cwnd is reliable. */
417                 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
418                         tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
419                                        max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
420                 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
421                         val = tcp_metric_get(tm, TCP_METRIC_CWND);
422                         tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1);
423                 }
424         } else {
425                 /* Else slow start did not finish, cwnd is non-sense,
426                  * ssthresh may be also invalid.
427                  */
428                 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
429                         val = tcp_metric_get(tm, TCP_METRIC_CWND);
430                         tcp_metric_set(tm, TCP_METRIC_CWND,
431                                        (val + tp->snd_ssthresh) >> 1);
432                 }
433                 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
434                         val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
435                         if (val && tp->snd_ssthresh > val)
436                                 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
437                                                tp->snd_ssthresh);
438                 }
439                 if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
440                         val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
441                         if (val < tp->reordering &&
442                             tp->reordering != sysctl_tcp_reordering)
443                                 tcp_metric_set(tm, TCP_METRIC_REORDERING,
444                                                tp->reordering);
445                 }
446         }
447         tm->tcpm_stamp = jiffies;
448 out_unlock:
449         rcu_read_unlock();
450 }
451
452 /* Initialize metrics on socket. */
453
454 void tcp_init_metrics(struct sock *sk)
455 {
456         struct dst_entry *dst = __sk_dst_get(sk);
457         struct tcp_sock *tp = tcp_sk(sk);
458         struct tcp_metrics_block *tm;
459         u32 val;
460
461         if (dst == NULL)
462                 goto reset;
463
464         dst_confirm(dst);
465
466         rcu_read_lock();
467         tm = tcp_get_metrics(sk, dst, true);
468         if (!tm) {
469                 rcu_read_unlock();
470                 goto reset;
471         }
472
473         if (tcp_metric_locked(tm, TCP_METRIC_CWND))
474                 tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
475
476         val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
477         if (val) {
478                 tp->snd_ssthresh = val;
479                 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
480                         tp->snd_ssthresh = tp->snd_cwnd_clamp;
481         } else {
482                 /* ssthresh may have been reduced unnecessarily during.
483                  * 3WHS. Restore it back to its initial default.
484                  */
485                 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
486         }
487         val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
488         if (val && tp->reordering != val) {
489                 tcp_disable_fack(tp);
490                 tcp_disable_early_retrans(tp);
491                 tp->reordering = val;
492         }
493
494         val = tcp_metric_get(tm, TCP_METRIC_RTT);
495         if (val == 0 || tp->srtt == 0) {
496                 rcu_read_unlock();
497                 goto reset;
498         }
499         /* Initial rtt is determined from SYN,SYN-ACK.
500          * The segment is small and rtt may appear much
501          * less than real one. Use per-dst memory
502          * to make it more realistic.
503          *
504          * A bit of theory. RTT is time passed after "normal" sized packet
505          * is sent until it is ACKed. In normal circumstances sending small
506          * packets force peer to delay ACKs and calculation is correct too.
507          * The algorithm is adaptive and, provided we follow specs, it
508          * NEVER underestimate RTT. BUT! If peer tries to make some clever
509          * tricks sort of "quick acks" for time long enough to decrease RTT
510          * to low value, and then abruptly stops to do it and starts to delay
511          * ACKs, wait for troubles.
512          */
513         val = msecs_to_jiffies(val);
514         if (val > tp->srtt) {
515                 tp->srtt = val;
516                 tp->rtt_seq = tp->snd_nxt;
517         }
518         val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
519         if (val > tp->mdev) {
520                 tp->mdev = val;
521                 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
522         }
523         rcu_read_unlock();
524
525         tcp_set_rto(sk);
526 reset:
527         if (tp->srtt == 0) {
528                 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
529                  * 3WHS. This is most likely due to retransmission,
530                  * including spurious one. Reset the RTO back to 3secs
531                  * from the more aggressive 1sec to avoid more spurious
532                  * retransmission.
533                  */
534                 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
535                 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
536         }
537         /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
538          * retransmitted. In light of RFC6298 more aggressive 1sec
539          * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
540          * retransmission has occurred.
541          */
542         if (tp->total_retrans > 1)
543                 tp->snd_cwnd = 1;
544         else
545                 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
546         tp->snd_cwnd_stamp = tcp_time_stamp;
547 }
548
549 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
550 {
551         struct tcp_metrics_block *tm;
552         bool ret;
553
554         if (!dst)
555                 return false;
556
557         rcu_read_lock();
558         tm = __tcp_get_metrics_req(req, dst);
559         if (paws_check) {
560                 if (tm &&
561                     (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
562                     (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
563                         ret = false;
564                 else
565                         ret = true;
566         } else {
567                 if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
568                         ret = true;
569                 else
570                         ret = false;
571         }
572         rcu_read_unlock();
573
574         return ret;
575 }
576 EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
577
578 void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
579 {
580         struct tcp_metrics_block *tm;
581
582         rcu_read_lock();
583         tm = tcp_get_metrics(sk, dst, true);
584         if (tm) {
585                 struct tcp_sock *tp = tcp_sk(sk);
586
587                 if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
588                         tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
589                         tp->rx_opt.ts_recent = tm->tcpm_ts;
590                 }
591         }
592         rcu_read_unlock();
593 }
594 EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
595
596 /* VJ's idea. Save last timestamp seen from this destination and hold
597  * it at least for normal timewait interval to use for duplicate
598  * segment detection in subsequent connections, before they enter
599  * synchronized state.
600  */
601 bool tcp_remember_stamp(struct sock *sk)
602 {
603         struct dst_entry *dst = __sk_dst_get(sk);
604         bool ret = false;
605
606         if (dst) {
607                 struct tcp_metrics_block *tm;
608
609                 rcu_read_lock();
610                 tm = tcp_get_metrics(sk, dst, true);
611                 if (tm) {
612                         struct tcp_sock *tp = tcp_sk(sk);
613
614                         if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
615                             ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
616                              tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
617                                 tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
618                                 tm->tcpm_ts = tp->rx_opt.ts_recent;
619                         }
620                         ret = true;
621                 }
622                 rcu_read_unlock();
623         }
624         return ret;
625 }
626
627 bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
628 {
629         struct tcp_metrics_block *tm;
630         bool ret = false;
631
632         rcu_read_lock();
633         tm = __tcp_get_metrics_tw(tw);
634         if (tm) {
635                 const struct tcp_timewait_sock *tcptw;
636                 struct sock *sk = (struct sock *) tw;
637
638                 tcptw = tcp_twsk(sk);
639                 if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
640                     ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
641                      tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
642                         tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
643                         tm->tcpm_ts        = tcptw->tw_ts_recent;
644                 }
645                 ret = true;
646         }
647         rcu_read_unlock();
648
649         return ret;
650 }
651
652 static DEFINE_SEQLOCK(fastopen_seqlock);
653
654 void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
655                             struct tcp_fastopen_cookie *cookie,
656                             int *syn_loss, unsigned long *last_syn_loss)
657 {
658         struct tcp_metrics_block *tm;
659
660         rcu_read_lock();
661         tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
662         if (tm) {
663                 struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
664                 unsigned int seq;
665
666                 do {
667                         seq = read_seqbegin(&fastopen_seqlock);
668                         if (tfom->mss)
669                                 *mss = tfom->mss;
670                         *cookie = tfom->cookie;
671                         *syn_loss = tfom->syn_loss;
672                         *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
673                 } while (read_seqretry(&fastopen_seqlock, seq));
674         }
675         rcu_read_unlock();
676 }
677
678 void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
679                             struct tcp_fastopen_cookie *cookie, bool syn_lost)
680 {
681         struct dst_entry *dst = __sk_dst_get(sk);
682         struct tcp_metrics_block *tm;
683
684         if (!dst)
685                 return;
686         rcu_read_lock();
687         tm = tcp_get_metrics(sk, dst, true);
688         if (tm) {
689                 struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
690
691                 write_seqlock_bh(&fastopen_seqlock);
692                 tfom->mss = mss;
693                 if (cookie->len > 0)
694                         tfom->cookie = *cookie;
695                 if (syn_lost) {
696                         ++tfom->syn_loss;
697                         tfom->last_syn_loss = jiffies;
698                 } else
699                         tfom->syn_loss = 0;
700                 write_sequnlock_bh(&fastopen_seqlock);
701         }
702         rcu_read_unlock();
703 }
704
705 static struct genl_family tcp_metrics_nl_family = {
706         .id             = GENL_ID_GENERATE,
707         .hdrsize        = 0,
708         .name           = TCP_METRICS_GENL_NAME,
709         .version        = TCP_METRICS_GENL_VERSION,
710         .maxattr        = TCP_METRICS_ATTR_MAX,
711         .netnsok        = true,
712 };
713
714 static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
715         [TCP_METRICS_ATTR_ADDR_IPV4]    = { .type = NLA_U32, },
716         [TCP_METRICS_ATTR_ADDR_IPV6]    = { .type = NLA_BINARY,
717                                             .len = sizeof(struct in6_addr), },
718         /* Following attributes are not received for GET/DEL,
719          * we keep them for reference
720          */
721 #if 0
722         [TCP_METRICS_ATTR_AGE]          = { .type = NLA_MSECS, },
723         [TCP_METRICS_ATTR_TW_TSVAL]     = { .type = NLA_U32, },
724         [TCP_METRICS_ATTR_TW_TS_STAMP]  = { .type = NLA_S32, },
725         [TCP_METRICS_ATTR_VALS]         = { .type = NLA_NESTED, },
726         [TCP_METRICS_ATTR_FOPEN_MSS]    = { .type = NLA_U16, },
727         [TCP_METRICS_ATTR_FOPEN_SYN_DROPS]      = { .type = NLA_U16, },
728         [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS]    = { .type = NLA_MSECS, },
729         [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY,
730                                             .len = TCP_FASTOPEN_COOKIE_MAX, },
731 #endif
732 };
733
734 /* Add attributes, caller cancels its header on failure */
735 static int tcp_metrics_fill_info(struct sk_buff *msg,
736                                  struct tcp_metrics_block *tm)
737 {
738         struct nlattr *nest;
739         int i;
740
741         switch (tm->tcpm_addr.family) {
742         case AF_INET:
743                 if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4,
744                                 tm->tcpm_addr.addr.a4) < 0)
745                         goto nla_put_failure;
746                 break;
747         case AF_INET6:
748                 if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16,
749                             tm->tcpm_addr.addr.a6) < 0)
750                         goto nla_put_failure;
751                 break;
752         default:
753                 return -EAFNOSUPPORT;
754         }
755
756         if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
757                           jiffies - tm->tcpm_stamp) < 0)
758                 goto nla_put_failure;
759         if (tm->tcpm_ts_stamp) {
760                 if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
761                                 (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0)
762                         goto nla_put_failure;
763                 if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL,
764                                 tm->tcpm_ts) < 0)
765                         goto nla_put_failure;
766         }
767
768         {
769                 int n = 0;
770
771                 nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
772                 if (!nest)
773                         goto nla_put_failure;
774                 for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
775                         if (!tm->tcpm_vals[i])
776                                 continue;
777                         if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0)
778                                 goto nla_put_failure;
779                         n++;
780                 }
781                 if (n)
782                         nla_nest_end(msg, nest);
783                 else
784                         nla_nest_cancel(msg, nest);
785         }
786
787         {
788                 struct tcp_fastopen_metrics tfom_copy[1], *tfom;
789                 unsigned int seq;
790
791                 do {
792                         seq = read_seqbegin(&fastopen_seqlock);
793                         tfom_copy[0] = tm->tcpm_fastopen;
794                 } while (read_seqretry(&fastopen_seqlock, seq));
795
796                 tfom = tfom_copy;
797                 if (tfom->mss &&
798                     nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
799                                 tfom->mss) < 0)
800                         goto nla_put_failure;
801                 if (tfom->syn_loss &&
802                     (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
803                                 tfom->syn_loss) < 0 ||
804                      nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
805                                 jiffies - tfom->last_syn_loss) < 0))
806                         goto nla_put_failure;
807                 if (tfom->cookie.len > 0 &&
808                     nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
809                             tfom->cookie.len, tfom->cookie.val) < 0)
810                         goto nla_put_failure;
811         }
812
813         return 0;
814
815 nla_put_failure:
816         return -EMSGSIZE;
817 }
818
819 static int tcp_metrics_dump_info(struct sk_buff *skb,
820                                  struct netlink_callback *cb,
821                                  struct tcp_metrics_block *tm)
822 {
823         void *hdr;
824
825         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
826                           &tcp_metrics_nl_family, NLM_F_MULTI,
827                           TCP_METRICS_CMD_GET);
828         if (!hdr)
829                 return -EMSGSIZE;
830
831         if (tcp_metrics_fill_info(skb, tm) < 0)
832                 goto nla_put_failure;
833
834         return genlmsg_end(skb, hdr);
835
836 nla_put_failure:
837         genlmsg_cancel(skb, hdr);
838         return -EMSGSIZE;
839 }
840
841 static int tcp_metrics_nl_dump(struct sk_buff *skb,
842                                struct netlink_callback *cb)
843 {
844         struct net *net = sock_net(skb->sk);
845         unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
846         unsigned int row, s_row = cb->args[0];
847         int s_col = cb->args[1], col = s_col;
848
849         for (row = s_row; row < max_rows; row++, s_col = 0) {
850                 struct tcp_metrics_block *tm;
851                 struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row;
852
853                 rcu_read_lock();
854                 for (col = 0, tm = rcu_dereference(hb->chain); tm;
855                      tm = rcu_dereference(tm->tcpm_next), col++) {
856                         if (col < s_col)
857                                 continue;
858                         if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
859                                 rcu_read_unlock();
860                                 goto done;
861                         }
862                 }
863                 rcu_read_unlock();
864         }
865
866 done:
867         cb->args[0] = row;
868         cb->args[1] = col;
869         return skb->len;
870 }
871
872 static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
873                          unsigned int *hash, int optional)
874 {
875         struct nlattr *a;
876
877         a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4];
878         if (a) {
879                 addr->family = AF_INET;
880                 addr->addr.a4 = nla_get_be32(a);
881                 *hash = (__force unsigned int) addr->addr.a4;
882                 return 0;
883         }
884         a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6];
885         if (a) {
886                 if (nla_len(a) != sizeof(struct in6_addr))
887                         return -EINVAL;
888                 addr->family = AF_INET6;
889                 memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6));
890                 *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6);
891                 return 0;
892         }
893         return optional ? 1 : -EAFNOSUPPORT;
894 }
895
896 static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
897 {
898         struct tcp_metrics_block *tm;
899         struct inetpeer_addr addr;
900         unsigned int hash;
901         struct sk_buff *msg;
902         struct net *net = genl_info_net(info);
903         void *reply;
904         int ret;
905
906         ret = parse_nl_addr(info, &addr, &hash, 0);
907         if (ret < 0)
908                 return ret;
909
910         msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
911         if (!msg)
912                 return -ENOMEM;
913
914         reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
915                                   info->genlhdr->cmd);
916         if (!reply)
917                 goto nla_put_failure;
918
919         hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
920         ret = -ESRCH;
921         rcu_read_lock();
922         for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
923              tm = rcu_dereference(tm->tcpm_next)) {
924                 if (addr_same(&tm->tcpm_addr, &addr)) {
925                         ret = tcp_metrics_fill_info(msg, tm);
926                         break;
927                 }
928         }
929         rcu_read_unlock();
930         if (ret < 0)
931                 goto out_free;
932
933         genlmsg_end(msg, reply);
934         return genlmsg_reply(msg, info);
935
936 nla_put_failure:
937         ret = -EMSGSIZE;
938
939 out_free:
940         nlmsg_free(msg);
941         return ret;
942 }
943
944 #define deref_locked_genl(p)    \
945         rcu_dereference_protected(p, lockdep_genl_is_held() && \
946                                      lockdep_is_held(&tcp_metrics_lock))
947
948 #define deref_genl(p)   rcu_dereference_protected(p, lockdep_genl_is_held())
949
950 static int tcp_metrics_flush_all(struct net *net)
951 {
952         unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
953         struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash;
954         struct tcp_metrics_block *tm;
955         unsigned int row;
956
957         for (row = 0; row < max_rows; row++, hb++) {
958                 spin_lock_bh(&tcp_metrics_lock);
959                 tm = deref_locked_genl(hb->chain);
960                 if (tm)
961                         hb->chain = NULL;
962                 spin_unlock_bh(&tcp_metrics_lock);
963                 while (tm) {
964                         struct tcp_metrics_block *next;
965
966                         next = deref_genl(tm->tcpm_next);
967                         kfree_rcu(tm, rcu_head);
968                         tm = next;
969                 }
970         }
971         return 0;
972 }
973
974 static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
975 {
976         struct tcpm_hash_bucket *hb;
977         struct tcp_metrics_block *tm;
978         struct tcp_metrics_block __rcu **pp;
979         struct inetpeer_addr addr;
980         unsigned int hash;
981         struct net *net = genl_info_net(info);
982         int ret;
983
984         ret = parse_nl_addr(info, &addr, &hash, 1);
985         if (ret < 0)
986                 return ret;
987         if (ret > 0)
988                 return tcp_metrics_flush_all(net);
989
990         hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
991         hb = net->ipv4.tcp_metrics_hash + hash;
992         pp = &hb->chain;
993         spin_lock_bh(&tcp_metrics_lock);
994         for (tm = deref_locked_genl(*pp); tm;
995              pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) {
996                 if (addr_same(&tm->tcpm_addr, &addr)) {
997                         *pp = tm->tcpm_next;
998                         break;
999                 }
1000         }
1001         spin_unlock_bh(&tcp_metrics_lock);
1002         if (!tm)
1003                 return -ESRCH;
1004         kfree_rcu(tm, rcu_head);
1005         return 0;
1006 }
1007
1008 static struct genl_ops tcp_metrics_nl_ops[] = {
1009         {
1010                 .cmd = TCP_METRICS_CMD_GET,
1011                 .doit = tcp_metrics_nl_cmd_get,
1012                 .dumpit = tcp_metrics_nl_dump,
1013                 .policy = tcp_metrics_nl_policy,
1014                 .flags = GENL_ADMIN_PERM,
1015         },
1016         {
1017                 .cmd = TCP_METRICS_CMD_DEL,
1018                 .doit = tcp_metrics_nl_cmd_del,
1019                 .policy = tcp_metrics_nl_policy,
1020                 .flags = GENL_ADMIN_PERM,
1021         },
1022 };
1023
1024 static unsigned int tcpmhash_entries;
1025 static int __init set_tcpmhash_entries(char *str)
1026 {
1027         ssize_t ret;
1028
1029         if (!str)
1030                 return 0;
1031
1032         ret = kstrtouint(str, 0, &tcpmhash_entries);
1033         if (ret)
1034                 return 0;
1035
1036         return 1;
1037 }
1038 __setup("tcpmhash_entries=", set_tcpmhash_entries);
1039
1040 static int __net_init tcp_net_metrics_init(struct net *net)
1041 {
1042         size_t size;
1043         unsigned int slots;
1044
1045         slots = tcpmhash_entries;
1046         if (!slots) {
1047                 if (totalram_pages >= 128 * 1024)
1048                         slots = 16 * 1024;
1049                 else
1050                         slots = 8 * 1024;
1051         }
1052
1053         net->ipv4.tcp_metrics_hash_log = order_base_2(slots);
1054         size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log;
1055
1056         net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1057         if (!net->ipv4.tcp_metrics_hash)
1058                 net->ipv4.tcp_metrics_hash = vzalloc(size);
1059
1060         if (!net->ipv4.tcp_metrics_hash)
1061                 return -ENOMEM;
1062
1063         return 0;
1064 }
1065
1066 static void __net_exit tcp_net_metrics_exit(struct net *net)
1067 {
1068         unsigned int i;
1069
1070         for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) {
1071                 struct tcp_metrics_block *tm, *next;
1072
1073                 tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1);
1074                 while (tm) {
1075                         next = rcu_dereference_protected(tm->tcpm_next, 1);
1076                         kfree(tm);
1077                         tm = next;
1078                 }
1079         }
1080         if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash))
1081                 vfree(net->ipv4.tcp_metrics_hash);
1082         else
1083                 kfree(net->ipv4.tcp_metrics_hash);
1084 }
1085
1086 static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
1087         .init   =       tcp_net_metrics_init,
1088         .exit   =       tcp_net_metrics_exit,
1089 };
1090
1091 void __init tcp_metrics_init(void)
1092 {
1093         int ret;
1094
1095         ret = register_pernet_subsys(&tcp_net_metrics_ops);
1096         if (ret < 0)
1097                 goto cleanup;
1098         ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
1099                                             tcp_metrics_nl_ops,
1100                                             ARRAY_SIZE(tcp_metrics_nl_ops));
1101         if (ret < 0)
1102                 goto cleanup_subsys;
1103         return;
1104
1105 cleanup_subsys:
1106         unregister_pernet_subsys(&tcp_net_metrics_ops);
1107
1108 cleanup:
1109         return;
1110 }