folly/stats/BucketedTimeSeries-defs.h

   1 /*
   2  * Copyright 2016 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #pragma once
  18
  19 #include <algorithm>
  20 #include <glog/logging.h>
  21 #include <folly/Likely.h>
  22
  23 namespace folly {
  24
  25 template <typename VT, typename CT>
  26 BucketedTimeSeries<VT, CT>::BucketedTimeSeries(
  27     size_t nBuckets,
  28     TimeType maxDuration)
  29     : firstTime_(1), latestTime_(0), duration_(maxDuration) {
  30   // For tracking all-time data we only use total_, and don't need to bother
  31   // with buckets_
  32   if (!isAllTime()) {
  33     // Round nBuckets down to duration_.count().
  34     //
  35     // There is no point in having more buckets than our timestamp
  36     // granularity: otherwise we would have buckets that could never be used.
  37     if (nBuckets > size_t(duration_.count())) {
  38       nBuckets = duration_.count();
  39     }
  40
  41     buckets_.resize(nBuckets, Bucket());
  42   }
  43 }
  44
  45 template <typename VT, typename CT>
  46 bool BucketedTimeSeries<VT, CT>::addValue(TimeType now, const ValueType& val) {
  47   return addValueAggregated(now, val, 1);
  48 }
  49
  50 template <typename VT, typename CT>
  51 bool BucketedTimeSeries<VT, CT>::addValue(
  52     TimeType now,
  53     const ValueType& val,
  54     int64_t times) {
  55   return addValueAggregated(now, val * times, times);
  56 }
  57
  58 template <typename VT, typename CT>
  59 bool BucketedTimeSeries<VT, CT>::addValueAggregated(
  60     TimeType now,
  61     const ValueType& total,
  62     int64_t nsamples) {
  63   if (isAllTime()) {
  64     if (UNLIKELY(empty())) {
  65       firstTime_ = now;
  66       latestTime_ = now;
  67     } else if (now > latestTime_) {
  68       latestTime_ = now;
  69     } else if (now < firstTime_) {
  70       firstTime_ = now;
  71     }
  72     total_.add(total, nsamples);
  73     return true;
  74   }
  75
  76   size_t bucketIdx;
  77   if (UNLIKELY(empty())) {
  78     // First data point we've ever seen
  79     firstTime_ = now;
  80     latestTime_ = now;
  81     bucketIdx = getBucketIdx(now);
  82   } else if (now > latestTime_) {
  83     // More recent time.  Need to update the buckets.
  84     bucketIdx = updateBuckets(now);
  85   } else if (LIKELY(now == latestTime_)) {
  86     // Current time.
  87     bucketIdx = getBucketIdx(now);
  88   } else {
  89     // An earlier time in the past.  We need to check if this time still falls
  90     // within our window.
  91     if (now < getEarliestTimeNonEmpty()) {
  92       return false;
  93     }
  94     bucketIdx = getBucketIdx(now);
  95   }
  96
  97   total_.add(total, nsamples);
  98   buckets_[bucketIdx].add(total, nsamples);
  99   return true;
 100 }
 101
 102 template <typename VT, typename CT>
 103 size_t BucketedTimeSeries<VT, CT>::update(TimeType now) {
 104   if (empty()) {
 105     // This is the first data point.
 106     firstTime_ = now;
 107   }
 108
 109   // For all-time data, all we need to do is update latestTime_
 110   if (isAllTime()) {
 111     latestTime_ = std::max(latestTime_, now);
 112     return 0;
 113   }
 114
 115   // Make sure time doesn't go backwards.
 116   // If the time is less than or equal to the latest time we have already seen,
 117   // we don't need to do anything.
 118   if (now <= latestTime_) {
 119     return getBucketIdx(latestTime_);
 120   }
 121
 122   return updateBuckets(now);
 123 }
 124
 125 template <typename VT, typename CT>
 126 size_t BucketedTimeSeries<VT, CT>::updateBuckets(TimeType now) {
 127   // We could cache nextBucketStart as a member variable, so we don't have to
 128   // recompute it each time update() is called with a new timestamp value.
 129   // This makes things faster when update() (or addValue()) is called once
 130   // per second, but slightly slower when update() is called multiple times a
 131   // second.  We care more about optimizing the cases where addValue() is being
 132   // called frequently.  If addValue() is only being called once every few
 133   // seconds, it doesn't matter as much if it is fast.
 134
 135   // Get info about the bucket that latestTime_ points at
 136   size_t currentBucket;
 137   TimeType currentBucketStart;
 138   TimeType nextBucketStart;
 139   getBucketInfo(latestTime_, &currentBucket,
 140                 &currentBucketStart, &nextBucketStart);
 141
 142   // Update latestTime_
 143   latestTime_ = now;
 144
 145   if (now < nextBucketStart) {
 146     // We're still in the same bucket.
 147     // We're done after updating latestTime_.
 148     return currentBucket;
 149   } else if (now >= currentBucketStart + duration_) {
 150     // It's been a while.  We have wrapped, and all of the buckets need to be
 151     // cleared.
 152     for (Bucket& bucket : buckets_) {
 153       bucket.clear();
 154     }
 155     total_.clear();
 156     return getBucketIdx(latestTime_);
 157   } else {
 158     // clear all the buckets between the last time and current time, meaning
 159     // buckets in the range [(currentBucket+1), newBucket]. Note that
 160     // the bucket (currentBucket+1) is always the oldest bucket we have. Since
 161     // our array is circular, loop when we reach the end.
 162     size_t newBucket = getBucketIdx(now);
 163     size_t idx = currentBucket;
 164     while (idx != newBucket) {
 165       ++idx;
 166       if (idx >= buckets_.size()) {
 167         idx = 0;
 168       }
 169       total_ -= buckets_[idx];
 170       buckets_[idx].clear();
 171     }
 172     return newBucket;
 173   }
 174 }
 175
 176 template <typename VT, typename CT>
 177 void BucketedTimeSeries<VT, CT>::clear() {
 178   for (Bucket& bucket : buckets_) {
 179     bucket.clear();
 180   }
 181   total_.clear();
 182   // Set firstTime_ larger than latestTime_,
 183   // to indicate that the timeseries is empty
 184   firstTime_ = TimeType(1);
 185   latestTime_ = TimeType(0);
 186 }
 187
 188 template <typename VT, typename CT>
 189 typename CT::duration BucketedTimeSeries<VT, CT>::getEarliestTime() const {
 190   if (empty()) {
 191     return TimeType(0);
 192   }
 193   if (isAllTime()) {
 194     return firstTime_;
 195   }
 196
 197   // Compute the earliest time we can track
 198   TimeType earliestTime = getEarliestTimeNonEmpty();
 199
 200   // We're never tracking data before firstTime_
 201   earliestTime = std::max(earliestTime, firstTime_);
 202
 203   return earliestTime;
 204 }
 205
 206 template <typename VT, typename CT>
 207 typename CT::duration BucketedTimeSeries<VT, CT>::getEarliestTimeNonEmpty()
 208     const {
 209   size_t currentBucket;
 210   TimeType currentBucketStart;
 211   TimeType nextBucketStart;
 212   getBucketInfo(latestTime_, &currentBucket,
 213                 &currentBucketStart, &nextBucketStart);
 214
 215   // Subtract 1 duration from the start of the next bucket to find the
 216   // earliest possible data point we could be tracking.
 217   return nextBucketStart - duration_;
 218 }
 219
 220 template <typename VT, typename CT>
 221 typename CT::duration BucketedTimeSeries<VT, CT>::elapsed() const {
 222   if (empty()) {
 223     return TimeType(0);
 224   }
 225
 226   // Add 1 since [latestTime_, earliestTime] is an inclusive interval.
 227   return latestTime_ - getEarliestTime() + TimeType(1);
 228 }
 229
 230 template <typename VT, typename CT>
 231 typename CT::duration BucketedTimeSeries<VT, CT>::elapsed(
 232     TimeType start,
 233     TimeType end) const {
 234   if (empty()) {
 235     return TimeType(0);
 236   }
 237   start = std::max(start, getEarliestTime());
 238   end = std::min(end, latestTime_ + TimeType(1));
 239   end = std::max(start, end);
 240   return end - start;
 241 }
 242
 243 template <typename VT, typename CT>
 244 VT BucketedTimeSeries<VT, CT>::sum(TimeType start, TimeType end) const {
 245   ValueType total = ValueType();
 246   forEachBucket(start, end, [&](const Bucket& bucket,
 247                                 TimeType bucketStart,
 248                                 TimeType nextBucketStart) -> bool {
 249     total += this->rangeAdjust(bucketStart, nextBucketStart, start, end,
 250                              bucket.sum);
 251     return true;
 252   });
 253
 254   return total;
 255 }
 256
 257 template <typename VT, typename CT>
 258 uint64_t BucketedTimeSeries<VT, CT>::count(TimeType start, TimeType end) const {
 259   uint64_t sample_count = 0;
 260   forEachBucket(start, end, [&](const Bucket& bucket,
 261                                 TimeType bucketStart,
 262                                 TimeType nextBucketStart) -> bool {
 263     sample_count += this->rangeAdjust(bucketStart, nextBucketStart, start, end,
 264                                bucket.count);
 265     return true;
 266   });
 267
 268   return sample_count;
 269 }
 270
 271 template <typename VT, typename CT>
 272 template <typename ReturnType>
 273 ReturnType BucketedTimeSeries<VT, CT>::avg(TimeType start, TimeType end) const {
 274   ValueType total = ValueType();
 275   uint64_t sample_count = 0;
 276   forEachBucket(start, end, [&](const Bucket& bucket,
 277                                 TimeType bucketStart,
 278                                 TimeType nextBucketStart) -> bool {
 279     total += this->rangeAdjust(bucketStart, nextBucketStart, start, end,
 280                              bucket.sum);
 281     sample_count += this->rangeAdjust(bucketStart, nextBucketStart, start, end,
 282                                bucket.count);
 283     return true;
 284   });
 285
 286   if (sample_count == 0) {
 287     return ReturnType(0);
 288   }
 289
 290   return detail::avgHelper<ReturnType>(total, sample_count);
 291 }
 292
 293 /*
 294  * A note about some of the bucket index calculations below:
 295  *
 296  * buckets_.size() may not divide evenly into duration_.  When this happens,
 297  * some buckets will be wider than others.  We still want to spread the data
 298  * out as evenly as possible among the buckets (as opposed to just making the
 299  * last bucket be significantly wider than all of the others).
 300  *
 301  * To make the division work out, we pretend that the buckets are each
 302  * duration_ wide, so that the overall duration becomes
 303  * buckets.size() * duration_.
 304  *
 305  * To transform a real timestamp into the scale used by our buckets,
 306  * we have to multiply by buckets_.size().  To figure out which bucket it goes
 307  * into, we then divide by duration_.
 308  */
 309
 310 template <typename VT, typename CT>
 311 size_t BucketedTimeSeries<VT, CT>::getBucketIdx(TimeType time) const {
 312   // For all-time data we don't use buckets_.  Everything is tracked in total_.
 313   DCHECK(!isAllTime());
 314
 315   time %= duration_;
 316   return time.count() * buckets_.size() / duration_.count();
 317 }
 318
 319 /*
 320  * Compute the bucket index for the specified time, as well as the earliest
 321  * time that falls into this bucket.
 322  */
 323 template <typename VT, typename CT>
 324 void BucketedTimeSeries<VT, CT>::getBucketInfo(
 325     TimeType time,
 326     size_t* bucketIdx,
 327     TimeType* bucketStart,
 328     TimeType* nextBucketStart) const {
 329   typedef typename TimeType::rep TimeInt;
 330   DCHECK(!isAllTime());
 331
 332   // Keep these two lines together.  The compiler should be able to compute
 333   // both the division and modulus with a single operation.
 334   TimeType timeMod = time % duration_;
 335   TimeInt numFullDurations = time / duration_;
 336
 337   TimeInt scaledTime = timeMod.count() * buckets_.size();
 338
 339   // Keep these two lines together.  The compiler should be able to compute
 340   // both the division and modulus with a single operation.
 341   *bucketIdx = scaledTime / duration_.count();
 342   TimeInt scaledOffsetInBucket = scaledTime % duration_.count();
 343
 344   TimeInt scaledBucketStart = scaledTime - scaledOffsetInBucket;
 345   TimeInt scaledNextBucketStart = scaledBucketStart + duration_.count();
 346
 347   TimeType bucketStartMod((scaledBucketStart + buckets_.size() - 1) /
 348                           buckets_.size());
 349   TimeType nextBucketStartMod((scaledNextBucketStart + buckets_.size() - 1) /
 350                               buckets_.size());
 351
 352   TimeType durationStart(numFullDurations * duration_.count());
 353   *bucketStart = bucketStartMod + durationStart;
 354   *nextBucketStart = nextBucketStartMod + durationStart;
 355 }
 356
 357 template <typename VT, typename CT>
 358 template <typename Function>
 359 void BucketedTimeSeries<VT, CT>::forEachBucket(Function fn) const {
 360   if (isAllTime()) {
 361     fn(total_, firstTime_, latestTime_ + TimeType(1));
 362     return;
 363   }
 364
 365   typedef typename TimeType::rep TimeInt;
 366
 367   // Compute durationStart, latestBucketIdx, and scaledNextBucketStart,
 368   // the same way as in getBucketInfo().
 369   TimeType timeMod = latestTime_ % duration_;
 370   TimeInt numFullDurations = latestTime_ / duration_;
 371   TimeType durationStart(numFullDurations * duration_.count());
 372   TimeInt scaledTime = timeMod.count() * buckets_.size();
 373   size_t latestBucketIdx = scaledTime / duration_.count();
 374   TimeInt scaledOffsetInBucket = scaledTime % duration_.count();
 375   TimeInt scaledBucketStart = scaledTime - scaledOffsetInBucket;
 376   TimeInt scaledNextBucketStart = scaledBucketStart + duration_.count();
 377
 378   // Walk through the buckets, starting one past the current bucket.
 379   // The next bucket is from the previous cycle, so subtract 1 duration
 380   // from durationStart.
 381   size_t idx = latestBucketIdx;
 382   durationStart -= duration_;
 383
 384   TimeType nextBucketStart =
 385     TimeType((scaledNextBucketStart + buckets_.size() - 1) / buckets_.size()) +
 386     durationStart;
 387   while (true) {
 388     ++idx;
 389     if (idx >= buckets_.size()) {
 390       idx = 0;
 391       durationStart += duration_;
 392       scaledNextBucketStart = duration_.count();
 393     } else {
 394       scaledNextBucketStart += duration_.count();
 395     }
 396
 397     TimeType bucketStart = nextBucketStart;
 398     nextBucketStart = TimeType((scaledNextBucketStart + buckets_.size() - 1) /
 399                                buckets_.size()) + durationStart;
 400
 401     // Should we bother skipping buckets where firstTime_ >= nextBucketStart?
 402     // For now we go ahead and invoke the function with these buckets.
 403     // sum and count should always be 0 in these buckets.
 404
 405     DCHECK_LE(bucketStart.count(), latestTime_.count());
 406     bool ret = fn(buckets_[idx], bucketStart, nextBucketStart);
 407     if (!ret) {
 408       break;
 409     }
 410
 411     if (idx == latestBucketIdx) {
 412       // all done
 413       break;
 414     }
 415   }
 416 }
 417
 418 /*
 419  * Adjust the input value from the specified bucket to only account
 420  * for the desired range.
 421  *
 422  * For example, if the bucket spans time [10, 20), but we only care about the
 423  * range [10, 16), this will return 60% of the input value.
 424  */
 425 template <typename VT, typename CT>
 426 VT BucketedTimeSeries<VT, CT>::rangeAdjust(
 427     TimeType bucketStart,
 428     TimeType nextBucketStart,
 429     TimeType start,
 430     TimeType end,
 431     ValueType input) const {
 432   // If nextBucketStart is greater than latestTime_, treat nextBucketStart as
 433   // if it were latestTime_.  This makes us more accurate when someone is
 434   // querying for all of the data up to latestTime_.  Even though latestTime_
 435   // may only be partially through the bucket, we don't want to adjust
 436   // downwards in this case, because the bucket really only has data up to
 437   // latestTime_.
 438   if (bucketStart <= latestTime_ && nextBucketStart > latestTime_) {
 439     nextBucketStart = latestTime_ + TimeType(1);
 440   }
 441
 442   if (start <= bucketStart && end >= nextBucketStart) {
 443     // The bucket is wholly contained in the [start, end) interval
 444     return input;
 445   }
 446
 447   TimeType intervalStart = std::max(start, bucketStart);
 448   TimeType intervalEnd = std::min(end, nextBucketStart);
 449   return input * (intervalEnd - intervalStart) /
 450     (nextBucketStart - bucketStart);
 451 }
 452
 453 template <typename VT, typename CT>
 454 template <typename Function>
 455 void BucketedTimeSeries<VT, CT>::forEachBucket(
 456     TimeType start,
 457     TimeType end,
 458     Function fn) const {
 459   forEachBucket([&start, &end, &fn] (const Bucket& bucket, TimeType bucketStart,
 460                                      TimeType nextBucketStart) -> bool {
 461     if (start >= nextBucketStart) {
 462       return true;
 463     }
 464     if (end <= bucketStart) {
 465       return false;
 466     }
 467     bool ret = fn(bucket, bucketStart, nextBucketStart);
 468     return ret;
 469   });
 470 }
 471
 472 } // folly