folly/stats/MultiLevelTimeSeries.h

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #pragma once
  18
  19 #include <chrono>
  20 #include <stdexcept>
  21 #include <string>
  22 #include <vector>
  23
  24 #include <folly/String.h>
  25 #include <folly/stats/BucketedTimeSeries.h>
  26 #include <glog/logging.h>
  27
  28 namespace folly {
  29
  30 /*
  31  * This class represents a timeseries which keeps several levels of data
  32  * granularity (similar in principle to the loads reported by the UNIX
  33  * 'uptime' command).  It uses several instances (one per level) of
  34  * BucketedTimeSeries as the underlying storage.
  35  *
  36  * This can easily be used to track sums (and thus rates or averages) over
  37  * several predetermined time periods, as well as all-time sums.  For example,
  38  * you would use to it to track query rate or response speed over the last
  39  * 5, 15, 30, and 60 minutes.
  40  *
  41  * The MultiLevelTimeSeries takes a list of level durations as an input; the
  42  * durations must be strictly increasing.  Furthermore a special level can be
  43  * provided with a duration of '0' -- this will be an "all-time" level.  If
  44  * an all-time level is provided, it MUST be the last level present.
  45  *
  46  * The class assumes that time advances forward --  you can't retroactively add
  47  * values for events in the past -- the 'now' argument is provided for better
  48  * efficiency and ease of unittesting.
  49  *
  50  * The class is not thread-safe -- use your own synchronization!
  51  */
  52 template <typename VT, typename CT = LegacyStatsClock<std::chrono::seconds>>
  53 class MultiLevelTimeSeries {
  54  public:
  55   using ValueType = VT;
  56   using Clock = CT;
  57   using Duration = typename Clock::duration;
  58   using TimePoint = typename Clock::time_point;
  59   using Level = folly::BucketedTimeSeries<ValueType, Clock>;
  60
  61   /*
  62    * Create a new MultiLevelTimeSeries.
  63    *
  64    * This creates a new MultiLevelTimeSeries that tracks time series data at the
  65    * specified time durations (level). The time series data tracked at each
  66    * level is then further divided by numBuckets for memory efficiency.
  67    *
  68    * The durations must be strictly increasing. Furthermore a special level can
  69    * be provided with a duration of '0' -- this will be an "all-time" level. If
  70    * an all-time level is provided, it MUST be the last level present.
  71    */
  72   MultiLevelTimeSeries(
  73       size_t numBuckets,
  74       size_t numLevels,
  75       const Duration levelDurations[]);
  76
  77   MultiLevelTimeSeries(
  78       size_t numBuckets,
  79       std::initializer_list<Duration> durations);
  80
  81   /*
  82    * Return the number of buckets used to track time series at each level.
  83    */
  84   size_t numBuckets() const {
  85     // The constructor ensures that levels_ has at least one item
  86     return levels_[0].numBuckets();
  87   }
  88
  89   /*
  90    * Return the number of levels tracked by MultiLevelTimeSeries.
  91    */
  92   size_t numLevels() const { return levels_.size(); }
  93
  94   /*
  95    * Get the BucketedTimeSeries backing the specified level.
  96    *
  97    * Note: you should generally call update() or flush() before accessing the
  98    * data. Otherwise you may be reading stale data if update() or flush() has
  99    * not been called recently.
 100    */
 101   const Level& getLevel(size_t level) const {
 102     CHECK_LT(level, levels_.size());
 103     return levels_[level];
 104   }
 105
 106   /*
 107    * Get the highest granularity level that is still large enough to contain
 108    * data going back to the specified start time.
 109    *
 110    * Note: you should generally call update() or flush() before accessing the
 111    * data. Otherwise you may be reading stale data if update() or flush() has
 112    * not been called recently.
 113    */
 114   const Level& getLevel(TimePoint start) const {
 115     for (const auto& level : levels_) {
 116       if (level.isAllTime()) {
 117         return level;
 118       }
 119       // Note that we use duration() here rather than elapsed().
 120       // If duration is large enough to contain the start time then this level
 121       // is good enough, even if elapsed() indicates that no data was recorded
 122       // before the specified start time.
 123       if (level.getLatestTime() - level.duration() <= start) {
 124         return level;
 125       }
 126     }
 127     // We should always have an all-time level, so this is never reached.
 128     LOG(FATAL) << "No level of timeseries covers internval"
 129                << " from " << start.time_since_epoch().count() << " to now";
 130     return levels_.back();
 131   }
 132
 133   /*
 134    * Get the BucketedTimeSeries backing the specified level.
 135    *
 136    * Note: you should generally call update() or flush() before accessing the
 137    * data. Otherwise you may be reading stale data if update() or flush() has
 138    * not been called recently.
 139    */
 140   const Level& getLevelByDuration(Duration duration) const {
 141     // since the number of levels is expected to be small (less than 5 in most
 142     // cases), a simple linear scan would be efficient and is intentionally
 143     // chosen here over other alternatives for lookup.
 144     for (const auto& level : levels_) {
 145       if (level.duration() == duration) {
 146         return level;
 147       }
 148     }
 149     throw std::out_of_range(folly::to<std::string>(
 150         "No level of duration ", duration.count(), " found"));
 151   }
 152
 153   /*
 154    * Return the sum of all the data points currently tracked at this level.
 155    *
 156    * Note: you should generally call update() or flush() before accessing the
 157    * data. Otherwise you may be reading stale data if update() or flush() has
 158    * not been called recently.
 159    */
 160   ValueType sum(size_t level) const {
 161     return getLevel(level).sum();
 162   }
 163
 164   /*
 165    * Return the average (sum / count) of all the data points currently tracked
 166    * at this level.
 167    *
 168    * The return type may be specified to control whether floating-point or
 169    * integer division should be performed.
 170    *
 171    * Note: you should generally call update() or flush() before accessing the
 172    * data. Otherwise you may be reading stale data if update() or flush() has
 173    * not been called recently.
 174    */
 175   template <typename ReturnType = double>
 176   ReturnType avg(size_t level) const {
 177     return getLevel(level).template avg<ReturnType>();
 178   }
 179
 180   /*
 181    * Return the rate (sum divided by elaspsed time) of the all data points
 182    * currently tracked at this level.
 183    *
 184    * Note: you should generally call update() or flush() before accessing the
 185    * data. Otherwise you may be reading stale data if update() or flush() has
 186    * not been called recently.
 187    */
 188   template <typename ReturnType = double, typename Interval = Duration>
 189   ReturnType rate(size_t level) const {
 190     return getLevel(level).template rate<ReturnType, Interval>();
 191   }
 192
 193   /*
 194    * Return the number of data points currently tracked at this level.
 195    *
 196    * Note: you should generally call update() or flush() before accessing the
 197    * data. Otherwise you may be reading stale data if update() or flush() has
 198    * not been called recently.
 199    */
 200   uint64_t count(size_t level) const {
 201     return getLevel(level).count();
 202   }
 203
 204   /*
 205    * Return the count divided by the elapsed time tracked at this level.
 206    *
 207    * Note: you should generally call update() or flush() before accessing the
 208    * data. Otherwise you may be reading stale data if update() or flush() has
 209    * not been called recently.
 210    */
 211   template <typename ReturnType = double, typename Interval = Duration>
 212   ReturnType countRate(size_t level) const {
 213     return getLevel(level).template countRate<ReturnType, Interval>();
 214   }
 215
 216   /*
 217    * Return the sum of all the data points currently tracked at this level.
 218    *
 219    * This method is identical to sum(size_t level) above but takes in the
 220    * duration that the user is interested in querying as the parameter.
 221    *
 222    * Note: you should generally call update() or flush() before accessing the
 223    * data. Otherwise you may be reading stale data if update() or flush() has
 224    * not been called recently.
 225    */
 226   ValueType sum(Duration duration) const {
 227     return getLevelByDuration(duration).sum();
 228   }
 229
 230   /*
 231    * Return the average (sum / count) of all the data points currently tracked
 232    * at this level.
 233    *
 234    * This method is identical to avg(size_t level) above but takes in the
 235    * duration that the user is interested in querying as the parameter.
 236    *
 237    * Note: you should generally call update() or flush() before accessing the
 238    * data. Otherwise you may be reading stale data if update() or flush() has
 239    * not been called recently.
 240    */
 241   template <typename ReturnType = double>
 242   ReturnType avg(Duration duration) const {
 243     return getLevelByDuration(duration).template avg<ReturnType>();
 244   }
 245
 246   /*
 247    * Return the rate (sum divided by elaspsed time) of the all data points
 248    * currently tracked at this level.
 249    *
 250    * This method is identical to rate(size_t level) above but takes in the
 251    * duration that the user is interested in querying as the parameter.
 252    *
 253    * Note: you should generally call update() or flush() before accessing the
 254    * data. Otherwise you may be reading stale data if update() or flush() has
 255    * not been called recently.
 256    */
 257   template <typename ReturnType = double, typename Interval = Duration>
 258   ReturnType rate(Duration duration) const {
 259     return getLevelByDuration(duration).template rate<ReturnType, Interval>();
 260   }
 261
 262   /*
 263    * Return the number of data points currently tracked at this level.
 264    *
 265    * This method is identical to count(size_t level) above but takes in the
 266    * duration that the user is interested in querying as the parameter.
 267    *
 268    * Note: you should generally call update() or flush() before accessing the
 269    * data. Otherwise you may be reading stale data if update() or flush() has
 270    * not been called recently.
 271    */
 272   uint64_t count(Duration duration) const {
 273     return getLevelByDuration(duration).count();
 274   }
 275
 276   /*
 277    * Return the count divided by the elapsed time tracked at this level.
 278    *
 279    * This method is identical to countRate(size_t level) above but takes in the
 280    * duration that the user is interested in querying as the parameter.
 281    *
 282    * Note: you should generally call update() or flush() before accessing the
 283    * data. Otherwise you may be reading stale data if update() or flush() has
 284    * not been called recently.
 285    */
 286   template <typename ReturnType = double, typename Interval = Duration>
 287   ReturnType countRate(Duration duration) const {
 288     return getLevelByDuration(duration)
 289         .template countRate<ReturnType, Interval>();
 290   }
 291
 292   /*
 293    * Estimate the sum of the data points that occurred in the specified time
 294    * period at this level.
 295    *
 296    * The range queried is [start, end).
 297    * That is, start is inclusive, and end is exclusive.
 298    *
 299    * Note that data outside of the timeseries duration will no longer be
 300    * available for use in the estimation.  Specifying a start time earlier than
 301    * getEarliestTime() will not have much effect, since only data points after
 302    * that point in time will be counted.
 303    *
 304    * Note that the value returned is an estimate, and may not be precise.
 305    *
 306    * Note: you should generally call update() or flush() before accessing the
 307    * data. Otherwise you may be reading stale data if update() or flush() has
 308    * not been called recently.
 309    */
 310   ValueType sum(TimePoint start, TimePoint end) const {
 311     return getLevel(start).sum(start, end);
 312   }
 313
 314   /*
 315    * Estimate the average value during the specified time period.
 316    *
 317    * The same caveats documented in the sum(TimePoint start, TimePoint end)
 318    * comments apply here as well.
 319    *
 320    * Note: you should generally call update() or flush() before accessing the
 321    * data. Otherwise you may be reading stale data if update() or flush() has
 322    * not been called recently.
 323    */
 324   template <typename ReturnType = double>
 325   ReturnType avg(TimePoint start, TimePoint end) const {
 326     return getLevel(start).template avg<ReturnType>(start, end);
 327   }
 328
 329   /*
 330    * Estimate the rate during the specified time period.
 331    *
 332    * The same caveats documented in the sum(TimePoint start, TimePoint end)
 333    * comments apply here as well.
 334    *
 335    * Note: you should generally call update() or flush() before accessing the
 336    * data. Otherwise you may be reading stale data if update() or flush() has
 337    * not been called recently.
 338    */
 339   template <typename ReturnType = double>
 340   ReturnType rate(TimePoint start, TimePoint end) const {
 341     return getLevel(start).template rate<ReturnType>(start, end);
 342   }
 343
 344   /*
 345    * Estimate the count during the specified time period.
 346    *
 347    * The same caveats documented in the sum(TimePoint start, TimePoint end)
 348    * comments apply here as well.
 349    *
 350    * Note: you should generally call update() or flush() before accessing the
 351    * data. Otherwise you may be reading stale data if update() or flush() has
 352    * not been called recently.
 353    */
 354   uint64_t count(TimePoint start, TimePoint end) const {
 355     return getLevel(start).count(start, end);
 356   }
 357
 358   /*
 359    * Adds the value 'val' at time 'now' to all levels.
 360    *
 361    * Data points added at the same time point is cached internally here and not
 362    * propagated to the underlying levels until either flush() is called or when
 363    * update from a different time comes.
 364    *
 365    * This function expects time to always move forwards: it cannot be used to
 366    * add historical data points that have occurred in the past.  If now is
 367    * older than the another timestamp that has already been passed to
 368    * addValue() or update(), now will be ignored and the latest timestamp will
 369    * be used.
 370    */
 371   void addValue(TimePoint now, const ValueType& val);
 372
 373   /*
 374    * Adds the value 'val' at time 'now' to all levels.
 375    */
 376   void addValue(TimePoint now, const ValueType& val, uint64_t times);
 377
 378   /*
 379    * Adds the value 'total' at time 'now' to all levels as the sum of
 380    * 'nsamples' samples.
 381    */
 382   void
 383   addValueAggregated(TimePoint now, const ValueType& total, uint64_t nsamples);
 384
 385   /*
 386    * Update all the levels to the specified time, doing all the necessary
 387    * work to rotate the buckets and remove any stale data points.
 388    *
 389    * When reading data from the timeseries, you should make sure to manually
 390    * call update() before accessing the data. Otherwise you may be reading
 391    * stale data if update() has not been called recently.
 392    */
 393   void update(TimePoint now);
 394
 395   /*
 396    * Reset all the timeseries to an empty state as if no data points have ever
 397    * been added to it.
 398    */
 399   void clear();
 400
 401   /*
 402    * Flush all cached updates.
 403    */
 404   void flush();
 405
 406   /*
 407    * Legacy APIs that accept a Duration parameters rather than TimePoint.
 408    *
 409    * These treat the Duration as relative to the clock epoch.
 410    * Prefer using the correct TimePoint-based APIs instead.  These APIs will
 411    * eventually be deprecated and removed.
 412    */
 413   void update(Duration now) {
 414     update(TimePoint(now));
 415   }
 416   void addValue(Duration now, const ValueType& value) {
 417     addValue(TimePoint(now), value);
 418   }
 419   void addValue(Duration now, const ValueType& value, uint64_t times) {
 420     addValue(TimePoint(now), value, times);
 421   }
 422   void
 423   addValueAggregated(Duration now, const ValueType& total, uint64_t nsamples) {
 424     addValueAggregated(TimePoint(now), total, nsamples);
 425   }
 426
 427  private:
 428   std::vector<Level> levels_;
 429
 430   // Updates within the same time interval are cached
 431   // They are flushed out when updates from a different time comes,
 432   // or flush() is called.
 433   TimePoint cachedTime_;
 434   ValueType cachedSum_;
 435   uint64_t cachedCount_;
 436 };
 437
 438 } // folly