folly/stats/MultiLevelTimeSeries.h

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #pragma once
  18
  19 #include <chrono>
  20 #include <stdexcept>
  21 #include <string>
  22 #include <vector>
  23
  24 #include <folly/String.h>
  25 #include <folly/stats/BucketedTimeSeries.h>
  26 #include <glog/logging.h>
  27
  28 namespace folly {
  29
  30 /*
  31  * This class represents a timeseries which keeps several levels of data
  32  * granularity (similar in principle to the loads reported by the UNIX
  33  * 'uptime' command).  It uses several instances (one per level) of
  34  * BucketedTimeSeries as the underlying storage.
  35  *
  36  * This can easily be used to track sums (and thus rates or averages) over
  37  * several predetermined time periods, as well as all-time sums.  For example,
  38  * you would use to it to track query rate or response speed over the last
  39  * 5, 15, 30, and 60 minutes.
  40  *
  41  * The MultiLevelTimeSeries takes a list of level durations as an input; the
  42  * durations must be strictly increasing.  Furthermore a special level can be
  43  * provided with a duration of '0' -- this will be an "all-time" level.  If
  44  * an all-time level is provided, it MUST be the last level present.
  45  *
  46  * The class assumes that time advances forward --  you can't retroactively add
  47  * values for events in the past -- the 'now' argument is provided for better
  48  * efficiency and ease of unittesting.
  49  *
  50  * The class is not thread-safe -- use your own synchronization!
  51  */
  52 template <typename VT, typename CT = LegacyStatsClock<std::chrono::seconds>>
  53 class MultiLevelTimeSeries {
  54  public:
  55   using ValueType = VT;
  56   using Clock = CT;
  57   using Duration = typename Clock::duration;
  58   using TimePoint = typename Clock::time_point;
  59   using Level = folly::BucketedTimeSeries<ValueType, Clock>;
  60
  61   /*
  62    * Create a new MultiLevelTimeSeries.
  63    *
  64    * This creates a new MultiLevelTimeSeries that tracks time series data at the
  65    * specified time durations (level). The time series data tracked at each
  66    * level is then further divided by numBuckets for memory efficiency.
  67    *
  68    * The durations must be strictly increasing. Furthermore a special level can
  69    * be provided with a duration of '0' -- this will be an "all-time" level. If
  70    * an all-time level is provided, it MUST be the last level present.
  71    */
  72   MultiLevelTimeSeries(
  73       size_t numBuckets,
  74       size_t numLevels,
  75       const Duration levelDurations[]);
  76
  77   MultiLevelTimeSeries(
  78       size_t numBuckets,
  79       std::initializer_list<Duration> durations);
  80
  81   /*
  82    * Return the number of buckets used to track time series at each level.
  83    */
  84   size_t numBuckets() const {
  85     // The constructor ensures that levels_ has at least one item
  86     return levels_[0].numBuckets();
  87   }
  88
  89   /*
  90    * Return the number of levels tracked by MultiLevelTimeSeries.
  91    */
  92   size_t numLevels() const {
  93     return levels_.size();
  94   }
  95
  96   /*
  97    * Get the BucketedTimeSeries backing the specified level.
  98    *
  99    * Note: you should generally call update() or flush() before accessing the
 100    * data. Otherwise you may be reading stale data if update() or flush() has
 101    * not been called recently.
 102    */
 103   const Level& getLevel(size_t level) const {
 104     CHECK_LT(level, levels_.size());
 105     return levels_[level];
 106   }
 107
 108   /*
 109    * Get the highest granularity level that is still large enough to contain
 110    * data going back to the specified start time.
 111    *
 112    * Note: you should generally call update() or flush() before accessing the
 113    * data. Otherwise you may be reading stale data if update() or flush() has
 114    * not been called recently.
 115    */
 116   const Level& getLevel(TimePoint start) const {
 117     for (const auto& level : levels_) {
 118       if (level.isAllTime()) {
 119         return level;
 120       }
 121       // Note that we use duration() here rather than elapsed().
 122       // If duration is large enough to contain the start time then this level
 123       // is good enough, even if elapsed() indicates that no data was recorded
 124       // before the specified start time.
 125       if (level.getLatestTime() - level.duration() <= start) {
 126         return level;
 127       }
 128     }
 129     // We should always have an all-time level, so this is never reached.
 130     LOG(FATAL) << "No level of timeseries covers internval"
 131                << " from " << start.time_since_epoch().count() << " to now";
 132     return levels_.back();
 133   }
 134
 135   /*
 136    * Get the BucketedTimeSeries backing the specified level.
 137    *
 138    * Note: you should generally call update() or flush() before accessing the
 139    * data. Otherwise you may be reading stale data if update() or flush() has
 140    * not been called recently.
 141    */
 142   const Level& getLevelByDuration(Duration duration) const {
 143     // since the number of levels is expected to be small (less than 5 in most
 144     // cases), a simple linear scan would be efficient and is intentionally
 145     // chosen here over other alternatives for lookup.
 146     for (const auto& level : levels_) {
 147       if (level.duration() == duration) {
 148         return level;
 149       }
 150     }
 151     throw std::out_of_range(folly::to<std::string>(
 152         "No level of duration ", duration.count(), " found"));
 153   }
 154
 155   /*
 156    * Return the sum of all the data points currently tracked at this level.
 157    *
 158    * Note: you should generally call update() or flush() before accessing the
 159    * data. Otherwise you may be reading stale data if update() or flush() has
 160    * not been called recently.
 161    */
 162   ValueType sum(size_t level) const {
 163     return getLevel(level).sum();
 164   }
 165
 166   /*
 167    * Return the average (sum / count) of all the data points currently tracked
 168    * at this level.
 169    *
 170    * The return type may be specified to control whether floating-point or
 171    * integer division should be performed.
 172    *
 173    * Note: you should generally call update() or flush() before accessing the
 174    * data. Otherwise you may be reading stale data if update() or flush() has
 175    * not been called recently.
 176    */
 177   template <typename ReturnType = double>
 178   ReturnType avg(size_t level) const {
 179     return getLevel(level).template avg<ReturnType>();
 180   }
 181
 182   /*
 183    * Return the rate (sum divided by elaspsed time) of the all data points
 184    * currently tracked at this level.
 185    *
 186    * Note: you should generally call update() or flush() before accessing the
 187    * data. Otherwise you may be reading stale data if update() or flush() has
 188    * not been called recently.
 189    */
 190   template <typename ReturnType = double, typename Interval = Duration>
 191   ReturnType rate(size_t level) const {
 192     return getLevel(level).template rate<ReturnType, Interval>();
 193   }
 194
 195   /*
 196    * Return the number of data points currently tracked at this level.
 197    *
 198    * Note: you should generally call update() or flush() before accessing the
 199    * data. Otherwise you may be reading stale data if update() or flush() has
 200    * not been called recently.
 201    */
 202   uint64_t count(size_t level) const {
 203     return getLevel(level).count();
 204   }
 205
 206   /*
 207    * Return the count divided by the elapsed time tracked at this level.
 208    *
 209    * Note: you should generally call update() or flush() before accessing the
 210    * data. Otherwise you may be reading stale data if update() or flush() has
 211    * not been called recently.
 212    */
 213   template <typename ReturnType = double, typename Interval = Duration>
 214   ReturnType countRate(size_t level) const {
 215     return getLevel(level).template countRate<ReturnType, Interval>();
 216   }
 217
 218   /*
 219    * Return the sum of all the data points currently tracked at this level.
 220    *
 221    * This method is identical to sum(size_t level) above but takes in the
 222    * duration that the user is interested in querying as the parameter.
 223    *
 224    * Note: you should generally call update() or flush() before accessing the
 225    * data. Otherwise you may be reading stale data if update() or flush() has
 226    * not been called recently.
 227    */
 228   ValueType sum(Duration duration) const {
 229     return getLevelByDuration(duration).sum();
 230   }
 231
 232   /*
 233    * Return the average (sum / count) of all the data points currently tracked
 234    * at this level.
 235    *
 236    * This method is identical to avg(size_t level) above but takes in the
 237    * duration that the user is interested in querying as the parameter.
 238    *
 239    * Note: you should generally call update() or flush() before accessing the
 240    * data. Otherwise you may be reading stale data if update() or flush() has
 241    * not been called recently.
 242    */
 243   template <typename ReturnType = double>
 244   ReturnType avg(Duration duration) const {
 245     return getLevelByDuration(duration).template avg<ReturnType>();
 246   }
 247
 248   /*
 249    * Return the rate (sum divided by elaspsed time) of the all data points
 250    * currently tracked at this level.
 251    *
 252    * This method is identical to rate(size_t level) above but takes in the
 253    * duration that the user is interested in querying as the parameter.
 254    *
 255    * Note: you should generally call update() or flush() before accessing the
 256    * data. Otherwise you may be reading stale data if update() or flush() has
 257    * not been called recently.
 258    */
 259   template <typename ReturnType = double, typename Interval = Duration>
 260   ReturnType rate(Duration duration) const {
 261     return getLevelByDuration(duration).template rate<ReturnType, Interval>();
 262   }
 263
 264   /*
 265    * Return the number of data points currently tracked at this level.
 266    *
 267    * This method is identical to count(size_t level) above but takes in the
 268    * duration that the user is interested in querying as the parameter.
 269    *
 270    * Note: you should generally call update() or flush() before accessing the
 271    * data. Otherwise you may be reading stale data if update() or flush() has
 272    * not been called recently.
 273    */
 274   uint64_t count(Duration duration) const {
 275     return getLevelByDuration(duration).count();
 276   }
 277
 278   /*
 279    * Return the count divided by the elapsed time tracked at this level.
 280    *
 281    * This method is identical to countRate(size_t level) above but takes in the
 282    * duration that the user is interested in querying as the parameter.
 283    *
 284    * Note: you should generally call update() or flush() before accessing the
 285    * data. Otherwise you may be reading stale data if update() or flush() has
 286    * not been called recently.
 287    */
 288   template <typename ReturnType = double, typename Interval = Duration>
 289   ReturnType countRate(Duration duration) const {
 290     return getLevelByDuration(duration)
 291         .template countRate<ReturnType, Interval>();
 292   }
 293
 294   /*
 295    * Estimate the sum of the data points that occurred in the specified time
 296    * period at this level.
 297    *
 298    * The range queried is [start, end).
 299    * That is, start is inclusive, and end is exclusive.
 300    *
 301    * Note that data outside of the timeseries duration will no longer be
 302    * available for use in the estimation.  Specifying a start time earlier than
 303    * getEarliestTime() will not have much effect, since only data points after
 304    * that point in time will be counted.
 305    *
 306    * Note that the value returned is an estimate, and may not be precise.
 307    *
 308    * Note: you should generally call update() or flush() before accessing the
 309    * data. Otherwise you may be reading stale data if update() or flush() has
 310    * not been called recently.
 311    */
 312   ValueType sum(TimePoint start, TimePoint end) const {
 313     return getLevel(start).sum(start, end);
 314   }
 315
 316   /*
 317    * Estimate the average value during the specified time period.
 318    *
 319    * The same caveats documented in the sum(TimePoint start, TimePoint end)
 320    * comments apply here as well.
 321    *
 322    * Note: you should generally call update() or flush() before accessing the
 323    * data. Otherwise you may be reading stale data if update() or flush() has
 324    * not been called recently.
 325    */
 326   template <typename ReturnType = double>
 327   ReturnType avg(TimePoint start, TimePoint end) const {
 328     return getLevel(start).template avg<ReturnType>(start, end);
 329   }
 330
 331   /*
 332    * Estimate the rate during the specified time period.
 333    *
 334    * The same caveats documented in the sum(TimePoint start, TimePoint end)
 335    * comments apply here as well.
 336    *
 337    * Note: you should generally call update() or flush() before accessing the
 338    * data. Otherwise you may be reading stale data if update() or flush() has
 339    * not been called recently.
 340    */
 341   template <typename ReturnType = double>
 342   ReturnType rate(TimePoint start, TimePoint end) const {
 343     return getLevel(start).template rate<ReturnType>(start, end);
 344   }
 345
 346   /*
 347    * Estimate the count during the specified time period.
 348    *
 349    * The same caveats documented in the sum(TimePoint start, TimePoint end)
 350    * comments apply here as well.
 351    *
 352    * Note: you should generally call update() or flush() before accessing the
 353    * data. Otherwise you may be reading stale data if update() or flush() has
 354    * not been called recently.
 355    */
 356   uint64_t count(TimePoint start, TimePoint end) const {
 357     return getLevel(start).count(start, end);
 358   }
 359
 360   /*
 361    * Adds the value 'val' at time 'now' to all levels.
 362    *
 363    * Data points added at the same time point is cached internally here and not
 364    * propagated to the underlying levels until either flush() is called or when
 365    * update from a different time comes.
 366    *
 367    * This function expects time to always move forwards: it cannot be used to
 368    * add historical data points that have occurred in the past.  If now is
 369    * older than the another timestamp that has already been passed to
 370    * addValue() or update(), now will be ignored and the latest timestamp will
 371    * be used.
 372    */
 373   void addValue(TimePoint now, const ValueType& val);
 374
 375   /*
 376    * Adds the value 'val' at time 'now' to all levels.
 377    */
 378   void addValue(TimePoint now, const ValueType& val, uint64_t times);
 379
 380   /*
 381    * Adds the value 'total' at time 'now' to all levels as the sum of
 382    * 'nsamples' samples.
 383    */
 384   void
 385   addValueAggregated(TimePoint now, const ValueType& total, uint64_t nsamples);
 386
 387   /*
 388    * Update all the levels to the specified time, doing all the necessary
 389    * work to rotate the buckets and remove any stale data points.
 390    *
 391    * When reading data from the timeseries, you should make sure to manually
 392    * call update() before accessing the data. Otherwise you may be reading
 393    * stale data if update() has not been called recently.
 394    */
 395   void update(TimePoint now);
 396
 397   /*
 398    * Reset all the timeseries to an empty state as if no data points have ever
 399    * been added to it.
 400    */
 401   void clear();
 402
 403   /*
 404    * Flush all cached updates.
 405    */
 406   void flush();
 407
 408   /*
 409    * Legacy APIs that accept a Duration parameters rather than TimePoint.
 410    *
 411    * These treat the Duration as relative to the clock epoch.
 412    * Prefer using the correct TimePoint-based APIs instead.  These APIs will
 413    * eventually be deprecated and removed.
 414    */
 415   void update(Duration now) {
 416     update(TimePoint(now));
 417   }
 418   void addValue(Duration now, const ValueType& value) {
 419     addValue(TimePoint(now), value);
 420   }
 421   void addValue(Duration now, const ValueType& value, uint64_t times) {
 422     addValue(TimePoint(now), value, times);
 423   }
 424   void
 425   addValueAggregated(Duration now, const ValueType& total, uint64_t nsamples) {
 426     addValueAggregated(TimePoint(now), total, nsamples);
 427   }
 428
 429  private:
 430   std::vector<Level> levels_;
 431
 432   // Updates within the same time interval are cached
 433   // They are flushed out when updates from a different time comes,
 434   // or flush() is called.
 435   TimePoint cachedTime_;
 436   ValueType cachedSum_;
 437   uint64_t cachedCount_;
 438 };
 439
 440 } // namespace folly