folly/stats/Histogram.h

   1 /*
   2  * Copyright 2015 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #ifndef FOLLY_HISTOGRAM_H_
  18 #define FOLLY_HISTOGRAM_H_
  19
  20 #include <cstddef>
  21 #include <limits>
  22 #include <ostream>
  23 #include <string>
  24 #include <vector>
  25 #include <stdexcept>
  26
  27 #include <folly/detail/Stats.h>
  28
  29 namespace folly {
  30
  31 namespace detail {
  32
  33 /*
  34  * A helper class to manage a set of histogram buckets.
  35  */
  36 template <typename T, typename BucketT>
  37 class HistogramBuckets {
  38  public:
  39   typedef T ValueType;
  40   typedef BucketT BucketType;
  41
  42   /*
  43    * Create a set of histogram buckets.
  44    *
  45    * One bucket will be created for each bucketSize interval of values within
  46    * the specified range.  Additionally, one bucket will be created to track
  47    * all values that fall below the specified minimum, and one bucket will be
  48    * created for all values above the specified maximum.
  49    *
  50    * If (max - min) is not a multiple of bucketSize, the last bucket will cover
  51    * a smaller range of values than the other buckets.
  52    *
  53    * (max - min) must be larger than or equal to bucketSize.
  54    */
  55   HistogramBuckets(ValueType bucketSize, ValueType min, ValueType max,
  56                    const BucketType& defaultBucket);
  57
  58   /* Returns the bucket size of each bucket in the histogram. */
  59   ValueType getBucketSize() const {
  60     return bucketSize_;
  61   }
  62
  63   /* Returns the min value at which bucketing begins. */
  64   ValueType getMin() const {
  65     return min_;
  66   }
  67
  68   /* Returns the max value at which bucketing ends. */
  69   ValueType getMax() const {
  70     return max_;
  71   }
  72
  73   /*
  74    * Returns the number of buckets.
  75    *
  76    * This includes the total number of buckets for the [min, max) range,
  77    * plus 2 extra buckets, one for handling values less than min, and one for
  78    * values greater than max.
  79    */
  80   unsigned int getNumBuckets() const {
  81     return buckets_.size();
  82   }
  83
  84   /* Returns the bucket index into which the given value would fall. */
  85   unsigned int getBucketIdx(ValueType value) const;
  86
  87   /* Returns the bucket for the specified value */
  88   BucketType& getByValue(ValueType value) {
  89     return buckets_[getBucketIdx(value)];
  90   }
  91
  92   /* Returns the bucket for the specified value */
  93   const BucketType& getByValue(ValueType value) const {
  94     return buckets_[getBucketIdx(value)];
  95   }
  96
  97   /*
  98    * Returns the bucket at the specified index.
  99    *
 100    * Note that index 0 is the bucket for all values less than the specified
 101    * minimum.  Index 1 is the first bucket in the specified bucket range.
 102    */
 103   BucketType& getByIndex(unsigned int idx) {
 104     return buckets_[idx];
 105   }
 106
 107   /* Returns the bucket at the specified index. */
 108   const BucketType& getByIndex(unsigned int idx) const {
 109     return buckets_[idx];
 110   }
 111
 112   /*
 113    * Returns the minimum threshold for the bucket at the given index.
 114    *
 115    * The bucket at the specified index will store values in the range
 116    * [bucketMin, bucketMin + bucketSize), or [bucketMin, max), if the overall
 117    * max is smaller than bucketMin + bucketSize.
 118    */
 119   ValueType getBucketMin(unsigned int idx) const {
 120     if (idx == 0) {
 121       return std::numeric_limits<ValueType>::min();
 122     }
 123     if (idx == buckets_.size() - 1) {
 124       return max_;
 125     }
 126
 127     return min_ + ((idx - 1) * bucketSize_);
 128   }
 129
 130   /*
 131    * Returns the maximum threshold for the bucket at the given index.
 132    *
 133    * The bucket at the specified index will store values in the range
 134    * [bucketMin, bucketMin + bucketSize), or [bucketMin, max), if the overall
 135    * max is smaller than bucketMin + bucketSize.
 136    */
 137   ValueType getBucketMax(unsigned int idx) const {
 138     if (idx == buckets_.size() - 1) {
 139       return std::numeric_limits<ValueType>::max();
 140     }
 141
 142     return min_ + (idx * bucketSize_);
 143   }
 144
 145   /**
 146    * Computes the total number of values stored across all buckets.
 147    *
 148    * Runs in O(numBuckets)
 149    *
 150    * @param countFn A function that takes a const BucketType&, and returns the
 151    *                number of values in that bucket
 152    * @return Returns the total number of values stored across all buckets
 153    */
 154   template <typename CountFn>
 155   const uint64_t computeTotalCount(CountFn countFromBucket) const;
 156
 157   /**
 158    * Determine which bucket the specified percentile falls into.
 159    *
 160    * Looks for the bucket that contains the Nth percentile data point.
 161    *
 162    * @param pct     The desired percentile to find, as a value from 0.0 to 1.0.
 163    * @param countFn A function that takes a const BucketType&, and returns the
 164    *                number of values in that bucket.
 165    * @param lowPct  The lowest percentile stored in the selected bucket will be
 166    *                returned via this parameter.
 167    * @param highPct The highest percentile stored in the selected bucket will
 168    *                be returned via this parameter.
 169    *
 170    * @return Returns the index of the bucket that contains the Nth percentile
 171    *         data point.
 172    */
 173   template <typename CountFn>
 174   unsigned int getPercentileBucketIdx(double pct,
 175                                       CountFn countFromBucket,
 176                                       double* lowPct = nullptr,
 177                                       double* highPct = nullptr) const;
 178
 179   /**
 180    * Estimate the value at the specified percentile.
 181    *
 182    * @param pct     The desired percentile to find, as a value from 0.0 to 1.0.
 183    * @param countFn A function that takes a const BucketType&, and returns the
 184    *                number of values in that bucket.
 185    * @param avgFn   A function that takes a const BucketType&, and returns the
 186    *                average of all the values in that bucket.
 187    *
 188    * @return Returns an estimate for N, where N is the number where exactly pct
 189    *         percentage of the data points in the histogram are less than N.
 190    */
 191   template <typename CountFn, typename AvgFn>
 192   ValueType getPercentileEstimate(double pct,
 193                                   CountFn countFromBucket,
 194                                   AvgFn avgFromBucket) const;
 195
 196   /*
 197    * Iterator access to the buckets.
 198    *
 199    * Note that the first bucket is for all values less than min, and the last
 200    * bucket is for all values greater than max.  The buckets tracking values in
 201    * the [min, max) actually start at the second bucket.
 202    */
 203   typename std::vector<BucketType>::const_iterator begin() const {
 204     return buckets_.begin();
 205   }
 206   typename std::vector<BucketType>::iterator begin() {
 207     return buckets_.begin();
 208   }
 209   typename std::vector<BucketType>::const_iterator end() const {
 210     return buckets_.end();
 211   }
 212   typename std::vector<BucketType>::iterator end() {
 213     return buckets_.end();
 214   }
 215
 216  private:
 217   ValueType bucketSize_;
 218   ValueType min_;
 219   ValueType max_;
 220   std::vector<BucketType> buckets_;
 221 };
 222
 223 } // detail
 224
 225
 226 /*
 227  * A basic histogram class.
 228  *
 229  * Groups data points into equally-sized buckets, and stores the overall sum of
 230  * the data points in each bucket, as well as the number of data points in the
 231  * bucket.
 232  *
 233  * The caller must specify the minimum and maximum data points to expect ahead
 234  * of time, as well as the bucket width.
 235  */
 236 template <typename T>
 237 class Histogram {
 238  public:
 239   typedef T ValueType;
 240   typedef detail::Bucket<T> Bucket;
 241
 242   Histogram(ValueType bucketSize, ValueType min, ValueType max)
 243     : buckets_(bucketSize, min, max, Bucket()) {}
 244
 245   /* Add a data point to the histogram */
 246   void addValue(ValueType value) {
 247     Bucket& bucket = buckets_.getByValue(value);
 248     // TODO: It would be nice to handle overflow here.
 249     bucket.sum += value;
 250     bucket.count += 1;
 251   }
 252
 253   /* Add multiple same data points to the histogram */
 254   void addRepeatedValue(ValueType value, uint64_t nSamples) {
 255     Bucket& bucket = buckets_.getByValue(value);
 256     // TODO: It would be nice to handle overflow here.
 257     bucket.sum += value * nSamples;
 258     bucket.count += nSamples;
 259   }
 260
 261   /*
 262    * Remove a data point to the histogram
 263    *
 264    * Note that this method does not actually verify that this exact data point
 265    * had previously been added to the histogram; it merely subtracts the
 266    * requested value from the appropriate bucket's sum.
 267    */
 268   void removeValue(ValueType value) {
 269     Bucket& bucket = buckets_.getByValue(value);
 270     // TODO: It would be nice to handle overflow here.
 271     if (bucket.count > 0) {
 272       bucket.sum -= value;
 273       bucket.count -= 1;
 274     } else {
 275       bucket.sum = ValueType();
 276       bucket.count = 0;
 277     }
 278   }
 279
 280   /* Remove multiple same data points from the histogram */
 281   void removeRepeatedValue(ValueType value, uint64_t nSamples) {
 282     Bucket& bucket = buckets_.getByValue(value);
 283     // TODO: It would be nice to handle overflow here.
 284     if (bucket.count >= nSamples) {
 285       bucket.sum -= value * nSamples;
 286       bucket.count -= nSamples;
 287     } else {
 288       bucket.sum = ValueType();
 289       bucket.count = 0;
 290     }
 291   }
 292
 293   /* Remove all data points from the histogram */
 294   void clear() {
 295     for (unsigned int i = 0; i < buckets_.getNumBuckets(); i++) {
 296       buckets_.getByIndex(i).clear();
 297     }
 298   }
 299
 300   /* Subtract another histogram data from the histogram */
 301   void subtract(const Histogram &hist) {
 302     // the two histogram bucket definitions must match to support
 303     // subtract.
 304     if (getBucketSize() != hist.getBucketSize() ||
 305         getMin() != hist.getMin() ||
 306         getMax() != hist.getMax() ||
 307         getNumBuckets() != hist.getNumBuckets() ) {
 308       throw std::invalid_argument("Cannot subtract input histogram.");
 309     }
 310
 311     for (unsigned int i = 0; i < buckets_.getNumBuckets(); i++) {
 312       buckets_.getByIndex(i) -= hist.buckets_.getByIndex(i);
 313     }
 314   }
 315
 316   /* Merge two histogram data together */
 317   void merge(const Histogram &hist) {
 318     // the two histogram bucket definitions must match to support
 319     // a merge.
 320     if (getBucketSize() != hist.getBucketSize() ||
 321         getMin() != hist.getMin() ||
 322         getMax() != hist.getMax() ||
 323         getNumBuckets() != hist.getNumBuckets() ) {
 324       throw std::invalid_argument("Cannot merge from input histogram.");
 325     }
 326
 327     for (unsigned int i = 0; i < buckets_.getNumBuckets(); i++) {
 328       buckets_.getByIndex(i) += hist.buckets_.getByIndex(i);
 329     }
 330   }
 331
 332   /* Copy bucket values from another histogram */
 333   void copy(const Histogram &hist) {
 334     // the two histogram bucket definition must match
 335     if (getBucketSize() != hist.getBucketSize() ||
 336         getMin() != hist.getMin() ||
 337         getMax() != hist.getMax() ||
 338         getNumBuckets() != hist.getNumBuckets() ) {
 339       throw std::invalid_argument("Cannot copy from input histogram.");
 340     }
 341
 342     for (unsigned int i = 0; i < buckets_.getNumBuckets(); i++) {
 343       buckets_.getByIndex(i) = hist.buckets_.getByIndex(i);
 344     }
 345   }
 346
 347   /* Returns the bucket size of each bucket in the histogram. */
 348   ValueType getBucketSize() const {
 349     return buckets_.getBucketSize();
 350   }
 351   /* Returns the min value at which bucketing begins. */
 352   ValueType getMin() const {
 353     return buckets_.getMin();
 354   }
 355   /* Returns the max value at which bucketing ends. */
 356   ValueType getMax() const {
 357     return buckets_.getMax();
 358   }
 359   /* Returns the number of buckets */
 360   unsigned int getNumBuckets() const {
 361     return buckets_.getNumBuckets();
 362   }
 363
 364   /* Returns the specified bucket (for reading only!) */
 365   const Bucket& getBucketByIndex(int idx) const {
 366     return buckets_.getByIndex(idx);
 367   }
 368
 369   /*
 370    * Returns the minimum threshold for the bucket at the given index.
 371    *
 372    * The bucket at the specified index will store values in the range
 373    * [bucketMin, bucketMin + bucketSize), or [bucketMin, max), if the overall
 374    * max is smaller than bucketMin + bucketSize.
 375    */
 376   ValueType getBucketMin(unsigned int idx) const {
 377     return buckets_.getBucketMin(idx);
 378   }
 379
 380   /*
 381    * Returns the maximum threshold for the bucket at the given index.
 382    *
 383    * The bucket at the specified index will store values in the range
 384    * [bucketMin, bucketMin + bucketSize), or [bucketMin, max), if the overall
 385    * max is smaller than bucketMin + bucketSize.
 386    */
 387   ValueType getBucketMax(unsigned int idx) const {
 388     return buckets_.getBucketMax(idx);
 389   }
 390
 391   /**
 392    * Computes the total number of values stored across all buckets.
 393    *
 394    * Runs in O(numBuckets)
 395    */
 396   const uint64_t computeTotalCount() const {
 397     CountFromBucket countFn;
 398     return buckets_.computeTotalCount(countFn);
 399   }
 400
 401   /*
 402    * Get the bucket that the specified percentile falls into
 403    *
 404    * The lowest and highest percentile data points in returned bucket will be
 405    * returned in the lowPct and highPct arguments, if they are non-NULL.
 406    */
 407   unsigned int getPercentileBucketIdx(double pct,
 408                                       double* lowPct = nullptr,
 409                                       double* highPct = nullptr) const {
 410     // We unfortunately can't use lambdas here yet;
 411     // Some users of this code are still built with gcc-4.4.
 412     CountFromBucket countFn;
 413     return buckets_.getPercentileBucketIdx(pct, countFn, lowPct, highPct);
 414   }
 415
 416   /**
 417    * Estimate the value at the specified percentile.
 418    *
 419    * @param pct     The desired percentile to find, as a value from 0.0 to 1.0.
 420    *
 421    * @return Returns an estimate for N, where N is the number where exactly pct
 422    *         percentage of the data points in the histogram are less than N.
 423    */
 424   ValueType getPercentileEstimate(double pct) const {
 425     CountFromBucket countFn;
 426     AvgFromBucket avgFn;
 427     return buckets_.getPercentileEstimate(pct, countFn, avgFn);
 428   }
 429
 430   /*
 431    * Get a human-readable string describing the histogram contents
 432    */
 433   std::string debugString() const;
 434
 435   /*
 436    * Write the histogram contents in tab-separated values (TSV) format.
 437    * Format is "min max count sum".
 438    */
 439   void toTSV(std::ostream& out, bool skipEmptyBuckets = true) const;
 440
 441  private:
 442   struct CountFromBucket {
 443     uint64_t operator()(const Bucket& bucket) const {
 444       return bucket.count;
 445     }
 446   };
 447   struct AvgFromBucket {
 448     ValueType operator()(const Bucket& bucket) const {
 449       if (bucket.count == 0) {
 450         return ValueType(0);
 451       }
 452       // Cast bucket.count to a signed integer type.  This ensures that we
 453       // perform division properly here: If bucket.sum is a signed integer
 454       // type but we divide by an unsigned number, unsigned division will be
 455       // performed and bucket.sum will be converted to unsigned first.
 456       // If bucket.sum is unsigned, the code will still do unsigned division
 457       // correctly.
 458       //
 459       // The only downside is if bucket.count is large enough to be negative
 460       // when treated as signed.  That should be extremely unlikely, though.
 461       return bucket.sum / static_cast<int64_t>(bucket.count);
 462     }
 463   };
 464
 465   detail::HistogramBuckets<ValueType, Bucket> buckets_;
 466 };
 467
 468 } // folly
 469
 470 #endif // FOLLY_HISTOGRAM_H_