move Histogram.h to stats/

[folly.git] / folly / stats / Histogram-defs.h
diff --git a/folly/stats/Histogram-defs.h b/folly/stats/Histogram-defs.h

new file mode 100644 (file)

index 0000000..7253d5c
--- /dev/null
+++ b/folly/stats/Histogram-defs.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FOLLY_HISTOGRAM_DEFS_H_
+#define FOLLY_HISTOGRAM_DEFS_H_
+
+#include "folly/Conv.h"
+
+#include <glog/logging.h>
+
+namespace folly {
+
+namespace detail {
+
+template <typename T, typename BucketT>
+HistogramBuckets<T, BucketT>::HistogramBuckets(ValueType bucketSize,
+                                               ValueType min,
+                                               ValueType max,
+                                               const BucketType& defaultBucket)
+  : bucketSize_(bucketSize),
+    min_(min),
+    max_(max) {
+  CHECK_GT(bucketSize_, ValueType(0));
+  CHECK_LT(min_, max_);
+
+  unsigned int numBuckets = (max - min) / bucketSize;
+  // Round up if the bucket size does not fit evenly
+  if (numBuckets * bucketSize < max - min) {
+    ++numBuckets;
+  }
+  // Add 2 for the extra 'below min' and 'above max' buckets
+  numBuckets += 2;
+  buckets_.assign(numBuckets, defaultBucket);
+}
+
+template <typename T, typename BucketType>
+unsigned int HistogramBuckets<T, BucketType>::getBucketIdx(
+    ValueType value) const {
+  if (value < min_) {
+    return 0;
+  } else if (value >= max_) {
+    return buckets_.size() - 1;
+  } else {
+    // the 1 is the below_min bucket
+    return ((value - min_) / bucketSize_) + 1;
+  }
+}
+
+template <typename T, typename BucketType>
+template <typename CountFn>
+unsigned int HistogramBuckets<T, BucketType>::getPercentileBucketIdx(
+    double pct,
+    CountFn countFromBucket,
+    double* lowPct, double* highPct) const {
+  CHECK_GE(pct, 0.0);
+  CHECK_LE(pct, 1.0);
+
+  unsigned int numBuckets = buckets_.size();
+
+  // Compute the counts in each bucket
+  std::vector<uint64_t> counts(numBuckets);
+  uint64_t totalCount = 0;
+  for (unsigned int n = 0; n < numBuckets; ++n) {
+    uint64_t bucketCount =
+      countFromBucket(const_cast<const BucketType&>(buckets_[n]));
+    counts[n] = bucketCount;
+    totalCount += bucketCount;
+  }
+
+  // If there are no elements, just return the lowest bucket.
+  // Note that we return bucket 1, which is the first bucket in the
+  // histogram range; bucket 0 is for all values below min_.
+  if (totalCount == 0) {
+    // Set lowPct and highPct both to 0.
+    // getPercentileEstimate() will recognize this to mean that the histogram
+    // is empty.
+    if (lowPct) {
+      *lowPct = 0.0;
+    }
+    if (highPct) {
+      *highPct = 0.0;
+    }
+    return 1;
+  }
+
+  // Loop through all the buckets, keeping track of each bucket's
+  // percentile range: [0,10], [10,17], [17,45], etc.  When we find a range
+  // that includes our desired percentile, we return that bucket index.
+  double prevPct = 0.0;
+  double curPct = 0.0;
+  uint64_t curCount = 0;
+  unsigned int idx;
+  for (idx = 0; idx < numBuckets; ++idx) {
+    if (counts[idx] == 0) {
+      // skip empty buckets
+      continue;
+    }
+
+    prevPct = curPct;
+    curCount += counts[idx];
+    curPct = static_cast<double>(curCount) / totalCount;
+    if (pct <= curPct) {
+      // This is the desired bucket
+      break;
+    }
+  }
+
+  if (lowPct) {
+    *lowPct = prevPct;
+  }
+  if (highPct) {
+    *highPct = curPct;
+  }
+  return idx;
+}
+
+template <typename T, typename BucketType>
+template <typename CountFn, typename AvgFn>
+T HistogramBuckets<T, BucketType>::getPercentileEstimate(
+    double pct,
+    CountFn countFromBucket,
+    AvgFn avgFromBucket) const {
+
+  // Find the bucket where this percentile falls
+  double lowPct;
+  double highPct;
+  unsigned int bucketIdx = getPercentileBucketIdx(pct, countFromBucket,
+                                                  &lowPct, &highPct);
+  if (lowPct == 0.0 && highPct == 0.0) {
+    // Invalid range -- the buckets must all be empty
+    // Return the default value for ValueType.
+    return ValueType();
+  }
+  if (lowPct == highPct) {
+    // Unlikely to have exact equality,
+    // but just return the bucket average in this case.
+    // We handle this here to avoid division by 0 below.
+    return avgFromBucket(buckets_[bucketIdx]);
+  }
+
+  CHECK_GE(pct, lowPct);
+  CHECK_LE(pct, highPct);
+  CHECK_LT(lowPct, highPct);
+
+  // Compute information about this bucket
+  ValueType avg = avgFromBucket(buckets_[bucketIdx]);
+  ValueType low;
+  ValueType high;
+  if (bucketIdx == 0) {
+    if (avg > min_) {
+      // This normally shouldn't happen.  This bucket is only supposed to track
+      // values less than min_.  Most likely this means that integer overflow
+      // occurred, and the code in avgFromBucket() returned a huge value
+      // instead of a small one.  Just return the minimum possible value for
+      // now.
+      //
+      // (Note that if the counter keeps being decremented, eventually it will
+      // wrap and become small enough that we won't detect this any more, and
+      // we will return bogus information.)
+      LOG(ERROR) << "invalid average value in histogram minimum bucket: " <<
+        avg << " > " << min_ << ": possible integer overflow?";
+      return getBucketMin(bucketIdx);
+    }
+    // For the below-min bucket, just assume the lowest value ever seen is
+    // twice as far away from min_ as avg.
+    high = min_;
+    low = high - (2 * (high - avg));
+    // Adjust low in case it wrapped
+    if (low > avg) {
+      low = std::numeric_limits<ValueType>::min();
+    }
+  } else if (bucketIdx == buckets_.size() - 1) {
+    if (avg < max_) {
+      // Most likely this means integer overflow occurred.  See the comments
+      // above in the minimum case.
+      LOG(ERROR) << "invalid average value in histogram maximum bucket: " <<
+        avg << " < " << max_ << ": possible integer overflow?";
+      return getBucketMax(bucketIdx);
+    }
+    // Similarly for the above-max bucket, assume the highest value ever seen
+    // is twice as far away from max_ as avg.
+    low = max_;
+    high = low + (2 * (avg - low));
+    // Adjust high in case it wrapped
+    if (high < avg) {
+      high = std::numeric_limits<ValueType>::max();
+    }
+  } else {
+    low = getBucketMin(bucketIdx);
+    high = getBucketMax(bucketIdx);
+    if (avg < low || avg > high) {
+      // Most likely this means an integer overflow occurred.
+      // See the comments above.  Return the midpoint between low and high
+      // as a best guess, since avg is meaningless.
+      LOG(ERROR) << "invalid average value in histogram bucket: " <<
+        avg << " not in range [" << low << ", " << high <<
+        "]: possible integer overflow?";
+      return (low + high) / 2;
+    }
+  }
+
+  // Since we know the average value in this bucket, we can do slightly better
+  // than just assuming the data points in this bucket are uniformly
+  // distributed between low and high.
+  //
+  // Assume that the median value in this bucket is the same as the average
+  // value.
+  double medianPct = (lowPct + highPct) / 2.0;
+  if (pct < medianPct) {
+    // Assume that the data points lower than the median of this bucket
+    // are uniformly distributed between low and avg
+    double pctThroughSection = (pct - lowPct) / (medianPct - lowPct);
+    return low + ((avg - low) * pctThroughSection);
+  } else {
+    // Assume that the data points greater than the median of this bucket
+    // are uniformly distributed between avg and high
+    double pctThroughSection = (pct - medianPct) / (highPct - medianPct);
+    return avg + ((high - avg) * pctThroughSection);
+  }
+}
+
+} // detail
+
+
+template <typename T>
+std::string Histogram<T>::debugString() const {
+  std::string ret = folly::to<std::string>(
+      "num buckets: ", buckets_.getNumBuckets(),
+      ", bucketSize: ", buckets_.getBucketSize(),
+      ", min: ", buckets_.getMin(), ", max: ", buckets_.getMax(), "\n");
+
+  for (unsigned int i = 0; i < buckets_.getNumBuckets(); ++i) {
+    folly::toAppend("  ", buckets_.getBucketMin(i), ": ",
+                    buckets_.getByIndex(i).count, "\n",
+                    &ret);
+  }
+
+  return ret;
+}
+
+template <typename T>
+void Histogram<T>::toTSV(std::ostream& out, bool skipEmptyBuckets) const {
+  for (unsigned int i = 0; i < buckets_.getNumBuckets(); ++i) {
+    // Do not output empty buckets in order to reduce data file size.
+    if (skipEmptyBuckets && getBucketByIndex(i).count == 0) {
+      continue;
+    }
+    const auto& bucket = getBucketByIndex(i);
+    out << getBucketMin(i) << '\t' << getBucketMax(i) << '\t'
+        << bucket.count << '\t' << bucket.sum << '\n';
+  }
+}
+
+} // folly
+
+#endif // FOLLY_HISTOGRAM_DEFS_H_