From: Adam Simpkins Date: Sat, 30 Mar 2013 01:55:16 +0000 (-0700) Subject: move Histogram.h to stats/ X-Git-Tag: v0.22.0~1008 X-Git-Url: http://plrg.eecs.uci.edu/git/?p=folly.git;a=commitdiff_plain;h=bac6c8f19e2dfb35b56c3d8985736fcdd1b1af1e move Histogram.h to stats/ Summary: Move Histogram.h into the stats/ subdirectory, along side the BucketedTimeSeries code. Eventually I plan to land more of our stats code in this subdirectory too. This also renames Histogram-inl.h to Histogram-defs.h, and no longer includes it by default from Histogram.h. Instead, this adds a new stats/Instantiations.cpp file which explicitly instantiates Histogram and BucketedTimeSeries. Most callers use these instantiations, and they now no longer need to include Histogram-defs.h. Only callers that need other instantiations need to include Histogram-defs.h. This will speed up build times. Test Plan: Ran "arc unit" to build all projects depending on folly, and verified they all still built and passed tests. Reviewed By: ldbrandy@fb.com FB internal diff: D761377 --- diff --git a/folly/stats/Histogram-defs.h b/folly/stats/Histogram-defs.h new file mode 100644 index 00000000..7253d5c9 --- /dev/null +++ b/folly/stats/Histogram-defs.h @@ -0,0 +1,269 @@ +/* + * Copyright 2013 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FOLLY_HISTOGRAM_DEFS_H_ +#define FOLLY_HISTOGRAM_DEFS_H_ + +#include "folly/Conv.h" + +#include + +namespace folly { + +namespace detail { + +template +HistogramBuckets::HistogramBuckets(ValueType bucketSize, + ValueType min, + ValueType max, + const BucketType& defaultBucket) + : bucketSize_(bucketSize), + min_(min), + max_(max) { + CHECK_GT(bucketSize_, ValueType(0)); + CHECK_LT(min_, max_); + + unsigned int numBuckets = (max - min) / bucketSize; + // Round up if the bucket size does not fit evenly + if (numBuckets * bucketSize < max - min) { + ++numBuckets; + } + // Add 2 for the extra 'below min' and 'above max' buckets + numBuckets += 2; + buckets_.assign(numBuckets, defaultBucket); +} + +template +unsigned int HistogramBuckets::getBucketIdx( + ValueType value) const { + if (value < min_) { + return 0; + } else if (value >= max_) { + return buckets_.size() - 1; + } else { + // the 1 is the below_min bucket + return ((value - min_) / bucketSize_) + 1; + } +} + +template +template +unsigned int HistogramBuckets::getPercentileBucketIdx( + double pct, + CountFn countFromBucket, + double* lowPct, double* highPct) const { + CHECK_GE(pct, 0.0); + CHECK_LE(pct, 1.0); + + unsigned int numBuckets = buckets_.size(); + + // Compute the counts in each bucket + std::vector counts(numBuckets); + uint64_t totalCount = 0; + for (unsigned int n = 0; n < numBuckets; ++n) { + uint64_t bucketCount = + countFromBucket(const_cast(buckets_[n])); + counts[n] = bucketCount; + totalCount += bucketCount; + } + + // If there are no elements, just return the lowest bucket. + // Note that we return bucket 1, which is the first bucket in the + // histogram range; bucket 0 is for all values below min_. + if (totalCount == 0) { + // Set lowPct and highPct both to 0. + // getPercentileEstimate() will recognize this to mean that the histogram + // is empty. + if (lowPct) { + *lowPct = 0.0; + } + if (highPct) { + *highPct = 0.0; + } + return 1; + } + + // Loop through all the buckets, keeping track of each bucket's + // percentile range: [0,10], [10,17], [17,45], etc. When we find a range + // that includes our desired percentile, we return that bucket index. + double prevPct = 0.0; + double curPct = 0.0; + uint64_t curCount = 0; + unsigned int idx; + for (idx = 0; idx < numBuckets; ++idx) { + if (counts[idx] == 0) { + // skip empty buckets + continue; + } + + prevPct = curPct; + curCount += counts[idx]; + curPct = static_cast(curCount) / totalCount; + if (pct <= curPct) { + // This is the desired bucket + break; + } + } + + if (lowPct) { + *lowPct = prevPct; + } + if (highPct) { + *highPct = curPct; + } + return idx; +} + +template +template +T HistogramBuckets::getPercentileEstimate( + double pct, + CountFn countFromBucket, + AvgFn avgFromBucket) const { + + // Find the bucket where this percentile falls + double lowPct; + double highPct; + unsigned int bucketIdx = getPercentileBucketIdx(pct, countFromBucket, + &lowPct, &highPct); + if (lowPct == 0.0 && highPct == 0.0) { + // Invalid range -- the buckets must all be empty + // Return the default value for ValueType. + return ValueType(); + } + if (lowPct == highPct) { + // Unlikely to have exact equality, + // but just return the bucket average in this case. + // We handle this here to avoid division by 0 below. + return avgFromBucket(buckets_[bucketIdx]); + } + + CHECK_GE(pct, lowPct); + CHECK_LE(pct, highPct); + CHECK_LT(lowPct, highPct); + + // Compute information about this bucket + ValueType avg = avgFromBucket(buckets_[bucketIdx]); + ValueType low; + ValueType high; + if (bucketIdx == 0) { + if (avg > min_) { + // This normally shouldn't happen. This bucket is only supposed to track + // values less than min_. Most likely this means that integer overflow + // occurred, and the code in avgFromBucket() returned a huge value + // instead of a small one. Just return the minimum possible value for + // now. + // + // (Note that if the counter keeps being decremented, eventually it will + // wrap and become small enough that we won't detect this any more, and + // we will return bogus information.) + LOG(ERROR) << "invalid average value in histogram minimum bucket: " << + avg << " > " << min_ << ": possible integer overflow?"; + return getBucketMin(bucketIdx); + } + // For the below-min bucket, just assume the lowest value ever seen is + // twice as far away from min_ as avg. + high = min_; + low = high - (2 * (high - avg)); + // Adjust low in case it wrapped + if (low > avg) { + low = std::numeric_limits::min(); + } + } else if (bucketIdx == buckets_.size() - 1) { + if (avg < max_) { + // Most likely this means integer overflow occurred. See the comments + // above in the minimum case. + LOG(ERROR) << "invalid average value in histogram maximum bucket: " << + avg << " < " << max_ << ": possible integer overflow?"; + return getBucketMax(bucketIdx); + } + // Similarly for the above-max bucket, assume the highest value ever seen + // is twice as far away from max_ as avg. + low = max_; + high = low + (2 * (avg - low)); + // Adjust high in case it wrapped + if (high < avg) { + high = std::numeric_limits::max(); + } + } else { + low = getBucketMin(bucketIdx); + high = getBucketMax(bucketIdx); + if (avg < low || avg > high) { + // Most likely this means an integer overflow occurred. + // See the comments above. Return the midpoint between low and high + // as a best guess, since avg is meaningless. + LOG(ERROR) << "invalid average value in histogram bucket: " << + avg << " not in range [" << low << ", " << high << + "]: possible integer overflow?"; + return (low + high) / 2; + } + } + + // Since we know the average value in this bucket, we can do slightly better + // than just assuming the data points in this bucket are uniformly + // distributed between low and high. + // + // Assume that the median value in this bucket is the same as the average + // value. + double medianPct = (lowPct + highPct) / 2.0; + if (pct < medianPct) { + // Assume that the data points lower than the median of this bucket + // are uniformly distributed between low and avg + double pctThroughSection = (pct - lowPct) / (medianPct - lowPct); + return low + ((avg - low) * pctThroughSection); + } else { + // Assume that the data points greater than the median of this bucket + // are uniformly distributed between avg and high + double pctThroughSection = (pct - medianPct) / (highPct - medianPct); + return avg + ((high - avg) * pctThroughSection); + } +} + +} // detail + + +template +std::string Histogram::debugString() const { + std::string ret = folly::to( + "num buckets: ", buckets_.getNumBuckets(), + ", bucketSize: ", buckets_.getBucketSize(), + ", min: ", buckets_.getMin(), ", max: ", buckets_.getMax(), "\n"); + + for (unsigned int i = 0; i < buckets_.getNumBuckets(); ++i) { + folly::toAppend(" ", buckets_.getBucketMin(i), ": ", + buckets_.getByIndex(i).count, "\n", + &ret); + } + + return ret; +} + +template +void Histogram::toTSV(std::ostream& out, bool skipEmptyBuckets) const { + for (unsigned int i = 0; i < buckets_.getNumBuckets(); ++i) { + // Do not output empty buckets in order to reduce data file size. + if (skipEmptyBuckets && getBucketByIndex(i).count == 0) { + continue; + } + const auto& bucket = getBucketByIndex(i); + out << getBucketMin(i) << '\t' << getBucketMax(i) << '\t' + << bucket.count << '\t' << bucket.sum << '\n'; + } +} + +} // folly + +#endif // FOLLY_HISTOGRAM_DEFS_H_ diff --git a/folly/stats/Histogram.h b/folly/stats/Histogram.h new file mode 100644 index 00000000..90cc1874 --- /dev/null +++ b/folly/stats/Histogram.h @@ -0,0 +1,406 @@ +/* + * Copyright 2013 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FOLLY_HISTOGRAM_H_ +#define FOLLY_HISTOGRAM_H_ + +#include +#include +#include +#include +#include +#include + +#include "folly/detail/Stats.h" + +namespace folly { + +namespace detail { + +/* + * A helper class to manage a set of histogram buckets. + */ +template +class HistogramBuckets { + public: + typedef T ValueType; + typedef BucketT BucketType; + + /* + * Create a set of histogram buckets. + * + * One bucket will be created for each bucketSize interval of values within + * the specified range. Additionally, one bucket will be created to track + * all values that fall below the specified minimum, and one bucket will be + * created for all values above the specified maximum. + * + * If (max - min) is not a multiple of bucketSize, the last bucket will cover + * a smaller range of values than the other buckets. + * + * (max - min) must be larger than or equal to bucketSize. + */ + HistogramBuckets(ValueType bucketSize, ValueType min, ValueType max, + const BucketType& defaultBucket); + + /* Returns the bucket size of each bucket in the histogram. */ + ValueType getBucketSize() const { + return bucketSize_; + } + + /* Returns the min value at which bucketing begins. */ + ValueType getMin() const { + return min_; + } + + /* Returns the max value at which bucketing ends. */ + ValueType getMax() const { + return max_; + } + + /* + * Returns the number of buckets. + * + * This includes the total number of buckets for the [min, max) range, + * plus 2 extra buckets, one for handling values less than min, and one for + * values greater than max. + */ + unsigned int getNumBuckets() const { + return buckets_.size(); + } + + /* Returns the bucket index into which the given value would fall. */ + unsigned int getBucketIdx(ValueType value) const; + + /* Returns the bucket for the specified value */ + BucketType& getByValue(ValueType value) { + return buckets_[getBucketIdx(value)]; + } + + /* Returns the bucket for the specified value */ + const BucketType& getByValue(ValueType value) const { + return buckets_[getBucketIdx(value)]; + } + + /* + * Returns the bucket at the specified index. + * + * Note that index 0 is the bucket for all values less than the specified + * minimum. Index 1 is the first bucket in the specified bucket range. + */ + BucketType& getByIndex(unsigned int idx) { + return buckets_[idx]; + } + + /* Returns the bucket at the specified index. */ + const BucketType& getByIndex(unsigned int idx) const { + return buckets_[idx]; + } + + /* + * Returns the minimum threshold for the bucket at the given index. + * + * The bucket at the specified index will store values in the range + * [bucketMin, bucketMin + bucketSize), or [bucketMin, max), if the overall + * max is smaller than bucketMin + bucketSize. + */ + ValueType getBucketMin(unsigned int idx) const { + if (idx == 0) { + return std::numeric_limits::min(); + } + if (idx == buckets_.size() - 1) { + return max_; + } + + return min_ + ((idx - 1) * bucketSize_); + } + + /* + * Returns the maximum threshold for the bucket at the given index. + * + * The bucket at the specified index will store values in the range + * [bucketMin, bucketMin + bucketSize), or [bucketMin, max), if the overall + * max is smaller than bucketMin + bucketSize. + */ + ValueType getBucketMax(unsigned int idx) const { + if (idx == buckets_.size() - 1) { + return std::numeric_limits::max(); + } + + return min_ + (idx * bucketSize_); + } + + /** + * Determine which bucket the specified percentile falls into. + * + * Looks for the bucket that contains the Nth percentile data point. + * + * @param pct The desired percentile to find, as a value from 0.0 to 1.0. + * @param countFn A function that takes a const BucketType&, and returns the + * number of values in that bucket. + * @param lowPct The lowest percentile stored in the selected bucket will be + * returned via this parameter. + * @param highPct The highest percentile stored in the selected bucket will + * be returned via this parameter. + * + * @return Returns the index of the bucket that contains the Nth percentile + * data point. + */ + template + unsigned int getPercentileBucketIdx(double pct, + CountFn countFromBucket, + double* lowPct = NULL, + double* highPct = NULL) const; + + /** + * Estimate the value at the specified percentile. + * + * @param pct The desired percentile to find, as a value from 0.0 to 1.0. + * @param countFn A function that takes a const BucketType&, and returns the + * number of values in that bucket. + * @param avgFn A function that takes a const BucketType&, and returns the + * average of all the values in that bucket. + * + * @return Returns an estimate for N, where N is the number where exactly pct + * percentage of the data points in the histogram are less than N. + */ + template + ValueType getPercentileEstimate(double pct, + CountFn countFromBucket, + AvgFn avgFromBucket) const; + + /* + * Iterator access to the buckets. + * + * Note that the first bucket is for all values less than min, and the last + * bucket is for all values greater than max. The buckets tracking values in + * the [min, max) actually start at the second bucket. + */ + typename std::vector::const_iterator begin() const { + return buckets_.begin(); + } + typename std::vector::iterator begin() { + return buckets_.begin(); + } + typename std::vector::const_iterator end() const { + return buckets_.end(); + } + typename std::vector::iterator end() { + return buckets_.end(); + } + + private: + const ValueType bucketSize_; + const ValueType min_; + const ValueType max_; + std::vector buckets_; +}; + +} // detail + + +/* + * A basic histogram class. + * + * Groups data points into equally-sized buckets, and stores the overall sum of + * the data points in each bucket, as well as the number of data points in the + * bucket. + * + * The caller must specify the minimum and maximum data points to expect ahead + * of time, as well as the bucket width. + */ +template +class Histogram { + public: + typedef T ValueType; + typedef detail::Bucket Bucket; + + Histogram(ValueType bucketSize, ValueType min, ValueType max) + : buckets_(bucketSize, min, max, Bucket()) {} + + /* Add a data point to the histogram */ + void addValue(ValueType value) { + Bucket& bucket = buckets_.getByValue(value); + // TODO: It would be nice to handle overflow here. + bucket.sum += value; + bucket.count += 1; + } + + /* + * Remove a data point to the histogram + * + * Note that this method does not actually verify that this exact data point + * had previously been added to the histogram; it merely subtracts the + * requested value from the appropriate bucket's sum. + */ + void removeValue(ValueType value) { + Bucket& bucket = buckets_.getByValue(value); + // TODO: It would be nice to handle overflow here. + bucket.sum -= value; + bucket.count -= 1; + } + + /* Remove all data points from the histogram */ + void clear() { + for (int i = 0; i < buckets_.getNumBuckets(); i++) { + buckets_.getByIndex(i).clear(); + } + } + + /* Merge two histogram data together */ + void merge(Histogram &hist) { + // the two histogram bucket definitions must match to support + // a merge. + if (getBucketSize() != hist.getBucketSize() || + getMin() != hist.getMin() || + getMax() != hist.getMax() || + getNumBuckets() != hist.getNumBuckets() ) { + throw std::invalid_argument("Cannot merge from input histogram."); + } + + for (int i = 0; i < buckets_.getNumBuckets(); i++) { + buckets_.getByIndex(i) += hist.buckets_.getByIndex(i); + } + } + + /* Copy bucket values from another histogram */ + void copy(Histogram &hist) { + // the two histogram bucket definition must match + if (getBucketSize() != hist.getBucketSize() || + getMin() != hist.getMin() || + getMax() != hist.getMax() || + getNumBuckets() != hist.getNumBuckets() ) { + throw std::invalid_argument("Cannot copy from input histogram."); + } + + for (int i = 0; i < buckets_.getNumBuckets(); i++) { + buckets_.getByIndex(i) = hist.buckets_.getByIndex(i); + } + } + + /* Returns the bucket size of each bucket in the histogram. */ + ValueType getBucketSize() const { + return buckets_.getBucketSize(); + } + /* Returns the min value at which bucketing begins. */ + ValueType getMin() const { + return buckets_.getMin(); + } + /* Returns the max value at which bucketing ends. */ + ValueType getMax() const { + return buckets_.getMax(); + } + /* Returns the number of buckets */ + unsigned int getNumBuckets() const { + return buckets_.getNumBuckets(); + } + + /* Returns the specified bucket (for reading only!) */ + const Bucket& getBucketByIndex(int idx) const { + return buckets_.getByIndex(idx); + } + + /* + * Returns the minimum threshold for the bucket at the given index. + * + * The bucket at the specified index will store values in the range + * [bucketMin, bucketMin + bucketSize), or [bucketMin, max), if the overall + * max is smaller than bucketMin + bucketSize. + */ + ValueType getBucketMin(unsigned int idx) const { + return buckets_.getBucketMin(idx); + } + + /* + * Returns the maximum threshold for the bucket at the given index. + * + * The bucket at the specified index will store values in the range + * [bucketMin, bucketMin + bucketSize), or [bucketMin, max), if the overall + * max is smaller than bucketMin + bucketSize. + */ + ValueType getBucketMax(unsigned int idx) const { + return buckets_.getBucketMax(idx); + } + + /* + * Get the bucket that the specified percentile falls into + * + * The lowest and highest percentile data points in returned bucket will be + * returned in the lowPct and highPct arguments, if they are non-NULL. + */ + unsigned int getPercentileBucketIdx(double pct, + double* lowPct = NULL, + double* highPct = NULL) const { + // We unfortunately can't use lambdas here yet; + // Some users of this code are still built with gcc-4.4. + CountFromBucket countFn; + return buckets_.getPercentileBucketIdx(pct, countFn, lowPct, highPct); + } + + /** + * Estimate the value at the specified percentile. + * + * @param pct The desired percentile to find, as a value from 0.0 to 1.0. + * + * @return Returns an estimate for N, where N is the number where exactly pct + * percentage of the data points in the histogram are less than N. + */ + ValueType getPercentileEstimate(double pct) const { + CountFromBucket countFn; + AvgFromBucket avgFn; + return buckets_.getPercentileEstimate(pct, countFn, avgFn); + } + + /* + * Get a human-readable string describing the histogram contents + */ + std::string debugString() const; + + /* + * Write the histogram contents in tab-separated values (TSV) format. + * Format is "min max count sum". + */ + void toTSV(std::ostream& out, bool skipEmptyBuckets = true) const; + + private: + struct CountFromBucket { + uint64_t operator()(const Bucket& bucket) const { + return bucket.count; + } + }; + struct AvgFromBucket { + ValueType operator()(const Bucket& bucket) const { + if (bucket.count == 0) { + return ValueType(0); + } + // Cast bucket.count to a signed integer type. This ensures that we + // perform division properly here: If bucket.sum is a signed integer + // type but we divide by an unsigned number, unsigned division will be + // performed and bucket.sum will be converted to unsigned first. + // If bucket.sum is unsigned, the code will still do unsigned division + // correctly. + // + // The only downside is if bucket.count is large enough to be negative + // when treated as signed. That should be extremely unlikely, though. + return bucket.sum / static_cast(bucket.count); + } + }; + + detail::HistogramBuckets buckets_; +}; + +} // folly + +#endif // FOLLY_HISTOGRAM_H_ diff --git a/folly/stats/Instantiations.cpp b/folly/stats/Instantiations.cpp new file mode 100644 index 00000000..5ac7d39a --- /dev/null +++ b/folly/stats/Instantiations.cpp @@ -0,0 +1,36 @@ +/* + * Copyright 2013 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This file contains explicit instantiations of stats template types. + * + * This allows most users to avoid having to include the template definition + * header files. + */ + +#include "folly/stats/BucketedTimeSeries.h" +#include "folly/stats/BucketedTimeSeries-defs.h" + +#include "folly/stats/Histogram.h" +#include "folly/stats/Histogram-defs.h" + +namespace folly { + +template class BucketedTimeSeries; +template class Histogram; +template class detail::HistogramBuckets::Bucket>; + +} // folly diff --git a/folly/test/HistogramBenchmark.cpp b/folly/test/HistogramBenchmark.cpp index 50598494..87a77b99 100644 --- a/folly/test/HistogramBenchmark.cpp +++ b/folly/test/HistogramBenchmark.cpp @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "folly/Histogram.h" +#include "folly/stats/Histogram.h" #include diff --git a/folly/test/HistogramTest.cpp b/folly/test/HistogramTest.cpp index 66104b10..ee088521 100644 --- a/folly/test/HistogramTest.cpp +++ b/folly/test/HistogramTest.cpp @@ -14,7 +14,8 @@ * limitations under the License. */ -#include "folly/Histogram.h" +#include "folly/stats/Histogram.h" +#include "folly/stats/Histogram-defs.h" #include #include diff --git a/folly/test/ProducerConsumerQueueBenchmark.cpp b/folly/test/ProducerConsumerQueueBenchmark.cpp index f4b44038..2d1f32e2 100644 --- a/folly/test/ProducerConsumerQueueBenchmark.cpp +++ b/folly/test/ProducerConsumerQueueBenchmark.cpp @@ -24,7 +24,8 @@ #include #include #include "folly/Benchmark.h" -#include "folly/Histogram.h" +#include "folly/stats/Histogram.h" +#include "folly/stats/Histogram-defs.h" #include "folly/ProducerConsumerQueue.h" namespace {