2 * Copyright 2012 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
19 #include "Benchmark.h"
31 DEFINE_bool(benchmark, false, "Run benchmarks.");
35 BenchmarkSuspender::NanosecondsSpent BenchmarkSuspender::nsSpent;
37 typedef function<uint64_t(unsigned int)> BenchmarkFun;
38 static vector<tuple<const char*, const char*, BenchmarkFun>> benchmarks;
40 // Add the global baseline
41 BENCHMARK(globalBenchmarkBaseline) {
45 void detail::addBenchmarkImpl(const char* file, const char* name,
47 benchmarks.emplace_back(file, name, std::move(fun));
51 * Given a point, gives density at that point as a number 0.0 < x <=
52 * 1.0. The result is 1.0 if all samples are equal to where, and
53 * decreases near 0 if all points are far away from it. The density is
54 * computed with the help of a radial basis function.
56 static double density(const double * begin, const double *const end,
57 const double where, const double bandwidth) {
59 assert(bandwidth > 0.0);
61 FOR_EACH_RANGE (i, begin, end) {
62 auto d = (*i - where) / bandwidth;
65 return sum / (end - begin);
69 * Computes mean and variance for a bunch of data points. Note that
70 * mean is currently not being used.
72 static pair<double, double>
73 meanVariance(const double * begin, const double *const end) {
75 double sum = 0.0, sum2 = 0.0;
76 FOR_EACH_RANGE (i, begin, end) {
80 auto const n = end - begin;
81 return make_pair(sum / n, sqrt((sum2 - sum * sum / n) / n));
85 * Computes the mode of a sample set through brute force. Assumes
88 static double mode(const double * begin, const double *const end) {
90 // Lower bound and upper bound for result and their respective
96 // Get the variance so we pass it down to density()
97 auto const sigma = meanVariance(begin, end).second;
99 // No variance means constant signal
103 FOR_EACH_RANGE (i, begin, end) {
104 assert(i == begin || *i >= i[-1]);
105 auto candidate = density(begin, end, *i, sigma * sqrt(2.0));
106 if (candidate > bestDensity) {
108 bestDensity = candidate;
111 // Density is decreasing... we could break here if we definitely
112 // knew this is unimodal.
120 * Given a bunch of benchmark samples, estimate the actual run time.
122 static double estimateTime(double * begin, double * end) {
125 // Current state of the art: get the minimum. After some
126 // experimentation, it seems taking the minimum is the best.
128 return *min_element(begin, end);
130 // What follows after estimates the time as the mode of the
133 // Select the awesomest (i.e. most frequent) result. We do this by
134 // sorting and then computing the longest run length.
137 // Eliminate outliers. A time much larger than the minimum time is
138 // considered an outlier.
139 while (end[-1] > 2.0 * *begin) {
149 /* Code used just for comparison purposes */ {
150 unsigned bestFrequency = 0;
151 unsigned candidateFrequency = 1;
152 double candidateValue = *begin;
153 for (auto current = begin + 1; ; ++current) {
154 if (current == end || *current != candidateValue) {
155 // Done with the current run, see if it was best
156 if (candidateFrequency > bestFrequency) {
157 bestFrequency = candidateFrequency;
158 result = candidateValue;
160 if (current == end) {
164 candidateValue = *current;
165 candidateFrequency = 1;
167 // Cool, inside a run, increase the frequency
168 ++candidateFrequency;
173 result = mode(begin, end);
178 static double runBenchmarkGetNSPerIteration(const BenchmarkFun& fun,
179 const double globalBaseline) {
180 // They key here is accuracy; too low numbers means the accuracy was
181 // coarse. We up the ante until we get to at least minNanoseconds
183 static uint64_t resolutionInNs = 0, coarseResolutionInNs = 0;
184 if (!resolutionInNs) {
186 CHECK_EQ(0, clock_getres(detail::DEFAULT_CLOCK_ID, &ts));
187 CHECK_EQ(0, ts.tv_sec) << "Clock sucks.";
188 CHECK_LT(0, ts.tv_nsec) << "Clock too fast for its own good.";
189 CHECK_EQ(1, ts.tv_nsec) << "Clock too coarse, upgrade your kernel.";
190 resolutionInNs = ts.tv_nsec;
192 // Whe choose a minimum minimum (sic) of 10,000 nanoseconds, but if
193 // the clock resolution is worse than that, it will be larger. In
194 // essence we're aiming at making the quantization noise 0.01%.
195 static const auto minNanoseconds = min(resolutionInNs * 100000, 1000000000UL);
197 // We do measurements in several epochs and take the minimum, to
198 // account for jitter.
199 static const unsigned int epochs = 1000;
200 // We establish a total time budget as we don't want a measurement
201 // to take too long. This will curtail the number of actual epochs.
202 static const uint64_t timeBudgetInNs = 1000000000;
204 CHECK_EQ(0, clock_gettime(CLOCK_REALTIME, &global));
206 double epochResults[epochs] = { 0 };
207 size_t actualEpochs = 0;
209 for (; actualEpochs < epochs; ++actualEpochs) {
210 for (unsigned int n = 1; n < (1U << 30); n *= 2) {
211 auto const nsecs = fun(n);
212 if (nsecs < minNanoseconds) {
215 // We got an accurate enough timing, done. But only save if
216 // smaller than the current result.
217 epochResults[actualEpochs] = max(0.0, double(nsecs) / n - globalBaseline);
218 // Done with the current epoch, we got a meaningful timing.
222 CHECK_EQ(0, clock_gettime(CLOCK_REALTIME, &now));
223 if (detail::timespecDiff(now, global) >= timeBudgetInNs) {
224 // No more time budget available.
230 // If the benchmark was basically drowned in baseline noise, it's
231 // possible it became negative.
232 return max(0.0, estimateTime(epochResults, epochResults + actualEpochs));
235 static string humanReadable(double n, unsigned int decimals) {
240 // Too big to be comprehended by the puny human brain
243 } else if (a >= 1E18) {
244 // "EXA" written with suffix 'X' so as to not create confusion
245 // with scientific notation.
248 } else if (a >= 1E15) {
252 } else if (a >= 1E12) {
256 } else if (a >= 1E9) {
260 } else if (a >= 1E6) {
264 } else if (a >= 1E3) {
268 } else if (a == 0.0) {
270 } else if (a < 1E-15) {
274 } else if (a < 1E-12) {
278 } else if (a < 1E-9) {
282 } else if (a < 1E-6) {
286 } else if (a < 1E-3) {
296 return stringPrintf("%*.*f%c", decimals + 3 + 1, decimals, n, suffix);
299 static void printBenchmarkResults(
300 const vector<tuple<const char*, const char*, double> >& data) {
302 static const uint columns = 76;
304 // Compute the longest benchmark name
305 size_t longestName = 0;
306 FOR_EACH_RANGE (i, 1, benchmarks.size()) {
307 longestName = max(longestName, strlen(get<1>(benchmarks[i])));
310 // Print a horizontal rule
311 auto separator = [&](char pad) {
312 puts(string(columns, pad).c_str());
315 // Print header for a file
316 auto header = [&](const char* file) {
318 printf("%-*srelative ns/iter iters/s\n",
323 double baselineNsPerIter = numeric_limits<double>::max();
324 const char* lastFile = "";
326 for (auto& datum : data) {
327 auto file = get<0>(datum);
328 if (strcmp(file, lastFile)) {
334 string s = get<1>(datum);
339 bool useBaseline /* = void */;
344 baselineNsPerIter = get<2>(datum);
347 s.resize(columns - 27, ' ');
348 auto nsPerIter = get<2>(datum);
349 auto itersPerSec = 1E9 / nsPerIter;
351 // Print without baseline
352 printf("%*s %s %s\n",
353 static_cast<int>(s.size()), s.c_str(),
354 humanReadable(nsPerIter, 2).c_str(),
355 humanReadable(itersPerSec, 2).c_str());
357 // Print with baseline
358 auto rel = baselineNsPerIter / nsPerIter * 100.0;
359 printf("%*s %7.2f%% %s %s\n",
360 static_cast<int>(s.size()), s.c_str(),
362 humanReadable(nsPerIter, 2).c_str(),
363 humanReadable(itersPerSec, 2).c_str());
369 void runBenchmarks() {
370 CHECK(!benchmarks.empty());
372 vector<tuple<const char*, const char*, double>> results;
373 results.reserve(benchmarks.size() - 1);
375 // PLEASE KEEP QUIET. MEASUREMENTS IN PROGRESS.
377 auto const globalBaseline = runBenchmarkGetNSPerIteration(
378 get<2>(benchmarks.front()), 0);
379 FOR_EACH_RANGE (i, 1, benchmarks.size()) {
380 auto elapsed = strcmp(get<1>(benchmarks[i]), "-") == 0
381 ? 0.0 // skip the separators
382 : runBenchmarkGetNSPerIteration(get<2>(benchmarks[i]),
384 results.emplace_back(get<0>(benchmarks[i]),
385 get<1>(benchmarks[i]), elapsed);
388 // PLEASE MAKE NOISE. MEASUREMENTS DONE.
390 printBenchmarkResults(results);