folly/Benchmark.cpp

   1 /*
   2  * Copyright 2012 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
  18
  19 #include "Benchmark.h"
  20 #include "Foreach.h"
  21 #include "json.h"
  22 #include "String.h"
  23 #include <algorithm>
  24 #include <cmath>
  25 #include <iostream>
  26 #include <limits>
  27 #include <utility>
  28 #include <vector>
  29
  30 using namespace std;
  31
  32 DEFINE_bool(benchmark, false, "Run benchmarks.");
  33 DEFINE_bool(json, false, "Output in JSON format.");
  34
  35 namespace folly {
  36
  37 BenchmarkSuspender::NanosecondsSpent BenchmarkSuspender::nsSpent;
  38
  39 typedef function<uint64_t(unsigned int)> BenchmarkFun;
  40 static vector<tuple<const char*, const char*, BenchmarkFun>> benchmarks;
  41
  42 // Add the global baseline
  43 BENCHMARK(globalBenchmarkBaseline) {
  44   asm volatile("");
  45 }
  46
  47 void detail::addBenchmarkImpl(const char* file, const char* name,
  48                               BenchmarkFun fun) {
  49   benchmarks.emplace_back(file, name, std::move(fun));
  50 }
  51
  52 /**
  53  * Given a point, gives density at that point as a number 0.0 < x <=
  54  * 1.0. The result is 1.0 if all samples are equal to where, and
  55  * decreases near 0 if all points are far away from it. The density is
  56  * computed with the help of a radial basis function.
  57  */
  58 static double density(const double * begin, const double *const end,
  59                       const double where, const double bandwidth) {
  60   assert(begin < end);
  61   assert(bandwidth > 0.0);
  62   double sum = 0.0;
  63   FOR_EACH_RANGE (i, begin, end) {
  64     auto d = (*i - where) / bandwidth;
  65     sum += exp(- d * d);
  66   }
  67   return sum / (end - begin);
  68 }
  69
  70 /**
  71  * Computes mean and variance for a bunch of data points. Note that
  72  * mean is currently not being used.
  73  */
  74 static pair<double, double>
  75 meanVariance(const double * begin, const double *const end) {
  76   assert(begin < end);
  77   double sum = 0.0, sum2 = 0.0;
  78   FOR_EACH_RANGE (i, begin, end) {
  79     sum += *i;
  80     sum2 += *i * *i;
  81   }
  82   auto const n = end - begin;
  83   return make_pair(sum / n, sqrt((sum2 - sum * sum / n) / n));
  84 }
  85
  86 /**
  87  * Computes the mode of a sample set through brute force. Assumes
  88  * input is sorted.
  89  */
  90 static double mode(const double * begin, const double *const end) {
  91   assert(begin < end);
  92   // Lower bound and upper bound for result and their respective
  93   // densities.
  94   auto
  95     result = 0.0,
  96     bestDensity = 0.0;
  97
  98   // Get the variance so we pass it down to density()
  99   auto const sigma = meanVariance(begin, end).second;
 100   if (!sigma) {
 101     // No variance means constant signal
 102     return *begin;
 103   }
 104
 105   FOR_EACH_RANGE (i, begin, end) {
 106     assert(i == begin || *i >= i[-1]);
 107     auto candidate = density(begin, end, *i, sigma * sqrt(2.0));
 108     if (candidate > bestDensity) {
 109       // Found a new best
 110       bestDensity = candidate;
 111       result = *i;
 112     } else {
 113       // Density is decreasing... we could break here if we definitely
 114       // knew this is unimodal.
 115     }
 116   }
 117
 118   return result;
 119 }
 120
 121 /**
 122  * Given a bunch of benchmark samples, estimate the actual run time.
 123  */
 124 static double estimateTime(double * begin, double * end) {
 125   assert(begin < end);
 126
 127   // Current state of the art: get the minimum. After some
 128   // experimentation, it seems taking the minimum is the best.
 129
 130   return *min_element(begin, end);
 131
 132   // What follows after estimates the time as the mode of the
 133   // distribution.
 134
 135   // Select the awesomest (i.e. most frequent) result. We do this by
 136   // sorting and then computing the longest run length.
 137   sort(begin, end);
 138
 139   // Eliminate outliers. A time much larger than the minimum time is
 140   // considered an outlier.
 141   while (end[-1] > 2.0 * *begin) {
 142     --end;
 143     if (begin == end) {
 144       LOG(INFO) << *begin;
 145     }
 146     assert(begin < end);
 147   }
 148
 149   double result = 0;
 150
 151   /* Code used just for comparison purposes */ {
 152     unsigned bestFrequency = 0;
 153     unsigned candidateFrequency = 1;
 154     double candidateValue = *begin;
 155     for (auto current = begin + 1; ; ++current) {
 156       if (current == end || *current != candidateValue) {
 157         // Done with the current run, see if it was best
 158         if (candidateFrequency > bestFrequency) {
 159           bestFrequency = candidateFrequency;
 160           result = candidateValue;
 161         }
 162         if (current == end) {
 163           break;
 164         }
 165         // Start a new run
 166         candidateValue = *current;
 167         candidateFrequency = 1;
 168       } else {
 169         // Cool, inside a run, increase the frequency
 170         ++candidateFrequency;
 171       }
 172     }
 173   }
 174
 175   result = mode(begin, end);
 176
 177   return result;
 178 }
 179
 180 static double runBenchmarkGetNSPerIteration(const BenchmarkFun& fun,
 181                                             const double globalBaseline) {
 182   // They key here is accuracy; too low numbers means the accuracy was
 183   // coarse. We up the ante until we get to at least minNanoseconds
 184   // timings.
 185   static uint64_t resolutionInNs = 0, coarseResolutionInNs = 0;
 186   if (!resolutionInNs) {
 187     timespec ts;
 188     CHECK_EQ(0, clock_getres(detail::DEFAULT_CLOCK_ID, &ts));
 189     CHECK_EQ(0, ts.tv_sec) << "Clock sucks.";
 190     CHECK_LT(0, ts.tv_nsec) << "Clock too fast for its own good.";
 191     CHECK_EQ(1, ts.tv_nsec) << "Clock too coarse, upgrade your kernel.";
 192     resolutionInNs = ts.tv_nsec;
 193   }
 194   // Whe choose a minimum minimum (sic) of 10,000 nanoseconds, but if
 195   // the clock resolution is worse than that, it will be larger. In
 196   // essence we're aiming at making the quantization noise 0.01%.
 197   static const auto minNanoseconds = min(resolutionInNs * 100000, 1000000000UL);
 198
 199   // We do measurements in several epochs and take the minimum, to
 200   // account for jitter.
 201   static const unsigned int epochs = 1000;
 202   // We establish a total time budget as we don't want a measurement
 203   // to take too long. This will curtail the number of actual epochs.
 204   static const uint64_t timeBudgetInNs = 1000000000;
 205   timespec global;
 206   CHECK_EQ(0, clock_gettime(CLOCK_REALTIME, &global));
 207
 208   double epochResults[epochs] = { 0 };
 209   size_t actualEpochs = 0;
 210
 211   for (; actualEpochs < epochs; ++actualEpochs) {
 212     for (unsigned int n = 1; n < (1U << 30); n *= 2) {
 213       auto const nsecs = fun(n);
 214       if (nsecs < minNanoseconds) {
 215         continue;
 216       }
 217       // We got an accurate enough timing, done. But only save if
 218       // smaller than the current result.
 219       epochResults[actualEpochs] = max(0.0, double(nsecs) / n - globalBaseline);
 220       // Done with the current epoch, we got a meaningful timing.
 221       break;
 222     }
 223     timespec now;
 224     CHECK_EQ(0, clock_gettime(CLOCK_REALTIME, &now));
 225     if (detail::timespecDiff(now, global) >= timeBudgetInNs) {
 226       // No more time budget available.
 227       ++actualEpochs;
 228       break;
 229     }
 230   }
 231
 232   // If the benchmark was basically drowned in baseline noise, it's
 233   // possible it became negative.
 234   return max(0.0, estimateTime(epochResults, epochResults + actualEpochs));
 235 }
 236
 237 static string humanReadable(double n, unsigned int decimals) {
 238   auto a = fabs(n);
 239   char suffix = ' ';
 240
 241   if (a >= 1E21) {
 242     // Too big to be comprehended by the puny human brain
 243     suffix = '!';
 244     n /= 1E21;
 245   } else if (a >= 1E18) {
 246     // "EXA" written with suffix 'X' so as to not create confusion
 247     // with scientific notation.
 248     suffix = 'X';
 249     n /= 1E18;
 250   } else if (a >= 1E15) {
 251     // "PETA"
 252     suffix = 'P';
 253     n /= 1E15;
 254   } else if (a >= 1E12) {
 255     // "TERA"
 256     suffix = 'T';
 257     n /= 1E12;
 258   } else if (a >= 1E9) {
 259     // "GIGA"
 260     suffix = 'G';
 261     n /= 1E9;
 262   } else if (a >= 1E6) {
 263     // "MEGA"
 264     suffix = 'M';
 265     n /= 1E6;
 266   } else if (a >= 1E3) {
 267     // "KILO"
 268     suffix = 'K';
 269     n /= 1E3;
 270   } else if (a == 0.0) {
 271     suffix = ' ';
 272   } else if (a < 1E-15) {
 273     // too small
 274     suffix = '?';
 275     n *= 1E18;
 276   } else if (a < 1E-12) {
 277     // "femto"
 278     suffix = 'f';
 279     n *= 1E15;
 280   } else if (a < 1E-9) {
 281     // "pico"
 282     suffix = 'p';
 283     n *= 1E12;
 284   } else if (a < 1E-6) {
 285     // "nano"
 286     suffix = 'n';
 287     n *= 1E9;
 288   } else if (a < 1E-3) {
 289     // "micro"
 290     suffix = 'u';
 291     n *= 1E6;
 292   } else if (a < 1) {
 293     // "mili"
 294     suffix = 'm';
 295     n *= 1E3;
 296   }
 297
 298   return stringPrintf("%*.*f%c", decimals + 3 + 1, decimals, n, suffix);
 299 }
 300
 301 static void printBenchmarkResultsAsTable(
 302   const vector<tuple<const char*, const char*, double> >& data) {
 303   // Width available
 304   static const uint columns = 76;
 305
 306   // Compute the longest benchmark name
 307   size_t longestName = 0;
 308   FOR_EACH_RANGE (i, 1, benchmarks.size()) {
 309     longestName = max(longestName, strlen(get<1>(benchmarks[i])));
 310   }
 311
 312   // Print a horizontal rule
 313   auto separator = [&](char pad) {
 314     puts(string(columns, pad).c_str());
 315   };
 316
 317   // Print header for a file
 318   auto header = [&](const char* file) {
 319     separator('=');
 320     printf("%-*srelative  ns/iter  iters/s\n",
 321            columns - 26, file);
 322     separator('=');
 323   };
 324
 325   double baselineNsPerIter = numeric_limits<double>::max();
 326   const char* lastFile = "";
 327
 328   for (auto& datum : data) {
 329     auto file = get<0>(datum);
 330     if (strcmp(file, lastFile)) {
 331       // New file starting
 332       header(file);
 333       lastFile = file;
 334     }
 335
 336     string s = get<1>(datum);
 337     if (s == "-") {
 338       separator('-');
 339       continue;
 340     }
 341     bool useBaseline /* = void */;
 342     if (s[0] == '%') {
 343       s.erase(0, 1);
 344       useBaseline = true;
 345     } else {
 346       baselineNsPerIter = get<2>(datum);
 347       useBaseline = false;
 348     }
 349     s.resize(columns - 27, ' ');
 350     auto nsPerIter = get<2>(datum);
 351     auto itersPerSec = 1E9 / nsPerIter;
 352     if (!useBaseline) {
 353       // Print without baseline
 354       printf("%*s           %s  %s\n",
 355              static_cast<int>(s.size()), s.c_str(),
 356              humanReadable(nsPerIter, 2).c_str(),
 357              humanReadable(itersPerSec, 2).c_str());
 358     } else {
 359       // Print with baseline
 360       auto rel = baselineNsPerIter / nsPerIter * 100.0;
 361       printf("%*s %7.2f%%  %s  %s\n",
 362              static_cast<int>(s.size()), s.c_str(),
 363              rel,
 364              humanReadable(nsPerIter, 2).c_str(),
 365              humanReadable(itersPerSec, 2).c_str());
 366     }
 367   }
 368   separator('=');
 369 }
 370
 371 static void printBenchmarkResultsAsJson(
 372   const vector<tuple<const char*, const char*, double> >& data) {
 373   dynamic d = dynamic::object;
 374   for (auto& datum: data) {
 375     d[std::get<1>(datum)] = std::get<2>(datum) * 1000.;
 376   }
 377
 378   printf("%s\n", toPrettyJson(d).c_str());
 379 }
 380
 381 static void printBenchmarkResults(
 382   const vector<tuple<const char*, const char*, double> >& data) {
 383
 384   if (FLAGS_json) {
 385     printBenchmarkResultsAsJson(data);
 386   } else {
 387     printBenchmarkResultsAsTable(data);
 388   }
 389 }
 390
 391 void runBenchmarks() {
 392   CHECK(!benchmarks.empty());
 393
 394   vector<tuple<const char*, const char*, double>> results;
 395   results.reserve(benchmarks.size() - 1);
 396
 397   // PLEASE KEEP QUIET. MEASUREMENTS IN PROGRESS.
 398
 399   auto const globalBaseline = runBenchmarkGetNSPerIteration(
 400     get<2>(benchmarks.front()), 0);
 401   FOR_EACH_RANGE (i, 1, benchmarks.size()) {
 402     auto elapsed = strcmp(get<1>(benchmarks[i]), "-") == 0
 403       ? 0.0 // skip the separators
 404       : runBenchmarkGetNSPerIteration(get<2>(benchmarks[i]),
 405                                       globalBaseline);
 406     results.emplace_back(get<0>(benchmarks[i]),
 407                          get<1>(benchmarks[i]), elapsed);
 408   }
 409
 410   // PLEASE MAKE NOISE. MEASUREMENTS DONE.
 411
 412   printBenchmarkResults(results);
 413 }
 414
 415 } // namespace folly