folly/Benchmark.h

   1 /*
   2  * Copyright 2016 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #ifndef FOLLY_BENCHMARK_H_
  18 #define FOLLY_BENCHMARK_H_
  19
  20 #include <folly/Portability.h>
  21 #include <folly/Preprocessor.h> // for FB_ANONYMOUS_VARIABLE
  22 #include <folly/ScopeGuard.h>
  23 #include <folly/portability/Time.h>
  24
  25 #include <cassert>
  26 #include <ctime>
  27 #include <boost/function_types/function_arity.hpp>
  28 #include <functional>
  29 #include <glog/logging.h>
  30 #include <gflags/gflags.h>
  31 #include <limits>
  32 #include <type_traits>
  33
  34 DECLARE_bool(benchmark);
  35
  36 namespace folly {
  37
  38 /**
  39  * Runs all benchmarks defined. Usually put in main().
  40  */
  41 void runBenchmarks();
  42
  43 /**
  44  * Runs all benchmarks defined if and only if the --benchmark flag has
  45  * been passed to the program. Usually put in main().
  46  */
  47 inline bool runBenchmarksOnFlag() {
  48   if (FLAGS_benchmark) {
  49     runBenchmarks();
  50   }
  51   return FLAGS_benchmark;
  52 }
  53
  54 namespace detail {
  55
  56 /**
  57  * This is the clock ID used for measuring time. On older kernels, the
  58  * resolution of this clock will be very coarse, which will cause the
  59  * benchmarks to fail.
  60  */
  61 enum Clock { DEFAULT_CLOCK_ID = CLOCK_REALTIME };
  62
  63 typedef std::pair<uint64_t, unsigned int> TimeIterPair;
  64
  65 /**
  66  * Adds a benchmark wrapped in a std::function. Only used
  67  * internally. Pass by value is intentional.
  68  */
  69 void addBenchmarkImpl(const char* file,
  70                       const char* name,
  71                       std::function<TimeIterPair(unsigned int)>);
  72
  73 /**
  74  * Takes the difference between two timespec values. end is assumed to
  75  * occur after start.
  76  */
  77 inline uint64_t timespecDiff(timespec end, timespec start) {
  78   if (end.tv_sec == start.tv_sec) {
  79     assert(end.tv_nsec >= start.tv_nsec);
  80     return end.tv_nsec - start.tv_nsec;
  81   }
  82   assert(end.tv_sec > start.tv_sec);
  83   auto diff = uint64_t(end.tv_sec - start.tv_sec);
  84   assert(diff <
  85          std::numeric_limits<uint64_t>::max() / 1000000000UL);
  86   return diff * 1000000000UL
  87     + end.tv_nsec - start.tv_nsec;
  88 }
  89
  90 /**
  91  * Takes the difference between two sets of timespec values. The first
  92  * two come from a high-resolution clock whereas the other two come
  93  * from a low-resolution clock. The crux of the matter is that
  94  * high-res values may be bogus as documented in
  95  * http://linux.die.net/man/3/clock_gettime. The trouble is when the
  96  * running process migrates from one CPU to another, which is more
  97  * likely for long-running processes. Therefore we watch for high
  98  * differences between the two timings.
  99  *
 100  * This function is subject to further improvements.
 101  */
 102 inline uint64_t timespecDiff(timespec end, timespec start,
 103                              timespec endCoarse, timespec startCoarse) {
 104   auto fine = timespecDiff(end, start);
 105   auto coarse = timespecDiff(endCoarse, startCoarse);
 106   if (coarse - fine >= 1000000) {
 107     // The fine time is in all likelihood bogus
 108     return coarse;
 109   }
 110   return fine;
 111 }
 112
 113 } // namespace detail
 114
 115 /**
 116  * Supporting type for BENCHMARK_SUSPEND defined below.
 117  */
 118 struct BenchmarkSuspender {
 119   BenchmarkSuspender() {
 120     CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &start));
 121   }
 122
 123   BenchmarkSuspender(const BenchmarkSuspender &) = delete;
 124   BenchmarkSuspender(BenchmarkSuspender && rhs) noexcept {
 125     start = rhs.start;
 126     rhs.start.tv_nsec = rhs.start.tv_sec = 0;
 127   }
 128
 129   BenchmarkSuspender& operator=(const BenchmarkSuspender &) = delete;
 130   BenchmarkSuspender& operator=(BenchmarkSuspender && rhs) {
 131     if (start.tv_nsec > 0 || start.tv_sec > 0) {
 132       tally();
 133     }
 134     start = rhs.start;
 135     rhs.start.tv_nsec = rhs.start.tv_sec = 0;
 136     return *this;
 137   }
 138
 139   ~BenchmarkSuspender() {
 140     if (start.tv_nsec > 0 || start.tv_sec > 0) {
 141       tally();
 142     }
 143   }
 144
 145   void dismiss() {
 146     assert(start.tv_nsec > 0 || start.tv_sec > 0);
 147     tally();
 148     start.tv_nsec = start.tv_sec = 0;
 149   }
 150
 151   void rehire() {
 152     assert(start.tv_nsec == 0 || start.tv_sec == 0);
 153     CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &start));
 154   }
 155
 156   template <class F>
 157   auto dismissing(F f) -> typename std::result_of<F()>::type {
 158     SCOPE_EXIT { rehire(); };
 159     dismiss();
 160     return f();
 161   }
 162
 163   /**
 164    * This is for use inside of if-conditions, used in BENCHMARK macros.
 165    * If-conditions bypass the explicit on operator bool.
 166    */
 167   explicit operator bool() const {
 168     return false;
 169   }
 170
 171   /**
 172    * Accumulates nanoseconds spent outside benchmark.
 173    */
 174   typedef uint64_t NanosecondsSpent;
 175   static NanosecondsSpent nsSpent;
 176
 177 private:
 178   void tally() {
 179     timespec end;
 180     CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &end));
 181     nsSpent += detail::timespecDiff(end, start);
 182     start = end;
 183   }
 184
 185   timespec start;
 186 };
 187
 188 /**
 189  * Adds a benchmark. Usually not called directly but instead through
 190  * the macro BENCHMARK defined below. The lambda function involved
 191  * must take exactly one parameter of type unsigned, and the benchmark
 192  * uses it with counter semantics (iteration occurs inside the
 193  * function).
 194  */
 195 template <typename Lambda>
 196 typename std::enable_if<
 197   boost::function_types::function_arity<decltype(&Lambda::operator())>::value
 198   == 2
 199 >::type
 200 addBenchmark(const char* file, const char* name, Lambda&& lambda) {
 201   auto execute = [=](unsigned int times) {
 202     BenchmarkSuspender::nsSpent = 0;
 203     timespec start, end;
 204     unsigned int niter;
 205
 206     // CORE MEASUREMENT STARTS
 207     auto const r1 = clock_gettime(detail::DEFAULT_CLOCK_ID, &start);
 208     niter = lambda(times);
 209     auto const r2 = clock_gettime(detail::DEFAULT_CLOCK_ID, &end);
 210     // CORE MEASUREMENT ENDS
 211
 212     CHECK_EQ(0, r1);
 213     CHECK_EQ(0, r2);
 214
 215     return detail::TimeIterPair(
 216       detail::timespecDiff(end, start) - BenchmarkSuspender::nsSpent,
 217       niter);
 218   };
 219
 220   detail::addBenchmarkImpl(file, name,
 221     std::function<detail::TimeIterPair(unsigned int)>(execute));
 222 }
 223
 224 /**
 225  * Adds a benchmark. Usually not called directly but instead through
 226  * the macro BENCHMARK defined below. The lambda function involved
 227  * must take zero parameters, and the benchmark calls it repeatedly
 228  * (iteration occurs outside the function).
 229  */
 230 template <typename Lambda>
 231 typename std::enable_if<
 232   boost::function_types::function_arity<decltype(&Lambda::operator())>::value
 233   == 1
 234 >::type
 235 addBenchmark(const char* file, const char* name, Lambda&& lambda) {
 236   addBenchmark(file, name, [=](unsigned int times) {
 237       unsigned int niter = 0;
 238       while (times-- > 0) {
 239         niter += lambda();
 240       }
 241       return niter;
 242     });
 243 }
 244
 245 /**
 246  * Call doNotOptimizeAway(var) against variables that you use for
 247  * benchmarking but otherwise are useless. The compiler tends to do a
 248  * good job at eliminating unused variables, and this function fools
 249  * it into thinking var is in fact needed.
 250  */
 251 #ifdef _MSC_VER
 252
 253 #pragma optimize("", off)
 254
 255 template <class T>
 256 void doNotOptimizeAway(T&& datum) {
 257   datum = datum;
 258 }
 259
 260 #pragma optimize("", on)
 261
 262 #elif defined(__clang__)
 263
 264 template <class T>
 265 __attribute__((__optnone__)) void doNotOptimizeAway(T&& /* datum */) {}
 266
 267 #else
 268
 269 template <class T>
 270 void doNotOptimizeAway(T&& datum) {
 271   asm volatile("" : "+r" (datum));
 272 }
 273
 274 #endif
 275
 276 } // namespace folly
 277
 278 /**
 279  * Introduces a benchmark function. Used internally, see BENCHMARK and
 280  * friends below.
 281  */
 282 #define BENCHMARK_IMPL(funName, stringName, rv, paramType, paramName)   \
 283   static void funName(paramType);                                       \
 284   static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = (           \
 285     ::folly::addBenchmark(__FILE__, stringName,                         \
 286       [](paramType paramName) -> unsigned { funName(paramName);         \
 287                                             return rv; }),              \
 288     true);                                                              \
 289   static void funName(paramType paramName)
 290
 291 /**
 292  * Introduces a benchmark function with support for returning the actual
 293  * number of iterations. Used internally, see BENCHMARK_MULTI and friends
 294  * below.
 295  */
 296 #define BENCHMARK_MULTI_IMPL(funName, stringName, paramType, paramName) \
 297   static unsigned funName(paramType);                                   \
 298   static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = (           \
 299     ::folly::addBenchmark(__FILE__, stringName,                         \
 300       [](paramType paramName) { return funName(paramName); }),          \
 301     true);                                                              \
 302   static unsigned funName(paramType paramName)
 303
 304 /**
 305  * Introduces a benchmark function. Use with either one or two arguments.
 306  * The first is the name of the benchmark. Use something descriptive, such
 307  * as insertVectorBegin. The second argument may be missing, or could be a
 308  * symbolic counter. The counter dictates how many internal iteration the
 309  * benchmark does. Example:
 310  *
 311  * BENCHMARK(vectorPushBack) {
 312  *   vector<int> v;
 313  *   v.push_back(42);
 314  * }
 315  *
 316  * BENCHMARK(insertVectorBegin, n) {
 317  *   vector<int> v;
 318  *   FOR_EACH_RANGE (i, 0, n) {
 319  *     v.insert(v.begin(), 42);
 320  *   }
 321  * }
 322  */
 323 #define BENCHMARK(name, ...)                                    \
 324   BENCHMARK_IMPL(                                               \
 325     name,                                                       \
 326     FB_STRINGIZE(name),                                         \
 327     FB_ARG_2_OR_1(1, ## __VA_ARGS__),                           \
 328     FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__),                   \
 329     __VA_ARGS__)
 330
 331 /**
 332  * Like BENCHMARK above, but allows the user to return the actual
 333  * number of iterations executed in the function body. This can be
 334  * useful if the benchmark function doesn't know upfront how many
 335  * iterations it's going to run or if it runs through a certain
 336  * number of test cases, e.g.:
 337  *
 338  * BENCHMARK_MULTI(benchmarkSomething) {
 339  *   std::vector<int> testCases { 0, 1, 1, 2, 3, 5 };
 340  *   for (int c : testCases) {
 341  *     doSomething(c);
 342  *   }
 343  *   return testCases.size();
 344  * }
 345  */
 346 #define BENCHMARK_MULTI(name, ...)                              \
 347   BENCHMARK_MULTI_IMPL(                                         \
 348     name,                                                       \
 349     FB_STRINGIZE(name),                                         \
 350     FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__),                   \
 351     __VA_ARGS__)
 352
 353 /**
 354  * Defines a benchmark that passes a parameter to another one. This is
 355  * common for benchmarks that need a "problem size" in addition to
 356  * "number of iterations". Consider:
 357  *
 358  * void pushBack(uint n, size_t initialSize) {
 359  *   vector<int> v;
 360  *   BENCHMARK_SUSPEND {
 361  *     v.resize(initialSize);
 362  *   }
 363  *   FOR_EACH_RANGE (i, 0, n) {
 364  *    v.push_back(i);
 365  *   }
 366  * }
 367  * BENCHMARK_PARAM(pushBack, 0)
 368  * BENCHMARK_PARAM(pushBack, 1000)
 369  * BENCHMARK_PARAM(pushBack, 1000000)
 370  *
 371  * The benchmark above estimates the speed of push_back at different
 372  * initial sizes of the vector. The framework will pass 0, 1000, and
 373  * 1000000 for initialSize, and the iteration count for n.
 374  */
 375 #define BENCHMARK_PARAM(name, param)                                    \
 376   BENCHMARK_NAMED_PARAM(name, param, param)
 377
 378 /**
 379  * Same as BENCHMARK_PARAM, but allows to return the actual number of
 380  * iterations that have been run.
 381  */
 382 #define BENCHMARK_PARAM_MULTI(name, param)                              \
 383   BENCHMARK_NAMED_PARAM_MULTI(name, param, param)
 384
 385 /*
 386  * Like BENCHMARK_PARAM(), but allows a custom name to be specified for each
 387  * parameter, rather than using the parameter value.
 388  *
 389  * Useful when the parameter value is not a valid token for string pasting,
 390  * of when you want to specify multiple parameter arguments.
 391  *
 392  * For example:
 393  *
 394  * void addValue(uint n, int64_t bucketSize, int64_t min, int64_t max) {
 395  *   Histogram<int64_t> hist(bucketSize, min, max);
 396  *   int64_t num = min;
 397  *   FOR_EACH_RANGE (i, 0, n) {
 398  *     hist.addValue(num);
 399  *     ++num;
 400  *     if (num > max) { num = min; }
 401  *   }
 402  * }
 403  *
 404  * BENCHMARK_NAMED_PARAM(addValue, 0_to_100, 1, 0, 100)
 405  * BENCHMARK_NAMED_PARAM(addValue, 0_to_1000, 10, 0, 1000)
 406  * BENCHMARK_NAMED_PARAM(addValue, 5k_to_20k, 250, 5000, 20000)
 407  */
 408 #define BENCHMARK_NAMED_PARAM(name, param_name, ...)                    \
 409   BENCHMARK_IMPL(                                                       \
 410       FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),              \
 411       FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")",              \
 412       iters,                                                            \
 413       unsigned,                                                         \
 414       iters) {                                                          \
 415     name(iters, ## __VA_ARGS__);                                        \
 416   }
 417
 418 /**
 419  * Same as BENCHMARK_NAMED_PARAM, but allows to return the actual number
 420  * of iterations that have been run.
 421  */
 422 #define BENCHMARK_NAMED_PARAM_MULTI(name, param_name, ...)              \
 423   BENCHMARK_MULTI_IMPL(                                                 \
 424       FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),              \
 425       FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")",              \
 426       unsigned,                                                         \
 427       iters) {                                                          \
 428     return name(iters, ## __VA_ARGS__);                                 \
 429   }
 430
 431 /**
 432  * Just like BENCHMARK, but prints the time relative to a
 433  * baseline. The baseline is the most recent BENCHMARK() seen in
 434  * lexical order. Example:
 435  *
 436  * // This is the baseline
 437  * BENCHMARK(insertVectorBegin, n) {
 438  *   vector<int> v;
 439  *   FOR_EACH_RANGE (i, 0, n) {
 440  *     v.insert(v.begin(), 42);
 441  *   }
 442  * }
 443  *
 444  * BENCHMARK_RELATIVE(insertListBegin, n) {
 445  *   list<int> s;
 446  *   FOR_EACH_RANGE (i, 0, n) {
 447  *     s.insert(s.begin(), 42);
 448  *   }
 449  * }
 450  *
 451  * Any number of relative benchmark can be associated with a
 452  * baseline. Another BENCHMARK() occurrence effectively establishes a
 453  * new baseline.
 454  */
 455 #define BENCHMARK_RELATIVE(name, ...)                           \
 456   BENCHMARK_IMPL(                                               \
 457     name,                                                       \
 458     "%" FB_STRINGIZE(name),                                     \
 459     FB_ARG_2_OR_1(1, ## __VA_ARGS__),                           \
 460     FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__),                   \
 461     __VA_ARGS__)
 462
 463 /**
 464  * Same as BENCHMARK_RELATIVE, but allows to return the actual number
 465  * of iterations that have been run.
 466  */
 467 #define BENCHMARK_RELATIVE_MULTI(name, ...)                     \
 468   BENCHMARK_MULTI_IMPL(                                         \
 469     name,                                                       \
 470     "%" FB_STRINGIZE(name),                                     \
 471     FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__),                   \
 472     __VA_ARGS__)
 473
 474 /**
 475  * A combination of BENCHMARK_RELATIVE and BENCHMARK_PARAM.
 476  */
 477 #define BENCHMARK_RELATIVE_PARAM(name, param)                           \
 478   BENCHMARK_RELATIVE_NAMED_PARAM(name, param, param)
 479
 480 /**
 481  * Same as BENCHMARK_RELATIVE_PARAM, but allows to return the actual
 482  * number of iterations that have been run.
 483  */
 484 #define BENCHMARK_RELATIVE_PARAM_MULTI(name, param)                     \
 485   BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(name, param, param)
 486
 487 /**
 488  * A combination of BENCHMARK_RELATIVE and BENCHMARK_NAMED_PARAM.
 489  */
 490 #define BENCHMARK_RELATIVE_NAMED_PARAM(name, param_name, ...)           \
 491   BENCHMARK_IMPL(                                                       \
 492       FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),              \
 493       "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")",          \
 494       iters,                                                            \
 495       unsigned,                                                         \
 496       iters) {                                                          \
 497     name(iters, ## __VA_ARGS__);                                        \
 498   }
 499
 500 /**
 501  * Same as BENCHMARK_RELATIVE_NAMED_PARAM, but allows to return the
 502  * actual number of iterations that have been run.
 503  */
 504 #define BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(name, param_name, ...)     \
 505   BENCHMARK_MULTI_IMPL(                                                 \
 506       FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),              \
 507       "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")",          \
 508       unsigned,                                                         \
 509       iters) {                                                          \
 510     return name(iters, ## __VA_ARGS__);                                 \
 511   }
 512
 513 /**
 514  * Draws a line of dashes.
 515  */
 516 #define BENCHMARK_DRAW_LINE()                                             \
 517   static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = (             \
 518     ::folly::addBenchmark(__FILE__, "-", []() -> unsigned { return 0; }), \
 519     true);
 520
 521 /**
 522  * Allows execution of code that doesn't count torward the benchmark's
 523  * time budget. Example:
 524  *
 525  * BENCHMARK_START_GROUP(insertVectorBegin, n) {
 526  *   vector<int> v;
 527  *   BENCHMARK_SUSPEND {
 528  *     v.reserve(n);
 529  *   }
 530  *   FOR_EACH_RANGE (i, 0, n) {
 531  *     v.insert(v.begin(), 42);
 532  *   }
 533  * }
 534  */
 535 #define BENCHMARK_SUSPEND                               \
 536   if (auto FB_ANONYMOUS_VARIABLE(BENCHMARK_SUSPEND) =   \
 537       ::folly::BenchmarkSuspender()) {}                 \
 538   else
 539
 540 #endif // FOLLY_BENCHMARK_H_