folly/Benchmark.h

   1 /*
   2  * Copyright 2016 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #pragma once
  18
  19 #include <folly/Portability.h>
  20 #include <folly/Preprocessor.h> // for FB_ANONYMOUS_VARIABLE
  21 #include <folly/ScopeGuard.h>
  22 #include <folly/Traits.h>
  23 #include <folly/portability/GFlags.h>
  24 #include <folly/portability/Time.h>
  25
  26 #include <cassert>
  27 #include <ctime>
  28 #include <boost/function_types/function_arity.hpp>
  29 #include <functional>
  30 #include <glog/logging.h>
  31 #include <limits>
  32 #include <type_traits>
  33
  34 DECLARE_bool(benchmark);
  35
  36 namespace folly {
  37
  38 /**
  39  * Runs all benchmarks defined. Usually put in main().
  40  */
  41 void runBenchmarks();
  42
  43 /**
  44  * Runs all benchmarks defined if and only if the --benchmark flag has
  45  * been passed to the program. Usually put in main().
  46  */
  47 inline bool runBenchmarksOnFlag() {
  48   if (FLAGS_benchmark) {
  49     runBenchmarks();
  50   }
  51   return FLAGS_benchmark;
  52 }
  53
  54 namespace detail {
  55
  56 typedef std::pair<uint64_t, unsigned int> TimeIterPair;
  57
  58 /**
  59  * Adds a benchmark wrapped in a std::function. Only used
  60  * internally. Pass by value is intentional.
  61  */
  62 void addBenchmarkImpl(const char* file,
  63                       const char* name,
  64                       std::function<TimeIterPair(unsigned int)>);
  65
  66 /**
  67  * Takes the difference between two timespec values. end is assumed to
  68  * occur after start.
  69  */
  70 inline uint64_t timespecDiff(timespec end, timespec start) {
  71   if (end.tv_sec == start.tv_sec) {
  72     assert(end.tv_nsec >= start.tv_nsec);
  73     return end.tv_nsec - start.tv_nsec;
  74   }
  75   assert(end.tv_sec > start.tv_sec);
  76   auto diff = uint64_t(end.tv_sec - start.tv_sec);
  77   assert(diff <
  78          std::numeric_limits<uint64_t>::max() / 1000000000UL);
  79   return diff * 1000000000UL
  80     + end.tv_nsec - start.tv_nsec;
  81 }
  82
  83 /**
  84  * Takes the difference between two sets of timespec values. The first
  85  * two come from a high-resolution clock whereas the other two come
  86  * from a low-resolution clock. The crux of the matter is that
  87  * high-res values may be bogus as documented in
  88  * http://linux.die.net/man/3/clock_gettime. The trouble is when the
  89  * running process migrates from one CPU to another, which is more
  90  * likely for long-running processes. Therefore we watch for high
  91  * differences between the two timings.
  92  *
  93  * This function is subject to further improvements.
  94  */
  95 inline uint64_t timespecDiff(timespec end, timespec start,
  96                              timespec endCoarse, timespec startCoarse) {
  97   auto fine = timespecDiff(end, start);
  98   auto coarse = timespecDiff(endCoarse, startCoarse);
  99   if (coarse - fine >= 1000000) {
 100     // The fine time is in all likelihood bogus
 101     return coarse;
 102   }
 103   return fine;
 104 }
 105
 106 } // namespace detail
 107
 108 /**
 109  * Supporting type for BENCHMARK_SUSPEND defined below.
 110  */
 111 struct BenchmarkSuspender {
 112   BenchmarkSuspender() {
 113     CHECK_EQ(0, clock_gettime(CLOCK_REALTIME, &start));
 114   }
 115
 116   BenchmarkSuspender(const BenchmarkSuspender &) = delete;
 117   BenchmarkSuspender(BenchmarkSuspender && rhs) noexcept {
 118     start = rhs.start;
 119     rhs.start = {0, 0};
 120   }
 121
 122   BenchmarkSuspender& operator=(const BenchmarkSuspender &) = delete;
 123   BenchmarkSuspender& operator=(BenchmarkSuspender && rhs) {
 124     if (start.tv_nsec > 0 || start.tv_sec > 0) {
 125       tally();
 126     }
 127     start = rhs.start;
 128     rhs.start = {0, 0};
 129     return *this;
 130   }
 131
 132   ~BenchmarkSuspender() {
 133     if (start.tv_nsec > 0 || start.tv_sec > 0) {
 134       tally();
 135     }
 136   }
 137
 138   void dismiss() {
 139     assert(start.tv_nsec > 0 || start.tv_sec > 0);
 140     tally();
 141     start = {0, 0};
 142   }
 143
 144   void rehire() {
 145     assert(start.tv_nsec == 0 || start.tv_sec == 0);
 146     CHECK_EQ(0, clock_gettime(CLOCK_REALTIME, &start));
 147   }
 148
 149   template <class F>
 150   auto dismissing(F f) -> typename std::result_of<F()>::type {
 151     SCOPE_EXIT { rehire(); };
 152     dismiss();
 153     return f();
 154   }
 155
 156   /**
 157    * This is for use inside of if-conditions, used in BENCHMARK macros.
 158    * If-conditions bypass the explicit on operator bool.
 159    */
 160   explicit operator bool() const {
 161     return false;
 162   }
 163
 164   /**
 165    * Accumulates nanoseconds spent outside benchmark.
 166    */
 167   typedef uint64_t NanosecondsSpent;
 168   static NanosecondsSpent nsSpent;
 169
 170 private:
 171   void tally() {
 172     timespec end;
 173     CHECK_EQ(0, clock_gettime(CLOCK_REALTIME, &end));
 174     nsSpent += detail::timespecDiff(end, start);
 175     start = end;
 176   }
 177
 178   timespec start;
 179 };
 180
 181 /**
 182  * Adds a benchmark. Usually not called directly but instead through
 183  * the macro BENCHMARK defined below. The lambda function involved
 184  * must take exactly one parameter of type unsigned, and the benchmark
 185  * uses it with counter semantics (iteration occurs inside the
 186  * function).
 187  */
 188 template <typename Lambda>
 189 typename std::enable_if<
 190   boost::function_types::function_arity<decltype(&Lambda::operator())>::value
 191   == 2
 192 >::type
 193 addBenchmark(const char* file, const char* name, Lambda&& lambda) {
 194   auto execute = [=](unsigned int times) {
 195     BenchmarkSuspender::nsSpent = 0;
 196     timespec start, end;
 197     unsigned int niter;
 198
 199     // CORE MEASUREMENT STARTS
 200     auto const r1 = clock_gettime(CLOCK_REALTIME, &start);
 201     niter = lambda(times);
 202     auto const r2 = clock_gettime(CLOCK_REALTIME, &end);
 203     // CORE MEASUREMENT ENDS
 204
 205     CHECK_EQ(0, r1);
 206     CHECK_EQ(0, r2);
 207
 208     return detail::TimeIterPair(
 209       detail::timespecDiff(end, start) - BenchmarkSuspender::nsSpent,
 210       niter);
 211   };
 212
 213   detail::addBenchmarkImpl(file, name,
 214     std::function<detail::TimeIterPair(unsigned int)>(execute));
 215 }
 216
 217 /**
 218  * Adds a benchmark. Usually not called directly but instead through
 219  * the macro BENCHMARK defined below. The lambda function involved
 220  * must take zero parameters, and the benchmark calls it repeatedly
 221  * (iteration occurs outside the function).
 222  */
 223 template <typename Lambda>
 224 typename std::enable_if<
 225   boost::function_types::function_arity<decltype(&Lambda::operator())>::value
 226   == 1
 227 >::type
 228 addBenchmark(const char* file, const char* name, Lambda&& lambda) {
 229   addBenchmark(file, name, [=](unsigned int times) {
 230       unsigned int niter = 0;
 231       while (times-- > 0) {
 232         niter += lambda();
 233       }
 234       return niter;
 235     });
 236 }
 237
 238 /**
 239  * Call doNotOptimizeAway(var) to ensure that var will be computed even
 240  * post-optimization.  Use it for variables that are computed during
 241  * benchmarking but otherwise are useless. The compiler tends to do a
 242  * good job at eliminating unused variables, and this function fools it
 243  * into thinking var is in fact needed.
 244  *
 245  * Call makeUnpredictable(var) when you don't want the optimizer to use
 246  * its knowledge of var to shape the following code.  This is useful
 247  * when constant propagation or power reduction is possible during your
 248  * benchmark but not in real use cases.
 249  */
 250
 251 #ifdef _MSC_VER
 252
 253 #pragma optimize("", off)
 254
 255 inline void doNotOptimizeDependencySink(const void*) {}
 256
 257 #pragma optimize("", on)
 258
 259 template <class T>
 260 void doNotOptimizeAway(const T& datum) {
 261   doNotOptimizeDependencySink(&datum);
 262 }
 263
 264 template <typename T>
 265 void makeUnpredictable(T& datum) {
 266   doNotOptimizeDependencySink(&datum);
 267 }
 268
 269 #else
 270
 271 namespace detail {
 272 template <typename T>
 273 struct DoNotOptimizeAwayNeedsIndirect {
 274   using Decayed = typename std::decay<T>::type;
 275
 276   // First two constraints ensure it can be an "r" operand.
 277   // std::is_pointer check is because callers seem to expect that
 278   // doNotOptimizeAway(&x) is equivalent to doNotOptimizeAway(x).
 279   constexpr static bool value = !folly::IsTriviallyCopyable<Decayed>::value ||
 280       sizeof(Decayed) > sizeof(long) || std::is_pointer<Decayed>::value;
 281 };
 282 } // detail namespace
 283
 284 template <typename T>
 285 auto doNotOptimizeAway(const T& datum) -> typename std::enable_if<
 286     !detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
 287   // The "r" constraint forces the compiler to make datum available
 288   // in a register to the asm block, which means that it must have
 289   // computed/loaded it.  We use this path for things that are <=
 290   // sizeof(long) (they have to fit), trivial (otherwise the compiler
 291   // doesn't want to put them in a register), and not a pointer (because
 292   // doNotOptimizeAway(&foo) would otherwise be a foot gun that didn't
 293   // necessarily compute foo).
 294   //
 295   // An earlier version of this method had a more permissive input operand
 296   // constraint, but that caused unnecessary variation between clang and
 297   // gcc benchmarks.
 298   asm volatile("" ::"r"(datum));
 299 }
 300
 301 template <typename T>
 302 auto doNotOptimizeAway(const T& datum) -> typename std::enable_if<
 303     detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
 304   // This version of doNotOptimizeAway tells the compiler that the asm
 305   // block will read datum from memory, and that in addition it might read
 306   // or write from any memory location.  If the memory clobber could be
 307   // separated into input and output that would be preferrable.
 308   asm volatile("" ::"m"(datum) : "memory");
 309 }
 310
 311 template <typename T>
 312 auto makeUnpredictable(T& datum) -> typename std::enable_if<
 313     !detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
 314   asm volatile("" : "+r"(datum));
 315 }
 316
 317 template <typename T>
 318 auto makeUnpredictable(T& datum) -> typename std::enable_if<
 319     detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
 320   asm volatile("" ::"m"(datum) : "memory");
 321 }
 322
 323 #endif
 324
 325 } // namespace folly
 326
 327 /**
 328  * Introduces a benchmark function. Used internally, see BENCHMARK and
 329  * friends below.
 330  */
 331 #define BENCHMARK_IMPL(funName, stringName, rv, paramType, paramName)   \
 332   static void funName(paramType);                                       \
 333   static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = (           \
 334     ::folly::addBenchmark(__FILE__, stringName,                         \
 335       [](paramType paramName) -> unsigned { funName(paramName);         \
 336                                             return rv; }),              \
 337     true);                                                              \
 338   static void funName(paramType paramName)
 339
 340 /**
 341  * Introduces a benchmark function with support for returning the actual
 342  * number of iterations. Used internally, see BENCHMARK_MULTI and friends
 343  * below.
 344  */
 345 #define BENCHMARK_MULTI_IMPL(funName, stringName, paramType, paramName) \
 346   static unsigned funName(paramType);                                   \
 347   static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = (           \
 348     ::folly::addBenchmark(__FILE__, stringName,                         \
 349       [](paramType paramName) { return funName(paramName); }),          \
 350     true);                                                              \
 351   static unsigned funName(paramType paramName)
 352
 353 /**
 354  * Introduces a benchmark function. Use with either one or two arguments.
 355  * The first is the name of the benchmark. Use something descriptive, such
 356  * as insertVectorBegin. The second argument may be missing, or could be a
 357  * symbolic counter. The counter dictates how many internal iteration the
 358  * benchmark does. Example:
 359  *
 360  * BENCHMARK(vectorPushBack) {
 361  *   vector<int> v;
 362  *   v.push_back(42);
 363  * }
 364  *
 365  * BENCHMARK(insertVectorBegin, n) {
 366  *   vector<int> v;
 367  *   FOR_EACH_RANGE (i, 0, n) {
 368  *     v.insert(v.begin(), 42);
 369  *   }
 370  * }
 371  */
 372 #define BENCHMARK(name, ...)                                    \
 373   BENCHMARK_IMPL(                                               \
 374     name,                                                       \
 375     FB_STRINGIZE(name),                                         \
 376     FB_ARG_2_OR_1(1, ## __VA_ARGS__),                           \
 377     FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__),                   \
 378     __VA_ARGS__)
 379
 380 /**
 381  * Like BENCHMARK above, but allows the user to return the actual
 382  * number of iterations executed in the function body. This can be
 383  * useful if the benchmark function doesn't know upfront how many
 384  * iterations it's going to run or if it runs through a certain
 385  * number of test cases, e.g.:
 386  *
 387  * BENCHMARK_MULTI(benchmarkSomething) {
 388  *   std::vector<int> testCases { 0, 1, 1, 2, 3, 5 };
 389  *   for (int c : testCases) {
 390  *     doSomething(c);
 391  *   }
 392  *   return testCases.size();
 393  * }
 394  */
 395 #define BENCHMARK_MULTI(name, ...)                              \
 396   BENCHMARK_MULTI_IMPL(                                         \
 397     name,                                                       \
 398     FB_STRINGIZE(name),                                         \
 399     FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__),                   \
 400     __VA_ARGS__)
 401
 402 /**
 403  * Defines a benchmark that passes a parameter to another one. This is
 404  * common for benchmarks that need a "problem size" in addition to
 405  * "number of iterations". Consider:
 406  *
 407  * void pushBack(uint n, size_t initialSize) {
 408  *   vector<int> v;
 409  *   BENCHMARK_SUSPEND {
 410  *     v.resize(initialSize);
 411  *   }
 412  *   FOR_EACH_RANGE (i, 0, n) {
 413  *    v.push_back(i);
 414  *   }
 415  * }
 416  * BENCHMARK_PARAM(pushBack, 0)
 417  * BENCHMARK_PARAM(pushBack, 1000)
 418  * BENCHMARK_PARAM(pushBack, 1000000)
 419  *
 420  * The benchmark above estimates the speed of push_back at different
 421  * initial sizes of the vector. The framework will pass 0, 1000, and
 422  * 1000000 for initialSize, and the iteration count for n.
 423  */
 424 #define BENCHMARK_PARAM(name, param)                                    \
 425   BENCHMARK_NAMED_PARAM(name, param, param)
 426
 427 /**
 428  * Same as BENCHMARK_PARAM, but allows one to return the actual number of
 429  * iterations that have been run.
 430  */
 431 #define BENCHMARK_PARAM_MULTI(name, param)                              \
 432   BENCHMARK_NAMED_PARAM_MULTI(name, param, param)
 433
 434 /*
 435  * Like BENCHMARK_PARAM(), but allows a custom name to be specified for each
 436  * parameter, rather than using the parameter value.
 437  *
 438  * Useful when the parameter value is not a valid token for string pasting,
 439  * of when you want to specify multiple parameter arguments.
 440  *
 441  * For example:
 442  *
 443  * void addValue(uint n, int64_t bucketSize, int64_t min, int64_t max) {
 444  *   Histogram<int64_t> hist(bucketSize, min, max);
 445  *   int64_t num = min;
 446  *   FOR_EACH_RANGE (i, 0, n) {
 447  *     hist.addValue(num);
 448  *     ++num;
 449  *     if (num > max) { num = min; }
 450  *   }
 451  * }
 452  *
 453  * BENCHMARK_NAMED_PARAM(addValue, 0_to_100, 1, 0, 100)
 454  * BENCHMARK_NAMED_PARAM(addValue, 0_to_1000, 10, 0, 1000)
 455  * BENCHMARK_NAMED_PARAM(addValue, 5k_to_20k, 250, 5000, 20000)
 456  */
 457 #define BENCHMARK_NAMED_PARAM(name, param_name, ...)                    \
 458   BENCHMARK_IMPL(                                                       \
 459       FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),              \
 460       FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")",              \
 461       iters,                                                            \
 462       unsigned,                                                         \
 463       iters) {                                                          \
 464     name(iters, ## __VA_ARGS__);                                        \
 465   }
 466
 467 /**
 468  * Same as BENCHMARK_NAMED_PARAM, but allows one to return the actual number
 469  * of iterations that have been run.
 470  */
 471 #define BENCHMARK_NAMED_PARAM_MULTI(name, param_name, ...)              \
 472   BENCHMARK_MULTI_IMPL(                                                 \
 473       FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),              \
 474       FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")",              \
 475       unsigned,                                                         \
 476       iters) {                                                          \
 477     return name(iters, ## __VA_ARGS__);                                 \
 478   }
 479
 480 /**
 481  * Just like BENCHMARK, but prints the time relative to a
 482  * baseline. The baseline is the most recent BENCHMARK() seen in
 483  * the current scope. Example:
 484  *
 485  * // This is the baseline
 486  * BENCHMARK(insertVectorBegin, n) {
 487  *   vector<int> v;
 488  *   FOR_EACH_RANGE (i, 0, n) {
 489  *     v.insert(v.begin(), 42);
 490  *   }
 491  * }
 492  *
 493  * BENCHMARK_RELATIVE(insertListBegin, n) {
 494  *   list<int> s;
 495  *   FOR_EACH_RANGE (i, 0, n) {
 496  *     s.insert(s.begin(), 42);
 497  *   }
 498  * }
 499  *
 500  * Any number of relative benchmark can be associated with a
 501  * baseline. Another BENCHMARK() occurrence effectively establishes a
 502  * new baseline.
 503  */
 504 #define BENCHMARK_RELATIVE(name, ...)                           \
 505   BENCHMARK_IMPL(                                               \
 506     name,                                                       \
 507     "%" FB_STRINGIZE(name),                                     \
 508     FB_ARG_2_OR_1(1, ## __VA_ARGS__),                           \
 509     FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__),                   \
 510     __VA_ARGS__)
 511
 512 /**
 513  * Same as BENCHMARK_RELATIVE, but allows one to return the actual number
 514  * of iterations that have been run.
 515  */
 516 #define BENCHMARK_RELATIVE_MULTI(name, ...)                     \
 517   BENCHMARK_MULTI_IMPL(                                         \
 518     name,                                                       \
 519     "%" FB_STRINGIZE(name),                                     \
 520     FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__),                   \
 521     __VA_ARGS__)
 522
 523 /**
 524  * A combination of BENCHMARK_RELATIVE and BENCHMARK_PARAM.
 525  */
 526 #define BENCHMARK_RELATIVE_PARAM(name, param)                           \
 527   BENCHMARK_RELATIVE_NAMED_PARAM(name, param, param)
 528
 529 /**
 530  * Same as BENCHMARK_RELATIVE_PARAM, but allows one to return the actual
 531  * number of iterations that have been run.
 532  */
 533 #define BENCHMARK_RELATIVE_PARAM_MULTI(name, param)                     \
 534   BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(name, param, param)
 535
 536 /**
 537  * A combination of BENCHMARK_RELATIVE and BENCHMARK_NAMED_PARAM.
 538  */
 539 #define BENCHMARK_RELATIVE_NAMED_PARAM(name, param_name, ...)           \
 540   BENCHMARK_IMPL(                                                       \
 541       FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),              \
 542       "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")",          \
 543       iters,                                                            \
 544       unsigned,                                                         \
 545       iters) {                                                          \
 546     name(iters, ## __VA_ARGS__);                                        \
 547   }
 548
 549 /**
 550  * Same as BENCHMARK_RELATIVE_NAMED_PARAM, but allows one to return the
 551  * actual number of iterations that have been run.
 552  */
 553 #define BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(name, param_name, ...)     \
 554   BENCHMARK_MULTI_IMPL(                                                 \
 555       FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),              \
 556       "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")",          \
 557       unsigned,                                                         \
 558       iters) {                                                          \
 559     return name(iters, ## __VA_ARGS__);                                 \
 560   }
 561
 562 /**
 563  * Draws a line of dashes.
 564  */
 565 #define BENCHMARK_DRAW_LINE()                                             \
 566   static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = (             \
 567     ::folly::addBenchmark(__FILE__, "-", []() -> unsigned { return 0; }), \
 568     true);
 569
 570 /**
 571  * Allows execution of code that doesn't count torward the benchmark's
 572  * time budget. Example:
 573  *
 574  * BENCHMARK_START_GROUP(insertVectorBegin, n) {
 575  *   vector<int> v;
 576  *   BENCHMARK_SUSPEND {
 577  *     v.reserve(n);
 578  *   }
 579  *   FOR_EACH_RANGE (i, 0, n) {
 580  *     v.insert(v.begin(), 42);
 581  *   }
 582  * }
 583  */
 584 #define BENCHMARK_SUSPEND                               \
 585   if (auto FB_ANONYMOUS_VARIABLE(BENCHMARK_SUSPEND) =   \
 586       ::folly::BenchmarkSuspender()) {}                 \
 587   else