folly/Benchmark.h

   1 /*
   2  * Copyright 2012 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #ifndef FOLLY_BENCHMARK_H_
  18 #define FOLLY_BENCHMARK_H_
  19
  20 #include "folly/Preprocessor.h" // for FB_ANONYMOUS_VARIABLE
  21 #include <cassert>
  22 #include <ctime>
  23 #include <boost/function_types/function_arity.hpp>
  24 #include <functional>
  25 #include <glog/logging.h>
  26 #include <gflags/gflags.h>
  27 #include <limits>
  28
  29 DECLARE_bool(benchmark);
  30
  31
  32 namespace folly {
  33
  34 /**
  35  * Runs all benchmarks defined. Usually put in main().
  36  */
  37 void runBenchmarks();
  38
  39 /**
  40  * Runs all benchmarks defined if and only if the --benchmark flag has
  41  * been passed to the program. Usually put in main().
  42  */
  43 inline bool runBenchmarksOnFlag() {
  44   if (FLAGS_benchmark) {
  45     runBenchmarks();
  46   }
  47   return FLAGS_benchmark;
  48 }
  49
  50 namespace detail {
  51
  52 /**
  53  * This is the clock ID used for measuring time. On older kernels, the
  54  * resolution of this clock will be very coarse, which will cause the
  55  * benchmarks to fail.
  56  */
  57 enum Clock { DEFAULT_CLOCK_ID = CLOCK_REALTIME };
  58
  59 /**
  60  * Adds a benchmark wrapped in a std::function. Only used
  61  * internally. Pass by value is intentional.
  62  */
  63 void addBenchmarkImpl(const char* file,
  64                       const char* name,
  65                       std::function<uint64_t(unsigned int)>);
  66
  67 /**
  68  * Takes the difference between two timespec values. end is assumed to
  69  * occur after start.
  70  */
  71 inline uint64_t timespecDiff(timespec end, timespec start) {
  72   if (end.tv_sec == start.tv_sec) {
  73     assert(end.tv_nsec >= start.tv_nsec);
  74     return end.tv_nsec - start.tv_nsec;
  75   }
  76   assert(end.tv_sec > start.tv_sec &&
  77          end.tv_sec - start.tv_sec <
  78          std::numeric_limits<uint64_t>::max() / 1000000000UL);
  79   return (end.tv_sec - start.tv_sec) * 1000000000UL
  80     + end.tv_nsec - start.tv_nsec;
  81 }
  82
  83 /**
  84  * Takes the difference between two sets of timespec values. The first
  85  * two come from a high-resolution clock whereas the other two come
  86  * from a low-resolution clock. The crux of the matter is that
  87  * high-res values may be bogus as documented in
  88  * http://linux.die.net/man/3/clock_gettime. The trouble is when the
  89  * running process migrates from one CPU to another, which is more
  90  * likely for long-running processes. Therefore we watch for high
  91  * differences between the two timings.
  92  *
  93  * This function is subject to further improvements.
  94  */
  95 inline uint64_t timespecDiff(timespec end, timespec start,
  96                              timespec endCoarse, timespec startCoarse) {
  97   auto fine = timespecDiff(end, start);
  98   auto coarse = timespecDiff(endCoarse, startCoarse);
  99   if (coarse - fine >= 1000000) {
 100     // The fine time is in all likelihood bogus
 101     return coarse;
 102   }
 103   return fine;
 104 }
 105
 106 } // namespace detail
 107
 108 /**
 109  * Supporting type for BENCHMARK_SUSPEND defined below.
 110  */
 111 struct BenchmarkSuspender {
 112   BenchmarkSuspender() {
 113     CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &start));
 114   }
 115
 116   BenchmarkSuspender(const BenchmarkSuspender &) = delete;
 117   BenchmarkSuspender(BenchmarkSuspender && rhs) {
 118     start = rhs.start;
 119     rhs.start.tv_nsec = rhs.start.tv_sec = 0;
 120   }
 121
 122   BenchmarkSuspender& operator=(const BenchmarkSuspender &) = delete;
 123   BenchmarkSuspender& operator=(BenchmarkSuspender && rhs) {
 124     if (start.tv_nsec > 0 || start.tv_sec > 0) {
 125       tally();
 126     }
 127     start = rhs.start;
 128     rhs.start.tv_nsec = rhs.start.tv_sec = 0;
 129     return *this;
 130   }
 131
 132   ~BenchmarkSuspender() {
 133     if (start.tv_nsec > 0 || start.tv_sec > 0) {
 134       tally();
 135     }
 136   }
 137
 138   void dismiss() {
 139     assert(start.tv_nsec > 0 || start.tv_sec > 0);
 140     tally();
 141     start.tv_nsec = start.tv_sec = 0;
 142   }
 143
 144   void rehire() {
 145     assert(start.tv_nsec == 0 || start.tv_sec == 0);
 146     CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &start));
 147   }
 148
 149   /**
 150    * This helps the macro definition. To get around the dangers of
 151    * operator bool, returns a pointer to member (which allows no
 152    * arithmetic).
 153    */
 154   operator int BenchmarkSuspender::*() const {
 155     return nullptr;
 156   }
 157
 158   /**
 159    * Accumulates nanoseconds spent outside benchmark.
 160    */
 161   typedef uint64_t NanosecondsSpent;
 162   static NanosecondsSpent nsSpent;
 163
 164 private:
 165   void tally() {
 166     timespec end;
 167     CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &end));
 168     nsSpent += detail::timespecDiff(end, start);
 169     start = end;
 170   }
 171
 172   timespec start;
 173 };
 174
 175 /**
 176  * Adds a benchmark. Usually not called directly but instead through
 177  * the macro BENCHMARK defined below. The lambda function involved
 178  * must take exactly one parameter of type unsigned, and the benchmark
 179  * uses it with counter semantics (iteration occurs inside the
 180  * function).
 181  */
 182 template <typename Lambda>
 183 typename std::enable_if<
 184   boost::function_types::function_arity<decltype(&Lambda::operator())>::value
 185   == 2
 186 >::type
 187 addBenchmark(const char* file, const char* name, Lambda&& lambda) {
 188   auto execute = [=](unsigned int times) {
 189     BenchmarkSuspender::nsSpent = 0;
 190     timespec start, end;
 191
 192     // CORE MEASUREMENT STARTS
 193     CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &start));
 194     lambda(times);
 195     CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &end));
 196     // CORE MEASUREMENT ENDS
 197
 198     return detail::timespecDiff(end, start) - BenchmarkSuspender::nsSpent;
 199   };
 200
 201   detail::addBenchmarkImpl(file, name,
 202                            std::function<uint64_t(unsigned int)>(execute));
 203 }
 204
 205 /**
 206  * Adds a benchmark. Usually not called directly but instead through
 207  * the macro BENCHMARK defined below. The lambda function involved
 208  * must take zero parameters, and the benchmark calls it repeatedly
 209  * (iteration occurs outside the function).
 210  */
 211 template <typename Lambda>
 212 typename std::enable_if<
 213   boost::function_types::function_arity<decltype(&Lambda::operator())>::value
 214   == 1
 215 >::type
 216 addBenchmark(const char* file, const char* name, Lambda&& lambda) {
 217   addBenchmark(file, name, [=](unsigned int times) {
 218       while (times-- > 0) {
 219         lambda();
 220       }
 221     });
 222 }
 223
 224 /**
 225  * Call doNotOptimizeAway(var) against variables that you use for
 226  * benchmarking but otherwise are useless. The compiler tends to do a
 227  * good job at eliminating unused variables, and this function fools
 228  * it into thinking var is in fact needed.
 229  */
 230 template <class T>
 231 void doNotOptimizeAway(T&& datum) {
 232   asm volatile("" : "+r" (datum));
 233 }
 234
 235 } // namespace folly
 236
 237 /**
 238  * Introduces a benchmark function. Used internally, see BENCHMARK and
 239  * friends below.
 240  */
 241 #define BENCHMARK_IMPL(funName, stringName, paramType, paramName)       \
 242   static void funName(paramType);                                       \
 243   static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = (           \
 244     ::folly::addBenchmark(__FILE__, stringName,                         \
 245       [](paramType paramName) { funName(paramName); }),                 \
 246     true);                                                              \
 247   static void funName(paramType paramName)
 248
 249 /**
 250  * Introduces a benchmark function. Use with either one one or two
 251  * arguments. The first is the name of the benchmark. Use something
 252  * descriptive, such as insertVectorBegin. The second argument may be
 253  * missing, or could be a symbolic counter. The counter dictates how
 254  * many internal iteration the benchmark does. Example:
 255  *
 256  * BENCHMARK(vectorPushBack) {
 257  *   vector<int> v;
 258  *   v.push_back(42);
 259  * }
 260  *
 261  * BENCHMARK(insertVectorBegin, n) {
 262  *   vector<int> v;
 263  *   FOR_EACH_RANGE (i, 0, n) {
 264  *     v.insert(v.begin(), 42);
 265  *   }
 266  * }
 267  */
 268 #define BENCHMARK(name, ...)                                    \
 269   BENCHMARK_IMPL(                                               \
 270     name,                                                       \
 271     FB_STRINGIZE(name),                                         \
 272     FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__),                   \
 273     __VA_ARGS__)
 274
 275 /**
 276  * Defines a benchmark that passes a parameter to another one. This is
 277  * common for benchmarks that need a "problem size" in addition to
 278  * "number of iterations". Consider:
 279  *
 280  * void pushBack(uint n, size_t initialSize) {
 281  *   vector<int> v;
 282  *   BENCHMARK_SUSPEND {
 283  *     v.resize(initialSize);
 284  *   }
 285  *   FOR_EACH_RANGE (i, 0, n) {
 286  *    v.push_back(i);
 287  *   }
 288  * }
 289  * BENCHMARK_PARAM(pushBack, 0)
 290  * BENCHMARK_PARAM(pushBack, 1000)
 291  * BENCHMARK_PARAM(pushBack, 1000000)
 292  *
 293  * The benchmark above estimates the speed of push_back at different
 294  * initial sizes of the vector. The framework will pass 0, 1000, and
 295  * 1000000 for initialSize, and the iteration count for n.
 296  */
 297 #define BENCHMARK_PARAM(name, param)                                    \
 298   BENCHMARK_IMPL(                                                       \
 299       FB_CONCATENATE(name, FB_CONCATENATE(_, param)),                   \
 300       FB_STRINGIZE(name) "(" FB_STRINGIZE(param) ")",                   \
 301       unsigned,                                                         \
 302       iters) {                                                          \
 303     name(iters, param);                                                 \
 304   }
 305
 306 /**
 307  * Just like BENCHMARK, but prints the time relative to a
 308  * baseline. The baseline is the most recent BENCHMARK() seen in
 309  * lexical order. Example:
 310  *
 311  * // This is the baseline
 312  * BENCHMARK(insertVectorBegin, n) {
 313  *   vector<int> v;
 314  *   FOR_EACH_RANGE (i, 0, n) {
 315  *     v.insert(v.begin(), 42);
 316  *   }
 317  * }
 318  *
 319  * BENCHMARK_RELATIVE(insertListBegin, n) {
 320  *   list<int> s;
 321  *   FOR_EACH_RANGE (i, 0, n) {
 322  *     s.insert(s.begin(), 42);
 323  *   }
 324  * }
 325  *
 326  * Any number of relative benchmark can be associated with a
 327  * baseline. Another BENCHMARK() occurrence effectively establishes a
 328  * new baseline.
 329  */
 330 #define BENCHMARK_RELATIVE(name, ...)                           \
 331   BENCHMARK_IMPL(                                               \
 332     name,                                                       \
 333     "%" FB_STRINGIZE(name),                                     \
 334     FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__),                   \
 335     __VA_ARGS__)
 336
 337 /**
 338  * A combination of BENCHMARK_RELATIVE and BENCHMARK_PARAM.
 339  */
 340 #define BENCHMARK_RELATIVE_PARAM(name, param)                           \
 341   BENCHMARK_IMPL(                                                       \
 342       FB_CONCATENATE(name, FB_CONCATENATE(_, param)),                   \
 343       "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param) ")",               \
 344       unsigned,                                                         \
 345       iters) {                                                          \
 346     name(iters, param);                                                 \
 347   }
 348
 349 /**
 350  * Draws a line of dashes.
 351  */
 352 #define BENCHMARK_DRAW_LINE()                                   \
 353   static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = (   \
 354     ::folly::addBenchmark(__FILE__, "-", []() { }),             \
 355     true);
 356
 357 /**
 358  * Allows execution of code that doesn't count torward the benchmark's
 359  * time budget. Example:
 360  *
 361  * BENCHMARK_START_GROUP(insertVectorBegin, n) {
 362  *   vector<int> v;
 363  *   BENCHMARK_SUSPEND {
 364  *     v.reserve(n);
 365  *   }
 366  *   FOR_EACH_RANGE (i, 0, n) {
 367  *     v.insert(v.begin(), 42);
 368  *   }
 369  * }
 370  */
 371 #define BENCHMARK_SUSPEND                               \
 372   if (auto FB_ANONYMOUS_VARIABLE(BENCHMARK_SUSPEND) =   \
 373       ::folly::BenchmarkSuspender()) {}                 \
 374   else
 375
 376 #endif // FOLLY_BENCHMARK_H_