X-Git-Url: http://plrg.eecs.uci.edu/git/?p=folly.git;a=blobdiff_plain;f=folly%2FBenchmark.h;h=cf11ca75b20b02ca67da3dfbaa4fdf198d3b14de;hp=19806028a96242a2cd45cf288fdadaabbacb9633;hb=b3e7df8220f410398011fea71b280ba8be459fcc;hpb=22afce906d7e98d95f8c45c3301072d9fd891d41 diff --git a/folly/Benchmark.h b/folly/Benchmark.h index 19806028..cf11ca75 100644 --- a/folly/Benchmark.h +++ b/folly/Benchmark.h @@ -1,5 +1,5 @@ /* - * Copyright 2014 Facebook, Inc. + * Copyright 2017 Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,18 +14,22 @@ * limitations under the License. */ -#ifndef FOLLY_BENCHMARK_H_ -#define FOLLY_BENCHMARK_H_ +#pragma once + +#include +#include // for FB_ANONYMOUS_VARIABLE +#include +#include +#include -#include "folly/Portability.h" -#include "folly/Preprocessor.h" // for FB_ANONYMOUS_VARIABLE #include -#include -#include +#include #include -#include -#include #include +#include + +#include +#include DECLARE_bool(benchmark); @@ -49,12 +53,8 @@ inline bool runBenchmarksOnFlag() { namespace detail { -/** - * This is the clock ID used for measuring time. On older kernels, the - * resolution of this clock will be very coarse, which will cause the - * benchmarks to fail. - */ -enum Clock { DEFAULT_CLOCK_ID = CLOCK_REALTIME }; +using TimeIterPair = + std::pair; /** * Adds a benchmark wrapped in a std::function. Only used @@ -62,46 +62,7 @@ enum Clock { DEFAULT_CLOCK_ID = CLOCK_REALTIME }; */ void addBenchmarkImpl(const char* file, const char* name, - std::function); - -/** - * Takes the difference between two timespec values. end is assumed to - * occur after start. - */ -inline uint64_t timespecDiff(timespec end, timespec start) { - if (end.tv_sec == start.tv_sec) { - assert(end.tv_nsec >= start.tv_nsec); - return end.tv_nsec - start.tv_nsec; - } - assert(end.tv_sec > start.tv_sec && - end.tv_sec - start.tv_sec < - std::numeric_limits::max() / 1000000000UL); - return (end.tv_sec - start.tv_sec) * 1000000000UL - + end.tv_nsec - start.tv_nsec; -} - -/** - * Takes the difference between two sets of timespec values. The first - * two come from a high-resolution clock whereas the other two come - * from a low-resolution clock. The crux of the matter is that - * high-res values may be bogus as documented in - * http://linux.die.net/man/3/clock_gettime. The trouble is when the - * running process migrates from one CPU to another, which is more - * likely for long-running processes. Therefore we watch for high - * differences between the two timings. - * - * This function is subject to further improvements. - */ -inline uint64_t timespecDiff(timespec end, timespec start, - timespec endCoarse, timespec startCoarse) { - auto fine = timespecDiff(end, start); - auto coarse = timespecDiff(endCoarse, startCoarse); - if (coarse - fine >= 1000000) { - // The fine time is in all likelihood bogus - return coarse; - } - return fine; -} + std::function); } // namespace detail @@ -109,67 +70,75 @@ inline uint64_t timespecDiff(timespec end, timespec start, * Supporting type for BENCHMARK_SUSPEND defined below. */ struct BenchmarkSuspender { + using Clock = std::chrono::high_resolution_clock; + using TimePoint = Clock::time_point; + using Duration = Clock::duration; + BenchmarkSuspender() { - CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &start)); + start = Clock::now(); } BenchmarkSuspender(const BenchmarkSuspender &) = delete; - BenchmarkSuspender(BenchmarkSuspender && rhs) { + BenchmarkSuspender(BenchmarkSuspender && rhs) noexcept { start = rhs.start; - rhs.start.tv_nsec = rhs.start.tv_sec = 0; + rhs.start = {}; } BenchmarkSuspender& operator=(const BenchmarkSuspender &) = delete; BenchmarkSuspender& operator=(BenchmarkSuspender && rhs) { - if (start.tv_nsec > 0 || start.tv_sec > 0) { + if (start != TimePoint{}) { tally(); } start = rhs.start; - rhs.start.tv_nsec = rhs.start.tv_sec = 0; + rhs.start = {}; return *this; } ~BenchmarkSuspender() { - if (start.tv_nsec > 0 || start.tv_sec > 0) { + if (start != TimePoint{}) { tally(); } } void dismiss() { - assert(start.tv_nsec > 0 || start.tv_sec > 0); + assert(start != TimePoint{}); tally(); - start.tv_nsec = start.tv_sec = 0; + start = {}; } void rehire() { - assert(start.tv_nsec == 0 || start.tv_sec == 0); - CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &start)); + assert(start == TimePoint{}); + start = Clock::now(); + } + + template + auto dismissing(F f) -> typename std::result_of::type { + SCOPE_EXIT { rehire(); }; + dismiss(); + return f(); } /** - * This helps the macro definition. To get around the dangers of - * operator bool, returns a pointer to member (which allows no - * arithmetic). + * This is for use inside of if-conditions, used in BENCHMARK macros. + * If-conditions bypass the explicit on operator bool. */ - operator int BenchmarkSuspender::*() const { - return nullptr; + explicit operator bool() const { + return false; } /** - * Accumulates nanoseconds spent outside benchmark. + * Accumulates time spent outside benchmark. */ - typedef uint64_t NanosecondsSpent; - static NanosecondsSpent nsSpent; + static Duration timeSpent; -private: + private: void tally() { - timespec end; - CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &end)); - nsSpent += detail::timespecDiff(end, start); + auto end = Clock::now(); + timeSpent += end - start; start = end; } - timespec start; + TimePoint start; }; /** @@ -185,24 +154,22 @@ typename std::enable_if< == 2 >::type addBenchmark(const char* file, const char* name, Lambda&& lambda) { - auto execute = [=](unsigned int times) -> uint64_t { - BenchmarkSuspender::nsSpent = 0; - timespec start, end; + auto execute = [=](unsigned int times) { + BenchmarkSuspender::timeSpent = {}; + unsigned int niter; // CORE MEASUREMENT STARTS - auto const r1 = clock_gettime(detail::DEFAULT_CLOCK_ID, &start); - lambda(times); - auto const r2 = clock_gettime(detail::DEFAULT_CLOCK_ID, &end); + auto start = std::chrono::high_resolution_clock::now(); + niter = lambda(times); + auto end = std::chrono::high_resolution_clock::now(); // CORE MEASUREMENT ENDS - CHECK_EQ(0, r1); - CHECK_EQ(0, r2); - - return detail::timespecDiff(end, start) - BenchmarkSuspender::nsSpent; + return detail::TimeIterPair( + (end - start) - BenchmarkSuspender::timeSpent, niter); }; detail::addBenchmarkImpl(file, name, - std::function(execute)); + std::function(execute)); } /** @@ -218,43 +185,135 @@ typename std::enable_if< >::type addBenchmark(const char* file, const char* name, Lambda&& lambda) { addBenchmark(file, name, [=](unsigned int times) { + unsigned int niter = 0; while (times-- > 0) { - lambda(); + niter += lambda(); } + return niter; }); } /** - * Call doNotOptimizeAway(var) against variables that you use for + * Call doNotOptimizeAway(var) to ensure that var will be computed even + * post-optimization. Use it for variables that are computed during * benchmarking but otherwise are useless. The compiler tends to do a - * good job at eliminating unused variables, and this function fools - * it into thinking var is in fact needed. + * good job at eliminating unused variables, and this function fools it + * into thinking var is in fact needed. + * + * Call makeUnpredictable(var) when you don't want the optimizer to use + * its knowledge of var to shape the following code. This is useful + * when constant propagation or power reduction is possible during your + * benchmark but not in real use cases. */ + +#ifdef _MSC_VER + +#pragma optimize("", off) + +inline void doNotOptimizeDependencySink(const void*) {} + +#pragma optimize("", on) + template -void doNotOptimizeAway(T&& datum) { - asm volatile("" : "+r" (datum)); +void doNotOptimizeAway(const T& datum) { + doNotOptimizeDependencySink(&datum); +} + +template +void makeUnpredictable(T& datum) { + doNotOptimizeDependencySink(&datum); +} + +#else + +namespace detail { +template +struct DoNotOptimizeAwayNeedsIndirect { + using Decayed = typename std::decay::type; + + // First two constraints ensure it can be an "r" operand. + // std::is_pointer check is because callers seem to expect that + // doNotOptimizeAway(&x) is equivalent to doNotOptimizeAway(x). + constexpr static bool value = !folly::IsTriviallyCopyable::value || + sizeof(Decayed) > sizeof(long) || std::is_pointer::value; +}; +} // detail namespace + +template +auto doNotOptimizeAway(const T& datum) -> typename std::enable_if< + !detail::DoNotOptimizeAwayNeedsIndirect::value>::type { + // The "r" constraint forces the compiler to make datum available + // in a register to the asm block, which means that it must have + // computed/loaded it. We use this path for things that are <= + // sizeof(long) (they have to fit), trivial (otherwise the compiler + // doesn't want to put them in a register), and not a pointer (because + // doNotOptimizeAway(&foo) would otherwise be a foot gun that didn't + // necessarily compute foo). + // + // An earlier version of this method had a more permissive input operand + // constraint, but that caused unnecessary variation between clang and + // gcc benchmarks. + asm volatile("" ::"r"(datum)); +} + +template +auto doNotOptimizeAway(const T& datum) -> typename std::enable_if< + detail::DoNotOptimizeAwayNeedsIndirect::value>::type { + // This version of doNotOptimizeAway tells the compiler that the asm + // block will read datum from memory, and that in addition it might read + // or write from any memory location. If the memory clobber could be + // separated into input and output that would be preferrable. + asm volatile("" ::"m"(datum) : "memory"); +} + +template +auto makeUnpredictable(T& datum) -> typename std::enable_if< + !detail::DoNotOptimizeAwayNeedsIndirect::value>::type { + asm volatile("" : "+r"(datum)); } +template +auto makeUnpredictable(T& datum) -> typename std::enable_if< + detail::DoNotOptimizeAwayNeedsIndirect::value>::type { + asm volatile("" ::"m"(datum) : "memory"); +} + +#endif + } // namespace folly /** * Introduces a benchmark function. Used internally, see BENCHMARK and * friends below. */ -#define BENCHMARK_IMPL(funName, stringName, paramType, paramName) \ +#define BENCHMARK_IMPL(funName, stringName, rv, paramType, paramName) \ static void funName(paramType); \ static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = ( \ ::folly::addBenchmark(__FILE__, stringName, \ - [](paramType paramName) { funName(paramName); }), \ + [](paramType paramName) -> unsigned { funName(paramName); \ + return rv; }), \ true); \ static void funName(paramType paramName) /** - * Introduces a benchmark function. Use with either one one or two - * arguments. The first is the name of the benchmark. Use something - * descriptive, such as insertVectorBegin. The second argument may be - * missing, or could be a symbolic counter. The counter dictates how - * many internal iteration the benchmark does. Example: + * Introduces a benchmark function with support for returning the actual + * number of iterations. Used internally, see BENCHMARK_MULTI and friends + * below. + */ +#define BENCHMARK_MULTI_IMPL(funName, stringName, paramType, paramName) \ + static unsigned funName(paramType); \ + static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = ( \ + ::folly::addBenchmark(__FILE__, stringName, \ + [](paramType paramName) { return funName(paramName); }), \ + true); \ + static unsigned funName(paramType paramName) + +/** + * Introduces a benchmark function. Use with either one or two arguments. + * The first is the name of the benchmark. Use something descriptive, such + * as insertVectorBegin. The second argument may be missing, or could be a + * symbolic counter. The counter dictates how many internal iteration the + * benchmark does. Example: * * BENCHMARK(vectorPushBack) { * vector v; @@ -270,6 +329,29 @@ void doNotOptimizeAway(T&& datum) { */ #define BENCHMARK(name, ...) \ BENCHMARK_IMPL( \ + name, \ + FB_STRINGIZE(name), \ + FB_ARG_2_OR_1(1, ## __VA_ARGS__), \ + FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__), \ + __VA_ARGS__) + +/** + * Like BENCHMARK above, but allows the user to return the actual + * number of iterations executed in the function body. This can be + * useful if the benchmark function doesn't know upfront how many + * iterations it's going to run or if it runs through a certain + * number of test cases, e.g.: + * + * BENCHMARK_MULTI(benchmarkSomething) { + * std::vector testCases { 0, 1, 1, 2, 3, 5 }; + * for (int c : testCases) { + * doSomething(c); + * } + * return testCases.size(); + * } + */ +#define BENCHMARK_MULTI(name, ...) \ + BENCHMARK_MULTI_IMPL( \ name, \ FB_STRINGIZE(name), \ FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__), \ @@ -300,6 +382,13 @@ void doNotOptimizeAway(T&& datum) { #define BENCHMARK_PARAM(name, param) \ BENCHMARK_NAMED_PARAM(name, param, param) +/** + * Same as BENCHMARK_PARAM, but allows one to return the actual number of + * iterations that have been run. + */ +#define BENCHMARK_PARAM_MULTI(name, param) \ + BENCHMARK_NAMED_PARAM_MULTI(name, param, param) + /* * Like BENCHMARK_PARAM(), but allows a custom name to be specified for each * parameter, rather than using the parameter value. @@ -327,15 +416,29 @@ void doNotOptimizeAway(T&& datum) { BENCHMARK_IMPL( \ FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \ FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \ + iters, \ unsigned, \ iters) { \ name(iters, ## __VA_ARGS__); \ } +/** + * Same as BENCHMARK_NAMED_PARAM, but allows one to return the actual number + * of iterations that have been run. + */ +#define BENCHMARK_NAMED_PARAM_MULTI(name, param_name, ...) \ + BENCHMARK_MULTI_IMPL( \ + FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \ + FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \ + unsigned, \ + iters) { \ + return name(iters, ## __VA_ARGS__); \ + } + /** * Just like BENCHMARK, but prints the time relative to a * baseline. The baseline is the most recent BENCHMARK() seen in - * lexical order. Example: + * the current scope. Example: * * // This is the baseline * BENCHMARK(insertVectorBegin, n) { @@ -358,6 +461,18 @@ void doNotOptimizeAway(T&& datum) { */ #define BENCHMARK_RELATIVE(name, ...) \ BENCHMARK_IMPL( \ + name, \ + "%" FB_STRINGIZE(name), \ + FB_ARG_2_OR_1(1, ## __VA_ARGS__), \ + FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__), \ + __VA_ARGS__) + +/** + * Same as BENCHMARK_RELATIVE, but allows one to return the actual number + * of iterations that have been run. + */ +#define BENCHMARK_RELATIVE_MULTI(name, ...) \ + BENCHMARK_MULTI_IMPL( \ name, \ "%" FB_STRINGIZE(name), \ FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__), \ @@ -369,6 +484,13 @@ void doNotOptimizeAway(T&& datum) { #define BENCHMARK_RELATIVE_PARAM(name, param) \ BENCHMARK_RELATIVE_NAMED_PARAM(name, param, param) +/** + * Same as BENCHMARK_RELATIVE_PARAM, but allows one to return the actual + * number of iterations that have been run. + */ +#define BENCHMARK_RELATIVE_PARAM_MULTI(name, param) \ + BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(name, param, param) + /** * A combination of BENCHMARK_RELATIVE and BENCHMARK_NAMED_PARAM. */ @@ -376,17 +498,31 @@ void doNotOptimizeAway(T&& datum) { BENCHMARK_IMPL( \ FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \ "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \ + iters, \ unsigned, \ iters) { \ name(iters, ## __VA_ARGS__); \ } +/** + * Same as BENCHMARK_RELATIVE_NAMED_PARAM, but allows one to return the + * actual number of iterations that have been run. + */ +#define BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(name, param_name, ...) \ + BENCHMARK_MULTI_IMPL( \ + FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \ + "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \ + unsigned, \ + iters) { \ + return name(iters, ## __VA_ARGS__); \ + } + /** * Draws a line of dashes. */ -#define BENCHMARK_DRAW_LINE() \ - static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = ( \ - ::folly::addBenchmark(__FILE__, "-", []() { }), \ +#define BENCHMARK_DRAW_LINE() \ + static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = ( \ + ::folly::addBenchmark(__FILE__, "-", []() -> unsigned { return 0; }), \ true); /** @@ -407,5 +543,3 @@ void doNotOptimizeAway(T&& datum) { if (auto FB_ANONYMOUS_VARIABLE(BENCHMARK_SUSPEND) = \ ::folly::BenchmarkSuspender()) {} \ else - -#endif // FOLLY_BENCHMARK_H_