X-Git-Url: http://plrg.eecs.uci.edu/git/?p=folly.git;a=blobdiff_plain;f=folly%2FBenchmark.h;h=dbe12145ef79bd4c5dde046c6efa2bf4fcf1e1fe;hp=3478d10f11cf22ac3ce428290cbc2fa0b1c2c29e;hb=b92bbedce7866bf3760863604e1af1e8e42db24a;hpb=f8147e89c8fc8c1096ccac2587adde4211317730 diff --git a/folly/Benchmark.h b/folly/Benchmark.h index 3478d10f..dbe12145 100644 --- a/folly/Benchmark.h +++ b/folly/Benchmark.h @@ -1,5 +1,5 @@ /* - * Copyright 2015 Facebook, Inc. + * Copyright 2017 Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,21 +14,23 @@ * limitations under the License. */ -#ifndef FOLLY_BENCHMARK_H_ -#define FOLLY_BENCHMARK_H_ +#pragma once -#include #include #include // for FB_ANONYMOUS_VARIABLE +#include +#include +#include + #include -#include -#include +#include #include -#include -#include #include #include +#include +#include + DECLARE_bool(benchmark); namespace folly { @@ -51,14 +53,21 @@ inline bool runBenchmarksOnFlag() { namespace detail { -/** - * This is the clock ID used for measuring time. On older kernels, the - * resolution of this clock will be very coarse, which will cause the - * benchmarks to fail. - */ -enum Clock { DEFAULT_CLOCK_ID = CLOCK_REALTIME }; +using TimeIterPair = + std::pair; +using BenchmarkFun = std::function; -typedef std::pair TimeIterPair; +struct BenchmarkRegistration { + std::string file; + std::string name; + BenchmarkFun func; +}; + +struct BenchmarkResult { + std::string file; + std::string name; + double timeInNs; +}; /** * Adds a benchmark wrapped in a std::function. Only used @@ -68,87 +77,51 @@ void addBenchmarkImpl(const char* file, const char* name, std::function); -/** - * Takes the difference between two timespec values. end is assumed to - * occur after start. - */ -inline uint64_t timespecDiff(timespec end, timespec start) { - if (end.tv_sec == start.tv_sec) { - assert(end.tv_nsec >= start.tv_nsec); - return end.tv_nsec - start.tv_nsec; - } - assert(end.tv_sec > start.tv_sec); - auto diff = uint64_t(end.tv_sec - start.tv_sec); - assert(diff < - std::numeric_limits::max() / 1000000000UL); - return diff * 1000000000UL - + end.tv_nsec - start.tv_nsec; -} - -/** - * Takes the difference between two sets of timespec values. The first - * two come from a high-resolution clock whereas the other two come - * from a low-resolution clock. The crux of the matter is that - * high-res values may be bogus as documented in - * http://linux.die.net/man/3/clock_gettime. The trouble is when the - * running process migrates from one CPU to another, which is more - * likely for long-running processes. Therefore we watch for high - * differences between the two timings. - * - * This function is subject to further improvements. - */ -inline uint64_t timespecDiff(timespec end, timespec start, - timespec endCoarse, timespec startCoarse) { - auto fine = timespecDiff(end, start); - auto coarse = timespecDiff(endCoarse, startCoarse); - if (coarse - fine >= 1000000) { - // The fine time is in all likelihood bogus - return coarse; - } - return fine; -} - } // namespace detail /** * Supporting type for BENCHMARK_SUSPEND defined below. */ struct BenchmarkSuspender { + using Clock = std::chrono::high_resolution_clock; + using TimePoint = Clock::time_point; + using Duration = Clock::duration; + BenchmarkSuspender() { - CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &start)); + start = Clock::now(); } BenchmarkSuspender(const BenchmarkSuspender &) = delete; BenchmarkSuspender(BenchmarkSuspender && rhs) noexcept { start = rhs.start; - rhs.start.tv_nsec = rhs.start.tv_sec = 0; + rhs.start = {}; } BenchmarkSuspender& operator=(const BenchmarkSuspender &) = delete; BenchmarkSuspender& operator=(BenchmarkSuspender && rhs) { - if (start.tv_nsec > 0 || start.tv_sec > 0) { + if (start != TimePoint{}) { tally(); } start = rhs.start; - rhs.start.tv_nsec = rhs.start.tv_sec = 0; + rhs.start = {}; return *this; } ~BenchmarkSuspender() { - if (start.tv_nsec > 0 || start.tv_sec > 0) { + if (start != TimePoint{}) { tally(); } } void dismiss() { - assert(start.tv_nsec > 0 || start.tv_sec > 0); + assert(start != TimePoint{}); tally(); - start.tv_nsec = start.tv_sec = 0; + start = {}; } void rehire() { - assert(start.tv_nsec == 0 || start.tv_sec == 0); - CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &start)); + assert(start == TimePoint{}); + start = Clock::now(); } template @@ -167,20 +140,18 @@ struct BenchmarkSuspender { } /** - * Accumulates nanoseconds spent outside benchmark. + * Accumulates time spent outside benchmark. */ - typedef uint64_t NanosecondsSpent; - static NanosecondsSpent nsSpent; + static Duration timeSpent; -private: + private: void tally() { - timespec end; - CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &end)); - nsSpent += detail::timespecDiff(end, start); + auto end = Clock::now(); + timeSpent += end - start; start = end; } - timespec start; + TimePoint start; }; /** @@ -197,22 +168,17 @@ typename std::enable_if< >::type addBenchmark(const char* file, const char* name, Lambda&& lambda) { auto execute = [=](unsigned int times) { - BenchmarkSuspender::nsSpent = 0; - timespec start, end; + BenchmarkSuspender::timeSpent = {}; unsigned int niter; // CORE MEASUREMENT STARTS - auto const r1 = clock_gettime(detail::DEFAULT_CLOCK_ID, &start); + auto start = std::chrono::high_resolution_clock::now(); niter = lambda(times); - auto const r2 = clock_gettime(detail::DEFAULT_CLOCK_ID, &end); + auto end = std::chrono::high_resolution_clock::now(); // CORE MEASUREMENT ENDS - CHECK_EQ(0, r1); - CHECK_EQ(0, r2); - return detail::TimeIterPair( - detail::timespecDiff(end, start) - BenchmarkSuspender::nsSpent, - niter); + (end - start) - BenchmarkSuspender::timeSpent, niter); }; detail::addBenchmarkImpl(file, name, @@ -241,29 +207,106 @@ addBenchmark(const char* file, const char* name, Lambda&& lambda) { } /** - * Call doNotOptimizeAway(var) against variables that you use for + * Call doNotOptimizeAway(var) to ensure that var will be computed even + * post-optimization. Use it for variables that are computed during * benchmarking but otherwise are useless. The compiler tends to do a - * good job at eliminating unused variables, and this function fools - * it into thinking var is in fact needed. + * good job at eliminating unused variables, and this function fools it + * into thinking var is in fact needed. + * + * Call makeUnpredictable(var) when you don't want the optimizer to use + * its knowledge of var to shape the following code. This is useful + * when constant propagation or power reduction is possible during your + * benchmark but not in real use cases. */ + #ifdef _MSC_VER #pragma optimize("", off) +inline void doNotOptimizeDependencySink(const void*) {} + +#pragma optimize("", on) + template -void doNotOptimizeAway(T&& datum) { - datum = datum; +void doNotOptimizeAway(const T& datum) { + doNotOptimizeDependencySink(&datum); } -#pragma optimize("", on) +template +void makeUnpredictable(T& datum) { + doNotOptimizeDependencySink(&datum); +} #else -template -void doNotOptimizeAway(T&& datum) { - asm volatile("" : "+r" (datum)); + +namespace detail { +template +struct DoNotOptimizeAwayNeedsIndirect { + using Decayed = typename std::decay::type; + + // First two constraints ensure it can be an "r" operand. + // std::is_pointer check is because callers seem to expect that + // doNotOptimizeAway(&x) is equivalent to doNotOptimizeAway(x). + constexpr static bool value = !folly::IsTriviallyCopyable::value || + sizeof(Decayed) > sizeof(long) || std::is_pointer::value; +}; +} // namespace detail + +template +auto doNotOptimizeAway(const T& datum) -> typename std::enable_if< + !detail::DoNotOptimizeAwayNeedsIndirect::value>::type { + // The "r" constraint forces the compiler to make datum available + // in a register to the asm block, which means that it must have + // computed/loaded it. We use this path for things that are <= + // sizeof(long) (they have to fit), trivial (otherwise the compiler + // doesn't want to put them in a register), and not a pointer (because + // doNotOptimizeAway(&foo) would otherwise be a foot gun that didn't + // necessarily compute foo). + // + // An earlier version of this method had a more permissive input operand + // constraint, but that caused unnecessary variation between clang and + // gcc benchmarks. + asm volatile("" ::"r"(datum)); } + +template +auto doNotOptimizeAway(const T& datum) -> typename std::enable_if< + detail::DoNotOptimizeAwayNeedsIndirect::value>::type { + // This version of doNotOptimizeAway tells the compiler that the asm + // block will read datum from memory, and that in addition it might read + // or write from any memory location. If the memory clobber could be + // separated into input and output that would be preferrable. + asm volatile("" ::"m"(datum) : "memory"); +} + +template +auto makeUnpredictable(T& datum) -> typename std::enable_if< + !detail::DoNotOptimizeAwayNeedsIndirect::value>::type { + asm volatile("" : "+r"(datum)); +} + +template +auto makeUnpredictable(T& datum) -> typename std::enable_if< + detail::DoNotOptimizeAwayNeedsIndirect::value>::type { + asm volatile("" ::"m"(datum) : "memory"); +} + #endif +struct dynamic; + +void benchmarkResultsToDynamic( + const std::vector& data, + dynamic&); + +void benchmarkResultsFromDynamic( + const dynamic&, + std::vector&); + +void printResultComparison( + const std::vector& base, + const std::vector& test); + } // namespace folly /** @@ -367,7 +410,7 @@ void doNotOptimizeAway(T&& datum) { BENCHMARK_NAMED_PARAM(name, param, param) /** - * Same as BENCHMARK_PARAM, but allows to return the actual number of + * Same as BENCHMARK_PARAM, but allows one to return the actual number of * iterations that have been run. */ #define BENCHMARK_PARAM_MULTI(name, param) \ @@ -407,7 +450,7 @@ void doNotOptimizeAway(T&& datum) { } /** - * Same as BENCHMARK_NAMED_PARAM, but allows to return the actual number + * Same as BENCHMARK_NAMED_PARAM, but allows one to return the actual number * of iterations that have been run. */ #define BENCHMARK_NAMED_PARAM_MULTI(name, param_name, ...) \ @@ -422,7 +465,7 @@ void doNotOptimizeAway(T&& datum) { /** * Just like BENCHMARK, but prints the time relative to a * baseline. The baseline is the most recent BENCHMARK() seen in - * lexical order. Example: + * the current scope. Example: * * // This is the baseline * BENCHMARK(insertVectorBegin, n) { @@ -452,7 +495,7 @@ void doNotOptimizeAway(T&& datum) { __VA_ARGS__) /** - * Same as BENCHMARK_RELATIVE, but allows to return the actual number + * Same as BENCHMARK_RELATIVE, but allows one to return the actual number * of iterations that have been run. */ #define BENCHMARK_RELATIVE_MULTI(name, ...) \ @@ -469,7 +512,7 @@ void doNotOptimizeAway(T&& datum) { BENCHMARK_RELATIVE_NAMED_PARAM(name, param, param) /** - * Same as BENCHMARK_RELATIVE_PARAM, but allows to return the actual + * Same as BENCHMARK_RELATIVE_PARAM, but allows one to return the actual * number of iterations that have been run. */ #define BENCHMARK_RELATIVE_PARAM_MULTI(name, param) \ @@ -489,7 +532,7 @@ void doNotOptimizeAway(T&& datum) { } /** - * Same as BENCHMARK_RELATIVE_NAMED_PARAM, but allows to return the + * Same as BENCHMARK_RELATIVE_NAMED_PARAM, but allows one to return the * actual number of iterations that have been run. */ #define BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(name, param_name, ...) \ @@ -527,5 +570,3 @@ void doNotOptimizeAway(T&& datum) { if (auto FB_ANONYMOUS_VARIABLE(BENCHMARK_SUSPEND) = \ ::folly::BenchmarkSuspender()) {} \ else - -#endif // FOLLY_BENCHMARK_H_