From: Michael Lee Date: Thu, 25 Feb 2016 22:58:16 +0000 (-0800) Subject: Split tests into test and benchmarks. X-Git-Tag: deprecate-dynamic-initializer~36 X-Git-Url: http://plrg.eecs.uci.edu/git/?p=folly.git;a=commitdiff_plain;h=705e7518ea5685ee7812d96dae277787e15d5ca8 Split tests into test and benchmarks. Summary:Split up io/test/IOBufCursorTest test/BitsTest test/CacheLocalityTest test/CallOnceTest test/ConvTest test/LoggingTest test/MemoryIdlerTest test/RandomTest test/ThreadLocalTest Reviewed By: yfeldblum Differential Revision: D2973132 fb-gh-sync-id: 9dadbdf49a31e82c3a2e34c2fdb6a2b47aa0928d shipit-source-id: 9dadbdf49a31e82c3a2e34c2fdb6a2b47aa0928d --- diff --git a/folly/Makefile.am b/folly/Makefile.am index 74f465f7..569879db 100644 --- a/folly/Makefile.am +++ b/folly/Makefile.am @@ -266,6 +266,7 @@ nobase_follyinclude_HEADERS = \ Portability.h \ portability/Constexpr.h \ portability/Environment.h \ + portability/GFlags.h \ portability/Syscall.h \ portability/SysUio.h \ Preprocessor.h \ diff --git a/folly/io/test/IOBufCursorBenchmark.cpp b/folly/io/test/IOBufCursorBenchmark.cpp new file mode 100644 index 00000000..e7142d9c --- /dev/null +++ b/folly/io/test/IOBufCursorBenchmark.cpp @@ -0,0 +1,111 @@ +/* + * Copyright 2016 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +DECLARE_bool(benchmark); + +using folly::ByteRange; +using folly::format; +using folly::IOBuf; +using folly::StringPiece; +using std::unique_ptr; +using namespace folly::io; + +int benchmark_size = 1000; +unique_ptr iobuf_benchmark; + +unique_ptr iobuf_read_benchmark; + +template +void runBenchmark() { + CursClass c(iobuf_benchmark.get()); + + for (int i = 0; i < benchmark_size; i++) { + c.write((uint8_t)0); + } +} + +BENCHMARK(rwPrivateCursorBenchmark, iters) { + while (iters--) { + runBenchmark(); + } +} + +BENCHMARK(rwUnshareCursorBenchmark, iters) { + while (iters--) { + runBenchmark(); + } +} + +BENCHMARK(cursorBenchmark, iters) { + while (iters--) { + Cursor c(iobuf_read_benchmark.get()); + for (int i = 0; i < benchmark_size; i++) { + c.read(); + } + } +} + +BENCHMARK(skipBenchmark, iters) { + while (iters--) { + Cursor c(iobuf_read_benchmark.get()); + for (int i = 0; i < benchmark_size; i++) { + c.peek(); + c.skip(1); + } + } +} + +// fbmake opt +// _bin/folly/experimental/io/test/iobuf_cursor_test -benchmark +// +// Benchmark Iters Total t t/iter iter/sec +// --------------------------------------------------------------------------- +// rwPrivateCursorBenchmark 100000 142.9 ms 1.429 us 683.5 k +// rwUnshareCursorBenchmark 100000 309.3 ms 3.093 us 315.7 k +// cursorBenchmark 100000 741.4 ms 7.414 us 131.7 k +// skipBenchmark 100000 738.9 ms 7.389 us 132.2 k +// +// uname -a: +// +// Linux dev2159.snc6.facebook.com 2.6.33-7_fbk15_104e4d0 #1 SMP +// Tue Oct 19 22:40:30 PDT 2010 x86_64 x86_64 x86_64 GNU/Linux +// +// 72GB RAM, 2 CPUs (Intel(R) Xeon(R) CPU L5630 @ 2.13GHz) +// hyperthreading disabled + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + iobuf_benchmark = IOBuf::create(benchmark_size); + iobuf_benchmark->append(benchmark_size); + + iobuf_read_benchmark = IOBuf::create(1); + for (int i = 0; i < benchmark_size; i++) { + unique_ptr iobuf2(IOBuf::create(1)); + iobuf2->append(1); + iobuf_read_benchmark->prependChain(std::move(iobuf2)); + } + + folly::runBenchmarks(); + return 0; +} diff --git a/folly/io/test/IOBufCursorTest.cpp b/folly/io/test/IOBufCursorTest.cpp index 8c65a042..ab8d1f83 100644 --- a/folly/io/test/IOBufCursorTest.cpp +++ b/folly/io/test/IOBufCursorTest.cpp @@ -16,16 +16,12 @@ #include -#include -#include -#include -#include #include #include #include #include -DECLARE_bool(benchmark); +#include using folly::ByteRange; using folly::format; @@ -779,90 +775,3 @@ TEST(IOBuf, StringOperations) { EXPECT_STREQ("hello", curs.readFixedString(5).c_str()); } } - -int benchmark_size = 1000; -unique_ptr iobuf_benchmark; - -unique_ptr iobuf_read_benchmark; - -template -void runBenchmark() { - CursClass c(iobuf_benchmark.get()); - - for(int i = 0; i < benchmark_size; i++) { - c.write((uint8_t)0); - } -} - -BENCHMARK(rwPrivateCursorBenchmark, iters) { - while (iters--) { - runBenchmark(); - } -} - -BENCHMARK(rwUnshareCursorBenchmark, iters) { - while (iters--) { - runBenchmark(); - } -} - - -BENCHMARK(cursorBenchmark, iters) { - while (iters--) { - Cursor c(iobuf_read_benchmark.get()); - for(int i = 0; i < benchmark_size ; i++) { - c.read(); - } - } -} - -BENCHMARK(skipBenchmark, iters) { - while (iters--) { - Cursor c(iobuf_read_benchmark.get()); - for(int i = 0; i < benchmark_size ; i++) { - c.peek(); - c.skip(1); - } - } -} - -// fbmake opt -// _bin/folly/experimental/io/test/iobuf_cursor_test -benchmark -// -// Benchmark Iters Total t t/iter iter/sec -// --------------------------------------------------------------------------- -// rwPrivateCursorBenchmark 100000 142.9 ms 1.429 us 683.5 k -// rwUnshareCursorBenchmark 100000 309.3 ms 3.093 us 315.7 k -// cursorBenchmark 100000 741.4 ms 7.414 us 131.7 k -// skipBenchmark 100000 738.9 ms 7.389 us 132.2 k -// -// uname -a: -// -// Linux dev2159.snc6.facebook.com 2.6.33-7_fbk15_104e4d0 #1 SMP -// Tue Oct 19 22:40:30 PDT 2010 x86_64 x86_64 x86_64 GNU/Linux -// -// 72GB RAM, 2 CPUs (Intel(R) Xeon(R) CPU L5630 @ 2.13GHz) -// hyperthreading disabled - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - gflags::ParseCommandLineFlags(&argc, &argv, true); - - auto ret = RUN_ALL_TESTS(); - - if (ret == 0 && FLAGS_benchmark) { - iobuf_benchmark = IOBuf::create(benchmark_size); - iobuf_benchmark->append(benchmark_size); - - iobuf_read_benchmark = IOBuf::create(1); - for (int i = 0; i < benchmark_size; i++) { - unique_ptr iobuf2(IOBuf::create(1)); - iobuf2->append(1); - iobuf_read_benchmark->prependChain(std::move(iobuf2)); - } - - folly::runBenchmarks(); - } - - return ret; -} diff --git a/folly/portability/GFlags.h b/folly/portability/GFlags.h new file mode 100644 index 00000000..976ecccc --- /dev/null +++ b/folly/portability/GFlags.h @@ -0,0 +1,28 @@ +/* + * Copyright 2016 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FOLLY_GFLAGS_H_ +#define FOLLY_GFLAGS_H_ + +#include + +#if FOLLY_HAVE_LIBGFLAGS +#include +#else +#define DEFINE_int32(_name, _default, _description) int FLAGS_##_name = _default +#endif + +#endif diff --git a/folly/test/BitsBenchmark.cpp b/folly/test/BitsBenchmark.cpp new file mode 100644 index 00000000..ac9e4e0d --- /dev/null +++ b/folly/test/BitsBenchmark.cpp @@ -0,0 +1,54 @@ +/* + * Copyright 2016 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// @author Tudor Bosman (tudorb@fb.com) + +#include + +#include + +using namespace folly; + +BENCHMARK(nextPowTwoClz, iters) { + for (unsigned long i = 0; i < iters; ++i) { + auto x = folly::nextPowTwo(iters); + folly::doNotOptimizeAway(x); + } +} + +BENCHMARK_DRAW_LINE(); +BENCHMARK(isPowTwo, iters) { + bool b; + for (unsigned long i = 0; i < iters; ++i) { + b = folly::isPowTwo(i); + folly::doNotOptimizeAway(b); + } +} + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + folly::runBenchmarks(); + return 0; +} + +/* +Benchmarks run on dual Xeon X5650's @ 2.67GHz w/hyperthreading enabled + (12 physical cores, 12 MB cache, 72 GB RAM) + +Benchmark Iters Total t t/iter iter/sec +------------------------------------------------------------------------------ +* nextPowTwoClz 1000000 1.659 ms 1.659 ns 574.8 M +*/ diff --git a/folly/test/BitsTest.cpp b/folly/test/BitsTest.cpp index 10a8a772..b5cf43be 100644 --- a/folly/test/BitsTest.cpp +++ b/folly/test/BitsTest.cpp @@ -16,9 +16,8 @@ // @author Tudor Bosman (tudorb@fb.com) -#include #include -#include + #include using namespace folly; @@ -117,13 +116,6 @@ TEST(Bits, nextPowTwoClz) { testPowTwo(nextPowTwo); } -BENCHMARK(nextPowTwoClz, iters) { - for (unsigned long i = 0; i < iters; ++i) { - auto x = folly::nextPowTwo(iters); - folly::doNotOptimizeAway(x); - } -} - TEST(Bits, isPowTwo) { EXPECT_FALSE(isPowTwo(0u)); EXPECT_TRUE(isPowTwo(1ul)); @@ -146,37 +138,9 @@ TEST(Bits, isPowTwo) { EXPECT_FALSE(isPowTwo((1ull<<63) + 1)); } -BENCHMARK_DRAW_LINE(); -BENCHMARK(isPowTwo, iters) { - bool b; - for (unsigned long i = 0; i < iters; ++i) { - b = folly::isPowTwo(i); - folly::doNotOptimizeAway(b); - } -} - TEST(Bits, popcount) { EXPECT_EQ(0, popcount(0U)); EXPECT_EQ(1, popcount(1U)); EXPECT_EQ(32, popcount(uint32_t(-1))); EXPECT_EQ(64, popcount(uint64_t(-1))); } - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - gflags::ParseCommandLineFlags(&argc, &argv, true); - auto ret = RUN_ALL_TESTS(); - if (!ret && FLAGS_benchmark) { - folly::runBenchmarks(); - } - return ret; -} - -/* -Benchmarks run on dual Xeon X5650's @ 2.67GHz w/hyperthreading enabled - (12 physical cores, 12 MB cache, 72 GB RAM) - -Benchmark Iters Total t t/iter iter/sec ------------------------------------------------------------------------------- -* nextPowTwoClz 1000000 1.659 ms 1.659 ns 574.8 M -*/ diff --git a/folly/test/CacheLocalityBenchmark.cpp b/folly/test/CacheLocalityBenchmark.cpp new file mode 100644 index 00000000..4815cc24 --- /dev/null +++ b/folly/test/CacheLocalityBenchmark.cpp @@ -0,0 +1,292 @@ +/* + * Copyright 2016 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace folly::detail; + +#define DECLARE_SPREADER_TAG(tag, locality, func) \ + namespace { \ + template \ + struct tag {}; \ + } \ + DECLARE_ACCESS_SPREADER_TYPE(tag) \ + namespace folly { \ + namespace detail { \ + template <> \ + const CacheLocality& CacheLocality::system() { \ + static auto* inst = new CacheLocality(locality); \ + return *inst; \ + } \ + template <> \ + Getcpu::Func AccessSpreader::pickGetcpuFunc() { \ + return func; \ + } \ + } \ + } + +DECLARE_SPREADER_TAG( + ThreadLocalTag, + CacheLocality::system<>(), + folly::detail::FallbackGetcpu>::getcpu) +DECLARE_SPREADER_TAG(PthreadSelfTag, + CacheLocality::system<>(), + folly::detail::FallbackGetcpu::getcpu) + +BENCHMARK(AccessSpreaderUse, iters) { + for (unsigned long i = 0; i < iters; ++i) { + auto x = AccessSpreader<>::current(16); + folly::doNotOptimizeAway(x); + } +} + +// Benchmark scores here reflect the time for 32 threads to perform an +// atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly, +// if we don't separate the counters onto unique 128 byte stripes the +// 1_stripe and 2_stripe results are identical, even though the L3 is +// claimed to have 64 byte cache lines. +// +// Getcpu refers to the vdso getcpu implementation. ThreadLocal refers +// to execution using SequentialThreadId, the fallback if the vdso +// getcpu isn't available. PthreadSelf hashes the value returned from +// pthread_self() as a fallback-fallback for systems that don't have +// thread-local support. +// +// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic, +// so since the stripe selection is 12 nanos the atomic increments in +// the L1 is ~17 nanos. At width 8_stripe_0_work the line is expected +// to ping-pong almost every operation, since the loops have the same +// duration. Widths 4 and 2 have the same behavior, but each tour of the +// cache line is 4 and 8 cores long, respectively. These all suggest a +// lower bound of 60 nanos for intra-chip handoff and increment between +// the L1s. +// +// With 420 nanos of busywork per contended increment, the system can +// hide all of the latency of a tour of length 4, but not quite one of +// length 8. I was a bit surprised at how much worse the non-striped +// version got. It seems that the inter-chip traffic also interferes +// with the L1-only localWork.load(). When the local work is doubled +// to about 1 microsecond we see that the inter-chip contention is still +// very important, but subdivisions on the same chip don't matter. +// +// sudo nice -n -20 buck-out/gen/folly/test/cache_locality_test +// --benchmark --bm_min_iters=1000000 +// ============================================================================ +// folly/test/CacheLocalityTest.cpp relative time/iter iters/s +// ============================================================================ +// AccessSpreaderUse 11.94ns 83.79M +// ---------------------------------------------------------------------------- +// contentionAtWidthGetcpu(1_stripe_0_work) 985.75ns 1.01M +// contentionAtWidthGetcpu(2_stripe_0_work) 424.02ns 2.36M +// contentionAtWidthGetcpu(4_stripe_0_work) 190.13ns 5.26M +// contentionAtWidthGetcpu(8_stripe_0_work) 91.86ns 10.89M +// contentionAtWidthGetcpu(16_stripe_0_work) 29.31ns 34.12M +// contentionAtWidthGetcpu(32_stripe_0_work) 29.53ns 33.86M +// contentionAtWidthGetcpu(64_stripe_0_work) 29.93ns 33.41M +// contentionAtWidthThreadLocal(2_stripe_0_work) 609.21ns 1.64M +// contentionAtWidthThreadLocal(4_stripe_0_work) 303.60ns 3.29M +// contentionAtWidthThreadLocal(8_stripe_0_work) 246.57ns 4.06M +// contentionAtWidthThreadLocal(16_stripe_0_work) 154.84ns 6.46M +// contentionAtWidthThreadLocal(32_stripe_0_work) 24.14ns 41.43M +// contentionAtWidthThreadLocal(64_stripe_0_work) 23.95ns 41.75M +// contentionAtWidthPthreadSelf(2_stripe_0_work) 722.01ns 1.39M +// contentionAtWidthPthreadSelf(4_stripe_0_work) 501.56ns 1.99M +// contentionAtWidthPthreadSelf(8_stripe_0_work) 474.58ns 2.11M +// contentionAtWidthPthreadSelf(16_stripe_0_work) 300.90ns 3.32M +// contentionAtWidthPthreadSelf(32_stripe_0_work) 175.77ns 5.69M +// contentionAtWidthPthreadSelf(64_stripe_0_work) 174.88ns 5.72M +// atomicIncrBaseline(local_incr_0_work) 16.81ns 59.51M +// ---------------------------------------------------------------------------- +// contentionAtWidthGetcpu(1_stripe_500_work) 1.82us 549.97K +// contentionAtWidthGetcpu(2_stripe_500_work) 533.71ns 1.87M +// contentionAtWidthGetcpu(4_stripe_500_work) 424.64ns 2.35M +// contentionAtWidthGetcpu(8_stripe_500_work) 451.85ns 2.21M +// contentionAtWidthGetcpu(16_stripe_500_work) 425.54ns 2.35M +// contentionAtWidthGetcpu(32_stripe_500_work) 501.66ns 1.99M +// atomicIncrBaseline(local_incr_500_work) 438.46ns 2.28M +// ---------------------------------------------------------------------------- +// contentionAtWidthGetcpu(1_stripe_1000_work) 1.88us 532.20K +// contentionAtWidthGetcpu(2_stripe_1000_work) 824.62ns 1.21M +// contentionAtWidthGetcpu(4_stripe_1000_work) 803.56ns 1.24M +// contentionAtWidthGetcpu(8_stripe_1000_work) 926.65ns 1.08M +// contentionAtWidthGetcpu(16_stripe_1000_work) 900.10ns 1.11M +// contentionAtWidthGetcpu(32_stripe_1000_work) 890.75ns 1.12M +// atomicIncrBaseline(local_incr_1000_work) 774.47ns 1.29M +// ============================================================================ +template