From: Michael Lee <mzlee@fb.com>
Date: Thu, 25 Feb 2016 22:58:16 +0000 (-0800)
Subject: Split tests into test and benchmarks.
X-Git-Tag: deprecate-dynamic-initializer~36
X-Git-Url: http://plrg.eecs.uci.edu/git/?p=folly.git;a=commitdiff_plain;h=705e7518ea5685ee7812d96dae277787e15d5ca8

Split tests into test and benchmarks.

Summary:Split up

  io/test/IOBufCursorTest
  test/BitsTest
  test/CacheLocalityTest
  test/CallOnceTest
  test/ConvTest
  test/LoggingTest
  test/MemoryIdlerTest
  test/RandomTest
  test/ThreadLocalTest

Reviewed By: yfeldblum

Differential Revision: D2973132

fb-gh-sync-id: 9dadbdf49a31e82c3a2e34c2fdb6a2b47aa0928d
shipit-source-id: 9dadbdf49a31e82c3a2e34c2fdb6a2b47aa0928d
---

diff --git a/folly/Makefile.am b/folly/Makefile.am
index 74f465f7..569879db 100644
--- a/folly/Makefile.am
+++ b/folly/Makefile.am
@@ -266,6 +266,7 @@ nobase_follyinclude_HEADERS = \
 	Portability.h \
 	portability/Constexpr.h \
 	portability/Environment.h \
+	portability/GFlags.h \
 	portability/Syscall.h \
 	portability/SysUio.h \
 	Preprocessor.h \
diff --git a/folly/io/test/IOBufCursorBenchmark.cpp b/folly/io/test/IOBufCursorBenchmark.cpp
new file mode 100644
index 00000000..e7142d9c
--- /dev/null
+++ b/folly/io/test/IOBufCursorBenchmark.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/io/IOBuf.h>
+
+#include <folly/Benchmark.h>
+#include <folly/Format.h>
+#include <folly/Range.h>
+#include <folly/io/Cursor.h>
+#include <folly/io/Cursor-defs.h>
+
+DECLARE_bool(benchmark);
+
+using folly::ByteRange;
+using folly::format;
+using folly::IOBuf;
+using folly::StringPiece;
+using std::unique_ptr;
+using namespace folly::io;
+
+int benchmark_size = 1000;
+unique_ptr<IOBuf> iobuf_benchmark;
+
+unique_ptr<IOBuf> iobuf_read_benchmark;
+
+template <class CursClass>
+void runBenchmark() {
+  CursClass c(iobuf_benchmark.get());
+
+  for (int i = 0; i < benchmark_size; i++) {
+    c.write((uint8_t)0);
+  }
+}
+
+BENCHMARK(rwPrivateCursorBenchmark, iters) {
+  while (iters--) {
+    runBenchmark<RWPrivateCursor>();
+  }
+}
+
+BENCHMARK(rwUnshareCursorBenchmark, iters) {
+  while (iters--) {
+    runBenchmark<RWUnshareCursor>();
+  }
+}
+
+BENCHMARK(cursorBenchmark, iters) {
+  while (iters--) {
+    Cursor c(iobuf_read_benchmark.get());
+    for (int i = 0; i < benchmark_size; i++) {
+      c.read<uint8_t>();
+    }
+  }
+}
+
+BENCHMARK(skipBenchmark, iters) {
+  while (iters--) {
+    Cursor c(iobuf_read_benchmark.get());
+    for (int i = 0; i < benchmark_size; i++) {
+      c.peek();
+      c.skip(1);
+    }
+  }
+}
+
+// fbmake opt
+// _bin/folly/experimental/io/test/iobuf_cursor_test -benchmark
+//
+// Benchmark                               Iters   Total t    t/iter iter/sec
+// ---------------------------------------------------------------------------
+// rwPrivateCursorBenchmark               100000  142.9 ms  1.429 us  683.5 k
+// rwUnshareCursorBenchmark               100000  309.3 ms  3.093 us  315.7 k
+// cursorBenchmark                        100000  741.4 ms  7.414 us  131.7 k
+// skipBenchmark                          100000  738.9 ms  7.389 us  132.2 k
+//
+// uname -a:
+//
+// Linux dev2159.snc6.facebook.com 2.6.33-7_fbk15_104e4d0 #1 SMP
+// Tue Oct 19 22:40:30 PDT 2010 x86_64 x86_64 x86_64 GNU/Linux
+//
+// 72GB RAM, 2 CPUs (Intel(R) Xeon(R) CPU L5630  @ 2.13GHz)
+// hyperthreading disabled
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  iobuf_benchmark = IOBuf::create(benchmark_size);
+  iobuf_benchmark->append(benchmark_size);
+
+  iobuf_read_benchmark = IOBuf::create(1);
+  for (int i = 0; i < benchmark_size; i++) {
+    unique_ptr<IOBuf> iobuf2(IOBuf::create(1));
+    iobuf2->append(1);
+    iobuf_read_benchmark->prependChain(std::move(iobuf2));
+  }
+
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/folly/io/test/IOBufCursorTest.cpp b/folly/io/test/IOBufCursorTest.cpp
index 8c65a042..ab8d1f83 100644
--- a/folly/io/test/IOBufCursorTest.cpp
+++ b/folly/io/test/IOBufCursorTest.cpp
@@ -16,16 +16,12 @@
 
 #include <folly/io/IOBuf.h>
 
-#include <gflags/gflags.h>
-#include <boost/random.hpp>
-#include <gtest/gtest.h>
-#include <folly/Benchmark.h>
 #include <folly/Format.h>
 #include <folly/Range.h>
 #include <folly/io/Cursor.h>
 #include <folly/io/Cursor-defs.h>
 
-DECLARE_bool(benchmark);
+#include <gtest/gtest.h>
 
 using folly::ByteRange;
 using folly::format;
@@ -779,90 +775,3 @@ TEST(IOBuf, StringOperations) {
     EXPECT_STREQ("hello", curs.readFixedString(5).c_str());
   }
 }
-
-int benchmark_size = 1000;
-unique_ptr<IOBuf> iobuf_benchmark;
-
-unique_ptr<IOBuf> iobuf_read_benchmark;
-
-template <class CursClass>
-void runBenchmark() {
-  CursClass c(iobuf_benchmark.get());
-
-  for(int i = 0; i < benchmark_size; i++) {
-    c.write((uint8_t)0);
-  }
-}
-
-BENCHMARK(rwPrivateCursorBenchmark, iters) {
-  while (iters--) {
-    runBenchmark<RWPrivateCursor>();
-  }
-}
-
-BENCHMARK(rwUnshareCursorBenchmark, iters) {
-  while (iters--) {
-    runBenchmark<RWUnshareCursor>();
-  }
-}
-
-
-BENCHMARK(cursorBenchmark, iters) {
-  while (iters--) {
-    Cursor c(iobuf_read_benchmark.get());
-    for(int i = 0; i < benchmark_size ; i++) {
-      c.read<uint8_t>();
-    }
-  }
-}
-
-BENCHMARK(skipBenchmark, iters) {
-  while (iters--) {
-    Cursor c(iobuf_read_benchmark.get());
-    for(int i = 0; i < benchmark_size ; i++) {
-      c.peek();
-      c.skip(1);
-    }
-  }
-}
-
-// fbmake opt
-// _bin/folly/experimental/io/test/iobuf_cursor_test -benchmark
-//
-// Benchmark                               Iters   Total t    t/iter iter/sec
-// ---------------------------------------------------------------------------
-// rwPrivateCursorBenchmark               100000  142.9 ms  1.429 us  683.5 k
-// rwUnshareCursorBenchmark               100000  309.3 ms  3.093 us  315.7 k
-// cursorBenchmark                        100000  741.4 ms  7.414 us  131.7 k
-// skipBenchmark                          100000  738.9 ms  7.389 us  132.2 k
-//
-// uname -a:
-//
-// Linux dev2159.snc6.facebook.com 2.6.33-7_fbk15_104e4d0 #1 SMP
-// Tue Oct 19 22:40:30 PDT 2010 x86_64 x86_64 x86_64 GNU/Linux
-//
-// 72GB RAM, 2 CPUs (Intel(R) Xeon(R) CPU L5630  @ 2.13GHz)
-// hyperthreading disabled
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  auto ret = RUN_ALL_TESTS();
-
-  if (ret == 0 && FLAGS_benchmark) {
-    iobuf_benchmark = IOBuf::create(benchmark_size);
-    iobuf_benchmark->append(benchmark_size);
-
-    iobuf_read_benchmark = IOBuf::create(1);
-    for (int i = 0; i < benchmark_size; i++) {
-      unique_ptr<IOBuf> iobuf2(IOBuf::create(1));
-      iobuf2->append(1);
-      iobuf_read_benchmark->prependChain(std::move(iobuf2));
-    }
-
-    folly::runBenchmarks();
-  }
-
-  return ret;
-}
diff --git a/folly/portability/GFlags.h b/folly/portability/GFlags.h
new file mode 100644
index 00000000..976ecccc
--- /dev/null
+++ b/folly/portability/GFlags.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FOLLY_GFLAGS_H_
+#define FOLLY_GFLAGS_H_
+
+#include <folly/Portability.h>
+
+#if FOLLY_HAVE_LIBGFLAGS
+#include <gflags/gflags.h>
+#else
+#define DEFINE_int32(_name, _default, _description) int FLAGS_##_name = _default
+#endif
+
+#endif
diff --git a/folly/test/BitsBenchmark.cpp b/folly/test/BitsBenchmark.cpp
new file mode 100644
index 00000000..ac9e4e0d
--- /dev/null
+++ b/folly/test/BitsBenchmark.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// @author Tudor Bosman (tudorb@fb.com)
+
+#include <folly/Bits.h>
+
+#include <folly/Benchmark.h>
+
+using namespace folly;
+
+BENCHMARK(nextPowTwoClz, iters) {
+  for (unsigned long i = 0; i < iters; ++i) {
+    auto x = folly::nextPowTwo(iters);
+    folly::doNotOptimizeAway(x);
+  }
+}
+
+BENCHMARK_DRAW_LINE();
+BENCHMARK(isPowTwo, iters) {
+  bool b;
+  for (unsigned long i = 0; i < iters; ++i) {
+    b = folly::isPowTwo(i);
+    folly::doNotOptimizeAway(b);
+  }
+}
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  folly::runBenchmarks();
+  return 0;
+}
+
+/*
+Benchmarks run on dual Xeon X5650's @ 2.67GHz w/hyperthreading enabled
+  (12 physical cores, 12 MB cache, 72 GB RAM)
+
+Benchmark                               Iters   Total t    t/iter iter/sec
+------------------------------------------------------------------------------
+*       nextPowTwoClz                 1000000  1.659 ms  1.659 ns  574.8 M
+*/
diff --git a/folly/test/BitsTest.cpp b/folly/test/BitsTest.cpp
index 10a8a772..b5cf43be 100644
--- a/folly/test/BitsTest.cpp
+++ b/folly/test/BitsTest.cpp
@@ -16,9 +16,8 @@
 
 // @author Tudor Bosman (tudorb@fb.com)
 
-#include <gflags/gflags.h>
 #include <folly/Bits.h>
-#include <folly/Benchmark.h>
+
 #include <gtest/gtest.h>
 
 using namespace folly;
@@ -117,13 +116,6 @@ TEST(Bits, nextPowTwoClz) {
   testPowTwo(nextPowTwo);
 }
 
-BENCHMARK(nextPowTwoClz, iters) {
-  for (unsigned long i = 0; i < iters; ++i) {
-    auto x = folly::nextPowTwo(iters);
-    folly::doNotOptimizeAway(x);
-  }
-}
-
 TEST(Bits, isPowTwo) {
   EXPECT_FALSE(isPowTwo(0u));
   EXPECT_TRUE(isPowTwo(1ul));
@@ -146,37 +138,9 @@ TEST(Bits, isPowTwo) {
   EXPECT_FALSE(isPowTwo((1ull<<63) + 1));
 }
 
-BENCHMARK_DRAW_LINE();
-BENCHMARK(isPowTwo, iters) {
-  bool b;
-  for (unsigned long i = 0; i < iters; ++i) {
-    b = folly::isPowTwo(i);
-    folly::doNotOptimizeAway(b);
-  }
-}
-
 TEST(Bits, popcount) {
   EXPECT_EQ(0, popcount(0U));
   EXPECT_EQ(1, popcount(1U));
   EXPECT_EQ(32, popcount(uint32_t(-1)));
   EXPECT_EQ(64, popcount(uint64_t(-1)));
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  auto ret = RUN_ALL_TESTS();
-  if (!ret && FLAGS_benchmark) {
-    folly::runBenchmarks();
-  }
-  return ret;
-}
-
-/*
-Benchmarks run on dual Xeon X5650's @ 2.67GHz w/hyperthreading enabled
-  (12 physical cores, 12 MB cache, 72 GB RAM)
-
-Benchmark                               Iters   Total t    t/iter iter/sec
-------------------------------------------------------------------------------
-*       nextPowTwoClz                 1000000  1.659 ms  1.659 ns  574.8 M
-*/
diff --git a/folly/test/CacheLocalityBenchmark.cpp b/folly/test/CacheLocalityBenchmark.cpp
new file mode 100644
index 00000000..4815cc24
--- /dev/null
+++ b/folly/test/CacheLocalityBenchmark.cpp
@@ -0,0 +1,292 @@
+/*
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/detail/CacheLocality.h>
+
+#include <sched.h>
+#include <memory>
+#include <thread>
+#include <type_traits>
+#include <unordered_map>
+#include <glog/logging.h>
+#include <folly/Benchmark.h>
+
+using namespace folly::detail;
+
+#define DECLARE_SPREADER_TAG(tag, locality, func)      \
+  namespace {                                          \
+  template <typename dummy>                            \
+  struct tag {};                                       \
+  }                                                    \
+  DECLARE_ACCESS_SPREADER_TYPE(tag)                    \
+  namespace folly {                                    \
+  namespace detail {                                   \
+  template <>                                          \
+  const CacheLocality& CacheLocality::system<tag>() {  \
+    static auto* inst = new CacheLocality(locality);   \
+    return *inst;                                      \
+  }                                                    \
+  template <>                                          \
+  Getcpu::Func AccessSpreader<tag>::pickGetcpuFunc() { \
+    return func;                                       \
+  }                                                    \
+  }                                                    \
+  }
+
+DECLARE_SPREADER_TAG(
+    ThreadLocalTag,
+    CacheLocality::system<>(),
+    folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu)
+DECLARE_SPREADER_TAG(PthreadSelfTag,
+                     CacheLocality::system<>(),
+                     folly::detail::FallbackGetcpu<HashingThreadId>::getcpu)
+
+BENCHMARK(AccessSpreaderUse, iters) {
+  for (unsigned long i = 0; i < iters; ++i) {
+    auto x = AccessSpreader<>::current(16);
+    folly::doNotOptimizeAway(x);
+  }
+}
+
+// Benchmark scores here reflect the time for 32 threads to perform an
+// atomic increment on a dual-socket E5-2660 @ 2.2Ghz.  Surprisingly,
+// if we don't separate the counters onto unique 128 byte stripes the
+// 1_stripe and 2_stripe results are identical, even though the L3 is
+// claimed to have 64 byte cache lines.
+//
+// Getcpu refers to the vdso getcpu implementation.  ThreadLocal refers
+// to execution using SequentialThreadId, the fallback if the vdso
+// getcpu isn't available.  PthreadSelf hashes the value returned from
+// pthread_self() as a fallback-fallback for systems that don't have
+// thread-local support.
+//
+// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
+// so since the stripe selection is 12 nanos the atomic increments in
+// the L1 is ~17 nanos.  At width 8_stripe_0_work the line is expected
+// to ping-pong almost every operation, since the loops have the same
+// duration.  Widths 4 and 2 have the same behavior, but each tour of the
+// cache line is 4 and 8 cores long, respectively.  These all suggest a
+// lower bound of 60 nanos for intra-chip handoff and increment between
+// the L1s.
+//
+// With 420 nanos of busywork per contended increment, the system can
+// hide all of the latency of a tour of length 4, but not quite one of
+// length 8.  I was a bit surprised at how much worse the non-striped
+// version got.  It seems that the inter-chip traffic also interferes
+// with the L1-only localWork.load().  When the local work is doubled
+// to about 1 microsecond we see that the inter-chip contention is still
+// very important, but subdivisions on the same chip don't matter.
+//
+// sudo nice -n -20 buck-out/gen/folly/test/cache_locality_test
+//     --benchmark --bm_min_iters=1000000
+// ============================================================================
+// folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
+// ============================================================================
+// AccessSpreaderUse                                           11.94ns   83.79M
+// ----------------------------------------------------------------------------
+// contentionAtWidthGetcpu(1_stripe_0_work)                   985.75ns    1.01M
+// contentionAtWidthGetcpu(2_stripe_0_work)                   424.02ns    2.36M
+// contentionAtWidthGetcpu(4_stripe_0_work)                   190.13ns    5.26M
+// contentionAtWidthGetcpu(8_stripe_0_work)                    91.86ns   10.89M
+// contentionAtWidthGetcpu(16_stripe_0_work)                   29.31ns   34.12M
+// contentionAtWidthGetcpu(32_stripe_0_work)                   29.53ns   33.86M
+// contentionAtWidthGetcpu(64_stripe_0_work)                   29.93ns   33.41M
+// contentionAtWidthThreadLocal(2_stripe_0_work)              609.21ns    1.64M
+// contentionAtWidthThreadLocal(4_stripe_0_work)              303.60ns    3.29M
+// contentionAtWidthThreadLocal(8_stripe_0_work)              246.57ns    4.06M
+// contentionAtWidthThreadLocal(16_stripe_0_work)             154.84ns    6.46M
+// contentionAtWidthThreadLocal(32_stripe_0_work)              24.14ns   41.43M
+// contentionAtWidthThreadLocal(64_stripe_0_work)              23.95ns   41.75M
+// contentionAtWidthPthreadSelf(2_stripe_0_work)              722.01ns    1.39M
+// contentionAtWidthPthreadSelf(4_stripe_0_work)              501.56ns    1.99M
+// contentionAtWidthPthreadSelf(8_stripe_0_work)              474.58ns    2.11M
+// contentionAtWidthPthreadSelf(16_stripe_0_work)             300.90ns    3.32M
+// contentionAtWidthPthreadSelf(32_stripe_0_work)             175.77ns    5.69M
+// contentionAtWidthPthreadSelf(64_stripe_0_work)             174.88ns    5.72M
+// atomicIncrBaseline(local_incr_0_work)                       16.81ns   59.51M
+// ----------------------------------------------------------------------------
+// contentionAtWidthGetcpu(1_stripe_500_work)                   1.82us  549.97K
+// contentionAtWidthGetcpu(2_stripe_500_work)                 533.71ns    1.87M
+// contentionAtWidthGetcpu(4_stripe_500_work)                 424.64ns    2.35M
+// contentionAtWidthGetcpu(8_stripe_500_work)                 451.85ns    2.21M
+// contentionAtWidthGetcpu(16_stripe_500_work)                425.54ns    2.35M
+// contentionAtWidthGetcpu(32_stripe_500_work)                501.66ns    1.99M
+// atomicIncrBaseline(local_incr_500_work)                    438.46ns    2.28M
+// ----------------------------------------------------------------------------
+// contentionAtWidthGetcpu(1_stripe_1000_work)                  1.88us  532.20K
+// contentionAtWidthGetcpu(2_stripe_1000_work)                824.62ns    1.21M
+// contentionAtWidthGetcpu(4_stripe_1000_work)                803.56ns    1.24M
+// contentionAtWidthGetcpu(8_stripe_1000_work)                926.65ns    1.08M
+// contentionAtWidthGetcpu(16_stripe_1000_work)               900.10ns    1.11M
+// contentionAtWidthGetcpu(32_stripe_1000_work)               890.75ns    1.12M
+// atomicIncrBaseline(local_incr_1000_work)                   774.47ns    1.29M
+// ============================================================================
+template <template <typename> class Tag>
+static void contentionAtWidth(size_t iters, size_t stripes, size_t work) {
+  const size_t counterAlignment = 128;
+  const size_t numThreads = 32;
+
+  folly::BenchmarkSuspender braces;
+
+  std::atomic<size_t> ready(0);
+  std::atomic<bool> go(false);
+
+  // while in theory the cache line size is 64 bytes, experiments show
+  // that we get contention on 128 byte boundaries for Ivy Bridge.  The
+  // extra indirection adds 1 or 2 nanos
+  assert(counterAlignment >= sizeof(std::atomic<size_t>));
+  std::vector<char> raw(counterAlignment * stripes);
+
+  // if we happen to be using the tlsRoundRobin, then sequentially
+  // assigning the thread identifiers is the unlikely best-case scenario.
+  // We don't want to unfairly benefit or penalize.  Computing the exact
+  // maximum likelihood of the probability distributions is annoying, so
+  // I approximate as 2/5 of the ids that have no threads, 2/5 that have
+  // 1, 2/15 that have 2, and 1/15 that have 3.  We accomplish this by
+  // wrapping back to slot 0 when we hit 1/15 and 1/5.
+
+  std::vector<std::thread> threads;
+  while (threads.size() < numThreads) {
+    threads.push_back(std::thread([&, iters, stripes, work]() {
+      std::atomic<size_t>* counters[stripes];
+      for (size_t i = 0; i < stripes; ++i) {
+        counters[i] =
+            new (raw.data() + counterAlignment * i) std::atomic<size_t>();
+      }
+
+      ready++;
+      while (!go.load()) {
+        sched_yield();
+      }
+      std::atomic<int> localWork(0);
+      for (size_t i = iters; i > 0; --i) {
+        ++*(counters[AccessSpreader<Tag>::current(stripes)]);
+        for (size_t j = work; j > 0; --j) {
+          localWork.load();
+        }
+      }
+    }));
+
+    if (threads.size() == numThreads / 15 || threads.size() == numThreads / 5) {
+      // create a few dummy threads to wrap back around to 0 mod numCpus
+      for (size_t i = threads.size(); i != numThreads; ++i) {
+        std::thread([&]() { AccessSpreader<Tag>::current(stripes); }).join();
+      }
+    }
+  }
+
+  while (ready < numThreads) {
+    sched_yield();
+  }
+  braces.dismiss();
+  go = true;
+
+  for (auto& thr : threads) {
+    thr.join();
+  }
+}
+
+static void atomicIncrBaseline(size_t iters,
+                               size_t work,
+                               size_t numThreads = 32) {
+  folly::BenchmarkSuspender braces;
+
+  std::atomic<bool> go(false);
+
+  std::vector<std::thread> threads;
+  while (threads.size() < numThreads) {
+    threads.push_back(std::thread([&]() {
+      while (!go.load()) {
+        sched_yield();
+      }
+      std::atomic<size_t> localCounter(0);
+      std::atomic<int> localWork(0);
+      for (size_t i = iters; i > 0; --i) {
+        localCounter++;
+        for (size_t j = work; j > 0; --j) {
+          localWork.load();
+        }
+      }
+    }));
+  }
+
+  braces.dismiss();
+  go = true;
+
+  for (auto& thr : threads) {
+    thr.join();
+  }
+}
+
+static void contentionAtWidthGetcpu(size_t iters, size_t stripes, size_t work) {
+  contentionAtWidth<std::atomic>(iters, stripes, work);
+}
+
+static void contentionAtWidthThreadLocal(size_t iters,
+                                         size_t stripes,
+                                         size_t work) {
+  contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
+}
+
+static void contentionAtWidthPthreadSelf(size_t iters,
+                                         size_t stripes,
+                                         size_t work) {
+  contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
+}
+
+BENCHMARK_DRAW_LINE()
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_0_work, 1, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_0_work, 2, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_0_work, 4, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_0_work, 8, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_0_work, 16, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_0_work, 32, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 64_stripe_0_work, 64, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 2_stripe_0_work, 2, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 4_stripe_0_work, 4, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 8_stripe_0_work, 8, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 16_stripe_0_work, 16, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 32_stripe_0_work, 32, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 64_stripe_0_work, 64, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 2_stripe_0_work, 2, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 4_stripe_0_work, 4, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 8_stripe_0_work, 8, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 16_stripe_0_work, 16, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 32_stripe_0_work, 32, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 64_stripe_0_work, 64, 0)
+BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
+BENCHMARK_DRAW_LINE()
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_500_work, 1, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_500_work, 2, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_500_work, 4, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_500_work, 8, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_500_work, 16, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_500_work, 32, 500)
+BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
+BENCHMARK_DRAW_LINE()
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_1000_work, 1, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_1000_work, 2, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_1000_work, 4, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_1000_work, 8, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_1000_work, 16, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_1000_work, 32, 1000)
+BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/folly/test/CacheLocalityTest.cpp b/folly/test/CacheLocalityTest.cpp
index 4a5ee999..40732ad8 100644
--- a/folly/test/CacheLocalityTest.cpp
+++ b/folly/test/CacheLocalityTest.cpp
@@ -23,7 +23,6 @@
 #include <unordered_map>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include <folly/Benchmark.h>
 
 using namespace folly::detail;
 
@@ -425,13 +424,6 @@ TEST(AccessSpreader, Simple) {
   }
 
 DECLARE_SPREADER_TAG(ManualTag, CacheLocality::uniform(16), testingGetcpu)
-DECLARE_SPREADER_TAG(
-    ThreadLocalTag,
-    CacheLocality::system<>(),
-    folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu)
-DECLARE_SPREADER_TAG(PthreadSelfTag,
-                     CacheLocality::system<>(),
-                     folly::detail::FallbackGetcpu<HashingThreadId>::getcpu)
 
 TEST(AccessSpreader, Wrapping) {
   // this test won't pass unless locality.numCpus divides kMaxCpus
@@ -448,244 +440,3 @@ TEST(AccessSpreader, Wrapping) {
     }
   }
 }
-
-BENCHMARK(AccessSpreaderUse, iters) {
-  for (unsigned long i = 0; i < iters; ++i) {
-    auto x = AccessSpreader<>::current(16);
-    folly::doNotOptimizeAway(x);
-  }
-}
-
-// Benchmark scores here reflect the time for 32 threads to perform an
-// atomic increment on a dual-socket E5-2660 @ 2.2Ghz.  Surprisingly,
-// if we don't separate the counters onto unique 128 byte stripes the
-// 1_stripe and 2_stripe results are identical, even though the L3 is
-// claimed to have 64 byte cache lines.
-//
-// Getcpu refers to the vdso getcpu implementation.  ThreadLocal refers
-// to execution using SequentialThreadId, the fallback if the vdso
-// getcpu isn't available.  PthreadSelf hashes the value returned from
-// pthread_self() as a fallback-fallback for systems that don't have
-// thread-local support.
-//
-// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
-// so since the stripe selection is 12 nanos the atomic increments in
-// the L1 is ~17 nanos.  At width 8_stripe_0_work the line is expected
-// to ping-pong almost every operation, since the loops have the same
-// duration.  Widths 4 and 2 have the same behavior, but each tour of the
-// cache line is 4 and 8 cores long, respectively.  These all suggest a
-// lower bound of 60 nanos for intra-chip handoff and increment between
-// the L1s.
-//
-// With 420 nanos of busywork per contended increment, the system can
-// hide all of the latency of a tour of length 4, but not quite one of
-// length 8.  I was a bit surprised at how much worse the non-striped
-// version got.  It seems that the inter-chip traffic also interferes
-// with the L1-only localWork.load().  When the local work is doubled
-// to about 1 microsecond we see that the inter-chip contention is still
-// very important, but subdivisions on the same chip don't matter.
-//
-// sudo nice -n -20 buck-out/gen/folly/test/cache_locality_test
-//     --benchmark --bm_min_iters=1000000
-// ============================================================================
-// folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
-// ============================================================================
-// AccessSpreaderUse                                           11.94ns   83.79M
-// ----------------------------------------------------------------------------
-// contentionAtWidthGetcpu(1_stripe_0_work)                   985.75ns    1.01M
-// contentionAtWidthGetcpu(2_stripe_0_work)                   424.02ns    2.36M
-// contentionAtWidthGetcpu(4_stripe_0_work)                   190.13ns    5.26M
-// contentionAtWidthGetcpu(8_stripe_0_work)                    91.86ns   10.89M
-// contentionAtWidthGetcpu(16_stripe_0_work)                   29.31ns   34.12M
-// contentionAtWidthGetcpu(32_stripe_0_work)                   29.53ns   33.86M
-// contentionAtWidthGetcpu(64_stripe_0_work)                   29.93ns   33.41M
-// contentionAtWidthThreadLocal(2_stripe_0_work)              609.21ns    1.64M
-// contentionAtWidthThreadLocal(4_stripe_0_work)              303.60ns    3.29M
-// contentionAtWidthThreadLocal(8_stripe_0_work)              246.57ns    4.06M
-// contentionAtWidthThreadLocal(16_stripe_0_work)             154.84ns    6.46M
-// contentionAtWidthThreadLocal(32_stripe_0_work)              24.14ns   41.43M
-// contentionAtWidthThreadLocal(64_stripe_0_work)              23.95ns   41.75M
-// contentionAtWidthPthreadSelf(2_stripe_0_work)              722.01ns    1.39M
-// contentionAtWidthPthreadSelf(4_stripe_0_work)              501.56ns    1.99M
-// contentionAtWidthPthreadSelf(8_stripe_0_work)              474.58ns    2.11M
-// contentionAtWidthPthreadSelf(16_stripe_0_work)             300.90ns    3.32M
-// contentionAtWidthPthreadSelf(32_stripe_0_work)             175.77ns    5.69M
-// contentionAtWidthPthreadSelf(64_stripe_0_work)             174.88ns    5.72M
-// atomicIncrBaseline(local_incr_0_work)                       16.81ns   59.51M
-// ----------------------------------------------------------------------------
-// contentionAtWidthGetcpu(1_stripe_500_work)                   1.82us  549.97K
-// contentionAtWidthGetcpu(2_stripe_500_work)                 533.71ns    1.87M
-// contentionAtWidthGetcpu(4_stripe_500_work)                 424.64ns    2.35M
-// contentionAtWidthGetcpu(8_stripe_500_work)                 451.85ns    2.21M
-// contentionAtWidthGetcpu(16_stripe_500_work)                425.54ns    2.35M
-// contentionAtWidthGetcpu(32_stripe_500_work)                501.66ns    1.99M
-// atomicIncrBaseline(local_incr_500_work)                    438.46ns    2.28M
-// ----------------------------------------------------------------------------
-// contentionAtWidthGetcpu(1_stripe_1000_work)                  1.88us  532.20K
-// contentionAtWidthGetcpu(2_stripe_1000_work)                824.62ns    1.21M
-// contentionAtWidthGetcpu(4_stripe_1000_work)                803.56ns    1.24M
-// contentionAtWidthGetcpu(8_stripe_1000_work)                926.65ns    1.08M
-// contentionAtWidthGetcpu(16_stripe_1000_work)               900.10ns    1.11M
-// contentionAtWidthGetcpu(32_stripe_1000_work)               890.75ns    1.12M
-// atomicIncrBaseline(local_incr_1000_work)                   774.47ns    1.29M
-// ============================================================================
-template <template <typename> class Tag>
-static void contentionAtWidth(size_t iters, size_t stripes, size_t work) {
-  const size_t counterAlignment = 128;
-  const size_t numThreads = 32;
-
-  folly::BenchmarkSuspender braces;
-
-  std::atomic<size_t> ready(0);
-  std::atomic<bool> go(false);
-
-  // while in theory the cache line size is 64 bytes, experiments show
-  // that we get contention on 128 byte boundaries for Ivy Bridge.  The
-  // extra indirection adds 1 or 2 nanos
-  assert(counterAlignment >= sizeof(std::atomic<size_t>));
-  std::vector<char> raw(counterAlignment * stripes);
-
-  // if we happen to be using the tlsRoundRobin, then sequentially
-  // assigning the thread identifiers is the unlikely best-case scenario.
-  // We don't want to unfairly benefit or penalize.  Computing the exact
-  // maximum likelihood of the probability distributions is annoying, so
-  // I approximate as 2/5 of the ids that have no threads, 2/5 that have
-  // 1, 2/15 that have 2, and 1/15 that have 3.  We accomplish this by
-  // wrapping back to slot 0 when we hit 1/15 and 1/5.
-
-  std::vector<std::thread> threads;
-  while (threads.size() < numThreads) {
-    threads.push_back(std::thread([&, iters, stripes, work]() {
-      std::atomic<size_t>* counters[stripes];
-      for (size_t i = 0; i < stripes; ++i) {
-        counters[i] =
-            new (raw.data() + counterAlignment * i) std::atomic<size_t>();
-      }
-
-      ready++;
-      while (!go.load()) {
-        sched_yield();
-      }
-      std::atomic<int> localWork(0);
-      for (size_t i = iters; i > 0; --i) {
-        ++*(counters[AccessSpreader<Tag>::current(stripes)]);
-        for (size_t j = work; j > 0; --j) {
-          localWork.load();
-        }
-      }
-    }));
-
-    if (threads.size() == numThreads / 15 || threads.size() == numThreads / 5) {
-      // create a few dummy threads to wrap back around to 0 mod numCpus
-      for (size_t i = threads.size(); i != numThreads; ++i) {
-        std::thread([&]() { AccessSpreader<Tag>::current(stripes); }).join();
-      }
-    }
-  }
-
-  while (ready < numThreads) {
-    sched_yield();
-  }
-  braces.dismiss();
-  go = true;
-
-  for (auto& thr : threads) {
-    thr.join();
-  }
-}
-
-static void atomicIncrBaseline(size_t iters,
-                               size_t work,
-                               size_t numThreads = 32) {
-  folly::BenchmarkSuspender braces;
-
-  std::atomic<bool> go(false);
-
-  std::vector<std::thread> threads;
-  while (threads.size() < numThreads) {
-    threads.push_back(std::thread([&]() {
-      while (!go.load()) {
-        sched_yield();
-      }
-      std::atomic<size_t> localCounter(0);
-      std::atomic<int> localWork(0);
-      for (size_t i = iters; i > 0; --i) {
-        localCounter++;
-        for (size_t j = work; j > 0; --j) {
-          localWork.load();
-        }
-      }
-    }));
-  }
-
-  braces.dismiss();
-  go = true;
-
-  for (auto& thr : threads) {
-    thr.join();
-  }
-}
-
-static void contentionAtWidthGetcpu(size_t iters, size_t stripes, size_t work) {
-  contentionAtWidth<std::atomic>(iters, stripes, work);
-}
-
-static void contentionAtWidthThreadLocal(size_t iters,
-                                         size_t stripes,
-                                         size_t work) {
-  contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
-}
-
-static void contentionAtWidthPthreadSelf(size_t iters,
-                                         size_t stripes,
-                                         size_t work) {
-  contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
-}
-
-BENCHMARK_DRAW_LINE()
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_0_work, 1, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_0_work, 2, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_0_work, 4, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_0_work, 8, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_0_work, 16, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_0_work, 32, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 64_stripe_0_work, 64, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 2_stripe_0_work, 2, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 4_stripe_0_work, 4, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 8_stripe_0_work, 8, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 16_stripe_0_work, 16, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 32_stripe_0_work, 32, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 64_stripe_0_work, 64, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 2_stripe_0_work, 2, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 4_stripe_0_work, 4, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 8_stripe_0_work, 8, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 16_stripe_0_work, 16, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 32_stripe_0_work, 32, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 64_stripe_0_work, 64, 0)
-BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
-BENCHMARK_DRAW_LINE()
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_500_work, 1, 500)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_500_work, 2, 500)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_500_work, 4, 500)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_500_work, 8, 500)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_500_work, 16, 500)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_500_work, 32, 500)
-BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
-BENCHMARK_DRAW_LINE()
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_1000_work, 1, 1000)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_1000_work, 2, 1000)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_1000_work, 4, 1000)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_1000_work, 8, 1000)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_1000_work, 16, 1000)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_1000_work, 32, 1000)
-BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  auto ret = RUN_ALL_TESTS();
-  if (!ret && FLAGS_benchmark) {
-    folly::runBenchmarks();
-  }
-  return ret;
-}
diff --git a/folly/test/CallOnceBenchmark.cpp b/folly/test/CallOnceBenchmark.cpp
new file mode 100644
index 00000000..4f8fdde5
--- /dev/null
+++ b/folly/test/CallOnceBenchmark.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/CallOnce.h>
+
+#include <deque>
+#include <mutex>
+#include <thread>
+
+#include <folly/Benchmark.h>
+
+#include <glog/logging.h>
+
+DEFINE_int32(threads, 16, "benchmark concurrency");
+
+template <typename CallOnceFunc>
+void bm_impl(CallOnceFunc&& fn, int64_t iters) {
+  std::deque<std::thread> threads;
+  for (int i = 0; i < FLAGS_threads; ++i) {
+    threads.emplace_back([&fn, iters] {
+      for (int64_t j = 0; j < iters; ++j) {
+        fn();
+      }
+    });
+  }
+  for (std::thread& t : threads) {
+    t.join();
+  }
+}
+
+BENCHMARK(StdCallOnceBench, iters) {
+  std::once_flag flag;
+  int out = 0;
+  bm_impl([&] { std::call_once(flag, [&] { ++out; }); }, iters);
+  CHECK_EQ(1, out);
+}
+
+BENCHMARK(FollyCallOnceBench, iters) {
+  folly::once_flag flag;
+  int out = 0;
+  bm_impl([&] { folly::call_once(flag, [&] { ++out; }); }, iters);
+  CHECK_EQ(1, out);
+}
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/folly/test/CallOnceTest.cpp b/folly/test/CallOnceTest.cpp
index f1b1d719..4a4b291d 100644
--- a/folly/test/CallOnceTest.cpp
+++ b/folly/test/CallOnceTest.cpp
@@ -18,9 +18,9 @@
 #include <mutex>
 #include <thread>
 
-#include <folly/Benchmark.h>
 #include <folly/CallOnce.h>
-#include <gflags/gflags.h>
+#include <folly/portability/GFlags.h>
+
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
@@ -41,20 +41,6 @@ void bm_impl(CallOnceFunc&& fn, int64_t iters) {
   }
 }
 
-BENCHMARK(StdCallOnceBench, iters) {
-  std::once_flag flag;
-  int out = 0;
-  bm_impl([&] { std::call_once(flag, [&] { ++out; }); }, iters);
-  ASSERT_EQ(1, out);
-}
-
-BENCHMARK(FollyCallOnceBench, iters) {
-  folly::once_flag flag;
-  int out = 0;
-  bm_impl([&] { folly::call_once(flag, [&] { ++out; }); }, iters);
-  ASSERT_EQ(1, out);
-}
-
 TEST(FollyCallOnce, Simple) {
   folly::once_flag flag;
   auto fn = [&](int* outp) { ++*outp; };
@@ -72,14 +58,3 @@ TEST(FollyCallOnce, Stress) {
     ASSERT_EQ(1, out);
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_benchmark) {
-    folly::runBenchmarksOnFlag();
-    return 0;
-  } else {
-    return RUN_ALL_TESTS();
-  }
-}
diff --git a/folly/test/ConvBenchmark.cpp b/folly/test/ConvBenchmark.cpp
new file mode 100644
index 00000000..4a89fa3b
--- /dev/null
+++ b/folly/test/ConvBenchmark.cpp
@@ -0,0 +1,389 @@
+/*
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/Conv.h>
+
+#include <boost/lexical_cast.hpp>
+
+#include <folly/Benchmark.h>
+#include <folly/Foreach.h>
+
+#include <limits>
+#include <stdexcept>
+
+using namespace std;
+using namespace folly;
+
+////////////////////////////////////////////////////////////////////////////////
+// Benchmarks for ASCII to int conversion
+////////////////////////////////////////////////////////////////////////////////
+// @author: Rajat Goel (rajat)
+
+static int64_t handwrittenAtoi(const char* start, const char* end) {
+
+  bool positive = true;
+  int64_t retVal = 0;
+
+  if (start == end) {
+    throw std::runtime_error("empty string");
+  }
+
+  while (start < end && isspace(*start)) {
+    ++start;
+  }
+
+  switch (*start) {
+    case '-':
+      positive = false;
+    case '+':
+      ++start;
+    default:
+      ;
+  }
+
+  while (start < end && *start >= '0' && *start <= '9') {
+    auto const newRetVal = retVal * 10 + (*start++ - '0');
+    if (newRetVal < retVal) {
+      throw std::runtime_error("overflow");
+    }
+    retVal = newRetVal;
+  }
+
+  if (start != end) {
+    throw std::runtime_error("extra chars at the end");
+  }
+
+  return positive ? retVal : -retVal;
+}
+
+static StringPiece pc1 = "1234567890123456789";
+
+void handwrittenAtoiMeasure(unsigned int n, unsigned int digits) {
+  auto p = pc1.subpiece(pc1.size() - digits, digits);
+  FOR_EACH_RANGE(i, 0, n) {
+    doNotOptimizeAway(handwrittenAtoi(p.begin(), p.end()));
+  }
+}
+
+void follyAtoiMeasure(unsigned int n, unsigned int digits) {
+  auto p = pc1.subpiece(pc1.size() - digits, digits);
+  FOR_EACH_RANGE(i, 0, n) {
+    doNotOptimizeAway(folly::to<int64_t>(p.begin(), p.end()));
+  }
+}
+
+void clibAtoiMeasure(unsigned int n, unsigned int digits) {
+  auto p = pc1.subpiece(pc1.size() - digits, digits);
+  assert(*p.end() == 0);
+  static_assert(sizeof(long) == 8, "64-bit long assumed");
+  FOR_EACH_RANGE(i, 0, n) { doNotOptimizeAway(atol(p.begin())); }
+}
+
+void clibStrtoulMeasure(unsigned int n, unsigned int digits) {
+  auto p = pc1.subpiece(pc1.size() - digits, digits);
+  assert(*p.end() == 0);
+  char* endptr;
+  FOR_EACH_RANGE(i, 0, n) {
+    doNotOptimizeAway(strtoul(p.begin(), &endptr, 10));
+  }
+}
+
+void lexicalCastMeasure(unsigned int n, unsigned int digits) {
+  auto p = pc1.subpiece(pc1.size() - digits, digits);
+  assert(*p.end() == 0);
+  FOR_EACH_RANGE(i, 0, n) {
+    doNotOptimizeAway(boost::lexical_cast<uint64_t>(p.begin()));
+  }
+}
+
+// Benchmarks for unsigned to string conversion, raw
+
+unsigned u64ToAsciiTable(uint64_t value, char* dst) {
+  static const char digits[201] =
+      "00010203040506070809"
+      "10111213141516171819"
+      "20212223242526272829"
+      "30313233343536373839"
+      "40414243444546474849"
+      "50515253545556575859"
+      "60616263646566676869"
+      "70717273747576777879"
+      "80818283848586878889"
+      "90919293949596979899";
+
+  uint32_t const length = digits10(value);
+  uint32_t next = length - 1;
+  while (value >= 100) {
+    auto const i = (value % 100) * 2;
+    value /= 100;
+    dst[next] = digits[i + 1];
+    dst[next - 1] = digits[i];
+    next -= 2;
+  }
+  // Handle last 1-2 digits
+  if (value < 10) {
+    dst[next] = '0' + uint32_t(value);
+  } else {
+    auto i = uint32_t(value) * 2;
+    dst[next] = digits[i + 1];
+    dst[next - 1] = digits[i];
+  }
+  return length;
+}
+
+void u64ToAsciiTableBM(unsigned int n, uint64_t value) {
+  // This is too fast, need to do 10 times per iteration
+  char buf[20];
+  FOR_EACH_RANGE(i, 0, n) {
+    doNotOptimizeAway(u64ToAsciiTable(value + n, buf));
+  }
+}
+
+unsigned u64ToAsciiClassic(uint64_t value, char* dst) {
+  // Write backwards.
+  char* next = (char*)dst;
+  char* start = next;
+  do {
+    *next++ = '0' + (value % 10);
+    value /= 10;
+  } while (value != 0);
+  unsigned length = next - start;
+
+  // Reverse in-place.
+  next--;
+  while (next > start) {
+    char swap = *next;
+    *next = *start;
+    *start = swap;
+    next--;
+    start++;
+  }
+  return length;
+}
+
+void u64ToAsciiClassicBM(unsigned int n, uint64_t value) {
+  // This is too fast, need to do 10 times per iteration
+  char buf[20];
+  FOR_EACH_RANGE(i, 0, n) {
+    doNotOptimizeAway(u64ToAsciiClassic(value + n, buf));
+  }
+}
+
+void u64ToAsciiFollyBM(unsigned int n, uint64_t value) {
+  // This is too fast, need to do 10 times per iteration
+  char buf[20];
+  FOR_EACH_RANGE(i, 0, n) {
+    doNotOptimizeAway(uint64ToBufferUnsafe(value + n, buf));
+  }
+}
+
+// Benchmark unsigned to string conversion
+
+void u64ToStringClibMeasure(unsigned int n, uint64_t value) {
+  // FOLLY_RANGE_CHECK_TO_STRING expands to std::to_string, except on Android
+  // where std::to_string is not supported
+  FOR_EACH_RANGE(i, 0, n) { FOLLY_RANGE_CHECK_TO_STRING(value + n); }
+}
+
+void u64ToStringFollyMeasure(unsigned int n, uint64_t value) {
+  FOR_EACH_RANGE(i, 0, n) { to<std::string>(value + n); }
+}
+
+// Benchmark uitoa with string append
+
+void u2aAppendClassicBM(unsigned int n, uint64_t value) {
+  string s;
+  FOR_EACH_RANGE(i, 0, n) {
+    // auto buf = &s.back() + 1;
+    char buffer[20];
+    s.append(buffer, u64ToAsciiClassic(value, buffer));
+    doNotOptimizeAway(s.size());
+  }
+}
+
+void u2aAppendFollyBM(unsigned int n, uint64_t value) {
+  string s;
+  FOR_EACH_RANGE(i, 0, n) {
+    // auto buf = &s.back() + 1;
+    char buffer[20];
+    s.append(buffer, uint64ToBufferUnsafe(value, buffer));
+    doNotOptimizeAway(s.size());
+  }
+}
+
+template <class String>
+struct StringIdenticalToBM {
+  StringIdenticalToBM() {}
+  void operator()(unsigned int n, size_t len) const {
+    String s;
+    BENCHMARK_SUSPEND { s.append(len, '0'); }
+    FOR_EACH_RANGE(i, 0, n) {
+      String result = to<String>(s);
+      doNotOptimizeAway(result.size());
+    }
+  }
+};
+
+template <class String>
+struct StringVariadicToBM {
+  StringVariadicToBM() {}
+  void operator()(unsigned int n, size_t len) const {
+    String s;
+    BENCHMARK_SUSPEND { s.append(len, '0'); }
+    FOR_EACH_RANGE(i, 0, n) {
+      String result = to<String>(s, nullptr);
+      doNotOptimizeAway(result.size());
+    }
+  }
+};
+
+static size_t bigInt = 11424545345345;
+static size_t smallInt = 104;
+static char someString[] = "this is some nice string";
+static char otherString[] = "this is a long string, so it's not so nice";
+static char reallyShort[] = "meh";
+static std::string stdString = "std::strings are very nice";
+static float fValue = 1.2355;
+static double dValue = 345345345.435;
+
+BENCHMARK(preallocateTestNoFloat, n) {
+  for (size_t i = 0; i < n; ++i) {
+    auto val1 = to<std::string>(bigInt, someString, stdString, otherString);
+    auto val3 = to<std::string>(reallyShort, smallInt);
+    auto val2 = to<std::string>(bigInt, stdString);
+    auto val4 = to<std::string>(bigInt, stdString, dValue, otherString);
+    auto val5 = to<std::string>(bigInt, someString, reallyShort);
+  }
+}
+
+BENCHMARK(preallocateTestFloat, n) {
+  for (size_t i = 0; i < n; ++i) {
+    auto val1 = to<std::string>(stdString, ',', fValue, dValue);
+    auto val2 = to<std::string>(stdString, ',', dValue);
+  }
+}
+BENCHMARK_DRAW_LINE();
+
+static const StringIdenticalToBM<std::string> stringIdenticalToBM;
+static const StringVariadicToBM<std::string> stringVariadicToBM;
+static const StringIdenticalToBM<fbstring> fbstringIdenticalToBM;
+static const StringVariadicToBM<fbstring> fbstringVariadicToBM;
+
+#define DEFINE_BENCHMARK_GROUP(n)                 \
+  BENCHMARK_PARAM(u64ToAsciiClassicBM, n);        \
+  BENCHMARK_RELATIVE_PARAM(u64ToAsciiTableBM, n); \
+  BENCHMARK_RELATIVE_PARAM(u64ToAsciiFollyBM, n); \
+  BENCHMARK_DRAW_LINE();
+
+DEFINE_BENCHMARK_GROUP(1);
+DEFINE_BENCHMARK_GROUP(12);
+DEFINE_BENCHMARK_GROUP(123);
+DEFINE_BENCHMARK_GROUP(1234);
+DEFINE_BENCHMARK_GROUP(12345);
+DEFINE_BENCHMARK_GROUP(123456);
+DEFINE_BENCHMARK_GROUP(1234567);
+DEFINE_BENCHMARK_GROUP(12345678);
+DEFINE_BENCHMARK_GROUP(123456789);
+DEFINE_BENCHMARK_GROUP(1234567890);
+DEFINE_BENCHMARK_GROUP(12345678901);
+DEFINE_BENCHMARK_GROUP(123456789012);
+DEFINE_BENCHMARK_GROUP(1234567890123);
+DEFINE_BENCHMARK_GROUP(12345678901234);
+DEFINE_BENCHMARK_GROUP(123456789012345);
+DEFINE_BENCHMARK_GROUP(1234567890123456);
+DEFINE_BENCHMARK_GROUP(12345678901234567);
+DEFINE_BENCHMARK_GROUP(123456789012345678);
+DEFINE_BENCHMARK_GROUP(1234567890123456789);
+DEFINE_BENCHMARK_GROUP(12345678901234567890U);
+
+#undef DEFINE_BENCHMARK_GROUP
+
+#define DEFINE_BENCHMARK_GROUP(n)                       \
+  BENCHMARK_PARAM(u64ToStringClibMeasure, n);           \
+  BENCHMARK_RELATIVE_PARAM(u64ToStringFollyMeasure, n); \
+  BENCHMARK_DRAW_LINE();
+
+DEFINE_BENCHMARK_GROUP(1);
+DEFINE_BENCHMARK_GROUP(12);
+DEFINE_BENCHMARK_GROUP(123);
+DEFINE_BENCHMARK_GROUP(1234);
+DEFINE_BENCHMARK_GROUP(12345);
+DEFINE_BENCHMARK_GROUP(123456);
+DEFINE_BENCHMARK_GROUP(1234567);
+DEFINE_BENCHMARK_GROUP(12345678);
+DEFINE_BENCHMARK_GROUP(123456789);
+DEFINE_BENCHMARK_GROUP(1234567890);
+DEFINE_BENCHMARK_GROUP(12345678901);
+DEFINE_BENCHMARK_GROUP(123456789012);
+DEFINE_BENCHMARK_GROUP(1234567890123);
+DEFINE_BENCHMARK_GROUP(12345678901234);
+DEFINE_BENCHMARK_GROUP(123456789012345);
+DEFINE_BENCHMARK_GROUP(1234567890123456);
+DEFINE_BENCHMARK_GROUP(12345678901234567);
+DEFINE_BENCHMARK_GROUP(123456789012345678);
+DEFINE_BENCHMARK_GROUP(1234567890123456789);
+DEFINE_BENCHMARK_GROUP(12345678901234567890U);
+
+#undef DEFINE_BENCHMARK_GROUP
+
+#define DEFINE_BENCHMARK_GROUP(n)                      \
+  BENCHMARK_PARAM(clibAtoiMeasure, n);                 \
+  BENCHMARK_RELATIVE_PARAM(lexicalCastMeasure, n);     \
+  BENCHMARK_RELATIVE_PARAM(handwrittenAtoiMeasure, n); \
+  BENCHMARK_RELATIVE_PARAM(follyAtoiMeasure, n);       \
+  BENCHMARK_DRAW_LINE();
+
+DEFINE_BENCHMARK_GROUP(1);
+DEFINE_BENCHMARK_GROUP(2);
+DEFINE_BENCHMARK_GROUP(3);
+DEFINE_BENCHMARK_GROUP(4);
+DEFINE_BENCHMARK_GROUP(5);
+DEFINE_BENCHMARK_GROUP(6);
+DEFINE_BENCHMARK_GROUP(7);
+DEFINE_BENCHMARK_GROUP(8);
+DEFINE_BENCHMARK_GROUP(9);
+DEFINE_BENCHMARK_GROUP(10);
+DEFINE_BENCHMARK_GROUP(11);
+DEFINE_BENCHMARK_GROUP(12);
+DEFINE_BENCHMARK_GROUP(13);
+DEFINE_BENCHMARK_GROUP(14);
+DEFINE_BENCHMARK_GROUP(15);
+DEFINE_BENCHMARK_GROUP(16);
+DEFINE_BENCHMARK_GROUP(17);
+DEFINE_BENCHMARK_GROUP(18);
+DEFINE_BENCHMARK_GROUP(19);
+
+#undef DEFINE_BENCHMARK_GROUP
+
+#define DEFINE_BENCHMARK_GROUP(T, n)             \
+  BENCHMARK_PARAM(T##VariadicToBM, n);           \
+  BENCHMARK_RELATIVE_PARAM(T##IdenticalToBM, n); \
+  BENCHMARK_DRAW_LINE();
+
+DEFINE_BENCHMARK_GROUP(string, 32);
+DEFINE_BENCHMARK_GROUP(string, 1024);
+DEFINE_BENCHMARK_GROUP(string, 32768);
+DEFINE_BENCHMARK_GROUP(fbstring, 32);
+DEFINE_BENCHMARK_GROUP(fbstring, 1024);
+DEFINE_BENCHMARK_GROUP(fbstring, 32768);
+
+#undef DEFINE_BENCHMARK_GROUP
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/folly/test/ConvTest.cpp b/folly/test/ConvTest.cpp
index f8e7d807..68a40d52 100644
--- a/folly/test/ConvTest.cpp
+++ b/folly/test/ConvTest.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <folly/Benchmark.h>
 #include <folly/Conv.h>
 #include <folly/Foreach.h>
 #include <boost/lexical_cast.hpp>
@@ -26,13 +25,6 @@ using namespace std;
 using namespace folly;
 
 
-
-TEST(Conv, digits10Minimal) {
-  // Not much of a test (and it's included in the test below anyway).
-  // I just want to inspect the generated assembly for this function.
-  folly::doNotOptimizeAway(digits10(random() * random()));
-}
-
 TEST(Conv, digits10) {
   char buffer[100];
   uint64_t power;
@@ -856,373 +848,3 @@ TEST(Conv, allocate_size) {
   toAppendDelimFit(",", str1, str2, &res3);
   EXPECT_EQ(res3, str1 + "," + str2);
 }
-
-////////////////////////////////////////////////////////////////////////////////
-// Benchmarks for ASCII to int conversion
-////////////////////////////////////////////////////////////////////////////////
-// @author: Rajat Goel (rajat)
-
-static int64_t handwrittenAtoi(const char* start, const char* end) {
-
-  bool positive = true;
-  int64_t retVal = 0;
-
-  if (start == end) {
-    throw std::runtime_error("empty string");
-  }
-
-  while (start < end && isspace(*start)) {
-    ++start;
-  }
-
-  switch (*start) {
-    case '-':
-      positive = false;
-    case '+':
-      ++start;
-    default:;
-  }
-
-  while (start < end && *start >= '0' && *start <= '9') {
-    auto const newRetVal = retVal * 10 + (*start++ - '0');
-    if (newRetVal < retVal) {
-      throw std::runtime_error("overflow");
-    }
-    retVal = newRetVal;
-  }
-
-  if (start != end) {
-    throw std::runtime_error("extra chars at the end");
-  }
-
-  return positive ? retVal : -retVal;
-}
-
-static StringPiece pc1 = "1234567890123456789";
-
-void handwrittenAtoiMeasure(unsigned int n, unsigned int digits) {
-  auto p = pc1.subpiece(pc1.size() - digits, digits);
-  FOR_EACH_RANGE (i, 0, n) {
-    doNotOptimizeAway(handwrittenAtoi(p.begin(), p.end()));
-  }
-}
-
-void follyAtoiMeasure(unsigned int n, unsigned int digits) {
-  auto p = pc1.subpiece(pc1.size() - digits, digits);
-  FOR_EACH_RANGE (i, 0, n) {
-    doNotOptimizeAway(folly::to<int64_t>(p.begin(), p.end()));
-  }
-}
-
-void clibAtoiMeasure(unsigned int n, unsigned int digits) {
-  auto p = pc1.subpiece(pc1.size() - digits, digits);
-  assert(*p.end() == 0);
-  static_assert(sizeof(long) == 8, "64-bit long assumed");
-  FOR_EACH_RANGE (i, 0, n) {
-    doNotOptimizeAway(atol(p.begin()));
-  }
-}
-
-void clibStrtoulMeasure(unsigned int n, unsigned int digits) {
-  auto p = pc1.subpiece(pc1.size() - digits, digits);
-  assert(*p.end() == 0);
-  char * endptr;
-  FOR_EACH_RANGE (i, 0, n) {
-    doNotOptimizeAway(strtoul(p.begin(), &endptr, 10));
-  }
-}
-
-void lexicalCastMeasure(unsigned int n, unsigned int digits) {
-  auto p = pc1.subpiece(pc1.size() - digits, digits);
-  assert(*p.end() == 0);
-  FOR_EACH_RANGE (i, 0, n) {
-    doNotOptimizeAway(boost::lexical_cast<uint64_t>(p.begin()));
-  }
-}
-
-// Benchmarks for unsigned to string conversion, raw
-
-unsigned u64ToAsciiTable(uint64_t value, char* dst) {
-  static const char digits[201] =
-    "00010203040506070809"
-    "10111213141516171819"
-    "20212223242526272829"
-    "30313233343536373839"
-    "40414243444546474849"
-    "50515253545556575859"
-    "60616263646566676869"
-    "70717273747576777879"
-    "80818283848586878889"
-    "90919293949596979899";
-
-  uint32_t const length = digits10(value);
-  uint32_t next = length - 1;
-  while (value >= 100) {
-    auto const i = (value % 100) * 2;
-    value /= 100;
-    dst[next] = digits[i + 1];
-    dst[next - 1] = digits[i];
-    next -= 2;
-  }
-  // Handle last 1-2 digits
-  if (value < 10) {
-    dst[next] = '0' + uint32_t(value);
-  } else {
-    auto i = uint32_t(value) * 2;
-    dst[next] = digits[i + 1];
-    dst[next - 1] = digits[i];
-  }
-  return length;
-}
-
-void u64ToAsciiTableBM(unsigned int n, uint64_t value) {
-  // This is too fast, need to do 10 times per iteration
-  char buf[20];
-  FOR_EACH_RANGE (i, 0, n) {
-    doNotOptimizeAway(u64ToAsciiTable(value + n, buf));
-  }
-}
-
-unsigned u64ToAsciiClassic(uint64_t value, char* dst) {
-  // Write backwards.
-  char* next = (char*)dst;
-  char* start = next;
-  do {
-    *next++ = '0' + (value % 10);
-    value /= 10;
-  } while (value != 0);
-  unsigned length = next - start;
-
-  // Reverse in-place.
-  next--;
-  while (next > start) {
-    char swap = *next;
-    *next = *start;
-    *start = swap;
-    next--;
-    start++;
-  }
-  return length;
-}
-
-void u64ToAsciiClassicBM(unsigned int n, uint64_t value) {
-  // This is too fast, need to do 10 times per iteration
-  char buf[20];
-  FOR_EACH_RANGE (i, 0, n) {
-    doNotOptimizeAway(u64ToAsciiClassic(value + n, buf));
-  }
-}
-
-void u64ToAsciiFollyBM(unsigned int n, uint64_t value) {
-  // This is too fast, need to do 10 times per iteration
-  char buf[20];
-  FOR_EACH_RANGE (i, 0, n) {
-    doNotOptimizeAway(uint64ToBufferUnsafe(value + n, buf));
-  }
-}
-
-// Benchmark unsigned to string conversion
-
-void u64ToStringClibMeasure(unsigned int n, uint64_t value) {
-  // FOLLY_RANGE_CHECK_TO_STRING expands to std::to_string, except on Android
-  // where std::to_string is not supported
-  FOR_EACH_RANGE (i, 0, n) {
-    FOLLY_RANGE_CHECK_TO_STRING(value + n);
-  }
-}
-
-void u64ToStringFollyMeasure(unsigned int n, uint64_t value) {
-  FOR_EACH_RANGE (i, 0, n) {
-    to<std::string>(value + n);
-  }
-}
-
-// Benchmark uitoa with string append
-
-void u2aAppendClassicBM(unsigned int n, uint64_t value) {
-  string s;
-  FOR_EACH_RANGE (i, 0, n) {
-    // auto buf = &s.back() + 1;
-    char buffer[20];
-    s.append(buffer, u64ToAsciiClassic(value, buffer));
-    doNotOptimizeAway(s.size());
-  }
-}
-
-void u2aAppendFollyBM(unsigned int n, uint64_t value) {
-  string s;
-  FOR_EACH_RANGE (i, 0, n) {
-    // auto buf = &s.back() + 1;
-    char buffer[20];
-    s.append(buffer, uint64ToBufferUnsafe(value, buffer));
-    doNotOptimizeAway(s.size());
-  }
-}
-
-template <class String>
-struct StringIdenticalToBM {
-  StringIdenticalToBM() {}
-  void operator()(unsigned int n, size_t len) const {
-    String s;
-    BENCHMARK_SUSPEND { s.append(len, '0'); }
-    FOR_EACH_RANGE (i, 0, n) {
-      String result = to<String>(s);
-      doNotOptimizeAway(result.size());
-    }
-  }
-};
-
-template <class String>
-struct StringVariadicToBM {
-  StringVariadicToBM() {}
-  void operator()(unsigned int n, size_t len) const {
-    String s;
-    BENCHMARK_SUSPEND { s.append(len, '0'); }
-    FOR_EACH_RANGE (i, 0, n) {
-      String result = to<String>(s, nullptr);
-      doNotOptimizeAway(result.size());
-    }
-  }
-};
-
-static size_t bigInt = 11424545345345;
-static size_t smallInt = 104;
-static char someString[] = "this is some nice string";
-static char otherString[] = "this is a long string, so it's not so nice";
-static char reallyShort[] = "meh";
-static std::string stdString = "std::strings are very nice";
-static float fValue = 1.2355;
-static double dValue = 345345345.435;
-
-BENCHMARK(preallocateTestNoFloat, n) {
-  for (size_t i = 0; i < n; ++i) {
-    auto val1 = to<std::string>(bigInt, someString, stdString, otherString);
-    auto val3 = to<std::string>(reallyShort, smallInt);
-    auto val2 = to<std::string>(bigInt, stdString);
-    auto val4 = to<std::string>(bigInt, stdString, dValue, otherString);
-    auto val5 = to<std::string>(bigInt, someString, reallyShort);
-  }
-}
-
-BENCHMARK(preallocateTestFloat, n) {
-  for (size_t i = 0; i < n; ++i) {
-    auto val1 = to<std::string>(stdString, ',', fValue, dValue);
-    auto val2 = to<std::string>(stdString, ',', dValue);
-  }
-}
-BENCHMARK_DRAW_LINE();
-
-static const StringIdenticalToBM<std::string> stringIdenticalToBM;
-static const StringVariadicToBM<std::string> stringVariadicToBM;
-static const StringIdenticalToBM<fbstring> fbstringIdenticalToBM;
-static const StringVariadicToBM<fbstring> fbstringVariadicToBM;
-
-#define DEFINE_BENCHMARK_GROUP(n)                       \
-  BENCHMARK_PARAM(u64ToAsciiClassicBM, n);              \
-  BENCHMARK_RELATIVE_PARAM(u64ToAsciiTableBM, n);       \
-  BENCHMARK_RELATIVE_PARAM(u64ToAsciiFollyBM, n);       \
-  BENCHMARK_DRAW_LINE();
-
-DEFINE_BENCHMARK_GROUP(1);
-DEFINE_BENCHMARK_GROUP(12);
-DEFINE_BENCHMARK_GROUP(123);
-DEFINE_BENCHMARK_GROUP(1234);
-DEFINE_BENCHMARK_GROUP(12345);
-DEFINE_BENCHMARK_GROUP(123456);
-DEFINE_BENCHMARK_GROUP(1234567);
-DEFINE_BENCHMARK_GROUP(12345678);
-DEFINE_BENCHMARK_GROUP(123456789);
-DEFINE_BENCHMARK_GROUP(1234567890);
-DEFINE_BENCHMARK_GROUP(12345678901);
-DEFINE_BENCHMARK_GROUP(123456789012);
-DEFINE_BENCHMARK_GROUP(1234567890123);
-DEFINE_BENCHMARK_GROUP(12345678901234);
-DEFINE_BENCHMARK_GROUP(123456789012345);
-DEFINE_BENCHMARK_GROUP(1234567890123456);
-DEFINE_BENCHMARK_GROUP(12345678901234567);
-DEFINE_BENCHMARK_GROUP(123456789012345678);
-DEFINE_BENCHMARK_GROUP(1234567890123456789);
-DEFINE_BENCHMARK_GROUP(12345678901234567890U);
-
-#undef DEFINE_BENCHMARK_GROUP
-
-#define DEFINE_BENCHMARK_GROUP(n)                        \
-  BENCHMARK_PARAM(u64ToStringClibMeasure, n);            \
-  BENCHMARK_RELATIVE_PARAM(u64ToStringFollyMeasure, n);  \
-  BENCHMARK_DRAW_LINE();
-
-DEFINE_BENCHMARK_GROUP(1);
-DEFINE_BENCHMARK_GROUP(12);
-DEFINE_BENCHMARK_GROUP(123);
-DEFINE_BENCHMARK_GROUP(1234);
-DEFINE_BENCHMARK_GROUP(12345);
-DEFINE_BENCHMARK_GROUP(123456);
-DEFINE_BENCHMARK_GROUP(1234567);
-DEFINE_BENCHMARK_GROUP(12345678);
-DEFINE_BENCHMARK_GROUP(123456789);
-DEFINE_BENCHMARK_GROUP(1234567890);
-DEFINE_BENCHMARK_GROUP(12345678901);
-DEFINE_BENCHMARK_GROUP(123456789012);
-DEFINE_BENCHMARK_GROUP(1234567890123);
-DEFINE_BENCHMARK_GROUP(12345678901234);
-DEFINE_BENCHMARK_GROUP(123456789012345);
-DEFINE_BENCHMARK_GROUP(1234567890123456);
-DEFINE_BENCHMARK_GROUP(12345678901234567);
-DEFINE_BENCHMARK_GROUP(123456789012345678);
-DEFINE_BENCHMARK_GROUP(1234567890123456789);
-DEFINE_BENCHMARK_GROUP(12345678901234567890U);
-
-#undef DEFINE_BENCHMARK_GROUP
-
-#define DEFINE_BENCHMARK_GROUP(n)                       \
-  BENCHMARK_PARAM(clibAtoiMeasure, n);                  \
-  BENCHMARK_RELATIVE_PARAM(lexicalCastMeasure, n);      \
-  BENCHMARK_RELATIVE_PARAM(handwrittenAtoiMeasure, n);  \
-  BENCHMARK_RELATIVE_PARAM(follyAtoiMeasure, n);        \
-  BENCHMARK_DRAW_LINE();
-
-DEFINE_BENCHMARK_GROUP(1);
-DEFINE_BENCHMARK_GROUP(2);
-DEFINE_BENCHMARK_GROUP(3);
-DEFINE_BENCHMARK_GROUP(4);
-DEFINE_BENCHMARK_GROUP(5);
-DEFINE_BENCHMARK_GROUP(6);
-DEFINE_BENCHMARK_GROUP(7);
-DEFINE_BENCHMARK_GROUP(8);
-DEFINE_BENCHMARK_GROUP(9);
-DEFINE_BENCHMARK_GROUP(10);
-DEFINE_BENCHMARK_GROUP(11);
-DEFINE_BENCHMARK_GROUP(12);
-DEFINE_BENCHMARK_GROUP(13);
-DEFINE_BENCHMARK_GROUP(14);
-DEFINE_BENCHMARK_GROUP(15);
-DEFINE_BENCHMARK_GROUP(16);
-DEFINE_BENCHMARK_GROUP(17);
-DEFINE_BENCHMARK_GROUP(18);
-DEFINE_BENCHMARK_GROUP(19);
-
-#undef DEFINE_BENCHMARK_GROUP
-
-#define DEFINE_BENCHMARK_GROUP(T, n)                    \
-  BENCHMARK_PARAM(T ## VariadicToBM, n);                \
-  BENCHMARK_RELATIVE_PARAM(T ## IdenticalToBM, n);      \
-  BENCHMARK_DRAW_LINE();
-
-DEFINE_BENCHMARK_GROUP(string, 32);
-DEFINE_BENCHMARK_GROUP(string, 1024);
-DEFINE_BENCHMARK_GROUP(string, 32768);
-DEFINE_BENCHMARK_GROUP(fbstring, 32);
-DEFINE_BENCHMARK_GROUP(fbstring, 1024);
-DEFINE_BENCHMARK_GROUP(fbstring, 32768);
-
-#undef DEFINE_BENCHMARK_GROUP
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  auto ret = RUN_ALL_TESTS();
-  if (!ret && FLAGS_benchmark) {
-    folly::runBenchmarks();
-  }
-  return ret;
-}
diff --git a/folly/test/LoggingBenchmark.cpp b/folly/test/LoggingBenchmark.cpp
new file mode 100644
index 00000000..bd08b23c
--- /dev/null
+++ b/folly/test/LoggingBenchmark.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/Logging.h>
+
+#include <folly/Benchmark.h>
+
+#include <vector>
+
+BENCHMARK(skip_overhead, iter) {
+  auto prev = FLAGS_minloglevel;
+  FLAGS_minloglevel = 2;
+
+  for (unsigned i = 0; i < iter; ++i) {
+    FB_LOG_EVERY_MS(INFO, 1000) << "every 1s";
+  }
+
+  FLAGS_minloglevel = prev;
+}
+
+BENCHMARK(dev_null_log_overhead, iter) {
+  auto prev = FLAGS_minloglevel;
+  FLAGS_minloglevel = 2;
+
+  for (unsigned i = 0; i < iter; ++i) {
+    FB_LOG_EVERY_MS(INFO, -1) << "every -1ms";
+  }
+
+  FLAGS_minloglevel = prev;
+}
+
+// ============================================================================
+// folly/test/LoggingTest.cpp                      relative  time/iter  iters/s
+// ============================================================================
+// skip_overhead                                               36.37ns   27.49M
+// dev_null_log_overhead                                        2.61us  382.57K
+// ============================================================================
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/folly/test/LoggingTest.cpp b/folly/test/LoggingTest.cpp
index db439453..734640de 100644
--- a/folly/test/LoggingTest.cpp
+++ b/folly/test/LoggingTest.cpp
@@ -15,9 +15,9 @@
  */
 
 #include <folly/Logging.h>
-#include <gflags/gflags.h>
+
 #include <gtest/gtest.h>
-#include <folly/Benchmark.h>
+
 #include <vector>
 
 TEST(LogEveryMs, basic) {
@@ -50,43 +50,3 @@ TEST(LogEveryMs, zero) {
 
   EXPECT_EQ(10, count);
 }
-
-BENCHMARK(skip_overhead, iter) {
-  auto prev = FLAGS_minloglevel;
-  FLAGS_minloglevel = 2;
-
-  for (unsigned i = 0; i < iter; ++i) {
-    FB_LOG_EVERY_MS(INFO, 1000) << "every 1s";
-  }
-
-  FLAGS_minloglevel = prev;
-}
-
-BENCHMARK(dev_null_log_overhead, iter) {
-  auto prev = FLAGS_minloglevel;
-  FLAGS_minloglevel = 2;
-
-  for (unsigned i = 0; i < iter; ++i) {
-    FB_LOG_EVERY_MS(INFO, -1) << "every -1ms";
-  }
-
-  FLAGS_minloglevel = prev;
-}
-
-// ============================================================================
-// folly/test/LoggingTest.cpp                      relative  time/iter  iters/s
-// ============================================================================
-// skip_overhead                                               36.37ns   27.49M
-// dev_null_log_overhead                                        2.61us  382.57K
-// ============================================================================
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  auto rv = RUN_ALL_TESTS();
-  if (!rv && FLAGS_benchmark) {
-    folly::runBenchmarks();
-  }
-  return rv;
-}
diff --git a/folly/test/MemoryIdlerBenchmark.cpp b/folly/test/MemoryIdlerBenchmark.cpp
new file mode 100644
index 00000000..99594fe1
--- /dev/null
+++ b/folly/test/MemoryIdlerBenchmark.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/detail/MemoryIdler.h>
+
+#include <folly/Benchmark.h>
+
+using namespace folly::detail;
+
+BENCHMARK(releaseStack, iters) {
+  for (size_t i = 0; i < iters; ++i) {
+    MemoryIdler::unmapUnusedStack();
+  }
+}
+
+BENCHMARK(releaseMallocTLS, iters) {
+  for (size_t i = 0; i < iters; ++i) {
+    MemoryIdler::flushLocalMallocCaches();
+  }
+}
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/folly/test/MemoryIdlerTest.cpp b/folly/test/MemoryIdlerTest.cpp
index 746ab766..ed4a35fa 100644
--- a/folly/test/MemoryIdlerTest.cpp
+++ b/folly/test/MemoryIdlerTest.cpp
@@ -15,15 +15,16 @@
  */
 
 #include <folly/detail/MemoryIdler.h>
+
 #include <folly/Baton.h>
+
 #include <memory>
 #include <thread>
 #include <assert.h>
 #include <semaphore.h>
-#include <gflags/gflags.h>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include <folly/Benchmark.h>
 
 using namespace folly;
 using namespace folly::detail;
@@ -182,27 +183,3 @@ TEST(MemoryIdler, futexWaitNeverFlush) {
   EXPECT_TRUE((MemoryIdler::futexWait<MockAtom, MockClock>(
       fut, 1, -1, MockClock::duration::max())));
 }
-
-
-BENCHMARK(releaseStack, iters) {
-  for (size_t i = 0; i < iters; ++i) {
-    MemoryIdler::unmapUnusedStack();
-  }
-}
-
-BENCHMARK(releaseMallocTLS, iters) {
-  for (size_t i = 0; i < iters; ++i) {
-    MemoryIdler::flushLocalMallocCaches();
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  auto rv = RUN_ALL_TESTS();
-  if (!rv && FLAGS_benchmark) {
-    folly::runBenchmarks();
-  }
-  return rv;
-}
diff --git a/folly/test/RandomBenchmark.cpp b/folly/test/RandomBenchmark.cpp
new file mode 100644
index 00000000..cc17e5ee
--- /dev/null
+++ b/folly/test/RandomBenchmark.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/Random.h>
+
+#include <folly/Benchmark.h>
+#include <folly/Foreach.h>
+#include <folly/Range.h>
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <thread>
+#include <vector>
+#include <random>
+
+using namespace folly;
+
+BENCHMARK(minstdrand, n) {
+  BenchmarkSuspender braces;
+  std::random_device rd;
+  std::minstd_rand rng(rd());
+
+  braces.dismiss();
+
+  FOR_EACH_RANGE(i, 0, n) { doNotOptimizeAway(rng()); }
+}
+
+BENCHMARK(mt19937, n) {
+  BenchmarkSuspender braces;
+  std::random_device rd;
+  std::mt19937 rng(rd());
+
+  braces.dismiss();
+
+  FOR_EACH_RANGE(i, 0, n) { doNotOptimizeAway(rng()); }
+}
+
+BENCHMARK(threadprng, n) {
+  BenchmarkSuspender braces;
+  ThreadLocalPRNG tprng;
+  tprng();
+
+  braces.dismiss();
+
+  FOR_EACH_RANGE(i, 0, n) { doNotOptimizeAway(tprng()); }
+}
+
+BENCHMARK(RandomDouble) { doNotOptimizeAway(Random::randDouble01()); }
+BENCHMARK(Random32) { doNotOptimizeAway(Random::rand32()); }
+BENCHMARK(Random32Num) { doNotOptimizeAway(Random::rand32(100)); }
+BENCHMARK(Random64) { doNotOptimizeAway(Random::rand64()); }
+BENCHMARK(Random64Num) { doNotOptimizeAway(Random::rand64(100ul << 32)); }
+BENCHMARK(Random64OneIn) { doNotOptimizeAway(Random::oneIn(100)); }
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/folly/test/RandomTest.cpp b/folly/test/RandomTest.cpp
index b31910d2..2f3c97d5 100644
--- a/folly/test/RandomTest.cpp
+++ b/folly/test/RandomTest.cpp
@@ -15,9 +15,6 @@
  */
 
 #include <folly/Random.h>
-#include <folly/Range.h>
-#include <folly/Benchmark.h>
-#include <folly/Foreach.h>
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -67,56 +64,3 @@ TEST(Random, MultiThreaded) {
     EXPECT_LT(seeds[i], seeds[i+1]);
   }
 }
-
-BENCHMARK(minstdrand, n) {
-  BenchmarkSuspender braces;
-  std::random_device rd;
-  std::minstd_rand rng(rd());
-
-  braces.dismiss();
-
-  FOR_EACH_RANGE (i, 0, n) {
-    doNotOptimizeAway(rng());
-  }
-}
-
-BENCHMARK(mt19937, n) {
-  BenchmarkSuspender braces;
-  std::random_device rd;
-  std::mt19937 rng(rd());
-
-  braces.dismiss();
-
-  FOR_EACH_RANGE (i, 0, n) {
-    doNotOptimizeAway(rng());
-  }
-}
-
-BENCHMARK(threadprng, n) {
-  BenchmarkSuspender braces;
-  ThreadLocalPRNG tprng;
-  tprng();
-
-  braces.dismiss();
-
-  FOR_EACH_RANGE (i, 0, n) {
-    doNotOptimizeAway(tprng());
-  }
-}
-
-BENCHMARK(RandomDouble) { doNotOptimizeAway(Random::randDouble01()); }
-BENCHMARK(Random32) { doNotOptimizeAway(Random::rand32()); }
-BENCHMARK(Random32Num) { doNotOptimizeAway(Random::rand32(100)); }
-BENCHMARK(Random64) { doNotOptimizeAway(Random::rand64()); }
-BENCHMARK(Random64Num) { doNotOptimizeAway(Random::rand64(100ul << 32)); }
-BENCHMARK(Random64OneIn) { doNotOptimizeAway(Random::oneIn(100)); }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  if (FLAGS_benchmark) {
-    folly::runBenchmarks();
-  }
-  return RUN_ALL_TESTS();
-}
diff --git a/folly/test/ThreadLocalBenchmark.cpp b/folly/test/ThreadLocalBenchmark.cpp
new file mode 100644
index 00000000..dfaec0b0
--- /dev/null
+++ b/folly/test/ThreadLocalBenchmark.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/ThreadLocal.h>
+
+#include <dlfcn.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <array>
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <limits.h>
+#include <map>
+#include <mutex>
+#include <set>
+#include <thread>
+#include <unordered_map>
+
+#include <boost/thread/tss.hpp>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include <folly/Benchmark.h>
+#include <folly/experimental/io/FsUtil.h>
+
+using namespace folly;
+
+// Simple reference implementation using pthread_get_specific
+template <typename T>
+class PThreadGetSpecific {
+ public:
+  PThreadGetSpecific() : key_(0) { pthread_key_create(&key_, OnThreadExit); }
+
+  T* get() const { return static_cast<T*>(pthread_getspecific(key_)); }
+
+  void reset(T* t) {
+    delete get();
+    pthread_setspecific(key_, t);
+  }
+  static void OnThreadExit(void* obj) { delete static_cast<T*>(obj); }
+
+ private:
+  pthread_key_t key_;
+};
+
+DEFINE_int32(numThreads, 8, "Number simultaneous threads for benchmarks.");
+
+#define REG(var)                                         \
+  BENCHMARK(FB_CONCATENATE(BM_mt_, var), iters) {        \
+    const int itersPerThread = iters / FLAGS_numThreads; \
+    std::vector<std::thread> threads;                    \
+    for (int i = 0; i < FLAGS_numThreads; ++i) {         \
+      threads.push_back(std::thread([&]() {              \
+        var.reset(new int(0));                           \
+        for (int i = 0; i < itersPerThread; ++i) {       \
+          ++(*var.get());                                \
+        }                                                \
+      }));                                               \
+    }                                                    \
+    for (auto& t : threads) {                            \
+      t.join();                                          \
+    }                                                    \
+  }
+
+ThreadLocalPtr<int> tlp;
+REG(tlp);
+PThreadGetSpecific<int> pthread_get_specific;
+REG(pthread_get_specific);
+boost::thread_specific_ptr<int> boost_tsp;
+REG(boost_tsp);
+BENCHMARK_DRAW_LINE();
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::SetCommandLineOptionWithMode(
+      "bm_max_iters", "100000000", gflags::SET_FLAG_IF_DEFAULT);
+  folly::runBenchmarks();
+  return 0;
+}
+
+/*
+Ran with 24 threads on dual 12-core Xeon(R) X5650 @ 2.67GHz with 12-MB caches
+
+Benchmark                               Iters   Total t    t/iter iter/sec
+------------------------------------------------------------------------------
+*       BM_mt_tlp                   100000000  39.88 ms  398.8 ps  2.335 G
+ +5.91% BM_mt_pthread_get_specific  100000000  42.23 ms  422.3 ps  2.205 G
+ + 295% BM_mt_boost_tsp             100000000  157.8 ms  1.578 ns  604.5 M
+------------------------------------------------------------------------------
+*/
diff --git a/folly/test/ThreadLocalTest.cpp b/folly/test/ThreadLocalTest.cpp
index 685802c9..6c2544a9 100644
--- a/folly/test/ThreadLocalTest.cpp
+++ b/folly/test/ThreadLocalTest.cpp
@@ -32,12 +32,9 @@
 #include <thread>
 #include <unordered_map>
 
-#include <boost/thread/tss.hpp>
-#include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include <folly/Benchmark.h>
 #include <folly/Baton.h>
 #include <folly/experimental/io/FsUtil.h>
 
@@ -596,91 +593,3 @@ TEST(ThreadLocal, UnregisterClassHasConstExprCtor) {
   // yep!
   SUCCEED();
 }
-
-// clang is unable to compile this code unless in c++14 mode.
-#if __cplusplus >= 201402L
-namespace {
-// This will fail to compile unless ThreadLocal{Ptr} has a constexpr
-// default constructor. This ensures that ThreadLocal is safe to use in
-// static constructors without worrying about initialization order
-class ConstexprThreadLocalCompile {
-  ThreadLocal<int> a_;
-  ThreadLocalPtr<int> b_;
-
-  constexpr ConstexprThreadLocalCompile() {}
-};
-}
-#endif
-
-// Simple reference implementation using pthread_get_specific
-template<typename T>
-class PThreadGetSpecific {
- public:
-  PThreadGetSpecific() : key_(0) {
-    pthread_key_create(&key_, OnThreadExit);
-  }
-
-  T* get() const {
-    return static_cast<T*>(pthread_getspecific(key_));
-  }
-
-  void reset(T* t) {
-    delete get();
-    pthread_setspecific(key_, t);
-  }
-  static void OnThreadExit(void* obj) {
-    delete static_cast<T*>(obj);
-  }
- private:
-  pthread_key_t key_;
-};
-
-DEFINE_int32(numThreads, 8, "Number simultaneous threads for benchmarks.");
-
-#define REG(var)                                                \
-  BENCHMARK(FB_CONCATENATE(BM_mt_, var), iters) {               \
-    const int itersPerThread = iters / FLAGS_numThreads;        \
-    std::vector<std::thread> threads;                           \
-    for (int i = 0; i < FLAGS_numThreads; ++i) {                \
-      threads.push_back(std::thread([&]() {                     \
-        var.reset(new int(0));                                  \
-        for (int i = 0; i < itersPerThread; ++i) {              \
-          ++(*var.get());                                       \
-        }                                                       \
-      }));                                                      \
-    }                                                           \
-    for (auto& t : threads) {                                   \
-      t.join();                                                 \
-    }                                                           \
-  }
-
-ThreadLocalPtr<int> tlp;
-REG(tlp);
-PThreadGetSpecific<int> pthread_get_specific;
-REG(pthread_get_specific);
-boost::thread_specific_ptr<int> boost_tsp;
-REG(boost_tsp);
-BENCHMARK_DRAW_LINE();
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  gflags::SetCommandLineOptionWithMode(
-    "bm_max_iters", "100000000", gflags::SET_FLAG_IF_DEFAULT
-  );
-  if (FLAGS_benchmark) {
-    folly::runBenchmarks();
-  }
-  return RUN_ALL_TESTS();
-}
-
-/*
-Ran with 24 threads on dual 12-core Xeon(R) X5650 @ 2.67GHz with 12-MB caches
-
-Benchmark                               Iters   Total t    t/iter iter/sec
-------------------------------------------------------------------------------
-*       BM_mt_tlp                   100000000  39.88 ms  398.8 ps  2.335 G
- +5.91% BM_mt_pthread_get_specific  100000000  42.23 ms  422.3 ps  2.205 G
- + 295% BM_mt_boost_tsp             100000000  157.8 ms  1.578 ns  604.5 M
-------------------------------------------------------------------------------
-*/