folly/test/ThreadLocalBenchmark.cpp

   1 /*
   2  * Copyright 2016 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <folly/ThreadLocal.h>
  18
  19 #include <dlfcn.h>
  20 #include <sys/types.h>
  21 #include <sys/wait.h>
  22
  23 #include <array>
  24 #include <atomic>
  25 #include <chrono>
  26 #include <condition_variable>
  27 #include <limits.h>
  28 #include <map>
  29 #include <mutex>
  30 #include <set>
  31 #include <thread>
  32 #include <unordered_map>
  33
  34 #include <boost/thread/tss.hpp>
  35 #include <glog/logging.h>
  36
  37 #include <folly/Benchmark.h>
  38 #include <folly/experimental/io/FsUtil.h>
  39 #include <folly/portability/GFlags.h>
  40 #include <folly/portability/Unistd.h>
  41
  42 using namespace folly;
  43
  44 // Simple reference implementation using pthread_get_specific
  45 template <typename T>
  46 class PThreadGetSpecific {
  47  public:
  48   PThreadGetSpecific() : key_(0) { pthread_key_create(&key_, OnThreadExit); }
  49
  50   T* get() const { return static_cast<T*>(pthread_getspecific(key_)); }
  51
  52   void reset(T* t) {
  53     delete get();
  54     pthread_setspecific(key_, t);
  55   }
  56   static void OnThreadExit(void* obj) { delete static_cast<T*>(obj); }
  57
  58  private:
  59   pthread_key_t key_;
  60 };
  61
  62 DEFINE_int32(numThreads, 8, "Number simultaneous threads for benchmarks.");
  63
  64 #define REG(var)                                         \
  65   BENCHMARK(FB_CONCATENATE(BM_mt_, var), iters) {        \
  66     const int itersPerThread = iters / FLAGS_numThreads; \
  67     std::vector<std::thread> threads;                    \
  68     for (int i = 0; i < FLAGS_numThreads; ++i) {         \
  69       threads.push_back(std::thread([&]() {              \
  70         var.reset(new int(0));                           \
  71         for (int i = 0; i < itersPerThread; ++i) {       \
  72           ++(*var.get());                                \
  73         }                                                \
  74       }));                                               \
  75     }                                                    \
  76     for (auto& t : threads) {                            \
  77       t.join();                                          \
  78     }                                                    \
  79   }
  80
  81 ThreadLocalPtr<int> tlp;
  82 REG(tlp);
  83 PThreadGetSpecific<int> pthread_get_specific;
  84 REG(pthread_get_specific);
  85 boost::thread_specific_ptr<int> boost_tsp;
  86 REG(boost_tsp);
  87 BENCHMARK_DRAW_LINE();
  88
  89 int main(int argc, char** argv) {
  90   gflags::ParseCommandLineFlags(&argc, &argv, true);
  91   gflags::SetCommandLineOptionWithMode(
  92       "bm_max_iters", "100000000", gflags::SET_FLAG_IF_DEFAULT);
  93   folly::runBenchmarks();
  94   return 0;
  95 }
  96
  97 /*
  98 Ran with 24 threads on dual 12-core Xeon(R) X5650 @ 2.67GHz with 12-MB caches
  99
 100 Benchmark                               Iters   Total t    t/iter iter/sec
 101 ------------------------------------------------------------------------------
 102 *       BM_mt_tlp                   100000000  39.88 ms  398.8 ps  2.335 G
 103  +5.91% BM_mt_pthread_get_specific  100000000  42.23 ms  422.3 ps  2.205 G
 104  + 295% BM_mt_boost_tsp             100000000  157.8 ms  1.578 ns  604.5 M
 105 ------------------------------------------------------------------------------
 106 */