folly/test/CacheLocalityBenchmark.cpp

   1 /*
   2  * Copyright 2016 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <folly/detail/CacheLocality.h>
  18
  19 #include <sched.h>
  20 #include <memory>
  21 #include <thread>
  22 #include <type_traits>
  23 #include <unordered_map>
  24 #include <glog/logging.h>
  25 #include <folly/Benchmark.h>
  26
  27 using namespace folly::detail;
  28
  29 #define DECLARE_SPREADER_TAG(tag, locality, func)      \
  30   namespace {                                          \
  31   template <typename dummy>                            \
  32   struct tag {};                                       \
  33   }                                                    \
  34   DECLARE_ACCESS_SPREADER_TYPE(tag)                    \
  35   namespace folly {                                    \
  36   namespace detail {                                   \
  37   template <>                                          \
  38   const CacheLocality& CacheLocality::system<tag>() {  \
  39     static auto* inst = new CacheLocality(locality);   \
  40     return *inst;                                      \
  41   }                                                    \
  42   template <>                                          \
  43   Getcpu::Func AccessSpreader<tag>::pickGetcpuFunc() { \
  44     return func;                                       \
  45   }                                                    \
  46   }                                                    \
  47   }
  48
  49 DECLARE_SPREADER_TAG(
  50     ThreadLocalTag,
  51     CacheLocality::system<>(),
  52     folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu)
  53 DECLARE_SPREADER_TAG(PthreadSelfTag,
  54                      CacheLocality::system<>(),
  55                      folly::detail::FallbackGetcpu<HashingThreadId>::getcpu)
  56
  57 BENCHMARK(AccessSpreaderUse, iters) {
  58   for (unsigned long i = 0; i < iters; ++i) {
  59     auto x = AccessSpreader<>::current(16);
  60     folly::doNotOptimizeAway(x);
  61   }
  62 }
  63
  64 // Benchmark scores here reflect the time for 32 threads to perform an
  65 // atomic increment on a dual-socket E5-2660 @ 2.2Ghz.  Surprisingly,
  66 // if we don't separate the counters onto unique 128 byte stripes the
  67 // 1_stripe and 2_stripe results are identical, even though the L3 is
  68 // claimed to have 64 byte cache lines.
  69 //
  70 // Getcpu refers to the vdso getcpu implementation.  ThreadLocal refers
  71 // to execution using SequentialThreadId, the fallback if the vdso
  72 // getcpu isn't available.  PthreadSelf hashes the value returned from
  73 // pthread_self() as a fallback-fallback for systems that don't have
  74 // thread-local support.
  75 //
  76 // At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
  77 // so since the stripe selection is 12 nanos the atomic increments in
  78 // the L1 is ~17 nanos.  At width 8_stripe_0_work the line is expected
  79 // to ping-pong almost every operation, since the loops have the same
  80 // duration.  Widths 4 and 2 have the same behavior, but each tour of the
  81 // cache line is 4 and 8 cores long, respectively.  These all suggest a
  82 // lower bound of 60 nanos for intra-chip handoff and increment between
  83 // the L1s.
  84 //
  85 // With 420 nanos of busywork per contended increment, the system can
  86 // hide all of the latency of a tour of length 4, but not quite one of
  87 // length 8.  I was a bit surprised at how much worse the non-striped
  88 // version got.  It seems that the inter-chip traffic also interferes
  89 // with the L1-only localWork.load().  When the local work is doubled
  90 // to about 1 microsecond we see that the inter-chip contention is still
  91 // very important, but subdivisions on the same chip don't matter.
  92 //
  93 // sudo nice -n -20 buck-out/gen/folly/test/cache_locality_test
  94 //     --benchmark --bm_min_iters=1000000
  95 // ============================================================================
  96 // folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
  97 // ============================================================================
  98 // AccessSpreaderUse                                           11.94ns   83.79M
  99 // ----------------------------------------------------------------------------
 100 // contentionAtWidthGetcpu(1_stripe_0_work)                   985.75ns    1.01M
 101 // contentionAtWidthGetcpu(2_stripe_0_work)                   424.02ns    2.36M
 102 // contentionAtWidthGetcpu(4_stripe_0_work)                   190.13ns    5.26M
 103 // contentionAtWidthGetcpu(8_stripe_0_work)                    91.86ns   10.89M
 104 // contentionAtWidthGetcpu(16_stripe_0_work)                   29.31ns   34.12M
 105 // contentionAtWidthGetcpu(32_stripe_0_work)                   29.53ns   33.86M
 106 // contentionAtWidthGetcpu(64_stripe_0_work)                   29.93ns   33.41M
 107 // contentionAtWidthThreadLocal(2_stripe_0_work)              609.21ns    1.64M
 108 // contentionAtWidthThreadLocal(4_stripe_0_work)              303.60ns    3.29M
 109 // contentionAtWidthThreadLocal(8_stripe_0_work)              246.57ns    4.06M
 110 // contentionAtWidthThreadLocal(16_stripe_0_work)             154.84ns    6.46M
 111 // contentionAtWidthThreadLocal(32_stripe_0_work)              24.14ns   41.43M
 112 // contentionAtWidthThreadLocal(64_stripe_0_work)              23.95ns   41.75M
 113 // contentionAtWidthPthreadSelf(2_stripe_0_work)              722.01ns    1.39M
 114 // contentionAtWidthPthreadSelf(4_stripe_0_work)              501.56ns    1.99M
 115 // contentionAtWidthPthreadSelf(8_stripe_0_work)              474.58ns    2.11M
 116 // contentionAtWidthPthreadSelf(16_stripe_0_work)             300.90ns    3.32M
 117 // contentionAtWidthPthreadSelf(32_stripe_0_work)             175.77ns    5.69M
 118 // contentionAtWidthPthreadSelf(64_stripe_0_work)             174.88ns    5.72M
 119 // atomicIncrBaseline(local_incr_0_work)                       16.81ns   59.51M
 120 // ----------------------------------------------------------------------------
 121 // contentionAtWidthGetcpu(1_stripe_500_work)                   1.82us  549.97K
 122 // contentionAtWidthGetcpu(2_stripe_500_work)                 533.71ns    1.87M
 123 // contentionAtWidthGetcpu(4_stripe_500_work)                 424.64ns    2.35M
 124 // contentionAtWidthGetcpu(8_stripe_500_work)                 451.85ns    2.21M
 125 // contentionAtWidthGetcpu(16_stripe_500_work)                425.54ns    2.35M
 126 // contentionAtWidthGetcpu(32_stripe_500_work)                501.66ns    1.99M
 127 // atomicIncrBaseline(local_incr_500_work)                    438.46ns    2.28M
 128 // ----------------------------------------------------------------------------
 129 // contentionAtWidthGetcpu(1_stripe_1000_work)                  1.88us  532.20K
 130 // contentionAtWidthGetcpu(2_stripe_1000_work)                824.62ns    1.21M
 131 // contentionAtWidthGetcpu(4_stripe_1000_work)                803.56ns    1.24M
 132 // contentionAtWidthGetcpu(8_stripe_1000_work)                926.65ns    1.08M
 133 // contentionAtWidthGetcpu(16_stripe_1000_work)               900.10ns    1.11M
 134 // contentionAtWidthGetcpu(32_stripe_1000_work)               890.75ns    1.12M
 135 // atomicIncrBaseline(local_incr_1000_work)                   774.47ns    1.29M
 136 // ============================================================================
 137 template <template <typename> class Tag>
 138 static void contentionAtWidth(size_t iters, size_t stripes, size_t work) {
 139   const size_t counterAlignment = 128;
 140   const size_t numThreads = 32;
 141
 142   folly::BenchmarkSuspender braces;
 143
 144   std::atomic<size_t> ready(0);
 145   std::atomic<bool> go(false);
 146
 147   // while in theory the cache line size is 64 bytes, experiments show
 148   // that we get contention on 128 byte boundaries for Ivy Bridge.  The
 149   // extra indirection adds 1 or 2 nanos
 150   assert(counterAlignment >= sizeof(std::atomic<size_t>));
 151   std::vector<char> raw(counterAlignment * stripes);
 152
 153   // if we happen to be using the tlsRoundRobin, then sequentially
 154   // assigning the thread identifiers is the unlikely best-case scenario.
 155   // We don't want to unfairly benefit or penalize.  Computing the exact
 156   // maximum likelihood of the probability distributions is annoying, so
 157   // I approximate as 2/5 of the ids that have no threads, 2/5 that have
 158   // 1, 2/15 that have 2, and 1/15 that have 3.  We accomplish this by
 159   // wrapping back to slot 0 when we hit 1/15 and 1/5.
 160
 161   std::vector<std::thread> threads;
 162   while (threads.size() < numThreads) {
 163     threads.push_back(std::thread([&, iters, stripes, work]() {
 164       std::atomic<size_t>* counters[stripes];
 165       for (size_t i = 0; i < stripes; ++i) {
 166         counters[i] =
 167             new (raw.data() + counterAlignment * i) std::atomic<size_t>();
 168       }
 169
 170       ready++;
 171       while (!go.load()) {
 172         sched_yield();
 173       }
 174       std::atomic<int> localWork(0);
 175       for (size_t i = iters; i > 0; --i) {
 176         ++*(counters[AccessSpreader<Tag>::current(stripes)]);
 177         for (size_t j = work; j > 0; --j) {
 178           localWork.load();
 179         }
 180       }
 181     }));
 182
 183     if (threads.size() == numThreads / 15 || threads.size() == numThreads / 5) {
 184       // create a few dummy threads to wrap back around to 0 mod numCpus
 185       for (size_t i = threads.size(); i != numThreads; ++i) {
 186         std::thread([&]() { AccessSpreader<Tag>::current(stripes); }).join();
 187       }
 188     }
 189   }
 190
 191   while (ready < numThreads) {
 192     sched_yield();
 193   }
 194   braces.dismiss();
 195   go = true;
 196
 197   for (auto& thr : threads) {
 198     thr.join();
 199   }
 200 }
 201
 202 static void atomicIncrBaseline(size_t iters,
 203                                size_t work,
 204                                size_t numThreads = 32) {
 205   folly::BenchmarkSuspender braces;
 206
 207   std::atomic<bool> go(false);
 208
 209   std::vector<std::thread> threads;
 210   while (threads.size() < numThreads) {
 211     threads.push_back(std::thread([&]() {
 212       while (!go.load()) {
 213         sched_yield();
 214       }
 215       std::atomic<size_t> localCounter(0);
 216       std::atomic<int> localWork(0);
 217       for (size_t i = iters; i > 0; --i) {
 218         localCounter++;
 219         for (size_t j = work; j > 0; --j) {
 220           localWork.load();
 221         }
 222       }
 223     }));
 224   }
 225
 226   braces.dismiss();
 227   go = true;
 228
 229   for (auto& thr : threads) {
 230     thr.join();
 231   }
 232 }
 233
 234 static void contentionAtWidthGetcpu(size_t iters, size_t stripes, size_t work) {
 235   contentionAtWidth<std::atomic>(iters, stripes, work);
 236 }
 237
 238 static void contentionAtWidthThreadLocal(size_t iters,
 239                                          size_t stripes,
 240                                          size_t work) {
 241   contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
 242 }
 243
 244 static void contentionAtWidthPthreadSelf(size_t iters,
 245                                          size_t stripes,
 246                                          size_t work) {
 247   contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
 248 }
 249
 250 BENCHMARK_DRAW_LINE()
 251 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_0_work, 1, 0)
 252 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_0_work, 2, 0)
 253 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_0_work, 4, 0)
 254 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_0_work, 8, 0)
 255 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_0_work, 16, 0)
 256 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_0_work, 32, 0)
 257 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 64_stripe_0_work, 64, 0)
 258 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 2_stripe_0_work, 2, 0)
 259 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 4_stripe_0_work, 4, 0)
 260 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 8_stripe_0_work, 8, 0)
 261 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 16_stripe_0_work, 16, 0)
 262 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 32_stripe_0_work, 32, 0)
 263 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 64_stripe_0_work, 64, 0)
 264 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 2_stripe_0_work, 2, 0)
 265 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 4_stripe_0_work, 4, 0)
 266 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 8_stripe_0_work, 8, 0)
 267 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 16_stripe_0_work, 16, 0)
 268 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 32_stripe_0_work, 32, 0)
 269 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 64_stripe_0_work, 64, 0)
 270 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
 271 BENCHMARK_DRAW_LINE()
 272 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_500_work, 1, 500)
 273 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_500_work, 2, 500)
 274 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_500_work, 4, 500)
 275 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_500_work, 8, 500)
 276 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_500_work, 16, 500)
 277 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_500_work, 32, 500)
 278 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
 279 BENCHMARK_DRAW_LINE()
 280 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_1000_work, 1, 1000)
 281 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_1000_work, 2, 1000)
 282 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_1000_work, 4, 1000)
 283 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_1000_work, 8, 1000)
 284 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_1000_work, 16, 1000)
 285 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_1000_work, 32, 1000)
 286 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
 287
 288 int main(int argc, char** argv) {
 289   gflags::ParseCommandLineFlags(&argc, &argv, true);
 290   folly::runBenchmarks();
 291   return 0;
 292 }