folly/test/SmallLocksBenchmark.cpp

   1 /*
   2  * Copyright 2016 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <folly/Benchmark.h>
  18 #include <folly/SmallLocks.h>
  19 #include <algorithm>
  20 #include <condition_variable>
  21 #include <numeric>
  22 #include <thread>
  23 #include <vector>
  24
  25 /* "Work cycle" is just an additional nop loop iteration.
  26  * A smaller number of work cyles will result in more contention,
  27  * which is what we're trying to measure.  The relative ratio of
  28  * locked to unlocked cycles will simulate how big critical sections
  29  * are in production code
  30  */
  31 DEFINE_int32(work, 100, "Number of work cycles");
  32 DEFINE_int32(unlocked_work, 1000, "Number of unlocked work cycles");
  33 DEFINE_int32(
  34     threads,
  35     std::thread::hardware_concurrency(),
  36     "Number of threads for fairness test");
  37
  38 static void burn(size_t n) {
  39   for (size_t i = 0; i < n; ++i) {
  40     folly::doNotOptimizeAway(i);
  41   }
  42 }
  43
  44 namespace {
  45
  46 struct SimpleBarrier {
  47   explicit SimpleBarrier(size_t count) : lock_(), cv_(), count_(count) {}
  48
  49   void wait() {
  50     std::unique_lock<std::mutex> lockHeld(lock_);
  51     if (++num_ == count_) {
  52       cv_.notify_all();
  53     } else {
  54       cv_.wait(lockHeld, [&]() { return num_ >= count_; });
  55     }
  56   }
  57
  58  private:
  59   std::mutex lock_;
  60   std::condition_variable cv_;
  61   size_t num_{0};
  62   size_t count_;
  63 };
  64 }
  65
  66 template <typename Lock>
  67 class InitLock {
  68   Lock lock_;
  69
  70  public:
  71   InitLock() {
  72     lock_.init();
  73   }
  74   void lock() {
  75     lock_.lock();
  76   }
  77   void unlock() {
  78     lock_.unlock();
  79   }
  80 };
  81
  82 template <typename Lock>
  83 static void runContended(size_t numOps, size_t numThreads) {
  84   folly::BenchmarkSuspender braces;
  85   size_t totalthreads = std::thread::hardware_concurrency();
  86   if (totalthreads < numThreads) {
  87     totalthreads = numThreads;
  88   }
  89   size_t threadgroups = totalthreads / numThreads;
  90   struct lockstruct {
  91     char padding1[128];
  92     Lock lock;
  93     char padding2[128];
  94     long value = 1;
  95   };
  96
  97   auto locks =
  98       (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
  99
 100   char padding3[128];
 101   (void)padding3;
 102   std::vector<std::thread> threads(totalthreads);
 103
 104   SimpleBarrier runbarrier(totalthreads + 1);
 105
 106   for (size_t t = 0; t < totalthreads; ++t) {
 107     threads[t] = std::thread([&, t, totalthreads] {
 108       lockstruct* lock = &locks[t % threadgroups];
 109       runbarrier.wait();
 110       for (size_t op = 0; op < numOps; op += 1) {
 111         lock->lock.lock();
 112         burn(FLAGS_work);
 113         lock->value++;
 114         lock->lock.unlock();
 115         burn(FLAGS_unlocked_work);
 116       }
 117     });
 118   }
 119
 120   runbarrier.wait();
 121   braces.dismissing([&] {
 122     for (auto& thr : threads) {
 123       thr.join();
 124     }
 125   });
 126 }
 127
 128 template <typename Lock>
 129 static void runFairness() {
 130   size_t numThreads = FLAGS_threads;
 131   size_t totalthreads = std::thread::hardware_concurrency();
 132   if (totalthreads < numThreads) {
 133     totalthreads = numThreads;
 134   }
 135   long threadgroups = totalthreads / numThreads;
 136   struct lockstruct {
 137     char padding1[128];
 138     Lock lock;
 139   };
 140
 141   auto locks =
 142       (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
 143
 144   char padding3[64];
 145   (void)padding3;
 146   std::vector<std::thread> threads(totalthreads);
 147
 148   std::atomic<bool> stop{false};
 149
 150   std::mutex rlock;
 151   std::vector<long> results;
 152   std::vector<std::chrono::microseconds> maxes;
 153
 154   std::vector<std::chrono::microseconds> aqTime;
 155   std::vector<unsigned long> aqTimeSq;
 156
 157   SimpleBarrier runbarrier(totalthreads + 1);
 158
 159   for (size_t t = 0; t < totalthreads; ++t) {
 160     threads[t] = std::thread([&, t, totalthreads] {
 161       lockstruct* lock = &locks[t % threadgroups];
 162       long value = 0;
 163       std::chrono::microseconds max(0);
 164       std::chrono::microseconds time(0);
 165       unsigned long timeSq(0);
 166       runbarrier.wait();
 167       while (!stop) {
 168         std::chrono::steady_clock::time_point prelock =
 169             std::chrono::steady_clock::now();
 170         lock->lock.lock();
 171         std::chrono::steady_clock::time_point postlock =
 172             std::chrono::steady_clock::now();
 173         auto diff = std::chrono::duration_cast<std::chrono::microseconds>(
 174             postlock - prelock);
 175         time += diff;
 176         timeSq += diff.count() * diff.count();
 177         if (diff > max) {
 178           max = diff;
 179         }
 180         burn(FLAGS_work);
 181         value++;
 182         lock->lock.unlock();
 183         burn(FLAGS_unlocked_work);
 184       }
 185       {
 186         std::lock_guard<std::mutex> g(rlock);
 187         results.push_back(value);
 188         maxes.push_back(max);
 189         aqTime.push_back(time);
 190         aqTimeSq.push_back(timeSq);
 191       }
 192     });
 193   }
 194
 195   runbarrier.wait();
 196   /* sleep override */
 197   std::this_thread::sleep_for(std::chrono::seconds(2));
 198   stop = true;
 199
 200   for (auto& thr : threads) {
 201     thr.join();
 202   }
 203
 204   // Calulate some stats
 205   unsigned long sum = std::accumulate(results.begin(), results.end(), 0.0);
 206   double m = sum / results.size();
 207
 208   double accum = 0.0;
 209   std::for_each(results.begin(), results.end(), [&](const double d) {
 210     accum += (d - m) * (d - m);
 211   });
 212   double stdev = sqrt(accum / (results.size() - 1));
 213   std::chrono::microseconds mx = *std::max_element(maxes.begin(), maxes.end());
 214   std::chrono::microseconds agAqTime = std::accumulate(
 215       aqTime.begin(), aqTime.end(), std::chrono::microseconds(0));
 216   unsigned long agAqTimeSq =
 217       std::accumulate(aqTimeSq.begin(), aqTimeSq.end(), 0);
 218   std::chrono::microseconds mean = agAqTime / sum;
 219   double variance = (sum * agAqTimeSq - (agAqTime.count() * agAqTime.count())) /
 220       sum / (sum - 1);
 221   double stddev2 = sqrt(variance);
 222
 223   printf("Sum: %li Mean: %.0f stddev: %.0f\n", sum, m, stdev);
 224   printf(
 225       "Lock time stats in us: mean %li stddev %.0f max %li\n",
 226       mean.count(),
 227       stddev2,
 228       mx.count());
 229 }
 230
 231 BENCHMARK(StdMutexUncontendedBenchmark, iters) {
 232   std::mutex lock;
 233   while (iters--) {
 234     lock.lock();
 235     lock.unlock();
 236   }
 237 }
 238
 239 BENCHMARK(MicroSpinLockUncontendedBenchmark, iters) {
 240   folly::MicroSpinLock lock;
 241   lock.init();
 242   while (iters--) {
 243     lock.lock();
 244     lock.unlock();
 245   }
 246 }
 247
 248 BENCHMARK(PicoSpinLockUncontendedBenchmark, iters) {
 249   // uint8_t would be more fair, but PicoSpinLock needs at lesat two bytes
 250   folly::PicoSpinLock<uint16_t> lock;
 251   lock.init();
 252   while (iters--) {
 253     lock.lock();
 254     lock.unlock();
 255   }
 256 }
 257
 258 BENCHMARK(MicroLockUncontendedBenchmark, iters) {
 259   folly::MicroLock lock;
 260   lock.init();
 261   while (iters--) {
 262     lock.lock();
 263     lock.unlock();
 264   }
 265 }
 266
 267 struct VirtualBase {
 268   virtual void foo() = 0;
 269   virtual ~VirtualBase() {}
 270 };
 271
 272 struct VirtualImpl : VirtualBase {
 273   void foo() override { /* noop */
 274   }
 275   virtual ~VirtualImpl() {}
 276 };
 277
 278 #ifndef __clang__
 279 __attribute__((noinline, noclone)) VirtualBase* makeVirtual() {
 280   return new VirtualImpl();
 281 }
 282
 283 BENCHMARK(VirtualFunctionCall, iters) {
 284   VirtualBase* vb = makeVirtual();
 285   while (iters--) {
 286     vb->foo();
 287   }
 288   delete vb;
 289 }
 290 #endif
 291
 292 BENCHMARK_DRAW_LINE()
 293
 294 #define BENCH_BASE(...) FB_VA_GLUE(BENCHMARK_NAMED_PARAM, (__VA_ARGS__))
 295 #define BENCH_REL(...) FB_VA_GLUE(BENCHMARK_RELATIVE_NAMED_PARAM, (__VA_ARGS__))
 296
 297 static void std_mutex(size_t numOps, size_t numThreads) {
 298   runContended<std::mutex>(numOps, numThreads);
 299 }
 300 static void folly_microspin(size_t numOps, size_t numThreads) {
 301   runContended<InitLock<folly::MicroSpinLock>>(numOps, numThreads);
 302 }
 303 static void folly_picospin(size_t numOps, size_t numThreads) {
 304   runContended<InitLock<folly::PicoSpinLock<uint16_t>>>(numOps, numThreads);
 305 }
 306 static void folly_microlock(size_t numOps, size_t numThreads) {
 307   runContended<folly::MicroLock>(numOps, numThreads);
 308 }
 309
 310 BENCHMARK_DRAW_LINE()
 311 BENCH_BASE(std_mutex, 1thread, 1)
 312 BENCH_REL(folly_microspin, 1thread, 1)
 313 BENCH_REL(folly_picospin, 1thread, 1)
 314 BENCH_REL(folly_microlock, 1thread, 1)
 315 BENCHMARK_DRAW_LINE()
 316 BENCH_BASE(std_mutex, 2thread, 2)
 317 BENCH_REL(folly_microspin, 2thread, 2)
 318 BENCH_REL(folly_picospin, 2thread, 2)
 319 BENCH_REL(folly_microlock, 2thread, 2)
 320 BENCHMARK_DRAW_LINE()
 321 BENCH_BASE(std_mutex, 4thread, 4)
 322 BENCH_REL(folly_microspin, 4thread, 4)
 323 BENCH_REL(folly_picospin, 4thread, 4)
 324 BENCH_REL(folly_microlock, 4thread, 4)
 325 BENCHMARK_DRAW_LINE()
 326 BENCH_BASE(std_mutex, 8thread, 8)
 327 BENCH_REL(folly_microspin, 8thread, 8)
 328 BENCH_REL(folly_picospin, 8thread, 8)
 329 BENCH_REL(folly_microlock, 8thread, 8)
 330 BENCHMARK_DRAW_LINE()
 331 BENCH_BASE(std_mutex, 16thread, 16)
 332 BENCH_REL(folly_microspin, 16thread, 16)
 333 BENCH_REL(folly_picospin, 16thread, 16)
 334 BENCH_REL(folly_microlock, 16thread, 16)
 335 BENCHMARK_DRAW_LINE()
 336 BENCH_BASE(std_mutex, 32thread, 32)
 337 BENCH_REL(folly_microspin, 32thread, 32)
 338 BENCH_REL(folly_picospin, 32thread, 32)
 339 BENCH_REL(folly_microlock, 32thread, 32)
 340 BENCHMARK_DRAW_LINE()
 341 BENCH_BASE(std_mutex, 64thread, 64)
 342 BENCH_REL(folly_microspin, 64thread, 64)
 343 BENCH_REL(folly_picospin, 64thread, 64)
 344 BENCH_REL(folly_microlock, 64thread, 64)
 345
 346 #define FairnessTest(type) \
 347   {                        \
 348     printf(#type ": \n");  \
 349     runFairness<type>();   \
 350   }
 351
 352 int main(int argc, char** argv) {
 353   gflags::ParseCommandLineFlags(&argc, &argv, true);
 354
 355   FairnessTest(std::mutex);
 356   FairnessTest(InitLock<folly::MicroSpinLock>);
 357   FairnessTest(InitLock<folly::PicoSpinLock<uint16_t>>);
 358   FairnessTest(folly::MicroLock);
 359
 360   folly::runBenchmarks();
 361
 362   return 0;
 363 }
 364
 365 /*
 366 locks_benchmark --bm_min_iters=1000000
 367
 368 std::mutex:
 369 Sum: 2895849 Mean: 90495 stddev: 3147
 370 Lock time stats in us: mean 11 stddev 1483 max 17823
 371 InitLock<folly::MicroSpinLock>:
 372 Sum: 3114365 Mean: 97323 stddev: 92604
 373 Lock time stats in us: mean 19 stddev 1379 max 235710
 374 InitLock<folly::PicoSpinLock<uint16_t>>:
 375 Sum: 1189713 Mean: 37178 stddev: 23295
 376 Lock time stats in us: mean 51 stddev 3610 max 414093
 377 folly::MicroLock:
 378 Sum: 1890150 Mean: 59067 stddev: 26795
 379 Lock time stats in us: mean 31 stddev 2272 max 70996
 380 ============================================================================
 381 folly/test/SmallLocksBenchmark.cpp              relative  time/iter  iters/s
 382 ============================================================================
 383 StdMutexUncontendedBenchmark                                25.97ns   38.51M
 384 MicroSpinLockUncontendedBenchmark                           16.26ns   61.49M
 385 PicoSpinLockUncontendedBenchmark                            15.92ns   62.82M
 386 MicroLockUncontendedBenchmark                               28.54ns   35.03M
 387 VirtualFunctionCall                                          1.73ns  576.51M
 388 ----------------------------------------------------------------------------
 389 ----------------------------------------------------------------------------
 390 std_mutex(1thread)                                         898.49ns    1.11M
 391 folly_microspin(1thread)                          87.68%     1.02us  975.85K
 392 folly_picospin(1thread)                           95.06%   945.15ns    1.06M
 393 folly_microlock(1thread)                          87.34%     1.03us  972.05K
 394 ----------------------------------------------------------------------------
 395 std_mutex(2thread)                                           1.29us  773.03K
 396 folly_microspin(2thread)                          90.35%     1.43us  698.40K
 397 folly_picospin(2thread)                          126.11%     1.03us  974.90K
 398 folly_microlock(2thread)                         109.99%     1.18us  850.24K
 399 ----------------------------------------------------------------------------
 400 std_mutex(4thread)                                           3.03us  330.05K
 401 folly_microspin(4thread)                          94.03%     3.22us  310.33K
 402 folly_picospin(4thread)                          127.16%     2.38us  419.70K
 403 folly_microlock(4thread)                          64.39%     4.71us  212.51K
 404 ----------------------------------------------------------------------------
 405 std_mutex(8thread)                                           4.79us  208.81K
 406 folly_microspin(8thread)                          71.50%     6.70us  149.30K
 407 folly_picospin(8thread)                           54.58%     8.77us  113.96K
 408 folly_microlock(8thread)                          55.58%     8.62us  116.05K
 409 ----------------------------------------------------------------------------
 410 std_mutex(16thread)                                         12.14us   82.39K
 411 folly_microspin(16thread)                        102.02%    11.90us   84.05K
 412 folly_picospin(16thread)                          62.30%    19.48us   51.33K
 413 folly_microlock(16thread)                         64.54%    18.81us   53.17K
 414 ----------------------------------------------------------------------------
 415 std_mutex(32thread)                                         22.52us   44.41K
 416 folly_microspin(32thread)                         88.25%    25.52us   39.19K
 417 folly_picospin(32thread)                          46.04%    48.91us   20.45K
 418 folly_microlock(32thread)                         67.08%    33.57us   29.79K
 419 ----------------------------------------------------------------------------
 420 std_mutex(64thread)                                         43.29us   23.10K
 421 folly_microspin(64thread)                         98.01%    44.17us   22.64K
 422 folly_picospin(64thread)                          48.62%    89.04us   11.23K
 423 folly_microlock(64thread)                         62.68%    69.07us   14.48K
 424 ============================================================================
 425 */