Fix copyright lines for Bits.h and move BitsBenchmark.cpp
[folly.git] / folly / test / SmallLocksBenchmark.cpp
1 /*
2  * Copyright 2017 Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <algorithm>
18 #include <cmath>
19 #include <condition_variable>
20 #include <numeric>
21 #include <thread>
22 #include <vector>
23
24 #include <folly/Benchmark.h>
25 #include <folly/SmallLocks.h>
26
27 /* "Work cycle" is just an additional nop loop iteration.
28  * A smaller number of work cyles will result in more contention,
29  * which is what we're trying to measure.  The relative ratio of
30  * locked to unlocked cycles will simulate how big critical sections
31  * are in production code
32  */
33 DEFINE_int32(work, 100, "Number of work cycles");
34 DEFINE_int32(unlocked_work, 1000, "Number of unlocked work cycles");
35 DEFINE_int32(
36     threads,
37     std::thread::hardware_concurrency(),
38     "Number of threads for fairness test");
39
40 static void burn(size_t n) {
41   for (size_t i = 0; i < n; ++i) {
42     folly::doNotOptimizeAway(i);
43   }
44 }
45
46 namespace {
47
48 struct SimpleBarrier {
49   explicit SimpleBarrier(size_t count) : lock_(), cv_(), count_(count) {}
50
51   void wait() {
52     std::unique_lock<std::mutex> lockHeld(lock_);
53     if (++num_ == count_) {
54       cv_.notify_all();
55     } else {
56       cv_.wait(lockHeld, [&]() { return num_ >= count_; });
57     }
58   }
59
60  private:
61   std::mutex lock_;
62   std::condition_variable cv_;
63   size_t num_{0};
64   size_t count_;
65 };
66 } // namespace
67
68 template <typename Lock>
69 class InitLock {
70   Lock lock_;
71
72  public:
73   InitLock() {
74     lock_.init();
75   }
76   void lock() {
77     lock_.lock();
78   }
79   void unlock() {
80     lock_.unlock();
81   }
82 };
83
84 template <typename Lock>
85 static void runContended(size_t numOps, size_t numThreads) {
86   folly::BenchmarkSuspender braces;
87   size_t totalthreads = std::thread::hardware_concurrency();
88   if (totalthreads < numThreads) {
89     totalthreads = numThreads;
90   }
91   size_t threadgroups = totalthreads / numThreads;
92   struct lockstruct {
93     char padding1[128];
94     Lock lock;
95     char padding2[128];
96     long value = 1;
97   };
98
99   auto locks =
100       (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
101
102   char padding3[128];
103   (void)padding3;
104   std::vector<std::thread> threads(totalthreads);
105
106   SimpleBarrier runbarrier(totalthreads + 1);
107
108   for (size_t t = 0; t < totalthreads; ++t) {
109     threads[t] = std::thread([&, t] {
110       lockstruct* lock = &locks[t % threadgroups];
111       runbarrier.wait();
112       for (size_t op = 0; op < numOps; op += 1) {
113         lock->lock.lock();
114         burn(FLAGS_work);
115         lock->value++;
116         lock->lock.unlock();
117         burn(FLAGS_unlocked_work);
118       }
119     });
120   }
121
122   runbarrier.wait();
123   braces.dismissing([&] {
124     for (auto& thr : threads) {
125       thr.join();
126     }
127   });
128 }
129
130 template <typename Lock>
131 static void runFairness() {
132   size_t numThreads = FLAGS_threads;
133   size_t totalthreads = std::thread::hardware_concurrency();
134   if (totalthreads < numThreads) {
135     totalthreads = numThreads;
136   }
137   long threadgroups = totalthreads / numThreads;
138   struct lockstruct {
139     char padding1[128];
140     Lock lock;
141   };
142
143   auto locks =
144       (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
145
146   char padding3[64];
147   (void)padding3;
148   std::vector<std::thread> threads(totalthreads);
149
150   std::atomic<bool> stop{false};
151
152   std::mutex rlock;
153   std::vector<long> results;
154   std::vector<std::chrono::microseconds> maxes;
155
156   std::vector<std::chrono::microseconds> aqTime;
157   std::vector<unsigned long> aqTimeSq;
158
159   SimpleBarrier runbarrier(totalthreads + 1);
160
161   for (size_t t = 0; t < totalthreads; ++t) {
162     threads[t] = std::thread([&, t] {
163       lockstruct* lock = &locks[t % threadgroups];
164       long value = 0;
165       std::chrono::microseconds max(0);
166       std::chrono::microseconds time(0);
167       unsigned long timeSq(0);
168       runbarrier.wait();
169       while (!stop) {
170         std::chrono::steady_clock::time_point prelock =
171             std::chrono::steady_clock::now();
172         lock->lock.lock();
173         std::chrono::steady_clock::time_point postlock =
174             std::chrono::steady_clock::now();
175         auto diff = std::chrono::duration_cast<std::chrono::microseconds>(
176             postlock - prelock);
177         time += diff;
178         timeSq += diff.count() * diff.count();
179         if (diff > max) {
180           max = diff;
181         }
182         burn(FLAGS_work);
183         value++;
184         lock->lock.unlock();
185         burn(FLAGS_unlocked_work);
186       }
187       {
188         std::lock_guard<std::mutex> g(rlock);
189         results.push_back(value);
190         maxes.push_back(max);
191         aqTime.push_back(time);
192         aqTimeSq.push_back(timeSq);
193       }
194     });
195   }
196
197   runbarrier.wait();
198   /* sleep override */
199   std::this_thread::sleep_for(std::chrono::seconds(2));
200   stop = true;
201
202   for (auto& thr : threads) {
203     thr.join();
204   }
205
206   // Calulate some stats
207   unsigned long sum = std::accumulate(results.begin(), results.end(), 0.0);
208   double m = sum / results.size();
209
210   double accum = 0.0;
211   std::for_each(results.begin(), results.end(), [&](const double d) {
212     accum += (d - m) * (d - m);
213   });
214   double stdev = std::sqrt(accum / (results.size() - 1));
215   std::chrono::microseconds mx = *std::max_element(maxes.begin(), maxes.end());
216   std::chrono::microseconds agAqTime = std::accumulate(
217       aqTime.begin(), aqTime.end(), std::chrono::microseconds(0));
218   unsigned long agAqTimeSq =
219       std::accumulate(aqTimeSq.begin(), aqTimeSq.end(), 0);
220   std::chrono::microseconds mean = agAqTime / sum;
221   double variance = (sum * agAqTimeSq - (agAqTime.count() * agAqTime.count())) /
222       sum / (sum - 1);
223   double stddev2 = std::sqrt(variance);
224
225   printf("Sum: %li Mean: %.0f stddev: %.0f\n", sum, m, stdev);
226   printf(
227       "Lock time stats in us: mean %li stddev %.0f max %li\n",
228       mean.count(),
229       stddev2,
230       mx.count());
231 }
232
233 BENCHMARK(StdMutexUncontendedBenchmark, iters) {
234   std::mutex lock;
235   while (iters--) {
236     lock.lock();
237     lock.unlock();
238   }
239 }
240
241 BENCHMARK(MicroSpinLockUncontendedBenchmark, iters) {
242   folly::MicroSpinLock lock;
243   lock.init();
244   while (iters--) {
245     lock.lock();
246     lock.unlock();
247   }
248 }
249
250 BENCHMARK(PicoSpinLockUncontendedBenchmark, iters) {
251   // uint8_t would be more fair, but PicoSpinLock needs at lesat two bytes
252   folly::PicoSpinLock<uint16_t> lock;
253   lock.init();
254   while (iters--) {
255     lock.lock();
256     lock.unlock();
257   }
258 }
259
260 BENCHMARK(MicroLockUncontendedBenchmark, iters) {
261   folly::MicroLock lock;
262   lock.init();
263   while (iters--) {
264     lock.lock();
265     lock.unlock();
266   }
267 }
268
269 struct VirtualBase {
270   virtual void foo() = 0;
271   virtual ~VirtualBase() {}
272 };
273
274 struct VirtualImpl : VirtualBase {
275   void foo() override { /* noop */
276   }
277   ~VirtualImpl() override {}
278 };
279
280 #ifndef __clang__
281 __attribute__((noinline, noclone)) VirtualBase* makeVirtual() {
282   return new VirtualImpl();
283 }
284
285 BENCHMARK(VirtualFunctionCall, iters) {
286   VirtualBase* vb = makeVirtual();
287   while (iters--) {
288     vb->foo();
289   }
290   delete vb;
291 }
292 #endif
293
294 BENCHMARK_DRAW_LINE()
295
296 #define BENCH_BASE(...) FB_VA_GLUE(BENCHMARK_NAMED_PARAM, (__VA_ARGS__))
297 #define BENCH_REL(...) FB_VA_GLUE(BENCHMARK_RELATIVE_NAMED_PARAM, (__VA_ARGS__))
298
299 static void std_mutex(size_t numOps, size_t numThreads) {
300   runContended<std::mutex>(numOps, numThreads);
301 }
302 static void folly_microspin(size_t numOps, size_t numThreads) {
303   runContended<InitLock<folly::MicroSpinLock>>(numOps, numThreads);
304 }
305 static void folly_picospin(size_t numOps, size_t numThreads) {
306   runContended<InitLock<folly::PicoSpinLock<uint16_t>>>(numOps, numThreads);
307 }
308 static void folly_microlock(size_t numOps, size_t numThreads) {
309   runContended<folly::MicroLock>(numOps, numThreads);
310 }
311
312 BENCHMARK_DRAW_LINE()
313 BENCH_BASE(std_mutex, 1thread, 1)
314 BENCH_REL(folly_microspin, 1thread, 1)
315 BENCH_REL(folly_picospin, 1thread, 1)
316 BENCH_REL(folly_microlock, 1thread, 1)
317 BENCHMARK_DRAW_LINE()
318 BENCH_BASE(std_mutex, 2thread, 2)
319 BENCH_REL(folly_microspin, 2thread, 2)
320 BENCH_REL(folly_picospin, 2thread, 2)
321 BENCH_REL(folly_microlock, 2thread, 2)
322 BENCHMARK_DRAW_LINE()
323 BENCH_BASE(std_mutex, 4thread, 4)
324 BENCH_REL(folly_microspin, 4thread, 4)
325 BENCH_REL(folly_picospin, 4thread, 4)
326 BENCH_REL(folly_microlock, 4thread, 4)
327 BENCHMARK_DRAW_LINE()
328 BENCH_BASE(std_mutex, 8thread, 8)
329 BENCH_REL(folly_microspin, 8thread, 8)
330 BENCH_REL(folly_picospin, 8thread, 8)
331 BENCH_REL(folly_microlock, 8thread, 8)
332 BENCHMARK_DRAW_LINE()
333 BENCH_BASE(std_mutex, 16thread, 16)
334 BENCH_REL(folly_microspin, 16thread, 16)
335 BENCH_REL(folly_picospin, 16thread, 16)
336 BENCH_REL(folly_microlock, 16thread, 16)
337 BENCHMARK_DRAW_LINE()
338 BENCH_BASE(std_mutex, 32thread, 32)
339 BENCH_REL(folly_microspin, 32thread, 32)
340 BENCH_REL(folly_picospin, 32thread, 32)
341 BENCH_REL(folly_microlock, 32thread, 32)
342 BENCHMARK_DRAW_LINE()
343 BENCH_BASE(std_mutex, 64thread, 64)
344 BENCH_REL(folly_microspin, 64thread, 64)
345 BENCH_REL(folly_picospin, 64thread, 64)
346 BENCH_REL(folly_microlock, 64thread, 64)
347
348 #define FairnessTest(type) \
349   {                        \
350     printf(#type ": \n");  \
351     runFairness<type>();   \
352   }
353
354 int main(int argc, char** argv) {
355   gflags::ParseCommandLineFlags(&argc, &argv, true);
356
357   FairnessTest(std::mutex);
358   FairnessTest(InitLock<folly::MicroSpinLock>);
359   FairnessTest(InitLock<folly::PicoSpinLock<uint16_t>>);
360   FairnessTest(folly::MicroLock);
361
362   folly::runBenchmarks();
363
364   return 0;
365 }
366
367 /*
368 locks_benchmark --bm_min_iters=1000000
369
370 std::mutex:
371 Sum: 2895849 Mean: 90495 stddev: 3147
372 Lock time stats in us: mean 11 stddev 1483 max 17823
373 InitLock<folly::MicroSpinLock>:
374 Sum: 3114365 Mean: 97323 stddev: 92604
375 Lock time stats in us: mean 19 stddev 1379 max 235710
376 InitLock<folly::PicoSpinLock<uint16_t>>:
377 Sum: 1189713 Mean: 37178 stddev: 23295
378 Lock time stats in us: mean 51 stddev 3610 max 414093
379 folly::MicroLock:
380 Sum: 1890150 Mean: 59067 stddev: 26795
381 Lock time stats in us: mean 31 stddev 2272 max 70996
382 ============================================================================
383 folly/test/SmallLocksBenchmark.cpp              relative  time/iter  iters/s
384 ============================================================================
385 StdMutexUncontendedBenchmark                                25.97ns   38.51M
386 MicroSpinLockUncontendedBenchmark                           16.26ns   61.49M
387 PicoSpinLockUncontendedBenchmark                            15.92ns   62.82M
388 MicroLockUncontendedBenchmark                               28.54ns   35.03M
389 VirtualFunctionCall                                          1.73ns  576.51M
390 ----------------------------------------------------------------------------
391 ----------------------------------------------------------------------------
392 std_mutex(1thread)                                         898.49ns    1.11M
393 folly_microspin(1thread)                          87.68%     1.02us  975.85K
394 folly_picospin(1thread)                           95.06%   945.15ns    1.06M
395 folly_microlock(1thread)                          87.34%     1.03us  972.05K
396 ----------------------------------------------------------------------------
397 std_mutex(2thread)                                           1.29us  773.03K
398 folly_microspin(2thread)                          90.35%     1.43us  698.40K
399 folly_picospin(2thread)                          126.11%     1.03us  974.90K
400 folly_microlock(2thread)                         109.99%     1.18us  850.24K
401 ----------------------------------------------------------------------------
402 std_mutex(4thread)                                           3.03us  330.05K
403 folly_microspin(4thread)                          94.03%     3.22us  310.33K
404 folly_picospin(4thread)                          127.16%     2.38us  419.70K
405 folly_microlock(4thread)                          64.39%     4.71us  212.51K
406 ----------------------------------------------------------------------------
407 std_mutex(8thread)                                           4.79us  208.81K
408 folly_microspin(8thread)                          71.50%     6.70us  149.30K
409 folly_picospin(8thread)                           54.58%     8.77us  113.96K
410 folly_microlock(8thread)                          55.58%     8.62us  116.05K
411 ----------------------------------------------------------------------------
412 std_mutex(16thread)                                         12.14us   82.39K
413 folly_microspin(16thread)                        102.02%    11.90us   84.05K
414 folly_picospin(16thread)                          62.30%    19.48us   51.33K
415 folly_microlock(16thread)                         64.54%    18.81us   53.17K
416 ----------------------------------------------------------------------------
417 std_mutex(32thread)                                         22.52us   44.41K
418 folly_microspin(32thread)                         88.25%    25.52us   39.19K
419 folly_picospin(32thread)                          46.04%    48.91us   20.45K
420 folly_microlock(32thread)                         67.08%    33.57us   29.79K
421 ----------------------------------------------------------------------------
422 std_mutex(64thread)                                         43.29us   23.10K
423 folly_microspin(64thread)                         98.01%    44.17us   22.64K
424 folly_picospin(64thread)                          48.62%    89.04us   11.23K
425 folly_microlock(64thread)                         62.68%    69.07us   14.48K
426 ============================================================================
427 */