ca69b1a06b8f5119c9cffc4a31fae852b70022f7
[folly.git] / folly / test / SmallLocksBenchmark.cpp
1 /*
2  * Copyright 2016 Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <folly/Benchmark.h>
18 #include <folly/SmallLocks.h>
19 #include <algorithm>
20 #include <condition_variable>
21 #include <numeric>
22 #include <thread>
23 #include <vector>
24
25 /* "Work cycle" is just an additional nop loop iteration.
26  * A smaller number of work cyles will result in more contention,
27  * which is what we're trying to measure.  The relative ratio of
28  * locked to unlocked cycles will simulate how big critical sections
29  * are in production code
30  */
31 DEFINE_int32(work, 100, "Number of work cycles");
32 DEFINE_int32(unlocked_work, 1000, "Number of unlocked work cycles");
33 DEFINE_int32(
34     threads,
35     std::thread::hardware_concurrency(),
36     "Number of threads for fairness test");
37
38 static void burn(size_t n) {
39   for (size_t i = 0; i < n; ++i) {
40     folly::doNotOptimizeAway(i);
41   }
42 }
43
44 namespace {
45
46 struct SimpleBarrier {
47   explicit SimpleBarrier(size_t count) : lock_(), cv_(), count_(count) {}
48
49   void wait() {
50     std::unique_lock<std::mutex> lockHeld(lock_);
51     if (++num_ == count_) {
52       cv_.notify_all();
53     } else {
54       cv_.wait(lockHeld, [&]() { return num_ >= count_; });
55     }
56   }
57
58  private:
59   std::mutex lock_;
60   std::condition_variable cv_;
61   size_t num_{0};
62   size_t count_;
63 };
64 }
65
66 template <typename Lock>
67 class InitLock {
68   Lock lock_;
69
70  public:
71   InitLock() {
72     lock_.init();
73   }
74   void lock() {
75     lock_.lock();
76   }
77   void unlock() {
78     lock_.unlock();
79   }
80 };
81
82 template <typename Lock>
83 static void runContended(size_t numOps, size_t numThreads) {
84   folly::BenchmarkSuspender braces;
85   size_t totalthreads = std::thread::hardware_concurrency();
86   if (totalthreads < numThreads) {
87     totalthreads = numThreads;
88   }
89   size_t threadgroups = totalthreads / numThreads;
90   struct lockstruct {
91     char padding1[128];
92     Lock lock;
93     char padding2[128];
94     long value = 1;
95   };
96
97   auto locks =
98       (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
99
100   char padding3[128];
101   (void)padding3;
102   std::vector<std::thread> threads(totalthreads);
103
104   SimpleBarrier runbarrier(totalthreads + 1);
105
106   for (size_t t = 0; t < totalthreads; ++t) {
107     threads[t] = std::thread([&, t, totalthreads] {
108       lockstruct* lock = &locks[t % threadgroups];
109       runbarrier.wait();
110       for (size_t op = 0; op < numOps; op += 1) {
111         lock->lock.lock();
112         burn(FLAGS_work);
113         lock->value++;
114         lock->lock.unlock();
115         burn(FLAGS_unlocked_work);
116       }
117     });
118   }
119
120   runbarrier.wait();
121   braces.dismissing([&] {
122     for (auto& thr : threads) {
123       thr.join();
124     }
125   });
126 }
127
128 template <typename Lock>
129 static void runFairness() {
130   size_t numThreads = FLAGS_threads;
131   size_t totalthreads = std::thread::hardware_concurrency();
132   if (totalthreads < numThreads) {
133     totalthreads = numThreads;
134   }
135   long threadgroups = totalthreads / numThreads;
136   struct lockstruct {
137     char padding1[128];
138     Lock lock;
139   };
140
141   auto locks =
142       (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
143
144   char padding3[64];
145   (void)padding3;
146   std::vector<std::thread> threads(totalthreads);
147
148   std::atomic<bool> stop{false};
149
150   std::mutex rlock;
151   std::vector<long> results;
152   std::vector<std::chrono::microseconds> maxes;
153
154   std::vector<std::chrono::microseconds> aqTime;
155   std::vector<unsigned long> aqTimeSq;
156
157   SimpleBarrier runbarrier(totalthreads + 1);
158
159   for (size_t t = 0; t < totalthreads; ++t) {
160     threads[t] = std::thread([&, t, totalthreads] {
161       lockstruct* lock = &locks[t % threadgroups];
162       long value = 0;
163       std::chrono::microseconds max(0);
164       std::chrono::microseconds time(0);
165       unsigned long timeSq(0);
166       runbarrier.wait();
167       while (!stop) {
168         std::chrono::steady_clock::time_point prelock =
169             std::chrono::steady_clock::now();
170         lock->lock.lock();
171         std::chrono::steady_clock::time_point postlock =
172             std::chrono::steady_clock::now();
173         auto diff = std::chrono::duration_cast<std::chrono::microseconds>(
174             postlock - prelock);
175         time += diff;
176         timeSq += diff.count() * diff.count();
177         if (diff > max) {
178           max = diff;
179         }
180         burn(FLAGS_work);
181         value++;
182         lock->lock.unlock();
183         burn(FLAGS_unlocked_work);
184       }
185       {
186         std::lock_guard<std::mutex> g(rlock);
187         results.push_back(value);
188         maxes.push_back(max);
189         aqTime.push_back(time);
190         aqTimeSq.push_back(timeSq);
191       }
192     });
193   }
194
195   runbarrier.wait();
196   /* sleep override */
197   std::this_thread::sleep_for(std::chrono::seconds(2));
198   stop = true;
199
200   for (auto& thr : threads) {
201     thr.join();
202   }
203
204   // Calulate some stats
205   unsigned long sum = std::accumulate(results.begin(), results.end(), 0.0);
206   double m = sum / results.size();
207
208   double accum = 0.0;
209   std::for_each(results.begin(), results.end(), [&](const double d) {
210     accum += (d - m) * (d - m);
211   });
212   double stdev = sqrt(accum / (results.size() - 1));
213   std::chrono::microseconds mx = *std::max_element(maxes.begin(), maxes.end());
214   std::chrono::microseconds agAqTime = std::accumulate(
215       aqTime.begin(), aqTime.end(), std::chrono::microseconds(0));
216   unsigned long agAqTimeSq =
217       std::accumulate(aqTimeSq.begin(), aqTimeSq.end(), 0);
218   std::chrono::microseconds mean = agAqTime / sum;
219   double variance = (sum * agAqTimeSq - (agAqTime.count() * agAqTime.count())) /
220       sum / (sum - 1);
221   double stddev2 = sqrt(variance);
222
223   printf("Sum: %li Mean: %.0f stddev: %.0f\n", sum, m, stdev);
224   printf(
225       "Lock time stats in us: mean %li stddev %.0f max %li\n",
226       mean.count(),
227       stddev2,
228       mx.count());
229 }
230
231 BENCHMARK(StdMutexUncontendedBenchmark, iters) {
232   std::mutex lock;
233   while (iters--) {
234     lock.lock();
235     lock.unlock();
236   }
237 }
238
239 BENCHMARK(MicroSpinLockUncontendedBenchmark, iters) {
240   folly::MicroSpinLock lock;
241   lock.init();
242   while (iters--) {
243     lock.lock();
244     lock.unlock();
245   }
246 }
247
248 BENCHMARK(PicoSpinLockUncontendedBenchmark, iters) {
249   // uint8_t would be more fair, but PicoSpinLock needs at lesat two bytes
250   folly::PicoSpinLock<uint16_t> lock;
251   lock.init();
252   while (iters--) {
253     lock.lock();
254     lock.unlock();
255   }
256 }
257
258 BENCHMARK(MicroLockUncontendedBenchmark, iters) {
259   folly::MicroLock lock;
260   lock.init();
261   while (iters--) {
262     lock.lock();
263     lock.unlock();
264   }
265 }
266
267 struct VirtualBase {
268   virtual void foo() = 0;
269   virtual ~VirtualBase() {}
270 };
271
272 struct VirtualImpl : VirtualBase {
273   void foo() override { /* noop */
274   }
275   virtual ~VirtualImpl() {}
276 };
277
278 #ifndef __clang__
279 __attribute__((noinline, noclone)) VirtualBase* makeVirtual() {
280   return new VirtualImpl();
281 }
282
283 BENCHMARK(VirtualFunctionCall, iters) {
284   VirtualBase* vb = makeVirtual();
285   while (iters--) {
286     vb->foo();
287   }
288   delete vb;
289 }
290 #endif
291
292 BENCHMARK_DRAW_LINE()
293
294 #define BENCH_BASE(...) FB_VA_GLUE(BENCHMARK_NAMED_PARAM, (__VA_ARGS__))
295 #define BENCH_REL(...) FB_VA_GLUE(BENCHMARK_RELATIVE_NAMED_PARAM, (__VA_ARGS__))
296
297 static void std_mutex(size_t numOps, size_t numThreads) {
298   runContended<std::mutex>(numOps, numThreads);
299 }
300 static void folly_microspin(size_t numOps, size_t numThreads) {
301   runContended<InitLock<folly::MicroSpinLock>>(numOps, numThreads);
302 }
303 static void folly_picospin(size_t numOps, size_t numThreads) {
304   runContended<InitLock<folly::PicoSpinLock<uint16_t>>>(numOps, numThreads);
305 }
306 static void folly_microlock(size_t numOps, size_t numThreads) {
307   runContended<folly::MicroLock>(numOps, numThreads);
308 }
309
310 BENCHMARK_DRAW_LINE()
311 BENCH_BASE(std_mutex, 1thread, 1)
312 BENCH_REL(folly_microspin, 1thread, 1)
313 BENCH_REL(folly_picospin, 1thread, 1)
314 BENCH_REL(folly_microlock, 1thread, 1)
315 BENCHMARK_DRAW_LINE()
316 BENCH_BASE(std_mutex, 2thread, 2)
317 BENCH_REL(folly_microspin, 2thread, 2)
318 BENCH_REL(folly_picospin, 2thread, 2)
319 BENCH_REL(folly_microlock, 2thread, 2)
320 BENCHMARK_DRAW_LINE()
321 BENCH_BASE(std_mutex, 4thread, 4)
322 BENCH_REL(folly_microspin, 4thread, 4)
323 BENCH_REL(folly_picospin, 4thread, 4)
324 BENCH_REL(folly_microlock, 4thread, 4)
325 BENCHMARK_DRAW_LINE()
326 BENCH_BASE(std_mutex, 8thread, 8)
327 BENCH_REL(folly_microspin, 8thread, 8)
328 BENCH_REL(folly_picospin, 8thread, 8)
329 BENCH_REL(folly_microlock, 8thread, 8)
330 BENCHMARK_DRAW_LINE()
331 BENCH_BASE(std_mutex, 16thread, 16)
332 BENCH_REL(folly_microspin, 16thread, 16)
333 BENCH_REL(folly_picospin, 16thread, 16)
334 BENCH_REL(folly_microlock, 16thread, 16)
335 BENCHMARK_DRAW_LINE()
336 BENCH_BASE(std_mutex, 32thread, 32)
337 BENCH_REL(folly_microspin, 32thread, 32)
338 BENCH_REL(folly_picospin, 32thread, 32)
339 BENCH_REL(folly_microlock, 32thread, 32)
340 BENCHMARK_DRAW_LINE()
341 BENCH_BASE(std_mutex, 64thread, 64)
342 BENCH_REL(folly_microspin, 64thread, 64)
343 BENCH_REL(folly_picospin, 64thread, 64)
344 BENCH_REL(folly_microlock, 64thread, 64)
345
346 #define FairnessTest(type) \
347   {                        \
348     printf(#type ": \n");  \
349     runFairness<type>();   \
350   }
351
352 int main(int argc, char** argv) {
353   gflags::ParseCommandLineFlags(&argc, &argv, true);
354
355   FairnessTest(std::mutex);
356   FairnessTest(InitLock<folly::MicroSpinLock>);
357   FairnessTest(InitLock<folly::PicoSpinLock<uint16_t>>);
358   FairnessTest(folly::MicroLock);
359
360   folly::runBenchmarks();
361
362   return 0;
363 }
364
365 /*
366 locks_benchmark --bm_min_iters=1000000
367
368 std::mutex:
369 Sum: 2895849 Mean: 90495 stddev: 3147
370 Lock time stats in us: mean 11 stddev 1483 max 17823
371 InitLock<folly::MicroSpinLock>:
372 Sum: 3114365 Mean: 97323 stddev: 92604
373 Lock time stats in us: mean 19 stddev 1379 max 235710
374 InitLock<folly::PicoSpinLock<uint16_t>>:
375 Sum: 1189713 Mean: 37178 stddev: 23295
376 Lock time stats in us: mean 51 stddev 3610 max 414093
377 folly::MicroLock:
378 Sum: 1890150 Mean: 59067 stddev: 26795
379 Lock time stats in us: mean 31 stddev 2272 max 70996
380 ============================================================================
381 folly/test/SmallLocksBenchmark.cpp              relative  time/iter  iters/s
382 ============================================================================
383 StdMutexUncontendedBenchmark                                25.97ns   38.51M
384 MicroSpinLockUncontendedBenchmark                           16.26ns   61.49M
385 PicoSpinLockUncontendedBenchmark                            15.92ns   62.82M
386 MicroLockUncontendedBenchmark                               28.54ns   35.03M
387 VirtualFunctionCall                                          1.73ns  576.51M
388 ----------------------------------------------------------------------------
389 ----------------------------------------------------------------------------
390 std_mutex(1thread)                                         898.49ns    1.11M
391 folly_microspin(1thread)                          87.68%     1.02us  975.85K
392 folly_picospin(1thread)                           95.06%   945.15ns    1.06M
393 folly_microlock(1thread)                          87.34%     1.03us  972.05K
394 ----------------------------------------------------------------------------
395 std_mutex(2thread)                                           1.29us  773.03K
396 folly_microspin(2thread)                          90.35%     1.43us  698.40K
397 folly_picospin(2thread)                          126.11%     1.03us  974.90K
398 folly_microlock(2thread)                         109.99%     1.18us  850.24K
399 ----------------------------------------------------------------------------
400 std_mutex(4thread)                                           3.03us  330.05K
401 folly_microspin(4thread)                          94.03%     3.22us  310.33K
402 folly_picospin(4thread)                          127.16%     2.38us  419.70K
403 folly_microlock(4thread)                          64.39%     4.71us  212.51K
404 ----------------------------------------------------------------------------
405 std_mutex(8thread)                                           4.79us  208.81K
406 folly_microspin(8thread)                          71.50%     6.70us  149.30K
407 folly_picospin(8thread)                           54.58%     8.77us  113.96K
408 folly_microlock(8thread)                          55.58%     8.62us  116.05K
409 ----------------------------------------------------------------------------
410 std_mutex(16thread)                                         12.14us   82.39K
411 folly_microspin(16thread)                        102.02%    11.90us   84.05K
412 folly_picospin(16thread)                          62.30%    19.48us   51.33K
413 folly_microlock(16thread)                         64.54%    18.81us   53.17K
414 ----------------------------------------------------------------------------
415 std_mutex(32thread)                                         22.52us   44.41K
416 folly_microspin(32thread)                         88.25%    25.52us   39.19K
417 folly_picospin(32thread)                          46.04%    48.91us   20.45K
418 folly_microlock(32thread)                         67.08%    33.57us   29.79K
419 ----------------------------------------------------------------------------
420 std_mutex(64thread)                                         43.29us   23.10K
421 folly_microspin(64thread)                         98.01%    44.17us   22.64K
422 folly_picospin(64thread)                          48.62%    89.04us   11.23K
423 folly_microlock(64thread)                         62.68%    69.07us   14.48K
424 ============================================================================
425 */