2 * Copyright 2016 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/Benchmark.h>
18 #include <folly/SmallLocks.h>
20 #include <condition_variable>
25 /* "Work cycle" is just an additional nop loop iteration.
26 * A smaller number of work cyles will result in more contention,
27 * which is what we're trying to measure. The relative ratio of
28 * locked to unlocked cycles will simulate how big critical sections
29 * are in production code
31 DEFINE_int32(work, 100, "Number of work cycles");
32 DEFINE_int32(unlocked_work, 1000, "Number of unlocked work cycles");
35 std::thread::hardware_concurrency(),
36 "Number of threads for fairness test");
38 static void burn(size_t n) {
39 for (size_t i = 0; i < n; ++i) {
40 folly::doNotOptimizeAway(i);
46 struct SimpleBarrier {
47 explicit SimpleBarrier(size_t count) : lock_(), cv_(), count_(count) {}
50 std::unique_lock<std::mutex> lockHeld(lock_);
51 if (++num_ == count_) {
54 cv_.wait(lockHeld, [&]() { return num_ >= count_; });
60 std::condition_variable cv_;
66 template <typename Lock>
82 template <typename Lock>
83 static void runContended(size_t numOps, size_t numThreads) {
84 folly::BenchmarkSuspender braces;
85 size_t totalthreads = std::thread::hardware_concurrency();
86 if (totalthreads < numThreads) {
87 totalthreads = numThreads;
89 size_t threadgroups = totalthreads / numThreads;
98 (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
102 std::vector<std::thread> threads(totalthreads);
104 SimpleBarrier runbarrier(totalthreads + 1);
106 for (size_t t = 0; t < totalthreads; ++t) {
107 threads[t] = std::thread([&, t, totalthreads] {
108 lockstruct* lock = &locks[t % threadgroups];
110 for (size_t op = 0; op < numOps; op += 1) {
115 burn(FLAGS_unlocked_work);
121 braces.dismissing([&] {
122 for (auto& thr : threads) {
128 template <typename Lock>
129 static void runFairness() {
130 size_t numThreads = FLAGS_threads;
131 size_t totalthreads = std::thread::hardware_concurrency();
132 if (totalthreads < numThreads) {
133 totalthreads = numThreads;
135 long threadgroups = totalthreads / numThreads;
142 (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
146 std::vector<std::thread> threads(totalthreads);
148 std::atomic<bool> stop{false};
151 std::vector<long> results;
152 std::vector<std::chrono::microseconds> maxes;
154 std::vector<std::chrono::microseconds> aqTime;
155 std::vector<unsigned long> aqTimeSq;
157 SimpleBarrier runbarrier(totalthreads + 1);
159 for (size_t t = 0; t < totalthreads; ++t) {
160 threads[t] = std::thread([&, t, totalthreads] {
161 lockstruct* lock = &locks[t % threadgroups];
163 std::chrono::microseconds max(0);
164 std::chrono::microseconds time(0);
165 unsigned long timeSq(0);
168 std::chrono::steady_clock::time_point prelock =
169 std::chrono::steady_clock::now();
171 std::chrono::steady_clock::time_point postlock =
172 std::chrono::steady_clock::now();
173 auto diff = std::chrono::duration_cast<std::chrono::microseconds>(
176 timeSq += diff.count() * diff.count();
183 burn(FLAGS_unlocked_work);
186 std::lock_guard<std::mutex> g(rlock);
187 results.push_back(value);
188 maxes.push_back(max);
189 aqTime.push_back(time);
190 aqTimeSq.push_back(timeSq);
197 std::this_thread::sleep_for(std::chrono::seconds(2));
200 for (auto& thr : threads) {
204 // Calulate some stats
205 unsigned long sum = std::accumulate(results.begin(), results.end(), 0.0);
206 double m = sum / results.size();
209 std::for_each(results.begin(), results.end(), [&](const double d) {
210 accum += (d - m) * (d - m);
212 double stdev = sqrt(accum / (results.size() - 1));
213 std::chrono::microseconds mx = *std::max_element(maxes.begin(), maxes.end());
214 std::chrono::microseconds agAqTime = std::accumulate(
215 aqTime.begin(), aqTime.end(), std::chrono::microseconds(0));
216 unsigned long agAqTimeSq =
217 std::accumulate(aqTimeSq.begin(), aqTimeSq.end(), 0);
218 std::chrono::microseconds mean = agAqTime / sum;
219 double variance = (sum * agAqTimeSq - (agAqTime.count() * agAqTime.count())) /
221 double stddev2 = sqrt(variance);
223 printf("Sum: %li Mean: %.0f stddev: %.0f\n", sum, m, stdev);
225 "Lock time stats in us: mean %li stddev %.0f max %li\n",
231 BENCHMARK(StdMutexUncontendedBenchmark, iters) {
239 BENCHMARK(MicroSpinLockUncontendedBenchmark, iters) {
240 folly::MicroSpinLock lock;
248 BENCHMARK(PicoSpinLockUncontendedBenchmark, iters) {
249 // uint8_t would be more fair, but PicoSpinLock needs at lesat two bytes
250 folly::PicoSpinLock<uint16_t> lock;
258 BENCHMARK(MicroLockUncontendedBenchmark, iters) {
259 folly::MicroLock lock;
268 virtual void foo() = 0;
269 virtual ~VirtualBase() {}
272 struct VirtualImpl : VirtualBase {
273 void foo() override { /* noop */
275 virtual ~VirtualImpl() {}
279 __attribute__((noinline, noclone)) VirtualBase* makeVirtual() {
280 return new VirtualImpl();
283 BENCHMARK(VirtualFunctionCall, iters) {
284 VirtualBase* vb = makeVirtual();
292 BENCHMARK_DRAW_LINE()
294 #define BENCH_BASE(...) FB_VA_GLUE(BENCHMARK_NAMED_PARAM, (__VA_ARGS__))
295 #define BENCH_REL(...) FB_VA_GLUE(BENCHMARK_RELATIVE_NAMED_PARAM, (__VA_ARGS__))
297 static void std_mutex(size_t numOps, size_t numThreads) {
298 runContended<std::mutex>(numOps, numThreads);
300 static void folly_microspin(size_t numOps, size_t numThreads) {
301 runContended<InitLock<folly::MicroSpinLock>>(numOps, numThreads);
303 static void folly_picospin(size_t numOps, size_t numThreads) {
304 runContended<InitLock<folly::PicoSpinLock<uint16_t>>>(numOps, numThreads);
306 static void folly_microlock(size_t numOps, size_t numThreads) {
307 runContended<folly::MicroLock>(numOps, numThreads);
310 BENCHMARK_DRAW_LINE()
311 BENCH_BASE(std_mutex, 1thread, 1)
312 BENCH_REL(folly_microspin, 1thread, 1)
313 BENCH_REL(folly_picospin, 1thread, 1)
314 BENCH_REL(folly_microlock, 1thread, 1)
315 BENCHMARK_DRAW_LINE()
316 BENCH_BASE(std_mutex, 2thread, 2)
317 BENCH_REL(folly_microspin, 2thread, 2)
318 BENCH_REL(folly_picospin, 2thread, 2)
319 BENCH_REL(folly_microlock, 2thread, 2)
320 BENCHMARK_DRAW_LINE()
321 BENCH_BASE(std_mutex, 4thread, 4)
322 BENCH_REL(folly_microspin, 4thread, 4)
323 BENCH_REL(folly_picospin, 4thread, 4)
324 BENCH_REL(folly_microlock, 4thread, 4)
325 BENCHMARK_DRAW_LINE()
326 BENCH_BASE(std_mutex, 8thread, 8)
327 BENCH_REL(folly_microspin, 8thread, 8)
328 BENCH_REL(folly_picospin, 8thread, 8)
329 BENCH_REL(folly_microlock, 8thread, 8)
330 BENCHMARK_DRAW_LINE()
331 BENCH_BASE(std_mutex, 16thread, 16)
332 BENCH_REL(folly_microspin, 16thread, 16)
333 BENCH_REL(folly_picospin, 16thread, 16)
334 BENCH_REL(folly_microlock, 16thread, 16)
335 BENCHMARK_DRAW_LINE()
336 BENCH_BASE(std_mutex, 32thread, 32)
337 BENCH_REL(folly_microspin, 32thread, 32)
338 BENCH_REL(folly_picospin, 32thread, 32)
339 BENCH_REL(folly_microlock, 32thread, 32)
340 BENCHMARK_DRAW_LINE()
341 BENCH_BASE(std_mutex, 64thread, 64)
342 BENCH_REL(folly_microspin, 64thread, 64)
343 BENCH_REL(folly_picospin, 64thread, 64)
344 BENCH_REL(folly_microlock, 64thread, 64)
346 #define FairnessTest(type) \
348 printf(#type ": \n"); \
349 runFairness<type>(); \
352 int main(int argc, char** argv) {
353 gflags::ParseCommandLineFlags(&argc, &argv, true);
355 FairnessTest(std::mutex);
356 FairnessTest(InitLock<folly::MicroSpinLock>);
357 FairnessTest(InitLock<folly::PicoSpinLock<uint16_t>>);
358 FairnessTest(folly::MicroLock);
360 folly::runBenchmarks();
366 locks_benchmark --bm_min_iters=1000000
369 Sum: 2895849 Mean: 90495 stddev: 3147
370 Lock time stats in us: mean 11 stddev 1483 max 17823
371 InitLock<folly::MicroSpinLock>:
372 Sum: 3114365 Mean: 97323 stddev: 92604
373 Lock time stats in us: mean 19 stddev 1379 max 235710
374 InitLock<folly::PicoSpinLock<uint16_t>>:
375 Sum: 1189713 Mean: 37178 stddev: 23295
376 Lock time stats in us: mean 51 stddev 3610 max 414093
378 Sum: 1890150 Mean: 59067 stddev: 26795
379 Lock time stats in us: mean 31 stddev 2272 max 70996
380 ============================================================================
381 folly/test/SmallLocksBenchmark.cpp relative time/iter iters/s
382 ============================================================================
383 StdMutexUncontendedBenchmark 25.97ns 38.51M
384 MicroSpinLockUncontendedBenchmark 16.26ns 61.49M
385 PicoSpinLockUncontendedBenchmark 15.92ns 62.82M
386 MicroLockUncontendedBenchmark 28.54ns 35.03M
387 VirtualFunctionCall 1.73ns 576.51M
388 ----------------------------------------------------------------------------
389 ----------------------------------------------------------------------------
390 std_mutex(1thread) 898.49ns 1.11M
391 folly_microspin(1thread) 87.68% 1.02us 975.85K
392 folly_picospin(1thread) 95.06% 945.15ns 1.06M
393 folly_microlock(1thread) 87.34% 1.03us 972.05K
394 ----------------------------------------------------------------------------
395 std_mutex(2thread) 1.29us 773.03K
396 folly_microspin(2thread) 90.35% 1.43us 698.40K
397 folly_picospin(2thread) 126.11% 1.03us 974.90K
398 folly_microlock(2thread) 109.99% 1.18us 850.24K
399 ----------------------------------------------------------------------------
400 std_mutex(4thread) 3.03us 330.05K
401 folly_microspin(4thread) 94.03% 3.22us 310.33K
402 folly_picospin(4thread) 127.16% 2.38us 419.70K
403 folly_microlock(4thread) 64.39% 4.71us 212.51K
404 ----------------------------------------------------------------------------
405 std_mutex(8thread) 4.79us 208.81K
406 folly_microspin(8thread) 71.50% 6.70us 149.30K
407 folly_picospin(8thread) 54.58% 8.77us 113.96K
408 folly_microlock(8thread) 55.58% 8.62us 116.05K
409 ----------------------------------------------------------------------------
410 std_mutex(16thread) 12.14us 82.39K
411 folly_microspin(16thread) 102.02% 11.90us 84.05K
412 folly_picospin(16thread) 62.30% 19.48us 51.33K
413 folly_microlock(16thread) 64.54% 18.81us 53.17K
414 ----------------------------------------------------------------------------
415 std_mutex(32thread) 22.52us 44.41K
416 folly_microspin(32thread) 88.25% 25.52us 39.19K
417 folly_picospin(32thread) 46.04% 48.91us 20.45K
418 folly_microlock(32thread) 67.08% 33.57us 29.79K
419 ----------------------------------------------------------------------------
420 std_mutex(64thread) 43.29us 23.10K
421 folly_microspin(64thread) 98.01% 44.17us 22.64K
422 folly_picospin(64thread) 48.62% 89.04us 11.23K
423 folly_microlock(64thread) 62.68% 69.07us 14.48K
424 ============================================================================