+/* "Work cycle" is just an additional nop loop iteration.
+ * A smaller number of work cyles will result in more contention,
+ * which is what we're trying to measure. The relative ratio of
+ * locked to unlocked cycles will simulate how big critical sections
+ * are in production code
+ */
+DEFINE_int32(work, 100, "Number of work cycles");
+DEFINE_int32(unlocked_work, 1000, "Number of unlocked work cycles");
+DEFINE_int32(
+ threads,
+ std::thread::hardware_concurrency(),
+ "Number of threads for fairness test");
+
+static void burn(size_t n) {
+ for (size_t i = 0; i < n; ++i) {
+ folly::doNotOptimizeAway(i);
+ }
+}
+
+namespace {
+
+struct SimpleBarrier {
+ explicit SimpleBarrier(size_t count) : lock_(), cv_(), count_(count) {}
+
+ void wait() {
+ std::unique_lock<std::mutex> lockHeld(lock_);
+ if (++num_ == count_) {
+ cv_.notify_all();
+ } else {
+ cv_.wait(lockHeld, [&]() { return num_ >= count_; });
+ }
+ }
+
+ private:
+ std::mutex lock_;
+ std::condition_variable cv_;
+ size_t num_{0};
+ size_t count_;
+};
+}
+
+template <typename Lock>
+class InitLock {
+ Lock lock_;
+
+ public:
+ InitLock() {
+ lock_.init();
+ }
+ void lock() {
+ lock_.lock();
+ }
+ void unlock() {
+ lock_.unlock();
+ }
+};
+
+template <typename Lock>
+static void runContended(size_t numOps, size_t numThreads) {
+ folly::BenchmarkSuspender braces;
+ size_t totalthreads = std::thread::hardware_concurrency();
+ if (totalthreads < numThreads) {
+ totalthreads = numThreads;
+ }
+ size_t threadgroups = totalthreads / numThreads;
+ struct lockstruct {
+ char padding1[128];
+ Lock lock;
+ char padding2[128];
+ long value = 1;
+ };
+
+ auto locks =
+ (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
+
+ char padding3[128];
+ (void)padding3;
+ std::vector<std::thread> threads(totalthreads);
+
+ SimpleBarrier runbarrier(totalthreads + 1);
+
+ for (size_t t = 0; t < totalthreads; ++t) {
+ threads[t] = std::thread([&, t] {
+ lockstruct* lock = &locks[t % threadgroups];
+ runbarrier.wait();
+ for (size_t op = 0; op < numOps; op += 1) {
+ lock->lock.lock();
+ burn(FLAGS_work);
+ lock->value++;
+ lock->lock.unlock();
+ burn(FLAGS_unlocked_work);
+ }
+ });
+ }
+
+ runbarrier.wait();
+ braces.dismissing([&] {
+ for (auto& thr : threads) {
+ thr.join();
+ }
+ });
+}
+
+template <typename Lock>
+static void runFairness() {
+ size_t numThreads = FLAGS_threads;
+ size_t totalthreads = std::thread::hardware_concurrency();
+ if (totalthreads < numThreads) {
+ totalthreads = numThreads;
+ }
+ long threadgroups = totalthreads / numThreads;
+ struct lockstruct {
+ char padding1[128];
+ Lock lock;
+ };
+
+ auto locks =
+ (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
+
+ char padding3[64];
+ (void)padding3;
+ std::vector<std::thread> threads(totalthreads);
+
+ std::atomic<bool> stop{false};
+
+ std::mutex rlock;
+ std::vector<long> results;
+ std::vector<std::chrono::microseconds> maxes;
+
+ std::vector<std::chrono::microseconds> aqTime;
+ std::vector<unsigned long> aqTimeSq;
+
+ SimpleBarrier runbarrier(totalthreads + 1);
+
+ for (size_t t = 0; t < totalthreads; ++t) {
+ threads[t] = std::thread([&, t] {
+ lockstruct* lock = &locks[t % threadgroups];
+ long value = 0;
+ std::chrono::microseconds max(0);
+ std::chrono::microseconds time(0);
+ unsigned long timeSq(0);
+ runbarrier.wait();
+ while (!stop) {
+ std::chrono::steady_clock::time_point prelock =
+ std::chrono::steady_clock::now();
+ lock->lock.lock();
+ std::chrono::steady_clock::time_point postlock =
+ std::chrono::steady_clock::now();
+ auto diff = std::chrono::duration_cast<std::chrono::microseconds>(
+ postlock - prelock);
+ time += diff;
+ timeSq += diff.count() * diff.count();
+ if (diff > max) {
+ max = diff;
+ }
+ burn(FLAGS_work);
+ value++;
+ lock->lock.unlock();
+ burn(FLAGS_unlocked_work);
+ }
+ {
+ std::lock_guard<std::mutex> g(rlock);
+ results.push_back(value);
+ maxes.push_back(max);
+ aqTime.push_back(time);
+ aqTimeSq.push_back(timeSq);
+ }
+ });
+ }
+
+ runbarrier.wait();
+ /* sleep override */
+ std::this_thread::sleep_for(std::chrono::seconds(2));
+ stop = true;
+
+ for (auto& thr : threads) {
+ thr.join();
+ }
+
+ // Calulate some stats
+ unsigned long sum = std::accumulate(results.begin(), results.end(), 0.0);
+ double m = sum / results.size();
+
+ double accum = 0.0;
+ std::for_each(results.begin(), results.end(), [&](const double d) {
+ accum += (d - m) * (d - m);
+ });
+ double stdev = std::sqrt(accum / (results.size() - 1));
+ std::chrono::microseconds mx = *std::max_element(maxes.begin(), maxes.end());
+ std::chrono::microseconds agAqTime = std::accumulate(
+ aqTime.begin(), aqTime.end(), std::chrono::microseconds(0));
+ unsigned long agAqTimeSq =
+ std::accumulate(aqTimeSq.begin(), aqTimeSq.end(), 0);
+ std::chrono::microseconds mean = agAqTime / sum;
+ double variance = (sum * agAqTimeSq - (agAqTime.count() * agAqTime.count())) /
+ sum / (sum - 1);
+ double stddev2 = std::sqrt(variance);
+
+ printf("Sum: %li Mean: %.0f stddev: %.0f\n", sum, m, stdev);
+ printf(
+ "Lock time stats in us: mean %li stddev %.0f max %li\n",
+ mean.count(),
+ stddev2,
+ mx.count());
+}
+
+BENCHMARK(StdMutexUncontendedBenchmark, iters) {
+ std::mutex lock;
+ while (iters--) {
+ lock.lock();
+ lock.unlock();
+ }
+}
+