From c8be994e747526c4928c69ac49dfda803cf613de Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Mon, 14 Jan 2013 11:18:16 -0800
Subject: [PATCH] Create a benchmark for ProducerConsumerQueue

Summary: This is a basic benchmark harness for testing ProducerConsumerQueue, which tests a single producer/consumer pair with and without CPU affinity set.  The change to ProducerConsumerQueue.h (not for committing) tests the effect of cache-aligning the read and write indices to reduce false sharing.

Test Plan: Run the benchmark with the cache alignment (un)commented.

Reviewed By: rituraj@fb.com

FB internal diff: D665948
---
 folly/test/ProducerConsumerQueueBenchmark.cpp | 264 ++++++++++++++++++
 1 file changed, 264 insertions(+)
 create mode 100644 folly/test/ProducerConsumerQueueBenchmark.cpp
diff --git a/folly/test/ProducerConsumerQueueBenchmark.cpp b/folly/test/ProducerConsumerQueueBenchmark.cpp
new file mode 100644
index 00000000..f4b44038
--- /dev/null
+++ b/folly/test/ProducerConsumerQueueBenchmark.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// @author: Bert Maher <bertrand@fb.com>
+
+#include <thread>
+#include <iostream>
+#include <stdio.h>
+#include <pthread.h>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include "folly/Benchmark.h"
+#include "folly/Histogram.h"
+#include "folly/ProducerConsumerQueue.h"
+
+namespace {
+
+using namespace folly;
+
+typedef int ThroughputType;
+typedef ProducerConsumerQueue<ThroughputType> ThroughputQueueType;
+
+typedef long LatencyType;
+typedef ProducerConsumerQueue<LatencyType> LatencyQueueType;
+
+template<class QueueType>
+struct ThroughputTest {
+  explicit ThroughputTest(size_t size, int iters, int cpu0, int cpu1)
+  : queue_(size),
+    done_(false),
+    iters_(iters),
+    cpu0_(cpu0),
+    cpu1_(cpu1)
+    { }
+
+  void producer() {
+    if (cpu0_ > -1) {
+      cpu_set_t cpuset;
+      CPU_ZERO(&cpuset);
+      CPU_SET(cpu0_, &cpuset);
+      pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+    }
+    for (int i = 0; i < iters_; ++i) {
+      ThroughputType item = i;
+      while (!queue_.write((ThroughputType) item)) {
+      }
+    }
+  }
+
+  void consumer() {
+    if (cpu1_ > -1) {
+      cpu_set_t cpuset;
+      CPU_ZERO(&cpuset);
+      CPU_SET(cpu1_, &cpuset);
+      pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+    }
+    for (int i = 0; i < iters_; ++i) {
+      ThroughputType item = 0;
+      while (!queue_.read(item)) {
+        doNotOptimizeAway(item);
+      }
+    }
+  }
+
+  QueueType queue_;
+  std::atomic<bool> done_;
+  int iters_;
+  int cpu0_;
+  int cpu1_;
+};
+
+template<class QueueType>
+struct LatencyTest {
+  explicit LatencyTest(size_t size, int iters, int cpu0, int cpu1)
+  : queue_(size),
+    done_(false),
+    iters_(iters),
+    cpu0_(cpu0),
+    cpu1_(cpu1),
+    hist_(1, 0, 30)
+    {
+      computeTimeCost();
+    }
+
+  void computeTimeCost() {
+    int iterations = 1000;
+    timespec start, end;
+    clock_gettime(CLOCK_REALTIME, &start);
+    for (int i = 0; i < iterations; ++i) {
+      timespec tv;
+      clock_gettime(CLOCK_REALTIME, &tv);
+    }
+    clock_gettime(CLOCK_REALTIME, &end);
+    time_cost_ = 2 * detail::timespecDiff(end, start) / iterations;
+  }
+
+  void producer() {
+    if (cpu0_ > -1) {
+      cpu_set_t cpuset;
+      CPU_ZERO(&cpuset);
+      CPU_SET(cpu0_, &cpuset);
+      pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+    }
+    for (int i = 0; i < iters_; ++i) {
+      timespec sleeptime, sleepstart;
+      clock_gettime(CLOCK_REALTIME, &sleepstart);
+      do {
+        clock_gettime(CLOCK_REALTIME, &sleeptime);
+      } while (detail::timespecDiff(sleeptime, sleepstart) < 1000000);
+
+      timespec tv;
+      clock_gettime(CLOCK_REALTIME, &tv);
+      while (!queue_.write((LatencyType) tv.tv_nsec)) {
+      }
+    }
+  }
+
+  void consumer() {
+    if (cpu1_ > -1) {
+      cpu_set_t cpuset;
+      CPU_ZERO(&cpuset);
+      CPU_SET(cpu1_, &cpuset);
+      pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+    }
+    for (int i = 0; i < iters_; ++i) {
+      long enqueue_nsec;
+      while (!queue_.read(enqueue_nsec)) {
+      }
+
+      timespec tv;
+      clock_gettime(CLOCK_REALTIME, &tv);
+      int diff = tv.tv_nsec - enqueue_nsec - time_cost_;
+      if (diff < 0) {
+        continue;
+      }
+
+      // Naive log-scale bucketing.
+      int bucket;
+      for (bucket = 0;
+           bucket <= 30 && (1 << bucket) <= diff;
+           ++bucket) {
+      }
+      hist_.addValue(bucket - 1);
+    }
+  }
+
+  void printHistogram() {
+    hist_.toTSV(std::cout);
+  }
+
+  QueueType queue_;
+  std::atomic<bool> done_;
+  int time_cost_;
+  int iters_;
+  int cpu0_;
+  int cpu1_;
+  Histogram<int> hist_;
+};
+
+void BM_ProducerConsumer(int iters, int size) {
+  BenchmarkSuspender susp;
+  CHECK_GT(size, 0);
+  ThroughputTest<ThroughputQueueType> *test =
+    new ThroughputTest<ThroughputQueueType>(size, iters, -1, -1);
+  susp.dismiss();
+
+  std::thread producer( [test] { test->producer(); } );
+  std::thread consumer( [test] { test->consumer(); } );
+
+  producer.join();
+  test->done_ = true;
+  consumer.join();
+  delete test;
+}
+
+void BM_ProducerConsumerAffinity(int iters, int size) {
+  BenchmarkSuspender susp;
+  CHECK_GT(size, 0);
+  ThroughputTest<ThroughputQueueType> *test =
+    new ThroughputTest<ThroughputQueueType>(size, iters, 0, 1);
+  susp.dismiss();
+
+  std::thread producer( [test] { test->producer(); } );
+  std::thread consumer( [test] { test->consumer(); } );
+
+  producer.join();
+  test->done_ = true;
+  consumer.join();
+  delete test;
+}
+
+void BM_ProducerConsumerLatency(int iters, int size) {
+  BenchmarkSuspender susp;
+  CHECK_GT(size, 0);
+  LatencyTest<LatencyQueueType> *test =
+    new LatencyTest<LatencyQueueType>(size, 100000, 0, 1);
+  susp.dismiss();
+
+  std::thread producer( [test] { test->producer(); } );
+  std::thread consumer( [test] { test->consumer(); } );
+
+  producer.join();
+  test->done_ = true;
+  consumer.join();
+  test->printHistogram();
+  delete test;
+}
+
+
+BENCHMARK_DRAW_LINE();
+
+BENCHMARK_PARAM(BM_ProducerConsumer, 1048574);
+BENCHMARK_PARAM(BM_ProducerConsumerAffinity, 1048574);
+BENCHMARK_PARAM(BM_ProducerConsumerLatency, 1048574);
+
+}
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  runBenchmarks();
+  return 0;
+}
+
+#if 0
+/*
+Benchmark on Intel(R) Xeon(R) CPU E5-2660 0 @ 2.20GHz
+Latency histogram:
+  log(nsec)
+  min  max     count
+  6    7       5124
+  7    8       4799
+  8    9       49
+  9    10      2
+  10   11      1
+  11   12      5
+  12   13      3
+  13   14      9
+  14   15      8
+============================================================================
+folly/test/ProducerConsumerQueueBenchmark.cpp   relative  time/iter  iters/s
+============================================================================
+----------------------------------------------------------------------------
+BM_ProducerConsumer(1048574)                                 7.52ns  132.90M
+BM_ProducerConsumerAffinity(1048574)                         8.28ns  120.75M
+BM_ProducerConsumerLatency(1048574)                          10.00s   99.98m
+============================================================================
+*/
+#endif
-- 
2.34.1