folly/test/ProducerConsumerQueueBenchmark.cpp

   1 /*
   2  * Copyright 2016 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 // @author: Bert Maher <bertrand@fb.com>
  18
  19 #include <thread>
  20 #include <iostream>
  21 #include <stdio.h>
  22 #include <pthread.h>
  23
  24 #include <folly/Benchmark.h>
  25 #include <folly/ProducerConsumerQueue.h>
  26 #include <folly/portability/GFlags.h>
  27 #include <folly/stats/Histogram.h>
  28 #include <folly/stats/Histogram-defs.h>
  29 #include <glog/logging.h>
  30
  31 namespace {
  32
  33 using namespace folly;
  34
  35 typedef unsigned int ThroughputType;
  36 typedef ProducerConsumerQueue<ThroughputType> ThroughputQueueType;
  37
  38 typedef unsigned long LatencyType;
  39 typedef ProducerConsumerQueue<LatencyType> LatencyQueueType;
  40
  41 template<class QueueType>
  42 struct ThroughputTest {
  43   explicit ThroughputTest(size_t size, int iters, int cpu0, int cpu1)
  44   : queue_(size),
  45     done_(false),
  46     iters_(iters),
  47     cpu0_(cpu0),
  48     cpu1_(cpu1)
  49     { }
  50
  51   void producer() {
  52     if (cpu0_ > -1) {
  53       cpu_set_t cpuset;
  54       CPU_ZERO(&cpuset);
  55       CPU_SET(cpu0_, &cpuset);
  56       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
  57     }
  58     for (int i = 0; i < iters_; ++i) {
  59       ThroughputType item = i;
  60       while (!queue_.write((ThroughputType) item)) {
  61       }
  62     }
  63   }
  64
  65   void consumer() {
  66     if (cpu1_ > -1) {
  67       cpu_set_t cpuset;
  68       CPU_ZERO(&cpuset);
  69       CPU_SET(cpu1_, &cpuset);
  70       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
  71     }
  72     for (int i = 0; i < iters_; ++i) {
  73       ThroughputType item = 0;
  74       while (!queue_.read(item)) {
  75       }
  76       doNotOptimizeAway(item);
  77     }
  78   }
  79
  80   QueueType queue_;
  81   std::atomic<bool> done_;
  82   int iters_;
  83   int cpu0_;
  84   int cpu1_;
  85 };
  86
  87 template<class QueueType>
  88 struct LatencyTest {
  89   explicit LatencyTest(size_t size, int iters, int cpu0, int cpu1)
  90   : queue_(size),
  91     done_(false),
  92     iters_(iters),
  93     cpu0_(cpu0),
  94     cpu1_(cpu1),
  95     hist_(1, 0, 30)
  96     {
  97       computeTimeCost();
  98     }
  99
 100   void computeTimeCost() {
 101     timespec start, end;
 102     clock_gettime(CLOCK_REALTIME, &start);
 103     for (int i = 0; i < iters_; ++i) {
 104       timespec tv;
 105       clock_gettime(CLOCK_REALTIME, &tv);
 106     }
 107     clock_gettime(CLOCK_REALTIME, &end);
 108     time_cost_ = 2 * detail::timespecDiff(end, start) / iters_;
 109   }
 110
 111   void producer() {
 112     if (cpu0_ > -1) {
 113       cpu_set_t cpuset;
 114       CPU_ZERO(&cpuset);
 115       CPU_SET(cpu0_, &cpuset);
 116       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
 117     }
 118     for (int i = 0; i < iters_; ++i) {
 119       timespec sleeptime, sleepstart;
 120       clock_gettime(CLOCK_REALTIME, &sleepstart);
 121       do {
 122         clock_gettime(CLOCK_REALTIME, &sleeptime);
 123       } while (detail::timespecDiff(sleeptime, sleepstart) < 1000000);
 124
 125       timespec tv;
 126       clock_gettime(CLOCK_REALTIME, &tv);
 127       while (!queue_.write((LatencyType) tv.tv_nsec)) {
 128       }
 129     }
 130   }
 131
 132   void consumer() {
 133     if (cpu1_ > -1) {
 134       cpu_set_t cpuset;
 135       CPU_ZERO(&cpuset);
 136       CPU_SET(cpu1_, &cpuset);
 137       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
 138     }
 139     for (int i = 0; i < iters_; ++i) {
 140       unsigned long enqueue_nsec;
 141       while (!queue_.read(enqueue_nsec)) {
 142       }
 143
 144       timespec tv;
 145       clock_gettime(CLOCK_REALTIME, &tv);
 146       int diff = tv.tv_nsec - enqueue_nsec - time_cost_;
 147       if (diff < 0) {
 148         continue;
 149       }
 150
 151       // Naive log-scale bucketing.
 152       int bucket;
 153       for (bucket = 0;
 154            bucket <= 30 && (1 << bucket) <= diff;
 155            ++bucket) {
 156       }
 157       hist_.addValue(bucket - 1);
 158     }
 159   }
 160
 161   void printHistogram() {
 162     hist_.toTSV(std::cout);
 163   }
 164
 165   QueueType queue_;
 166   std::atomic<bool> done_;
 167   int time_cost_;
 168   int iters_;
 169   int cpu0_;
 170   int cpu1_;
 171   Histogram<int> hist_;
 172 };
 173
 174 void BM_ProducerConsumer(int iters, int size) {
 175   BenchmarkSuspender susp;
 176   CHECK_GT(size, 0);
 177   ThroughputTest<ThroughputQueueType> *test =
 178     new ThroughputTest<ThroughputQueueType>(size, iters, -1, -1);
 179   susp.dismiss();
 180
 181   std::thread producer( [test] { test->producer(); } );
 182   std::thread consumer( [test] { test->consumer(); } );
 183
 184   producer.join();
 185   test->done_ = true;
 186   consumer.join();
 187   delete test;
 188 }
 189
 190 void BM_ProducerConsumerAffinity(int iters, int size) {
 191   BenchmarkSuspender susp;
 192   CHECK_GT(size, 0);
 193   ThroughputTest<ThroughputQueueType> *test =
 194     new ThroughputTest<ThroughputQueueType>(size, iters, 0, 1);
 195   susp.dismiss();
 196
 197   std::thread producer( [test] { test->producer(); } );
 198   std::thread consumer( [test] { test->consumer(); } );
 199
 200   producer.join();
 201   test->done_ = true;
 202   consumer.join();
 203   delete test;
 204 }
 205
 206 void BM_ProducerConsumerLatency(int /* iters */, int size) {
 207   BenchmarkSuspender susp;
 208   CHECK_GT(size, 0);
 209   LatencyTest<LatencyQueueType> *test =
 210     new LatencyTest<LatencyQueueType>(size, 100000, 0, 1);
 211   susp.dismiss();
 212
 213   std::thread producer( [test] { test->producer(); } );
 214   std::thread consumer( [test] { test->consumer(); } );
 215
 216   producer.join();
 217   test->done_ = true;
 218   consumer.join();
 219   test->printHistogram();
 220   delete test;
 221 }
 222
 223
 224 BENCHMARK_DRAW_LINE();
 225
 226 BENCHMARK_PARAM(BM_ProducerConsumer, 1048574);
 227 BENCHMARK_PARAM(BM_ProducerConsumerAffinity, 1048574);
 228 BENCHMARK_PARAM(BM_ProducerConsumerLatency, 1048574);
 229
 230 }
 231
 232 int main(int argc, char** argv) {
 233   google::InitGoogleLogging(argv[0]);
 234   gflags::ParseCommandLineFlags(&argc, &argv, true);
 235
 236   runBenchmarks();
 237   return 0;
 238 }
 239
 240 #if 0
 241 /*
 242 Benchmark on Intel(R) Xeon(R) CPU E5-2660 0 @ 2.20GHz
 243 Latency histogram:
 244   log(nsec)
 245   min  max     count
 246   6    7       5124
 247   7    8       4799
 248   8    9       49
 249   9    10      2
 250   10   11      1
 251   11   12      5
 252   12   13      3
 253   13   14      9
 254   14   15      8
 255 ============================================================================
 256 folly/test/ProducerConsumerQueueBenchmark.cpp   relative  time/iter  iters/s
 257 ============================================================================
 258 ----------------------------------------------------------------------------
 259 BM_ProducerConsumer(1048574)                                 7.52ns  132.90M
 260 BM_ProducerConsumerAffinity(1048574)                         8.28ns  120.75M
 261 BM_ProducerConsumerLatency(1048574)                          10.00s   99.98m
 262 ============================================================================
 263 */
 264 #endif