folly/test/ProducerConsumerQueueBenchmark.cpp

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 // @author: Bert Maher <bertrand@fb.com>
  18
  19 #include <folly/ProducerConsumerQueue.h>
  20
  21 #include <pthread.h>
  22
  23 #include <cstdio>
  24 #include <iostream>
  25 #include <thread>
  26
  27 #include <glog/logging.h>
  28
  29 #include <folly/Benchmark.h>
  30 #include <folly/portability/GFlags.h>
  31 #include <folly/stats/Histogram.h>
  32 #include <folly/stats/Histogram-defs.h>
  33
  34 namespace {
  35
  36 using namespace folly;
  37
  38 typedef unsigned int ThroughputType;
  39 typedef ProducerConsumerQueue<ThroughputType> ThroughputQueueType;
  40
  41 typedef unsigned long LatencyType;
  42 typedef ProducerConsumerQueue<LatencyType> LatencyQueueType;
  43
  44 template<class QueueType>
  45 struct ThroughputTest {
  46   explicit ThroughputTest(size_t size, int iters, int cpu0, int cpu1)
  47   : queue_(size),
  48     done_(false),
  49     iters_(iters),
  50     cpu0_(cpu0),
  51     cpu1_(cpu1)
  52     { }
  53
  54   void producer() {
  55     if (cpu0_ > -1) {
  56       cpu_set_t cpuset;
  57       CPU_ZERO(&cpuset);
  58       CPU_SET(cpu0_, &cpuset);
  59       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
  60     }
  61     for (int i = 0; i < iters_; ++i) {
  62       ThroughputType item = i;
  63       while (!queue_.write((ThroughputType) item)) {
  64       }
  65     }
  66   }
  67
  68   void consumer() {
  69     if (cpu1_ > -1) {
  70       cpu_set_t cpuset;
  71       CPU_ZERO(&cpuset);
  72       CPU_SET(cpu1_, &cpuset);
  73       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
  74     }
  75     for (int i = 0; i < iters_; ++i) {
  76       ThroughputType item = 0;
  77       while (!queue_.read(item)) {
  78       }
  79       doNotOptimizeAway(item);
  80     }
  81   }
  82
  83   QueueType queue_;
  84   std::atomic<bool> done_;
  85   const int iters_;
  86   int cpu0_;
  87   int cpu1_;
  88 };
  89
  90 template<class QueueType>
  91 struct LatencyTest {
  92   explicit LatencyTest(size_t size, int iters, int cpu0, int cpu1)
  93   : queue_(size),
  94     done_(false),
  95     iters_(iters),
  96     cpu0_(cpu0),
  97     cpu1_(cpu1),
  98     hist_(1, 0, 30)
  99     {
 100       computeTimeCost();
 101     }
 102
 103   static uint64_t timespecDiff(timespec end, timespec start) {
 104     if (end.tv_sec == start.tv_sec) {
 105       assert(end.tv_nsec >= start.tv_nsec);
 106       return uint64_t(end.tv_nsec - start.tv_nsec);
 107     }
 108     assert(end.tv_sec > start.tv_sec);
 109     auto diff = uint64_t(end.tv_sec - start.tv_sec);
 110     assert(diff < std::numeric_limits<uint64_t>::max() / 1000000000ULL);
 111     return diff * 1000000000ULL + end.tv_nsec - start.tv_nsec;
 112   }
 113
 114   void computeTimeCost() {
 115     timespec start, end;
 116     clock_gettime(CLOCK_REALTIME, &start);
 117     for (int i = 0; i < iters_; ++i) {
 118       timespec tv;
 119       clock_gettime(CLOCK_REALTIME, &tv);
 120     }
 121     clock_gettime(CLOCK_REALTIME, &end);
 122     time_cost_ = 2 * timespecDiff(end, start) / iters_;
 123   }
 124
 125   void producer() {
 126     if (cpu0_ > -1) {
 127       cpu_set_t cpuset;
 128       CPU_ZERO(&cpuset);
 129       CPU_SET(cpu0_, &cpuset);
 130       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
 131     }
 132     for (int i = 0; i < iters_; ++i) {
 133       timespec sleeptime, sleepstart;
 134       clock_gettime(CLOCK_REALTIME, &sleepstart);
 135       do {
 136         clock_gettime(CLOCK_REALTIME, &sleeptime);
 137       } while (timespecDiff(sleeptime, sleepstart) < 1000000);
 138
 139       timespec tv;
 140       clock_gettime(CLOCK_REALTIME, &tv);
 141       while (!queue_.write((LatencyType) tv.tv_nsec)) {
 142       }
 143     }
 144   }
 145
 146   void consumer() {
 147     if (cpu1_ > -1) {
 148       cpu_set_t cpuset;
 149       CPU_ZERO(&cpuset);
 150       CPU_SET(cpu1_, &cpuset);
 151       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
 152     }
 153     for (int i = 0; i < iters_; ++i) {
 154       unsigned long enqueue_nsec;
 155       while (!queue_.read(enqueue_nsec)) {
 156       }
 157
 158       timespec tv;
 159       clock_gettime(CLOCK_REALTIME, &tv);
 160       int diff = tv.tv_nsec - enqueue_nsec - time_cost_;
 161       if (diff < 0) {
 162         continue;
 163       }
 164
 165       // Naive log-scale bucketing.
 166       int bucket;
 167       for (bucket = 0;
 168            bucket <= 30 && (1 << bucket) <= diff;
 169            ++bucket) {
 170       }
 171       hist_.addValue(bucket - 1);
 172     }
 173   }
 174
 175   void printHistogram() {
 176     hist_.toTSV(std::cout);
 177   }
 178
 179   QueueType queue_;
 180   std::atomic<bool> done_;
 181   int time_cost_;
 182   const int iters_;
 183   int cpu0_;
 184   int cpu1_;
 185   Histogram<int> hist_;
 186 };
 187
 188 void BM_ProducerConsumer(int iters, int size) {
 189   BenchmarkSuspender susp;
 190   CHECK_GT(size, 0);
 191   ThroughputTest<ThroughputQueueType> *test =
 192     new ThroughputTest<ThroughputQueueType>(size, iters, -1, -1);
 193   susp.dismiss();
 194
 195   std::thread producer( [test] { test->producer(); } );
 196   std::thread consumer( [test] { test->consumer(); } );
 197
 198   producer.join();
 199   test->done_ = true;
 200   consumer.join();
 201   delete test;
 202 }
 203
 204 void BM_ProducerConsumerAffinity(int iters, int size) {
 205   BenchmarkSuspender susp;
 206   CHECK_GT(size, 0);
 207   ThroughputTest<ThroughputQueueType> *test =
 208     new ThroughputTest<ThroughputQueueType>(size, iters, 0, 1);
 209   susp.dismiss();
 210
 211   std::thread producer( [test] { test->producer(); } );
 212   std::thread consumer( [test] { test->consumer(); } );
 213
 214   producer.join();
 215   test->done_ = true;
 216   consumer.join();
 217   delete test;
 218 }
 219
 220 void BM_ProducerConsumerLatency(int /* iters */, int size) {
 221   BenchmarkSuspender susp;
 222   CHECK_GT(size, 0);
 223   LatencyTest<LatencyQueueType> *test =
 224     new LatencyTest<LatencyQueueType>(size, 100000, 0, 1);
 225   susp.dismiss();
 226
 227   std::thread producer( [test] { test->producer(); } );
 228   std::thread consumer( [test] { test->consumer(); } );
 229
 230   producer.join();
 231   test->done_ = true;
 232   consumer.join();
 233   test->printHistogram();
 234   delete test;
 235 }
 236
 237
 238 BENCHMARK_DRAW_LINE();
 239
 240 BENCHMARK_PARAM(BM_ProducerConsumer, 1048574);
 241 BENCHMARK_PARAM(BM_ProducerConsumerAffinity, 1048574);
 242 BENCHMARK_PARAM(BM_ProducerConsumerLatency, 1048574);
 243
 244 }
 245
 246 int main(int argc, char** argv) {
 247   google::InitGoogleLogging(argv[0]);
 248   gflags::ParseCommandLineFlags(&argc, &argv, true);
 249
 250   runBenchmarks();
 251   return 0;
 252 }
 253
 254 #if 0
 255 /*
 256 Benchmark
 257
 258 $ lscpu
 259 Architecture:          x86_64
 260 CPU op-mode(s):        32-bit, 64-bit
 261 Byte Order:            Little Endian
 262 CPU(s):                24
 263 On-line CPU(s) list:   0-23
 264 Thread(s) per core:    1
 265 Core(s) per socket:    1
 266 Socket(s):             24
 267 NUMA node(s):          1
 268 Vendor ID:             GenuineIntel
 269 CPU family:            6
 270 Model:                 60
 271 Model name:            Intel Core Processor (Haswell, no TSX)
 272 Stepping:              1
 273 CPU MHz:               2494.244
 274 BogoMIPS:              4988.48
 275 Hypervisor vendor:     KVM
 276 Virtualization type:   full
 277 L1d cache:             32K
 278 L1i cache:             32K
 279 L2 cache:              4096K
 280 NUMA node0 CPU(s):     0-23
 281
 282 $ ../buck-out/gen/folly/test/producer_consumer_queue_benchmark
 283 5       6       1       5
 284 6       7       1893    11358
 285 7       8       39671   277697
 286 8       9       34921   279368
 287 9       10      17799   160191
 288 10      11      3685    36850
 289 11      12      1075    11825
 290 12      13      456     5472
 291 13      14      422     5486
 292 14      15      64      896
 293 15      16      7       105
 294 16      17      3       48
 295 17      18      3       51
 296 ============================================================================
 297 folly/test/ProducerConsumerQueueBenchmark.cpp   relative  time/iter  iters/s
 298 ============================================================================
 299 ----------------------------------------------------------------------------
 300 BM_ProducerConsumer(1048574)                                 5.82ns  171.75M
 301 BM_ProducerConsumerAffinity(1048574)                         7.36ns  135.83M
 302 BM_ProducerConsumerLatency(1048574)                         1.67min    9.99m
 303 ============================================================================
 304 */
 305 #endif