folly/test/ProducerConsumerQueueBenchmark.cpp

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 // @author: Bert Maher <bertrand@fb.com>
  18
  19 #include <thread>
  20 #include <iostream>
  21 #include <stdio.h>
  22 #include <pthread.h>
  23
  24 #include <folly/Benchmark.h>
  25 #include <folly/ProducerConsumerQueue.h>
  26 #include <folly/portability/GFlags.h>
  27 #include <folly/stats/Histogram.h>
  28 #include <folly/stats/Histogram-defs.h>
  29 #include <glog/logging.h>
  30
  31 namespace {
  32
  33 using namespace folly;
  34
  35 typedef unsigned int ThroughputType;
  36 typedef ProducerConsumerQueue<ThroughputType> ThroughputQueueType;
  37
  38 typedef unsigned long LatencyType;
  39 typedef ProducerConsumerQueue<LatencyType> LatencyQueueType;
  40
  41 template<class QueueType>
  42 struct ThroughputTest {
  43   explicit ThroughputTest(size_t size, int iters, int cpu0, int cpu1)
  44   : queue_(size),
  45     done_(false),
  46     iters_(iters),
  47     cpu0_(cpu0),
  48     cpu1_(cpu1)
  49     { }
  50
  51   void producer() {
  52     if (cpu0_ > -1) {
  53       cpu_set_t cpuset;
  54       CPU_ZERO(&cpuset);
  55       CPU_SET(cpu0_, &cpuset);
  56       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
  57     }
  58     for (int i = 0; i < iters_; ++i) {
  59       ThroughputType item = i;
  60       while (!queue_.write((ThroughputType) item)) {
  61       }
  62     }
  63   }
  64
  65   void consumer() {
  66     if (cpu1_ > -1) {
  67       cpu_set_t cpuset;
  68       CPU_ZERO(&cpuset);
  69       CPU_SET(cpu1_, &cpuset);
  70       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
  71     }
  72     for (int i = 0; i < iters_; ++i) {
  73       ThroughputType item = 0;
  74       while (!queue_.read(item)) {
  75       }
  76       doNotOptimizeAway(item);
  77     }
  78   }
  79
  80   QueueType queue_;
  81   std::atomic<bool> done_;
  82   const int iters_;
  83   int cpu0_;
  84   int cpu1_;
  85 };
  86
  87 template<class QueueType>
  88 struct LatencyTest {
  89   explicit LatencyTest(size_t size, int iters, int cpu0, int cpu1)
  90   : queue_(size),
  91     done_(false),
  92     iters_(iters),
  93     cpu0_(cpu0),
  94     cpu1_(cpu1),
  95     hist_(1, 0, 30)
  96     {
  97       computeTimeCost();
  98     }
  99
 100   static uint64_t timespecDiff(timespec end, timespec start) {
 101     if (end.tv_sec == start.tv_sec) {
 102       assert(end.tv_nsec >= start.tv_nsec);
 103       return uint64_t(end.tv_nsec - start.tv_nsec);
 104     }
 105     assert(end.tv_sec > start.tv_sec);
 106     auto diff = uint64_t(end.tv_sec - start.tv_sec);
 107     assert(diff < std::numeric_limits<uint64_t>::max() / 1000000000ULL);
 108     return diff * 1000000000ULL + end.tv_nsec - start.tv_nsec;
 109   }
 110
 111   void computeTimeCost() {
 112     timespec start, end;
 113     clock_gettime(CLOCK_REALTIME, &start);
 114     for (int i = 0; i < iters_; ++i) {
 115       timespec tv;
 116       clock_gettime(CLOCK_REALTIME, &tv);
 117     }
 118     clock_gettime(CLOCK_REALTIME, &end);
 119     time_cost_ = 2 * timespecDiff(end, start) / iters_;
 120   }
 121
 122   void producer() {
 123     if (cpu0_ > -1) {
 124       cpu_set_t cpuset;
 125       CPU_ZERO(&cpuset);
 126       CPU_SET(cpu0_, &cpuset);
 127       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
 128     }
 129     for (int i = 0; i < iters_; ++i) {
 130       timespec sleeptime, sleepstart;
 131       clock_gettime(CLOCK_REALTIME, &sleepstart);
 132       do {
 133         clock_gettime(CLOCK_REALTIME, &sleeptime);
 134       } while (timespecDiff(sleeptime, sleepstart) < 1000000);
 135
 136       timespec tv;
 137       clock_gettime(CLOCK_REALTIME, &tv);
 138       while (!queue_.write((LatencyType) tv.tv_nsec)) {
 139       }
 140     }
 141   }
 142
 143   void consumer() {
 144     if (cpu1_ > -1) {
 145       cpu_set_t cpuset;
 146       CPU_ZERO(&cpuset);
 147       CPU_SET(cpu1_, &cpuset);
 148       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
 149     }
 150     for (int i = 0; i < iters_; ++i) {
 151       unsigned long enqueue_nsec;
 152       while (!queue_.read(enqueue_nsec)) {
 153       }
 154
 155       timespec tv;
 156       clock_gettime(CLOCK_REALTIME, &tv);
 157       int diff = tv.tv_nsec - enqueue_nsec - time_cost_;
 158       if (diff < 0) {
 159         continue;
 160       }
 161
 162       // Naive log-scale bucketing.
 163       int bucket;
 164       for (bucket = 0;
 165            bucket <= 30 && (1 << bucket) <= diff;
 166            ++bucket) {
 167       }
 168       hist_.addValue(bucket - 1);
 169     }
 170   }
 171
 172   void printHistogram() {
 173     hist_.toTSV(std::cout);
 174   }
 175
 176   QueueType queue_;
 177   std::atomic<bool> done_;
 178   int time_cost_;
 179   const int iters_;
 180   int cpu0_;
 181   int cpu1_;
 182   Histogram<int> hist_;
 183 };
 184
 185 void BM_ProducerConsumer(int iters, int size) {
 186   BenchmarkSuspender susp;
 187   CHECK_GT(size, 0);
 188   ThroughputTest<ThroughputQueueType> *test =
 189     new ThroughputTest<ThroughputQueueType>(size, iters, -1, -1);
 190   susp.dismiss();
 191
 192   std::thread producer( [test] { test->producer(); } );
 193   std::thread consumer( [test] { test->consumer(); } );
 194
 195   producer.join();
 196   test->done_ = true;
 197   consumer.join();
 198   delete test;
 199 }
 200
 201 void BM_ProducerConsumerAffinity(int iters, int size) {
 202   BenchmarkSuspender susp;
 203   CHECK_GT(size, 0);
 204   ThroughputTest<ThroughputQueueType> *test =
 205     new ThroughputTest<ThroughputQueueType>(size, iters, 0, 1);
 206   susp.dismiss();
 207
 208   std::thread producer( [test] { test->producer(); } );
 209   std::thread consumer( [test] { test->consumer(); } );
 210
 211   producer.join();
 212   test->done_ = true;
 213   consumer.join();
 214   delete test;
 215 }
 216
 217 void BM_ProducerConsumerLatency(int /* iters */, int size) {
 218   BenchmarkSuspender susp;
 219   CHECK_GT(size, 0);
 220   LatencyTest<LatencyQueueType> *test =
 221     new LatencyTest<LatencyQueueType>(size, 100000, 0, 1);
 222   susp.dismiss();
 223
 224   std::thread producer( [test] { test->producer(); } );
 225   std::thread consumer( [test] { test->consumer(); } );
 226
 227   producer.join();
 228   test->done_ = true;
 229   consumer.join();
 230   test->printHistogram();
 231   delete test;
 232 }
 233
 234
 235 BENCHMARK_DRAW_LINE();
 236
 237 BENCHMARK_PARAM(BM_ProducerConsumer, 1048574);
 238 BENCHMARK_PARAM(BM_ProducerConsumerAffinity, 1048574);
 239 BENCHMARK_PARAM(BM_ProducerConsumerLatency, 1048574);
 240
 241 }
 242
 243 int main(int argc, char** argv) {
 244   google::InitGoogleLogging(argv[0]);
 245   gflags::ParseCommandLineFlags(&argc, &argv, true);
 246
 247   runBenchmarks();
 248   return 0;
 249 }
 250
 251 #if 0
 252 /*
 253 Benchmark
 254
 255 $ lscpu
 256 Architecture:          x86_64
 257 CPU op-mode(s):        32-bit, 64-bit
 258 Byte Order:            Little Endian
 259 CPU(s):                24
 260 On-line CPU(s) list:   0-23
 261 Thread(s) per core:    1
 262 Core(s) per socket:    1
 263 Socket(s):             24
 264 NUMA node(s):          1
 265 Vendor ID:             GenuineIntel
 266 CPU family:            6
 267 Model:                 60
 268 Model name:            Intel Core Processor (Haswell, no TSX)
 269 Stepping:              1
 270 CPU MHz:               2494.244
 271 BogoMIPS:              4988.48
 272 Hypervisor vendor:     KVM
 273 Virtualization type:   full
 274 L1d cache:             32K
 275 L1i cache:             32K
 276 L2 cache:              4096K
 277 NUMA node0 CPU(s):     0-23
 278
 279 $ ../buck-out/gen/folly/test/producer_consumer_queue_benchmark
 280 5       6       1       5
 281 6       7       1893    11358
 282 7       8       39671   277697
 283 8       9       34921   279368
 284 9       10      17799   160191
 285 10      11      3685    36850
 286 11      12      1075    11825
 287 12      13      456     5472
 288 13      14      422     5486
 289 14      15      64      896
 290 15      16      7       105
 291 16      17      3       48
 292 17      18      3       51
 293 ============================================================================
 294 folly/test/ProducerConsumerQueueBenchmark.cpp   relative  time/iter  iters/s
 295 ============================================================================
 296 ----------------------------------------------------------------------------
 297 BM_ProducerConsumer(1048574)                                 5.82ns  171.75M
 298 BM_ProducerConsumerAffinity(1048574)                         7.36ns  135.83M
 299 BM_ProducerConsumerLatency(1048574)                         1.67min    9.99m
 300 ============================================================================
 301 */
 302 #endif