folly/test/ProducerConsumerQueueBenchmark.cpp

   1 /*
   2  * Copyright 2013-present Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 // @author: Bert Maher <bertrand@fb.com>
  18
  19 #include <folly/ProducerConsumerQueue.h>
  20
  21 #include <cstdio>
  22 #include <iostream>
  23 #include <thread>
  24
  25 #include <glog/logging.h>
  26
  27 #include <folly/Benchmark.h>
  28 #include <folly/portability/GFlags.h>
  29 #include <folly/portability/PThread.h>
  30 #include <folly/stats/Histogram.h>
  31 #include <folly/stats/Histogram-defs.h>
  32
  33 namespace {
  34
  35 using namespace folly;
  36
  37 typedef unsigned int ThroughputType;
  38 typedef ProducerConsumerQueue<ThroughputType> ThroughputQueueType;
  39
  40 typedef unsigned long LatencyType;
  41 typedef ProducerConsumerQueue<LatencyType> LatencyQueueType;
  42
  43 template <class QueueType>
  44 struct ThroughputTest {
  45   explicit ThroughputTest(size_t size, int iters, int cpu0, int cpu1)
  46   : queue_(size),
  47     done_(false),
  48     iters_(iters),
  49     cpu0_(cpu0),
  50     cpu1_(cpu1)
  51     { }
  52
  53   void producer() {
  54     if (cpu0_ > -1) {
  55       cpu_set_t cpuset;
  56       CPU_ZERO(&cpuset);
  57       CPU_SET(cpu0_, &cpuset);
  58       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
  59     }
  60     for (int i = 0; i < iters_; ++i) {
  61       ThroughputType item = i;
  62       while (!queue_.write((ThroughputType) item)) {
  63       }
  64     }
  65   }
  66
  67   void consumer() {
  68     if (cpu1_ > -1) {
  69       cpu_set_t cpuset;
  70       CPU_ZERO(&cpuset);
  71       CPU_SET(cpu1_, &cpuset);
  72       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
  73     }
  74     for (int i = 0; i < iters_; ++i) {
  75       ThroughputType item = 0;
  76       while (!queue_.read(item)) {
  77       }
  78       doNotOptimizeAway(item);
  79     }
  80   }
  81
  82   QueueType queue_;
  83   std::atomic<bool> done_;
  84   const int iters_;
  85   int cpu0_;
  86   int cpu1_;
  87 };
  88
  89 template <class QueueType>
  90 struct LatencyTest {
  91   explicit LatencyTest(size_t size, int iters, int cpu0, int cpu1)
  92   : queue_(size),
  93     done_(false),
  94     iters_(iters),
  95     cpu0_(cpu0),
  96     cpu1_(cpu1),
  97     hist_(1, 0, 30)
  98     {
  99       computeTimeCost();
 100     }
 101
 102   static uint64_t timespecDiff(timespec end, timespec start) {
 103     if (end.tv_sec == start.tv_sec) {
 104       assert(end.tv_nsec >= start.tv_nsec);
 105       return uint64_t(end.tv_nsec - start.tv_nsec);
 106     }
 107     assert(end.tv_sec > start.tv_sec);
 108     auto diff = uint64_t(end.tv_sec - start.tv_sec);
 109     assert(diff < std::numeric_limits<uint64_t>::max() / 1000000000ULL);
 110     return diff * 1000000000ULL + end.tv_nsec - start.tv_nsec;
 111   }
 112
 113   void computeTimeCost() {
 114     timespec start, end;
 115     clock_gettime(CLOCK_REALTIME, &start);
 116     for (int i = 0; i < iters_; ++i) {
 117       timespec tv;
 118       clock_gettime(CLOCK_REALTIME, &tv);
 119     }
 120     clock_gettime(CLOCK_REALTIME, &end);
 121     time_cost_ = 2 * timespecDiff(end, start) / iters_;
 122   }
 123
 124   void producer() {
 125     if (cpu0_ > -1) {
 126       cpu_set_t cpuset;
 127       CPU_ZERO(&cpuset);
 128       CPU_SET(cpu0_, &cpuset);
 129       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
 130     }
 131     for (int i = 0; i < iters_; ++i) {
 132       timespec sleeptime, sleepstart;
 133       clock_gettime(CLOCK_REALTIME, &sleepstart);
 134       do {
 135         clock_gettime(CLOCK_REALTIME, &sleeptime);
 136       } while (timespecDiff(sleeptime, sleepstart) < 1000000);
 137
 138       timespec tv;
 139       clock_gettime(CLOCK_REALTIME, &tv);
 140       while (!queue_.write((LatencyType) tv.tv_nsec)) {
 141       }
 142     }
 143   }
 144
 145   void consumer() {
 146     if (cpu1_ > -1) {
 147       cpu_set_t cpuset;
 148       CPU_ZERO(&cpuset);
 149       CPU_SET(cpu1_, &cpuset);
 150       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
 151     }
 152     for (int i = 0; i < iters_; ++i) {
 153       unsigned long enqueue_nsec;
 154       while (!queue_.read(enqueue_nsec)) {
 155       }
 156
 157       timespec tv;
 158       clock_gettime(CLOCK_REALTIME, &tv);
 159       int diff = tv.tv_nsec - enqueue_nsec - time_cost_;
 160       if (diff < 0) {
 161         continue;
 162       }
 163
 164       // Naive log-scale bucketing.
 165       int bucket;
 166       for (bucket = 0;
 167            bucket <= 30 && (1 << bucket) <= diff;
 168            ++bucket) {
 169       }
 170       hist_.addValue(bucket - 1);
 171     }
 172   }
 173
 174   void printHistogram() {
 175     hist_.toTSV(std::cout);
 176   }
 177
 178   QueueType queue_;
 179   std::atomic<bool> done_;
 180   int time_cost_;
 181   const int iters_;
 182   int cpu0_;
 183   int cpu1_;
 184   Histogram<int> hist_;
 185 };
 186
 187 void BM_ProducerConsumer(int iters, int size) {
 188   BenchmarkSuspender susp;
 189   CHECK_GT(size, 0);
 190   ThroughputTest<ThroughputQueueType> *test =
 191     new ThroughputTest<ThroughputQueueType>(size, iters, -1, -1);
 192   susp.dismiss();
 193
 194   std::thread producer( [test] { test->producer(); } );
 195   std::thread consumer( [test] { test->consumer(); } );
 196
 197   producer.join();
 198   test->done_ = true;
 199   consumer.join();
 200   delete test;
 201 }
 202
 203 void BM_ProducerConsumerAffinity(int iters, int size) {
 204   BenchmarkSuspender susp;
 205   CHECK_GT(size, 0);
 206   ThroughputTest<ThroughputQueueType> *test =
 207     new ThroughputTest<ThroughputQueueType>(size, iters, 0, 1);
 208   susp.dismiss();
 209
 210   std::thread producer( [test] { test->producer(); } );
 211   std::thread consumer( [test] { test->consumer(); } );
 212
 213   producer.join();
 214   test->done_ = true;
 215   consumer.join();
 216   delete test;
 217 }
 218
 219 void BM_ProducerConsumerLatency(int /* iters */, int size) {
 220   BenchmarkSuspender susp;
 221   CHECK_GT(size, 0);
 222   LatencyTest<LatencyQueueType> *test =
 223     new LatencyTest<LatencyQueueType>(size, 100000, 0, 1);
 224   susp.dismiss();
 225
 226   std::thread producer( [test] { test->producer(); } );
 227   std::thread consumer( [test] { test->consumer(); } );
 228
 229   producer.join();
 230   test->done_ = true;
 231   consumer.join();
 232   test->printHistogram();
 233   delete test;
 234 }
 235
 236
 237 BENCHMARK_DRAW_LINE();
 238
 239 BENCHMARK_PARAM(BM_ProducerConsumer, 1048574);
 240 BENCHMARK_PARAM(BM_ProducerConsumerAffinity, 1048574);
 241 BENCHMARK_PARAM(BM_ProducerConsumerLatency, 1048574);
 242
 243 } // namespace
 244
 245 int main(int argc, char** argv) {
 246   google::InitGoogleLogging(argv[0]);
 247   gflags::ParseCommandLineFlags(&argc, &argv, true);
 248
 249   runBenchmarks();
 250   return 0;
 251 }
 252
 253 #if 0
 254 /*
 255 Benchmark
 256
 257 $ lscpu
 258 Architecture:          x86_64
 259 CPU op-mode(s):        32-bit, 64-bit
 260 Byte Order:            Little Endian
 261 CPU(s):                24
 262 On-line CPU(s) list:   0-23
 263 Thread(s) per core:    1
 264 Core(s) per socket:    1
 265 Socket(s):             24
 266 NUMA node(s):          1
 267 Vendor ID:             GenuineIntel
 268 CPU family:            6
 269 Model:                 60
 270 Model name:            Intel Core Processor (Haswell, no TSX)
 271 Stepping:              1
 272 CPU MHz:               2494.244
 273 BogoMIPS:              4988.48
 274 Hypervisor vendor:     KVM
 275 Virtualization type:   full
 276 L1d cache:             32K
 277 L1i cache:             32K
 278 L2 cache:              4096K
 279 NUMA node0 CPU(s):     0-23
 280
 281 $ ../buck-out/gen/folly/test/producer_consumer_queue_benchmark
 282 5       6       1       5
 283 6       7       1893    11358
 284 7       8       39671   277697
 285 8       9       34921   279368
 286 9       10      17799   160191
 287 10      11      3685    36850
 288 11      12      1075    11825
 289 12      13      456     5472
 290 13      14      422     5486
 291 14      15      64      896
 292 15      16      7       105
 293 16      17      3       48
 294 17      18      3       51
 295 ============================================================================
 296 folly/test/ProducerConsumerQueueBenchmark.cpp   relative  time/iter  iters/s
 297 ============================================================================
 298 ----------------------------------------------------------------------------
 299 BM_ProducerConsumer(1048574)                                 5.82ns  171.75M
 300 BM_ProducerConsumerAffinity(1048574)                         7.36ns  135.83M
 301 BM_ProducerConsumerLatency(1048574)                         1.67min    9.99m
 302 ============================================================================
 303 */
 304 #endif