Fix the linking of various tests against GMock
[folly.git] / folly / test / ProducerConsumerQueueBenchmark.cpp
1 /*
2  * Copyright 2017 Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 // @author: Bert Maher <bertrand@fb.com>
18
19 #include <thread>
20 #include <iostream>
21 #include <stdio.h>
22 #include <pthread.h>
23
24 #include <folly/Benchmark.h>
25 #include <folly/ProducerConsumerQueue.h>
26 #include <folly/portability/GFlags.h>
27 #include <folly/stats/Histogram.h>
28 #include <folly/stats/Histogram-defs.h>
29 #include <glog/logging.h>
30
31 namespace {
32
33 using namespace folly;
34
35 typedef unsigned int ThroughputType;
36 typedef ProducerConsumerQueue<ThroughputType> ThroughputQueueType;
37
38 typedef unsigned long LatencyType;
39 typedef ProducerConsumerQueue<LatencyType> LatencyQueueType;
40
41 template<class QueueType>
42 struct ThroughputTest {
43   explicit ThroughputTest(size_t size, int iters, int cpu0, int cpu1)
44   : queue_(size),
45     done_(false),
46     iters_(iters),
47     cpu0_(cpu0),
48     cpu1_(cpu1)
49     { }
50
51   void producer() {
52     if (cpu0_ > -1) {
53       cpu_set_t cpuset;
54       CPU_ZERO(&cpuset);
55       CPU_SET(cpu0_, &cpuset);
56       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
57     }
58     for (int i = 0; i < iters_; ++i) {
59       ThroughputType item = i;
60       while (!queue_.write((ThroughputType) item)) {
61       }
62     }
63   }
64
65   void consumer() {
66     if (cpu1_ > -1) {
67       cpu_set_t cpuset;
68       CPU_ZERO(&cpuset);
69       CPU_SET(cpu1_, &cpuset);
70       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
71     }
72     for (int i = 0; i < iters_; ++i) {
73       ThroughputType item = 0;
74       while (!queue_.read(item)) {
75       }
76       doNotOptimizeAway(item);
77     }
78   }
79
80   QueueType queue_;
81   std::atomic<bool> done_;
82   const int iters_;
83   int cpu0_;
84   int cpu1_;
85 };
86
87 template<class QueueType>
88 struct LatencyTest {
89   explicit LatencyTest(size_t size, int iters, int cpu0, int cpu1)
90   : queue_(size),
91     done_(false),
92     iters_(iters),
93     cpu0_(cpu0),
94     cpu1_(cpu1),
95     hist_(1, 0, 30)
96     {
97       computeTimeCost();
98     }
99
100   static uint64_t timespecDiff(timespec end, timespec start) {
101     if (end.tv_sec == start.tv_sec) {
102       assert(end.tv_nsec >= start.tv_nsec);
103       return uint64_t(end.tv_nsec - start.tv_nsec);
104     }
105     assert(end.tv_sec > start.tv_sec);
106     auto diff = uint64_t(end.tv_sec - start.tv_sec);
107     assert(diff < std::numeric_limits<uint64_t>::max() / 1000000000ULL);
108     return diff * 1000000000ULL + end.tv_nsec - start.tv_nsec;
109   }
110
111   void computeTimeCost() {
112     timespec start, end;
113     clock_gettime(CLOCK_REALTIME, &start);
114     for (int i = 0; i < iters_; ++i) {
115       timespec tv;
116       clock_gettime(CLOCK_REALTIME, &tv);
117     }
118     clock_gettime(CLOCK_REALTIME, &end);
119     time_cost_ = 2 * timespecDiff(end, start) / iters_;
120   }
121
122   void producer() {
123     if (cpu0_ > -1) {
124       cpu_set_t cpuset;
125       CPU_ZERO(&cpuset);
126       CPU_SET(cpu0_, &cpuset);
127       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
128     }
129     for (int i = 0; i < iters_; ++i) {
130       timespec sleeptime, sleepstart;
131       clock_gettime(CLOCK_REALTIME, &sleepstart);
132       do {
133         clock_gettime(CLOCK_REALTIME, &sleeptime);
134       } while (timespecDiff(sleeptime, sleepstart) < 1000000);
135
136       timespec tv;
137       clock_gettime(CLOCK_REALTIME, &tv);
138       while (!queue_.write((LatencyType) tv.tv_nsec)) {
139       }
140     }
141   }
142
143   void consumer() {
144     if (cpu1_ > -1) {
145       cpu_set_t cpuset;
146       CPU_ZERO(&cpuset);
147       CPU_SET(cpu1_, &cpuset);
148       pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
149     }
150     for (int i = 0; i < iters_; ++i) {
151       unsigned long enqueue_nsec;
152       while (!queue_.read(enqueue_nsec)) {
153       }
154
155       timespec tv;
156       clock_gettime(CLOCK_REALTIME, &tv);
157       int diff = tv.tv_nsec - enqueue_nsec - time_cost_;
158       if (diff < 0) {
159         continue;
160       }
161
162       // Naive log-scale bucketing.
163       int bucket;
164       for (bucket = 0;
165            bucket <= 30 && (1 << bucket) <= diff;
166            ++bucket) {
167       }
168       hist_.addValue(bucket - 1);
169     }
170   }
171
172   void printHistogram() {
173     hist_.toTSV(std::cout);
174   }
175
176   QueueType queue_;
177   std::atomic<bool> done_;
178   int time_cost_;
179   const int iters_;
180   int cpu0_;
181   int cpu1_;
182   Histogram<int> hist_;
183 };
184
185 void BM_ProducerConsumer(int iters, int size) {
186   BenchmarkSuspender susp;
187   CHECK_GT(size, 0);
188   ThroughputTest<ThroughputQueueType> *test =
189     new ThroughputTest<ThroughputQueueType>(size, iters, -1, -1);
190   susp.dismiss();
191
192   std::thread producer( [test] { test->producer(); } );
193   std::thread consumer( [test] { test->consumer(); } );
194
195   producer.join();
196   test->done_ = true;
197   consumer.join();
198   delete test;
199 }
200
201 void BM_ProducerConsumerAffinity(int iters, int size) {
202   BenchmarkSuspender susp;
203   CHECK_GT(size, 0);
204   ThroughputTest<ThroughputQueueType> *test =
205     new ThroughputTest<ThroughputQueueType>(size, iters, 0, 1);
206   susp.dismiss();
207
208   std::thread producer( [test] { test->producer(); } );
209   std::thread consumer( [test] { test->consumer(); } );
210
211   producer.join();
212   test->done_ = true;
213   consumer.join();
214   delete test;
215 }
216
217 void BM_ProducerConsumerLatency(int /* iters */, int size) {
218   BenchmarkSuspender susp;
219   CHECK_GT(size, 0);
220   LatencyTest<LatencyQueueType> *test =
221     new LatencyTest<LatencyQueueType>(size, 100000, 0, 1);
222   susp.dismiss();
223
224   std::thread producer( [test] { test->producer(); } );
225   std::thread consumer( [test] { test->consumer(); } );
226
227   producer.join();
228   test->done_ = true;
229   consumer.join();
230   test->printHistogram();
231   delete test;
232 }
233
234
235 BENCHMARK_DRAW_LINE();
236
237 BENCHMARK_PARAM(BM_ProducerConsumer, 1048574);
238 BENCHMARK_PARAM(BM_ProducerConsumerAffinity, 1048574);
239 BENCHMARK_PARAM(BM_ProducerConsumerLatency, 1048574);
240
241 }
242
243 int main(int argc, char** argv) {
244   google::InitGoogleLogging(argv[0]);
245   gflags::ParseCommandLineFlags(&argc, &argv, true);
246
247   runBenchmarks();
248   return 0;
249 }
250
251 #if 0
252 /*
253 Benchmark
254
255 $ lscpu
256 Architecture:          x86_64
257 CPU op-mode(s):        32-bit, 64-bit
258 Byte Order:            Little Endian
259 CPU(s):                24
260 On-line CPU(s) list:   0-23
261 Thread(s) per core:    1
262 Core(s) per socket:    1
263 Socket(s):             24
264 NUMA node(s):          1
265 Vendor ID:             GenuineIntel
266 CPU family:            6
267 Model:                 60
268 Model name:            Intel Core Processor (Haswell, no TSX)
269 Stepping:              1
270 CPU MHz:               2494.244
271 BogoMIPS:              4988.48
272 Hypervisor vendor:     KVM
273 Virtualization type:   full
274 L1d cache:             32K
275 L1i cache:             32K
276 L2 cache:              4096K
277 NUMA node0 CPU(s):     0-23
278
279 $ ../buck-out/gen/folly/test/producer_consumer_queue_benchmark
280 5       6       1       5
281 6       7       1893    11358
282 7       8       39671   277697
283 8       9       34921   279368
284 9       10      17799   160191
285 10      11      3685    36850
286 11      12      1075    11825
287 12      13      456     5472
288 13      14      422     5486
289 14      15      64      896
290 15      16      7       105
291 16      17      3       48
292 17      18      3       51
293 ============================================================================
294 folly/test/ProducerConsumerQueueBenchmark.cpp   relative  time/iter  iters/s
295 ============================================================================
296 ----------------------------------------------------------------------------
297 BM_ProducerConsumer(1048574)                                 5.82ns  171.75M
298 BM_ProducerConsumerAffinity(1048574)                         7.36ns  135.83M
299 BM_ProducerConsumerLatency(1048574)                         1.67min    9.99m
300 ============================================================================
301 */
302 #endif