2 * Copyright 2013-present Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 // @author: Bert Maher <bertrand@fb.com>
19 #include <folly/ProducerConsumerQueue.h>
25 #include <glog/logging.h>
27 #include <folly/Benchmark.h>
28 #include <folly/portability/GFlags.h>
29 #include <folly/portability/PThread.h>
30 #include <folly/stats/Histogram.h>
31 #include <folly/stats/Histogram-defs.h>
35 using namespace folly;
37 typedef unsigned int ThroughputType;
38 typedef ProducerConsumerQueue<ThroughputType> ThroughputQueueType;
40 typedef unsigned long LatencyType;
41 typedef ProducerConsumerQueue<LatencyType> LatencyQueueType;
43 template <class QueueType>
44 struct ThroughputTest {
45 explicit ThroughputTest(size_t size, int iters, int cpu0, int cpu1)
57 CPU_SET(cpu0_, &cpuset);
58 pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
60 for (int i = 0; i < iters_; ++i) {
61 ThroughputType item = i;
62 while (!queue_.write((ThroughputType) item)) {
71 CPU_SET(cpu1_, &cpuset);
72 pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
74 for (int i = 0; i < iters_; ++i) {
75 ThroughputType item = 0;
76 while (!queue_.read(item)) {
78 doNotOptimizeAway(item);
83 std::atomic<bool> done_;
89 template <class QueueType>
91 explicit LatencyTest(size_t size, int iters, int cpu0, int cpu1)
102 static uint64_t timespecDiff(timespec end, timespec start) {
103 if (end.tv_sec == start.tv_sec) {
104 assert(end.tv_nsec >= start.tv_nsec);
105 return uint64_t(end.tv_nsec - start.tv_nsec);
107 assert(end.tv_sec > start.tv_sec);
108 auto diff = uint64_t(end.tv_sec - start.tv_sec);
109 assert(diff < std::numeric_limits<uint64_t>::max() / 1000000000ULL);
110 return diff * 1000000000ULL + end.tv_nsec - start.tv_nsec;
113 void computeTimeCost() {
115 clock_gettime(CLOCK_REALTIME, &start);
116 for (int i = 0; i < iters_; ++i) {
118 clock_gettime(CLOCK_REALTIME, &tv);
120 clock_gettime(CLOCK_REALTIME, &end);
121 time_cost_ = 2 * timespecDiff(end, start) / iters_;
128 CPU_SET(cpu0_, &cpuset);
129 pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
131 for (int i = 0; i < iters_; ++i) {
132 timespec sleeptime, sleepstart;
133 clock_gettime(CLOCK_REALTIME, &sleepstart);
135 clock_gettime(CLOCK_REALTIME, &sleeptime);
136 } while (timespecDiff(sleeptime, sleepstart) < 1000000);
139 clock_gettime(CLOCK_REALTIME, &tv);
140 while (!queue_.write((LatencyType) tv.tv_nsec)) {
149 CPU_SET(cpu1_, &cpuset);
150 pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
152 for (int i = 0; i < iters_; ++i) {
153 unsigned long enqueue_nsec;
154 while (!queue_.read(enqueue_nsec)) {
158 clock_gettime(CLOCK_REALTIME, &tv);
159 int diff = tv.tv_nsec - enqueue_nsec - time_cost_;
164 // Naive log-scale bucketing.
167 bucket <= 30 && (1 << bucket) <= diff;
170 hist_.addValue(bucket - 1);
174 void printHistogram() {
175 hist_.toTSV(std::cout);
179 std::atomic<bool> done_;
184 Histogram<int> hist_;
187 void BM_ProducerConsumer(int iters, int size) {
188 BenchmarkSuspender susp;
190 ThroughputTest<ThroughputQueueType> *test =
191 new ThroughputTest<ThroughputQueueType>(size, iters, -1, -1);
194 std::thread producer( [test] { test->producer(); } );
195 std::thread consumer( [test] { test->consumer(); } );
203 void BM_ProducerConsumerAffinity(int iters, int size) {
204 BenchmarkSuspender susp;
206 ThroughputTest<ThroughputQueueType> *test =
207 new ThroughputTest<ThroughputQueueType>(size, iters, 0, 1);
210 std::thread producer( [test] { test->producer(); } );
211 std::thread consumer( [test] { test->consumer(); } );
219 void BM_ProducerConsumerLatency(int /* iters */, int size) {
220 BenchmarkSuspender susp;
222 LatencyTest<LatencyQueueType> *test =
223 new LatencyTest<LatencyQueueType>(size, 100000, 0, 1);
226 std::thread producer( [test] { test->producer(); } );
227 std::thread consumer( [test] { test->consumer(); } );
232 test->printHistogram();
237 BENCHMARK_DRAW_LINE();
239 BENCHMARK_PARAM(BM_ProducerConsumer, 1048574);
240 BENCHMARK_PARAM(BM_ProducerConsumerAffinity, 1048574);
241 BENCHMARK_PARAM(BM_ProducerConsumerLatency, 1048574);
245 int main(int argc, char** argv) {
246 google::InitGoogleLogging(argv[0]);
247 gflags::ParseCommandLineFlags(&argc, &argv, true);
259 CPU op-mode(s): 32-bit, 64-bit
260 Byte Order: Little Endian
262 On-line CPU(s) list: 0-23
263 Thread(s) per core: 1
264 Core(s) per socket: 1
267 Vendor ID: GenuineIntel
270 Model name: Intel Core Processor (Haswell, no TSX)
274 Hypervisor vendor: KVM
275 Virtualization type: full
279 NUMA node0 CPU(s): 0-23
281 $ ../buck-out/gen/folly/test/producer_consumer_queue_benchmark
295 ============================================================================
296 folly/test/ProducerConsumerQueueBenchmark.cpp relative time/iter iters/s
297 ============================================================================
298 ----------------------------------------------------------------------------
299 BM_ProducerConsumer(1048574) 5.82ns 171.75M
300 BM_ProducerConsumerAffinity(1048574) 7.36ns 135.83M
301 BM_ProducerConsumerLatency(1048574) 1.67min 9.99m
302 ============================================================================