2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 // @author: Bert Maher <bertrand@fb.com>
24 #include <folly/Benchmark.h>
25 #include <folly/ProducerConsumerQueue.h>
26 #include <folly/portability/GFlags.h>
27 #include <folly/stats/Histogram.h>
28 #include <folly/stats/Histogram-defs.h>
29 #include <glog/logging.h>
33 using namespace folly;
35 typedef unsigned int ThroughputType;
36 typedef ProducerConsumerQueue<ThroughputType> ThroughputQueueType;
38 typedef unsigned long LatencyType;
39 typedef ProducerConsumerQueue<LatencyType> LatencyQueueType;
41 template<class QueueType>
42 struct ThroughputTest {
43 explicit ThroughputTest(size_t size, int iters, int cpu0, int cpu1)
55 CPU_SET(cpu0_, &cpuset);
56 pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
58 for (int i = 0; i < iters_; ++i) {
59 ThroughputType item = i;
60 while (!queue_.write((ThroughputType) item)) {
69 CPU_SET(cpu1_, &cpuset);
70 pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
72 for (int i = 0; i < iters_; ++i) {
73 ThroughputType item = 0;
74 while (!queue_.read(item)) {
76 doNotOptimizeAway(item);
81 std::atomic<bool> done_;
87 template<class QueueType>
89 explicit LatencyTest(size_t size, int iters, int cpu0, int cpu1)
100 static uint64_t timespecDiff(timespec end, timespec start) {
101 if (end.tv_sec == start.tv_sec) {
102 assert(end.tv_nsec >= start.tv_nsec);
103 return uint64_t(end.tv_nsec - start.tv_nsec);
105 assert(end.tv_sec > start.tv_sec);
106 auto diff = uint64_t(end.tv_sec - start.tv_sec);
107 assert(diff < std::numeric_limits<uint64_t>::max() / 1000000000ULL);
108 return diff * 1000000000ULL + end.tv_nsec - start.tv_nsec;
111 void computeTimeCost() {
113 clock_gettime(CLOCK_REALTIME, &start);
114 for (int i = 0; i < iters_; ++i) {
116 clock_gettime(CLOCK_REALTIME, &tv);
118 clock_gettime(CLOCK_REALTIME, &end);
119 time_cost_ = 2 * timespecDiff(end, start) / iters_;
126 CPU_SET(cpu0_, &cpuset);
127 pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
129 for (int i = 0; i < iters_; ++i) {
130 timespec sleeptime, sleepstart;
131 clock_gettime(CLOCK_REALTIME, &sleepstart);
133 clock_gettime(CLOCK_REALTIME, &sleeptime);
134 } while (timespecDiff(sleeptime, sleepstart) < 1000000);
137 clock_gettime(CLOCK_REALTIME, &tv);
138 while (!queue_.write((LatencyType) tv.tv_nsec)) {
147 CPU_SET(cpu1_, &cpuset);
148 pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
150 for (int i = 0; i < iters_; ++i) {
151 unsigned long enqueue_nsec;
152 while (!queue_.read(enqueue_nsec)) {
156 clock_gettime(CLOCK_REALTIME, &tv);
157 int diff = tv.tv_nsec - enqueue_nsec - time_cost_;
162 // Naive log-scale bucketing.
165 bucket <= 30 && (1 << bucket) <= diff;
168 hist_.addValue(bucket - 1);
172 void printHistogram() {
173 hist_.toTSV(std::cout);
177 std::atomic<bool> done_;
182 Histogram<int> hist_;
185 void BM_ProducerConsumer(int iters, int size) {
186 BenchmarkSuspender susp;
188 ThroughputTest<ThroughputQueueType> *test =
189 new ThroughputTest<ThroughputQueueType>(size, iters, -1, -1);
192 std::thread producer( [test] { test->producer(); } );
193 std::thread consumer( [test] { test->consumer(); } );
201 void BM_ProducerConsumerAffinity(int iters, int size) {
202 BenchmarkSuspender susp;
204 ThroughputTest<ThroughputQueueType> *test =
205 new ThroughputTest<ThroughputQueueType>(size, iters, 0, 1);
208 std::thread producer( [test] { test->producer(); } );
209 std::thread consumer( [test] { test->consumer(); } );
217 void BM_ProducerConsumerLatency(int /* iters */, int size) {
218 BenchmarkSuspender susp;
220 LatencyTest<LatencyQueueType> *test =
221 new LatencyTest<LatencyQueueType>(size, 100000, 0, 1);
224 std::thread producer( [test] { test->producer(); } );
225 std::thread consumer( [test] { test->consumer(); } );
230 test->printHistogram();
235 BENCHMARK_DRAW_LINE();
237 BENCHMARK_PARAM(BM_ProducerConsumer, 1048574);
238 BENCHMARK_PARAM(BM_ProducerConsumerAffinity, 1048574);
239 BENCHMARK_PARAM(BM_ProducerConsumerLatency, 1048574);
243 int main(int argc, char** argv) {
244 google::InitGoogleLogging(argv[0]);
245 gflags::ParseCommandLineFlags(&argc, &argv, true);
253 Benchmark on Intel(R) Xeon(R) CPU E5-2660 0 @ 2.20GHz
266 ============================================================================
267 folly/test/ProducerConsumerQueueBenchmark.cpp relative time/iter iters/s
268 ============================================================================
269 ----------------------------------------------------------------------------
270 BM_ProducerConsumer(1048574) 7.52ns 132.90M
271 BM_ProducerConsumerAffinity(1048574) 8.28ns 120.75M
272 BM_ProducerConsumerLatency(1048574) 10.00s 99.98m
273 ============================================================================