2 * Copyright 2016-present Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/concurrency/CacheLocality.h>
21 #include <unordered_map>
23 #include <glog/logging.h>
25 #include <folly/Benchmark.h>
27 using namespace folly;
29 #define DECLARE_SPREADER_TAG(tag, locality, func) \
31 template <typename dummy> \
36 const CacheLocality& CacheLocality::system<tag>() { \
37 static auto* inst = new CacheLocality(locality); \
41 Getcpu::Func AccessSpreader<tag>::pickGetcpuFunc() { \
48 CacheLocality::system<>(),
49 folly::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu)
52 CacheLocality::system<>(),
53 folly::FallbackGetcpu<HashingThreadId>::getcpu)
55 BENCHMARK(AccessSpreaderUse, iters) {
56 for (unsigned long i = 0; i < iters; ++i) {
57 auto x = AccessSpreader<>::current(16);
58 folly::doNotOptimizeAway(x);
62 // Benchmark scores here reflect the time for 32 threads to perform an
63 // atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly,
64 // if we don't separate the counters onto unique 128 byte stripes the
65 // 1_stripe and 2_stripe results are identical, even though the L3 is
66 // claimed to have 64 byte cache lines.
68 // Getcpu refers to the vdso getcpu implementation. ThreadLocal refers
69 // to execution using SequentialThreadId, the fallback if the vdso
70 // getcpu isn't available. PthreadSelf hashes the value returned from
71 // pthread_self() as a fallback-fallback for systems that don't have
72 // thread-local support.
74 // At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
75 // so since the stripe selection is 12 nanos the atomic increments in
76 // the L1 is ~17 nanos. At width 8_stripe_0_work the line is expected
77 // to ping-pong almost every operation, since the loops have the same
78 // duration. Widths 4 and 2 have the same behavior, but each tour of the
79 // cache line is 4 and 8 cores long, respectively. These all suggest a
80 // lower bound of 60 nanos for intra-chip handoff and increment between
83 // With 420 nanos of busywork per contended increment, the system can
84 // hide all of the latency of a tour of length 4, but not quite one of
85 // length 8. I was a bit surprised at how much worse the non-striped
86 // version got. It seems that the inter-chip traffic also interferes
87 // with the L1-only localWork.load(). When the local work is doubled
88 // to about 1 microsecond we see that the inter-chip contention is still
89 // very important, but subdivisions on the same chip don't matter.
91 // sudo nice -n -20 buck-out/gen/folly/test/cache_locality_test
92 // --benchmark --bm_min_iters=1000000
93 // ============================================================================
94 // folly/test/CacheLocalityTest.cpp relative time/iter iters/s
95 // ============================================================================
96 // AccessSpreaderUse 11.94ns 83.79M
97 // ----------------------------------------------------------------------------
98 // contentionAtWidthGetcpu(1_stripe_0_work) 985.75ns 1.01M
99 // contentionAtWidthGetcpu(2_stripe_0_work) 424.02ns 2.36M
100 // contentionAtWidthGetcpu(4_stripe_0_work) 190.13ns 5.26M
101 // contentionAtWidthGetcpu(8_stripe_0_work) 91.86ns 10.89M
102 // contentionAtWidthGetcpu(16_stripe_0_work) 29.31ns 34.12M
103 // contentionAtWidthGetcpu(32_stripe_0_work) 29.53ns 33.86M
104 // contentionAtWidthGetcpu(64_stripe_0_work) 29.93ns 33.41M
105 // contentionAtWidthThreadLocal(2_stripe_0_work) 609.21ns 1.64M
106 // contentionAtWidthThreadLocal(4_stripe_0_work) 303.60ns 3.29M
107 // contentionAtWidthThreadLocal(8_stripe_0_work) 246.57ns 4.06M
108 // contentionAtWidthThreadLocal(16_stripe_0_work) 154.84ns 6.46M
109 // contentionAtWidthThreadLocal(32_stripe_0_work) 24.14ns 41.43M
110 // contentionAtWidthThreadLocal(64_stripe_0_work) 23.95ns 41.75M
111 // contentionAtWidthPthreadSelf(2_stripe_0_work) 722.01ns 1.39M
112 // contentionAtWidthPthreadSelf(4_stripe_0_work) 501.56ns 1.99M
113 // contentionAtWidthPthreadSelf(8_stripe_0_work) 474.58ns 2.11M
114 // contentionAtWidthPthreadSelf(16_stripe_0_work) 300.90ns 3.32M
115 // contentionAtWidthPthreadSelf(32_stripe_0_work) 175.77ns 5.69M
116 // contentionAtWidthPthreadSelf(64_stripe_0_work) 174.88ns 5.72M
117 // atomicIncrBaseline(local_incr_0_work) 16.81ns 59.51M
118 // ----------------------------------------------------------------------------
119 // contentionAtWidthGetcpu(1_stripe_500_work) 1.82us 549.97K
120 // contentionAtWidthGetcpu(2_stripe_500_work) 533.71ns 1.87M
121 // contentionAtWidthGetcpu(4_stripe_500_work) 424.64ns 2.35M
122 // contentionAtWidthGetcpu(8_stripe_500_work) 451.85ns 2.21M
123 // contentionAtWidthGetcpu(16_stripe_500_work) 425.54ns 2.35M
124 // contentionAtWidthGetcpu(32_stripe_500_work) 501.66ns 1.99M
125 // atomicIncrBaseline(local_incr_500_work) 438.46ns 2.28M
126 // ----------------------------------------------------------------------------
127 // contentionAtWidthGetcpu(1_stripe_1000_work) 1.88us 532.20K
128 // contentionAtWidthGetcpu(2_stripe_1000_work) 824.62ns 1.21M
129 // contentionAtWidthGetcpu(4_stripe_1000_work) 803.56ns 1.24M
130 // contentionAtWidthGetcpu(8_stripe_1000_work) 926.65ns 1.08M
131 // contentionAtWidthGetcpu(16_stripe_1000_work) 900.10ns 1.11M
132 // contentionAtWidthGetcpu(32_stripe_1000_work) 890.75ns 1.12M
133 // atomicIncrBaseline(local_incr_1000_work) 774.47ns 1.29M
134 // ============================================================================
135 template <template <typename> class Tag>
136 static void contentionAtWidth(size_t iters, size_t stripes, size_t work) {
137 const size_t counterAlignment = 128;
138 const size_t numThreads = 32;
140 folly::BenchmarkSuspender braces;
142 std::atomic<size_t> ready(0);
143 std::atomic<bool> go(false);
145 // while in theory the cache line size is 64 bytes, experiments show
146 // that we get contention on 128 byte boundaries for Ivy Bridge. The
147 // extra indirection adds 1 or 2 nanos
148 assert(counterAlignment >= sizeof(std::atomic<size_t>));
149 std::vector<char> raw(counterAlignment * stripes);
151 // if we happen to be using the tlsRoundRobin, then sequentially
152 // assigning the thread identifiers is the unlikely best-case scenario.
153 // We don't want to unfairly benefit or penalize. Computing the exact
154 // maximum likelihood of the probability distributions is annoying, so
155 // I approximate as 2/5 of the ids that have no threads, 2/5 that have
156 // 1, 2/15 that have 2, and 1/15 that have 3. We accomplish this by
157 // wrapping back to slot 0 when we hit 1/15 and 1/5.
159 std::vector<std::thread> threads;
160 while (threads.size() < numThreads) {
161 threads.push_back(std::thread([&, iters, stripes, work]() {
162 auto counters = std::vector<std::atomic<size_t>*>(stripes);
163 for (size_t i = 0; i < stripes; ++i) {
165 new (raw.data() + counterAlignment * i) std::atomic<size_t>();
170 std::this_thread::yield();
172 std::atomic<int> localWork(0);
173 for (size_t i = iters; i > 0; --i) {
174 ++*(counters[AccessSpreader<Tag>::current(stripes)]);
175 for (size_t j = work; j > 0; --j) {
181 if (threads.size() == numThreads / 15 || threads.size() == numThreads / 5) {
182 // create a few dummy threads to wrap back around to 0 mod numCpus
183 for (size_t i = threads.size(); i != numThreads; ++i) {
184 std::thread([&]() { AccessSpreader<Tag>::current(stripes); }).join();
189 while (ready < numThreads) {
190 std::this_thread::yield();
195 for (auto& thr : threads) {
201 atomicIncrBaseline(size_t iters, size_t work, size_t numThreads = 32) {
202 folly::BenchmarkSuspender braces;
204 std::atomic<bool> go(false);
206 std::vector<std::thread> threads;
207 while (threads.size() < numThreads) {
208 threads.push_back(std::thread([&]() {
210 std::this_thread::yield();
212 std::atomic<size_t> localCounter(0);
213 std::atomic<int> localWork(0);
214 for (size_t i = iters; i > 0; --i) {
216 for (size_t j = work; j > 0; --j) {
226 for (auto& thr : threads) {
231 static void contentionAtWidthGetcpu(size_t iters, size_t stripes, size_t work) {
232 contentionAtWidth<std::atomic>(iters, stripes, work);
236 contentionAtWidthThreadLocal(size_t iters, size_t stripes, size_t work) {
237 contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
241 contentionAtWidthPthreadSelf(size_t iters, size_t stripes, size_t work) {
242 contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
245 BENCHMARK_DRAW_LINE()
246 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_0_work, 1, 0)
247 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_0_work, 2, 0)
248 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_0_work, 4, 0)
249 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_0_work, 8, 0)
250 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_0_work, 16, 0)
251 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_0_work, 32, 0)
252 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 64_stripe_0_work, 64, 0)
253 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 2_stripe_0_work, 2, 0)
254 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 4_stripe_0_work, 4, 0)
255 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 8_stripe_0_work, 8, 0)
256 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 16_stripe_0_work, 16, 0)
257 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 32_stripe_0_work, 32, 0)
258 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 64_stripe_0_work, 64, 0)
259 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 2_stripe_0_work, 2, 0)
260 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 4_stripe_0_work, 4, 0)
261 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 8_stripe_0_work, 8, 0)
262 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 16_stripe_0_work, 16, 0)
263 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 32_stripe_0_work, 32, 0)
264 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 64_stripe_0_work, 64, 0)
265 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
266 BENCHMARK_DRAW_LINE()
267 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_500_work, 1, 500)
268 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_500_work, 2, 500)
269 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_500_work, 4, 500)
270 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_500_work, 8, 500)
271 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_500_work, 16, 500)
272 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_500_work, 32, 500)
273 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
274 BENCHMARK_DRAW_LINE()
275 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_1000_work, 1, 1000)
276 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_1000_work, 2, 1000)
277 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_1000_work, 4, 1000)
278 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_1000_work, 8, 1000)
279 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_1000_work, 16, 1000)
280 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_1000_work, 32, 1000)
281 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
283 int main(int argc, char** argv) {
284 gflags::ParseCommandLineFlags(&argc, &argv, true);
285 folly::runBenchmarks();