Adding getter and setter methods for RSA
[folly.git] / folly / test / CacheLocalityBenchmark.cpp
1 /*
2  * Copyright 2017 Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <folly/detail/CacheLocality.h>
18
19 #include <memory>
20 #include <thread>
21 #include <unordered_map>
22
23 #include <glog/logging.h>
24
25 #include <folly/Benchmark.h>
26
27 using namespace folly::detail;
28
29 #define DECLARE_SPREADER_TAG(tag, locality, func)      \
30   namespace {                                          \
31   template <typename dummy>                            \
32   struct tag {};                                       \
33   }                                                    \
34   namespace folly {                                    \
35   namespace detail {                                   \
36   template <>                                          \
37   const CacheLocality& CacheLocality::system<tag>() {  \
38     static auto* inst = new CacheLocality(locality);   \
39     return *inst;                                      \
40   }                                                    \
41   template <>                                          \
42   Getcpu::Func AccessSpreader<tag>::pickGetcpuFunc() { \
43     return func;                                       \
44   }                                                    \
45   }                                                    \
46   }
47
48 DECLARE_SPREADER_TAG(
49     ThreadLocalTag,
50     CacheLocality::system<>(),
51     folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu)
52 DECLARE_SPREADER_TAG(PthreadSelfTag,
53                      CacheLocality::system<>(),
54                      folly::detail::FallbackGetcpu<HashingThreadId>::getcpu)
55
56 BENCHMARK(AccessSpreaderUse, iters) {
57   for (unsigned long i = 0; i < iters; ++i) {
58     auto x = AccessSpreader<>::current(16);
59     folly::doNotOptimizeAway(x);
60   }
61 }
62
63 // Benchmark scores here reflect the time for 32 threads to perform an
64 // atomic increment on a dual-socket E5-2660 @ 2.2Ghz.  Surprisingly,
65 // if we don't separate the counters onto unique 128 byte stripes the
66 // 1_stripe and 2_stripe results are identical, even though the L3 is
67 // claimed to have 64 byte cache lines.
68 //
69 // Getcpu refers to the vdso getcpu implementation.  ThreadLocal refers
70 // to execution using SequentialThreadId, the fallback if the vdso
71 // getcpu isn't available.  PthreadSelf hashes the value returned from
72 // pthread_self() as a fallback-fallback for systems that don't have
73 // thread-local support.
74 //
75 // At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
76 // so since the stripe selection is 12 nanos the atomic increments in
77 // the L1 is ~17 nanos.  At width 8_stripe_0_work the line is expected
78 // to ping-pong almost every operation, since the loops have the same
79 // duration.  Widths 4 and 2 have the same behavior, but each tour of the
80 // cache line is 4 and 8 cores long, respectively.  These all suggest a
81 // lower bound of 60 nanos for intra-chip handoff and increment between
82 // the L1s.
83 //
84 // With 420 nanos of busywork per contended increment, the system can
85 // hide all of the latency of a tour of length 4, but not quite one of
86 // length 8.  I was a bit surprised at how much worse the non-striped
87 // version got.  It seems that the inter-chip traffic also interferes
88 // with the L1-only localWork.load().  When the local work is doubled
89 // to about 1 microsecond we see that the inter-chip contention is still
90 // very important, but subdivisions on the same chip don't matter.
91 //
92 // sudo nice -n -20 buck-out/gen/folly/test/cache_locality_test
93 //     --benchmark --bm_min_iters=1000000
94 // ============================================================================
95 // folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
96 // ============================================================================
97 // AccessSpreaderUse                                           11.94ns   83.79M
98 // ----------------------------------------------------------------------------
99 // contentionAtWidthGetcpu(1_stripe_0_work)                   985.75ns    1.01M
100 // contentionAtWidthGetcpu(2_stripe_0_work)                   424.02ns    2.36M
101 // contentionAtWidthGetcpu(4_stripe_0_work)                   190.13ns    5.26M
102 // contentionAtWidthGetcpu(8_stripe_0_work)                    91.86ns   10.89M
103 // contentionAtWidthGetcpu(16_stripe_0_work)                   29.31ns   34.12M
104 // contentionAtWidthGetcpu(32_stripe_0_work)                   29.53ns   33.86M
105 // contentionAtWidthGetcpu(64_stripe_0_work)                   29.93ns   33.41M
106 // contentionAtWidthThreadLocal(2_stripe_0_work)              609.21ns    1.64M
107 // contentionAtWidthThreadLocal(4_stripe_0_work)              303.60ns    3.29M
108 // contentionAtWidthThreadLocal(8_stripe_0_work)              246.57ns    4.06M
109 // contentionAtWidthThreadLocal(16_stripe_0_work)             154.84ns    6.46M
110 // contentionAtWidthThreadLocal(32_stripe_0_work)              24.14ns   41.43M
111 // contentionAtWidthThreadLocal(64_stripe_0_work)              23.95ns   41.75M
112 // contentionAtWidthPthreadSelf(2_stripe_0_work)              722.01ns    1.39M
113 // contentionAtWidthPthreadSelf(4_stripe_0_work)              501.56ns    1.99M
114 // contentionAtWidthPthreadSelf(8_stripe_0_work)              474.58ns    2.11M
115 // contentionAtWidthPthreadSelf(16_stripe_0_work)             300.90ns    3.32M
116 // contentionAtWidthPthreadSelf(32_stripe_0_work)             175.77ns    5.69M
117 // contentionAtWidthPthreadSelf(64_stripe_0_work)             174.88ns    5.72M
118 // atomicIncrBaseline(local_incr_0_work)                       16.81ns   59.51M
119 // ----------------------------------------------------------------------------
120 // contentionAtWidthGetcpu(1_stripe_500_work)                   1.82us  549.97K
121 // contentionAtWidthGetcpu(2_stripe_500_work)                 533.71ns    1.87M
122 // contentionAtWidthGetcpu(4_stripe_500_work)                 424.64ns    2.35M
123 // contentionAtWidthGetcpu(8_stripe_500_work)                 451.85ns    2.21M
124 // contentionAtWidthGetcpu(16_stripe_500_work)                425.54ns    2.35M
125 // contentionAtWidthGetcpu(32_stripe_500_work)                501.66ns    1.99M
126 // atomicIncrBaseline(local_incr_500_work)                    438.46ns    2.28M
127 // ----------------------------------------------------------------------------
128 // contentionAtWidthGetcpu(1_stripe_1000_work)                  1.88us  532.20K
129 // contentionAtWidthGetcpu(2_stripe_1000_work)                824.62ns    1.21M
130 // contentionAtWidthGetcpu(4_stripe_1000_work)                803.56ns    1.24M
131 // contentionAtWidthGetcpu(8_stripe_1000_work)                926.65ns    1.08M
132 // contentionAtWidthGetcpu(16_stripe_1000_work)               900.10ns    1.11M
133 // contentionAtWidthGetcpu(32_stripe_1000_work)               890.75ns    1.12M
134 // atomicIncrBaseline(local_incr_1000_work)                   774.47ns    1.29M
135 // ============================================================================
136 template <template <typename> class Tag>
137 static void contentionAtWidth(size_t iters, size_t stripes, size_t work) {
138   const size_t counterAlignment = 128;
139   const size_t numThreads = 32;
140
141   folly::BenchmarkSuspender braces;
142
143   std::atomic<size_t> ready(0);
144   std::atomic<bool> go(false);
145
146   // while in theory the cache line size is 64 bytes, experiments show
147   // that we get contention on 128 byte boundaries for Ivy Bridge.  The
148   // extra indirection adds 1 or 2 nanos
149   assert(counterAlignment >= sizeof(std::atomic<size_t>));
150   std::vector<char> raw(counterAlignment * stripes);
151
152   // if we happen to be using the tlsRoundRobin, then sequentially
153   // assigning the thread identifiers is the unlikely best-case scenario.
154   // We don't want to unfairly benefit or penalize.  Computing the exact
155   // maximum likelihood of the probability distributions is annoying, so
156   // I approximate as 2/5 of the ids that have no threads, 2/5 that have
157   // 1, 2/15 that have 2, and 1/15 that have 3.  We accomplish this by
158   // wrapping back to slot 0 when we hit 1/15 and 1/5.
159
160   std::vector<std::thread> threads;
161   while (threads.size() < numThreads) {
162     threads.push_back(std::thread([&, iters, stripes, work]() {
163       auto counters = std::vector<std::atomic<size_t>*>(stripes);
164       for (size_t i = 0; i < stripes; ++i) {
165         counters[i] =
166             new (raw.data() + counterAlignment * i) std::atomic<size_t>();
167       }
168
169       ready++;
170       while (!go.load()) {
171         std::this_thread::yield();
172       }
173       std::atomic<int> localWork(0);
174       for (size_t i = iters; i > 0; --i) {
175         ++*(counters[AccessSpreader<Tag>::current(stripes)]);
176         for (size_t j = work; j > 0; --j) {
177           localWork.load();
178         }
179       }
180     }));
181
182     if (threads.size() == numThreads / 15 || threads.size() == numThreads / 5) {
183       // create a few dummy threads to wrap back around to 0 mod numCpus
184       for (size_t i = threads.size(); i != numThreads; ++i) {
185         std::thread([&]() { AccessSpreader<Tag>::current(stripes); }).join();
186       }
187     }
188   }
189
190   while (ready < numThreads) {
191     std::this_thread::yield();
192   }
193   braces.dismiss();
194   go = true;
195
196   for (auto& thr : threads) {
197     thr.join();
198   }
199 }
200
201 static void atomicIncrBaseline(size_t iters,
202                                size_t work,
203                                size_t numThreads = 32) {
204   folly::BenchmarkSuspender braces;
205
206   std::atomic<bool> go(false);
207
208   std::vector<std::thread> threads;
209   while (threads.size() < numThreads) {
210     threads.push_back(std::thread([&]() {
211       while (!go.load()) {
212         std::this_thread::yield();
213       }
214       std::atomic<size_t> localCounter(0);
215       std::atomic<int> localWork(0);
216       for (size_t i = iters; i > 0; --i) {
217         localCounter++;
218         for (size_t j = work; j > 0; --j) {
219           localWork.load();
220         }
221       }
222     }));
223   }
224
225   braces.dismiss();
226   go = true;
227
228   for (auto& thr : threads) {
229     thr.join();
230   }
231 }
232
233 static void contentionAtWidthGetcpu(size_t iters, size_t stripes, size_t work) {
234   contentionAtWidth<std::atomic>(iters, stripes, work);
235 }
236
237 static void contentionAtWidthThreadLocal(size_t iters,
238                                          size_t stripes,
239                                          size_t work) {
240   contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
241 }
242
243 static void contentionAtWidthPthreadSelf(size_t iters,
244                                          size_t stripes,
245                                          size_t work) {
246   contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
247 }
248
249 BENCHMARK_DRAW_LINE()
250 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_0_work, 1, 0)
251 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_0_work, 2, 0)
252 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_0_work, 4, 0)
253 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_0_work, 8, 0)
254 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_0_work, 16, 0)
255 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_0_work, 32, 0)
256 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 64_stripe_0_work, 64, 0)
257 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 2_stripe_0_work, 2, 0)
258 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 4_stripe_0_work, 4, 0)
259 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 8_stripe_0_work, 8, 0)
260 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 16_stripe_0_work, 16, 0)
261 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 32_stripe_0_work, 32, 0)
262 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 64_stripe_0_work, 64, 0)
263 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 2_stripe_0_work, 2, 0)
264 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 4_stripe_0_work, 4, 0)
265 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 8_stripe_0_work, 8, 0)
266 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 16_stripe_0_work, 16, 0)
267 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 32_stripe_0_work, 32, 0)
268 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 64_stripe_0_work, 64, 0)
269 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
270 BENCHMARK_DRAW_LINE()
271 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_500_work, 1, 500)
272 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_500_work, 2, 500)
273 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_500_work, 4, 500)
274 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_500_work, 8, 500)
275 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_500_work, 16, 500)
276 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_500_work, 32, 500)
277 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
278 BENCHMARK_DRAW_LINE()
279 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_1000_work, 1, 1000)
280 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_1000_work, 2, 1000)
281 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_1000_work, 4, 1000)
282 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_1000_work, 8, 1000)
283 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_1000_work, 16, 1000)
284 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_1000_work, 32, 1000)
285 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
286
287 int main(int argc, char** argv) {
288   gflags::ParseCommandLineFlags(&argc, &argv, true);
289   folly::runBenchmarks();
290   return 0;
291 }