cfe19e18c5fa8e72f8a8db4eb03dafa8a3d1cbbb
[folly.git] / folly / test / ThreadCachedIntTest.cpp
1 /*
2  * Copyright 2012 Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "folly/ThreadCachedInt.h"
18 #include "folly/Hash.h"
19
20 #include <atomic>
21 #include <thread>
22 #include <gtest/gtest.h>
23 #include <gflags/gflags.h>
24 #include <glog/logging.h>
25 #include "folly/Benchmark.h"
26
27 using namespace folly;
28
29 TEST(ThreadCachedInt, SingleThreadedNotCached) {
30   ThreadCachedInt<int64_t> val(0, 0);
31   EXPECT_EQ(0, val.readFast());
32   ++val;
33   EXPECT_EQ(1, val.readFast());
34   for (int i = 0; i < 41; ++i) {
35     val.increment(1);
36   }
37   EXPECT_EQ(42, val.readFast());
38   --val;
39   EXPECT_EQ(41, val.readFast());
40 }
41
42 // Note: This is somewhat fragile to the implementation.  If this causes
43 // problems, feel free to remove it.
44 TEST(ThreadCachedInt, SingleThreadedCached) {
45   ThreadCachedInt<int64_t> val(0, 10);
46   EXPECT_EQ(0, val.readFast());
47   ++val;
48   EXPECT_EQ(0, val.readFast());
49   for (int i = 0; i < 7; ++i) {
50     val.increment(1);
51   }
52   EXPECT_EQ(0, val.readFast());
53   EXPECT_EQ(0, val.readFastAndReset());
54   EXPECT_EQ(8, val.readFull());
55   EXPECT_EQ(8, val.readFullAndReset());
56   EXPECT_EQ(0, val.readFull());
57   EXPECT_EQ(0, val.readFast());
58 }
59
60 ThreadCachedInt<int32_t> globalInt32(0, 11);
61 ThreadCachedInt<int64_t> globalInt64(0, 11);
62 int kNumInserts = 100000;
63 DEFINE_int32(numThreads, 8, "Number simultaneous threads for benchmarks.");
64 #define CREATE_INC_FUNC(size)                                       \
65   void incFunc ## size () {                                         \
66     const int num = kNumInserts / FLAGS_numThreads;                 \
67     for (int i = 0; i < num; ++i) {                                 \
68       ++globalInt ## size ;                                         \
69     }                                                               \
70   }
71 CREATE_INC_FUNC(64);
72 CREATE_INC_FUNC(32);
73
74 // Confirms counts are accurate with competing threads
75 TEST(ThreadCachedInt, MultiThreadedCached) {
76   kNumInserts = 100000;
77   CHECK_EQ(0, kNumInserts % FLAGS_numThreads) <<
78     "FLAGS_numThreads must evenly divide kNumInserts (" << kNumInserts << ").";
79   const int numPerThread = kNumInserts / FLAGS_numThreads;
80   ThreadCachedInt<int64_t> TCInt64(0, numPerThread - 2);
81   {
82     std::atomic<bool> run(true);
83     std::atomic<int> threadsDone(0);
84     std::vector<std::thread> threads;
85     for (int i = 0; i < FLAGS_numThreads; ++i) {
86       threads.push_back(std::thread([&] {
87         FOR_EACH_RANGE(k, 0, numPerThread) {
88           ++TCInt64;
89         }
90         std::atomic_fetch_add(&threadsDone, 1);
91         while (run.load()) { usleep(100); }
92       }));
93     }
94
95     // We create and increment another ThreadCachedInt here to make sure it
96     // doesn't interact with the other instances
97     ThreadCachedInt<int64_t> otherTCInt64(0, 10);
98     otherTCInt64.set(33);
99     ++otherTCInt64;
100
101     while (threadsDone.load() < FLAGS_numThreads) { usleep(100); }
102
103     ++otherTCInt64;
104
105     // Threads are done incrementing, but caches have not been flushed yet, so
106     // we have to readFull.
107     EXPECT_NE(kNumInserts, TCInt64.readFast());
108     EXPECT_EQ(kNumInserts, TCInt64.readFull());
109
110     run.store(false);
111     for (auto& t : threads) {
112       t.join();
113     }
114
115   }  // Caches are flushed when threads finish
116   EXPECT_EQ(kNumInserts, TCInt64.readFast());
117 }
118
119 #define MAKE_MT_CACHE_SIZE_BM(size)                             \
120   void BM_mt_cache_size ## size (int iters, int cacheSize) {    \
121     kNumInserts = iters;                                        \
122     globalInt ## size.set(0);                                   \
123     globalInt ## size.setCacheSize(cacheSize);                  \
124     std::vector<std::thread> threads;                           \
125     for (int i = 0; i < FLAGS_numThreads; ++i) {                \
126       threads.push_back(std::thread(incFunc ## size));          \
127     }                                                           \
128     for (auto& t : threads) {                                   \
129       t.join();                                                 \
130     }                                                           \
131   }
132 MAKE_MT_CACHE_SIZE_BM(64);
133 MAKE_MT_CACHE_SIZE_BM(32);
134
135 #define REG_BASELINE(name, inc_stmt)                            \
136   BENCHMARK(FB_CONCATENATE(BM_mt_baseline_, name), iters) {     \
137     const int iterPerThread = iters / FLAGS_numThreads;         \
138     std::vector<std::thread> threads;                           \
139     for (int i = 0; i < FLAGS_numThreads; ++i) {                \
140       threads.push_back(std::thread([&]() {                     \
141             for (int i = 0; i < iterPerThread; ++i) {           \
142               inc_stmt;                                         \
143             }                                                   \
144           }));                                                  \
145     }                                                           \
146     for (auto& t : threads) {                                   \
147       t.join();                                                 \
148     }                                                           \
149   }
150
151 ThreadLocal<int64_t> globalTL64Baseline;
152 ThreadLocal<int32_t> globalTL32Baseline;
153 std::atomic<int64_t> globalInt64Baseline(0);
154 std::atomic<int32_t> globalInt32Baseline(0);
155 __thread int64_t global__thread64;
156 __thread int32_t global__thread32;
157
158 // Alternate lock-free implementation.  Achieves about the same performance,
159 // but uses about 20x more memory than ThreadCachedInt with 24 threads.
160 struct ShardedAtomicInt {
161   static const int64_t kBuckets_ = 2048;
162   std::atomic<int64_t> ints_[kBuckets_];
163
164   inline void inc(int64_t val = 1) {
165     int bucket = hash::twang_mix64(pthread_self()) & (kBuckets_ - 1);
166     std::atomic_fetch_add(&ints_[bucket], val);
167   }
168
169   // read the first few and extrapolate
170   int64_t readFast() {
171     int64_t ret = 0;
172     static const int numToRead = 8;
173     FOR_EACH_RANGE(i, 0, numToRead) {
174       ret += ints_[i].load(std::memory_order_relaxed);
175     }
176     return ret * (kBuckets_ / numToRead);
177   }
178
179   // readFull is lock-free, but has to do thousands of loads...
180   int64_t readFull() {
181     int64_t ret = 0;
182     for (auto& i : ints_) {
183       // Fun fact - using memory_order_consume below reduces perf 30-40% in high
184       // contention benchmarks.
185       ret += i.load(std::memory_order_relaxed);
186     }
187     return ret;
188   }
189 };
190 ShardedAtomicInt shd_int64;
191
192 REG_BASELINE(_thread64, global__thread64 += 1);
193 REG_BASELINE(_thread32, global__thread32 += 1);
194 REG_BASELINE(ThreadLocal64, *globalTL64Baseline += 1);
195 REG_BASELINE(ThreadLocal32, *globalTL32Baseline += 1);
196 REG_BASELINE(atomic_inc64, std::atomic_fetch_add(&globalInt64Baseline, 1L));
197 REG_BASELINE(atomic_inc32, std::atomic_fetch_add(&globalInt32Baseline, 1));
198 REG_BASELINE(ShardedAtm64, shd_int64.inc());
199
200 BENCHMARK_PARAM(BM_mt_cache_size64, 0);
201 BENCHMARK_PARAM(BM_mt_cache_size64, 10);
202 BENCHMARK_PARAM(BM_mt_cache_size64, 100);
203 BENCHMARK_PARAM(BM_mt_cache_size64, 1000);
204 BENCHMARK_PARAM(BM_mt_cache_size32, 0);
205 BENCHMARK_PARAM(BM_mt_cache_size32, 10);
206 BENCHMARK_PARAM(BM_mt_cache_size32, 100);
207 BENCHMARK_PARAM(BM_mt_cache_size32, 1000);
208 BENCHMARK_DRAW_LINE();
209
210 // single threaded
211 BENCHMARK(Atomic_readFull) {
212   doNotOptimizeAway(globalInt64Baseline.load(std::memory_order_relaxed));
213 }
214 BENCHMARK(ThrCache_readFull) {
215   doNotOptimizeAway(globalInt64.readFull());
216 }
217 BENCHMARK(Sharded_readFull) {
218   doNotOptimizeAway(shd_int64.readFull());
219 }
220 BENCHMARK(ThrCache_readFast) {
221   doNotOptimizeAway(globalInt64.readFast());
222 }
223 BENCHMARK(Sharded_readFast) {
224   doNotOptimizeAway(shd_int64.readFast());
225 }
226 BENCHMARK_DRAW_LINE();
227
228 // multi threaded
229 REG_BASELINE(Atomic_readFull,
230       doNotOptimizeAway(globalInt64Baseline.load(std::memory_order_relaxed)));
231 REG_BASELINE(ThrCache_readFull, doNotOptimizeAway(globalInt64.readFull()));
232 REG_BASELINE(Sharded_readFull, doNotOptimizeAway(shd_int64.readFull()));
233 REG_BASELINE(ThrCache_readFast, doNotOptimizeAway(globalInt64.readFast()));
234 REG_BASELINE(Sharded_readFast, doNotOptimizeAway(shd_int64.readFast()));
235 BENCHMARK_DRAW_LINE();
236
237 int main(int argc, char** argv) {
238   testing::InitGoogleTest(&argc, argv);
239   google::ParseCommandLineFlags(&argc, &argv, true);
240   google::SetCommandLineOptionWithMode(
241     "bm_min_usec", "10000", google::SET_FLAG_IF_DEFAULT
242   );
243   if (FLAGS_benchmark) {
244     folly::runBenchmarks();
245   }
246   return RUN_ALL_TESTS();
247 }
248
249 /*
250  Ran with 20 threads on dual 12-core Xeon(R) X5650 @ 2.67GHz with 12-MB caches
251
252  Benchmark                               Iters   Total t    t/iter iter/sec
253  ------------------------------------------------------------------------------
254  + 103% BM_mt_baseline__thread64     10000000  13.54 ms  1.354 ns  704.4 M
255 *       BM_mt_baseline__thread32     10000000  6.651 ms  665.1 ps    1.4 G
256  +50.3% BM_mt_baseline_ThreadLocal64  10000000  9.994 ms  999.4 ps  954.2 M
257  +49.9% BM_mt_baseline_ThreadLocal32  10000000  9.972 ms  997.2 ps  956.4 M
258  +2650% BM_mt_baseline_atomic_inc64  10000000  182.9 ms  18.29 ns  52.13 M
259  +2665% BM_mt_baseline_atomic_inc32  10000000  183.9 ms  18.39 ns  51.85 M
260  +75.3% BM_mt_baseline_ShardedAtm64  10000000  11.66 ms  1.166 ns  817.8 M
261  +6670% BM_mt_cache_size64/0         10000000  450.3 ms  45.03 ns  21.18 M
262  +1644% BM_mt_cache_size64/10        10000000    116 ms   11.6 ns   82.2 M
263  + 381% BM_mt_cache_size64/100       10000000  32.04 ms  3.204 ns  297.7 M
264  + 129% BM_mt_cache_size64/1000      10000000  15.24 ms  1.524 ns  625.8 M
265  +6052% BM_mt_cache_size32/0         10000000  409.2 ms  40.92 ns  23.31 M
266  +1304% BM_mt_cache_size32/10        10000000  93.39 ms  9.339 ns  102.1 M
267  + 298% BM_mt_cache_size32/100       10000000  26.52 ms  2.651 ns  359.7 M
268  +68.1% BM_mt_cache_size32/1000      10000000  11.18 ms  1.118 ns  852.9 M
269 ------------------------------------------------------------------------------
270  +10.4% Atomic_readFull              10000000  36.05 ms  3.605 ns  264.5 M
271  + 619% ThrCache_readFull            10000000  235.1 ms  23.51 ns  40.57 M
272  SLOW   Sharded_readFull              1981093      2 s    1.01 us  967.3 k
273 *       ThrCache_readFast            10000000  32.65 ms  3.265 ns  292.1 M
274  +10.0% Sharded_readFast             10000000  35.92 ms  3.592 ns  265.5 M
275 ------------------------------------------------------------------------------
276  +4.54% BM_mt_baseline_Atomic_readFull  10000000  8.672 ms  867.2 ps  1.074 G
277  SLOW   BM_mt_baseline_ThrCache_readFull  10000000  996.9 ms  99.69 ns  9.567 M
278  SLOW   BM_mt_baseline_Sharded_readFull  10000000  891.5 ms  89.15 ns   10.7 M
279 *       BM_mt_baseline_ThrCache_readFast  10000000  8.295 ms  829.5 ps  1.123 G
280  +12.7% BM_mt_baseline_Sharded_readFast  10000000  9.348 ms  934.8 ps   1020 M
281 ------------------------------------------------------------------------------
282 */