TLS cache for AccessSpreader
[folly.git] / folly / test / CacheLocalityTest.cpp
1 /*
2  * Copyright 2014 Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <folly/detail/CacheLocality.h>
18
19 #include <sched.h>
20 #include <memory>
21 #include <thread>
22 #include <type_traits>
23 #include <unordered_map>
24 #include <glog/logging.h>
25 #include <gtest/gtest.h>
26 #include <folly/Benchmark.h>
27
28 using namespace folly::detail;
29
30 /// This is the relevant nodes from a production box's sysfs tree.  If you
31 /// think this map is ugly you should see the version of this test that
32 /// used a real directory tree.  To reduce the chance of testing error
33 /// I haven't tried to remove the common prefix
34 static std::unordered_map<std::string,std::string> fakeSysfsTree = {
35   { "/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list", "0,17" },
36   { "/sys/devices/system/cpu/cpu0/cache/index0/type", "Data" },
37   { "/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list", "0,17" },
38   { "/sys/devices/system/cpu/cpu0/cache/index1/type", "Instruction" },
39   { "/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list", "0,17" },
40   { "/sys/devices/system/cpu/cpu0/cache/index2/type", "Unified" },
41   { "/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list", "0-8,17-23" },
42   { "/sys/devices/system/cpu/cpu0/cache/index3/type", "Unified" },
43   { "/sys/devices/system/cpu/cpu1/cache/index0/shared_cpu_list", "1,18" },
44   { "/sys/devices/system/cpu/cpu1/cache/index0/type", "Data" },
45   { "/sys/devices/system/cpu/cpu1/cache/index1/shared_cpu_list", "1,18" },
46   { "/sys/devices/system/cpu/cpu1/cache/index1/type", "Instruction" },
47   { "/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list", "1,18" },
48   { "/sys/devices/system/cpu/cpu1/cache/index2/type", "Unified" },
49   { "/sys/devices/system/cpu/cpu1/cache/index3/shared_cpu_list", "0-8,17-23" },
50   { "/sys/devices/system/cpu/cpu1/cache/index3/type", "Unified" },
51   { "/sys/devices/system/cpu/cpu2/cache/index0/shared_cpu_list", "2,19" },
52   { "/sys/devices/system/cpu/cpu2/cache/index0/type", "Data" },
53   { "/sys/devices/system/cpu/cpu2/cache/index1/shared_cpu_list", "2,19" },
54   { "/sys/devices/system/cpu/cpu2/cache/index1/type", "Instruction" },
55   { "/sys/devices/system/cpu/cpu2/cache/index2/shared_cpu_list", "2,19" },
56   { "/sys/devices/system/cpu/cpu2/cache/index2/type", "Unified" },
57   { "/sys/devices/system/cpu/cpu2/cache/index3/shared_cpu_list", "0-8,17-23" },
58   { "/sys/devices/system/cpu/cpu2/cache/index3/type", "Unified" },
59   { "/sys/devices/system/cpu/cpu3/cache/index0/shared_cpu_list", "3,20" },
60   { "/sys/devices/system/cpu/cpu3/cache/index0/type", "Data" },
61   { "/sys/devices/system/cpu/cpu3/cache/index1/shared_cpu_list", "3,20" },
62   { "/sys/devices/system/cpu/cpu3/cache/index1/type", "Instruction" },
63   { "/sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_list", "3,20" },
64   { "/sys/devices/system/cpu/cpu3/cache/index2/type", "Unified" },
65   { "/sys/devices/system/cpu/cpu3/cache/index3/shared_cpu_list", "0-8,17-23" },
66   { "/sys/devices/system/cpu/cpu3/cache/index3/type", "Unified" },
67   { "/sys/devices/system/cpu/cpu4/cache/index0/shared_cpu_list", "4,21" },
68   { "/sys/devices/system/cpu/cpu4/cache/index0/type", "Data" },
69   { "/sys/devices/system/cpu/cpu4/cache/index1/shared_cpu_list", "4,21" },
70   { "/sys/devices/system/cpu/cpu4/cache/index1/type", "Instruction" },
71   { "/sys/devices/system/cpu/cpu4/cache/index2/shared_cpu_list", "4,21" },
72   { "/sys/devices/system/cpu/cpu4/cache/index2/type", "Unified" },
73   { "/sys/devices/system/cpu/cpu4/cache/index3/shared_cpu_list", "0-8,17-23" },
74   { "/sys/devices/system/cpu/cpu4/cache/index3/type", "Unified" },
75   { "/sys/devices/system/cpu/cpu5/cache/index0/shared_cpu_list", "5-6" },
76   { "/sys/devices/system/cpu/cpu5/cache/index0/type", "Data" },
77   { "/sys/devices/system/cpu/cpu5/cache/index1/shared_cpu_list", "5-6" },
78   { "/sys/devices/system/cpu/cpu5/cache/index1/type", "Instruction" },
79   { "/sys/devices/system/cpu/cpu5/cache/index2/shared_cpu_list", "5-6" },
80   { "/sys/devices/system/cpu/cpu5/cache/index2/type", "Unified" },
81   { "/sys/devices/system/cpu/cpu5/cache/index3/shared_cpu_list", "0-8,17-23" },
82   { "/sys/devices/system/cpu/cpu5/cache/index3/type", "Unified" },
83   { "/sys/devices/system/cpu/cpu6/cache/index0/shared_cpu_list", "5-6" },
84   { "/sys/devices/system/cpu/cpu6/cache/index0/type", "Data" },
85   { "/sys/devices/system/cpu/cpu6/cache/index1/shared_cpu_list", "5-6" },
86   { "/sys/devices/system/cpu/cpu6/cache/index1/type", "Instruction" },
87   { "/sys/devices/system/cpu/cpu6/cache/index2/shared_cpu_list", "5-6" },
88   { "/sys/devices/system/cpu/cpu6/cache/index2/type", "Unified" },
89   { "/sys/devices/system/cpu/cpu6/cache/index3/shared_cpu_list", "0-8,17-23" },
90   { "/sys/devices/system/cpu/cpu6/cache/index3/type", "Unified" },
91   { "/sys/devices/system/cpu/cpu7/cache/index0/shared_cpu_list", "7,22" },
92   { "/sys/devices/system/cpu/cpu7/cache/index0/type", "Data" },
93   { "/sys/devices/system/cpu/cpu7/cache/index1/shared_cpu_list", "7,22" },
94   { "/sys/devices/system/cpu/cpu7/cache/index1/type", "Instruction" },
95   { "/sys/devices/system/cpu/cpu7/cache/index2/shared_cpu_list", "7,22" },
96   { "/sys/devices/system/cpu/cpu7/cache/index2/type", "Unified" },
97   { "/sys/devices/system/cpu/cpu7/cache/index3/shared_cpu_list", "0-8,17-23" },
98   { "/sys/devices/system/cpu/cpu7/cache/index3/type", "Unified" },
99   { "/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list", "8,23" },
100   { "/sys/devices/system/cpu/cpu8/cache/index0/type", "Data" },
101   { "/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list", "8,23" },
102   { "/sys/devices/system/cpu/cpu8/cache/index1/type", "Instruction" },
103   { "/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list", "8,23" },
104   { "/sys/devices/system/cpu/cpu8/cache/index2/type", "Unified" },
105   { "/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list", "0-8,17-23" },
106   { "/sys/devices/system/cpu/cpu8/cache/index3/type", "Unified" },
107   { "/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list", "9,24" },
108   { "/sys/devices/system/cpu/cpu9/cache/index0/type", "Data" },
109   { "/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list", "9,24" },
110   { "/sys/devices/system/cpu/cpu9/cache/index1/type", "Instruction" },
111   { "/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list", "9,24" },
112   { "/sys/devices/system/cpu/cpu9/cache/index2/type", "Unified" },
113   { "/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list", "9-16,24-31" },
114   { "/sys/devices/system/cpu/cpu9/cache/index3/type", "Unified" },
115   { "/sys/devices/system/cpu/cpu10/cache/index0/shared_cpu_list", "10,25" },
116   { "/sys/devices/system/cpu/cpu10/cache/index0/type", "Data" },
117   { "/sys/devices/system/cpu/cpu10/cache/index1/shared_cpu_list", "10,25" },
118   { "/sys/devices/system/cpu/cpu10/cache/index1/type", "Instruction" },
119   { "/sys/devices/system/cpu/cpu10/cache/index2/shared_cpu_list", "10,25" },
120   { "/sys/devices/system/cpu/cpu10/cache/index2/type", "Unified" },
121   { "/sys/devices/system/cpu/cpu10/cache/index3/shared_cpu_list", "9-16,24-31"},
122   { "/sys/devices/system/cpu/cpu10/cache/index3/type", "Unified" },
123   { "/sys/devices/system/cpu/cpu11/cache/index0/shared_cpu_list", "11,26" },
124   { "/sys/devices/system/cpu/cpu11/cache/index0/type", "Data" },
125   { "/sys/devices/system/cpu/cpu11/cache/index1/shared_cpu_list", "11,26" },
126   { "/sys/devices/system/cpu/cpu11/cache/index1/type", "Instruction" },
127   { "/sys/devices/system/cpu/cpu11/cache/index2/shared_cpu_list", "11,26" },
128   { "/sys/devices/system/cpu/cpu11/cache/index2/type", "Unified" },
129   { "/sys/devices/system/cpu/cpu11/cache/index3/shared_cpu_list", "9-16,24-31"},
130   { "/sys/devices/system/cpu/cpu11/cache/index3/type", "Unified" },
131   { "/sys/devices/system/cpu/cpu12/cache/index0/shared_cpu_list", "12,27" },
132   { "/sys/devices/system/cpu/cpu12/cache/index0/type", "Data" },
133   { "/sys/devices/system/cpu/cpu12/cache/index1/shared_cpu_list", "12,27" },
134   { "/sys/devices/system/cpu/cpu12/cache/index1/type", "Instruction" },
135   { "/sys/devices/system/cpu/cpu12/cache/index2/shared_cpu_list", "12,27" },
136   { "/sys/devices/system/cpu/cpu12/cache/index2/type", "Unified" },
137   { "/sys/devices/system/cpu/cpu12/cache/index3/shared_cpu_list", "9-16,24-31"},
138   { "/sys/devices/system/cpu/cpu12/cache/index3/type", "Unified" },
139   { "/sys/devices/system/cpu/cpu13/cache/index0/shared_cpu_list", "13,28" },
140   { "/sys/devices/system/cpu/cpu13/cache/index0/type", "Data" },
141   { "/sys/devices/system/cpu/cpu13/cache/index1/shared_cpu_list", "13,28" },
142   { "/sys/devices/system/cpu/cpu13/cache/index1/type", "Instruction" },
143   { "/sys/devices/system/cpu/cpu13/cache/index2/shared_cpu_list", "13,28" },
144   { "/sys/devices/system/cpu/cpu13/cache/index2/type", "Unified" },
145   { "/sys/devices/system/cpu/cpu13/cache/index3/shared_cpu_list", "9-16,24-31"},
146   { "/sys/devices/system/cpu/cpu13/cache/index3/type", "Unified" },
147   { "/sys/devices/system/cpu/cpu14/cache/index0/shared_cpu_list", "14,29" },
148   { "/sys/devices/system/cpu/cpu14/cache/index0/type", "Data" },
149   { "/sys/devices/system/cpu/cpu14/cache/index1/shared_cpu_list", "14,29" },
150   { "/sys/devices/system/cpu/cpu14/cache/index1/type", "Instruction" },
151   { "/sys/devices/system/cpu/cpu14/cache/index2/shared_cpu_list", "14,29" },
152   { "/sys/devices/system/cpu/cpu14/cache/index2/type", "Unified" },
153   { "/sys/devices/system/cpu/cpu14/cache/index3/shared_cpu_list", "9-16,24-31"},
154   { "/sys/devices/system/cpu/cpu14/cache/index3/type", "Unified" },
155   { "/sys/devices/system/cpu/cpu15/cache/index0/shared_cpu_list", "15,30" },
156   { "/sys/devices/system/cpu/cpu15/cache/index0/type", "Data" },
157   { "/sys/devices/system/cpu/cpu15/cache/index1/shared_cpu_list", "15,30" },
158   { "/sys/devices/system/cpu/cpu15/cache/index1/type", "Instruction" },
159   { "/sys/devices/system/cpu/cpu15/cache/index2/shared_cpu_list", "15,30" },
160   { "/sys/devices/system/cpu/cpu15/cache/index2/type", "Unified" },
161   { "/sys/devices/system/cpu/cpu15/cache/index3/shared_cpu_list", "9-16,24-31"},
162   { "/sys/devices/system/cpu/cpu15/cache/index3/type", "Unified" },
163   { "/sys/devices/system/cpu/cpu16/cache/index0/shared_cpu_list", "16,31" },
164   { "/sys/devices/system/cpu/cpu16/cache/index0/type", "Data" },
165   { "/sys/devices/system/cpu/cpu16/cache/index1/shared_cpu_list", "16,31" },
166   { "/sys/devices/system/cpu/cpu16/cache/index1/type", "Instruction" },
167   { "/sys/devices/system/cpu/cpu16/cache/index2/shared_cpu_list", "16,31" },
168   { "/sys/devices/system/cpu/cpu16/cache/index2/type", "Unified" },
169   { "/sys/devices/system/cpu/cpu16/cache/index3/shared_cpu_list", "9-16,24-31"},
170   { "/sys/devices/system/cpu/cpu16/cache/index3/type", "Unified" },
171   { "/sys/devices/system/cpu/cpu17/cache/index0/shared_cpu_list", "0,17" },
172   { "/sys/devices/system/cpu/cpu17/cache/index0/type", "Data" },
173   { "/sys/devices/system/cpu/cpu17/cache/index1/shared_cpu_list", "0,17" },
174   { "/sys/devices/system/cpu/cpu17/cache/index1/type", "Instruction" },
175   { "/sys/devices/system/cpu/cpu17/cache/index2/shared_cpu_list", "0,17" },
176   { "/sys/devices/system/cpu/cpu17/cache/index2/type", "Unified" },
177   { "/sys/devices/system/cpu/cpu17/cache/index3/shared_cpu_list", "0-8,17-23" },
178   { "/sys/devices/system/cpu/cpu17/cache/index3/type", "Unified" },
179   { "/sys/devices/system/cpu/cpu18/cache/index0/shared_cpu_list", "1,18" },
180   { "/sys/devices/system/cpu/cpu18/cache/index0/type", "Data" },
181   { "/sys/devices/system/cpu/cpu18/cache/index1/shared_cpu_list", "1,18" },
182   { "/sys/devices/system/cpu/cpu18/cache/index1/type", "Instruction" },
183   { "/sys/devices/system/cpu/cpu18/cache/index2/shared_cpu_list", "1,18" },
184   { "/sys/devices/system/cpu/cpu18/cache/index2/type", "Unified" },
185   { "/sys/devices/system/cpu/cpu18/cache/index3/shared_cpu_list", "0-8,17-23" },
186   { "/sys/devices/system/cpu/cpu18/cache/index3/type", "Unified" },
187   { "/sys/devices/system/cpu/cpu19/cache/index0/shared_cpu_list", "2,19" },
188   { "/sys/devices/system/cpu/cpu19/cache/index0/type", "Data" },
189   { "/sys/devices/system/cpu/cpu19/cache/index1/shared_cpu_list", "2,19" },
190   { "/sys/devices/system/cpu/cpu19/cache/index1/type", "Instruction" },
191   { "/sys/devices/system/cpu/cpu19/cache/index2/shared_cpu_list", "2,19" },
192   { "/sys/devices/system/cpu/cpu19/cache/index2/type", "Unified" },
193   { "/sys/devices/system/cpu/cpu19/cache/index3/shared_cpu_list", "0-8,17-23" },
194   { "/sys/devices/system/cpu/cpu19/cache/index3/type", "Unified" },
195   { "/sys/devices/system/cpu/cpu20/cache/index0/shared_cpu_list", "3,20" },
196   { "/sys/devices/system/cpu/cpu20/cache/index0/type", "Data" },
197   { "/sys/devices/system/cpu/cpu20/cache/index1/shared_cpu_list", "3,20" },
198   { "/sys/devices/system/cpu/cpu20/cache/index1/type", "Instruction" },
199   { "/sys/devices/system/cpu/cpu20/cache/index2/shared_cpu_list", "3,20" },
200   { "/sys/devices/system/cpu/cpu20/cache/index2/type", "Unified" },
201   { "/sys/devices/system/cpu/cpu20/cache/index3/shared_cpu_list", "0-8,17-23" },
202   { "/sys/devices/system/cpu/cpu20/cache/index3/type", "Unified" },
203   { "/sys/devices/system/cpu/cpu21/cache/index0/shared_cpu_list", "4,21" },
204   { "/sys/devices/system/cpu/cpu21/cache/index0/type", "Data" },
205   { "/sys/devices/system/cpu/cpu21/cache/index1/shared_cpu_list", "4,21" },
206   { "/sys/devices/system/cpu/cpu21/cache/index1/type", "Instruction" },
207   { "/sys/devices/system/cpu/cpu21/cache/index2/shared_cpu_list", "4,21" },
208   { "/sys/devices/system/cpu/cpu21/cache/index2/type", "Unified" },
209   { "/sys/devices/system/cpu/cpu21/cache/index3/shared_cpu_list", "0-8,17-23" },
210   { "/sys/devices/system/cpu/cpu21/cache/index3/type", "Unified" },
211   { "/sys/devices/system/cpu/cpu22/cache/index0/shared_cpu_list", "7,22" },
212   { "/sys/devices/system/cpu/cpu22/cache/index0/type", "Data" },
213   { "/sys/devices/system/cpu/cpu22/cache/index1/shared_cpu_list", "7,22" },
214   { "/sys/devices/system/cpu/cpu22/cache/index1/type", "Instruction" },
215   { "/sys/devices/system/cpu/cpu22/cache/index2/shared_cpu_list", "7,22" },
216   { "/sys/devices/system/cpu/cpu22/cache/index2/type", "Unified" },
217   { "/sys/devices/system/cpu/cpu22/cache/index3/shared_cpu_list", "0-8,17-23" },
218   { "/sys/devices/system/cpu/cpu22/cache/index3/type", "Unified" },
219   { "/sys/devices/system/cpu/cpu23/cache/index0/shared_cpu_list", "8,23" },
220   { "/sys/devices/system/cpu/cpu23/cache/index0/type", "Data" },
221   { "/sys/devices/system/cpu/cpu23/cache/index1/shared_cpu_list", "8,23" },
222   { "/sys/devices/system/cpu/cpu23/cache/index1/type", "Instruction" },
223   { "/sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_list", "8,23" },
224   { "/sys/devices/system/cpu/cpu23/cache/index2/type", "Unified" },
225   { "/sys/devices/system/cpu/cpu23/cache/index3/shared_cpu_list", "0-8,17-23" },
226   { "/sys/devices/system/cpu/cpu23/cache/index3/type", "Unified" },
227   { "/sys/devices/system/cpu/cpu24/cache/index0/shared_cpu_list", "9,24" },
228   { "/sys/devices/system/cpu/cpu24/cache/index0/type", "Data" },
229   { "/sys/devices/system/cpu/cpu24/cache/index1/shared_cpu_list", "9,24" },
230   { "/sys/devices/system/cpu/cpu24/cache/index1/type", "Instruction" },
231   { "/sys/devices/system/cpu/cpu24/cache/index2/shared_cpu_list", "9,24" },
232   { "/sys/devices/system/cpu/cpu24/cache/index2/type", "Unified" },
233   { "/sys/devices/system/cpu/cpu24/cache/index3/shared_cpu_list", "9-16,24-31"},
234   { "/sys/devices/system/cpu/cpu24/cache/index3/type", "Unified" },
235   { "/sys/devices/system/cpu/cpu25/cache/index0/shared_cpu_list", "10,25" },
236   { "/sys/devices/system/cpu/cpu25/cache/index0/type", "Data" },
237   { "/sys/devices/system/cpu/cpu25/cache/index1/shared_cpu_list", "10,25" },
238   { "/sys/devices/system/cpu/cpu25/cache/index1/type", "Instruction" },
239   { "/sys/devices/system/cpu/cpu25/cache/index2/shared_cpu_list", "10,25" },
240   { "/sys/devices/system/cpu/cpu25/cache/index2/type", "Unified" },
241   { "/sys/devices/system/cpu/cpu25/cache/index3/shared_cpu_list", "9-16,24-31"},
242   { "/sys/devices/system/cpu/cpu25/cache/index3/type", "Unified" },
243   { "/sys/devices/system/cpu/cpu26/cache/index0/shared_cpu_list", "11,26" },
244   { "/sys/devices/system/cpu/cpu26/cache/index0/type", "Data" },
245   { "/sys/devices/system/cpu/cpu26/cache/index1/shared_cpu_list", "11,26" },
246   { "/sys/devices/system/cpu/cpu26/cache/index1/type", "Instruction" },
247   { "/sys/devices/system/cpu/cpu26/cache/index2/shared_cpu_list", "11,26" },
248   { "/sys/devices/system/cpu/cpu26/cache/index2/type", "Unified" },
249   { "/sys/devices/system/cpu/cpu26/cache/index3/shared_cpu_list", "9-16,24-31"},
250   { "/sys/devices/system/cpu/cpu26/cache/index3/type", "Unified" },
251   { "/sys/devices/system/cpu/cpu27/cache/index0/shared_cpu_list", "12,27" },
252   { "/sys/devices/system/cpu/cpu27/cache/index0/type", "Data" },
253   { "/sys/devices/system/cpu/cpu27/cache/index1/shared_cpu_list", "12,27" },
254   { "/sys/devices/system/cpu/cpu27/cache/index1/type", "Instruction" },
255   { "/sys/devices/system/cpu/cpu27/cache/index2/shared_cpu_list", "12,27" },
256   { "/sys/devices/system/cpu/cpu27/cache/index2/type", "Unified" },
257   { "/sys/devices/system/cpu/cpu27/cache/index3/shared_cpu_list", "9-16,24-31"},
258   { "/sys/devices/system/cpu/cpu27/cache/index3/type", "Unified" },
259   { "/sys/devices/system/cpu/cpu28/cache/index0/shared_cpu_list", "13,28" },
260   { "/sys/devices/system/cpu/cpu28/cache/index0/type", "Data" },
261   { "/sys/devices/system/cpu/cpu28/cache/index1/shared_cpu_list", "13,28" },
262   { "/sys/devices/system/cpu/cpu28/cache/index1/type", "Instruction" },
263   { "/sys/devices/system/cpu/cpu28/cache/index2/shared_cpu_list", "13,28" },
264   { "/sys/devices/system/cpu/cpu28/cache/index2/type", "Unified" },
265   { "/sys/devices/system/cpu/cpu28/cache/index3/shared_cpu_list", "9-16,24-31"},
266   { "/sys/devices/system/cpu/cpu28/cache/index3/type", "Unified" },
267   { "/sys/devices/system/cpu/cpu29/cache/index0/shared_cpu_list", "14,29" },
268   { "/sys/devices/system/cpu/cpu29/cache/index0/type", "Data" },
269   { "/sys/devices/system/cpu/cpu29/cache/index1/shared_cpu_list", "14,29" },
270   { "/sys/devices/system/cpu/cpu29/cache/index1/type", "Instruction" },
271   { "/sys/devices/system/cpu/cpu29/cache/index2/shared_cpu_list", "14,29" },
272   { "/sys/devices/system/cpu/cpu29/cache/index2/type", "Unified" },
273   { "/sys/devices/system/cpu/cpu29/cache/index3/shared_cpu_list", "9-16,24-31"},
274   { "/sys/devices/system/cpu/cpu29/cache/index3/type", "Unified" },
275   { "/sys/devices/system/cpu/cpu30/cache/index0/shared_cpu_list", "15,30" },
276   { "/sys/devices/system/cpu/cpu30/cache/index0/type", "Data" },
277   { "/sys/devices/system/cpu/cpu30/cache/index1/shared_cpu_list", "15,30" },
278   { "/sys/devices/system/cpu/cpu30/cache/index1/type", "Instruction" },
279   { "/sys/devices/system/cpu/cpu30/cache/index2/shared_cpu_list", "15,30" },
280   { "/sys/devices/system/cpu/cpu30/cache/index2/type", "Unified" },
281   { "/sys/devices/system/cpu/cpu30/cache/index3/shared_cpu_list", "9-16,24-31"},
282   { "/sys/devices/system/cpu/cpu30/cache/index3/type", "Unified" },
283   { "/sys/devices/system/cpu/cpu31/cache/index0/shared_cpu_list", "16,31" },
284   { "/sys/devices/system/cpu/cpu31/cache/index0/type", "Data" },
285   { "/sys/devices/system/cpu/cpu31/cache/index1/shared_cpu_list", "16,31" },
286   { "/sys/devices/system/cpu/cpu31/cache/index1/type", "Instruction" },
287   { "/sys/devices/system/cpu/cpu31/cache/index2/shared_cpu_list", "16,31" },
288   { "/sys/devices/system/cpu/cpu31/cache/index2/type", "Unified" },
289   { "/sys/devices/system/cpu/cpu31/cache/index3/shared_cpu_list", "9-16,24-31"},
290   { "/sys/devices/system/cpu/cpu31/cache/index3/type", "Unified" }
291 };
292
293 /// This is the expected CacheLocality structure for fakeSysfsTree
294 static const CacheLocality nonUniformExampleLocality = {
295   32,
296   { 16, 16, 2 },
297   { 0, 2, 4, 6, 8, 10, 11, 12, 14, 16, 18, 20, 22, 24, 26, 28,
298     30, 1, 3, 5, 7, 9, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 }
299 };
300
301 TEST(CacheLocality, FakeSysfs) {
302   auto parsed = CacheLocality::readFromSysfsTree([](std::string name) {
303     auto iter = fakeSysfsTree.find(name);
304     return iter == fakeSysfsTree.end() ? std::string() : iter->second;
305   });
306
307   auto& expected = nonUniformExampleLocality;
308   EXPECT_EQ(expected.numCpus, parsed.numCpus);
309   EXPECT_EQ(expected.numCachesByLevel, parsed.numCachesByLevel);
310   EXPECT_EQ(expected.localityIndexByCpu, parsed.localityIndexByCpu);
311 }
312
313 TEST(Getcpu, VdsoGetcpu) {
314   unsigned cpu;
315   Getcpu::vdsoFunc()(&cpu, nullptr, nullptr);
316
317   EXPECT_TRUE(cpu < CPU_SETSIZE);
318 }
319
320 TEST(SequentialThreadId, Simple) {
321   unsigned cpu = 0;
322   auto rv = SequentialThreadId<std::atomic>::getcpu(&cpu, nullptr, nullptr);
323   EXPECT_EQ(rv, 0);
324   EXPECT_TRUE(cpu > 0);
325   unsigned again;
326   SequentialThreadId<std::atomic>::getcpu(&again, nullptr, nullptr);
327   EXPECT_EQ(cpu, again);
328 }
329
330 static FOLLY_TLS unsigned testingCpu = 0;
331
332 static int testingGetcpu(unsigned* cpu, unsigned* node, void* unused) {
333   if (cpu != nullptr) {
334     *cpu = testingCpu;
335   }
336   if (node != nullptr) {
337     *node = testingCpu;
338   }
339   return 0;
340 }
341
342 TEST(AccessSpreader, Stubbed) {
343   std::vector<std::unique_ptr<AccessSpreader<>>> spreaders(100);
344   for (size_t s = 1; s < spreaders.size(); ++s) {
345     spreaders[s].reset(new AccessSpreader<>(
346         s, nonUniformExampleLocality, &testingGetcpu));
347   }
348   std::vector<size_t> cpusInLocalityOrder = {
349       0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 6, 7, 22, 8, 23, 9, 24, 10, 25,
350       11, 26, 12, 27, 13, 28, 14, 29, 15, 30, 16, 31 };
351   for (size_t i = 0; i < 32; ++i) {
352     // extra i * 32 is to check wrapping behavior of impl
353     testingCpu = cpusInLocalityOrder[i] + i * 64;
354     for (size_t s = 1; s < spreaders.size(); ++s) {
355       EXPECT_EQ((i * s) / 32, spreaders[s]->current())
356           << "i=" << i << ", cpu=" << testingCpu << ", s=" << s;
357     }
358   }
359 }
360
361 TEST(AccessSpreader, Default) {
362   AccessSpreader<> spreader(16);
363   EXPECT_LT(spreader.current(), 16);
364 }
365
366 TEST(AccessSpreader, Shared) {
367   for (size_t s = 1; s < 200; ++s) {
368     EXPECT_LT(AccessSpreader<>::shared(s).current(), s);
369   }
370 }
371
372 TEST(AccessSpreader, Statics) {
373   LOG(INFO) << "stripeByCore.numStripes() = "
374             << AccessSpreader<>::stripeByCore.numStripes();
375   LOG(INFO) << "stripeByChip.numStripes() = "
376             << AccessSpreader<>::stripeByChip.numStripes();
377   for (size_t s = 1; s < 200; ++s) {
378     EXPECT_LT(AccessSpreader<>::current(s), s);
379   }
380 }
381
382 TEST(AccessSpreader, Wrapping) {
383   // this test won't pass unless locality.numCpus divides kMaxCpus
384   auto numCpus = 16;
385   auto locality = CacheLocality::uniform(numCpus);
386   for (size_t s = 1; s < 200; ++s) {
387     AccessSpreader<> spreader(s, locality, &testingGetcpu);
388     for (size_t c = 0; c < 400; ++c) {
389       testingCpu = c;
390       auto observed = spreader.current();
391       testingCpu = c % numCpus;
392       auto expected = spreader.current();
393       EXPECT_EQ(expected, observed)
394           << "numCpus=" << numCpus << ", s=" << s << ", c=" << c;
395     }
396   }
397 }
398
399 // Benchmarked at ~21 nanos on fbk35 (2.6) and fbk18 (3.2) kernels with
400 // a 2.2Ghz Xeon
401 // ============================================================================
402 // folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
403 // ============================================================================
404 // LocalAccessSpreaderUse                                      20.77ns   48.16M
405 // SharedAccessSpreaderUse                                     21.95ns   45.55M
406 // AccessSpreaderConstruction                                 466.56ns    2.14M
407 // ============================================================================
408
409 BENCHMARK(LocalAccessSpreaderUse, iters) {
410   folly::BenchmarkSuspender braces;
411   AccessSpreader<> spreader(16);
412   braces.dismiss();
413
414   for (unsigned long i = 0; i < iters; ++i) {
415     auto x = spreader.current();
416     folly::doNotOptimizeAway(x);
417   }
418 }
419
420 BENCHMARK(SharedAccessSpreaderUse, iters) {
421   for (unsigned long i = 0; i < iters; ++i) {
422     auto x = AccessSpreader<>::current(16);
423     folly::doNotOptimizeAway(x);
424   }
425 }
426
427 BENCHMARK(AccessSpreaderConstruction, iters) {
428   std::aligned_storage<sizeof(AccessSpreader<>),
429                        std::alignment_of<AccessSpreader<>>::value>::type raw;
430   for (unsigned long i = 0; i < iters; ++i) {
431     auto x = new (&raw) AccessSpreader<>(16);
432     folly::doNotOptimizeAway(x);
433     x->~AccessSpreader();
434   }
435 }
436
437 enum class SpreaderType { GETCPU, SHARED, TLS_RR };
438
439 // Benchmark scores here reflect the time for 32 threads to perform an
440 // atomic increment on a dual-socket E5-2660 @ 2.2Ghz.  Surprisingly,
441 // if we don't separate the counters onto unique 128 byte stripes the
442 // 1_stripe and 2_stripe results are identical, even though the L3 is
443 // claimed to have 64 byte cache lines.
444 //
445 // _stub means there was no call to getcpu or the tls round-robin
446 // implementation, because for a single stripe the cpu doesn't matter.
447 // _getcpu refers to the vdso getcpu implementation with a locally
448 // constructed AccessSpreader.  _tls_rr refers to execution using
449 // SequentialThreadId, the fallback if the vdso getcpu isn't available.
450 // _shared refers to calling AccessSpreader<>::current(numStripes) inside
451 // the hot loop.
452 //
453 // At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic, so
454 // since the stripe selection is 6 nanos the atomic increments in the L1 is
455 // ~15 nanos.  At width 8_stripe_0_work the line is expected to ping-pong
456 // almost every operation, since the loops have the same duration.
457 // Widths 4 and 2 have the same behavior, but each tour of the cache line
458 // is 4 and 8 cores long, respectively.  These all suggest a lower bound
459 // of ~60 nanos for intra-chip handoff and increment between the L1s.
460 //
461 // With 396 nanos (500 std::memory_order_seq_cst loads) of busywork per
462 // contended increment, the system can hide all of the latency of a tour
463 // of length 4, but not quite one of length 8.  I was a bit surprised
464 // at how much worse the non-striped version got.  It seems that the
465 // inter-chip traffic also interferes with the L1-only localWork.load().
466 // When the local work is doubled to 776 nanoseconds we see that the
467 // inter-chip contention is still very important, but subdivisions on
468 // the same chip don't matter.
469 //
470 // sudo nice -n -20
471 //   _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
472 // ============================================================================
473 // folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
474 // ============================================================================
475 // LocalAccessSpreaderUse                                       6.34ns  157.75M
476 // SharedAccessSpreaderUse                                      6.34ns  157.75M
477 // AccessSpreaderConstruction                                 328.19ns    3.05M
478 // ----------------------------------------------------------------------------
479 // contentionAtWidth(1_stripe_0_work_stub)                    909.99ns    1.10M
480 // contentionAtWidth(2_stripe_0_work_getcpu)                  527.54ns    1.90M
481 // contentionAtWidth(4_stripe_0_work_getcpu)                  260.28ns    3.84M
482 // contentionAtWidth(8_stripe_0_work_getcpu)                  131.82ns    7.59M
483 // contentionAtWidth(16_stripe_0_work_getcpu)                  25.92ns   38.58M
484 // contentionAtWidth(32_stripe_0_work_getcpu)                  21.80ns   45.88M
485 // contentionAtWidth(64_stripe_0_work_getcpu)                  20.06ns   49.85M
486 // contentionAtWidth(2_stripe_0_work_tls_rr)                  759.21ns    1.32M
487 // contentionAtWidth(4_stripe_0_work_tls_rr)                  607.46ns    1.65M
488 // contentionAtWidth(8_stripe_0_work_tls_rr)                  403.79ns    2.48M
489 // contentionAtWidth(16_stripe_0_work_tls_rr)                 188.14ns    5.32M
490 // contentionAtWidth(32_stripe_0_work_tls_rr)                 131.59ns    7.60M
491 // contentionAtWidth(64_stripe_0_work_tls_rr)                 103.56ns    9.66M
492 // contentionAtWidth(2_stripe_0_work_shared)                  553.07ns    1.81M
493 // contentionAtWidth(4_stripe_0_work_shared)                  274.23ns    3.65M
494 // contentionAtWidth(8_stripe_0_work_shared)                  137.43ns    7.28M
495 // contentionAtWidth(16_stripe_0_work_shared)                  24.52ns   40.78M
496 // contentionAtWidth(32_stripe_0_work_shared)                  21.80ns   45.86M
497 // contentionAtWidth(64_stripe_0_work_shared)                  21.66ns   46.17M
498 // atomicIncrBaseline(local_incr_0_work)                       16.73ns   59.78M
499 // ----------------------------------------------------------------------------
500 // contentionAtWidth(1_stripe_500_work_stub)                    1.75us  571.14K
501 // contentionAtWidth(2_stripe_500_work_getcpu)                500.79ns    2.00M
502 // contentionAtWidth(4_stripe_500_work_getcpu)                410.45ns    2.44M
503 // contentionAtWidth(8_stripe_500_work_getcpu)                411.41ns    2.43M
504 // contentionAtWidth(16_stripe_500_work_getcpu)               400.12ns    2.50M
505 // contentionAtWidth(32_stripe_500_work_getcpu)               397.37ns    2.52M
506 // atomicIncrBaseline(local_incr_500_work)                    396.53ns    2.52M
507 // ----------------------------------------------------------------------------
508 // contentionAtWidth(1_stripe_1000_work_stub)                   1.88us  530.59K
509 // contentionAtWidth(2_stripe_1000_work_getcpu)               778.77ns    1.28M
510 // contentionAtWidth(4_stripe_1000_work_getcpu)               779.56ns    1.28M
511 // contentionAtWidth(8_stripe_1000_work_getcpu)               795.62ns    1.26M
512 // contentionAtWidth(16_stripe_1000_work_getcpu)              778.81ns    1.28M
513 // contentionAtWidth(32_stripe_1000_work_getcpu)              780.26ns    1.28M
514 // atomicIncrBaseline(local_incr_1000_work)                   776.39ns    1.29M
515 // ============================================================================
516 static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
517                               SpreaderType spreaderType,
518                               size_t counterAlignment = 128,
519                               size_t numThreads = 32) {
520   folly::BenchmarkSuspender braces;
521
522   AccessSpreader<> spreader(
523       stripes,
524       CacheLocality::system<std::atomic>(),
525       spreaderType == SpreaderType::TLS_RR
526           ? SequentialThreadId<std::atomic>::getcpu : nullptr);
527
528   std::atomic<size_t> ready(0);
529   std::atomic<bool> go(false);
530
531   // while in theory the cache line size is 64 bytes, experiments show
532   // that we get contention on 128 byte boundaries for Ivy Bridge.  The
533   // extra indirection adds 1 or 2 nanos
534   assert(counterAlignment >= sizeof(std::atomic<size_t>));
535   std::vector<char> raw(counterAlignment * stripes);
536
537   // if we happen to be using the tlsRoundRobin, then sequentially
538   // assigning the thread identifiers is the unlikely best-case scenario.
539   // We don't want to unfairly benefit or penalize.  Computing the exact
540   // maximum likelihood of the probability distributions is annoying, so
541   // I approximate as 2/5 of the ids that have no threads, 2/5 that have
542   // 1, 2/15 that have 2, and 1/15 that have 3.  We accomplish this by
543   // wrapping back to slot 0 when we hit 1/15 and 1/5.
544
545   std::vector<std::thread> threads;
546   while (threads.size() < numThreads) {
547     threads.push_back(std::thread([&,iters,stripes,work]() {
548       std::atomic<size_t>* counters[stripes];
549       for (size_t i = 0; i < stripes; ++i) {
550         counters[i]
551           = new (raw.data() + counterAlignment * i) std::atomic<size_t>();
552       }
553
554       spreader.current();
555       ready++;
556       while (!go.load()) {
557         sched_yield();
558       }
559       std::atomic<int> localWork(0);
560       if (spreaderType == SpreaderType::SHARED) {
561         for (size_t i = iters; i > 0; --i) {
562           ++*(counters[AccessSpreader<>::current(stripes)]);
563           for (size_t j = work; j > 0; --j) {
564             localWork.load();
565           }
566         }
567       } else {
568         for (size_t i = iters; i > 0; --i) {
569           ++*(counters[spreader.current()]);
570           for (size_t j = work; j > 0; --j) {
571             localWork.load();
572           }
573         }
574       }
575     }));
576
577     if (threads.size() == numThreads / 15 ||
578         threads.size() == numThreads / 5) {
579       // create a few dummy threads to wrap back around to 0 mod numCpus
580       for (size_t i = threads.size(); i != numThreads; ++i) {
581         std::thread([&]() {
582           spreader.current();
583         }).join();
584       }
585     }
586   }
587
588   while (ready < numThreads) {
589     sched_yield();
590   }
591   braces.dismiss();
592   go = true;
593
594   for (auto& thr : threads) {
595     thr.join();
596   }
597 }
598
599 static void atomicIncrBaseline(size_t iters, size_t work,
600                                size_t numThreads = 32) {
601   folly::BenchmarkSuspender braces;
602
603   std::atomic<bool> go(false);
604
605   std::vector<std::thread> threads;
606   while (threads.size() < numThreads) {
607     threads.push_back(std::thread([&]() {
608       while (!go.load()) {
609         sched_yield();
610       }
611       std::atomic<size_t> localCounter(0);
612       std::atomic<int> localWork(0);
613       for (size_t i = iters; i > 0; --i) {
614         localCounter++;
615         for (size_t j = work; j > 0; --j) {
616           localWork.load();
617         }
618       }
619     }));
620   }
621
622   braces.dismiss();
623   go = true;
624
625   for (auto& thr : threads) {
626     thr.join();
627   }
628 }
629
630 BENCHMARK_DRAW_LINE()
631
632 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_0_work_stub,
633                       1, 0, SpreaderType::GETCPU)
634 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_getcpu,
635                       2, 0, SpreaderType::GETCPU)
636 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_getcpu,
637                       4, 0, SpreaderType::GETCPU)
638 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_getcpu,
639                       8, 0, SpreaderType::GETCPU)
640 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_getcpu,
641                       16, 0, SpreaderType::GETCPU)
642 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_getcpu,
643                       32, 0, SpreaderType::GETCPU)
644 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_getcpu,
645                       64, 0, SpreaderType::GETCPU)
646 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_tls_rr,
647                       2, 0, SpreaderType::TLS_RR)
648 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_tls_rr,
649                       4, 0, SpreaderType::TLS_RR)
650 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_tls_rr,
651                       8, 0, SpreaderType::TLS_RR)
652 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_tls_rr,
653                       16, 0, SpreaderType::TLS_RR)
654 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_tls_rr,
655                       32, 0, SpreaderType::TLS_RR)
656 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_tls_rr,
657                       64, 0, SpreaderType::TLS_RR)
658 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_shared,
659                       2, 0, SpreaderType::SHARED)
660 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_shared,
661                       4, 0, SpreaderType::SHARED)
662 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_shared,
663                       8, 0, SpreaderType::SHARED)
664 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_shared,
665                       16, 0, SpreaderType::SHARED)
666 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_shared,
667                       32, 0, SpreaderType::SHARED)
668 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_shared,
669                       64, 0, SpreaderType::SHARED)
670 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
671 BENCHMARK_DRAW_LINE()
672 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_500_work_stub,
673                       1, 500, SpreaderType::GETCPU)
674 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_500_work_getcpu,
675                       2, 500, SpreaderType::GETCPU)
676 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_500_work_getcpu,
677                       4, 500, SpreaderType::GETCPU)
678 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_500_work_getcpu,
679                       8, 500, SpreaderType::GETCPU)
680 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_500_work_getcpu,
681                       16, 500, SpreaderType::GETCPU)
682 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_500_work_getcpu,
683                       32, 500, SpreaderType::GETCPU)
684 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
685 BENCHMARK_DRAW_LINE()
686 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_1000_work_stub,
687                       1, 1000, SpreaderType::GETCPU)
688 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_1000_work_getcpu,
689                       2, 1000, SpreaderType::GETCPU)
690 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_1000_work_getcpu,
691                       4, 1000, SpreaderType::GETCPU)
692 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_1000_work_getcpu,
693                       8, 1000, SpreaderType::GETCPU)
694 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_1000_work_getcpu,
695                       16, 1000, SpreaderType::GETCPU)
696 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_1000_work_getcpu,
697                       32, 1000, SpreaderType::GETCPU)
698 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
699
700
701 int main(int argc, char** argv) {
702   testing::InitGoogleTest(&argc, argv);
703   gflags::ParseCommandLineFlags(&argc, &argv, true);
704   auto ret = RUN_ALL_TESTS();
705   if (!ret && FLAGS_benchmark) {
706     folly::runBenchmarks();
707   }
708   return ret;
709 }