338c9c4751533e4c622655c7a65cb1a35c84f589
[folly.git] / folly / test / CacheLocalityTest.cpp
1 /*
2  * Copyright 2014 Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "folly/detail/CacheLocality.h"
18
19 #include <sched.h>
20 #include <memory>
21 #include <thread>
22 #include <type_traits>
23 #include <unordered_map>
24 #include <glog/logging.h>
25 #include <gtest/gtest.h>
26 #include "folly/Benchmark.h"
27
28 using namespace folly::detail;
29
30 /// This is the relevant nodes from a production box's sysfs tree.  If you
31 /// think this map is ugly you should see the version of this test that
32 /// used a real directory tree.  To reduce the chance of testing error
33 /// I haven't tried to remove the common prefix
34 static std::unordered_map<std::string,std::string> fakeSysfsTree = {
35   { "/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list", "0,17" },
36   { "/sys/devices/system/cpu/cpu0/cache/index0/type", "Data" },
37   { "/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list", "0,17" },
38   { "/sys/devices/system/cpu/cpu0/cache/index1/type", "Instruction" },
39   { "/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list", "0,17" },
40   { "/sys/devices/system/cpu/cpu0/cache/index2/type", "Unified" },
41   { "/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list", "0-8,17-23" },
42   { "/sys/devices/system/cpu/cpu0/cache/index3/type", "Unified" },
43   { "/sys/devices/system/cpu/cpu1/cache/index0/shared_cpu_list", "1,18" },
44   { "/sys/devices/system/cpu/cpu1/cache/index0/type", "Data" },
45   { "/sys/devices/system/cpu/cpu1/cache/index1/shared_cpu_list", "1,18" },
46   { "/sys/devices/system/cpu/cpu1/cache/index1/type", "Instruction" },
47   { "/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list", "1,18" },
48   { "/sys/devices/system/cpu/cpu1/cache/index2/type", "Unified" },
49   { "/sys/devices/system/cpu/cpu1/cache/index3/shared_cpu_list", "0-8,17-23" },
50   { "/sys/devices/system/cpu/cpu1/cache/index3/type", "Unified" },
51   { "/sys/devices/system/cpu/cpu2/cache/index0/shared_cpu_list", "2,19" },
52   { "/sys/devices/system/cpu/cpu2/cache/index0/type", "Data" },
53   { "/sys/devices/system/cpu/cpu2/cache/index1/shared_cpu_list", "2,19" },
54   { "/sys/devices/system/cpu/cpu2/cache/index1/type", "Instruction" },
55   { "/sys/devices/system/cpu/cpu2/cache/index2/shared_cpu_list", "2,19" },
56   { "/sys/devices/system/cpu/cpu2/cache/index2/type", "Unified" },
57   { "/sys/devices/system/cpu/cpu2/cache/index3/shared_cpu_list", "0-8,17-23" },
58   { "/sys/devices/system/cpu/cpu2/cache/index3/type", "Unified" },
59   { "/sys/devices/system/cpu/cpu3/cache/index0/shared_cpu_list", "3,20" },
60   { "/sys/devices/system/cpu/cpu3/cache/index0/type", "Data" },
61   { "/sys/devices/system/cpu/cpu3/cache/index1/shared_cpu_list", "3,20" },
62   { "/sys/devices/system/cpu/cpu3/cache/index1/type", "Instruction" },
63   { "/sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_list", "3,20" },
64   { "/sys/devices/system/cpu/cpu3/cache/index2/type", "Unified" },
65   { "/sys/devices/system/cpu/cpu3/cache/index3/shared_cpu_list", "0-8,17-23" },
66   { "/sys/devices/system/cpu/cpu3/cache/index3/type", "Unified" },
67   { "/sys/devices/system/cpu/cpu4/cache/index0/shared_cpu_list", "4,21" },
68   { "/sys/devices/system/cpu/cpu4/cache/index0/type", "Data" },
69   { "/sys/devices/system/cpu/cpu4/cache/index1/shared_cpu_list", "4,21" },
70   { "/sys/devices/system/cpu/cpu4/cache/index1/type", "Instruction" },
71   { "/sys/devices/system/cpu/cpu4/cache/index2/shared_cpu_list", "4,21" },
72   { "/sys/devices/system/cpu/cpu4/cache/index2/type", "Unified" },
73   { "/sys/devices/system/cpu/cpu4/cache/index3/shared_cpu_list", "0-8,17-23" },
74   { "/sys/devices/system/cpu/cpu4/cache/index3/type", "Unified" },
75   { "/sys/devices/system/cpu/cpu5/cache/index0/shared_cpu_list", "5-6" },
76   { "/sys/devices/system/cpu/cpu5/cache/index0/type", "Data" },
77   { "/sys/devices/system/cpu/cpu5/cache/index1/shared_cpu_list", "5-6" },
78   { "/sys/devices/system/cpu/cpu5/cache/index1/type", "Instruction" },
79   { "/sys/devices/system/cpu/cpu5/cache/index2/shared_cpu_list", "5-6" },
80   { "/sys/devices/system/cpu/cpu5/cache/index2/type", "Unified" },
81   { "/sys/devices/system/cpu/cpu5/cache/index3/shared_cpu_list", "0-8,17-23" },
82   { "/sys/devices/system/cpu/cpu5/cache/index3/type", "Unified" },
83   { "/sys/devices/system/cpu/cpu6/cache/index0/shared_cpu_list", "5-6" },
84   { "/sys/devices/system/cpu/cpu6/cache/index0/type", "Data" },
85   { "/sys/devices/system/cpu/cpu6/cache/index1/shared_cpu_list", "5-6" },
86   { "/sys/devices/system/cpu/cpu6/cache/index1/type", "Instruction" },
87   { "/sys/devices/system/cpu/cpu6/cache/index2/shared_cpu_list", "5-6" },
88   { "/sys/devices/system/cpu/cpu6/cache/index2/type", "Unified" },
89   { "/sys/devices/system/cpu/cpu6/cache/index3/shared_cpu_list", "0-8,17-23" },
90   { "/sys/devices/system/cpu/cpu6/cache/index3/type", "Unified" },
91   { "/sys/devices/system/cpu/cpu7/cache/index0/shared_cpu_list", "7,22" },
92   { "/sys/devices/system/cpu/cpu7/cache/index0/type", "Data" },
93   { "/sys/devices/system/cpu/cpu7/cache/index1/shared_cpu_list", "7,22" },
94   { "/sys/devices/system/cpu/cpu7/cache/index1/type", "Instruction" },
95   { "/sys/devices/system/cpu/cpu7/cache/index2/shared_cpu_list", "7,22" },
96   { "/sys/devices/system/cpu/cpu7/cache/index2/type", "Unified" },
97   { "/sys/devices/system/cpu/cpu7/cache/index3/shared_cpu_list", "0-8,17-23" },
98   { "/sys/devices/system/cpu/cpu7/cache/index3/type", "Unified" },
99   { "/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list", "8,23" },
100   { "/sys/devices/system/cpu/cpu8/cache/index0/type", "Data" },
101   { "/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list", "8,23" },
102   { "/sys/devices/system/cpu/cpu8/cache/index1/type", "Instruction" },
103   { "/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list", "8,23" },
104   { "/sys/devices/system/cpu/cpu8/cache/index2/type", "Unified" },
105   { "/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list", "0-8,17-23" },
106   { "/sys/devices/system/cpu/cpu8/cache/index3/type", "Unified" },
107   { "/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list", "9,24" },
108   { "/sys/devices/system/cpu/cpu9/cache/index0/type", "Data" },
109   { "/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list", "9,24" },
110   { "/sys/devices/system/cpu/cpu9/cache/index1/type", "Instruction" },
111   { "/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list", "9,24" },
112   { "/sys/devices/system/cpu/cpu9/cache/index2/type", "Unified" },
113   { "/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list", "9-16,24-31" },
114   { "/sys/devices/system/cpu/cpu9/cache/index3/type", "Unified" },
115   { "/sys/devices/system/cpu/cpu10/cache/index0/shared_cpu_list", "10,25" },
116   { "/sys/devices/system/cpu/cpu10/cache/index0/type", "Data" },
117   { "/sys/devices/system/cpu/cpu10/cache/index1/shared_cpu_list", "10,25" },
118   { "/sys/devices/system/cpu/cpu10/cache/index1/type", "Instruction" },
119   { "/sys/devices/system/cpu/cpu10/cache/index2/shared_cpu_list", "10,25" },
120   { "/sys/devices/system/cpu/cpu10/cache/index2/type", "Unified" },
121   { "/sys/devices/system/cpu/cpu10/cache/index3/shared_cpu_list", "9-16,24-31"},
122   { "/sys/devices/system/cpu/cpu10/cache/index3/type", "Unified" },
123   { "/sys/devices/system/cpu/cpu11/cache/index0/shared_cpu_list", "11,26" },
124   { "/sys/devices/system/cpu/cpu11/cache/index0/type", "Data" },
125   { "/sys/devices/system/cpu/cpu11/cache/index1/shared_cpu_list", "11,26" },
126   { "/sys/devices/system/cpu/cpu11/cache/index1/type", "Instruction" },
127   { "/sys/devices/system/cpu/cpu11/cache/index2/shared_cpu_list", "11,26" },
128   { "/sys/devices/system/cpu/cpu11/cache/index2/type", "Unified" },
129   { "/sys/devices/system/cpu/cpu11/cache/index3/shared_cpu_list", "9-16,24-31"},
130   { "/sys/devices/system/cpu/cpu11/cache/index3/type", "Unified" },
131   { "/sys/devices/system/cpu/cpu12/cache/index0/shared_cpu_list", "12,27" },
132   { "/sys/devices/system/cpu/cpu12/cache/index0/type", "Data" },
133   { "/sys/devices/system/cpu/cpu12/cache/index1/shared_cpu_list", "12,27" },
134   { "/sys/devices/system/cpu/cpu12/cache/index1/type", "Instruction" },
135   { "/sys/devices/system/cpu/cpu12/cache/index2/shared_cpu_list", "12,27" },
136   { "/sys/devices/system/cpu/cpu12/cache/index2/type", "Unified" },
137   { "/sys/devices/system/cpu/cpu12/cache/index3/shared_cpu_list", "9-16,24-31"},
138   { "/sys/devices/system/cpu/cpu12/cache/index3/type", "Unified" },
139   { "/sys/devices/system/cpu/cpu13/cache/index0/shared_cpu_list", "13,28" },
140   { "/sys/devices/system/cpu/cpu13/cache/index0/type", "Data" },
141   { "/sys/devices/system/cpu/cpu13/cache/index1/shared_cpu_list", "13,28" },
142   { "/sys/devices/system/cpu/cpu13/cache/index1/type", "Instruction" },
143   { "/sys/devices/system/cpu/cpu13/cache/index2/shared_cpu_list", "13,28" },
144   { "/sys/devices/system/cpu/cpu13/cache/index2/type", "Unified" },
145   { "/sys/devices/system/cpu/cpu13/cache/index3/shared_cpu_list", "9-16,24-31"},
146   { "/sys/devices/system/cpu/cpu13/cache/index3/type", "Unified" },
147   { "/sys/devices/system/cpu/cpu14/cache/index0/shared_cpu_list", "14,29" },
148   { "/sys/devices/system/cpu/cpu14/cache/index0/type", "Data" },
149   { "/sys/devices/system/cpu/cpu14/cache/index1/shared_cpu_list", "14,29" },
150   { "/sys/devices/system/cpu/cpu14/cache/index1/type", "Instruction" },
151   { "/sys/devices/system/cpu/cpu14/cache/index2/shared_cpu_list", "14,29" },
152   { "/sys/devices/system/cpu/cpu14/cache/index2/type", "Unified" },
153   { "/sys/devices/system/cpu/cpu14/cache/index3/shared_cpu_list", "9-16,24-31"},
154   { "/sys/devices/system/cpu/cpu14/cache/index3/type", "Unified" },
155   { "/sys/devices/system/cpu/cpu15/cache/index0/shared_cpu_list", "15,30" },
156   { "/sys/devices/system/cpu/cpu15/cache/index0/type", "Data" },
157   { "/sys/devices/system/cpu/cpu15/cache/index1/shared_cpu_list", "15,30" },
158   { "/sys/devices/system/cpu/cpu15/cache/index1/type", "Instruction" },
159   { "/sys/devices/system/cpu/cpu15/cache/index2/shared_cpu_list", "15,30" },
160   { "/sys/devices/system/cpu/cpu15/cache/index2/type", "Unified" },
161   { "/sys/devices/system/cpu/cpu15/cache/index3/shared_cpu_list", "9-16,24-31"},
162   { "/sys/devices/system/cpu/cpu15/cache/index3/type", "Unified" },
163   { "/sys/devices/system/cpu/cpu16/cache/index0/shared_cpu_list", "16,31" },
164   { "/sys/devices/system/cpu/cpu16/cache/index0/type", "Data" },
165   { "/sys/devices/system/cpu/cpu16/cache/index1/shared_cpu_list", "16,31" },
166   { "/sys/devices/system/cpu/cpu16/cache/index1/type", "Instruction" },
167   { "/sys/devices/system/cpu/cpu16/cache/index2/shared_cpu_list", "16,31" },
168   { "/sys/devices/system/cpu/cpu16/cache/index2/type", "Unified" },
169   { "/sys/devices/system/cpu/cpu16/cache/index3/shared_cpu_list", "9-16,24-31"},
170   { "/sys/devices/system/cpu/cpu16/cache/index3/type", "Unified" },
171   { "/sys/devices/system/cpu/cpu17/cache/index0/shared_cpu_list", "0,17" },
172   { "/sys/devices/system/cpu/cpu17/cache/index0/type", "Data" },
173   { "/sys/devices/system/cpu/cpu17/cache/index1/shared_cpu_list", "0,17" },
174   { "/sys/devices/system/cpu/cpu17/cache/index1/type", "Instruction" },
175   { "/sys/devices/system/cpu/cpu17/cache/index2/shared_cpu_list", "0,17" },
176   { "/sys/devices/system/cpu/cpu17/cache/index2/type", "Unified" },
177   { "/sys/devices/system/cpu/cpu17/cache/index3/shared_cpu_list", "0-8,17-23" },
178   { "/sys/devices/system/cpu/cpu17/cache/index3/type", "Unified" },
179   { "/sys/devices/system/cpu/cpu18/cache/index0/shared_cpu_list", "1,18" },
180   { "/sys/devices/system/cpu/cpu18/cache/index0/type", "Data" },
181   { "/sys/devices/system/cpu/cpu18/cache/index1/shared_cpu_list", "1,18" },
182   { "/sys/devices/system/cpu/cpu18/cache/index1/type", "Instruction" },
183   { "/sys/devices/system/cpu/cpu18/cache/index2/shared_cpu_list", "1,18" },
184   { "/sys/devices/system/cpu/cpu18/cache/index2/type", "Unified" },
185   { "/sys/devices/system/cpu/cpu18/cache/index3/shared_cpu_list", "0-8,17-23" },
186   { "/sys/devices/system/cpu/cpu18/cache/index3/type", "Unified" },
187   { "/sys/devices/system/cpu/cpu19/cache/index0/shared_cpu_list", "2,19" },
188   { "/sys/devices/system/cpu/cpu19/cache/index0/type", "Data" },
189   { "/sys/devices/system/cpu/cpu19/cache/index1/shared_cpu_list", "2,19" },
190   { "/sys/devices/system/cpu/cpu19/cache/index1/type", "Instruction" },
191   { "/sys/devices/system/cpu/cpu19/cache/index2/shared_cpu_list", "2,19" },
192   { "/sys/devices/system/cpu/cpu19/cache/index2/type", "Unified" },
193   { "/sys/devices/system/cpu/cpu19/cache/index3/shared_cpu_list", "0-8,17-23" },
194   { "/sys/devices/system/cpu/cpu19/cache/index3/type", "Unified" },
195   { "/sys/devices/system/cpu/cpu20/cache/index0/shared_cpu_list", "3,20" },
196   { "/sys/devices/system/cpu/cpu20/cache/index0/type", "Data" },
197   { "/sys/devices/system/cpu/cpu20/cache/index1/shared_cpu_list", "3,20" },
198   { "/sys/devices/system/cpu/cpu20/cache/index1/type", "Instruction" },
199   { "/sys/devices/system/cpu/cpu20/cache/index2/shared_cpu_list", "3,20" },
200   { "/sys/devices/system/cpu/cpu20/cache/index2/type", "Unified" },
201   { "/sys/devices/system/cpu/cpu20/cache/index3/shared_cpu_list", "0-8,17-23" },
202   { "/sys/devices/system/cpu/cpu20/cache/index3/type", "Unified" },
203   { "/sys/devices/system/cpu/cpu21/cache/index0/shared_cpu_list", "4,21" },
204   { "/sys/devices/system/cpu/cpu21/cache/index0/type", "Data" },
205   { "/sys/devices/system/cpu/cpu21/cache/index1/shared_cpu_list", "4,21" },
206   { "/sys/devices/system/cpu/cpu21/cache/index1/type", "Instruction" },
207   { "/sys/devices/system/cpu/cpu21/cache/index2/shared_cpu_list", "4,21" },
208   { "/sys/devices/system/cpu/cpu21/cache/index2/type", "Unified" },
209   { "/sys/devices/system/cpu/cpu21/cache/index3/shared_cpu_list", "0-8,17-23" },
210   { "/sys/devices/system/cpu/cpu21/cache/index3/type", "Unified" },
211   { "/sys/devices/system/cpu/cpu22/cache/index0/shared_cpu_list", "7,22" },
212   { "/sys/devices/system/cpu/cpu22/cache/index0/type", "Data" },
213   { "/sys/devices/system/cpu/cpu22/cache/index1/shared_cpu_list", "7,22" },
214   { "/sys/devices/system/cpu/cpu22/cache/index1/type", "Instruction" },
215   { "/sys/devices/system/cpu/cpu22/cache/index2/shared_cpu_list", "7,22" },
216   { "/sys/devices/system/cpu/cpu22/cache/index2/type", "Unified" },
217   { "/sys/devices/system/cpu/cpu22/cache/index3/shared_cpu_list", "0-8,17-23" },
218   { "/sys/devices/system/cpu/cpu22/cache/index3/type", "Unified" },
219   { "/sys/devices/system/cpu/cpu23/cache/index0/shared_cpu_list", "8,23" },
220   { "/sys/devices/system/cpu/cpu23/cache/index0/type", "Data" },
221   { "/sys/devices/system/cpu/cpu23/cache/index1/shared_cpu_list", "8,23" },
222   { "/sys/devices/system/cpu/cpu23/cache/index1/type", "Instruction" },
223   { "/sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_list", "8,23" },
224   { "/sys/devices/system/cpu/cpu23/cache/index2/type", "Unified" },
225   { "/sys/devices/system/cpu/cpu23/cache/index3/shared_cpu_list", "0-8,17-23" },
226   { "/sys/devices/system/cpu/cpu23/cache/index3/type", "Unified" },
227   { "/sys/devices/system/cpu/cpu24/cache/index0/shared_cpu_list", "9,24" },
228   { "/sys/devices/system/cpu/cpu24/cache/index0/type", "Data" },
229   { "/sys/devices/system/cpu/cpu24/cache/index1/shared_cpu_list", "9,24" },
230   { "/sys/devices/system/cpu/cpu24/cache/index1/type", "Instruction" },
231   { "/sys/devices/system/cpu/cpu24/cache/index2/shared_cpu_list", "9,24" },
232   { "/sys/devices/system/cpu/cpu24/cache/index2/type", "Unified" },
233   { "/sys/devices/system/cpu/cpu24/cache/index3/shared_cpu_list", "9-16,24-31"},
234   { "/sys/devices/system/cpu/cpu24/cache/index3/type", "Unified" },
235   { "/sys/devices/system/cpu/cpu25/cache/index0/shared_cpu_list", "10,25" },
236   { "/sys/devices/system/cpu/cpu25/cache/index0/type", "Data" },
237   { "/sys/devices/system/cpu/cpu25/cache/index1/shared_cpu_list", "10,25" },
238   { "/sys/devices/system/cpu/cpu25/cache/index1/type", "Instruction" },
239   { "/sys/devices/system/cpu/cpu25/cache/index2/shared_cpu_list", "10,25" },
240   { "/sys/devices/system/cpu/cpu25/cache/index2/type", "Unified" },
241   { "/sys/devices/system/cpu/cpu25/cache/index3/shared_cpu_list", "9-16,24-31"},
242   { "/sys/devices/system/cpu/cpu25/cache/index3/type", "Unified" },
243   { "/sys/devices/system/cpu/cpu26/cache/index0/shared_cpu_list", "11,26" },
244   { "/sys/devices/system/cpu/cpu26/cache/index0/type", "Data" },
245   { "/sys/devices/system/cpu/cpu26/cache/index1/shared_cpu_list", "11,26" },
246   { "/sys/devices/system/cpu/cpu26/cache/index1/type", "Instruction" },
247   { "/sys/devices/system/cpu/cpu26/cache/index2/shared_cpu_list", "11,26" },
248   { "/sys/devices/system/cpu/cpu26/cache/index2/type", "Unified" },
249   { "/sys/devices/system/cpu/cpu26/cache/index3/shared_cpu_list", "9-16,24-31"},
250   { "/sys/devices/system/cpu/cpu26/cache/index3/type", "Unified" },
251   { "/sys/devices/system/cpu/cpu27/cache/index0/shared_cpu_list", "12,27" },
252   { "/sys/devices/system/cpu/cpu27/cache/index0/type", "Data" },
253   { "/sys/devices/system/cpu/cpu27/cache/index1/shared_cpu_list", "12,27" },
254   { "/sys/devices/system/cpu/cpu27/cache/index1/type", "Instruction" },
255   { "/sys/devices/system/cpu/cpu27/cache/index2/shared_cpu_list", "12,27" },
256   { "/sys/devices/system/cpu/cpu27/cache/index2/type", "Unified" },
257   { "/sys/devices/system/cpu/cpu27/cache/index3/shared_cpu_list", "9-16,24-31"},
258   { "/sys/devices/system/cpu/cpu27/cache/index3/type", "Unified" },
259   { "/sys/devices/system/cpu/cpu28/cache/index0/shared_cpu_list", "13,28" },
260   { "/sys/devices/system/cpu/cpu28/cache/index0/type", "Data" },
261   { "/sys/devices/system/cpu/cpu28/cache/index1/shared_cpu_list", "13,28" },
262   { "/sys/devices/system/cpu/cpu28/cache/index1/type", "Instruction" },
263   { "/sys/devices/system/cpu/cpu28/cache/index2/shared_cpu_list", "13,28" },
264   { "/sys/devices/system/cpu/cpu28/cache/index2/type", "Unified" },
265   { "/sys/devices/system/cpu/cpu28/cache/index3/shared_cpu_list", "9-16,24-31"},
266   { "/sys/devices/system/cpu/cpu28/cache/index3/type", "Unified" },
267   { "/sys/devices/system/cpu/cpu29/cache/index0/shared_cpu_list", "14,29" },
268   { "/sys/devices/system/cpu/cpu29/cache/index0/type", "Data" },
269   { "/sys/devices/system/cpu/cpu29/cache/index1/shared_cpu_list", "14,29" },
270   { "/sys/devices/system/cpu/cpu29/cache/index1/type", "Instruction" },
271   { "/sys/devices/system/cpu/cpu29/cache/index2/shared_cpu_list", "14,29" },
272   { "/sys/devices/system/cpu/cpu29/cache/index2/type", "Unified" },
273   { "/sys/devices/system/cpu/cpu29/cache/index3/shared_cpu_list", "9-16,24-31"},
274   { "/sys/devices/system/cpu/cpu29/cache/index3/type", "Unified" },
275   { "/sys/devices/system/cpu/cpu30/cache/index0/shared_cpu_list", "15,30" },
276   { "/sys/devices/system/cpu/cpu30/cache/index0/type", "Data" },
277   { "/sys/devices/system/cpu/cpu30/cache/index1/shared_cpu_list", "15,30" },
278   { "/sys/devices/system/cpu/cpu30/cache/index1/type", "Instruction" },
279   { "/sys/devices/system/cpu/cpu30/cache/index2/shared_cpu_list", "15,30" },
280   { "/sys/devices/system/cpu/cpu30/cache/index2/type", "Unified" },
281   { "/sys/devices/system/cpu/cpu30/cache/index3/shared_cpu_list", "9-16,24-31"},
282   { "/sys/devices/system/cpu/cpu30/cache/index3/type", "Unified" },
283   { "/sys/devices/system/cpu/cpu31/cache/index0/shared_cpu_list", "16,31" },
284   { "/sys/devices/system/cpu/cpu31/cache/index0/type", "Data" },
285   { "/sys/devices/system/cpu/cpu31/cache/index1/shared_cpu_list", "16,31" },
286   { "/sys/devices/system/cpu/cpu31/cache/index1/type", "Instruction" },
287   { "/sys/devices/system/cpu/cpu31/cache/index2/shared_cpu_list", "16,31" },
288   { "/sys/devices/system/cpu/cpu31/cache/index2/type", "Unified" },
289   { "/sys/devices/system/cpu/cpu31/cache/index3/shared_cpu_list", "9-16,24-31"},
290   { "/sys/devices/system/cpu/cpu31/cache/index3/type", "Unified" }
291 };
292
293 /// This is the expected CacheLocality structure for fakeSysfsTree
294 static const CacheLocality nonUniformExampleLocality = {
295   32,
296   { 16, 16, 2 },
297   { 0, 2, 4, 6, 8, 10, 11, 12, 14, 16, 18, 20, 22, 24, 26, 28,
298     30, 1, 3, 5, 7, 9, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 }
299 };
300
301 TEST(CacheLocality, FakeSysfs) {
302   auto parsed = CacheLocality::readFromSysfsTree([](std::string name) {
303     auto iter = fakeSysfsTree.find(name);
304     return iter == fakeSysfsTree.end() ? std::string() : iter->second;
305   });
306
307   auto& expected = nonUniformExampleLocality;
308   EXPECT_EQ(expected.numCpus, parsed.numCpus);
309   EXPECT_EQ(expected.numCachesByLevel, parsed.numCachesByLevel);
310   EXPECT_EQ(expected.localityIndexByCpu, parsed.localityIndexByCpu);
311 }
312
313 TEST(Getcpu, VdsoGetcpu) {
314   unsigned cpu;
315   Getcpu::vdsoFunc()(&cpu, nullptr, nullptr);
316
317   EXPECT_TRUE(cpu < CPU_SETSIZE);
318 }
319
320 TEST(SequentialThreadId, Simple) {
321   unsigned cpu = 0;
322   auto rv = SequentialThreadId<std::atomic>::getcpu(&cpu, nullptr, nullptr);
323   EXPECT_EQ(rv, 0);
324   EXPECT_TRUE(cpu > 0);
325   unsigned again;
326   SequentialThreadId<std::atomic>::getcpu(&again, nullptr, nullptr);
327   EXPECT_EQ(cpu, again);
328 }
329
330 static FOLLY_TLS unsigned testingCpu = 0;
331
332 static int testingGetcpu(unsigned* cpu, unsigned* node, void* unused) {
333   if (cpu != nullptr) {
334     *cpu = testingCpu;
335   }
336   if (node != nullptr) {
337     *node = testingCpu;
338   }
339   return 0;
340 }
341
342 TEST(AccessSpreader, Stubbed) {
343   std::vector<std::unique_ptr<AccessSpreader<>>> spreaders(100);
344   for (size_t s = 1; s < spreaders.size(); ++s) {
345     spreaders[s].reset(new AccessSpreader<>(
346         s, nonUniformExampleLocality, &testingGetcpu));
347   }
348   std::vector<size_t> cpusInLocalityOrder = {
349       0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 6, 7, 22, 8, 23, 9, 24, 10, 25,
350       11, 26, 12, 27, 13, 28, 14, 29, 15, 30, 16, 31 };
351   for (size_t i = 0; i < 32; ++i) {
352     // extra i * 32 is to check wrapping behavior of impl
353     testingCpu = cpusInLocalityOrder[i] + i * 64;
354     for (size_t s = 1; s < spreaders.size(); ++s) {
355       EXPECT_EQ((i * s) / 32, spreaders[s]->current())
356           << "i=" << i << ", cpu=" << testingCpu << ", s=" << s;
357     }
358   }
359 }
360
361 TEST(AccessSpreader, Default) {
362   AccessSpreader<> spreader(16);
363   EXPECT_LT(spreader.current(), 16);
364 }
365
366 TEST(AccessSpreader, Shared) {
367   for (size_t s = 1; s < 200; ++s) {
368     EXPECT_LT(AccessSpreader<>::shared(s).current(), s);
369   }
370 }
371
372 TEST(AccessSpreader, Statics) {
373   LOG(INFO) << "stripeByCore.numStripes() = "
374             << AccessSpreader<>::stripeByCore.numStripes();
375   LOG(INFO) << "stripeByChip.numStripes() = "
376             << AccessSpreader<>::stripeByChip.numStripes();
377   for (size_t s = 1; s < 200; ++s) {
378     EXPECT_LT(AccessSpreader<>::current(s), s);
379   }
380 }
381
382 TEST(AccessSpreader, Wrapping) {
383   // this test won't pass unless locality.numCpus divides kMaxCpus
384   auto numCpus = 16;
385   auto locality = CacheLocality::uniform(numCpus);
386   for (size_t s = 1; s < 200; ++s) {
387     AccessSpreader<> spreader(s, locality, &testingGetcpu);
388     for (size_t c = 0; c < 400; ++c) {
389       testingCpu = c;
390       auto observed = spreader.current();
391       testingCpu = c % numCpus;
392       auto expected = spreader.current();
393       EXPECT_EQ(expected, observed)
394           << "numCpus=" << numCpus << ", s=" << s << ", c=" << c;
395     }
396   }
397 }
398
399 // Benchmarked at ~21 nanos on fbk35 (2.6) and fbk18 (3.2) kernels with
400 // a 2.2Ghz Xeon
401 // ============================================================================
402 // folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
403 // ============================================================================
404 // LocalAccessSpreaderUse                                      20.77ns   48.16M
405 // SharedAccessSpreaderUse                                     21.95ns   45.55M
406 // AccessSpreaderConstruction                                 466.56ns    2.14M
407 // ============================================================================
408
409 BENCHMARK(LocalAccessSpreaderUse, iters) {
410   folly::BenchmarkSuspender braces;
411   AccessSpreader<> spreader(16);
412   braces.dismiss();
413
414   for (unsigned long i = 0; i < iters; ++i) {
415     auto x = spreader.current();
416     folly::doNotOptimizeAway(x);
417   }
418 }
419
420 BENCHMARK(SharedAccessSpreaderUse, iters) {
421   for (unsigned long i = 0; i < iters; ++i) {
422     auto x = AccessSpreader<>::current(16);
423     folly::doNotOptimizeAway(x);
424   }
425 }
426
427 BENCHMARK(AccessSpreaderConstruction, iters) {
428   std::aligned_storage<sizeof(AccessSpreader<>),
429                        std::alignment_of<AccessSpreader<>>::value>::type raw;
430   for (unsigned long i = 0; i < iters; ++i) {
431     auto x = new (&raw) AccessSpreader<>(16);
432     folly::doNotOptimizeAway(x);
433     x->~AccessSpreader();
434   }
435 }
436
437 enum class SpreaderType { GETCPU, SHARED, TLS_RR };
438
439 // Benchmark scores here reflect the time for 32 threads to perform an
440 // atomic increment on a dual-socket E5-2660 @ 2.2Ghz.  Surprisingly,
441 // if we don't separate the counters onto unique 128 byte stripes the
442 // 1_stripe and 2_stripe results are identical, even though the L3 is
443 // claimed to have 64 byte cache lines.
444 //
445 // _stub means there was no call to getcpu or the tls round-robin
446 // implementation, because for a single stripe the cpu doesn't matter.
447 // _getcpu refers to the vdso getcpu implementation with a locally
448 // constructed AccessSpreader.  _tls_rr refers to execution using
449 // SequentialThreadId, the fallback if the vdso getcpu isn't available.
450 // _shared refers to calling AccessSpreader<>::current(numStripes)
451 // inside the hot loop.
452 //
453 // At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
454 // so since the stripe selection is 21 nanos the atomic increments in
455 // the L1 is ~15 nanos.  At width 8_stripe_0_work the line is expected
456 // to ping-pong almost every operation, since the loops have the same
457 // duration.  Widths 4 and 2 have the same behavior, but each tour of the
458 // cache line is 4 and 8 cores long, respectively.  These all suggest a
459 // lower bound of 60 nanos for intra-chip handoff and increment between
460 // the L1s.
461 //
462 // With 455 nanos (1K cycles) of busywork per contended increment, the
463 // system can hide all of the latency of a tour of length 4, but not
464 // quite one of length 8.  I was a bit surprised at how much worse the
465 // non-striped version got.  It seems that the inter-chip traffic also
466 // interferes with the L1-only localWork.load().  When the local work is
467 // doubled to about 1 microsecond we see that the inter-chip contention
468 // is still very important, but subdivisions on the same chip don't matter.
469 //
470 // sudo nice -n -20
471 //   _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
472 // ============================================================================
473 // folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
474 // ============================================================================
475 // contentionAtWidth(1_stripe_0_work_stub)                      1.14us  873.64K
476 // contentionAtWidth(2_stripe_0_work_getcpu)                  495.58ns    2.02M
477 // contentionAtWidth(4_stripe_0_work_getcpu)                  232.99ns    4.29M
478 // contentionAtWidth(8_stripe_0_work_getcpu)                  101.16ns    9.88M
479 // contentionAtWidth(16_stripe_0_work_getcpu)                  41.93ns   23.85M
480 // contentionAtWidth(32_stripe_0_work_getcpu)                  42.04ns   23.79M
481 // contentionAtWidth(64_stripe_0_work_getcpu)                  41.94ns   23.84M
482 // contentionAtWidth(2_stripe_0_work_tls_rr)                    1.00us  997.41K
483 // contentionAtWidth(4_stripe_0_work_tls_rr)                  694.41ns    1.44M
484 // contentionAtWidth(8_stripe_0_work_tls_rr)                  590.27ns    1.69M
485 // contentionAtWidth(16_stripe_0_work_tls_rr)                 222.13ns    4.50M
486 // contentionAtWidth(32_stripe_0_work_tls_rr)                 169.49ns    5.90M
487 // contentionAtWidth(64_stripe_0_work_tls_rr)                 162.20ns    6.17M
488 // contentionAtWidth(2_stripe_0_work_shared)                  495.54ns    2.02M
489 // contentionAtWidth(4_stripe_0_work_shared)                  236.27ns    4.23M
490 // contentionAtWidth(8_stripe_0_work_shared)                  114.81ns    8.71M
491 // contentionAtWidth(16_stripe_0_work_shared)                  44.65ns   22.40M
492 // contentionAtWidth(32_stripe_0_work_shared)                  41.76ns   23.94M
493 // contentionAtWidth(64_stripe_0_work_shared)                  43.47ns   23.00M
494 // atomicIncrBaseline(local_incr_0_work)                       20.39ns   49.06M
495 // ----------------------------------------------------------------------------
496 // contentionAtWidth(1_stripe_500_work_stub)                    2.04us  491.13K
497 // contentionAtWidth(2_stripe_500_work_getcpu)                610.98ns    1.64M
498 // contentionAtWidth(4_stripe_500_work_getcpu)                507.72ns    1.97M
499 // contentionAtWidth(8_stripe_500_work_getcpu)                542.53ns    1.84M
500 // contentionAtWidth(16_stripe_500_work_getcpu)               496.55ns    2.01M
501 // contentionAtWidth(32_stripe_500_work_getcpu)               500.67ns    2.00M
502 // atomicIncrBaseline(local_incr_500_work)                    484.69ns    2.06M
503 // ----------------------------------------------------------------------------
504 // contentionAtWidth(1_stripe_1000_work_stub)                   2.11us  473.78K
505 // contentionAtWidth(2_stripe_1000_work_getcpu)               970.64ns    1.03M
506 // contentionAtWidth(4_stripe_1000_work_getcpu)               987.31ns    1.01M
507 // contentionAtWidth(8_stripe_1000_work_getcpu)                 1.01us  985.52K
508 // contentionAtWidth(16_stripe_1000_work_getcpu)              986.09ns    1.01M
509 // contentionAtWidth(32_stripe_1000_work_getcpu)              960.23ns    1.04M
510 // atomicIncrBaseline(local_incr_1000_work)                   950.63ns    1.05M
511 // ============================================================================
512 static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
513                               SpreaderType spreaderType,
514                               size_t counterAlignment = 128,
515                               size_t numThreads = 32) {
516   folly::BenchmarkSuspender braces;
517
518   AccessSpreader<> spreader(
519       stripes,
520       CacheLocality::system<std::atomic>(),
521       spreaderType == SpreaderType::TLS_RR
522           ? SequentialThreadId<std::atomic>::getcpu : nullptr);
523
524   std::atomic<size_t> ready(0);
525   std::atomic<bool> go(false);
526
527   // while in theory the cache line size is 64 bytes, experiments show
528   // that we get contention on 128 byte boundaries for Ivy Bridge.  The
529   // extra indirection adds 1 or 2 nanos
530   assert(counterAlignment >= sizeof(std::atomic<size_t>));
531   std::vector<char> raw(counterAlignment * stripes);
532
533   // if we happen to be using the tlsRoundRobin, then sequentially
534   // assigning the thread identifiers is the unlikely best-case scenario.
535   // We don't want to unfairly benefit or penalize.  Computing the exact
536   // maximum likelihood of the probability distributions is annoying, so
537   // I approximate as 2/5 of the ids that have no threads, 2/5 that have
538   // 1, 2/15 that have 2, and 1/15 that have 3.  We accomplish this by
539   // wrapping back to slot 0 when we hit 1/15 and 1/5.
540
541   std::vector<std::thread> threads;
542   while (threads.size() < numThreads) {
543     threads.push_back(std::thread([&,iters,stripes,work]() {
544       std::atomic<size_t>* counters[stripes];
545       for (size_t i = 0; i < stripes; ++i) {
546         counters[i]
547           = new (raw.data() + counterAlignment * i) std::atomic<size_t>();
548       }
549
550       spreader.current();
551       ready++;
552       while (!go.load()) {
553         sched_yield();
554       }
555       std::atomic<int> localWork;
556       if (spreaderType == SpreaderType::SHARED) {
557         for (size_t i = iters; i > 0; --i) {
558           ++*(counters[AccessSpreader<>::current(stripes)]);
559           for (size_t j = work; j > 0; --j) {
560             localWork.load();
561           }
562         }
563       } else {
564         for (size_t i = iters; i > 0; --i) {
565           ++*(counters[spreader.current()]);
566           for (size_t j = work; j > 0; --j) {
567             localWork.load();
568           }
569         }
570       }
571     }));
572
573     if (threads.size() == numThreads / 15 ||
574         threads.size() == numThreads / 5) {
575       // create a few dummy threads to wrap back around to 0 mod numCpus
576       for (size_t i = threads.size(); i != numThreads; ++i) {
577         std::thread([&]() {
578           spreader.current();
579         }).join();
580       }
581     }
582   }
583
584   while (ready < numThreads) {
585     sched_yield();
586   }
587   braces.dismiss();
588   go = true;
589
590   for (auto& thr : threads) {
591     thr.join();
592   }
593 }
594
595 static void atomicIncrBaseline(size_t iters, size_t work,
596                                size_t numThreads = 32) {
597   folly::BenchmarkSuspender braces;
598
599   std::atomic<bool> go(false);
600
601   std::vector<std::thread> threads;
602   while (threads.size() < numThreads) {
603     threads.push_back(std::thread([&]() {
604       while (!go.load()) {
605         sched_yield();
606       }
607       std::atomic<size_t> localCounter;
608       std::atomic<int> localWork;
609       for (size_t i = iters; i > 0; --i) {
610         localCounter++;
611         for (size_t j = work; j > 0; --j) {
612           localWork.load();
613         }
614       }
615     }));
616   }
617
618   braces.dismiss();
619   go = true;
620
621   for (auto& thr : threads) {
622     thr.join();
623   }
624 }
625
626 BENCHMARK_DRAW_LINE()
627
628 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_0_work_stub,
629                       1, 0, SpreaderType::GETCPU)
630 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_getcpu,
631                       2, 0, SpreaderType::GETCPU)
632 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_getcpu,
633                       4, 0, SpreaderType::GETCPU)
634 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_getcpu,
635                       8, 0, SpreaderType::GETCPU)
636 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_getcpu,
637                       16, 0, SpreaderType::GETCPU)
638 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_getcpu,
639                       32, 0, SpreaderType::GETCPU)
640 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_getcpu,
641                       64, 0, SpreaderType::GETCPU)
642 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_tls_rr,
643                       2, 0, SpreaderType::TLS_RR)
644 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_tls_rr,
645                       4, 0, SpreaderType::TLS_RR)
646 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_tls_rr,
647                       8, 0, SpreaderType::TLS_RR)
648 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_tls_rr,
649                       16, 0, SpreaderType::TLS_RR)
650 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_tls_rr,
651                       32, 0, SpreaderType::TLS_RR)
652 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_tls_rr,
653                       64, 0, SpreaderType::TLS_RR)
654 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_shared,
655                       2, 0, SpreaderType::SHARED)
656 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_shared,
657                       4, 0, SpreaderType::SHARED)
658 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_shared,
659                       8, 0, SpreaderType::SHARED)
660 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_shared,
661                       16, 0, SpreaderType::SHARED)
662 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_shared,
663                       32, 0, SpreaderType::SHARED)
664 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_shared,
665                       64, 0, SpreaderType::SHARED)
666 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
667 BENCHMARK_DRAW_LINE()
668 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_500_work_stub,
669                       1, 500, SpreaderType::GETCPU)
670 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_500_work_getcpu,
671                       2, 500, SpreaderType::GETCPU)
672 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_500_work_getcpu,
673                       4, 500, SpreaderType::GETCPU)
674 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_500_work_getcpu,
675                       8, 500, SpreaderType::GETCPU)
676 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_500_work_getcpu,
677                       16, 500, SpreaderType::GETCPU)
678 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_500_work_getcpu,
679                       32, 500, SpreaderType::GETCPU)
680 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
681 BENCHMARK_DRAW_LINE()
682 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_1000_work_stub,
683                       1, 1000, SpreaderType::GETCPU)
684 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_1000_work_getcpu,
685                       2, 1000, SpreaderType::GETCPU)
686 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_1000_work_getcpu,
687                       4, 1000, SpreaderType::GETCPU)
688 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_1000_work_getcpu,
689                       8, 1000, SpreaderType::GETCPU)
690 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_1000_work_getcpu,
691                       16, 1000, SpreaderType::GETCPU)
692 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_1000_work_getcpu,
693                       32, 1000, SpreaderType::GETCPU)
694 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
695
696
697 int main(int argc, char** argv) {
698   testing::InitGoogleTest(&argc, argv);
699   google::ParseCommandLineFlags(&argc, &argv, true);
700   auto ret = RUN_ALL_TESTS();
701   if (!ret && FLAGS_benchmark) {
702     folly::runBenchmarks();
703   }
704   return ret;
705 }