Copyright 2013 -> 2014
[folly.git] / folly / test / CacheLocalityTest.cpp
1 /*
2  * Copyright 2014 Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "folly/detail/CacheLocality.h"
18
19 #include <sched.h>
20 #include <memory>
21 #include <thread>
22 #include <type_traits>
23 #include <unordered_map>
24 #include <glog/logging.h>
25 #include <gtest/gtest.h>
26 #include "folly/Benchmark.h"
27
28 using namespace folly::detail;
29
30 /// This is the relevant nodes from a production box's sysfs tree.  If you
31 /// think this map is ugly you should see the version of this test that
32 /// used a real directory tree.  To reduce the chance of testing error
33 /// I haven't tried to remove the common prefix
34 static std::unordered_map<std::string,std::string> fakeSysfsTree = {
35   { "/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list", "0,17" },
36   { "/sys/devices/system/cpu/cpu0/cache/index0/type", "Data" },
37   { "/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list", "0,17" },
38   { "/sys/devices/system/cpu/cpu0/cache/index1/type", "Instruction" },
39   { "/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list", "0,17" },
40   { "/sys/devices/system/cpu/cpu0/cache/index2/type", "Unified" },
41   { "/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list", "0-8,17-23" },
42   { "/sys/devices/system/cpu/cpu0/cache/index3/type", "Unified" },
43   { "/sys/devices/system/cpu/cpu1/cache/index0/shared_cpu_list", "1,18" },
44   { "/sys/devices/system/cpu/cpu1/cache/index0/type", "Data" },
45   { "/sys/devices/system/cpu/cpu1/cache/index1/shared_cpu_list", "1,18" },
46   { "/sys/devices/system/cpu/cpu1/cache/index1/type", "Instruction" },
47   { "/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list", "1,18" },
48   { "/sys/devices/system/cpu/cpu1/cache/index2/type", "Unified" },
49   { "/sys/devices/system/cpu/cpu1/cache/index3/shared_cpu_list", "0-8,17-23" },
50   { "/sys/devices/system/cpu/cpu1/cache/index3/type", "Unified" },
51   { "/sys/devices/system/cpu/cpu2/cache/index0/shared_cpu_list", "2,19" },
52   { "/sys/devices/system/cpu/cpu2/cache/index0/type", "Data" },
53   { "/sys/devices/system/cpu/cpu2/cache/index1/shared_cpu_list", "2,19" },
54   { "/sys/devices/system/cpu/cpu2/cache/index1/type", "Instruction" },
55   { "/sys/devices/system/cpu/cpu2/cache/index2/shared_cpu_list", "2,19" },
56   { "/sys/devices/system/cpu/cpu2/cache/index2/type", "Unified" },
57   { "/sys/devices/system/cpu/cpu2/cache/index3/shared_cpu_list", "0-8,17-23" },
58   { "/sys/devices/system/cpu/cpu2/cache/index3/type", "Unified" },
59   { "/sys/devices/system/cpu/cpu3/cache/index0/shared_cpu_list", "3,20" },
60   { "/sys/devices/system/cpu/cpu3/cache/index0/type", "Data" },
61   { "/sys/devices/system/cpu/cpu3/cache/index1/shared_cpu_list", "3,20" },
62   { "/sys/devices/system/cpu/cpu3/cache/index1/type", "Instruction" },
63   { "/sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_list", "3,20" },
64   { "/sys/devices/system/cpu/cpu3/cache/index2/type", "Unified" },
65   { "/sys/devices/system/cpu/cpu3/cache/index3/shared_cpu_list", "0-8,17-23" },
66   { "/sys/devices/system/cpu/cpu3/cache/index3/type", "Unified" },
67   { "/sys/devices/system/cpu/cpu4/cache/index0/shared_cpu_list", "4,21" },
68   { "/sys/devices/system/cpu/cpu4/cache/index0/type", "Data" },
69   { "/sys/devices/system/cpu/cpu4/cache/index1/shared_cpu_list", "4,21" },
70   { "/sys/devices/system/cpu/cpu4/cache/index1/type", "Instruction" },
71   { "/sys/devices/system/cpu/cpu4/cache/index2/shared_cpu_list", "4,21" },
72   { "/sys/devices/system/cpu/cpu4/cache/index2/type", "Unified" },
73   { "/sys/devices/system/cpu/cpu4/cache/index3/shared_cpu_list", "0-8,17-23" },
74   { "/sys/devices/system/cpu/cpu4/cache/index3/type", "Unified" },
75   { "/sys/devices/system/cpu/cpu5/cache/index0/shared_cpu_list", "5-6" },
76   { "/sys/devices/system/cpu/cpu5/cache/index0/type", "Data" },
77   { "/sys/devices/system/cpu/cpu5/cache/index1/shared_cpu_list", "5-6" },
78   { "/sys/devices/system/cpu/cpu5/cache/index1/type", "Instruction" },
79   { "/sys/devices/system/cpu/cpu5/cache/index2/shared_cpu_list", "5-6" },
80   { "/sys/devices/system/cpu/cpu5/cache/index2/type", "Unified" },
81   { "/sys/devices/system/cpu/cpu5/cache/index3/shared_cpu_list", "0-8,17-23" },
82   { "/sys/devices/system/cpu/cpu5/cache/index3/type", "Unified" },
83   { "/sys/devices/system/cpu/cpu6/cache/index0/shared_cpu_list", "5-6" },
84   { "/sys/devices/system/cpu/cpu6/cache/index0/type", "Data" },
85   { "/sys/devices/system/cpu/cpu6/cache/index1/shared_cpu_list", "5-6" },
86   { "/sys/devices/system/cpu/cpu6/cache/index1/type", "Instruction" },
87   { "/sys/devices/system/cpu/cpu6/cache/index2/shared_cpu_list", "5-6" },
88   { "/sys/devices/system/cpu/cpu6/cache/index2/type", "Unified" },
89   { "/sys/devices/system/cpu/cpu6/cache/index3/shared_cpu_list", "0-8,17-23" },
90   { "/sys/devices/system/cpu/cpu6/cache/index3/type", "Unified" },
91   { "/sys/devices/system/cpu/cpu7/cache/index0/shared_cpu_list", "7,22" },
92   { "/sys/devices/system/cpu/cpu7/cache/index0/type", "Data" },
93   { "/sys/devices/system/cpu/cpu7/cache/index1/shared_cpu_list", "7,22" },
94   { "/sys/devices/system/cpu/cpu7/cache/index1/type", "Instruction" },
95   { "/sys/devices/system/cpu/cpu7/cache/index2/shared_cpu_list", "7,22" },
96   { "/sys/devices/system/cpu/cpu7/cache/index2/type", "Unified" },
97   { "/sys/devices/system/cpu/cpu7/cache/index3/shared_cpu_list", "0-8,17-23" },
98   { "/sys/devices/system/cpu/cpu7/cache/index3/type", "Unified" },
99   { "/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list", "8,23" },
100   { "/sys/devices/system/cpu/cpu8/cache/index0/type", "Data" },
101   { "/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list", "8,23" },
102   { "/sys/devices/system/cpu/cpu8/cache/index1/type", "Instruction" },
103   { "/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list", "8,23" },
104   { "/sys/devices/system/cpu/cpu8/cache/index2/type", "Unified" },
105   { "/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list", "0-8,17-23" },
106   { "/sys/devices/system/cpu/cpu8/cache/index3/type", "Unified" },
107   { "/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list", "9,24" },
108   { "/sys/devices/system/cpu/cpu9/cache/index0/type", "Data" },
109   { "/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list", "9,24" },
110   { "/sys/devices/system/cpu/cpu9/cache/index1/type", "Instruction" },
111   { "/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list", "9,24" },
112   { "/sys/devices/system/cpu/cpu9/cache/index2/type", "Unified" },
113   { "/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list", "9-16,24-31" },
114   { "/sys/devices/system/cpu/cpu9/cache/index3/type", "Unified" },
115   { "/sys/devices/system/cpu/cpu10/cache/index0/shared_cpu_list", "10,25" },
116   { "/sys/devices/system/cpu/cpu10/cache/index0/type", "Data" },
117   { "/sys/devices/system/cpu/cpu10/cache/index1/shared_cpu_list", "10,25" },
118   { "/sys/devices/system/cpu/cpu10/cache/index1/type", "Instruction" },
119   { "/sys/devices/system/cpu/cpu10/cache/index2/shared_cpu_list", "10,25" },
120   { "/sys/devices/system/cpu/cpu10/cache/index2/type", "Unified" },
121   { "/sys/devices/system/cpu/cpu10/cache/index3/shared_cpu_list", "9-16,24-31"},
122   { "/sys/devices/system/cpu/cpu10/cache/index3/type", "Unified" },
123   { "/sys/devices/system/cpu/cpu11/cache/index0/shared_cpu_list", "11,26" },
124   { "/sys/devices/system/cpu/cpu11/cache/index0/type", "Data" },
125   { "/sys/devices/system/cpu/cpu11/cache/index1/shared_cpu_list", "11,26" },
126   { "/sys/devices/system/cpu/cpu11/cache/index1/type", "Instruction" },
127   { "/sys/devices/system/cpu/cpu11/cache/index2/shared_cpu_list", "11,26" },
128   { "/sys/devices/system/cpu/cpu11/cache/index2/type", "Unified" },
129   { "/sys/devices/system/cpu/cpu11/cache/index3/shared_cpu_list", "9-16,24-31"},
130   { "/sys/devices/system/cpu/cpu11/cache/index3/type", "Unified" },
131   { "/sys/devices/system/cpu/cpu12/cache/index0/shared_cpu_list", "12,27" },
132   { "/sys/devices/system/cpu/cpu12/cache/index0/type", "Data" },
133   { "/sys/devices/system/cpu/cpu12/cache/index1/shared_cpu_list", "12,27" },
134   { "/sys/devices/system/cpu/cpu12/cache/index1/type", "Instruction" },
135   { "/sys/devices/system/cpu/cpu12/cache/index2/shared_cpu_list", "12,27" },
136   { "/sys/devices/system/cpu/cpu12/cache/index2/type", "Unified" },
137   { "/sys/devices/system/cpu/cpu12/cache/index3/shared_cpu_list", "9-16,24-31"},
138   { "/sys/devices/system/cpu/cpu12/cache/index3/type", "Unified" },
139   { "/sys/devices/system/cpu/cpu13/cache/index0/shared_cpu_list", "13,28" },
140   { "/sys/devices/system/cpu/cpu13/cache/index0/type", "Data" },
141   { "/sys/devices/system/cpu/cpu13/cache/index1/shared_cpu_list", "13,28" },
142   { "/sys/devices/system/cpu/cpu13/cache/index1/type", "Instruction" },
143   { "/sys/devices/system/cpu/cpu13/cache/index2/shared_cpu_list", "13,28" },
144   { "/sys/devices/system/cpu/cpu13/cache/index2/type", "Unified" },
145   { "/sys/devices/system/cpu/cpu13/cache/index3/shared_cpu_list", "9-16,24-31"},
146   { "/sys/devices/system/cpu/cpu13/cache/index3/type", "Unified" },
147   { "/sys/devices/system/cpu/cpu14/cache/index0/shared_cpu_list", "14,29" },
148   { "/sys/devices/system/cpu/cpu14/cache/index0/type", "Data" },
149   { "/sys/devices/system/cpu/cpu14/cache/index1/shared_cpu_list", "14,29" },
150   { "/sys/devices/system/cpu/cpu14/cache/index1/type", "Instruction" },
151   { "/sys/devices/system/cpu/cpu14/cache/index2/shared_cpu_list", "14,29" },
152   { "/sys/devices/system/cpu/cpu14/cache/index2/type", "Unified" },
153   { "/sys/devices/system/cpu/cpu14/cache/index3/shared_cpu_list", "9-16,24-31"},
154   { "/sys/devices/system/cpu/cpu14/cache/index3/type", "Unified" },
155   { "/sys/devices/system/cpu/cpu15/cache/index0/shared_cpu_list", "15,30" },
156   { "/sys/devices/system/cpu/cpu15/cache/index0/type", "Data" },
157   { "/sys/devices/system/cpu/cpu15/cache/index1/shared_cpu_list", "15,30" },
158   { "/sys/devices/system/cpu/cpu15/cache/index1/type", "Instruction" },
159   { "/sys/devices/system/cpu/cpu15/cache/index2/shared_cpu_list", "15,30" },
160   { "/sys/devices/system/cpu/cpu15/cache/index2/type", "Unified" },
161   { "/sys/devices/system/cpu/cpu15/cache/index3/shared_cpu_list", "9-16,24-31"},
162   { "/sys/devices/system/cpu/cpu15/cache/index3/type", "Unified" },
163   { "/sys/devices/system/cpu/cpu16/cache/index0/shared_cpu_list", "16,31" },
164   { "/sys/devices/system/cpu/cpu16/cache/index0/type", "Data" },
165   { "/sys/devices/system/cpu/cpu16/cache/index1/shared_cpu_list", "16,31" },
166   { "/sys/devices/system/cpu/cpu16/cache/index1/type", "Instruction" },
167   { "/sys/devices/system/cpu/cpu16/cache/index2/shared_cpu_list", "16,31" },
168   { "/sys/devices/system/cpu/cpu16/cache/index2/type", "Unified" },
169   { "/sys/devices/system/cpu/cpu16/cache/index3/shared_cpu_list", "9-16,24-31"},
170   { "/sys/devices/system/cpu/cpu16/cache/index3/type", "Unified" },
171   { "/sys/devices/system/cpu/cpu17/cache/index0/shared_cpu_list", "0,17" },
172   { "/sys/devices/system/cpu/cpu17/cache/index0/type", "Data" },
173   { "/sys/devices/system/cpu/cpu17/cache/index1/shared_cpu_list", "0,17" },
174   { "/sys/devices/system/cpu/cpu17/cache/index1/type", "Instruction" },
175   { "/sys/devices/system/cpu/cpu17/cache/index2/shared_cpu_list", "0,17" },
176   { "/sys/devices/system/cpu/cpu17/cache/index2/type", "Unified" },
177   { "/sys/devices/system/cpu/cpu17/cache/index3/shared_cpu_list", "0-8,17-23" },
178   { "/sys/devices/system/cpu/cpu17/cache/index3/type", "Unified" },
179   { "/sys/devices/system/cpu/cpu18/cache/index0/shared_cpu_list", "1,18" },
180   { "/sys/devices/system/cpu/cpu18/cache/index0/type", "Data" },
181   { "/sys/devices/system/cpu/cpu18/cache/index1/shared_cpu_list", "1,18" },
182   { "/sys/devices/system/cpu/cpu18/cache/index1/type", "Instruction" },
183   { "/sys/devices/system/cpu/cpu18/cache/index2/shared_cpu_list", "1,18" },
184   { "/sys/devices/system/cpu/cpu18/cache/index2/type", "Unified" },
185   { "/sys/devices/system/cpu/cpu18/cache/index3/shared_cpu_list", "0-8,17-23" },
186   { "/sys/devices/system/cpu/cpu18/cache/index3/type", "Unified" },
187   { "/sys/devices/system/cpu/cpu19/cache/index0/shared_cpu_list", "2,19" },
188   { "/sys/devices/system/cpu/cpu19/cache/index0/type", "Data" },
189   { "/sys/devices/system/cpu/cpu19/cache/index1/shared_cpu_list", "2,19" },
190   { "/sys/devices/system/cpu/cpu19/cache/index1/type", "Instruction" },
191   { "/sys/devices/system/cpu/cpu19/cache/index2/shared_cpu_list", "2,19" },
192   { "/sys/devices/system/cpu/cpu19/cache/index2/type", "Unified" },
193   { "/sys/devices/system/cpu/cpu19/cache/index3/shared_cpu_list", "0-8,17-23" },
194   { "/sys/devices/system/cpu/cpu19/cache/index3/type", "Unified" },
195   { "/sys/devices/system/cpu/cpu20/cache/index0/shared_cpu_list", "3,20" },
196   { "/sys/devices/system/cpu/cpu20/cache/index0/type", "Data" },
197   { "/sys/devices/system/cpu/cpu20/cache/index1/shared_cpu_list", "3,20" },
198   { "/sys/devices/system/cpu/cpu20/cache/index1/type", "Instruction" },
199   { "/sys/devices/system/cpu/cpu20/cache/index2/shared_cpu_list", "3,20" },
200   { "/sys/devices/system/cpu/cpu20/cache/index2/type", "Unified" },
201   { "/sys/devices/system/cpu/cpu20/cache/index3/shared_cpu_list", "0-8,17-23" },
202   { "/sys/devices/system/cpu/cpu20/cache/index3/type", "Unified" },
203   { "/sys/devices/system/cpu/cpu21/cache/index0/shared_cpu_list", "4,21" },
204   { "/sys/devices/system/cpu/cpu21/cache/index0/type", "Data" },
205   { "/sys/devices/system/cpu/cpu21/cache/index1/shared_cpu_list", "4,21" },
206   { "/sys/devices/system/cpu/cpu21/cache/index1/type", "Instruction" },
207   { "/sys/devices/system/cpu/cpu21/cache/index2/shared_cpu_list", "4,21" },
208   { "/sys/devices/system/cpu/cpu21/cache/index2/type", "Unified" },
209   { "/sys/devices/system/cpu/cpu21/cache/index3/shared_cpu_list", "0-8,17-23" },
210   { "/sys/devices/system/cpu/cpu21/cache/index3/type", "Unified" },
211   { "/sys/devices/system/cpu/cpu22/cache/index0/shared_cpu_list", "7,22" },
212   { "/sys/devices/system/cpu/cpu22/cache/index0/type", "Data" },
213   { "/sys/devices/system/cpu/cpu22/cache/index1/shared_cpu_list", "7,22" },
214   { "/sys/devices/system/cpu/cpu22/cache/index1/type", "Instruction" },
215   { "/sys/devices/system/cpu/cpu22/cache/index2/shared_cpu_list", "7,22" },
216   { "/sys/devices/system/cpu/cpu22/cache/index2/type", "Unified" },
217   { "/sys/devices/system/cpu/cpu22/cache/index3/shared_cpu_list", "0-8,17-23" },
218   { "/sys/devices/system/cpu/cpu22/cache/index3/type", "Unified" },
219   { "/sys/devices/system/cpu/cpu23/cache/index0/shared_cpu_list", "8,23" },
220   { "/sys/devices/system/cpu/cpu23/cache/index0/type", "Data" },
221   { "/sys/devices/system/cpu/cpu23/cache/index1/shared_cpu_list", "8,23" },
222   { "/sys/devices/system/cpu/cpu23/cache/index1/type", "Instruction" },
223   { "/sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_list", "8,23" },
224   { "/sys/devices/system/cpu/cpu23/cache/index2/type", "Unified" },
225   { "/sys/devices/system/cpu/cpu23/cache/index3/shared_cpu_list", "0-8,17-23" },
226   { "/sys/devices/system/cpu/cpu23/cache/index3/type", "Unified" },
227   { "/sys/devices/system/cpu/cpu24/cache/index0/shared_cpu_list", "9,24" },
228   { "/sys/devices/system/cpu/cpu24/cache/index0/type", "Data" },
229   { "/sys/devices/system/cpu/cpu24/cache/index1/shared_cpu_list", "9,24" },
230   { "/sys/devices/system/cpu/cpu24/cache/index1/type", "Instruction" },
231   { "/sys/devices/system/cpu/cpu24/cache/index2/shared_cpu_list", "9,24" },
232   { "/sys/devices/system/cpu/cpu24/cache/index2/type", "Unified" },
233   { "/sys/devices/system/cpu/cpu24/cache/index3/shared_cpu_list", "9-16,24-31"},
234   { "/sys/devices/system/cpu/cpu24/cache/index3/type", "Unified" },
235   { "/sys/devices/system/cpu/cpu25/cache/index0/shared_cpu_list", "10,25" },
236   { "/sys/devices/system/cpu/cpu25/cache/index0/type", "Data" },
237   { "/sys/devices/system/cpu/cpu25/cache/index1/shared_cpu_list", "10,25" },
238   { "/sys/devices/system/cpu/cpu25/cache/index1/type", "Instruction" },
239   { "/sys/devices/system/cpu/cpu25/cache/index2/shared_cpu_list", "10,25" },
240   { "/sys/devices/system/cpu/cpu25/cache/index2/type", "Unified" },
241   { "/sys/devices/system/cpu/cpu25/cache/index3/shared_cpu_list", "9-16,24-31"},
242   { "/sys/devices/system/cpu/cpu25/cache/index3/type", "Unified" },
243   { "/sys/devices/system/cpu/cpu26/cache/index0/shared_cpu_list", "11,26" },
244   { "/sys/devices/system/cpu/cpu26/cache/index0/type", "Data" },
245   { "/sys/devices/system/cpu/cpu26/cache/index1/shared_cpu_list", "11,26" },
246   { "/sys/devices/system/cpu/cpu26/cache/index1/type", "Instruction" },
247   { "/sys/devices/system/cpu/cpu26/cache/index2/shared_cpu_list", "11,26" },
248   { "/sys/devices/system/cpu/cpu26/cache/index2/type", "Unified" },
249   { "/sys/devices/system/cpu/cpu26/cache/index3/shared_cpu_list", "9-16,24-31"},
250   { "/sys/devices/system/cpu/cpu26/cache/index3/type", "Unified" },
251   { "/sys/devices/system/cpu/cpu27/cache/index0/shared_cpu_list", "12,27" },
252   { "/sys/devices/system/cpu/cpu27/cache/index0/type", "Data" },
253   { "/sys/devices/system/cpu/cpu27/cache/index1/shared_cpu_list", "12,27" },
254   { "/sys/devices/system/cpu/cpu27/cache/index1/type", "Instruction" },
255   { "/sys/devices/system/cpu/cpu27/cache/index2/shared_cpu_list", "12,27" },
256   { "/sys/devices/system/cpu/cpu27/cache/index2/type", "Unified" },
257   { "/sys/devices/system/cpu/cpu27/cache/index3/shared_cpu_list", "9-16,24-31"},
258   { "/sys/devices/system/cpu/cpu27/cache/index3/type", "Unified" },
259   { "/sys/devices/system/cpu/cpu28/cache/index0/shared_cpu_list", "13,28" },
260   { "/sys/devices/system/cpu/cpu28/cache/index0/type", "Data" },
261   { "/sys/devices/system/cpu/cpu28/cache/index1/shared_cpu_list", "13,28" },
262   { "/sys/devices/system/cpu/cpu28/cache/index1/type", "Instruction" },
263   { "/sys/devices/system/cpu/cpu28/cache/index2/shared_cpu_list", "13,28" },
264   { "/sys/devices/system/cpu/cpu28/cache/index2/type", "Unified" },
265   { "/sys/devices/system/cpu/cpu28/cache/index3/shared_cpu_list", "9-16,24-31"},
266   { "/sys/devices/system/cpu/cpu28/cache/index3/type", "Unified" },
267   { "/sys/devices/system/cpu/cpu29/cache/index0/shared_cpu_list", "14,29" },
268   { "/sys/devices/system/cpu/cpu29/cache/index0/type", "Data" },
269   { "/sys/devices/system/cpu/cpu29/cache/index1/shared_cpu_list", "14,29" },
270   { "/sys/devices/system/cpu/cpu29/cache/index1/type", "Instruction" },
271   { "/sys/devices/system/cpu/cpu29/cache/index2/shared_cpu_list", "14,29" },
272   { "/sys/devices/system/cpu/cpu29/cache/index2/type", "Unified" },
273   { "/sys/devices/system/cpu/cpu29/cache/index3/shared_cpu_list", "9-16,24-31"},
274   { "/sys/devices/system/cpu/cpu29/cache/index3/type", "Unified" },
275   { "/sys/devices/system/cpu/cpu30/cache/index0/shared_cpu_list", "15,30" },
276   { "/sys/devices/system/cpu/cpu30/cache/index0/type", "Data" },
277   { "/sys/devices/system/cpu/cpu30/cache/index1/shared_cpu_list", "15,30" },
278   { "/sys/devices/system/cpu/cpu30/cache/index1/type", "Instruction" },
279   { "/sys/devices/system/cpu/cpu30/cache/index2/shared_cpu_list", "15,30" },
280   { "/sys/devices/system/cpu/cpu30/cache/index2/type", "Unified" },
281   { "/sys/devices/system/cpu/cpu30/cache/index3/shared_cpu_list", "9-16,24-31"},
282   { "/sys/devices/system/cpu/cpu30/cache/index3/type", "Unified" },
283   { "/sys/devices/system/cpu/cpu31/cache/index0/shared_cpu_list", "16,31" },
284   { "/sys/devices/system/cpu/cpu31/cache/index0/type", "Data" },
285   { "/sys/devices/system/cpu/cpu31/cache/index1/shared_cpu_list", "16,31" },
286   { "/sys/devices/system/cpu/cpu31/cache/index1/type", "Instruction" },
287   { "/sys/devices/system/cpu/cpu31/cache/index2/shared_cpu_list", "16,31" },
288   { "/sys/devices/system/cpu/cpu31/cache/index2/type", "Unified" },
289   { "/sys/devices/system/cpu/cpu31/cache/index3/shared_cpu_list", "9-16,24-31"},
290   { "/sys/devices/system/cpu/cpu31/cache/index3/type", "Unified" }
291 };
292
293 /// This is the expected CacheLocality structure for fakeSysfsTree
294 static const CacheLocality nonUniformExampleLocality = {
295   32,
296   { 16, 16, 2 },
297   { 0, 2, 4, 6, 8, 10, 11, 12, 14, 16, 18, 20, 22, 24, 26, 28,
298     30, 1, 3, 5, 7, 9, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 }
299 };
300
301 TEST(CacheLocality, FakeSysfs) {
302   auto parsed = CacheLocality::readFromSysfsTree([](std::string name) {
303     auto iter = fakeSysfsTree.find(name);
304     return iter == fakeSysfsTree.end() ? std::string() : iter->second;
305   });
306
307   auto& expected = nonUniformExampleLocality;
308   EXPECT_EQ(expected.numCpus, parsed.numCpus);
309   EXPECT_EQ(expected.numCachesByLevel, parsed.numCachesByLevel);
310   EXPECT_EQ(expected.localityIndexByCpu, parsed.localityIndexByCpu);
311 }
312
313 TEST(Getcpu, VdsoGetcpu) {
314   unsigned cpu;
315   Getcpu::vdsoFunc()(&cpu, nullptr, nullptr);
316
317   EXPECT_TRUE(cpu < CPU_SETSIZE);
318 }
319
320 TEST(SequentialThreadId, Simple) {
321   unsigned cpu = 0;
322   auto rv = SequentialThreadId<std::atomic>::getcpu(&cpu, nullptr, nullptr);
323   EXPECT_EQ(rv, 0);
324   EXPECT_TRUE(cpu > 0);
325   unsigned again;
326   SequentialThreadId<std::atomic>::getcpu(&again, nullptr, nullptr);
327   EXPECT_EQ(cpu, again);
328 }
329
330 static __thread unsigned testingCpu = 0;
331
332 static int testingGetcpu(unsigned* cpu, unsigned* node, void* unused) {
333   if (cpu != nullptr) {
334     *cpu = testingCpu;
335   }
336   if (node != nullptr) {
337     *node = testingCpu;
338   }
339   return 0;
340 }
341
342 TEST(AccessSpreader, Stubbed) {
343   std::vector<std::unique_ptr<AccessSpreader<>>> spreaders(100);
344   for (size_t s = 1; s < spreaders.size(); ++s) {
345     spreaders[s].reset(new AccessSpreader<>(
346         s, nonUniformExampleLocality, &testingGetcpu));
347   }
348   std::vector<size_t> cpusInLocalityOrder = {
349       0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 6, 7, 22, 8, 23, 9, 24, 10, 25,
350       11, 26, 12, 27, 13, 28, 14, 29, 15, 30, 16, 31 };
351   for (size_t i = 0; i < 32; ++i) {
352     // extra i * 32 is to check wrapping behavior of impl
353     testingCpu = cpusInLocalityOrder[i] + i * 64;
354     for (size_t s = 1; s < spreaders.size(); ++s) {
355       EXPECT_EQ((i * s) / 32, spreaders[s]->current())
356           << "i=" << i << ", cpu=" << testingCpu << ", s=" << s;
357     }
358   }
359 }
360
361 TEST(AccessSpreader, Default) {
362   AccessSpreader<> spreader(16);
363   EXPECT_LT(spreader.current(), 16);
364 }
365
366 TEST(AccessSpreader, Shared) {
367   for (size_t s = 1; s < 200; ++s) {
368     EXPECT_LT(AccessSpreader<>::shared(s).current(), s);
369   }
370 }
371
372 TEST(AccessSpreader, Statics) {
373   LOG(INFO) << "stripeByCore.numStripes() = "
374             << AccessSpreader<>::stripeByCore.numStripes();
375   LOG(INFO) << "stripeByChip.numStripes() = "
376             << AccessSpreader<>::stripeByChip.numStripes();
377   for (size_t s = 1; s < 200; ++s) {
378     EXPECT_LT(AccessSpreader<>::current(s), s);
379   }
380 }
381
382 TEST(AccessSpreader, Wrapping) {
383   // this test won't pass unless locality.numCpus divides kMaxCpus
384   auto numCpus = 16;
385   auto locality = CacheLocality::uniform(numCpus);
386   for (size_t s = 1; s < 200; ++s) {
387     AccessSpreader<> spreader(s, locality, &testingGetcpu);
388     for (size_t c = 0; c < 400; ++c) {
389       testingCpu = c;
390       auto observed = spreader.current();
391       testingCpu = c % numCpus;
392       auto expected = spreader.current();
393       EXPECT_EQ(expected, observed)
394           << "numCpus=" << numCpus << ", s=" << s << ", c=" << c;
395     }
396   }
397 }
398
399 // Benchmarked at ~21 nanos on fbk35 (2.6) and fbk18 (3.2) kernels with
400 // a 2.2Ghz Xeon
401 // ============================================================================
402 // folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
403 // ============================================================================
404 // LocalAccessSpreaderUse                                      20.77ns   48.16M
405 // SharedAccessSpreaderUse                                     21.95ns   45.55M
406 // AccessSpreaderConstruction                                 466.56ns    2.14M
407 // ============================================================================
408
409 BENCHMARK(LocalAccessSpreaderUse, iters) {
410   folly::BenchmarkSuspender braces;
411   AccessSpreader<> spreader(16);
412   braces.dismiss();
413
414   for (unsigned long i = 0; i < iters; ++i) {
415     auto x = spreader.current();
416     folly::doNotOptimizeAway(x);
417   }
418 }
419
420 BENCHMARK(SharedAccessSpreaderUse, iters) {
421   for (unsigned long i = 0; i < iters; ++i) {
422     auto x = AccessSpreader<>::current(16);
423     folly::doNotOptimizeAway(x);
424   }
425 }
426
427 BENCHMARK(AccessSpreaderConstruction, iters) {
428   std::aligned_storage<sizeof(AccessSpreader<>),
429                        std::alignment_of<AccessSpreader<>>::value>::type raw;
430   for (unsigned long i = 0; i < iters; ++i) {
431     auto x = new (&raw) AccessSpreader<>(16);
432     folly::doNotOptimizeAway(x);
433     x->~AccessSpreader();
434   }
435 }
436
437 enum class SpreaderType { GETCPU, SHARED, TLS_RR };
438
439 // Benchmark scores here reflect the time for 32 threads to perform an
440 // atomic increment on a dual-socket E5-2660 @ 2.2Ghz.  Surprisingly,
441 // if we don't separate the counters onto unique 128 byte stripes the
442 // 1_stripe and 2_stripe results are identical, even though the L3 is
443 // claimed to have 64 byte cache lines.
444 //
445 // _stub means there was no call to getcpu or the tls round-robin
446 // implementation, because for a single stripe the cpu doesn't matter.
447 // _getcpu refers to the vdso getcpu implementation with a locally
448 // constructed AccessSpreader.  _tls_rr refers to execution using
449 // SequentialThreadId, the fallback if the vdso getcpu isn't available.
450 // _shared refers to calling AccessSpreader<>::current(numStripes)
451 // inside the hot loop.
452 //
453 // At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
454 // so since the stripe selection is 21 nanos the atomic increments in
455 // the L1 is ~15 nanos.  At width 8_stripe_0_work the line is expected
456 // to ping-pong almost every operation, since the loops have the same
457 // duration.  Widths 4 and 2 have the same behavior, but each tour of the
458 // cache line is 4 and 8 cores long, respectively.  These all suggest a
459 // lower bound of 60 nanos for intra-chip handoff and increment between
460 // the L1s.
461 //
462 // With 455 nanos (1K cycles) of busywork per contended increment, the
463 // system can hide all of the latency of a tour of length 4, but not
464 // quite one of length 8.  I was a bit surprised at how much worse the
465 // non-striped version got.  It seems that the inter-chip traffic also
466 // interferes with the L1-only localWork.load().  When the local work is
467 // doubled to about 1 microsecond we see that the inter-chip contention
468 // is still very important, but subdivisions on the same chip don't matter.
469 //
470 // sudo nice -n -20
471 //   _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
472 // ============================================================================
473 // folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
474 // ============================================================================
475 // contentionAtWidth(1_stripe_0_work_stub)                      1.14us  873.64K
476 // contentionAtWidth(2_stripe_0_work_getcpu)                  495.58ns    2.02M
477 // contentionAtWidth(4_stripe_0_work_getcpu)                  232.99ns    4.29M
478 // contentionAtWidth(8_stripe_0_work_getcpu)                  101.16ns    9.88M
479 // contentionAtWidth(16_stripe_0_work_getcpu)                  41.93ns   23.85M
480 // contentionAtWidth(32_stripe_0_work_getcpu)                  42.04ns   23.79M
481 // contentionAtWidth(64_stripe_0_work_getcpu)                  41.94ns   23.84M
482 // contentionAtWidth(2_stripe_0_work_tls_rr)                    1.00us  997.41K
483 // contentionAtWidth(4_stripe_0_work_tls_rr)                  694.41ns    1.44M
484 // contentionAtWidth(8_stripe_0_work_tls_rr)                  590.27ns    1.69M
485 // contentionAtWidth(16_stripe_0_work_tls_rr)                 222.13ns    4.50M
486 // contentionAtWidth(32_stripe_0_work_tls_rr)                 169.49ns    5.90M
487 // contentionAtWidth(64_stripe_0_work_tls_rr)                 162.20ns    6.17M
488 // contentionAtWidth(2_stripe_0_work_shared)                  495.54ns    2.02M
489 // contentionAtWidth(4_stripe_0_work_shared)                  236.27ns    4.23M
490 // contentionAtWidth(8_stripe_0_work_shared)                  114.81ns    8.71M
491 // contentionAtWidth(16_stripe_0_work_shared)                  44.65ns   22.40M
492 // contentionAtWidth(32_stripe_0_work_shared)                  41.76ns   23.94M
493 // contentionAtWidth(64_stripe_0_work_shared)                  43.47ns   23.00M
494 // atomicIncrBaseline(local_incr_0_work)                       20.39ns   49.06M
495 // ----------------------------------------------------------------------------
496 // contentionAtWidth(1_stripe_500_work_stub)                    2.04us  491.13K
497 // contentionAtWidth(2_stripe_500_work_getcpu)                610.98ns    1.64M
498 // contentionAtWidth(4_stripe_500_work_getcpu)                507.72ns    1.97M
499 // contentionAtWidth(8_stripe_500_work_getcpu)                542.53ns    1.84M
500 // contentionAtWidth(16_stripe_500_work_getcpu)               496.55ns    2.01M
501 // contentionAtWidth(32_stripe_500_work_getcpu)               500.67ns    2.00M
502 // atomicIncrBaseline(local_incr_500_work)                    484.69ns    2.06M
503 // ----------------------------------------------------------------------------
504 // contentionAtWidth(1_stripe_1000_work_stub)                   2.11us  473.78K
505 // contentionAtWidth(2_stripe_1000_work_getcpu)               970.64ns    1.03M
506 // contentionAtWidth(4_stripe_1000_work_getcpu)               987.31ns    1.01M
507 // contentionAtWidth(8_stripe_1000_work_getcpu)                 1.01us  985.52K
508 // contentionAtWidth(16_stripe_1000_work_getcpu)              986.09ns    1.01M
509 // contentionAtWidth(32_stripe_1000_work_getcpu)              960.23ns    1.04M
510 // atomicIncrBaseline(local_incr_1000_work)                   950.63ns    1.05M
511 // ============================================================================
512 static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
513                               SpreaderType spreaderType,
514                               size_t counterAlignment = 128,
515                               size_t numThreads = 32) {
516   folly::BenchmarkSuspender braces;
517
518   AccessSpreader<> spreader(
519       stripes,
520       CacheLocality::system<std::atomic>(),
521       spreaderType == SpreaderType::TLS_RR
522           ? SequentialThreadId<std::atomic>::getcpu : nullptr);
523
524   std::atomic<size_t> ready(0);
525   std::atomic<bool> go(false);
526
527   // while in theory the cache line size is 64 bytes, experiments show
528   // that we get contention on 128 byte boundaries for Ivy Bridge.  The
529   // extra indirection adds 1 or 2 nanos
530   assert(counterAlignment >= sizeof(std::atomic<size_t>));
531   char raw[counterAlignment * stripes];
532
533   // if we happen to be using the tlsRoundRobin, then sequentially
534   // assigning the thread identifiers is the unlikely best-case scenario.
535   // We don't want to unfairly benefit or penalize.  Computing the exact
536   // maximum likelihood of the probability distributions is annoying, so
537   // I approximate as 2/5 of the ids that have no threads, 2/5 that have
538   // 1, 2/15 that have 2, and 1/15 that have 3.  We accomplish this by
539   // wrapping back to slot 0 when we hit 1/15 and 1/5.
540
541   std::vector<std::thread> threads;
542   while (threads.size() < numThreads) {
543     threads.push_back(std::thread([&,iters,stripes,work]() {
544       std::atomic<size_t>* counters[stripes];
545       for (size_t i = 0; i < stripes; ++i) {
546         counters[i] = new (raw + counterAlignment * i) std::atomic<size_t>();
547       }
548
549       spreader.current();
550       ready++;
551       while (!go.load()) {
552         sched_yield();
553       }
554       std::atomic<int> localWork;
555       if (spreaderType == SpreaderType::SHARED) {
556         for (size_t i = iters; i > 0; --i) {
557           ++*(counters[AccessSpreader<>::current(stripes)]);
558           for (size_t j = work; j > 0; --j) {
559             localWork.load();
560           }
561         }
562       } else {
563         for (size_t i = iters; i > 0; --i) {
564           ++*(counters[spreader.current()]);
565           for (size_t j = work; j > 0; --j) {
566             localWork.load();
567           }
568         }
569       }
570     }));
571
572     if (threads.size() == numThreads / 15 ||
573         threads.size() == numThreads / 5) {
574       // create a few dummy threads to wrap back around to 0 mod numCpus
575       for (size_t i = threads.size(); i != numThreads; ++i) {
576         std::thread([&]() {
577           spreader.current();
578         }).join();
579       }
580     }
581   }
582
583   while (ready < numThreads) {
584     sched_yield();
585   }
586   braces.dismiss();
587   go = true;
588
589   for (auto& thr : threads) {
590     thr.join();
591   }
592 }
593
594 static void atomicIncrBaseline(size_t iters, size_t work,
595                                size_t numThreads = 32) {
596   folly::BenchmarkSuspender braces;
597
598   std::atomic<bool> go(false);
599
600   std::vector<std::thread> threads;
601   while (threads.size() < numThreads) {
602     threads.push_back(std::thread([&]() {
603       while (!go.load()) {
604         sched_yield();
605       }
606       std::atomic<size_t> localCounter;
607       std::atomic<int> localWork;
608       for (size_t i = iters; i > 0; --i) {
609         localCounter++;
610         for (size_t j = work; j > 0; --j) {
611           localWork.load();
612         }
613       }
614     }));
615   }
616
617   braces.dismiss();
618   go = true;
619
620   for (auto& thr : threads) {
621     thr.join();
622   }
623 }
624
625 BENCHMARK_DRAW_LINE()
626
627 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_0_work_stub,
628                       1, 0, SpreaderType::GETCPU)
629 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_getcpu,
630                       2, 0, SpreaderType::GETCPU)
631 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_getcpu,
632                       4, 0, SpreaderType::GETCPU)
633 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_getcpu,
634                       8, 0, SpreaderType::GETCPU)
635 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_getcpu,
636                       16, 0, SpreaderType::GETCPU)
637 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_getcpu,
638                       32, 0, SpreaderType::GETCPU)
639 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_getcpu,
640                       64, 0, SpreaderType::GETCPU)
641 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_tls_rr,
642                       2, 0, SpreaderType::TLS_RR)
643 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_tls_rr,
644                       4, 0, SpreaderType::TLS_RR)
645 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_tls_rr,
646                       8, 0, SpreaderType::TLS_RR)
647 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_tls_rr,
648                       16, 0, SpreaderType::TLS_RR)
649 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_tls_rr,
650                       32, 0, SpreaderType::TLS_RR)
651 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_tls_rr,
652                       64, 0, SpreaderType::TLS_RR)
653 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_shared,
654                       2, 0, SpreaderType::SHARED)
655 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_shared,
656                       4, 0, SpreaderType::SHARED)
657 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_shared,
658                       8, 0, SpreaderType::SHARED)
659 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_shared,
660                       16, 0, SpreaderType::SHARED)
661 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_shared,
662                       32, 0, SpreaderType::SHARED)
663 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_shared,
664                       64, 0, SpreaderType::SHARED)
665 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
666 BENCHMARK_DRAW_LINE()
667 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_500_work_stub,
668                       1, 500, SpreaderType::GETCPU)
669 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_500_work_getcpu,
670                       2, 500, SpreaderType::GETCPU)
671 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_500_work_getcpu,
672                       4, 500, SpreaderType::GETCPU)
673 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_500_work_getcpu,
674                       8, 500, SpreaderType::GETCPU)
675 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_500_work_getcpu,
676                       16, 500, SpreaderType::GETCPU)
677 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_500_work_getcpu,
678                       32, 500, SpreaderType::GETCPU)
679 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
680 BENCHMARK_DRAW_LINE()
681 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_1000_work_stub,
682                       1, 1000, SpreaderType::GETCPU)
683 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_1000_work_getcpu,
684                       2, 1000, SpreaderType::GETCPU)
685 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_1000_work_getcpu,
686                       4, 1000, SpreaderType::GETCPU)
687 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_1000_work_getcpu,
688                       8, 1000, SpreaderType::GETCPU)
689 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_1000_work_getcpu,
690                       16, 1000, SpreaderType::GETCPU)
691 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_1000_work_getcpu,
692                       32, 1000, SpreaderType::GETCPU)
693 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
694
695
696 int main(int argc, char** argv) {
697   testing::InitGoogleTest(&argc, argv);
698   google::ParseCommandLineFlags(&argc, &argv, true);
699   auto ret = RUN_ALL_TESTS();
700   if (!ret && FLAGS_benchmark) {
701     folly::runBenchmarks();
702   }
703   return ret;
704 }
705