2 * Copyright 2015 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/detail/CacheLocality.h>
22 #include <type_traits>
23 #include <unordered_map>
24 #include <glog/logging.h>
25 #include <gtest/gtest.h>
26 #include <folly/Benchmark.h>
28 using namespace folly::detail;
30 /// This is the relevant nodes from a production box's sysfs tree. If you
31 /// think this map is ugly you should see the version of this test that
32 /// used a real directory tree. To reduce the chance of testing error
33 /// I haven't tried to remove the common prefix
34 static std::unordered_map<std::string,std::string> fakeSysfsTree = {
35 { "/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list", "0,17" },
36 { "/sys/devices/system/cpu/cpu0/cache/index0/type", "Data" },
37 { "/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list", "0,17" },
38 { "/sys/devices/system/cpu/cpu0/cache/index1/type", "Instruction" },
39 { "/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list", "0,17" },
40 { "/sys/devices/system/cpu/cpu0/cache/index2/type", "Unified" },
41 { "/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list", "0-8,17-23" },
42 { "/sys/devices/system/cpu/cpu0/cache/index3/type", "Unified" },
43 { "/sys/devices/system/cpu/cpu1/cache/index0/shared_cpu_list", "1,18" },
44 { "/sys/devices/system/cpu/cpu1/cache/index0/type", "Data" },
45 { "/sys/devices/system/cpu/cpu1/cache/index1/shared_cpu_list", "1,18" },
46 { "/sys/devices/system/cpu/cpu1/cache/index1/type", "Instruction" },
47 { "/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list", "1,18" },
48 { "/sys/devices/system/cpu/cpu1/cache/index2/type", "Unified" },
49 { "/sys/devices/system/cpu/cpu1/cache/index3/shared_cpu_list", "0-8,17-23" },
50 { "/sys/devices/system/cpu/cpu1/cache/index3/type", "Unified" },
51 { "/sys/devices/system/cpu/cpu2/cache/index0/shared_cpu_list", "2,19" },
52 { "/sys/devices/system/cpu/cpu2/cache/index0/type", "Data" },
53 { "/sys/devices/system/cpu/cpu2/cache/index1/shared_cpu_list", "2,19" },
54 { "/sys/devices/system/cpu/cpu2/cache/index1/type", "Instruction" },
55 { "/sys/devices/system/cpu/cpu2/cache/index2/shared_cpu_list", "2,19" },
56 { "/sys/devices/system/cpu/cpu2/cache/index2/type", "Unified" },
57 { "/sys/devices/system/cpu/cpu2/cache/index3/shared_cpu_list", "0-8,17-23" },
58 { "/sys/devices/system/cpu/cpu2/cache/index3/type", "Unified" },
59 { "/sys/devices/system/cpu/cpu3/cache/index0/shared_cpu_list", "3,20" },
60 { "/sys/devices/system/cpu/cpu3/cache/index0/type", "Data" },
61 { "/sys/devices/system/cpu/cpu3/cache/index1/shared_cpu_list", "3,20" },
62 { "/sys/devices/system/cpu/cpu3/cache/index1/type", "Instruction" },
63 { "/sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_list", "3,20" },
64 { "/sys/devices/system/cpu/cpu3/cache/index2/type", "Unified" },
65 { "/sys/devices/system/cpu/cpu3/cache/index3/shared_cpu_list", "0-8,17-23" },
66 { "/sys/devices/system/cpu/cpu3/cache/index3/type", "Unified" },
67 { "/sys/devices/system/cpu/cpu4/cache/index0/shared_cpu_list", "4,21" },
68 { "/sys/devices/system/cpu/cpu4/cache/index0/type", "Data" },
69 { "/sys/devices/system/cpu/cpu4/cache/index1/shared_cpu_list", "4,21" },
70 { "/sys/devices/system/cpu/cpu4/cache/index1/type", "Instruction" },
71 { "/sys/devices/system/cpu/cpu4/cache/index2/shared_cpu_list", "4,21" },
72 { "/sys/devices/system/cpu/cpu4/cache/index2/type", "Unified" },
73 { "/sys/devices/system/cpu/cpu4/cache/index3/shared_cpu_list", "0-8,17-23" },
74 { "/sys/devices/system/cpu/cpu4/cache/index3/type", "Unified" },
75 { "/sys/devices/system/cpu/cpu5/cache/index0/shared_cpu_list", "5-6" },
76 { "/sys/devices/system/cpu/cpu5/cache/index0/type", "Data" },
77 { "/sys/devices/system/cpu/cpu5/cache/index1/shared_cpu_list", "5-6" },
78 { "/sys/devices/system/cpu/cpu5/cache/index1/type", "Instruction" },
79 { "/sys/devices/system/cpu/cpu5/cache/index2/shared_cpu_list", "5-6" },
80 { "/sys/devices/system/cpu/cpu5/cache/index2/type", "Unified" },
81 { "/sys/devices/system/cpu/cpu5/cache/index3/shared_cpu_list", "0-8,17-23" },
82 { "/sys/devices/system/cpu/cpu5/cache/index3/type", "Unified" },
83 { "/sys/devices/system/cpu/cpu6/cache/index0/shared_cpu_list", "5-6" },
84 { "/sys/devices/system/cpu/cpu6/cache/index0/type", "Data" },
85 { "/sys/devices/system/cpu/cpu6/cache/index1/shared_cpu_list", "5-6" },
86 { "/sys/devices/system/cpu/cpu6/cache/index1/type", "Instruction" },
87 { "/sys/devices/system/cpu/cpu6/cache/index2/shared_cpu_list", "5-6" },
88 { "/sys/devices/system/cpu/cpu6/cache/index2/type", "Unified" },
89 { "/sys/devices/system/cpu/cpu6/cache/index3/shared_cpu_list", "0-8,17-23" },
90 { "/sys/devices/system/cpu/cpu6/cache/index3/type", "Unified" },
91 { "/sys/devices/system/cpu/cpu7/cache/index0/shared_cpu_list", "7,22" },
92 { "/sys/devices/system/cpu/cpu7/cache/index0/type", "Data" },
93 { "/sys/devices/system/cpu/cpu7/cache/index1/shared_cpu_list", "7,22" },
94 { "/sys/devices/system/cpu/cpu7/cache/index1/type", "Instruction" },
95 { "/sys/devices/system/cpu/cpu7/cache/index2/shared_cpu_list", "7,22" },
96 { "/sys/devices/system/cpu/cpu7/cache/index2/type", "Unified" },
97 { "/sys/devices/system/cpu/cpu7/cache/index3/shared_cpu_list", "0-8,17-23" },
98 { "/sys/devices/system/cpu/cpu7/cache/index3/type", "Unified" },
99 { "/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list", "8,23" },
100 { "/sys/devices/system/cpu/cpu8/cache/index0/type", "Data" },
101 { "/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list", "8,23" },
102 { "/sys/devices/system/cpu/cpu8/cache/index1/type", "Instruction" },
103 { "/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list", "8,23" },
104 { "/sys/devices/system/cpu/cpu8/cache/index2/type", "Unified" },
105 { "/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list", "0-8,17-23" },
106 { "/sys/devices/system/cpu/cpu8/cache/index3/type", "Unified" },
107 { "/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list", "9,24" },
108 { "/sys/devices/system/cpu/cpu9/cache/index0/type", "Data" },
109 { "/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list", "9,24" },
110 { "/sys/devices/system/cpu/cpu9/cache/index1/type", "Instruction" },
111 { "/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list", "9,24" },
112 { "/sys/devices/system/cpu/cpu9/cache/index2/type", "Unified" },
113 { "/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list", "9-16,24-31" },
114 { "/sys/devices/system/cpu/cpu9/cache/index3/type", "Unified" },
115 { "/sys/devices/system/cpu/cpu10/cache/index0/shared_cpu_list", "10,25" },
116 { "/sys/devices/system/cpu/cpu10/cache/index0/type", "Data" },
117 { "/sys/devices/system/cpu/cpu10/cache/index1/shared_cpu_list", "10,25" },
118 { "/sys/devices/system/cpu/cpu10/cache/index1/type", "Instruction" },
119 { "/sys/devices/system/cpu/cpu10/cache/index2/shared_cpu_list", "10,25" },
120 { "/sys/devices/system/cpu/cpu10/cache/index2/type", "Unified" },
121 { "/sys/devices/system/cpu/cpu10/cache/index3/shared_cpu_list", "9-16,24-31"},
122 { "/sys/devices/system/cpu/cpu10/cache/index3/type", "Unified" },
123 { "/sys/devices/system/cpu/cpu11/cache/index0/shared_cpu_list", "11,26" },
124 { "/sys/devices/system/cpu/cpu11/cache/index0/type", "Data" },
125 { "/sys/devices/system/cpu/cpu11/cache/index1/shared_cpu_list", "11,26" },
126 { "/sys/devices/system/cpu/cpu11/cache/index1/type", "Instruction" },
127 { "/sys/devices/system/cpu/cpu11/cache/index2/shared_cpu_list", "11,26" },
128 { "/sys/devices/system/cpu/cpu11/cache/index2/type", "Unified" },
129 { "/sys/devices/system/cpu/cpu11/cache/index3/shared_cpu_list", "9-16,24-31"},
130 { "/sys/devices/system/cpu/cpu11/cache/index3/type", "Unified" },
131 { "/sys/devices/system/cpu/cpu12/cache/index0/shared_cpu_list", "12,27" },
132 { "/sys/devices/system/cpu/cpu12/cache/index0/type", "Data" },
133 { "/sys/devices/system/cpu/cpu12/cache/index1/shared_cpu_list", "12,27" },
134 { "/sys/devices/system/cpu/cpu12/cache/index1/type", "Instruction" },
135 { "/sys/devices/system/cpu/cpu12/cache/index2/shared_cpu_list", "12,27" },
136 { "/sys/devices/system/cpu/cpu12/cache/index2/type", "Unified" },
137 { "/sys/devices/system/cpu/cpu12/cache/index3/shared_cpu_list", "9-16,24-31"},
138 { "/sys/devices/system/cpu/cpu12/cache/index3/type", "Unified" },
139 { "/sys/devices/system/cpu/cpu13/cache/index0/shared_cpu_list", "13,28" },
140 { "/sys/devices/system/cpu/cpu13/cache/index0/type", "Data" },
141 { "/sys/devices/system/cpu/cpu13/cache/index1/shared_cpu_list", "13,28" },
142 { "/sys/devices/system/cpu/cpu13/cache/index1/type", "Instruction" },
143 { "/sys/devices/system/cpu/cpu13/cache/index2/shared_cpu_list", "13,28" },
144 { "/sys/devices/system/cpu/cpu13/cache/index2/type", "Unified" },
145 { "/sys/devices/system/cpu/cpu13/cache/index3/shared_cpu_list", "9-16,24-31"},
146 { "/sys/devices/system/cpu/cpu13/cache/index3/type", "Unified" },
147 { "/sys/devices/system/cpu/cpu14/cache/index0/shared_cpu_list", "14,29" },
148 { "/sys/devices/system/cpu/cpu14/cache/index0/type", "Data" },
149 { "/sys/devices/system/cpu/cpu14/cache/index1/shared_cpu_list", "14,29" },
150 { "/sys/devices/system/cpu/cpu14/cache/index1/type", "Instruction" },
151 { "/sys/devices/system/cpu/cpu14/cache/index2/shared_cpu_list", "14,29" },
152 { "/sys/devices/system/cpu/cpu14/cache/index2/type", "Unified" },
153 { "/sys/devices/system/cpu/cpu14/cache/index3/shared_cpu_list", "9-16,24-31"},
154 { "/sys/devices/system/cpu/cpu14/cache/index3/type", "Unified" },
155 { "/sys/devices/system/cpu/cpu15/cache/index0/shared_cpu_list", "15,30" },
156 { "/sys/devices/system/cpu/cpu15/cache/index0/type", "Data" },
157 { "/sys/devices/system/cpu/cpu15/cache/index1/shared_cpu_list", "15,30" },
158 { "/sys/devices/system/cpu/cpu15/cache/index1/type", "Instruction" },
159 { "/sys/devices/system/cpu/cpu15/cache/index2/shared_cpu_list", "15,30" },
160 { "/sys/devices/system/cpu/cpu15/cache/index2/type", "Unified" },
161 { "/sys/devices/system/cpu/cpu15/cache/index3/shared_cpu_list", "9-16,24-31"},
162 { "/sys/devices/system/cpu/cpu15/cache/index3/type", "Unified" },
163 { "/sys/devices/system/cpu/cpu16/cache/index0/shared_cpu_list", "16,31" },
164 { "/sys/devices/system/cpu/cpu16/cache/index0/type", "Data" },
165 { "/sys/devices/system/cpu/cpu16/cache/index1/shared_cpu_list", "16,31" },
166 { "/sys/devices/system/cpu/cpu16/cache/index1/type", "Instruction" },
167 { "/sys/devices/system/cpu/cpu16/cache/index2/shared_cpu_list", "16,31" },
168 { "/sys/devices/system/cpu/cpu16/cache/index2/type", "Unified" },
169 { "/sys/devices/system/cpu/cpu16/cache/index3/shared_cpu_list", "9-16,24-31"},
170 { "/sys/devices/system/cpu/cpu16/cache/index3/type", "Unified" },
171 { "/sys/devices/system/cpu/cpu17/cache/index0/shared_cpu_list", "0,17" },
172 { "/sys/devices/system/cpu/cpu17/cache/index0/type", "Data" },
173 { "/sys/devices/system/cpu/cpu17/cache/index1/shared_cpu_list", "0,17" },
174 { "/sys/devices/system/cpu/cpu17/cache/index1/type", "Instruction" },
175 { "/sys/devices/system/cpu/cpu17/cache/index2/shared_cpu_list", "0,17" },
176 { "/sys/devices/system/cpu/cpu17/cache/index2/type", "Unified" },
177 { "/sys/devices/system/cpu/cpu17/cache/index3/shared_cpu_list", "0-8,17-23" },
178 { "/sys/devices/system/cpu/cpu17/cache/index3/type", "Unified" },
179 { "/sys/devices/system/cpu/cpu18/cache/index0/shared_cpu_list", "1,18" },
180 { "/sys/devices/system/cpu/cpu18/cache/index0/type", "Data" },
181 { "/sys/devices/system/cpu/cpu18/cache/index1/shared_cpu_list", "1,18" },
182 { "/sys/devices/system/cpu/cpu18/cache/index1/type", "Instruction" },
183 { "/sys/devices/system/cpu/cpu18/cache/index2/shared_cpu_list", "1,18" },
184 { "/sys/devices/system/cpu/cpu18/cache/index2/type", "Unified" },
185 { "/sys/devices/system/cpu/cpu18/cache/index3/shared_cpu_list", "0-8,17-23" },
186 { "/sys/devices/system/cpu/cpu18/cache/index3/type", "Unified" },
187 { "/sys/devices/system/cpu/cpu19/cache/index0/shared_cpu_list", "2,19" },
188 { "/sys/devices/system/cpu/cpu19/cache/index0/type", "Data" },
189 { "/sys/devices/system/cpu/cpu19/cache/index1/shared_cpu_list", "2,19" },
190 { "/sys/devices/system/cpu/cpu19/cache/index1/type", "Instruction" },
191 { "/sys/devices/system/cpu/cpu19/cache/index2/shared_cpu_list", "2,19" },
192 { "/sys/devices/system/cpu/cpu19/cache/index2/type", "Unified" },
193 { "/sys/devices/system/cpu/cpu19/cache/index3/shared_cpu_list", "0-8,17-23" },
194 { "/sys/devices/system/cpu/cpu19/cache/index3/type", "Unified" },
195 { "/sys/devices/system/cpu/cpu20/cache/index0/shared_cpu_list", "3,20" },
196 { "/sys/devices/system/cpu/cpu20/cache/index0/type", "Data" },
197 { "/sys/devices/system/cpu/cpu20/cache/index1/shared_cpu_list", "3,20" },
198 { "/sys/devices/system/cpu/cpu20/cache/index1/type", "Instruction" },
199 { "/sys/devices/system/cpu/cpu20/cache/index2/shared_cpu_list", "3,20" },
200 { "/sys/devices/system/cpu/cpu20/cache/index2/type", "Unified" },
201 { "/sys/devices/system/cpu/cpu20/cache/index3/shared_cpu_list", "0-8,17-23" },
202 { "/sys/devices/system/cpu/cpu20/cache/index3/type", "Unified" },
203 { "/sys/devices/system/cpu/cpu21/cache/index0/shared_cpu_list", "4,21" },
204 { "/sys/devices/system/cpu/cpu21/cache/index0/type", "Data" },
205 { "/sys/devices/system/cpu/cpu21/cache/index1/shared_cpu_list", "4,21" },
206 { "/sys/devices/system/cpu/cpu21/cache/index1/type", "Instruction" },
207 { "/sys/devices/system/cpu/cpu21/cache/index2/shared_cpu_list", "4,21" },
208 { "/sys/devices/system/cpu/cpu21/cache/index2/type", "Unified" },
209 { "/sys/devices/system/cpu/cpu21/cache/index3/shared_cpu_list", "0-8,17-23" },
210 { "/sys/devices/system/cpu/cpu21/cache/index3/type", "Unified" },
211 { "/sys/devices/system/cpu/cpu22/cache/index0/shared_cpu_list", "7,22" },
212 { "/sys/devices/system/cpu/cpu22/cache/index0/type", "Data" },
213 { "/sys/devices/system/cpu/cpu22/cache/index1/shared_cpu_list", "7,22" },
214 { "/sys/devices/system/cpu/cpu22/cache/index1/type", "Instruction" },
215 { "/sys/devices/system/cpu/cpu22/cache/index2/shared_cpu_list", "7,22" },
216 { "/sys/devices/system/cpu/cpu22/cache/index2/type", "Unified" },
217 { "/sys/devices/system/cpu/cpu22/cache/index3/shared_cpu_list", "0-8,17-23" },
218 { "/sys/devices/system/cpu/cpu22/cache/index3/type", "Unified" },
219 { "/sys/devices/system/cpu/cpu23/cache/index0/shared_cpu_list", "8,23" },
220 { "/sys/devices/system/cpu/cpu23/cache/index0/type", "Data" },
221 { "/sys/devices/system/cpu/cpu23/cache/index1/shared_cpu_list", "8,23" },
222 { "/sys/devices/system/cpu/cpu23/cache/index1/type", "Instruction" },
223 { "/sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_list", "8,23" },
224 { "/sys/devices/system/cpu/cpu23/cache/index2/type", "Unified" },
225 { "/sys/devices/system/cpu/cpu23/cache/index3/shared_cpu_list", "0-8,17-23" },
226 { "/sys/devices/system/cpu/cpu23/cache/index3/type", "Unified" },
227 { "/sys/devices/system/cpu/cpu24/cache/index0/shared_cpu_list", "9,24" },
228 { "/sys/devices/system/cpu/cpu24/cache/index0/type", "Data" },
229 { "/sys/devices/system/cpu/cpu24/cache/index1/shared_cpu_list", "9,24" },
230 { "/sys/devices/system/cpu/cpu24/cache/index1/type", "Instruction" },
231 { "/sys/devices/system/cpu/cpu24/cache/index2/shared_cpu_list", "9,24" },
232 { "/sys/devices/system/cpu/cpu24/cache/index2/type", "Unified" },
233 { "/sys/devices/system/cpu/cpu24/cache/index3/shared_cpu_list", "9-16,24-31"},
234 { "/sys/devices/system/cpu/cpu24/cache/index3/type", "Unified" },
235 { "/sys/devices/system/cpu/cpu25/cache/index0/shared_cpu_list", "10,25" },
236 { "/sys/devices/system/cpu/cpu25/cache/index0/type", "Data" },
237 { "/sys/devices/system/cpu/cpu25/cache/index1/shared_cpu_list", "10,25" },
238 { "/sys/devices/system/cpu/cpu25/cache/index1/type", "Instruction" },
239 { "/sys/devices/system/cpu/cpu25/cache/index2/shared_cpu_list", "10,25" },
240 { "/sys/devices/system/cpu/cpu25/cache/index2/type", "Unified" },
241 { "/sys/devices/system/cpu/cpu25/cache/index3/shared_cpu_list", "9-16,24-31"},
242 { "/sys/devices/system/cpu/cpu25/cache/index3/type", "Unified" },
243 { "/sys/devices/system/cpu/cpu26/cache/index0/shared_cpu_list", "11,26" },
244 { "/sys/devices/system/cpu/cpu26/cache/index0/type", "Data" },
245 { "/sys/devices/system/cpu/cpu26/cache/index1/shared_cpu_list", "11,26" },
246 { "/sys/devices/system/cpu/cpu26/cache/index1/type", "Instruction" },
247 { "/sys/devices/system/cpu/cpu26/cache/index2/shared_cpu_list", "11,26" },
248 { "/sys/devices/system/cpu/cpu26/cache/index2/type", "Unified" },
249 { "/sys/devices/system/cpu/cpu26/cache/index3/shared_cpu_list", "9-16,24-31"},
250 { "/sys/devices/system/cpu/cpu26/cache/index3/type", "Unified" },
251 { "/sys/devices/system/cpu/cpu27/cache/index0/shared_cpu_list", "12,27" },
252 { "/sys/devices/system/cpu/cpu27/cache/index0/type", "Data" },
253 { "/sys/devices/system/cpu/cpu27/cache/index1/shared_cpu_list", "12,27" },
254 { "/sys/devices/system/cpu/cpu27/cache/index1/type", "Instruction" },
255 { "/sys/devices/system/cpu/cpu27/cache/index2/shared_cpu_list", "12,27" },
256 { "/sys/devices/system/cpu/cpu27/cache/index2/type", "Unified" },
257 { "/sys/devices/system/cpu/cpu27/cache/index3/shared_cpu_list", "9-16,24-31"},
258 { "/sys/devices/system/cpu/cpu27/cache/index3/type", "Unified" },
259 { "/sys/devices/system/cpu/cpu28/cache/index0/shared_cpu_list", "13,28" },
260 { "/sys/devices/system/cpu/cpu28/cache/index0/type", "Data" },
261 { "/sys/devices/system/cpu/cpu28/cache/index1/shared_cpu_list", "13,28" },
262 { "/sys/devices/system/cpu/cpu28/cache/index1/type", "Instruction" },
263 { "/sys/devices/system/cpu/cpu28/cache/index2/shared_cpu_list", "13,28" },
264 { "/sys/devices/system/cpu/cpu28/cache/index2/type", "Unified" },
265 { "/sys/devices/system/cpu/cpu28/cache/index3/shared_cpu_list", "9-16,24-31"},
266 { "/sys/devices/system/cpu/cpu28/cache/index3/type", "Unified" },
267 { "/sys/devices/system/cpu/cpu29/cache/index0/shared_cpu_list", "14,29" },
268 { "/sys/devices/system/cpu/cpu29/cache/index0/type", "Data" },
269 { "/sys/devices/system/cpu/cpu29/cache/index1/shared_cpu_list", "14,29" },
270 { "/sys/devices/system/cpu/cpu29/cache/index1/type", "Instruction" },
271 { "/sys/devices/system/cpu/cpu29/cache/index2/shared_cpu_list", "14,29" },
272 { "/sys/devices/system/cpu/cpu29/cache/index2/type", "Unified" },
273 { "/sys/devices/system/cpu/cpu29/cache/index3/shared_cpu_list", "9-16,24-31"},
274 { "/sys/devices/system/cpu/cpu29/cache/index3/type", "Unified" },
275 { "/sys/devices/system/cpu/cpu30/cache/index0/shared_cpu_list", "15,30" },
276 { "/sys/devices/system/cpu/cpu30/cache/index0/type", "Data" },
277 { "/sys/devices/system/cpu/cpu30/cache/index1/shared_cpu_list", "15,30" },
278 { "/sys/devices/system/cpu/cpu30/cache/index1/type", "Instruction" },
279 { "/sys/devices/system/cpu/cpu30/cache/index2/shared_cpu_list", "15,30" },
280 { "/sys/devices/system/cpu/cpu30/cache/index2/type", "Unified" },
281 { "/sys/devices/system/cpu/cpu30/cache/index3/shared_cpu_list", "9-16,24-31"},
282 { "/sys/devices/system/cpu/cpu30/cache/index3/type", "Unified" },
283 { "/sys/devices/system/cpu/cpu31/cache/index0/shared_cpu_list", "16,31" },
284 { "/sys/devices/system/cpu/cpu31/cache/index0/type", "Data" },
285 { "/sys/devices/system/cpu/cpu31/cache/index1/shared_cpu_list", "16,31" },
286 { "/sys/devices/system/cpu/cpu31/cache/index1/type", "Instruction" },
287 { "/sys/devices/system/cpu/cpu31/cache/index2/shared_cpu_list", "16,31" },
288 { "/sys/devices/system/cpu/cpu31/cache/index2/type", "Unified" },
289 { "/sys/devices/system/cpu/cpu31/cache/index3/shared_cpu_list", "9-16,24-31"},
290 { "/sys/devices/system/cpu/cpu31/cache/index3/type", "Unified" }
293 /// This is the expected CacheLocality structure for fakeSysfsTree
294 static const CacheLocality nonUniformExampleLocality = {
297 { 0, 2, 4, 6, 8, 10, 11, 12, 14, 16, 18, 20, 22, 24, 26, 28,
298 30, 1, 3, 5, 7, 9, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 }
301 TEST(CacheLocality, FakeSysfs) {
302 auto parsed = CacheLocality::readFromSysfsTree([](std::string name) {
303 auto iter = fakeSysfsTree.find(name);
304 return iter == fakeSysfsTree.end() ? std::string() : iter->second;
307 auto& expected = nonUniformExampleLocality;
308 EXPECT_EQ(expected.numCpus, parsed.numCpus);
309 EXPECT_EQ(expected.numCachesByLevel, parsed.numCachesByLevel);
310 EXPECT_EQ(expected.localityIndexByCpu, parsed.localityIndexByCpu);
313 TEST(Getcpu, VdsoGetcpu) {
315 Getcpu::vdsoFunc()(&cpu, nullptr, nullptr);
317 EXPECT_TRUE(cpu < CPU_SETSIZE);
321 TEST(ThreadId, SimpleTls) {
324 folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu(
325 &cpu, nullptr, nullptr);
327 EXPECT_TRUE(cpu > 0);
329 folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu(
330 &again, nullptr, nullptr);
331 EXPECT_EQ(cpu, again);
335 TEST(ThreadId, SimplePthread) {
337 auto rv = folly::detail::FallbackGetcpu<HashingThreadId>::getcpu(
338 &cpu, nullptr, nullptr);
340 EXPECT_TRUE(cpu > 0);
342 folly::detail::FallbackGetcpu<HashingThreadId>::getcpu(
343 &again, nullptr, nullptr);
344 EXPECT_EQ(cpu, again);
347 static FOLLY_TLS unsigned testingCpu = 0;
349 static int testingGetcpu(unsigned* cpu, unsigned* node, void* unused) {
350 if (cpu != nullptr) {
353 if (node != nullptr) {
359 TEST(AccessSpreader, Stubbed) {
360 std::vector<std::unique_ptr<AccessSpreader<>>> spreaders(100);
361 for (size_t s = 1; s < spreaders.size(); ++s) {
362 spreaders[s].reset(new AccessSpreader<>(
363 s, nonUniformExampleLocality, &testingGetcpu));
365 std::vector<size_t> cpusInLocalityOrder = {
366 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 6, 7, 22, 8, 23, 9, 24, 10, 25,
367 11, 26, 12, 27, 13, 28, 14, 29, 15, 30, 16, 31 };
368 for (size_t i = 0; i < 32; ++i) {
369 // extra i * 32 is to check wrapping behavior of impl
370 testingCpu = cpusInLocalityOrder[i] + i * 64;
371 for (size_t s = 1; s < spreaders.size(); ++s) {
372 EXPECT_EQ((i * s) / 32, spreaders[s]->current())
373 << "i=" << i << ", cpu=" << testingCpu << ", s=" << s;
378 TEST(AccessSpreader, Default) {
379 AccessSpreader<> spreader(16);
380 EXPECT_LT(spreader.current(), 16);
383 TEST(AccessSpreader, Shared) {
384 for (size_t s = 1; s < 200; ++s) {
385 EXPECT_LT(AccessSpreader<>::shared(s).current(), s);
389 TEST(AccessSpreader, Statics) {
390 LOG(INFO) << "stripeByCore.numStripes() = "
391 << AccessSpreader<>::stripeByCore.numStripes();
392 LOG(INFO) << "stripeByChip.numStripes() = "
393 << AccessSpreader<>::stripeByChip.numStripes();
394 for (size_t s = 1; s < 200; ++s) {
395 EXPECT_LT(AccessSpreader<>::current(s), s);
399 TEST(AccessSpreader, Wrapping) {
400 // this test won't pass unless locality.numCpus divides kMaxCpus
402 auto locality = CacheLocality::uniform(numCpus);
403 for (size_t s = 1; s < 200; ++s) {
404 AccessSpreader<> spreader(s, locality, &testingGetcpu);
405 for (size_t c = 0; c < 400; ++c) {
407 auto observed = spreader.current();
408 testingCpu = c % numCpus;
409 auto expected = spreader.current();
410 EXPECT_EQ(expected, observed)
411 << "numCpus=" << numCpus << ", s=" << s << ", c=" << c;
416 // Benchmarked at ~21 nanos on fbk35 (2.6) and fbk18 (3.2) kernels with
418 // ============================================================================
419 // folly/test/CacheLocalityTest.cpp relative time/iter iters/s
420 // ============================================================================
421 // LocalAccessSpreaderUse 20.77ns 48.16M
422 // SharedAccessSpreaderUse 21.95ns 45.55M
423 // AccessSpreaderConstruction 466.56ns 2.14M
424 // ============================================================================
426 BENCHMARK(LocalAccessSpreaderUse, iters) {
427 folly::BenchmarkSuspender braces;
428 AccessSpreader<> spreader(16);
431 for (unsigned long i = 0; i < iters; ++i) {
432 auto x = spreader.current();
433 folly::doNotOptimizeAway(x);
437 BENCHMARK(SharedAccessSpreaderUse, iters) {
438 for (unsigned long i = 0; i < iters; ++i) {
439 auto x = AccessSpreader<>::current(16);
440 folly::doNotOptimizeAway(x);
444 BENCHMARK(AccessSpreaderConstruction, iters) {
445 std::aligned_storage<sizeof(AccessSpreader<>),
446 std::alignment_of<AccessSpreader<>>::value>::type raw;
447 for (unsigned long i = 0; i < iters; ++i) {
448 auto x = new (&raw) AccessSpreader<>(16);
449 folly::doNotOptimizeAway(x);
450 x->~AccessSpreader();
454 enum class SpreaderType { GETCPU, SHARED, TLS_RR, PTHREAD_SELF };
456 // Benchmark scores here reflect the time for 32 threads to perform an
457 // atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly,
458 // if we don't separate the counters onto unique 128 byte stripes the
459 // 1_stripe and 2_stripe results are identical, even though the L3 is
460 // claimed to have 64 byte cache lines.
462 // _stub means there was no call to getcpu or the tls round-robin
463 // implementation, because for a single stripe the cpu doesn't matter.
464 // _getcpu refers to the vdso getcpu implementation with a locally
465 // constructed AccessSpreader. _tls_rr refers to execution using
466 // SequentialThreadId, the fallback if the vdso getcpu isn't available.
467 // _shared refers to calling AccessSpreader<>::current(numStripes)
468 // inside the hot loop.
470 // At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
471 // so since the stripe selection is 21 nanos the atomic increments in
472 // the L1 is ~15 nanos. At width 8_stripe_0_work the line is expected
473 // to ping-pong almost every operation, since the loops have the same
474 // duration. Widths 4 and 2 have the same behavior, but each tour of the
475 // cache line is 4 and 8 cores long, respectively. These all suggest a
476 // lower bound of 60 nanos for intra-chip handoff and increment between
479 // With 455 nanos (1K cycles) of busywork per contended increment, the
480 // system can hide all of the latency of a tour of length 4, but not
481 // quite one of length 8. I was a bit surprised at how much worse the
482 // non-striped version got. It seems that the inter-chip traffic also
483 // interferes with the L1-only localWork.load(). When the local work is
484 // doubled to about 1 microsecond we see that the inter-chip contention
485 // is still very important, but subdivisions on the same chip don't matter.
488 // _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
489 // ============================================================================
490 // folly/test/CacheLocalityTest.cpp relative time/iter iters/s
491 // ============================================================================
492 // LocalAccessSpreaderUse 13.00ns 76.94M
493 // SharedAccessSpreaderUse 13.04ns 76.66M
494 // AccessSpreaderConstruction 366.00ns 2.73M
495 // ----------------------------------------------------------------------------
496 // contentionAtWidth(1_stripe_0_work_stub) 891.04ns 1.12M
497 // contentionAtWidth(2_stripe_0_work_getcpu) 403.45ns 2.48M
498 // contentionAtWidth(4_stripe_0_work_getcpu) 198.02ns 5.05M
499 // contentionAtWidth(8_stripe_0_work_getcpu) 90.54ns 11.04M
500 // contentionAtWidth(16_stripe_0_work_getcpu) 31.21ns 32.04M
501 // contentionAtWidth(32_stripe_0_work_getcpu) 29.15ns 34.31M
502 // contentionAtWidth(64_stripe_0_work_getcpu) 32.41ns 30.86M
503 // contentionAtWidth(2_stripe_0_work_tls_rr) 958.06ns 1.04M
504 // contentionAtWidth(4_stripe_0_work_tls_rr) 494.31ns 2.02M
505 // contentionAtWidth(8_stripe_0_work_tls_rr) 362.34ns 2.76M
506 // contentionAtWidth(16_stripe_0_work_tls_rr) 231.37ns 4.32M
507 // contentionAtWidth(32_stripe_0_work_tls_rr) 128.26ns 7.80M
508 // contentionAtWidth(64_stripe_0_work_tls_rr) 115.08ns 8.69M
509 // contentionAtWidth(2_stripe_0_work_pthread_self) 856.63ns 1.17M
510 // contentionAtWidth(4_stripe_0_work_pthread_self) 623.43ns 1.60M
511 // contentionAtWidth(8_stripe_0_work_pthread_self) 419.69ns 2.38M
512 // contentionAtWidth(16_stripe_0_work_pthread_self 217.32ns 4.60M
513 // contentionAtWidth(32_stripe_0_work_pthread_self 157.69ns 6.34M
514 // contentionAtWidth(64_stripe_0_work_pthread_self 140.94ns 7.10M
515 // contentionAtWidth(2_stripe_0_work_shared) 406.55ns 2.46M
516 // contentionAtWidth(4_stripe_0_work_shared) 198.28ns 5.04M
517 // contentionAtWidth(8_stripe_0_work_shared) 90.11ns 11.10M
518 // contentionAtWidth(16_stripe_0_work_shared) 34.53ns 28.96M
519 // contentionAtWidth(32_stripe_0_work_shared) 30.08ns 33.25M
520 // contentionAtWidth(64_stripe_0_work_shared) 34.60ns 28.90M
521 // atomicIncrBaseline(local_incr_0_work) 17.51ns 57.12M
522 // ----------------------------------------------------------------------------
523 // contentionAtWidth(1_stripe_500_work_stub) 1.87us 534.36K
524 // contentionAtWidth(2_stripe_500_work_getcpu) 542.31ns 1.84M
525 // contentionAtWidth(4_stripe_500_work_getcpu) 409.18ns 2.44M
526 // contentionAtWidth(8_stripe_500_work_getcpu) 511.05ns 1.96M
527 // contentionAtWidth(16_stripe_500_work_getcpu) 399.14ns 2.51M
528 // contentionAtWidth(32_stripe_500_work_getcpu) 399.05ns 2.51M
529 // atomicIncrBaseline(local_incr_500_work) 399.41ns 2.50M
530 // ----------------------------------------------------------------------------
531 // contentionAtWidth(1_stripe_1000_work_stub) 1.90us 525.73K
532 // contentionAtWidth(2_stripe_1000_work_getcpu) 792.91ns 1.26M
533 // contentionAtWidth(4_stripe_1000_work_getcpu) 788.14ns 1.27M
534 // contentionAtWidth(8_stripe_1000_work_getcpu) 794.16ns 1.26M
535 // contentionAtWidth(16_stripe_1000_work_getcpu) 785.33ns 1.27M
536 // contentionAtWidth(32_stripe_1000_work_getcpu) 786.56ns 1.27M
537 // atomicIncrBaseline(local_incr_1000_work) 784.69ns 1.27M
538 // ============================================================================
539 static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
540 SpreaderType spreaderType,
541 size_t counterAlignment = 128,
542 size_t numThreads = 32) {
543 folly::BenchmarkSuspender braces;
545 folly::detail::Getcpu::Func getcpuFunc = nullptr;
547 if (spreaderType == SpreaderType::TLS_RR) {
549 folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu;
551 if (spreaderType == SpreaderType::PTHREAD_SELF) {
552 getcpuFunc = folly::detail::FallbackGetcpu<HashingThreadId>::getcpu;
555 AccessSpreader<> spreader(
556 stripes, CacheLocality::system<std::atomic>(), getcpuFunc);
558 std::atomic<size_t> ready(0);
559 std::atomic<bool> go(false);
561 // while in theory the cache line size is 64 bytes, experiments show
562 // that we get contention on 128 byte boundaries for Ivy Bridge. The
563 // extra indirection adds 1 or 2 nanos
564 assert(counterAlignment >= sizeof(std::atomic<size_t>));
565 std::vector<char> raw(counterAlignment * stripes);
567 // if we happen to be using the tlsRoundRobin, then sequentially
568 // assigning the thread identifiers is the unlikely best-case scenario.
569 // We don't want to unfairly benefit or penalize. Computing the exact
570 // maximum likelihood of the probability distributions is annoying, so
571 // I approximate as 2/5 of the ids that have no threads, 2/5 that have
572 // 1, 2/15 that have 2, and 1/15 that have 3. We accomplish this by
573 // wrapping back to slot 0 when we hit 1/15 and 1/5.
575 std::vector<std::thread> threads;
576 while (threads.size() < numThreads) {
577 threads.push_back(std::thread([&,iters,stripes,work]() {
578 std::atomic<size_t>* counters[stripes];
579 for (size_t i = 0; i < stripes; ++i) {
581 = new (raw.data() + counterAlignment * i) std::atomic<size_t>();
589 std::atomic<int> localWork(0);
590 if (spreaderType == SpreaderType::SHARED) {
591 for (size_t i = iters; i > 0; --i) {
592 ++*(counters[AccessSpreader<>::current(stripes)]);
593 for (size_t j = work; j > 0; --j) {
598 for (size_t i = iters; i > 0; --i) {
599 ++*(counters[spreader.current()]);
600 for (size_t j = work; j > 0; --j) {
607 if (threads.size() == numThreads / 15 ||
608 threads.size() == numThreads / 5) {
609 // create a few dummy threads to wrap back around to 0 mod numCpus
610 for (size_t i = threads.size(); i != numThreads; ++i) {
618 while (ready < numThreads) {
624 for (auto& thr : threads) {
629 static void atomicIncrBaseline(size_t iters, size_t work,
630 size_t numThreads = 32) {
631 folly::BenchmarkSuspender braces;
633 std::atomic<bool> go(false);
635 std::vector<std::thread> threads;
636 while (threads.size() < numThreads) {
637 threads.push_back(std::thread([&]() {
641 std::atomic<size_t> localCounter(0);
642 std::atomic<int> localWork(0);
643 for (size_t i = iters; i > 0; --i) {
645 for (size_t j = work; j > 0; --j) {
655 for (auto& thr : threads) {
660 BENCHMARK_DRAW_LINE()
662 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_0_work_stub,
663 1, 0, SpreaderType::GETCPU)
664 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_getcpu,
665 2, 0, SpreaderType::GETCPU)
666 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_getcpu,
667 4, 0, SpreaderType::GETCPU)
668 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_getcpu,
669 8, 0, SpreaderType::GETCPU)
670 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_getcpu,
671 16, 0, SpreaderType::GETCPU)
672 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_getcpu,
673 32, 0, SpreaderType::GETCPU)
674 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_getcpu,
675 64, 0, SpreaderType::GETCPU)
676 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_tls_rr,
677 2, 0, SpreaderType::TLS_RR)
678 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_tls_rr,
679 4, 0, SpreaderType::TLS_RR)
680 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_tls_rr,
681 8, 0, SpreaderType::TLS_RR)
682 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_tls_rr,
683 16, 0, SpreaderType::TLS_RR)
684 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_tls_rr,
685 32, 0, SpreaderType::TLS_RR)
686 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_tls_rr,
687 64, 0, SpreaderType::TLS_RR)
688 BENCHMARK_NAMED_PARAM(contentionAtWidth,
689 2_stripe_0_work_pthread_self,
692 SpreaderType::PTHREAD_SELF)
693 BENCHMARK_NAMED_PARAM(contentionAtWidth,
694 4_stripe_0_work_pthread_self,
697 SpreaderType::PTHREAD_SELF)
698 BENCHMARK_NAMED_PARAM(contentionAtWidth,
699 8_stripe_0_work_pthread_self,
702 SpreaderType::PTHREAD_SELF)
703 BENCHMARK_NAMED_PARAM(contentionAtWidth,
704 16_stripe_0_work_pthread_self,
707 SpreaderType::PTHREAD_SELF)
708 BENCHMARK_NAMED_PARAM(contentionAtWidth,
709 32_stripe_0_work_pthread_self,
712 SpreaderType::PTHREAD_SELF)
713 BENCHMARK_NAMED_PARAM(contentionAtWidth,
714 64_stripe_0_work_pthread_self,
717 SpreaderType::PTHREAD_SELF)
718 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_shared,
719 2, 0, SpreaderType::SHARED)
720 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_shared,
721 4, 0, SpreaderType::SHARED)
722 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_shared,
723 8, 0, SpreaderType::SHARED)
724 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_shared,
725 16, 0, SpreaderType::SHARED)
726 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_shared,
727 32, 0, SpreaderType::SHARED)
728 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_shared,
729 64, 0, SpreaderType::SHARED)
730 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
731 BENCHMARK_DRAW_LINE()
732 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_500_work_stub,
733 1, 500, SpreaderType::GETCPU)
734 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_500_work_getcpu,
735 2, 500, SpreaderType::GETCPU)
736 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_500_work_getcpu,
737 4, 500, SpreaderType::GETCPU)
738 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_500_work_getcpu,
739 8, 500, SpreaderType::GETCPU)
740 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_500_work_getcpu,
741 16, 500, SpreaderType::GETCPU)
742 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_500_work_getcpu,
743 32, 500, SpreaderType::GETCPU)
744 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
745 BENCHMARK_DRAW_LINE()
746 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_1000_work_stub,
747 1, 1000, SpreaderType::GETCPU)
748 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_1000_work_getcpu,
749 2, 1000, SpreaderType::GETCPU)
750 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_1000_work_getcpu,
751 4, 1000, SpreaderType::GETCPU)
752 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_1000_work_getcpu,
753 8, 1000, SpreaderType::GETCPU)
754 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_1000_work_getcpu,
755 16, 1000, SpreaderType::GETCPU)
756 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_1000_work_getcpu,
757 32, 1000, SpreaderType::GETCPU)
758 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
761 int main(int argc, char** argv) {
762 testing::InitGoogleTest(&argc, argv);
763 gflags::ParseCommandLineFlags(&argc, &argv, true);
764 auto ret = RUN_ALL_TESTS();
765 if (!ret && FLAGS_benchmark) {
766 folly::runBenchmarks();