2 * Copyright 2014 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/detail/CacheLocality.h>
22 #include <type_traits>
23 #include <unordered_map>
24 #include <glog/logging.h>
25 #include <gtest/gtest.h>
26 #include <folly/Benchmark.h>
28 using namespace folly::detail;
30 /// This is the relevant nodes from a production box's sysfs tree. If you
31 /// think this map is ugly you should see the version of this test that
32 /// used a real directory tree. To reduce the chance of testing error
33 /// I haven't tried to remove the common prefix
34 static std::unordered_map<std::string,std::string> fakeSysfsTree = {
35 { "/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list", "0,17" },
36 { "/sys/devices/system/cpu/cpu0/cache/index0/type", "Data" },
37 { "/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list", "0,17" },
38 { "/sys/devices/system/cpu/cpu0/cache/index1/type", "Instruction" },
39 { "/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list", "0,17" },
40 { "/sys/devices/system/cpu/cpu0/cache/index2/type", "Unified" },
41 { "/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list", "0-8,17-23" },
42 { "/sys/devices/system/cpu/cpu0/cache/index3/type", "Unified" },
43 { "/sys/devices/system/cpu/cpu1/cache/index0/shared_cpu_list", "1,18" },
44 { "/sys/devices/system/cpu/cpu1/cache/index0/type", "Data" },
45 { "/sys/devices/system/cpu/cpu1/cache/index1/shared_cpu_list", "1,18" },
46 { "/sys/devices/system/cpu/cpu1/cache/index1/type", "Instruction" },
47 { "/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list", "1,18" },
48 { "/sys/devices/system/cpu/cpu1/cache/index2/type", "Unified" },
49 { "/sys/devices/system/cpu/cpu1/cache/index3/shared_cpu_list", "0-8,17-23" },
50 { "/sys/devices/system/cpu/cpu1/cache/index3/type", "Unified" },
51 { "/sys/devices/system/cpu/cpu2/cache/index0/shared_cpu_list", "2,19" },
52 { "/sys/devices/system/cpu/cpu2/cache/index0/type", "Data" },
53 { "/sys/devices/system/cpu/cpu2/cache/index1/shared_cpu_list", "2,19" },
54 { "/sys/devices/system/cpu/cpu2/cache/index1/type", "Instruction" },
55 { "/sys/devices/system/cpu/cpu2/cache/index2/shared_cpu_list", "2,19" },
56 { "/sys/devices/system/cpu/cpu2/cache/index2/type", "Unified" },
57 { "/sys/devices/system/cpu/cpu2/cache/index3/shared_cpu_list", "0-8,17-23" },
58 { "/sys/devices/system/cpu/cpu2/cache/index3/type", "Unified" },
59 { "/sys/devices/system/cpu/cpu3/cache/index0/shared_cpu_list", "3,20" },
60 { "/sys/devices/system/cpu/cpu3/cache/index0/type", "Data" },
61 { "/sys/devices/system/cpu/cpu3/cache/index1/shared_cpu_list", "3,20" },
62 { "/sys/devices/system/cpu/cpu3/cache/index1/type", "Instruction" },
63 { "/sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_list", "3,20" },
64 { "/sys/devices/system/cpu/cpu3/cache/index2/type", "Unified" },
65 { "/sys/devices/system/cpu/cpu3/cache/index3/shared_cpu_list", "0-8,17-23" },
66 { "/sys/devices/system/cpu/cpu3/cache/index3/type", "Unified" },
67 { "/sys/devices/system/cpu/cpu4/cache/index0/shared_cpu_list", "4,21" },
68 { "/sys/devices/system/cpu/cpu4/cache/index0/type", "Data" },
69 { "/sys/devices/system/cpu/cpu4/cache/index1/shared_cpu_list", "4,21" },
70 { "/sys/devices/system/cpu/cpu4/cache/index1/type", "Instruction" },
71 { "/sys/devices/system/cpu/cpu4/cache/index2/shared_cpu_list", "4,21" },
72 { "/sys/devices/system/cpu/cpu4/cache/index2/type", "Unified" },
73 { "/sys/devices/system/cpu/cpu4/cache/index3/shared_cpu_list", "0-8,17-23" },
74 { "/sys/devices/system/cpu/cpu4/cache/index3/type", "Unified" },
75 { "/sys/devices/system/cpu/cpu5/cache/index0/shared_cpu_list", "5-6" },
76 { "/sys/devices/system/cpu/cpu5/cache/index0/type", "Data" },
77 { "/sys/devices/system/cpu/cpu5/cache/index1/shared_cpu_list", "5-6" },
78 { "/sys/devices/system/cpu/cpu5/cache/index1/type", "Instruction" },
79 { "/sys/devices/system/cpu/cpu5/cache/index2/shared_cpu_list", "5-6" },
80 { "/sys/devices/system/cpu/cpu5/cache/index2/type", "Unified" },
81 { "/sys/devices/system/cpu/cpu5/cache/index3/shared_cpu_list", "0-8,17-23" },
82 { "/sys/devices/system/cpu/cpu5/cache/index3/type", "Unified" },
83 { "/sys/devices/system/cpu/cpu6/cache/index0/shared_cpu_list", "5-6" },
84 { "/sys/devices/system/cpu/cpu6/cache/index0/type", "Data" },
85 { "/sys/devices/system/cpu/cpu6/cache/index1/shared_cpu_list", "5-6" },
86 { "/sys/devices/system/cpu/cpu6/cache/index1/type", "Instruction" },
87 { "/sys/devices/system/cpu/cpu6/cache/index2/shared_cpu_list", "5-6" },
88 { "/sys/devices/system/cpu/cpu6/cache/index2/type", "Unified" },
89 { "/sys/devices/system/cpu/cpu6/cache/index3/shared_cpu_list", "0-8,17-23" },
90 { "/sys/devices/system/cpu/cpu6/cache/index3/type", "Unified" },
91 { "/sys/devices/system/cpu/cpu7/cache/index0/shared_cpu_list", "7,22" },
92 { "/sys/devices/system/cpu/cpu7/cache/index0/type", "Data" },
93 { "/sys/devices/system/cpu/cpu7/cache/index1/shared_cpu_list", "7,22" },
94 { "/sys/devices/system/cpu/cpu7/cache/index1/type", "Instruction" },
95 { "/sys/devices/system/cpu/cpu7/cache/index2/shared_cpu_list", "7,22" },
96 { "/sys/devices/system/cpu/cpu7/cache/index2/type", "Unified" },
97 { "/sys/devices/system/cpu/cpu7/cache/index3/shared_cpu_list", "0-8,17-23" },
98 { "/sys/devices/system/cpu/cpu7/cache/index3/type", "Unified" },
99 { "/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list", "8,23" },
100 { "/sys/devices/system/cpu/cpu8/cache/index0/type", "Data" },
101 { "/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list", "8,23" },
102 { "/sys/devices/system/cpu/cpu8/cache/index1/type", "Instruction" },
103 { "/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list", "8,23" },
104 { "/sys/devices/system/cpu/cpu8/cache/index2/type", "Unified" },
105 { "/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list", "0-8,17-23" },
106 { "/sys/devices/system/cpu/cpu8/cache/index3/type", "Unified" },
107 { "/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list", "9,24" },
108 { "/sys/devices/system/cpu/cpu9/cache/index0/type", "Data" },
109 { "/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list", "9,24" },
110 { "/sys/devices/system/cpu/cpu9/cache/index1/type", "Instruction" },
111 { "/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list", "9,24" },
112 { "/sys/devices/system/cpu/cpu9/cache/index2/type", "Unified" },
113 { "/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list", "9-16,24-31" },
114 { "/sys/devices/system/cpu/cpu9/cache/index3/type", "Unified" },
115 { "/sys/devices/system/cpu/cpu10/cache/index0/shared_cpu_list", "10,25" },
116 { "/sys/devices/system/cpu/cpu10/cache/index0/type", "Data" },
117 { "/sys/devices/system/cpu/cpu10/cache/index1/shared_cpu_list", "10,25" },
118 { "/sys/devices/system/cpu/cpu10/cache/index1/type", "Instruction" },
119 { "/sys/devices/system/cpu/cpu10/cache/index2/shared_cpu_list", "10,25" },
120 { "/sys/devices/system/cpu/cpu10/cache/index2/type", "Unified" },
121 { "/sys/devices/system/cpu/cpu10/cache/index3/shared_cpu_list", "9-16,24-31"},
122 { "/sys/devices/system/cpu/cpu10/cache/index3/type", "Unified" },
123 { "/sys/devices/system/cpu/cpu11/cache/index0/shared_cpu_list", "11,26" },
124 { "/sys/devices/system/cpu/cpu11/cache/index0/type", "Data" },
125 { "/sys/devices/system/cpu/cpu11/cache/index1/shared_cpu_list", "11,26" },
126 { "/sys/devices/system/cpu/cpu11/cache/index1/type", "Instruction" },
127 { "/sys/devices/system/cpu/cpu11/cache/index2/shared_cpu_list", "11,26" },
128 { "/sys/devices/system/cpu/cpu11/cache/index2/type", "Unified" },
129 { "/sys/devices/system/cpu/cpu11/cache/index3/shared_cpu_list", "9-16,24-31"},
130 { "/sys/devices/system/cpu/cpu11/cache/index3/type", "Unified" },
131 { "/sys/devices/system/cpu/cpu12/cache/index0/shared_cpu_list", "12,27" },
132 { "/sys/devices/system/cpu/cpu12/cache/index0/type", "Data" },
133 { "/sys/devices/system/cpu/cpu12/cache/index1/shared_cpu_list", "12,27" },
134 { "/sys/devices/system/cpu/cpu12/cache/index1/type", "Instruction" },
135 { "/sys/devices/system/cpu/cpu12/cache/index2/shared_cpu_list", "12,27" },
136 { "/sys/devices/system/cpu/cpu12/cache/index2/type", "Unified" },
137 { "/sys/devices/system/cpu/cpu12/cache/index3/shared_cpu_list", "9-16,24-31"},
138 { "/sys/devices/system/cpu/cpu12/cache/index3/type", "Unified" },
139 { "/sys/devices/system/cpu/cpu13/cache/index0/shared_cpu_list", "13,28" },
140 { "/sys/devices/system/cpu/cpu13/cache/index0/type", "Data" },
141 { "/sys/devices/system/cpu/cpu13/cache/index1/shared_cpu_list", "13,28" },
142 { "/sys/devices/system/cpu/cpu13/cache/index1/type", "Instruction" },
143 { "/sys/devices/system/cpu/cpu13/cache/index2/shared_cpu_list", "13,28" },
144 { "/sys/devices/system/cpu/cpu13/cache/index2/type", "Unified" },
145 { "/sys/devices/system/cpu/cpu13/cache/index3/shared_cpu_list", "9-16,24-31"},
146 { "/sys/devices/system/cpu/cpu13/cache/index3/type", "Unified" },
147 { "/sys/devices/system/cpu/cpu14/cache/index0/shared_cpu_list", "14,29" },
148 { "/sys/devices/system/cpu/cpu14/cache/index0/type", "Data" },
149 { "/sys/devices/system/cpu/cpu14/cache/index1/shared_cpu_list", "14,29" },
150 { "/sys/devices/system/cpu/cpu14/cache/index1/type", "Instruction" },
151 { "/sys/devices/system/cpu/cpu14/cache/index2/shared_cpu_list", "14,29" },
152 { "/sys/devices/system/cpu/cpu14/cache/index2/type", "Unified" },
153 { "/sys/devices/system/cpu/cpu14/cache/index3/shared_cpu_list", "9-16,24-31"},
154 { "/sys/devices/system/cpu/cpu14/cache/index3/type", "Unified" },
155 { "/sys/devices/system/cpu/cpu15/cache/index0/shared_cpu_list", "15,30" },
156 { "/sys/devices/system/cpu/cpu15/cache/index0/type", "Data" },
157 { "/sys/devices/system/cpu/cpu15/cache/index1/shared_cpu_list", "15,30" },
158 { "/sys/devices/system/cpu/cpu15/cache/index1/type", "Instruction" },
159 { "/sys/devices/system/cpu/cpu15/cache/index2/shared_cpu_list", "15,30" },
160 { "/sys/devices/system/cpu/cpu15/cache/index2/type", "Unified" },
161 { "/sys/devices/system/cpu/cpu15/cache/index3/shared_cpu_list", "9-16,24-31"},
162 { "/sys/devices/system/cpu/cpu15/cache/index3/type", "Unified" },
163 { "/sys/devices/system/cpu/cpu16/cache/index0/shared_cpu_list", "16,31" },
164 { "/sys/devices/system/cpu/cpu16/cache/index0/type", "Data" },
165 { "/sys/devices/system/cpu/cpu16/cache/index1/shared_cpu_list", "16,31" },
166 { "/sys/devices/system/cpu/cpu16/cache/index1/type", "Instruction" },
167 { "/sys/devices/system/cpu/cpu16/cache/index2/shared_cpu_list", "16,31" },
168 { "/sys/devices/system/cpu/cpu16/cache/index2/type", "Unified" },
169 { "/sys/devices/system/cpu/cpu16/cache/index3/shared_cpu_list", "9-16,24-31"},
170 { "/sys/devices/system/cpu/cpu16/cache/index3/type", "Unified" },
171 { "/sys/devices/system/cpu/cpu17/cache/index0/shared_cpu_list", "0,17" },
172 { "/sys/devices/system/cpu/cpu17/cache/index0/type", "Data" },
173 { "/sys/devices/system/cpu/cpu17/cache/index1/shared_cpu_list", "0,17" },
174 { "/sys/devices/system/cpu/cpu17/cache/index1/type", "Instruction" },
175 { "/sys/devices/system/cpu/cpu17/cache/index2/shared_cpu_list", "0,17" },
176 { "/sys/devices/system/cpu/cpu17/cache/index2/type", "Unified" },
177 { "/sys/devices/system/cpu/cpu17/cache/index3/shared_cpu_list", "0-8,17-23" },
178 { "/sys/devices/system/cpu/cpu17/cache/index3/type", "Unified" },
179 { "/sys/devices/system/cpu/cpu18/cache/index0/shared_cpu_list", "1,18" },
180 { "/sys/devices/system/cpu/cpu18/cache/index0/type", "Data" },
181 { "/sys/devices/system/cpu/cpu18/cache/index1/shared_cpu_list", "1,18" },
182 { "/sys/devices/system/cpu/cpu18/cache/index1/type", "Instruction" },
183 { "/sys/devices/system/cpu/cpu18/cache/index2/shared_cpu_list", "1,18" },
184 { "/sys/devices/system/cpu/cpu18/cache/index2/type", "Unified" },
185 { "/sys/devices/system/cpu/cpu18/cache/index3/shared_cpu_list", "0-8,17-23" },
186 { "/sys/devices/system/cpu/cpu18/cache/index3/type", "Unified" },
187 { "/sys/devices/system/cpu/cpu19/cache/index0/shared_cpu_list", "2,19" },
188 { "/sys/devices/system/cpu/cpu19/cache/index0/type", "Data" },
189 { "/sys/devices/system/cpu/cpu19/cache/index1/shared_cpu_list", "2,19" },
190 { "/sys/devices/system/cpu/cpu19/cache/index1/type", "Instruction" },
191 { "/sys/devices/system/cpu/cpu19/cache/index2/shared_cpu_list", "2,19" },
192 { "/sys/devices/system/cpu/cpu19/cache/index2/type", "Unified" },
193 { "/sys/devices/system/cpu/cpu19/cache/index3/shared_cpu_list", "0-8,17-23" },
194 { "/sys/devices/system/cpu/cpu19/cache/index3/type", "Unified" },
195 { "/sys/devices/system/cpu/cpu20/cache/index0/shared_cpu_list", "3,20" },
196 { "/sys/devices/system/cpu/cpu20/cache/index0/type", "Data" },
197 { "/sys/devices/system/cpu/cpu20/cache/index1/shared_cpu_list", "3,20" },
198 { "/sys/devices/system/cpu/cpu20/cache/index1/type", "Instruction" },
199 { "/sys/devices/system/cpu/cpu20/cache/index2/shared_cpu_list", "3,20" },
200 { "/sys/devices/system/cpu/cpu20/cache/index2/type", "Unified" },
201 { "/sys/devices/system/cpu/cpu20/cache/index3/shared_cpu_list", "0-8,17-23" },
202 { "/sys/devices/system/cpu/cpu20/cache/index3/type", "Unified" },
203 { "/sys/devices/system/cpu/cpu21/cache/index0/shared_cpu_list", "4,21" },
204 { "/sys/devices/system/cpu/cpu21/cache/index0/type", "Data" },
205 { "/sys/devices/system/cpu/cpu21/cache/index1/shared_cpu_list", "4,21" },
206 { "/sys/devices/system/cpu/cpu21/cache/index1/type", "Instruction" },
207 { "/sys/devices/system/cpu/cpu21/cache/index2/shared_cpu_list", "4,21" },
208 { "/sys/devices/system/cpu/cpu21/cache/index2/type", "Unified" },
209 { "/sys/devices/system/cpu/cpu21/cache/index3/shared_cpu_list", "0-8,17-23" },
210 { "/sys/devices/system/cpu/cpu21/cache/index3/type", "Unified" },
211 { "/sys/devices/system/cpu/cpu22/cache/index0/shared_cpu_list", "7,22" },
212 { "/sys/devices/system/cpu/cpu22/cache/index0/type", "Data" },
213 { "/sys/devices/system/cpu/cpu22/cache/index1/shared_cpu_list", "7,22" },
214 { "/sys/devices/system/cpu/cpu22/cache/index1/type", "Instruction" },
215 { "/sys/devices/system/cpu/cpu22/cache/index2/shared_cpu_list", "7,22" },
216 { "/sys/devices/system/cpu/cpu22/cache/index2/type", "Unified" },
217 { "/sys/devices/system/cpu/cpu22/cache/index3/shared_cpu_list", "0-8,17-23" },
218 { "/sys/devices/system/cpu/cpu22/cache/index3/type", "Unified" },
219 { "/sys/devices/system/cpu/cpu23/cache/index0/shared_cpu_list", "8,23" },
220 { "/sys/devices/system/cpu/cpu23/cache/index0/type", "Data" },
221 { "/sys/devices/system/cpu/cpu23/cache/index1/shared_cpu_list", "8,23" },
222 { "/sys/devices/system/cpu/cpu23/cache/index1/type", "Instruction" },
223 { "/sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_list", "8,23" },
224 { "/sys/devices/system/cpu/cpu23/cache/index2/type", "Unified" },
225 { "/sys/devices/system/cpu/cpu23/cache/index3/shared_cpu_list", "0-8,17-23" },
226 { "/sys/devices/system/cpu/cpu23/cache/index3/type", "Unified" },
227 { "/sys/devices/system/cpu/cpu24/cache/index0/shared_cpu_list", "9,24" },
228 { "/sys/devices/system/cpu/cpu24/cache/index0/type", "Data" },
229 { "/sys/devices/system/cpu/cpu24/cache/index1/shared_cpu_list", "9,24" },
230 { "/sys/devices/system/cpu/cpu24/cache/index1/type", "Instruction" },
231 { "/sys/devices/system/cpu/cpu24/cache/index2/shared_cpu_list", "9,24" },
232 { "/sys/devices/system/cpu/cpu24/cache/index2/type", "Unified" },
233 { "/sys/devices/system/cpu/cpu24/cache/index3/shared_cpu_list", "9-16,24-31"},
234 { "/sys/devices/system/cpu/cpu24/cache/index3/type", "Unified" },
235 { "/sys/devices/system/cpu/cpu25/cache/index0/shared_cpu_list", "10,25" },
236 { "/sys/devices/system/cpu/cpu25/cache/index0/type", "Data" },
237 { "/sys/devices/system/cpu/cpu25/cache/index1/shared_cpu_list", "10,25" },
238 { "/sys/devices/system/cpu/cpu25/cache/index1/type", "Instruction" },
239 { "/sys/devices/system/cpu/cpu25/cache/index2/shared_cpu_list", "10,25" },
240 { "/sys/devices/system/cpu/cpu25/cache/index2/type", "Unified" },
241 { "/sys/devices/system/cpu/cpu25/cache/index3/shared_cpu_list", "9-16,24-31"},
242 { "/sys/devices/system/cpu/cpu25/cache/index3/type", "Unified" },
243 { "/sys/devices/system/cpu/cpu26/cache/index0/shared_cpu_list", "11,26" },
244 { "/sys/devices/system/cpu/cpu26/cache/index0/type", "Data" },
245 { "/sys/devices/system/cpu/cpu26/cache/index1/shared_cpu_list", "11,26" },
246 { "/sys/devices/system/cpu/cpu26/cache/index1/type", "Instruction" },
247 { "/sys/devices/system/cpu/cpu26/cache/index2/shared_cpu_list", "11,26" },
248 { "/sys/devices/system/cpu/cpu26/cache/index2/type", "Unified" },
249 { "/sys/devices/system/cpu/cpu26/cache/index3/shared_cpu_list", "9-16,24-31"},
250 { "/sys/devices/system/cpu/cpu26/cache/index3/type", "Unified" },
251 { "/sys/devices/system/cpu/cpu27/cache/index0/shared_cpu_list", "12,27" },
252 { "/sys/devices/system/cpu/cpu27/cache/index0/type", "Data" },
253 { "/sys/devices/system/cpu/cpu27/cache/index1/shared_cpu_list", "12,27" },
254 { "/sys/devices/system/cpu/cpu27/cache/index1/type", "Instruction" },
255 { "/sys/devices/system/cpu/cpu27/cache/index2/shared_cpu_list", "12,27" },
256 { "/sys/devices/system/cpu/cpu27/cache/index2/type", "Unified" },
257 { "/sys/devices/system/cpu/cpu27/cache/index3/shared_cpu_list", "9-16,24-31"},
258 { "/sys/devices/system/cpu/cpu27/cache/index3/type", "Unified" },
259 { "/sys/devices/system/cpu/cpu28/cache/index0/shared_cpu_list", "13,28" },
260 { "/sys/devices/system/cpu/cpu28/cache/index0/type", "Data" },
261 { "/sys/devices/system/cpu/cpu28/cache/index1/shared_cpu_list", "13,28" },
262 { "/sys/devices/system/cpu/cpu28/cache/index1/type", "Instruction" },
263 { "/sys/devices/system/cpu/cpu28/cache/index2/shared_cpu_list", "13,28" },
264 { "/sys/devices/system/cpu/cpu28/cache/index2/type", "Unified" },
265 { "/sys/devices/system/cpu/cpu28/cache/index3/shared_cpu_list", "9-16,24-31"},
266 { "/sys/devices/system/cpu/cpu28/cache/index3/type", "Unified" },
267 { "/sys/devices/system/cpu/cpu29/cache/index0/shared_cpu_list", "14,29" },
268 { "/sys/devices/system/cpu/cpu29/cache/index0/type", "Data" },
269 { "/sys/devices/system/cpu/cpu29/cache/index1/shared_cpu_list", "14,29" },
270 { "/sys/devices/system/cpu/cpu29/cache/index1/type", "Instruction" },
271 { "/sys/devices/system/cpu/cpu29/cache/index2/shared_cpu_list", "14,29" },
272 { "/sys/devices/system/cpu/cpu29/cache/index2/type", "Unified" },
273 { "/sys/devices/system/cpu/cpu29/cache/index3/shared_cpu_list", "9-16,24-31"},
274 { "/sys/devices/system/cpu/cpu29/cache/index3/type", "Unified" },
275 { "/sys/devices/system/cpu/cpu30/cache/index0/shared_cpu_list", "15,30" },
276 { "/sys/devices/system/cpu/cpu30/cache/index0/type", "Data" },
277 { "/sys/devices/system/cpu/cpu30/cache/index1/shared_cpu_list", "15,30" },
278 { "/sys/devices/system/cpu/cpu30/cache/index1/type", "Instruction" },
279 { "/sys/devices/system/cpu/cpu30/cache/index2/shared_cpu_list", "15,30" },
280 { "/sys/devices/system/cpu/cpu30/cache/index2/type", "Unified" },
281 { "/sys/devices/system/cpu/cpu30/cache/index3/shared_cpu_list", "9-16,24-31"},
282 { "/sys/devices/system/cpu/cpu30/cache/index3/type", "Unified" },
283 { "/sys/devices/system/cpu/cpu31/cache/index0/shared_cpu_list", "16,31" },
284 { "/sys/devices/system/cpu/cpu31/cache/index0/type", "Data" },
285 { "/sys/devices/system/cpu/cpu31/cache/index1/shared_cpu_list", "16,31" },
286 { "/sys/devices/system/cpu/cpu31/cache/index1/type", "Instruction" },
287 { "/sys/devices/system/cpu/cpu31/cache/index2/shared_cpu_list", "16,31" },
288 { "/sys/devices/system/cpu/cpu31/cache/index2/type", "Unified" },
289 { "/sys/devices/system/cpu/cpu31/cache/index3/shared_cpu_list", "9-16,24-31"},
290 { "/sys/devices/system/cpu/cpu31/cache/index3/type", "Unified" }
293 /// This is the expected CacheLocality structure for fakeSysfsTree
294 static const CacheLocality nonUniformExampleLocality = {
297 { 0, 2, 4, 6, 8, 10, 11, 12, 14, 16, 18, 20, 22, 24, 26, 28,
298 30, 1, 3, 5, 7, 9, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 }
301 TEST(CacheLocality, FakeSysfs) {
302 auto parsed = CacheLocality::readFromSysfsTree([](std::string name) {
303 auto iter = fakeSysfsTree.find(name);
304 return iter == fakeSysfsTree.end() ? std::string() : iter->second;
307 auto& expected = nonUniformExampleLocality;
308 EXPECT_EQ(expected.numCpus, parsed.numCpus);
309 EXPECT_EQ(expected.numCachesByLevel, parsed.numCachesByLevel);
310 EXPECT_EQ(expected.localityIndexByCpu, parsed.localityIndexByCpu);
313 TEST(Getcpu, VdsoGetcpu) {
315 Getcpu::vdsoFunc()(&cpu, nullptr, nullptr);
317 EXPECT_TRUE(cpu < CPU_SETSIZE);
320 TEST(SequentialThreadId, Simple) {
322 auto rv = SequentialThreadId<std::atomic>::getcpu(&cpu, nullptr, nullptr);
324 EXPECT_TRUE(cpu > 0);
326 SequentialThreadId<std::atomic>::getcpu(&again, nullptr, nullptr);
327 EXPECT_EQ(cpu, again);
330 static FOLLY_TLS unsigned testingCpu = 0;
332 static int testingGetcpu(unsigned* cpu, unsigned* node, void* unused) {
333 if (cpu != nullptr) {
336 if (node != nullptr) {
342 TEST(AccessSpreader, Stubbed) {
343 std::vector<std::unique_ptr<AccessSpreader<>>> spreaders(100);
344 for (size_t s = 1; s < spreaders.size(); ++s) {
345 spreaders[s].reset(new AccessSpreader<>(
346 s, nonUniformExampleLocality, &testingGetcpu));
348 std::vector<size_t> cpusInLocalityOrder = {
349 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 6, 7, 22, 8, 23, 9, 24, 10, 25,
350 11, 26, 12, 27, 13, 28, 14, 29, 15, 30, 16, 31 };
351 for (size_t i = 0; i < 32; ++i) {
352 // extra i * 32 is to check wrapping behavior of impl
353 testingCpu = cpusInLocalityOrder[i] + i * 64;
354 for (size_t s = 1; s < spreaders.size(); ++s) {
355 EXPECT_EQ((i * s) / 32, spreaders[s]->current())
356 << "i=" << i << ", cpu=" << testingCpu << ", s=" << s;
361 TEST(AccessSpreader, Default) {
362 AccessSpreader<> spreader(16);
363 EXPECT_LT(spreader.current(), 16);
366 TEST(AccessSpreader, Shared) {
367 for (size_t s = 1; s < 200; ++s) {
368 EXPECT_LT(AccessSpreader<>::shared(s).current(), s);
372 TEST(AccessSpreader, Statics) {
373 LOG(INFO) << "stripeByCore.numStripes() = "
374 << AccessSpreader<>::stripeByCore.numStripes();
375 LOG(INFO) << "stripeByChip.numStripes() = "
376 << AccessSpreader<>::stripeByChip.numStripes();
377 for (size_t s = 1; s < 200; ++s) {
378 EXPECT_LT(AccessSpreader<>::current(s), s);
382 TEST(AccessSpreader, Wrapping) {
383 // this test won't pass unless locality.numCpus divides kMaxCpus
385 auto locality = CacheLocality::uniform(numCpus);
386 for (size_t s = 1; s < 200; ++s) {
387 AccessSpreader<> spreader(s, locality, &testingGetcpu);
388 for (size_t c = 0; c < 400; ++c) {
390 auto observed = spreader.current();
391 testingCpu = c % numCpus;
392 auto expected = spreader.current();
393 EXPECT_EQ(expected, observed)
394 << "numCpus=" << numCpus << ", s=" << s << ", c=" << c;
399 // Benchmarked at ~21 nanos on fbk35 (2.6) and fbk18 (3.2) kernels with
401 // ============================================================================
402 // folly/test/CacheLocalityTest.cpp relative time/iter iters/s
403 // ============================================================================
404 // LocalAccessSpreaderUse 20.77ns 48.16M
405 // SharedAccessSpreaderUse 21.95ns 45.55M
406 // AccessSpreaderConstruction 466.56ns 2.14M
407 // ============================================================================
409 BENCHMARK(LocalAccessSpreaderUse, iters) {
410 folly::BenchmarkSuspender braces;
411 AccessSpreader<> spreader(16);
414 for (unsigned long i = 0; i < iters; ++i) {
415 auto x = spreader.current();
416 folly::doNotOptimizeAway(x);
420 BENCHMARK(SharedAccessSpreaderUse, iters) {
421 for (unsigned long i = 0; i < iters; ++i) {
422 auto x = AccessSpreader<>::current(16);
423 folly::doNotOptimizeAway(x);
427 BENCHMARK(AccessSpreaderConstruction, iters) {
428 std::aligned_storage<sizeof(AccessSpreader<>),
429 std::alignment_of<AccessSpreader<>>::value>::type raw;
430 for (unsigned long i = 0; i < iters; ++i) {
431 auto x = new (&raw) AccessSpreader<>(16);
432 folly::doNotOptimizeAway(x);
433 x->~AccessSpreader();
437 enum class SpreaderType { GETCPU, SHARED, TLS_RR };
439 // Benchmark scores here reflect the time for 32 threads to perform an
440 // atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly,
441 // if we don't separate the counters onto unique 128 byte stripes the
442 // 1_stripe and 2_stripe results are identical, even though the L3 is
443 // claimed to have 64 byte cache lines.
445 // _stub means there was no call to getcpu or the tls round-robin
446 // implementation, because for a single stripe the cpu doesn't matter.
447 // _getcpu refers to the vdso getcpu implementation with a locally
448 // constructed AccessSpreader. _tls_rr refers to execution using
449 // SequentialThreadId, the fallback if the vdso getcpu isn't available.
450 // _shared refers to calling AccessSpreader<>::current(numStripes) inside
453 // At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic, so
454 // since the stripe selection is 6 nanos the atomic increments in the L1 is
455 // ~15 nanos. At width 8_stripe_0_work the line is expected to ping-pong
456 // almost every operation, since the loops have the same duration.
457 // Widths 4 and 2 have the same behavior, but each tour of the cache line
458 // is 4 and 8 cores long, respectively. These all suggest a lower bound
459 // of ~60 nanos for intra-chip handoff and increment between the L1s.
461 // With 396 nanos (500 std::memory_order_seq_cst loads) of busywork per
462 // contended increment, the system can hide all of the latency of a tour
463 // of length 4, but not quite one of length 8. I was a bit surprised
464 // at how much worse the non-striped version got. It seems that the
465 // inter-chip traffic also interferes with the L1-only localWork.load().
466 // When the local work is doubled to 776 nanoseconds we see that the
467 // inter-chip contention is still very important, but subdivisions on
468 // the same chip don't matter.
471 // _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
472 // ============================================================================
473 // folly/test/CacheLocalityTest.cpp relative time/iter iters/s
474 // ============================================================================
475 // LocalAccessSpreaderUse 6.34ns 157.75M
476 // SharedAccessSpreaderUse 6.34ns 157.75M
477 // AccessSpreaderConstruction 328.19ns 3.05M
478 // ----------------------------------------------------------------------------
479 // contentionAtWidth(1_stripe_0_work_stub) 909.99ns 1.10M
480 // contentionAtWidth(2_stripe_0_work_getcpu) 527.54ns 1.90M
481 // contentionAtWidth(4_stripe_0_work_getcpu) 260.28ns 3.84M
482 // contentionAtWidth(8_stripe_0_work_getcpu) 131.82ns 7.59M
483 // contentionAtWidth(16_stripe_0_work_getcpu) 25.92ns 38.58M
484 // contentionAtWidth(32_stripe_0_work_getcpu) 21.80ns 45.88M
485 // contentionAtWidth(64_stripe_0_work_getcpu) 20.06ns 49.85M
486 // contentionAtWidth(2_stripe_0_work_tls_rr) 759.21ns 1.32M
487 // contentionAtWidth(4_stripe_0_work_tls_rr) 607.46ns 1.65M
488 // contentionAtWidth(8_stripe_0_work_tls_rr) 403.79ns 2.48M
489 // contentionAtWidth(16_stripe_0_work_tls_rr) 188.14ns 5.32M
490 // contentionAtWidth(32_stripe_0_work_tls_rr) 131.59ns 7.60M
491 // contentionAtWidth(64_stripe_0_work_tls_rr) 103.56ns 9.66M
492 // contentionAtWidth(2_stripe_0_work_shared) 553.07ns 1.81M
493 // contentionAtWidth(4_stripe_0_work_shared) 274.23ns 3.65M
494 // contentionAtWidth(8_stripe_0_work_shared) 137.43ns 7.28M
495 // contentionAtWidth(16_stripe_0_work_shared) 24.52ns 40.78M
496 // contentionAtWidth(32_stripe_0_work_shared) 21.80ns 45.86M
497 // contentionAtWidth(64_stripe_0_work_shared) 21.66ns 46.17M
498 // atomicIncrBaseline(local_incr_0_work) 16.73ns 59.78M
499 // ----------------------------------------------------------------------------
500 // contentionAtWidth(1_stripe_500_work_stub) 1.75us 571.14K
501 // contentionAtWidth(2_stripe_500_work_getcpu) 500.79ns 2.00M
502 // contentionAtWidth(4_stripe_500_work_getcpu) 410.45ns 2.44M
503 // contentionAtWidth(8_stripe_500_work_getcpu) 411.41ns 2.43M
504 // contentionAtWidth(16_stripe_500_work_getcpu) 400.12ns 2.50M
505 // contentionAtWidth(32_stripe_500_work_getcpu) 397.37ns 2.52M
506 // atomicIncrBaseline(local_incr_500_work) 396.53ns 2.52M
507 // ----------------------------------------------------------------------------
508 // contentionAtWidth(1_stripe_1000_work_stub) 1.88us 530.59K
509 // contentionAtWidth(2_stripe_1000_work_getcpu) 778.77ns 1.28M
510 // contentionAtWidth(4_stripe_1000_work_getcpu) 779.56ns 1.28M
511 // contentionAtWidth(8_stripe_1000_work_getcpu) 795.62ns 1.26M
512 // contentionAtWidth(16_stripe_1000_work_getcpu) 778.81ns 1.28M
513 // contentionAtWidth(32_stripe_1000_work_getcpu) 780.26ns 1.28M
514 // atomicIncrBaseline(local_incr_1000_work) 776.39ns 1.29M
515 // ============================================================================
516 static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
517 SpreaderType spreaderType,
518 size_t counterAlignment = 128,
519 size_t numThreads = 32) {
520 folly::BenchmarkSuspender braces;
522 AccessSpreader<> spreader(
524 CacheLocality::system<std::atomic>(),
525 spreaderType == SpreaderType::TLS_RR
526 ? SequentialThreadId<std::atomic>::getcpu : nullptr);
528 std::atomic<size_t> ready(0);
529 std::atomic<bool> go(false);
531 // while in theory the cache line size is 64 bytes, experiments show
532 // that we get contention on 128 byte boundaries for Ivy Bridge. The
533 // extra indirection adds 1 or 2 nanos
534 assert(counterAlignment >= sizeof(std::atomic<size_t>));
535 std::vector<char> raw(counterAlignment * stripes);
537 // if we happen to be using the tlsRoundRobin, then sequentially
538 // assigning the thread identifiers is the unlikely best-case scenario.
539 // We don't want to unfairly benefit or penalize. Computing the exact
540 // maximum likelihood of the probability distributions is annoying, so
541 // I approximate as 2/5 of the ids that have no threads, 2/5 that have
542 // 1, 2/15 that have 2, and 1/15 that have 3. We accomplish this by
543 // wrapping back to slot 0 when we hit 1/15 and 1/5.
545 std::vector<std::thread> threads;
546 while (threads.size() < numThreads) {
547 threads.push_back(std::thread([&,iters,stripes,work]() {
548 std::atomic<size_t>* counters[stripes];
549 for (size_t i = 0; i < stripes; ++i) {
551 = new (raw.data() + counterAlignment * i) std::atomic<size_t>();
559 std::atomic<int> localWork(0);
560 if (spreaderType == SpreaderType::SHARED) {
561 for (size_t i = iters; i > 0; --i) {
562 ++*(counters[AccessSpreader<>::current(stripes)]);
563 for (size_t j = work; j > 0; --j) {
568 for (size_t i = iters; i > 0; --i) {
569 ++*(counters[spreader.current()]);
570 for (size_t j = work; j > 0; --j) {
577 if (threads.size() == numThreads / 15 ||
578 threads.size() == numThreads / 5) {
579 // create a few dummy threads to wrap back around to 0 mod numCpus
580 for (size_t i = threads.size(); i != numThreads; ++i) {
588 while (ready < numThreads) {
594 for (auto& thr : threads) {
599 static void atomicIncrBaseline(size_t iters, size_t work,
600 size_t numThreads = 32) {
601 folly::BenchmarkSuspender braces;
603 std::atomic<bool> go(false);
605 std::vector<std::thread> threads;
606 while (threads.size() < numThreads) {
607 threads.push_back(std::thread([&]() {
611 std::atomic<size_t> localCounter(0);
612 std::atomic<int> localWork(0);
613 for (size_t i = iters; i > 0; --i) {
615 for (size_t j = work; j > 0; --j) {
625 for (auto& thr : threads) {
630 BENCHMARK_DRAW_LINE()
632 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_0_work_stub,
633 1, 0, SpreaderType::GETCPU)
634 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_getcpu,
635 2, 0, SpreaderType::GETCPU)
636 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_getcpu,
637 4, 0, SpreaderType::GETCPU)
638 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_getcpu,
639 8, 0, SpreaderType::GETCPU)
640 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_getcpu,
641 16, 0, SpreaderType::GETCPU)
642 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_getcpu,
643 32, 0, SpreaderType::GETCPU)
644 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_getcpu,
645 64, 0, SpreaderType::GETCPU)
646 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_tls_rr,
647 2, 0, SpreaderType::TLS_RR)
648 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_tls_rr,
649 4, 0, SpreaderType::TLS_RR)
650 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_tls_rr,
651 8, 0, SpreaderType::TLS_RR)
652 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_tls_rr,
653 16, 0, SpreaderType::TLS_RR)
654 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_tls_rr,
655 32, 0, SpreaderType::TLS_RR)
656 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_tls_rr,
657 64, 0, SpreaderType::TLS_RR)
658 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_shared,
659 2, 0, SpreaderType::SHARED)
660 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_shared,
661 4, 0, SpreaderType::SHARED)
662 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_shared,
663 8, 0, SpreaderType::SHARED)
664 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_shared,
665 16, 0, SpreaderType::SHARED)
666 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_shared,
667 32, 0, SpreaderType::SHARED)
668 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_shared,
669 64, 0, SpreaderType::SHARED)
670 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
671 BENCHMARK_DRAW_LINE()
672 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_500_work_stub,
673 1, 500, SpreaderType::GETCPU)
674 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_500_work_getcpu,
675 2, 500, SpreaderType::GETCPU)
676 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_500_work_getcpu,
677 4, 500, SpreaderType::GETCPU)
678 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_500_work_getcpu,
679 8, 500, SpreaderType::GETCPU)
680 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_500_work_getcpu,
681 16, 500, SpreaderType::GETCPU)
682 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_500_work_getcpu,
683 32, 500, SpreaderType::GETCPU)
684 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
685 BENCHMARK_DRAW_LINE()
686 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_1000_work_stub,
687 1, 1000, SpreaderType::GETCPU)
688 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_1000_work_getcpu,
689 2, 1000, SpreaderType::GETCPU)
690 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_1000_work_getcpu,
691 4, 1000, SpreaderType::GETCPU)
692 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_1000_work_getcpu,
693 8, 1000, SpreaderType::GETCPU)
694 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_1000_work_getcpu,
695 16, 1000, SpreaderType::GETCPU)
696 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_1000_work_getcpu,
697 32, 1000, SpreaderType::GETCPU)
698 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
701 int main(int argc, char** argv) {
702 testing::InitGoogleTest(&argc, argv);
703 gflags::ParseCommandLineFlags(&argc, &argv, true);
704 auto ret = RUN_ALL_TESTS();
705 if (!ret && FLAGS_benchmark) {
706 folly::runBenchmarks();