2 * Copyright 2013 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "folly/detail/CacheLocality.h"
22 #include <type_traits>
23 #include <unordered_map>
24 #include <glog/logging.h>
25 #include <gtest/gtest.h>
26 #include "folly/Benchmark.h"
28 using namespace folly::detail;
30 /// This is the relevant nodes from a production box's sysfs tree. If you
31 /// think this map is ugly you should see the version of this test that
32 /// used a real directory tree. To reduce the chance of testing error
33 /// I haven't tried to remove the common prefix
34 static std::unordered_map<std::string,std::string> fakeSysfsTree = {
35 { "/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list", "0,17" },
36 { "/sys/devices/system/cpu/cpu0/cache/index0/type", "Data" },
37 { "/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list", "0,17" },
38 { "/sys/devices/system/cpu/cpu0/cache/index1/type", "Instruction" },
39 { "/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list", "0,17" },
40 { "/sys/devices/system/cpu/cpu0/cache/index2/type", "Unified" },
41 { "/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list", "0-8,17-23" },
42 { "/sys/devices/system/cpu/cpu0/cache/index3/type", "Unified" },
43 { "/sys/devices/system/cpu/cpu1/cache/index0/shared_cpu_list", "1,18" },
44 { "/sys/devices/system/cpu/cpu1/cache/index0/type", "Data" },
45 { "/sys/devices/system/cpu/cpu1/cache/index1/shared_cpu_list", "1,18" },
46 { "/sys/devices/system/cpu/cpu1/cache/index1/type", "Instruction" },
47 { "/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list", "1,18" },
48 { "/sys/devices/system/cpu/cpu1/cache/index2/type", "Unified" },
49 { "/sys/devices/system/cpu/cpu1/cache/index3/shared_cpu_list", "0-8,17-23" },
50 { "/sys/devices/system/cpu/cpu1/cache/index3/type", "Unified" },
51 { "/sys/devices/system/cpu/cpu2/cache/index0/shared_cpu_list", "2,19" },
52 { "/sys/devices/system/cpu/cpu2/cache/index0/type", "Data" },
53 { "/sys/devices/system/cpu/cpu2/cache/index1/shared_cpu_list", "2,19" },
54 { "/sys/devices/system/cpu/cpu2/cache/index1/type", "Instruction" },
55 { "/sys/devices/system/cpu/cpu2/cache/index2/shared_cpu_list", "2,19" },
56 { "/sys/devices/system/cpu/cpu2/cache/index2/type", "Unified" },
57 { "/sys/devices/system/cpu/cpu2/cache/index3/shared_cpu_list", "0-8,17-23" },
58 { "/sys/devices/system/cpu/cpu2/cache/index3/type", "Unified" },
59 { "/sys/devices/system/cpu/cpu3/cache/index0/shared_cpu_list", "3,20" },
60 { "/sys/devices/system/cpu/cpu3/cache/index0/type", "Data" },
61 { "/sys/devices/system/cpu/cpu3/cache/index1/shared_cpu_list", "3,20" },
62 { "/sys/devices/system/cpu/cpu3/cache/index1/type", "Instruction" },
63 { "/sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_list", "3,20" },
64 { "/sys/devices/system/cpu/cpu3/cache/index2/type", "Unified" },
65 { "/sys/devices/system/cpu/cpu3/cache/index3/shared_cpu_list", "0-8,17-23" },
66 { "/sys/devices/system/cpu/cpu3/cache/index3/type", "Unified" },
67 { "/sys/devices/system/cpu/cpu4/cache/index0/shared_cpu_list", "4,21" },
68 { "/sys/devices/system/cpu/cpu4/cache/index0/type", "Data" },
69 { "/sys/devices/system/cpu/cpu4/cache/index1/shared_cpu_list", "4,21" },
70 { "/sys/devices/system/cpu/cpu4/cache/index1/type", "Instruction" },
71 { "/sys/devices/system/cpu/cpu4/cache/index2/shared_cpu_list", "4,21" },
72 { "/sys/devices/system/cpu/cpu4/cache/index2/type", "Unified" },
73 { "/sys/devices/system/cpu/cpu4/cache/index3/shared_cpu_list", "0-8,17-23" },
74 { "/sys/devices/system/cpu/cpu4/cache/index3/type", "Unified" },
75 { "/sys/devices/system/cpu/cpu5/cache/index0/shared_cpu_list", "5-6" },
76 { "/sys/devices/system/cpu/cpu5/cache/index0/type", "Data" },
77 { "/sys/devices/system/cpu/cpu5/cache/index1/shared_cpu_list", "5-6" },
78 { "/sys/devices/system/cpu/cpu5/cache/index1/type", "Instruction" },
79 { "/sys/devices/system/cpu/cpu5/cache/index2/shared_cpu_list", "5-6" },
80 { "/sys/devices/system/cpu/cpu5/cache/index2/type", "Unified" },
81 { "/sys/devices/system/cpu/cpu5/cache/index3/shared_cpu_list", "0-8,17-23" },
82 { "/sys/devices/system/cpu/cpu5/cache/index3/type", "Unified" },
83 { "/sys/devices/system/cpu/cpu6/cache/index0/shared_cpu_list", "5-6" },
84 { "/sys/devices/system/cpu/cpu6/cache/index0/type", "Data" },
85 { "/sys/devices/system/cpu/cpu6/cache/index1/shared_cpu_list", "5-6" },
86 { "/sys/devices/system/cpu/cpu6/cache/index1/type", "Instruction" },
87 { "/sys/devices/system/cpu/cpu6/cache/index2/shared_cpu_list", "5-6" },
88 { "/sys/devices/system/cpu/cpu6/cache/index2/type", "Unified" },
89 { "/sys/devices/system/cpu/cpu6/cache/index3/shared_cpu_list", "0-8,17-23" },
90 { "/sys/devices/system/cpu/cpu6/cache/index3/type", "Unified" },
91 { "/sys/devices/system/cpu/cpu7/cache/index0/shared_cpu_list", "7,22" },
92 { "/sys/devices/system/cpu/cpu7/cache/index0/type", "Data" },
93 { "/sys/devices/system/cpu/cpu7/cache/index1/shared_cpu_list", "7,22" },
94 { "/sys/devices/system/cpu/cpu7/cache/index1/type", "Instruction" },
95 { "/sys/devices/system/cpu/cpu7/cache/index2/shared_cpu_list", "7,22" },
96 { "/sys/devices/system/cpu/cpu7/cache/index2/type", "Unified" },
97 { "/sys/devices/system/cpu/cpu7/cache/index3/shared_cpu_list", "0-8,17-23" },
98 { "/sys/devices/system/cpu/cpu7/cache/index3/type", "Unified" },
99 { "/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list", "8,23" },
100 { "/sys/devices/system/cpu/cpu8/cache/index0/type", "Data" },
101 { "/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list", "8,23" },
102 { "/sys/devices/system/cpu/cpu8/cache/index1/type", "Instruction" },
103 { "/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list", "8,23" },
104 { "/sys/devices/system/cpu/cpu8/cache/index2/type", "Unified" },
105 { "/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list", "0-8,17-23" },
106 { "/sys/devices/system/cpu/cpu8/cache/index3/type", "Unified" },
107 { "/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list", "9,24" },
108 { "/sys/devices/system/cpu/cpu9/cache/index0/type", "Data" },
109 { "/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list", "9,24" },
110 { "/sys/devices/system/cpu/cpu9/cache/index1/type", "Instruction" },
111 { "/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list", "9,24" },
112 { "/sys/devices/system/cpu/cpu9/cache/index2/type", "Unified" },
113 { "/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list", "9-16,24-31" },
114 { "/sys/devices/system/cpu/cpu9/cache/index3/type", "Unified" },
115 { "/sys/devices/system/cpu/cpu10/cache/index0/shared_cpu_list", "10,25" },
116 { "/sys/devices/system/cpu/cpu10/cache/index0/type", "Data" },
117 { "/sys/devices/system/cpu/cpu10/cache/index1/shared_cpu_list", "10,25" },
118 { "/sys/devices/system/cpu/cpu10/cache/index1/type", "Instruction" },
119 { "/sys/devices/system/cpu/cpu10/cache/index2/shared_cpu_list", "10,25" },
120 { "/sys/devices/system/cpu/cpu10/cache/index2/type", "Unified" },
121 { "/sys/devices/system/cpu/cpu10/cache/index3/shared_cpu_list", "9-16,24-31"},
122 { "/sys/devices/system/cpu/cpu10/cache/index3/type", "Unified" },
123 { "/sys/devices/system/cpu/cpu11/cache/index0/shared_cpu_list", "11,26" },
124 { "/sys/devices/system/cpu/cpu11/cache/index0/type", "Data" },
125 { "/sys/devices/system/cpu/cpu11/cache/index1/shared_cpu_list", "11,26" },
126 { "/sys/devices/system/cpu/cpu11/cache/index1/type", "Instruction" },
127 { "/sys/devices/system/cpu/cpu11/cache/index2/shared_cpu_list", "11,26" },
128 { "/sys/devices/system/cpu/cpu11/cache/index2/type", "Unified" },
129 { "/sys/devices/system/cpu/cpu11/cache/index3/shared_cpu_list", "9-16,24-31"},
130 { "/sys/devices/system/cpu/cpu11/cache/index3/type", "Unified" },
131 { "/sys/devices/system/cpu/cpu12/cache/index0/shared_cpu_list", "12,27" },
132 { "/sys/devices/system/cpu/cpu12/cache/index0/type", "Data" },
133 { "/sys/devices/system/cpu/cpu12/cache/index1/shared_cpu_list", "12,27" },
134 { "/sys/devices/system/cpu/cpu12/cache/index1/type", "Instruction" },
135 { "/sys/devices/system/cpu/cpu12/cache/index2/shared_cpu_list", "12,27" },
136 { "/sys/devices/system/cpu/cpu12/cache/index2/type", "Unified" },
137 { "/sys/devices/system/cpu/cpu12/cache/index3/shared_cpu_list", "9-16,24-31"},
138 { "/sys/devices/system/cpu/cpu12/cache/index3/type", "Unified" },
139 { "/sys/devices/system/cpu/cpu13/cache/index0/shared_cpu_list", "13,28" },
140 { "/sys/devices/system/cpu/cpu13/cache/index0/type", "Data" },
141 { "/sys/devices/system/cpu/cpu13/cache/index1/shared_cpu_list", "13,28" },
142 { "/sys/devices/system/cpu/cpu13/cache/index1/type", "Instruction" },
143 { "/sys/devices/system/cpu/cpu13/cache/index2/shared_cpu_list", "13,28" },
144 { "/sys/devices/system/cpu/cpu13/cache/index2/type", "Unified" },
145 { "/sys/devices/system/cpu/cpu13/cache/index3/shared_cpu_list", "9-16,24-31"},
146 { "/sys/devices/system/cpu/cpu13/cache/index3/type", "Unified" },
147 { "/sys/devices/system/cpu/cpu14/cache/index0/shared_cpu_list", "14,29" },
148 { "/sys/devices/system/cpu/cpu14/cache/index0/type", "Data" },
149 { "/sys/devices/system/cpu/cpu14/cache/index1/shared_cpu_list", "14,29" },
150 { "/sys/devices/system/cpu/cpu14/cache/index1/type", "Instruction" },
151 { "/sys/devices/system/cpu/cpu14/cache/index2/shared_cpu_list", "14,29" },
152 { "/sys/devices/system/cpu/cpu14/cache/index2/type", "Unified" },
153 { "/sys/devices/system/cpu/cpu14/cache/index3/shared_cpu_list", "9-16,24-31"},
154 { "/sys/devices/system/cpu/cpu14/cache/index3/type", "Unified" },
155 { "/sys/devices/system/cpu/cpu15/cache/index0/shared_cpu_list", "15,30" },
156 { "/sys/devices/system/cpu/cpu15/cache/index0/type", "Data" },
157 { "/sys/devices/system/cpu/cpu15/cache/index1/shared_cpu_list", "15,30" },
158 { "/sys/devices/system/cpu/cpu15/cache/index1/type", "Instruction" },
159 { "/sys/devices/system/cpu/cpu15/cache/index2/shared_cpu_list", "15,30" },
160 { "/sys/devices/system/cpu/cpu15/cache/index2/type", "Unified" },
161 { "/sys/devices/system/cpu/cpu15/cache/index3/shared_cpu_list", "9-16,24-31"},
162 { "/sys/devices/system/cpu/cpu15/cache/index3/type", "Unified" },
163 { "/sys/devices/system/cpu/cpu16/cache/index0/shared_cpu_list", "16,31" },
164 { "/sys/devices/system/cpu/cpu16/cache/index0/type", "Data" },
165 { "/sys/devices/system/cpu/cpu16/cache/index1/shared_cpu_list", "16,31" },
166 { "/sys/devices/system/cpu/cpu16/cache/index1/type", "Instruction" },
167 { "/sys/devices/system/cpu/cpu16/cache/index2/shared_cpu_list", "16,31" },
168 { "/sys/devices/system/cpu/cpu16/cache/index2/type", "Unified" },
169 { "/sys/devices/system/cpu/cpu16/cache/index3/shared_cpu_list", "9-16,24-31"},
170 { "/sys/devices/system/cpu/cpu16/cache/index3/type", "Unified" },
171 { "/sys/devices/system/cpu/cpu17/cache/index0/shared_cpu_list", "0,17" },
172 { "/sys/devices/system/cpu/cpu17/cache/index0/type", "Data" },
173 { "/sys/devices/system/cpu/cpu17/cache/index1/shared_cpu_list", "0,17" },
174 { "/sys/devices/system/cpu/cpu17/cache/index1/type", "Instruction" },
175 { "/sys/devices/system/cpu/cpu17/cache/index2/shared_cpu_list", "0,17" },
176 { "/sys/devices/system/cpu/cpu17/cache/index2/type", "Unified" },
177 { "/sys/devices/system/cpu/cpu17/cache/index3/shared_cpu_list", "0-8,17-23" },
178 { "/sys/devices/system/cpu/cpu17/cache/index3/type", "Unified" },
179 { "/sys/devices/system/cpu/cpu18/cache/index0/shared_cpu_list", "1,18" },
180 { "/sys/devices/system/cpu/cpu18/cache/index0/type", "Data" },
181 { "/sys/devices/system/cpu/cpu18/cache/index1/shared_cpu_list", "1,18" },
182 { "/sys/devices/system/cpu/cpu18/cache/index1/type", "Instruction" },
183 { "/sys/devices/system/cpu/cpu18/cache/index2/shared_cpu_list", "1,18" },
184 { "/sys/devices/system/cpu/cpu18/cache/index2/type", "Unified" },
185 { "/sys/devices/system/cpu/cpu18/cache/index3/shared_cpu_list", "0-8,17-23" },
186 { "/sys/devices/system/cpu/cpu18/cache/index3/type", "Unified" },
187 { "/sys/devices/system/cpu/cpu19/cache/index0/shared_cpu_list", "2,19" },
188 { "/sys/devices/system/cpu/cpu19/cache/index0/type", "Data" },
189 { "/sys/devices/system/cpu/cpu19/cache/index1/shared_cpu_list", "2,19" },
190 { "/sys/devices/system/cpu/cpu19/cache/index1/type", "Instruction" },
191 { "/sys/devices/system/cpu/cpu19/cache/index2/shared_cpu_list", "2,19" },
192 { "/sys/devices/system/cpu/cpu19/cache/index2/type", "Unified" },
193 { "/sys/devices/system/cpu/cpu19/cache/index3/shared_cpu_list", "0-8,17-23" },
194 { "/sys/devices/system/cpu/cpu19/cache/index3/type", "Unified" },
195 { "/sys/devices/system/cpu/cpu20/cache/index0/shared_cpu_list", "3,20" },
196 { "/sys/devices/system/cpu/cpu20/cache/index0/type", "Data" },
197 { "/sys/devices/system/cpu/cpu20/cache/index1/shared_cpu_list", "3,20" },
198 { "/sys/devices/system/cpu/cpu20/cache/index1/type", "Instruction" },
199 { "/sys/devices/system/cpu/cpu20/cache/index2/shared_cpu_list", "3,20" },
200 { "/sys/devices/system/cpu/cpu20/cache/index2/type", "Unified" },
201 { "/sys/devices/system/cpu/cpu20/cache/index3/shared_cpu_list", "0-8,17-23" },
202 { "/sys/devices/system/cpu/cpu20/cache/index3/type", "Unified" },
203 { "/sys/devices/system/cpu/cpu21/cache/index0/shared_cpu_list", "4,21" },
204 { "/sys/devices/system/cpu/cpu21/cache/index0/type", "Data" },
205 { "/sys/devices/system/cpu/cpu21/cache/index1/shared_cpu_list", "4,21" },
206 { "/sys/devices/system/cpu/cpu21/cache/index1/type", "Instruction" },
207 { "/sys/devices/system/cpu/cpu21/cache/index2/shared_cpu_list", "4,21" },
208 { "/sys/devices/system/cpu/cpu21/cache/index2/type", "Unified" },
209 { "/sys/devices/system/cpu/cpu21/cache/index3/shared_cpu_list", "0-8,17-23" },
210 { "/sys/devices/system/cpu/cpu21/cache/index3/type", "Unified" },
211 { "/sys/devices/system/cpu/cpu22/cache/index0/shared_cpu_list", "7,22" },
212 { "/sys/devices/system/cpu/cpu22/cache/index0/type", "Data" },
213 { "/sys/devices/system/cpu/cpu22/cache/index1/shared_cpu_list", "7,22" },
214 { "/sys/devices/system/cpu/cpu22/cache/index1/type", "Instruction" },
215 { "/sys/devices/system/cpu/cpu22/cache/index2/shared_cpu_list", "7,22" },
216 { "/sys/devices/system/cpu/cpu22/cache/index2/type", "Unified" },
217 { "/sys/devices/system/cpu/cpu22/cache/index3/shared_cpu_list", "0-8,17-23" },
218 { "/sys/devices/system/cpu/cpu22/cache/index3/type", "Unified" },
219 { "/sys/devices/system/cpu/cpu23/cache/index0/shared_cpu_list", "8,23" },
220 { "/sys/devices/system/cpu/cpu23/cache/index0/type", "Data" },
221 { "/sys/devices/system/cpu/cpu23/cache/index1/shared_cpu_list", "8,23" },
222 { "/sys/devices/system/cpu/cpu23/cache/index1/type", "Instruction" },
223 { "/sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_list", "8,23" },
224 { "/sys/devices/system/cpu/cpu23/cache/index2/type", "Unified" },
225 { "/sys/devices/system/cpu/cpu23/cache/index3/shared_cpu_list", "0-8,17-23" },
226 { "/sys/devices/system/cpu/cpu23/cache/index3/type", "Unified" },
227 { "/sys/devices/system/cpu/cpu24/cache/index0/shared_cpu_list", "9,24" },
228 { "/sys/devices/system/cpu/cpu24/cache/index0/type", "Data" },
229 { "/sys/devices/system/cpu/cpu24/cache/index1/shared_cpu_list", "9,24" },
230 { "/sys/devices/system/cpu/cpu24/cache/index1/type", "Instruction" },
231 { "/sys/devices/system/cpu/cpu24/cache/index2/shared_cpu_list", "9,24" },
232 { "/sys/devices/system/cpu/cpu24/cache/index2/type", "Unified" },
233 { "/sys/devices/system/cpu/cpu24/cache/index3/shared_cpu_list", "9-16,24-31"},
234 { "/sys/devices/system/cpu/cpu24/cache/index3/type", "Unified" },
235 { "/sys/devices/system/cpu/cpu25/cache/index0/shared_cpu_list", "10,25" },
236 { "/sys/devices/system/cpu/cpu25/cache/index0/type", "Data" },
237 { "/sys/devices/system/cpu/cpu25/cache/index1/shared_cpu_list", "10,25" },
238 { "/sys/devices/system/cpu/cpu25/cache/index1/type", "Instruction" },
239 { "/sys/devices/system/cpu/cpu25/cache/index2/shared_cpu_list", "10,25" },
240 { "/sys/devices/system/cpu/cpu25/cache/index2/type", "Unified" },
241 { "/sys/devices/system/cpu/cpu25/cache/index3/shared_cpu_list", "9-16,24-31"},
242 { "/sys/devices/system/cpu/cpu25/cache/index3/type", "Unified" },
243 { "/sys/devices/system/cpu/cpu26/cache/index0/shared_cpu_list", "11,26" },
244 { "/sys/devices/system/cpu/cpu26/cache/index0/type", "Data" },
245 { "/sys/devices/system/cpu/cpu26/cache/index1/shared_cpu_list", "11,26" },
246 { "/sys/devices/system/cpu/cpu26/cache/index1/type", "Instruction" },
247 { "/sys/devices/system/cpu/cpu26/cache/index2/shared_cpu_list", "11,26" },
248 { "/sys/devices/system/cpu/cpu26/cache/index2/type", "Unified" },
249 { "/sys/devices/system/cpu/cpu26/cache/index3/shared_cpu_list", "9-16,24-31"},
250 { "/sys/devices/system/cpu/cpu26/cache/index3/type", "Unified" },
251 { "/sys/devices/system/cpu/cpu27/cache/index0/shared_cpu_list", "12,27" },
252 { "/sys/devices/system/cpu/cpu27/cache/index0/type", "Data" },
253 { "/sys/devices/system/cpu/cpu27/cache/index1/shared_cpu_list", "12,27" },
254 { "/sys/devices/system/cpu/cpu27/cache/index1/type", "Instruction" },
255 { "/sys/devices/system/cpu/cpu27/cache/index2/shared_cpu_list", "12,27" },
256 { "/sys/devices/system/cpu/cpu27/cache/index2/type", "Unified" },
257 { "/sys/devices/system/cpu/cpu27/cache/index3/shared_cpu_list", "9-16,24-31"},
258 { "/sys/devices/system/cpu/cpu27/cache/index3/type", "Unified" },
259 { "/sys/devices/system/cpu/cpu28/cache/index0/shared_cpu_list", "13,28" },
260 { "/sys/devices/system/cpu/cpu28/cache/index0/type", "Data" },
261 { "/sys/devices/system/cpu/cpu28/cache/index1/shared_cpu_list", "13,28" },
262 { "/sys/devices/system/cpu/cpu28/cache/index1/type", "Instruction" },
263 { "/sys/devices/system/cpu/cpu28/cache/index2/shared_cpu_list", "13,28" },
264 { "/sys/devices/system/cpu/cpu28/cache/index2/type", "Unified" },
265 { "/sys/devices/system/cpu/cpu28/cache/index3/shared_cpu_list", "9-16,24-31"},
266 { "/sys/devices/system/cpu/cpu28/cache/index3/type", "Unified" },
267 { "/sys/devices/system/cpu/cpu29/cache/index0/shared_cpu_list", "14,29" },
268 { "/sys/devices/system/cpu/cpu29/cache/index0/type", "Data" },
269 { "/sys/devices/system/cpu/cpu29/cache/index1/shared_cpu_list", "14,29" },
270 { "/sys/devices/system/cpu/cpu29/cache/index1/type", "Instruction" },
271 { "/sys/devices/system/cpu/cpu29/cache/index2/shared_cpu_list", "14,29" },
272 { "/sys/devices/system/cpu/cpu29/cache/index2/type", "Unified" },
273 { "/sys/devices/system/cpu/cpu29/cache/index3/shared_cpu_list", "9-16,24-31"},
274 { "/sys/devices/system/cpu/cpu29/cache/index3/type", "Unified" },
275 { "/sys/devices/system/cpu/cpu30/cache/index0/shared_cpu_list", "15,30" },
276 { "/sys/devices/system/cpu/cpu30/cache/index0/type", "Data" },
277 { "/sys/devices/system/cpu/cpu30/cache/index1/shared_cpu_list", "15,30" },
278 { "/sys/devices/system/cpu/cpu30/cache/index1/type", "Instruction" },
279 { "/sys/devices/system/cpu/cpu30/cache/index2/shared_cpu_list", "15,30" },
280 { "/sys/devices/system/cpu/cpu30/cache/index2/type", "Unified" },
281 { "/sys/devices/system/cpu/cpu30/cache/index3/shared_cpu_list", "9-16,24-31"},
282 { "/sys/devices/system/cpu/cpu30/cache/index3/type", "Unified" },
283 { "/sys/devices/system/cpu/cpu31/cache/index0/shared_cpu_list", "16,31" },
284 { "/sys/devices/system/cpu/cpu31/cache/index0/type", "Data" },
285 { "/sys/devices/system/cpu/cpu31/cache/index1/shared_cpu_list", "16,31" },
286 { "/sys/devices/system/cpu/cpu31/cache/index1/type", "Instruction" },
287 { "/sys/devices/system/cpu/cpu31/cache/index2/shared_cpu_list", "16,31" },
288 { "/sys/devices/system/cpu/cpu31/cache/index2/type", "Unified" },
289 { "/sys/devices/system/cpu/cpu31/cache/index3/shared_cpu_list", "9-16,24-31"},
290 { "/sys/devices/system/cpu/cpu31/cache/index3/type", "Unified" }
293 /// This is the expected CacheLocality structure for fakeSysfsTree
294 static const CacheLocality nonUniformExampleLocality = {
297 { 0, 2, 4, 6, 8, 10, 11, 12, 14, 16, 18, 20, 22, 24, 26, 28,
298 30, 1, 3, 5, 7, 9, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 }
301 TEST(CacheLocality, FakeSysfs) {
302 auto parsed = CacheLocality::readFromSysfsTree([](std::string name) {
303 auto iter = fakeSysfsTree.find(name);
304 return iter == fakeSysfsTree.end() ? std::string() : iter->second;
307 auto& expected = nonUniformExampleLocality;
308 EXPECT_EQ(expected.numCpus, parsed.numCpus);
309 EXPECT_EQ(expected.numCachesByLevel, parsed.numCachesByLevel);
310 EXPECT_EQ(expected.localityIndexByCpu, parsed.localityIndexByCpu);
313 TEST(Getcpu, VdsoGetcpu) {
315 Getcpu::vdsoFunc()(&cpu, nullptr, nullptr);
317 EXPECT_TRUE(cpu < CPU_SETSIZE);
320 TEST(SequentialThreadId, Simple) {
322 auto rv = SequentialThreadId<std::atomic>::getcpu(&cpu, nullptr, nullptr);
324 EXPECT_TRUE(cpu > 0);
326 SequentialThreadId<std::atomic>::getcpu(&again, nullptr, nullptr);
327 EXPECT_EQ(cpu, again);
330 static __thread unsigned testingCpu = 0;
332 static int testingGetcpu(unsigned* cpu, unsigned* node, void* unused) {
333 if (cpu != nullptr) {
336 if (node != nullptr) {
342 TEST(AccessSpreader, Stubbed) {
343 std::vector<std::unique_ptr<AccessSpreader<>>> spreaders(100);
344 for (size_t s = 1; s < spreaders.size(); ++s) {
345 spreaders[s].reset(new AccessSpreader<>(
346 s, nonUniformExampleLocality, &testingGetcpu));
348 std::vector<size_t> cpusInLocalityOrder = {
349 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 6, 7, 22, 8, 23, 9, 24, 10, 25,
350 11, 26, 12, 27, 13, 28, 14, 29, 15, 30, 16, 31 };
351 for (size_t i = 0; i < 32; ++i) {
352 // extra i * 32 is to check wrapping behavior of impl
353 testingCpu = cpusInLocalityOrder[i] + i * 64;
354 for (size_t s = 1; s < spreaders.size(); ++s) {
355 EXPECT_EQ((i * s) / 32, spreaders[s]->current())
356 << "i=" << i << ", cpu=" << testingCpu << ", s=" << s;
361 TEST(AccessSpreader, Default) {
362 AccessSpreader<> spreader(16);
363 EXPECT_LT(spreader.current(), 16);
366 TEST(AccessSpreader, Shared) {
367 for (size_t s = 1; s < 200; ++s) {
368 EXPECT_LT(AccessSpreader<>::shared(s).current(), s);
372 TEST(AccessSpreader, Statics) {
373 LOG(INFO) << "stripeByCore.numStripes() = "
374 << AccessSpreader<>::stripeByCore.numStripes();
375 LOG(INFO) << "stripeByChip.numStripes() = "
376 << AccessSpreader<>::stripeByChip.numStripes();
377 for (size_t s = 1; s < 200; ++s) {
378 EXPECT_LT(AccessSpreader<>::current(s), s);
382 TEST(AccessSpreader, Wrapping) {
383 // this test won't pass unless locality.numCpus divides kMaxCpus
385 auto locality = CacheLocality::uniform(numCpus);
386 for (size_t s = 1; s < 200; ++s) {
387 AccessSpreader<> spreader(s, locality, &testingGetcpu);
388 for (size_t c = 0; c < 400; ++c) {
390 auto observed = spreader.current();
391 testingCpu = c % numCpus;
392 auto expected = spreader.current();
393 EXPECT_EQ(expected, observed)
394 << "numCpus=" << numCpus << ", s=" << s << ", c=" << c;
399 // Benchmarked at ~21 nanos on fbk35 (2.6) and fbk18 (3.2) kernels with
401 // ============================================================================
402 // folly/test/CacheLocalityTest.cpp relative time/iter iters/s
403 // ============================================================================
404 // LocalAccessSpreaderUse 20.77ns 48.16M
405 // SharedAccessSpreaderUse 21.95ns 45.55M
406 // AccessSpreaderConstruction 466.56ns 2.14M
407 // ============================================================================
409 BENCHMARK(LocalAccessSpreaderUse, iters) {
410 folly::BenchmarkSuspender braces;
411 AccessSpreader<> spreader(16);
414 for (unsigned long i = 0; i < iters; ++i) {
415 auto x = spreader.current();
416 folly::doNotOptimizeAway(x);
420 BENCHMARK(SharedAccessSpreaderUse, iters) {
421 for (unsigned long i = 0; i < iters; ++i) {
422 auto x = AccessSpreader<>::current(16);
423 folly::doNotOptimizeAway(x);
427 BENCHMARK(AccessSpreaderConstruction, iters) {
428 std::aligned_storage<sizeof(AccessSpreader<>),
429 std::alignment_of<AccessSpreader<>>::value>::type raw;
430 for (unsigned long i = 0; i < iters; ++i) {
431 auto x = new (&raw) AccessSpreader<>(16);
432 folly::doNotOptimizeAway(x);
433 x->~AccessSpreader();
437 enum class SpreaderType { GETCPU, SHARED, TLS_RR };
439 // Benchmark scores here reflect the time for 32 threads to perform an
440 // atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly,
441 // if we don't separate the counters onto unique 128 byte stripes the
442 // 1_stripe and 2_stripe results are identical, even though the L3 is
443 // claimed to have 64 byte cache lines.
445 // _stub means there was no call to getcpu or the tls round-robin
446 // implementation, because for a single stripe the cpu doesn't matter.
447 // _getcpu refers to the vdso getcpu implementation with a locally
448 // constructed AccessSpreader. _tls_rr refers to execution using
449 // SequentialThreadId, the fallback if the vdso getcpu isn't available.
450 // _shared refers to calling AccessSpreader<>::current(numStripes)
451 // inside the hot loop.
453 // At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
454 // so since the stripe selection is 21 nanos the atomic increments in
455 // the L1 is ~15 nanos. At width 8_stripe_0_work the line is expected
456 // to ping-pong almost every operation, since the loops have the same
457 // duration. Widths 4 and 2 have the same behavior, but each tour of the
458 // cache line is 4 and 8 cores long, respectively. These all suggest a
459 // lower bound of 60 nanos for intra-chip handoff and increment between
462 // With 455 nanos (1K cycles) of busywork per contended increment, the
463 // system can hide all of the latency of a tour of length 4, but not
464 // quite one of length 8. I was a bit surprised at how much worse the
465 // non-striped version got. It seems that the inter-chip traffic also
466 // interferes with the L1-only localWork.load(). When the local work is
467 // doubled to about 1 microsecond we see that the inter-chip contention
468 // is still very important, but subdivisions on the same chip don't matter.
471 // _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
472 // ============================================================================
473 // folly/test/CacheLocalityTest.cpp relative time/iter iters/s
474 // ============================================================================
475 // contentionAtWidth(1_stripe_0_work_stub) 1.14us 873.64K
476 // contentionAtWidth(2_stripe_0_work_getcpu) 495.58ns 2.02M
477 // contentionAtWidth(4_stripe_0_work_getcpu) 232.99ns 4.29M
478 // contentionAtWidth(8_stripe_0_work_getcpu) 101.16ns 9.88M
479 // contentionAtWidth(16_stripe_0_work_getcpu) 41.93ns 23.85M
480 // contentionAtWidth(32_stripe_0_work_getcpu) 42.04ns 23.79M
481 // contentionAtWidth(64_stripe_0_work_getcpu) 41.94ns 23.84M
482 // contentionAtWidth(2_stripe_0_work_tls_rr) 1.00us 997.41K
483 // contentionAtWidth(4_stripe_0_work_tls_rr) 694.41ns 1.44M
484 // contentionAtWidth(8_stripe_0_work_tls_rr) 590.27ns 1.69M
485 // contentionAtWidth(16_stripe_0_work_tls_rr) 222.13ns 4.50M
486 // contentionAtWidth(32_stripe_0_work_tls_rr) 169.49ns 5.90M
487 // contentionAtWidth(64_stripe_0_work_tls_rr) 162.20ns 6.17M
488 // contentionAtWidth(2_stripe_0_work_shared) 495.54ns 2.02M
489 // contentionAtWidth(4_stripe_0_work_shared) 236.27ns 4.23M
490 // contentionAtWidth(8_stripe_0_work_shared) 114.81ns 8.71M
491 // contentionAtWidth(16_stripe_0_work_shared) 44.65ns 22.40M
492 // contentionAtWidth(32_stripe_0_work_shared) 41.76ns 23.94M
493 // contentionAtWidth(64_stripe_0_work_shared) 43.47ns 23.00M
494 // atomicIncrBaseline(local_incr_0_work) 20.39ns 49.06M
495 // ----------------------------------------------------------------------------
496 // contentionAtWidth(1_stripe_500_work_stub) 2.04us 491.13K
497 // contentionAtWidth(2_stripe_500_work_getcpu) 610.98ns 1.64M
498 // contentionAtWidth(4_stripe_500_work_getcpu) 507.72ns 1.97M
499 // contentionAtWidth(8_stripe_500_work_getcpu) 542.53ns 1.84M
500 // contentionAtWidth(16_stripe_500_work_getcpu) 496.55ns 2.01M
501 // contentionAtWidth(32_stripe_500_work_getcpu) 500.67ns 2.00M
502 // atomicIncrBaseline(local_incr_500_work) 484.69ns 2.06M
503 // ----------------------------------------------------------------------------
504 // contentionAtWidth(1_stripe_1000_work_stub) 2.11us 473.78K
505 // contentionAtWidth(2_stripe_1000_work_getcpu) 970.64ns 1.03M
506 // contentionAtWidth(4_stripe_1000_work_getcpu) 987.31ns 1.01M
507 // contentionAtWidth(8_stripe_1000_work_getcpu) 1.01us 985.52K
508 // contentionAtWidth(16_stripe_1000_work_getcpu) 986.09ns 1.01M
509 // contentionAtWidth(32_stripe_1000_work_getcpu) 960.23ns 1.04M
510 // atomicIncrBaseline(local_incr_1000_work) 950.63ns 1.05M
511 // ============================================================================
512 static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
513 SpreaderType spreaderType,
514 size_t counterAlignment = 128,
515 size_t numThreads = 32) {
516 folly::BenchmarkSuspender braces;
518 AccessSpreader<> spreader(
520 CacheLocality::system<std::atomic>(),
521 spreaderType == SpreaderType::TLS_RR
522 ? SequentialThreadId<std::atomic>::getcpu : nullptr);
524 std::atomic<size_t> ready(0);
525 std::atomic<bool> go(false);
527 // while in theory the cache line size is 64 bytes, experiments show
528 // that we get contention on 128 byte boundaries for Ivy Bridge. The
529 // extra indirection adds 1 or 2 nanos
530 assert(counterAlignment >= sizeof(std::atomic<size_t>));
531 char raw[counterAlignment * stripes];
533 // if we happen to be using the tlsRoundRobin, then sequentially
534 // assigning the thread identifiers is the unlikely best-case scenario.
535 // We don't want to unfairly benefit or penalize. Computing the exact
536 // maximum likelihood of the probability distributions is annoying, so
537 // I approximate as 2/5 of the ids that have no threads, 2/5 that have
538 // 1, 2/15 that have 2, and 1/15 that have 3. We accomplish this by
539 // wrapping back to slot 0 when we hit 1/15 and 1/5.
541 std::vector<std::thread> threads;
542 while (threads.size() < numThreads) {
543 threads.push_back(std::thread([&,iters,stripes,work]() {
544 std::atomic<size_t>* counters[stripes];
545 for (size_t i = 0; i < stripes; ++i) {
546 counters[i] = new (raw + counterAlignment * i) std::atomic<size_t>();
554 std::atomic<int> localWork;
555 if (spreaderType == SpreaderType::SHARED) {
556 for (size_t i = iters; i > 0; --i) {
557 ++*(counters[AccessSpreader<>::current(stripes)]);
558 for (size_t j = work; j > 0; --j) {
563 for (size_t i = iters; i > 0; --i) {
564 ++*(counters[spreader.current()]);
565 for (size_t j = work; j > 0; --j) {
572 if (threads.size() == numThreads / 15 ||
573 threads.size() == numThreads / 5) {
574 // create a few dummy threads to wrap back around to 0 mod numCpus
575 for (size_t i = threads.size(); i != numThreads; ++i) {
583 while (ready < numThreads) {
589 for (auto& thr : threads) {
594 static void atomicIncrBaseline(size_t iters, size_t work,
595 size_t numThreads = 32) {
596 folly::BenchmarkSuspender braces;
598 std::atomic<bool> go(false);
600 std::vector<std::thread> threads;
601 while (threads.size() < numThreads) {
602 threads.push_back(std::thread([&]() {
606 std::atomic<size_t> localCounter;
607 std::atomic<int> localWork;
608 for (size_t i = iters; i > 0; --i) {
610 for (size_t j = work; j > 0; --j) {
620 for (auto& thr : threads) {
625 BENCHMARK_DRAW_LINE()
627 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_0_work_stub,
628 1, 0, SpreaderType::GETCPU)
629 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_getcpu,
630 2, 0, SpreaderType::GETCPU)
631 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_getcpu,
632 4, 0, SpreaderType::GETCPU)
633 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_getcpu,
634 8, 0, SpreaderType::GETCPU)
635 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_getcpu,
636 16, 0, SpreaderType::GETCPU)
637 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_getcpu,
638 32, 0, SpreaderType::GETCPU)
639 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_getcpu,
640 64, 0, SpreaderType::GETCPU)
641 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_tls_rr,
642 2, 0, SpreaderType::TLS_RR)
643 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_tls_rr,
644 4, 0, SpreaderType::TLS_RR)
645 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_tls_rr,
646 8, 0, SpreaderType::TLS_RR)
647 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_tls_rr,
648 16, 0, SpreaderType::TLS_RR)
649 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_tls_rr,
650 32, 0, SpreaderType::TLS_RR)
651 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_tls_rr,
652 64, 0, SpreaderType::TLS_RR)
653 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_shared,
654 2, 0, SpreaderType::SHARED)
655 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_shared,
656 4, 0, SpreaderType::SHARED)
657 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_shared,
658 8, 0, SpreaderType::SHARED)
659 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_shared,
660 16, 0, SpreaderType::SHARED)
661 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_shared,
662 32, 0, SpreaderType::SHARED)
663 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_shared,
664 64, 0, SpreaderType::SHARED)
665 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
666 BENCHMARK_DRAW_LINE()
667 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_500_work_stub,
668 1, 500, SpreaderType::GETCPU)
669 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_500_work_getcpu,
670 2, 500, SpreaderType::GETCPU)
671 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_500_work_getcpu,
672 4, 500, SpreaderType::GETCPU)
673 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_500_work_getcpu,
674 8, 500, SpreaderType::GETCPU)
675 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_500_work_getcpu,
676 16, 500, SpreaderType::GETCPU)
677 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_500_work_getcpu,
678 32, 500, SpreaderType::GETCPU)
679 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
680 BENCHMARK_DRAW_LINE()
681 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_1000_work_stub,
682 1, 1000, SpreaderType::GETCPU)
683 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_1000_work_getcpu,
684 2, 1000, SpreaderType::GETCPU)
685 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_1000_work_getcpu,
686 4, 1000, SpreaderType::GETCPU)
687 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_1000_work_getcpu,
688 8, 1000, SpreaderType::GETCPU)
689 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_1000_work_getcpu,
690 16, 1000, SpreaderType::GETCPU)
691 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_1000_work_getcpu,
692 32, 1000, SpreaderType::GETCPU)
693 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
696 int main(int argc, char** argv) {
697 testing::InitGoogleTest(&argc, argv);
698 google::ParseCommandLineFlags(&argc, &argv, true);
699 auto ret = RUN_ALL_TESTS();
700 if (!ret && FLAGS_benchmark) {
701 folly::runBenchmarks();