Baton::ready, a const variant of try_wait
[folly.git] / folly / test / MathBenchmark.cpp
1 /*
2  * Copyright 2017 Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <folly/Math.h>
18
19 #include <algorithm>
20 #include <random>
21
22 #include <folly/Benchmark.h>
23
24 namespace {
25 template <typename T>
26 T brokenButWidespreadDivCeil(T num, T denom) {
27   return (num + denom - 1) / denom;
28 }
29
30 template <typename T>
31 T viaFloatDivCeil(T num, T denom) {
32   return static_cast<T>(ceilf(static_cast<float>(num) / denom));
33 }
34
35 template <typename T>
36 T viaDoubleDivCeil(T num, T denom) {
37   return static_cast<T>(ceil(static_cast<double>(num) / denom));
38 }
39
40 template <typename T>
41 T viaLongDoubleDivCeil(T num, T denom) {
42   return static_cast<T>(ceill(static_cast<long double>(num) / denom));
43 }
44
45 template <typename T>
46 std::vector<T> divValues() {
47   std::vector<T> rv;
48   for (T i = 1; i < std::numeric_limits<T>::max() && i <= 1000; ++i) {
49     rv.push_back(i);
50     rv.push_back(-i);
51     rv.push_back(std::numeric_limits<T>::max() / i);
52     auto x = std::numeric_limits<T>::min() / i;
53     if (x != 0) {
54       rv.push_back(x);
55     }
56   }
57   return rv;
58 }
59
60 template <typename T, typename F>
61 void runDivTests(const F& func, size_t iters) {
62   std::vector<T> denoms;
63   std::vector<T> numers;
64   BENCHMARK_SUSPEND {
65     denoms = divValues<T>();
66     numers = denoms;
67     numers.push_back(0);
68     std::mt19937 rnd(1234);
69     std::shuffle(denoms.begin(), denoms.end(), rnd);
70     std::shuffle(numers.begin(), numers.end(), rnd);
71   }
72   T dep = 0;
73   while (true) {
74     for (T d : denoms) {
75       for (T n : numers) {
76         n ^= dep;
77         if (std::is_signed<T>::value && n == std::numeric_limits<T>::min() &&
78             d == -1) {
79           // min / -1 overflows in two's complement
80           d = -2;
81         }
82         dep = func(n, d);
83
84         if (--iters == 0) {
85           folly::doNotOptimizeAway(dep);
86           return;
87         }
88       }
89     }
90   }
91 }
92 } // namespace
93
94 BENCHMARK_DRAW_LINE();
95 BENCHMARK(divTruncInt8, iters) {
96   runDivTests<int8_t>(&folly::divTrunc<int8_t, int8_t>, iters);
97 }
98 BENCHMARK(divFloorInt8, iters) {
99   runDivTests<int8_t>(&folly::divFloor<int8_t, int8_t>, iters);
100 }
101 BENCHMARK(divCeilInt8, iters) {
102   runDivTests<int8_t>(&folly::divCeil<int8_t, int8_t>, iters);
103 }
104 BENCHMARK_RELATIVE(branchlessDivCeilInt8, iters) {
105   runDivTests<int8_t>(&folly::detail::divCeilBranchless<int8_t>, iters);
106 }
107 BENCHMARK_RELATIVE(branchfulDivCeilInt8, iters) {
108   runDivTests<int8_t>(&folly::detail::divCeilBranchful<int8_t>, iters);
109 }
110 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt8, iters) {
111   runDivTests<int8_t>(&brokenButWidespreadDivCeil<int8_t>, iters);
112 }
113 BENCHMARK_RELATIVE(viaFloatDivCeilInt8, iters) {
114   runDivTests<int8_t>(&viaFloatDivCeil<int8_t>, iters);
115 }
116 BENCHMARK_RELATIVE(viaDoubleDivCeilInt8, iters) {
117   runDivTests<int8_t>(&viaDoubleDivCeil<int8_t>, iters);
118 }
119 BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt8, iters) {
120   runDivTests<int8_t>(&viaLongDoubleDivCeil<int8_t>, iters);
121 }
122 BENCHMARK(divRoundAwayInt8, iters) {
123   runDivTests<int8_t>(&folly::divRoundAway<int8_t, int8_t>, iters);
124 }
125
126 BENCHMARK_DRAW_LINE();
127 BENCHMARK(divTruncInt16, iters) {
128   runDivTests<int16_t>(&folly::divTrunc<int16_t, int16_t>, iters);
129 }
130 BENCHMARK(divFloorInt16, iters) {
131   runDivTests<int16_t>(&folly::divFloor<int16_t, int16_t>, iters);
132 }
133 BENCHMARK(divCeilInt16, iters) {
134   runDivTests<int16_t>(&folly::divCeil<int16_t, int16_t>, iters);
135 }
136 BENCHMARK_RELATIVE(branchlessDivCeilInt16, iters) {
137   runDivTests<int16_t>(&folly::detail::divCeilBranchless<int16_t>, iters);
138 }
139 BENCHMARK_RELATIVE(branchfulDivCeilInt16, iters) {
140   runDivTests<int16_t>(&folly::detail::divCeilBranchful<int16_t>, iters);
141 }
142 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt16, iters) {
143   runDivTests<int16_t>(&brokenButWidespreadDivCeil<int16_t>, iters);
144 }
145 BENCHMARK_RELATIVE(viaFloatDivCeilInt16, iters) {
146   runDivTests<int16_t>(&viaFloatDivCeil<int16_t>, iters);
147 }
148 BENCHMARK_RELATIVE(viaDoubleDivCeilInt16, iters) {
149   runDivTests<int16_t>(&viaDoubleDivCeil<int16_t>, iters);
150 }
151 BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt16, iters) {
152   runDivTests<int16_t>(&viaLongDoubleDivCeil<int16_t>, iters);
153 }
154 BENCHMARK(divRoundAwayInt16, iters) {
155   runDivTests<int16_t>(&folly::divRoundAway<int16_t, int16_t>, iters);
156 }
157
158 BENCHMARK_DRAW_LINE();
159 BENCHMARK(divTruncInt32, iters) {
160   runDivTests<int32_t>(&folly::divTrunc<int32_t, int32_t>, iters);
161 }
162 BENCHMARK(divFloorInt32, iters) {
163   runDivTests<int32_t>(&folly::divFloor<int32_t, int32_t>, iters);
164 }
165 BENCHMARK(divCeilInt32, iters) {
166   runDivTests<int32_t>(&folly::divCeil<int32_t, int32_t>, iters);
167 }
168 BENCHMARK_RELATIVE(branchlessDivCeilInt32, iters) {
169   runDivTests<int32_t>(&folly::detail::divCeilBranchless<int32_t>, iters);
170 }
171 BENCHMARK_RELATIVE(branchfulDivCeilInt32, iters) {
172   runDivTests<int32_t>(&folly::detail::divCeilBranchful<int32_t>, iters);
173 }
174 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt32, iters) {
175   runDivTests<int32_t>(&brokenButWidespreadDivCeil<int32_t>, iters);
176 }
177 BENCHMARK_RELATIVE(approxViaFloatDivCeilInt32, iters) {
178   runDivTests<int32_t>(&viaFloatDivCeil<int32_t>, iters);
179 }
180 BENCHMARK_RELATIVE(viaDoubleDivCeilInt32, iters) {
181   runDivTests<int32_t>(&viaDoubleDivCeil<int32_t>, iters);
182 }
183 BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt32, iters) {
184   runDivTests<int32_t>(&viaLongDoubleDivCeil<int32_t>, iters);
185 }
186 BENCHMARK(divRoundAwayInt32, iters) {
187   runDivTests<int32_t>(&folly::divRoundAway<int32_t, int32_t>, iters);
188 }
189
190 BENCHMARK_DRAW_LINE();
191 BENCHMARK(divTruncInt64, iters) {
192   runDivTests<int64_t>(&folly::divTrunc<int64_t, int64_t>, iters);
193 }
194 BENCHMARK(divFloorInt64, iters) {
195   runDivTests<int64_t>(&folly::divFloor<int64_t, int64_t>, iters);
196 }
197 BENCHMARK(divCeilInt64, iters) {
198   runDivTests<int64_t>(&folly::divCeil<int64_t, int64_t>, iters);
199 }
200 BENCHMARK_RELATIVE(branchlessDivCeilInt64, iters) {
201   runDivTests<int64_t>(&folly::detail::divCeilBranchless<int64_t>, iters);
202 }
203 BENCHMARK_RELATIVE(branchfulDivCeilInt64, iters) {
204   runDivTests<int64_t>(&folly::detail::divCeilBranchful<int64_t>, iters);
205 }
206 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt64, iters) {
207   runDivTests<int64_t>(&brokenButWidespreadDivCeil<int64_t>, iters);
208 }
209 BENCHMARK_RELATIVE(approxViaFloatDivCeilInt64, iters) {
210   runDivTests<int64_t>(&viaFloatDivCeil<int64_t>, iters);
211 }
212 BENCHMARK_RELATIVE(approxViaDoubleDivCeilInt64, iters) {
213   runDivTests<int64_t>(&viaDoubleDivCeil<int64_t>, iters);
214 }
215 BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt64, iters) {
216   runDivTests<int64_t>(&viaLongDoubleDivCeil<int64_t>, iters);
217 }
218 BENCHMARK(divRoundAwayInt64, iters) {
219   runDivTests<int64_t>(&folly::divRoundAway<int64_t, int64_t>, iters);
220 }
221
222 BENCHMARK_DRAW_LINE();
223 BENCHMARK(divTruncUint8, iters) {
224   runDivTests<uint8_t>(&folly::divTrunc<uint8_t, uint8_t>, iters);
225 }
226 BENCHMARK(divFloorUint8, iters) {
227   runDivTests<uint8_t>(&folly::divFloor<uint8_t, uint8_t>, iters);
228 }
229 BENCHMARK(divCeilUint8, iters) {
230   runDivTests<uint8_t>(&folly::divCeil<uint8_t, uint8_t>, iters);
231 }
232 BENCHMARK_RELATIVE(branchlessDivCeilUint8, iters) {
233   runDivTests<uint8_t>(&folly::detail::divCeilBranchless<uint8_t>, iters);
234 }
235 BENCHMARK_RELATIVE(branchfulDivCeilUint8, iters) {
236   runDivTests<uint8_t>(&folly::detail::divCeilBranchful<uint8_t>, iters);
237 }
238 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint8, iters) {
239   runDivTests<uint8_t>(&brokenButWidespreadDivCeil<uint8_t>, iters);
240 }
241 BENCHMARK_RELATIVE(viaFloatDivCeilUint8, iters) {
242   runDivTests<uint8_t>(&viaFloatDivCeil<uint8_t>, iters);
243 }
244 BENCHMARK_RELATIVE(viaDoubleDivCeilUint8, iters) {
245   runDivTests<uint8_t>(&viaDoubleDivCeil<uint8_t>, iters);
246 }
247 BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint8, iters) {
248   runDivTests<uint8_t>(&viaLongDoubleDivCeil<uint8_t>, iters);
249 }
250 BENCHMARK(divRoundAwayUint8, iters) {
251   runDivTests<uint8_t>(&folly::divRoundAway<uint8_t, uint8_t>, iters);
252 }
253
254 BENCHMARK_DRAW_LINE();
255 BENCHMARK(divTruncUint16, iters) {
256   runDivTests<uint16_t>(&folly::divTrunc<uint16_t, uint16_t>, iters);
257 }
258 BENCHMARK(divFloorUint16, iters) {
259   runDivTests<uint16_t>(&folly::divFloor<uint16_t, uint16_t>, iters);
260 }
261 BENCHMARK(divCeilUint16, iters) {
262   runDivTests<uint16_t>(&folly::divCeil<uint16_t, uint16_t>, iters);
263 }
264 BENCHMARK_RELATIVE(branchlessDivCeilUint16, iters) {
265   runDivTests<uint16_t>(&folly::detail::divCeilBranchless<uint16_t>, iters);
266 }
267 BENCHMARK_RELATIVE(branchfulDivCeilUint16, iters) {
268   runDivTests<uint16_t>(&folly::detail::divCeilBranchful<uint16_t>, iters);
269 }
270 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint16, iters) {
271   runDivTests<uint16_t>(&brokenButWidespreadDivCeil<uint16_t>, iters);
272 }
273 BENCHMARK_RELATIVE(viaFloatDivCeilUint16, iters) {
274   runDivTests<uint16_t>(&viaFloatDivCeil<uint16_t>, iters);
275 }
276 BENCHMARK_RELATIVE(viaDoubleDivCeilUint16, iters) {
277   runDivTests<uint16_t>(&viaDoubleDivCeil<uint16_t>, iters);
278 }
279 BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint16, iters) {
280   runDivTests<uint16_t>(&viaLongDoubleDivCeil<uint16_t>, iters);
281 }
282 BENCHMARK(divRoundAwayUint16, iters) {
283   runDivTests<uint16_t>(&folly::divRoundAway<uint16_t, uint16_t>, iters);
284 }
285
286 BENCHMARK_DRAW_LINE();
287 BENCHMARK(divTruncUint32, iters) {
288   runDivTests<uint32_t>(&folly::divTrunc<uint32_t, uint32_t>, iters);
289 }
290 BENCHMARK(divFloorUint32, iters) {
291   runDivTests<uint32_t>(&folly::divFloor<uint32_t, uint32_t>, iters);
292 }
293 BENCHMARK(divCeilUint32, iters) {
294   runDivTests<uint32_t>(&folly::divCeil<uint32_t, uint32_t>, iters);
295 }
296 BENCHMARK_RELATIVE(branchlessDivCeilUint32, iters) {
297   runDivTests<uint32_t>(&folly::detail::divCeilBranchless<uint32_t>, iters);
298 }
299 BENCHMARK_RELATIVE(branchfulDivCeilUint32, iters) {
300   runDivTests<uint32_t>(&folly::detail::divCeilBranchful<uint32_t>, iters);
301 }
302 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint32, iters) {
303   runDivTests<uint32_t>(&brokenButWidespreadDivCeil<uint32_t>, iters);
304 }
305 BENCHMARK_RELATIVE(approxViaFloatDivCeilUint32, iters) {
306   runDivTests<uint32_t>(&viaFloatDivCeil<uint32_t>, iters);
307 }
308 BENCHMARK_RELATIVE(viaDoubleDivCeilUint32, iters) {
309   runDivTests<uint32_t>(&viaDoubleDivCeil<uint32_t>, iters);
310 }
311 BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint32, iters) {
312   runDivTests<uint32_t>(&viaLongDoubleDivCeil<uint32_t>, iters);
313 }
314 BENCHMARK(divRoundAwayUint32, iters) {
315   runDivTests<uint32_t>(&folly::divRoundAway<uint32_t, uint32_t>, iters);
316 }
317
318 BENCHMARK_DRAW_LINE();
319 BENCHMARK(divTruncUint64, iters) {
320   runDivTests<uint64_t>(&folly::divTrunc<uint64_t, uint64_t>, iters);
321 }
322 BENCHMARK(divFloorUint64, iters) {
323   runDivTests<uint64_t>(&folly::divFloor<uint64_t, uint64_t>, iters);
324 }
325 BENCHMARK(divCeilUint64, iters) {
326   runDivTests<uint64_t>(&folly::divCeil<uint64_t, uint64_t>, iters);
327 }
328 BENCHMARK_RELATIVE(branchlessDivCeilUint64, iters) {
329   runDivTests<uint64_t>(&folly::detail::divCeilBranchless<uint64_t>, iters);
330 }
331 BENCHMARK_RELATIVE(branchfulDivCeilUint64, iters) {
332   runDivTests<uint64_t>(&folly::detail::divCeilBranchful<uint64_t>, iters);
333 }
334 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint64, iters) {
335   runDivTests<uint64_t>(&brokenButWidespreadDivCeil<uint64_t>, iters);
336 }
337 BENCHMARK_RELATIVE(approxViaFloatDivCeilUint64, iters) {
338   runDivTests<uint64_t>(&viaFloatDivCeil<uint64_t>, iters);
339 }
340 BENCHMARK_RELATIVE(approxViaDoubleDivCeilUint64, iters) {
341   runDivTests<uint64_t>(&viaDoubleDivCeil<uint64_t>, iters);
342 }
343 BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint64, iters) {
344   runDivTests<uint64_t>(&viaLongDoubleDivCeil<uint64_t>, iters);
345 }
346 BENCHMARK(divRoundAwayUint64, iters) {
347   runDivTests<uint64_t>(&folly::divRoundAway<uint64_t, uint64_t>, iters);
348 }
349
350 int main(int argc, char** argv) {
351   gflags::ParseCommandLineFlags(&argc, &argv, true);
352   folly::runBenchmarks();
353   return 0;
354 }
355
356 /*
357 Benchmarks run single-threaded on a dual Xeon E5-2660 @ 2.2 Ghz with
358 hyperthreading (16 physical cores, 20 MB cache per socket, 256 GB RAM)
359
360 Benchmarks used --bm_min_iters=10000000.
361
362 divTrunc is just a native integral division.  viaDoubleViaCeil doesn't
363 have full accuracy for Int64 or Uint64.  There is a loop-carried
364 dependency for all of the div* tests, but there is a bit of extra slack
365 (a predictable call, a load that should be from the L1, and a predictable
366 not-taken branch in addition to the loop's branch) in the driving loop,
367 so the benchmark driver's attempt to subtract the overhead of the loop
368 might mean that the latency numbers here are slightly too low or too high.
369
370 The branchful implementation's branch is very predictable in this
371 microbenchmark for unsigned types, since it only needs to predict a
372 zero numerator.  That's likely to be true in real life as well, so we
373 make this the default.
374
375 I was surprised at the speed of float and double division, but
376 the only case where it actually wins by much and is correct is for
377 int16_t.  (float + ceil is faster for the 32-bit case, but is only
378 an approximation.)  I ran a similar benchmark setup for ARM and ARM64.
379 On ARM the conditional versions win by quite a bit.  32-bit ARM doesn't
380 have a native integer divide, so getting the remainder after a division
381 (to see if truncation occurred) is more work than preconditioning the
382 numerator to make truncation go in the correct direction.  64-bit ARM
383 had the same winners and losers as x86_64, at least on the two physical
384 instances I tested.
385
386 ============================================================================
387 folly/test/MathBenchmark.cpp                    relative  time/iter  iters/s
388 ============================================================================
389 ----------------------------------------------------------------------------
390 divTruncInt8                                                 8.89ns  112.44M
391 divFloorInt8                                                10.99ns   91.00M
392 divCeilInt8                                                 10.95ns   91.33M
393 branchlessDivCeilInt8                            100.40%    10.91ns   91.69M
394 branchfulDivCeilInt8                              88.87%    12.32ns   81.16M
395 brokenButWidespreadDivCeilInt8                   109.20%    10.03ns   99.73M
396 viaFloatDivCeilInt8                              109.68%     9.98ns  100.17M
397 viaDoubleDivCeilInt8                              95.47%    11.47ns   87.19M
398 viaLongDoubleDivCeilInt8                          31.65%    34.59ns   28.91M
399 divRoundAwayInt8                                            10.42ns   95.97M
400 ----------------------------------------------------------------------------
401 divTruncInt16                                                8.68ns  115.17M
402 divFloorInt16                                               10.94ns   91.38M
403 divCeilInt16                                                10.91ns   91.70M
404 branchlessDivCeilInt16                            99.44%    10.97ns   91.18M
405 branchfulDivCeilInt16                             81.68%    13.35ns   74.90M
406 brokenButWidespreadDivCeilInt16                  109.50%     9.96ns  100.40M
407 viaFloatDivCeilInt16                             108.04%    10.09ns   99.07M
408 viaDoubleDivCeilInt16                             85.38%    12.77ns   78.29M
409 viaLongDoubleDivCeilInt16                         29.99%    36.36ns   27.50M
410 divRoundAwayInt16                                           10.59ns   94.46M
411 ----------------------------------------------------------------------------
412 divTruncInt32                                                8.38ns  119.29M
413 divFloorInt32                                               11.01ns   90.84M
414 divCeilInt32                                                11.12ns   89.91M
415 branchlessDivCeilInt32                           101.94%    10.91ns   91.66M
416 branchfulDivCeilInt32                             84.67%    13.14ns   76.12M
417 brokenButWidespreadDivCeilInt32                  117.61%     9.46ns  105.75M
418 approxViaFloatDivCeilInt32                       115.98%     9.59ns  104.28M
419 viaDoubleDivCeilInt32                             89.86%    12.38ns   80.79M
420 viaLongDoubleDivCeilInt32                         30.84%    36.06ns   27.73M
421 divRoundAwayInt32                                           11.30ns   88.50M
422 ----------------------------------------------------------------------------
423 divTruncInt64                                               16.07ns   62.21M
424 divFloorInt64                                               18.37ns   54.45M
425 divCeilInt64                                                18.61ns   53.74M
426 branchlessDivCeilInt64                           100.43%    18.53ns   53.97M
427 branchfulDivCeilInt64                             84.65%    21.98ns   45.49M
428 brokenButWidespreadDivCeilInt64                  108.47%    17.16ns   58.29M
429 approxViaFloatDivCeilInt64                       190.99%     9.74ns  102.64M
430 approxViaDoubleDivCeilInt64                      148.64%    12.52ns   79.88M
431 viaLongDoubleDivCeilInt64                         52.01%    35.77ns   27.95M
432 divRoundAwayInt64                                           18.79ns   53.21M
433 ----------------------------------------------------------------------------
434 divTruncUint8                                                7.76ns  128.89M
435 divFloorUint8                                                8.29ns  120.61M
436 divCeilUint8                                                 9.61ns  104.09M
437 branchlessDivCeilUint8                           112.00%     8.58ns  116.58M
438 branchfulDivCeilUint8                            114.01%     8.43ns  118.67M
439 brokenButWidespreadDivCeilUint8                  100.48%     9.56ns  104.58M
440 viaFloatDivCeilUint8                             103.53%     9.28ns  107.76M
441 viaDoubleDivCeilUint8                             85.75%    11.20ns   89.26M
442 viaLongDoubleDivCeilUint8                         27.72%    34.65ns   28.86M
443 divRoundAwayUint8                                            9.60ns  104.11M
444 ----------------------------------------------------------------------------
445 divTruncUint16                                               8.39ns  119.19M
446 divFloorUint16                                               8.28ns  120.82M
447 divCeilUint16                                                9.90ns  100.96M
448 branchlessDivCeilUint16                          100.23%     9.88ns  101.19M
449 branchfulDivCeilUint16                           107.83%     9.19ns  108.87M
450 brokenButWidespreadDivCeilUint16                  99.89%     9.92ns  100.85M
451 viaFloatDivCeilUint16                            100.54%     9.85ns  101.50M
452 viaDoubleDivCeilUint16                            77.38%    12.80ns   78.13M
453 viaLongDoubleDivCeilUint16                        27.30%    36.28ns   27.56M
454 divRoundAwayUint16                                           9.82ns  101.85M
455 ----------------------------------------------------------------------------
456 divTruncUint32                                               8.12ns  123.20M
457 divFloorUint32                                               8.09ns  123.58M
458 divCeilUint32                                                8.44ns  118.55M
459 branchlessDivCeilUint32                           88.27%     9.56ns  104.64M
460 branchfulDivCeilUint32                            98.91%     8.53ns  117.25M
461 brokenButWidespreadDivCeilUint32                  93.48%     9.02ns  110.82M
462 approxViaFloatDivCeilUint32                       86.29%     9.78ns  102.30M
463 viaDoubleDivCeilUint32                            66.76%    12.63ns   79.15M
464 viaLongDoubleDivCeilUint32                        23.35%    36.13ns   27.68M
465 divRoundAwayUint32                                           8.47ns  118.03M
466 ----------------------------------------------------------------------------
467 divTruncUint64                                              12.38ns   80.79M
468 divFloorUint64                                              12.27ns   81.47M
469 divCeilUint64                                               12.66ns   78.99M
470 branchlessDivCeilUint64                           93.46%    13.55ns   73.83M
471 branchfulDivCeilUint64                           100.30%    12.62ns   79.23M
472 brokenButWidespreadDivCeilUint64                  99.41%    12.73ns   78.53M
473 approxViaFloatDivCeilUint64                      106.59%    11.88ns   84.19M
474 approxViaDoubleDivCeilUint64                      92.14%    13.74ns   72.78M
475 viaLongDoubleDivCeilUint64                        33.51%    37.78ns   26.47M
476 divRoundAwayUint64                                          12.34ns   81.02M
477 ============================================================================
478 */