integer division with controlled rounding in Math.h
[folly.git] / folly / test / MathBenchmark.cpp
1 /*
2  * Copyright 2016 Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <folly/Math.h>
18
19 #include <algorithm>
20
21 #include <folly/Benchmark.h>
22
23 namespace {
24 template <typename T>
25 T brokenButWidespreadDivCeil(T num, T denom) {
26   return (num + denom - 1) / denom;
27 }
28
29 template <typename T>
30 T viaFloatDivCeil(T num, T denom) {
31   return static_cast<T>(ceilf(static_cast<float>(num) / denom));
32 }
33
34 template <typename T>
35 T viaDoubleDivCeil(T num, T denom) {
36   return static_cast<T>(ceil(static_cast<double>(num) / denom));
37 }
38
39 template <typename T>
40 T viaLongDoubleDivCeil(T num, T denom) {
41   return static_cast<T>(ceill(static_cast<long double>(num) / denom));
42 }
43
44 template <typename T>
45 std::vector<T> divValues() {
46   std::vector<T> rv;
47   for (T i = 1; i < std::numeric_limits<T>::max() && i <= 1000; ++i) {
48     rv.push_back(i);
49     rv.push_back(-i);
50     rv.push_back(std::numeric_limits<T>::max() / i);
51     auto x = std::numeric_limits<T>::min() / i;
52     if (x != 0) {
53       rv.push_back(x);
54     }
55   }
56   return rv;
57 }
58
59 template <typename T, typename F>
60 void runDivTests(const F& func, size_t iters) {
61   std::vector<T> denoms;
62   std::vector<T> numers;
63   BENCHMARK_SUSPEND {
64     denoms = divValues<T>();
65     numers = denoms;
66     numers.push_back(0);
67     std::mt19937 rnd(1234);
68     std::shuffle(denoms.begin(), denoms.end(), rnd);
69     std::shuffle(numers.begin(), numers.end(), rnd);
70   }
71   T dep = 0;
72   while (true) {
73     for (T d : denoms) {
74       for (T n : numers) {
75         n ^= dep;
76         if (std::is_signed<T>::value && n == std::numeric_limits<T>::min() &&
77             d == -1) {
78           // min / -1 overflows in two's complement
79           d = -2;
80         }
81         dep = func(n, d);
82
83         if (--iters == 0) {
84           folly::doNotOptimizeAway(dep);
85           return;
86         }
87       }
88     }
89   }
90 }
91 }
92
93 BENCHMARK_DRAW_LINE();
94 BENCHMARK(divTruncInt8, iters) {
95   runDivTests<int8_t>(&folly::divTrunc<int8_t, int8_t>, iters);
96 }
97 BENCHMARK(divFloorInt8, iters) {
98   runDivTests<int8_t>(&folly::divFloor<int8_t, int8_t>, iters);
99 }
100 BENCHMARK(divCeilInt8, iters) {
101   runDivTests<int8_t>(&folly::divCeil<int8_t, int8_t>, iters);
102 }
103 BENCHMARK_RELATIVE(branchlessDivCeilInt8, iters) {
104   runDivTests<int8_t>(&folly::detail::divCeilBranchless<int8_t>, iters);
105 }
106 BENCHMARK_RELATIVE(branchfulDivCeilInt8, iters) {
107   runDivTests<int8_t>(&folly::detail::divCeilBranchful<int8_t>, iters);
108 }
109 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt8, iters) {
110   runDivTests<int8_t>(&brokenButWidespreadDivCeil<int8_t>, iters);
111 }
112 BENCHMARK_RELATIVE(viaFloatDivCeilInt8, iters) {
113   runDivTests<int8_t>(&viaFloatDivCeil<int8_t>, iters);
114 }
115 BENCHMARK_RELATIVE(viaDoubleDivCeilInt8, iters) {
116   runDivTests<int8_t>(&viaDoubleDivCeil<int8_t>, iters);
117 }
118 BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt8, iters) {
119   runDivTests<int8_t>(&viaLongDoubleDivCeil<int8_t>, iters);
120 }
121 BENCHMARK(divRoundAwayInt8, iters) {
122   runDivTests<int8_t>(&folly::divRoundAway<int8_t, int8_t>, iters);
123 }
124
125 BENCHMARK_DRAW_LINE();
126 BENCHMARK(divTruncInt16, iters) {
127   runDivTests<int16_t>(&folly::divTrunc<int16_t, int16_t>, iters);
128 }
129 BENCHMARK(divFloorInt16, iters) {
130   runDivTests<int16_t>(&folly::divFloor<int16_t, int16_t>, iters);
131 }
132 BENCHMARK(divCeilInt16, iters) {
133   runDivTests<int16_t>(&folly::divCeil<int16_t, int16_t>, iters);
134 }
135 BENCHMARK_RELATIVE(branchlessDivCeilInt16, iters) {
136   runDivTests<int16_t>(&folly::detail::divCeilBranchless<int16_t>, iters);
137 }
138 BENCHMARK_RELATIVE(branchfulDivCeilInt16, iters) {
139   runDivTests<int16_t>(&folly::detail::divCeilBranchful<int16_t>, iters);
140 }
141 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt16, iters) {
142   runDivTests<int16_t>(&brokenButWidespreadDivCeil<int16_t>, iters);
143 }
144 BENCHMARK_RELATIVE(viaFloatDivCeilInt16, iters) {
145   runDivTests<int16_t>(&viaFloatDivCeil<int16_t>, iters);
146 }
147 BENCHMARK_RELATIVE(viaDoubleDivCeilInt16, iters) {
148   runDivTests<int16_t>(&viaDoubleDivCeil<int16_t>, iters);
149 }
150 BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt16, iters) {
151   runDivTests<int16_t>(&viaLongDoubleDivCeil<int16_t>, iters);
152 }
153 BENCHMARK(divRoundAwayInt16, iters) {
154   runDivTests<int16_t>(&folly::divRoundAway<int16_t, int16_t>, iters);
155 }
156
157 BENCHMARK_DRAW_LINE();
158 BENCHMARK(divTruncInt32, iters) {
159   runDivTests<int32_t>(&folly::divTrunc<int32_t, int32_t>, iters);
160 }
161 BENCHMARK(divFloorInt32, iters) {
162   runDivTests<int32_t>(&folly::divFloor<int32_t, int32_t>, iters);
163 }
164 BENCHMARK(divCeilInt32, iters) {
165   runDivTests<int32_t>(&folly::divCeil<int32_t, int32_t>, iters);
166 }
167 BENCHMARK_RELATIVE(branchlessDivCeilInt32, iters) {
168   runDivTests<int32_t>(&folly::detail::divCeilBranchless<int32_t>, iters);
169 }
170 BENCHMARK_RELATIVE(branchfulDivCeilInt32, iters) {
171   runDivTests<int32_t>(&folly::detail::divCeilBranchful<int32_t>, iters);
172 }
173 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt32, iters) {
174   runDivTests<int32_t>(&brokenButWidespreadDivCeil<int32_t>, iters);
175 }
176 BENCHMARK_RELATIVE(approxViaFloatDivCeilInt32, iters) {
177   runDivTests<int32_t>(&viaFloatDivCeil<int32_t>, iters);
178 }
179 BENCHMARK_RELATIVE(viaDoubleDivCeilInt32, iters) {
180   runDivTests<int32_t>(&viaDoubleDivCeil<int32_t>, iters);
181 }
182 BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt32, iters) {
183   runDivTests<int32_t>(&viaLongDoubleDivCeil<int32_t>, iters);
184 }
185 BENCHMARK(divRoundAwayInt32, iters) {
186   runDivTests<int32_t>(&folly::divRoundAway<int32_t, int32_t>, iters);
187 }
188
189 BENCHMARK_DRAW_LINE();
190 BENCHMARK(divTruncInt64, iters) {
191   runDivTests<int64_t>(&folly::divTrunc<int64_t, int64_t>, iters);
192 }
193 BENCHMARK(divFloorInt64, iters) {
194   runDivTests<int64_t>(&folly::divFloor<int64_t, int64_t>, iters);
195 }
196 BENCHMARK(divCeilInt64, iters) {
197   runDivTests<int64_t>(&folly::divCeil<int64_t, int64_t>, iters);
198 }
199 BENCHMARK_RELATIVE(branchlessDivCeilInt64, iters) {
200   runDivTests<int64_t>(&folly::detail::divCeilBranchless<int64_t>, iters);
201 }
202 BENCHMARK_RELATIVE(branchfulDivCeilInt64, iters) {
203   runDivTests<int64_t>(&folly::detail::divCeilBranchful<int64_t>, iters);
204 }
205 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt64, iters) {
206   runDivTests<int64_t>(&brokenButWidespreadDivCeil<int64_t>, iters);
207 }
208 BENCHMARK_RELATIVE(approxViaFloatDivCeilInt64, iters) {
209   runDivTests<int64_t>(&viaFloatDivCeil<int64_t>, iters);
210 }
211 BENCHMARK_RELATIVE(approxViaDoubleDivCeilInt64, iters) {
212   runDivTests<int64_t>(&viaDoubleDivCeil<int64_t>, iters);
213 }
214 BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt64, iters) {
215   runDivTests<int64_t>(&viaLongDoubleDivCeil<int64_t>, iters);
216 }
217 BENCHMARK(divRoundAwayInt64, iters) {
218   runDivTests<int64_t>(&folly::divRoundAway<int64_t, int64_t>, iters);
219 }
220
221 BENCHMARK_DRAW_LINE();
222 BENCHMARK(divTruncUint8, iters) {
223   runDivTests<uint8_t>(&folly::divTrunc<uint8_t, uint8_t>, iters);
224 }
225 BENCHMARK(divFloorUint8, iters) {
226   runDivTests<uint8_t>(&folly::divFloor<uint8_t, uint8_t>, iters);
227 }
228 BENCHMARK(divCeilUint8, iters) {
229   runDivTests<uint8_t>(&folly::divCeil<uint8_t, uint8_t>, iters);
230 }
231 BENCHMARK_RELATIVE(branchlessDivCeilUint8, iters) {
232   runDivTests<uint8_t>(&folly::detail::divCeilBranchless<uint8_t>, iters);
233 }
234 BENCHMARK_RELATIVE(branchfulDivCeilUint8, iters) {
235   runDivTests<uint8_t>(&folly::detail::divCeilBranchful<uint8_t>, iters);
236 }
237 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint8, iters) {
238   runDivTests<uint8_t>(&brokenButWidespreadDivCeil<uint8_t>, iters);
239 }
240 BENCHMARK_RELATIVE(viaFloatDivCeilUint8, iters) {
241   runDivTests<uint8_t>(&viaFloatDivCeil<uint8_t>, iters);
242 }
243 BENCHMARK_RELATIVE(viaDoubleDivCeilUint8, iters) {
244   runDivTests<uint8_t>(&viaDoubleDivCeil<uint8_t>, iters);
245 }
246 BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint8, iters) {
247   runDivTests<uint8_t>(&viaLongDoubleDivCeil<uint8_t>, iters);
248 }
249 BENCHMARK(divRoundAwayUint8, iters) {
250   runDivTests<uint8_t>(&folly::divRoundAway<uint8_t, uint8_t>, iters);
251 }
252
253 BENCHMARK_DRAW_LINE();
254 BENCHMARK(divTruncUint16, iters) {
255   runDivTests<uint16_t>(&folly::divTrunc<uint16_t, uint16_t>, iters);
256 }
257 BENCHMARK(divFloorUint16, iters) {
258   runDivTests<uint16_t>(&folly::divFloor<uint16_t, uint16_t>, iters);
259 }
260 BENCHMARK(divCeilUint16, iters) {
261   runDivTests<uint16_t>(&folly::divCeil<uint16_t, uint16_t>, iters);
262 }
263 BENCHMARK_RELATIVE(branchlessDivCeilUint16, iters) {
264   runDivTests<uint16_t>(&folly::detail::divCeilBranchless<uint16_t>, iters);
265 }
266 BENCHMARK_RELATIVE(branchfulDivCeilUint16, iters) {
267   runDivTests<uint16_t>(&folly::detail::divCeilBranchful<uint16_t>, iters);
268 }
269 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint16, iters) {
270   runDivTests<uint16_t>(&brokenButWidespreadDivCeil<uint16_t>, iters);
271 }
272 BENCHMARK_RELATIVE(viaFloatDivCeilUint16, iters) {
273   runDivTests<uint16_t>(&viaFloatDivCeil<uint16_t>, iters);
274 }
275 BENCHMARK_RELATIVE(viaDoubleDivCeilUint16, iters) {
276   runDivTests<uint16_t>(&viaDoubleDivCeil<uint16_t>, iters);
277 }
278 BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint16, iters) {
279   runDivTests<uint16_t>(&viaLongDoubleDivCeil<uint16_t>, iters);
280 }
281 BENCHMARK(divRoundAwayUint16, iters) {
282   runDivTests<uint16_t>(&folly::divRoundAway<uint16_t, uint16_t>, iters);
283 }
284
285 BENCHMARK_DRAW_LINE();
286 BENCHMARK(divTruncUint32, iters) {
287   runDivTests<uint32_t>(&folly::divTrunc<uint32_t, uint32_t>, iters);
288 }
289 BENCHMARK(divFloorUint32, iters) {
290   runDivTests<uint32_t>(&folly::divFloor<uint32_t, uint32_t>, iters);
291 }
292 BENCHMARK(divCeilUint32, iters) {
293   runDivTests<uint32_t>(&folly::divCeil<uint32_t, uint32_t>, iters);
294 }
295 BENCHMARK_RELATIVE(branchlessDivCeilUint32, iters) {
296   runDivTests<uint32_t>(&folly::detail::divCeilBranchless<uint32_t>, iters);
297 }
298 BENCHMARK_RELATIVE(branchfulDivCeilUint32, iters) {
299   runDivTests<uint32_t>(&folly::detail::divCeilBranchful<uint32_t>, iters);
300 }
301 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint32, iters) {
302   runDivTests<uint32_t>(&brokenButWidespreadDivCeil<uint32_t>, iters);
303 }
304 BENCHMARK_RELATIVE(approxViaFloatDivCeilUint32, iters) {
305   runDivTests<uint32_t>(&viaFloatDivCeil<uint32_t>, iters);
306 }
307 BENCHMARK_RELATIVE(viaDoubleDivCeilUint32, iters) {
308   runDivTests<uint32_t>(&viaDoubleDivCeil<uint32_t>, iters);
309 }
310 BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint32, iters) {
311   runDivTests<uint32_t>(&viaLongDoubleDivCeil<uint32_t>, iters);
312 }
313 BENCHMARK(divRoundAwayUint32, iters) {
314   runDivTests<uint32_t>(&folly::divRoundAway<uint32_t, uint32_t>, iters);
315 }
316
317 BENCHMARK_DRAW_LINE();
318 BENCHMARK(divTruncUint64, iters) {
319   runDivTests<uint64_t>(&folly::divTrunc<uint64_t, uint64_t>, iters);
320 }
321 BENCHMARK(divFloorUint64, iters) {
322   runDivTests<uint64_t>(&folly::divFloor<uint64_t, uint64_t>, iters);
323 }
324 BENCHMARK(divCeilUint64, iters) {
325   runDivTests<uint64_t>(&folly::divCeil<uint64_t, uint64_t>, iters);
326 }
327 BENCHMARK_RELATIVE(branchlessDivCeilUint64, iters) {
328   runDivTests<uint64_t>(&folly::detail::divCeilBranchless<uint64_t>, iters);
329 }
330 BENCHMARK_RELATIVE(branchfulDivCeilUint64, iters) {
331   runDivTests<uint64_t>(&folly::detail::divCeilBranchful<uint64_t>, iters);
332 }
333 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint64, iters) {
334   runDivTests<uint64_t>(&brokenButWidespreadDivCeil<uint64_t>, iters);
335 }
336 BENCHMARK_RELATIVE(approxViaFloatDivCeilUint64, iters) {
337   runDivTests<uint64_t>(&viaFloatDivCeil<uint64_t>, iters);
338 }
339 BENCHMARK_RELATIVE(approxViaDoubleDivCeilUint64, iters) {
340   runDivTests<uint64_t>(&viaDoubleDivCeil<uint64_t>, iters);
341 }
342 BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint64, iters) {
343   runDivTests<uint64_t>(&viaLongDoubleDivCeil<uint64_t>, iters);
344 }
345 BENCHMARK(divRoundAwayUint64, iters) {
346   runDivTests<uint64_t>(&folly::divRoundAway<uint64_t, uint64_t>, iters);
347 }
348
349 int main(int argc, char** argv) {
350   gflags::ParseCommandLineFlags(&argc, &argv, true);
351   folly::runBenchmarks();
352   return 0;
353 }
354
355 /*
356 Benchmarks run single-threaded on a dual Xeon E5-2660 @ 2.2 Ghz with
357 hyperthreading (16 physical cores, 20 MB cache per socket, 256 GB RAM)
358
359 Benchmarks used --bm_min_iters=10000000.
360
361 divTrunc is just a native integral division.  viaDoubleViaCeil doesn't
362 have full accuracy for Int64 or Uint64.  There is a loop-carried
363 dependency for all of the div* tests, but there is a bit of extra slack
364 (a predictable call, a load that should be from the L1, and a predictable
365 not-taken branch in addition to the loop's branch) in the driving loop,
366 so the benchmark driver's attempt to subtract the overhead of the loop
367 might mean that the latency numbers here are slightly too low or too high.
368
369 The branchful implementation's branch is very predictable in this
370 microbenchmark for unsigned types, since it only needs to predict a
371 zero numerator.  That's likely to be true in real life as well, so we
372 make this the default.
373
374 I was surprised at the speed of float and double division, but
375 the only case where it actually wins by much and is correct is for
376 int16_t.  (float + ceil is faster for the 32-bit case, but is only
377 an approximation.)  I ran a similar benchmark setup for ARM and ARM64.
378 On ARM the conditional versions win by quite a bit.  32-bit ARM doesn't
379 have a native integer divide, so getting the remainder after a division
380 (to see if truncation occurred) is more work than preconditioning the
381 numerator to make truncation go in the correct direction.  64-bit ARM
382 had the same winners and losers as x86_64, at least on the two physical
383 instances I tested.
384
385 ============================================================================
386 folly/test/MathBenchmark.cpp                    relative  time/iter  iters/s
387 ============================================================================
388 ----------------------------------------------------------------------------
389 divTruncInt8                                                 8.89ns  112.44M
390 divFloorInt8                                                10.99ns   91.00M
391 divCeilInt8                                                 10.95ns   91.33M
392 branchlessDivCeilInt8                            100.40%    10.91ns   91.69M
393 branchfulDivCeilInt8                              88.87%    12.32ns   81.16M
394 brokenButWidespreadDivCeilInt8                   109.20%    10.03ns   99.73M
395 viaFloatDivCeilInt8                              109.68%     9.98ns  100.17M
396 viaDoubleDivCeilInt8                              95.47%    11.47ns   87.19M
397 viaLongDoubleDivCeilInt8                          31.65%    34.59ns   28.91M
398 divRoundAwayInt8                                            10.42ns   95.97M
399 ----------------------------------------------------------------------------
400 divTruncInt16                                                8.68ns  115.17M
401 divFloorInt16                                               10.94ns   91.38M
402 divCeilInt16                                                10.91ns   91.70M
403 branchlessDivCeilInt16                            99.44%    10.97ns   91.18M
404 branchfulDivCeilInt16                             81.68%    13.35ns   74.90M
405 brokenButWidespreadDivCeilInt16                  109.50%     9.96ns  100.40M
406 viaFloatDivCeilInt16                             108.04%    10.09ns   99.07M
407 viaDoubleDivCeilInt16                             85.38%    12.77ns   78.29M
408 viaLongDoubleDivCeilInt16                         29.99%    36.36ns   27.50M
409 divRoundAwayInt16                                           10.59ns   94.46M
410 ----------------------------------------------------------------------------
411 divTruncInt32                                                8.38ns  119.29M
412 divFloorInt32                                               11.01ns   90.84M
413 divCeilInt32                                                11.12ns   89.91M
414 branchlessDivCeilInt32                           101.94%    10.91ns   91.66M
415 branchfulDivCeilInt32                             84.67%    13.14ns   76.12M
416 brokenButWidespreadDivCeilInt32                  117.61%     9.46ns  105.75M
417 approxViaFloatDivCeilInt32                       115.98%     9.59ns  104.28M
418 viaDoubleDivCeilInt32                             89.86%    12.38ns   80.79M
419 viaLongDoubleDivCeilInt32                         30.84%    36.06ns   27.73M
420 divRoundAwayInt32                                           11.30ns   88.50M
421 ----------------------------------------------------------------------------
422 divTruncInt64                                               16.07ns   62.21M
423 divFloorInt64                                               18.37ns   54.45M
424 divCeilInt64                                                18.61ns   53.74M
425 branchlessDivCeilInt64                           100.43%    18.53ns   53.97M
426 branchfulDivCeilInt64                             84.65%    21.98ns   45.49M
427 brokenButWidespreadDivCeilInt64                  108.47%    17.16ns   58.29M
428 approxViaFloatDivCeilInt64                       190.99%     9.74ns  102.64M
429 approxViaDoubleDivCeilInt64                      148.64%    12.52ns   79.88M
430 viaLongDoubleDivCeilInt64                         52.01%    35.77ns   27.95M
431 divRoundAwayInt64                                           18.79ns   53.21M
432 ----------------------------------------------------------------------------
433 divTruncUint8                                                7.76ns  128.89M
434 divFloorUint8                                                8.29ns  120.61M
435 divCeilUint8                                                 9.61ns  104.09M
436 branchlessDivCeilUint8                           112.00%     8.58ns  116.58M
437 branchfulDivCeilUint8                            114.01%     8.43ns  118.67M
438 brokenButWidespreadDivCeilUint8                  100.48%     9.56ns  104.58M
439 viaFloatDivCeilUint8                             103.53%     9.28ns  107.76M
440 viaDoubleDivCeilUint8                             85.75%    11.20ns   89.26M
441 viaLongDoubleDivCeilUint8                         27.72%    34.65ns   28.86M
442 divRoundAwayUint8                                            9.60ns  104.11M
443 ----------------------------------------------------------------------------
444 divTruncUint16                                               8.39ns  119.19M
445 divFloorUint16                                               8.28ns  120.82M
446 divCeilUint16                                                9.90ns  100.96M
447 branchlessDivCeilUint16                          100.23%     9.88ns  101.19M
448 branchfulDivCeilUint16                           107.83%     9.19ns  108.87M
449 brokenButWidespreadDivCeilUint16                  99.89%     9.92ns  100.85M
450 viaFloatDivCeilUint16                            100.54%     9.85ns  101.50M
451 viaDoubleDivCeilUint16                            77.38%    12.80ns   78.13M
452 viaLongDoubleDivCeilUint16                        27.30%    36.28ns   27.56M
453 divRoundAwayUint16                                           9.82ns  101.85M
454 ----------------------------------------------------------------------------
455 divTruncUint32                                               8.12ns  123.20M
456 divFloorUint32                                               8.09ns  123.58M
457 divCeilUint32                                                8.44ns  118.55M
458 branchlessDivCeilUint32                           88.27%     9.56ns  104.64M
459 branchfulDivCeilUint32                            98.91%     8.53ns  117.25M
460 brokenButWidespreadDivCeilUint32                  93.48%     9.02ns  110.82M
461 approxViaFloatDivCeilUint32                       86.29%     9.78ns  102.30M
462 viaDoubleDivCeilUint32                            66.76%    12.63ns   79.15M
463 viaLongDoubleDivCeilUint32                        23.35%    36.13ns   27.68M
464 divRoundAwayUint32                                           8.47ns  118.03M
465 ----------------------------------------------------------------------------
466 divTruncUint64                                              12.38ns   80.79M
467 divFloorUint64                                              12.27ns   81.47M
468 divCeilUint64                                               12.66ns   78.99M
469 branchlessDivCeilUint64                           93.46%    13.55ns   73.83M
470 branchfulDivCeilUint64                           100.30%    12.62ns   79.23M
471 brokenButWidespreadDivCeilUint64                  99.41%    12.73ns   78.53M
472 approxViaFloatDivCeilUint64                      106.59%    11.88ns   84.19M
473 approxViaDoubleDivCeilUint64                      92.14%    13.74ns   72.78M
474 viaLongDoubleDivCeilUint64                        33.51%    37.78ns   26.47M
475 divRoundAwayUint64                                          12.34ns   81.02M
476 ============================================================================
477 */