#define _GNU_SOURCE 1 // for RTLD_NOLOAD
#include <dlfcn.h>
#include <fstream>
+#include <mutex>
#include <folly/Conv.h>
#include <folly/Exception.h>
try {
return CacheLocality::readFromSysfs();
} catch (...) {
- // keep trying
+ // fall through to below if something goes wrong
}
#endif
////////////// Getcpu
-/// Resolves the dynamically loaded symbol __vdso_getcpu, returning null
-/// on failure
-static Getcpu::Func loadVdsoGetcpu() {
- void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
- if (h == nullptr) {
- return nullptr;
- }
+#ifdef CLOCK_REALTIME_COARSE
- auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu"));
- if (func == nullptr) {
- // technically a null result could either be a failure or a successful
- // lookup of a symbol with the null value, but the second can't actually
- // happen for this symbol. No point holding the handle forever if
- // we don't need the code
- dlclose(h);
- }
+static std::once_flag gVdsoInitOnce;
+static Getcpu::Func gVdsoGetcpuFunc;
+static int64_t (*gVdsoGettimeNsFunc)(clockid_t);
+
+static int cachingVdsoGetcpu(unsigned* cpu, unsigned* unused_node,
+ void* unused_tcache) {
+ static __thread unsigned tls_cpu;
+ static __thread int64_t tls_lastContextSwitchNanos;
- return func;
+ auto lastContextSwitchNanos = gVdsoGettimeNsFunc(CLOCK_REALTIME_COARSE);
+ if (tls_lastContextSwitchNanos != lastContextSwitchNanos) {
+ int rv = gVdsoGetcpuFunc(&tls_cpu, nullptr, nullptr);
+ if (rv != 0) {
+ return rv;
+ }
+ tls_lastContextSwitchNanos = lastContextSwitchNanos;
+ }
+ *cpu = tls_cpu;
+ return 0;
}
+#endif
+/// Resolves the dynamically loaded symbol __vdso_getcpu and
+/// __vdso_clock_gettime_ns, returning a pair of nulls on failure. Does a
+/// little bit of probing to make sure that the __vdso_clock_gettime_ns
+/// function isn't using the slow fallback path.
Getcpu::Func Getcpu::vdsoFunc() {
- static Func func = loadVdsoGetcpu();
- return func;
+#ifdef CLOCK_REALTIME_COARSE
+ std::call_once(gVdsoInitOnce, []{
+ void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+
+ typedef int64_t (*GettimeNsFunc)(clockid_t);
+
+ auto getcpuFunc = Getcpu::Func(
+ !h ? nullptr : dlsym(h, "__vdso_getcpu"));
+ auto gettimeNsFunc = GettimeNsFunc(
+ !h ? nullptr : dlsym(h, "__vdso_clock_gettime_ns"));
+
+ bool coarseGettimeDetected = false;
+ if (gettimeNsFunc != nullptr) {
+ // The TLS cache of getcpu results only is an optimization if the
+ // __vdso_clock_gettime_ns implementation is fast and actually
+ // coarse. The slow fallback implementation is not coarse, so if
+ // we detect a coarse clock we are set. If CLOCK_REALTIME_COARSE
+ // has the right properties, then so long as there is no context
+ // switch between two calls the returned time will be identical.
+ // Dynamically verify this. An unlikely context switch while we're
+ // testing can lead to a false negative, but not a false positive,
+ // so we just run the test multiple times. This ensures that we
+ // will get two calls to gettimeNsFunc in a row with no intervening
+ // context switch.
+ auto prev = gettimeNsFunc(CLOCK_REALTIME_COARSE);
+ for (int i = 0; i < 10 && !coarseGettimeDetected; ++i) {
+ auto next = gettimeNsFunc(CLOCK_REALTIME_COARSE);
+ coarseGettimeDetected = next == prev;
+ prev = next;
+ }
+ }
+
+ if (getcpuFunc == nullptr || !coarseGettimeDetected) {
+ // technically a null getcpuFunc could either be a failure or
+ // a successful lookup of a symbol with the null value, but the
+ // second can't actually happen for this symbol. No point holding
+ // the handle forever if we don't need the code
+ if (h) {
+ dlclose(h);
+ }
+ } else {
+ gVdsoGetcpuFunc = getcpuFunc;
+ gVdsoGettimeNsFunc = gettimeNsFunc;
+ }
+ });
+
+ if (gVdsoGetcpuFunc != nullptr) {
+ return cachingVdsoGetcpu;
+ }
+#endif
+
+ return nullptr;
}
/////////////// SequentialThreadId
// _getcpu refers to the vdso getcpu implementation with a locally
// constructed AccessSpreader. _tls_rr refers to execution using
// SequentialThreadId, the fallback if the vdso getcpu isn't available.
-// _shared refers to calling AccessSpreader<>::current(numStripes)
-// inside the hot loop.
+// _shared refers to calling AccessSpreader<>::current(numStripes) inside
+// the hot loop.
//
-// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
-// so since the stripe selection is 21 nanos the atomic increments in
-// the L1 is ~15 nanos. At width 8_stripe_0_work the line is expected
-// to ping-pong almost every operation, since the loops have the same
-// duration. Widths 4 and 2 have the same behavior, but each tour of the
-// cache line is 4 and 8 cores long, respectively. These all suggest a
-// lower bound of 60 nanos for intra-chip handoff and increment between
-// the L1s.
+// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic, so
+// since the stripe selection is 6 nanos the atomic increments in the L1 is
+// ~15 nanos. At width 8_stripe_0_work the line is expected to ping-pong
+// almost every operation, since the loops have the same duration.
+// Widths 4 and 2 have the same behavior, but each tour of the cache line
+// is 4 and 8 cores long, respectively. These all suggest a lower bound
+// of ~60 nanos for intra-chip handoff and increment between the L1s.
//
-// With 455 nanos (1K cycles) of busywork per contended increment, the
-// system can hide all of the latency of a tour of length 4, but not
-// quite one of length 8. I was a bit surprised at how much worse the
-// non-striped version got. It seems that the inter-chip traffic also
-// interferes with the L1-only localWork.load(). When the local work is
-// doubled to about 1 microsecond we see that the inter-chip contention
-// is still very important, but subdivisions on the same chip don't matter.
+// With 396 nanos (500 std::memory_order_seq_cst loads) of busywork per
+// contended increment, the system can hide all of the latency of a tour
+// of length 4, but not quite one of length 8. I was a bit surprised
+// at how much worse the non-striped version got. It seems that the
+// inter-chip traffic also interferes with the L1-only localWork.load().
+// When the local work is doubled to 776 nanoseconds we see that the
+// inter-chip contention is still very important, but subdivisions on
+// the same chip don't matter.
//
// sudo nice -n -20
// _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
// ============================================================================
// folly/test/CacheLocalityTest.cpp relative time/iter iters/s
// ============================================================================
-// contentionAtWidth(1_stripe_0_work_stub) 1.14us 873.64K
-// contentionAtWidth(2_stripe_0_work_getcpu) 495.58ns 2.02M
-// contentionAtWidth(4_stripe_0_work_getcpu) 232.99ns 4.29M
-// contentionAtWidth(8_stripe_0_work_getcpu) 101.16ns 9.88M
-// contentionAtWidth(16_stripe_0_work_getcpu) 41.93ns 23.85M
-// contentionAtWidth(32_stripe_0_work_getcpu) 42.04ns 23.79M
-// contentionAtWidth(64_stripe_0_work_getcpu) 41.94ns 23.84M
-// contentionAtWidth(2_stripe_0_work_tls_rr) 1.00us 997.41K
-// contentionAtWidth(4_stripe_0_work_tls_rr) 694.41ns 1.44M
-// contentionAtWidth(8_stripe_0_work_tls_rr) 590.27ns 1.69M
-// contentionAtWidth(16_stripe_0_work_tls_rr) 222.13ns 4.50M
-// contentionAtWidth(32_stripe_0_work_tls_rr) 169.49ns 5.90M
-// contentionAtWidth(64_stripe_0_work_tls_rr) 162.20ns 6.17M
-// contentionAtWidth(2_stripe_0_work_shared) 495.54ns 2.02M
-// contentionAtWidth(4_stripe_0_work_shared) 236.27ns 4.23M
-// contentionAtWidth(8_stripe_0_work_shared) 114.81ns 8.71M
-// contentionAtWidth(16_stripe_0_work_shared) 44.65ns 22.40M
-// contentionAtWidth(32_stripe_0_work_shared) 41.76ns 23.94M
-// contentionAtWidth(64_stripe_0_work_shared) 43.47ns 23.00M
-// atomicIncrBaseline(local_incr_0_work) 20.39ns 49.06M
+// LocalAccessSpreaderUse 6.34ns 157.75M
+// SharedAccessSpreaderUse 6.34ns 157.75M
+// AccessSpreaderConstruction 328.19ns 3.05M
// ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_500_work_stub) 2.04us 491.13K
-// contentionAtWidth(2_stripe_500_work_getcpu) 610.98ns 1.64M
-// contentionAtWidth(4_stripe_500_work_getcpu) 507.72ns 1.97M
-// contentionAtWidth(8_stripe_500_work_getcpu) 542.53ns 1.84M
-// contentionAtWidth(16_stripe_500_work_getcpu) 496.55ns 2.01M
-// contentionAtWidth(32_stripe_500_work_getcpu) 500.67ns 2.00M
-// atomicIncrBaseline(local_incr_500_work) 484.69ns 2.06M
+// contentionAtWidth(1_stripe_0_work_stub) 909.99ns 1.10M
+// contentionAtWidth(2_stripe_0_work_getcpu) 527.54ns 1.90M
+// contentionAtWidth(4_stripe_0_work_getcpu) 260.28ns 3.84M
+// contentionAtWidth(8_stripe_0_work_getcpu) 131.82ns 7.59M
+// contentionAtWidth(16_stripe_0_work_getcpu) 25.92ns 38.58M
+// contentionAtWidth(32_stripe_0_work_getcpu) 21.80ns 45.88M
+// contentionAtWidth(64_stripe_0_work_getcpu) 20.06ns 49.85M
+// contentionAtWidth(2_stripe_0_work_tls_rr) 759.21ns 1.32M
+// contentionAtWidth(4_stripe_0_work_tls_rr) 607.46ns 1.65M
+// contentionAtWidth(8_stripe_0_work_tls_rr) 403.79ns 2.48M
+// contentionAtWidth(16_stripe_0_work_tls_rr) 188.14ns 5.32M
+// contentionAtWidth(32_stripe_0_work_tls_rr) 131.59ns 7.60M
+// contentionAtWidth(64_stripe_0_work_tls_rr) 103.56ns 9.66M
+// contentionAtWidth(2_stripe_0_work_shared) 553.07ns 1.81M
+// contentionAtWidth(4_stripe_0_work_shared) 274.23ns 3.65M
+// contentionAtWidth(8_stripe_0_work_shared) 137.43ns 7.28M
+// contentionAtWidth(16_stripe_0_work_shared) 24.52ns 40.78M
+// contentionAtWidth(32_stripe_0_work_shared) 21.80ns 45.86M
+// contentionAtWidth(64_stripe_0_work_shared) 21.66ns 46.17M
+// atomicIncrBaseline(local_incr_0_work) 16.73ns 59.78M
// ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_1000_work_stub) 2.11us 473.78K
-// contentionAtWidth(2_stripe_1000_work_getcpu) 970.64ns 1.03M
-// contentionAtWidth(4_stripe_1000_work_getcpu) 987.31ns 1.01M
-// contentionAtWidth(8_stripe_1000_work_getcpu) 1.01us 985.52K
-// contentionAtWidth(16_stripe_1000_work_getcpu) 986.09ns 1.01M
-// contentionAtWidth(32_stripe_1000_work_getcpu) 960.23ns 1.04M
-// atomicIncrBaseline(local_incr_1000_work) 950.63ns 1.05M
+// contentionAtWidth(1_stripe_500_work_stub) 1.75us 571.14K
+// contentionAtWidth(2_stripe_500_work_getcpu) 500.79ns 2.00M
+// contentionAtWidth(4_stripe_500_work_getcpu) 410.45ns 2.44M
+// contentionAtWidth(8_stripe_500_work_getcpu) 411.41ns 2.43M
+// contentionAtWidth(16_stripe_500_work_getcpu) 400.12ns 2.50M
+// contentionAtWidth(32_stripe_500_work_getcpu) 397.37ns 2.52M
+// atomicIncrBaseline(local_incr_500_work) 396.53ns 2.52M
+// ----------------------------------------------------------------------------
+// contentionAtWidth(1_stripe_1000_work_stub) 1.88us 530.59K
+// contentionAtWidth(2_stripe_1000_work_getcpu) 778.77ns 1.28M
+// contentionAtWidth(4_stripe_1000_work_getcpu) 779.56ns 1.28M
+// contentionAtWidth(8_stripe_1000_work_getcpu) 795.62ns 1.26M
+// contentionAtWidth(16_stripe_1000_work_getcpu) 778.81ns 1.28M
+// contentionAtWidth(32_stripe_1000_work_getcpu) 780.26ns 1.28M
+// atomicIncrBaseline(local_incr_1000_work) 776.39ns 1.29M
// ============================================================================
static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
SpreaderType spreaderType,