From: Nathan Bronson Date: Fri, 6 Feb 2015 23:16:53 +0000 (-0800) Subject: TLS cache for AccessSpreader X-Git-Tag: v0.25.0~11 X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=5569e6fa6061fdeb6c3c95ac4fe6d2fc9e4df7b6;p=folly.git TLS cache for AccessSpreader Summary: Under Linux the process gtod doesn't contain the current cpu, so __vdso_getcpu uses RDTSCP, which makes it cost about 20 nanos. The gtod _does_, however, contain a nanosecond time (for CLOCK_REALTIME_COARSE) updated during context switches. This diff adds a TLS cache that uses __vdso_clock_gettime_ns(CLOCK_REALTIME_COARSE) to detect context switches. The end result is that AccessSpreader goes from ~20 nanos to ~6. Test Plan: unit tests Reviewed By: davejwatson@fb.com Subscribers: yfeldblum, trunkagent, folly-diffs@ FB internal diff: D1798922 Signature: t1:1798922:1423264298:32312a5e9bddb3b8aa630c146ef708164a6a4651 --- diff --git a/folly/detail/CacheLocality.cpp b/folly/detail/CacheLocality.cpp index 7af5962f..ef562fa9 100644 --- a/folly/detail/CacheLocality.cpp +++ b/folly/detail/CacheLocality.cpp @@ -19,6 +19,7 @@ #define _GNU_SOURCE 1 // for RTLD_NOLOAD #include #include +#include #include #include @@ -36,7 +37,7 @@ static CacheLocality getSystemLocalityInfo() { try { return CacheLocality::readFromSysfs(); } catch (...) { - // keep trying + // fall through to below if something goes wrong } #endif @@ -201,29 +202,87 @@ CacheLocality CacheLocality::uniform(size_t numCpus) { ////////////// Getcpu -/// Resolves the dynamically loaded symbol __vdso_getcpu, returning null -/// on failure -static Getcpu::Func loadVdsoGetcpu() { - void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); - if (h == nullptr) { - return nullptr; - } +#ifdef CLOCK_REALTIME_COARSE - auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu")); - if (func == nullptr) { - // technically a null result could either be a failure or a successful - // lookup of a symbol with the null value, but the second can't actually - // happen for this symbol. No point holding the handle forever if - // we don't need the code - dlclose(h); - } +static std::once_flag gVdsoInitOnce; +static Getcpu::Func gVdsoGetcpuFunc; +static int64_t (*gVdsoGettimeNsFunc)(clockid_t); + +static int cachingVdsoGetcpu(unsigned* cpu, unsigned* unused_node, + void* unused_tcache) { + static __thread unsigned tls_cpu; + static __thread int64_t tls_lastContextSwitchNanos; - return func; + auto lastContextSwitchNanos = gVdsoGettimeNsFunc(CLOCK_REALTIME_COARSE); + if (tls_lastContextSwitchNanos != lastContextSwitchNanos) { + int rv = gVdsoGetcpuFunc(&tls_cpu, nullptr, nullptr); + if (rv != 0) { + return rv; + } + tls_lastContextSwitchNanos = lastContextSwitchNanos; + } + *cpu = tls_cpu; + return 0; } +#endif +/// Resolves the dynamically loaded symbol __vdso_getcpu and +/// __vdso_clock_gettime_ns, returning a pair of nulls on failure. Does a +/// little bit of probing to make sure that the __vdso_clock_gettime_ns +/// function isn't using the slow fallback path. Getcpu::Func Getcpu::vdsoFunc() { - static Func func = loadVdsoGetcpu(); - return func; +#ifdef CLOCK_REALTIME_COARSE + std::call_once(gVdsoInitOnce, []{ + void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + + typedef int64_t (*GettimeNsFunc)(clockid_t); + + auto getcpuFunc = Getcpu::Func( + !h ? nullptr : dlsym(h, "__vdso_getcpu")); + auto gettimeNsFunc = GettimeNsFunc( + !h ? nullptr : dlsym(h, "__vdso_clock_gettime_ns")); + + bool coarseGettimeDetected = false; + if (gettimeNsFunc != nullptr) { + // The TLS cache of getcpu results only is an optimization if the + // __vdso_clock_gettime_ns implementation is fast and actually + // coarse. The slow fallback implementation is not coarse, so if + // we detect a coarse clock we are set. If CLOCK_REALTIME_COARSE + // has the right properties, then so long as there is no context + // switch between two calls the returned time will be identical. + // Dynamically verify this. An unlikely context switch while we're + // testing can lead to a false negative, but not a false positive, + // so we just run the test multiple times. This ensures that we + // will get two calls to gettimeNsFunc in a row with no intervening + // context switch. + auto prev = gettimeNsFunc(CLOCK_REALTIME_COARSE); + for (int i = 0; i < 10 && !coarseGettimeDetected; ++i) { + auto next = gettimeNsFunc(CLOCK_REALTIME_COARSE); + coarseGettimeDetected = next == prev; + prev = next; + } + } + + if (getcpuFunc == nullptr || !coarseGettimeDetected) { + // technically a null getcpuFunc could either be a failure or + // a successful lookup of a symbol with the null value, but the + // second can't actually happen for this symbol. No point holding + // the handle forever if we don't need the code + if (h) { + dlclose(h); + } + } else { + gVdsoGetcpuFunc = getcpuFunc; + gVdsoGettimeNsFunc = gettimeNsFunc; + } + }); + + if (gVdsoGetcpuFunc != nullptr) { + return cachingVdsoGetcpu; + } +#endif + + return nullptr; } /////////////// SequentialThreadId diff --git a/folly/detail/CacheLocality.h b/folly/detail/CacheLocality.h index 107cf757..47adca8f 100644 --- a/folly/detail/CacheLocality.h +++ b/folly/detail/CacheLocality.h @@ -306,8 +306,7 @@ struct AccessSpreader { /// Points to the getcpu-like function we are using to obtain the /// current cpu. It should not be assumed that the returned cpu value - /// is in range. We use a member for this instead of a static so that - /// this fetch preloads a prefix the stripeByCpu array + /// is in range. Getcpu::Func getcpuFunc_; /// A precomputed map from cpu to stripe. Rather than add a layer of diff --git a/folly/test/CacheLocalityTest.cpp b/folly/test/CacheLocalityTest.cpp index 82026c29..a062d9ec 100644 --- a/folly/test/CacheLocalityTest.cpp +++ b/folly/test/CacheLocalityTest.cpp @@ -447,67 +447,71 @@ enum class SpreaderType { GETCPU, SHARED, TLS_RR }; // _getcpu refers to the vdso getcpu implementation with a locally // constructed AccessSpreader. _tls_rr refers to execution using // SequentialThreadId, the fallback if the vdso getcpu isn't available. -// _shared refers to calling AccessSpreader<>::current(numStripes) -// inside the hot loop. +// _shared refers to calling AccessSpreader<>::current(numStripes) inside +// the hot loop. // -// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic, -// so since the stripe selection is 21 nanos the atomic increments in -// the L1 is ~15 nanos. At width 8_stripe_0_work the line is expected -// to ping-pong almost every operation, since the loops have the same -// duration. Widths 4 and 2 have the same behavior, but each tour of the -// cache line is 4 and 8 cores long, respectively. These all suggest a -// lower bound of 60 nanos for intra-chip handoff and increment between -// the L1s. +// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic, so +// since the stripe selection is 6 nanos the atomic increments in the L1 is +// ~15 nanos. At width 8_stripe_0_work the line is expected to ping-pong +// almost every operation, since the loops have the same duration. +// Widths 4 and 2 have the same behavior, but each tour of the cache line +// is 4 and 8 cores long, respectively. These all suggest a lower bound +// of ~60 nanos for intra-chip handoff and increment between the L1s. // -// With 455 nanos (1K cycles) of busywork per contended increment, the -// system can hide all of the latency of a tour of length 4, but not -// quite one of length 8. I was a bit surprised at how much worse the -// non-striped version got. It seems that the inter-chip traffic also -// interferes with the L1-only localWork.load(). When the local work is -// doubled to about 1 microsecond we see that the inter-chip contention -// is still very important, but subdivisions on the same chip don't matter. +// With 396 nanos (500 std::memory_order_seq_cst loads) of busywork per +// contended increment, the system can hide all of the latency of a tour +// of length 4, but not quite one of length 8. I was a bit surprised +// at how much worse the non-striped version got. It seems that the +// inter-chip traffic also interferes with the L1-only localWork.load(). +// When the local work is doubled to 776 nanoseconds we see that the +// inter-chip contention is still very important, but subdivisions on +// the same chip don't matter. // // sudo nice -n -20 // _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000 // ============================================================================ // folly/test/CacheLocalityTest.cpp relative time/iter iters/s // ============================================================================ -// contentionAtWidth(1_stripe_0_work_stub) 1.14us 873.64K -// contentionAtWidth(2_stripe_0_work_getcpu) 495.58ns 2.02M -// contentionAtWidth(4_stripe_0_work_getcpu) 232.99ns 4.29M -// contentionAtWidth(8_stripe_0_work_getcpu) 101.16ns 9.88M -// contentionAtWidth(16_stripe_0_work_getcpu) 41.93ns 23.85M -// contentionAtWidth(32_stripe_0_work_getcpu) 42.04ns 23.79M -// contentionAtWidth(64_stripe_0_work_getcpu) 41.94ns 23.84M -// contentionAtWidth(2_stripe_0_work_tls_rr) 1.00us 997.41K -// contentionAtWidth(4_stripe_0_work_tls_rr) 694.41ns 1.44M -// contentionAtWidth(8_stripe_0_work_tls_rr) 590.27ns 1.69M -// contentionAtWidth(16_stripe_0_work_tls_rr) 222.13ns 4.50M -// contentionAtWidth(32_stripe_0_work_tls_rr) 169.49ns 5.90M -// contentionAtWidth(64_stripe_0_work_tls_rr) 162.20ns 6.17M -// contentionAtWidth(2_stripe_0_work_shared) 495.54ns 2.02M -// contentionAtWidth(4_stripe_0_work_shared) 236.27ns 4.23M -// contentionAtWidth(8_stripe_0_work_shared) 114.81ns 8.71M -// contentionAtWidth(16_stripe_0_work_shared) 44.65ns 22.40M -// contentionAtWidth(32_stripe_0_work_shared) 41.76ns 23.94M -// contentionAtWidth(64_stripe_0_work_shared) 43.47ns 23.00M -// atomicIncrBaseline(local_incr_0_work) 20.39ns 49.06M +// LocalAccessSpreaderUse 6.34ns 157.75M +// SharedAccessSpreaderUse 6.34ns 157.75M +// AccessSpreaderConstruction 328.19ns 3.05M // ---------------------------------------------------------------------------- -// contentionAtWidth(1_stripe_500_work_stub) 2.04us 491.13K -// contentionAtWidth(2_stripe_500_work_getcpu) 610.98ns 1.64M -// contentionAtWidth(4_stripe_500_work_getcpu) 507.72ns 1.97M -// contentionAtWidth(8_stripe_500_work_getcpu) 542.53ns 1.84M -// contentionAtWidth(16_stripe_500_work_getcpu) 496.55ns 2.01M -// contentionAtWidth(32_stripe_500_work_getcpu) 500.67ns 2.00M -// atomicIncrBaseline(local_incr_500_work) 484.69ns 2.06M +// contentionAtWidth(1_stripe_0_work_stub) 909.99ns 1.10M +// contentionAtWidth(2_stripe_0_work_getcpu) 527.54ns 1.90M +// contentionAtWidth(4_stripe_0_work_getcpu) 260.28ns 3.84M +// contentionAtWidth(8_stripe_0_work_getcpu) 131.82ns 7.59M +// contentionAtWidth(16_stripe_0_work_getcpu) 25.92ns 38.58M +// contentionAtWidth(32_stripe_0_work_getcpu) 21.80ns 45.88M +// contentionAtWidth(64_stripe_0_work_getcpu) 20.06ns 49.85M +// contentionAtWidth(2_stripe_0_work_tls_rr) 759.21ns 1.32M +// contentionAtWidth(4_stripe_0_work_tls_rr) 607.46ns 1.65M +// contentionAtWidth(8_stripe_0_work_tls_rr) 403.79ns 2.48M +// contentionAtWidth(16_stripe_0_work_tls_rr) 188.14ns 5.32M +// contentionAtWidth(32_stripe_0_work_tls_rr) 131.59ns 7.60M +// contentionAtWidth(64_stripe_0_work_tls_rr) 103.56ns 9.66M +// contentionAtWidth(2_stripe_0_work_shared) 553.07ns 1.81M +// contentionAtWidth(4_stripe_0_work_shared) 274.23ns 3.65M +// contentionAtWidth(8_stripe_0_work_shared) 137.43ns 7.28M +// contentionAtWidth(16_stripe_0_work_shared) 24.52ns 40.78M +// contentionAtWidth(32_stripe_0_work_shared) 21.80ns 45.86M +// contentionAtWidth(64_stripe_0_work_shared) 21.66ns 46.17M +// atomicIncrBaseline(local_incr_0_work) 16.73ns 59.78M // ---------------------------------------------------------------------------- -// contentionAtWidth(1_stripe_1000_work_stub) 2.11us 473.78K -// contentionAtWidth(2_stripe_1000_work_getcpu) 970.64ns 1.03M -// contentionAtWidth(4_stripe_1000_work_getcpu) 987.31ns 1.01M -// contentionAtWidth(8_stripe_1000_work_getcpu) 1.01us 985.52K -// contentionAtWidth(16_stripe_1000_work_getcpu) 986.09ns 1.01M -// contentionAtWidth(32_stripe_1000_work_getcpu) 960.23ns 1.04M -// atomicIncrBaseline(local_incr_1000_work) 950.63ns 1.05M +// contentionAtWidth(1_stripe_500_work_stub) 1.75us 571.14K +// contentionAtWidth(2_stripe_500_work_getcpu) 500.79ns 2.00M +// contentionAtWidth(4_stripe_500_work_getcpu) 410.45ns 2.44M +// contentionAtWidth(8_stripe_500_work_getcpu) 411.41ns 2.43M +// contentionAtWidth(16_stripe_500_work_getcpu) 400.12ns 2.50M +// contentionAtWidth(32_stripe_500_work_getcpu) 397.37ns 2.52M +// atomicIncrBaseline(local_incr_500_work) 396.53ns 2.52M +// ---------------------------------------------------------------------------- +// contentionAtWidth(1_stripe_1000_work_stub) 1.88us 530.59K +// contentionAtWidth(2_stripe_1000_work_getcpu) 778.77ns 1.28M +// contentionAtWidth(4_stripe_1000_work_getcpu) 779.56ns 1.28M +// contentionAtWidth(8_stripe_1000_work_getcpu) 795.62ns 1.26M +// contentionAtWidth(16_stripe_1000_work_getcpu) 778.81ns 1.28M +// contentionAtWidth(32_stripe_1000_work_getcpu) 780.26ns 1.28M +// atomicIncrBaseline(local_incr_1000_work) 776.39ns 1.29M // ============================================================================ static void contentionAtWidth(size_t iters, size_t stripes, size_t work, SpreaderType spreaderType,