TLS cache for AccessSpreader
authorNathan Bronson <ngbronson@fb.com>
Fri, 6 Feb 2015 23:16:53 +0000 (15:16 -0800)
committerSara Golemon <sgolemon@fb.com>
Wed, 11 Feb 2015 02:02:00 +0000 (18:02 -0800)
Summary:
Under Linux the process gtod doesn't contain the current cpu, so
__vdso_getcpu uses RDTSCP, which makes it cost about 20 nanos.  The gtod
_does_, however, contain a nanosecond time (for CLOCK_REALTIME_COARSE)
updated during context switches.  This diff adds a TLS cache that uses
__vdso_clock_gettime_ns(CLOCK_REALTIME_COARSE) to detect context switches.
The end result is that AccessSpreader goes from ~20 nanos to ~6.

Test Plan: unit tests

Reviewed By: davejwatson@fb.com

Subscribers: yfeldblum, trunkagent, folly-diffs@

FB internal diff: D1798922

Signature: t1:1798922:1423264298:32312a5e9bddb3b8aa630c146ef708164a6a4651

folly/detail/CacheLocality.cpp
folly/detail/CacheLocality.h
folly/test/CacheLocalityTest.cpp

index 7af5962fb575c44892fa571787cc466a09a90158..ef562fa99f584e34ee98a7021df6ef8bba696cc5 100644 (file)
@@ -19,6 +19,7 @@
 #define _GNU_SOURCE 1 // for RTLD_NOLOAD
 #include <dlfcn.h>
 #include <fstream>
+#include <mutex>
 
 #include <folly/Conv.h>
 #include <folly/Exception.h>
@@ -36,7 +37,7 @@ static CacheLocality getSystemLocalityInfo() {
   try {
     return CacheLocality::readFromSysfs();
   } catch (...) {
-    // keep trying
+    // fall through to below if something goes wrong
   }
 #endif
 
@@ -201,29 +202,87 @@ CacheLocality CacheLocality::uniform(size_t numCpus) {
 
 ////////////// Getcpu
 
-/// Resolves the dynamically loaded symbol __vdso_getcpu, returning null
-/// on failure
-static Getcpu::Func loadVdsoGetcpu() {
-  void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
-  if (h == nullptr) {
-    return nullptr;
-  }
+#ifdef CLOCK_REALTIME_COARSE
 
-  auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu"));
-  if (func == nullptr) {
-    // technically a null result could either be a failure or a successful
-    // lookup of a symbol with the null value, but the second can't actually
-    // happen for this symbol.  No point holding the handle forever if
-    // we don't need the code
-    dlclose(h);
-  }
+static std::once_flag gVdsoInitOnce;
+static Getcpu::Func gVdsoGetcpuFunc;
+static int64_t (*gVdsoGettimeNsFunc)(clockid_t);
+
+static int cachingVdsoGetcpu(unsigned* cpu, unsigned* unused_node,
+                             void* unused_tcache) {
+  static __thread unsigned tls_cpu;
+  static __thread int64_t tls_lastContextSwitchNanos;
 
-  return func;
+  auto lastContextSwitchNanos = gVdsoGettimeNsFunc(CLOCK_REALTIME_COARSE);
+  if (tls_lastContextSwitchNanos != lastContextSwitchNanos) {
+    int rv = gVdsoGetcpuFunc(&tls_cpu, nullptr, nullptr);
+    if (rv != 0) {
+      return rv;
+    }
+    tls_lastContextSwitchNanos = lastContextSwitchNanos;
+  }
+  *cpu = tls_cpu;
+  return 0;
 }
+#endif
 
+/// Resolves the dynamically loaded symbol __vdso_getcpu and
+/// __vdso_clock_gettime_ns, returning a pair of nulls on failure.  Does a
+/// little bit of probing to make sure that the __vdso_clock_gettime_ns
+/// function isn't using the slow fallback path.
 Getcpu::Func Getcpu::vdsoFunc() {
-  static Func func = loadVdsoGetcpu();
-  return func;
+#ifdef CLOCK_REALTIME_COARSE
+  std::call_once(gVdsoInitOnce, []{
+    void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+
+    typedef int64_t (*GettimeNsFunc)(clockid_t);
+
+    auto getcpuFunc = Getcpu::Func(
+        !h ? nullptr : dlsym(h, "__vdso_getcpu"));
+    auto gettimeNsFunc = GettimeNsFunc(
+        !h ? nullptr : dlsym(h, "__vdso_clock_gettime_ns"));
+
+    bool coarseGettimeDetected = false;
+    if (gettimeNsFunc != nullptr) {
+      // The TLS cache of getcpu results only is an optimization if the
+      // __vdso_clock_gettime_ns implementation is fast and actually
+      // coarse.  The slow fallback implementation is not coarse, so if
+      // we detect a coarse clock we are set.  If CLOCK_REALTIME_COARSE
+      // has the right properties, then so long as there is no context
+      // switch between two calls the returned time will be identical.
+      // Dynamically verify this.  An unlikely context switch while we're
+      // testing can lead to a false negative, but not a false positive,
+      // so we just run the test multiple times.  This ensures that we
+      // will get two calls to gettimeNsFunc in a row with no intervening
+      // context switch.
+      auto prev = gettimeNsFunc(CLOCK_REALTIME_COARSE);
+      for (int i = 0; i < 10 && !coarseGettimeDetected; ++i) {
+        auto next = gettimeNsFunc(CLOCK_REALTIME_COARSE);
+        coarseGettimeDetected = next == prev;
+        prev = next;
+      }
+    }
+
+    if (getcpuFunc == nullptr || !coarseGettimeDetected) {
+      // technically a null getcpuFunc could either be a failure or
+      // a successful lookup of a symbol with the null value, but the
+      // second can't actually happen for this symbol.  No point holding
+      // the handle forever if we don't need the code
+      if (h) {
+        dlclose(h);
+      }
+    } else {
+      gVdsoGetcpuFunc = getcpuFunc;
+      gVdsoGettimeNsFunc = gettimeNsFunc;
+    }
+  });
+
+  if (gVdsoGetcpuFunc != nullptr) {
+    return cachingVdsoGetcpu;
+  }
+#endif
+
+  return nullptr;
 }
 
 /////////////// SequentialThreadId
index 107cf7577b449f4dc154b71018dafc67e00a55c4..47adca8fbb64a526302f1799121567c2445f7977 100644 (file)
@@ -306,8 +306,7 @@ struct AccessSpreader {
 
   /// Points to the getcpu-like function we are using to obtain the
   /// current cpu.  It should not be assumed that the returned cpu value
-  /// is in range.  We use a member for this instead of a static so that
-  /// this fetch preloads a prefix the stripeByCpu array
+  /// is in range.
   Getcpu::Func getcpuFunc_;
 
   /// A precomputed map from cpu to stripe.  Rather than add a layer of
index 82026c29d45a96acea3fd38a0cb62aa022cd39f3..a062d9ec64492ed9fb2a2c9d7e4745c67acc3c81 100644 (file)
@@ -447,67 +447,71 @@ enum class SpreaderType { GETCPU, SHARED, TLS_RR };
 // _getcpu refers to the vdso getcpu implementation with a locally
 // constructed AccessSpreader.  _tls_rr refers to execution using
 // SequentialThreadId, the fallback if the vdso getcpu isn't available.
-// _shared refers to calling AccessSpreader<>::current(numStripes)
-// inside the hot loop.
+// _shared refers to calling AccessSpreader<>::current(numStripes) inside
+// the hot loop.
 //
-// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
-// so since the stripe selection is 21 nanos the atomic increments in
-// the L1 is ~15 nanos.  At width 8_stripe_0_work the line is expected
-// to ping-pong almost every operation, since the loops have the same
-// duration.  Widths 4 and 2 have the same behavior, but each tour of the
-// cache line is 4 and 8 cores long, respectively.  These all suggest a
-// lower bound of 60 nanos for intra-chip handoff and increment between
-// the L1s.
+// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic, so
+// since the stripe selection is 6 nanos the atomic increments in the L1 is
+// ~15 nanos.  At width 8_stripe_0_work the line is expected to ping-pong
+// almost every operation, since the loops have the same duration.
+// Widths 4 and 2 have the same behavior, but each tour of the cache line
+// is 4 and 8 cores long, respectively.  These all suggest a lower bound
+// of ~60 nanos for intra-chip handoff and increment between the L1s.
 //
-// With 455 nanos (1K cycles) of busywork per contended increment, the
-// system can hide all of the latency of a tour of length 4, but not
-// quite one of length 8.  I was a bit surprised at how much worse the
-// non-striped version got.  It seems that the inter-chip traffic also
-// interferes with the L1-only localWork.load().  When the local work is
-// doubled to about 1 microsecond we see that the inter-chip contention
-// is still very important, but subdivisions on the same chip don't matter.
+// With 396 nanos (500 std::memory_order_seq_cst loads) of busywork per
+// contended increment, the system can hide all of the latency of a tour
+// of length 4, but not quite one of length 8.  I was a bit surprised
+// at how much worse the non-striped version got.  It seems that the
+// inter-chip traffic also interferes with the L1-only localWork.load().
+// When the local work is doubled to 776 nanoseconds we see that the
+// inter-chip contention is still very important, but subdivisions on
+// the same chip don't matter.
 //
 // sudo nice -n -20
 //   _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
 // ============================================================================
 // folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
 // ============================================================================
-// contentionAtWidth(1_stripe_0_work_stub)                      1.14us  873.64K
-// contentionAtWidth(2_stripe_0_work_getcpu)                  495.58ns    2.02M
-// contentionAtWidth(4_stripe_0_work_getcpu)                  232.99ns    4.29M
-// contentionAtWidth(8_stripe_0_work_getcpu)                  101.16ns    9.88M
-// contentionAtWidth(16_stripe_0_work_getcpu)                  41.93ns   23.85M
-// contentionAtWidth(32_stripe_0_work_getcpu)                  42.04ns   23.79M
-// contentionAtWidth(64_stripe_0_work_getcpu)                  41.94ns   23.84M
-// contentionAtWidth(2_stripe_0_work_tls_rr)                    1.00us  997.41K
-// contentionAtWidth(4_stripe_0_work_tls_rr)                  694.41ns    1.44M
-// contentionAtWidth(8_stripe_0_work_tls_rr)                  590.27ns    1.69M
-// contentionAtWidth(16_stripe_0_work_tls_rr)                 222.13ns    4.50M
-// contentionAtWidth(32_stripe_0_work_tls_rr)                 169.49ns    5.90M
-// contentionAtWidth(64_stripe_0_work_tls_rr)                 162.20ns    6.17M
-// contentionAtWidth(2_stripe_0_work_shared)                  495.54ns    2.02M
-// contentionAtWidth(4_stripe_0_work_shared)                  236.27ns    4.23M
-// contentionAtWidth(8_stripe_0_work_shared)                  114.81ns    8.71M
-// contentionAtWidth(16_stripe_0_work_shared)                  44.65ns   22.40M
-// contentionAtWidth(32_stripe_0_work_shared)                  41.76ns   23.94M
-// contentionAtWidth(64_stripe_0_work_shared)                  43.47ns   23.00M
-// atomicIncrBaseline(local_incr_0_work)                       20.39ns   49.06M
+// LocalAccessSpreaderUse                                       6.34ns  157.75M
+// SharedAccessSpreaderUse                                      6.34ns  157.75M
+// AccessSpreaderConstruction                                 328.19ns    3.05M
 // ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_500_work_stub)                    2.04us  491.13K
-// contentionAtWidth(2_stripe_500_work_getcpu)                610.98ns    1.64M
-// contentionAtWidth(4_stripe_500_work_getcpu)                507.72ns    1.97M
-// contentionAtWidth(8_stripe_500_work_getcpu)                542.53ns    1.84M
-// contentionAtWidth(16_stripe_500_work_getcpu)               496.55ns    2.01M
-// contentionAtWidth(32_stripe_500_work_getcpu)               500.67ns    2.00M
-// atomicIncrBaseline(local_incr_500_work)                    484.69ns    2.06M
+// contentionAtWidth(1_stripe_0_work_stub)                    909.99ns    1.10M
+// contentionAtWidth(2_stripe_0_work_getcpu)                  527.54ns    1.90M
+// contentionAtWidth(4_stripe_0_work_getcpu)                  260.28ns    3.84M
+// contentionAtWidth(8_stripe_0_work_getcpu)                  131.82ns    7.59M
+// contentionAtWidth(16_stripe_0_work_getcpu)                  25.92ns   38.58M
+// contentionAtWidth(32_stripe_0_work_getcpu)                  21.80ns   45.88M
+// contentionAtWidth(64_stripe_0_work_getcpu)                  20.06ns   49.85M
+// contentionAtWidth(2_stripe_0_work_tls_rr)                  759.21ns    1.32M
+// contentionAtWidth(4_stripe_0_work_tls_rr)                  607.46ns    1.65M
+// contentionAtWidth(8_stripe_0_work_tls_rr)                  403.79ns    2.48M
+// contentionAtWidth(16_stripe_0_work_tls_rr)                 188.14ns    5.32M
+// contentionAtWidth(32_stripe_0_work_tls_rr)                 131.59ns    7.60M
+// contentionAtWidth(64_stripe_0_work_tls_rr)                 103.56ns    9.66M
+// contentionAtWidth(2_stripe_0_work_shared)                  553.07ns    1.81M
+// contentionAtWidth(4_stripe_0_work_shared)                  274.23ns    3.65M
+// contentionAtWidth(8_stripe_0_work_shared)                  137.43ns    7.28M
+// contentionAtWidth(16_stripe_0_work_shared)                  24.52ns   40.78M
+// contentionAtWidth(32_stripe_0_work_shared)                  21.80ns   45.86M
+// contentionAtWidth(64_stripe_0_work_shared)                  21.66ns   46.17M
+// atomicIncrBaseline(local_incr_0_work)                       16.73ns   59.78M
 // ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_1000_work_stub)                   2.11us  473.78K
-// contentionAtWidth(2_stripe_1000_work_getcpu)               970.64ns    1.03M
-// contentionAtWidth(4_stripe_1000_work_getcpu)               987.31ns    1.01M
-// contentionAtWidth(8_stripe_1000_work_getcpu)                 1.01us  985.52K
-// contentionAtWidth(16_stripe_1000_work_getcpu)              986.09ns    1.01M
-// contentionAtWidth(32_stripe_1000_work_getcpu)              960.23ns    1.04M
-// atomicIncrBaseline(local_incr_1000_work)                   950.63ns    1.05M
+// contentionAtWidth(1_stripe_500_work_stub)                    1.75us  571.14K
+// contentionAtWidth(2_stripe_500_work_getcpu)                500.79ns    2.00M
+// contentionAtWidth(4_stripe_500_work_getcpu)                410.45ns    2.44M
+// contentionAtWidth(8_stripe_500_work_getcpu)                411.41ns    2.43M
+// contentionAtWidth(16_stripe_500_work_getcpu)               400.12ns    2.50M
+// contentionAtWidth(32_stripe_500_work_getcpu)               397.37ns    2.52M
+// atomicIncrBaseline(local_incr_500_work)                    396.53ns    2.52M
+// ----------------------------------------------------------------------------
+// contentionAtWidth(1_stripe_1000_work_stub)                   1.88us  530.59K
+// contentionAtWidth(2_stripe_1000_work_getcpu)               778.77ns    1.28M
+// contentionAtWidth(4_stripe_1000_work_getcpu)               779.56ns    1.28M
+// contentionAtWidth(8_stripe_1000_work_getcpu)               795.62ns    1.26M
+// contentionAtWidth(16_stripe_1000_work_getcpu)              778.81ns    1.28M
+// contentionAtWidth(32_stripe_1000_work_getcpu)              780.26ns    1.28M
+// atomicIncrBaseline(local_incr_1000_work)                   776.39ns    1.29M
 // ============================================================================
 static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
                               SpreaderType spreaderType,