From 6a9fa4cf60c50dbc935051ab2e5273771e5010bb Mon Sep 17 00:00:00 2001
From: Nathan Bronson <ngbronson@fb.com>
Date: Wed, 11 Feb 2015 14:23:06 -0800
Subject: [PATCH] Revert "[folly] TLS cache for AccessSpreader"

Summary:
This reverts commit 4ebb2303bbcf343d7c2bcc95d55557c0a3b444f3.
The caching mechanism was based on a misunderstanding of the
implementation of CLOCK_MONOTONIC_COARSE, and is not correct.
On the plus side, the upstream kernel patch

e76b027e6408 x86,vdso: Use LSL unconditionally for vgetcpu

gets the vdso getcpu from 16 nanos down to 10 (half of which is the
inherent indirect call).

Test Plan: unit tests

Reviewed By: delong.j@fb.com

Subscribers: trunkagent, bmaurer, tudorb, folly-diffs@, yfeldblum, jdelong

FB internal diff: D1840690

Signature: t1:1840690:1423693026:33801341ec0b83bf47e050be6528c5dd05021ed5
---
 folly/detail/CacheLocality.cpp   |  97 ++++++----------------------
 folly/detail/CacheLocality.h     |   3 +-
 folly/test/CacheLocalityTest.cpp | 106 +++++++++++++++----------------
 3 files changed, 72 insertions(+), 134 deletions(-)
diff --git a/folly/detail/CacheLocality.cpp b/folly/detail/CacheLocality.cpp
index ef562fa9..7af5962f 100644
--- a/folly/detail/CacheLocality.cpp
+++ b/folly/detail/CacheLocality.cpp
@@ -19,7 +19,6 @@
 #define _GNU_SOURCE 1 // for RTLD_NOLOAD
 #include <dlfcn.h>
 #include <fstream>
-#include <mutex>
 
 #include <folly/Conv.h>
 #include <folly/Exception.h>
@@ -37,7 +36,7 @@ static CacheLocality getSystemLocalityInfo() {
   try {
     return CacheLocality::readFromSysfs();
   } catch (...) {
-    // fall through to below if something goes wrong
+    // keep trying
   }
 #endif
 
@@ -202,87 +201,29 @@ CacheLocality CacheLocality::uniform(size_t numCpus) {
 
 ////////////// Getcpu
 
-#ifdef CLOCK_REALTIME_COARSE
-
-static std::once_flag gVdsoInitOnce;
-static Getcpu::Func gVdsoGetcpuFunc;
-static int64_t (*gVdsoGettimeNsFunc)(clockid_t);
-
-static int cachingVdsoGetcpu(unsigned* cpu, unsigned* unused_node,
-                             void* unused_tcache) {
-  static __thread unsigned tls_cpu;
-  static __thread int64_t tls_lastContextSwitchNanos;
+/// Resolves the dynamically loaded symbol __vdso_getcpu, returning null
+/// on failure
+static Getcpu::Func loadVdsoGetcpu() {
+  void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+  if (h == nullptr) {
+    return nullptr;
+  }
 
-  auto lastContextSwitchNanos = gVdsoGettimeNsFunc(CLOCK_REALTIME_COARSE);
-  if (tls_lastContextSwitchNanos != lastContextSwitchNanos) {
-    int rv = gVdsoGetcpuFunc(&tls_cpu, nullptr, nullptr);
-    if (rv != 0) {
-      return rv;
-    }
-    tls_lastContextSwitchNanos = lastContextSwitchNanos;
+  auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu"));
+  if (func == nullptr) {
+    // technically a null result could either be a failure or a successful
+    // lookup of a symbol with the null value, but the second can't actually
+    // happen for this symbol.  No point holding the handle forever if
+    // we don't need the code
+    dlclose(h);
   }
-  *cpu = tls_cpu;
-  return 0;
+
+  return func;
 }
-#endif
 
-/// Resolves the dynamically loaded symbol __vdso_getcpu and
-/// __vdso_clock_gettime_ns, returning a pair of nulls on failure.  Does a
-/// little bit of probing to make sure that the __vdso_clock_gettime_ns
-/// function isn't using the slow fallback path.
 Getcpu::Func Getcpu::vdsoFunc() {
-#ifdef CLOCK_REALTIME_COARSE
-  std::call_once(gVdsoInitOnce, []{
-    void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
-
-    typedef int64_t (*GettimeNsFunc)(clockid_t);
-
-    auto getcpuFunc = Getcpu::Func(
-        !h ? nullptr : dlsym(h, "__vdso_getcpu"));
-    auto gettimeNsFunc = GettimeNsFunc(
-        !h ? nullptr : dlsym(h, "__vdso_clock_gettime_ns"));
-
-    bool coarseGettimeDetected = false;
-    if (gettimeNsFunc != nullptr) {
-      // The TLS cache of getcpu results only is an optimization if the
-      // __vdso_clock_gettime_ns implementation is fast and actually
-      // coarse.  The slow fallback implementation is not coarse, so if
-      // we detect a coarse clock we are set.  If CLOCK_REALTIME_COARSE
-      // has the right properties, then so long as there is no context
-      // switch between two calls the returned time will be identical.
-      // Dynamically verify this.  An unlikely context switch while we're
-      // testing can lead to a false negative, but not a false positive,
-      // so we just run the test multiple times.  This ensures that we
-      // will get two calls to gettimeNsFunc in a row with no intervening
-      // context switch.
-      auto prev = gettimeNsFunc(CLOCK_REALTIME_COARSE);
-      for (int i = 0; i < 10 && !coarseGettimeDetected; ++i) {
-        auto next = gettimeNsFunc(CLOCK_REALTIME_COARSE);
-        coarseGettimeDetected = next == prev;
-        prev = next;
-      }
-    }
-
-    if (getcpuFunc == nullptr || !coarseGettimeDetected) {
-      // technically a null getcpuFunc could either be a failure or
-      // a successful lookup of a symbol with the null value, but the
-      // second can't actually happen for this symbol.  No point holding
-      // the handle forever if we don't need the code
-      if (h) {
-        dlclose(h);
-      }
-    } else {
-      gVdsoGetcpuFunc = getcpuFunc;
-      gVdsoGettimeNsFunc = gettimeNsFunc;
-    }
-  });
-
-  if (gVdsoGetcpuFunc != nullptr) {
-    return cachingVdsoGetcpu;
-  }
-#endif
-
-  return nullptr;
+  static Func func = loadVdsoGetcpu();
+  return func;
 }
 
 /////////////// SequentialThreadId
diff --git a/folly/detail/CacheLocality.h b/folly/detail/CacheLocality.h
index 47adca8f..107cf757 100644
--- a/folly/detail/CacheLocality.h
+++ b/folly/detail/CacheLocality.h
@@ -306,7 +306,8 @@ struct AccessSpreader {
 
   /// Points to the getcpu-like function we are using to obtain the
   /// current cpu.  It should not be assumed that the returned cpu value
-  /// is in range.
+  /// is in range.  We use a member for this instead of a static so that
+  /// this fetch preloads a prefix the stripeByCpu array
   Getcpu::Func getcpuFunc_;
 
   /// A precomputed map from cpu to stripe.  Rather than add a layer of
diff --git a/folly/test/CacheLocalityTest.cpp b/folly/test/CacheLocalityTest.cpp
index a062d9ec..82026c29 100644
--- a/folly/test/CacheLocalityTest.cpp
+++ b/folly/test/CacheLocalityTest.cpp
@@ -447,71 +447,67 @@ enum class SpreaderType { GETCPU, SHARED, TLS_RR };
 // _getcpu refers to the vdso getcpu implementation with a locally
 // constructed AccessSpreader.  _tls_rr refers to execution using
 // SequentialThreadId, the fallback if the vdso getcpu isn't available.
-// _shared refers to calling AccessSpreader<>::current(numStripes) inside
-// the hot loop.
+// _shared refers to calling AccessSpreader<>::current(numStripes)
+// inside the hot loop.
 //
-// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic, so
-// since the stripe selection is 6 nanos the atomic increments in the L1 is
-// ~15 nanos.  At width 8_stripe_0_work the line is expected to ping-pong
-// almost every operation, since the loops have the same duration.
-// Widths 4 and 2 have the same behavior, but each tour of the cache line
-// is 4 and 8 cores long, respectively.  These all suggest a lower bound
-// of ~60 nanos for intra-chip handoff and increment between the L1s.
+// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
+// so since the stripe selection is 21 nanos the atomic increments in
+// the L1 is ~15 nanos.  At width 8_stripe_0_work the line is expected
+// to ping-pong almost every operation, since the loops have the same
+// duration.  Widths 4 and 2 have the same behavior, but each tour of the
+// cache line is 4 and 8 cores long, respectively.  These all suggest a
+// lower bound of 60 nanos for intra-chip handoff and increment between
+// the L1s.
 //
-// With 396 nanos (500 std::memory_order_seq_cst loads) of busywork per
-// contended increment, the system can hide all of the latency of a tour
-// of length 4, but not quite one of length 8.  I was a bit surprised
-// at how much worse the non-striped version got.  It seems that the
-// inter-chip traffic also interferes with the L1-only localWork.load().
-// When the local work is doubled to 776 nanoseconds we see that the
-// inter-chip contention is still very important, but subdivisions on
-// the same chip don't matter.
+// With 455 nanos (1K cycles) of busywork per contended increment, the
+// system can hide all of the latency of a tour of length 4, but not
+// quite one of length 8.  I was a bit surprised at how much worse the
+// non-striped version got.  It seems that the inter-chip traffic also
+// interferes with the L1-only localWork.load().  When the local work is
+// doubled to about 1 microsecond we see that the inter-chip contention
+// is still very important, but subdivisions on the same chip don't matter.
 //
 // sudo nice -n -20
 //   _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
 // ============================================================================
 // folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
 // ============================================================================
-// LocalAccessSpreaderUse                                       6.34ns  157.75M
-// SharedAccessSpreaderUse                                      6.34ns  157.75M
-// AccessSpreaderConstruction                                 328.19ns    3.05M
+// contentionAtWidth(1_stripe_0_work_stub)                      1.14us  873.64K
+// contentionAtWidth(2_stripe_0_work_getcpu)                  495.58ns    2.02M
+// contentionAtWidth(4_stripe_0_work_getcpu)                  232.99ns    4.29M
+// contentionAtWidth(8_stripe_0_work_getcpu)                  101.16ns    9.88M
+// contentionAtWidth(16_stripe_0_work_getcpu)                  41.93ns   23.85M
+// contentionAtWidth(32_stripe_0_work_getcpu)                  42.04ns   23.79M
+// contentionAtWidth(64_stripe_0_work_getcpu)                  41.94ns   23.84M
+// contentionAtWidth(2_stripe_0_work_tls_rr)                    1.00us  997.41K
+// contentionAtWidth(4_stripe_0_work_tls_rr)                  694.41ns    1.44M
+// contentionAtWidth(8_stripe_0_work_tls_rr)                  590.27ns    1.69M
+// contentionAtWidth(16_stripe_0_work_tls_rr)                 222.13ns    4.50M
+// contentionAtWidth(32_stripe_0_work_tls_rr)                 169.49ns    5.90M
+// contentionAtWidth(64_stripe_0_work_tls_rr)                 162.20ns    6.17M
+// contentionAtWidth(2_stripe_0_work_shared)                  495.54ns    2.02M
+// contentionAtWidth(4_stripe_0_work_shared)                  236.27ns    4.23M
+// contentionAtWidth(8_stripe_0_work_shared)                  114.81ns    8.71M
+// contentionAtWidth(16_stripe_0_work_shared)                  44.65ns   22.40M
+// contentionAtWidth(32_stripe_0_work_shared)                  41.76ns   23.94M
+// contentionAtWidth(64_stripe_0_work_shared)                  43.47ns   23.00M
+// atomicIncrBaseline(local_incr_0_work)                       20.39ns   49.06M
 // ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_0_work_stub)                    909.99ns    1.10M
-// contentionAtWidth(2_stripe_0_work_getcpu)                  527.54ns    1.90M
-// contentionAtWidth(4_stripe_0_work_getcpu)                  260.28ns    3.84M
-// contentionAtWidth(8_stripe_0_work_getcpu)                  131.82ns    7.59M
-// contentionAtWidth(16_stripe_0_work_getcpu)                  25.92ns   38.58M
-// contentionAtWidth(32_stripe_0_work_getcpu)                  21.80ns   45.88M
-// contentionAtWidth(64_stripe_0_work_getcpu)                  20.06ns   49.85M
-// contentionAtWidth(2_stripe_0_work_tls_rr)                  759.21ns    1.32M
-// contentionAtWidth(4_stripe_0_work_tls_rr)                  607.46ns    1.65M
-// contentionAtWidth(8_stripe_0_work_tls_rr)                  403.79ns    2.48M
-// contentionAtWidth(16_stripe_0_work_tls_rr)                 188.14ns    5.32M
-// contentionAtWidth(32_stripe_0_work_tls_rr)                 131.59ns    7.60M
-// contentionAtWidth(64_stripe_0_work_tls_rr)                 103.56ns    9.66M
-// contentionAtWidth(2_stripe_0_work_shared)                  553.07ns    1.81M
-// contentionAtWidth(4_stripe_0_work_shared)                  274.23ns    3.65M
-// contentionAtWidth(8_stripe_0_work_shared)                  137.43ns    7.28M
-// contentionAtWidth(16_stripe_0_work_shared)                  24.52ns   40.78M
-// contentionAtWidth(32_stripe_0_work_shared)                  21.80ns   45.86M
-// contentionAtWidth(64_stripe_0_work_shared)                  21.66ns   46.17M
-// atomicIncrBaseline(local_incr_0_work)                       16.73ns   59.78M
+// contentionAtWidth(1_stripe_500_work_stub)                    2.04us  491.13K
+// contentionAtWidth(2_stripe_500_work_getcpu)                610.98ns    1.64M
+// contentionAtWidth(4_stripe_500_work_getcpu)                507.72ns    1.97M
+// contentionAtWidth(8_stripe_500_work_getcpu)                542.53ns    1.84M
+// contentionAtWidth(16_stripe_500_work_getcpu)               496.55ns    2.01M
+// contentionAtWidth(32_stripe_500_work_getcpu)               500.67ns    2.00M
+// atomicIncrBaseline(local_incr_500_work)                    484.69ns    2.06M
 // ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_500_work_stub)                    1.75us  571.14K
-// contentionAtWidth(2_stripe_500_work_getcpu)                500.79ns    2.00M
-// contentionAtWidth(4_stripe_500_work_getcpu)                410.45ns    2.44M
-// contentionAtWidth(8_stripe_500_work_getcpu)                411.41ns    2.43M
-// contentionAtWidth(16_stripe_500_work_getcpu)               400.12ns    2.50M
-// contentionAtWidth(32_stripe_500_work_getcpu)               397.37ns    2.52M
-// atomicIncrBaseline(local_incr_500_work)                    396.53ns    2.52M
-// ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_1000_work_stub)                   1.88us  530.59K
-// contentionAtWidth(2_stripe_1000_work_getcpu)               778.77ns    1.28M
-// contentionAtWidth(4_stripe_1000_work_getcpu)               779.56ns    1.28M
-// contentionAtWidth(8_stripe_1000_work_getcpu)               795.62ns    1.26M
-// contentionAtWidth(16_stripe_1000_work_getcpu)              778.81ns    1.28M
-// contentionAtWidth(32_stripe_1000_work_getcpu)              780.26ns    1.28M
-// atomicIncrBaseline(local_incr_1000_work)                   776.39ns    1.29M
+// contentionAtWidth(1_stripe_1000_work_stub)                   2.11us  473.78K
+// contentionAtWidth(2_stripe_1000_work_getcpu)               970.64ns    1.03M
+// contentionAtWidth(4_stripe_1000_work_getcpu)               987.31ns    1.01M
+// contentionAtWidth(8_stripe_1000_work_getcpu)                 1.01us  985.52K
+// contentionAtWidth(16_stripe_1000_work_getcpu)              986.09ns    1.01M
+// contentionAtWidth(32_stripe_1000_work_getcpu)              960.23ns    1.04M
+// atomicIncrBaseline(local_incr_1000_work)                   950.63ns    1.05M
 // ============================================================================
 static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
                               SpreaderType spreaderType,
-- 
2.34.1