fix SIOF in CacheLocality.h's AccessSpreader
authorNathan Bronson <ngbronson@fb.com>
Mon, 22 Feb 2016 21:08:30 +0000 (13:08 -0800)
committerfacebook-github-bot-4 <folly-bot@fb.com>
Mon, 22 Feb 2016 21:20:26 +0000 (13:20 -0800)
Summary:This diff moves all data accessed during
AccessSpreader<>::current(x) into the .data segment, avoiding SIOF
without adding indirection or dynamic gating as would be the case for
normal singleton-like constructs.  The diff also trims the AccessSpreader
API to include only those methods that people actually seem to use.

Reviewed By: djwatson

Differential Revision: D2945205

fb-gh-sync-id: 847e31adc4450217f4ed0575686be261fb504d7c
shipit-source-id: 847e31adc4450217f4ed0575686be261fb504d7c

folly/detail/CacheLocality.cpp
folly/detail/CacheLocality.h
folly/test/CacheLocalityTest.cpp
folly/test/DeterministicSchedule.cpp
folly/test/DeterministicSchedule.h

index 6f7657f5a6b64a42ef9d9f75eb72f322388af331..d174b6f5928e37dfcc60ad15da00c8ea8b211b96 100644 (file)
@@ -28,6 +28,8 @@
 #include <folly/Format.h>
 #include <folly/ScopeGuard.h>
 
+DECLARE_ACCESS_SPREADER_TYPE(std::atomic)
+
 namespace folly {
 namespace detail {
 
@@ -60,8 +62,8 @@ static CacheLocality getSystemLocalityInfo() {
 
 template <>
 const CacheLocality& CacheLocality::system<std::atomic>() {
-  static CacheLocality cache(getSystemLocalityInfo());
-  return cache;
+  static auto* cache = new CacheLocality(getSystemLocalityInfo());
+  return *cache;
 }
 
 // Each level of cache has sharing sets, which are the set of cpus
@@ -110,8 +112,7 @@ CacheLocality CacheLocality::readFromSysfsTree(
     std::vector<size_t> levels;
     for (size_t index = 0;; ++index) {
       auto dir =
-          format("/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index)
-              .str();
+          sformat("/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index);
       auto cacheType = mapping(dir + "type");
       auto equivStr = mapping(dir + "shared_cpu_list");
       if (cacheType.size() == 0 || equivStr.size() == 0) {
@@ -208,9 +209,7 @@ CacheLocality CacheLocality::uniform(size_t numCpus) {
 
 ////////////// Getcpu
 
-/// Resolves the dynamically loaded symbol __vdso_getcpu, returning null
-/// on failure
-static Getcpu::Func loadVdsoGetcpu() {
+Getcpu::Func Getcpu::resolveVdsoFunc() {
 #if defined(_MSC_VER) || defined(__BIONIC__)
   return nullptr;
 #else
@@ -232,11 +231,6 @@ static Getcpu::Func loadVdsoGetcpu() {
 #endif
 }
 
-Getcpu::Func Getcpu::vdsoFunc() {
-  static Func func = loadVdsoGetcpu();
-  return func;
-}
-
 #ifdef FOLLY_TLS
 /////////////// SequentialThreadId
 
@@ -250,40 +244,10 @@ FOLLY_TLS size_t SequentialThreadId<std::atomic>::currentId(0);
 /////////////// AccessSpreader
 
 template <>
-const AccessSpreader<std::atomic> AccessSpreader<std::atomic>::stripeByCore(
-    CacheLocality::system<>().numCachesByLevel.front());
-
-template <>
-const AccessSpreader<std::atomic> AccessSpreader<std::atomic>::stripeByChip(
-    CacheLocality::system<>().numCachesByLevel.back());
-
-template <>
-AccessSpreaderArray<std::atomic, 128>
-    AccessSpreaderArray<std::atomic, 128>::sharedInstance = {};
-
-/// Always claims to be on CPU zero, node zero
-static int degenerateGetcpu(unsigned* cpu, unsigned* node, void* /* unused */) {
-  if (cpu != nullptr) {
-    *cpu = 0;
-  }
-  if (node != nullptr) {
-    *node = 0;
-  }
-  return 0;
+Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc() {
+  auto best = Getcpu::resolveVdsoFunc();
+  return best ? best : &FallbackGetcpuType::getcpu;
 }
 
-template <>
-Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t numStripes) {
-  if (numStripes == 1) {
-    // there's no need to call getcpu if there is only one stripe.
-    // This should not be common, so we don't want to waste a test and
-    // branch in the main code path, but we might as well use a faster
-    // function pointer
-    return &degenerateGetcpu;
-  } else {
-    auto best = Getcpu::vdsoFunc();
-    return best ? best : &FallbackGetcpuType::getcpu;
-  }
-}
-}
-} // namespace folly::detail
+} // namespace detail
+} // namespace folly
index ac9de657cc0694c0902565a6855ed1dd5b0b0e5e..c58d21c57709c5053bde830e01f491713975488d 100644 (file)
@@ -131,15 +131,16 @@ struct CacheLocality {
 /// it doesn't have false sharing with anything at a smaller memory address.
 #define FOLLY_ALIGN_TO_AVOID_FALSE_SHARING FOLLY_ALIGNED(128)
 
-/// Holds a function pointer to the VDSO implementation of getcpu(2),
-/// if available
+/// Knows how to derive a function pointer to the VDSO implementation of
+/// getcpu(2), if available
 struct Getcpu {
   /// Function pointer to a function with the same signature as getcpu(2).
   typedef int (*Func)(unsigned* cpu, unsigned* node, void* unused);
 
   /// Returns a pointer to the VDSO implementation of getcpu(2), if
-  /// available, or nullptr otherwise
-  static Func vdsoFunc();
+  /// available, or nullptr otherwise.  This function may be quite
+  /// expensive, be sure to cache the result.
+  static Func resolveVdsoFunc();
 };
 
 #ifdef FOLLY_TLS
@@ -197,23 +198,14 @@ typedef FallbackGetcpu<SequentialThreadId<std::atomic>> FallbackGetcpuType;
 typedef FallbackGetcpu<HashingThreadId> FallbackGetcpuType;
 #endif
 
-template <template <typename> class Atom, size_t kMaxCpus>
-struct AccessSpreaderArray;
-
 /// AccessSpreader arranges access to a striped data structure in such a
 /// way that concurrently executing threads are likely to be accessing
 /// different stripes.  It does NOT guarantee uncontended access.
 /// Your underlying algorithm must be thread-safe without spreading, this
 /// is merely an optimization.  AccessSpreader::current(n) is typically
-/// much faster than a cache miss (22 nanos on my dev box, tested fast
+/// much faster than a cache miss (12 nanos on my dev box, tested fast
 /// in both 2.6 and 3.2 kernels).
 ///
-/// You are free to create your own AccessSpreader-s or to cache the
-/// results of AccessSpreader<>::shared(n), but you will probably want
-/// to use one of the system-wide shared ones.  Calling .current() on
-/// a particular AccessSpreader instance only saves about 1 nanosecond
-/// over calling AccessSpreader<>::shared(n).
-///
 /// If available (and not using the deterministic testing implementation)
 /// AccessSpreader uses the getcpu system call via VDSO and the
 /// precise locality information retrieved from sysfs by CacheLocality.
@@ -228,10 +220,11 @@ struct AccessSpreaderArray;
 /// own stripe and there will be no cache sharing at all.
 ///
 /// AccessSpreader has a fallback mechanism for when __vdso_getcpu can't be
-/// loaded, or for use during deterministic testing.  Using sched_getcpu or
-/// the getcpu syscall would negate the performance advantages of access
-/// spreading, so we use a thread-local value and a shared atomic counter
-/// to spread access out.
+/// loaded, or for use during deterministic testing.  Using sched_getcpu
+/// or the getcpu syscall would negate the performance advantages of
+/// access spreading, so we use a thread-local value and a shared atomic
+/// counter to spread access out.  On systems lacking both a fast getcpu()
+/// and TLS, we hash the thread id to spread accesses.
 ///
 /// AccessSpreader is templated on the template type that is used
 /// to implement atomics, as a way to instantiate the underlying
@@ -242,70 +235,17 @@ struct AccessSpreaderArray;
 template <template <typename> class Atom = std::atomic>
 struct AccessSpreader {
 
-  /// Returns a never-destructed shared AccessSpreader instance.
-  /// numStripes should be > 0.
-  static const AccessSpreader& shared(size_t numStripes) {
-    // sharedInstances[0] actually has numStripes == 1
-    assert(numStripes > 0);
-
-    // the last shared element handles all large sizes
-    return AccessSpreaderArray<Atom, kMaxCpus>::sharedInstance[std::min(
-        size_t(kMaxCpus), numStripes)];
-  }
-
-  /// Returns the stripe associated with the current CPU, assuming
-  /// that there are numStripes (non-zero) stripes.  Equivalent to
-  /// AccessSpreader::shared(numStripes)->current.
+  /// Returns the stripe associated with the current CPU.  The returned
+  /// value will be < numStripes.
   static size_t current(size_t numStripes) {
-    return shared(numStripes).current();
-  }
-
-  /// stripeByCore uses 1 stripe per L1 cache, according to
-  /// CacheLocality::system<>().  Use stripeByCore.numStripes() to see
-  /// its width, or stripeByCore.current() to get the current stripe
-  static const AccessSpreader stripeByCore;
-
-  /// stripeByChip uses 1 stripe per last-level cache, which is the fewest
-  /// number of stripes for which off-chip communication can be avoided
-  /// (assuming all caches are on-chip).  Use stripeByChip.numStripes()
-  /// to see its width, or stripeByChip.current() to get the current stripe
-  static const AccessSpreader stripeByChip;
-
-  /// Constructs an AccessSpreader that will return values from
-  /// 0 to numStripes-1 (inclusive), precomputing the mapping
-  /// from CPU to stripe.  There is no use in having more than
-  /// CacheLocality::system<Atom>().localityIndexByCpu.size() stripes or
-  /// kMaxCpus stripes
-  explicit AccessSpreader(
-      size_t spreaderNumStripes,
-      const CacheLocality& cacheLocality = CacheLocality::system<Atom>(),
-      Getcpu::Func getcpuFunc = nullptr)
-      : getcpuFunc_(getcpuFunc ? getcpuFunc
-                               : pickGetcpuFunc(spreaderNumStripes)),
-        numStripes_(spreaderNumStripes) {
-    auto n = cacheLocality.numCpus;
-    for (size_t cpu = 0; cpu < kMaxCpus && cpu < n; ++cpu) {
-      auto index = cacheLocality.localityIndexByCpu[cpu];
-      assert(index < n);
-      // as index goes from 0..n, post-transform value goes from
-      // 0..numStripes
-      stripeByCpu[cpu] = (index * numStripes_) / n;
-      assert(stripeByCpu[cpu] < numStripes_);
-    }
-    for (size_t cpu = n; cpu < kMaxCpus; ++cpu) {
-      stripeByCpu[cpu] = stripeByCpu[cpu - n];
-    }
-  }
-
-  /// Returns 1 more than the maximum value that can be returned from
-  /// current()
-  size_t numStripes() const { return numStripes_; }
+    // widthAndCpuToStripe[0] will actually work okay (all zeros), but
+    // something's wrong with the caller
+    assert(numStripes > 0);
 
-  /// Returns the stripe associated with the current CPU
-  size_t current() const {
     unsigned cpu;
-    getcpuFunc_(&cpu, nullptr, nullptr);
-    return stripeByCpu[cpu % kMaxCpus];
+    getcpuFunc(&cpu, nullptr, nullptr);
+    return widthAndCpuToStripe[std::min(size_t(kMaxCpus),
+                                        numStripes)][cpu % kMaxCpus];
   }
 
  private:
@@ -322,61 +262,88 @@ struct AccessSpreader {
 
   /// Points to the getcpu-like function we are using to obtain the
   /// current cpu.  It should not be assumed that the returned cpu value
-  /// is in range.  We use a member for this instead of a static so that
-  /// this fetch preloads a prefix the stripeByCpu array
-  Getcpu::Func getcpuFunc_;
-
-  /// A precomputed map from cpu to stripe.  Rather than add a layer of
-  /// indirection requiring a dynamic bounds check and another cache miss,
-  /// we always precompute the whole array
-  CompactStripe stripeByCpu[kMaxCpus];
-
-  size_t numStripes_;
-
-  /// Returns the best getcpu implementation for this type and width
-  /// of AccessSpreader
-  static Getcpu::Func pickGetcpuFunc(size_t numStripes);
-};
-
-template <>
-Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t);
-
-/// An array of kMaxCpus+1 AccessSpreader<Atom> instances constructed
-/// with default params, with the zero-th element having 1 stripe
-template <template <typename> class Atom, size_t kMaxStripe>
-struct AccessSpreaderArray {
-
-  AccessSpreaderArray() {
-    for (size_t i = 0; i <= kMaxStripe; ++i) {
-      new (raw + i) AccessSpreader<Atom>(std::max(size_t(1), i));
+  /// is in range.  We use a static for this so that we can prearrange a
+  /// valid value in the pre-constructed state and avoid the need for a
+  /// conditional on every subsequent invocation (not normally a big win,
+  /// but 20% on some inner loops here).
+  static Getcpu::Func getcpuFunc;
+
+  /// For each level of splitting up to kMaxCpus, maps the cpu (mod
+  /// kMaxCpus) to the stripe.  Rather than performing any inequalities
+  /// or modulo on the actual number of cpus, we just fill in the entire
+  /// array.
+  static CompactStripe widthAndCpuToStripe[kMaxCpus + 1][kMaxCpus];
+
+  static bool initialized;
+
+  /// Returns the best getcpu implementation for Atom
+  static Getcpu::Func pickGetcpuFunc();
+
+  /// Always claims to be on CPU zero, node zero
+  static int degenerateGetcpu(unsigned* cpu, unsigned* node, void*) {
+    if (cpu != nullptr) {
+      *cpu = 0;
     }
+    if (node != nullptr) {
+      *node = 0;
+    }
+    return 0;
   }
 
-  ~AccessSpreaderArray() {
-    for (size_t i = 0; i <= kMaxStripe; ++i) {
-      auto p = static_cast<AccessSpreader<Atom>*>(static_cast<void*>(raw + i));
-      p->~AccessSpreader();
+  // The function to call for fast lookup of getcpu is a singleton, as
+  // is the precomputed table of locality information.  AccessSpreader
+  // is used in very tight loops, however (we're trying to race an L1
+  // cache miss!), so the normal singleton mechanisms are noticeably
+  // expensive.  Even a not-taken branch guarding access to getcpuFunc
+  // slows AccessSpreader::current from 12 nanos to 14.  As a result, we
+  // populate the static members with simple (but valid) values that can
+  // be filled in by the linker, and then follow up with a normal static
+  // initializer call that puts in the proper version.  This means that
+  // when there are initialization order issues we will just observe a
+  // zero stripe.  Once a sanitizer gets smart enough to detect this as
+  // a race or undefined behavior, we can annotate it.
+
+  static bool initialize() {
+    getcpuFunc = pickGetcpuFunc();
+
+    auto& cacheLocality = CacheLocality::system<Atom>();
+    auto n = cacheLocality.numCpus;
+    for (size_t width = 0; width <= kMaxCpus; ++width) {
+      auto numStripes = std::max(size_t{1}, width);
+      for (size_t cpu = 0; cpu < kMaxCpus && cpu < n; ++cpu) {
+        auto index = cacheLocality.localityIndexByCpu[cpu];
+        assert(index < n);
+        // as index goes from 0..n, post-transform value goes from
+        // 0..numStripes
+        widthAndCpuToStripe[width][cpu] = (index * numStripes) / n;
+        assert(widthAndCpuToStripe[width][cpu] < numStripes);
+      }
+      for (size_t cpu = n; cpu < kMaxCpus; ++cpu) {
+        widthAndCpuToStripe[width][cpu] = widthAndCpuToStripe[width][cpu - n];
+      }
     }
+    return true;
   }
+};
 
-  AccessSpreader<Atom> const& operator[](size_t index) const {
-    return *static_cast<AccessSpreader<Atom> const*>(
-               static_cast<void const*>(raw + index));
+template <>
+Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc();
+
+#define DECLARE_ACCESS_SPREADER_TYPE(Atom)                                     \
+  namespace folly {                                                            \
+  namespace detail {                                                           \
+  template <>                                                                  \
+  Getcpu::Func AccessSpreader<Atom>::getcpuFunc =                              \
+      AccessSpreader<Atom>::degenerateGetcpu;                                  \
+  template <>                                                                  \
+  typename AccessSpreader<Atom>::CompactStripe                                 \
+      AccessSpreader<Atom>::widthAndCpuToStripe[129][128] = {};                \
+  template <>                                                                  \
+  bool AccessSpreader<Atom>::initialized = AccessSpreader<Atom>::initialize(); \
+  }                                                                            \
   }
 
- private:
-  // AccessSpreader uses sharedInstance
-  friend AccessSpreader<Atom>;
-
-  static AccessSpreaderArray<Atom, kMaxStripe> sharedInstance;
-
-  /// aligned_storage is uninitialized, we use placement new since there
-  /// is no AccessSpreader default constructor
-  typename std::aligned_storage<sizeof(AccessSpreader<Atom>),
-                                CacheLocality::kFalseSharingRange>::type
-      raw[kMaxStripe + 1];
-};
-}
-}
+} // namespace detail
+} // namespace folly
 
 #endif /* FOLLY_DETAIL_CacheLocality_H_ */
index 9eb9facff0272f5d3dd5b079a0440f1ded6210ea..4a5ee9990abc3d9950348f74aa71adcc103fed8c 100644 (file)
@@ -354,7 +354,7 @@ TEST(CacheLocality, FakeSysfs) {
 
 TEST(Getcpu, VdsoGetcpu) {
   unsigned cpu;
-  Getcpu::vdsoFunc()(&cpu, nullptr, nullptr);
+  Getcpu::resolveVdsoFunc()(&cpu, nullptr, nullptr);
 
   EXPECT_TRUE(cpu < CPU_SETSIZE);
 }
@@ -398,206 +398,143 @@ static int testingGetcpu(unsigned* cpu, unsigned* node, void* /* unused */) {
   return 0;
 }
 
-TEST(AccessSpreader, Stubbed) {
-  std::vector<std::unique_ptr<AccessSpreader<>>> spreaders(100);
-  for (size_t s = 1; s < spreaders.size(); ++s) {
-    spreaders[s].reset(
-        new AccessSpreader<>(s, nonUniformExampleLocality, &testingGetcpu));
-  }
-  std::vector<size_t> cpusInLocalityOrder = {
-      0, 17, 1,  18, 2,  19, 3,  20, 4,  21, 5,  6,  7,  22, 8,  23,
-      9, 24, 10, 25, 11, 26, 12, 27, 13, 28, 14, 29, 15, 30, 16, 31};
-  for (size_t i = 0; i < 32; ++i) {
-    // extra i * 32 is to check wrapping behavior of impl
-    testingCpu = cpusInLocalityOrder[i] + i * 64;
-    for (size_t s = 1; s < spreaders.size(); ++s) {
-      EXPECT_EQ((i * s) / 32, spreaders[s]->current())
-          << "i=" << i << ", cpu=" << testingCpu << ", s=" << s;
-    }
-  }
-}
-
-TEST(AccessSpreader, Default) {
-  AccessSpreader<> spreader(16);
-  EXPECT_LT(spreader.current(), 16);
-}
-
-TEST(AccessSpreader, Shared) {
+TEST(AccessSpreader, Simple) {
   for (size_t s = 1; s < 200; ++s) {
-    EXPECT_LT(AccessSpreader<>::shared(s).current(), s);
+    EXPECT_LT(AccessSpreader<>::current(s), s);
   }
 }
 
-TEST(AccessSpreader, Statics) {
-  LOG(INFO) << "stripeByCore.numStripes() = "
-            << AccessSpreader<>::stripeByCore.numStripes();
-  LOG(INFO) << "stripeByChip.numStripes() = "
-            << AccessSpreader<>::stripeByChip.numStripes();
-  for (size_t s = 1; s < 200; ++s) {
-    EXPECT_LT(AccessSpreader<>::current(s), s);
+#define DECLARE_SPREADER_TAG(tag, locality, func)      \
+  namespace {                                          \
+  template <typename dummy>                            \
+  struct tag {};                                       \
+  }                                                    \
+  DECLARE_ACCESS_SPREADER_TYPE(tag)                    \
+  namespace folly {                                    \
+  namespace detail {                                   \
+  template <>                                          \
+  const CacheLocality& CacheLocality::system<tag>() {  \
+    static auto* inst = new CacheLocality(locality);   \
+    return *inst;                                      \
+  }                                                    \
+  template <>                                          \
+  Getcpu::Func AccessSpreader<tag>::pickGetcpuFunc() { \
+    return func;                                       \
+  }                                                    \
+  }                                                    \
   }
-}
+
+DECLARE_SPREADER_TAG(ManualTag, CacheLocality::uniform(16), testingGetcpu)
+DECLARE_SPREADER_TAG(
+    ThreadLocalTag,
+    CacheLocality::system<>(),
+    folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu)
+DECLARE_SPREADER_TAG(PthreadSelfTag,
+                     CacheLocality::system<>(),
+                     folly::detail::FallbackGetcpu<HashingThreadId>::getcpu)
 
 TEST(AccessSpreader, Wrapping) {
   // this test won't pass unless locality.numCpus divides kMaxCpus
-  auto numCpus = 16;
-  auto locality = CacheLocality::uniform(numCpus);
+  auto numCpus = CacheLocality::system<ManualTag>().numCpus;
+  EXPECT_EQ(0, 128 % numCpus);
   for (size_t s = 1; s < 200; ++s) {
-    AccessSpreader<> spreader(s, locality, &testingGetcpu);
     for (size_t c = 0; c < 400; ++c) {
       testingCpu = c;
-      auto observed = spreader.current();
+      auto observed = AccessSpreader<ManualTag>::current(s);
       testingCpu = c % numCpus;
-      auto expected = spreader.current();
+      auto expected = AccessSpreader<ManualTag>::current(s);
       EXPECT_EQ(expected, observed) << "numCpus=" << numCpus << ", s=" << s
                                     << ", c=" << c;
     }
   }
 }
 
-// Benchmarked at ~21 nanos on fbk35 (2.6) and fbk18 (3.2) kernels with
-// a 2.2Ghz Xeon
-// ============================================================================
-// folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
-// ============================================================================
-// LocalAccessSpreaderUse                                      20.77ns   48.16M
-// SharedAccessSpreaderUse                                     21.95ns   45.55M
-// AccessSpreaderConstruction                                 466.56ns    2.14M
-// ============================================================================
-
-BENCHMARK(LocalAccessSpreaderUse, iters) {
-  folly::BenchmarkSuspender braces;
-  AccessSpreader<> spreader(16);
-  braces.dismiss();
-
-  for (unsigned long i = 0; i < iters; ++i) {
-    auto x = spreader.current();
-    folly::doNotOptimizeAway(x);
-  }
-}
-
-BENCHMARK(SharedAccessSpreaderUse, iters) {
+BENCHMARK(AccessSpreaderUse, iters) {
   for (unsigned long i = 0; i < iters; ++i) {
     auto x = AccessSpreader<>::current(16);
     folly::doNotOptimizeAway(x);
   }
 }
 
-BENCHMARK(AccessSpreaderConstruction, iters) {
-  std::aligned_storage<sizeof(AccessSpreader<>),
-                       std::alignment_of<AccessSpreader<>>::value>::type raw;
-  for (unsigned long i = 0; i < iters; ++i) {
-    auto x = new (&raw) AccessSpreader<>(16);
-    folly::doNotOptimizeAway(x);
-    x->~AccessSpreader();
-  }
-}
-
-enum class SpreaderType { GETCPU, SHARED, TLS_RR, PTHREAD_SELF };
-
 // Benchmark scores here reflect the time for 32 threads to perform an
 // atomic increment on a dual-socket E5-2660 @ 2.2Ghz.  Surprisingly,
 // if we don't separate the counters onto unique 128 byte stripes the
 // 1_stripe and 2_stripe results are identical, even though the L3 is
 // claimed to have 64 byte cache lines.
 //
-// _stub means there was no call to getcpu or the tls round-robin
-// implementation, because for a single stripe the cpu doesn't matter.
-// _getcpu refers to the vdso getcpu implementation with a locally
-// constructed AccessSpreader.  _tls_rr refers to execution using
-// SequentialThreadId, the fallback if the vdso getcpu isn't available.
-// _shared refers to calling AccessSpreader<>::current(numStripes)
-// inside the hot loop.
+// Getcpu refers to the vdso getcpu implementation.  ThreadLocal refers
+// to execution using SequentialThreadId, the fallback if the vdso
+// getcpu isn't available.  PthreadSelf hashes the value returned from
+// pthread_self() as a fallback-fallback for systems that don't have
+// thread-local support.
 //
 // At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
-// so since the stripe selection is 21 nanos the atomic increments in
-// the L1 is ~15 nanos.  At width 8_stripe_0_work the line is expected
+// so since the stripe selection is 12 nanos the atomic increments in
+// the L1 is ~17 nanos.  At width 8_stripe_0_work the line is expected
 // to ping-pong almost every operation, since the loops have the same
 // duration.  Widths 4 and 2 have the same behavior, but each tour of the
 // cache line is 4 and 8 cores long, respectively.  These all suggest a
 // lower bound of 60 nanos for intra-chip handoff and increment between
 // the L1s.
 //
-// With 455 nanos (1K cycles) of busywork per contended increment, the
-// system can hide all of the latency of a tour of length 4, but not
-// quite one of length 8.  I was a bit surprised at how much worse the
-// non-striped version got.  It seems that the inter-chip traffic also
-// interferes with the L1-only localWork.load().  When the local work is
-// doubled to about 1 microsecond we see that the inter-chip contention
-// is still very important, but subdivisions on the same chip don't matter.
+// With 420 nanos of busywork per contended increment, the system can
+// hide all of the latency of a tour of length 4, but not quite one of
+// length 8.  I was a bit surprised at how much worse the non-striped
+// version got.  It seems that the inter-chip traffic also interferes
+// with the L1-only localWork.load().  When the local work is doubled
+// to about 1 microsecond we see that the inter-chip contention is still
+// very important, but subdivisions on the same chip don't matter.
 //
-// sudo nice -n -20
-//   _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
+// sudo nice -n -20 buck-out/gen/folly/test/cache_locality_test
+//     --benchmark --bm_min_iters=1000000
 // ============================================================================
 // folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
 // ============================================================================
-// LocalAccessSpreaderUse                                      13.00ns   76.94M
-// SharedAccessSpreaderUse                                     13.04ns   76.66M
-// AccessSpreaderConstruction                                 366.00ns    2.73M
+// AccessSpreaderUse                                           11.94ns   83.79M
 // ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_0_work_stub)                    891.04ns    1.12M
-// contentionAtWidth(2_stripe_0_work_getcpu)                  403.45ns    2.48M
-// contentionAtWidth(4_stripe_0_work_getcpu)                  198.02ns    5.05M
-// contentionAtWidth(8_stripe_0_work_getcpu)                   90.54ns   11.04M
-// contentionAtWidth(16_stripe_0_work_getcpu)                  31.21ns   32.04M
-// contentionAtWidth(32_stripe_0_work_getcpu)                  29.15ns   34.31M
-// contentionAtWidth(64_stripe_0_work_getcpu)                  32.41ns   30.86M
-// contentionAtWidth(2_stripe_0_work_tls_rr)                  958.06ns    1.04M
-// contentionAtWidth(4_stripe_0_work_tls_rr)                  494.31ns    2.02M
-// contentionAtWidth(8_stripe_0_work_tls_rr)                  362.34ns    2.76M
-// contentionAtWidth(16_stripe_0_work_tls_rr)                 231.37ns    4.32M
-// contentionAtWidth(32_stripe_0_work_tls_rr)                 128.26ns    7.80M
-// contentionAtWidth(64_stripe_0_work_tls_rr)                 115.08ns    8.69M
-// contentionAtWidth(2_stripe_0_work_pthread_self)            856.63ns    1.17M
-// contentionAtWidth(4_stripe_0_work_pthread_self)            623.43ns    1.60M
-// contentionAtWidth(8_stripe_0_work_pthread_self)            419.69ns    2.38M
-// contentionAtWidth(16_stripe_0_work_pthread_self            217.32ns    4.60M
-// contentionAtWidth(32_stripe_0_work_pthread_self            157.69ns    6.34M
-// contentionAtWidth(64_stripe_0_work_pthread_self            140.94ns    7.10M
-// contentionAtWidth(2_stripe_0_work_shared)                  406.55ns    2.46M
-// contentionAtWidth(4_stripe_0_work_shared)                  198.28ns    5.04M
-// contentionAtWidth(8_stripe_0_work_shared)                   90.11ns   11.10M
-// contentionAtWidth(16_stripe_0_work_shared)                  34.53ns   28.96M
-// contentionAtWidth(32_stripe_0_work_shared)                  30.08ns   33.25M
-// contentionAtWidth(64_stripe_0_work_shared)                  34.60ns   28.90M
-// atomicIncrBaseline(local_incr_0_work)                       17.51ns   57.12M
+// contentionAtWidthGetcpu(1_stripe_0_work)                   985.75ns    1.01M
+// contentionAtWidthGetcpu(2_stripe_0_work)                   424.02ns    2.36M
+// contentionAtWidthGetcpu(4_stripe_0_work)                   190.13ns    5.26M
+// contentionAtWidthGetcpu(8_stripe_0_work)                    91.86ns   10.89M
+// contentionAtWidthGetcpu(16_stripe_0_work)                   29.31ns   34.12M
+// contentionAtWidthGetcpu(32_stripe_0_work)                   29.53ns   33.86M
+// contentionAtWidthGetcpu(64_stripe_0_work)                   29.93ns   33.41M
+// contentionAtWidthThreadLocal(2_stripe_0_work)              609.21ns    1.64M
+// contentionAtWidthThreadLocal(4_stripe_0_work)              303.60ns    3.29M
+// contentionAtWidthThreadLocal(8_stripe_0_work)              246.57ns    4.06M
+// contentionAtWidthThreadLocal(16_stripe_0_work)             154.84ns    6.46M
+// contentionAtWidthThreadLocal(32_stripe_0_work)              24.14ns   41.43M
+// contentionAtWidthThreadLocal(64_stripe_0_work)              23.95ns   41.75M
+// contentionAtWidthPthreadSelf(2_stripe_0_work)              722.01ns    1.39M
+// contentionAtWidthPthreadSelf(4_stripe_0_work)              501.56ns    1.99M
+// contentionAtWidthPthreadSelf(8_stripe_0_work)              474.58ns    2.11M
+// contentionAtWidthPthreadSelf(16_stripe_0_work)             300.90ns    3.32M
+// contentionAtWidthPthreadSelf(32_stripe_0_work)             175.77ns    5.69M
+// contentionAtWidthPthreadSelf(64_stripe_0_work)             174.88ns    5.72M
+// atomicIncrBaseline(local_incr_0_work)                       16.81ns   59.51M
 // ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_500_work_stub)                    1.87us  534.36K
-// contentionAtWidth(2_stripe_500_work_getcpu)                542.31ns    1.84M
-// contentionAtWidth(4_stripe_500_work_getcpu)                409.18ns    2.44M
-// contentionAtWidth(8_stripe_500_work_getcpu)                511.05ns    1.96M
-// contentionAtWidth(16_stripe_500_work_getcpu)               399.14ns    2.51M
-// contentionAtWidth(32_stripe_500_work_getcpu)               399.05ns    2.51M
-// atomicIncrBaseline(local_incr_500_work)                    399.41ns    2.50M
+// contentionAtWidthGetcpu(1_stripe_500_work)                   1.82us  549.97K
+// contentionAtWidthGetcpu(2_stripe_500_work)                 533.71ns    1.87M
+// contentionAtWidthGetcpu(4_stripe_500_work)                 424.64ns    2.35M
+// contentionAtWidthGetcpu(8_stripe_500_work)                 451.85ns    2.21M
+// contentionAtWidthGetcpu(16_stripe_500_work)                425.54ns    2.35M
+// contentionAtWidthGetcpu(32_stripe_500_work)                501.66ns    1.99M
+// atomicIncrBaseline(local_incr_500_work)                    438.46ns    2.28M
 // ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_1000_work_stub)                   1.90us  525.73K
-// contentionAtWidth(2_stripe_1000_work_getcpu)               792.91ns    1.26M
-// contentionAtWidth(4_stripe_1000_work_getcpu)               788.14ns    1.27M
-// contentionAtWidth(8_stripe_1000_work_getcpu)               794.16ns    1.26M
-// contentionAtWidth(16_stripe_1000_work_getcpu)              785.33ns    1.27M
-// contentionAtWidth(32_stripe_1000_work_getcpu)              786.56ns    1.27M
-// atomicIncrBaseline(local_incr_1000_work)                   784.69ns    1.27M
+// contentionAtWidthGetcpu(1_stripe_1000_work)                  1.88us  532.20K
+// contentionAtWidthGetcpu(2_stripe_1000_work)                824.62ns    1.21M
+// contentionAtWidthGetcpu(4_stripe_1000_work)                803.56ns    1.24M
+// contentionAtWidthGetcpu(8_stripe_1000_work)                926.65ns    1.08M
+// contentionAtWidthGetcpu(16_stripe_1000_work)               900.10ns    1.11M
+// contentionAtWidthGetcpu(32_stripe_1000_work)               890.75ns    1.12M
+// atomicIncrBaseline(local_incr_1000_work)                   774.47ns    1.29M
 // ============================================================================
-static void contentionAtWidth(size_t iters,
-                              size_t stripes,
-                              size_t work,
-                              SpreaderType spreaderType,
-                              size_t counterAlignment = 128,
-                              size_t numThreads = 32) {
-  folly::BenchmarkSuspender braces;
-
-  folly::detail::Getcpu::Func getcpuFunc = nullptr;
-
-  if (spreaderType == SpreaderType::TLS_RR) {
-    getcpuFunc =
-        folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu;
-  }
-  if (spreaderType == SpreaderType::PTHREAD_SELF) {
-    getcpuFunc = folly::detail::FallbackGetcpu<HashingThreadId>::getcpu;
-  }
+template <template <typename> class Tag>
+static void contentionAtWidth(size_t iters, size_t stripes, size_t work) {
+  const size_t counterAlignment = 128;
+  const size_t numThreads = 32;
 
-  AccessSpreader<> spreader(
-      stripes, CacheLocality::system<std::atomic>(), getcpuFunc);
+  folly::BenchmarkSuspender braces;
 
   std::atomic<size_t> ready(0);
   std::atomic<bool> go(false);
@@ -625,25 +562,15 @@ static void contentionAtWidth(size_t iters,
             new (raw.data() + counterAlignment * i) std::atomic<size_t>();
       }
 
-      spreader.current();
       ready++;
       while (!go.load()) {
         sched_yield();
       }
       std::atomic<int> localWork(0);
-      if (spreaderType == SpreaderType::SHARED) {
-        for (size_t i = iters; i > 0; --i) {
-          ++*(counters[AccessSpreader<>::current(stripes)]);
-          for (size_t j = work; j > 0; --j) {
-            localWork.load();
-          }
-        }
-      } else {
-        for (size_t i = iters; i > 0; --i) {
-          ++*(counters[spreader.current()]);
-          for (size_t j = work; j > 0; --j) {
-            localWork.load();
-          }
+      for (size_t i = iters; i > 0; --i) {
+        ++*(counters[AccessSpreader<Tag>::current(stripes)]);
+        for (size_t j = work; j > 0; --j) {
+          localWork.load();
         }
       }
     }));
@@ -651,7 +578,7 @@ static void contentionAtWidth(size_t iters,
     if (threads.size() == numThreads / 15 || threads.size() == numThreads / 5) {
       // create a few dummy threads to wrap back around to 0 mod numCpus
       for (size_t i = threads.size(); i != numThreads; ++i) {
-        std::thread([&]() { spreader.current(); }).join();
+        std::thread([&]() { AccessSpreader<Tag>::current(stripes); }).join();
       }
     }
   }
@@ -699,110 +626,58 @@ static void atomicIncrBaseline(size_t iters,
   }
 }
 
-BENCHMARK_DRAW_LINE()
+static void contentionAtWidthGetcpu(size_t iters, size_t stripes, size_t work) {
+  contentionAtWidth<std::atomic>(iters, stripes, work);
+}
+
+static void contentionAtWidthThreadLocal(size_t iters,
+                                         size_t stripes,
+                                         size_t work) {
+  contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
+}
 
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 1_stripe_0_work_stub, 1, 0, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 2_stripe_0_work_getcpu, 2, 0, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 4_stripe_0_work_getcpu, 4, 0, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 8_stripe_0_work_getcpu, 8, 0, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 16_stripe_0_work_getcpu, 16, 0, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 32_stripe_0_work_getcpu, 32, 0, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 64_stripe_0_work_getcpu, 64, 0, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 2_stripe_0_work_tls_rr, 2, 0, SpreaderType::TLS_RR)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 4_stripe_0_work_tls_rr, 4, 0, SpreaderType::TLS_RR)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 8_stripe_0_work_tls_rr, 8, 0, SpreaderType::TLS_RR)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 16_stripe_0_work_tls_rr, 16, 0, SpreaderType::TLS_RR)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 32_stripe_0_work_tls_rr, 32, 0, SpreaderType::TLS_RR)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 64_stripe_0_work_tls_rr, 64, 0, SpreaderType::TLS_RR)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
-                      2_stripe_0_work_pthread_self,
-                      2,
-                      0,
-                      SpreaderType::PTHREAD_SELF)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
-                      4_stripe_0_work_pthread_self,
-                      4,
-                      0,
-                      SpreaderType::PTHREAD_SELF)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
-                      8_stripe_0_work_pthread_self,
-                      8,
-                      0,
-                      SpreaderType::PTHREAD_SELF)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
-                      16_stripe_0_work_pthread_self,
-                      16,
-                      0,
-                      SpreaderType::PTHREAD_SELF)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
-                      32_stripe_0_work_pthread_self,
-                      32,
-                      0,
-                      SpreaderType::PTHREAD_SELF)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
-                      64_stripe_0_work_pthread_self,
-                      64,
-                      0,
-                      SpreaderType::PTHREAD_SELF)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 2_stripe_0_work_shared, 2, 0, SpreaderType::SHARED)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 4_stripe_0_work_shared, 4, 0, SpreaderType::SHARED)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 8_stripe_0_work_shared, 8, 0, SpreaderType::SHARED)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 16_stripe_0_work_shared, 16, 0, SpreaderType::SHARED)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 32_stripe_0_work_shared, 32, 0, SpreaderType::SHARED)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 64_stripe_0_work_shared, 64, 0, SpreaderType::SHARED)
+static void contentionAtWidthPthreadSelf(size_t iters,
+                                         size_t stripes,
+                                         size_t work) {
+  contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
+}
+
+BENCHMARK_DRAW_LINE()
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_0_work, 1, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_0_work, 2, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_0_work, 4, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_0_work, 8, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_0_work, 16, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_0_work, 32, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 64_stripe_0_work, 64, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 2_stripe_0_work, 2, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 4_stripe_0_work, 4, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 8_stripe_0_work, 8, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 16_stripe_0_work, 16, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 32_stripe_0_work, 32, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 64_stripe_0_work, 64, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 2_stripe_0_work, 2, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 4_stripe_0_work, 4, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 8_stripe_0_work, 8, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 16_stripe_0_work, 16, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 32_stripe_0_work, 32, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 64_stripe_0_work, 64, 0)
 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
 BENCHMARK_DRAW_LINE()
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 1_stripe_500_work_stub, 1, 500, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 2_stripe_500_work_getcpu, 2, 500, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 4_stripe_500_work_getcpu, 4, 500, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 8_stripe_500_work_getcpu, 8, 500, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 16_stripe_500_work_getcpu, 16, 500, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 32_stripe_500_work_getcpu, 32, 500, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_500_work, 1, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_500_work, 2, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_500_work, 4, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_500_work, 8, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_500_work, 16, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_500_work, 32, 500)
 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
 BENCHMARK_DRAW_LINE()
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 1_stripe_1000_work_stub, 1, 1000, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 2_stripe_1000_work_getcpu, 2, 1000, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 4_stripe_1000_work_getcpu, 4, 1000, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
-    contentionAtWidth, 8_stripe_1000_work_getcpu, 8, 1000, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
-                      16_stripe_1000_work_getcpu,
-                      16,
-                      1000,
-                      SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
-                      32_stripe_1000_work_getcpu,
-                      32,
-                      1000,
-                      SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_1000_work, 1, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_1000_work, 2, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_1000_work, 4, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_1000_work, 8, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_1000_work, 16, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_1000_work, 32, 1000)
 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
 
 int main(int argc, char** argv) {
index 74d4dcdf4089270c249bc5a11ebdb9bdc01d8e83..fe59cc02145c47357fb68158a16914119a9cb634 100644 (file)
@@ -23,6 +23,8 @@
 #include <unordered_map>
 #include <assert.h>
 
+DECLARE_ACCESS_SPREADER_TYPE(folly::test::DeterministicAtomic)
+
 namespace folly {
 namespace test {
 
@@ -352,22 +354,7 @@ CacheLocality const& CacheLocality::system<test::DeterministicAtomic>() {
 }
 
 template <>
-const AccessSpreader<test::DeterministicAtomic>
-    AccessSpreader<test::DeterministicAtomic>::stripeByCore(
-        CacheLocality::system<>().numCachesByLevel.front());
-
-template <>
-const AccessSpreader<test::DeterministicAtomic>
-    AccessSpreader<test::DeterministicAtomic>::stripeByChip(
-        CacheLocality::system<>().numCachesByLevel.back());
-
-template <>
-AccessSpreaderArray<test::DeterministicAtomic, 128>
-    AccessSpreaderArray<test::DeterministicAtomic, 128>::sharedInstance = {};
-
-template <>
-Getcpu::Func AccessSpreader<test::DeterministicAtomic>::pickGetcpuFunc(
-    size_t /* numStripes */) {
+Getcpu::Func AccessSpreader<test::DeterministicAtomic>::pickGetcpuFunc() {
   return &DeterministicSchedule::getcpu;
 }
 }
index 627305a7ffdf4f966a0b4359a8fe6cd163b68975..4f7f2fe9ee9489844c651a6d0f1edd5f21e9feef 100644 (file)
@@ -400,7 +400,6 @@ FutexResult Futex<test::DeterministicAtomic>::futexWaitImpl(
     uint32_t waitMask);
 
 template <>
-Getcpu::Func AccessSpreader<test::DeterministicAtomic>::pickGetcpuFunc(
-    size_t numStripes);
+Getcpu::Func AccessSpreader<test::DeterministicAtomic>::pickGetcpuFunc();
 }
 } // namespace folly::detail