From 3d0c8f284dd1ce098db0fffb3b72c9396b6b50d2 Mon Sep 17 00:00:00 2001
From: Nathan Bronson <ngbronson@fb.com>
Date: Thu, 21 Nov 2013 17:15:06 -0800
Subject: [PATCH 1/1] detail/CacheLocality.h - utilities for dynamic cache
 optimizations

Summary:
CacheLocality reads cache sharing information from sysfs to
determine how CPUs should be grouped to minimize contention, Getcpu
provides fast access to the current CPU via __vdso_getcpu, and
AccessSpreader uses these two to optimally spread accesses among a
predetermined number of stripes.

AccessSpreader<>::current(n) microbenchmarks at 22 nanos, which is
substantially less than the cost of a cache miss.  This means that we
can effectively use it to reduce cache line ping-pong on striped data
structures such as IndexedMemPool or statistics counters.

Because CacheLocality looks at all of the cache levels, it can be used for
different levels of optimization.  AccessSpreader<>::stripeByChip.current()
uses as few stripes as possible to avoid off-chip communication,
AccessSpreader<>::stripeByCore.current() uses as many stripes as necessary
to get the optimal speedup, but no more.

@override-unit-failures

Test Plan: new unit tests

Reviewed By: davejwatson@fb.com

FB internal diff: D1076718
---
 folly/Makefile.am                    |   2 +
 folly/detail/CacheLocality.cpp       | 276 +++++++++++
 folly/detail/CacheLocality.h         | 353 ++++++++++++++
 folly/test/CacheLocalityTest.cpp     | 705 +++++++++++++++++++++++++++
 folly/test/DeterministicSchedule.cpp |  35 ++
 folly/test/DeterministicSchedule.h   |   1 +
 6 files changed, 1372 insertions(+)
 create mode 100644 folly/detail/CacheLocality.cpp
 create mode 100644 folly/detail/CacheLocality.h
 create mode 100644 folly/test/CacheLocalityTest.cpp
diff --git a/folly/Makefile.am b/folly/Makefile.am
index 91a993bf..a9647533 100644
--- a/folly/Makefile.am
+++ b/folly/Makefile.am
@@ -36,6 +36,7 @@ nobase_follyinclude_HEADERS = \
 	detail/AtomicHashUtils.h \
 	detail/BitIteratorDetail.h \
 	detail/BitsDetail.h \
+	detail/CacheLocality.h \
 	detail/DiscriminatedPtrDetail.h \
 	detail/FileUtilDetail.h \
 	detail/FingerprintPolynomial.h \
@@ -137,6 +138,7 @@ libfolly_la_SOURCES = \
 	Benchmark.cpp \
 	Bits.cpp \
 	Conv.cpp \
+	detail/CacheLocality.cpp \
 	dynamic.cpp \
 	EscapeTables.cpp \
 	File.cpp \
diff --git a/folly/detail/CacheLocality.cpp b/folly/detail/CacheLocality.cpp
new file mode 100644
index 00000000..7a13a9dd
--- /dev/null
+++ b/folly/detail/CacheLocality.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "folly/detail/CacheLocality.h"
+
+#define _GNU_SOURCE 1 // for RTLD_NOLOAD
+#include <dlfcn.h>
+#include <fstream>
+
+#include "folly/Conv.h"
+#include "folly/Exception.h"
+#include "folly/FileUtil.h"
+#include "folly/Format.h"
+#include "folly/ScopeGuard.h"
+
+namespace folly { namespace detail {
+
+///////////// CacheLocality
+
+/// Returns the best real CacheLocality information available
+static CacheLocality getSystemLocalityInfo() {
+  try {
+    return CacheLocality::readFromSysfs();
+  } catch (...) {
+    // keep trying
+  }
+
+  long numCpus = sysconf(_SC_NPROCESSORS_CONF);
+  if (numCpus <= 0) {
+    // This shouldn't happen, but if it does we should try to keep
+    // going.  We are probably not going to be able to parse /sys on
+    // this box either (although we will try), which means we are going
+    // to fall back to the SequentialThreadId splitter.  On my 16 core
+    // (x hyperthreading) dev box 16 stripes is enough to get pretty good
+    // contention avoidance with SequentialThreadId, and there is little
+    // improvement from going from 32 to 64.  This default gives us some
+    // wiggle room
+    numCpus = 32;
+  }
+  return CacheLocality::uniform(numCpus);
+}
+
+template <>
+const CacheLocality& CacheLocality::system<std::atomic>() {
+  static CacheLocality cache(getSystemLocalityInfo());
+  return cache;
+}
+
+// Each level of cache has sharing sets, which are the set of cpus
+// that share a common cache at that level.  These are available in a
+// hex bitset form (/sys/devices/system/cpu/cpu0/index0/shared_cpu_map,
+// for example).  They are also available in a human-readable list form,
+// as in /sys/devices/system/cpu/cpu0/index0/shared_cpu_list.  The list
+// is a comma-separated list of numbers and ranges, where the ranges are
+// a pair of decimal numbers separated by a '-'.
+//
+// To sort the cpus for optimum locality we don't really need to parse
+// the sharing sets, we just need a unique representative from the
+// equivalence class.  The smallest value works fine, and happens to be
+// the first decimal number in the file.  We load all of the equivalence
+// class information from all of the cpu*/index* directories, order the
+// cpus first by increasing last-level cache equivalence class, then by
+// the smaller caches.  Finally, we break ties with the cpu number itself.
+
+/// Returns the first decimal number in the string, or throws an exception
+/// if the string does not start with a number terminated by ',', '-',
+/// '\n', or eos.
+static ssize_t parseLeadingNumber(const std::string& line) {
+  auto raw = line.c_str();
+  char *end;
+  unsigned val = strtoul(raw, &end, 10);
+  if (end == raw || (*end != ',' && *end != '-' && *end != '\n')) {
+    throw std::runtime_error(to<std::string>(
+        "error parsing list '", line, "'").c_str());
+  }
+  return val;
+}
+
+CacheLocality CacheLocality::readFromSysfsTree(
+    const std::function<std::string(std::string)>& mapping) {
+  // number of equivalence classes per level
+  std::vector<size_t> numCachesByLevel;
+
+  // the list of cache equivalence classes, where equivalance classes
+  // are named by the smallest cpu in the class
+  std::vector<std::vector<size_t>> equivClassesByCpu;
+
+  std::vector<size_t> cpus;
+
+  while (true) {
+    auto cpu = cpus.size();
+    std::vector<size_t> levels;
+    for (size_t index = 0; ; ++index) {
+      auto dir = format("/sys/devices/system/cpu/cpu{}/cache/index{}/",
+                        cpu, index).str();
+      auto cacheType = mapping(dir + "type");
+      auto equivStr = mapping(dir + "shared_cpu_list");
+      if (cacheType.size() == 0 || equivStr.size() == 0) {
+        // no more caches
+        break;
+      }
+      if (cacheType[0] == 'I') {
+        // cacheType in { "Data", "Instruction", "Unified" }. skip icache
+        continue;
+      }
+      auto equiv = parseLeadingNumber(equivStr);
+      auto level = levels.size();
+      levels.push_back(equiv);
+
+      if (equiv == cpu) {
+        // we only want to count the equiv classes once, so we do it when
+        // we first encounter them
+        while (numCachesByLevel.size() <= level) {
+          numCachesByLevel.push_back(0);
+        }
+        numCachesByLevel[level]++;
+      }
+    }
+
+    if (levels.size() == 0) {
+      // no levels at all for this cpu, we must be done
+      break;
+    }
+    equivClassesByCpu.emplace_back(std::move(levels));
+    cpus.push_back(cpu);
+  }
+
+  if (cpus.size() == 0) {
+    throw std::runtime_error("unable to load cache sharing info");
+  }
+
+  std::sort(cpus.begin(), cpus.end(), [&](size_t lhs, size_t rhs) -> bool {
+    // sort first by equiv class of cache with highest index, direction
+    // doesn't matter.  If different cpus have different numbers of
+    // caches then this code might produce a sub-optimal ordering, but
+    // it won't crash
+    auto& lhsEquiv = equivClassesByCpu[lhs];
+    auto& rhsEquiv = equivClassesByCpu[rhs];
+    for (int i = std::min(lhsEquiv.size(), rhsEquiv.size()) - 1; i >= 0; --i) {
+      if (lhsEquiv[i] != rhsEquiv[i]) {
+        return lhsEquiv[i] < rhsEquiv[i];
+      }
+    }
+
+    // break ties deterministically by cpu
+    return lhs < rhs;
+  });
+
+  // the cpus are now sorted by locality, with neighboring entries closer
+  // to each other than entries that are far away.  For striping we want
+  // the inverse map, since we are starting with the cpu
+  std::vector<size_t> indexes(cpus.size());
+  for (int i = 0; i < cpus.size(); ++i) {
+    indexes[cpus[i]] = i;
+  }
+
+  return CacheLocality{
+      cpus.size(), std::move(numCachesByLevel), std::move(indexes) };
+}
+
+CacheLocality CacheLocality::readFromSysfs() {
+  return readFromSysfsTree([](std::string name) {
+    std::ifstream xi(name.c_str());
+    std::string rv;
+    std::getline(xi, rv);
+    return rv;
+  });
+}
+
+
+CacheLocality CacheLocality::uniform(size_t numCpus) {
+  CacheLocality rv;
+
+  rv.numCpus = numCpus;
+
+  // one cache shared by all cpus
+  rv.numCachesByLevel.push_back(numCpus);
+
+  // no permutations in locality index mapping
+  for (size_t cpu = 0; cpu < numCpus; ++cpu) {
+    rv.localityIndexByCpu.push_back(cpu);
+  }
+
+  return rv;
+}
+
+////////////// Getcpu
+
+/// Resolves the dynamically loaded symbol __vdso_getcpu, returning null
+/// on failure
+static Getcpu::Func loadVdsoGetcpu() {
+  void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+  if (h == nullptr) {
+    return nullptr;
+  }
+
+  auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu"));
+  if (func == nullptr) {
+    // technically a null result could either be a failure or a successful
+    // lookup of a symbol with the null value, but the second can't actually
+    // happen for this symbol.  No point holding the handle forever if
+    // we don't need the code
+    dlclose(h);
+  }
+
+  return func;
+}
+
+Getcpu::Func Getcpu::vdsoFunc() {
+  static Func func = loadVdsoGetcpu();
+  return func;
+}
+
+/////////////// SequentialThreadId
+
+template<>
+std::atomic<size_t> SequentialThreadId<std::atomic>::prevId(0);
+
+template<>
+__thread size_t SequentialThreadId<std::atomic>::currentId(0);
+
+/////////////// AccessSpreader
+
+template<>
+const AccessSpreader<std::atomic>
+AccessSpreader<std::atomic>::stripeByCore(
+    CacheLocality::system<>().numCachesByLevel.front());
+
+template<>
+const AccessSpreader<std::atomic>
+AccessSpreader<std::atomic>::stripeByChip(
+    CacheLocality::system<>().numCachesByLevel.back());
+
+template<>
+AccessSpreaderArray<std::atomic,128>
+AccessSpreaderArray<std::atomic,128>::sharedInstance = {};
+
+/// Always claims to be on CPU zero, node zero
+static int degenerateGetcpu(unsigned* cpu, unsigned* node, void* unused) {
+  if (cpu != nullptr) {
+    *cpu = 0;
+  }
+  if (node != nullptr) {
+    *node = 0;
+  }
+  return 0;
+}
+
+template<>
+Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t numStripes) {
+  if (numStripes == 1) {
+    // there's no need to call getcpu if there is only one stripe.
+    // This should not be common, so we don't want to waste a test and
+    // branch in the main code path, but we might as well use a faster
+    // function pointer
+    return &degenerateGetcpu;
+  } else {
+    auto best = Getcpu::vdsoFunc();
+    return best ? best : &SequentialThreadId<std::atomic>::getcpu;
+  }
+}
+
+} } // namespace folly::detail
diff --git a/folly/detail/CacheLocality.h b/folly/detail/CacheLocality.h
new file mode 100644
index 00000000..0d839bec
--- /dev/null
+++ b/folly/detail/CacheLocality.h
@@ -0,0 +1,353 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FOLLY_DETAIL_CACHELOCALITY_H_
+#define FOLLY_DETAIL_CACHELOCALITY_H_
+
+#include <sched.h>
+#include <atomic>
+#include <cassert>
+#include <functional>
+#include <limits>
+#include <string>
+#include <type_traits>
+#include <vector>
+#include "folly/Likely.h"
+
+namespace folly { namespace detail {
+
+// This file contains several classes that might be useful if you are
+// trying to dynamically optimize cache locality: CacheLocality reads
+// cache sharing information from sysfs to determine how CPUs should be
+// grouped to minimize contention, Getcpu provides fast access to the
+// current CPU via __vdso_getcpu, and AccessSpreader uses these two to
+// optimally spread accesses among a predetermined number of stripes.
+//
+// AccessSpreader<>::current(n) microbenchmarks at 22 nanos, which is
+// substantially less than the cost of a cache miss.  This means that we
+// can effectively use it to reduce cache line ping-pong on striped data
+// structures such as IndexedMemPool or statistics counters.
+//
+// Because CacheLocality looks at all of the cache levels, it can be
+// used for different levels of optimization.  AccessSpreader(2) does
+// per-chip spreading on a dual socket system.  AccessSpreader(numCpus)
+// does perfect per-cpu spreading.  AccessSpreader(numCpus / 2) does
+// perfect L1 spreading in a system with hyperthreading enabled.
+
+struct CacheLocality {
+
+  /// 1 more than the maximum value that can be returned from sched_getcpu
+  /// or getcpu.  This is the number of hardware thread contexts provided
+  /// by the processors
+  size_t numCpus;
+
+  /// Holds the number of caches present at each cache level (0 is
+  /// the closest to the cpu).  This is the number of AccessSpreader
+  /// stripes needed to avoid cross-cache communication at the specified
+  /// layer.  numCachesByLevel.front() is the number of L1 caches and
+  /// numCachesByLevel.back() is the number of last-level caches.
+  std::vector<size_t> numCachesByLevel;
+
+  /// A map from cpu (from sched_getcpu or getcpu) to an index in the
+  /// range 0..numCpus-1, where neighboring locality indices are more
+  /// likely to share caches then indices far away.  All of the members
+  /// of a particular cache level be contiguous in their locality index.
+  /// For example, if numCpus is 32 and numCachesByLevel.back() is 2,
+  /// then cpus with a locality index < 16 will share one last-level
+  /// cache and cpus with a locality index >= 16 will share the other.
+  std::vector<size_t> localityIndexByCpu;
+
+
+  /// Returns the best CacheLocality information available for the current
+  /// system, cached for fast access.  This will be loaded from sysfs if
+  /// possible, otherwise it will be correct in the number of CPUs but
+  /// not in their sharing structure.
+  ///
+  /// If you are into yo dawgs, this is a shared cache of the local
+  /// locality of the shared caches.
+  ///
+  /// The template parameter here is used to allow injection of a
+  /// repeatable CacheLocality structure during testing.  Rather than
+  /// inject the type of the CacheLocality provider into every data type
+  /// that transitively uses it, all components select between the default
+  /// sysfs implementation and a deterministic implementation by keying
+  /// off the type of the underlying atomic.  See DeterministicScheduler.
+  template <template<typename> class Atom = std::atomic>
+  static const CacheLocality& system();
+
+
+  /// Reads CacheLocality information from a tree structured like
+  /// the sysfs filesystem.  The provided function will be evaluated
+  /// for each sysfs file that needs to be queried.  The function
+  /// should return a string containing the first line of the file
+  /// (not including the newline), or an empty string if the file does
+  /// not exist.  The function will be called with paths of the form
+  /// /sys/devices/system/cpu/cpu*/cache/index*/{type,shared_cpu_list} .
+  /// Throws an exception if no caches can be parsed at all.
+  static CacheLocality readFromSysfsTree(
+      const std::function<std::string(std::string)>& mapping);
+
+  /// Reads CacheLocality information from the real sysfs filesystem.
+  /// Throws an exception if no cache information can be loaded.
+  static CacheLocality readFromSysfs();
+
+  /// Returns a usable (but probably not reflective of reality)
+  /// CacheLocality structure with the specified number of cpus and a
+  /// single cache level that associates one cpu per cache.
+  static CacheLocality uniform(size_t numCpus);
+};
+
+/// Holds a function pointer to the VDSO implementation of getcpu(2),
+/// if available
+struct Getcpu {
+  /// Function pointer to a function with the same signature as getcpu(2).
+  typedef int (*Func)(unsigned* cpu, unsigned* node, void* unused);
+
+  /// Returns a pointer to the VDSO implementation of getcpu(2), if
+  /// available, or nullptr otherwise
+  static Func vdsoFunc();
+};
+
+/// A class that lazily binds a unique (for each implementation of Atom)
+/// identifier to a thread.  This is a fallback mechanism for the access
+/// spreader if we are in testing (using DeterministicAtomic) or if
+/// __vdso_getcpu can't be dynamically loaded
+template <template<typename> class Atom>
+struct SequentialThreadId {
+
+  /// Returns the thread id assigned to the current thread
+  static size_t get() {
+    auto rv = currentId;
+    if (UNLIKELY(rv == 0)) {
+      rv = currentId = ++prevId;
+    }
+    return rv;
+  }
+
+  /// Fills the thread id into the cpu and node out params (if they
+  /// are non-null).  This method is intended to act like getcpu when a
+  /// fast-enough form of getcpu isn't available or isn't desired
+  static int getcpu(unsigned* cpu, unsigned* node, void* unused) {
+    auto id = get();
+    if (cpu) {
+      *cpu = id;
+    }
+    if (node) {
+      *node = id;
+    }
+    return 0;
+  }
+
+ private:
+  static Atom<size_t> prevId;
+
+  // TODO: switch to thread_local
+  static __thread size_t currentId;
+};
+
+template <template<typename> class Atom, size_t kMaxCpus>
+struct AccessSpreaderArray;
+
+/// AccessSpreader arranges access to a striped data structure in such a
+/// way that concurrently executing threads are likely to be accessing
+/// different stripes.  It does NOT guarantee uncontended access.
+/// Your underlying algorithm must be thread-safe without spreading, this
+/// is merely an optimization.  AccessSpreader::current(n) is typically
+/// much faster than a cache miss (22 nanos on my dev box, tested fast
+/// in both 2.6 and 3.2 kernels).
+///
+/// You are free to create your own AccessSpreader-s or to cache the
+/// results of AccessSpreader<>::shared(n), but you will probably want
+/// to use one of the system-wide shared ones.  Calling .current() on
+/// a particular AccessSpreader instance only saves about 1 nanosecond
+/// over calling AccessSpreader<>::shared(n).
+///
+/// If available (and not using the deterministic testing implementation)
+/// AccessSpreader uses the getcpu system call via VDSO and the
+/// precise locality information retrieved from sysfs by CacheLocality.
+/// This provides optimal anti-sharing at a fraction of the cost of a
+/// cache miss.
+///
+/// When there are not as many stripes as processors, we try to optimally
+/// place the cache sharing boundaries.  This means that if you have 2
+/// stripes and run on a dual-socket system, your 2 stripes will each get
+/// all of the cores from a single socket.  If you have 16 stripes on a
+/// 16 core system plus hyperthreading (32 cpus), each core will get its
+/// own stripe and there will be no cache sharing at all.
+///
+/// AccessSpreader has a fallback mechanism for when __vdso_getcpu can't be
+/// loaded, or for use during deterministic testing.  Using sched_getcpu or
+/// the getcpu syscall would negate the performance advantages of access
+/// spreading, so we use a thread-local value and a shared atomic counter
+/// to spread access out.
+///
+/// AccessSpreader is templated on the template type that is used
+/// to implement atomics, as a way to instantiate the underlying
+/// heuristics differently for production use and deterministic unit
+/// testing.  See DeterministicScheduler for more.  If you aren't using
+/// DeterministicScheduler, you can just use the default template parameter
+/// all of the time.
+template <template<typename> class Atom = std::atomic>
+struct AccessSpreader {
+
+  /// Returns a never-destructed shared AccessSpreader instance.
+  /// numStripes should be > 0.
+  static const AccessSpreader& shared(size_t numStripes) {
+    // sharedInstances[0] actually has numStripes == 1
+    assert(numStripes > 0);
+
+    // the last shared element handles all large sizes
+    return AccessSpreaderArray<Atom,kMaxCpus>::sharedInstance[
+        std::min(size_t(kMaxCpus), numStripes)];
+  }
+
+  /// Returns the stripe associated with the current CPU, assuming
+  /// that there are numStripes (non-zero) stripes.  Equivalent to
+  /// AccessSpreader::shared(numStripes)->current.
+  static size_t current(size_t numStripes) {
+    return shared(numStripes).current();
+  }
+
+  /// stripeByCore uses 1 stripe per L1 cache, according to
+  /// CacheLocality::system<>().  Use stripeByCore.numStripes() to see
+  /// its width, or stripeByCore.current() to get the current stripe
+  static const AccessSpreader stripeByCore;
+
+  /// stripeByChip uses 1 stripe per last-level cache, which is the fewest
+  /// number of stripes for which off-chip communication can be avoided
+  /// (assuming all caches are on-chip).  Use stripeByChip.numStripes()
+  /// to see its width, or stripeByChip.current() to get the current stripe
+  static const AccessSpreader stripeByChip;
+
+
+  /// Constructs an AccessSpreader that will return values from
+  /// 0 to numStripes-1 (inclusive), precomputing the mapping
+  /// from CPU to stripe.  There is no use in having more than
+  /// CacheLocality::system<Atom>().localityIndexByCpu.size() stripes or
+  /// kMaxCpus stripes
+  explicit AccessSpreader(size_t spreaderNumStripes,
+                          const CacheLocality& cacheLocality =
+                              CacheLocality::system<Atom>(),
+                          Getcpu::Func getcpuFunc = nullptr)
+    : getcpuFunc_(getcpuFunc ? getcpuFunc : pickGetcpuFunc(spreaderNumStripes))
+    , numStripes_(spreaderNumStripes)
+  {
+    auto n = cacheLocality.numCpus;
+    for (size_t cpu = 0; cpu < kMaxCpus && cpu < n; ++cpu) {
+      auto index = cacheLocality.localityIndexByCpu[cpu];
+      assert(index < n);
+      // as index goes from 0..n, post-transform value goes from
+      // 0..numStripes
+      stripeByCpu[cpu] = (index * numStripes_) / n;
+      assert(stripeByCpu[cpu] < numStripes_);
+    }
+    for (size_t cpu = n; cpu < kMaxCpus; ++cpu) {
+      stripeByCpu[cpu] = stripeByCpu[cpu - n];
+    }
+  }
+
+  /// Returns 1 more than the maximum value that can be returned from
+  /// current()
+  size_t numStripes() const {
+    return numStripes_;
+  }
+
+  /// Returns the stripe associated with the current CPU
+  size_t current() const {
+    unsigned cpu;
+    getcpuFunc_(&cpu, nullptr, nullptr);
+    return stripeByCpu[cpu % kMaxCpus];
+  }
+
+ private:
+
+  /// If there are more cpus than this nothing will crash, but there
+  /// might be unnecessary sharing
+  enum { kMaxCpus = 128 };
+
+  typedef uint8_t CompactStripe;
+
+  static_assert((kMaxCpus & (kMaxCpus - 1)) == 0,
+      "kMaxCpus should be a power of two so modulo is fast");
+  static_assert(kMaxCpus - 1 <= std::numeric_limits<CompactStripe>::max(),
+      "stripeByCpu element type isn't wide enough");
+
+
+  /// Points to the getcpu-like function we are using to obtain the
+  /// current cpu.  It should not be assumed that the returned cpu value
+  /// is in range.  We use a member for this instead of a static so that
+  /// this fetch preloads a prefix the stripeByCpu array
+  Getcpu::Func getcpuFunc_;
+
+  /// A precomputed map from cpu to stripe.  Rather than add a layer of
+  /// indirection requiring a dynamic bounds check and another cache miss,
+  /// we always precompute the whole array
+  CompactStripe stripeByCpu[kMaxCpus];
+
+  size_t numStripes_;
+
+  /// Returns the best getcpu implementation for this type and width
+  /// of AccessSpreader
+  static Getcpu::Func pickGetcpuFunc(size_t numStripes);
+};
+
+/// An array of kMaxCpus+1 AccessSpreader<Atom> instances constructed
+/// with default params, with the zero-th element having 1 stripe
+template <template<typename> class Atom, size_t kMaxStripe>
+struct AccessSpreaderArray {
+
+  AccessSpreaderArray() {
+    for (size_t i = 0; i <= kMaxStripe; ++i) {
+      new (raw + i) AccessSpreader<Atom>(std::max(size_t(1), i));
+    }
+  }
+
+  ~AccessSpreaderArray() {
+    for (size_t i = 0; i <= kMaxStripe; ++i) {
+      auto p = static_cast<AccessSpreader<Atom>*>(static_cast<void*>(raw + i));
+      p->~AccessSpreader();
+    }
+  }
+
+  AccessSpreader<Atom> const& operator[] (size_t index) const {
+    return *static_cast<AccessSpreader<Atom> const*>(
+        static_cast<void const*>(raw + index));
+  }
+
+ private:
+
+  /// If we align the access spreaders at the beginning of a cache line
+  /// then getcpuFunc_ and the first 56 bytes of stripeByCpu will be on
+  /// the same cache line
+  enum { kAlignment = 64 };
+
+
+  // AccessSpreader uses sharedInstance
+  friend AccessSpreader<Atom>;
+
+  static AccessSpreaderArray<Atom,kMaxStripe> sharedInstance;
+
+
+  /// aligned_storage is uninitialized, we use placement new since there
+  /// is no AccessSpreader default constructor
+  typename std::aligned_storage<sizeof(AccessSpreader<Atom>),kAlignment>::type
+      raw[kMaxStripe + 1];
+};
+
+} }
+
+#endif /* FOLLY_DETAIL_CacheLocality_H_ */
+
diff --git a/folly/test/CacheLocalityTest.cpp b/folly/test/CacheLocalityTest.cpp
new file mode 100644
index 00000000..056ec5f6
--- /dev/null
+++ b/folly/test/CacheLocalityTest.cpp
@@ -0,0 +1,705 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "folly/detail/CacheLocality.h"
+
+#include <sched.h>
+#include <memory>
+#include <thread>
+#include <type_traits>
+#include <unordered_map>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "folly/Benchmark.h"
+
+using namespace folly::detail;
+
+/// This is the relevant nodes from a production box's sysfs tree.  If you
+/// think this map is ugly you should see the version of this test that
+/// used a real directory tree.  To reduce the chance of testing error
+/// I haven't tried to remove the common prefix
+static std::unordered_map<std::string,std::string> fakeSysfsTree = {
+  { "/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list", "0,17" },
+  { "/sys/devices/system/cpu/cpu0/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list", "0,17" },
+  { "/sys/devices/system/cpu/cpu0/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list", "0,17" },
+  { "/sys/devices/system/cpu/cpu0/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu0/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu1/cache/index0/shared_cpu_list", "1,18" },
+  { "/sys/devices/system/cpu/cpu1/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu1/cache/index1/shared_cpu_list", "1,18" },
+  { "/sys/devices/system/cpu/cpu1/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list", "1,18" },
+  { "/sys/devices/system/cpu/cpu1/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu1/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu1/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu2/cache/index0/shared_cpu_list", "2,19" },
+  { "/sys/devices/system/cpu/cpu2/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu2/cache/index1/shared_cpu_list", "2,19" },
+  { "/sys/devices/system/cpu/cpu2/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu2/cache/index2/shared_cpu_list", "2,19" },
+  { "/sys/devices/system/cpu/cpu2/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu2/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu2/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu3/cache/index0/shared_cpu_list", "3,20" },
+  { "/sys/devices/system/cpu/cpu3/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu3/cache/index1/shared_cpu_list", "3,20" },
+  { "/sys/devices/system/cpu/cpu3/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_list", "3,20" },
+  { "/sys/devices/system/cpu/cpu3/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu3/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu3/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu4/cache/index0/shared_cpu_list", "4,21" },
+  { "/sys/devices/system/cpu/cpu4/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu4/cache/index1/shared_cpu_list", "4,21" },
+  { "/sys/devices/system/cpu/cpu4/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu4/cache/index2/shared_cpu_list", "4,21" },
+  { "/sys/devices/system/cpu/cpu4/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu4/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu4/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu5/cache/index0/shared_cpu_list", "5-6" },
+  { "/sys/devices/system/cpu/cpu5/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu5/cache/index1/shared_cpu_list", "5-6" },
+  { "/sys/devices/system/cpu/cpu5/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu5/cache/index2/shared_cpu_list", "5-6" },
+  { "/sys/devices/system/cpu/cpu5/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu5/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu5/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu6/cache/index0/shared_cpu_list", "5-6" },
+  { "/sys/devices/system/cpu/cpu6/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu6/cache/index1/shared_cpu_list", "5-6" },
+  { "/sys/devices/system/cpu/cpu6/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu6/cache/index2/shared_cpu_list", "5-6" },
+  { "/sys/devices/system/cpu/cpu6/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu6/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu6/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu7/cache/index0/shared_cpu_list", "7,22" },
+  { "/sys/devices/system/cpu/cpu7/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu7/cache/index1/shared_cpu_list", "7,22" },
+  { "/sys/devices/system/cpu/cpu7/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu7/cache/index2/shared_cpu_list", "7,22" },
+  { "/sys/devices/system/cpu/cpu7/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu7/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu7/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list", "8,23" },
+  { "/sys/devices/system/cpu/cpu8/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list", "8,23" },
+  { "/sys/devices/system/cpu/cpu8/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list", "8,23" },
+  { "/sys/devices/system/cpu/cpu8/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu8/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list", "9,24" },
+  { "/sys/devices/system/cpu/cpu9/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list", "9,24" },
+  { "/sys/devices/system/cpu/cpu9/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list", "9,24" },
+  { "/sys/devices/system/cpu/cpu9/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list", "9-16,24-31" },
+  { "/sys/devices/system/cpu/cpu9/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu10/cache/index0/shared_cpu_list", "10,25" },
+  { "/sys/devices/system/cpu/cpu10/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu10/cache/index1/shared_cpu_list", "10,25" },
+  { "/sys/devices/system/cpu/cpu10/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu10/cache/index2/shared_cpu_list", "10,25" },
+  { "/sys/devices/system/cpu/cpu10/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu10/cache/index3/shared_cpu_list", "9-16,24-31"},
+  { "/sys/devices/system/cpu/cpu10/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu11/cache/index0/shared_cpu_list", "11,26" },
+  { "/sys/devices/system/cpu/cpu11/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu11/cache/index1/shared_cpu_list", "11,26" },
+  { "/sys/devices/system/cpu/cpu11/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu11/cache/index2/shared_cpu_list", "11,26" },
+  { "/sys/devices/system/cpu/cpu11/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu11/cache/index3/shared_cpu_list", "9-16,24-31"},
+  { "/sys/devices/system/cpu/cpu11/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu12/cache/index0/shared_cpu_list", "12,27" },
+  { "/sys/devices/system/cpu/cpu12/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu12/cache/index1/shared_cpu_list", "12,27" },
+  { "/sys/devices/system/cpu/cpu12/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu12/cache/index2/shared_cpu_list", "12,27" },
+  { "/sys/devices/system/cpu/cpu12/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu12/cache/index3/shared_cpu_list", "9-16,24-31"},
+  { "/sys/devices/system/cpu/cpu12/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu13/cache/index0/shared_cpu_list", "13,28" },
+  { "/sys/devices/system/cpu/cpu13/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu13/cache/index1/shared_cpu_list", "13,28" },
+  { "/sys/devices/system/cpu/cpu13/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu13/cache/index2/shared_cpu_list", "13,28" },
+  { "/sys/devices/system/cpu/cpu13/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu13/cache/index3/shared_cpu_list", "9-16,24-31"},
+  { "/sys/devices/system/cpu/cpu13/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu14/cache/index0/shared_cpu_list", "14,29" },
+  { "/sys/devices/system/cpu/cpu14/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu14/cache/index1/shared_cpu_list", "14,29" },
+  { "/sys/devices/system/cpu/cpu14/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu14/cache/index2/shared_cpu_list", "14,29" },
+  { "/sys/devices/system/cpu/cpu14/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu14/cache/index3/shared_cpu_list", "9-16,24-31"},
+  { "/sys/devices/system/cpu/cpu14/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu15/cache/index0/shared_cpu_list", "15,30" },
+  { "/sys/devices/system/cpu/cpu15/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu15/cache/index1/shared_cpu_list", "15,30" },
+  { "/sys/devices/system/cpu/cpu15/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu15/cache/index2/shared_cpu_list", "15,30" },
+  { "/sys/devices/system/cpu/cpu15/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu15/cache/index3/shared_cpu_list", "9-16,24-31"},
+  { "/sys/devices/system/cpu/cpu15/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu16/cache/index0/shared_cpu_list", "16,31" },
+  { "/sys/devices/system/cpu/cpu16/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu16/cache/index1/shared_cpu_list", "16,31" },
+  { "/sys/devices/system/cpu/cpu16/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu16/cache/index2/shared_cpu_list", "16,31" },
+  { "/sys/devices/system/cpu/cpu16/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu16/cache/index3/shared_cpu_list", "9-16,24-31"},
+  { "/sys/devices/system/cpu/cpu16/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu17/cache/index0/shared_cpu_list", "0,17" },
+  { "/sys/devices/system/cpu/cpu17/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu17/cache/index1/shared_cpu_list", "0,17" },
+  { "/sys/devices/system/cpu/cpu17/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu17/cache/index2/shared_cpu_list", "0,17" },
+  { "/sys/devices/system/cpu/cpu17/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu17/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu17/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu18/cache/index0/shared_cpu_list", "1,18" },
+  { "/sys/devices/system/cpu/cpu18/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu18/cache/index1/shared_cpu_list", "1,18" },
+  { "/sys/devices/system/cpu/cpu18/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu18/cache/index2/shared_cpu_list", "1,18" },
+  { "/sys/devices/system/cpu/cpu18/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu18/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu18/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu19/cache/index0/shared_cpu_list", "2,19" },
+  { "/sys/devices/system/cpu/cpu19/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu19/cache/index1/shared_cpu_list", "2,19" },
+  { "/sys/devices/system/cpu/cpu19/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu19/cache/index2/shared_cpu_list", "2,19" },
+  { "/sys/devices/system/cpu/cpu19/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu19/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu19/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu20/cache/index0/shared_cpu_list", "3,20" },
+  { "/sys/devices/system/cpu/cpu20/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu20/cache/index1/shared_cpu_list", "3,20" },
+  { "/sys/devices/system/cpu/cpu20/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu20/cache/index2/shared_cpu_list", "3,20" },
+  { "/sys/devices/system/cpu/cpu20/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu20/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu20/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu21/cache/index0/shared_cpu_list", "4,21" },
+  { "/sys/devices/system/cpu/cpu21/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu21/cache/index1/shared_cpu_list", "4,21" },
+  { "/sys/devices/system/cpu/cpu21/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu21/cache/index2/shared_cpu_list", "4,21" },
+  { "/sys/devices/system/cpu/cpu21/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu21/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu21/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu22/cache/index0/shared_cpu_list", "7,22" },
+  { "/sys/devices/system/cpu/cpu22/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu22/cache/index1/shared_cpu_list", "7,22" },
+  { "/sys/devices/system/cpu/cpu22/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu22/cache/index2/shared_cpu_list", "7,22" },
+  { "/sys/devices/system/cpu/cpu22/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu22/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu22/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu23/cache/index0/shared_cpu_list", "8,23" },
+  { "/sys/devices/system/cpu/cpu23/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu23/cache/index1/shared_cpu_list", "8,23" },
+  { "/sys/devices/system/cpu/cpu23/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_list", "8,23" },
+  { "/sys/devices/system/cpu/cpu23/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu23/cache/index3/shared_cpu_list", "0-8,17-23" },
+  { "/sys/devices/system/cpu/cpu23/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu24/cache/index0/shared_cpu_list", "9,24" },
+  { "/sys/devices/system/cpu/cpu24/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu24/cache/index1/shared_cpu_list", "9,24" },
+  { "/sys/devices/system/cpu/cpu24/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu24/cache/index2/shared_cpu_list", "9,24" },
+  { "/sys/devices/system/cpu/cpu24/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu24/cache/index3/shared_cpu_list", "9-16,24-31"},
+  { "/sys/devices/system/cpu/cpu24/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu25/cache/index0/shared_cpu_list", "10,25" },
+  { "/sys/devices/system/cpu/cpu25/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu25/cache/index1/shared_cpu_list", "10,25" },
+  { "/sys/devices/system/cpu/cpu25/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu25/cache/index2/shared_cpu_list", "10,25" },
+  { "/sys/devices/system/cpu/cpu25/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu25/cache/index3/shared_cpu_list", "9-16,24-31"},
+  { "/sys/devices/system/cpu/cpu25/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu26/cache/index0/shared_cpu_list", "11,26" },
+  { "/sys/devices/system/cpu/cpu26/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu26/cache/index1/shared_cpu_list", "11,26" },
+  { "/sys/devices/system/cpu/cpu26/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu26/cache/index2/shared_cpu_list", "11,26" },
+  { "/sys/devices/system/cpu/cpu26/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu26/cache/index3/shared_cpu_list", "9-16,24-31"},
+  { "/sys/devices/system/cpu/cpu26/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu27/cache/index0/shared_cpu_list", "12,27" },
+  { "/sys/devices/system/cpu/cpu27/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu27/cache/index1/shared_cpu_list", "12,27" },
+  { "/sys/devices/system/cpu/cpu27/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu27/cache/index2/shared_cpu_list", "12,27" },
+  { "/sys/devices/system/cpu/cpu27/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu27/cache/index3/shared_cpu_list", "9-16,24-31"},
+  { "/sys/devices/system/cpu/cpu27/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu28/cache/index0/shared_cpu_list", "13,28" },
+  { "/sys/devices/system/cpu/cpu28/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu28/cache/index1/shared_cpu_list", "13,28" },
+  { "/sys/devices/system/cpu/cpu28/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu28/cache/index2/shared_cpu_list", "13,28" },
+  { "/sys/devices/system/cpu/cpu28/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu28/cache/index3/shared_cpu_list", "9-16,24-31"},
+  { "/sys/devices/system/cpu/cpu28/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu29/cache/index0/shared_cpu_list", "14,29" },
+  { "/sys/devices/system/cpu/cpu29/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu29/cache/index1/shared_cpu_list", "14,29" },
+  { "/sys/devices/system/cpu/cpu29/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu29/cache/index2/shared_cpu_list", "14,29" },
+  { "/sys/devices/system/cpu/cpu29/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu29/cache/index3/shared_cpu_list", "9-16,24-31"},
+  { "/sys/devices/system/cpu/cpu29/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu30/cache/index0/shared_cpu_list", "15,30" },
+  { "/sys/devices/system/cpu/cpu30/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu30/cache/index1/shared_cpu_list", "15,30" },
+  { "/sys/devices/system/cpu/cpu30/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu30/cache/index2/shared_cpu_list", "15,30" },
+  { "/sys/devices/system/cpu/cpu30/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu30/cache/index3/shared_cpu_list", "9-16,24-31"},
+  { "/sys/devices/system/cpu/cpu30/cache/index3/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu31/cache/index0/shared_cpu_list", "16,31" },
+  { "/sys/devices/system/cpu/cpu31/cache/index0/type", "Data" },
+  { "/sys/devices/system/cpu/cpu31/cache/index1/shared_cpu_list", "16,31" },
+  { "/sys/devices/system/cpu/cpu31/cache/index1/type", "Instruction" },
+  { "/sys/devices/system/cpu/cpu31/cache/index2/shared_cpu_list", "16,31" },
+  { "/sys/devices/system/cpu/cpu31/cache/index2/type", "Unified" },
+  { "/sys/devices/system/cpu/cpu31/cache/index3/shared_cpu_list", "9-16,24-31"},
+  { "/sys/devices/system/cpu/cpu31/cache/index3/type", "Unified" }
+};
+
+/// This is the expected CacheLocality structure for fakeSysfsTree
+static const CacheLocality nonUniformExampleLocality = {
+  32,
+  { 16, 16, 2 },
+  { 0, 2, 4, 6, 8, 10, 11, 12, 14, 16, 18, 20, 22, 24, 26, 28,
+    30, 1, 3, 5, 7, 9, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 }
+};
+
+TEST(CacheLocality, FakeSysfs) {
+  auto parsed = CacheLocality::readFromSysfsTree([](std::string name) {
+    auto iter = fakeSysfsTree.find(name);
+    return iter == fakeSysfsTree.end() ? std::string() : iter->second;
+  });
+
+  auto& expected = nonUniformExampleLocality;
+  EXPECT_EQ(expected.numCpus, parsed.numCpus);
+  EXPECT_EQ(expected.numCachesByLevel, parsed.numCachesByLevel);
+  EXPECT_EQ(expected.localityIndexByCpu, parsed.localityIndexByCpu);
+}
+
+TEST(Getcpu, VdsoGetcpu) {
+  unsigned cpu;
+  Getcpu::vdsoFunc()(&cpu, nullptr, nullptr);
+
+  EXPECT_TRUE(cpu < CPU_SETSIZE);
+}
+
+TEST(SequentialThreadId, Simple) {
+  unsigned cpu = 0;
+  auto rv = SequentialThreadId<std::atomic>::getcpu(&cpu, nullptr, nullptr);
+  EXPECT_EQ(rv, 0);
+  EXPECT_TRUE(cpu > 0);
+  unsigned again;
+  SequentialThreadId<std::atomic>::getcpu(&again, nullptr, nullptr);
+  EXPECT_EQ(cpu, again);
+}
+
+static __thread unsigned testingCpu = 0;
+
+static int testingGetcpu(unsigned* cpu, unsigned* node, void* unused) {
+  if (cpu != nullptr) {
+    *cpu = testingCpu;
+  }
+  if (node != nullptr) {
+    *node = testingCpu;
+  }
+  return 0;
+}
+
+TEST(AccessSpreader, Stubbed) {
+  std::vector<std::unique_ptr<AccessSpreader<>>> spreaders(100);
+  for (size_t s = 1; s < spreaders.size(); ++s) {
+    spreaders[s].reset(new AccessSpreader<>(
+        s, nonUniformExampleLocality, &testingGetcpu));
+  }
+  std::vector<size_t> cpusInLocalityOrder = {
+      0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 6, 7, 22, 8, 23, 9, 24, 10, 25,
+      11, 26, 12, 27, 13, 28, 14, 29, 15, 30, 16, 31 };
+  for (size_t i = 0; i < 32; ++i) {
+    // extra i * 32 is to check wrapping behavior of impl
+    testingCpu = cpusInLocalityOrder[i] + i * 64;
+    for (size_t s = 1; s < spreaders.size(); ++s) {
+      EXPECT_EQ((i * s) / 32, spreaders[s]->current())
+          << "i=" << i << ", cpu=" << testingCpu << ", s=" << s;
+    }
+  }
+}
+
+TEST(AccessSpreader, Default) {
+  AccessSpreader<> spreader(16);
+  EXPECT_LT(spreader.current(), 16);
+}
+
+TEST(AccessSpreader, Shared) {
+  for (size_t s = 1; s < 200; ++s) {
+    EXPECT_LT(AccessSpreader<>::shared(s).current(), s);
+  }
+}
+
+TEST(AccessSpreader, Statics) {
+  LOG(INFO) << "stripeByCore.numStripes() = "
+            << AccessSpreader<>::stripeByCore.numStripes();
+  LOG(INFO) << "stripeByChip.numStripes() = "
+            << AccessSpreader<>::stripeByChip.numStripes();
+  for (size_t s = 1; s < 200; ++s) {
+    EXPECT_LT(AccessSpreader<>::current(s), s);
+  }
+}
+
+TEST(AccessSpreader, Wrapping) {
+  // this test won't pass unless locality.numCpus divides kMaxCpus
+  auto numCpus = 16;
+  auto locality = CacheLocality::uniform(numCpus);
+  for (size_t s = 1; s < 200; ++s) {
+    AccessSpreader<> spreader(s, locality, &testingGetcpu);
+    for (size_t c = 0; c < 400; ++c) {
+      testingCpu = c;
+      auto observed = spreader.current();
+      testingCpu = c % numCpus;
+      auto expected = spreader.current();
+      EXPECT_EQ(expected, observed)
+          << "numCpus=" << numCpus << ", s=" << s << ", c=" << c;
+    }
+  }
+}
+
+// Benchmarked at ~21 nanos on fbk35 (2.6) and fbk18 (3.2) kernels with
+// a 2.2Ghz Xeon
+// ============================================================================
+// folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
+// ============================================================================
+// LocalAccessSpreaderUse                                      20.77ns   48.16M
+// SharedAccessSpreaderUse                                     21.95ns   45.55M
+// AccessSpreaderConstruction                                 466.56ns    2.14M
+// ============================================================================
+
+BENCHMARK(LocalAccessSpreaderUse, iters) {
+  folly::BenchmarkSuspender braces;
+  AccessSpreader<> spreader(16);
+  braces.dismiss();
+
+  for (unsigned long i = 0; i < iters; ++i) {
+    auto x = spreader.current();
+    folly::doNotOptimizeAway(x);
+  }
+}
+
+BENCHMARK(SharedAccessSpreaderUse, iters) {
+  for (unsigned long i = 0; i < iters; ++i) {
+    auto x = AccessSpreader<>::current(16);
+    folly::doNotOptimizeAway(x);
+  }
+}
+
+BENCHMARK(AccessSpreaderConstruction, iters) {
+  std::aligned_storage<sizeof(AccessSpreader<>),
+                       std::alignment_of<AccessSpreader<>>::value>::type raw;
+  for (unsigned long i = 0; i < iters; ++i) {
+    auto x = new (&raw) AccessSpreader<>(16);
+    folly::doNotOptimizeAway(x);
+    x->~AccessSpreader();
+  }
+}
+
+enum class SpreaderType { GETCPU, SHARED, TLS_RR };
+
+// Benchmark scores here reflect the time for 32 threads to perform an
+// atomic increment on a dual-socket E5-2660 @ 2.2Ghz.  Surprisingly,
+// if we don't separate the counters onto unique 128 byte stripes the
+// 1_stripe and 2_stripe results are identical, even though the L3 is
+// claimed to have 64 byte cache lines.
+//
+// _stub means there was no call to getcpu or the tls round-robin
+// implementation, because for a single stripe the cpu doesn't matter.
+// _getcpu refers to the vdso getcpu implementation with a locally
+// constructed AccessSpreader.  _tls_rr refers to execution using
+// SequentialThreadId, the fallback if the vdso getcpu isn't available.
+// _shared refers to calling AccessSpreader<>::current(numStripes)
+// inside the hot loop.
+//
+// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
+// so since the stripe selection is 21 nanos the atomic increments in
+// the L1 is ~15 nanos.  At width 8_stripe_0_work the line is expected
+// to ping-pong almost every operation, since the loops have the same
+// duration.  Widths 4 and 2 have the same behavior, but each tour of the
+// cache line is 4 and 8 cores long, respectively.  These all suggest a
+// lower bound of 60 nanos for intra-chip handoff and increment between
+// the L1s.
+//
+// With 455 nanos (1K cycles) of busywork per contended increment, the
+// system can hide all of the latency of a tour of length 4, but not
+// quite one of length 8.  I was a bit surprised at how much worse the
+// non-striped version got.  It seems that the inter-chip traffic also
+// interferes with the L1-only localWork.load().  When the local work is
+// doubled to about 1 microsecond we see that the inter-chip contention
+// is still very important, but subdivisions on the same chip don't matter.
+//
+// sudo nice -n -20
+//   _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
+// ============================================================================
+// folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
+// ============================================================================
+// contentionAtWidth(1_stripe_0_work_stub)                      1.14us  873.64K
+// contentionAtWidth(2_stripe_0_work_getcpu)                  495.58ns    2.02M
+// contentionAtWidth(4_stripe_0_work_getcpu)                  232.99ns    4.29M
+// contentionAtWidth(8_stripe_0_work_getcpu)                  101.16ns    9.88M
+// contentionAtWidth(16_stripe_0_work_getcpu)                  41.93ns   23.85M
+// contentionAtWidth(32_stripe_0_work_getcpu)                  42.04ns   23.79M
+// contentionAtWidth(64_stripe_0_work_getcpu)                  41.94ns   23.84M
+// contentionAtWidth(2_stripe_0_work_tls_rr)                    1.00us  997.41K
+// contentionAtWidth(4_stripe_0_work_tls_rr)                  694.41ns    1.44M
+// contentionAtWidth(8_stripe_0_work_tls_rr)                  590.27ns    1.69M
+// contentionAtWidth(16_stripe_0_work_tls_rr)                 222.13ns    4.50M
+// contentionAtWidth(32_stripe_0_work_tls_rr)                 169.49ns    5.90M
+// contentionAtWidth(64_stripe_0_work_tls_rr)                 162.20ns    6.17M
+// contentionAtWidth(2_stripe_0_work_shared)                  495.54ns    2.02M
+// contentionAtWidth(4_stripe_0_work_shared)                  236.27ns    4.23M
+// contentionAtWidth(8_stripe_0_work_shared)                  114.81ns    8.71M
+// contentionAtWidth(16_stripe_0_work_shared)                  44.65ns   22.40M
+// contentionAtWidth(32_stripe_0_work_shared)                  41.76ns   23.94M
+// contentionAtWidth(64_stripe_0_work_shared)                  43.47ns   23.00M
+// atomicIncrBaseline(local_incr_0_work)                       20.39ns   49.06M
+// ----------------------------------------------------------------------------
+// contentionAtWidth(1_stripe_500_work_stub)                    2.04us  491.13K
+// contentionAtWidth(2_stripe_500_work_getcpu)                610.98ns    1.64M
+// contentionAtWidth(4_stripe_500_work_getcpu)                507.72ns    1.97M
+// contentionAtWidth(8_stripe_500_work_getcpu)                542.53ns    1.84M
+// contentionAtWidth(16_stripe_500_work_getcpu)               496.55ns    2.01M
+// contentionAtWidth(32_stripe_500_work_getcpu)               500.67ns    2.00M
+// atomicIncrBaseline(local_incr_500_work)                    484.69ns    2.06M
+// ----------------------------------------------------------------------------
+// contentionAtWidth(1_stripe_1000_work_stub)                   2.11us  473.78K
+// contentionAtWidth(2_stripe_1000_work_getcpu)               970.64ns    1.03M
+// contentionAtWidth(4_stripe_1000_work_getcpu)               987.31ns    1.01M
+// contentionAtWidth(8_stripe_1000_work_getcpu)                 1.01us  985.52K
+// contentionAtWidth(16_stripe_1000_work_getcpu)              986.09ns    1.01M
+// contentionAtWidth(32_stripe_1000_work_getcpu)              960.23ns    1.04M
+// atomicIncrBaseline(local_incr_1000_work)                   950.63ns    1.05M
+// ============================================================================
+static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
+                              SpreaderType spreaderType,
+                              size_t counterAlignment = 128,
+                              size_t numThreads = 32) {
+  folly::BenchmarkSuspender braces;
+
+  AccessSpreader<> spreader(
+      stripes,
+      CacheLocality::system<std::atomic>(),
+      spreaderType == SpreaderType::TLS_RR
+          ? SequentialThreadId<std::atomic>::getcpu : nullptr);
+
+  std::atomic<size_t> ready(0);
+  std::atomic<bool> go(false);
+
+  // while in theory the cache line size is 64 bytes, experiments show
+  // that we get contention on 128 byte boundaries for Ivy Bridge.  The
+  // extra indirection adds 1 or 2 nanos
+  assert(counterAlignment >= sizeof(std::atomic<size_t>));
+  char raw[counterAlignment * stripes];
+
+  // if we happen to be using the tlsRoundRobin, then sequentially
+  // assigning the thread identifiers is the unlikely best-case scenario.
+  // We don't want to unfairly benefit or penalize.  Computing the exact
+  // maximum likelihood of the probability distributions is annoying, so
+  // I approximate as 2/5 of the ids that have no threads, 2/5 that have
+  // 1, 2/15 that have 2, and 1/15 that have 3.  We accomplish this by
+  // wrapping back to slot 0 when we hit 1/15 and 1/5.
+
+  std::vector<std::thread> threads;
+  while (threads.size() < numThreads) {
+    threads.push_back(std::thread([&,iters,stripes,work]() {
+      std::atomic<size_t>* counters[stripes];
+      for (size_t i = 0; i < stripes; ++i) {
+        counters[i] = new (raw + counterAlignment * i) std::atomic<size_t>();
+      }
+
+      spreader.current();
+      ready++;
+      while (!go.load()) {
+        sched_yield();
+      }
+      std::atomic<int> localWork;
+      if (spreaderType == SpreaderType::SHARED) {
+        for (size_t i = iters; i > 0; --i) {
+          ++*(counters[AccessSpreader<>::current(stripes)]);
+          for (size_t j = work; j > 0; --j) {
+            localWork.load();
+          }
+        }
+      } else {
+        for (size_t i = iters; i > 0; --i) {
+          ++*(counters[spreader.current()]);
+          for (size_t j = work; j > 0; --j) {
+            localWork.load();
+          }
+        }
+      }
+    }));
+
+    if (threads.size() == numThreads / 15 ||
+        threads.size() == numThreads / 5) {
+      // create a few dummy threads to wrap back around to 0 mod numCpus
+      for (size_t i = threads.size(); i != numThreads; ++i) {
+        std::thread([&]() {
+          spreader.current();
+        }).join();
+      }
+    }
+  }
+
+  while (ready < numThreads) {
+    sched_yield();
+  }
+  braces.dismiss();
+  go = true;
+
+  for (auto& thr : threads) {
+    thr.join();
+  }
+}
+
+static void atomicIncrBaseline(size_t iters, size_t work,
+                               size_t numThreads = 32) {
+  folly::BenchmarkSuspender braces;
+
+  std::atomic<bool> go(false);
+
+  std::vector<std::thread> threads;
+  while (threads.size() < numThreads) {
+    threads.push_back(std::thread([&]() {
+      while (!go.load()) {
+        sched_yield();
+      }
+      std::atomic<size_t> localCounter;
+      std::atomic<int> localWork;
+      for (size_t i = iters; i > 0; --i) {
+        localCounter++;
+        for (size_t j = work; j > 0; --j) {
+          localWork.load();
+        }
+      }
+    }));
+  }
+
+  braces.dismiss();
+  go = true;
+
+  for (auto& thr : threads) {
+    thr.join();
+  }
+}
+
+BENCHMARK_DRAW_LINE()
+
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_0_work_stub,
+                      1, 0, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_getcpu,
+                      2, 0, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_getcpu,
+                      4, 0, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_getcpu,
+                      8, 0, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_getcpu,
+                      16, 0, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_getcpu,
+                      32, 0, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_getcpu,
+                      64, 0, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_tls_rr,
+                      2, 0, SpreaderType::TLS_RR)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_tls_rr,
+                      4, 0, SpreaderType::TLS_RR)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_tls_rr,
+                      8, 0, SpreaderType::TLS_RR)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_tls_rr,
+                      16, 0, SpreaderType::TLS_RR)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_tls_rr,
+                      32, 0, SpreaderType::TLS_RR)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_tls_rr,
+                      64, 0, SpreaderType::TLS_RR)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_shared,
+                      2, 0, SpreaderType::SHARED)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_shared,
+                      4, 0, SpreaderType::SHARED)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_shared,
+                      8, 0, SpreaderType::SHARED)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_shared,
+                      16, 0, SpreaderType::SHARED)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_shared,
+                      32, 0, SpreaderType::SHARED)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_shared,
+                      64, 0, SpreaderType::SHARED)
+BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
+BENCHMARK_DRAW_LINE()
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_500_work_stub,
+                      1, 500, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_500_work_getcpu,
+                      2, 500, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_500_work_getcpu,
+                      4, 500, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_500_work_getcpu,
+                      8, 500, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_500_work_getcpu,
+                      16, 500, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_500_work_getcpu,
+                      32, 500, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
+BENCHMARK_DRAW_LINE()
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_1000_work_stub,
+                      1, 1000, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_1000_work_getcpu,
+                      2, 1000, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_1000_work_getcpu,
+                      4, 1000, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_1000_work_getcpu,
+                      8, 1000, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_1000_work_getcpu,
+                      16, 1000, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_1000_work_getcpu,
+                      32, 1000, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
+
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  auto ret = RUN_ALL_TESTS();
+  if (!ret && FLAGS_benchmark) {
+    folly::runBenchmarks();
+  }
+  return ret;
+}
+
diff --git a/folly/test/DeterministicSchedule.cpp b/folly/test/DeterministicSchedule.cpp
index 7f77262c..873bcf78 100644
--- a/folly/test/DeterministicSchedule.cpp
+++ b/folly/test/DeterministicSchedule.cpp
@@ -272,4 +272,39 @@ int Futex<DeterministicAtomic>::futexWake(int count, uint32_t wakeMask) {
   return rv;
 }
 
+
+template<>
+CacheLocality const& CacheLocality::system<test::DeterministicAtomic>() {
+  static CacheLocality cache(CacheLocality::uniform(16));
+  return cache;
+}
+
+template<>
+test::DeterministicAtomic<size_t>
+    SequentialThreadId<test::DeterministicAtomic>::prevId(0);
+
+template<>
+__thread size_t SequentialThreadId<test::DeterministicAtomic>::currentId(0);
+
+template<>
+const AccessSpreader<test::DeterministicAtomic>
+AccessSpreader<test::DeterministicAtomic>::stripeByCore(
+    CacheLocality::system<>().numCachesByLevel.front());
+
+template<>
+const AccessSpreader<test::DeterministicAtomic>
+AccessSpreader<test::DeterministicAtomic>::stripeByChip(
+    CacheLocality::system<>().numCachesByLevel.back());
+
+template<>
+AccessSpreaderArray<test::DeterministicAtomic,128>
+AccessSpreaderArray<test::DeterministicAtomic,128>::sharedInstance = {};
+
+
+template<>
+Getcpu::Func
+AccessSpreader<test::DeterministicAtomic>::pickGetcpuFunc(size_t numStripes) {
+  return &SequentialThreadId<test::DeterministicAtomic>::getcpu;
+}
+
 }}
diff --git a/folly/test/DeterministicSchedule.h b/folly/test/DeterministicSchedule.h
index 3e7ccb3a..71cb1037 100644
--- a/folly/test/DeterministicSchedule.h
+++ b/folly/test/DeterministicSchedule.h
@@ -27,6 +27,7 @@
 #include <assert.h>
 
 #include <folly/ScopeGuard.h>
+#include <folly/detail/CacheLocality.h>
 #include <folly/detail/Futex.h>
 
 namespace folly { namespace test {
-- 
2.34.1