From 3d0c8f284dd1ce098db0fffb3b72c9396b6b50d2 Mon Sep 17 00:00:00 2001 From: Nathan Bronson Date: Thu, 21 Nov 2013 17:15:06 -0800 Subject: [PATCH 1/1] detail/CacheLocality.h - utilities for dynamic cache optimizations Summary: CacheLocality reads cache sharing information from sysfs to determine how CPUs should be grouped to minimize contention, Getcpu provides fast access to the current CPU via __vdso_getcpu, and AccessSpreader uses these two to optimally spread accesses among a predetermined number of stripes. AccessSpreader<>::current(n) microbenchmarks at 22 nanos, which is substantially less than the cost of a cache miss. This means that we can effectively use it to reduce cache line ping-pong on striped data structures such as IndexedMemPool or statistics counters. Because CacheLocality looks at all of the cache levels, it can be used for different levels of optimization. AccessSpreader<>::stripeByChip.current() uses as few stripes as possible to avoid off-chip communication, AccessSpreader<>::stripeByCore.current() uses as many stripes as necessary to get the optimal speedup, but no more. @override-unit-failures Test Plan: new unit tests Reviewed By: davejwatson@fb.com FB internal diff: D1076718 --- folly/Makefile.am | 2 + folly/detail/CacheLocality.cpp | 276 +++++++++++ folly/detail/CacheLocality.h | 353 ++++++++++++++ folly/test/CacheLocalityTest.cpp | 705 +++++++++++++++++++++++++++ folly/test/DeterministicSchedule.cpp | 35 ++ folly/test/DeterministicSchedule.h | 1 + 6 files changed, 1372 insertions(+) create mode 100644 folly/detail/CacheLocality.cpp create mode 100644 folly/detail/CacheLocality.h create mode 100644 folly/test/CacheLocalityTest.cpp diff --git a/folly/Makefile.am b/folly/Makefile.am index 91a993bf..a9647533 100644 --- a/folly/Makefile.am +++ b/folly/Makefile.am @@ -36,6 +36,7 @@ nobase_follyinclude_HEADERS = \ detail/AtomicHashUtils.h \ detail/BitIteratorDetail.h \ detail/BitsDetail.h \ + detail/CacheLocality.h \ detail/DiscriminatedPtrDetail.h \ detail/FileUtilDetail.h \ detail/FingerprintPolynomial.h \ @@ -137,6 +138,7 @@ libfolly_la_SOURCES = \ Benchmark.cpp \ Bits.cpp \ Conv.cpp \ + detail/CacheLocality.cpp \ dynamic.cpp \ EscapeTables.cpp \ File.cpp \ diff --git a/folly/detail/CacheLocality.cpp b/folly/detail/CacheLocality.cpp new file mode 100644 index 00000000..7a13a9dd --- /dev/null +++ b/folly/detail/CacheLocality.cpp @@ -0,0 +1,276 @@ +/* + * Copyright 2013 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "folly/detail/CacheLocality.h" + +#define _GNU_SOURCE 1 // for RTLD_NOLOAD +#include +#include + +#include "folly/Conv.h" +#include "folly/Exception.h" +#include "folly/FileUtil.h" +#include "folly/Format.h" +#include "folly/ScopeGuard.h" + +namespace folly { namespace detail { + +///////////// CacheLocality + +/// Returns the best real CacheLocality information available +static CacheLocality getSystemLocalityInfo() { + try { + return CacheLocality::readFromSysfs(); + } catch (...) { + // keep trying + } + + long numCpus = sysconf(_SC_NPROCESSORS_CONF); + if (numCpus <= 0) { + // This shouldn't happen, but if it does we should try to keep + // going. We are probably not going to be able to parse /sys on + // this box either (although we will try), which means we are going + // to fall back to the SequentialThreadId splitter. On my 16 core + // (x hyperthreading) dev box 16 stripes is enough to get pretty good + // contention avoidance with SequentialThreadId, and there is little + // improvement from going from 32 to 64. This default gives us some + // wiggle room + numCpus = 32; + } + return CacheLocality::uniform(numCpus); +} + +template <> +const CacheLocality& CacheLocality::system() { + static CacheLocality cache(getSystemLocalityInfo()); + return cache; +} + +// Each level of cache has sharing sets, which are the set of cpus +// that share a common cache at that level. These are available in a +// hex bitset form (/sys/devices/system/cpu/cpu0/index0/shared_cpu_map, +// for example). They are also available in a human-readable list form, +// as in /sys/devices/system/cpu/cpu0/index0/shared_cpu_list. The list +// is a comma-separated list of numbers and ranges, where the ranges are +// a pair of decimal numbers separated by a '-'. +// +// To sort the cpus for optimum locality we don't really need to parse +// the sharing sets, we just need a unique representative from the +// equivalence class. The smallest value works fine, and happens to be +// the first decimal number in the file. We load all of the equivalence +// class information from all of the cpu*/index* directories, order the +// cpus first by increasing last-level cache equivalence class, then by +// the smaller caches. Finally, we break ties with the cpu number itself. + +/// Returns the first decimal number in the string, or throws an exception +/// if the string does not start with a number terminated by ',', '-', +/// '\n', or eos. +static ssize_t parseLeadingNumber(const std::string& line) { + auto raw = line.c_str(); + char *end; + unsigned val = strtoul(raw, &end, 10); + if (end == raw || (*end != ',' && *end != '-' && *end != '\n')) { + throw std::runtime_error(to( + "error parsing list '", line, "'").c_str()); + } + return val; +} + +CacheLocality CacheLocality::readFromSysfsTree( + const std::function& mapping) { + // number of equivalence classes per level + std::vector numCachesByLevel; + + // the list of cache equivalence classes, where equivalance classes + // are named by the smallest cpu in the class + std::vector> equivClassesByCpu; + + std::vector cpus; + + while (true) { + auto cpu = cpus.size(); + std::vector levels; + for (size_t index = 0; ; ++index) { + auto dir = format("/sys/devices/system/cpu/cpu{}/cache/index{}/", + cpu, index).str(); + auto cacheType = mapping(dir + "type"); + auto equivStr = mapping(dir + "shared_cpu_list"); + if (cacheType.size() == 0 || equivStr.size() == 0) { + // no more caches + break; + } + if (cacheType[0] == 'I') { + // cacheType in { "Data", "Instruction", "Unified" }. skip icache + continue; + } + auto equiv = parseLeadingNumber(equivStr); + auto level = levels.size(); + levels.push_back(equiv); + + if (equiv == cpu) { + // we only want to count the equiv classes once, so we do it when + // we first encounter them + while (numCachesByLevel.size() <= level) { + numCachesByLevel.push_back(0); + } + numCachesByLevel[level]++; + } + } + + if (levels.size() == 0) { + // no levels at all for this cpu, we must be done + break; + } + equivClassesByCpu.emplace_back(std::move(levels)); + cpus.push_back(cpu); + } + + if (cpus.size() == 0) { + throw std::runtime_error("unable to load cache sharing info"); + } + + std::sort(cpus.begin(), cpus.end(), [&](size_t lhs, size_t rhs) -> bool { + // sort first by equiv class of cache with highest index, direction + // doesn't matter. If different cpus have different numbers of + // caches then this code might produce a sub-optimal ordering, but + // it won't crash + auto& lhsEquiv = equivClassesByCpu[lhs]; + auto& rhsEquiv = equivClassesByCpu[rhs]; + for (int i = std::min(lhsEquiv.size(), rhsEquiv.size()) - 1; i >= 0; --i) { + if (lhsEquiv[i] != rhsEquiv[i]) { + return lhsEquiv[i] < rhsEquiv[i]; + } + } + + // break ties deterministically by cpu + return lhs < rhs; + }); + + // the cpus are now sorted by locality, with neighboring entries closer + // to each other than entries that are far away. For striping we want + // the inverse map, since we are starting with the cpu + std::vector indexes(cpus.size()); + for (int i = 0; i < cpus.size(); ++i) { + indexes[cpus[i]] = i; + } + + return CacheLocality{ + cpus.size(), std::move(numCachesByLevel), std::move(indexes) }; +} + +CacheLocality CacheLocality::readFromSysfs() { + return readFromSysfsTree([](std::string name) { + std::ifstream xi(name.c_str()); + std::string rv; + std::getline(xi, rv); + return rv; + }); +} + + +CacheLocality CacheLocality::uniform(size_t numCpus) { + CacheLocality rv; + + rv.numCpus = numCpus; + + // one cache shared by all cpus + rv.numCachesByLevel.push_back(numCpus); + + // no permutations in locality index mapping + for (size_t cpu = 0; cpu < numCpus; ++cpu) { + rv.localityIndexByCpu.push_back(cpu); + } + + return rv; +} + +////////////// Getcpu + +/// Resolves the dynamically loaded symbol __vdso_getcpu, returning null +/// on failure +static Getcpu::Func loadVdsoGetcpu() { + void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (h == nullptr) { + return nullptr; + } + + auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu")); + if (func == nullptr) { + // technically a null result could either be a failure or a successful + // lookup of a symbol with the null value, but the second can't actually + // happen for this symbol. No point holding the handle forever if + // we don't need the code + dlclose(h); + } + + return func; +} + +Getcpu::Func Getcpu::vdsoFunc() { + static Func func = loadVdsoGetcpu(); + return func; +} + +/////////////// SequentialThreadId + +template<> +std::atomic SequentialThreadId::prevId(0); + +template<> +__thread size_t SequentialThreadId::currentId(0); + +/////////////// AccessSpreader + +template<> +const AccessSpreader +AccessSpreader::stripeByCore( + CacheLocality::system<>().numCachesByLevel.front()); + +template<> +const AccessSpreader +AccessSpreader::stripeByChip( + CacheLocality::system<>().numCachesByLevel.back()); + +template<> +AccessSpreaderArray +AccessSpreaderArray::sharedInstance = {}; + +/// Always claims to be on CPU zero, node zero +static int degenerateGetcpu(unsigned* cpu, unsigned* node, void* unused) { + if (cpu != nullptr) { + *cpu = 0; + } + if (node != nullptr) { + *node = 0; + } + return 0; +} + +template<> +Getcpu::Func AccessSpreader::pickGetcpuFunc(size_t numStripes) { + if (numStripes == 1) { + // there's no need to call getcpu if there is only one stripe. + // This should not be common, so we don't want to waste a test and + // branch in the main code path, but we might as well use a faster + // function pointer + return °enerateGetcpu; + } else { + auto best = Getcpu::vdsoFunc(); + return best ? best : &SequentialThreadId::getcpu; + } +} + +} } // namespace folly::detail diff --git a/folly/detail/CacheLocality.h b/folly/detail/CacheLocality.h new file mode 100644 index 00000000..0d839bec --- /dev/null +++ b/folly/detail/CacheLocality.h @@ -0,0 +1,353 @@ +/* + * Copyright 2013 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FOLLY_DETAIL_CACHELOCALITY_H_ +#define FOLLY_DETAIL_CACHELOCALITY_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "folly/Likely.h" + +namespace folly { namespace detail { + +// This file contains several classes that might be useful if you are +// trying to dynamically optimize cache locality: CacheLocality reads +// cache sharing information from sysfs to determine how CPUs should be +// grouped to minimize contention, Getcpu provides fast access to the +// current CPU via __vdso_getcpu, and AccessSpreader uses these two to +// optimally spread accesses among a predetermined number of stripes. +// +// AccessSpreader<>::current(n) microbenchmarks at 22 nanos, which is +// substantially less than the cost of a cache miss. This means that we +// can effectively use it to reduce cache line ping-pong on striped data +// structures such as IndexedMemPool or statistics counters. +// +// Because CacheLocality looks at all of the cache levels, it can be +// used for different levels of optimization. AccessSpreader(2) does +// per-chip spreading on a dual socket system. AccessSpreader(numCpus) +// does perfect per-cpu spreading. AccessSpreader(numCpus / 2) does +// perfect L1 spreading in a system with hyperthreading enabled. + +struct CacheLocality { + + /// 1 more than the maximum value that can be returned from sched_getcpu + /// or getcpu. This is the number of hardware thread contexts provided + /// by the processors + size_t numCpus; + + /// Holds the number of caches present at each cache level (0 is + /// the closest to the cpu). This is the number of AccessSpreader + /// stripes needed to avoid cross-cache communication at the specified + /// layer. numCachesByLevel.front() is the number of L1 caches and + /// numCachesByLevel.back() is the number of last-level caches. + std::vector numCachesByLevel; + + /// A map from cpu (from sched_getcpu or getcpu) to an index in the + /// range 0..numCpus-1, where neighboring locality indices are more + /// likely to share caches then indices far away. All of the members + /// of a particular cache level be contiguous in their locality index. + /// For example, if numCpus is 32 and numCachesByLevel.back() is 2, + /// then cpus with a locality index < 16 will share one last-level + /// cache and cpus with a locality index >= 16 will share the other. + std::vector localityIndexByCpu; + + + /// Returns the best CacheLocality information available for the current + /// system, cached for fast access. This will be loaded from sysfs if + /// possible, otherwise it will be correct in the number of CPUs but + /// not in their sharing structure. + /// + /// If you are into yo dawgs, this is a shared cache of the local + /// locality of the shared caches. + /// + /// The template parameter here is used to allow injection of a + /// repeatable CacheLocality structure during testing. Rather than + /// inject the type of the CacheLocality provider into every data type + /// that transitively uses it, all components select between the default + /// sysfs implementation and a deterministic implementation by keying + /// off the type of the underlying atomic. See DeterministicScheduler. + template class Atom = std::atomic> + static const CacheLocality& system(); + + + /// Reads CacheLocality information from a tree structured like + /// the sysfs filesystem. The provided function will be evaluated + /// for each sysfs file that needs to be queried. The function + /// should return a string containing the first line of the file + /// (not including the newline), or an empty string if the file does + /// not exist. The function will be called with paths of the form + /// /sys/devices/system/cpu/cpu*/cache/index*/{type,shared_cpu_list} . + /// Throws an exception if no caches can be parsed at all. + static CacheLocality readFromSysfsTree( + const std::function& mapping); + + /// Reads CacheLocality information from the real sysfs filesystem. + /// Throws an exception if no cache information can be loaded. + static CacheLocality readFromSysfs(); + + /// Returns a usable (but probably not reflective of reality) + /// CacheLocality structure with the specified number of cpus and a + /// single cache level that associates one cpu per cache. + static CacheLocality uniform(size_t numCpus); +}; + +/// Holds a function pointer to the VDSO implementation of getcpu(2), +/// if available +struct Getcpu { + /// Function pointer to a function with the same signature as getcpu(2). + typedef int (*Func)(unsigned* cpu, unsigned* node, void* unused); + + /// Returns a pointer to the VDSO implementation of getcpu(2), if + /// available, or nullptr otherwise + static Func vdsoFunc(); +}; + +/// A class that lazily binds a unique (for each implementation of Atom) +/// identifier to a thread. This is a fallback mechanism for the access +/// spreader if we are in testing (using DeterministicAtomic) or if +/// __vdso_getcpu can't be dynamically loaded +template class Atom> +struct SequentialThreadId { + + /// Returns the thread id assigned to the current thread + static size_t get() { + auto rv = currentId; + if (UNLIKELY(rv == 0)) { + rv = currentId = ++prevId; + } + return rv; + } + + /// Fills the thread id into the cpu and node out params (if they + /// are non-null). This method is intended to act like getcpu when a + /// fast-enough form of getcpu isn't available or isn't desired + static int getcpu(unsigned* cpu, unsigned* node, void* unused) { + auto id = get(); + if (cpu) { + *cpu = id; + } + if (node) { + *node = id; + } + return 0; + } + + private: + static Atom prevId; + + // TODO: switch to thread_local + static __thread size_t currentId; +}; + +template class Atom, size_t kMaxCpus> +struct AccessSpreaderArray; + +/// AccessSpreader arranges access to a striped data structure in such a +/// way that concurrently executing threads are likely to be accessing +/// different stripes. It does NOT guarantee uncontended access. +/// Your underlying algorithm must be thread-safe without spreading, this +/// is merely an optimization. AccessSpreader::current(n) is typically +/// much faster than a cache miss (22 nanos on my dev box, tested fast +/// in both 2.6 and 3.2 kernels). +/// +/// You are free to create your own AccessSpreader-s or to cache the +/// results of AccessSpreader<>::shared(n), but you will probably want +/// to use one of the system-wide shared ones. Calling .current() on +/// a particular AccessSpreader instance only saves about 1 nanosecond +/// over calling AccessSpreader<>::shared(n). +/// +/// If available (and not using the deterministic testing implementation) +/// AccessSpreader uses the getcpu system call via VDSO and the +/// precise locality information retrieved from sysfs by CacheLocality. +/// This provides optimal anti-sharing at a fraction of the cost of a +/// cache miss. +/// +/// When there are not as many stripes as processors, we try to optimally +/// place the cache sharing boundaries. This means that if you have 2 +/// stripes and run on a dual-socket system, your 2 stripes will each get +/// all of the cores from a single socket. If you have 16 stripes on a +/// 16 core system plus hyperthreading (32 cpus), each core will get its +/// own stripe and there will be no cache sharing at all. +/// +/// AccessSpreader has a fallback mechanism for when __vdso_getcpu can't be +/// loaded, or for use during deterministic testing. Using sched_getcpu or +/// the getcpu syscall would negate the performance advantages of access +/// spreading, so we use a thread-local value and a shared atomic counter +/// to spread access out. +/// +/// AccessSpreader is templated on the template type that is used +/// to implement atomics, as a way to instantiate the underlying +/// heuristics differently for production use and deterministic unit +/// testing. See DeterministicScheduler for more. If you aren't using +/// DeterministicScheduler, you can just use the default template parameter +/// all of the time. +template class Atom = std::atomic> +struct AccessSpreader { + + /// Returns a never-destructed shared AccessSpreader instance. + /// numStripes should be > 0. + static const AccessSpreader& shared(size_t numStripes) { + // sharedInstances[0] actually has numStripes == 1 + assert(numStripes > 0); + + // the last shared element handles all large sizes + return AccessSpreaderArray::sharedInstance[ + std::min(size_t(kMaxCpus), numStripes)]; + } + + /// Returns the stripe associated with the current CPU, assuming + /// that there are numStripes (non-zero) stripes. Equivalent to + /// AccessSpreader::shared(numStripes)->current. + static size_t current(size_t numStripes) { + return shared(numStripes).current(); + } + + /// stripeByCore uses 1 stripe per L1 cache, according to + /// CacheLocality::system<>(). Use stripeByCore.numStripes() to see + /// its width, or stripeByCore.current() to get the current stripe + static const AccessSpreader stripeByCore; + + /// stripeByChip uses 1 stripe per last-level cache, which is the fewest + /// number of stripes for which off-chip communication can be avoided + /// (assuming all caches are on-chip). Use stripeByChip.numStripes() + /// to see its width, or stripeByChip.current() to get the current stripe + static const AccessSpreader stripeByChip; + + + /// Constructs an AccessSpreader that will return values from + /// 0 to numStripes-1 (inclusive), precomputing the mapping + /// from CPU to stripe. There is no use in having more than + /// CacheLocality::system().localityIndexByCpu.size() stripes or + /// kMaxCpus stripes + explicit AccessSpreader(size_t spreaderNumStripes, + const CacheLocality& cacheLocality = + CacheLocality::system(), + Getcpu::Func getcpuFunc = nullptr) + : getcpuFunc_(getcpuFunc ? getcpuFunc : pickGetcpuFunc(spreaderNumStripes)) + , numStripes_(spreaderNumStripes) + { + auto n = cacheLocality.numCpus; + for (size_t cpu = 0; cpu < kMaxCpus && cpu < n; ++cpu) { + auto index = cacheLocality.localityIndexByCpu[cpu]; + assert(index < n); + // as index goes from 0..n, post-transform value goes from + // 0..numStripes + stripeByCpu[cpu] = (index * numStripes_) / n; + assert(stripeByCpu[cpu] < numStripes_); + } + for (size_t cpu = n; cpu < kMaxCpus; ++cpu) { + stripeByCpu[cpu] = stripeByCpu[cpu - n]; + } + } + + /// Returns 1 more than the maximum value that can be returned from + /// current() + size_t numStripes() const { + return numStripes_; + } + + /// Returns the stripe associated with the current CPU + size_t current() const { + unsigned cpu; + getcpuFunc_(&cpu, nullptr, nullptr); + return stripeByCpu[cpu % kMaxCpus]; + } + + private: + + /// If there are more cpus than this nothing will crash, but there + /// might be unnecessary sharing + enum { kMaxCpus = 128 }; + + typedef uint8_t CompactStripe; + + static_assert((kMaxCpus & (kMaxCpus - 1)) == 0, + "kMaxCpus should be a power of two so modulo is fast"); + static_assert(kMaxCpus - 1 <= std::numeric_limits::max(), + "stripeByCpu element type isn't wide enough"); + + + /// Points to the getcpu-like function we are using to obtain the + /// current cpu. It should not be assumed that the returned cpu value + /// is in range. We use a member for this instead of a static so that + /// this fetch preloads a prefix the stripeByCpu array + Getcpu::Func getcpuFunc_; + + /// A precomputed map from cpu to stripe. Rather than add a layer of + /// indirection requiring a dynamic bounds check and another cache miss, + /// we always precompute the whole array + CompactStripe stripeByCpu[kMaxCpus]; + + size_t numStripes_; + + /// Returns the best getcpu implementation for this type and width + /// of AccessSpreader + static Getcpu::Func pickGetcpuFunc(size_t numStripes); +}; + +/// An array of kMaxCpus+1 AccessSpreader instances constructed +/// with default params, with the zero-th element having 1 stripe +template class Atom, size_t kMaxStripe> +struct AccessSpreaderArray { + + AccessSpreaderArray() { + for (size_t i = 0; i <= kMaxStripe; ++i) { + new (raw + i) AccessSpreader(std::max(size_t(1), i)); + } + } + + ~AccessSpreaderArray() { + for (size_t i = 0; i <= kMaxStripe; ++i) { + auto p = static_cast*>(static_cast(raw + i)); + p->~AccessSpreader(); + } + } + + AccessSpreader const& operator[] (size_t index) const { + return *static_cast const*>( + static_cast(raw + index)); + } + + private: + + /// If we align the access spreaders at the beginning of a cache line + /// then getcpuFunc_ and the first 56 bytes of stripeByCpu will be on + /// the same cache line + enum { kAlignment = 64 }; + + + // AccessSpreader uses sharedInstance + friend AccessSpreader; + + static AccessSpreaderArray sharedInstance; + + + /// aligned_storage is uninitialized, we use placement new since there + /// is no AccessSpreader default constructor + typename std::aligned_storage),kAlignment>::type + raw[kMaxStripe + 1]; +}; + +} } + +#endif /* FOLLY_DETAIL_CacheLocality_H_ */ + diff --git a/folly/test/CacheLocalityTest.cpp b/folly/test/CacheLocalityTest.cpp new file mode 100644 index 00000000..056ec5f6 --- /dev/null +++ b/folly/test/CacheLocalityTest.cpp @@ -0,0 +1,705 @@ +/* + * Copyright 2013 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "folly/detail/CacheLocality.h" + +#include +#include +#include +#include +#include +#include +#include +#include "folly/Benchmark.h" + +using namespace folly::detail; + +/// This is the relevant nodes from a production box's sysfs tree. If you +/// think this map is ugly you should see the version of this test that +/// used a real directory tree. To reduce the chance of testing error +/// I haven't tried to remove the common prefix +static std::unordered_map fakeSysfsTree = { + { "/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list", "0,17" }, + { "/sys/devices/system/cpu/cpu0/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list", "0,17" }, + { "/sys/devices/system/cpu/cpu0/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list", "0,17" }, + { "/sys/devices/system/cpu/cpu0/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu0/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu1/cache/index0/shared_cpu_list", "1,18" }, + { "/sys/devices/system/cpu/cpu1/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu1/cache/index1/shared_cpu_list", "1,18" }, + { "/sys/devices/system/cpu/cpu1/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list", "1,18" }, + { "/sys/devices/system/cpu/cpu1/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu1/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu1/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu2/cache/index0/shared_cpu_list", "2,19" }, + { "/sys/devices/system/cpu/cpu2/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu2/cache/index1/shared_cpu_list", "2,19" }, + { "/sys/devices/system/cpu/cpu2/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu2/cache/index2/shared_cpu_list", "2,19" }, + { "/sys/devices/system/cpu/cpu2/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu2/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu2/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu3/cache/index0/shared_cpu_list", "3,20" }, + { "/sys/devices/system/cpu/cpu3/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu3/cache/index1/shared_cpu_list", "3,20" }, + { "/sys/devices/system/cpu/cpu3/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_list", "3,20" }, + { "/sys/devices/system/cpu/cpu3/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu3/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu3/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu4/cache/index0/shared_cpu_list", "4,21" }, + { "/sys/devices/system/cpu/cpu4/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu4/cache/index1/shared_cpu_list", "4,21" }, + { "/sys/devices/system/cpu/cpu4/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu4/cache/index2/shared_cpu_list", "4,21" }, + { "/sys/devices/system/cpu/cpu4/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu4/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu4/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu5/cache/index0/shared_cpu_list", "5-6" }, + { "/sys/devices/system/cpu/cpu5/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu5/cache/index1/shared_cpu_list", "5-6" }, + { "/sys/devices/system/cpu/cpu5/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu5/cache/index2/shared_cpu_list", "5-6" }, + { "/sys/devices/system/cpu/cpu5/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu5/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu5/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu6/cache/index0/shared_cpu_list", "5-6" }, + { "/sys/devices/system/cpu/cpu6/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu6/cache/index1/shared_cpu_list", "5-6" }, + { "/sys/devices/system/cpu/cpu6/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu6/cache/index2/shared_cpu_list", "5-6" }, + { "/sys/devices/system/cpu/cpu6/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu6/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu6/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu7/cache/index0/shared_cpu_list", "7,22" }, + { "/sys/devices/system/cpu/cpu7/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu7/cache/index1/shared_cpu_list", "7,22" }, + { "/sys/devices/system/cpu/cpu7/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu7/cache/index2/shared_cpu_list", "7,22" }, + { "/sys/devices/system/cpu/cpu7/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu7/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu7/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list", "8,23" }, + { "/sys/devices/system/cpu/cpu8/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list", "8,23" }, + { "/sys/devices/system/cpu/cpu8/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list", "8,23" }, + { "/sys/devices/system/cpu/cpu8/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu8/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list", "9,24" }, + { "/sys/devices/system/cpu/cpu9/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list", "9,24" }, + { "/sys/devices/system/cpu/cpu9/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list", "9,24" }, + { "/sys/devices/system/cpu/cpu9/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list", "9-16,24-31" }, + { "/sys/devices/system/cpu/cpu9/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu10/cache/index0/shared_cpu_list", "10,25" }, + { "/sys/devices/system/cpu/cpu10/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu10/cache/index1/shared_cpu_list", "10,25" }, + { "/sys/devices/system/cpu/cpu10/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu10/cache/index2/shared_cpu_list", "10,25" }, + { "/sys/devices/system/cpu/cpu10/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu10/cache/index3/shared_cpu_list", "9-16,24-31"}, + { "/sys/devices/system/cpu/cpu10/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu11/cache/index0/shared_cpu_list", "11,26" }, + { "/sys/devices/system/cpu/cpu11/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu11/cache/index1/shared_cpu_list", "11,26" }, + { "/sys/devices/system/cpu/cpu11/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu11/cache/index2/shared_cpu_list", "11,26" }, + { "/sys/devices/system/cpu/cpu11/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu11/cache/index3/shared_cpu_list", "9-16,24-31"}, + { "/sys/devices/system/cpu/cpu11/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu12/cache/index0/shared_cpu_list", "12,27" }, + { "/sys/devices/system/cpu/cpu12/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu12/cache/index1/shared_cpu_list", "12,27" }, + { "/sys/devices/system/cpu/cpu12/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu12/cache/index2/shared_cpu_list", "12,27" }, + { "/sys/devices/system/cpu/cpu12/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu12/cache/index3/shared_cpu_list", "9-16,24-31"}, + { "/sys/devices/system/cpu/cpu12/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu13/cache/index0/shared_cpu_list", "13,28" }, + { "/sys/devices/system/cpu/cpu13/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu13/cache/index1/shared_cpu_list", "13,28" }, + { "/sys/devices/system/cpu/cpu13/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu13/cache/index2/shared_cpu_list", "13,28" }, + { "/sys/devices/system/cpu/cpu13/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu13/cache/index3/shared_cpu_list", "9-16,24-31"}, + { "/sys/devices/system/cpu/cpu13/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu14/cache/index0/shared_cpu_list", "14,29" }, + { "/sys/devices/system/cpu/cpu14/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu14/cache/index1/shared_cpu_list", "14,29" }, + { "/sys/devices/system/cpu/cpu14/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu14/cache/index2/shared_cpu_list", "14,29" }, + { "/sys/devices/system/cpu/cpu14/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu14/cache/index3/shared_cpu_list", "9-16,24-31"}, + { "/sys/devices/system/cpu/cpu14/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu15/cache/index0/shared_cpu_list", "15,30" }, + { "/sys/devices/system/cpu/cpu15/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu15/cache/index1/shared_cpu_list", "15,30" }, + { "/sys/devices/system/cpu/cpu15/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu15/cache/index2/shared_cpu_list", "15,30" }, + { "/sys/devices/system/cpu/cpu15/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu15/cache/index3/shared_cpu_list", "9-16,24-31"}, + { "/sys/devices/system/cpu/cpu15/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu16/cache/index0/shared_cpu_list", "16,31" }, + { "/sys/devices/system/cpu/cpu16/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu16/cache/index1/shared_cpu_list", "16,31" }, + { "/sys/devices/system/cpu/cpu16/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu16/cache/index2/shared_cpu_list", "16,31" }, + { "/sys/devices/system/cpu/cpu16/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu16/cache/index3/shared_cpu_list", "9-16,24-31"}, + { "/sys/devices/system/cpu/cpu16/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu17/cache/index0/shared_cpu_list", "0,17" }, + { "/sys/devices/system/cpu/cpu17/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu17/cache/index1/shared_cpu_list", "0,17" }, + { "/sys/devices/system/cpu/cpu17/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu17/cache/index2/shared_cpu_list", "0,17" }, + { "/sys/devices/system/cpu/cpu17/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu17/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu17/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu18/cache/index0/shared_cpu_list", "1,18" }, + { "/sys/devices/system/cpu/cpu18/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu18/cache/index1/shared_cpu_list", "1,18" }, + { "/sys/devices/system/cpu/cpu18/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu18/cache/index2/shared_cpu_list", "1,18" }, + { "/sys/devices/system/cpu/cpu18/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu18/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu18/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu19/cache/index0/shared_cpu_list", "2,19" }, + { "/sys/devices/system/cpu/cpu19/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu19/cache/index1/shared_cpu_list", "2,19" }, + { "/sys/devices/system/cpu/cpu19/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu19/cache/index2/shared_cpu_list", "2,19" }, + { "/sys/devices/system/cpu/cpu19/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu19/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu19/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu20/cache/index0/shared_cpu_list", "3,20" }, + { "/sys/devices/system/cpu/cpu20/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu20/cache/index1/shared_cpu_list", "3,20" }, + { "/sys/devices/system/cpu/cpu20/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu20/cache/index2/shared_cpu_list", "3,20" }, + { "/sys/devices/system/cpu/cpu20/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu20/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu20/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu21/cache/index0/shared_cpu_list", "4,21" }, + { "/sys/devices/system/cpu/cpu21/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu21/cache/index1/shared_cpu_list", "4,21" }, + { "/sys/devices/system/cpu/cpu21/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu21/cache/index2/shared_cpu_list", "4,21" }, + { "/sys/devices/system/cpu/cpu21/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu21/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu21/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu22/cache/index0/shared_cpu_list", "7,22" }, + { "/sys/devices/system/cpu/cpu22/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu22/cache/index1/shared_cpu_list", "7,22" }, + { "/sys/devices/system/cpu/cpu22/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu22/cache/index2/shared_cpu_list", "7,22" }, + { "/sys/devices/system/cpu/cpu22/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu22/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu22/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu23/cache/index0/shared_cpu_list", "8,23" }, + { "/sys/devices/system/cpu/cpu23/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu23/cache/index1/shared_cpu_list", "8,23" }, + { "/sys/devices/system/cpu/cpu23/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_list", "8,23" }, + { "/sys/devices/system/cpu/cpu23/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu23/cache/index3/shared_cpu_list", "0-8,17-23" }, + { "/sys/devices/system/cpu/cpu23/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu24/cache/index0/shared_cpu_list", "9,24" }, + { "/sys/devices/system/cpu/cpu24/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu24/cache/index1/shared_cpu_list", "9,24" }, + { "/sys/devices/system/cpu/cpu24/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu24/cache/index2/shared_cpu_list", "9,24" }, + { "/sys/devices/system/cpu/cpu24/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu24/cache/index3/shared_cpu_list", "9-16,24-31"}, + { "/sys/devices/system/cpu/cpu24/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu25/cache/index0/shared_cpu_list", "10,25" }, + { "/sys/devices/system/cpu/cpu25/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu25/cache/index1/shared_cpu_list", "10,25" }, + { "/sys/devices/system/cpu/cpu25/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu25/cache/index2/shared_cpu_list", "10,25" }, + { "/sys/devices/system/cpu/cpu25/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu25/cache/index3/shared_cpu_list", "9-16,24-31"}, + { "/sys/devices/system/cpu/cpu25/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu26/cache/index0/shared_cpu_list", "11,26" }, + { "/sys/devices/system/cpu/cpu26/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu26/cache/index1/shared_cpu_list", "11,26" }, + { "/sys/devices/system/cpu/cpu26/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu26/cache/index2/shared_cpu_list", "11,26" }, + { "/sys/devices/system/cpu/cpu26/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu26/cache/index3/shared_cpu_list", "9-16,24-31"}, + { "/sys/devices/system/cpu/cpu26/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu27/cache/index0/shared_cpu_list", "12,27" }, + { "/sys/devices/system/cpu/cpu27/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu27/cache/index1/shared_cpu_list", "12,27" }, + { "/sys/devices/system/cpu/cpu27/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu27/cache/index2/shared_cpu_list", "12,27" }, + { "/sys/devices/system/cpu/cpu27/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu27/cache/index3/shared_cpu_list", "9-16,24-31"}, + { "/sys/devices/system/cpu/cpu27/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu28/cache/index0/shared_cpu_list", "13,28" }, + { "/sys/devices/system/cpu/cpu28/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu28/cache/index1/shared_cpu_list", "13,28" }, + { "/sys/devices/system/cpu/cpu28/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu28/cache/index2/shared_cpu_list", "13,28" }, + { "/sys/devices/system/cpu/cpu28/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu28/cache/index3/shared_cpu_list", "9-16,24-31"}, + { "/sys/devices/system/cpu/cpu28/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu29/cache/index0/shared_cpu_list", "14,29" }, + { "/sys/devices/system/cpu/cpu29/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu29/cache/index1/shared_cpu_list", "14,29" }, + { "/sys/devices/system/cpu/cpu29/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu29/cache/index2/shared_cpu_list", "14,29" }, + { "/sys/devices/system/cpu/cpu29/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu29/cache/index3/shared_cpu_list", "9-16,24-31"}, + { "/sys/devices/system/cpu/cpu29/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu30/cache/index0/shared_cpu_list", "15,30" }, + { "/sys/devices/system/cpu/cpu30/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu30/cache/index1/shared_cpu_list", "15,30" }, + { "/sys/devices/system/cpu/cpu30/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu30/cache/index2/shared_cpu_list", "15,30" }, + { "/sys/devices/system/cpu/cpu30/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu30/cache/index3/shared_cpu_list", "9-16,24-31"}, + { "/sys/devices/system/cpu/cpu30/cache/index3/type", "Unified" }, + { "/sys/devices/system/cpu/cpu31/cache/index0/shared_cpu_list", "16,31" }, + { "/sys/devices/system/cpu/cpu31/cache/index0/type", "Data" }, + { "/sys/devices/system/cpu/cpu31/cache/index1/shared_cpu_list", "16,31" }, + { "/sys/devices/system/cpu/cpu31/cache/index1/type", "Instruction" }, + { "/sys/devices/system/cpu/cpu31/cache/index2/shared_cpu_list", "16,31" }, + { "/sys/devices/system/cpu/cpu31/cache/index2/type", "Unified" }, + { "/sys/devices/system/cpu/cpu31/cache/index3/shared_cpu_list", "9-16,24-31"}, + { "/sys/devices/system/cpu/cpu31/cache/index3/type", "Unified" } +}; + +/// This is the expected CacheLocality structure for fakeSysfsTree +static const CacheLocality nonUniformExampleLocality = { + 32, + { 16, 16, 2 }, + { 0, 2, 4, 6, 8, 10, 11, 12, 14, 16, 18, 20, 22, 24, 26, 28, + 30, 1, 3, 5, 7, 9, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 } +}; + +TEST(CacheLocality, FakeSysfs) { + auto parsed = CacheLocality::readFromSysfsTree([](std::string name) { + auto iter = fakeSysfsTree.find(name); + return iter == fakeSysfsTree.end() ? std::string() : iter->second; + }); + + auto& expected = nonUniformExampleLocality; + EXPECT_EQ(expected.numCpus, parsed.numCpus); + EXPECT_EQ(expected.numCachesByLevel, parsed.numCachesByLevel); + EXPECT_EQ(expected.localityIndexByCpu, parsed.localityIndexByCpu); +} + +TEST(Getcpu, VdsoGetcpu) { + unsigned cpu; + Getcpu::vdsoFunc()(&cpu, nullptr, nullptr); + + EXPECT_TRUE(cpu < CPU_SETSIZE); +} + +TEST(SequentialThreadId, Simple) { + unsigned cpu = 0; + auto rv = SequentialThreadId::getcpu(&cpu, nullptr, nullptr); + EXPECT_EQ(rv, 0); + EXPECT_TRUE(cpu > 0); + unsigned again; + SequentialThreadId::getcpu(&again, nullptr, nullptr); + EXPECT_EQ(cpu, again); +} + +static __thread unsigned testingCpu = 0; + +static int testingGetcpu(unsigned* cpu, unsigned* node, void* unused) { + if (cpu != nullptr) { + *cpu = testingCpu; + } + if (node != nullptr) { + *node = testingCpu; + } + return 0; +} + +TEST(AccessSpreader, Stubbed) { + std::vector>> spreaders(100); + for (size_t s = 1; s < spreaders.size(); ++s) { + spreaders[s].reset(new AccessSpreader<>( + s, nonUniformExampleLocality, &testingGetcpu)); + } + std::vector cpusInLocalityOrder = { + 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 6, 7, 22, 8, 23, 9, 24, 10, 25, + 11, 26, 12, 27, 13, 28, 14, 29, 15, 30, 16, 31 }; + for (size_t i = 0; i < 32; ++i) { + // extra i * 32 is to check wrapping behavior of impl + testingCpu = cpusInLocalityOrder[i] + i * 64; + for (size_t s = 1; s < spreaders.size(); ++s) { + EXPECT_EQ((i * s) / 32, spreaders[s]->current()) + << "i=" << i << ", cpu=" << testingCpu << ", s=" << s; + } + } +} + +TEST(AccessSpreader, Default) { + AccessSpreader<> spreader(16); + EXPECT_LT(spreader.current(), 16); +} + +TEST(AccessSpreader, Shared) { + for (size_t s = 1; s < 200; ++s) { + EXPECT_LT(AccessSpreader<>::shared(s).current(), s); + } +} + +TEST(AccessSpreader, Statics) { + LOG(INFO) << "stripeByCore.numStripes() = " + << AccessSpreader<>::stripeByCore.numStripes(); + LOG(INFO) << "stripeByChip.numStripes() = " + << AccessSpreader<>::stripeByChip.numStripes(); + for (size_t s = 1; s < 200; ++s) { + EXPECT_LT(AccessSpreader<>::current(s), s); + } +} + +TEST(AccessSpreader, Wrapping) { + // this test won't pass unless locality.numCpus divides kMaxCpus + auto numCpus = 16; + auto locality = CacheLocality::uniform(numCpus); + for (size_t s = 1; s < 200; ++s) { + AccessSpreader<> spreader(s, locality, &testingGetcpu); + for (size_t c = 0; c < 400; ++c) { + testingCpu = c; + auto observed = spreader.current(); + testingCpu = c % numCpus; + auto expected = spreader.current(); + EXPECT_EQ(expected, observed) + << "numCpus=" << numCpus << ", s=" << s << ", c=" << c; + } + } +} + +// Benchmarked at ~21 nanos on fbk35 (2.6) and fbk18 (3.2) kernels with +// a 2.2Ghz Xeon +// ============================================================================ +// folly/test/CacheLocalityTest.cpp relative time/iter iters/s +// ============================================================================ +// LocalAccessSpreaderUse 20.77ns 48.16M +// SharedAccessSpreaderUse 21.95ns 45.55M +// AccessSpreaderConstruction 466.56ns 2.14M +// ============================================================================ + +BENCHMARK(LocalAccessSpreaderUse, iters) { + folly::BenchmarkSuspender braces; + AccessSpreader<> spreader(16); + braces.dismiss(); + + for (unsigned long i = 0; i < iters; ++i) { + auto x = spreader.current(); + folly::doNotOptimizeAway(x); + } +} + +BENCHMARK(SharedAccessSpreaderUse, iters) { + for (unsigned long i = 0; i < iters; ++i) { + auto x = AccessSpreader<>::current(16); + folly::doNotOptimizeAway(x); + } +} + +BENCHMARK(AccessSpreaderConstruction, iters) { + std::aligned_storage), + std::alignment_of>::value>::type raw; + for (unsigned long i = 0; i < iters; ++i) { + auto x = new (&raw) AccessSpreader<>(16); + folly::doNotOptimizeAway(x); + x->~AccessSpreader(); + } +} + +enum class SpreaderType { GETCPU, SHARED, TLS_RR }; + +// Benchmark scores here reflect the time for 32 threads to perform an +// atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly, +// if we don't separate the counters onto unique 128 byte stripes the +// 1_stripe and 2_stripe results are identical, even though the L3 is +// claimed to have 64 byte cache lines. +// +// _stub means there was no call to getcpu or the tls round-robin +// implementation, because for a single stripe the cpu doesn't matter. +// _getcpu refers to the vdso getcpu implementation with a locally +// constructed AccessSpreader. _tls_rr refers to execution using +// SequentialThreadId, the fallback if the vdso getcpu isn't available. +// _shared refers to calling AccessSpreader<>::current(numStripes) +// inside the hot loop. +// +// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic, +// so since the stripe selection is 21 nanos the atomic increments in +// the L1 is ~15 nanos. At width 8_stripe_0_work the line is expected +// to ping-pong almost every operation, since the loops have the same +// duration. Widths 4 and 2 have the same behavior, but each tour of the +// cache line is 4 and 8 cores long, respectively. These all suggest a +// lower bound of 60 nanos for intra-chip handoff and increment between +// the L1s. +// +// With 455 nanos (1K cycles) of busywork per contended increment, the +// system can hide all of the latency of a tour of length 4, but not +// quite one of length 8. I was a bit surprised at how much worse the +// non-striped version got. It seems that the inter-chip traffic also +// interferes with the L1-only localWork.load(). When the local work is +// doubled to about 1 microsecond we see that the inter-chip contention +// is still very important, but subdivisions on the same chip don't matter. +// +// sudo nice -n -20 +// _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000 +// ============================================================================ +// folly/test/CacheLocalityTest.cpp relative time/iter iters/s +// ============================================================================ +// contentionAtWidth(1_stripe_0_work_stub) 1.14us 873.64K +// contentionAtWidth(2_stripe_0_work_getcpu) 495.58ns 2.02M +// contentionAtWidth(4_stripe_0_work_getcpu) 232.99ns 4.29M +// contentionAtWidth(8_stripe_0_work_getcpu) 101.16ns 9.88M +// contentionAtWidth(16_stripe_0_work_getcpu) 41.93ns 23.85M +// contentionAtWidth(32_stripe_0_work_getcpu) 42.04ns 23.79M +// contentionAtWidth(64_stripe_0_work_getcpu) 41.94ns 23.84M +// contentionAtWidth(2_stripe_0_work_tls_rr) 1.00us 997.41K +// contentionAtWidth(4_stripe_0_work_tls_rr) 694.41ns 1.44M +// contentionAtWidth(8_stripe_0_work_tls_rr) 590.27ns 1.69M +// contentionAtWidth(16_stripe_0_work_tls_rr) 222.13ns 4.50M +// contentionAtWidth(32_stripe_0_work_tls_rr) 169.49ns 5.90M +// contentionAtWidth(64_stripe_0_work_tls_rr) 162.20ns 6.17M +// contentionAtWidth(2_stripe_0_work_shared) 495.54ns 2.02M +// contentionAtWidth(4_stripe_0_work_shared) 236.27ns 4.23M +// contentionAtWidth(8_stripe_0_work_shared) 114.81ns 8.71M +// contentionAtWidth(16_stripe_0_work_shared) 44.65ns 22.40M +// contentionAtWidth(32_stripe_0_work_shared) 41.76ns 23.94M +// contentionAtWidth(64_stripe_0_work_shared) 43.47ns 23.00M +// atomicIncrBaseline(local_incr_0_work) 20.39ns 49.06M +// ---------------------------------------------------------------------------- +// contentionAtWidth(1_stripe_500_work_stub) 2.04us 491.13K +// contentionAtWidth(2_stripe_500_work_getcpu) 610.98ns 1.64M +// contentionAtWidth(4_stripe_500_work_getcpu) 507.72ns 1.97M +// contentionAtWidth(8_stripe_500_work_getcpu) 542.53ns 1.84M +// contentionAtWidth(16_stripe_500_work_getcpu) 496.55ns 2.01M +// contentionAtWidth(32_stripe_500_work_getcpu) 500.67ns 2.00M +// atomicIncrBaseline(local_incr_500_work) 484.69ns 2.06M +// ---------------------------------------------------------------------------- +// contentionAtWidth(1_stripe_1000_work_stub) 2.11us 473.78K +// contentionAtWidth(2_stripe_1000_work_getcpu) 970.64ns 1.03M +// contentionAtWidth(4_stripe_1000_work_getcpu) 987.31ns 1.01M +// contentionAtWidth(8_stripe_1000_work_getcpu) 1.01us 985.52K +// contentionAtWidth(16_stripe_1000_work_getcpu) 986.09ns 1.01M +// contentionAtWidth(32_stripe_1000_work_getcpu) 960.23ns 1.04M +// atomicIncrBaseline(local_incr_1000_work) 950.63ns 1.05M +// ============================================================================ +static void contentionAtWidth(size_t iters, size_t stripes, size_t work, + SpreaderType spreaderType, + size_t counterAlignment = 128, + size_t numThreads = 32) { + folly::BenchmarkSuspender braces; + + AccessSpreader<> spreader( + stripes, + CacheLocality::system(), + spreaderType == SpreaderType::TLS_RR + ? SequentialThreadId::getcpu : nullptr); + + std::atomic ready(0); + std::atomic go(false); + + // while in theory the cache line size is 64 bytes, experiments show + // that we get contention on 128 byte boundaries for Ivy Bridge. The + // extra indirection adds 1 or 2 nanos + assert(counterAlignment >= sizeof(std::atomic)); + char raw[counterAlignment * stripes]; + + // if we happen to be using the tlsRoundRobin, then sequentially + // assigning the thread identifiers is the unlikely best-case scenario. + // We don't want to unfairly benefit or penalize. Computing the exact + // maximum likelihood of the probability distributions is annoying, so + // I approximate as 2/5 of the ids that have no threads, 2/5 that have + // 1, 2/15 that have 2, and 1/15 that have 3. We accomplish this by + // wrapping back to slot 0 when we hit 1/15 and 1/5. + + std::vector threads; + while (threads.size() < numThreads) { + threads.push_back(std::thread([&,iters,stripes,work]() { + std::atomic* counters[stripes]; + for (size_t i = 0; i < stripes; ++i) { + counters[i] = new (raw + counterAlignment * i) std::atomic(); + } + + spreader.current(); + ready++; + while (!go.load()) { + sched_yield(); + } + std::atomic localWork; + if (spreaderType == SpreaderType::SHARED) { + for (size_t i = iters; i > 0; --i) { + ++*(counters[AccessSpreader<>::current(stripes)]); + for (size_t j = work; j > 0; --j) { + localWork.load(); + } + } + } else { + for (size_t i = iters; i > 0; --i) { + ++*(counters[spreader.current()]); + for (size_t j = work; j > 0; --j) { + localWork.load(); + } + } + } + })); + + if (threads.size() == numThreads / 15 || + threads.size() == numThreads / 5) { + // create a few dummy threads to wrap back around to 0 mod numCpus + for (size_t i = threads.size(); i != numThreads; ++i) { + std::thread([&]() { + spreader.current(); + }).join(); + } + } + } + + while (ready < numThreads) { + sched_yield(); + } + braces.dismiss(); + go = true; + + for (auto& thr : threads) { + thr.join(); + } +} + +static void atomicIncrBaseline(size_t iters, size_t work, + size_t numThreads = 32) { + folly::BenchmarkSuspender braces; + + std::atomic go(false); + + std::vector threads; + while (threads.size() < numThreads) { + threads.push_back(std::thread([&]() { + while (!go.load()) { + sched_yield(); + } + std::atomic localCounter; + std::atomic localWork; + for (size_t i = iters; i > 0; --i) { + localCounter++; + for (size_t j = work; j > 0; --j) { + localWork.load(); + } + } + })); + } + + braces.dismiss(); + go = true; + + for (auto& thr : threads) { + thr.join(); + } +} + +BENCHMARK_DRAW_LINE() + +BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_0_work_stub, + 1, 0, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_getcpu, + 2, 0, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_getcpu, + 4, 0, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_getcpu, + 8, 0, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_getcpu, + 16, 0, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_getcpu, + 32, 0, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_getcpu, + 64, 0, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_tls_rr, + 2, 0, SpreaderType::TLS_RR) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_tls_rr, + 4, 0, SpreaderType::TLS_RR) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_tls_rr, + 8, 0, SpreaderType::TLS_RR) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_tls_rr, + 16, 0, SpreaderType::TLS_RR) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_tls_rr, + 32, 0, SpreaderType::TLS_RR) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_tls_rr, + 64, 0, SpreaderType::TLS_RR) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_shared, + 2, 0, SpreaderType::SHARED) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_shared, + 4, 0, SpreaderType::SHARED) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_shared, + 8, 0, SpreaderType::SHARED) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_shared, + 16, 0, SpreaderType::SHARED) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_shared, + 32, 0, SpreaderType::SHARED) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_shared, + 64, 0, SpreaderType::SHARED) +BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0) +BENCHMARK_DRAW_LINE() +BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_500_work_stub, + 1, 500, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_500_work_getcpu, + 2, 500, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_500_work_getcpu, + 4, 500, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_500_work_getcpu, + 8, 500, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_500_work_getcpu, + 16, 500, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_500_work_getcpu, + 32, 500, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500) +BENCHMARK_DRAW_LINE() +BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_1000_work_stub, + 1, 1000, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_1000_work_getcpu, + 2, 1000, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_1000_work_getcpu, + 4, 1000, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_1000_work_getcpu, + 8, 1000, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_1000_work_getcpu, + 16, 1000, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_1000_work_getcpu, + 32, 1000, SpreaderType::GETCPU) +BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000) + + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + google::ParseCommandLineFlags(&argc, &argv, true); + auto ret = RUN_ALL_TESTS(); + if (!ret && FLAGS_benchmark) { + folly::runBenchmarks(); + } + return ret; +} + diff --git a/folly/test/DeterministicSchedule.cpp b/folly/test/DeterministicSchedule.cpp index 7f77262c..873bcf78 100644 --- a/folly/test/DeterministicSchedule.cpp +++ b/folly/test/DeterministicSchedule.cpp @@ -272,4 +272,39 @@ int Futex::futexWake(int count, uint32_t wakeMask) { return rv; } + +template<> +CacheLocality const& CacheLocality::system() { + static CacheLocality cache(CacheLocality::uniform(16)); + return cache; +} + +template<> +test::DeterministicAtomic + SequentialThreadId::prevId(0); + +template<> +__thread size_t SequentialThreadId::currentId(0); + +template<> +const AccessSpreader +AccessSpreader::stripeByCore( + CacheLocality::system<>().numCachesByLevel.front()); + +template<> +const AccessSpreader +AccessSpreader::stripeByChip( + CacheLocality::system<>().numCachesByLevel.back()); + +template<> +AccessSpreaderArray +AccessSpreaderArray::sharedInstance = {}; + + +template<> +Getcpu::Func +AccessSpreader::pickGetcpuFunc(size_t numStripes) { + return &SequentialThreadId::getcpu; +} + }} diff --git a/folly/test/DeterministicSchedule.h b/folly/test/DeterministicSchedule.h index 3e7ccb3a..71cb1037 100644 --- a/folly/test/DeterministicSchedule.h +++ b/folly/test/DeterministicSchedule.h @@ -27,6 +27,7 @@ #include #include +#include #include namespace folly { namespace test { -- 2.34.1