/*
- * Copyright 2014 Facebook, Inc.
+ * Copyright 2016 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* limitations under the License.
*/
-#ifndef FOLLY_DETAIL_CACHELOCALITY_H_
-#define FOLLY_DETAIL_CACHELOCALITY_H_
+#pragma once
#include <sched.h>
+#include <algorithm>
#include <atomic>
#include <cassert>
#include <functional>
#include <string>
#include <type_traits>
#include <vector>
+#include <pthread.h>
+#include <folly/Hash.h>
#include <folly/Likely.h>
#include <folly/Portability.h>
-namespace folly { namespace detail {
+namespace folly {
+namespace detail {
// This file contains several classes that might be useful if you are
// trying to dynamically optimize cache locality: CacheLocality reads
/// cache and cpus with a locality index >= 16 will share the other.
std::vector<size_t> localityIndexByCpu;
-
/// Returns the best CacheLocality information available for the current
/// system, cached for fast access. This will be loaded from sysfs if
/// possible, otherwise it will be correct in the number of CPUs but
/// that transitively uses it, all components select between the default
/// sysfs implementation and a deterministic implementation by keying
/// off the type of the underlying atomic. See DeterministicScheduler.
- template <template<typename> class Atom = std::atomic>
+ template <template <typename> class Atom = std::atomic>
static const CacheLocality& system();
-
/// Reads CacheLocality information from a tree structured like
/// the sysfs filesystem. The provided function will be evaluated
/// for each sysfs file that needs to be queried. The function
kFalseSharingRange = 128
};
- static_assert(kFalseSharingRange == 128,
+ static_assert(
+ kFalseSharingRange == 128,
"FOLLY_ALIGN_TO_AVOID_FALSE_SHARING should track kFalseSharingRange");
};
/// An attribute that will cause a variable or field to be aligned so that
/// it doesn't have false sharing with anything at a smaller memory address.
-#define FOLLY_ALIGN_TO_AVOID_FALSE_SHARING __attribute__((__aligned__(128)))
+#define FOLLY_ALIGN_TO_AVOID_FALSE_SHARING FOLLY_ALIGNED(128)
-/// Holds a function pointer to the VDSO implementation of getcpu(2),
-/// if available
+/// Knows how to derive a function pointer to the VDSO implementation of
+/// getcpu(2), if available
struct Getcpu {
/// Function pointer to a function with the same signature as getcpu(2).
typedef int (*Func)(unsigned* cpu, unsigned* node, void* unused);
/// Returns a pointer to the VDSO implementation of getcpu(2), if
- /// available, or nullptr otherwise
- static Func vdsoFunc();
+ /// available, or nullptr otherwise. This function may be quite
+ /// expensive, be sure to cache the result.
+ static Func resolveVdsoFunc();
};
-/// A class that lazily binds a unique (for each implementation of Atom)
-/// identifier to a thread. This is a fallback mechanism for the access
-/// spreader if we are in testing (using DeterministicAtomic) or if
-/// __vdso_getcpu can't be dynamically loaded
-template <template<typename> class Atom>
+#ifdef FOLLY_TLS
+template <template <typename> class Atom>
struct SequentialThreadId {
/// Returns the thread id assigned to the current thread
return rv;
}
+ private:
+ static Atom<size_t> prevId;
+
+ static FOLLY_TLS size_t currentId;
+};
+#endif
+
+struct HashingThreadId {
+ static size_t get() {
+ pthread_t pid = pthread_self();
+ uint64_t id = 0;
+ memcpy(&id, &pid, std::min(sizeof(pid), sizeof(id)));
+ return hash::twang_32from64(id);
+ }
+};
+
+/// A class that lazily binds a unique (for each implementation of Atom)
+/// identifier to a thread. This is a fallback mechanism for the access
+/// spreader if __vdso_getcpu can't be loaded
+template <typename ThreadId>
+struct FallbackGetcpu {
/// Fills the thread id into the cpu and node out params (if they
/// are non-null). This method is intended to act like getcpu when a
/// fast-enough form of getcpu isn't available or isn't desired
- static int getcpu(unsigned* cpu, unsigned* node, void* unused) {
- auto id = get();
+ static int getcpu(unsigned* cpu, unsigned* node, void* /* unused */) {
+ auto id = ThreadId::get();
if (cpu) {
*cpu = id;
}
}
return 0;
}
-
- private:
- static Atom<size_t> prevId;
-
- static FOLLY_TLS size_t currentId;
};
-template <template<typename> class Atom, size_t kMaxCpus>
-struct AccessSpreaderArray;
+#ifdef FOLLY_TLS
+typedef FallbackGetcpu<SequentialThreadId<std::atomic>> FallbackGetcpuType;
+#else
+typedef FallbackGetcpu<HashingThreadId> FallbackGetcpuType;
+#endif
/// AccessSpreader arranges access to a striped data structure in such a
/// way that concurrently executing threads are likely to be accessing
/// different stripes. It does NOT guarantee uncontended access.
/// Your underlying algorithm must be thread-safe without spreading, this
/// is merely an optimization. AccessSpreader::current(n) is typically
-/// much faster than a cache miss (22 nanos on my dev box, tested fast
+/// much faster than a cache miss (12 nanos on my dev box, tested fast
/// in both 2.6 and 3.2 kernels).
///
-/// You are free to create your own AccessSpreader-s or to cache the
-/// results of AccessSpreader<>::shared(n), but you will probably want
-/// to use one of the system-wide shared ones. Calling .current() on
-/// a particular AccessSpreader instance only saves about 1 nanosecond
-/// over calling AccessSpreader<>::shared(n).
-///
/// If available (and not using the deterministic testing implementation)
/// AccessSpreader uses the getcpu system call via VDSO and the
/// precise locality information retrieved from sysfs by CacheLocality.
/// own stripe and there will be no cache sharing at all.
///
/// AccessSpreader has a fallback mechanism for when __vdso_getcpu can't be
-/// loaded, or for use during deterministic testing. Using sched_getcpu or
-/// the getcpu syscall would negate the performance advantages of access
-/// spreading, so we use a thread-local value and a shared atomic counter
-/// to spread access out.
+/// loaded, or for use during deterministic testing. Using sched_getcpu
+/// or the getcpu syscall would negate the performance advantages of
+/// access spreading, so we use a thread-local value and a shared atomic
+/// counter to spread access out. On systems lacking both a fast getcpu()
+/// and TLS, we hash the thread id to spread accesses.
///
/// AccessSpreader is templated on the template type that is used
/// to implement atomics, as a way to instantiate the underlying
/// testing. See DeterministicScheduler for more. If you aren't using
/// DeterministicScheduler, you can just use the default template parameter
/// all of the time.
-template <template<typename> class Atom = std::atomic>
+template <template <typename> class Atom = std::atomic>
struct AccessSpreader {
- /// Returns a never-destructed shared AccessSpreader instance.
- /// numStripes should be > 0.
- static const AccessSpreader& shared(size_t numStripes) {
- // sharedInstances[0] actually has numStripes == 1
- assert(numStripes > 0);
-
- // the last shared element handles all large sizes
- return AccessSpreaderArray<Atom,kMaxCpus>::sharedInstance[
- std::min(size_t(kMaxCpus), numStripes)];
- }
-
- /// Returns the stripe associated with the current CPU, assuming
- /// that there are numStripes (non-zero) stripes. Equivalent to
- /// AccessSpreader::shared(numStripes)->current.
+ /// Returns the stripe associated with the current CPU. The returned
+ /// value will be < numStripes.
static size_t current(size_t numStripes) {
- return shared(numStripes).current();
- }
-
- /// stripeByCore uses 1 stripe per L1 cache, according to
- /// CacheLocality::system<>(). Use stripeByCore.numStripes() to see
- /// its width, or stripeByCore.current() to get the current stripe
- static const AccessSpreader stripeByCore;
-
- /// stripeByChip uses 1 stripe per last-level cache, which is the fewest
- /// number of stripes for which off-chip communication can be avoided
- /// (assuming all caches are on-chip). Use stripeByChip.numStripes()
- /// to see its width, or stripeByChip.current() to get the current stripe
- static const AccessSpreader stripeByChip;
-
-
- /// Constructs an AccessSpreader that will return values from
- /// 0 to numStripes-1 (inclusive), precomputing the mapping
- /// from CPU to stripe. There is no use in having more than
- /// CacheLocality::system<Atom>().localityIndexByCpu.size() stripes or
- /// kMaxCpus stripes
- explicit AccessSpreader(size_t spreaderNumStripes,
- const CacheLocality& cacheLocality =
- CacheLocality::system<Atom>(),
- Getcpu::Func getcpuFunc = nullptr)
- : getcpuFunc_(getcpuFunc ? getcpuFunc : pickGetcpuFunc(spreaderNumStripes))
- , numStripes_(spreaderNumStripes)
- {
- auto n = cacheLocality.numCpus;
- for (size_t cpu = 0; cpu < kMaxCpus && cpu < n; ++cpu) {
- auto index = cacheLocality.localityIndexByCpu[cpu];
- assert(index < n);
- // as index goes from 0..n, post-transform value goes from
- // 0..numStripes
- stripeByCpu[cpu] = (index * numStripes_) / n;
- assert(stripeByCpu[cpu] < numStripes_);
- }
- for (size_t cpu = n; cpu < kMaxCpus; ++cpu) {
- stripeByCpu[cpu] = stripeByCpu[cpu - n];
- }
- }
-
- /// Returns 1 more than the maximum value that can be returned from
- /// current()
- size_t numStripes() const {
- return numStripes_;
- }
+ // widthAndCpuToStripe[0] will actually work okay (all zeros), but
+ // something's wrong with the caller
+ assert(numStripes > 0);
- /// Returns the stripe associated with the current CPU
- size_t current() const {
unsigned cpu;
- getcpuFunc_(&cpu, nullptr, nullptr);
- return stripeByCpu[cpu % kMaxCpus];
+ getcpuFunc(&cpu, nullptr, nullptr);
+ return widthAndCpuToStripe[std::min(size_t(kMaxCpus),
+ numStripes)][cpu % kMaxCpus];
}
private:
-
/// If there are more cpus than this nothing will crash, but there
/// might be unnecessary sharing
enum { kMaxCpus = 128 };
typedef uint8_t CompactStripe;
static_assert((kMaxCpus & (kMaxCpus - 1)) == 0,
- "kMaxCpus should be a power of two so modulo is fast");
+ "kMaxCpus should be a power of two so modulo is fast");
static_assert(kMaxCpus - 1 <= std::numeric_limits<CompactStripe>::max(),
- "stripeByCpu element type isn't wide enough");
-
+ "stripeByCpu element type isn't wide enough");
/// Points to the getcpu-like function we are using to obtain the
/// current cpu. It should not be assumed that the returned cpu value
- /// is in range. We use a member for this instead of a static so that
- /// this fetch preloads a prefix the stripeByCpu array
- Getcpu::Func getcpuFunc_;
-
- /// A precomputed map from cpu to stripe. Rather than add a layer of
- /// indirection requiring a dynamic bounds check and another cache miss,
- /// we always precompute the whole array
- CompactStripe stripeByCpu[kMaxCpus];
-
- size_t numStripes_;
-
- /// Returns the best getcpu implementation for this type and width
- /// of AccessSpreader
- static Getcpu::Func pickGetcpuFunc(size_t numStripes);
-};
-
-template<>
-Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t);
-
-
-/// An array of kMaxCpus+1 AccessSpreader<Atom> instances constructed
-/// with default params, with the zero-th element having 1 stripe
-template <template<typename> class Atom, size_t kMaxStripe>
-struct AccessSpreaderArray {
-
- AccessSpreaderArray() {
- for (size_t i = 0; i <= kMaxStripe; ++i) {
- new (raw + i) AccessSpreader<Atom>(std::max(size_t(1), i));
+ /// is in range. We use a static for this so that we can prearrange a
+ /// valid value in the pre-constructed state and avoid the need for a
+ /// conditional on every subsequent invocation (not normally a big win,
+ /// but 20% on some inner loops here).
+ static Getcpu::Func getcpuFunc;
+
+ /// For each level of splitting up to kMaxCpus, maps the cpu (mod
+ /// kMaxCpus) to the stripe. Rather than performing any inequalities
+ /// or modulo on the actual number of cpus, we just fill in the entire
+ /// array.
+ static CompactStripe widthAndCpuToStripe[kMaxCpus + 1][kMaxCpus];
+
+ static bool initialized;
+
+ /// Returns the best getcpu implementation for Atom
+ static Getcpu::Func pickGetcpuFunc();
+
+ /// Always claims to be on CPU zero, node zero
+ static int degenerateGetcpu(unsigned* cpu, unsigned* node, void*) {
+ if (cpu != nullptr) {
+ *cpu = 0;
}
- }
-
- ~AccessSpreaderArray() {
- for (size_t i = 0; i <= kMaxStripe; ++i) {
- auto p = static_cast<AccessSpreader<Atom>*>(static_cast<void*>(raw + i));
- p->~AccessSpreader();
+ if (node != nullptr) {
+ *node = 0;
}
+ return 0;
}
- AccessSpreader<Atom> const& operator[] (size_t index) const {
- return *static_cast<AccessSpreader<Atom> const*>(
- static_cast<void const*>(raw + index));
+ // The function to call for fast lookup of getcpu is a singleton, as
+ // is the precomputed table of locality information. AccessSpreader
+ // is used in very tight loops, however (we're trying to race an L1
+ // cache miss!), so the normal singleton mechanisms are noticeably
+ // expensive. Even a not-taken branch guarding access to getcpuFunc
+ // slows AccessSpreader::current from 12 nanos to 14. As a result, we
+ // populate the static members with simple (but valid) values that can
+ // be filled in by the linker, and then follow up with a normal static
+ // initializer call that puts in the proper version. This means that
+ // when there are initialization order issues we will just observe a
+ // zero stripe. Once a sanitizer gets smart enough to detect this as
+ // a race or undefined behavior, we can annotate it.
+
+ static bool initialize() {
+ getcpuFunc = pickGetcpuFunc();
+
+ auto& cacheLocality = CacheLocality::system<Atom>();
+ auto n = cacheLocality.numCpus;
+ for (size_t width = 0; width <= kMaxCpus; ++width) {
+ auto numStripes = std::max(size_t{1}, width);
+ for (size_t cpu = 0; cpu < kMaxCpus && cpu < n; ++cpu) {
+ auto index = cacheLocality.localityIndexByCpu[cpu];
+ assert(index < n);
+ // as index goes from 0..n, post-transform value goes from
+ // 0..numStripes
+ widthAndCpuToStripe[width][cpu] = (index * numStripes) / n;
+ assert(widthAndCpuToStripe[width][cpu] < numStripes);
+ }
+ for (size_t cpu = n; cpu < kMaxCpus; ++cpu) {
+ widthAndCpuToStripe[width][cpu] = widthAndCpuToStripe[width][cpu - n];
+ }
+ }
+ return true;
}
-
- private:
-
- // AccessSpreader uses sharedInstance
- friend AccessSpreader<Atom>;
-
- static AccessSpreaderArray<Atom,kMaxStripe> sharedInstance;
-
-
- /// aligned_storage is uninitialized, we use placement new since there
- /// is no AccessSpreader default constructor
- typename std::aligned_storage<sizeof(AccessSpreader<Atom>),
- CacheLocality::kFalseSharingRange>::type
- raw[kMaxStripe + 1];
};
-} }
+template <>
+Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc();
+
+#define DECLARE_ACCESS_SPREADER_TYPE(Atom) \
+ namespace folly { \
+ namespace detail { \
+ template <> \
+ Getcpu::Func AccessSpreader<Atom>::getcpuFunc = \
+ AccessSpreader<Atom>::degenerateGetcpu; \
+ template <> \
+ typename AccessSpreader<Atom>::CompactStripe \
+ AccessSpreader<Atom>::widthAndCpuToStripe[129][128] = {}; \
+ template <> \
+ bool AccessSpreader<Atom>::initialized = AccessSpreader<Atom>::initialize(); \
+ } \
+ }
-#endif /* FOLLY_DETAIL_CacheLocality_H_ */
+} // namespace detail
+} // namespace folly