folly: replace old-style header guards with "pragma once"

[folly.git] / folly / detail / CacheLocality.h
diff --git a/folly/detail/CacheLocality.h b/folly/detail/CacheLocality.h

index 107cf7577b449f4dc154b71018dafc67e00a55c4..1ede43cbef70b4f60d4f06f40a77890be419dfa1 100644 (file)
--- a/folly/detail/CacheLocality.h
+++ b/folly/detail/CacheLocality.h
@@ -1,5 +1,5 @@
  /*
- * Copyright 2014 Facebook, Inc.
+ * Copyright 2016 Facebook, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -14,10 +14,10 @@
   * limitations under the License.
   */
  
-#ifndef FOLLY_DETAIL_CACHELOCALITY_H_
-#define FOLLY_DETAIL_CACHELOCALITY_H_
+#pragma once
  
  #include <sched.h>
+#include <algorithm>
  #include <atomic>
  #include <cassert>
  #include <functional>
@@ -25,10 +25,13 @@
  #include <string>
  #include <type_traits>
  #include <vector>
+#include <pthread.h>
+#include <folly/Hash.h>
  #include <folly/Likely.h>
  #include <folly/Portability.h>
  
-namespace folly { namespace detail {
+namespace folly {
+namespace detail {
  
  // This file contains several classes that might be useful if you are
  // trying to dynamically optimize cache locality: CacheLocality reads
@@ -71,7 +74,6 @@ struct CacheLocality {
    /// cache and cpus with a locality index >= 16 will share the other.
    std::vector<size_t> localityIndexByCpu;
  
-
    /// Returns the best CacheLocality information available for the current
    /// system, cached for fast access.  This will be loaded from sysfs if
    /// possible, otherwise it will be correct in the number of CPUs but
@@ -86,10 +88,9 @@ struct CacheLocality {
    /// that transitively uses it, all components select between the default
    /// sysfs implementation and a deterministic implementation by keying
    /// off the type of the underlying atomic.  See DeterministicScheduler.
-  template <template<typename> class Atom = std::atomic>
+  template <template <typename> class Atom = std::atomic>
    static const CacheLocality& system();
  
-
    /// Reads CacheLocality information from a tree structured like
    /// the sysfs filesystem.  The provided function will be evaluated
    /// for each sysfs file that needs to be queried.  The function
@@ -119,7 +120,8 @@ struct CacheLocality {
      kFalseSharingRange = 128
    };
  
-  static_assert(kFalseSharingRange == 128,
+  static_assert(
+      kFalseSharingRange == 128,
        "FOLLY_ALIGN_TO_AVOID_FALSE_SHARING should track kFalseSharingRange");
  };
  
@@ -127,24 +129,22 @@ struct CacheLocality {
  
  /// An attribute that will cause a variable or field to be aligned so that
  /// it doesn't have false sharing with anything at a smaller memory address.
-#define FOLLY_ALIGN_TO_AVOID_FALSE_SHARING __attribute__((__aligned__(128)))
+#define FOLLY_ALIGN_TO_AVOID_FALSE_SHARING FOLLY_ALIGNED(128)
  
-/// Holds a function pointer to the VDSO implementation of getcpu(2),
-/// if available
+/// Knows how to derive a function pointer to the VDSO implementation of
+/// getcpu(2), if available
  struct Getcpu {
    /// Function pointer to a function with the same signature as getcpu(2).
    typedef int (*Func)(unsigned* cpu, unsigned* node, void* unused);
  
    /// Returns a pointer to the VDSO implementation of getcpu(2), if
-  /// available, or nullptr otherwise
-  static Func vdsoFunc();
+  /// available, or nullptr otherwise.  This function may be quite
+  /// expensive, be sure to cache the result.
+  static Func resolveVdsoFunc();
  };
  
-/// A class that lazily binds a unique (for each implementation of Atom)
-/// identifier to a thread.  This is a fallback mechanism for the access
-/// spreader if we are in testing (using DeterministicAtomic) or if
-/// __vdso_getcpu can't be dynamically loaded
-template <template<typename> class Atom>
+#ifdef FOLLY_TLS
+template <template <typename> class Atom>
  struct SequentialThreadId {
  
    /// Returns the thread id assigned to the current thread
@@ -156,11 +156,32 @@ struct SequentialThreadId {
      return rv;
    }
  
+ private:
+  static Atom<size_t> prevId;
+
+  static FOLLY_TLS size_t currentId;
+};
+#endif
+
+struct HashingThreadId {
+  static size_t get() {
+    pthread_t pid = pthread_self();
+    uint64_t id = 0;
+    memcpy(&id, &pid, std::min(sizeof(pid), sizeof(id)));
+    return hash::twang_32from64(id);
+  }
+};
+
+/// A class that lazily binds a unique (for each implementation of Atom)
+/// identifier to a thread.  This is a fallback mechanism for the access
+/// spreader if __vdso_getcpu can't be loaded
+template <typename ThreadId>
+struct FallbackGetcpu {
    /// Fills the thread id into the cpu and node out params (if they
    /// are non-null).  This method is intended to act like getcpu when a
    /// fast-enough form of getcpu isn't available or isn't desired
-  static int getcpu(unsigned* cpu, unsigned* node, void* unused) {
-    auto id = get();
+  static int getcpu(unsigned* cpu, unsigned* node, void* /* unused */) {
+    auto id = ThreadId::get();
      if (cpu) {
        *cpu = id;
      }
@@ -169,30 +190,22 @@ struct SequentialThreadId {
      }
      return 0;
    }
-
- private:
-  static Atom<size_t> prevId;
-
-  static FOLLY_TLS size_t currentId;
  };
  
-template <template<typename> class Atom, size_t kMaxCpus>
-struct AccessSpreaderArray;
+#ifdef FOLLY_TLS
+typedef FallbackGetcpu<SequentialThreadId<std::atomic>> FallbackGetcpuType;
+#else
+typedef FallbackGetcpu<HashingThreadId> FallbackGetcpuType;
+#endif
  
  /// AccessSpreader arranges access to a striped data structure in such a
  /// way that concurrently executing threads are likely to be accessing
  /// different stripes.  It does NOT guarantee uncontended access.
  /// Your underlying algorithm must be thread-safe without spreading, this
  /// is merely an optimization.  AccessSpreader::current(n) is typically
-/// much faster than a cache miss (22 nanos on my dev box, tested fast
+/// much faster than a cache miss (12 nanos on my dev box, tested fast
  /// in both 2.6 and 3.2 kernels).
  ///
-/// You are free to create your own AccessSpreader-s or to cache the
-/// results of AccessSpreader<>::shared(n), but you will probably want
-/// to use one of the system-wide shared ones.  Calling .current() on
-/// a particular AccessSpreader instance only saves about 1 nanosecond
-/// over calling AccessSpreader<>::shared(n).
-///
  /// If available (and not using the deterministic testing implementation)
  /// AccessSpreader uses the getcpu system call via VDSO and the
  /// precise locality information retrieved from sysfs by CacheLocality.
@@ -207,10 +220,11 @@ struct AccessSpreaderArray;
  /// own stripe and there will be no cache sharing at all.
  ///
  /// AccessSpreader has a fallback mechanism for when __vdso_getcpu can't be
-/// loaded, or for use during deterministic testing.  Using sched_getcpu or
-/// the getcpu syscall would negate the performance advantages of access
-/// spreading, so we use a thread-local value and a shared atomic counter
-/// to spread access out.
+/// loaded, or for use during deterministic testing.  Using sched_getcpu
+/// or the getcpu syscall would negate the performance advantages of
+/// access spreading, so we use a thread-local value and a shared atomic
+/// counter to spread access out.  On systems lacking both a fast getcpu()
+/// and TLS, we hash the thread id to spread accesses.
  ///
  /// AccessSpreader is templated on the template type that is used
  /// to implement atomics, as a way to instantiate the underlying
@@ -218,80 +232,23 @@ struct AccessSpreaderArray;
  /// testing.  See DeterministicScheduler for more.  If you aren't using
  /// DeterministicScheduler, you can just use the default template parameter
  /// all of the time.
-template <template<typename> class Atom = std::atomic>
+template <template <typename> class Atom = std::atomic>
  struct AccessSpreader {
  
-  /// Returns a never-destructed shared AccessSpreader instance.
-  /// numStripes should be > 0.
-  static const AccessSpreader& shared(size_t numStripes) {
-    // sharedInstances[0] actually has numStripes == 1
-    assert(numStripes > 0);
-
-    // the last shared element handles all large sizes
-    return AccessSpreaderArray<Atom,kMaxCpus>::sharedInstance[
-        std::min(size_t(kMaxCpus), numStripes)];
-  }
-
-  /// Returns the stripe associated with the current CPU, assuming
-  /// that there are numStripes (non-zero) stripes.  Equivalent to
-  /// AccessSpreader::shared(numStripes)->current.
+  /// Returns the stripe associated with the current CPU.  The returned
+  /// value will be < numStripes.
    static size_t current(size_t numStripes) {
-    return shared(numStripes).current();
-  }
-
-  /// stripeByCore uses 1 stripe per L1 cache, according to
-  /// CacheLocality::system<>().  Use stripeByCore.numStripes() to see
-  /// its width, or stripeByCore.current() to get the current stripe
-  static const AccessSpreader stripeByCore;
-
-  /// stripeByChip uses 1 stripe per last-level cache, which is the fewest
-  /// number of stripes for which off-chip communication can be avoided
-  /// (assuming all caches are on-chip).  Use stripeByChip.numStripes()
-  /// to see its width, or stripeByChip.current() to get the current stripe
-  static const AccessSpreader stripeByChip;
-
-
-  /// Constructs an AccessSpreader that will return values from
-  /// 0 to numStripes-1 (inclusive), precomputing the mapping
-  /// from CPU to stripe.  There is no use in having more than
-  /// CacheLocality::system<Atom>().localityIndexByCpu.size() stripes or
-  /// kMaxCpus stripes
-  explicit AccessSpreader(size_t spreaderNumStripes,
-                          const CacheLocality& cacheLocality =
-                              CacheLocality::system<Atom>(),
-                          Getcpu::Func getcpuFunc = nullptr)
-    : getcpuFunc_(getcpuFunc ? getcpuFunc : pickGetcpuFunc(spreaderNumStripes))
-    , numStripes_(spreaderNumStripes)
-  {
-    auto n = cacheLocality.numCpus;
-    for (size_t cpu = 0; cpu < kMaxCpus && cpu < n; ++cpu) {
-      auto index = cacheLocality.localityIndexByCpu[cpu];
-      assert(index < n);
-      // as index goes from 0..n, post-transform value goes from
-      // 0..numStripes
-      stripeByCpu[cpu] = (index * numStripes_) / n;
-      assert(stripeByCpu[cpu] < numStripes_);
-    }
-    for (size_t cpu = n; cpu < kMaxCpus; ++cpu) {
-      stripeByCpu[cpu] = stripeByCpu[cpu - n];
-    }
-  }
-
-  /// Returns 1 more than the maximum value that can be returned from
-  /// current()
-  size_t numStripes() const {
-    return numStripes_;
-  }
+    // widthAndCpuToStripe[0] will actually work okay (all zeros), but
+    // something's wrong with the caller
+    assert(numStripes > 0);
  
-  /// Returns the stripe associated with the current CPU
-  size_t current() const {
      unsigned cpu;
-    getcpuFunc_(&cpu, nullptr, nullptr);
-    return stripeByCpu[cpu % kMaxCpus];
+    getcpuFunc(&cpu, nullptr, nullptr);
+    return widthAndCpuToStripe[std::min(size_t(kMaxCpus),
+                                        numStripes)][cpu % kMaxCpus];
    }
  
   private:
-
    /// If there are more cpus than this nothing will crash, but there
    /// might be unnecessary sharing
    enum { kMaxCpus = 128 };
@@ -299,71 +256,92 @@ struct AccessSpreader {
    typedef uint8_t CompactStripe;
  
    static_assert((kMaxCpus & (kMaxCpus - 1)) == 0,
-      "kMaxCpus should be a power of two so modulo is fast");
+                "kMaxCpus should be a power of two so modulo is fast");
    static_assert(kMaxCpus - 1 <= std::numeric_limits<CompactStripe>::max(),
-      "stripeByCpu element type isn't wide enough");
-
+                "stripeByCpu element type isn't wide enough");
  
    /// Points to the getcpu-like function we are using to obtain the
    /// current cpu.  It should not be assumed that the returned cpu value
-  /// is in range.  We use a member for this instead of a static so that
-  /// this fetch preloads a prefix the stripeByCpu array
-  Getcpu::Func getcpuFunc_;
-
-  /// A precomputed map from cpu to stripe.  Rather than add a layer of
-  /// indirection requiring a dynamic bounds check and another cache miss,
-  /// we always precompute the whole array
-  CompactStripe stripeByCpu[kMaxCpus];
-
-  size_t numStripes_;
-
-  /// Returns the best getcpu implementation for this type and width
-  /// of AccessSpreader
-  static Getcpu::Func pickGetcpuFunc(size_t numStripes);
-};
-
-template<>
-Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t);
-
-
-/// An array of kMaxCpus+1 AccessSpreader<Atom> instances constructed
-/// with default params, with the zero-th element having 1 stripe
-template <template<typename> class Atom, size_t kMaxStripe>
-struct AccessSpreaderArray {
-
-  AccessSpreaderArray() {
-    for (size_t i = 0; i <= kMaxStripe; ++i) {
-      new (raw + i) AccessSpreader<Atom>(std::max(size_t(1), i));
+  /// is in range.  We use a static for this so that we can prearrange a
+  /// valid value in the pre-constructed state and avoid the need for a
+  /// conditional on every subsequent invocation (not normally a big win,
+  /// but 20% on some inner loops here).
+  static Getcpu::Func getcpuFunc;
+
+  /// For each level of splitting up to kMaxCpus, maps the cpu (mod
+  /// kMaxCpus) to the stripe.  Rather than performing any inequalities
+  /// or modulo on the actual number of cpus, we just fill in the entire
+  /// array.
+  static CompactStripe widthAndCpuToStripe[kMaxCpus + 1][kMaxCpus];
+
+  static bool initialized;
+
+  /// Returns the best getcpu implementation for Atom
+  static Getcpu::Func pickGetcpuFunc();
+
+  /// Always claims to be on CPU zero, node zero
+  static int degenerateGetcpu(unsigned* cpu, unsigned* node, void*) {
+    if (cpu != nullptr) {
+      *cpu = 0;
      }
-  }
-
-  ~AccessSpreaderArray() {
-    for (size_t i = 0; i <= kMaxStripe; ++i) {
-      auto p = static_cast<AccessSpreader<Atom>*>(static_cast<void*>(raw + i));
-      p->~AccessSpreader();
+    if (node != nullptr) {
+      *node = 0;
      }
+    return 0;
    }
  
-  AccessSpreader<Atom> const& operator[] (size_t index) const {
-    return *static_cast<AccessSpreader<Atom> const*>(
-        static_cast<void const*>(raw + index));
+  // The function to call for fast lookup of getcpu is a singleton, as
+  // is the precomputed table of locality information.  AccessSpreader
+  // is used in very tight loops, however (we're trying to race an L1
+  // cache miss!), so the normal singleton mechanisms are noticeably
+  // expensive.  Even a not-taken branch guarding access to getcpuFunc
+  // slows AccessSpreader::current from 12 nanos to 14.  As a result, we
+  // populate the static members with simple (but valid) values that can
+  // be filled in by the linker, and then follow up with a normal static
+  // initializer call that puts in the proper version.  This means that
+  // when there are initialization order issues we will just observe a
+  // zero stripe.  Once a sanitizer gets smart enough to detect this as
+  // a race or undefined behavior, we can annotate it.
+
+  static bool initialize() {
+    getcpuFunc = pickGetcpuFunc();
+
+    auto& cacheLocality = CacheLocality::system<Atom>();
+    auto n = cacheLocality.numCpus;
+    for (size_t width = 0; width <= kMaxCpus; ++width) {
+      auto numStripes = std::max(size_t{1}, width);
+      for (size_t cpu = 0; cpu < kMaxCpus && cpu < n; ++cpu) {
+        auto index = cacheLocality.localityIndexByCpu[cpu];
+        assert(index < n);
+        // as index goes from 0..n, post-transform value goes from
+        // 0..numStripes
+        widthAndCpuToStripe[width][cpu] = (index * numStripes) / n;
+        assert(widthAndCpuToStripe[width][cpu] < numStripes);
+      }
+      for (size_t cpu = n; cpu < kMaxCpus; ++cpu) {
+        widthAndCpuToStripe[width][cpu] = widthAndCpuToStripe[width][cpu - n];
+      }
+    }
+    return true;
    }
-
- private:
-
-  // AccessSpreader uses sharedInstance
-  friend AccessSpreader<Atom>;
-
-  static AccessSpreaderArray<Atom,kMaxStripe> sharedInstance;
-
-
-  /// aligned_storage is uninitialized, we use placement new since there
-  /// is no AccessSpreader default constructor
-  typename std::aligned_storage<sizeof(AccessSpreader<Atom>),
-                                CacheLocality::kFalseSharingRange>::type
-      raw[kMaxStripe + 1];
  };
  
-} }
+template <>
+Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc();
+
+#define DECLARE_ACCESS_SPREADER_TYPE(Atom)                                     \
+  namespace folly {                                                            \
+  namespace detail {                                                           \
+  template <>                                                                  \
+  Getcpu::Func AccessSpreader<Atom>::getcpuFunc =                              \
+      AccessSpreader<Atom>::degenerateGetcpu;                                  \
+  template <>                                                                  \
+  typename AccessSpreader<Atom>::CompactStripe                                 \
+      AccessSpreader<Atom>::widthAndCpuToStripe[129][128] = {};                \
+  template <>                                                                  \
+  bool AccessSpreader<Atom>::initialized = AccessSpreader<Atom>::initialize(); \
+  }                                                                            \
+  }
  
-#endif /* FOLLY_DETAIL_CacheLocality_H_ */
+} // namespace detail
+} // namespace folly