Make folly::detail::CacheLocality portable on apple

author Francis Ma <fma@fb.com>

Wed, 27 Jan 2016 22:37:05 +0000 (14:37 -0800)

committer facebook-github-bot-0 <folly-bot@fb.com>

Wed, 27 Jan 2016 23:20:28 +0000 (15:20 -0800)
author Francis Ma <fma@fb.com>
Wed, 27 Jan 2016 22:37:05 +0000 (14:37 -0800)
committer facebook-github-bot-0 <folly-bot@fb.com>
Wed, 27 Jan 2016 23:20:28 +0000 (15:20 -0800)
diff --git a/folly/Portability.h b/folly/Portability.h

index 03717262bb74b0234b6904fe8a6828993b8f0da1..f4893a543c3a06a82933f768c73c62fc356d5d83 100644 (file)
--- a/folly/Portability.h
+++ b/folly/Portability.h
@@ -223,7 +223,9 @@ namespace std { typedef ::max_align_t max_align_t; }
   * the semantics are the same
   * (but remember __thread has different semantics when using emutls (ex. apple))
   */
-#if defined(_MSC_VER)
+#if defined(__APPLE__)
+#undef FOLLY_TLS
+#elif defined(_MSC_VER)
  # define FOLLY_TLS __declspec(thread)
  #elif defined(__GNUC__) || defined(__clang__)
  # define FOLLY_TLS __thread
diff --git a/folly/detail/CacheLocality.cpp b/folly/detail/CacheLocality.cpp

index 87f85928d17433b0f5089f345ec30762009138ea..a6abec2ae1aabb6f454e601b34e22b1d1050dccc 100644 (file)
--- a/folly/detail/CacheLocality.cpp
+++ b/folly/detail/CacheLocality.cpp
@@ -232,6 +232,7 @@ Getcpu::Func Getcpu::vdsoFunc() {
    return func;
  }
  
+#ifdef FOLLY_TLS
  /////////////// SequentialThreadId
  
  template<>
@@ -239,6 +240,7 @@ std::atomic<size_t> SequentialThreadId<std::atomic>::prevId(0);
  
  template<>
  FOLLY_TLS size_t SequentialThreadId<std::atomic>::currentId(0);
+#endif
  
  /////////////// AccessSpreader
  
@@ -277,7 +279,7 @@ Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t numStripes) {
      return &degenerateGetcpu;
    } else {
      auto best = Getcpu::vdsoFunc();
-    return best ? best : &SequentialThreadId<std::atomic>::getcpu;
+    return best ? best : &FallbackGetcpuType::getcpu;
    }
  }
  
diff --git a/folly/detail/CacheLocality.h b/folly/detail/CacheLocality.h

index 8731d485aafbcc6f8067bea95ae9e341ed277c3d..b632888725ab1770ba4e2bc5978eb97ed2812622 100644 (file)
--- a/folly/detail/CacheLocality.h
+++ b/folly/detail/CacheLocality.h
@@ -26,6 +26,7 @@
  #include <string>
  #include <type_traits>
  #include <vector>
+#include <folly/Hash.h>
  #include <folly/Likely.h>
  #include <folly/Portability.h>
  
@@ -141,10 +142,7 @@ struct Getcpu {
    static Func vdsoFunc();
  };
  
-/// A class that lazily binds a unique (for each implementation of Atom)
-/// identifier to a thread.  This is a fallback mechanism for the access
-/// spreader if we are in testing (using DeterministicAtomic) or if
-/// __vdso_getcpu can't be dynamically loaded
+#ifdef FOLLY_TLS
  template <template<typename> class Atom>
  struct SequentialThreadId {
  
@@ -157,11 +155,32 @@ struct SequentialThreadId {
      return rv;
    }
  
+ private:
+  static Atom<size_t> prevId;
+
+  static FOLLY_TLS size_t currentId;
+};
+#endif
+
+struct HashingThreadId {
+  static size_t get() {
+    pthread_t pid = pthread_self();
+    uint64_t id = 0;
+    memcpy(&id, &pid, std::min(sizeof(pid), sizeof(id)));
+    return hash::twang_32from64(id);
+  }
+};
+
+/// A class that lazily binds a unique (for each implementation of Atom)
+/// identifier to a thread.  This is a fallback mechanism for the access
+/// spreader if __vdso_getcpu can't be loaded
+template <typename ThreadId>
+struct FallbackGetcpu {
    /// Fills the thread id into the cpu and node out params (if they
    /// are non-null).  This method is intended to act like getcpu when a
    /// fast-enough form of getcpu isn't available or isn't desired
    static int getcpu(unsigned* cpu, unsigned* node, void* unused) {
-    auto id = get();
+    auto id = ThreadId::get();
      if (cpu) {
        *cpu = id;
      }
@@ -170,13 +189,14 @@ struct SequentialThreadId {
      }
      return 0;
    }
-
- private:
-  static Atom<size_t> prevId;
-
-  static FOLLY_TLS size_t currentId;
  };
  
+#ifdef FOLLY_TLS
+typedef FallbackGetcpu<SequentialThreadId<std::atomic>> FallbackGetcpuType;
+#else
+typedef FallbackGetcpu<HashingThreadId> FallbackGetcpuType;
+#endif
+
  template <template<typename> class Atom, size_t kMaxCpus>
  struct AccessSpreaderArray;
  
diff --git a/folly/test/CacheLocalityTest.cpp b/folly/test/CacheLocalityTest.cpp

index fb400849fc5349ca785cd3f5a862c11f496e65b6..2899b48a4619f7546685d0009d60e3731c456449 100644 (file)
--- a/folly/test/CacheLocalityTest.cpp
+++ b/folly/test/CacheLocalityTest.cpp
@@ -317,13 +317,30 @@ TEST(Getcpu, VdsoGetcpu) {
    EXPECT_TRUE(cpu < CPU_SETSIZE);
  }
  
-TEST(SequentialThreadId, Simple) {
+#ifdef FOLLY_TLS
+TEST(ThreadId, SimpleTls) {
    unsigned cpu = 0;
-  auto rv = SequentialThreadId<std::atomic>::getcpu(&cpu, nullptr, nullptr);
+  auto rv =
+      folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu(
+          &cpu, nullptr, nullptr);
    EXPECT_EQ(rv, 0);
    EXPECT_TRUE(cpu > 0);
    unsigned again;
-  SequentialThreadId<std::atomic>::getcpu(&again, nullptr, nullptr);
+  folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu(
+      &again, nullptr, nullptr);
+  EXPECT_EQ(cpu, again);
+}
+#endif
+
+TEST(ThreadId, SimplePthread) {
+  unsigned cpu = 0;
+  auto rv = folly::detail::FallbackGetcpu<HashingThreadId>::getcpu(
+      &cpu, nullptr, nullptr);
+  EXPECT_EQ(rv, 0);
+  EXPECT_TRUE(cpu > 0);
+  unsigned again;
+  folly::detail::FallbackGetcpu<HashingThreadId>::getcpu(
+      &again, nullptr, nullptr);
    EXPECT_EQ(cpu, again);
  }
  
@@ -434,7 +451,7 @@ BENCHMARK(AccessSpreaderConstruction, iters) {
    }
  }
  
-enum class SpreaderType { GETCPU, SHARED, TLS_RR };
+enum class SpreaderType { GETCPU, SHARED, TLS_RR, PTHREAD_SELF };
  
  // Benchmark scores here reflect the time for 32 threads to perform an
  // atomic increment on a dual-socket E5-2660 @ 2.2Ghz.  Surprisingly,
@@ -472,42 +489,52 @@ enum class SpreaderType { GETCPU, SHARED, TLS_RR };
  // ============================================================================
  // folly/test/CacheLocalityTest.cpp                relative  time/iter  iters/s
  // ============================================================================
-// contentionAtWidth(1_stripe_0_work_stub)                      1.14us  873.64K
-// contentionAtWidth(2_stripe_0_work_getcpu)                  495.58ns    2.02M
-// contentionAtWidth(4_stripe_0_work_getcpu)                  232.99ns    4.29M
-// contentionAtWidth(8_stripe_0_work_getcpu)                  101.16ns    9.88M
-// contentionAtWidth(16_stripe_0_work_getcpu)                  41.93ns   23.85M
-// contentionAtWidth(32_stripe_0_work_getcpu)                  42.04ns   23.79M
-// contentionAtWidth(64_stripe_0_work_getcpu)                  41.94ns   23.84M
-// contentionAtWidth(2_stripe_0_work_tls_rr)                    1.00us  997.41K
-// contentionAtWidth(4_stripe_0_work_tls_rr)                  694.41ns    1.44M
-// contentionAtWidth(8_stripe_0_work_tls_rr)                  590.27ns    1.69M
-// contentionAtWidth(16_stripe_0_work_tls_rr)                 222.13ns    4.50M
-// contentionAtWidth(32_stripe_0_work_tls_rr)                 169.49ns    5.90M
-// contentionAtWidth(64_stripe_0_work_tls_rr)                 162.20ns    6.17M
-// contentionAtWidth(2_stripe_0_work_shared)                  495.54ns    2.02M
-// contentionAtWidth(4_stripe_0_work_shared)                  236.27ns    4.23M
-// contentionAtWidth(8_stripe_0_work_shared)                  114.81ns    8.71M
-// contentionAtWidth(16_stripe_0_work_shared)                  44.65ns   22.40M
-// contentionAtWidth(32_stripe_0_work_shared)                  41.76ns   23.94M
-// contentionAtWidth(64_stripe_0_work_shared)                  43.47ns   23.00M
-// atomicIncrBaseline(local_incr_0_work)                       20.39ns   49.06M
+// LocalAccessSpreaderUse                                      13.00ns   76.94M
+// SharedAccessSpreaderUse                                     13.04ns   76.66M
+// AccessSpreaderConstruction                                 366.00ns    2.73M
+// ----------------------------------------------------------------------------
+// contentionAtWidth(1_stripe_0_work_stub)                    891.04ns    1.12M
+// contentionAtWidth(2_stripe_0_work_getcpu)                  403.45ns    2.48M
+// contentionAtWidth(4_stripe_0_work_getcpu)                  198.02ns    5.05M
+// contentionAtWidth(8_stripe_0_work_getcpu)                   90.54ns   11.04M
+// contentionAtWidth(16_stripe_0_work_getcpu)                  31.21ns   32.04M
+// contentionAtWidth(32_stripe_0_work_getcpu)                  29.15ns   34.31M
+// contentionAtWidth(64_stripe_0_work_getcpu)                  32.41ns   30.86M
+// contentionAtWidth(2_stripe_0_work_tls_rr)                  958.06ns    1.04M
+// contentionAtWidth(4_stripe_0_work_tls_rr)                  494.31ns    2.02M
+// contentionAtWidth(8_stripe_0_work_tls_rr)                  362.34ns    2.76M
+// contentionAtWidth(16_stripe_0_work_tls_rr)                 231.37ns    4.32M
+// contentionAtWidth(32_stripe_0_work_tls_rr)                 128.26ns    7.80M
+// contentionAtWidth(64_stripe_0_work_tls_rr)                 115.08ns    8.69M
+// contentionAtWidth(2_stripe_0_work_pthread_self)            856.63ns    1.17M
+// contentionAtWidth(4_stripe_0_work_pthread_self)            623.43ns    1.60M
+// contentionAtWidth(8_stripe_0_work_pthread_self)            419.69ns    2.38M
+// contentionAtWidth(16_stripe_0_work_pthread_self            217.32ns    4.60M
+// contentionAtWidth(32_stripe_0_work_pthread_self            157.69ns    6.34M
+// contentionAtWidth(64_stripe_0_work_pthread_self            140.94ns    7.10M
+// contentionAtWidth(2_stripe_0_work_shared)                  406.55ns    2.46M
+// contentionAtWidth(4_stripe_0_work_shared)                  198.28ns    5.04M
+// contentionAtWidth(8_stripe_0_work_shared)                   90.11ns   11.10M
+// contentionAtWidth(16_stripe_0_work_shared)                  34.53ns   28.96M
+// contentionAtWidth(32_stripe_0_work_shared)                  30.08ns   33.25M
+// contentionAtWidth(64_stripe_0_work_shared)                  34.60ns   28.90M
+// atomicIncrBaseline(local_incr_0_work)                       17.51ns   57.12M
  // ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_500_work_stub)                    2.04us  491.13K
-// contentionAtWidth(2_stripe_500_work_getcpu)                610.98ns    1.64M
-// contentionAtWidth(4_stripe_500_work_getcpu)                507.72ns    1.97M
-// contentionAtWidth(8_stripe_500_work_getcpu)                542.53ns    1.84M
-// contentionAtWidth(16_stripe_500_work_getcpu)               496.55ns    2.01M
-// contentionAtWidth(32_stripe_500_work_getcpu)               500.67ns    2.00M
-// atomicIncrBaseline(local_incr_500_work)                    484.69ns    2.06M
+// contentionAtWidth(1_stripe_500_work_stub)                    1.87us  534.36K
+// contentionAtWidth(2_stripe_500_work_getcpu)                542.31ns    1.84M
+// contentionAtWidth(4_stripe_500_work_getcpu)                409.18ns    2.44M
+// contentionAtWidth(8_stripe_500_work_getcpu)                511.05ns    1.96M
+// contentionAtWidth(16_stripe_500_work_getcpu)               399.14ns    2.51M
+// contentionAtWidth(32_stripe_500_work_getcpu)               399.05ns    2.51M
+// atomicIncrBaseline(local_incr_500_work)                    399.41ns    2.50M
  // ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_1000_work_stub)                   2.11us  473.78K
-// contentionAtWidth(2_stripe_1000_work_getcpu)               970.64ns    1.03M
-// contentionAtWidth(4_stripe_1000_work_getcpu)               987.31ns    1.01M
-// contentionAtWidth(8_stripe_1000_work_getcpu)                 1.01us  985.52K
-// contentionAtWidth(16_stripe_1000_work_getcpu)              986.09ns    1.01M
-// contentionAtWidth(32_stripe_1000_work_getcpu)              960.23ns    1.04M
-// atomicIncrBaseline(local_incr_1000_work)                   950.63ns    1.05M
+// contentionAtWidth(1_stripe_1000_work_stub)                   1.90us  525.73K
+// contentionAtWidth(2_stripe_1000_work_getcpu)               792.91ns    1.26M
+// contentionAtWidth(4_stripe_1000_work_getcpu)               788.14ns    1.27M
+// contentionAtWidth(8_stripe_1000_work_getcpu)               794.16ns    1.26M
+// contentionAtWidth(16_stripe_1000_work_getcpu)              785.33ns    1.27M
+// contentionAtWidth(32_stripe_1000_work_getcpu)              786.56ns    1.27M
+// atomicIncrBaseline(local_incr_1000_work)                   784.69ns    1.27M
  // ============================================================================
  static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
                                SpreaderType spreaderType,
@@ -515,11 +542,18 @@ static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
                                size_t numThreads = 32) {
    folly::BenchmarkSuspender braces;
  
+  folly::detail::Getcpu::Func getcpuFunc = nullptr;
+
+  if (spreaderType == SpreaderType::TLS_RR) {
+    getcpuFunc =
+        folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu;
+  }
+  if (spreaderType == SpreaderType::PTHREAD_SELF) {
+    getcpuFunc = folly::detail::FallbackGetcpu<HashingThreadId>::getcpu;
+  }
+
    AccessSpreader<> spreader(
-      stripes,
-      CacheLocality::system<std::atomic>(),
-      spreaderType == SpreaderType::TLS_RR
-          ? SequentialThreadId<std::atomic>::getcpu : nullptr);
+      stripes, CacheLocality::system<std::atomic>(), getcpuFunc);
  
    std::atomic<size_t> ready(0);
    std::atomic<bool> go(false);
@@ -651,6 +685,36 @@ BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_tls_rr,
                        32, 0, SpreaderType::TLS_RR)
  BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_tls_rr,
                        64, 0, SpreaderType::TLS_RR)
+BENCHMARK_NAMED_PARAM(contentionAtWidth,
+                      2_stripe_0_work_pthread_self,
+                      2,
+                      0,
+                      SpreaderType::PTHREAD_SELF)
+BENCHMARK_NAMED_PARAM(contentionAtWidth,
+                      4_stripe_0_work_pthread_self,
+                      4,
+                      0,
+                      SpreaderType::PTHREAD_SELF)
+BENCHMARK_NAMED_PARAM(contentionAtWidth,
+                      8_stripe_0_work_pthread_self,
+                      8,
+                      0,
+                      SpreaderType::PTHREAD_SELF)
+BENCHMARK_NAMED_PARAM(contentionAtWidth,
+                      16_stripe_0_work_pthread_self,
+                      16,
+                      0,
+                      SpreaderType::PTHREAD_SELF)
+BENCHMARK_NAMED_PARAM(contentionAtWidth,
+                      32_stripe_0_work_pthread_self,
+                      32,
+                      0,
+                      SpreaderType::PTHREAD_SELF)
+BENCHMARK_NAMED_PARAM(contentionAtWidth,
+                      64_stripe_0_work_pthread_self,
+                      64,
+                      0,
+                      SpreaderType::PTHREAD_SELF)
  BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_shared,
                        2, 0, SpreaderType::SHARED)
  BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_shared,
author	Francis Ma <fma@fb.com>
	Wed, 27 Jan 2016 22:37:05 +0000 (14:37 -0800)
committer	facebook-github-bot-0 <folly-bot@fb.com>
	Wed, 27 Jan 2016 23:20:28 +0000 (15:20 -0800)
folly/Portability.h		patch \| blob \| history
folly/detail/CacheLocality.cpp		patch \| blob \| history
folly/detail/CacheLocality.h		patch \| blob \| history
folly/test/CacheLocalityTest.cpp		patch \| blob \| history