Add hardware_destructive_interference_size

author Yedidya Feldblum <yfeldblum@fb.com>

Wed, 20 Dec 2017 03:02:51 +0000 (19:02 -0800)

committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>

Wed, 20 Dec 2017 03:05:10 +0000 (19:05 -0800)
author Yedidya Feldblum <yfeldblum@fb.com>
Wed, 20 Dec 2017 03:02:51 +0000 (19:02 -0800)
committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
Wed, 20 Dec 2017 03:05:10 +0000 (19:05 -0800)
diff --git a/folly/CachelinePadded.h b/folly/CachelinePadded.h

index cf10ecb7a8baadf63f1a046983f45051f3314b99..27a06b4a917bbd96b1dcd9e16bf34616b9fb92ae 100644 (file)
--- a/folly/CachelinePadded.h
+++ b/folly/CachelinePadded.h
@@ -17,9 +17,8 @@
  #pragma once
  
  #include <cstddef>
  #pragma once
  
  #include <cstddef>
+#include <utility>
  
  
-#include <folly/Portability.h>
-#include <folly/concurrency/CacheLocality.h>
  #include <folly/lang/Align.h>
  
  namespace folly {
  #include <folly/lang/Align.h>
  
  namespace folly {
@@ -68,8 +67,8 @@ class CachelinePadded {
  
   private:
    static constexpr size_t paddingSize() noexcept {
  
   private:
    static constexpr size_t paddingSize() noexcept {
-    return CacheLocality::kFalseSharingRange -
-        (alignof(T) % CacheLocality::kFalseSharingRange);
+    return hardware_destructive_interference_size -
+        (alignof(T) % hardware_destructive_interference_size);
    }
    char paddingPre_[paddingSize()];
    T inner_;
    }
    char paddingPre_[paddingSize()];
    T inner_;
diff --git a/folly/MPMCQueue.h b/folly/MPMCQueue.h

index d4c0ccc7c8307dbfec970f75fa8b6bd676da6339..0db70fa0a01d3d26ba6ca5e21c04972d65ef64fa 100644 (file)
--- a/folly/MPMCQueue.h
+++ b/folly/MPMCQueue.h
@@ -651,11 +651,12 @@ class MPMCQueueBase<Derived<T, Atom, Dynamic>> : boost::noncopyable {
      }
  
      // ideally this would be a static assert, but g++ doesn't allow it
      }
  
      // ideally this would be a static assert, but g++ doesn't allow it
-    assert(alignof(MPMCQueue<T, Atom>) >= CacheLocality::kFalseSharingRange);
+    assert(
+        alignof(MPMCQueue<T, Atom>) >= hardware_destructive_interference_size);
      assert(
          static_cast<uint8_t*>(static_cast<void*>(&popTicket_)) -
              static_cast<uint8_t*>(static_cast<void*>(&pushTicket_)) >=
      assert(
          static_cast<uint8_t*>(static_cast<void*>(&popTicket_)) -
              static_cast<uint8_t*>(static_cast<void*>(&pushTicket_)) >=
-        CacheLocality::kFalseSharingRange);
+        static_cast<ptrdiff_t>(hardware_destructive_interference_size));
    }
  
    /// A default-constructed queue is useful because a usable (non-zero
    }
  
    /// A default-constructed queue is useful because a usable (non-zero
@@ -975,7 +976,8 @@ class MPMCQueueBase<Derived<T, Atom, Dynamic>> : boost::noncopyable {
      /// To avoid false sharing in slots_ with neighboring memory
      /// allocations, we pad it with this many SingleElementQueue-s at
      /// each end
      /// To avoid false sharing in slots_ with neighboring memory
      /// allocations, we pad it with this many SingleElementQueue-s at
      /// each end
-    kSlotPadding = (CacheLocality::kFalseSharingRange - 1) / sizeof(Slot) + 1
+    kSlotPadding =
+        (hardware_destructive_interference_size - 1) / sizeof(Slot) + 1
    };
  
    /// The maximum number of items in the queue at once
    };
  
    /// The maximum number of items in the queue at once
@@ -1027,7 +1029,7 @@ class MPMCQueueBase<Derived<T, Atom, Dynamic>> : boost::noncopyable {
  
    /// Alignment doesn't prevent false sharing at the end of the struct,
    /// so fill out the last cache line
  
    /// Alignment doesn't prevent false sharing at the end of the struct,
    /// so fill out the last cache line
-  char padding_[CacheLocality::kFalseSharingRange - sizeof(Atom<uint32_t>)];
+  char pad_[hardware_destructive_interference_size - sizeof(Atom<uint32_t>)];
  
    /// We assign tickets in increasing order, but we don't want to
    /// access neighboring elements of slots_ because that will lead to
  
    /// We assign tickets in increasing order, but we don't want to
    /// access neighboring elements of slots_ because that will lead to
diff --git a/folly/Portability.h b/folly/Portability.h

index 1498db30bfca9f4b6512f090491e4360eadad9e2..11eb8f82b16ed9f3f580aad406547b524dc490c3 100644 (file)
--- a/folly/Portability.h
+++ b/folly/Portability.h
@@ -97,6 +97,12 @@ constexpr bool kHasUnalignedAccess = false;
  # define FOLLY_X64 0
  #endif
  
  # define FOLLY_X64 0
  #endif
  
+#if defined(__arm__)
+#define FOLLY_ARM 1
+#else
+#define FOLLY_ARM 0
+#endif
+
  #if defined(__aarch64__)
  # define FOLLY_AARCH64 1
  #else
  #if defined(__aarch64__)
  # define FOLLY_AARCH64 1
  #else
@@ -110,6 +116,7 @@ constexpr bool kHasUnalignedAccess = false;
  #endif
  
  namespace folly {
  #endif
  
  namespace folly {
+constexpr bool kIsArchArm = FOLLY_ARM == 1;
  constexpr bool kIsArchAmd64 = FOLLY_X64 == 1;
  constexpr bool kIsArchAArch64 = FOLLY_AARCH64 == 1;
  constexpr bool kIsArchPPC64 = FOLLY_PPC64 == 1;
  constexpr bool kIsArchAmd64 = FOLLY_X64 == 1;
  constexpr bool kIsArchAArch64 = FOLLY_AARCH64 == 1;
  constexpr bool kIsArchPPC64 = FOLLY_PPC64 == 1;
diff --git a/folly/ProducerConsumerQueue.h b/folly/ProducerConsumerQueue.h

index 16de57d35486a8207fcb1def59c48d59a1360fd3..2a8f04c6597aa4b22bbd5c9c5f657d7eefa235cd 100644 (file)
--- a/folly/ProducerConsumerQueue.h
+++ b/folly/ProducerConsumerQueue.h
@@ -173,14 +173,14 @@ struct ProducerConsumerQueue {
    }
  
   private:
    }
  
   private:
-  char pad0_[CacheLocality::kFalseSharingRange];
+  char pad0_[hardware_destructive_interference_size];
    const uint32_t size_;
    T* const records_;
  
    FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned int> readIndex_;
    FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned int> writeIndex_;
  
    const uint32_t size_;
    T* const records_;
  
    FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned int> readIndex_;
    FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned int> writeIndex_;
  
-  char pad1_[CacheLocality::kFalseSharingRange - sizeof(writeIndex_)];
+  char pad1_[hardware_destructive_interference_size - sizeof(writeIndex_)];
  };
  
  } // namespace folly
  };
  
  } // namespace folly
diff --git a/folly/concurrency/CacheLocality.h b/folly/concurrency/CacheLocality.h

index 7751b7dc91b624a3d3137694990ebea271f88ee0..420f5334b5ac108e37695e5943b17b115b83ed03 100644 (file)
--- a/folly/concurrency/CacheLocality.h
+++ b/folly/concurrency/CacheLocality.h
@@ -116,22 +116,9 @@ struct CacheLocality {
    /// CacheLocality structure with the specified number of cpus and a
    /// single cache level that associates one cpu per cache.
    static CacheLocality uniform(size_t numCpus);
    /// CacheLocality structure with the specified number of cpus and a
    /// single cache level that associates one cpu per cache.
    static CacheLocality uniform(size_t numCpus);
-
-  enum {
-    /// Memory locations on the same cache line are subject to false
-    /// sharing, which is very bad for performance.  Microbenchmarks
-    /// indicate that pairs of cache lines also see interference under
-    /// heavy use of atomic operations (observed for atomic increment on
-    /// Sandy Bridge).  See FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
-    kFalseSharingRange = 128
-  };
-
-  static_assert(
-      kFalseSharingRange == 128,
-      "FOLLY_ALIGN_TO_AVOID_FALSE_SHARING should track kFalseSharingRange");
  };
  
  };
  
-// TODO replace __attribute__ with alignas and 128 with kFalseSharingRange
+// TODO replace with alignas(hardware_destructive_interference_size)
  
  /// An attribute that will cause a variable or field to be aligned so that
  /// it doesn't have false sharing with anything at a smaller memory address.
  
  /// An attribute that will cause a variable or field to be aligned so that
  /// it doesn't have false sharing with anything at a smaller memory address.
@@ -451,14 +438,11 @@ class CoreAllocator {
      void* allocate(size_t size) {
        auto cl = sizeClass(size);
        if (cl == 4) {
      void* allocate(size_t size) {
        auto cl = sizeClass(size);
        if (cl == 4) {
-        static_assert(
-            CacheLocality::kFalseSharingRange == 128,
-            "kFalseSharingRange changed");
          // Align to a cacheline
          // Align to a cacheline
-        size = size + (CacheLocality::kFalseSharingRange - 1);
-        size &= ~size_t(CacheLocality::kFalseSharingRange - 1);
-        void* mem =
-            detail::aligned_malloc(size, CacheLocality::kFalseSharingRange);
+        size = size + (hardware_destructive_interference_size - 1);
+        size &= ~size_t(hardware_destructive_interference_size - 1);
+        void* mem = detail::aligned_malloc(
+            size, hardware_destructive_interference_size);
          if (!mem) {
            std::__throw_bad_alloc();
          }
          if (!mem) {
            std::__throw_bad_alloc();
          }
diff --git a/folly/lang/Align.h b/folly/lang/Align.h

index 4f0564dccd62fbd7d821efcdb95079468f697b4d..2bdb8dd8d302549126828cf2176abe2026cdafab 100644 (file)
--- a/folly/lang/Align.h
+++ b/folly/lang/Align.h
@@ -18,6 +18,8 @@
  
  #include <cstddef>
  
  
  #include <cstddef>
  
+#include <folly/Portability.h>
+
  namespace folly {
  
  namespace detail {
  namespace folly {
  
  namespace detail {
@@ -88,4 +90,32 @@ using max_align_v_ = max_align_t_<
  constexpr std::size_t max_align_v = detail::max_align_v_::value;
  struct alignas(max_align_v) max_align_t {};
  
  constexpr std::size_t max_align_v = detail::max_align_v_::value;
  struct alignas(max_align_v) max_align_t {};
  
+//  Memory locations within the same cache line are subject to destructive
+//  interference, also known as false sharing, which is when concurrent
+//  accesses to these different memory locations from different cores, where at
+//  least one of the concurrent accesses is or involves a store operation,
+//  induce contention and harm performance.
+//
+//  Microbenchmarks indicate that pairs of cache lines also see destructive
+//  interference under heavy use of atomic operations, as observed for atomic
+//  increment on Sandy Bridge.
+//
+//  We assume a cache line size of 64, so we use a cache line pair size of 128
+//  to avoid destructive interference.
+//
+//  mimic: std::hardware_destructive_interference_size, C++17
+constexpr std::size_t hardware_destructive_interference_size =
+    kIsArchArm ? 64 : 128;
+static_assert(hardware_destructive_interference_size >= max_align_v, "math?");
+
+//  Memory locations within the same cache line are subject to constructive
+//  interference, also known as true sharing, which is when accesses to some
+//  memory locations induce all memory locations within the same cache line to
+//  be cached, benefiting subsequent accesses to different memory locations
+//  within the same cache line and heping performance.
+//
+//  mimic: std::hardware_constructive_interference_size, C++17
+constexpr std::size_t hardware_constructive_interference_size = 64;
+static_assert(hardware_constructive_interference_size >= max_align_v, "math?");
+
  } // namespace folly
  } // namespace folly
diff --git a/folly/test/CachelinePaddedTest.cpp b/folly/test/CachelinePaddedTest.cpp

index 148493e31c66163043a5977df0d78d3d841e52ae..5890ba36f1949dbedc11625ac424bb3a069014c1 100644 (file)
--- a/folly/test/CachelinePaddedTest.cpp
+++ b/folly/test/CachelinePaddedTest.cpp
@@ -27,7 +27,8 @@ static_assert(
      std::is_standard_layout<CachelinePadded<int>>::value,
      "CachelinePadded<T> must be standard-layout if T is.");
  
      std::is_standard_layout<CachelinePadded<int>>::value,
      "CachelinePadded<T> must be standard-layout if T is.");
  
-static constexpr int kCachelineSize = folly::CacheLocality::kFalseSharingRange;
+static constexpr int kCachelineSize =
+    folly::hardware_destructive_interference_size;
  
  template <size_t dataSize, size_t alignment = alignof(void*)>
  struct alignas(alignment) SizedData {
  
  template <size_t dataSize, size_t alignment = alignof(void*)>
  struct alignas(alignment) SizedData {
author	Yedidya Feldblum <yfeldblum@fb.com>
	Wed, 20 Dec 2017 03:02:51 +0000 (19:02 -0800)
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
	Wed, 20 Dec 2017 03:05:10 +0000 (19:05 -0800)
folly/CachelinePadded.h		patch \| blob \| history
folly/MPMCQueue.h		patch \| blob \| history
folly/Portability.h		patch \| blob \| history
folly/ProducerConsumerQueue.h		patch \| blob \| history
folly/concurrency/CacheLocality.h		patch \| blob \| history
folly/lang/Align.h		patch \| blob \| history
folly/test/CachelinePaddedTest.cpp		patch \| blob \| history