From 5ad17f0715288d08e0db580c90b7bf8716ddfd99 Mon Sep 17 00:00:00 2001 From: Yedidya Feldblum Date: Tue, 19 Dec 2017 19:02:51 -0800 Subject: [PATCH] Add hardware_destructive_interference_size Summary: [Folly] Add `hardware_destructive_interference_size` and `hardware_constructive_interference_size` to `folly/lang/Align.h`. As backports from C++17. Which may require keeping, depending on how stanard libraries choose to implement these. And replace `CacheLocality::kFalseSharingRange` with `hardware_destructive_interference_size`. Reviewed By: ot Differential Revision: D6554817 fbshipit-source-id: bff49f5ca8b01d38fa806076f99201355df76cd9 --- folly/CachelinePadded.h | 7 +++---- folly/MPMCQueue.h | 10 ++++++---- folly/Portability.h | 7 +++++++ folly/ProducerConsumerQueue.h | 4 ++-- folly/concurrency/CacheLocality.h | 26 +++++--------------------- folly/lang/Align.h | 30 ++++++++++++++++++++++++++++++ folly/test/CachelinePaddedTest.cpp | 3 ++- 7 files changed, 55 insertions(+), 32 deletions(-) diff --git a/folly/CachelinePadded.h b/folly/CachelinePadded.h index cf10ecb7..27a06b4a 100644 --- a/folly/CachelinePadded.h +++ b/folly/CachelinePadded.h @@ -17,9 +17,8 @@ #pragma once #include +#include -#include -#include #include namespace folly { @@ -68,8 +67,8 @@ class CachelinePadded { private: static constexpr size_t paddingSize() noexcept { - return CacheLocality::kFalseSharingRange - - (alignof(T) % CacheLocality::kFalseSharingRange); + return hardware_destructive_interference_size - + (alignof(T) % hardware_destructive_interference_size); } char paddingPre_[paddingSize()]; T inner_; diff --git a/folly/MPMCQueue.h b/folly/MPMCQueue.h index d4c0ccc7..0db70fa0 100644 --- a/folly/MPMCQueue.h +++ b/folly/MPMCQueue.h @@ -651,11 +651,12 @@ class MPMCQueueBase> : boost::noncopyable { } // ideally this would be a static assert, but g++ doesn't allow it - assert(alignof(MPMCQueue) >= CacheLocality::kFalseSharingRange); + assert( + alignof(MPMCQueue) >= hardware_destructive_interference_size); assert( static_cast(static_cast(&popTicket_)) - static_cast(static_cast(&pushTicket_)) >= - CacheLocality::kFalseSharingRange); + static_cast(hardware_destructive_interference_size)); } /// A default-constructed queue is useful because a usable (non-zero @@ -975,7 +976,8 @@ class MPMCQueueBase> : boost::noncopyable { /// To avoid false sharing in slots_ with neighboring memory /// allocations, we pad it with this many SingleElementQueue-s at /// each end - kSlotPadding = (CacheLocality::kFalseSharingRange - 1) / sizeof(Slot) + 1 + kSlotPadding = + (hardware_destructive_interference_size - 1) / sizeof(Slot) + 1 }; /// The maximum number of items in the queue at once @@ -1027,7 +1029,7 @@ class MPMCQueueBase> : boost::noncopyable { /// Alignment doesn't prevent false sharing at the end of the struct, /// so fill out the last cache line - char padding_[CacheLocality::kFalseSharingRange - sizeof(Atom)]; + char pad_[hardware_destructive_interference_size - sizeof(Atom)]; /// We assign tickets in increasing order, but we don't want to /// access neighboring elements of slots_ because that will lead to diff --git a/folly/Portability.h b/folly/Portability.h index 1498db30..11eb8f82 100644 --- a/folly/Portability.h +++ b/folly/Portability.h @@ -97,6 +97,12 @@ constexpr bool kHasUnalignedAccess = false; # define FOLLY_X64 0 #endif +#if defined(__arm__) +#define FOLLY_ARM 1 +#else +#define FOLLY_ARM 0 +#endif + #if defined(__aarch64__) # define FOLLY_AARCH64 1 #else @@ -110,6 +116,7 @@ constexpr bool kHasUnalignedAccess = false; #endif namespace folly { +constexpr bool kIsArchArm = FOLLY_ARM == 1; constexpr bool kIsArchAmd64 = FOLLY_X64 == 1; constexpr bool kIsArchAArch64 = FOLLY_AARCH64 == 1; constexpr bool kIsArchPPC64 = FOLLY_PPC64 == 1; diff --git a/folly/ProducerConsumerQueue.h b/folly/ProducerConsumerQueue.h index 16de57d3..2a8f04c6 100644 --- a/folly/ProducerConsumerQueue.h +++ b/folly/ProducerConsumerQueue.h @@ -173,14 +173,14 @@ struct ProducerConsumerQueue { } private: - char pad0_[CacheLocality::kFalseSharingRange]; + char pad0_[hardware_destructive_interference_size]; const uint32_t size_; T* const records_; FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic readIndex_; FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic writeIndex_; - char pad1_[CacheLocality::kFalseSharingRange - sizeof(writeIndex_)]; + char pad1_[hardware_destructive_interference_size - sizeof(writeIndex_)]; }; } // namespace folly diff --git a/folly/concurrency/CacheLocality.h b/folly/concurrency/CacheLocality.h index 7751b7dc..420f5334 100644 --- a/folly/concurrency/CacheLocality.h +++ b/folly/concurrency/CacheLocality.h @@ -116,22 +116,9 @@ struct CacheLocality { /// CacheLocality structure with the specified number of cpus and a /// single cache level that associates one cpu per cache. static CacheLocality uniform(size_t numCpus); - - enum { - /// Memory locations on the same cache line are subject to false - /// sharing, which is very bad for performance. Microbenchmarks - /// indicate that pairs of cache lines also see interference under - /// heavy use of atomic operations (observed for atomic increment on - /// Sandy Bridge). See FOLLY_ALIGN_TO_AVOID_FALSE_SHARING - kFalseSharingRange = 128 - }; - - static_assert( - kFalseSharingRange == 128, - "FOLLY_ALIGN_TO_AVOID_FALSE_SHARING should track kFalseSharingRange"); }; -// TODO replace __attribute__ with alignas and 128 with kFalseSharingRange +// TODO replace with alignas(hardware_destructive_interference_size) /// An attribute that will cause a variable or field to be aligned so that /// it doesn't have false sharing with anything at a smaller memory address. @@ -451,14 +438,11 @@ class CoreAllocator { void* allocate(size_t size) { auto cl = sizeClass(size); if (cl == 4) { - static_assert( - CacheLocality::kFalseSharingRange == 128, - "kFalseSharingRange changed"); // Align to a cacheline - size = size + (CacheLocality::kFalseSharingRange - 1); - size &= ~size_t(CacheLocality::kFalseSharingRange - 1); - void* mem = - detail::aligned_malloc(size, CacheLocality::kFalseSharingRange); + size = size + (hardware_destructive_interference_size - 1); + size &= ~size_t(hardware_destructive_interference_size - 1); + void* mem = detail::aligned_malloc( + size, hardware_destructive_interference_size); if (!mem) { std::__throw_bad_alloc(); } diff --git a/folly/lang/Align.h b/folly/lang/Align.h index 4f0564dc..2bdb8dd8 100644 --- a/folly/lang/Align.h +++ b/folly/lang/Align.h @@ -18,6 +18,8 @@ #include +#include + namespace folly { namespace detail { @@ -88,4 +90,32 @@ using max_align_v_ = max_align_t_< constexpr std::size_t max_align_v = detail::max_align_v_::value; struct alignas(max_align_v) max_align_t {}; +// Memory locations within the same cache line are subject to destructive +// interference, also known as false sharing, which is when concurrent +// accesses to these different memory locations from different cores, where at +// least one of the concurrent accesses is or involves a store operation, +// induce contention and harm performance. +// +// Microbenchmarks indicate that pairs of cache lines also see destructive +// interference under heavy use of atomic operations, as observed for atomic +// increment on Sandy Bridge. +// +// We assume a cache line size of 64, so we use a cache line pair size of 128 +// to avoid destructive interference. +// +// mimic: std::hardware_destructive_interference_size, C++17 +constexpr std::size_t hardware_destructive_interference_size = + kIsArchArm ? 64 : 128; +static_assert(hardware_destructive_interference_size >= max_align_v, "math?"); + +// Memory locations within the same cache line are subject to constructive +// interference, also known as true sharing, which is when accesses to some +// memory locations induce all memory locations within the same cache line to +// be cached, benefiting subsequent accesses to different memory locations +// within the same cache line and heping performance. +// +// mimic: std::hardware_constructive_interference_size, C++17 +constexpr std::size_t hardware_constructive_interference_size = 64; +static_assert(hardware_constructive_interference_size >= max_align_v, "math?"); + } // namespace folly diff --git a/folly/test/CachelinePaddedTest.cpp b/folly/test/CachelinePaddedTest.cpp index 148493e3..5890ba36 100644 --- a/folly/test/CachelinePaddedTest.cpp +++ b/folly/test/CachelinePaddedTest.cpp @@ -27,7 +27,8 @@ static_assert( std::is_standard_layout>::value, "CachelinePadded must be standard-layout if T is."); -static constexpr int kCachelineSize = folly::CacheLocality::kFalseSharingRange; +static constexpr int kCachelineSize = + folly::hardware_destructive_interference_size; template struct alignas(alignment) SizedData { -- 2.34.1