Summary:
The alignment requirements and details required to avoid false
sharing belong in CacheLocality, so this diff moves them there.
@override-unit-failures
Test Plan: fbmake runtests
Reviewed By: davejwatson@fb.com
FB internal diff:
D1086090
#include <linux/futex.h>
#include <string.h>
#include <sys/syscall.h>
#include <linux/futex.h>
#include <string.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <folly/Traits.h>
#include <unistd.h>
#include <folly/Traits.h>
+#include <folly/detail/CacheLocality.h>
#include <folly/detail/Futex.h>
namespace folly {
#include <folly/detail/Futex.h>
namespace folly {
/// FOLLY_ASSUME_FBVECTOR_COMPATIBLE then your type can be put in
/// MPMCQueue.
template<typename T,
/// FOLLY_ASSUME_FBVECTOR_COMPATIBLE then your type can be put in
/// MPMCQueue.
template<typename T,
- template<typename> class Atom = std::atomic,
- typename = typename std::enable_if<
- std::is_nothrow_constructible<T,T&&>::value ||
- folly::IsRelocatable<T>::value>::type>
+ template<typename> class Atom = std::atomic>
class MPMCQueue : boost::noncopyable {
class MPMCQueue : boost::noncopyable {
+
+ static_assert(std::is_nothrow_constructible<T,T&&>::value ||
+ folly::IsRelocatable<T>::value,
+ "T must be relocatable or have a noexcept move constructor");
+
friend class detail::MPMCPipelineStageImpl<T>;
public:
typedef T value_type;
friend class detail::MPMCPipelineStageImpl<T>;
public:
typedef T value_type;
, popSpinCutoff_(0)
{
// ideally this would be a static assert, but g++ doesn't allow it
, popSpinCutoff_(0)
{
// ideally this would be a static assert, but g++ doesn't allow it
- assert(alignof(MPMCQueue<T,Atom>) >= kFalseSharingRange);
+ assert(alignof(MPMCQueue<T,Atom>)
+ >= detail::CacheLocality::kFalseSharingRange);
+ assert(static_cast<uint8_t*>(static_cast<void*>(&popTicket_))
+ - static_cast<uint8_t*>(static_cast<void*>(&pushTicket_))
+ >= detail::CacheLocality::kFalseSharingRange);
}
/// A default-constructed queue is useful because a usable (non-zero
}
/// A default-constructed queue is useful because a usable (non-zero
/// the proper spin backoff
kAdaptationFreq = 128,
/// the proper spin backoff
kAdaptationFreq = 128,
- /// Memory locations on the same cache line are subject to false
- /// sharing, which is very bad for performance
- kFalseSharingRange = 64,
-
/// To avoid false sharing in slots_ with neighboring memory
/// allocations, we pad it with this many SingleElementQueue-s at
/// each end
/// To avoid false sharing in slots_ with neighboring memory
/// allocations, we pad it with this many SingleElementQueue-s at
/// each end
- kSlotPadding = 1 +
- (kFalseSharingRange - 1) / sizeof(detail::SingleElementQueue<T,Atom>)
+ kSlotPadding = (detail::CacheLocality::kFalseSharingRange - 1)
+ / sizeof(detail::SingleElementQueue<T,Atom>) + 1
- static_assert(kFalseSharingRange == 64,
- "FOLLY_ON_NEXT_CACHE_LINE must track kFalseSharingRange");
-
-// This literal "64' should be kFalseSharingRange,
-// but gcc-4.8.0 and 4.8.1 reject it.
-// FIXME: s/64/kFalseSharingRange/ if that ever changes.
-#define FOLLY_ON_NEXT_CACHE_LINE __attribute__((aligned(64)))
-
/// The maximum number of items in the queue at once
/// The maximum number of items in the queue at once
- size_t capacity_ FOLLY_ON_NEXT_CACHE_LINE;
+ size_t FOLLY_ALIGN_TO_AVOID_FALSE_SHARING capacity_;
/// An array of capacity_ SingleElementQueue-s, each of which holds
/// either 0 or 1 item. We over-allocate by 2 * kSlotPadding and don't
/// An array of capacity_ SingleElementQueue-s, each of which holds
/// either 0 or 1 item. We over-allocate by 2 * kSlotPadding and don't
int stride_;
/// Enqueuers get tickets from here
int stride_;
/// Enqueuers get tickets from here
- Atom<uint64_t> pushTicket_ FOLLY_ON_NEXT_CACHE_LINE;
+ Atom<uint64_t> FOLLY_ALIGN_TO_AVOID_FALSE_SHARING pushTicket_;
/// Dequeuers get tickets from here
/// Dequeuers get tickets from here
- Atom<uint64_t> popTicket_ FOLLY_ON_NEXT_CACHE_LINE;
+ Atom<uint64_t> FOLLY_ALIGN_TO_AVOID_FALSE_SHARING popTicket_;
/// This is how many times we will spin before using FUTEX_WAIT when
/// the queue is full on enqueue, adaptively computed by occasionally
/// spinning for longer and smoothing with an exponential moving average
/// This is how many times we will spin before using FUTEX_WAIT when
/// the queue is full on enqueue, adaptively computed by occasionally
/// spinning for longer and smoothing with an exponential moving average
- Atom<int> pushSpinCutoff_ FOLLY_ON_NEXT_CACHE_LINE;
+ Atom<int> FOLLY_ALIGN_TO_AVOID_FALSE_SHARING pushSpinCutoff_;
/// The adaptive spin cutoff when the queue is empty on dequeue
/// The adaptive spin cutoff when the queue is empty on dequeue
- Atom<int> popSpinCutoff_ FOLLY_ON_NEXT_CACHE_LINE;
+ Atom<int> FOLLY_ALIGN_TO_AVOID_FALSE_SHARING popSpinCutoff_;
- /// Alignment doesn't avoid false sharing at the end of the struct,
+ /// Alignment doesn't prevent false sharing at the end of the struct,
/// so fill out the last cache line
/// so fill out the last cache line
- char padding_[kFalseSharingRange - sizeof(Atom<int>)];
+ char padding_[detail::CacheLocality::kFalseSharingRange - sizeof(Atom<int>)];
-#undef FOLLY_ON_NEXT_CACHE_LINE
/// We assign tickets in increasing order, but we don't want to
/// access neighboring elements of slots_ because that will lead to
/// We assign tickets in increasing order, but we don't want to
/// access neighboring elements of slots_ because that will lead to
const bool updateSpinCutoff,
Args&&... args) noexcept {
sequencer_.waitForTurn(turn * 2, spinCutoff, updateSpinCutoff);
const bool updateSpinCutoff,
Args&&... args) noexcept {
sequencer_.waitForTurn(turn * 2, spinCutoff, updateSpinCutoff);
- new (contents_) T(std::forward<Args>(args)...);
+ new (&contents_) T(std::forward<Args>(args)...);
sequencer_.completeTurn(turn * 2);
}
sequencer_.completeTurn(turn * 2);
}
if (std::is_nothrow_constructible<T,T&&>::value) {
// this is preferred
sequencer_.waitForTurn(turn * 2, spinCutoff, updateSpinCutoff);
if (std::is_nothrow_constructible<T,T&&>::value) {
// this is preferred
sequencer_.waitForTurn(turn * 2, spinCutoff, updateSpinCutoff);
- new (contents_) T(std::move(goner));
+ new (&contents_) T(std::move(goner));
sequencer_.completeTurn(turn * 2);
} else {
// simulate nothrow move with relocation, followed by default
// construction to fill the gap we created
sequencer_.waitForTurn(turn * 2, spinCutoff, updateSpinCutoff);
sequencer_.completeTurn(turn * 2);
} else {
// simulate nothrow move with relocation, followed by default
// construction to fill the gap we created
sequencer_.waitForTurn(turn * 2, spinCutoff, updateSpinCutoff);
- memcpy(contents_, &goner, sizeof(T));
+ memcpy(&contents_, &goner, sizeof(T));
sequencer_.completeTurn(turn * 2);
new (&goner) T();
}
sequencer_.completeTurn(turn * 2);
new (&goner) T();
}
// unlikely, but if we don't complete our turn the queue will die
}
sequencer_.waitForTurn(turn * 2 + 1, spinCutoff, updateSpinCutoff);
// unlikely, but if we don't complete our turn the queue will die
}
sequencer_.waitForTurn(turn * 2 + 1, spinCutoff, updateSpinCutoff);
- memcpy(&elem, contents_, sizeof(T));
+ memcpy(&elem, &contents_, sizeof(T));
sequencer_.completeTurn(turn * 2 + 1);
} else {
// use nothrow move assignment
sequencer_.completeTurn(turn * 2 + 1);
} else {
// use nothrow move assignment
private:
/// Storage for a T constructed with placement new
private:
/// Storage for a T constructed with placement new
- char contents_[sizeof(T)] __attribute__((aligned(alignof(T))));
+ typename std::aligned_storage<sizeof(T),alignof(T)>::type contents_;
/// Even turns are pushes, odd turns are pops
TurnSequencer<Atom> sequencer_;
T* ptr() noexcept {
/// Even turns are pushes, odd turns are pops
TurnSequencer<Atom> sequencer_;
T* ptr() noexcept {
- return static_cast<T*>(static_cast<void*>(contents_));
+ return static_cast<T*>(static_cast<void*>(&contents_));
}
void destroyContents() noexcept {
}
void destroyContents() noexcept {
// g++ doesn't seem to have std::is_nothrow_destructible yet
}
#ifndef NDEBUG
// g++ doesn't seem to have std::is_nothrow_destructible yet
}
#ifndef NDEBUG
- memset(contents_, 'Q', sizeof(T));
+ memset(&contents_, 'Q', sizeof(T));
/// CacheLocality structure with the specified number of cpus and a
/// single cache level that associates one cpu per cache.
static CacheLocality uniform(size_t numCpus);
/// CacheLocality structure with the specified number of cpus and a
/// single cache level that associates one cpu per cache.
static CacheLocality uniform(size_t numCpus);
+
+ enum {
+ /// Memory locations on the same cache line are subject to false
+ /// sharing, which is very bad for performance. Microbenchmarks
+ /// indicate that pairs of cache lines also see interference under
+ /// heavy use of atomic operations (observed for atomic increment on
+ /// Sandy Bridge). See FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
+ kFalseSharingRange = 128
+ };
+
+ static_assert(kFalseSharingRange == 128,
+ "FOLLY_ALIGN_TO_AVOID_FALSE_SHARING should track kFalseSharingRange");
+// TODO replace __attribute__ with alignas and 128 with kFalseSharingRange
+
+/// An attribute that will cause a variable or field to be aligned so that
+/// it doesn't have false sharing with anything at a smaller memory address.
+#define FOLLY_ALIGN_TO_AVOID_FALSE_SHARING __attribute__((aligned(128)))
+
/// Holds a function pointer to the VDSO implementation of getcpu(2),
/// if available
struct Getcpu {
/// Holds a function pointer to the VDSO implementation of getcpu(2),
/// if available
struct Getcpu {
- /// If we align the access spreaders at the beginning of a cache line
- /// then getcpuFunc_ and the first 56 bytes of stripeByCpu will be on
- /// the same cache line
- enum { kAlignment = 64 };
-
-
// AccessSpreader uses sharedInstance
friend AccessSpreader<Atom>;
// AccessSpreader uses sharedInstance
friend AccessSpreader<Atom>;
/// aligned_storage is uninitialized, we use placement new since there
/// is no AccessSpreader default constructor
/// aligned_storage is uninitialized, we use placement new since there
/// is no AccessSpreader default constructor
- typename std::aligned_storage<sizeof(AccessSpreader<Atom>),kAlignment>::type
+ typename std::aligned_storage<sizeof(AccessSpreader<Atom>),
+ CacheLocality::kFalseSharingRange>::type