From 05ce52289b0ec8f525a92d6d1955301d0b77c0a7 Mon Sep 17 00:00:00 2001 From: Giuseppe Ottaviano Date: Wed, 28 Jun 2017 11:09:42 -0700 Subject: [PATCH] Move CacheLocality out of detail/ and into concurrency/ Summary: There's no reason these utilities should only be used by folly. Reviewed By: mzlee Differential Revision: D5317894 fbshipit-source-id: 5a9bdf4c5efaa5bcbe78e6723a03a468f2fe5e32 --- CMakeLists.txt | 3 ++- folly/IndexedMemPool.h | 8 +++++--- folly/LifoSem.h | 6 ++---- folly/MPMCQueue.h | 18 ++++++++--------- folly/Makefile.am | 4 ++-- folly/ProducerConsumerQueue.h | 14 ++++++------- folly/SharedMutex.h | 7 ++++--- folly/TokenBucket.h | 2 +- .../{detail => concurrency}/CacheLocality.cpp | 8 +++----- folly/{detail => concurrency}/CacheLocality.h | 7 +++---- folly/concurrency/CoreCachedSharedPtr.h | 8 ++++---- .../test/CacheLocalityBenchmark.cpp | 15 +++++++------- .../test/CacheLocalityTest.cpp | 20 ++++++++----------- folly/detail/CachelinePaddedImpl.h | 6 +++--- folly/detail/MemoryIdler.cpp | 2 +- .../flat_combining/FlatCombining.h | 2 +- folly/test/CachelinePaddedTest.cpp | 2 +- folly/test/DeterministicSchedule.cpp | 4 ++-- folly/test/DeterministicSchedule.h | 5 +++-- 19 files changed, 67 insertions(+), 74 deletions(-) rename folly/{detail => concurrency}/CacheLocality.cpp (97%) rename folly/{detail => concurrency}/CacheLocality.h (99%) rename folly/{ => concurrency}/test/CacheLocalityBenchmark.cpp (96%) rename folly/{ => concurrency}/test/CacheLocalityTest.cpp (97%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4bfa9974..7d476c5c 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -292,6 +292,8 @@ if (BUILD_TESTS) apply_folly_compile_options_to_target(folly_test_support) folly_define_tests( + DIRECTORY concurrency/ + TEST cache_locality_test SOURCES CacheLocalityTest.cpp DIRECTORY experimental/test/ TEST autotimer_test SOURCES AutoTimerTest.cpp TEST bits_test_2 SOURCES BitsTest.cpp @@ -467,7 +469,6 @@ if (BUILD_TESTS) TEST baton_test SOURCES BatonTest.cpp TEST bit_iterator_test SOURCES BitIteratorTest.cpp TEST bits_test SOURCES BitsTest.cpp - TEST cache_locality_test SOURCES CacheLocalityTest.cpp TEST cacheline_padded_test SOURCES CachelinePaddedTest.cpp TEST call_once_test SOURCES CallOnceTest.cpp TEST checksum_test SOURCES ChecksumTest.cpp diff --git a/folly/IndexedMemPool.h b/folly/IndexedMemPool.h index 275d441c..6b3fa53a 100644 --- a/folly/IndexedMemPool.h +++ b/folly/IndexedMemPool.h @@ -16,14 +16,16 @@ #pragma once -#include #include #include #include + +#include + #include #include #include -#include +#include #include #include @@ -497,7 +499,7 @@ struct IndexedMemPool : boost::noncopyable { } AtomicStruct& localHead() { - auto stripe = detail::AccessSpreader::current(NumLocalLists); + auto stripe = AccessSpreader::current(NumLocalLists); return local_[stripe].head; } diff --git a/folly/LifoSem.h b/folly/LifoSem.h index f6b7bf02..9c0404fe 100644 --- a/folly/LifoSem.h +++ b/folly/LifoSem.h @@ -27,7 +27,7 @@ #include #include #include -#include +#include namespace folly { @@ -515,9 +515,7 @@ struct LifoSemBase { FOLLY_ALIGN_TO_AVOID_FALSE_SHARING folly::AtomicStruct head_; - char padding_[folly::detail::CacheLocality::kFalseSharingRange - - sizeof(LifoSemHead)]; - + char padding_[folly::CacheLocality::kFalseSharingRange - sizeof(LifoSemHead)]; static LifoSemNode& idxToNode(uint32_t idx) { auto raw = &LifoSemRawNode::pool()[idx]; diff --git a/folly/MPMCQueue.h b/folly/MPMCQueue.h index b0cfc46f..0e921060 100644 --- a/folly/MPMCQueue.h +++ b/folly/MPMCQueue.h @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include @@ -647,11 +647,11 @@ class MPMCQueueBase> : boost::noncopyable { } // ideally this would be a static assert, but g++ doesn't allow it - assert(alignof(MPMCQueue) - >= detail::CacheLocality::kFalseSharingRange); - assert(static_cast(static_cast(&popTicket_)) - - static_cast(static_cast(&pushTicket_)) - >= detail::CacheLocality::kFalseSharingRange); + assert(alignof(MPMCQueue) >= CacheLocality::kFalseSharingRange); + assert( + static_cast(static_cast(&popTicket_)) - + static_cast(static_cast(&pushTicket_)) >= + CacheLocality::kFalseSharingRange); } /// A default-constructed queue is useful because a usable (non-zero @@ -971,8 +971,7 @@ class MPMCQueueBase> : boost::noncopyable { /// To avoid false sharing in slots_ with neighboring memory /// allocations, we pad it with this many SingleElementQueue-s at /// each end - kSlotPadding = (detail::CacheLocality::kFalseSharingRange - 1) - / sizeof(Slot) + 1 + kSlotPadding = (CacheLocality::kFalseSharingRange - 1) / sizeof(Slot) + 1 }; /// The maximum number of items in the queue at once @@ -1024,8 +1023,7 @@ class MPMCQueueBase> : boost::noncopyable { /// Alignment doesn't prevent false sharing at the end of the struct, /// so fill out the last cache line - char padding_[detail::CacheLocality::kFalseSharingRange - - sizeof(Atom)]; + char padding_[CacheLocality::kFalseSharingRange - sizeof(Atom)]; /// We assign tickets in increasing order, but we don't want to /// access neighboring elements of slots_ because that will lead to diff --git a/folly/Makefile.am b/folly/Makefile.am index 74977332..acc1c0bd 100644 --- a/folly/Makefile.am +++ b/folly/Makefile.am @@ -56,12 +56,12 @@ nobase_follyinclude_HEADERS = \ CppAttributes.h \ CpuId.h \ CPortability.h \ + concurrency/CacheLocality.h \ concurrency/CoreCachedSharedPtr.h \ detail/AtomicHashUtils.h \ detail/AtomicUnorderedMapUtils.h \ detail/AtomicUtils.h \ detail/BitIteratorDetail.h \ - detail/CacheLocality.h \ detail/CachelinePaddedImpl.h \ detail/ChecksumDetail.h \ detail/DiscriminatedPtrDetail.h \ @@ -459,7 +459,7 @@ libfolly_la_SOURCES = \ Assume.cpp \ Checksum.cpp \ ClockGettimeWrappers.cpp \ - detail/CacheLocality.cpp \ + concurrency/CacheLocality.cpp \ detail/IPAddress.cpp \ dynamic.cpp \ ExceptionWrapper.cpp \ diff --git a/folly/ProducerConsumerQueue.h b/folly/ProducerConsumerQueue.h index d0bf3ec8..12f2bf42 100644 --- a/folly/ProducerConsumerQueue.h +++ b/folly/ProducerConsumerQueue.h @@ -27,7 +27,7 @@ #include #include -#include +#include namespace folly { @@ -168,14 +168,14 @@ struct ProducerConsumerQueue { } private: - char pad0_[detail::CacheLocality::kFalseSharingRange]; - const uint32_t size_; - T* const records_; + char pad0_[CacheLocality::kFalseSharingRange]; + const uint32_t size_; + T* const records_; - FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic readIndex_; - FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic writeIndex_; + FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic readIndex_; + FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic writeIndex_; - char pad1_[detail::CacheLocality::kFalseSharingRange - sizeof(writeIndex_)]; + char pad1_[CacheLocality::kFalseSharingRange - sizeof(writeIndex_)]; }; } diff --git a/folly/SharedMutex.h b/folly/SharedMutex.h index c13a6d6f..24d8051c 100644 --- a/folly/SharedMutex.h +++ b/folly/SharedMutex.h @@ -19,11 +19,13 @@ #pragma once #include + #include #include #include + #include -#include +#include #include #include #include @@ -1417,8 +1419,7 @@ bool SharedMutexImpl:: // starting point for our empty-slot search, can change after // calling waitForZeroBits uint32_t bestSlot = - (uint32_t)folly::detail::AccessSpreader::current( - kMaxDeferredReaders); + (uint32_t)folly::AccessSpreader::current(kMaxDeferredReaders); // deferred readers are already enabled, or it is time to // enable them if we can find a slot diff --git a/folly/TokenBucket.h b/folly/TokenBucket.h index d88bcd86..905b0f9f 100644 --- a/folly/TokenBucket.h +++ b/folly/TokenBucket.h @@ -21,7 +21,7 @@ #include #include -#include +#include namespace folly { diff --git a/folly/detail/CacheLocality.cpp b/folly/concurrency/CacheLocality.cpp similarity index 97% rename from folly/detail/CacheLocality.cpp rename to folly/concurrency/CacheLocality.cpp index d646ebe9..36b77b83 100644 --- a/folly/detail/CacheLocality.cpp +++ b/folly/concurrency/CacheLocality.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #ifndef _MSC_VER #define _GNU_SOURCE 1 // for RTLD_NOLOAD @@ -29,7 +29,6 @@ #include namespace folly { -namespace detail { ///////////// CacheLocality @@ -244,13 +243,13 @@ SimpleAllocator::SimpleAllocator(size_t allocSize, size_t sz) SimpleAllocator::~SimpleAllocator() { std::lock_guard g(m_); for (auto& block : blocks_) { - aligned_free(block); + detail::aligned_free(block); } } void* SimpleAllocator::allocateHard() { // Allocate a new slab. - mem_ = static_cast(aligned_malloc(allocSize_, allocSize_)); + mem_ = static_cast(detail::aligned_malloc(allocSize_, allocSize_)); if (!mem_) { std::__throw_bad_alloc(); } @@ -271,5 +270,4 @@ void* SimpleAllocator::allocateHard() { return mem; } -} // namespace detail } // namespace folly diff --git a/folly/detail/CacheLocality.h b/folly/concurrency/CacheLocality.h similarity index 99% rename from folly/detail/CacheLocality.h rename to folly/concurrency/CacheLocality.h index 741d30f8..be9d4410 100644 --- a/folly/detail/CacheLocality.h +++ b/folly/concurrency/CacheLocality.h @@ -38,7 +38,6 @@ #include namespace folly { -namespace detail { // This file contains several classes that might be useful if you are // trying to dynamically optimize cache locality: CacheLocality reads @@ -458,7 +457,8 @@ class CoreAllocator { // Align to a cacheline size = size + (CacheLocality::kFalseSharingRange - 1); size &= ~size_t(CacheLocality::kFalseSharingRange - 1); - void* mem = aligned_malloc(size, CacheLocality::kFalseSharingRange); + void* mem = + detail::aligned_malloc(size, CacheLocality::kFalseSharingRange); if (!mem) { std::__throw_bad_alloc(); } @@ -478,7 +478,7 @@ class CoreAllocator { auto allocator = *static_cast(addr); allocator->deallocate(mem); } else { - aligned_free(mem); + detail::aligned_free(mem); } } }; @@ -507,5 +507,4 @@ StlAllocator::Allocator, T> getCoreAllocatorStl( return StlAllocator::Allocator, T>(alloc); } -} // namespace detail } // namespace folly diff --git a/folly/concurrency/CoreCachedSharedPtr.h b/folly/concurrency/CoreCachedSharedPtr.h index 594050b2..ac89ac18 100644 --- a/folly/concurrency/CoreCachedSharedPtr.h +++ b/folly/concurrency/CoreCachedSharedPtr.h @@ -20,7 +20,7 @@ #include #include -#include +#include namespace folly { @@ -46,14 +46,14 @@ class CoreCachedSharedPtr { // prevent false sharing. Their control blocks will be adjacent // thanks to allocate_shared(). for (auto slot : folly::enumerate(slots_)) { - auto alloc = detail::getCoreAllocatorStl(slot.index); + auto alloc = getCoreAllocatorStl(slot.index); auto holder = std::allocate_shared(alloc, p); *slot = std::shared_ptr(holder, p.get()); } } std::shared_ptr get() const { - return slots_[detail::AccessSpreader<>::current(kNumSlots)]; + return slots_[AccessSpreader<>::current(kNumSlots)]; } private: @@ -75,7 +75,7 @@ class CoreCachedWeakPtr { } std::weak_ptr get() const { - return slots_[detail::AccessSpreader<>::current(kNumSlots)]; + return slots_[AccessSpreader<>::current(kNumSlots)]; } private: diff --git a/folly/test/CacheLocalityBenchmark.cpp b/folly/concurrency/test/CacheLocalityBenchmark.cpp similarity index 96% rename from folly/test/CacheLocalityBenchmark.cpp rename to folly/concurrency/test/CacheLocalityBenchmark.cpp index dbb2d6ac..4c6c9ea1 100644 --- a/folly/test/CacheLocalityBenchmark.cpp +++ b/folly/concurrency/test/CacheLocalityBenchmark.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include @@ -24,7 +24,7 @@ #include -using namespace folly::detail; +using namespace folly; #define DECLARE_SPREADER_TAG(tag, locality, func) \ namespace { \ @@ -32,7 +32,6 @@ using namespace folly::detail; struct tag {}; \ } \ namespace folly { \ - namespace detail { \ template <> \ const CacheLocality& CacheLocality::system() { \ static auto* inst = new CacheLocality(locality); \ @@ -42,16 +41,16 @@ using namespace folly::detail; Getcpu::Func AccessSpreader::pickGetcpuFunc() { \ return func; \ } \ - } \ } DECLARE_SPREADER_TAG( ThreadLocalTag, CacheLocality::system<>(), - folly::detail::FallbackGetcpu>::getcpu) -DECLARE_SPREADER_TAG(PthreadSelfTag, - CacheLocality::system<>(), - folly::detail::FallbackGetcpu::getcpu) + folly::FallbackGetcpu>::getcpu) +DECLARE_SPREADER_TAG( + PthreadSelfTag, + CacheLocality::system<>(), + folly::FallbackGetcpu::getcpu) BENCHMARK(AccessSpreaderUse, iters) { for (unsigned long i = 0; i < iters; ++i) { diff --git a/folly/test/CacheLocalityTest.cpp b/folly/concurrency/test/CacheLocalityTest.cpp similarity index 97% rename from folly/test/CacheLocalityTest.cpp rename to folly/concurrency/test/CacheLocalityTest.cpp index cb18f14c..92a5abc6 100644 --- a/folly/test/CacheLocalityTest.cpp +++ b/folly/concurrency/test/CacheLocalityTest.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include @@ -24,7 +24,7 @@ #include #include -using namespace folly::detail; +using namespace folly; /// This is the relevant nodes from a production box's sysfs tree. If you /// think this map is ugly you should see the version of this test that @@ -363,13 +363,12 @@ TEST(Getcpu, VdsoGetcpu) { #ifdef FOLLY_TLS TEST(ThreadId, SimpleTls) { unsigned cpu = 0; - auto rv = - folly::detail::FallbackGetcpu>::getcpu( - &cpu, nullptr, nullptr); + auto rv = folly::FallbackGetcpu>::getcpu( + &cpu, nullptr, nullptr); EXPECT_EQ(rv, 0); EXPECT_TRUE(cpu > 0); unsigned again; - folly::detail::FallbackGetcpu>::getcpu( + folly::FallbackGetcpu>::getcpu( &again, nullptr, nullptr); EXPECT_EQ(cpu, again); } @@ -377,13 +376,12 @@ TEST(ThreadId, SimpleTls) { TEST(ThreadId, SimplePthread) { unsigned cpu = 0; - auto rv = folly::detail::FallbackGetcpu::getcpu( - &cpu, nullptr, nullptr); + auto rv = + folly::FallbackGetcpu::getcpu(&cpu, nullptr, nullptr); EXPECT_EQ(rv, 0); EXPECT_TRUE(cpu > 0); unsigned again; - folly::detail::FallbackGetcpu::getcpu( - &again, nullptr, nullptr); + folly::FallbackGetcpu::getcpu(&again, nullptr, nullptr); EXPECT_EQ(cpu, again); } @@ -414,7 +412,6 @@ TEST(AccessSpreader, Simple) { struct tag {}; \ } \ namespace folly { \ - namespace detail { \ template <> \ const CacheLocality& CacheLocality::system() { \ static auto* inst = new CacheLocality(locality); \ @@ -424,7 +421,6 @@ TEST(AccessSpreader, Simple) { Getcpu::Func AccessSpreader::pickGetcpuFunc() { \ return func; \ } \ - } \ } DECLARE_SPREADER_TAG(ManualTag, CacheLocality::uniform(16), testingGetcpu) diff --git a/folly/detail/CachelinePaddedImpl.h b/folly/detail/CachelinePaddedImpl.h index e65a6ce6..1acce99d 100644 --- a/folly/detail/CachelinePaddedImpl.h +++ b/folly/detail/CachelinePaddedImpl.h @@ -16,7 +16,7 @@ #pragma once -#include +#include namespace folly { @@ -33,7 +33,7 @@ struct CachelinePaddedImpl; // We need alignas(T) alignas(kFalseSharingRange) for the case where alignof(T) // > alignof(kFalseSharingRange). template -struct alignas(T) alignas(detail::CacheLocality::kFalseSharingRange) +struct alignas(T) alignas(CacheLocality::kFalseSharingRange) CachelinePaddedImpl { template explicit CachelinePaddedImpl(Args&&... args) @@ -42,7 +42,7 @@ struct alignas(T) alignas(detail::CacheLocality::kFalseSharingRange) }; template -struct alignas(T) alignas(detail::CacheLocality::kFalseSharingRange) +struct alignas(T) alignas(CacheLocality::kFalseSharingRange) CachelinePaddedImpl { template explicit CachelinePaddedImpl(Args&&... args) diff --git a/folly/detail/MemoryIdler.cpp b/folly/detail/MemoryIdler.cpp index 608e244b..70c4c055 100644 --- a/folly/detail/MemoryIdler.cpp +++ b/folly/detail/MemoryIdler.cpp @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/folly/experimental/flat_combining/FlatCombining.h b/folly/experimental/flat_combining/FlatCombining.h index 146f1748..1ddc0c08 100644 --- a/folly/experimental/flat_combining/FlatCombining.h +++ b/folly/experimental/flat_combining/FlatCombining.h @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include diff --git a/folly/test/CachelinePaddedTest.cpp b/folly/test/CachelinePaddedTest.cpp index 02fea430..2fe69e88 100644 --- a/folly/test/CachelinePaddedTest.cpp +++ b/folly/test/CachelinePaddedTest.cpp @@ -26,7 +26,7 @@ static_assert( std::is_standard_layout>::value, "CachelinePadded must be standard-layout if T is."); -const int kCachelineSize = folly::detail::CacheLocality::kFalseSharingRange; +const int kCachelineSize = folly::CacheLocality::kFalseSharingRange; template struct SizedData { diff --git a/folly/test/DeterministicSchedule.cpp b/folly/test/DeterministicSchedule.cpp index 87ecb762..1272c42a 100644 --- a/folly/test/DeterministicSchedule.cpp +++ b/folly/test/DeterministicSchedule.cpp @@ -382,6 +382,7 @@ int Futex::futexWake(int count, uint32_t wakeMask) { DeterministicSchedule::afterSharedAccess(); return rv; } +} template <> CacheLocality const& CacheLocality::system() { @@ -391,7 +392,6 @@ CacheLocality const& CacheLocality::system() { template <> Getcpu::Func AccessSpreader::pickGetcpuFunc() { - return &DeterministicSchedule::getcpu; -} + return &detail::DeterministicSchedule::getcpu; } } diff --git a/folly/test/DeterministicSchedule.h b/folly/test/DeterministicSchedule.h index 5ec444e6..8f1f4283 100644 --- a/folly/test/DeterministicSchedule.h +++ b/folly/test/DeterministicSchedule.h @@ -28,8 +28,8 @@ #include #include +#include #include -#include #include #include @@ -499,8 +499,9 @@ FutexResult Futex::futexWaitImpl( std::chrono::time_point* absSystemTime, std::chrono::time_point* absSteadyTime, uint32_t waitMask); +} template <> Getcpu::Func AccessSpreader::pickGetcpuFunc(); -} + } // namespace folly::detail -- 2.34.1