From a9d90ea14f527b0b719a90f014c8c20e13d54866 Mon Sep 17 00:00:00 2001 From: Artem Lantsev Date: Wed, 12 Apr 2017 02:16:51 -0700 Subject: [PATCH] Add memory padding and alignment to prevent false sharing Summary: This is a follow up commit for the thread https://github.com/facebook/folly/pull/378 Reviewed By: nbronson Differential Revision: D4860356 fbshipit-source-id: f10a0d12a593c18b1abf94da5b477c524c04f4be --- folly/ProducerConsumerQueue.h | 9 ++- folly/test/ProducerConsumerQueueBenchmark.cpp | 63 +++++++++++++------ 2 files changed, 52 insertions(+), 20 deletions(-) diff --git a/folly/ProducerConsumerQueue.h b/folly/ProducerConsumerQueue.h index 053c0e26..9ebe65ac 100644 --- a/folly/ProducerConsumerQueue.h +++ b/folly/ProducerConsumerQueue.h @@ -27,6 +27,8 @@ #include #include +#include + namespace folly { /* @@ -166,11 +168,14 @@ struct ProducerConsumerQueue { } private: + char pad0_[detail::CacheLocality::kFalseSharingRange]; const uint32_t size_; T* const records_; - std::atomic readIndex_; - std::atomic writeIndex_; + FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic readIndex_; + FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic writeIndex_; + + char pad1_[detail::CacheLocality::kFalseSharingRange - sizeof(writeIndex_)]; }; } diff --git a/folly/test/ProducerConsumerQueueBenchmark.cpp b/folly/test/ProducerConsumerQueueBenchmark.cpp index e8079680..d37ac911 100644 --- a/folly/test/ProducerConsumerQueueBenchmark.cpp +++ b/folly/test/ProducerConsumerQueueBenchmark.cpp @@ -79,7 +79,7 @@ struct ThroughputTest { QueueType queue_; std::atomic done_; - int iters_; + const int iters_; int cpu0_; int cpu1_; }; @@ -176,7 +176,7 @@ struct LatencyTest { QueueType queue_; std::atomic done_; int time_cost_; - int iters_; + const int iters_; int cpu0_; int cpu1_; Histogram hist_; @@ -250,26 +250,53 @@ int main(int argc, char** argv) { #if 0 /* -Benchmark on Intel(R) Xeon(R) CPU E5-2660 0 @ 2.20GHz -Latency histogram: - log(nsec) - min max count - 6 7 5124 - 7 8 4799 - 8 9 49 - 9 10 2 - 10 11 1 - 11 12 5 - 12 13 3 - 13 14 9 - 14 15 8 +Benchmark + +$ lscpu +Architecture: x86_64 +CPU op-mode(s): 32-bit, 64-bit +Byte Order: Little Endian +CPU(s): 24 +On-line CPU(s) list: 0-23 +Thread(s) per core: 1 +Core(s) per socket: 1 +Socket(s): 24 +NUMA node(s): 1 +Vendor ID: GenuineIntel +CPU family: 6 +Model: 60 +Model name: Intel Core Processor (Haswell, no TSX) +Stepping: 1 +CPU MHz: 2494.244 +BogoMIPS: 4988.48 +Hypervisor vendor: KVM +Virtualization type: full +L1d cache: 32K +L1i cache: 32K +L2 cache: 4096K +NUMA node0 CPU(s): 0-23 + +$ ../buck-out/gen/folly/test/producer_consumer_queue_benchmark +5 6 1 5 +6 7 1893 11358 +7 8 39671 277697 +8 9 34921 279368 +9 10 17799 160191 +10 11 3685 36850 +11 12 1075 11825 +12 13 456 5472 +13 14 422 5486 +14 15 64 896 +15 16 7 105 +16 17 3 48 +17 18 3 51 ============================================================================ folly/test/ProducerConsumerQueueBenchmark.cpp relative time/iter iters/s ============================================================================ ---------------------------------------------------------------------------- -BM_ProducerConsumer(1048574) 7.52ns 132.90M -BM_ProducerConsumerAffinity(1048574) 8.28ns 120.75M -BM_ProducerConsumerLatency(1048574) 10.00s 99.98m +BM_ProducerConsumer(1048574) 5.82ns 171.75M +BM_ProducerConsumerAffinity(1048574) 7.36ns 135.83M +BM_ProducerConsumerLatency(1048574) 1.67min 9.99m ============================================================================ */ #endif -- 2.34.1