Add memory padding and alignment to prevent false sharing
authorArtem Lantsev <lantsev@fb.com>
Wed, 12 Apr 2017 09:16:51 +0000 (02:16 -0700)
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>
Wed, 12 Apr 2017 09:20:08 +0000 (02:20 -0700)
Summary: This is a follow up commit for the thread https://github.com/facebook/folly/pull/378

Reviewed By: nbronson

Differential Revision: D4860356

fbshipit-source-id: f10a0d12a593c18b1abf94da5b477c524c04f4be

folly/ProducerConsumerQueue.h
folly/test/ProducerConsumerQueueBenchmark.cpp

index 053c0e2..9ebe65a 100644 (file)
@@ -27,6 +27,8 @@
 #include <type_traits>
 #include <utility>
 
+#include <folly/detail/CacheLocality.h>
+
 namespace folly {
 
 /*
@@ -166,11 +168,14 @@ struct ProducerConsumerQueue {
   }
 
 private:
+  char pad0_[detail::CacheLocality::kFalseSharingRange];
   const uint32_t size_;
   T* const records_;
 
-  std::atomic<unsigned int> readIndex_;
-  std::atomic<unsigned int> writeIndex_;
+  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned int> readIndex_;
+  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned int> writeIndex_;
+
+  char pad1_[detail::CacheLocality::kFalseSharingRange - sizeof(writeIndex_)];
 };
 
 }
index e807968..d37ac91 100644 (file)
@@ -79,7 +79,7 @@ struct ThroughputTest {
 
   QueueType queue_;
   std::atomic<bool> done_;
-  int iters_;
+  const int iters_;
   int cpu0_;
   int cpu1_;
 };
@@ -176,7 +176,7 @@ struct LatencyTest {
   QueueType queue_;
   std::atomic<bool> done_;
   int time_cost_;
-  int iters_;
+  const int iters_;
   int cpu0_;
   int cpu1_;
   Histogram<int> hist_;
@@ -250,26 +250,53 @@ int main(int argc, char** argv) {
 
 #if 0
 /*
-Benchmark on Intel(R) Xeon(R) CPU E5-2660 0 @ 2.20GHz
-Latency histogram:
-  log(nsec)
-  min  max     count
-  6    7       5124
-  7    8       4799
-  8    9       49
-  9    10      2
-  10   11      1
-  11   12      5
-  12   13      3
-  13   14      9
-  14   15      8
+Benchmark
+
+$ lscpu
+Architecture:          x86_64
+CPU op-mode(s):        32-bit, 64-bit
+Byte Order:            Little Endian
+CPU(s):                24
+On-line CPU(s) list:   0-23
+Thread(s) per core:    1
+Core(s) per socket:    1
+Socket(s):             24
+NUMA node(s):          1
+Vendor ID:             GenuineIntel
+CPU family:            6
+Model:                 60
+Model name:            Intel Core Processor (Haswell, no TSX)
+Stepping:              1
+CPU MHz:               2494.244
+BogoMIPS:              4988.48
+Hypervisor vendor:     KVM
+Virtualization type:   full
+L1d cache:             32K
+L1i cache:             32K
+L2 cache:              4096K
+NUMA node0 CPU(s):     0-23
+
+$ ../buck-out/gen/folly/test/producer_consumer_queue_benchmark
+5       6       1       5
+6       7       1893    11358
+7       8       39671   277697
+8       9       34921   279368
+9       10      17799   160191
+10      11      3685    36850
+11      12      1075    11825
+12      13      456     5472
+13      14      422     5486
+14      15      64      896
+15      16      7       105
+16      17      3       48
+17      18      3       51
 ============================================================================
 folly/test/ProducerConsumerQueueBenchmark.cpp   relative  time/iter  iters/s
 ============================================================================
 ----------------------------------------------------------------------------
-BM_ProducerConsumer(1048574)                                 7.52ns  132.90M
-BM_ProducerConsumerAffinity(1048574)                         8.28ns  120.75M
-BM_ProducerConsumerLatency(1048574)                          10.00s   99.98m
+BM_ProducerConsumer(1048574)                                 5.82ns  171.75M
+BM_ProducerConsumerAffinity(1048574)                         7.36ns  135.83M
+BM_ProducerConsumerLatency(1048574)                         1.67min    9.99m
 ============================================================================
 */
 #endif