folly/test/ReadMostlySharedPtrBenchmark.cpp

   1 /*
   2  * Copyright 2015 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16 /* -*- Mode: C++; tab-width: 2; c-basic-offset: 2; indent-tabs-mode: nil -*- */
  17
  18 #include <thread>
  19 #include <iostream>
  20 #include <folly/Benchmark.h>
  21 #include <folly/Memory.h>
  22 #include <gflags/gflags.h>
  23
  24 #include <folly/ReadMostlySharedPtr.h>
  25
  26 /**
  27  * @file Benchmark comparing three implementations of ReadMostlySharedPtr.
  28  *
  29  *       Run with something like --bm_min_usec=100000.
  30  */
  31
  32 namespace slow {
  33
  34 // An implementation with thread local cache of shared_ptrs.
  35 template<typename T>
  36 class ReadMostlySharedPtr : boost::noncopyable {
  37  public:
  38   explicit ReadMostlySharedPtr(std::shared_ptr<T> ptr = nullptr) {
  39     master_.ptr = std::move(ptr);
  40     master_.version.store(1);
  41   }
  42
  43   std::shared_ptr<T> store(std::shared_ptr<T> ptr) {
  44     std::lock_guard<std::mutex> guard(mutex_);
  45     std::swap(master_.ptr, ptr);
  46     master_.version.fetch_add(1);
  47     return ptr;
  48   }
  49
  50   std::shared_ptr<T> load() const {
  51     // We are the only thread accessing threadLocalCache_->version so it is
  52     // fine to use memory_order_relaxed
  53     auto local_version =
  54       threadLocalCache_->version.load(std::memory_order_relaxed);
  55     if (local_version != master_.version.load()) {
  56       std::lock_guard<std::mutex> guard(mutex_);
  57       threadLocalCache_->ptr = master_.ptr;
  58       threadLocalCache_->version.store(master_.version.load(),
  59                                        std::memory_order_relaxed);
  60     }
  61     return threadLocalCache_->ptr;
  62   }
  63
  64  private:
  65   struct VersionedPointer : boost::noncopyable {
  66     VersionedPointer() : version(0) { }
  67     std::shared_ptr<T> ptr;
  68     std::atomic<uint64_t> version;
  69   };
  70
  71   folly::ThreadLocal<VersionedPointer> threadLocalCache_;
  72   VersionedPointer master_;
  73
  74   // Ensures safety between concurrent store() and load() calls
  75   mutable std::mutex mutex_;
  76 };
  77
  78 }
  79
  80
  81 /**
  82  * At the moment the fastest implementation in this benchmark.
  83  * A real RCU implementation would most likely be significantly better.
  84  */
  85 namespace fast {
  86
  87 /**
  88  * Contains a version number and a shared_ptr that points to the most recent
  89  * object. The load() method uses thread-local storage to efficiently return
  90  * the current pointer without locking when the pointer has not changed.
  91  * The version of the pointer in thread-local cache is compared to the
  92  * master version. If the master is found to be newer, it is copied into
  93  * the thread-local cache under a lock. The store() method grabs the lock,
  94  * updates the master pointer and bumps the version number.
  95  *
  96  * The downside is that it doesn't clear or update thread-local cache
  97  * when updating the pointer. This means that old instances of T can stay
  98  * alive in thread-local cache indefinitely if load() is not called from
  99  * some threads.
 100  */
 101 template<typename T>
 102 class ReadMostlySharedPtr : boost::noncopyable {
 103  public:
 104   explicit ReadMostlySharedPtr(std::shared_ptr<T> ptr = nullptr) {
 105     masterPtr_ = std::move(ptr);
 106     masterVersion_.store(1);
 107   }
 108
 109   /**
 110    * Replaces the managed object.
 111    */
 112   void store(std::shared_ptr<T> ptr) {
 113     {
 114       std::lock_guard<std::mutex> guard(mutex_);
 115       // Swap to avoid calling ~T() under the lock
 116       std::swap(masterPtr_, ptr);
 117       masterVersion_.fetch_add(1);
 118     }
 119   }
 120
 121   /**
 122    * Returns a shared_ptr to the managed object.
 123    */
 124   std::shared_ptr<T> load() const {
 125     auto& local = *threadLocalCache_;
 126     if (local.version != masterVersion_.load()) {
 127       std::lock_guard<std::mutex> guard(mutex_);
 128
 129       if (!masterPtr_) {
 130         local.ptr = nullptr;
 131       } else {
 132         // The following expression is tricky.
 133         //
 134         // It creates a shared_ptr<shared_ptr<T>> that points to a copy of
 135         // masterPtr_. The reference counter of this shared_ptr<shared_ptr<T>>
 136         // will normally only be modified from this thread, which avoids
 137         // cache line bouncing. (Though the caller is free to pass the pointer
 138         // to other threads and bump reference counter from there)
 139         //
 140         // Then this shared_ptr<shared_ptr<T>> is turned into shared_ptr<T>.
 141         // This means that the returned shared_ptr<T> will internally point to
 142         // control block of the shared_ptr<shared_ptr<T>>, but will dereference
 143         // to T, not shared_ptr<T>.
 144         local.ptr = std::shared_ptr<T>(
 145           std::make_shared<std::shared_ptr<T>>(masterPtr_),
 146           masterPtr_.get());
 147       }
 148
 149       local.version = masterVersion_.load();
 150     }
 151     return local.ptr;
 152   }
 153
 154  private:
 155   struct VersionedPointer : boost::noncopyable {
 156     VersionedPointer() { }
 157     std::shared_ptr<T> ptr;
 158     uint64_t version = 0;
 159   };
 160
 161   folly::ThreadLocal<VersionedPointer> threadLocalCache_;
 162
 163   std::shared_ptr<T> masterPtr_;
 164   std::atomic<uint64_t> masterVersion_;
 165
 166   // Ensures safety between concurrent store() and load() calls
 167   mutable std::mutex mutex_;
 168 };
 169
 170 }
 171
 172
 173 template<class PtrInt>
 174 void benchReads(int n) {
 175   PtrInt ptr(folly::make_unique<int>(42));
 176   for (int i = 0; i < n; ++i) {
 177     auto val = ptr.load();
 178     folly::doNotOptimizeAway(val.get());
 179   }
 180 }
 181
 182 template<class PtrInt>
 183 void benchWrites(int n) {
 184   PtrInt ptr;
 185   for (int i = 0; i < n; ++i) {
 186     ptr.store(folly::make_unique<int>(3));
 187   }
 188 }
 189
 190 template<class PtrInt>
 191 void benchReadsWhenWriting(int n) {
 192   PtrInt ptr;
 193   std::atomic<bool> shutdown {false};
 194   std::thread writing_thread;
 195
 196   BENCHMARK_SUSPEND {
 197     writing_thread = std::thread([&] {
 198       for (uint64_t i = 0; !shutdown.load(); ++i) {
 199         ptr.store(folly::make_unique<int>(3));
 200       }
 201     });
 202   }
 203
 204   for (uint64_t i = 0; i < n; ++i) {
 205     auto val = ptr.load();
 206     folly::doNotOptimizeAway(val.get());
 207   }
 208
 209   BENCHMARK_SUSPEND {
 210     shutdown.store(true);
 211     writing_thread.join();
 212   }
 213 }
 214
 215
 216 template<class PtrInt>
 217 void benchWritesWhenReading(int n) {
 218   PtrInt ptr;
 219   std::atomic<bool> shutdown {false};
 220   std::thread reading_thread;
 221
 222   BENCHMARK_SUSPEND {
 223     reading_thread = std::thread([&] {
 224       for (uint64_t i = 0; !shutdown.load(); ++i) {
 225         auto val = ptr.load();
 226         folly::doNotOptimizeAway(val.get());
 227       }
 228     });
 229   }
 230
 231
 232   for (uint64_t i = 0; i < n; ++i) {
 233     ptr.store(folly::make_unique<int>(3));
 234   }
 235
 236   BENCHMARK_SUSPEND {
 237     shutdown.store(true);
 238     reading_thread.join();
 239   }
 240 }
 241
 242
 243 template<class PtrInt>
 244 void benchReadsIn10Threads(int n) {
 245   PtrInt ptr(folly::make_unique<int>(27));
 246   std::vector<std::thread> threads(10);
 247   int n_per_thread = n;
 248
 249   for (std::thread& t: threads) {
 250     t = std::thread([&] {
 251       for (int i = 0; i < n; ++i) {
 252         auto val = ptr.load();
 253         folly::doNotOptimizeAway(val.get());
 254       }
 255     });
 256   }
 257
 258   for (std::thread& t: threads) {
 259     t.join();
 260   }
 261 }
 262
 263
 264 #define BENCH(name)                                               \
 265   BENCHMARK(name ## _Slow, n) {                                   \
 266     bench ## name <slow::ReadMostlySharedPtr<int>>(n);      \
 267   }                                                               \
 268   BENCHMARK(name ## _ReadMostlySharedPtr, n) {              \
 269     bench ## name <folly::ReadMostlySharedPtr<int, int>>(n);\
 270   }                                                               \
 271   BENCHMARK(name ## _FastReadMostlySharedPtr, n) {          \
 272     bench ## name <fast::ReadMostlySharedPtr<int>>(n);      \
 273   }                                                               \
 274   BENCHMARK_DRAW_LINE();
 275
 276
 277 BENCH(Reads)
 278 BENCH(Writes)
 279 BENCH(ReadsWhenWriting)
 280 BENCH(WritesWhenReading)
 281 BENCH(ReadsIn10Threads)
 282
 283 int main(int argc, char** argv) {
 284   gflags::ParseCommandLineFlags(&argc, &argv, true);
 285   gflags::SetCommandLineOptionWithMode(
 286     "bm_min_usec", "100000", gflags::SET_FLAG_IF_DEFAULT
 287   );
 288
 289   folly::runBenchmarks();
 290
 291   return 0;
 292 }
 293
 294 /*
 295 ============================================================================
 296 folly/test/ReadMostlySharedPtrBenchmark.cpp     relative  time/iter  iters/s
 297 ============================================================================
 298 Reads_Slow                                                  21.05ns   47.52M
 299 Reads_ReadMostlySharedPtr                                   30.57ns   32.71M
 300 Reads_FastReadMostlySharedPtr                               21.24ns   47.09M
 301 ----------------------------------------------------------------------------
 302 Writes_Slow                                                117.52ns    8.51M
 303 Writes_ReadMostlySharedPtr                                 145.26ns    6.88M
 304 Writes_FastReadMostlySharedPtr                             116.26ns    8.60M
 305 ----------------------------------------------------------------------------
 306 ReadsWhenWriting_Slow                                       56.18ns   17.80M
 307 ReadsWhenWriting_ReadMostlySharedPtr                       141.32ns    7.08M
 308 ReadsWhenWriting_FastReadMostlySharedPtr                    51.82ns   19.30M
 309 ----------------------------------------------------------------------------
 310 WritesWhenReading_Slow                                     828.32ns    1.21M
 311 WritesWhenReading_ReadMostlySharedPtr                        3.00us  333.63K
 312 WritesWhenReading_FastReadMostlySharedPtr                  677.28ns    1.48M
 313 ----------------------------------------------------------------------------
 314 ReadsIn10Threads_Slow                                      509.37ns    1.96M
 315 ReadsIn10Threads_ReadMostlySharedPtr                        34.33ns   29.13M
 316 ReadsIn10Threads_FastReadMostlySharedPtr                    26.31ns   38.00M
 317 ----------------------------------------------------------------------------
 318 ============================================================================
 319 */