2 * Copyright 2015 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 /* -*- Mode: C++; tab-width: 2; c-basic-offset: 2; indent-tabs-mode: nil -*- */
20 #include <folly/Benchmark.h>
21 #include <folly/Memory.h>
22 #include <gflags/gflags.h>
24 #include <folly/ReadMostlySharedPtr.h>
27 * @file Benchmark comparing three implementations of ReadMostlySharedPtr.
29 * Run with something like --bm_min_usec=100000.
34 // An implementation with thread local cache of shared_ptrs.
36 class ReadMostlySharedPtr : boost::noncopyable {
38 explicit ReadMostlySharedPtr(std::shared_ptr<T> ptr = nullptr) {
39 master_.ptr = std::move(ptr);
40 master_.version.store(1);
43 std::shared_ptr<T> store(std::shared_ptr<T> ptr) {
44 std::lock_guard<std::mutex> guard(mutex_);
45 std::swap(master_.ptr, ptr);
46 master_.version.fetch_add(1);
50 std::shared_ptr<T> load() const {
51 // We are the only thread accessing threadLocalCache_->version so it is
52 // fine to use memory_order_relaxed
54 threadLocalCache_->version.load(std::memory_order_relaxed);
55 if (local_version != master_.version.load()) {
56 std::lock_guard<std::mutex> guard(mutex_);
57 threadLocalCache_->ptr = master_.ptr;
58 threadLocalCache_->version.store(master_.version.load(),
59 std::memory_order_relaxed);
61 return threadLocalCache_->ptr;
65 struct VersionedPointer : boost::noncopyable {
66 VersionedPointer() : version(0) { }
67 std::shared_ptr<T> ptr;
68 std::atomic<uint64_t> version;
71 folly::ThreadLocal<VersionedPointer> threadLocalCache_;
72 VersionedPointer master_;
74 // Ensures safety between concurrent store() and load() calls
75 mutable std::mutex mutex_;
82 * At the moment the fastest implementation in this benchmark.
83 * A real RCU implementation would most likely be significantly better.
88 * Contains a version number and a shared_ptr that points to the most recent
89 * object. The load() method uses thread-local storage to efficiently return
90 * the current pointer without locking when the pointer has not changed.
91 * The version of the pointer in thread-local cache is compared to the
92 * master version. If the master is found to be newer, it is copied into
93 * the thread-local cache under a lock. The store() method grabs the lock,
94 * updates the master pointer and bumps the version number.
96 * The downside is that it doesn't clear or update thread-local cache
97 * when updating the pointer. This means that old instances of T can stay
98 * alive in thread-local cache indefinitely if load() is not called from
102 class ReadMostlySharedPtr : boost::noncopyable {
104 explicit ReadMostlySharedPtr(std::shared_ptr<T> ptr = nullptr) {
105 masterPtr_ = std::move(ptr);
106 masterVersion_.store(1);
110 * Replaces the managed object.
112 void store(std::shared_ptr<T> ptr) {
114 std::lock_guard<std::mutex> guard(mutex_);
115 // Swap to avoid calling ~T() under the lock
116 std::swap(masterPtr_, ptr);
117 masterVersion_.fetch_add(1);
122 * Returns a shared_ptr to the managed object.
124 std::shared_ptr<T> load() const {
125 auto& local = *threadLocalCache_;
126 if (local.version != masterVersion_.load()) {
127 std::lock_guard<std::mutex> guard(mutex_);
132 // The following expression is tricky.
134 // It creates a shared_ptr<shared_ptr<T>> that points to a copy of
135 // masterPtr_. The reference counter of this shared_ptr<shared_ptr<T>>
136 // will normally only be modified from this thread, which avoids
137 // cache line bouncing. (Though the caller is free to pass the pointer
138 // to other threads and bump reference counter from there)
140 // Then this shared_ptr<shared_ptr<T>> is turned into shared_ptr<T>.
141 // This means that the returned shared_ptr<T> will internally point to
142 // control block of the shared_ptr<shared_ptr<T>>, but will dereference
143 // to T, not shared_ptr<T>.
144 local.ptr = std::shared_ptr<T>(
145 std::make_shared<std::shared_ptr<T>>(masterPtr_),
149 local.version = masterVersion_.load();
155 struct VersionedPointer : boost::noncopyable {
156 VersionedPointer() { }
157 std::shared_ptr<T> ptr;
158 uint64_t version = 0;
161 folly::ThreadLocal<VersionedPointer> threadLocalCache_;
163 std::shared_ptr<T> masterPtr_;
164 std::atomic<uint64_t> masterVersion_;
166 // Ensures safety between concurrent store() and load() calls
167 mutable std::mutex mutex_;
173 template<class PtrInt>
174 void benchReads(int n) {
175 PtrInt ptr(folly::make_unique<int>(42));
176 for (int i = 0; i < n; ++i) {
177 auto val = ptr.load();
178 folly::doNotOptimizeAway(val.get());
182 template<class PtrInt>
183 void benchWrites(int n) {
185 for (int i = 0; i < n; ++i) {
186 ptr.store(folly::make_unique<int>(3));
190 template<class PtrInt>
191 void benchReadsWhenWriting(int n) {
193 std::atomic<bool> shutdown {false};
194 std::thread writing_thread;
197 writing_thread = std::thread([&] {
198 for (uint64_t i = 0; !shutdown.load(); ++i) {
199 ptr.store(folly::make_unique<int>(3));
204 for (uint64_t i = 0; i < n; ++i) {
205 auto val = ptr.load();
206 folly::doNotOptimizeAway(val.get());
210 shutdown.store(true);
211 writing_thread.join();
216 template<class PtrInt>
217 void benchWritesWhenReading(int n) {
219 std::atomic<bool> shutdown {false};
220 std::thread reading_thread;
223 reading_thread = std::thread([&] {
224 for (uint64_t i = 0; !shutdown.load(); ++i) {
225 auto val = ptr.load();
226 folly::doNotOptimizeAway(val.get());
232 for (uint64_t i = 0; i < n; ++i) {
233 ptr.store(folly::make_unique<int>(3));
237 shutdown.store(true);
238 reading_thread.join();
243 template<class PtrInt>
244 void benchReadsIn10Threads(int n) {
245 PtrInt ptr(folly::make_unique<int>(27));
246 std::vector<std::thread> threads(10);
247 int n_per_thread = n;
249 for (std::thread& t: threads) {
250 t = std::thread([&] {
251 for (int i = 0; i < n; ++i) {
252 auto val = ptr.load();
253 folly::doNotOptimizeAway(val.get());
258 for (std::thread& t: threads) {
264 #define BENCH(name) \
265 BENCHMARK(name ## _Slow, n) { \
266 bench ## name <slow::ReadMostlySharedPtr<int>>(n); \
268 BENCHMARK(name ## _ReadMostlySharedPtr, n) { \
269 bench ## name <folly::ReadMostlySharedPtr<int, int>>(n);\
271 BENCHMARK(name ## _FastReadMostlySharedPtr, n) { \
272 bench ## name <fast::ReadMostlySharedPtr<int>>(n); \
274 BENCHMARK_DRAW_LINE();
279 BENCH(ReadsWhenWriting)
280 BENCH(WritesWhenReading)
281 BENCH(ReadsIn10Threads)
283 int main(int argc, char** argv) {
284 gflags::ParseCommandLineFlags(&argc, &argv, true);
285 gflags::SetCommandLineOptionWithMode(
286 "bm_min_usec", "100000", gflags::SET_FLAG_IF_DEFAULT
289 folly::runBenchmarks();
295 ============================================================================
296 folly/test/ReadMostlySharedPtrBenchmark.cpp relative time/iter iters/s
297 ============================================================================
298 Reads_Slow 21.05ns 47.52M
299 Reads_ReadMostlySharedPtr 30.57ns 32.71M
300 Reads_FastReadMostlySharedPtr 21.24ns 47.09M
301 ----------------------------------------------------------------------------
302 Writes_Slow 117.52ns 8.51M
303 Writes_ReadMostlySharedPtr 145.26ns 6.88M
304 Writes_FastReadMostlySharedPtr 116.26ns 8.60M
305 ----------------------------------------------------------------------------
306 ReadsWhenWriting_Slow 56.18ns 17.80M
307 ReadsWhenWriting_ReadMostlySharedPtr 141.32ns 7.08M
308 ReadsWhenWriting_FastReadMostlySharedPtr 51.82ns 19.30M
309 ----------------------------------------------------------------------------
310 WritesWhenReading_Slow 828.32ns 1.21M
311 WritesWhenReading_ReadMostlySharedPtr 3.00us 333.63K
312 WritesWhenReading_FastReadMostlySharedPtr 677.28ns 1.48M
313 ----------------------------------------------------------------------------
314 ReadsIn10Threads_Slow 509.37ns 1.96M
315 ReadsIn10Threads_ReadMostlySharedPtr 34.33ns 29.13M
316 ReadsIn10Threads_FastReadMostlySharedPtr 26.31ns 38.00M
317 ----------------------------------------------------------------------------
318 ============================================================================