2 * Copyright 2017-present Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
22 #include <folly/Optional.h>
23 #include <folly/detail/TurnSequencer.h>
24 #include <folly/executors/QueuedImmediateExecutor.h>
25 #include <folly/synchronization/detail/ThreadCachedInts.h>
26 #include <folly/synchronization/detail/ThreadCachedLists.h>
28 // Implementation of proposed RCU C++ API
29 // http://open-std.org/JTC1/SC22/WG21/docs/papers/2017/p0566r3.pdf
33 // This file provides a low-overhead mechanism to guarantee ordering
34 // between operations on shared data. In the simplest usage pattern,
35 // readers enter a critical section, view some state, and leave the
36 // critical section, while writers modify shared state and then defer
37 // some cleanup operations. Proper use of these classes will guarantee
38 // that a cleanup operation that is deferred during a reader critical
39 // section will not be executed until after that critical section is
44 // As an example, suppose we have some configuration data that gets
45 // periodically updated. We need to protect ourselves on *every* read
46 // path, even if updates are only vanishly rare, because we don't know
47 // if some writer will come along and yank the data out from under us.
49 // Here's how that might look without deferral:
51 // void doSomething(IPAddress host);
53 // folly::SharedMutex sm;
54 // ConfigData* globalConfigData;
59 // IPAddress curManagementServer = globalConfigData->managementServerIP;
60 // sm.unlock_shared();
61 // doSomethingWith(curManagementServer);
67 // std::this_thread::sleep_for(std::chrono::seconds(60));
68 // ConfigData* oldConfigData = globalConfigData;
69 // ConfigData* newConfigData = loadConfigDataFromRemoteServer();
71 // globalConfigData = newConfigData;
73 // delete oldConfigData;
77 // The readers have to lock_shared and unlock_shared every iteration, even
78 // during the overwhelming majority of the time in which there's no writer
79 // present. These functions are surprisingly expensive; there's around 30ns of
80 // overhead per critical section on my machine.
82 // Let's get rid of the locking. The readers and writer may proceed
83 // concurrently. Here's how this would look:
85 // void doSomething(IPAddress host);
87 // std::atomic<ConfigData*> globalConfigData;
91 // ConfigData* configData = globalConfigData.load();
92 // IPAddress curManagementServer = configData->managementServerIP;
93 // doSomethingWith(curManagementServer);
99 // std::this_thread::sleep_for(std::chrono::seconds(60));
100 // ConfigData* newConfigData = loadConfigDataFromRemoteServer();
101 // globalConfigData.store(newConfigData);
102 // // We can't delete the old ConfigData; we don't know when the readers
103 // // will be done with it! I guess we have to leak it...
107 // This works and is fast, but we don't ever reclaim the memory we
108 // allocated for the copy of the data. We need a way for the writer to
109 // know that no readers observed the old value of the pointer and are
110 // still using it. Tweaking this slightly, we want a way for the
111 // writer to say "I want some operation (deleting the old ConfigData)
112 // to happen, but only after I know that all readers that started
113 // before I requested the action have finished.". The classes in this
114 // namespace allow this. Here's how we'd express this idea:
116 // void doSomething(IPAddress host);
117 // std::atomic<ConfigData*> globalConfigData;
122 // IPAddress curManagementServer;
124 // // We're about to do some reads we want to protect; if we read a
125 // // pointer, we need to make sure that if the writer comes along and
126 // // updates it, the writer's cleanup operation won't happen until we're
127 // // done accessing the pointed-to data. We get a Guard on that
128 // // domain; as long as it exists, no function subsequently passed to
129 // // invokeEventually will execute.
131 // ConfigData* configData = globalConfigData.load();
132 // // We created a guard before we read globalConfigData; we know that the
133 // // pointer will remain valid until the guard is destroyed.
134 // curManagementServer = configData->managementServerIP;
135 // // Guard is released here; retired objects may be freed.
137 // doSomethingWith(curManagementServer);
144 // std::this_thread::sleep_for(std::chrono::seconds(60));
145 // ConfigData* oldConfigData = globalConfigData.load();
146 // ConfigData* newConfigData = loadConfigDataFromRemoteServer();
147 // globalConfigData.store(newConfigData);
148 // rcu_retire(oldConfigData);
149 // // alternatively, in a blocking manner:
150 // // synchronize_rcu();
151 // // delete oldConfigData;
155 // This gets us close to the speed of the second solution, without
156 // leaking memory. A rcu_reader costs about 4 ns, faster than the
157 // lock_shared() / unlock_shared() pair in the more traditional
158 // mutex-based approach from our first attempt, and the writer
159 // never blocks the readers.
163 // This implementation does implement an rcu_domain, and provides a default
164 // one for use per the standard implementation.
167 // A "universe" of deferred execution. Each rcu_domain has an
168 // executor on which deferred functions may execute. Readers obtain
169 // Tokens from an rcu_domain and release them back to it.
170 // rcu_domains should in general be completely separated; it's never
171 // correct to pass a token from one domain to another, and
172 // rcu_reader objects created on one domain do not prevent functions
173 // deferred on other domains from running. It's intended that most
174 // callers should only ever use the default, global domain.
176 // Creation of a domain takes a template tag argument, which
177 // defaults to void. To access different domains, you have to pass a
178 // different tag. The global domain is preferred for almost all
179 // purposes, unless a different executor is required.
181 // You should use a custom rcu_domain if you can't avoid sleeping
182 // during reader critical sections (because you don't want to block
183 // all other users of the domain while you sleep), or you want to change
184 // the default executor type.
186 // API correctness limitations:
188 // In short, nothing about this is exception safe. retire functions should
189 // not throw exceptions in their destructors, move constructors or call
192 // Performance limitations:
194 // A blocked reader will block invocation of deferred functions until it
195 // becomes unblocked. Sleeping while holding a rcu_reader can have bad
196 // performance consequences.
198 // API questions you might have:
199 // - Nested critical sections:
200 // These are fine. The following is explicitly allowed by the standard, up to
201 // a nesting level of 100:
202 // rcu_reader reader1;
204 // rcu_reader reader2;
206 // - Restrictions on retired()ed functions:
207 // Any operation is safe from within a retired function's
208 // execution; you can retire additional functions or add a new domain call to
210 // - rcu_domain destruction:
211 // Destruction of a domain assumes previous synchronization: all remaining
212 // call and retire calls are immediately added to the executor.
216 // Deferral latency is as low as is practical: overhead involves running
217 // several global memory barriers on the machine to ensure all readers are gone.
219 // Currently use of MPMCQueue is the bottleneck for deferred calls, more
220 // specialized queues could be used if available, since only a single reader
221 // reads the queue, and can splice all of the items to the executor if possible.
223 // synchronize_rcu() call latency is on the order of 10ms. Multiple
224 // separate threads can share a synchronized period and should scale.
226 // rcu_retire() is a queue push, and on the order of 150 ns, however,
227 // the current implementation may synchronize if the retire queue is full,
228 // resulting in tail latencies of ~10ms.
230 // rcu_reader creation/destruction is ~4ns. By comparison,
231 // folly::SharedMutex::lock_shared + unlock_shared pair is ~26ns
233 // Hazard pointers vs. RCU:
235 // Hazard pointers protect pointers, generally malloc()d pieces of memory, and
236 // each hazptr only protects one such block.
238 // RCU protects critical sections, *all* memory is protected as long
239 // as the critical section is active.
241 // For example, this has implications for linked lists: If you wish to
242 // free an entire linked list, you simply rcu_retire() each node with
243 // RCU: readers will see either an entirely valid list, or no list at
246 // Conversely with hazptrs, generally lists are walked with
247 // hand-over-hand hazptrs. Only one node is protected at a time. So
248 // anywhere in the middle of the list, hazptrs may read NULL, and throw
249 // away all current work. Alternatively, reference counting can be used,
250 // (as if each node was a shared_ptr<node>), so that readers will always see
251 // *the remaining part* of the list as valid, however parts before its current
252 // hazptr may be freed.
254 // So roughly: RCU is simple, but an all-or-nothing affair. A single rcu_reader
255 // can block all reclamation. Hazptrs will reclaim exactly as much as possible,
256 // at the cost of extra work writing traversal code
259 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0461r1.pdf
261 // Reference Counting RCU Hazptr
263 // Unreclaimed objects Bounded Unbounded Bounded
265 // Contention among readers High None None
267 // Traversal forward progress lock-free wait-free lock-free
269 // Reclamation forward progress lock-free blocking wait-free
271 // Traversal speed atomic no-overhead folly's is
274 // Reference acquisition unconditional unconditional conditional
276 // Automatic reclamation yes no no
278 // Purpose of domains N/A isolate slow configeration
285 template <typename Tag = RcuTag>
288 // Opaque token used to match up lock_shared() and unlock_shared()
292 rcu_token(uint64_t epoch) : epoch_(epoch) {}
294 ~rcu_token() = default;
296 rcu_token(const rcu_token&) = delete;
297 rcu_token(rcu_token&& other) = default;
298 rcu_token& operator=(const rcu_token& other) = delete;
299 rcu_token& operator=(rcu_token&& other) = default;
302 template <typename Tag>
303 friend class rcu_domain;
307 // For most usages, rcu_domain is unnecessary, and you can use
308 // rcu_reader and rcu_retire/synchronize_rcu directly.
309 template <typename Tag>
311 using list_head = typename detail::ThreadCachedLists<Tag>::ListHead;
312 using list_node = typename detail::ThreadCachedLists<Tag>::Node;
316 * If an executor is passed, it is used to run calls and delete
319 rcu_domain(Executor* executor = nullptr) noexcept;
321 rcu_domain(const rcu_domain&) = delete;
322 rcu_domain(rcu_domain&&) = delete;
323 rcu_domain& operator=(const rcu_domain&) = delete;
324 rcu_domain& operator=(rcu_domain&&) = delete;
327 // Reader locks: Prevent any calls from occuring, retired memory
328 // from being freed, and synchronize() calls from completing until
329 // all preceding lock_shared() sections are finished.
331 // Note: can potentially allocate on thread first use.
332 FOLLY_ALWAYS_INLINE rcu_token lock_shared();
333 FOLLY_ALWAYS_INLINE void unlock_shared(rcu_token&&);
335 // Call a function after concurrent critical sections have finished.
336 // Does not block unless the queue is full, then may block to wait
337 // for scheduler thread, but generally does not wait for full
339 template <typename T>
341 void retire(list_node* node) noexcept;
343 // Ensure concurrent critical sections have finished.
344 // Always waits for full synchronization.
345 // read lock *must not* be held.
346 void synchronize() noexcept;
349 detail::ThreadCachedInts<Tag> counters_;
351 std::atomic<uint64_t> version_{0};
352 // Future epochs being driven by threads in synchronize
353 std::atomic<uint64_t> work_{0};
354 // Matches version_, for waking up threads in synchronize that are
356 detail::TurnSequencer<std::atomic> turn_;
357 // Only one thread can drive half_sync.
358 std::mutex syncMutex_;
359 // Currently half_sync takes ~16ms due to heavy barriers.
360 // Ensure that if used in a single thread, max GC time
361 // is limited to 1% of total CPU time.
362 static constexpr uint64_t syncTimePeriod_{1600 * 2 /* full sync is 2x */};
363 std::atomic<uint64_t> syncTime_{0};
364 // call()s waiting to move through two epochs.
365 detail::ThreadCachedLists<Tag> q_;
366 // Executor callbacks will eventually be run on.
367 Executor* executor_{nullptr};
368 static bool singleton_; // Ensure uniqueness per-tag.
370 // Queues for callbacks waiting to go through two epochs.
371 list_head queues_[2]{};
373 // Move queues through one epoch (half of a full synchronize()).
374 // Will block waiting for readers to exit if blocking is true.
375 // blocking must *not* be true if the current thread is locked,
378 // returns a list of callbacks ready to run in cbs.
379 void half_sync(bool blocking, list_head& cbs);
382 extern rcu_domain<RcuTag> rcu_default_domain_;
384 inline rcu_domain<RcuTag>* rcu_default_domain() {
385 return &rcu_default_domain_;
388 // Main reader guard class.
391 FOLLY_ALWAYS_INLINE rcu_reader() noexcept
392 : epoch_(rcu_default_domain()->lock_shared()) {}
393 rcu_reader(std::defer_lock_t) noexcept {}
394 rcu_reader(const rcu_reader&) = delete;
395 rcu_reader(rcu_reader&& other) noexcept : epoch_(std::move(other.epoch_)) {}
396 rcu_reader& operator=(const rcu_reader&) = delete;
397 rcu_reader& operator=(rcu_reader&& other) noexcept {
398 if (epoch_.has_value()) {
399 rcu_default_domain()->unlock_shared(std::move(epoch_.value()));
401 epoch_ = std::move(other.epoch_);
405 FOLLY_ALWAYS_INLINE ~rcu_reader() noexcept {
406 if (epoch_.has_value()) {
407 rcu_default_domain()->unlock_shared(std::move(epoch_.value()));
411 void swap(rcu_reader& other) noexcept {
412 std::swap(epoch_, other.epoch_);
415 FOLLY_ALWAYS_INLINE void lock() noexcept {
416 DCHECK(!epoch_.has_value());
417 epoch_ = rcu_default_domain()->lock_shared();
420 FOLLY_ALWAYS_INLINE void unlock() noexcept {
421 DCHECK(epoch_.has_value());
422 rcu_default_domain()->unlock_shared(std::move(epoch_.value()));
426 Optional<rcu_token> epoch_;
429 inline void swap(rcu_reader& a, rcu_reader& b) noexcept {
433 inline void synchronize_rcu() noexcept {
434 rcu_default_domain()->synchronize();
437 inline void rcu_barrier() noexcept {
438 rcu_default_domain()->synchronize();
441 // Free-function retire. Always allocates.
442 template <typename T, typename D = std::default_delete<T>>
443 void rcu_retire(T* p, D d = {}) {
444 rcu_default_domain()->call([p, del = std::move(d)]() { del(p); });
447 // Base class for rcu objects. retire() will use preallocated storage
448 // from rcu_obj_base, vs. rcu_retire() which always allocates.
449 template <typename T, typename D = std::default_delete<T>>
450 class rcu_obj_base : detail::ThreadCachedListsBase::Node {
452 void retire(D d = {}) {
453 // This implementation assumes folly::Function has enough
454 // inline storage for D, otherwise, it allocates.
455 this->cb_ = [this, d = std::move(d)]() { d(static_cast<T*>(this)); };
456 rcu_default_domain()->retire(this);
462 #include <folly/synchronization/Rcu-inl.h>