folly/synchronization/Rcu.h

   1 /*
   2  * Copyright 2017-present Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16 #pragma once
  17
  18 #include <atomic>
  19 #include <functional>
  20 #include <limits>
  21
  22 #include <folly/Optional.h>
  23 #include <folly/detail/TurnSequencer.h>
  24 #include <folly/executors/QueuedImmediateExecutor.h>
  25 #include <folly/synchronization/detail/ThreadCachedInts.h>
  26 #include <folly/synchronization/detail/ThreadCachedLists.h>
  27
  28 // Implementation of proposed RCU C++ API
  29 // http://open-std.org/JTC1/SC22/WG21/docs/papers/2017/p0566r3.pdf
  30
  31 // Overview
  32
  33 // This file provides a low-overhead mechanism to guarantee ordering
  34 // between operations on shared data. In the simplest usage pattern,
  35 // readers enter a critical section, view some state, and leave the
  36 // critical section, while writers modify shared state and then defer
  37 // some cleanup operations. Proper use of these classes will guarantee
  38 // that a cleanup operation that is deferred during a reader critical
  39 // section will not be executed until after that critical section is
  40 // over.
  41
  42 // Example
  43
  44 // As an example, suppose we have some configuration data that gets
  45 // periodically updated. We need to protect ourselves on *every* read
  46 // path, even if updates are only vanishly rare, because we don't know
  47 // if some writer will come along and yank the data out from under us.
  48 //
  49 // Here's how that might look without deferral:
  50 //
  51 // void doSomething(IPAddress host);
  52 //
  53 // folly::SharedMutex sm;
  54 // ConfigData* globalConfigData;
  55 //
  56 // void reader() {
  57 //   while (true) {
  58 //     sm.lock_shared();
  59 //     IPAddress curManagementServer = globalConfigData->managementServerIP;
  60 //     sm.unlock_shared();
  61 //     doSomethingWith(curManagementServer);
  62 //   }
  63 // }
  64 //
  65 // void writer() {
  66 //   while (true) {
  67 //     std::this_thread::sleep_for(std::chrono::seconds(60));
  68 //     ConfigData* oldConfigData = globalConfigData;
  69 //     ConfigData* newConfigData = loadConfigDataFromRemoteServer();
  70 //     sm.lock();
  71 //     globalConfigData = newConfigData;
  72 //     sm.unlock();
  73 //     delete oldConfigData;
  74 //   }
  75 // }
  76 //
  77 // The readers have to lock_shared and unlock_shared every iteration, even
  78 // during the overwhelming majority of the time in which there's no writer
  79 // present. These functions are surprisingly expensive; there's around 30ns of
  80 // overhead per critical section on my machine.
  81 //
  82 // Let's get rid of the locking. The readers and writer may proceed
  83 // concurrently. Here's how this would look:
  84 //
  85 // void doSomething(IPAddress host);
  86 //
  87 // std::atomic<ConfigData*> globalConfigData;
  88 //
  89 // void reader() {
  90 //   while (true) {
  91 //     ConfigData* configData = globalConfigData.load();
  92 //     IPAddress curManagementServer = configData->managementServerIP;
  93 //     doSomethingWith(curManagementServer);
  94 //  }
  95 // }
  96 //
  97 // void writer() {
  98 //   while (true) {
  99 //     std::this_thread::sleep_for(std::chrono::seconds(60));
 100 //     ConfigData* newConfigData = loadConfigDataFromRemoteServer();
 101 //     globalConfigData.store(newConfigData);
 102 //     // We can't delete the old ConfigData; we don't know when the readers
 103 //     // will be done with it! I guess we have to leak it...
 104 //   }
 105 // }
 106 //
 107 // This works and is fast, but we don't ever reclaim the memory we
 108 // allocated for the copy of the data. We need a way for the writer to
 109 // know that no readers observed the old value of the pointer and are
 110 // still using it. Tweaking this slightly, we want a way for the
 111 // writer to say "I want some operation (deleting the old ConfigData)
 112 // to happen, but only after I know that all readers that started
 113 // before I requested the action have finished.". The classes in this
 114 // namespace allow this. Here's how we'd express this idea:
 115 //
 116 // void doSomething(IPAddress host);
 117 // std::atomic<ConfigData*> globalConfigData;
 118 //
 119 //
 120 // void reader() {
 121 //   while (true) {
 122 //     IPAddress curManagementServer;
 123 //     {
 124 //       // We're about to do some reads we want to protect; if we read a
 125 //       // pointer, we need to make sure that if the writer comes along and
 126 //       // updates it, the writer's cleanup operation won't happen until we're
 127 //       // done accessing the pointed-to data. We get a Guard on that
 128 //       // domain; as long as it exists, no function subsequently passed to
 129 //       // invokeEventually will execute.
 130 //       rcu_reader guard;
 131 //       ConfigData* configData = globalConfigData.load();
 132 //       // We created a guard before we read globalConfigData; we know that the
 133 //       // pointer will remain valid until the guard is destroyed.
 134 //       curManagementServer = configData->managementServerIP;
 135 //       // Guard is released here; retired objects may be freed.
 136 //     }
 137 //     doSomethingWith(curManagementServer);
 138 //   }
 139 // }
 140 //
 141 // void writer() {
 142 //
 143 //   while (true) {
 144 //     std::this_thread::sleep_for(std::chrono::seconds(60));
 145 //     ConfigData* oldConfigData = globalConfigData.load();
 146 //     ConfigData* newConfigData = loadConfigDataFromRemoteServer();
 147 //     globalConfigData.store(newConfigData);
 148 //     rcu_retire(oldConfigData);
 149 //     // alternatively, in a blocking manner:
 150 //     //   synchronize_rcu();
 151 //     //   delete oldConfigData;
 152 //   }
 153 // }
 154 //
 155 // This gets us close to the speed of the second solution, without
 156 // leaking memory. A rcu_reader costs about 4 ns, faster than the
 157 // lock_shared() / unlock_shared() pair in the more traditional
 158 // mutex-based approach from our first attempt, and the writer
 159 // never blocks the readers.
 160
 161 // Notes
 162
 163 // This implementation does implement an rcu_domain, and provides a default
 164 // one for use per the standard implementation.
 165 //
 166 // rcu_domain:
 167 //   A "universe" of deferred execution. Each rcu_domain has an
 168 //   executor on which deferred functions may execute. Readers obtain
 169 //   Tokens from an rcu_domain and release them back to it.
 170 //   rcu_domains should in general be completely separated; it's never
 171 //   correct to pass a token from one domain to another, and
 172 //   rcu_reader objects created on one domain do not prevent functions
 173 //   deferred on other domains from running. It's intended that most
 174 //   callers should only ever use the default, global domain.
 175 //
 176 //   Creation of a domain takes a template tag argument, which
 177 //   defaults to void. To access different domains, you have to pass a
 178 //   different tag.  The global domain is preferred for almost all
 179 //   purposes, unless a different executor is required.
 180 //
 181 //   You should use a custom rcu_domain if you can't avoid sleeping
 182 //   during reader critical sections (because you don't want to block
 183 //   all other users of the domain while you sleep), or you want to change
 184 //   the default executor type.
 185
 186 // API correctness limitations:
 187 //  - Exceptions:
 188 //    In short, nothing about this is exception safe. retire functions should
 189 //    not throw exceptions in their destructors, move constructors or call
 190 //    operators.
 191 //
 192 // Performance limitations:
 193 //  - Blocking:
 194 //    A blocked reader will block invocation of deferred functions until it
 195 //    becomes unblocked. Sleeping while holding a rcu_reader can have bad
 196 //    performance consequences.
 197 //
 198 // API questions you might have:
 199 //  - Nested critical sections:
 200 //    These are fine. The following is explicitly allowed by the standard, up to
 201 //    a nesting level of 100:
 202 //        rcu_reader reader1;
 203 //        doSomeWork();
 204 //        rcu_reader reader2;
 205 //        doSomeMoreWork();
 206 //  - Restrictions on retired()ed functions:
 207 //    Any operation is safe from within a retired function's
 208 //    execution; you can retire additional functions or add a new domain call to
 209 //    the domain.
 210 //  - rcu_domain destruction:
 211 //    Destruction of a domain assumes previous synchronization: all remaining
 212 //    call and retire calls are immediately added to the executor.
 213
 214 // Overheads
 215
 216 // Deferral latency is as low as is practical: overhead involves running
 217 // several global memory barriers on the machine to ensure all readers are gone.
 218 //
 219 // Currently use of MPMCQueue is the bottleneck for deferred calls, more
 220 // specialized queues could be used if available, since only a single reader
 221 // reads the queue, and can splice all of the items to the executor if possible.
 222 //
 223 // synchronize_rcu() call latency is on the order of 10ms.  Multiple
 224 // separate threads can share a synchronized period and should scale.
 225 //
 226 // rcu_retire() is a queue push, and on the order of 150 ns, however,
 227 // the current implementation may synchronize if the retire queue is full,
 228 // resulting in tail latencies of ~10ms.
 229 //
 230 // rcu_reader creation/destruction is ~4ns.  By comparison,
 231 // folly::SharedMutex::lock_shared + unlock_shared pair is ~26ns
 232
 233 // Hazard pointers vs. RCU:
 234 //
 235 // Hazard pointers protect pointers, generally malloc()d pieces of memory, and
 236 // each hazptr only protects one such block.
 237 //
 238 // RCU protects critical sections, *all* memory is protected as long
 239 // as the critical section is active.
 240 //
 241 // For example, this has implications for linked lists: If you wish to
 242 // free an entire linked list, you simply rcu_retire() each node with
 243 // RCU: readers will see either an entirely valid list, or no list at
 244 // all.
 245 //
 246 // Conversely with hazptrs, generally lists are walked with
 247 // hand-over-hand hazptrs.  Only one node is protected at a time.  So
 248 // anywhere in the middle of the list, hazptrs may read NULL, and throw
 249 // away all current work.  Alternatively, reference counting can be used,
 250 // (as if each node was a shared_ptr<node>), so that readers will always see
 251 // *the remaining part* of the list as valid, however parts before its current
 252 // hazptr may be freed.
 253 //
 254 // So roughly: RCU is simple, but an all-or-nothing affair.  A single rcu_reader
 255 // can block all reclamation. Hazptrs will reclaim exactly as much as possible,
 256 // at the cost of extra work writing traversal code
 257 //
 258 // Reproduced from
 259 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0461r1.pdf
 260 //
 261 //                              Reference Counting    RCU            Hazptr
 262 //
 263 // Unreclaimed objects          Bounded               Unbounded      Bounded
 264 //
 265 // Contention among readers     High                  None           None
 266 //
 267 // Traversal forward progress   lock-free             wait-free      lock-free
 268 //
 269 // Reclamation forward progress lock-free             blocking       wait-free
 270 //
 271 // Traversal speed              atomic                no-overhead    folly's is
 272 //                                                                   no-overhead
 273 //
 274 // Reference acquisition        unconditional         unconditional  conditional
 275 //
 276 // Automatic reclamation        yes                   no             no
 277 //
 278 // Purpose of domains           N/A                   isolate slow configeration
 279 //                                                    readers
 280
 281 namespace folly {
 282
 283 struct RcuTag;
 284
 285 template <typename Tag = RcuTag>
 286 class rcu_domain;
 287
 288 // Opaque token used to match up lock_shared() and unlock_shared()
 289 // pairs.
 290 class rcu_token {
 291  public:
 292   rcu_token(uint64_t epoch) : epoch_(epoch) {}
 293   rcu_token() {}
 294   ~rcu_token() = default;
 295
 296   rcu_token(const rcu_token&) = delete;
 297   rcu_token(rcu_token&& other) = default;
 298   rcu_token& operator=(const rcu_token& other) = delete;
 299   rcu_token& operator=(rcu_token&& other) = default;
 300
 301  private:
 302   template <typename Tag>
 303   friend class rcu_domain;
 304   uint64_t epoch_;
 305 };
 306
 307 // For most usages, rcu_domain is unnecessary, and you can use
 308 // rcu_reader and rcu_retire/synchronize_rcu directly.
 309 template <typename Tag>
 310 class rcu_domain {
 311   using list_head = typename detail::ThreadCachedLists<Tag>::ListHead;
 312   using list_node = typename detail::ThreadCachedLists<Tag>::Node;
 313
 314  public:
 315   /*
 316    * If an executor is passed, it is used to run calls and delete
 317    * retired objects.
 318    */
 319   rcu_domain(Executor* executor = nullptr) noexcept;
 320
 321   rcu_domain(const rcu_domain&) = delete;
 322   rcu_domain(rcu_domain&&) = delete;
 323   rcu_domain& operator=(const rcu_domain&) = delete;
 324   rcu_domain& operator=(rcu_domain&&) = delete;
 325   ~rcu_domain();
 326
 327   // Reader locks: Prevent any calls from occuring, retired memory
 328   // from being freed, and synchronize() calls from completing until
 329   // all preceding lock_shared() sections are finished.
 330
 331   // Note: can potentially allocate on thread first use.
 332   FOLLY_ALWAYS_INLINE rcu_token lock_shared();
 333   FOLLY_ALWAYS_INLINE void unlock_shared(rcu_token&&);
 334
 335   // Call a function after concurrent critical sections have finished.
 336   // Does not block unless the queue is full, then may block to wait
 337   // for scheduler thread, but generally does not wait for full
 338   // synchronization.
 339   template <typename T>
 340   void call(T&& cbin);
 341   void retire(list_node* node) noexcept;
 342
 343   // Ensure concurrent critical sections have finished.
 344   // Always waits for full synchronization.
 345   // read lock *must not* be held.
 346   void synchronize() noexcept;
 347
 348  private:
 349   detail::ThreadCachedInts<Tag> counters_;
 350   // Global epoch.
 351   std::atomic<uint64_t> version_{0};
 352   // Future epochs being driven by threads in synchronize
 353   std::atomic<uint64_t> work_{0};
 354   // Matches version_, for waking up threads in synchronize that are
 355   // sharing an epoch.
 356   detail::TurnSequencer<std::atomic> turn_;
 357   // Only one thread can drive half_sync.
 358   std::mutex syncMutex_;
 359   // Currently half_sync takes ~16ms due to heavy barriers.
 360   // Ensure that if used in a single thread, max GC time
 361   // is limited to 1% of total CPU time.
 362   static constexpr uint64_t syncTimePeriod_{1600 * 2 /* full sync is 2x */};
 363   std::atomic<uint64_t> syncTime_{0};
 364   // call()s waiting to move through two epochs.
 365   detail::ThreadCachedLists<Tag> q_;
 366   // Executor callbacks will eventually be run on.
 367   Executor* executor_{nullptr};
 368   static bool singleton_; // Ensure uniqueness per-tag.
 369
 370   // Queues for callbacks waiting to go through two epochs.
 371   list_head queues_[2]{};
 372
 373   // Move queues through one epoch (half of a full synchronize()).
 374   // Will block waiting for readers to exit if blocking is true.
 375   // blocking must *not* be true if the current thread is locked,
 376   // or will deadlock.
 377   //
 378   // returns a list of callbacks ready to run in cbs.
 379   void half_sync(bool blocking, list_head& cbs);
 380 };
 381
 382 extern rcu_domain<RcuTag> rcu_default_domain_;
 383
 384 inline rcu_domain<RcuTag>* rcu_default_domain() {
 385   return &rcu_default_domain_;
 386 }
 387
 388 // Main reader guard class.
 389 class rcu_reader {
 390  public:
 391   FOLLY_ALWAYS_INLINE rcu_reader() noexcept
 392       : epoch_(rcu_default_domain()->lock_shared()) {}
 393   rcu_reader(std::defer_lock_t) noexcept {}
 394   rcu_reader(const rcu_reader&) = delete;
 395   rcu_reader(rcu_reader&& other) noexcept : epoch_(std::move(other.epoch_)) {}
 396   rcu_reader& operator=(const rcu_reader&) = delete;
 397   rcu_reader& operator=(rcu_reader&& other) noexcept {
 398     if (epoch_.has_value()) {
 399       rcu_default_domain()->unlock_shared(std::move(epoch_.value()));
 400     }
 401     epoch_ = std::move(other.epoch_);
 402     return *this;
 403   }
 404
 405   FOLLY_ALWAYS_INLINE ~rcu_reader() noexcept {
 406     if (epoch_.has_value()) {
 407       rcu_default_domain()->unlock_shared(std::move(epoch_.value()));
 408     }
 409   }
 410
 411   void swap(rcu_reader& other) noexcept {
 412     std::swap(epoch_, other.epoch_);
 413   }
 414
 415   FOLLY_ALWAYS_INLINE void lock() noexcept {
 416     DCHECK(!epoch_.has_value());
 417     epoch_ = rcu_default_domain()->lock_shared();
 418   }
 419
 420   FOLLY_ALWAYS_INLINE void unlock() noexcept {
 421     DCHECK(epoch_.has_value());
 422     rcu_default_domain()->unlock_shared(std::move(epoch_.value()));
 423   }
 424
 425  private:
 426   Optional<rcu_token> epoch_;
 427 };
 428
 429 inline void swap(rcu_reader& a, rcu_reader& b) noexcept {
 430   a.swap(b);
 431 }
 432
 433 inline void synchronize_rcu() noexcept {
 434   rcu_default_domain()->synchronize();
 435 }
 436
 437 inline void rcu_barrier() noexcept {
 438   rcu_default_domain()->synchronize();
 439 }
 440
 441 // Free-function retire.  Always allocates.
 442 template <typename T, typename D = std::default_delete<T>>
 443 void rcu_retire(T* p, D d = {}) {
 444   rcu_default_domain()->call([p, del = std::move(d)]() { del(p); });
 445 }
 446
 447 // Base class for rcu objects.  retire() will use preallocated storage
 448 // from rcu_obj_base, vs.  rcu_retire() which always allocates.
 449 template <typename T, typename D = std::default_delete<T>>
 450 class rcu_obj_base : detail::ThreadCachedListsBase::Node {
 451  public:
 452   void retire(D d = {}) {
 453     // This implementation assumes folly::Function has enough
 454     // inline storage for D, otherwise, it allocates.
 455     this->cb_ = [this, d = std::move(d)]() { d(static_cast<T*>(this)); };
 456     rcu_default_domain()->retire(this);
 457   }
 458 };
 459
 460 } // namespace folly
 461
 462 #include <folly/synchronization/Rcu-inl.h>