folly/SharedMutex.h

   1 /*
   2  * Copyright 2016 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 // @author Nathan Bronson (ngbronson@fb.com)
  18
  19 #pragma once
  20
  21 #include <stdint.h>
  22 #include <atomic>
  23 #include <thread>
  24 #include <type_traits>
  25 #include <folly/Likely.h>
  26 #include <folly/detail/CacheLocality.h>
  27 #include <folly/detail/Futex.h>
  28 #include <folly/portability/Asm.h>
  29 #include <folly/portability/SysResource.h>
  30
  31 // SharedMutex is a reader-writer lock.  It is small, very fast, scalable
  32 // on multi-core, and suitable for use when readers or writers may block.
  33 // Unlike most other reader-writer locks, its throughput with concurrent
  34 // readers scales linearly; it is able to acquire and release the lock
  35 // in shared mode without cache line ping-ponging.  It is suitable for
  36 // a wide range of lock hold times because it starts with spinning,
  37 // proceeds to using sched_yield with a preemption heuristic, and then
  38 // waits using futex and precise wakeups.
  39 //
  40 // SharedMutex provides all of the methods of folly::RWSpinLock,
  41 // boost::shared_mutex, boost::upgrade_mutex, and C++14's
  42 // std::shared_timed_mutex.  All operations that can block are available
  43 // in try, try-for, and try-until (system_clock or steady_clock) versions.
  44 //
  45 // SharedMutexReadPriority gives priority to readers,
  46 // SharedMutexWritePriority gives priority to writers.  SharedMutex is an
  47 // alias for SharedMutexWritePriority, because writer starvation is more
  48 // likely than reader starvation for the read-heavy workloads targetted
  49 // by SharedMutex.
  50 //
  51 // In my tests SharedMutex is as good or better than the other
  52 // reader-writer locks in use at Facebook for almost all use cases,
  53 // sometimes by a wide margin.  (If it is rare that there are actually
  54 // concurrent readers then RWSpinLock can be a few nanoseconds faster.)
  55 // I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64,
  56 // boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses
  57 // spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block.
  58 // (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.)
  59 // It is generally as good or better than the rest when evaluating size,
  60 // speed, scalability, or latency outliers.  In the corner cases where
  61 // it is not the fastest (such as single-threaded use or heavy write
  62 // contention) it is never very much worse than the best.  See the bottom
  63 // of folly/test/SharedMutexTest.cpp for lots of microbenchmark results.
  64 //
  65 // Comparison to folly::RWSpinLock:
  66 //
  67 //  * SharedMutex is faster than RWSpinLock when there are actually
  68 //    concurrent read accesses (sometimes much faster), and ~5 nanoseconds
  69 //    slower when there is not actually any contention.  SharedMutex is
  70 //    faster in every (benchmarked) scenario where the shared mode of
  71 //    the lock is actually useful.
  72 //
  73 //  * Concurrent shared access to SharedMutex scales linearly, while total
  74 //    RWSpinLock throughput drops as more threads try to access the lock
  75 //    in shared mode.  Under very heavy read contention SharedMutex can
  76 //    be two orders of magnitude faster than RWSpinLock (or any reader
  77 //    writer lock that doesn't use striping or deferral).
  78 //
  79 //  * SharedMutex can safely protect blocking calls, because after an
  80 //    initial period of spinning it waits using futex().
  81 //
  82 //  * RWSpinLock prioritizes readers, SharedMutex has both reader- and
  83 //    writer-priority variants, but defaults to write priority.
  84 //
  85 //  * RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's
  86 //    doesn't.  Both semantics are reasonable.  The boost documentation
  87 //    doesn't explicitly talk about this behavior (except by omitting
  88 //    any statement that those lock modes conflict), but the boost
  89 //    implementations do allow new readers while the upgradeable mode
  90 //    is held.  See https://github.com/boostorg/thread/blob/master/
  91 //      include/boost/thread/pthread/shared_mutex.hpp
  92 //
  93 //  * RWSpinLock::UpgradedHolder maps to SharedMutex::UpgradeHolder
  94 //    (UpgradeableHolder would be even more pedantically correct).
  95 //    SharedMutex's holders have fewer methods (no reset) and are less
  96 //    tolerant (promotion and downgrade crash if the donor doesn't own
  97 //    the lock, and you must use the default constructor rather than
  98 //    passing a nullptr to the pointer constructor).
  99 //
 100 // Both SharedMutex and RWSpinLock provide "exclusive", "upgrade",
 101 // and "shared" modes.  At all times num_threads_holding_exclusive +
 102 // num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive ==
 103 // 0 || num_threads_holding_shared == 0.  RWSpinLock has the additional
 104 // constraint that num_threads_holding_shared cannot increase while
 105 // num_threads_holding_upgrade is non-zero.
 106 //
 107 // Comparison to the internal RWLock:
 108 //
 109 //  * SharedMutex doesn't allow a maximum reader count to be configured,
 110 //    so it can't be used as a semaphore in the same way as RWLock.
 111 //
 112 //  * SharedMutex is 4 bytes, RWLock is 256.
 113 //
 114 //  * SharedMutex is as fast or faster than RWLock in all of my
 115 //    microbenchmarks, and has positive rather than negative scalability.
 116 //
 117 //  * RWLock and SharedMutex are both writer priority locks.
 118 //
 119 //  * SharedMutex avoids latency outliers as well as RWLock.
 120 //
 121 //  * SharedMutex uses different names (t != 0 below):
 122 //
 123 //    RWLock::lock(0)    => SharedMutex::lock()
 124 //
 125 //    RWLock::lock(t)    => SharedMutex::try_lock_for(milliseconds(t))
 126 //
 127 //    RWLock::tryLock()  => SharedMutex::try_lock()
 128 //
 129 //    RWLock::unlock()   => SharedMutex::unlock()
 130 //
 131 //    RWLock::enter(0)   => SharedMutex::lock_shared()
 132 //
 133 //    RWLock::enter(t)   =>
 134 //        SharedMutex::try_lock_shared_for(milliseconds(t))
 135 //
 136 //    RWLock::tryEnter() => SharedMutex::try_lock_shared()
 137 //
 138 //    RWLock::leave()    => SharedMutex::unlock_shared()
 139 //
 140 //  * RWLock allows the reader count to be adjusted by a value other
 141 //    than 1 during enter() or leave(). SharedMutex doesn't currently
 142 //    implement this feature.
 143 //
 144 //  * RWLock's methods are marked const, SharedMutex's aren't.
 145 //
 146 // Reader-writer locks have the potential to allow concurrent access
 147 // to shared read-mostly data, but in practice they often provide no
 148 // improvement over a mutex.  The problem is the cache coherence protocol
 149 // of modern CPUs.  Coherence is provided by making sure that when a cache
 150 // line is written it is present in only one core's cache.  Since a memory
 151 // write is required to acquire a reader-writer lock in shared mode, the
 152 // cache line holding the lock is invalidated in all of the other caches.
 153 // This leads to cache misses when another thread wants to acquire or
 154 // release the lock concurrently.  When the RWLock is colocated with the
 155 // data it protects (common), cache misses can also continue occur when
 156 // a thread that already holds the lock tries to read the protected data.
 157 //
 158 // Ideally, a reader-writer lock would allow multiple cores to acquire
 159 // and release the lock in shared mode without incurring any cache misses.
 160 // This requires that each core records its shared access in a cache line
 161 // that isn't read or written by other read-locking cores.  (Writers will
 162 // have to check all of the cache lines.)  Typical server hardware when
 163 // this comment was written has 16 L1 caches and cache lines of 64 bytes,
 164 // so a lock striped over all L1 caches would occupy a prohibitive 1024
 165 // bytes.  Nothing says that we need a separate set of per-core memory
 166 // locations for each lock, however.  Each SharedMutex instance is only
 167 // 4 bytes, but all locks together share a 2K area in which they make a
 168 // core-local record of lock acquisitions.
 169 //
 170 // SharedMutex's strategy of using a shared set of core-local stripes has
 171 // a potential downside, because it means that acquisition of any lock in
 172 // write mode can conflict with acquisition of any lock in shared mode.
 173 // If a lock instance doesn't actually experience concurrency then this
 174 // downside will outweight the upside of improved scalability for readers.
 175 // To avoid this problem we dynamically detect concurrent accesses to
 176 // SharedMutex, and don't start using the deferred mode unless we actually
 177 // observe concurrency.  See kNumSharedToStartDeferring.
 178 //
 179 // It is explicitly allowed to call lock_unshared() from a different
 180 // thread than lock_shared(), so long as they are properly paired.
 181 // lock_unshared() needs to find the location at which lock_shared()
 182 // recorded the lock, which might be in the lock itself or in any of
 183 // the shared slots.  If you can conveniently pass state from lock
 184 // acquisition to release then the fastest mechanism is to std::move
 185 // the SharedMutex::ReadHolder instance or an SharedMutex::Token (using
 186 // lock_shared(Token&) and unlock_shared(Token&)).  The guard or token
 187 // will tell unlock_shared where in deferredReaders[] to look for the
 188 // deferred lock.  The Token-less version of unlock_shared() works in all
 189 // cases, but is optimized for the common (no inter-thread handoff) case.
 190 //
 191 // In both read- and write-priority mode, a waiting lock() (exclusive mode)
 192 // only blocks readers after it has waited for an active upgrade lock to be
 193 // released; until the upgrade lock is released (or upgraded or downgraded)
 194 // readers will still be able to enter.  Preferences about lock acquisition
 195 // are not guaranteed to be enforced perfectly (even if they were, there
 196 // is theoretically the chance that a thread could be arbitrarily suspended
 197 // between calling lock() and SharedMutex code actually getting executed).
 198 //
 199 // try_*_for methods always try at least once, even if the duration
 200 // is zero or negative.  The duration type must be compatible with
 201 // std::chrono::steady_clock.  try_*_until methods also always try at
 202 // least once.  std::chrono::system_clock and std::chrono::steady_clock
 203 // are supported.
 204 //
 205 // If you have observed by profiling that your SharedMutex-s are getting
 206 // cache misses on deferredReaders[] due to another SharedMutex user, then
 207 // you can use the tag type plus the RWDEFERREDLOCK_DECLARE_STATIC_STORAGE
 208 // macro to create your own instantiation of the type.  The contention
 209 // threshold (see kNumSharedToStartDeferring) should make this unnecessary
 210 // in all but the most extreme cases.  Make sure to check that the
 211 // increased icache and dcache footprint of the tagged result is worth it.
 212
 213 // SharedMutex's use of thread local storage is as an optimization, so
 214 // for the case where thread local storage is not supported, define it
 215 // away.
 216 #ifndef FOLLY_SHAREDMUTEX_TLS
 217 #if !FOLLY_MOBILE
 218 #define FOLLY_SHAREDMUTEX_TLS FOLLY_TLS
 219 #else
 220 #define FOLLY_SHAREDMUTEX_TLS
 221 #endif
 222 #endif
 223
 224 namespace folly {
 225
 226 struct SharedMutexToken {
 227   enum class Type : uint16_t {
 228     INVALID = 0,
 229     INLINE_SHARED,
 230     DEFERRED_SHARED,
 231   };
 232
 233   Type type_;
 234   uint16_t slot_;
 235 };
 236
 237 template <bool ReaderPriority,
 238           typename Tag_ = void,
 239           template <typename> class Atom = std::atomic,
 240           bool BlockImmediately = false>
 241 class SharedMutexImpl {
 242  public:
 243   static constexpr bool kReaderPriority = ReaderPriority;
 244   typedef Tag_ Tag;
 245
 246   typedef SharedMutexToken Token;
 247
 248   class ReadHolder;
 249   class UpgradeHolder;
 250   class WriteHolder;
 251
 252   constexpr SharedMutexImpl() : state_(0) {}
 253
 254   SharedMutexImpl(const SharedMutexImpl&) = delete;
 255   SharedMutexImpl(SharedMutexImpl&&) = delete;
 256   SharedMutexImpl& operator = (const SharedMutexImpl&) = delete;
 257   SharedMutexImpl& operator = (SharedMutexImpl&&) = delete;
 258
 259   // It is an error to destroy an SharedMutex that still has
 260   // any outstanding locks.  This is checked if NDEBUG isn't defined.
 261   // SharedMutex's exclusive mode can be safely used to guard the lock's
 262   // own destruction.  If, for example, you acquire the lock in exclusive
 263   // mode and then observe that the object containing the lock is no longer
 264   // needed, you can unlock() and then immediately destroy the lock.
 265   // See https://sourceware.org/bugzilla/show_bug.cgi?id=13690 for a
 266   // description about why this property needs to be explicitly mentioned.
 267   ~SharedMutexImpl() {
 268     auto state = state_.load(std::memory_order_relaxed);
 269     if (UNLIKELY((state & kHasS) != 0)) {
 270       cleanupTokenlessSharedDeferred(state);
 271     }
 272
 273 #ifndef NDEBUG
 274     // if a futexWait fails to go to sleep because the value has been
 275     // changed, we don't necessarily clean up the wait bits, so it is
 276     // possible they will be set here in a correct system
 277     assert((state & ~(kWaitingAny | kMayDefer)) == 0);
 278     if ((state & kMayDefer) != 0) {
 279       for (uint32_t slot = 0; slot < kMaxDeferredReaders; ++slot) {
 280         auto slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
 281         assert(!slotValueIsThis(slotValue));
 282       }
 283     }
 284 #endif
 285   }
 286
 287   void lock() {
 288     WaitForever ctx;
 289     (void)lockExclusiveImpl(kHasSolo, ctx);
 290   }
 291
 292   bool try_lock() {
 293     WaitNever ctx;
 294     return lockExclusiveImpl(kHasSolo, ctx);
 295   }
 296
 297   template <class Rep, class Period>
 298   bool try_lock_for(const std::chrono::duration<Rep, Period>& duration) {
 299     WaitForDuration<Rep, Period> ctx(duration);
 300     return lockExclusiveImpl(kHasSolo, ctx);
 301   }
 302
 303   template <class Clock, class Duration>
 304   bool try_lock_until(
 305       const std::chrono::time_point<Clock, Duration>& absDeadline) {
 306     WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
 307     return lockExclusiveImpl(kHasSolo, ctx);
 308   }
 309
 310   void unlock() {
 311     // It is possible that we have a left-over kWaitingNotS if the last
 312     // unlock_shared() that let our matching lock() complete finished
 313     // releasing before lock()'s futexWait went to sleep.  Clean it up now
 314     auto state = (state_ &= ~(kWaitingNotS | kPrevDefer | kHasE));
 315     assert((state & ~kWaitingAny) == 0);
 316     wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
 317   }
 318
 319   // Managing the token yourself makes unlock_shared a bit faster
 320
 321   void lock_shared() {
 322     WaitForever ctx;
 323     (void)lockSharedImpl(nullptr, ctx);
 324   }
 325
 326   void lock_shared(Token& token) {
 327     WaitForever ctx;
 328     (void)lockSharedImpl(&token, ctx);
 329   }
 330
 331   bool try_lock_shared() {
 332     WaitNever ctx;
 333     return lockSharedImpl(nullptr, ctx);
 334   }
 335
 336   bool try_lock_shared(Token& token) {
 337     WaitNever ctx;
 338     return lockSharedImpl(&token, ctx);
 339   }
 340
 341   template <class Rep, class Period>
 342   bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration) {
 343     WaitForDuration<Rep, Period> ctx(duration);
 344     return lockSharedImpl(nullptr, ctx);
 345   }
 346
 347   template <class Rep, class Period>
 348   bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration,
 349                            Token& token) {
 350     WaitForDuration<Rep, Period> ctx(duration);
 351     return lockSharedImpl(&token, ctx);
 352   }
 353
 354   template <class Clock, class Duration>
 355   bool try_lock_shared_until(
 356       const std::chrono::time_point<Clock, Duration>& absDeadline) {
 357     WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
 358     return lockSharedImpl(nullptr, ctx);
 359   }
 360
 361   template <class Clock, class Duration>
 362   bool try_lock_shared_until(
 363       const std::chrono::time_point<Clock, Duration>& absDeadline,
 364       Token& token) {
 365     WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
 366     return lockSharedImpl(&token, ctx);
 367   }
 368
 369   void unlock_shared() {
 370     auto state = state_.load(std::memory_order_acquire);
 371
 372     // kPrevDefer can only be set if HasE or BegunE is set
 373     assert((state & (kPrevDefer | kHasE | kBegunE)) != kPrevDefer);
 374
 375     // lock() strips kMayDefer immediately, but then copies it to
 376     // kPrevDefer so we can tell if the pre-lock() lock_shared() might
 377     // have deferred
 378     if ((state & (kMayDefer | kPrevDefer)) == 0 ||
 379         !tryUnlockTokenlessSharedDeferred()) {
 380       // Matching lock_shared() couldn't have deferred, or the deferred
 381       // lock has already been inlined by applyDeferredReaders()
 382       unlockSharedInline();
 383     }
 384   }
 385
 386   void unlock_shared(Token& token) {
 387     assert(token.type_ == Token::Type::INLINE_SHARED ||
 388            token.type_ == Token::Type::DEFERRED_SHARED);
 389
 390     if (token.type_ != Token::Type::DEFERRED_SHARED ||
 391         !tryUnlockSharedDeferred(token.slot_)) {
 392       unlockSharedInline();
 393     }
 394 #ifndef NDEBUG
 395     token.type_ = Token::Type::INVALID;
 396 #endif
 397   }
 398
 399   void unlock_and_lock_shared() {
 400     // We can't use state_ -=, because we need to clear 2 bits (1 of which
 401     // has an uncertain initial state) and set 1 other.  We might as well
 402     // clear the relevant wake bits at the same time.  Note that since S
 403     // doesn't block the beginning of a transition to E (writer priority
 404     // can cut off new S, reader priority grabs BegunE and blocks deferred
 405     // S) we need to wake E as well.
 406     auto state = state_.load(std::memory_order_acquire);
 407     do {
 408       assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
 409     } while (!state_.compare_exchange_strong(
 410         state, (state & ~(kWaitingAny | kPrevDefer | kHasE)) + kIncrHasS));
 411     if ((state & (kWaitingE | kWaitingU | kWaitingS)) != 0) {
 412       futexWakeAll(kWaitingE | kWaitingU | kWaitingS);
 413     }
 414   }
 415
 416   void unlock_and_lock_shared(Token& token) {
 417     unlock_and_lock_shared();
 418     token.type_ = Token::Type::INLINE_SHARED;
 419   }
 420
 421   void lock_upgrade() {
 422     WaitForever ctx;
 423     (void)lockUpgradeImpl(ctx);
 424   }
 425
 426   bool try_lock_upgrade() {
 427     WaitNever ctx;
 428     return lockUpgradeImpl(ctx);
 429   }
 430
 431   template <class Rep, class Period>
 432   bool try_lock_upgrade_for(
 433       const std::chrono::duration<Rep, Period>& duration) {
 434     WaitForDuration<Rep, Period> ctx(duration);
 435     return lockUpgradeImpl(ctx);
 436   }
 437
 438   template <class Clock, class Duration>
 439   bool try_lock_upgrade_until(
 440       const std::chrono::time_point<Clock, Duration>& absDeadline) {
 441     WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
 442     return lockUpgradeImpl(ctx);
 443   }
 444
 445   void unlock_upgrade() {
 446     auto state = (state_ -= kHasU);
 447     assert((state & (kWaitingNotS | kHasSolo)) == 0);
 448     wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
 449   }
 450
 451   void unlock_upgrade_and_lock() {
 452     // no waiting necessary, so waitMask is empty
 453     WaitForever ctx;
 454     (void)lockExclusiveImpl(0, ctx);
 455   }
 456
 457   void unlock_upgrade_and_lock_shared() {
 458     auto state = (state_ -= kHasU - kIncrHasS);
 459     assert((state & (kWaitingNotS | kHasSolo)) == 0);
 460     wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
 461   }
 462
 463   void unlock_upgrade_and_lock_shared(Token& token) {
 464     unlock_upgrade_and_lock_shared();
 465     token.type_ = Token::Type::INLINE_SHARED;
 466   }
 467
 468   void unlock_and_lock_upgrade() {
 469     // We can't use state_ -=, because we need to clear 2 bits (1 of
 470     // which has an uncertain initial state) and set 1 other.  We might
 471     // as well clear the relevant wake bits at the same time.
 472     auto state = state_.load(std::memory_order_acquire);
 473     while (true) {
 474       assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
 475       auto after =
 476           (state & ~(kWaitingNotS | kWaitingS | kPrevDefer | kHasE)) + kHasU;
 477       if (state_.compare_exchange_strong(state, after)) {
 478         if ((state & kWaitingS) != 0) {
 479           futexWakeAll(kWaitingS);
 480         }
 481         return;
 482       }
 483     }
 484   }
 485
 486  private:
 487   typedef typename folly::detail::Futex<Atom> Futex;
 488
 489   // Internally we use four kinds of wait contexts.  These are structs
 490   // that provide a doWait method that returns true if a futex wake
 491   // was issued that intersects with the waitMask, false if there was a
 492   // timeout and no more waiting should be performed.  Spinning occurs
 493   // before the wait context is invoked.
 494
 495   struct WaitForever {
 496     bool canBlock() { return true; }
 497     bool canTimeOut() { return false; }
 498     bool shouldTimeOut() { return false; }
 499
 500     bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
 501       futex.futexWait(expected, waitMask);
 502       return true;
 503     }
 504   };
 505
 506   struct WaitNever {
 507     bool canBlock() { return false; }
 508     bool canTimeOut() { return true; }
 509     bool shouldTimeOut() { return true; }
 510
 511     bool doWait(Futex& /* futex */,
 512                 uint32_t /* expected */,
 513                 uint32_t /* waitMask */) {
 514       return false;
 515     }
 516   };
 517
 518   template <class Rep, class Period>
 519   struct WaitForDuration {
 520     std::chrono::duration<Rep, Period> duration_;
 521     bool deadlineComputed_;
 522     std::chrono::steady_clock::time_point deadline_;
 523
 524     explicit WaitForDuration(const std::chrono::duration<Rep, Period>& duration)
 525         : duration_(duration), deadlineComputed_(false) {}
 526
 527     std::chrono::steady_clock::time_point deadline() {
 528       if (!deadlineComputed_) {
 529         deadline_ = std::chrono::steady_clock::now() + duration_;
 530         deadlineComputed_ = true;
 531       }
 532       return deadline_;
 533     }
 534
 535     bool canBlock() { return duration_.count() > 0; }
 536     bool canTimeOut() { return true; }
 537
 538     bool shouldTimeOut() {
 539       return std::chrono::steady_clock::now() > deadline();
 540     }
 541
 542     bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
 543       auto result = futex.futexWaitUntil(expected, deadline(), waitMask);
 544       return result != folly::detail::FutexResult::TIMEDOUT;
 545     }
 546   };
 547
 548   template <class Clock, class Duration>
 549   struct WaitUntilDeadline {
 550     std::chrono::time_point<Clock, Duration> absDeadline_;
 551
 552     bool canBlock() { return true; }
 553     bool canTimeOut() { return true; }
 554     bool shouldTimeOut() { return Clock::now() > absDeadline_; }
 555
 556     bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
 557       auto result = futex.futexWaitUntil(expected, absDeadline_, waitMask);
 558       return result != folly::detail::FutexResult::TIMEDOUT;
 559     }
 560   };
 561
 562   // 32 bits of state
 563   Futex state_;
 564
 565   // S count needs to be on the end, because we explicitly allow it to
 566   // underflow.  This can occur while we are in the middle of applying
 567   // deferred locks (we remove them from deferredReaders[] before
 568   // inlining them), or during token-less unlock_shared() if a racing
 569   // lock_shared();unlock_shared() moves the deferredReaders slot while
 570   // the first unlock_shared() is scanning.  The former case is cleaned
 571   // up before we finish applying the locks.  The latter case can persist
 572   // until destruction, when it is cleaned up.
 573   static constexpr uint32_t kIncrHasS = 1 << 10;
 574   static constexpr uint32_t kHasS = ~(kIncrHasS - 1);
 575
 576   // If false, then there are definitely no deferred read locks for this
 577   // instance.  Cleared after initialization and when exclusively locked.
 578   static constexpr uint32_t kMayDefer = 1 << 9;
 579
 580   // lock() cleared kMayDefer as soon as it starts draining readers (so
 581   // that it doesn't have to do a second CAS once drain completes), but
 582   // unlock_shared() still needs to know whether to scan deferredReaders[]
 583   // or not.  We copy kMayDefer to kPrevDefer when setting kHasE or
 584   // kBegunE, and clear it when clearing those bits.
 585   static constexpr uint32_t kPrevDefer = 1 << 8;
 586
 587   // Exclusive-locked blocks all read locks and write locks.  This bit
 588   // may be set before all readers have finished, but in that case the
 589   // thread that sets it won't return to the caller until all read locks
 590   // have been released.
 591   static constexpr uint32_t kHasE = 1 << 7;
 592
 593   // Exclusive-draining means that lock() is waiting for existing readers
 594   // to leave, but that new readers may still acquire shared access.
 595   // This is only used in reader priority mode.  New readers during
 596   // drain must be inline.  The difference between this and kHasU is that
 597   // kBegunE prevents kMayDefer from being set.
 598   static constexpr uint32_t kBegunE = 1 << 6;
 599
 600   // At most one thread may have either exclusive or upgrade lock
 601   // ownership.  Unlike exclusive mode, ownership of the lock in upgrade
 602   // mode doesn't preclude other threads holding the lock in shared mode.
 603   // boost's concept for this doesn't explicitly say whether new shared
 604   // locks can be acquired one lock_upgrade has succeeded, but doesn't
 605   // list that as disallowed.  RWSpinLock disallows new read locks after
 606   // lock_upgrade has been acquired, but the boost implementation doesn't.
 607   // We choose the latter.
 608   static constexpr uint32_t kHasU = 1 << 5;
 609
 610   // There are three states that we consider to be "solo", in that they
 611   // cannot coexist with other solo states.  These are kHasE, kBegunE,
 612   // and kHasU.  Note that S doesn't conflict with any of these, because
 613   // setting the kHasE is only one of the two steps needed to actually
 614   // acquire the lock in exclusive mode (the other is draining the existing
 615   // S holders).
 616   static constexpr uint32_t kHasSolo = kHasE | kBegunE | kHasU;
 617
 618   // Once a thread sets kHasE it needs to wait for the current readers
 619   // to exit the lock.  We give this a separate wait identity from the
 620   // waiting to set kHasE so that we can perform partial wakeups (wake
 621   // one instead of wake all).
 622   static constexpr uint32_t kWaitingNotS = 1 << 4;
 623
 624   // When waking writers we can either wake them all, in which case we
 625   // can clear kWaitingE, or we can call futexWake(1).  futexWake tells
 626   // us if anybody woke up, but even if we detect that nobody woke up we
 627   // can't clear the bit after the fact without issuing another wakeup.
 628   // To avoid thundering herds when there are lots of pending lock()
 629   // without needing to call futexWake twice when there is only one
 630   // waiter, kWaitingE actually encodes if we have observed multiple
 631   // concurrent waiters.  Tricky: ABA issues on futexWait mean that when
 632   // we see kWaitingESingle we can't assume that there is only one.
 633   static constexpr uint32_t kWaitingESingle = 1 << 2;
 634   static constexpr uint32_t kWaitingEMultiple = 1 << 3;
 635   static constexpr uint32_t kWaitingE = kWaitingESingle | kWaitingEMultiple;
 636
 637   // kWaitingU is essentially a 1 bit saturating counter.  It always
 638   // requires a wakeAll.
 639   static constexpr uint32_t kWaitingU = 1 << 1;
 640
 641   // All blocked lock_shared() should be awoken, so it is correct (not
 642   // suboptimal) to wakeAll if there are any shared readers.
 643   static constexpr uint32_t kWaitingS = 1 << 0;
 644
 645   // kWaitingAny is a mask of all of the bits that record the state of
 646   // threads, rather than the state of the lock.  It is convenient to be
 647   // able to mask them off during asserts.
 648   static constexpr uint32_t kWaitingAny =
 649       kWaitingNotS | kWaitingE | kWaitingU | kWaitingS;
 650
 651   // The reader count at which a reader will attempt to use the lock
 652   // in deferred mode.  If this value is 2, then the second concurrent
 653   // reader will set kMayDefer and use deferredReaders[].  kMayDefer is
 654   // cleared during exclusive access, so this threshold must be reached
 655   // each time a lock is held in exclusive mode.
 656   static constexpr uint32_t kNumSharedToStartDeferring = 2;
 657
 658   // The typical number of spins that a thread will wait for a state
 659   // transition.  There is no bound on the number of threads that can wait
 660   // for a writer, so we are pretty conservative here to limit the chance
 661   // that we are starving the writer of CPU.  Each spin is 6 or 7 nanos,
 662   // almost all of which is in the pause instruction.
 663   static constexpr uint32_t kMaxSpinCount = !BlockImmediately ? 1000 : 2;
 664
 665   // The maximum number of soft yields before falling back to futex.
 666   // If the preemption heuristic is activated we will fall back before
 667   // this.  A soft yield takes ~900 nanos (two sched_yield plus a call
 668   // to getrusage, with checks of the goal at each step).  Soft yields
 669   // aren't compatible with deterministic execution under test (unlike
 670   // futexWaitUntil, which has a capricious but deterministic back end).
 671   static constexpr uint32_t kMaxSoftYieldCount = !BlockImmediately ? 1000 : 0;
 672
 673   // If AccessSpreader assigns indexes from 0..k*n-1 on a system where some
 674   // level of the memory hierarchy is symmetrically divided into k pieces
 675   // (NUMA nodes, last-level caches, L1 caches, ...), then slot indexes
 676   // that are the same after integer division by k share that resource.
 677   // Our strategy for deferred readers is to probe up to numSlots/4 slots,
 678   // using the full granularity of AccessSpreader for the start slot
 679   // and then search outward.  We can use AccessSpreader::current(n)
 680   // without managing our own spreader if kMaxDeferredReaders <=
 681   // AccessSpreader::kMaxCpus, which is currently 128.
 682   //
 683   // Our 2-socket E5-2660 machines have 8 L1 caches on each chip,
 684   // with 64 byte cache lines.  That means we need 64*16 bytes of
 685   // deferredReaders[] to give each L1 its own playground.  On x86_64
 686   // each DeferredReaderSlot is 8 bytes, so we need kMaxDeferredReaders
 687   // * kDeferredSeparationFactor >= 64 * 16 / 8 == 128.  If
 688   // kDeferredSearchDistance * kDeferredSeparationFactor <=
 689   // 64 / 8 then we will search only within a single cache line, which
 690   // guarantees we won't have inter-L1 contention.  We give ourselves
 691   // a factor of 2 on the core count, which should hold us for a couple
 692   // processor generations.  deferredReaders[] is 2048 bytes currently.
 693  public:
 694   static constexpr uint32_t kMaxDeferredReaders = 64;
 695   static constexpr uint32_t kDeferredSearchDistance = 2;
 696   static constexpr uint32_t kDeferredSeparationFactor = 4;
 697
 698  private:
 699
 700   static_assert(!(kMaxDeferredReaders & (kMaxDeferredReaders - 1)),
 701                 "kMaxDeferredReaders must be a power of 2");
 702   static_assert(!(kDeferredSearchDistance & (kDeferredSearchDistance - 1)),
 703                 "kDeferredSearchDistance must be a power of 2");
 704
 705   // The number of deferred locks that can be simultaneously acquired
 706   // by a thread via the token-less methods without performing any heap
 707   // allocations.  Each of these costs 3 pointers (24 bytes, probably)
 708   // per thread.  There's not much point in making this larger than
 709   // kDeferredSearchDistance.
 710   static constexpr uint32_t kTokenStackTLSCapacity = 2;
 711
 712   // We need to make sure that if there is a lock_shared()
 713   // and lock_shared(token) followed by unlock_shared() and
 714   // unlock_shared(token), the token-less unlock doesn't null
 715   // out deferredReaders[token.slot_].  If we allowed that, then
 716   // unlock_shared(token) wouldn't be able to assume that its lock
 717   // had been inlined by applyDeferredReaders when it finds that
 718   // deferredReaders[token.slot_] no longer points to this.  We accomplish
 719   // this by stealing bit 0 from the pointer to record that the slot's
 720   // element has no token, hence our use of uintptr_t in deferredReaders[].
 721   static constexpr uintptr_t kTokenless = 0x1;
 722
 723   // This is the starting location for Token-less unlock_shared().
 724   static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastTokenlessSlot;
 725
 726   // Only indexes divisible by kDeferredSeparationFactor are used.
 727   // If any of those elements points to a SharedMutexImpl, then it
 728   // should be considered that there is a shared lock on that instance.
 729   // See kTokenless.
 730  public:
 731   typedef Atom<uintptr_t> DeferredReaderSlot;
 732
 733  private:
 734   FOLLY_ALIGN_TO_AVOID_FALSE_SHARING static DeferredReaderSlot deferredReaders
 735       [kMaxDeferredReaders *
 736        kDeferredSeparationFactor];
 737
 738   // Performs an exclusive lock, waiting for state_ & waitMask to be
 739   // zero first
 740   template <class WaitContext>
 741   bool lockExclusiveImpl(uint32_t preconditionGoalMask, WaitContext& ctx) {
 742     uint32_t state = state_.load(std::memory_order_acquire);
 743     if (LIKELY(
 744             (state & (preconditionGoalMask | kMayDefer | kHasS)) == 0 &&
 745             state_.compare_exchange_strong(state, (state | kHasE) & ~kHasU))) {
 746       return true;
 747     } else {
 748       return lockExclusiveImpl(state, preconditionGoalMask, ctx);
 749     }
 750   }
 751
 752   template <class WaitContext>
 753   bool lockExclusiveImpl(uint32_t& state,
 754                          uint32_t preconditionGoalMask,
 755                          WaitContext& ctx) {
 756     while (true) {
 757       if (UNLIKELY((state & preconditionGoalMask) != 0) &&
 758           !waitForZeroBits(state, preconditionGoalMask, kWaitingE, ctx) &&
 759           ctx.canTimeOut()) {
 760         return false;
 761       }
 762
 763       uint32_t after = (state & kMayDefer) == 0 ? 0 : kPrevDefer;
 764       if (!ReaderPriority || (state & (kMayDefer | kHasS)) == 0) {
 765         // Block readers immediately, either because we are in write
 766         // priority mode or because we can acquire the lock in one
 767         // step.  Note that if state has kHasU, then we are doing an
 768         // unlock_upgrade_and_lock() and we should clear it (reader
 769         // priority branch also does this).
 770         after |= (state | kHasE) & ~(kHasU | kMayDefer);
 771       } else {
 772         after |= (state | kBegunE) & ~(kHasU | kMayDefer);
 773       }
 774       if (state_.compare_exchange_strong(state, after)) {
 775         auto before = state;
 776         state = after;
 777
 778         // If we set kHasE (writer priority) then no new readers can
 779         // arrive.  If we set kBegunE then they can still enter, but
 780         // they must be inline.  Either way we need to either spin on
 781         // deferredReaders[] slots, or inline them so that we can wait on
 782         // kHasS to zero itself.  deferredReaders[] is pointers, which on
 783         // x86_64 are bigger than futex() can handle, so we inline the
 784         // deferred locks instead of trying to futexWait on each slot.
 785         // Readers are responsible for rechecking state_ after recording
 786         // a deferred read to avoid atomicity problems between the state_
 787         // CAS and applyDeferredReader's reads of deferredReaders[].
 788         if (UNLIKELY((before & kMayDefer) != 0)) {
 789           applyDeferredReaders(state, ctx);
 790         }
 791         while (true) {
 792           assert((state & (kHasE | kBegunE)) != 0 && (state & kHasU) == 0);
 793           if (UNLIKELY((state & kHasS) != 0) &&
 794               !waitForZeroBits(state, kHasS, kWaitingNotS, ctx) &&
 795               ctx.canTimeOut()) {
 796             // Ugh.  We blocked new readers and other writers for a while,
 797             // but were unable to complete.  Move on.  On the plus side
 798             // we can clear kWaitingNotS because nobody else can piggyback
 799             // on it.
 800             state = (state_ &= ~(kPrevDefer | kHasE | kBegunE | kWaitingNotS));
 801             wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
 802             return false;
 803           }
 804
 805           if (ReaderPriority && (state & kHasE) == 0) {
 806             assert((state & kBegunE) != 0);
 807             if (!state_.compare_exchange_strong(state,
 808                                                 (state & ~kBegunE) | kHasE)) {
 809               continue;
 810             }
 811           }
 812
 813           return true;
 814         }
 815       }
 816     }
 817   }
 818
 819   template <class WaitContext>
 820   bool waitForZeroBits(uint32_t& state,
 821                        uint32_t goal,
 822                        uint32_t waitMask,
 823                        WaitContext& ctx) {
 824     uint32_t spinCount = 0;
 825     while (true) {
 826       state = state_.load(std::memory_order_acquire);
 827       if ((state & goal) == 0) {
 828         return true;
 829       }
 830       asm_volatile_pause();
 831       ++spinCount;
 832       if (UNLIKELY(spinCount >= kMaxSpinCount)) {
 833         return ctx.canBlock() &&
 834                yieldWaitForZeroBits(state, goal, waitMask, ctx);
 835       }
 836     }
 837   }
 838
 839   template <class WaitContext>
 840   bool yieldWaitForZeroBits(uint32_t& state,
 841                             uint32_t goal,
 842                             uint32_t waitMask,
 843                             WaitContext& ctx) {
 844 #ifdef RUSAGE_THREAD
 845     struct rusage usage;
 846     long before = -1;
 847 #endif
 848     for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
 849          ++yieldCount) {
 850       for (int softState = 0; softState < 3; ++softState) {
 851         if (softState < 2) {
 852           std::this_thread::yield();
 853         } else {
 854 #ifdef RUSAGE_THREAD
 855           getrusage(RUSAGE_THREAD, &usage);
 856 #endif
 857         }
 858         if (((state = state_.load(std::memory_order_acquire)) & goal) == 0) {
 859           return true;
 860         }
 861         if (ctx.shouldTimeOut()) {
 862           return false;
 863         }
 864       }
 865 #ifdef RUSAGE_THREAD
 866       if (before >= 0 && usage.ru_nivcsw >= before + 2) {
 867         // One involuntary csw might just be occasional background work,
 868         // but if we get two in a row then we guess that there is someone
 869         // else who can profitably use this CPU.  Fall back to futex
 870         break;
 871       }
 872       before = usage.ru_nivcsw;
 873 #endif
 874     }
 875     return futexWaitForZeroBits(state, goal, waitMask, ctx);
 876   }
 877
 878   template <class WaitContext>
 879   bool futexWaitForZeroBits(uint32_t& state,
 880                             uint32_t goal,
 881                             uint32_t waitMask,
 882                             WaitContext& ctx) {
 883     assert(waitMask == kWaitingNotS || waitMask == kWaitingE ||
 884            waitMask == kWaitingU || waitMask == kWaitingS);
 885
 886     while (true) {
 887       state = state_.load(std::memory_order_acquire);
 888       if ((state & goal) == 0) {
 889         return true;
 890       }
 891
 892       auto after = state;
 893       if (waitMask == kWaitingE) {
 894         if ((state & kWaitingESingle) != 0) {
 895           after |= kWaitingEMultiple;
 896         } else {
 897           after |= kWaitingESingle;
 898         }
 899       } else {
 900         after |= waitMask;
 901       }
 902
 903       // CAS is better than atomic |= here, because it lets us avoid
 904       // setting the wait flag when the goal is concurrently achieved
 905       if (after != state && !state_.compare_exchange_strong(state, after)) {
 906         continue;
 907       }
 908
 909       if (!ctx.doWait(state_, after, waitMask)) {
 910         // timed out
 911         return false;
 912       }
 913     }
 914   }
 915
 916   // Wakes up waiters registered in state_ as appropriate, clearing the
 917   // awaiting bits for anybody that was awoken.  Tries to perform direct
 918   // single wakeup of an exclusive waiter if appropriate
 919   void wakeRegisteredWaiters(uint32_t& state, uint32_t wakeMask) {
 920     if (UNLIKELY((state & wakeMask) != 0)) {
 921       wakeRegisteredWaitersImpl(state, wakeMask);
 922     }
 923   }
 924
 925   void wakeRegisteredWaitersImpl(uint32_t& state, uint32_t wakeMask) {
 926     // If there are multiple lock() pending only one of them will actually
 927     // get to wake up, so issuing futexWakeAll will make a thundering herd.
 928     // There's nothing stopping us from issuing futexWake(1) instead,
 929     // so long as the wait bits are still an accurate reflection of
 930     // the waiters.  If we notice (via futexWake's return value) that
 931     // nobody woke up then we can try again with the normal wake-all path.
 932     // Note that we can't just clear the bits at that point; we need to
 933     // clear the bits and then issue another wakeup.
 934     //
 935     // It is possible that we wake an E waiter but an outside S grabs the
 936     // lock instead, at which point we should wake pending U and S waiters.
 937     // Rather than tracking state to make the failing E regenerate the
 938     // wakeup, we just disable the optimization in the case that there
 939     // are waiting U or S that we are eligible to wake.
 940     if ((wakeMask & kWaitingE) == kWaitingE &&
 941         (state & wakeMask) == kWaitingE &&
 942         state_.futexWake(1, kWaitingE) > 0) {
 943       // somebody woke up, so leave state_ as is and clear it later
 944       return;
 945     }
 946
 947     if ((state & wakeMask) != 0) {
 948       auto prev = state_.fetch_and(~wakeMask);
 949       if ((prev & wakeMask) != 0) {
 950         futexWakeAll(wakeMask);
 951       }
 952       state = prev & ~wakeMask;
 953     }
 954   }
 955
 956   void futexWakeAll(uint32_t wakeMask) {
 957     state_.futexWake(std::numeric_limits<int>::max(), wakeMask);
 958   }
 959
 960   DeferredReaderSlot* deferredReader(uint32_t slot) {
 961     return &deferredReaders[slot * kDeferredSeparationFactor];
 962   }
 963
 964   uintptr_t tokenfulSlotValue() { return reinterpret_cast<uintptr_t>(this); }
 965
 966   uintptr_t tokenlessSlotValue() { return tokenfulSlotValue() | kTokenless; }
 967
 968   bool slotValueIsThis(uintptr_t slotValue) {
 969     return (slotValue & ~kTokenless) == tokenfulSlotValue();
 970   }
 971
 972   // Clears any deferredReaders[] that point to this, adjusting the inline
 973   // shared lock count to compensate.  Does some spinning and yielding
 974   // to avoid the work.  Always finishes the application, even if ctx
 975   // times out.
 976   template <class WaitContext>
 977   void applyDeferredReaders(uint32_t& state, WaitContext& ctx) {
 978     uint32_t slot = 0;
 979
 980     uint32_t spinCount = 0;
 981     while (true) {
 982       while (!slotValueIsThis(
 983                  deferredReader(slot)->load(std::memory_order_acquire))) {
 984         if (++slot == kMaxDeferredReaders) {
 985           return;
 986         }
 987       }
 988       asm_pause();
 989       if (UNLIKELY(++spinCount >= kMaxSpinCount)) {
 990         applyDeferredReaders(state, ctx, slot);
 991         return;
 992       }
 993     }
 994   }
 995
 996   template <class WaitContext>
 997   void applyDeferredReaders(uint32_t& state, WaitContext& ctx, uint32_t slot) {
 998
 999 #ifdef RUSAGE_THREAD
1000     struct rusage usage;
1001     long before = -1;
1002 #endif
1003     for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
1004          ++yieldCount) {
1005       for (int softState = 0; softState < 3; ++softState) {
1006         if (softState < 2) {
1007           std::this_thread::yield();
1008         } else {
1009 #ifdef RUSAGE_THREAD
1010           getrusage(RUSAGE_THREAD, &usage);
1011 #endif
1012         }
1013         while (!slotValueIsThis(
1014                    deferredReader(slot)->load(std::memory_order_acquire))) {
1015           if (++slot == kMaxDeferredReaders) {
1016             return;
1017           }
1018         }
1019         if (ctx.shouldTimeOut()) {
1020           // finish applying immediately on timeout
1021           break;
1022         }
1023       }
1024 #ifdef RUSAGE_THREAD
1025       if (before >= 0 && usage.ru_nivcsw >= before + 2) {
1026         // heuristic says run queue is not empty
1027         break;
1028       }
1029       before = usage.ru_nivcsw;
1030 #endif
1031     }
1032
1033     uint32_t movedSlotCount = 0;
1034     for (; slot < kMaxDeferredReaders; ++slot) {
1035       auto slotPtr = deferredReader(slot);
1036       auto slotValue = slotPtr->load(std::memory_order_acquire);
1037       if (slotValueIsThis(slotValue) &&
1038           slotPtr->compare_exchange_strong(slotValue, 0)) {
1039         ++movedSlotCount;
1040       }
1041     }
1042
1043     if (movedSlotCount > 0) {
1044       state = (state_ += movedSlotCount * kIncrHasS);
1045     }
1046     assert((state & (kHasE | kBegunE)) != 0);
1047
1048     // if state + kIncrHasS overflows (off the end of state) then either
1049     // we have 2^(32-9) readers (almost certainly an application bug)
1050     // or we had an underflow (also a bug)
1051     assert(state < state + kIncrHasS);
1052   }
1053
1054   // It is straightfoward to make a token-less lock_shared() and
1055   // unlock_shared() either by making the token-less version always use
1056   // INLINE_SHARED mode or by removing the token version.  Supporting
1057   // deferred operation for both types is trickier than it appears, because
1058   // the purpose of the token it so that unlock_shared doesn't have to
1059   // look in other slots for its deferred lock.  Token-less unlock_shared
1060   // might place a deferred lock in one place and then release a different
1061   // slot that was originally used by the token-ful version.  If this was
1062   // important we could solve the problem by differentiating the deferred
1063   // locks so that cross-variety release wouldn't occur.  The best way
1064   // is probably to steal a bit from the pointer, making deferredLocks[]
1065   // an array of Atom<uintptr_t>.
1066
1067   template <class WaitContext>
1068   bool lockSharedImpl(Token* token, WaitContext& ctx) {
1069     uint32_t state = state_.load(std::memory_order_relaxed);
1070     if ((state & (kHasS | kMayDefer | kHasE)) == 0 &&
1071         state_.compare_exchange_strong(state, state + kIncrHasS)) {
1072       if (token != nullptr) {
1073         token->type_ = Token::Type::INLINE_SHARED;
1074       }
1075       return true;
1076     }
1077     return lockSharedImpl(state, token, ctx);
1078   }
1079
1080   template <class WaitContext>
1081   bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx);
1082
1083   // Updates the state in/out argument as if the locks were made inline,
1084   // but does not update state_
1085   void cleanupTokenlessSharedDeferred(uint32_t& state) {
1086     for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
1087       auto slotPtr = deferredReader(i);
1088       auto slotValue = slotPtr->load(std::memory_order_relaxed);
1089       if (slotValue == tokenlessSlotValue()) {
1090         slotPtr->store(0, std::memory_order_relaxed);
1091         state += kIncrHasS;
1092         if ((state & kHasS) == 0) {
1093           break;
1094         }
1095       }
1096     }
1097   }
1098
1099   bool tryUnlockTokenlessSharedDeferred();
1100
1101   bool tryUnlockSharedDeferred(uint32_t slot) {
1102     assert(slot < kMaxDeferredReaders);
1103     auto slotValue = tokenfulSlotValue();
1104     return deferredReader(slot)->compare_exchange_strong(slotValue, 0);
1105   }
1106
1107   uint32_t unlockSharedInline() {
1108     uint32_t state = (state_ -= kIncrHasS);
1109     assert((state & (kHasE | kBegunE | kMayDefer)) != 0 ||
1110            state < state + kIncrHasS);
1111     if ((state & kHasS) == 0) {
1112       // Only the second half of lock() can be blocked by a non-zero
1113       // reader count, so that's the only thing we need to wake
1114       wakeRegisteredWaiters(state, kWaitingNotS);
1115     }
1116     return state;
1117   }
1118
1119   template <class WaitContext>
1120   bool lockUpgradeImpl(WaitContext& ctx) {
1121     uint32_t state;
1122     do {
1123       if (!waitForZeroBits(state, kHasSolo, kWaitingU, ctx)) {
1124         return false;
1125       }
1126     } while (!state_.compare_exchange_strong(state, state | kHasU));
1127     return true;
1128   }
1129
1130  public:
1131   class ReadHolder {
1132    public:
1133     ReadHolder() : lock_(nullptr) {}
1134
1135     explicit ReadHolder(const SharedMutexImpl* lock) : ReadHolder(*lock) {}
1136
1137     explicit ReadHolder(const SharedMutexImpl& lock)
1138         : lock_(const_cast<SharedMutexImpl*>(&lock)) {
1139       lock_->lock_shared(token_);
1140     }
1141
1142     ReadHolder(ReadHolder&& rhs) noexcept : lock_(rhs.lock_),
1143                                             token_(rhs.token_) {
1144       rhs.lock_ = nullptr;
1145     }
1146
1147     // Downgrade from upgrade mode
1148     explicit ReadHolder(UpgradeHolder&& upgraded) : lock_(upgraded.lock_) {
1149       assert(upgraded.lock_ != nullptr);
1150       upgraded.lock_ = nullptr;
1151       lock_->unlock_upgrade_and_lock_shared(token_);
1152     }
1153
1154     // Downgrade from exclusive mode
1155     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
1156       assert(writer.lock_ != nullptr);
1157       writer.lock_ = nullptr;
1158       lock_->unlock_and_lock_shared(token_);
1159     }
1160
1161     ReadHolder& operator=(ReadHolder&& rhs) noexcept {
1162       std::swap(lock_, rhs.lock_);
1163       std::swap(token_, rhs.token_);
1164       return *this;
1165     }
1166
1167     ReadHolder(const ReadHolder& rhs) = delete;
1168     ReadHolder& operator=(const ReadHolder& rhs) = delete;
1169
1170     ~ReadHolder() {
1171       unlock();
1172     }
1173
1174     void unlock() {
1175       if (lock_) {
1176         lock_->unlock_shared(token_);
1177         lock_ = nullptr;
1178       }
1179     }
1180
1181    private:
1182     friend class UpgradeHolder;
1183     friend class WriteHolder;
1184     SharedMutexImpl* lock_;
1185     SharedMutexToken token_;
1186   };
1187
1188   class UpgradeHolder {
1189    public:
1190     UpgradeHolder() : lock_(nullptr) {}
1191
1192     explicit UpgradeHolder(SharedMutexImpl* lock) : UpgradeHolder(*lock) {}
1193
1194     explicit UpgradeHolder(SharedMutexImpl& lock) : lock_(&lock) {
1195       lock_->lock_upgrade();
1196     }
1197
1198     // Downgrade from exclusive mode
1199     explicit UpgradeHolder(WriteHolder&& writer) : lock_(writer.lock_) {
1200       assert(writer.lock_ != nullptr);
1201       writer.lock_ = nullptr;
1202       lock_->unlock_and_lock_upgrade();
1203     }
1204
1205     UpgradeHolder(UpgradeHolder&& rhs) noexcept : lock_(rhs.lock_) {
1206       rhs.lock_ = nullptr;
1207     }
1208
1209     UpgradeHolder& operator=(UpgradeHolder&& rhs) noexcept {
1210       std::swap(lock_, rhs.lock_);
1211       return *this;
1212     }
1213
1214     UpgradeHolder(const UpgradeHolder& rhs) = delete;
1215     UpgradeHolder& operator=(const UpgradeHolder& rhs) = delete;
1216
1217     ~UpgradeHolder() {
1218       unlock();
1219     }
1220
1221     void unlock() {
1222       if (lock_) {
1223         lock_->unlock_upgrade();
1224         lock_ = nullptr;
1225       }
1226     }
1227
1228    private:
1229     friend class WriteHolder;
1230     friend class ReadHolder;
1231     SharedMutexImpl* lock_;
1232   };
1233
1234   class WriteHolder {
1235    public:
1236     WriteHolder() : lock_(nullptr) {}
1237
1238     explicit WriteHolder(SharedMutexImpl* lock) : WriteHolder(*lock) {}
1239
1240     explicit WriteHolder(SharedMutexImpl& lock) : lock_(&lock) {
1241       lock_->lock();
1242     }
1243
1244     // Promotion from upgrade mode
1245     explicit WriteHolder(UpgradeHolder&& upgrade) : lock_(upgrade.lock_) {
1246       assert(upgrade.lock_ != nullptr);
1247       upgrade.lock_ = nullptr;
1248       lock_->unlock_upgrade_and_lock();
1249     }
1250
1251     // README:
1252     //
1253     // It is intended that WriteHolder(ReadHolder&& rhs) do not exist.
1254     //
1255     // Shared locks (read) can not safely upgrade to unique locks (write).
1256     // That upgrade path is a well-known recipe for deadlock, so we explicitly
1257     // disallow it.
1258     //
1259     // If you need to do a conditional mutation, you have a few options:
1260     // 1. Check the condition under a shared lock and release it.
1261     //    Then maybe check the condition again under a unique lock and maybe do
1262     //    the mutation.
1263     // 2. Check the condition once under an upgradeable lock.
1264     //    Then maybe upgrade the lock to a unique lock and do the mutation.
1265     // 3. Check the condition and maybe perform the mutation under a unique
1266     //    lock.
1267     //
1268     // Relevant upgradeable lock notes:
1269     // * At most one upgradeable lock can be held at a time for a given shared
1270     //   mutex, just like a unique lock.
1271     // * An upgradeable lock may be held concurrently with any number of shared
1272     //   locks.
1273     // * An upgradeable lock may be upgraded atomically to a unique lock.
1274
1275     WriteHolder(WriteHolder&& rhs) noexcept : lock_(rhs.lock_) {
1276       rhs.lock_ = nullptr;
1277     }
1278
1279     WriteHolder& operator=(WriteHolder&& rhs) noexcept {
1280       std::swap(lock_, rhs.lock_);
1281       return *this;
1282     }
1283
1284     WriteHolder(const WriteHolder& rhs) = delete;
1285     WriteHolder& operator=(const WriteHolder& rhs) = delete;
1286
1287     ~WriteHolder() {
1288       unlock();
1289     }
1290
1291     void unlock() {
1292       if (lock_) {
1293         lock_->unlock();
1294         lock_ = nullptr;
1295       }
1296     }
1297
1298    private:
1299     friend class ReadHolder;
1300     friend class UpgradeHolder;
1301     SharedMutexImpl* lock_;
1302   };
1303
1304   // Adapters for Synchronized<>
1305   friend void acquireRead(SharedMutexImpl& lock) { lock.lock_shared(); }
1306   friend void acquireReadWrite(SharedMutexImpl& lock) { lock.lock(); }
1307   friend void releaseRead(SharedMutexImpl& lock) { lock.unlock_shared(); }
1308   friend void releaseReadWrite(SharedMutexImpl& lock) { lock.unlock(); }
1309   friend bool acquireRead(SharedMutexImpl& lock, unsigned int ms) {
1310     return lock.try_lock_shared_for(std::chrono::milliseconds(ms));
1311   }
1312   friend bool acquireReadWrite(SharedMutexImpl& lock, unsigned int ms) {
1313     return lock.try_lock_for(std::chrono::milliseconds(ms));
1314   }
1315 };
1316
1317 typedef SharedMutexImpl<true> SharedMutexReadPriority;
1318 typedef SharedMutexImpl<false> SharedMutexWritePriority;
1319 typedef SharedMutexWritePriority SharedMutex;
1320
1321 // Prevent the compiler from instantiating these in other translation units.
1322 // They are instantiated once in SharedMutex.cpp
1323 extern template class SharedMutexImpl<true>;
1324 extern template class SharedMutexImpl<false>;
1325
1326 template <
1327     bool ReaderPriority,
1328     typename Tag_,
1329     template <typename> class Atom,
1330     bool BlockImmediately>
1331 typename SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1332     DeferredReaderSlot
1333         SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1334             deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor] =
1335                 {};
1336
1337 template <
1338     bool ReaderPriority,
1339     typename Tag_,
1340     template <typename> class Atom,
1341     bool BlockImmediately>
1342 FOLLY_SHAREDMUTEX_TLS uint32_t
1343     SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1344         tls_lastTokenlessSlot = 0;
1345
1346 template <
1347     bool ReaderPriority,
1348     typename Tag_,
1349     template <typename> class Atom,
1350     bool BlockImmediately>
1351 bool SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1352     tryUnlockTokenlessSharedDeferred() {
1353   auto bestSlot = tls_lastTokenlessSlot;
1354   for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
1355     auto slotPtr = deferredReader(bestSlot ^ i);
1356     auto slotValue = slotPtr->load(std::memory_order_relaxed);
1357     if (slotValue == tokenlessSlotValue() &&
1358         slotPtr->compare_exchange_strong(slotValue, 0)) {
1359       tls_lastTokenlessSlot = bestSlot ^ i;
1360       return true;
1361     }
1362   }
1363   return false;
1364 }
1365
1366 template <
1367     bool ReaderPriority,
1368     typename Tag_,
1369     template <typename> class Atom,
1370     bool BlockImmediately>
1371 template <class WaitContext>
1372 bool SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1373     lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
1374   while (true) {
1375     if (UNLIKELY((state & kHasE) != 0) &&
1376         !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
1377       return false;
1378     }
1379
1380     uint32_t slot;
1381     uintptr_t slotValue = 1; // any non-zero value will do
1382
1383     bool canAlreadyDefer = (state & kMayDefer) != 0;
1384     bool aboveDeferThreshold =
1385         (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS;
1386     bool drainInProgress = ReaderPriority && (state & kBegunE) != 0;
1387     if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) {
1388       // starting point for our empty-slot search, can change after
1389       // calling waitForZeroBits
1390       uint32_t bestSlot =
1391           (uint32_t)folly::detail::AccessSpreader<Atom>::current(
1392               kMaxDeferredReaders);
1393
1394       // deferred readers are already enabled, or it is time to
1395       // enable them if we can find a slot
1396       for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) {
1397         slot = bestSlot ^ i;
1398         assert(slot < kMaxDeferredReaders);
1399         slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
1400         if (slotValue == 0) {
1401           // found empty slot
1402           break;
1403         }
1404       }
1405     }
1406
1407     if (slotValue != 0) {
1408       // not yet deferred, or no empty slots
1409       if (state_.compare_exchange_strong(state, state + kIncrHasS)) {
1410         // successfully recorded the read lock inline
1411         if (token != nullptr) {
1412           token->type_ = Token::Type::INLINE_SHARED;
1413         }
1414         return true;
1415       }
1416       // state is updated, try again
1417       continue;
1418     }
1419
1420     // record that deferred readers might be in use if necessary
1421     if ((state & kMayDefer) == 0) {
1422       if (!state_.compare_exchange_strong(state, state | kMayDefer)) {
1423         // keep going if CAS failed because somebody else set the bit
1424         // for us
1425         if ((state & (kHasE | kMayDefer)) != kMayDefer) {
1426           continue;
1427         }
1428       }
1429       // state = state | kMayDefer;
1430     }
1431
1432     // try to use the slot
1433     bool gotSlot = deferredReader(slot)->compare_exchange_strong(
1434         slotValue,
1435         token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue());
1436
1437     // If we got the slot, we need to verify that an exclusive lock
1438     // didn't happen since we last checked.  If we didn't get the slot we
1439     // need to recheck state_ anyway to make sure we don't waste too much
1440     // work.  It is also possible that since we checked state_ someone
1441     // has acquired and released the write lock, clearing kMayDefer.
1442     // Both cases are covered by looking for the readers-possible bit,
1443     // because it is off when the exclusive lock bit is set.
1444     state = state_.load(std::memory_order_acquire);
1445
1446     if (!gotSlot) {
1447       continue;
1448     }
1449
1450     if (token == nullptr) {
1451       tls_lastTokenlessSlot = slot;
1452     }
1453
1454     if ((state & kMayDefer) != 0) {
1455       assert((state & kHasE) == 0);
1456       // success
1457       if (token != nullptr) {
1458         token->type_ = Token::Type::DEFERRED_SHARED;
1459         token->slot_ = (uint16_t)slot;
1460       }
1461       return true;
1462     }
1463
1464     // release the slot before retrying
1465     if (token == nullptr) {
1466       // We can't rely on slot.  Token-less slot values can be freed by
1467       // any unlock_shared(), so we need to do the full deferredReader
1468       // search during unlock.  Unlike unlock_shared(), we can't trust
1469       // kPrevDefer here.  This deferred lock isn't visible to lock()
1470       // (that's the whole reason we're undoing it) so there might have
1471       // subsequently been an unlock() and lock() with no intervening
1472       // transition to deferred mode.
1473       if (!tryUnlockTokenlessSharedDeferred()) {
1474         unlockSharedInline();
1475       }
1476     } else {
1477       if (!tryUnlockSharedDeferred(slot)) {
1478         unlockSharedInline();
1479       }
1480     }
1481
1482     // We got here not because the lock was unavailable, but because
1483     // we lost a compare-and-swap.  Try-lock is typically allowed to
1484     // have spurious failures, but there is no lock efficiency gain
1485     // from exploiting that freedom here.
1486   }
1487 }
1488
1489 } // namespace folly