Improve folly::RequestContext onSet and onUnset efficiency

[folly.git] / folly / SharedMutex.h
diff --git a/folly/SharedMutex.h b/folly/SharedMutex.h

index 8bfd32629a75877802e41a52e0cd1b8e56540f8c..b6a1a2639a919e546433482be6f0b7dc5ffa00ea 100644 (file)
--- a/folly/SharedMutex.h
+++ b/folly/SharedMutex.h
@@ -1,5 +1,5 @@
  /*
- * Copyright 2015 Facebook, Inc.
+ * Copyright 2017 Facebook, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -19,13 +19,16 @@
  #pragma once
  
  #include <stdint.h>
+
  #include <atomic>
  #include <thread>
  #include <type_traits>
+
  #include <folly/Likely.h>
-#include <folly/detail/CacheLocality.h>
+#include <folly/concurrency/CacheLocality.h>
  #include <folly/detail/Futex.h>
-#include <sys/resource.h>
+#include <folly/portability/Asm.h>
+#include <folly/portability/SysResource.h>
  
  // SharedMutex is a reader-writer lock.  It is small, very fast, scalable
  // on multi-core, and suitable for use when readers or writers may block.
@@ -182,7 +185,7 @@
  // the shared slots.  If you can conveniently pass state from lock
  // acquisition to release then the fastest mechanism is to std::move
  // the SharedMutex::ReadHolder instance or an SharedMutex::Token (using
-// lock_shared(Token&) and unlock_sahred(Token&)).  The guard or token
+// lock_shared(Token&) and unlock_shared(Token&)).  The guard or token
  // will tell unlock_shared where in deferredReaders[] to look for the
  // deferred lock.  The Token-less version of unlock_shared() works in all
  // cases, but is optimized for the common (no inter-thread handoff) case.
@@ -203,11 +206,22 @@
  //
  // If you have observed by profiling that your SharedMutex-s are getting
  // cache misses on deferredReaders[] due to another SharedMutex user, then
-// you can use the tag type plus the RWDEFERREDLOCK_DECLARE_STATIC_STORAGE
-// macro to create your own instantiation of the type.  The contention
-// threshold (see kNumSharedToStartDeferring) should make this unnecessary
-// in all but the most extreme cases.  Make sure to check that the
-// increased icache and dcache footprint of the tagged result is worth it.
+// you can use the tag type to create your own instantiation of the type.
+// The contention threshold (see kNumSharedToStartDeferring) should make
+// this unnecessary in all but the most extreme cases.  Make sure to check
+// that the increased icache and dcache footprint of the tagged result is
+// worth it.
+
+// SharedMutex's use of thread local storage is as an optimization, so
+// for the case where thread local storage is not supported, define it
+// away.
+#ifndef FOLLY_SHAREDMUTEX_TLS
+#if !FOLLY_MOBILE
+#define FOLLY_SHAREDMUTEX_TLS FOLLY_TLS
+#else
+#define FOLLY_SHAREDMUTEX_TLS
+#endif
+#endif
  
  namespace folly {
  
@@ -222,10 +236,11 @@ struct SharedMutexToken {
    uint16_t slot_;
  };
  
-template <bool ReaderPriority,
-          typename Tag_ = void,
-          template <typename> class Atom = std::atomic,
-          bool BlockImmediately = false>
+template <
+    bool ReaderPriority,
+    typename Tag_ = void,
+    template <typename> class Atom = std::atomic,
+    bool BlockImmediately = false>
  class SharedMutexImpl {
   public:
    static constexpr bool kReaderPriority = ReaderPriority;
@@ -237,7 +252,7 @@ class SharedMutexImpl {
    class UpgradeHolder;
    class WriteHolder;
  
-  SharedMutexImpl() : state_(0) {}
+  constexpr SharedMutexImpl() noexcept : state_(0) {}
  
    SharedMutexImpl(const SharedMutexImpl&) = delete;
    SharedMutexImpl(SharedMutexImpl&&) = delete;
@@ -253,16 +268,19 @@ class SharedMutexImpl {
    // See https://sourceware.org/bugzilla/show_bug.cgi?id=13690 for a
    // description about why this property needs to be explicitly mentioned.
    ~SharedMutexImpl() {
-#ifndef NDEBUG
-    auto state = state_.load(std::memory_order_acquire);
+    auto state = state_.load(std::memory_order_relaxed);
+    if (UNLIKELY((state & kHasS) != 0)) {
+      cleanupTokenlessSharedDeferred(state);
+    }
  
+#ifndef NDEBUG
      // if a futexWait fails to go to sleep because the value has been
      // changed, we don't necessarily clean up the wait bits, so it is
      // possible they will be set here in a correct system
      assert((state & ~(kWaitingAny | kMayDefer)) == 0);
      if ((state & kMayDefer) != 0) {
        for (uint32_t slot = 0; slot < kMaxDeferredReaders; ++slot) {
-        auto slotValue = deferredReader(slot)->load(std::memory_order_acquire);
+        auto slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
          assert(!slotValueIsThis(slotValue));
        }
      }
@@ -361,7 +379,7 @@ class SharedMutexImpl {
      // kPrevDefer so we can tell if the pre-lock() lock_shared() might
      // have deferred
      if ((state & (kMayDefer | kPrevDefer)) == 0 ||
-        !tryUnlockAnySharedDeferred()) {
+        !tryUnlockTokenlessSharedDeferred()) {
        // Matching lock_shared() couldn't have deferred, or the deferred
        // lock has already been inlined by applyDeferredReaders()
        unlockSharedInline();
@@ -441,7 +459,7 @@ class SharedMutexImpl {
  
    void unlock_upgrade_and_lock_shared() {
      auto state = (state_ -= kHasU - kIncrHasS);
-    assert((state & (kWaitingNotS | kHasSolo)) == 0 && (state & kHasS) != 0);
+    assert((state & (kWaitingNotS | kHasSolo)) == 0);
      wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
    }
  
@@ -493,7 +511,9 @@ class SharedMutexImpl {
      bool canTimeOut() { return true; }
      bool shouldTimeOut() { return true; }
  
-    bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
+    bool doWait(Futex& /* futex */,
+                uint32_t /* expected */,
+                uint32_t /* waitMask */) {
        return false;
      }
    };
@@ -545,6 +565,14 @@ class SharedMutexImpl {
    // 32 bits of state
    Futex state_;
  
+  // S count needs to be on the end, because we explicitly allow it to
+  // underflow.  This can occur while we are in the middle of applying
+  // deferred locks (we remove them from deferredReaders[] before
+  // inlining them), or during token-less unlock_shared() if a racing
+  // lock_shared();unlock_shared() moves the deferredReaders slot while
+  // the first unlock_shared() is scanning.  The former case is cleaned
+  // up before we finish applying the locks.  The latter case can persist
+  // until destruction, when it is cleaned up.
    static constexpr uint32_t kIncrHasS = 1 << 10;
    static constexpr uint32_t kHasS = ~(kIncrHasS - 1);
  
@@ -665,10 +693,13 @@ class SharedMutexImpl {
    // guarantees we won't have inter-L1 contention.  We give ourselves
    // a factor of 2 on the core count, which should hold us for a couple
    // processor generations.  deferredReaders[] is 2048 bytes currently.
+ public:
    static constexpr uint32_t kMaxDeferredReaders = 64;
    static constexpr uint32_t kDeferredSearchDistance = 2;
    static constexpr uint32_t kDeferredSeparationFactor = 4;
  
+ private:
+
    static_assert(!(kMaxDeferredReaders & (kMaxDeferredReaders - 1)),
                  "kMaxDeferredReaders must be a power of 2");
    static_assert(!(kDeferredSearchDistance & (kDeferredSearchDistance - 1)),
@@ -693,16 +724,23 @@ class SharedMutexImpl {
    static constexpr uintptr_t kTokenless = 0x1;
  
    // This is the starting location for Token-less unlock_shared().
-  static FOLLY_TLS uint32_t tls_lastTokenlessSlot;
+  static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastTokenlessSlot;
+
+  // Last deferred reader slot used.
+  static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastDeferredReaderSlot;
+
  
    // Only indexes divisible by kDeferredSeparationFactor are used.
    // If any of those elements points to a SharedMutexImpl, then it
    // should be considered that there is a shared lock on that instance.
    // See kTokenless.
+ public:
    typedef Atom<uintptr_t> DeferredReaderSlot;
-  static DeferredReaderSlot deferredReaders
+
+ private:
+  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING static DeferredReaderSlot deferredReaders
        [kMaxDeferredReaders *
-       kDeferredSeparationFactor] FOLLY_ALIGN_TO_AVOID_FALSE_SHARING;
+       kDeferredSeparationFactor];
  
    // Performs an exclusive lock, waiting for state_ & waitMask to be
    // zero first
@@ -730,7 +768,7 @@ class SharedMutexImpl {
        }
  
        uint32_t after = (state & kMayDefer) == 0 ? 0 : kPrevDefer;
-      if (!ReaderPriority || (state & (kMayDefer | kHasS)) == 0) {
+      if (!kReaderPriority || (state & (kMayDefer | kHasS)) == 0) {
          // Block readers immediately, either because we are in write
          // priority mode or because we can acquire the lock in one
          // step.  Note that if state has kHasU, then we are doing an
@@ -771,7 +809,7 @@ class SharedMutexImpl {
              return false;
            }
  
-          if (ReaderPriority && (state & kHasE) == 0) {
+          if (kReaderPriority && (state & kHasE) == 0) {
              assert((state & kBegunE) != 0);
              if (!state_.compare_exchange_strong(state,
                                                  (state & ~kBegunE) | kHasE)) {
@@ -796,9 +834,7 @@ class SharedMutexImpl {
        if ((state & goal) == 0) {
          return true;
        }
-#if FOLLY_X64
-      asm volatile("pause");
-#endif
+      asm_volatile_pause();
        ++spinCount;
        if (UNLIKELY(spinCount >= kMaxSpinCount)) {
          return ctx.canBlock() &&
@@ -814,6 +850,7 @@ class SharedMutexImpl {
                              WaitContext& ctx) {
  #ifdef RUSAGE_THREAD
      struct rusage usage;
+    std::memset(&usage, 0, sizeof(usage));
      long before = -1;
  #endif
      for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
@@ -956,9 +993,7 @@ class SharedMutexImpl {
            return;
          }
        }
-#if FOLLY_X64
-      asm("pause");
-#endif
+      asm_volatile_pause();
        if (UNLIKELY(++spinCount >= kMaxSpinCount)) {
          applyDeferredReaders(state, ctx, slot);
          return;
@@ -971,6 +1006,7 @@ class SharedMutexImpl {
  
  #ifdef RUSAGE_THREAD
      struct rusage usage;
+    std::memset(&usage, 0, sizeof(usage));
      long before = -1;
  #endif
      for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
@@ -1051,136 +1087,26 @@ class SharedMutexImpl {
    }
  
    template <class WaitContext>
-  bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
-    while (true) {
-      if (UNLIKELY((state & kHasE) != 0) &&
-          !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
-        return false;
-      }
-
-      uint32_t slot;
-      uintptr_t slotValue = 1; // any non-zero value will do
-
-      bool canAlreadyDefer = (state & kMayDefer) != 0;
-      bool aboveDeferThreshold =
-          (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS;
-      bool drainInProgress = ReaderPriority && (state & kBegunE) != 0;
-      if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) {
-        // starting point for our empty-slot search, can change after
-        // calling waitForZeroBits
-        uint32_t bestSlot =
-            (uint32_t)folly::detail::AccessSpreader<Atom>::current(
-                kMaxDeferredReaders);
-
-        // deferred readers are already enabled, or it is time to
-        // enable them if we can find a slot
-        for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) {
-          slot = bestSlot ^ i;
-          assert(slot < kMaxDeferredReaders);
-          slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
-          if (slotValue == 0) {
-            // found empty slot
-            break;
-          }
-        }
-      }
-
-      if (slotValue != 0) {
-        // not yet deferred, or no empty slots
-        if (state_.compare_exchange_strong(state, state + kIncrHasS)) {
-          // successfully recorded the read lock inline
-          if (token != nullptr) {
-            token->type_ = Token::Type::INLINE_SHARED;
-          }
-          return true;
-        }
-        // state is updated, try again
-        continue;
-      }
-
-      // record that deferred readers might be in use if necessary
-      if ((state & kMayDefer) == 0) {
-        if (!state_.compare_exchange_strong(state, state | kMayDefer)) {
-          // keep going if CAS failed because somebody else set the bit
-          // for us
-          if ((state & (kHasE | kMayDefer)) != kMayDefer) {
-            continue;
-          }
-        }
-        // state = state | kMayDefer;
-      }
-
-      // try to use the slot
-      bool gotSlot = deferredReader(slot)->compare_exchange_strong(
-          slotValue,
-          token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue());
-
-      // If we got the slot, we need to verify that an exclusive lock
-      // didn't happen since we last checked.  If we didn't get the slot we
-      // need to recheck state_ anyway to make sure we don't waste too much
-      // work.  It is also possible that since we checked state_ someone
-      // has acquired and released the write lock, clearing kMayDefer.
-      // Both cases are covered by looking for the readers-possible bit,
-      // because it is off when the exclusive lock bit is set.
-      state = state_.load(std::memory_order_acquire);
-
-      if (!gotSlot) {
-        continue;
-      }
-
-      if (token == nullptr) {
-        tls_lastTokenlessSlot = slot;
-      }
-
-      if ((state & kMayDefer) != 0) {
-        assert((state & kHasE) == 0);
-        // success
-        if (token != nullptr) {
-          token->type_ = Token::Type::DEFERRED_SHARED;
-          token->slot_ = (uint16_t)slot;
-        }
-        return true;
-      }
-
-      // release the slot before retrying
-      if (token == nullptr) {
-        // We can't rely on slot.  Token-less slot values can be freed by
-        // any unlock_shared(), so we need to do the full deferredReader
-        // search during unlock.  Unlike unlock_shared(), we can't trust
-        // kPrevDefer here.  This deferred lock isn't visible to lock()
-        // (that's the whole reason we're undoing it) so there might have
-        // subsequently been an unlock() and lock() with no intervening
-        // transition to deferred mode.
-        if (!tryUnlockAnySharedDeferred()) {
-          unlockSharedInline();
-        }
-      } else {
-        if (!tryUnlockSharedDeferred(slot)) {
-          unlockSharedInline();
-        }
-      }
-
-      // We got here not because the lock was unavailable, but because
-      // we lost a compare-and-swap.  Try-lock is typically allowed to
-      // have spurious failures, but there is no lock efficiency gain
-      // from exploiting that freedom here.
-    }
-  }
+  bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx);
  
-  bool tryUnlockAnySharedDeferred() {
-    auto bestSlot = tls_lastTokenlessSlot;
+  // Updates the state in/out argument as if the locks were made inline,
+  // but does not update state_
+  void cleanupTokenlessSharedDeferred(uint32_t& state) {
      for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
-      auto slotPtr = deferredReader(bestSlot ^ i);
+      auto slotPtr = deferredReader(i);
        auto slotValue = slotPtr->load(std::memory_order_relaxed);
-      if (slotValue == tokenlessSlotValue() &&
-          slotPtr->compare_exchange_strong(slotValue, 0)) {
-        tls_lastTokenlessSlot = bestSlot ^ i;
-        return true;
+      if (slotValue == tokenlessSlotValue()) {
+        slotPtr->store(0, std::memory_order_relaxed);
+        state += kIncrHasS;
+        if ((state & kHasS) == 0) {
+          break;
+        }
        }
      }
-    return false;
    }
  
+  bool tryUnlockTokenlessSharedDeferred();
+
    bool tryUnlockSharedDeferred(uint32_t slot) {
      assert(slot < kMaxDeferredReaders);
      auto slotValue = tokenfulSlotValue();
@@ -1189,7 +1115,8 @@ class SharedMutexImpl {
  
    uint32_t unlockSharedInline() {
      uint32_t state = (state_ -= kIncrHasS);
-    assert((state & (kHasE | kBegunE)) != 0 || state < state + kIncrHasS);
+    assert((state & (kHasE | kBegunE | kMayDefer)) != 0 ||
+           state < state + kIncrHasS);
      if ((state & kHasS) == 0) {
        // Only the second half of lock() can be blocked by a non-zero
        // reader count, so that's the only thing we need to wake
@@ -1211,10 +1138,15 @@ class SharedMutexImpl {
  
   public:
    class ReadHolder {
-   public:
      ReadHolder() : lock_(nullptr) {}
  
-    explicit ReadHolder(const SharedMutexImpl* lock) : ReadHolder(*lock) {}
+   public:
+    explicit ReadHolder(const SharedMutexImpl* lock)
+        : lock_(const_cast<SharedMutexImpl*>(lock)) {
+      if (lock_) {
+        lock_->lock_shared(token_);
+      }
+    }
  
      explicit ReadHolder(const SharedMutexImpl& lock)
          : lock_(const_cast<SharedMutexImpl*>(&lock)) {
@@ -1250,8 +1182,13 @@ class SharedMutexImpl {
      ReadHolder& operator=(const ReadHolder& rhs) = delete;
  
      ~ReadHolder() {
+      unlock();
+    }
+
+    void unlock() {
        if (lock_) {
          lock_->unlock_shared(token_);
+        lock_ = nullptr;
        }
      }
  
@@ -1263,10 +1200,14 @@ class SharedMutexImpl {
    };
  
    class UpgradeHolder {
-   public:
      UpgradeHolder() : lock_(nullptr) {}
  
-    explicit UpgradeHolder(SharedMutexImpl* lock) : UpgradeHolder(*lock) {}
+   public:
+    explicit UpgradeHolder(SharedMutexImpl* lock) : lock_(lock) {
+      if (lock_) {
+        lock_->lock_upgrade();
+      }
+    }
  
      explicit UpgradeHolder(SharedMutexImpl& lock) : lock_(&lock) {
        lock_->lock_upgrade();
@@ -1292,8 +1233,13 @@ class SharedMutexImpl {
      UpgradeHolder& operator=(const UpgradeHolder& rhs) = delete;
  
      ~UpgradeHolder() {
+      unlock();
+    }
+
+    void unlock() {
        if (lock_) {
          lock_->unlock_upgrade();
+        lock_ = nullptr;
        }
      }
  
@@ -1304,10 +1250,14 @@ class SharedMutexImpl {
    };
  
    class WriteHolder {
-   public:
      WriteHolder() : lock_(nullptr) {}
  
-    explicit WriteHolder(SharedMutexImpl* lock) : WriteHolder(*lock) {}
+   public:
+    explicit WriteHolder(SharedMutexImpl* lock) : lock_(lock) {
+      if (lock_) {
+        lock_->lock();
+      }
+    }
  
      explicit WriteHolder(SharedMutexImpl& lock) : lock_(&lock) {
        lock_->lock();
@@ -1320,6 +1270,30 @@ class SharedMutexImpl {
        lock_->unlock_upgrade_and_lock();
      }
  
+    // README:
+    //
+    // It is intended that WriteHolder(ReadHolder&& rhs) do not exist.
+    //
+    // Shared locks (read) can not safely upgrade to unique locks (write).
+    // That upgrade path is a well-known recipe for deadlock, so we explicitly
+    // disallow it.
+    //
+    // If you need to do a conditional mutation, you have a few options:
+    // 1. Check the condition under a shared lock and release it.
+    //    Then maybe check the condition again under a unique lock and maybe do
+    //    the mutation.
+    // 2. Check the condition once under an upgradeable lock.
+    //    Then maybe upgrade the lock to a unique lock and do the mutation.
+    // 3. Check the condition and maybe perform the mutation under a unique
+    //    lock.
+    //
+    // Relevant upgradeable lock notes:
+    // * At most one upgradeable lock can be held at a time for a given shared
+    //   mutex, just like a unique lock.
+    // * An upgradeable lock may be held concurrently with any number of shared
+    //   locks.
+    // * An upgradeable lock may be upgraded atomically to a unique lock.
+
      WriteHolder(WriteHolder&& rhs) noexcept : lock_(rhs.lock_) {
        rhs.lock_ = nullptr;
      }
@@ -1333,8 +1307,13 @@ class SharedMutexImpl {
      WriteHolder& operator=(const WriteHolder& rhs) = delete;
  
      ~WriteHolder() {
+      unlock();
+    }
+
+    void unlock() {
        if (lock_) {
          lock_->unlock();
+        lock_ = nullptr;
        }
      }
  
@@ -1349,18 +1328,197 @@ class SharedMutexImpl {
    friend void acquireReadWrite(SharedMutexImpl& lock) { lock.lock(); }
    friend void releaseRead(SharedMutexImpl& lock) { lock.unlock_shared(); }
    friend void releaseReadWrite(SharedMutexImpl& lock) { lock.unlock(); }
+  friend bool acquireRead(SharedMutexImpl& lock, unsigned int ms) {
+    return lock.try_lock_shared_for(std::chrono::milliseconds(ms));
+  }
+  friend bool acquireReadWrite(SharedMutexImpl& lock, unsigned int ms) {
+    return lock.try_lock_for(std::chrono::milliseconds(ms));
+  }
  };
  
-#define COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(type) \
-  template <>                                                        \
-  type::DeferredReaderSlot                                           \
-      type::deferredReaders[type::kMaxDeferredReaders *              \
-                            type::kDeferredSeparationFactor] = {};   \
-  template <>                                                        \
-  FOLLY_TLS uint32_t type::tls_lastTokenlessSlot = 0;
-
  typedef SharedMutexImpl<true> SharedMutexReadPriority;
  typedef SharedMutexImpl<false> SharedMutexWritePriority;
  typedef SharedMutexWritePriority SharedMutex;
  
+// Prevent the compiler from instantiating these in other translation units.
+// They are instantiated once in SharedMutex.cpp
+extern template class SharedMutexImpl<true>;
+extern template class SharedMutexImpl<false>;
+
+template <
+    bool ReaderPriority,
+    typename Tag_,
+    template <typename> class Atom,
+    bool BlockImmediately>
+typename SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
+    DeferredReaderSlot
+        SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
+            deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor] =
+                {};
+
+template <
+    bool ReaderPriority,
+    typename Tag_,
+    template <typename> class Atom,
+    bool BlockImmediately>
+FOLLY_SHAREDMUTEX_TLS uint32_t
+    SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
+        tls_lastTokenlessSlot = 0;
+
+template <
+    bool ReaderPriority,
+    typename Tag_,
+    template <typename> class Atom,
+    bool BlockImmediately>
+FOLLY_SHAREDMUTEX_TLS uint32_t
+    SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
+        tls_lastDeferredReaderSlot = 0;
+
+template <
+    bool ReaderPriority,
+    typename Tag_,
+    template <typename> class Atom,
+    bool BlockImmediately>
+bool SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
+    tryUnlockTokenlessSharedDeferred() {
+  auto bestSlot = tls_lastTokenlessSlot;
+  for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
+    auto slotPtr = deferredReader(bestSlot ^ i);
+    auto slotValue = slotPtr->load(std::memory_order_relaxed);
+    if (slotValue == tokenlessSlotValue() &&
+        slotPtr->compare_exchange_strong(slotValue, 0)) {
+      tls_lastTokenlessSlot = bestSlot ^ i;
+      return true;
+    }
+  }
+  return false;
+}
+
+template <
+    bool ReaderPriority,
+    typename Tag_,
+    template <typename> class Atom,
+    bool BlockImmediately>
+template <class WaitContext>
+bool SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
+    lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
+  while (true) {
+    if (UNLIKELY((state & kHasE) != 0) &&
+        !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
+      return false;
+    }
+
+    uint32_t slot = tls_lastDeferredReaderSlot;
+    uintptr_t slotValue = 1; // any non-zero value will do
+
+    bool canAlreadyDefer = (state & kMayDefer) != 0;
+    bool aboveDeferThreshold =
+        (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS;
+    bool drainInProgress = ReaderPriority && (state & kBegunE) != 0;
+    if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) {
+      /* Try using the most recent slot first. */
+      slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
+      if (slotValue != 0) {
+        // starting point for our empty-slot search, can change after
+        // calling waitForZeroBits
+        uint32_t bestSlot =
+            (uint32_t)folly::AccessSpreader<Atom>::current(kMaxDeferredReaders);
+
+        // deferred readers are already enabled, or it is time to
+        // enable them if we can find a slot
+        for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) {
+          slot = bestSlot ^ i;
+          assert(slot < kMaxDeferredReaders);
+          slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
+          if (slotValue == 0) {
+            // found empty slot
+            tls_lastDeferredReaderSlot = slot;
+            break;
+          }
+        }
+      }
+    }
+
+    if (slotValue != 0) {
+      // not yet deferred, or no empty slots
+      if (state_.compare_exchange_strong(state, state + kIncrHasS)) {
+        // successfully recorded the read lock inline
+        if (token != nullptr) {
+          token->type_ = Token::Type::INLINE_SHARED;
+        }
+        return true;
+      }
+      // state is updated, try again
+      continue;
+    }
+
+    // record that deferred readers might be in use if necessary
+    if ((state & kMayDefer) == 0) {
+      if (!state_.compare_exchange_strong(state, state | kMayDefer)) {
+        // keep going if CAS failed because somebody else set the bit
+        // for us
+        if ((state & (kHasE | kMayDefer)) != kMayDefer) {
+          continue;
+        }
+      }
+      // state = state | kMayDefer;
+    }
+
+    // try to use the slot
+    bool gotSlot = deferredReader(slot)->compare_exchange_strong(
+        slotValue,
+        token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue());
+
+    // If we got the slot, we need to verify that an exclusive lock
+    // didn't happen since we last checked.  If we didn't get the slot we
+    // need to recheck state_ anyway to make sure we don't waste too much
+    // work.  It is also possible that since we checked state_ someone
+    // has acquired and released the write lock, clearing kMayDefer.
+    // Both cases are covered by looking for the readers-possible bit,
+    // because it is off when the exclusive lock bit is set.
+    state = state_.load(std::memory_order_acquire);
+
+    if (!gotSlot) {
+      continue;
+    }
+
+    if (token == nullptr) {
+      tls_lastTokenlessSlot = slot;
+    }
+
+    if ((state & kMayDefer) != 0) {
+      assert((state & kHasE) == 0);
+      // success
+      if (token != nullptr) {
+        token->type_ = Token::Type::DEFERRED_SHARED;
+        token->slot_ = (uint16_t)slot;
+      }
+      return true;
+    }
+
+    // release the slot before retrying
+    if (token == nullptr) {
+      // We can't rely on slot.  Token-less slot values can be freed by
+      // any unlock_shared(), so we need to do the full deferredReader
+      // search during unlock.  Unlike unlock_shared(), we can't trust
+      // kPrevDefer here.  This deferred lock isn't visible to lock()
+      // (that's the whole reason we're undoing it) so there might have
+      // subsequently been an unlock() and lock() with no intervening
+      // transition to deferred mode.
+      if (!tryUnlockTokenlessSharedDeferred()) {
+        unlockSharedInline();
+      }
+    } else {
+      if (!tryUnlockSharedDeferred(slot)) {
+        unlockSharedInline();
+      }
+    }
+
+    // We got here not because the lock was unavailable, but because
+    // we lost a compare-and-swap.  Try-lock is typically allowed to
+    // have spurious failures, but there is no lock efficiency gain
+    // from exploiting that freedom here.
+  }
+}
+
  } // namespace folly