SharedMutex - a small fast scalable reader-writer lock

author Nathan Bronson <ngbronson@fb.com>

Wed, 11 Mar 2015 05:21:54 +0000 (22:21 -0700)

committer Andre Azevedo <aap@fb.com>

Wed, 18 Mar 2015 02:50:35 +0000 (19:50 -0700)
author Nathan Bronson <ngbronson@fb.com>
Wed, 11 Mar 2015 05:21:54 +0000 (22:21 -0700)
committer Andre Azevedo <aap@fb.com>
Wed, 18 Mar 2015 02:50:35 +0000 (19:50 -0700)
diff --git a/folly/Makefile.am b/folly/Makefile.am

index c57494ce9e0b2e440c9000e41eb95121418037d3..ef4240e3a6bbd8c74a23f0970772012f06cb9adf 100644 (file)
--- a/folly/Makefile.am
+++ b/folly/Makefile.am
@@ -74,11 +74,12 @@ nobase_follyinclude_HEADERS = \
         experimental/EventCount.h \
         experimental/FunctionScheduler.h \
         experimental/io/FsUtil.h \
         experimental/EventCount.h \
         experimental/FunctionScheduler.h \
         experimental/io/FsUtil.h \
+       experimental/JSONSchema.h \
+       experimental/Select64.h \
+       experimental/SharedMutex.h
         experimental/Singleton.h \
         experimental/Singleton-inl.h \
         experimental/TestUtil.h \
         experimental/Singleton.h \
         experimental/Singleton-inl.h \
         experimental/TestUtil.h \
-       experimental/Select64.h \
-       experimental/JSONSchema.h \
         FBString.h \
         FBVector.h \
         File.h \
         FBString.h \
         FBVector.h \
         File.h \
@@ -339,12 +340,13 @@ libfolly_la_SOURCES = \
         TimeoutQueue.cpp \
         Uri.cpp \
         Version.cpp \
         TimeoutQueue.cpp \
         Uri.cpp \
         Version.cpp \
-       experimental/io/FsUtil.cpp \
         experimental/FunctionScheduler.cpp \
         experimental/FunctionScheduler.cpp \
+       experimental/io/FsUtil.cpp \
+       experimental/JSONSchema.cpp \
+       experimental/Select64.cpp \
+       experimental/SharedMutex.cpp
         experimental/Singleton.cpp \
         experimental/TestUtil.cpp \
         experimental/Singleton.cpp \
         experimental/TestUtil.cpp \
-       experimental/Select64.cpp \
-       experimental/JSONSchema.cpp \
         wangle/acceptor/Acceptor.cpp \
         wangle/acceptor/ConnectionManager.cpp \
         wangle/acceptor/LoadShedConfiguration.cpp \
         wangle/acceptor/Acceptor.cpp \
         wangle/acceptor/ConnectionManager.cpp \
         wangle/acceptor/LoadShedConfiguration.cpp \
diff --git a/folly/experimental/SharedMutex.cpp b/folly/experimental/SharedMutex.cpp

new file mode 100644 (file)

index 0000000..3a3ac17
--- /dev/null
+++ b/folly/experimental/SharedMutex.cpp
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2015 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "SharedMutex.h"
+
+COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(
+    folly::SharedMutexReadPriority);
+COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(
+    folly::SharedMutexWritePriority);
diff --git a/folly/experimental/SharedMutex.h b/folly/experimental/SharedMutex.h

new file mode 100644 (file)

index 0000000..f139f9d
--- /dev/null
+++ b/folly/experimental/SharedMutex.h
@@ -0,0 +1,1384 @@
+/*
+ * Copyright 2015 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// @author Nathan Bronson (ngbronson@fb.com)
+
+#pragma once
+
+#include <stdint.h>
+#include <atomic>
+#include <thread>
+#include <type_traits>
+#include <folly/Likely.h>
+#include <folly/detail/CacheLocality.h>
+#include <folly/detail/Futex.h>
+#include <sys/resource.h>
+
+// SharedMutex is a reader-writer lock.  It is small, very fast, scalable
+// on multi-core, and suitable for use when readers or writers may block.
+// Unlike most other reader-writer locks, its throughput with concurrent
+// readers scales linearly; it is able to acquire and release the lock
+// in shared mode without cache line ping-ponging.  It is suitable for
+// a wide range of lock hold times because it starts with spinning,
+// proceeds to using sched_yield with a preemption heuristic, and then
+// waits using futex and precise wakeups.
+//
+// SharedMutex provides all of the methods of folly::RWSpinLock,
+// boost::shared_mutex, boost::upgrade_mutex, and C++14's
+// std::shared_timed_mutex.  All operations that can block are available
+// in try, try-for, and try-until (system_clock or steady_clock) versions.
+//
+// SharedMutexReadPriority gives priority to readers,
+// SharedMutexWritePriority gives priority to writers.  SharedMutex is an
+// alias for SharedMutexWritePriority, because writer starvation is more
+// likely than reader starvation for the read-heavy workloads targetted
+// by SharedMutex.
+//
+// In my tests SharedMutex is as good or better than the other
+// reader-writer locks in use at Facebook for almost all use cases,
+// sometimes by a wide margin.  (If it is rare that there are actually
+// concurrent readers then RWSpinLock can be a few nanoseconds faster.)
+// I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64,
+// boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses
+// spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block.
+// (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.)
+// It is generally as good or better than the rest when evaluating size,
+// speed, scalability, or latency outliers.  In the corner cases where
+// it is not the fastest (such as single-threaded use or heavy write
+// contention) it is never very much worse than the best.  See the bottom
+// of folly/test/SharedMutexTest.cpp for lots of microbenchmark results.
+//
+// Comparison to folly::RWSpinLock:
+//
+//  * SharedMutex is faster than RWSpinLock when there are actually
+//    concurrent read accesses (sometimes much faster), and ~5 nanoseconds
+//    slower when there is not actually any contention.  SharedMutex is
+//    faster in every (benchmarked) scenario where the shared mode of
+//    the lock is actually useful.
+//
+//  * Concurrent shared access to SharedMutex scales linearly, while total
+//    RWSpinLock throughput drops as more threads try to access the lock
+//    in shared mode.  Under very heavy read contention SharedMutex can
+//    be two orders of magnitude faster than RWSpinLock (or any reader
+//    writer lock that doesn't use striping or deferral).
+//
+//  * SharedMutex can safely protect blocking calls, because after an
+//    initial period of spinning it waits using futex().
+//
+//  * RWSpinLock prioritizes readers, SharedMutex has both reader- and
+//    writer-priority variants, but defaults to write priority.
+//
+//  * RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's
+//    doesn't.  Both semantics are reasonable.  The boost documentation
+//    doesn't explicitly talk about this behavior (except by omitting
+//    any statement that those lock modes conflict), but the boost
+//    implementations do allow new readers while the upgradeable mode
+//    is held.  See https://github.com/boostorg/thread/blob/master/
+//      include/boost/thread/pthread/shared_mutex.hpp
+//
+//  * RWSpinLock::UpgradedHolder maps to SharedMutex::UpgradeHolder
+//    (UpgradeableHolder would be even more pedantically correct).
+//    SharedMutex's holders have fewer methods (no reset) and are less
+//    tolerant (promotion and downgrade crash if the donor doesn't own
+//    the lock, and you must use the default constructor rather than
+//    passing a nullptr to the pointer constructor).
+//
+// Both SharedMutex and RWSpinLock provide "exclusive", "upgrade",
+// and "shared" modes.  At all times num_threads_holding_exclusive +
+// num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive ==
+// 0 || num_threads_holding_shared == 0.  RWSpinLock has the additional
+// constraint that num_threads_holding_shared cannot increase while
+// num_threads_holding_upgrade is non-zero.
+//
+// Comparison to the internal RWLock:
+//
+//  * SharedMutex doesn't allow a maximum reader count to be configured,
+//    so it can't be used as a semaphore in the same way as RWLock.
+//
+//  * SharedMutex is 4 bytes, RWLock is 256.
+//
+//  * SharedMutex is as fast or faster than RWLock in all of my
+//    microbenchmarks, and has positive rather than negative scalability.
+//
+//  * RWLock and SharedMutex are both writer priority locks.
+//
+//  * SharedMutex avoids latency outliers as well as RWLock.
+//
+//  * SharedMutex uses different names (t != 0 below):
+//
+//    RWLock::lock(0)    => SharedMutex::lock()
+//
+//    RWLock::lock(t)    => SharedMutex::try_lock_for(milliseconds(t))
+//
+//    RWLock::tryLock()  => SharedMutex::try_lock()
+//
+//    RWLock::unlock()   => SharedMutex::unlock()
+//
+//    RWLock::enter(0)   => SharedMutex::lock_shared()
+//
+//    RWLock::enter(t)   =>
+//        SharedMutex::try_lock_shared_for(milliseconds(t))
+//
+//    RWLock::tryEnter() => SharedMutex::try_lock_shared()
+//
+//    RWLock::leave()    => SharedMutex::unlock_shared()
+//
+//  * RWLock allows the reader count to be adjusted by a value other
+//    than 1 during enter() or leave(). SharedMutex doesn't currently
+//    implement this feature.
+//
+//  * RWLock's methods are marked const, SharedMutex's aren't.
+//
+// Reader-writer locks have the potential to allow concurrent access
+// to shared read-mostly data, but in practice they often provide no
+// improvement over a mutex.  The problem is the cache coherence protocol
+// of modern CPUs.  Coherence is provided by making sure that when a cache
+// line is written it is present in only one core's cache.  Since a memory
+// write is required to acquire a reader-writer lock in shared mode, the
+// cache line holding the lock is invalidated in all of the other caches.
+// This leads to cache misses when another thread wants to acquire or
+// release the lock concurrently.  When the RWLock is colocated with the
+// data it protects (common), cache misses can also continue occur when
+// a thread that already holds the lock tries to read the protected data.
+//
+// Ideally, a reader-writer lock would allow multiple cores to acquire
+// and release the lock in shared mode without incurring any cache misses.
+// This requires that each core records its shared access in a cache line
+// that isn't read or written by other read-locking cores.  (Writers will
+// have to check all of the cache lines.)  Typical server hardware when
+// this comment was written has 16 L1 caches and cache lines of 64 bytes,
+// so a lock striped over all L1 caches would occupy a prohibitive 1024
+// bytes.  Nothing says that we need a separate set of per-core memory
+// locations for each lock, however.  Each SharedMutex instance is only
+// 4 bytes, but all locks together share a 2K area in which they make a
+// core-local record of lock acquisitions.
+//
+// SharedMutex's strategy of using a shared set of core-local stripes has
+// a potential downside, because it means that acquisition of any lock in
+// write mode can conflict with acquisition of any lock in shared mode.
+// If a lock instance doesn't actually experience concurrency then this
+// downside will outweight the upside of improved scalability for readers.
+// To avoid this problem we dynamically detect concurrent accesses to
+// SharedMutex, and don't start using the deferred mode unless we actually
+// observe concurrency.  See kNumSharedToStartDeferring.
+//
+// It is explicitly allowed to call lock_unshared() from a different
+// thread than lock_shared(), so long as they are properly paired.
+// lock_unshared() needs to find the location at which lock_shared()
+// recorded the lock, which might be in the lock itself or in any of
+// the shared slots.  If you can conveniently pass state from lock
+// acquisition to release then the fastest mechanism is to std::move
+// the SharedMutex::ReadHolder instance or an SharedMutex::Token (using
+// lock_shared(Token&) and unlock_sahred(Token&)).  The guard or token
+// will tell unlock_shared where in deferredReaders[] to look for the
+// deferred lock.  The Token-less version of unlock_shared() works in all
+// cases, but is optimized for the common (no inter-thread handoff) case.
+//
+// In both read- and write-priority mode, a waiting lock() (exclusive mode)
+// only blocks readers after it has waited for an active upgrade lock to be
+// released; until the upgrade lock is released (or upgraded or downgraded)
+// readers will still be able to enter.  Preferences about lock acquisition
+// are not guaranteed to be enforced perfectly (even if they were, there
+// is theoretically the chance that a thread could be arbitrarily suspended
+// between calling lock() and SharedMutex code actually getting executed).
+//
+// try_*_for methods always try at least once, even if the duration
+// is zero or negative.  The duration type must be compatible with
+// std::chrono::steady_clock.  try_*_until methods also always try at
+// least once.  std::chrono::system_clock and std::chrono::steady_clock
+// are supported.
+//
+// If you have observed by profiling that your SharedMutex-s are getting
+// cache misses on deferredReaders[] due to another SharedMutex user, then
+// you can use the tag type plus the RWDEFERREDLOCK_DECLARE_STATIC_STORAGE
+// macro to create your own instantiation of the type.  The contention
+// threshold (see kNumSharedToStartDeferring) should make this unnecessary
+// in all but the most extreme cases.  Make sure to check that the
+// increased icache and dcache footprint of the tagged result is worth it.
+
+namespace folly {
+
+struct SharedMutexToken {
+  enum class Type : uint16_t {
+    INVALID = 0,
+    INLINE_SHARED,
+    DEFERRED_SHARED,
+  };
+
+  Type type_;
+  uint16_t slot_;
+};
+
+template <bool ReaderPriority,
+          typename Tag_ = void,
+          template <typename> class Atom = std::atomic,
+          bool BlockImmediately = false>
+class SharedMutexImpl : boost::noncopyable {
+ public:
+  static constexpr bool kReaderPriority = ReaderPriority;
+  typedef Tag_ Tag;
+
+  typedef SharedMutexToken Token;
+
+  class ReadHolder;
+  class UpgradeHolder;
+  class WriteHolder;
+
+  SharedMutexImpl() : state_(0) {}
+
+  // It is an error to destroy an SharedMutex that still has
+  // any outstanding locks.  This is checked if NDEBUG isn't defined.
+  // SharedMutex's exclusive mode can be safely used to guard the lock's
+  // own destruction.  If, for example, you acquire the lock in exclusive
+  // mode and then observe that the object containing the lock is no longer
+  // needed, you can unlock() and then immediately destroy the lock.
+  // See https://sourceware.org/bugzilla/show_bug.cgi?id=13690 for a
+  // description about why this property needs to be explicitly mentioned.
+  ~SharedMutexImpl() {
+#ifndef NDEBUG
+    auto state = state_.load(std::memory_order_acquire);
+
+    // if a futexWait fails to go to sleep because the value has been
+    // changed, we don't necessarily clean up the wait bits, so it is
+    // possible they will be set here in a correct system
+    assert((state & ~(kWaitingAny | kMayDefer)) == 0);
+    if ((state & kMayDefer) != 0) {
+      for (uint32_t slot = 0; slot < kMaxDeferredReaders; ++slot) {
+        auto slotValue = deferredReader(slot)->load(std::memory_order_acquire);
+        assert(!slotValueIsThis(slotValue));
+      }
+    }
+#endif
+  }
+
+  void lock() {
+    WaitForever ctx;
+    (void)lockExclusiveImpl(kHasSolo, ctx);
+  }
+
+  bool try_lock() {
+    WaitNever ctx;
+    return lockExclusiveImpl(kHasSolo, ctx);
+  }
+
+  template <class Rep, class Period>
+  bool try_lock_for(const std::chrono::duration<Rep, Period>& duration) {
+    WaitForDuration<Rep, Period> ctx(duration);
+    return lockExclusiveImpl(kHasSolo, ctx);
+  }
+
+  template <class Clock, class Duration>
+  bool try_lock_until(
+      const std::chrono::time_point<Clock, Duration>& absDeadline) {
+    WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
+    return lockExclusiveImpl(kHasSolo, ctx);
+  }
+
+  void unlock() {
+    // It is possible that we have a left-over kWaitingNotS if the last
+    // unlock_shared() that let our matching lock() complete finished
+    // releasing before lock()'s futexWait went to sleep.  Clean it up now
+    auto state = (state_ &= ~(kWaitingNotS | kPrevDefer | kHasE));
+    assert((state & ~kWaitingAny) == 0);
+    wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
+  }
+
+  // Managing the token yourself makes unlock_shared a bit faster
+
+  void lock_shared() {
+    WaitForever ctx;
+    (void)lockSharedImpl(nullptr, ctx);
+  }
+
+  void lock_shared(Token& token) {
+    WaitForever ctx;
+    (void)lockSharedImpl(&token, ctx);
+  }
+
+  bool try_lock_shared() {
+    WaitNever ctx;
+    return lockSharedImpl(nullptr, ctx);
+  }
+
+  bool try_lock_shared(Token& token) {
+    WaitNever ctx;
+    return lockSharedImpl(&token, ctx);
+  }
+
+  template <class Rep, class Period>
+  bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration) {
+    WaitForDuration<Rep, Period> ctx(duration);
+    return lockSharedImpl(nullptr, ctx);
+  }
+
+  template <class Rep, class Period>
+  bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration,
+                           Token& token) {
+    WaitForDuration<Rep, Period> ctx(duration);
+    return lockSharedImpl(&token, ctx);
+  }
+
+  template <class Clock, class Duration>
+  bool try_lock_shared_until(
+      const std::chrono::time_point<Clock, Duration>& absDeadline) {
+    WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
+    return lockSharedImpl(nullptr, ctx);
+  }
+
+  template <class Clock, class Duration>
+  bool try_lock_shared_until(
+      const std::chrono::time_point<Clock, Duration>& absDeadline,
+      Token& token) {
+    WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
+    return lockSharedImpl(&token, ctx);
+  }
+
+  void unlock_shared() {
+    auto state = state_.load(std::memory_order_acquire);
+
+    // kPrevDefer can only be set if HasE or BegunE is set
+    assert((state & (kPrevDefer | kHasE | kBegunE)) != kPrevDefer);
+
+    // lock() strips kMayDefer immediately, but then copies it to
+    // kPrevDefer so we can tell if the pre-lock() lock_shared() might
+    // have deferred
+    if ((state & (kMayDefer | kPrevDefer)) == 0 ||
+        !tryUnlockAnySharedDeferred()) {
+      // Matching lock_shared() couldn't have deferred, or the deferred
+      // lock has already been inlined by applyDeferredReaders()
+      unlockSharedInline();
+    }
+  }
+
+  void unlock_shared(Token& token) {
+    assert(token.type_ == Token::Type::INLINE_SHARED ||
+           token.type_ == Token::Type::DEFERRED_SHARED);
+
+    if (token.type_ != Token::Type::DEFERRED_SHARED ||
+        !tryUnlockSharedDeferred(token.slot_)) {
+      unlockSharedInline();
+    }
+#ifndef NDEBUG
+    token.type_ = Token::Type::INVALID;
+#endif
+  }
+
+  void unlock_and_lock_shared() {
+    // We can't use state_ -=, because we need to clear 2 bits (1 of which
+    // has an uncertain initial state) and set 1 other.  We might as well
+    // clear the relevant wake bits at the same time.  Note that since S
+    // doesn't block the beginning of a transition to E (writer priority
+    // can cut off new S, reader priority grabs BegunE and blocks deferred
+    // S) we need to wake E as well.
+    auto state = state_.load(std::memory_order_acquire);
+    do {
+      assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
+    } while (!state_.compare_exchange_strong(
+        state, (state & ~(kWaitingAny | kPrevDefer | kHasE)) + kIncrHasS));
+    if ((state & (kWaitingE | kWaitingU | kWaitingS)) != 0) {
+      futexWakeAll(kWaitingE | kWaitingU | kWaitingS);
+    }
+  }
+
+  void unlock_and_lock_shared(Token& token) {
+    unlock_and_lock_shared();
+    token.type_ = Token::Type::INLINE_SHARED;
+  }
+
+  void lock_upgrade() {
+    WaitForever ctx;
+    (void)lockUpgradeImpl(ctx);
+  }
+
+  bool try_lock_upgrade() {
+    WaitNever ctx;
+    return lockUpgradeImpl(ctx);
+  }
+
+  template <class Rep, class Period>
+  bool try_lock_upgrade_for(
+      const std::chrono::duration<Rep, Period>& duration) {
+    WaitForDuration<Rep, Period> ctx(duration);
+    return lockUpgradeImpl(ctx);
+  }
+
+  template <class Clock, class Duration>
+  bool try_lock_upgrade_until(
+      const std::chrono::time_point<Clock, Duration>& absDeadline) {
+    WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
+    return lockUpgradeImpl(ctx);
+  }
+
+  void unlock_upgrade() {
+    auto state = (state_ -= kHasU);
+    assert((state & (kWaitingNotS | kHasSolo)) == 0);
+    wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
+  }
+
+  void unlock_upgrade_and_lock() {
+    // no waiting necessary, so waitMask is empty
+    WaitForever ctx;
+    (void)lockExclusiveImpl(0, ctx);
+  }
+
+  void unlock_upgrade_and_lock_shared() {
+    auto state = (state_ -= kHasU - kIncrHasS);
+    assert((state & (kWaitingNotS | kHasSolo)) == 0 && (state & kHasS) != 0);
+    wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
+  }
+
+  void unlock_upgrade_and_lock_shared(Token& token) {
+    unlock_upgrade_and_lock_shared();
+    token.type_ = Token::Type::INLINE_SHARED;
+  }
+
+  void unlock_and_lock_upgrade() {
+    // We can't use state_ -=, because we need to clear 2 bits (1 of
+    // which has an uncertain initial state) and set 1 other.  We might
+    // as well clear the relevant wake bits at the same time.
+    auto state = state_.load(std::memory_order_acquire);
+    while (true) {
+      assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
+      auto after =
+          (state & ~(kWaitingNotS | kWaitingS | kPrevDefer | kHasE)) + kHasU;
+      if (state_.compare_exchange_strong(state, after)) {
+        if ((state & kWaitingS) != 0) {
+          futexWakeAll(kWaitingS);
+        }
+        return;
+      }
+    }
+  }
+
+ private:
+  typedef typename folly::detail::Futex<Atom> Futex;
+
+  // Internally we use four kinds of wait contexts.  These are structs
+  // that provide a doWait method that returns true if a futex wake
+  // was issued that intersects with the waitMask, false if there was a
+  // timeout and no more waiting should be performed.  Spinning occurs
+  // before the wait context is invoked.
+
+  struct WaitForever {
+    bool canBlock() { return true; }
+    bool canTimeOut() { return false; }
+    bool shouldTimeOut() { return false; }
+
+    bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
+      futex.futexWait(expected, waitMask);
+      return true;
+    }
+  };
+
+  struct WaitNever {
+    bool canBlock() { return false; }
+    bool canTimeOut() { return true; }
+    bool shouldTimeOut() { return true; }
+
+    bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
+      return false;
+    }
+  };
+
+  template <class Rep, class Period>
+  struct WaitForDuration {
+    std::chrono::duration<Rep, Period> duration_;
+    bool deadlineComputed_;
+    std::chrono::steady_clock::time_point deadline_;
+
+    explicit WaitForDuration(const std::chrono::duration<Rep, Period>& duration)
+        : duration_(duration), deadlineComputed_(false) {}
+
+    std::chrono::steady_clock::time_point deadline() {
+      if (!deadlineComputed_) {
+        deadline_ = std::chrono::steady_clock::now() + duration_;
+        deadlineComputed_ = true;
+      }
+      return deadline_;
+    }
+
+    bool canBlock() { return duration_.count() > 0; }
+    bool canTimeOut() { return true; }
+
+    bool shouldTimeOut() {
+      return std::chrono::steady_clock::now() > deadline();
+    }
+
+    bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
+      auto result = futex.futexWaitUntil(expected, deadline(), waitMask);
+      return result != folly::detail::FutexResult::TIMEDOUT;
+    }
+  };
+
+  template <class Clock, class Duration>
+  struct WaitUntilDeadline {
+    std::chrono::time_point<Clock, Duration> absDeadline_;
+
+    bool canBlock() { return true; }
+    bool canTimeOut() { return true; }
+    bool shouldTimeOut() { return Clock::now() > absDeadline_; }
+
+    bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
+      auto result = futex.futexWaitUntil(expected, absDeadline_, waitMask);
+      return result != folly::detail::FutexResult::TIMEDOUT;
+    }
+  };
+
+  // 32 bits of state
+  Futex state_;
+
+  static constexpr uint32_t kIncrHasS = 1 << 10;
+  static constexpr uint32_t kHasS = ~(kIncrHasS - 1);
+
+  // If false, then there are definitely no deferred read locks for this
+  // instance.  Cleared after initialization and when exclusively locked.
+  static constexpr uint32_t kMayDefer = 1 << 9;
+
+  // lock() cleared kMayDefer as soon as it starts draining readers (so
+  // that it doesn't have to do a second CAS once drain completes), but
+  // unlock_shared() still needs to know whether to scan deferredReaders[]
+  // or not.  We copy kMayDefer to kPrevDefer when setting kHasE or
+  // kBegunE, and clear it when clearing those bits.
+  static constexpr uint32_t kPrevDefer = 1 << 8;
+
+  // Exclusive-locked blocks all read locks and write locks.  This bit
+  // may be set before all readers have finished, but in that case the
+  // thread that sets it won't return to the caller until all read locks
+  // have been released.
+  static constexpr uint32_t kHasE = 1 << 7;
+
+  // Exclusive-draining means that lock() is waiting for existing readers
+  // to leave, but that new readers may still acquire shared access.
+  // This is only used in reader priority mode.  New readers during
+  // drain must be inline.  The difference between this and kHasU is that
+  // kBegunE prevents kMayDefer from being set.
+  static constexpr uint32_t kBegunE = 1 << 6;
+
+  // At most one thread may have either exclusive or upgrade lock
+  // ownership.  Unlike exclusive mode, ownership of the lock in upgrade
+  // mode doesn't preclude other threads holding the lock in shared mode.
+  // boost's concept for this doesn't explicitly say whether new shared
+  // locks can be acquired one lock_upgrade has succeeded, but doesn't
+  // list that as disallowed.  RWSpinLock disallows new read locks after
+  // lock_upgrade has been acquired, but the boost implementation doesn't.
+  // We choose the latter.
+  static constexpr uint32_t kHasU = 1 << 5;
+
+  // There are three states that we consider to be "solo", in that they
+  // cannot coexist with other solo states.  These are kHasE, kBegunE,
+  // and kHasU.  Note that S doesn't conflict with any of these, because
+  // setting the kHasE is only one of the two steps needed to actually
+  // acquire the lock in exclusive mode (the other is draining the existing
+  // S holders).
+  static constexpr uint32_t kHasSolo = kHasE | kBegunE | kHasU;
+
+  // Once a thread sets kHasE it needs to wait for the current readers
+  // to exit the lock.  We give this a separate wait identity from the
+  // waiting to set kHasE so that we can perform partial wakeups (wake
+  // one instead of wake all).
+  static constexpr uint32_t kWaitingNotS = 1 << 4;
+
+  // If there are multiple pending waiters, then waking them all can
+  // lead to a thundering herd on the lock.  To avoid this, we keep
+  // a 2 bit saturating counter of the number of exclusive waiters
+  // (0, 1, 2, 3+), and if the value is >= 2 we perform futexWake(1)
+  // instead of futexWakeAll.  See wakeRegisteredWaiters for more.
+  // It isn't actually useful to make the counter bigger, because
+  // whenever a futexWait fails with EAGAIN the counter becomes higher
+  // than the actual number of waiters, and hence effectively saturated.
+  // Bigger counters just lead to more changes in state_, which increase
+  // contention and failed futexWait-s.
+  static constexpr uint32_t kIncrWaitingE = 1 << 2;
+  static constexpr uint32_t kWaitingE = 0x3 * kIncrWaitingE;
+
+  // kWaitingU is essentially a 1 bit saturating counter.  It always
+  // requires a wakeAll.
+  static constexpr uint32_t kWaitingU = 1 << 1;
+
+  // All blocked lock_shared() should be awoken, so it is correct (not
+  // suboptimal) to wakeAll if there are any shared readers.
+  static constexpr uint32_t kWaitingS = 1 << 0;
+
+  // kWaitingAny is a mask of all of the bits that record the state of
+  // threads, rather than the state of the lock.  It is convenient to be
+  // able to mask them off during asserts.
+  static constexpr uint32_t kWaitingAny =
+      kWaitingNotS | kWaitingE | kWaitingU | kWaitingS;
+
+  // The reader count at which a reader will attempt to use the lock
+  // in deferred mode.  If this value is 2, then the second concurrent
+  // reader will set kMayDefer and use deferredReaders[].  kMayDefer is
+  // cleared during exclusive access, so this threshold must be reached
+  // each time a lock is held in exclusive mode.
+  static constexpr uint32_t kNumSharedToStartDeferring = 2;
+
+  // The typical number of spins that a thread will wait for a state
+  // transition.  There is no bound on the number of threads that can wait
+  // for a writer, so we are pretty conservative here to limit the chance
+  // that we are starving the writer of CPU.  Each spin is 6 or 7 nanos,
+  // almost all of which is in the pause instruction.
+  static constexpr uint32_t kMaxSpinCount = !BlockImmediately ? 1000 : 2;
+
+  // The maximum number of soft yields before falling back to futex.
+  // If the preemption heuristic is activated we will fall back before
+  // this.  A soft yield takes ~900 nanos (two sched_yield plus a call
+  // to getrusage, with checks of the goal at each step).  Soft yields
+  // aren't compatible with deterministic execution under test (unlike
+  // futexWaitUntil, which has a capricious but deterministic back end).
+  static constexpr uint32_t kMaxSoftYieldCount = !BlockImmediately ? 1000 : 0;
+
+  // If AccessSpreader assigns indexes from 0..k*n-1 on a system where some
+  // level of the memory hierarchy is symmetrically divided into k pieces
+  // (NUMA nodes, last-level caches, L1 caches, ...), then slot indexes
+  // that are the same after integer division by k share that resource.
+  // Our strategy for deferred readers is to probe up to numSlots/4 slots,
+  // using the full granularity of AccessSpreader for the start slot
+  // and then search outward.  We can use AccessSpreader::current(n)
+  // without managing our own spreader if kMaxDeferredReaders <=
+  // AccessSpreader::kMaxCpus, which is currently 128.
+  //
+  // Our 2-socket E5-2660 machines have 8 L1 caches on each chip,
+  // with 64 byte cache lines.  That means we need 64*16 bytes of
+  // deferredReaders[] to give each L1 its own playground.  On x86_64
+  // each DeferredReaderSlot is 8 bytes, so we need kMaxDeferredReaders
+  // * kDeferredSeparationFactor >= 64 * 16 / 8 == 128.  If
+  // kDeferredSearchDistance * kDeferredSeparationFactor <=
+  // 64 / 8 then we will search only within a single cache line, which
+  // guarantees we won't have inter-L1 contention.  We give ourselves
+  // a factor of 2 on the core count, which should hold us for a couple
+  // processor generations.  deferredReaders[] is 2048 bytes currently.
+  static constexpr uint32_t kMaxDeferredReaders = 64;
+  static constexpr uint32_t kDeferredSearchDistance = 2;
+  static constexpr uint32_t kDeferredSeparationFactor = 4;
+
+  static_assert(!(kMaxDeferredReaders & (kMaxDeferredReaders - 1)),
+                "kMaxDeferredReaders must be a power of 2");
+  static_assert(!(kDeferredSearchDistance & (kDeferredSearchDistance - 1)),
+                "kDeferredSearchDistance must be a power of 2");
+
+  // The number of deferred locks that can be simultaneously acquired
+  // by a thread via the token-less methods without performing any heap
+  // allocations.  Each of these costs 3 pointers (24 bytes, probably)
+  // per thread.  There's not much point in making this larger than
+  // kDeferredSearchDistance.
+  static constexpr uint32_t kTokenStackTLSCapacity = 2;
+
+  // We need to make sure that if there is a lock_shared()
+  // and lock_shared(token) followed by unlock_shared() and
+  // unlock_shared(token), the token-less unlock doesn't null
+  // out deferredReaders[token.slot_].  If we allowed that, then
+  // unlock_shared(token) wouldn't be able to assume that its lock
+  // had been inlined by applyDeferredReaders when it finds that
+  // deferredReaders[token.slot_] no longer points to this.  We accomplish
+  // this by stealing bit 0 from the pointer to record that the slot's
+  // element has no token, hence our use of uintptr_t in deferredReaders[].
+  static constexpr uintptr_t kTokenless = 0x1;
+
+  // This is the starting location for Token-less unlock_shared().
+  static FOLLY_TLS uint32_t tls_lastTokenlessSlot;
+
+  // Only indexes divisible by kDeferredSeparationFactor are used.
+  // If any of those elements points to a SharedMutexImpl, then it
+  // should be considered that there is a shared lock on that instance.
+  // See kTokenless.
+  typedef Atom<uintptr_t> DeferredReaderSlot;
+  static DeferredReaderSlot deferredReaders
+      [kMaxDeferredReaders *
+       kDeferredSeparationFactor] FOLLY_ALIGN_TO_AVOID_FALSE_SHARING;
+
+  // Performs an exclusive lock, waiting for state_ & waitMask to be
+  // zero first
+  template <class WaitContext>
+  bool lockExclusiveImpl(uint32_t preconditionGoalMask, WaitContext& ctx) {
+    uint32_t state = state_.load(std::memory_order_acquire);
+    if (LIKELY(
+            (state & (preconditionGoalMask | kMayDefer | kHasS)) == 0 &&
+            state_.compare_exchange_strong(state, (state | kHasE) & ~kHasU))) {
+      return true;
+    } else {
+      return lockExclusiveImpl(state, preconditionGoalMask, ctx);
+    }
+  }
+
+  template <class WaitContext>
+  bool lockExclusiveImpl(uint32_t& state,
+                         uint32_t preconditionGoalMask,
+                         WaitContext& ctx) {
+    while (true) {
+      if (UNLIKELY((state & preconditionGoalMask) != 0) &&
+          !waitForZeroBits(state, preconditionGoalMask, kWaitingE, ctx) &&
+          ctx.canTimeOut()) {
+        return false;
+      }
+
+      uint32_t after = (state & kMayDefer) == 0 ? 0 : kPrevDefer;
+      if (!ReaderPriority || (state & (kMayDefer | kHasS)) == 0) {
+        // Block readers immediately, either because we are in write
+        // priority mode or because we can acquire the lock in one
+        // step.  Note that if state has kHasU, then we are doing an
+        // unlock_upgrade_and_lock() and we should clear it (reader
+        // priority branch also does this).
+        after |= (state | kHasE) & ~(kHasU | kMayDefer);
+      } else {
+        after |= (state | kBegunE) & ~(kHasU | kMayDefer);
+      }
+      if (state_.compare_exchange_strong(state, after)) {
+        auto before = state;
+        state = after;
+
+        // If we set kHasE (writer priority) then no new readers can
+        // arrive.  If we set kBegunE then they can still enter, but
+        // they must be inline.  Either way we need to either spin on
+        // deferredReaders[] slots, or inline them so that we can wait on
+        // kHasS to zero itself.  deferredReaders[] is pointers, which on
+        // x86_64 are bigger than futex() can handle, so we inline the
+        // deferred locks instead of trying to futexWait on each slot.
+        // Readers are responsible for rechecking state_ after recording
+        // a deferred read to avoid atomicity problems between the state_
+        // CAS and applyDeferredReader's reads of deferredReaders[].
+        if (UNLIKELY((before & kMayDefer) != 0)) {
+          applyDeferredReaders(state, ctx);
+        }
+        while (true) {
+          assert((state & (kHasE | kBegunE)) != 0 && (state & kHasU) == 0);
+          if (UNLIKELY((state & kHasS) != 0) &&
+              !waitForZeroBits(state, kHasS, kWaitingNotS, ctx) &&
+              ctx.canTimeOut()) {
+            // Ugh.  We blocked new readers and other writers for a while,
+            // but were unable to complete.  Move on.  On the plus side
+            // we can clear kWaitingNotS because nobody else can piggyback
+            // on it.
+            state = (state_ &= ~(kPrevDefer | kHasE | kBegunE | kWaitingNotS));
+            wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
+            return false;
+          }
+
+          if (ReaderPriority && (state & kHasE) == 0) {
+            assert((state & kBegunE) != 0);
+            if (!state_.compare_exchange_strong(state,
+                                                (state & ~kBegunE) | kHasE)) {
+              continue;
+            }
+          }
+
+          return true;
+        }
+      }
+    }
+  }
+
+  template <class WaitContext>
+  bool waitForZeroBits(uint32_t& state,
+                       uint32_t goal,
+                       uint32_t waitMask,
+                       WaitContext& ctx) {
+    uint32_t spinCount = 0;
+    while (true) {
+      state = state_.load(std::memory_order_acquire);
+      if ((state & goal) == 0) {
+        return true;
+      }
+#if FOLLY_X64
+      asm volatile("pause");
+#endif
+      ++spinCount;
+      if (UNLIKELY(spinCount >= kMaxSpinCount)) {
+        return ctx.canBlock() &&
+               yieldWaitForZeroBits(state, goal, waitMask, ctx);
+      }
+    }
+  }
+
+  template <class WaitContext>
+  bool yieldWaitForZeroBits(uint32_t& state,
+                            uint32_t goal,
+                            uint32_t waitMask,
+                            WaitContext& ctx) {
+#ifdef RUSAGE_THREAD
+    struct rusage usage;
+    long before = -1;
+#endif
+    for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
+         ++yieldCount) {
+      for (int softState = 0; softState < 3; ++softState) {
+        if (softState < 2) {
+          std::this_thread::yield();
+        } else {
+#ifdef RUSAGE_THREAD
+          getrusage(RUSAGE_THREAD, &usage);
+#endif
+        }
+        if (((state = state_.load(std::memory_order_acquire)) & goal) == 0) {
+          return true;
+        }
+        if (ctx.shouldTimeOut()) {
+          return false;
+        }
+      }
+#ifdef RUSAGE_THREAD
+      if (before >= 0 && usage.ru_nivcsw >= before + 2) {
+        // One involuntary csw might just be occasional background work,
+        // but if we get two in a row then we guess that there is someone
+        // else who can profitably use this CPU.  Fall back to futex
+        break;
+      }
+      before = usage.ru_nivcsw;
+#endif
+    }
+    return futexWaitForZeroBits(state, goal, waitMask, ctx);
+  }
+
+  template <class WaitContext>
+  bool futexWaitForZeroBits(uint32_t& state,
+                            uint32_t goal,
+                            uint32_t waitMask,
+                            WaitContext& ctx) {
+    assert(waitMask == kWaitingNotS || waitMask == kWaitingE ||
+           waitMask == kWaitingU || waitMask == kWaitingS);
+
+    while (true) {
+      state = state_.load(std::memory_order_acquire);
+      if ((state & goal) == 0) {
+        return true;
+      }
+
+      auto after = state;
+      if (waitMask == kWaitingE) {
+        if ((state & kWaitingE) != kWaitingE) {
+          after += kIncrWaitingE;
+        } // else counter is saturated
+      } else {
+        after |= waitMask;
+      }
+
+      // CAS is better than atomic |= here, because it lets us avoid
+      // setting the wait flag when the goal is concurrently achieved
+      if (after != state && !state_.compare_exchange_strong(state, after)) {
+        continue;
+      }
+
+      if (!ctx.doWait(state_, after, waitMask)) {
+        // timed out
+        return false;
+      }
+    }
+  }
+
+  // Wakes up waiters registered in state_ as appropriate, clearing the
+  // awaiting bits for anybody that was awoken.  Tries to perform direct
+  // single wakeup of an exclusive waiter if appropriate
+  void wakeRegisteredWaiters(uint32_t& state, uint32_t wakeMask) {
+    if (UNLIKELY((state & wakeMask) != 0)) {
+      wakeRegisteredWaitersImpl(state, wakeMask);
+    }
+  }
+
+  void wakeRegisteredWaitersImpl(uint32_t& state, uint32_t wakeMask) {
+    if ((wakeMask & kWaitingE) != 0) {
+      // If there are multiple lock() pending only one of them will
+      // actually get to wake up, so issuing futexWakeAll will make
+      // a thundering herd.  There's nothing stopping us from issuing
+      // futexWake(1) instead, so long as the wait bits are still an
+      // accurate reflection of the waiters.  If our pending lock() counter
+      // hasn't saturated we can decrement it.  If it has saturated,
+      // then we can clear it by noticing that futexWake(1) returns 0
+      // (indicating no actual waiters) and then retrying via the normal
+      // clear+futexWakeAll path.
+      //
+      // It is possible that we wake an E waiter but an outside S grabs
+      // the lock instead, at which point we should wake pending U and
+      // S waiters.  Rather than tracking state to make the failing E
+      // regenerate the wakeup, we just disable the optimization in the
+      // case that there are waiting U or S that we are eligible to wake.
+      //
+      // Note that in the contended scenario it is quite likely that the
+      // waiter's futexWait call will fail with EAGAIN (expected value
+      // mismatch), at which point the awaiting-exclusive count will be
+      // larger than the actual number of waiters.  At this point the
+      // counter is effectively saturated.  Since this is likely, it is
+      // actually less efficient to have a larger counter.  2 bits seems
+      // to be the best.
+      while ((state & kWaitingE) != 0 &&
+             (state & wakeMask & (kWaitingU | kWaitingS)) == 0) {
+        if ((state & kWaitingE) != kWaitingE) {
+          // not saturated
+          if (!state_.compare_exchange_strong(state, state - kIncrWaitingE)) {
+            continue;
+          }
+          state -= kIncrWaitingE;
+        }
+
+        if (state_.futexWake(1, kWaitingE) > 0) {
+          return;
+        }
+
+        // Despite the non-zero awaiting-exclusive count, there aren't
+        // actually any pending writers.  Fall through to the logic below
+        // to wake up other classes of locks and to clear the saturated
+        // counter (if necessary).
+        break;
+      }
+    }
+
+    if ((state & wakeMask) != 0) {
+      auto prev = state_.fetch_and(~wakeMask);
+      if ((prev & wakeMask) != 0) {
+        futexWakeAll(wakeMask);
+      }
+      state = prev & ~wakeMask;
+    }
+  }
+
+  void futexWakeAll(uint32_t wakeMask) {
+    state_.futexWake(std::numeric_limits<int>::max(), wakeMask);
+  }
+
+  DeferredReaderSlot* deferredReader(uint32_t slot) {
+    return &deferredReaders[slot * kDeferredSeparationFactor];
+  }
+
+  uintptr_t tokenfulSlotValue() { return reinterpret_cast<uintptr_t>(this); }
+
+  uintptr_t tokenlessSlotValue() { return tokenfulSlotValue() | kTokenless; }
+
+  bool slotValueIsThis(uintptr_t slotValue) {
+    return (slotValue & ~kTokenless) == tokenfulSlotValue();
+  }
+
+  // Clears any deferredReaders[] that point to this, adjusting the inline
+  // shared lock count to compensate.  Does some spinning and yielding
+  // to avoid the work.  Always finishes the application, even if ctx
+  // times out.
+  template <class WaitContext>
+  void applyDeferredReaders(uint32_t& state, WaitContext& ctx) {
+    uint32_t slot = 0;
+
+    uint32_t spinCount = 0;
+    while (true) {
+      while (!slotValueIsThis(
+                 deferredReader(slot)->load(std::memory_order_acquire))) {
+        if (++slot == kMaxDeferredReaders) {
+          return;
+        }
+      }
+#if FOLLY_X64
+      asm("pause");
+#endif
+      if (UNLIKELY(++spinCount >= kMaxSpinCount)) {
+        applyDeferredReaders(state, ctx, slot);
+        return;
+      }
+    }
+  }
+
+  template <class WaitContext>
+  void applyDeferredReaders(uint32_t& state, WaitContext& ctx, uint32_t slot) {
+
+#ifdef RUSAGE_THREAD
+    struct rusage usage;
+    long before = -1;
+#endif
+    for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
+         ++yieldCount) {
+      for (int softState = 0; softState < 3; ++softState) {
+        if (softState < 2) {
+          std::this_thread::yield();
+        } else {
+#ifdef RUSAGE_THREAD
+          getrusage(RUSAGE_THREAD, &usage);
+#endif
+        }
+        while (!slotValueIsThis(
+                   deferredReader(slot)->load(std::memory_order_acquire))) {
+          if (++slot == kMaxDeferredReaders) {
+            return;
+          }
+        }
+        if (ctx.shouldTimeOut()) {
+          // finish applying immediately on timeout
+          break;
+        }
+      }
+#ifdef RUSAGE_THREAD
+      if (before >= 0 && usage.ru_nivcsw >= before + 2) {
+        // heuristic says run queue is not empty
+        break;
+      }
+      before = usage.ru_nivcsw;
+#endif
+    }
+
+    uint32_t movedSlotCount = 0;
+    for (; slot < kMaxDeferredReaders; ++slot) {
+      auto slotPtr = deferredReader(slot);
+      auto slotValue = slotPtr->load(std::memory_order_acquire);
+      if (slotValueIsThis(slotValue) &&
+          slotPtr->compare_exchange_strong(slotValue, 0)) {
+        ++movedSlotCount;
+      }
+    }
+
+    if (movedSlotCount > 0) {
+      state = (state_ += movedSlotCount * kIncrHasS);
+    }
+    assert((state & (kHasE | kBegunE)) != 0);
+
+    // if state + kIncrHasS overflows (off the end of state) then either
+    // we have 2^(32-9) readers (almost certainly an application bug)
+    // or we had an underflow (also a bug)
+    assert(state < state + kIncrHasS);
+  }
+
+  // It is straightfoward to make a token-less lock_shared() and
+  // unlock_shared() either by making the token-less version always use
+  // INLINE_SHARED mode or by removing the token version.  Supporting
+  // deferred operation for both types is trickier than it appears, because
+  // the purpose of the token it so that unlock_shared doesn't have to
+  // look in other slots for its deferred lock.  Token-less unlock_shared
+  // might place a deferred lock in one place and then release a different
+  // slot that was originally used by the token-ful version.  If this was
+  // important we could solve the problem by differentiating the deferred
+  // locks so that cross-variety release wouldn't occur.  The best way
+  // is probably to steal a bit from the pointer, making deferredLocks[]
+  // an array of Atom<uintptr_t>.
+
+  template <class WaitContext>
+  bool lockSharedImpl(Token* token, WaitContext& ctx) {
+    uint32_t state = state_.load(std::memory_order_relaxed);
+    if ((state & (kHasS | kMayDefer | kHasE)) == 0 &&
+        state_.compare_exchange_strong(state, state + kIncrHasS)) {
+      if (token != nullptr) {
+        token->type_ = Token::Type::INLINE_SHARED;
+      }
+      return true;
+    }
+    return lockSharedImpl(state, token, ctx);
+  }
+
+  template <class WaitContext>
+  bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
+    while (true) {
+      if (UNLIKELY((state & kHasE) != 0) &&
+          !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
+        return false;
+      }
+
+      uint32_t slot;
+      uintptr_t slotValue = 1; // any non-zero value will do
+
+      bool canAlreadyDefer = (state & kMayDefer) != 0;
+      bool aboveDeferThreshold =
+          (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS;
+      bool drainInProgress = ReaderPriority && (state & kBegunE) != 0;
+      if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) {
+        // starting point for our empty-slot search, can change after
+        // calling waitForZeroBits
+        uint32_t bestSlot =
+            (uint32_t)folly::detail::AccessSpreader<Atom>::current(
+                kMaxDeferredReaders);
+
+        // deferred readers are already enabled, or it is time to
+        // enable them if we can find a slot
+        for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) {
+          slot = bestSlot ^ i;
+          assert(slot < kMaxDeferredReaders);
+          slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
+          if (slotValue == 0) {
+            // found empty slot
+            break;
+          }
+        }
+      }
+
+      if (slotValue != 0) {
+        // not yet deferred, or no empty slots
+        if (state_.compare_exchange_strong(state, state + kIncrHasS)) {
+          // successfully recorded the read lock inline
+          if (token != nullptr) {
+            token->type_ = Token::Type::INLINE_SHARED;
+          }
+          return true;
+        }
+        // state is updated, try again
+        continue;
+      }
+
+      // record that deferred readers might be in use if necessary
+      if ((state & kMayDefer) == 0) {
+        if (!state_.compare_exchange_strong(state, state | kMayDefer)) {
+          // keep going if CAS failed because somebody else set the bit
+          // for us
+          if ((state & (kHasE | kMayDefer)) != kMayDefer) {
+            continue;
+          }
+        }
+        // state = state | kMayDefer;
+      }
+
+      // try to use the slot
+      bool gotSlot = deferredReader(slot)->compare_exchange_strong(
+          slotValue,
+          token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue());
+
+      // If we got the slot, we need to verify that an exclusive lock
+      // didn't happen since we last checked.  If we didn't get the slot we
+      // need to recheck state_ anyway to make sure we don't waste too much
+      // work.  It is also possible that since we checked state_ someone
+      // has acquired and released the write lock, clearing kMayDefer.
+      // Both cases are covered by looking for the readers-possible bit,
+      // because it is off when the exclusive lock bit is set.
+      state = state_.load(std::memory_order_acquire);
+
+      if (!gotSlot) {
+        continue;
+      }
+
+      if (token == nullptr) {
+        tls_lastTokenlessSlot = slot;
+      }
+
+      if ((state & kMayDefer) != 0) {
+        assert((state & kHasE) == 0);
+        // success
+        if (token != nullptr) {
+          token->type_ = Token::Type::DEFERRED_SHARED;
+          token->slot_ = (uint16_t)slot;
+        }
+        return true;
+      }
+
+      // release the slot before retrying
+      if (token == nullptr) {
+        // We can't rely on slot.  Token-less slot values can be freed by
+        // any unlock_shared(), so we need to do the full deferredReader
+        // search during unlock.  Unlike unlock_shared(), we can't trust
+        // kPrevDefer here.  This deferred lock isn't visible to lock()
+        // (that's the whole reason we're undoing it) so there might have
+        // subsequently been an unlock() and lock() with no intervening
+        // transition to deferred mode.
+        if (!tryUnlockAnySharedDeferred()) {
+          unlockSharedInline();
+        }
+      } else {
+        if (!tryUnlockSharedDeferred(slot)) {
+          unlockSharedInline();
+        }
+      }
+
+      // We got here not because the lock was unavailable, but because
+      // we lost a compare-and-swap.  Try-lock is typically allowed to
+      // have spurious failures, but there is no lock efficiency gain
+      // from exploiting that freedom here.
+    }
+  }
+
+  bool tryUnlockAnySharedDeferred() {
+    auto bestSlot = tls_lastTokenlessSlot;
+    for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
+      auto slotPtr = deferredReader(bestSlot ^ i);
+      auto slotValue = slotPtr->load(std::memory_order_relaxed);
+      if (slotValue == tokenlessSlotValue() &&
+          slotPtr->compare_exchange_strong(slotValue, 0)) {
+        tls_lastTokenlessSlot = bestSlot ^ i;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool tryUnlockSharedDeferred(uint32_t slot) {
+    assert(slot < kMaxDeferredReaders);
+    auto slotValue = tokenfulSlotValue();
+    return deferredReader(slot)->compare_exchange_strong(slotValue, 0);
+  }
+
+  uint32_t unlockSharedInline() {
+    uint32_t state = (state_ -= kIncrHasS);
+    assert((state & (kHasE | kBegunE)) != 0 || state < state + kIncrHasS);
+    if ((state & kHasS) == 0) {
+      // Only the second half of lock() can be blocked by a non-zero
+      // reader count, so that's the only thing we need to wake
+      wakeRegisteredWaiters(state, kWaitingNotS);
+    }
+    return state;
+  }
+
+  template <class WaitContext>
+  bool lockUpgradeImpl(WaitContext& ctx) {
+    uint32_t state;
+    do {
+      if (!waitForZeroBits(state, kHasSolo, kWaitingU, ctx)) {
+        return false;
+      }
+    } while (!state_.compare_exchange_strong(state, state | kHasU));
+    return true;
+  }
+
+ public:
+  class ReadHolder {
+   public:
+    ReadHolder() : lock_(nullptr) {}
+
+    explicit ReadHolder(const SharedMutexImpl* lock) : ReadHolder(*lock) {}
+
+    explicit ReadHolder(const SharedMutexImpl& lock)
+        : lock_(const_cast<SharedMutexImpl*>(&lock)) {
+      lock_->lock_shared(token_);
+    }
+
+    ReadHolder(ReadHolder&& rhs) noexcept : lock_(rhs.lock_),
+                                            token_(rhs.token_) {
+      rhs.lock_ = nullptr;
+    }
+
+    // Downgrade from upgrade mode
+    explicit ReadHolder(UpgradeHolder&& upgraded) : lock_(upgraded.lock_) {
+      assert(upgraded.lock_ != nullptr);
+      upgraded.lock_ = nullptr;
+      lock_->unlock_upgrade_and_lock_shared(token_);
+    }
+
+    // Downgrade from exclusive mode
+    explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
+      assert(writer.lock_ != nullptr);
+      writer.lock_ = nullptr;
+      lock_->unlock_and_lock_shared(token_);
+    }
+
+    ReadHolder& operator=(ReadHolder&& rhs) noexcept {
+      std::swap(lock_, rhs.lock_);
+      std::swap(token_, rhs.token_);
+      return *this;
+    }
+
+    ReadHolder(const ReadHolder& rhs) = delete;
+    ReadHolder& operator=(const ReadHolder& rhs) = delete;
+
+    ~ReadHolder() {
+      if (lock_) {
+        lock_->unlock_shared(token_);
+      }
+    }
+
+   private:
+    friend class UpgradeHolder;
+    friend class WriteHolder;
+    SharedMutexImpl* lock_;
+    SharedMutexToken token_;
+  };
+
+  class UpgradeHolder {
+   public:
+    UpgradeHolder() : lock_(nullptr) {}
+
+    explicit UpgradeHolder(SharedMutexImpl* lock) : UpgradeHolder(*lock) {}
+
+    explicit UpgradeHolder(SharedMutexImpl& lock) : lock_(&lock) {
+      lock_->lock_upgrade();
+    }
+
+    // Downgrade from exclusive mode
+    explicit UpgradeHolder(WriteHolder&& writer) : lock_(writer.lock_) {
+      assert(writer.lock_ != nullptr);
+      writer.lock_ = nullptr;
+      lock_->unlock_and_lock_upgrade();
+    }
+
+    UpgradeHolder(UpgradeHolder&& rhs) noexcept : lock_(rhs.lock_) {
+      rhs.lock_ = nullptr;
+    }
+
+    UpgradeHolder& operator=(UpgradeHolder&& rhs) noexcept {
+      std::swap(lock_, rhs.lock_);
+      return *this;
+    }
+
+    UpgradeHolder(const UpgradeHolder& rhs) = delete;
+    UpgradeHolder& operator=(const UpgradeHolder& rhs) = delete;
+
+    ~UpgradeHolder() {
+      if (lock_) {
+        lock_->unlock_upgrade();
+      }
+    }
+
+   private:
+    friend class WriteHolder;
+    friend class ReadHolder;
+    SharedMutexImpl* lock_;
+  };
+
+  class WriteHolder {
+   public:
+    WriteHolder() : lock_(nullptr) {}
+
+    explicit WriteHolder(SharedMutexImpl* lock) : WriteHolder(*lock) {}
+
+    explicit WriteHolder(SharedMutexImpl& lock) : lock_(&lock) {
+      lock_->lock();
+    }
+
+    // Promotion from upgrade mode
+    explicit WriteHolder(UpgradeHolder&& upgrade) : lock_(upgrade.lock_) {
+      assert(upgrade.lock_ != nullptr);
+      upgrade.lock_ = nullptr;
+      lock_->unlock_upgrade_and_lock();
+    }
+
+    WriteHolder(WriteHolder&& rhs) noexcept : lock_(rhs.lock_) {
+      rhs.lock_ = nullptr;
+    }
+
+    WriteHolder& operator=(WriteHolder&& rhs) noexcept {
+      std::swap(lock_, rhs.lock_);
+      return *this;
+    }
+
+    WriteHolder(const WriteHolder& rhs) = delete;
+    WriteHolder& operator=(const WriteHolder& rhs) = delete;
+
+    ~WriteHolder() {
+      if (lock_) {
+        lock_->unlock();
+      }
+    }
+
+   private:
+    friend class ReadHolder;
+    friend class UpgradeHolder;
+    SharedMutexImpl* lock_;
+  };
+
+  // Adapters for Synchronized<>
+  friend void acquireRead(SharedMutexImpl& lock) { lock.lock_shared(); }
+  friend void acquireReadWrite(SharedMutexImpl& lock) { lock.lock(); }
+  friend void releaseRead(SharedMutexImpl& lock) { lock.unlock_shared(); }
+  friend void releaseReadWrite(SharedMutexImpl& lock) { lock.unlock(); }
+};
+
+#define COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(type) \
+  template <>                                                        \
+  type::DeferredReaderSlot                                           \
+      type::deferredReaders[type::kMaxDeferredReaders *              \
+                            type::kDeferredSeparationFactor] = {};   \
+  template <>                                                        \
+  FOLLY_TLS uint32_t type::tls_lastTokenlessSlot = 0;
+
+typedef SharedMutexImpl<true> SharedMutexReadPriority;
+typedef SharedMutexImpl<false> SharedMutexWritePriority;
+typedef SharedMutexWritePriority SharedMutex;
+
+} // namespace folly
diff --git a/folly/experimental/test/SharedMutexTest.cpp b/folly/experimental/test/SharedMutexTest.cpp

new file mode 100644 (file)

index 0000000..aff72bf
--- /dev/null
+++ b/folly/experimental/test/SharedMutexTest.cpp
@@ -0,0 +1,2076 @@
+/*
+ * Copyright 2015 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/experimental/SharedMutex.h>
+
+#include <stdlib.h>
+#include <thread>
+#include <vector>
+#include <boost/optional.hpp>
+#include <folly/Benchmark.h>
+#include <folly/MPMCQueue.h>
+#include <folly/Random.h>
+#include <folly/test/DeterministicSchedule.h>
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include <boost/thread/shared_mutex.hpp>
+#include <folly/RWSpinLock.h>
+
+using namespace folly;
+using namespace folly::test;
+using namespace std;
+using namespace chrono;
+
+typedef DeterministicSchedule DSched;
+typedef SharedMutexImpl<true, void, DeterministicAtomic, true>
+    DSharedMutexReadPriority;
+typedef SharedMutexImpl<false, void, DeterministicAtomic, true>
+    DSharedMutexWritePriority;
+
+COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(
+    DSharedMutexReadPriority);
+COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(
+    DSharedMutexWritePriority);
+
+template <typename Lock>
+void runBasicTest() {
+  Lock lock;
+  SharedMutexToken token1;
+  SharedMutexToken token2;
+  SharedMutexToken token3;
+
+  EXPECT_TRUE(lock.try_lock());
+  EXPECT_FALSE(lock.try_lock());
+  EXPECT_FALSE(lock.try_lock_shared(token1));
+  lock.unlock();
+
+  EXPECT_TRUE(lock.try_lock_shared(token1));
+  EXPECT_FALSE(lock.try_lock());
+  EXPECT_TRUE(lock.try_lock_shared(token2));
+  lock.lock_shared(token3);
+  lock.unlock_shared(token3);
+  lock.unlock_shared(token2);
+  lock.unlock_shared(token1);
+
+  lock.lock();
+  lock.unlock();
+
+  lock.lock_shared(token1);
+  lock.lock_shared(token2);
+  lock.unlock_shared(token1);
+  lock.unlock_shared(token2);
+
+  lock.lock();
+  lock.unlock_and_lock_shared(token1);
+  lock.lock_shared(token2);
+  lock.unlock_shared(token2);
+  lock.unlock_shared(token1);
+}
+
+TEST(SharedMutex, basic) {
+  runBasicTest<SharedMutexReadPriority>();
+  runBasicTest<SharedMutexWritePriority>();
+}
+
+template <typename Lock>
+void runBasicHoldersTest() {
+  Lock lock;
+  SharedMutexToken token;
+
+  {
+    typename Lock::WriteHolder holder(lock);
+    EXPECT_FALSE(lock.try_lock());
+    EXPECT_FALSE(lock.try_lock_shared(token));
+
+    typename Lock::WriteHolder holder2(std::move(holder));
+    typename Lock::WriteHolder holder3;
+    holder3 = std::move(holder2);
+
+    typename Lock::UpgradeHolder holder4(std::move(holder3));
+    typename Lock::WriteHolder holder5(std::move(holder4));
+
+    typename Lock::ReadHolder holder6(std::move(holder5));
+
+    EXPECT_FALSE(lock.try_lock());
+    EXPECT_TRUE(lock.try_lock_shared(token));
+    lock.unlock_shared(token);
+  }
+
+  {
+    typename Lock::WriteHolder holder(lock);
+    EXPECT_FALSE(lock.try_lock());
+  }
+
+  {
+    typename Lock::ReadHolder holder(lock);
+    typename Lock::ReadHolder holder2(lock);
+    typename Lock::UpgradeHolder holder3(lock);
+  }
+
+  {
+    typename Lock::UpgradeHolder holder(lock);
+    typename Lock::ReadHolder holder2(lock);
+    typename Lock::ReadHolder holder3(std::move(holder));
+  }
+}
+
+TEST(SharedMutex, basic_holders) {
+  runBasicHoldersTest<SharedMutexReadPriority>();
+  runBasicHoldersTest<SharedMutexWritePriority>();
+}
+
+template <typename Lock>
+void runManyReadLocksTestWithTokens() {
+  Lock lock;
+
+  vector<SharedMutexToken> tokens;
+  for (int i = 0; i < 1000; ++i) {
+    tokens.emplace_back();
+    EXPECT_TRUE(lock.try_lock_shared(tokens.back()));
+  }
+  for (auto& token : tokens) {
+    lock.unlock_shared(token);
+  }
+  EXPECT_TRUE(lock.try_lock());
+  lock.unlock();
+}
+
+TEST(SharedMutex, many_read_locks_with_tokens) {
+  runManyReadLocksTestWithTokens<SharedMutexReadPriority>();
+  runManyReadLocksTestWithTokens<SharedMutexWritePriority>();
+}
+
+template <typename Lock>
+void runManyReadLocksTestWithoutTokens() {
+  Lock lock;
+
+  for (int i = 0; i < 1000; ++i) {
+    EXPECT_TRUE(lock.try_lock_shared());
+  }
+  for (int i = 0; i < 1000; ++i) {
+    lock.unlock_shared();
+  }
+  EXPECT_TRUE(lock.try_lock());
+  lock.unlock();
+}
+
+TEST(SharedMutex, many_read_locks_without_tokens) {
+  runManyReadLocksTestWithoutTokens<SharedMutexReadPriority>();
+  runManyReadLocksTestWithoutTokens<SharedMutexWritePriority>();
+}
+
+template <typename Lock>
+void runTimeoutInPastTest() {
+  Lock lock;
+
+  EXPECT_TRUE(lock.try_lock_for(milliseconds(0)));
+  lock.unlock();
+  EXPECT_TRUE(lock.try_lock_for(milliseconds(-1)));
+  lock.unlock();
+  EXPECT_TRUE(lock.try_lock_shared_for(milliseconds(0)));
+  lock.unlock_shared();
+  EXPECT_TRUE(lock.try_lock_shared_for(milliseconds(-1)));
+  lock.unlock_shared();
+  EXPECT_TRUE(lock.try_lock_until(system_clock::now() - milliseconds(1)));
+  lock.unlock();
+  EXPECT_TRUE(
+      lock.try_lock_shared_until(system_clock::now() - milliseconds(1)));
+  lock.unlock_shared();
+  EXPECT_TRUE(lock.try_lock_until(steady_clock::now() - milliseconds(1)));
+  lock.unlock();
+  EXPECT_TRUE(
+      lock.try_lock_shared_until(steady_clock::now() - milliseconds(1)));
+  lock.unlock_shared();
+}
+
+TEST(SharedMutex, timeout_in_past) {
+  runTimeoutInPastTest<SharedMutexReadPriority>();
+  runTimeoutInPastTest<SharedMutexWritePriority>();
+}
+
+template <class Func>
+bool funcHasDuration(milliseconds expectedDuration, Func func) {
+  // elapsed time should eventually fall within expectedDuration +- 25%
+  for (int tries = 0; tries < 100; ++tries) {
+    auto start = steady_clock::now();
+    func();
+    auto elapsed = steady_clock::now() - start;
+    if (elapsed > expectedDuration - expectedDuration / 4 &&
+        elapsed < expectedDuration + expectedDuration / 4) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <typename Lock>
+void runFailingTryTimeoutTest() {
+  Lock lock;
+  lock.lock();
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(lock.try_lock_for(milliseconds(10)));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    typename Lock::Token token;
+    EXPECT_FALSE(lock.try_lock_shared_for(milliseconds(10), token));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(lock.try_lock_upgrade_for(milliseconds(10)));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(lock.try_lock_until(steady_clock::now() + milliseconds(10)));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    typename Lock::Token token;
+    EXPECT_FALSE(lock.try_lock_shared_until(
+        steady_clock::now() + milliseconds(10), token));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(
+        lock.try_lock_upgrade_until(steady_clock::now() + milliseconds(10)));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(lock.try_lock_until(system_clock::now() + milliseconds(10)));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    typename Lock::Token token;
+    EXPECT_FALSE(lock.try_lock_shared_until(
+        system_clock::now() + milliseconds(10), token));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(
+        lock.try_lock_upgrade_until(system_clock::now() + milliseconds(10)));
+  }));
+  lock.unlock();
+
+  lock.lock_shared();
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(lock.try_lock_for(milliseconds(10)));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(lock.try_lock_until(steady_clock::now() + milliseconds(10)));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(lock.try_lock_until(system_clock::now() + milliseconds(10)));
+  }));
+  lock.unlock_shared();
+
+  lock.lock();
+  for (int p = 0; p < 8; ++p) {
+    EXPECT_FALSE(lock.try_lock_for(nanoseconds(1 << p)));
+  }
+  lock.unlock();
+
+  for (int p = 0; p < 8; ++p) {
+    typename Lock::ReadHolder holder1(lock);
+    typename Lock::ReadHolder holder2(lock);
+    typename Lock::ReadHolder holder3(lock);
+    EXPECT_FALSE(lock.try_lock_for(nanoseconds(1 << p)));
+  }
+}
+
+TEST(SharedMutex, failing_try_timeout) {
+  runFailingTryTimeoutTest<SharedMutexReadPriority>();
+  runFailingTryTimeoutTest<SharedMutexWritePriority>();
+}
+
+template <typename Lock>
+void runBasicUpgradeTest() {
+  Lock lock;
+  typename Lock::Token token1;
+  typename Lock::Token token2;
+
+  lock.lock_upgrade();
+  EXPECT_FALSE(lock.try_lock());
+  EXPECT_TRUE(lock.try_lock_shared(token1));
+  lock.unlock_shared(token1);
+  lock.unlock_upgrade();
+
+  lock.lock_upgrade();
+  lock.unlock_upgrade_and_lock();
+  EXPECT_FALSE(lock.try_lock_shared(token1));
+  lock.unlock();
+
+  lock.lock_upgrade();
+  lock.unlock_upgrade_and_lock_shared(token1);
+  lock.lock_upgrade();
+  lock.unlock_upgrade_and_lock_shared(token2);
+  lock.unlock_shared(token1);
+  lock.unlock_shared(token2);
+
+  lock.lock();
+  lock.unlock_and_lock_upgrade();
+  EXPECT_TRUE(lock.try_lock_shared(token1));
+  lock.unlock_upgrade();
+  lock.unlock_shared(token1);
+}
+
+TEST(SharedMutex, basic_upgrade_tests) {
+  runBasicUpgradeTest<SharedMutexReadPriority>();
+  runBasicUpgradeTest<SharedMutexWritePriority>();
+}
+
+TEST(SharedMutex, read_has_prio) {
+  SharedMutexReadPriority lock;
+  SharedMutexToken token1;
+  SharedMutexToken token2;
+  lock.lock_shared(token1);
+  bool exclusiveAcquired = false;
+  auto writer = thread([&] {
+    lock.lock();
+    exclusiveAcquired = true;
+    lock.unlock();
+  });
+
+  // lock() can't complete until we unlock token1, but it should stake
+  // its claim with regards to other exclusive or upgrade locks.  We can
+  // use try_lock_upgrade to poll for that eventuality.
+  while (lock.try_lock_upgrade()) {
+    lock.unlock_upgrade();
+    this_thread::yield();
+  }
+  EXPECT_FALSE(exclusiveAcquired);
+
+  // Even though lock() is stuck we should be able to get token2
+  EXPECT_TRUE(lock.try_lock_shared(token2));
+  lock.unlock_shared(token1);
+  lock.unlock_shared(token2);
+  writer.join();
+  EXPECT_TRUE(exclusiveAcquired);
+}
+
+TEST(SharedMutex, write_has_prio) {
+  SharedMutexWritePriority lock;
+  SharedMutexToken token1;
+  SharedMutexToken token2;
+  lock.lock_shared(token1);
+  auto writer = thread([&] {
+    lock.lock();
+    lock.unlock();
+  });
+
+  // eventually lock() should block readers
+  while (lock.try_lock_shared(token2)) {
+    lock.unlock_shared(token2);
+    this_thread::yield();
+  }
+
+  lock.unlock_shared(token1);
+  writer.join();
+}
+
+struct TokenLocker {
+  SharedMutexToken token;
+
+  template <typename T>
+  void lock(T* lock) {
+    lock->lock();
+  }
+
+  template <typename T>
+  void unlock(T* lock) {
+    lock->unlock();
+  }
+
+  template <typename T>
+  void lock_shared(T* lock) {
+    lock->lock_shared(token);
+  }
+
+  template <typename T>
+  void unlock_shared(T* lock) {
+    lock->unlock_shared(token);
+  }
+};
+
+struct Locker {
+  template <typename T>
+  void lock(T* lock) {
+    lock->lock();
+  }
+
+  template <typename T>
+  void unlock(T* lock) {
+    lock->unlock();
+  }
+
+  template <typename T>
+  void lock_shared(T* lock) {
+    lock->lock_shared();
+  }
+
+  template <typename T>
+  void unlock_shared(T* lock) {
+    lock->unlock_shared();
+  }
+};
+
+struct EnterLocker {
+  template <typename T>
+  void lock(T* lock) {
+    lock->lock(0);
+  }
+
+  template <typename T>
+  void unlock(T* lock) {
+    lock->unlock();
+  }
+
+  template <typename T>
+  void lock_shared(T* lock) {
+    lock->enter(0);
+  }
+
+  template <typename T>
+  void unlock_shared(T* lock) {
+    lock->leave();
+  }
+};
+
+struct PosixRWLock {
+  pthread_rwlock_t lock_;
+
+  PosixRWLock() { pthread_rwlock_init(&lock_, nullptr); }
+
+  ~PosixRWLock() { pthread_rwlock_destroy(&lock_); }
+
+  void lock() { pthread_rwlock_wrlock(&lock_); }
+
+  void unlock() { pthread_rwlock_unlock(&lock_); }
+
+  void lock_shared() { pthread_rwlock_rdlock(&lock_); }
+
+  void unlock_shared() { pthread_rwlock_unlock(&lock_); }
+};
+
+struct PosixMutex {
+  pthread_mutex_t lock_;
+
+  PosixMutex() { pthread_mutex_init(&lock_, nullptr); }
+
+  ~PosixMutex() { pthread_mutex_destroy(&lock_); }
+
+  void lock() { pthread_mutex_lock(&lock_); }
+
+  void unlock() { pthread_mutex_unlock(&lock_); }
+
+  void lock_shared() { pthread_mutex_lock(&lock_); }
+
+  void unlock_shared() { pthread_mutex_unlock(&lock_); }
+};
+
+template <template <typename> class Atom, typename Lock, typename Locker>
+static void runContendedReaders(size_t numOps,
+                                size_t numThreads,
+                                bool useSeparateLocks) {
+  char padding1[64];
+  Lock globalLock;
+  int valueProtectedByLock = 10;
+  char padding2[64];
+  Atom<bool> go(false);
+  Atom<bool>* goPtr = &go; // workaround for clang bug
+  vector<thread> threads(numThreads);
+
+  BENCHMARK_SUSPEND {
+    for (int t = 0; t < numThreads; ++t) {
+      threads[t] = DSched::thread([&, t, numThreads] {
+        Lock privateLock;
+        Lock* lock = useSeparateLocks ? &privateLock : &globalLock;
+        Locker locker;
+        while (!goPtr->load()) {
+          this_thread::yield();
+        }
+        for (size_t op = t; op < numOps; op += numThreads) {
+          locker.lock_shared(lock);
+          // note: folly::doNotOptimizeAway reads and writes to its arg,
+          // so the following two lines are very different than a call
+          // to folly::doNotOptimizeAway(valueProtectedByLock);
+          auto copy = valueProtectedByLock;
+          folly::doNotOptimizeAway(copy);
+          locker.unlock_shared(lock);
+        }
+      });
+    }
+  }
+
+  go.store(true);
+  for (auto& thr : threads) {
+    DSched::join(thr);
+  }
+}
+
+static void folly_rwspin_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, RWSpinLock, Locker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+static void shmtx_wr_pri_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, SharedMutexWritePriority, TokenLocker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+static void shmtx_w_bare_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, SharedMutexWritePriority, Locker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+static void shmtx_rd_pri_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, SharedMutexReadPriority, TokenLocker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+static void shmtx_r_bare_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, SharedMutexReadPriority, Locker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+static void folly_ticket_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, RWTicketSpinLock64, Locker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+static void boost_shared_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, boost::shared_mutex, Locker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+static void pthrd_rwlock_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, PosixRWLock, Locker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+template <template <typename> class Atom, typename Lock, typename Locker>
+static void runMixed(size_t numOps,
+                     size_t numThreads,
+                     double writeFraction,
+                     bool useSeparateLocks) {
+  char padding1[64];
+  Lock globalLock;
+  int valueProtectedByLock = 0;
+  char padding2[64];
+  Atom<bool> go(false);
+  Atom<bool>* goPtr = &go; // workaround for clang bug
+  vector<thread> threads(numThreads);
+
+  BENCHMARK_SUSPEND {
+    for (int t = 0; t < numThreads; ++t) {
+      threads[t] = DSched::thread([&, t, numThreads] {
+        struct drand48_data buffer;
+        srand48_r(t, &buffer);
+        long writeThreshold = writeFraction * 0x7fffffff;
+        Lock privateLock;
+        Lock* lock = useSeparateLocks ? &privateLock : &globalLock;
+        Locker locker;
+        while (!goPtr->load()) {
+          this_thread::yield();
+        }
+        for (size_t op = t; op < numOps; op += numThreads) {
+          long randVal;
+          lrand48_r(&buffer, &randVal);
+          bool writeOp = randVal < writeThreshold;
+          SharedMutexToken token;
+          if (writeOp) {
+            locker.lock(lock);
+            if (!useSeparateLocks) {
+              ++valueProtectedByLock;
+            }
+            locker.unlock(lock);
+          } else {
+            locker.lock_shared(lock);
+            auto v = valueProtectedByLock;
+            folly::doNotOptimizeAway(v);
+            locker.unlock_shared(lock);
+          }
+        }
+      });
+    }
+  }
+
+  go.store(true);
+  for (auto& thr : threads) {
+    DSched::join(thr);
+  }
+}
+
+static void folly_rwspin(size_t numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, RWSpinLock, Locker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void shmtx_wr_pri(uint numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, SharedMutexWritePriority, TokenLocker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void shmtx_w_bare(uint numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, SharedMutexWritePriority, Locker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void shmtx_rd_pri(uint numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, SharedMutexReadPriority, TokenLocker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void shmtx_r_bare(uint numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, SharedMutexReadPriority, Locker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void folly_ticket(size_t numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, RWTicketSpinLock64, Locker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void boost_shared(size_t numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, boost::shared_mutex, Locker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void pthrd_rwlock(size_t numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, PosixRWLock, Locker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void pthrd_mutex_(size_t numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, PosixMutex, Locker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+template <typename Lock, template <typename> class Atom>
+static void runAllAndValidate(size_t numOps, size_t numThreads) {
+  Lock globalLock;
+  Atom<int> globalExclusiveCount(0);
+  Atom<int> globalUpgradeCount(0);
+  Atom<int> globalSharedCount(0);
+
+  Atom<bool> go(false);
+
+  // clang crashes on access to Atom<> captured by ref in closure
+  Atom<int>* globalExclusiveCountPtr = &globalExclusiveCount;
+  Atom<int>* globalUpgradeCountPtr = &globalUpgradeCount;
+  Atom<int>* globalSharedCountPtr = &globalSharedCount;
+  Atom<bool>* goPtr = &go;
+
+  vector<thread> threads(numThreads);
+
+  BENCHMARK_SUSPEND {
+    for (int t = 0; t < numThreads; ++t) {
+      threads[t] = DSched::thread([&, t, numThreads] {
+        struct drand48_data buffer;
+        srand48_r(t, &buffer);
+
+        bool exclusive = false;
+        bool upgrade = false;
+        bool shared = false;
+        bool ourGlobalTokenUsed = false;
+        SharedMutexToken ourGlobalToken;
+
+        Lock privateLock;
+        vector<SharedMutexToken> privateTokens;
+
+        while (!goPtr->load()) {
+          this_thread::yield();
+        }
+        for (size_t op = t; op < numOps; op += numThreads) {
+          // randVal in [0,1000)
+          long randVal;
+          lrand48_r(&buffer, &randVal);
+          randVal = (long)((randVal * (uint64_t)1000) / 0x7fffffff);
+
+          // make as many assertions as possible about the global state
+          if (exclusive) {
+            EXPECT_EQ(1, globalExclusiveCountPtr->load(memory_order_acquire));
+            EXPECT_EQ(0, globalUpgradeCountPtr->load(memory_order_acquire));
+            EXPECT_EQ(0, globalSharedCountPtr->load(memory_order_acquire));
+          }
+          if (upgrade) {
+            EXPECT_EQ(0, globalExclusiveCountPtr->load(memory_order_acquire));
+            EXPECT_EQ(1, globalUpgradeCountPtr->load(memory_order_acquire));
+          }
+          if (shared) {
+            EXPECT_EQ(0, globalExclusiveCountPtr->load(memory_order_acquire));
+            EXPECT_TRUE(globalSharedCountPtr->load(memory_order_acquire) > 0);
+          } else {
+            EXPECT_FALSE(ourGlobalTokenUsed);
+          }
+
+          // independent 20% chance we do something to the private lock
+          if (randVal < 200) {
+            // it's okay to take multiple private shared locks because
+            // we never take an exclusive lock, so reader versus writer
+            // priority doesn't cause deadlocks
+            if (randVal < 100 && privateTokens.size() > 0) {
+              auto i = randVal % privateTokens.size();
+              privateLock.unlock_shared(privateTokens[i]);
+              privateTokens.erase(privateTokens.begin() + i);
+            } else {
+              SharedMutexToken token;
+              privateLock.lock_shared(token);
+              privateTokens.push_back(token);
+            }
+            continue;
+          }
+
+          // if we've got a lock, the only thing we can do is release it
+          // or transform it into a different kind of lock
+          if (exclusive) {
+            exclusive = false;
+            --*globalExclusiveCountPtr;
+            if (randVal < 500) {
+              globalLock.unlock();
+            } else if (randVal < 700) {
+              globalLock.unlock_and_lock_shared();
+              ++*globalSharedCountPtr;
+              shared = true;
+            } else if (randVal < 900) {
+              globalLock.unlock_and_lock_shared(ourGlobalToken);
+              ++*globalSharedCountPtr;
+              shared = true;
+              ourGlobalTokenUsed = true;
+            } else {
+              globalLock.unlock_and_lock_upgrade();
+              ++*globalUpgradeCountPtr;
+              upgrade = true;
+            }
+          } else if (upgrade) {
+            upgrade = false;
+            --*globalUpgradeCountPtr;
+            if (randVal < 500) {
+              globalLock.unlock_upgrade();
+            } else if (randVal < 700) {
+              globalLock.unlock_upgrade_and_lock_shared();
+              ++*globalSharedCountPtr;
+              shared = true;
+            } else if (randVal < 900) {
+              globalLock.unlock_upgrade_and_lock_shared(ourGlobalToken);
+              ++*globalSharedCountPtr;
+              shared = true;
+              ourGlobalTokenUsed = true;
+            } else {
+              globalLock.unlock_upgrade_and_lock();
+              ++*globalExclusiveCountPtr;
+              exclusive = true;
+            }
+          } else if (shared) {
+            shared = false;
+            --*globalSharedCountPtr;
+            if (ourGlobalTokenUsed) {
+              globalLock.unlock_shared(ourGlobalToken);
+              ourGlobalTokenUsed = false;
+            } else {
+              globalLock.unlock_shared();
+            }
+          } else if (randVal < 400) {
+            // 40% chance of shared lock with token, 5 ways to get it
+
+            // delta t goes from -1 millis to 7 millis
+            auto dt = microseconds(10 * (randVal - 100));
+
+            if (randVal < 400) {
+              globalLock.lock_shared(ourGlobalToken);
+              shared = true;
+            } else if (randVal < 500) {
+              shared = globalLock.try_lock_shared(ourGlobalToken);
+            } else if (randVal < 600) {
+              shared = globalLock.try_lock_shared_for(dt, ourGlobalToken);
+            } else if (randVal < 800) {
+              shared = globalLock.try_lock_shared_until(
+                  system_clock::now() + dt, ourGlobalToken);
+            }
+            if (shared) {
+              ourGlobalTokenUsed = true;
+              ++*globalSharedCountPtr;
+            }
+          } else if (randVal < 800) {
+            // 40% chance of shared lock without token
+            auto dt = microseconds(10 * (randVal - 100));
+            if (randVal < 400) {
+              globalLock.lock_shared();
+              shared = true;
+            } else if (randVal < 500) {
+              shared = globalLock.try_lock_shared();
+            } else if (randVal < 600) {
+              shared = globalLock.try_lock_shared_for(dt);
+            } else if (randVal < 800) {
+              shared = globalLock.try_lock_shared_until(
+                  system_clock::now() + dt);
+            }
+            if (shared) {
+              ++*globalSharedCountPtr;
+            }
+          } else if (randVal < 900) {
+            // 10% change of upgrade lock
+            globalLock.lock_upgrade();
+            upgrade = true;
+            ++*globalUpgradeCountPtr;
+          } else {
+            // 10% chance of exclusive lock, 5 ways to get it
+
+            // delta t goes from -1 millis to 9 millis
+            auto dt = microseconds(100 * (randVal - 910));
+
+            if (randVal < 400) {
+              globalLock.lock();
+              exclusive = true;
+            } else if (randVal < 500) {
+              exclusive = globalLock.try_lock();
+            } else if (randVal < 600) {
+              exclusive = globalLock.try_lock_for(dt);
+            } else if (randVal < 700) {
+              exclusive = globalLock.try_lock_until(steady_clock::now() + dt);
+            } else {
+              exclusive = globalLock.try_lock_until(system_clock::now() + dt);
+            }
+            if (exclusive) {
+              ++*globalExclusiveCountPtr;
+            }
+          }
+        }
+
+        if (exclusive) {
+          --*globalExclusiveCountPtr;
+          globalLock.unlock();
+        }
+        if (upgrade) {
+          --*globalUpgradeCountPtr;
+          globalLock.unlock_upgrade();
+        }
+        if (shared) {
+          --*globalSharedCountPtr;
+          if (ourGlobalTokenUsed) {
+            globalLock.unlock_shared(ourGlobalToken);
+            ourGlobalTokenUsed = false;
+          } else {
+            globalLock.unlock_shared();
+          }
+        }
+        for (auto& token : privateTokens) {
+          privateLock.unlock_shared(token);
+        }
+      });
+    }
+  }
+
+  go.store(true);
+  for (auto& thr : threads) {
+    DSched::join(thr);
+  }
+}
+
+TEST(SharedMutex, deterministic_concurrent_readers_of_one_lock_read_prio) {
+  for (int pass = 0; pass < 3; ++pass) {
+    // LOG(INFO) << "pass " << pass;
+    DSched sched(DSched::uniform(pass));
+    runContendedReaders<DeterministicAtomic,
+                        DSharedMutexReadPriority,
+                        Locker>(1000, 3, false);
+  }
+}
+
+TEST(SharedMutex, deterministic_concurrent_readers_of_one_lock_write_prio) {
+  for (int pass = 0; pass < 3; ++pass) {
+    // LOG(INFO) << "pass " << pass;
+    DSched sched(DSched::uniform(pass));
+    runContendedReaders<DeterministicAtomic,
+                        DSharedMutexWritePriority,
+                        Locker>(1000, 3, false);
+  }
+}
+
+TEST(SharedMutex, concurrent_readers_of_one_lock_read_prio) {
+  for (int pass = 0; pass < 10; ++pass) {
+    runContendedReaders<atomic, SharedMutexReadPriority, Locker>(
+        100000, 32, false);
+  }
+}
+
+TEST(SharedMutex, concurrent_readers_of_one_lock_write_prio) {
+  for (int pass = 0; pass < 10; ++pass) {
+    runContendedReaders<atomic, SharedMutexWritePriority, Locker>(
+        100000, 32, false);
+  }
+}
+
+TEST(SharedMutex, deterministic_readers_of_concurrent_locks_read_prio) {
+  for (int pass = 0; pass < 3; ++pass) {
+    // LOG(INFO) << "pass " << pass;
+    DSched sched(DSched::uniform(pass));
+    runContendedReaders<DeterministicAtomic,
+                        DSharedMutexReadPriority,
+                        Locker>(1000, 3, true);
+  }
+}
+
+TEST(SharedMutex, deterministic_readers_of_concurrent_locks_write_prio) {
+  for (int pass = 0; pass < 3; ++pass) {
+    // LOG(INFO) << "pass " << pass;
+    DSched sched(DSched::uniform(pass));
+    runContendedReaders<DeterministicAtomic,
+                        DSharedMutexWritePriority,
+                        Locker>(1000, 3, true);
+  }
+}
+
+TEST(SharedMutex, readers_of_concurrent_locks_read_prio) {
+  for (int pass = 0; pass < 10; ++pass) {
+    runContendedReaders<atomic, SharedMutexReadPriority, TokenLocker>(
+        100000, 32, true);
+  }
+}
+
+TEST(SharedMutex, readers_of_concurrent_locks_write_prio) {
+  for (int pass = 0; pass < 10; ++pass) {
+    runContendedReaders<atomic, SharedMutexWritePriority, TokenLocker>(
+        100000, 32, true);
+  }
+}
+
+TEST(SharedMutex, deterministic_mixed_mostly_read_read_prio) {
+  for (int pass = 0; pass < 3; ++pass) {
+    // LOG(INFO) << "pass " << pass;
+    DSched sched(DSched::uniform(pass));
+    runMixed<DeterministicAtomic, DSharedMutexReadPriority, Locker>(
+        1000, 3, 0.1, false);
+  }
+}
+
+TEST(SharedMutex, deterministic_mixed_mostly_read_write_prio) {
+  for (int pass = 0; pass < 3; ++pass) {
+    // LOG(INFO) << "pass " << pass;
+    DSched sched(DSched::uniform(pass));
+    runMixed<DeterministicAtomic, DSharedMutexWritePriority, Locker>(
+        1000, 3, 0.1, false);
+  }
+}
+
+TEST(SharedMutex, mixed_mostly_read_read_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    runMixed<atomic, SharedMutexReadPriority, TokenLocker>(
+        50000, 32, 0.1, false);
+  }
+}
+
+TEST(SharedMutex, mixed_mostly_read_write_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    runMixed<atomic, SharedMutexWritePriority, TokenLocker>(
+        50000, 32, 0.1, false);
+  }
+}
+
+TEST(SharedMutex, deterministic_mixed_mostly_write_read_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    // LOG(INFO) << "pass " << pass;
+    DSched sched(DSched::uniform(pass));
+    runMixed<DeterministicAtomic, DSharedMutexReadPriority, TokenLocker>(
+        1000, 10, 0.9, false);
+  }
+}
+
+TEST(SharedMutex, deterministic_mixed_mostly_write_write_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    // LOG(INFO) << "pass " << pass;
+    DSched sched(DSched::uniform(pass));
+    runMixed<DeterministicAtomic, DSharedMutexWritePriority, TokenLocker>(
+        1000, 10, 0.9, false);
+  }
+}
+
+TEST(SharedMutex, mixed_mostly_write_read_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    runMixed<atomic, SharedMutexReadPriority, TokenLocker>(
+        50000, 300, 0.9, false);
+  }
+}
+
+TEST(SharedMutex, mixed_mostly_write_write_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    runMixed<atomic, SharedMutexWritePriority, TokenLocker>(
+        50000, 300, 0.9, false);
+  }
+}
+
+TEST(SharedMutex, deterministic_all_ops_read_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    // LOG(INFO) << "pass " << pass;
+    DSched sched(DSched::uniform(pass));
+    runAllAndValidate<DSharedMutexReadPriority, DeterministicAtomic>(1000, 8);
+  }
+}
+
+TEST(SharedMutex, deterministic_all_ops_write_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    // LOG(INFO) << "pass " << pass;
+    DSched sched(DSched::uniform(pass));
+    runAllAndValidate<DSharedMutexWritePriority, DeterministicAtomic>(1000, 8);
+  }
+}
+
+TEST(SharedMutex, all_ops_read_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    runAllAndValidate<SharedMutexReadPriority, atomic>(100000, 32);
+  }
+}
+
+TEST(SharedMutex, all_ops_write_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    runAllAndValidate<SharedMutexWritePriority, atomic>(100000, 32);
+  }
+}
+
+FOLLY_ASSUME_FBVECTOR_COMPATIBLE(
+    boost::optional<boost::optional<SharedMutexToken>>)
+
+// Setup is a set of threads that either grab a shared lock, or exclusive
+// and then downgrade it, or upgrade then upgrade and downgrade, then
+// enqueue the shared lock to a second set of threads that just performs
+// unlocks.  Half of the shared locks use tokens, the others don't.
+template <typename Lock, template <typename> class Atom>
+static void runRemoteUnlock(size_t numOps,
+                            double preWriteFraction,
+                            double preUpgradeFraction,
+                            size_t numSendingThreads,
+                            size_t numReceivingThreads) {
+  Lock globalLock;
+  MPMCQueue<boost::optional<boost::optional<SharedMutexToken>>, Atom>
+    queue(10);
+  auto queuePtr = &queue; // workaround for clang crash
+
+  Atom<bool> go(false);
+  auto goPtr = &go; // workaround for clang crash
+  Atom<int> pendingSenders(numSendingThreads);
+  auto pendingSendersPtr = &pendingSenders; // workaround for clang crash
+  vector<thread> threads(numSendingThreads + numReceivingThreads);
+
+  BENCHMARK_SUSPEND {
+    for (int t = 0; t < threads.size(); ++t) {
+      threads[t] = DSched::thread([&, t, numSendingThreads] {
+        if (t >= numSendingThreads) {
+          // we're a receiver
+          typename decltype(queue)::value_type elem;
+          while (true) {
+            queuePtr->blockingRead(elem);
+            if (!elem) {
+              // EOF, pass the EOF token
+              queuePtr->blockingWrite(std::move(elem));
+              break;
+            }
+            if (*elem) {
+              globalLock.unlock_shared(**elem);
+            } else {
+              globalLock.unlock_shared();
+            }
+          }
+          return;
+        }
+        // else we're a sender
+
+        struct drand48_data buffer;
+        srand48_r(t, &buffer);
+
+        while (!goPtr->load()) {
+          this_thread::yield();
+        }
+        for (size_t op = t; op < numOps; op += numSendingThreads) {
+          long unscaledRandVal;
+          lrand48_r(&buffer, &unscaledRandVal);
+
+          // randVal in [0,1]
+          double randVal = ((double)unscaledRandVal) / 0x7fffffff;
+
+          // extract a bit and rescale
+          bool useToken = randVal >= 0.5;
+          randVal = (randVal - (useToken ? 0.5 : 0.0)) * 2;
+
+          boost::optional<SharedMutexToken> maybeToken;
+
+          if (useToken) {
+            SharedMutexToken token;
+            if (randVal < preWriteFraction) {
+              globalLock.lock();
+              globalLock.unlock_and_lock_shared(token);
+            } else if (randVal < preWriteFraction + preUpgradeFraction / 2) {
+              globalLock.lock_upgrade();
+              globalLock.unlock_upgrade_and_lock_shared(token);
+            } else if (randVal < preWriteFraction + preUpgradeFraction) {
+              globalLock.lock_upgrade();
+              globalLock.unlock_upgrade_and_lock();
+              globalLock.unlock_and_lock_shared(token);
+            } else {
+              globalLock.lock_shared(token);
+            }
+            maybeToken = token;
+          } else {
+            if (randVal < preWriteFraction) {
+              globalLock.lock();
+              globalLock.unlock_and_lock_shared();
+            } else if (randVal < preWriteFraction + preUpgradeFraction / 2) {
+              globalLock.lock_upgrade();
+              globalLock.unlock_upgrade_and_lock_shared();
+            } else if (randVal < preWriteFraction + preUpgradeFraction) {
+              globalLock.lock_upgrade();
+              globalLock.unlock_upgrade_and_lock();
+              globalLock.unlock_and_lock_shared();
+            } else {
+              globalLock.lock_shared();
+            }
+          }
+
+          // blockingWrite is emplace-like, so this automatically adds
+          // another level of wrapping
+          queuePtr->blockingWrite(maybeToken);
+        }
+        if (--*pendingSendersPtr == 0) {
+          queuePtr->blockingWrite(boost::none);
+        }
+      });
+    }
+  }
+
+  go.store(true);
+  for (auto& thr : threads) {
+    DSched::join(thr);
+  }
+}
+
+TEST(SharedMutex, deterministic_remote_write_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    // LOG(INFO) << "pass " << pass;
+    DSched sched(DSched::uniform(pass));
+    runRemoteUnlock<DSharedMutexWritePriority, DeterministicAtomic>(
+        500, 0.1, 0.1, 5, 5);
+  }
+}
+
+TEST(SharedMutex, deterministic_remote_read_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    // LOG(INFO) << "pass " << pass;
+    DSched sched(DSched::uniform(pass));
+    runRemoteUnlock<DSharedMutexReadPriority, DeterministicAtomic>(
+        500, 0.1, 0.1, 5, 5);
+  }
+}
+
+TEST(SharedMutex, remote_write_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    // LOG(INFO) << "pass " << pass;
+    runRemoteUnlock<SharedMutexWritePriority, atomic>(100000, 0.1, 0.1, 5, 5);
+  }
+}
+
+TEST(SharedMutex, remote_read_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    // LOG(INFO) << "pass " << pass;
+    runRemoteUnlock<SharedMutexReadPriority, atomic>(100000, 0.1, 0.1, 5, 5);
+  }
+}
+
+static void burn(size_t n) {
+  for (size_t i = 0; i < n; ++i) {
+    folly::doNotOptimizeAway(i);
+  }
+}
+
+// Two threads and three locks, arranged so that they have to proceed
+// in turn with reader/writer conflict
+template <typename Lock, template <typename> class Atom = atomic>
+static void runPingPong(size_t numRounds, size_t burnCount) {
+  char padding1[56];
+  pair<Lock, char[56]> locks[3];
+  char padding2[56];
+
+  Atom<int> avail(0);
+  auto availPtr = &avail; // workaround for clang crash
+  Atom<bool> go(false);
+  auto goPtr = &go; // workaround for clang crash
+  vector<thread> threads(2);
+
+  locks[0].first.lock();
+  locks[1].first.lock();
+  locks[2].first.lock_shared();
+
+  BENCHMARK_SUSPEND {
+    threads[0] = DSched::thread([&] {
+      ++*availPtr;
+      while (!goPtr->load()) {
+        this_thread::yield();
+      }
+      for (int i = 0; i < numRounds; ++i) {
+        locks[i % 3].first.unlock();
+        locks[(i + 2) % 3].first.lock();
+        burn(burnCount);
+      }
+    });
+    threads[1] = DSched::thread([&] {
+      ++*availPtr;
+      while (!goPtr->load()) {
+        this_thread::yield();
+      }
+      for (int i = 0; i < numRounds; ++i) {
+        locks[i % 3].first.lock_shared();
+        burn(burnCount);
+        locks[(i + 2) % 3].first.unlock_shared();
+      }
+    });
+
+    while (avail.load() < 2) {
+      this_thread::yield();
+    }
+  }
+
+  go.store(true);
+  for (auto& thr : threads) {
+    DSched::join(thr);
+  }
+  locks[numRounds % 3].first.unlock();
+  locks[(numRounds + 1) % 3].first.unlock();
+  locks[(numRounds + 2) % 3].first.unlock_shared();
+}
+
+static void folly_rwspin_ping_pong(size_t n, size_t scale, size_t burnCount) {
+  runPingPong<RWSpinLock>(n / scale, burnCount);
+}
+
+static void shmtx_w_bare_ping_pong(size_t n, size_t scale, size_t burnCount) {
+  runPingPong<SharedMutexWritePriority>(n / scale, burnCount);
+}
+
+static void shmtx_r_bare_ping_pong(size_t n, size_t scale, size_t burnCount) {
+  runPingPong<SharedMutexReadPriority>(n / scale, burnCount);
+}
+
+static void folly_ticket_ping_pong(size_t n, size_t scale, size_t burnCount) {
+  runPingPong<RWTicketSpinLock64>(n / scale, burnCount);
+}
+
+static void boost_shared_ping_pong(size_t n, size_t scale, size_t burnCount) {
+  runPingPong<boost::shared_mutex>(n / scale, burnCount);
+}
+
+static void pthrd_rwlock_ping_pong(size_t n, size_t scale, size_t burnCount) {
+  runPingPong<PosixRWLock>(n / scale, burnCount);
+}
+
+TEST(SharedMutex, deterministic_ping_pong_write_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runPingPong<DSharedMutexWritePriority, DeterministicAtomic>(500, 0);
+  }
+}
+
+TEST(SharedMutex, deterministic_ping_pong_read_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runPingPong<DSharedMutexReadPriority, DeterministicAtomic>(500, 0);
+  }
+}
+
+TEST(SharedMutex, ping_pong_write_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    runPingPong<SharedMutexWritePriority, atomic>(50000, 0);
+  }
+}
+
+TEST(SharedMutex, ping_pong_read_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    runPingPong<SharedMutexReadPriority, atomic>(50000, 0);
+  }
+}
+
+// This is here so you can tell how much of the runtime reported by the
+// more complex harnesses is due to the harness, although due to the
+// magic of compiler optimization it may also be slower
+BENCHMARK(single_thread_lock_shared_unlock_shared, iters) {
+  SharedMutex lock;
+  for (size_t n = 0; n < iters; ++n) {
+    SharedMutex::Token token;
+    lock.lock_shared(token);
+    folly::doNotOptimizeAway(0);
+    lock.unlock_shared(token);
+  }
+}
+
+BENCHMARK(single_thread_lock_unlock, iters) {
+  SharedMutex lock;
+  for (size_t n = 0; n < iters; ++n) {
+    lock.lock();
+    folly::doNotOptimizeAway(0);
+    lock.unlock();
+  }
+}
+
+#define BENCH_BASE(args...) BENCHMARK_NAMED_PARAM(args)
+#define BENCH_REL(args...) BENCHMARK_RELATIVE_NAMED_PARAM(args)
+
+// 100% reads.  Best-case scenario for deferred locks.  Lock is colocated
+// with read data, so inline lock takes cache miss every time but deferred
+// lock has only cache hits and local access.
+BENCHMARK_DRAW_LINE()
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 1thread, 1, false)
+BENCH_REL (shmtx_wr_pri_reads, 1thread, 1, false)
+BENCH_REL (shmtx_w_bare_reads, 1thread, 1, false)
+BENCH_REL (shmtx_rd_pri_reads, 1thread, 1, false)
+BENCH_REL (shmtx_r_bare_reads, 1thread, 1, false)
+BENCH_REL (folly_ticket_reads, 1thread, 1, false)
+BENCH_REL (boost_shared_reads, 1thread, 1, false)
+BENCH_REL (pthrd_rwlock_reads, 1thread, 1, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 2thread, 2, false)
+BENCH_REL (shmtx_wr_pri_reads, 2thread, 2, false)
+BENCH_REL (shmtx_w_bare_reads, 2thread, 2, false)
+BENCH_REL (shmtx_rd_pri_reads, 2thread, 2, false)
+BENCH_REL (shmtx_r_bare_reads, 2thread, 2, false)
+BENCH_REL (folly_ticket_reads, 2thread, 2, false)
+BENCH_REL (boost_shared_reads, 2thread, 2, false)
+BENCH_REL (pthrd_rwlock_reads, 2thread, 2, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 4thread, 4, false)
+BENCH_REL (shmtx_wr_pri_reads, 4thread, 4, false)
+BENCH_REL (shmtx_w_bare_reads, 4thread, 4, false)
+BENCH_REL (shmtx_rd_pri_reads, 4thread, 4, false)
+BENCH_REL (shmtx_r_bare_reads, 4thread, 4, false)
+BENCH_REL (folly_ticket_reads, 4thread, 4, false)
+BENCH_REL (boost_shared_reads, 4thread, 4, false)
+BENCH_REL (pthrd_rwlock_reads, 4thread, 4, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 8thread, 8, false)
+BENCH_REL (shmtx_wr_pri_reads, 8thread, 8, false)
+BENCH_REL (shmtx_w_bare_reads, 8thread, 8, false)
+BENCH_REL (shmtx_rd_pri_reads, 8thread, 8, false)
+BENCH_REL (shmtx_r_bare_reads, 8thread, 8, false)
+BENCH_REL (folly_ticket_reads, 8thread, 8, false)
+BENCH_REL (boost_shared_reads, 8thread, 8, false)
+BENCH_REL (pthrd_rwlock_reads, 8thread, 8, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 16thread, 16, false)
+BENCH_REL (shmtx_wr_pri_reads, 16thread, 16, false)
+BENCH_REL (shmtx_w_bare_reads, 16thread, 16, false)
+BENCH_REL (shmtx_rd_pri_reads, 16thread, 16, false)
+BENCH_REL (shmtx_r_bare_reads, 16thread, 16, false)
+BENCH_REL (folly_ticket_reads, 16thread, 16, false)
+BENCH_REL (boost_shared_reads, 16thread, 16, false)
+BENCH_REL (pthrd_rwlock_reads, 16thread, 16, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 32thread, 32, false)
+BENCH_REL (shmtx_wr_pri_reads, 32thread, 32, false)
+BENCH_REL (shmtx_w_bare_reads, 32thread, 32, false)
+BENCH_REL (shmtx_rd_pri_reads, 32thread, 32, false)
+BENCH_REL (shmtx_r_bare_reads, 32thread, 32, false)
+BENCH_REL (folly_ticket_reads, 32thread, 32, false)
+BENCH_REL (boost_shared_reads, 32thread, 32, false)
+BENCH_REL (pthrd_rwlock_reads, 32thread, 32, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 64thread, 64, false)
+BENCH_REL (shmtx_wr_pri_reads, 64thread, 64, false)
+BENCH_REL (shmtx_w_bare_reads, 64thread, 64, false)
+BENCH_REL (shmtx_rd_pri_reads, 64thread, 64, false)
+BENCH_REL (shmtx_r_bare_reads, 64thread, 64, false)
+BENCH_REL (folly_ticket_reads, 64thread, 64, false)
+BENCH_REL (boost_shared_reads, 64thread, 64, false)
+BENCH_REL (pthrd_rwlock_reads, 64thread, 64, false)
+
+// 1 lock used by everybody, 100% writes.  Threads only hurt, but it is
+// good to not fail catastrophically.  Compare to single_thread_lock_unlock
+// to see the overhead of the generic driver (and its pseudo-random number
+// generator).  pthrd_mutex_ is a pthread_mutex_t (default, not adaptive),
+// which is better than any of the reader-writer locks for this scenario.
+BENCHMARK_DRAW_LINE()
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (folly_ticket, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (boost_shared, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (pthrd_rwlock, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (pthrd_mutex_, 1thread_all_write, 1, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (folly_ticket, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (boost_shared, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (pthrd_rwlock, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (pthrd_mutex_, 2thread_all_write, 2, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (folly_ticket, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (boost_shared, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (pthrd_rwlock, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (pthrd_mutex_, 4thread_all_write, 4, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (folly_ticket, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (boost_shared, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (pthrd_rwlock, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (pthrd_mutex_, 8thread_all_write, 8, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (folly_ticket, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (boost_shared, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (pthrd_rwlock, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (pthrd_mutex_, 16thread_all_write, 16, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (folly_ticket, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (boost_shared, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (pthrd_rwlock, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (pthrd_mutex_, 32thread_all_write, 32, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (folly_ticket, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (boost_shared, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (pthrd_rwlock, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (pthrd_mutex_, 64thread_all_write, 64, 1.0, false)
+
+// 1 lock used by everybody, 10% writes.  Not much scaling to be had.  Perf
+// is best at 1 thread, once you've got multiple threads > 8 threads hurts.
+BENCHMARK_DRAW_LINE()
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 1thread_10pct_write, 1, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 1thread_10pct_write, 1, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 1thread_10pct_write, 1, 0.10, false)
+BENCH_REL (folly_ticket, 1thread_10pct_write, 1, 0.10, false)
+BENCH_REL (boost_shared, 1thread_10pct_write, 1, 0.10, false)
+BENCH_REL (pthrd_rwlock, 1thread_10pct_write, 1, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thread_10pct_write, 2, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 2thread_10pct_write, 2, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 2thread_10pct_write, 2, 0.10, false)
+BENCH_REL (folly_ticket, 2thread_10pct_write, 2, 0.10, false)
+BENCH_REL (boost_shared, 2thread_10pct_write, 2, 0.10, false)
+BENCH_REL (pthrd_rwlock, 2thread_10pct_write, 2, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 4thread_10pct_write, 4, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 4thread_10pct_write, 4, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 4thread_10pct_write, 4, 0.10, false)
+BENCH_REL (folly_ticket, 4thread_10pct_write, 4, 0.10, false)
+BENCH_REL (boost_shared, 4thread_10pct_write, 4, 0.10, false)
+BENCH_REL (pthrd_rwlock, 4thread_10pct_write, 4, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 8thread_10pct_write, 8, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 8thread_10pct_write, 8, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 8thread_10pct_write, 8, 0.10, false)
+BENCH_REL (folly_ticket, 8thread_10pct_write, 8, 0.10, false)
+BENCH_REL (boost_shared, 8thread_10pct_write, 8, 0.10, false)
+BENCH_REL (pthrd_rwlock, 8thread_10pct_write, 8, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 16thread_10pct_write, 16, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 16thread_10pct_write, 16, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 16thread_10pct_write, 16, 0.10, false)
+BENCH_REL (folly_ticket, 16thread_10pct_write, 16, 0.10, false)
+BENCH_REL (boost_shared, 16thread_10pct_write, 16, 0.10, false)
+BENCH_REL (pthrd_rwlock, 16thread_10pct_write, 16, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 32thread_10pct_write, 32, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 32thread_10pct_write, 32, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 32thread_10pct_write, 32, 0.10, false)
+BENCH_REL (folly_ticket, 32thread_10pct_write, 32, 0.10, false)
+BENCH_REL (boost_shared, 32thread_10pct_write, 32, 0.10, false)
+BENCH_REL (pthrd_rwlock, 32thread_10pct_write, 32, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 64thread_10pct_write, 64, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 64thread_10pct_write, 64, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 64thread_10pct_write, 64, 0.10, false)
+BENCH_REL (folly_ticket, 64thread_10pct_write, 64, 0.10, false)
+BENCH_REL (boost_shared, 64thread_10pct_write, 64, 0.10, false)
+BENCH_REL (pthrd_rwlock, 64thread_10pct_write, 64, 0.10, false)
+
+// 1 lock used by everybody, 1% writes.  This is a more realistic example
+// than the concurrent_*_reads benchmark, but still shows SharedMutex locks
+// winning over all of the others
+BENCHMARK_DRAW_LINE()
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (shmtx_w_bare, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (shmtx_r_bare, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (folly_ticket, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (boost_shared, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (pthrd_rwlock, 1thread_1pct_write, 1, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (shmtx_w_bare, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (shmtx_r_bare, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (folly_ticket, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (boost_shared, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (pthrd_rwlock, 2thread_1pct_write, 2, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (shmtx_w_bare, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (shmtx_r_bare, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (folly_ticket, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (boost_shared, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (pthrd_rwlock, 4thread_1pct_write, 4, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (shmtx_w_bare, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (shmtx_r_bare, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (folly_ticket, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (boost_shared, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (pthrd_rwlock, 8thread_1pct_write, 8, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (shmtx_w_bare, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (shmtx_r_bare, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (folly_ticket, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (boost_shared, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (pthrd_rwlock, 16thread_1pct_write, 16, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (shmtx_w_bare, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (shmtx_r_bare, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (folly_ticket, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (boost_shared, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (pthrd_rwlock, 32thread_1pct_write, 32, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (shmtx_w_bare, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (shmtx_r_bare, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (folly_ticket, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (boost_shared, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (pthrd_rwlock, 64thread_1pct_write, 64, 0.01, false)
+
+// Worst case scenario for deferred locks. No actual sharing, likely that
+// read operations will have to first set the kDeferredReadersPossibleBit,
+// and likely that writers will have to scan deferredReaders[].
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thr_2lock_50pct_write, 2, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 2thr_2lock_50pct_write, 2, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 2thr_2lock_50pct_write, 2, 0.50, true)
+BENCH_BASE(folly_rwspin, 4thr_4lock_50pct_write, 4, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 4thr_4lock_50pct_write, 4, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 4thr_4lock_50pct_write, 4, 0.50, true)
+BENCH_BASE(folly_rwspin, 8thr_8lock_50pct_write, 8, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 8thr_8lock_50pct_write, 8, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 8thr_8lock_50pct_write, 8, 0.50, true)
+BENCH_BASE(folly_rwspin, 16thr_16lock_50pct_write, 16, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 16thr_16lock_50pct_write, 16, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 16thr_16lock_50pct_write, 16, 0.50, true)
+BENCH_BASE(folly_rwspin, 32thr_32lock_50pct_write, 32, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 32thr_32lock_50pct_write, 32, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 32thr_32lock_50pct_write, 32, 0.50, true)
+BENCH_BASE(folly_rwspin, 64thr_64lock_50pct_write, 64, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 64thr_64lock_50pct_write, 64, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 64thr_64lock_50pct_write, 64, 0.50, true)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thr_2lock_10pct_write, 2, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 2thr_2lock_10pct_write, 2, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 2thr_2lock_10pct_write, 2, 0.10, true)
+BENCH_BASE(folly_rwspin, 4thr_4lock_10pct_write, 4, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 4thr_4lock_10pct_write, 4, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 4thr_4lock_10pct_write, 4, 0.10, true)
+BENCH_BASE(folly_rwspin, 8thr_8lock_10pct_write, 8, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 8thr_8lock_10pct_write, 8, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 8thr_8lock_10pct_write, 8, 0.10, true)
+BENCH_BASE(folly_rwspin, 16thr_16lock_10pct_write, 16, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 16thr_16lock_10pct_write, 16, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 16thr_16lock_10pct_write, 16, 0.10, true)
+BENCH_BASE(folly_rwspin, 32thr_32lock_10pct_write, 32, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 32thr_32lock_10pct_write, 32, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 32thr_32lock_10pct_write, 32, 0.10, true)
+BENCH_BASE(folly_rwspin, 64thr_64lock_10pct_write, 64, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 64thr_64lock_10pct_write, 64, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 64thr_64lock_10pct_write, 64, 0.10, true)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thr_2lock_1pct_write, 2, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 2thr_2lock_1pct_write, 2, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 2thr_2lock_1pct_write, 2, 0.01, true)
+BENCH_BASE(folly_rwspin, 4thr_4lock_1pct_write, 4, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 4thr_4lock_1pct_write, 4, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 4thr_4lock_1pct_write, 4, 0.01, true)
+BENCH_BASE(folly_rwspin, 8thr_8lock_1pct_write, 8, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 8thr_8lock_1pct_write, 8, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 8thr_8lock_1pct_write, 8, 0.01, true)
+BENCH_BASE(folly_rwspin, 16thr_16lock_1pct_write, 16, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 16thr_16lock_1pct_write, 16, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 16thr_16lock_1pct_write, 16, 0.01, true)
+BENCH_BASE(folly_rwspin, 32thr_32lock_1pct_write, 32, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 32thr_32lock_1pct_write, 32, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 32thr_32lock_1pct_write, 32, 0.01, true)
+BENCH_BASE(folly_rwspin, 64thr_64lock_1pct_write, 64, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 64thr_64lock_1pct_write, 64, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 64thr_64lock_1pct_write, 64, 0.01, true)
+
+// Ping-pong tests have a scaled number of iterations, because their burn
+// loop would make them too slow otherwise.  Ping-pong with burn count of
+// 100k or 300k shows the advantage of soft-spin, reducing the cost of
+// each wakeup by about 20 usec.  (Take benchmark reported difference,
+// ~400 nanos, multiply by the scale of 100, then divide by 2 because
+// each round has two wakeups.)
+BENCHMARK_DRAW_LINE()
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_ping_pong, burn0, 1, 0)
+BENCH_REL (shmtx_w_bare_ping_pong, burn0, 1, 0)
+BENCH_REL (shmtx_r_bare_ping_pong, burn0, 1, 0)
+BENCH_REL (folly_ticket_ping_pong, burn0, 1, 0)
+BENCH_REL (boost_shared_ping_pong, burn0, 1, 0)
+BENCH_REL (pthrd_rwlock_ping_pong, burn0, 1, 0)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_ping_pong, burn100k, 100, 100000)
+BENCH_REL (shmtx_w_bare_ping_pong, burn100k, 100, 100000)
+BENCH_REL (shmtx_r_bare_ping_pong, burn100k, 100, 100000)
+BENCH_REL (folly_ticket_ping_pong, burn100k, 100, 100000)
+BENCH_REL (boost_shared_ping_pong, burn100k, 100, 100000)
+BENCH_REL (pthrd_rwlock_ping_pong, burn100k, 100, 100000)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_ping_pong, burn300k, 100, 300000)
+BENCH_REL (shmtx_w_bare_ping_pong, burn300k, 100, 300000)
+BENCH_REL (shmtx_r_bare_ping_pong, burn300k, 100, 300000)
+BENCH_REL (folly_ticket_ping_pong, burn300k, 100, 300000)
+BENCH_REL (boost_shared_ping_pong, burn300k, 100, 300000)
+BENCH_REL (pthrd_rwlock_ping_pong, burn300k, 100, 300000)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_ping_pong, burn1M, 1000, 1000000)
+BENCH_REL (shmtx_w_bare_ping_pong, burn1M, 1000, 1000000)
+BENCH_REL (shmtx_r_bare_ping_pong, burn1M, 1000, 1000000)
+BENCH_REL (folly_ticket_ping_pong, burn1M, 1000, 1000000)
+BENCH_REL (boost_shared_ping_pong, burn1M, 1000, 1000000)
+BENCH_REL (pthrd_rwlock_ping_pong, burn1M, 1000, 1000000)
+
+// Reproduce with 10 minutes and
+//   sudo nice -n -20 \
+//     shared_mutex_test --benchmark --bm_min_iters=1000000
+//
+// Comparison use folly::RWSpinLock as the baseline, with the
+// following row being the default SharedMutex (using *Holder or
+// Token-ful methods).
+// ============================================================================
+// folly/experimental/test/SharedMutexTest.cpp     relative  time/iter  iters/s
+// ============================================================================
+// single_thread_lock_shared_unlock_shared                     23.01ns   43.47M
+// single_thread_lock_unlock                                   25.42ns   39.34M
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(1thread)                                 15.13ns   66.10M
+// shmtx_wr_pri_reads(1thread)                       73.76%    20.51ns   48.75M
+// shmtx_w_bare_reads(1thread)                       59.49%    25.43ns   39.32M
+// shmtx_rd_pri_reads(1thread)                       72.60%    20.84ns   47.99M
+// shmtx_r_bare_reads(1thread)                       59.62%    25.37ns   39.41M
+// folly_ticket_reads(1thread)                       55.40%    27.31ns   36.62M
+// boost_shared_reads(1thread)                       10.88%   139.01ns    7.19M
+// pthrd_rwlock_reads(1thread)                       40.70%    37.17ns   26.90M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(2thread)                                 47.51ns   21.05M
+// shmtx_wr_pri_reads(2thread)                      237.28%    20.02ns   49.94M
+// shmtx_w_bare_reads(2thread)                      222.10%    21.39ns   46.74M
+// shmtx_rd_pri_reads(2thread)                      251.68%    18.88ns   52.97M
+// shmtx_r_bare_reads(2thread)                      222.29%    21.37ns   46.78M
+// folly_ticket_reads(2thread)                       55.00%    86.39ns   11.58M
+// boost_shared_reads(2thread)                       22.86%   207.81ns    4.81M
+// pthrd_rwlock_reads(2thread)                       61.36%    77.43ns   12.92M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(4thread)                                 69.29ns   14.43M
+// shmtx_wr_pri_reads(4thread)                      694.46%     9.98ns  100.23M
+// shmtx_w_bare_reads(4thread)                      650.25%    10.66ns   93.85M
+// shmtx_rd_pri_reads(4thread)                      738.08%     9.39ns  106.53M
+// shmtx_r_bare_reads(4thread)                      650.71%    10.65ns   93.92M
+// folly_ticket_reads(4thread)                       63.86%   108.49ns    9.22M
+// boost_shared_reads(4thread)                       19.53%   354.79ns    2.82M
+// pthrd_rwlock_reads(4thread)                       33.86%   204.61ns    4.89M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(8thread)                                 75.34ns   13.27M
+// shmtx_wr_pri_reads(8thread)                     1500.46%     5.02ns  199.16M
+// shmtx_w_bare_reads(8thread)                     1397.84%     5.39ns  185.54M
+// shmtx_rd_pri_reads(8thread)                     1589.99%     4.74ns  211.05M
+// shmtx_r_bare_reads(8thread)                     1398.83%     5.39ns  185.67M
+// folly_ticket_reads(8thread)                       53.26%   141.45ns    7.07M
+// boost_shared_reads(8thread)                       26.24%   287.11ns    3.48M
+// pthrd_rwlock_reads(8thread)                       43.40%   173.57ns    5.76M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(16thread)                                80.81ns   12.38M
+// shmtx_wr_pri_reads(16thread)                    3119.49%     2.59ns  386.05M
+// shmtx_w_bare_reads(16thread)                    2916.06%     2.77ns  360.87M
+// shmtx_rd_pri_reads(16thread)                    3330.06%     2.43ns  412.11M
+// shmtx_r_bare_reads(16thread)                    2909.05%     2.78ns  360.01M
+// folly_ticket_reads(16thread)                      44.59%   181.21ns    5.52M
+// boost_shared_reads(16thread)                      29.56%   273.40ns    3.66M
+// pthrd_rwlock_reads(16thread)                      48.39%   166.99ns    5.99M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(32thread)                                73.29ns   13.64M
+// shmtx_wr_pri_reads(32thread)                    4417.58%     1.66ns  602.77M
+// shmtx_w_bare_reads(32thread)                    4463.71%     1.64ns  609.06M
+// shmtx_rd_pri_reads(32thread)                    4777.84%     1.53ns  651.92M
+// shmtx_r_bare_reads(32thread)                    4312.45%     1.70ns  588.42M
+// folly_ticket_reads(32thread)                      25.56%   286.75ns    3.49M
+// boost_shared_reads(32thread)                      22.08%   331.86ns    3.01M
+// pthrd_rwlock_reads(32thread)                      46.72%   156.87ns    6.37M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(64thread)                                74.92ns   13.35M
+// shmtx_wr_pri_reads(64thread)                    4171.71%     1.80ns  556.83M
+// shmtx_w_bare_reads(64thread)                    3973.49%     1.89ns  530.37M
+// shmtx_rd_pri_reads(64thread)                    4404.73%     1.70ns  587.94M
+// shmtx_r_bare_reads(64thread)                    3985.48%     1.88ns  531.98M
+// folly_ticket_reads(64thread)                      26.07%   287.39ns    3.48M
+// boost_shared_reads(64thread)                      23.59%   317.64ns    3.15M
+// pthrd_rwlock_reads(64thread)                      49.54%   151.24ns    6.61M
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// folly_rwspin(1thread_all_write)                             25.29ns   39.53M
+// shmtx_wr_pri(1thread_all_write)                   96.76%    26.14ns   38.25M
+// shmtx_rd_pri(1thread_all_write)                   96.60%    26.18ns   38.19M
+// folly_ticket(1thread_all_write)                   89.58%    28.24ns   35.42M
+// boost_shared(1thread_all_write)                   17.06%   148.29ns    6.74M
+// pthrd_rwlock(1thread_all_write)                   63.32%    39.95ns   25.03M
+// pthrd_mutex_(1thread_all_write)                   81.38%    31.08ns   32.17M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thread_all_write)                            104.60ns    9.56M
+// shmtx_wr_pri(2thread_all_write)                   48.87%   214.06ns    4.67M
+// shmtx_rd_pri(2thread_all_write)                   42.47%   246.31ns    4.06M
+// folly_ticket(2thread_all_write)                   73.12%   143.05ns    6.99M
+// boost_shared(2thread_all_write)                   24.59%   425.41ns    2.35M
+// pthrd_rwlock(2thread_all_write)                   38.69%   270.37ns    3.70M
+// pthrd_mutex_(2thread_all_write)                  155.45%    67.29ns   14.86M
+// ----------------------------------------------------------------------------
+// folly_rwspin(4thread_all_write)                            166.17ns    6.02M
+// shmtx_wr_pri(4thread_all_write)                   45.40%   366.00ns    2.73M
+// shmtx_rd_pri(4thread_all_write)                   62.81%   264.56ns    3.78M
+// folly_ticket(4thread_all_write)                  118.11%   140.69ns    7.11M
+// boost_shared(4thread_all_write)                    8.78%     1.89us  528.22K
+// pthrd_rwlock(4thread_all_write)                   27.30%   608.59ns    1.64M
+// pthrd_mutex_(4thread_all_write)                   92.18%   180.27ns    5.55M
+// ----------------------------------------------------------------------------
+// folly_rwspin(8thread_all_write)                            363.10ns    2.75M
+// shmtx_wr_pri(8thread_all_write)                  163.18%   222.51ns    4.49M
+// shmtx_rd_pri(8thread_all_write)                   91.20%   398.11ns    2.51M
+// folly_ticket(8thread_all_write)                  150.11%   241.89ns    4.13M
+// boost_shared(8thread_all_write)                    7.53%     4.82us  207.48K
+// pthrd_rwlock(8thread_all_write)                   57.06%   636.32ns    1.57M
+// pthrd_mutex_(8thread_all_write)                  218.78%   165.96ns    6.03M
+// ----------------------------------------------------------------------------
+// folly_rwspin(16thread_all_write)                           762.75ns    1.31M
+// shmtx_wr_pri(16thread_all_write)                 131.04%   582.08ns    1.72M
+// shmtx_rd_pri(16thread_all_write)                 130.26%   585.57ns    1.71M
+// folly_ticket(16thread_all_write)                 253.39%   301.01ns    3.32M
+// boost_shared(16thread_all_write)                  10.33%     7.38us  135.43K
+// pthrd_rwlock(16thread_all_write)                 141.66%   538.43ns    1.86M
+// pthrd_mutex_(16thread_all_write)                 471.34%   161.83ns    6.18M
+// ----------------------------------------------------------------------------
+// folly_rwspin(32thread_all_write)                             1.42us  705.40K
+// shmtx_wr_pri(32thread_all_write)                 229.36%   618.09ns    1.62M
+// shmtx_rd_pri(32thread_all_write)                 228.78%   619.65ns    1.61M
+// folly_ticket(32thread_all_write)                 326.61%   434.04ns    2.30M
+// boost_shared(32thread_all_write)                  18.65%     7.60us  131.59K
+// pthrd_rwlock(32thread_all_write)                 261.56%   542.00ns    1.85M
+// pthrd_mutex_(32thread_all_write)                 946.65%   149.75ns    6.68M
+// ----------------------------------------------------------------------------
+// folly_rwspin(64thread_all_write)                             1.83us  545.94K
+// shmtx_wr_pri(64thread_all_write)                 248.08%   738.34ns    1.35M
+// shmtx_rd_pri(64thread_all_write)                 249.47%   734.23ns    1.36M
+// folly_ticket(64thread_all_write)                 342.38%   535.00ns    1.87M
+// boost_shared(64thread_all_write)                  23.95%     7.65us  130.75K
+// pthrd_rwlock(64thread_all_write)                 318.32%   575.42ns    1.74M
+// pthrd_mutex_(64thread_all_write)                1288.43%   142.16ns    7.03M
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// folly_rwspin(1thread_10pct_write)                           19.13ns   52.28M
+// shmtx_wr_pri(1thread_10pct_write)                 80.47%    23.77ns   42.07M
+// shmtx_rd_pri(1thread_10pct_write)                 80.63%    23.72ns   42.15M
+// folly_ticket(1thread_10pct_write)                 69.33%    27.59ns   36.25M
+// boost_shared(1thread_10pct_write)                 12.46%   153.53ns    6.51M
+// pthrd_rwlock(1thread_10pct_write)                 46.35%    41.27ns   24.23M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thread_10pct_write)                          142.93ns    7.00M
+// shmtx_wr_pri(2thread_10pct_write)                165.37%    86.43ns   11.57M
+// shmtx_rd_pri(2thread_10pct_write)                159.35%    89.70ns   11.15M
+// folly_ticket(2thread_10pct_write)                129.31%   110.53ns    9.05M
+// boost_shared(2thread_10pct_write)                 39.42%   362.54ns    2.76M
+// pthrd_rwlock(2thread_10pct_write)                 87.87%   162.65ns    6.15M
+// ----------------------------------------------------------------------------
+// folly_rwspin(4thread_10pct_write)                          197.39ns    5.07M
+// shmtx_wr_pri(4thread_10pct_write)                171.06%   115.39ns    8.67M
+// shmtx_rd_pri(4thread_10pct_write)                139.86%   141.13ns    7.09M
+// folly_ticket(4thread_10pct_write)                129.34%   152.62ns    6.55M
+// boost_shared(4thread_10pct_write)                 16.99%     1.16us  860.70K
+// pthrd_rwlock(4thread_10pct_write)                 47.65%   414.28ns    2.41M
+// ----------------------------------------------------------------------------
+// folly_rwspin(8thread_10pct_write)                          392.62ns    2.55M
+// shmtx_wr_pri(8thread_10pct_write)                273.40%   143.61ns    6.96M
+// shmtx_rd_pri(8thread_10pct_write)                194.52%   201.84ns    4.95M
+// folly_ticket(8thread_10pct_write)                189.91%   206.75ns    4.84M
+// boost_shared(8thread_10pct_write)                 16.84%     2.33us  429.03K
+// pthrd_rwlock(8thread_10pct_write)                 87.03%   451.14ns    2.22M
+// ----------------------------------------------------------------------------
+// folly_rwspin(16thread_10pct_write)                         794.93ns    1.26M
+// shmtx_wr_pri(16thread_10pct_write)               352.64%   225.43ns    4.44M
+// shmtx_rd_pri(16thread_10pct_write)               295.42%   269.09ns    3.72M
+// folly_ticket(16thread_10pct_write)               296.11%   268.46ns    3.72M
+// boost_shared(16thread_10pct_write)                17.04%     4.66us  214.39K
+// pthrd_rwlock(16thread_10pct_write)               176.40%   450.64ns    2.22M
+// ----------------------------------------------------------------------------
+// folly_rwspin(32thread_10pct_write)                         821.14ns    1.22M
+// shmtx_wr_pri(32thread_10pct_write)               355.74%   230.82ns    4.33M
+// shmtx_rd_pri(32thread_10pct_write)               320.09%   256.53ns    3.90M
+// folly_ticket(32thread_10pct_write)               262.01%   313.41ns    3.19M
+// boost_shared(32thread_10pct_write)                 8.15%    10.08us   99.20K
+// pthrd_rwlock(32thread_10pct_write)               175.15%   468.83ns    2.13M
+// ----------------------------------------------------------------------------
+// folly_rwspin(64thread_10pct_write)                           1.20us  836.33K
+// shmtx_wr_pri(64thread_10pct_write)               437.20%   273.49ns    3.66M
+// shmtx_rd_pri(64thread_10pct_write)               438.80%   272.49ns    3.67M
+// folly_ticket(64thread_10pct_write)               254.51%   469.82ns    2.13M
+// boost_shared(64thread_10pct_write)                 6.05%    19.78us   50.56K
+// pthrd_rwlock(64thread_10pct_write)               254.24%   470.30ns    2.13M
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// folly_rwspin(1thread_1pct_write)                            18.60ns   53.76M
+// shmtx_wr_pri(1thread_1pct_write)                  79.07%    23.52ns   42.51M
+// shmtx_w_bare(1thread_1pct_write)                  66.09%    28.15ns   35.53M
+// shmtx_rd_pri(1thread_1pct_write)                  79.21%    23.48ns   42.58M
+// shmtx_r_bare(1thread_1pct_write)                  65.98%    28.19ns   35.47M
+// folly_ticket(1thread_1pct_write)                  67.69%    27.48ns   36.39M
+// boost_shared(1thread_1pct_write)                  12.17%   152.88ns    6.54M
+// pthrd_rwlock(1thread_1pct_write)                  45.04%    41.30ns   24.22M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thread_1pct_write)                           128.42ns    7.79M
+// shmtx_wr_pri(2thread_1pct_write)                 347.63%    36.94ns   27.07M
+// shmtx_w_bare(2thread_1pct_write)                 475.37%    27.02ns   37.02M
+// shmtx_rd_pri(2thread_1pct_write)                 312.94%    41.04ns   24.37M
+// shmtx_r_bare(2thread_1pct_write)                 149.38%    85.97ns   11.63M
+// folly_ticket(2thread_1pct_write)                 147.88%    86.84ns   11.52M
+// boost_shared(2thread_1pct_write)                  45.50%   282.24ns    3.54M
+// pthrd_rwlock(2thread_1pct_write)                 129.88%    98.88ns   10.11M
+// ----------------------------------------------------------------------------
+// folly_rwspin(4thread_1pct_write)                           148.88ns    6.72M
+// shmtx_wr_pri(4thread_1pct_write)                 504.03%    29.54ns   33.86M
+// shmtx_w_bare(4thread_1pct_write)                 471.63%    31.57ns   31.68M
+// shmtx_rd_pri(4thread_1pct_write)                 291.84%    51.01ns   19.60M
+// shmtx_r_bare(4thread_1pct_write)                  81.41%   182.86ns    5.47M
+// folly_ticket(4thread_1pct_write)                 114.59%   129.92ns    7.70M
+// boost_shared(4thread_1pct_write)                  26.70%   557.56ns    1.79M
+// pthrd_rwlock(4thread_1pct_write)                  64.46%   230.97ns    4.33M
+// ----------------------------------------------------------------------------
+// folly_rwspin(8thread_1pct_write)                           213.06ns    4.69M
+// shmtx_wr_pri(8thread_1pct_write)                 734.88%    28.99ns   34.49M
+// shmtx_w_bare(8thread_1pct_write)                 676.88%    31.48ns   31.77M
+// shmtx_rd_pri(8thread_1pct_write)                 196.93%   108.19ns    9.24M
+// shmtx_r_bare(8thread_1pct_write)                  99.35%   214.46ns    4.66M
+// folly_ticket(8thread_1pct_write)                 120.84%   176.31ns    5.67M
+// boost_shared(8thread_1pct_write)                  28.51%   747.36ns    1.34M
+// pthrd_rwlock(8thread_1pct_write)                  88.85%   239.81ns    4.17M
+// ----------------------------------------------------------------------------
+// folly_rwspin(16thread_1pct_write)                          481.61ns    2.08M
+// shmtx_wr_pri(16thread_1pct_write)               1204.17%    40.00ns   25.00M
+// shmtx_w_bare(16thread_1pct_write)               1241.61%    38.79ns   25.78M
+// shmtx_rd_pri(16thread_1pct_write)                315.61%   152.60ns    6.55M
+// shmtx_r_bare(16thread_1pct_write)                211.23%   228.00ns    4.39M
+// folly_ticket(16thread_1pct_write)                227.88%   211.35ns    4.73M
+// boost_shared(16thread_1pct_write)                 34.17%     1.41us  709.47K
+// pthrd_rwlock(16thread_1pct_write)                210.97%   228.28ns    4.38M
+// ----------------------------------------------------------------------------
+// folly_rwspin(32thread_1pct_write)                          382.40ns    2.62M
+// shmtx_wr_pri(32thread_1pct_write)                984.99%    38.82ns   25.76M
+// shmtx_w_bare(32thread_1pct_write)                957.41%    39.94ns   25.04M
+// shmtx_rd_pri(32thread_1pct_write)                248.87%   153.65ns    6.51M
+// shmtx_r_bare(32thread_1pct_write)                175.33%   218.11ns    4.58M
+// folly_ticket(32thread_1pct_write)                140.50%   272.18ns    3.67M
+// boost_shared(32thread_1pct_write)                 12.67%     3.02us  331.22K
+// pthrd_rwlock(32thread_1pct_write)                172.70%   221.42ns    4.52M
+// ----------------------------------------------------------------------------
+// folly_rwspin(64thread_1pct_write)                          448.64ns    2.23M
+// shmtx_wr_pri(64thread_1pct_write)               1136.53%    39.47ns   25.33M
+// shmtx_w_bare(64thread_1pct_write)               1037.84%    43.23ns   23.13M
+// shmtx_rd_pri(64thread_1pct_write)                284.52%   157.68ns    6.34M
+// shmtx_r_bare(64thread_1pct_write)                216.51%   207.21ns    4.83M
+// folly_ticket(64thread_1pct_write)                114.00%   393.54ns    2.54M
+// boost_shared(64thread_1pct_write)                  8.29%     5.41us  184.85K
+// pthrd_rwlock(64thread_1pct_write)                207.19%   216.53ns    4.62M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thr_2lock_50pct_write)                        10.84ns   92.23M
+// shmtx_wr_pri(2thr_2lock_50pct_write)              85.21%    12.72ns   78.59M
+// shmtx_rd_pri(2thr_2lock_50pct_write)              84.80%    12.79ns   78.21M
+// folly_rwspin(4thr_4lock_50pct_write)                         5.33ns  187.76M
+// shmtx_wr_pri(4thr_4lock_50pct_write)              84.84%     6.28ns  159.30M
+// shmtx_rd_pri(4thr_4lock_50pct_write)              84.38%     6.31ns  158.42M
+// folly_rwspin(8thr_8lock_50pct_write)                         2.63ns  379.54M
+// shmtx_wr_pri(8thr_8lock_50pct_write)              84.30%     3.13ns  319.97M
+// shmtx_rd_pri(8thr_8lock_50pct_write)              84.35%     3.12ns  320.16M
+// folly_rwspin(16thr_16lock_50pct_write)                       1.31ns  760.73M
+// shmtx_wr_pri(16thr_16lock_50pct_write)            83.58%     1.57ns  635.80M
+// shmtx_rd_pri(16thr_16lock_50pct_write)            83.72%     1.57ns  636.89M
+// folly_rwspin(32thr_32lock_50pct_write)                       1.19ns  838.77M
+// shmtx_wr_pri(32thr_32lock_50pct_write)            89.84%     1.33ns  753.55M
+// shmtx_rd_pri(32thr_32lock_50pct_write)            89.39%     1.33ns  749.82M
+// folly_rwspin(64thr_64lock_50pct_write)                       1.39ns  718.11M
+// shmtx_wr_pri(64thr_64lock_50pct_write)            91.89%     1.52ns  659.90M
+// shmtx_rd_pri(64thr_64lock_50pct_write)            91.08%     1.53ns  654.04M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thr_2lock_10pct_write)                        10.25ns   97.53M
+// shmtx_wr_pri(2thr_2lock_10pct_write)              84.23%    12.17ns   82.14M
+// shmtx_rd_pri(2thr_2lock_10pct_write)              84.03%    12.20ns   81.96M
+// folly_rwspin(4thr_4lock_10pct_write)                         5.05ns  197.98M
+// shmtx_wr_pri(4thr_4lock_10pct_write)              84.01%     6.01ns  166.31M
+// shmtx_rd_pri(4thr_4lock_10pct_write)              83.98%     6.01ns  166.27M
+// folly_rwspin(8thr_8lock_10pct_write)                         2.46ns  405.97M
+// shmtx_wr_pri(8thr_8lock_10pct_write)              82.52%     2.98ns  335.03M
+// shmtx_rd_pri(8thr_8lock_10pct_write)              82.47%     2.99ns  334.82M
+// folly_rwspin(16thr_16lock_10pct_write)                       1.23ns  813.48M
+// shmtx_wr_pri(16thr_16lock_10pct_write)            82.08%     1.50ns  667.72M
+// shmtx_rd_pri(16thr_16lock_10pct_write)            81.53%     1.51ns  663.23M
+// folly_rwspin(32thr_32lock_10pct_write)                       1.20ns  836.43M
+// shmtx_wr_pri(32thr_32lock_10pct_write)            91.52%     1.31ns  765.47M
+// shmtx_rd_pri(32thr_32lock_10pct_write)            91.87%     1.30ns  768.45M
+// folly_rwspin(64thr_64lock_10pct_write)                       1.39ns  721.74M
+// shmtx_wr_pri(64thr_64lock_10pct_write)            92.04%     1.51ns  664.28M
+// shmtx_rd_pri(64thr_64lock_10pct_write)            92.57%     1.50ns  668.15M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thr_2lock_1pct_write)                         10.13ns   98.71M
+// shmtx_wr_pri(2thr_2lock_1pct_write)               83.59%    12.12ns   82.51M
+// shmtx_rd_pri(2thr_2lock_1pct_write)               83.59%    12.12ns   82.51M
+// folly_rwspin(4thr_4lock_1pct_write)                          4.96ns  201.67M
+// shmtx_wr_pri(4thr_4lock_1pct_write)               82.87%     5.98ns  167.13M
+// shmtx_rd_pri(4thr_4lock_1pct_write)               83.05%     5.97ns  167.48M
+// folly_rwspin(8thr_8lock_1pct_write)                          2.44ns  409.64M
+// shmtx_wr_pri(8thr_8lock_1pct_write)               82.46%     2.96ns  337.79M
+// shmtx_rd_pri(8thr_8lock_1pct_write)               82.40%     2.96ns  337.55M
+// folly_rwspin(16thr_16lock_1pct_write)                        1.22ns  821.15M
+// shmtx_wr_pri(16thr_16lock_1pct_write)             81.63%     1.49ns  670.29M
+// shmtx_rd_pri(16thr_16lock_1pct_write)             81.65%     1.49ns  670.50M
+// folly_rwspin(32thr_32lock_1pct_write)                        1.20ns  832.88M
+// shmtx_wr_pri(32thr_32lock_1pct_write)             92.22%     1.30ns  768.06M
+// shmtx_rd_pri(32thr_32lock_1pct_write)             92.21%     1.30ns  768.01M
+// folly_rwspin(64thr_64lock_1pct_write)                        1.38ns  726.10M
+// shmtx_wr_pri(64thr_64lock_1pct_write)             92.24%     1.49ns  669.75M
+// shmtx_rd_pri(64thr_64lock_1pct_write)             92.13%     1.49ns  668.95M
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// folly_rwspin_ping_pong(burn0)                              503.21ns    1.99M
+// shmtx_w_bare_ping_pong(burn0)                     79.13%   635.96ns    1.57M
+// shmtx_r_bare_ping_pong(burn0)                     59.08%   851.81ns    1.17M
+// folly_ticket_ping_pong(burn0)                     60.50%   831.77ns    1.20M
+// boost_shared_ping_pong(burn0)                      4.46%    11.28us   88.65K
+// pthrd_rwlock_ping_pong(burn0)                      6.86%     7.34us  136.27K
+// ----------------------------------------------------------------------------
+// folly_rwspin_ping_pong(burn100k)                           685.00ns    1.46M
+// shmtx_w_bare_ping_pong(burn100k)                 100.05%   684.65ns    1.46M
+// shmtx_r_bare_ping_pong(burn100k)                  99.93%   685.51ns    1.46M
+// folly_ticket_ping_pong(burn100k)                  99.32%   689.72ns    1.45M
+// boost_shared_ping_pong(burn100k)                  56.59%     1.21us  826.06K
+// pthrd_rwlock_ping_pong(burn100k)                  58.32%     1.17us  851.41K
+// ----------------------------------------------------------------------------
+// folly_rwspin_ping_pong(burn300k)                             2.15us  464.20K
+// shmtx_w_bare_ping_pong(burn300k)                 101.02%     2.13us  468.93K
+// shmtx_r_bare_ping_pong(burn300k)                 103.95%     2.07us  482.55K
+// folly_ticket_ping_pong(burn300k)                 104.06%     2.07us  483.05K
+// boost_shared_ping_pong(burn300k)                  86.36%     2.49us  400.86K
+// pthrd_rwlock_ping_pong(burn300k)                  87.30%     2.47us  405.25K
+// ----------------------------------------------------------------------------
+// folly_rwspin_ping_pong(burn1M)                             675.20ns    1.48M
+// shmtx_w_bare_ping_pong(burn1M)                    99.73%   677.02ns    1.48M
+// shmtx_r_bare_ping_pong(burn1M)                    99.23%   680.45ns    1.47M
+// folly_ticket_ping_pong(burn1M)                    97.85%   690.01ns    1.45M
+// boost_shared_ping_pong(burn1M)                    93.17%   724.67ns    1.38M
+// pthrd_rwlock_ping_pong(burn1M)                    91.84%   735.22ns    1.36M
+// ============================================================================
+
+int main(int argc, char** argv) {
+  (void)folly_rwspin_reads;
+  (void)shmtx_wr_pri_reads;
+  (void)shmtx_w_bare_reads;
+  (void)shmtx_rd_pri_reads;
+  (void)shmtx_r_bare_reads;
+  (void)folly_ticket_reads;
+  (void)boost_shared_reads;
+  (void)pthrd_rwlock_reads;
+  (void)folly_rwspin;
+  (void)shmtx_wr_pri;
+  (void)shmtx_w_bare;
+  (void)shmtx_rd_pri;
+  (void)shmtx_r_bare;
+  (void)folly_ticket;
+  (void)boost_shared;
+  (void)pthrd_rwlock;
+  (void)pthrd_mutex_;
+  (void)folly_rwspin_ping_pong;
+  (void)shmtx_w_bare_ping_pong;
+  (void)shmtx_r_bare_ping_pong;
+  (void)folly_ticket_ping_pong;
+  (void)boost_shared_ping_pong;
+  (void)pthrd_rwlock_ping_pong;
+
+  testing::InitGoogleTest(&argc, argv);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  int rv = RUN_ALL_TESTS();
+  folly::runBenchmarksOnFlag();
+  return rv;
+}
author	Nathan Bronson <ngbronson@fb.com>
	Wed, 11 Mar 2015 05:21:54 +0000 (22:21 -0700)
committer	Andre Azevedo <aap@fb.com>
	Wed, 18 Mar 2015 02:50:35 +0000 (19:50 -0700)
folly/Makefile.am		patch \| blob \| history
folly/experimental/SharedMutex.cpp	[new file with mode: 0644]	patch \| blob
folly/experimental/SharedMutex.h	[new file with mode: 0644]	patch \| blob
folly/experimental/test/SharedMutexTest.cpp	[new file with mode: 0644]	patch \| blob