From: Nathan Bronson <ngbronson@fb.com>
Date: Wed, 17 Jun 2015 21:27:06 +0000 (-0700)
Subject: move SharedMutex from folly/experimental to folly
X-Git-Tag: v0.47.0~5
X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=186bd8ade9668acd931201b4913d3c2af1f9535f;p=folly.git

move SharedMutex from folly/experimental to folly

Summary: SharedMutex has been in heavy production use for a while with no
bugs reported and no API changes requested, it is no longer experimental.

Reviewed By: @yfeldblum

Differential Revision: D2165275
---

diff --git a/folly/Makefile.am b/folly/Makefile.am
index a682e012..bdfbe6d5 100644
--- a/folly/Makefile.am
+++ b/folly/Makefile.am
@@ -109,7 +109,6 @@ nobase_follyinclude_HEADERS = \
 	experimental/JSONSchema.h \
 	experimental/LockFreeRingBuffer.h \
 	experimental/Select64.h \
-	experimental/SharedMutex.h \
 	experimental/StringKeyedCommon.h \
 	experimental/StringKeyedUnorderedMap.h \
 	experimental/StringKeyedUnorderedSet.h \
@@ -233,6 +232,7 @@ nobase_follyinclude_HEADERS = \
 	Range.h \
 	RWSpinLock.h \
 	ScopeGuard.h \
+	SharedMutex.h \
 	Singleton.h \
 	Singleton-inl.h \
 	SmallLocks.h \
@@ -390,8 +390,9 @@ libfolly_la_SOURCES = \
 	MemoryMapping.cpp \
 	Random.cpp \
 	SafeAssert.cpp \
-	SocketAddress.cpp \
+	SharedMutex.cpp \
 	Singleton.cpp \
+	SocketAddress.cpp \
 	SpookyHashV1.cpp \
 	SpookyHashV2.cpp \
 	stats/Instantiations.cpp \
@@ -410,7 +411,6 @@ libfolly_la_SOURCES = \
 	experimental/io/FsUtil.cpp \
 	experimental/JSONSchema.cpp \
 	experimental/Select64.cpp \
-	experimental/SharedMutex.cpp \
 	experimental/TestUtil.cpp \
 	wangle/acceptor/Acceptor.cpp \
 	wangle/acceptor/ConnectionManager.cpp \
diff --git a/folly/SharedMutex.cpp b/folly/SharedMutex.cpp
new file mode 100644
index 00000000..3a3ac17d
--- /dev/null
+++ b/folly/SharedMutex.cpp
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2015 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "SharedMutex.h"
+
+COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(
+    folly::SharedMutexReadPriority);
+COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(
+    folly::SharedMutexWritePriority);
diff --git a/folly/SharedMutex.h b/folly/SharedMutex.h
new file mode 100644
index 00000000..8bfd3262
--- /dev/null
+++ b/folly/SharedMutex.h
@@ -0,0 +1,1366 @@
+/*
+ * Copyright 2015 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// @author Nathan Bronson (ngbronson@fb.com)
+
+#pragma once
+
+#include <stdint.h>
+#include <atomic>
+#include <thread>
+#include <type_traits>
+#include <folly/Likely.h>
+#include <folly/detail/CacheLocality.h>
+#include <folly/detail/Futex.h>
+#include <sys/resource.h>
+
+// SharedMutex is a reader-writer lock.  It is small, very fast, scalable
+// on multi-core, and suitable for use when readers or writers may block.
+// Unlike most other reader-writer locks, its throughput with concurrent
+// readers scales linearly; it is able to acquire and release the lock
+// in shared mode without cache line ping-ponging.  It is suitable for
+// a wide range of lock hold times because it starts with spinning,
+// proceeds to using sched_yield with a preemption heuristic, and then
+// waits using futex and precise wakeups.
+//
+// SharedMutex provides all of the methods of folly::RWSpinLock,
+// boost::shared_mutex, boost::upgrade_mutex, and C++14's
+// std::shared_timed_mutex.  All operations that can block are available
+// in try, try-for, and try-until (system_clock or steady_clock) versions.
+//
+// SharedMutexReadPriority gives priority to readers,
+// SharedMutexWritePriority gives priority to writers.  SharedMutex is an
+// alias for SharedMutexWritePriority, because writer starvation is more
+// likely than reader starvation for the read-heavy workloads targetted
+// by SharedMutex.
+//
+// In my tests SharedMutex is as good or better than the other
+// reader-writer locks in use at Facebook for almost all use cases,
+// sometimes by a wide margin.  (If it is rare that there are actually
+// concurrent readers then RWSpinLock can be a few nanoseconds faster.)
+// I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64,
+// boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses
+// spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block.
+// (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.)
+// It is generally as good or better than the rest when evaluating size,
+// speed, scalability, or latency outliers.  In the corner cases where
+// it is not the fastest (such as single-threaded use or heavy write
+// contention) it is never very much worse than the best.  See the bottom
+// of folly/test/SharedMutexTest.cpp for lots of microbenchmark results.
+//
+// Comparison to folly::RWSpinLock:
+//
+//  * SharedMutex is faster than RWSpinLock when there are actually
+//    concurrent read accesses (sometimes much faster), and ~5 nanoseconds
+//    slower when there is not actually any contention.  SharedMutex is
+//    faster in every (benchmarked) scenario where the shared mode of
+//    the lock is actually useful.
+//
+//  * Concurrent shared access to SharedMutex scales linearly, while total
+//    RWSpinLock throughput drops as more threads try to access the lock
+//    in shared mode.  Under very heavy read contention SharedMutex can
+//    be two orders of magnitude faster than RWSpinLock (or any reader
+//    writer lock that doesn't use striping or deferral).
+//
+//  * SharedMutex can safely protect blocking calls, because after an
+//    initial period of spinning it waits using futex().
+//
+//  * RWSpinLock prioritizes readers, SharedMutex has both reader- and
+//    writer-priority variants, but defaults to write priority.
+//
+//  * RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's
+//    doesn't.  Both semantics are reasonable.  The boost documentation
+//    doesn't explicitly talk about this behavior (except by omitting
+//    any statement that those lock modes conflict), but the boost
+//    implementations do allow new readers while the upgradeable mode
+//    is held.  See https://github.com/boostorg/thread/blob/master/
+//      include/boost/thread/pthread/shared_mutex.hpp
+//
+//  * RWSpinLock::UpgradedHolder maps to SharedMutex::UpgradeHolder
+//    (UpgradeableHolder would be even more pedantically correct).
+//    SharedMutex's holders have fewer methods (no reset) and are less
+//    tolerant (promotion and downgrade crash if the donor doesn't own
+//    the lock, and you must use the default constructor rather than
+//    passing a nullptr to the pointer constructor).
+//
+// Both SharedMutex and RWSpinLock provide "exclusive", "upgrade",
+// and "shared" modes.  At all times num_threads_holding_exclusive +
+// num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive ==
+// 0 || num_threads_holding_shared == 0.  RWSpinLock has the additional
+// constraint that num_threads_holding_shared cannot increase while
+// num_threads_holding_upgrade is non-zero.
+//
+// Comparison to the internal RWLock:
+//
+//  * SharedMutex doesn't allow a maximum reader count to be configured,
+//    so it can't be used as a semaphore in the same way as RWLock.
+//
+//  * SharedMutex is 4 bytes, RWLock is 256.
+//
+//  * SharedMutex is as fast or faster than RWLock in all of my
+//    microbenchmarks, and has positive rather than negative scalability.
+//
+//  * RWLock and SharedMutex are both writer priority locks.
+//
+//  * SharedMutex avoids latency outliers as well as RWLock.
+//
+//  * SharedMutex uses different names (t != 0 below):
+//
+//    RWLock::lock(0)    => SharedMutex::lock()
+//
+//    RWLock::lock(t)    => SharedMutex::try_lock_for(milliseconds(t))
+//
+//    RWLock::tryLock()  => SharedMutex::try_lock()
+//
+//    RWLock::unlock()   => SharedMutex::unlock()
+//
+//    RWLock::enter(0)   => SharedMutex::lock_shared()
+//
+//    RWLock::enter(t)   =>
+//        SharedMutex::try_lock_shared_for(milliseconds(t))
+//
+//    RWLock::tryEnter() => SharedMutex::try_lock_shared()
+//
+//    RWLock::leave()    => SharedMutex::unlock_shared()
+//
+//  * RWLock allows the reader count to be adjusted by a value other
+//    than 1 during enter() or leave(). SharedMutex doesn't currently
+//    implement this feature.
+//
+//  * RWLock's methods are marked const, SharedMutex's aren't.
+//
+// Reader-writer locks have the potential to allow concurrent access
+// to shared read-mostly data, but in practice they often provide no
+// improvement over a mutex.  The problem is the cache coherence protocol
+// of modern CPUs.  Coherence is provided by making sure that when a cache
+// line is written it is present in only one core's cache.  Since a memory
+// write is required to acquire a reader-writer lock in shared mode, the
+// cache line holding the lock is invalidated in all of the other caches.
+// This leads to cache misses when another thread wants to acquire or
+// release the lock concurrently.  When the RWLock is colocated with the
+// data it protects (common), cache misses can also continue occur when
+// a thread that already holds the lock tries to read the protected data.
+//
+// Ideally, a reader-writer lock would allow multiple cores to acquire
+// and release the lock in shared mode without incurring any cache misses.
+// This requires that each core records its shared access in a cache line
+// that isn't read or written by other read-locking cores.  (Writers will
+// have to check all of the cache lines.)  Typical server hardware when
+// this comment was written has 16 L1 caches and cache lines of 64 bytes,
+// so a lock striped over all L1 caches would occupy a prohibitive 1024
+// bytes.  Nothing says that we need a separate set of per-core memory
+// locations for each lock, however.  Each SharedMutex instance is only
+// 4 bytes, but all locks together share a 2K area in which they make a
+// core-local record of lock acquisitions.
+//
+// SharedMutex's strategy of using a shared set of core-local stripes has
+// a potential downside, because it means that acquisition of any lock in
+// write mode can conflict with acquisition of any lock in shared mode.
+// If a lock instance doesn't actually experience concurrency then this
+// downside will outweight the upside of improved scalability for readers.
+// To avoid this problem we dynamically detect concurrent accesses to
+// SharedMutex, and don't start using the deferred mode unless we actually
+// observe concurrency.  See kNumSharedToStartDeferring.
+//
+// It is explicitly allowed to call lock_unshared() from a different
+// thread than lock_shared(), so long as they are properly paired.
+// lock_unshared() needs to find the location at which lock_shared()
+// recorded the lock, which might be in the lock itself or in any of
+// the shared slots.  If you can conveniently pass state from lock
+// acquisition to release then the fastest mechanism is to std::move
+// the SharedMutex::ReadHolder instance or an SharedMutex::Token (using
+// lock_shared(Token&) and unlock_sahred(Token&)).  The guard or token
+// will tell unlock_shared where in deferredReaders[] to look for the
+// deferred lock.  The Token-less version of unlock_shared() works in all
+// cases, but is optimized for the common (no inter-thread handoff) case.
+//
+// In both read- and write-priority mode, a waiting lock() (exclusive mode)
+// only blocks readers after it has waited for an active upgrade lock to be
+// released; until the upgrade lock is released (or upgraded or downgraded)
+// readers will still be able to enter.  Preferences about lock acquisition
+// are not guaranteed to be enforced perfectly (even if they were, there
+// is theoretically the chance that a thread could be arbitrarily suspended
+// between calling lock() and SharedMutex code actually getting executed).
+//
+// try_*_for methods always try at least once, even if the duration
+// is zero or negative.  The duration type must be compatible with
+// std::chrono::steady_clock.  try_*_until methods also always try at
+// least once.  std::chrono::system_clock and std::chrono::steady_clock
+// are supported.
+//
+// If you have observed by profiling that your SharedMutex-s are getting
+// cache misses on deferredReaders[] due to another SharedMutex user, then
+// you can use the tag type plus the RWDEFERREDLOCK_DECLARE_STATIC_STORAGE
+// macro to create your own instantiation of the type.  The contention
+// threshold (see kNumSharedToStartDeferring) should make this unnecessary
+// in all but the most extreme cases.  Make sure to check that the
+// increased icache and dcache footprint of the tagged result is worth it.
+
+namespace folly {
+
+struct SharedMutexToken {
+  enum class Type : uint16_t {
+    INVALID = 0,
+    INLINE_SHARED,
+    DEFERRED_SHARED,
+  };
+
+  Type type_;
+  uint16_t slot_;
+};
+
+template <bool ReaderPriority,
+          typename Tag_ = void,
+          template <typename> class Atom = std::atomic,
+          bool BlockImmediately = false>
+class SharedMutexImpl {
+ public:
+  static constexpr bool kReaderPriority = ReaderPriority;
+  typedef Tag_ Tag;
+
+  typedef SharedMutexToken Token;
+
+  class ReadHolder;
+  class UpgradeHolder;
+  class WriteHolder;
+
+  SharedMutexImpl() : state_(0) {}
+
+  SharedMutexImpl(const SharedMutexImpl&) = delete;
+  SharedMutexImpl(SharedMutexImpl&&) = delete;
+  SharedMutexImpl& operator = (const SharedMutexImpl&) = delete;
+  SharedMutexImpl& operator = (SharedMutexImpl&&) = delete;
+
+  // It is an error to destroy an SharedMutex that still has
+  // any outstanding locks.  This is checked if NDEBUG isn't defined.
+  // SharedMutex's exclusive mode can be safely used to guard the lock's
+  // own destruction.  If, for example, you acquire the lock in exclusive
+  // mode and then observe that the object containing the lock is no longer
+  // needed, you can unlock() and then immediately destroy the lock.
+  // See https://sourceware.org/bugzilla/show_bug.cgi?id=13690 for a
+  // description about why this property needs to be explicitly mentioned.
+  ~SharedMutexImpl() {
+#ifndef NDEBUG
+    auto state = state_.load(std::memory_order_acquire);
+
+    // if a futexWait fails to go to sleep because the value has been
+    // changed, we don't necessarily clean up the wait bits, so it is
+    // possible they will be set here in a correct system
+    assert((state & ~(kWaitingAny | kMayDefer)) == 0);
+    if ((state & kMayDefer) != 0) {
+      for (uint32_t slot = 0; slot < kMaxDeferredReaders; ++slot) {
+        auto slotValue = deferredReader(slot)->load(std::memory_order_acquire);
+        assert(!slotValueIsThis(slotValue));
+      }
+    }
+#endif
+  }
+
+  void lock() {
+    WaitForever ctx;
+    (void)lockExclusiveImpl(kHasSolo, ctx);
+  }
+
+  bool try_lock() {
+    WaitNever ctx;
+    return lockExclusiveImpl(kHasSolo, ctx);
+  }
+
+  template <class Rep, class Period>
+  bool try_lock_for(const std::chrono::duration<Rep, Period>& duration) {
+    WaitForDuration<Rep, Period> ctx(duration);
+    return lockExclusiveImpl(kHasSolo, ctx);
+  }
+
+  template <class Clock, class Duration>
+  bool try_lock_until(
+      const std::chrono::time_point<Clock, Duration>& absDeadline) {
+    WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
+    return lockExclusiveImpl(kHasSolo, ctx);
+  }
+
+  void unlock() {
+    // It is possible that we have a left-over kWaitingNotS if the last
+    // unlock_shared() that let our matching lock() complete finished
+    // releasing before lock()'s futexWait went to sleep.  Clean it up now
+    auto state = (state_ &= ~(kWaitingNotS | kPrevDefer | kHasE));
+    assert((state & ~kWaitingAny) == 0);
+    wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
+  }
+
+  // Managing the token yourself makes unlock_shared a bit faster
+
+  void lock_shared() {
+    WaitForever ctx;
+    (void)lockSharedImpl(nullptr, ctx);
+  }
+
+  void lock_shared(Token& token) {
+    WaitForever ctx;
+    (void)lockSharedImpl(&token, ctx);
+  }
+
+  bool try_lock_shared() {
+    WaitNever ctx;
+    return lockSharedImpl(nullptr, ctx);
+  }
+
+  bool try_lock_shared(Token& token) {
+    WaitNever ctx;
+    return lockSharedImpl(&token, ctx);
+  }
+
+  template <class Rep, class Period>
+  bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration) {
+    WaitForDuration<Rep, Period> ctx(duration);
+    return lockSharedImpl(nullptr, ctx);
+  }
+
+  template <class Rep, class Period>
+  bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration,
+                           Token& token) {
+    WaitForDuration<Rep, Period> ctx(duration);
+    return lockSharedImpl(&token, ctx);
+  }
+
+  template <class Clock, class Duration>
+  bool try_lock_shared_until(
+      const std::chrono::time_point<Clock, Duration>& absDeadline) {
+    WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
+    return lockSharedImpl(nullptr, ctx);
+  }
+
+  template <class Clock, class Duration>
+  bool try_lock_shared_until(
+      const std::chrono::time_point<Clock, Duration>& absDeadline,
+      Token& token) {
+    WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
+    return lockSharedImpl(&token, ctx);
+  }
+
+  void unlock_shared() {
+    auto state = state_.load(std::memory_order_acquire);
+
+    // kPrevDefer can only be set if HasE or BegunE is set
+    assert((state & (kPrevDefer | kHasE | kBegunE)) != kPrevDefer);
+
+    // lock() strips kMayDefer immediately, but then copies it to
+    // kPrevDefer so we can tell if the pre-lock() lock_shared() might
+    // have deferred
+    if ((state & (kMayDefer | kPrevDefer)) == 0 ||
+        !tryUnlockAnySharedDeferred()) {
+      // Matching lock_shared() couldn't have deferred, or the deferred
+      // lock has already been inlined by applyDeferredReaders()
+      unlockSharedInline();
+    }
+  }
+
+  void unlock_shared(Token& token) {
+    assert(token.type_ == Token::Type::INLINE_SHARED ||
+           token.type_ == Token::Type::DEFERRED_SHARED);
+
+    if (token.type_ != Token::Type::DEFERRED_SHARED ||
+        !tryUnlockSharedDeferred(token.slot_)) {
+      unlockSharedInline();
+    }
+#ifndef NDEBUG
+    token.type_ = Token::Type::INVALID;
+#endif
+  }
+
+  void unlock_and_lock_shared() {
+    // We can't use state_ -=, because we need to clear 2 bits (1 of which
+    // has an uncertain initial state) and set 1 other.  We might as well
+    // clear the relevant wake bits at the same time.  Note that since S
+    // doesn't block the beginning of a transition to E (writer priority
+    // can cut off new S, reader priority grabs BegunE and blocks deferred
+    // S) we need to wake E as well.
+    auto state = state_.load(std::memory_order_acquire);
+    do {
+      assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
+    } while (!state_.compare_exchange_strong(
+        state, (state & ~(kWaitingAny | kPrevDefer | kHasE)) + kIncrHasS));
+    if ((state & (kWaitingE | kWaitingU | kWaitingS)) != 0) {
+      futexWakeAll(kWaitingE | kWaitingU | kWaitingS);
+    }
+  }
+
+  void unlock_and_lock_shared(Token& token) {
+    unlock_and_lock_shared();
+    token.type_ = Token::Type::INLINE_SHARED;
+  }
+
+  void lock_upgrade() {
+    WaitForever ctx;
+    (void)lockUpgradeImpl(ctx);
+  }
+
+  bool try_lock_upgrade() {
+    WaitNever ctx;
+    return lockUpgradeImpl(ctx);
+  }
+
+  template <class Rep, class Period>
+  bool try_lock_upgrade_for(
+      const std::chrono::duration<Rep, Period>& duration) {
+    WaitForDuration<Rep, Period> ctx(duration);
+    return lockUpgradeImpl(ctx);
+  }
+
+  template <class Clock, class Duration>
+  bool try_lock_upgrade_until(
+      const std::chrono::time_point<Clock, Duration>& absDeadline) {
+    WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
+    return lockUpgradeImpl(ctx);
+  }
+
+  void unlock_upgrade() {
+    auto state = (state_ -= kHasU);
+    assert((state & (kWaitingNotS | kHasSolo)) == 0);
+    wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
+  }
+
+  void unlock_upgrade_and_lock() {
+    // no waiting necessary, so waitMask is empty
+    WaitForever ctx;
+    (void)lockExclusiveImpl(0, ctx);
+  }
+
+  void unlock_upgrade_and_lock_shared() {
+    auto state = (state_ -= kHasU - kIncrHasS);
+    assert((state & (kWaitingNotS | kHasSolo)) == 0 && (state & kHasS) != 0);
+    wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
+  }
+
+  void unlock_upgrade_and_lock_shared(Token& token) {
+    unlock_upgrade_and_lock_shared();
+    token.type_ = Token::Type::INLINE_SHARED;
+  }
+
+  void unlock_and_lock_upgrade() {
+    // We can't use state_ -=, because we need to clear 2 bits (1 of
+    // which has an uncertain initial state) and set 1 other.  We might
+    // as well clear the relevant wake bits at the same time.
+    auto state = state_.load(std::memory_order_acquire);
+    while (true) {
+      assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
+      auto after =
+          (state & ~(kWaitingNotS | kWaitingS | kPrevDefer | kHasE)) + kHasU;
+      if (state_.compare_exchange_strong(state, after)) {
+        if ((state & kWaitingS) != 0) {
+          futexWakeAll(kWaitingS);
+        }
+        return;
+      }
+    }
+  }
+
+ private:
+  typedef typename folly::detail::Futex<Atom> Futex;
+
+  // Internally we use four kinds of wait contexts.  These are structs
+  // that provide a doWait method that returns true if a futex wake
+  // was issued that intersects with the waitMask, false if there was a
+  // timeout and no more waiting should be performed.  Spinning occurs
+  // before the wait context is invoked.
+
+  struct WaitForever {
+    bool canBlock() { return true; }
+    bool canTimeOut() { return false; }
+    bool shouldTimeOut() { return false; }
+
+    bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
+      futex.futexWait(expected, waitMask);
+      return true;
+    }
+  };
+
+  struct WaitNever {
+    bool canBlock() { return false; }
+    bool canTimeOut() { return true; }
+    bool shouldTimeOut() { return true; }
+
+    bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
+      return false;
+    }
+  };
+
+  template <class Rep, class Period>
+  struct WaitForDuration {
+    std::chrono::duration<Rep, Period> duration_;
+    bool deadlineComputed_;
+    std::chrono::steady_clock::time_point deadline_;
+
+    explicit WaitForDuration(const std::chrono::duration<Rep, Period>& duration)
+        : duration_(duration), deadlineComputed_(false) {}
+
+    std::chrono::steady_clock::time_point deadline() {
+      if (!deadlineComputed_) {
+        deadline_ = std::chrono::steady_clock::now() + duration_;
+        deadlineComputed_ = true;
+      }
+      return deadline_;
+    }
+
+    bool canBlock() { return duration_.count() > 0; }
+    bool canTimeOut() { return true; }
+
+    bool shouldTimeOut() {
+      return std::chrono::steady_clock::now() > deadline();
+    }
+
+    bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
+      auto result = futex.futexWaitUntil(expected, deadline(), waitMask);
+      return result != folly::detail::FutexResult::TIMEDOUT;
+    }
+  };
+
+  template <class Clock, class Duration>
+  struct WaitUntilDeadline {
+    std::chrono::time_point<Clock, Duration> absDeadline_;
+
+    bool canBlock() { return true; }
+    bool canTimeOut() { return true; }
+    bool shouldTimeOut() { return Clock::now() > absDeadline_; }
+
+    bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
+      auto result = futex.futexWaitUntil(expected, absDeadline_, waitMask);
+      return result != folly::detail::FutexResult::TIMEDOUT;
+    }
+  };
+
+  // 32 bits of state
+  Futex state_;
+
+  static constexpr uint32_t kIncrHasS = 1 << 10;
+  static constexpr uint32_t kHasS = ~(kIncrHasS - 1);
+
+  // If false, then there are definitely no deferred read locks for this
+  // instance.  Cleared after initialization and when exclusively locked.
+  static constexpr uint32_t kMayDefer = 1 << 9;
+
+  // lock() cleared kMayDefer as soon as it starts draining readers (so
+  // that it doesn't have to do a second CAS once drain completes), but
+  // unlock_shared() still needs to know whether to scan deferredReaders[]
+  // or not.  We copy kMayDefer to kPrevDefer when setting kHasE or
+  // kBegunE, and clear it when clearing those bits.
+  static constexpr uint32_t kPrevDefer = 1 << 8;
+
+  // Exclusive-locked blocks all read locks and write locks.  This bit
+  // may be set before all readers have finished, but in that case the
+  // thread that sets it won't return to the caller until all read locks
+  // have been released.
+  static constexpr uint32_t kHasE = 1 << 7;
+
+  // Exclusive-draining means that lock() is waiting for existing readers
+  // to leave, but that new readers may still acquire shared access.
+  // This is only used in reader priority mode.  New readers during
+  // drain must be inline.  The difference between this and kHasU is that
+  // kBegunE prevents kMayDefer from being set.
+  static constexpr uint32_t kBegunE = 1 << 6;
+
+  // At most one thread may have either exclusive or upgrade lock
+  // ownership.  Unlike exclusive mode, ownership of the lock in upgrade
+  // mode doesn't preclude other threads holding the lock in shared mode.
+  // boost's concept for this doesn't explicitly say whether new shared
+  // locks can be acquired one lock_upgrade has succeeded, but doesn't
+  // list that as disallowed.  RWSpinLock disallows new read locks after
+  // lock_upgrade has been acquired, but the boost implementation doesn't.
+  // We choose the latter.
+  static constexpr uint32_t kHasU = 1 << 5;
+
+  // There are three states that we consider to be "solo", in that they
+  // cannot coexist with other solo states.  These are kHasE, kBegunE,
+  // and kHasU.  Note that S doesn't conflict with any of these, because
+  // setting the kHasE is only one of the two steps needed to actually
+  // acquire the lock in exclusive mode (the other is draining the existing
+  // S holders).
+  static constexpr uint32_t kHasSolo = kHasE | kBegunE | kHasU;
+
+  // Once a thread sets kHasE it needs to wait for the current readers
+  // to exit the lock.  We give this a separate wait identity from the
+  // waiting to set kHasE so that we can perform partial wakeups (wake
+  // one instead of wake all).
+  static constexpr uint32_t kWaitingNotS = 1 << 4;
+
+  // When waking writers we can either wake them all, in which case we
+  // can clear kWaitingE, or we can call futexWake(1).  futexWake tells
+  // us if anybody woke up, but even if we detect that nobody woke up we
+  // can't clear the bit after the fact without issuing another wakeup.
+  // To avoid thundering herds when there are lots of pending lock()
+  // without needing to call futexWake twice when there is only one
+  // waiter, kWaitingE actually encodes if we have observed multiple
+  // concurrent waiters.  Tricky: ABA issues on futexWait mean that when
+  // we see kWaitingESingle we can't assume that there is only one.
+  static constexpr uint32_t kWaitingESingle = 1 << 2;
+  static constexpr uint32_t kWaitingEMultiple = 1 << 3;
+  static constexpr uint32_t kWaitingE = kWaitingESingle | kWaitingEMultiple;
+
+  // kWaitingU is essentially a 1 bit saturating counter.  It always
+  // requires a wakeAll.
+  static constexpr uint32_t kWaitingU = 1 << 1;
+
+  // All blocked lock_shared() should be awoken, so it is correct (not
+  // suboptimal) to wakeAll if there are any shared readers.
+  static constexpr uint32_t kWaitingS = 1 << 0;
+
+  // kWaitingAny is a mask of all of the bits that record the state of
+  // threads, rather than the state of the lock.  It is convenient to be
+  // able to mask them off during asserts.
+  static constexpr uint32_t kWaitingAny =
+      kWaitingNotS | kWaitingE | kWaitingU | kWaitingS;
+
+  // The reader count at which a reader will attempt to use the lock
+  // in deferred mode.  If this value is 2, then the second concurrent
+  // reader will set kMayDefer and use deferredReaders[].  kMayDefer is
+  // cleared during exclusive access, so this threshold must be reached
+  // each time a lock is held in exclusive mode.
+  static constexpr uint32_t kNumSharedToStartDeferring = 2;
+
+  // The typical number of spins that a thread will wait for a state
+  // transition.  There is no bound on the number of threads that can wait
+  // for a writer, so we are pretty conservative here to limit the chance
+  // that we are starving the writer of CPU.  Each spin is 6 or 7 nanos,
+  // almost all of which is in the pause instruction.
+  static constexpr uint32_t kMaxSpinCount = !BlockImmediately ? 1000 : 2;
+
+  // The maximum number of soft yields before falling back to futex.
+  // If the preemption heuristic is activated we will fall back before
+  // this.  A soft yield takes ~900 nanos (two sched_yield plus a call
+  // to getrusage, with checks of the goal at each step).  Soft yields
+  // aren't compatible with deterministic execution under test (unlike
+  // futexWaitUntil, which has a capricious but deterministic back end).
+  static constexpr uint32_t kMaxSoftYieldCount = !BlockImmediately ? 1000 : 0;
+
+  // If AccessSpreader assigns indexes from 0..k*n-1 on a system where some
+  // level of the memory hierarchy is symmetrically divided into k pieces
+  // (NUMA nodes, last-level caches, L1 caches, ...), then slot indexes
+  // that are the same after integer division by k share that resource.
+  // Our strategy for deferred readers is to probe up to numSlots/4 slots,
+  // using the full granularity of AccessSpreader for the start slot
+  // and then search outward.  We can use AccessSpreader::current(n)
+  // without managing our own spreader if kMaxDeferredReaders <=
+  // AccessSpreader::kMaxCpus, which is currently 128.
+  //
+  // Our 2-socket E5-2660 machines have 8 L1 caches on each chip,
+  // with 64 byte cache lines.  That means we need 64*16 bytes of
+  // deferredReaders[] to give each L1 its own playground.  On x86_64
+  // each DeferredReaderSlot is 8 bytes, so we need kMaxDeferredReaders
+  // * kDeferredSeparationFactor >= 64 * 16 / 8 == 128.  If
+  // kDeferredSearchDistance * kDeferredSeparationFactor <=
+  // 64 / 8 then we will search only within a single cache line, which
+  // guarantees we won't have inter-L1 contention.  We give ourselves
+  // a factor of 2 on the core count, which should hold us for a couple
+  // processor generations.  deferredReaders[] is 2048 bytes currently.
+  static constexpr uint32_t kMaxDeferredReaders = 64;
+  static constexpr uint32_t kDeferredSearchDistance = 2;
+  static constexpr uint32_t kDeferredSeparationFactor = 4;
+
+  static_assert(!(kMaxDeferredReaders & (kMaxDeferredReaders - 1)),
+                "kMaxDeferredReaders must be a power of 2");
+  static_assert(!(kDeferredSearchDistance & (kDeferredSearchDistance - 1)),
+                "kDeferredSearchDistance must be a power of 2");
+
+  // The number of deferred locks that can be simultaneously acquired
+  // by a thread via the token-less methods without performing any heap
+  // allocations.  Each of these costs 3 pointers (24 bytes, probably)
+  // per thread.  There's not much point in making this larger than
+  // kDeferredSearchDistance.
+  static constexpr uint32_t kTokenStackTLSCapacity = 2;
+
+  // We need to make sure that if there is a lock_shared()
+  // and lock_shared(token) followed by unlock_shared() and
+  // unlock_shared(token), the token-less unlock doesn't null
+  // out deferredReaders[token.slot_].  If we allowed that, then
+  // unlock_shared(token) wouldn't be able to assume that its lock
+  // had been inlined by applyDeferredReaders when it finds that
+  // deferredReaders[token.slot_] no longer points to this.  We accomplish
+  // this by stealing bit 0 from the pointer to record that the slot's
+  // element has no token, hence our use of uintptr_t in deferredReaders[].
+  static constexpr uintptr_t kTokenless = 0x1;
+
+  // This is the starting location for Token-less unlock_shared().
+  static FOLLY_TLS uint32_t tls_lastTokenlessSlot;
+
+  // Only indexes divisible by kDeferredSeparationFactor are used.
+  // If any of those elements points to a SharedMutexImpl, then it
+  // should be considered that there is a shared lock on that instance.
+  // See kTokenless.
+  typedef Atom<uintptr_t> DeferredReaderSlot;
+  static DeferredReaderSlot deferredReaders
+      [kMaxDeferredReaders *
+       kDeferredSeparationFactor] FOLLY_ALIGN_TO_AVOID_FALSE_SHARING;
+
+  // Performs an exclusive lock, waiting for state_ & waitMask to be
+  // zero first
+  template <class WaitContext>
+  bool lockExclusiveImpl(uint32_t preconditionGoalMask, WaitContext& ctx) {
+    uint32_t state = state_.load(std::memory_order_acquire);
+    if (LIKELY(
+            (state & (preconditionGoalMask | kMayDefer | kHasS)) == 0 &&
+            state_.compare_exchange_strong(state, (state | kHasE) & ~kHasU))) {
+      return true;
+    } else {
+      return lockExclusiveImpl(state, preconditionGoalMask, ctx);
+    }
+  }
+
+  template <class WaitContext>
+  bool lockExclusiveImpl(uint32_t& state,
+                         uint32_t preconditionGoalMask,
+                         WaitContext& ctx) {
+    while (true) {
+      if (UNLIKELY((state & preconditionGoalMask) != 0) &&
+          !waitForZeroBits(state, preconditionGoalMask, kWaitingE, ctx) &&
+          ctx.canTimeOut()) {
+        return false;
+      }
+
+      uint32_t after = (state & kMayDefer) == 0 ? 0 : kPrevDefer;
+      if (!ReaderPriority || (state & (kMayDefer | kHasS)) == 0) {
+        // Block readers immediately, either because we are in write
+        // priority mode or because we can acquire the lock in one
+        // step.  Note that if state has kHasU, then we are doing an
+        // unlock_upgrade_and_lock() and we should clear it (reader
+        // priority branch also does this).
+        after |= (state | kHasE) & ~(kHasU | kMayDefer);
+      } else {
+        after |= (state | kBegunE) & ~(kHasU | kMayDefer);
+      }
+      if (state_.compare_exchange_strong(state, after)) {
+        auto before = state;
+        state = after;
+
+        // If we set kHasE (writer priority) then no new readers can
+        // arrive.  If we set kBegunE then they can still enter, but
+        // they must be inline.  Either way we need to either spin on
+        // deferredReaders[] slots, or inline them so that we can wait on
+        // kHasS to zero itself.  deferredReaders[] is pointers, which on
+        // x86_64 are bigger than futex() can handle, so we inline the
+        // deferred locks instead of trying to futexWait on each slot.
+        // Readers are responsible for rechecking state_ after recording
+        // a deferred read to avoid atomicity problems between the state_
+        // CAS and applyDeferredReader's reads of deferredReaders[].
+        if (UNLIKELY((before & kMayDefer) != 0)) {
+          applyDeferredReaders(state, ctx);
+        }
+        while (true) {
+          assert((state & (kHasE | kBegunE)) != 0 && (state & kHasU) == 0);
+          if (UNLIKELY((state & kHasS) != 0) &&
+              !waitForZeroBits(state, kHasS, kWaitingNotS, ctx) &&
+              ctx.canTimeOut()) {
+            // Ugh.  We blocked new readers and other writers for a while,
+            // but were unable to complete.  Move on.  On the plus side
+            // we can clear kWaitingNotS because nobody else can piggyback
+            // on it.
+            state = (state_ &= ~(kPrevDefer | kHasE | kBegunE | kWaitingNotS));
+            wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
+            return false;
+          }
+
+          if (ReaderPriority && (state & kHasE) == 0) {
+            assert((state & kBegunE) != 0);
+            if (!state_.compare_exchange_strong(state,
+                                                (state & ~kBegunE) | kHasE)) {
+              continue;
+            }
+          }
+
+          return true;
+        }
+      }
+    }
+  }
+
+  template <class WaitContext>
+  bool waitForZeroBits(uint32_t& state,
+                       uint32_t goal,
+                       uint32_t waitMask,
+                       WaitContext& ctx) {
+    uint32_t spinCount = 0;
+    while (true) {
+      state = state_.load(std::memory_order_acquire);
+      if ((state & goal) == 0) {
+        return true;
+      }
+#if FOLLY_X64
+      asm volatile("pause");
+#endif
+      ++spinCount;
+      if (UNLIKELY(spinCount >= kMaxSpinCount)) {
+        return ctx.canBlock() &&
+               yieldWaitForZeroBits(state, goal, waitMask, ctx);
+      }
+    }
+  }
+
+  template <class WaitContext>
+  bool yieldWaitForZeroBits(uint32_t& state,
+                            uint32_t goal,
+                            uint32_t waitMask,
+                            WaitContext& ctx) {
+#ifdef RUSAGE_THREAD
+    struct rusage usage;
+    long before = -1;
+#endif
+    for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
+         ++yieldCount) {
+      for (int softState = 0; softState < 3; ++softState) {
+        if (softState < 2) {
+          std::this_thread::yield();
+        } else {
+#ifdef RUSAGE_THREAD
+          getrusage(RUSAGE_THREAD, &usage);
+#endif
+        }
+        if (((state = state_.load(std::memory_order_acquire)) & goal) == 0) {
+          return true;
+        }
+        if (ctx.shouldTimeOut()) {
+          return false;
+        }
+      }
+#ifdef RUSAGE_THREAD
+      if (before >= 0 && usage.ru_nivcsw >= before + 2) {
+        // One involuntary csw might just be occasional background work,
+        // but if we get two in a row then we guess that there is someone
+        // else who can profitably use this CPU.  Fall back to futex
+        break;
+      }
+      before = usage.ru_nivcsw;
+#endif
+    }
+    return futexWaitForZeroBits(state, goal, waitMask, ctx);
+  }
+
+  template <class WaitContext>
+  bool futexWaitForZeroBits(uint32_t& state,
+                            uint32_t goal,
+                            uint32_t waitMask,
+                            WaitContext& ctx) {
+    assert(waitMask == kWaitingNotS || waitMask == kWaitingE ||
+           waitMask == kWaitingU || waitMask == kWaitingS);
+
+    while (true) {
+      state = state_.load(std::memory_order_acquire);
+      if ((state & goal) == 0) {
+        return true;
+      }
+
+      auto after = state;
+      if (waitMask == kWaitingE) {
+        if ((state & kWaitingESingle) != 0) {
+          after |= kWaitingEMultiple;
+        } else {
+          after |= kWaitingESingle;
+        }
+      } else {
+        after |= waitMask;
+      }
+
+      // CAS is better than atomic |= here, because it lets us avoid
+      // setting the wait flag when the goal is concurrently achieved
+      if (after != state && !state_.compare_exchange_strong(state, after)) {
+        continue;
+      }
+
+      if (!ctx.doWait(state_, after, waitMask)) {
+        // timed out
+        return false;
+      }
+    }
+  }
+
+  // Wakes up waiters registered in state_ as appropriate, clearing the
+  // awaiting bits for anybody that was awoken.  Tries to perform direct
+  // single wakeup of an exclusive waiter if appropriate
+  void wakeRegisteredWaiters(uint32_t& state, uint32_t wakeMask) {
+    if (UNLIKELY((state & wakeMask) != 0)) {
+      wakeRegisteredWaitersImpl(state, wakeMask);
+    }
+  }
+
+  void wakeRegisteredWaitersImpl(uint32_t& state, uint32_t wakeMask) {
+    // If there are multiple lock() pending only one of them will actually
+    // get to wake up, so issuing futexWakeAll will make a thundering herd.
+    // There's nothing stopping us from issuing futexWake(1) instead,
+    // so long as the wait bits are still an accurate reflection of
+    // the waiters.  If we notice (via futexWake's return value) that
+    // nobody woke up then we can try again with the normal wake-all path.
+    // Note that we can't just clear the bits at that point; we need to
+    // clear the bits and then issue another wakeup.
+    //
+    // It is possible that we wake an E waiter but an outside S grabs the
+    // lock instead, at which point we should wake pending U and S waiters.
+    // Rather than tracking state to make the failing E regenerate the
+    // wakeup, we just disable the optimization in the case that there
+    // are waiting U or S that we are eligible to wake.
+    if ((wakeMask & kWaitingE) == kWaitingE &&
+        (state & wakeMask) == kWaitingE &&
+        state_.futexWake(1, kWaitingE) > 0) {
+      // somebody woke up, so leave state_ as is and clear it later
+      return;
+    }
+
+    if ((state & wakeMask) != 0) {
+      auto prev = state_.fetch_and(~wakeMask);
+      if ((prev & wakeMask) != 0) {
+        futexWakeAll(wakeMask);
+      }
+      state = prev & ~wakeMask;
+    }
+  }
+
+  void futexWakeAll(uint32_t wakeMask) {
+    state_.futexWake(std::numeric_limits<int>::max(), wakeMask);
+  }
+
+  DeferredReaderSlot* deferredReader(uint32_t slot) {
+    return &deferredReaders[slot * kDeferredSeparationFactor];
+  }
+
+  uintptr_t tokenfulSlotValue() { return reinterpret_cast<uintptr_t>(this); }
+
+  uintptr_t tokenlessSlotValue() { return tokenfulSlotValue() | kTokenless; }
+
+  bool slotValueIsThis(uintptr_t slotValue) {
+    return (slotValue & ~kTokenless) == tokenfulSlotValue();
+  }
+
+  // Clears any deferredReaders[] that point to this, adjusting the inline
+  // shared lock count to compensate.  Does some spinning and yielding
+  // to avoid the work.  Always finishes the application, even if ctx
+  // times out.
+  template <class WaitContext>
+  void applyDeferredReaders(uint32_t& state, WaitContext& ctx) {
+    uint32_t slot = 0;
+
+    uint32_t spinCount = 0;
+    while (true) {
+      while (!slotValueIsThis(
+                 deferredReader(slot)->load(std::memory_order_acquire))) {
+        if (++slot == kMaxDeferredReaders) {
+          return;
+        }
+      }
+#if FOLLY_X64
+      asm("pause");
+#endif
+      if (UNLIKELY(++spinCount >= kMaxSpinCount)) {
+        applyDeferredReaders(state, ctx, slot);
+        return;
+      }
+    }
+  }
+
+  template <class WaitContext>
+  void applyDeferredReaders(uint32_t& state, WaitContext& ctx, uint32_t slot) {
+
+#ifdef RUSAGE_THREAD
+    struct rusage usage;
+    long before = -1;
+#endif
+    for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
+         ++yieldCount) {
+      for (int softState = 0; softState < 3; ++softState) {
+        if (softState < 2) {
+          std::this_thread::yield();
+        } else {
+#ifdef RUSAGE_THREAD
+          getrusage(RUSAGE_THREAD, &usage);
+#endif
+        }
+        while (!slotValueIsThis(
+                   deferredReader(slot)->load(std::memory_order_acquire))) {
+          if (++slot == kMaxDeferredReaders) {
+            return;
+          }
+        }
+        if (ctx.shouldTimeOut()) {
+          // finish applying immediately on timeout
+          break;
+        }
+      }
+#ifdef RUSAGE_THREAD
+      if (before >= 0 && usage.ru_nivcsw >= before + 2) {
+        // heuristic says run queue is not empty
+        break;
+      }
+      before = usage.ru_nivcsw;
+#endif
+    }
+
+    uint32_t movedSlotCount = 0;
+    for (; slot < kMaxDeferredReaders; ++slot) {
+      auto slotPtr = deferredReader(slot);
+      auto slotValue = slotPtr->load(std::memory_order_acquire);
+      if (slotValueIsThis(slotValue) &&
+          slotPtr->compare_exchange_strong(slotValue, 0)) {
+        ++movedSlotCount;
+      }
+    }
+
+    if (movedSlotCount > 0) {
+      state = (state_ += movedSlotCount * kIncrHasS);
+    }
+    assert((state & (kHasE | kBegunE)) != 0);
+
+    // if state + kIncrHasS overflows (off the end of state) then either
+    // we have 2^(32-9) readers (almost certainly an application bug)
+    // or we had an underflow (also a bug)
+    assert(state < state + kIncrHasS);
+  }
+
+  // It is straightfoward to make a token-less lock_shared() and
+  // unlock_shared() either by making the token-less version always use
+  // INLINE_SHARED mode or by removing the token version.  Supporting
+  // deferred operation for both types is trickier than it appears, because
+  // the purpose of the token it so that unlock_shared doesn't have to
+  // look in other slots for its deferred lock.  Token-less unlock_shared
+  // might place a deferred lock in one place and then release a different
+  // slot that was originally used by the token-ful version.  If this was
+  // important we could solve the problem by differentiating the deferred
+  // locks so that cross-variety release wouldn't occur.  The best way
+  // is probably to steal a bit from the pointer, making deferredLocks[]
+  // an array of Atom<uintptr_t>.
+
+  template <class WaitContext>
+  bool lockSharedImpl(Token* token, WaitContext& ctx) {
+    uint32_t state = state_.load(std::memory_order_relaxed);
+    if ((state & (kHasS | kMayDefer | kHasE)) == 0 &&
+        state_.compare_exchange_strong(state, state + kIncrHasS)) {
+      if (token != nullptr) {
+        token->type_ = Token::Type::INLINE_SHARED;
+      }
+      return true;
+    }
+    return lockSharedImpl(state, token, ctx);
+  }
+
+  template <class WaitContext>
+  bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
+    while (true) {
+      if (UNLIKELY((state & kHasE) != 0) &&
+          !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
+        return false;
+      }
+
+      uint32_t slot;
+      uintptr_t slotValue = 1; // any non-zero value will do
+
+      bool canAlreadyDefer = (state & kMayDefer) != 0;
+      bool aboveDeferThreshold =
+          (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS;
+      bool drainInProgress = ReaderPriority && (state & kBegunE) != 0;
+      if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) {
+        // starting point for our empty-slot search, can change after
+        // calling waitForZeroBits
+        uint32_t bestSlot =
+            (uint32_t)folly::detail::AccessSpreader<Atom>::current(
+                kMaxDeferredReaders);
+
+        // deferred readers are already enabled, or it is time to
+        // enable them if we can find a slot
+        for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) {
+          slot = bestSlot ^ i;
+          assert(slot < kMaxDeferredReaders);
+          slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
+          if (slotValue == 0) {
+            // found empty slot
+            break;
+          }
+        }
+      }
+
+      if (slotValue != 0) {
+        // not yet deferred, or no empty slots
+        if (state_.compare_exchange_strong(state, state + kIncrHasS)) {
+          // successfully recorded the read lock inline
+          if (token != nullptr) {
+            token->type_ = Token::Type::INLINE_SHARED;
+          }
+          return true;
+        }
+        // state is updated, try again
+        continue;
+      }
+
+      // record that deferred readers might be in use if necessary
+      if ((state & kMayDefer) == 0) {
+        if (!state_.compare_exchange_strong(state, state | kMayDefer)) {
+          // keep going if CAS failed because somebody else set the bit
+          // for us
+          if ((state & (kHasE | kMayDefer)) != kMayDefer) {
+            continue;
+          }
+        }
+        // state = state | kMayDefer;
+      }
+
+      // try to use the slot
+      bool gotSlot = deferredReader(slot)->compare_exchange_strong(
+          slotValue,
+          token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue());
+
+      // If we got the slot, we need to verify that an exclusive lock
+      // didn't happen since we last checked.  If we didn't get the slot we
+      // need to recheck state_ anyway to make sure we don't waste too much
+      // work.  It is also possible that since we checked state_ someone
+      // has acquired and released the write lock, clearing kMayDefer.
+      // Both cases are covered by looking for the readers-possible bit,
+      // because it is off when the exclusive lock bit is set.
+      state = state_.load(std::memory_order_acquire);
+
+      if (!gotSlot) {
+        continue;
+      }
+
+      if (token == nullptr) {
+        tls_lastTokenlessSlot = slot;
+      }
+
+      if ((state & kMayDefer) != 0) {
+        assert((state & kHasE) == 0);
+        // success
+        if (token != nullptr) {
+          token->type_ = Token::Type::DEFERRED_SHARED;
+          token->slot_ = (uint16_t)slot;
+        }
+        return true;
+      }
+
+      // release the slot before retrying
+      if (token == nullptr) {
+        // We can't rely on slot.  Token-less slot values can be freed by
+        // any unlock_shared(), so we need to do the full deferredReader
+        // search during unlock.  Unlike unlock_shared(), we can't trust
+        // kPrevDefer here.  This deferred lock isn't visible to lock()
+        // (that's the whole reason we're undoing it) so there might have
+        // subsequently been an unlock() and lock() with no intervening
+        // transition to deferred mode.
+        if (!tryUnlockAnySharedDeferred()) {
+          unlockSharedInline();
+        }
+      } else {
+        if (!tryUnlockSharedDeferred(slot)) {
+          unlockSharedInline();
+        }
+      }
+
+      // We got here not because the lock was unavailable, but because
+      // we lost a compare-and-swap.  Try-lock is typically allowed to
+      // have spurious failures, but there is no lock efficiency gain
+      // from exploiting that freedom here.
+    }
+  }
+
+  bool tryUnlockAnySharedDeferred() {
+    auto bestSlot = tls_lastTokenlessSlot;
+    for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
+      auto slotPtr = deferredReader(bestSlot ^ i);
+      auto slotValue = slotPtr->load(std::memory_order_relaxed);
+      if (slotValue == tokenlessSlotValue() &&
+          slotPtr->compare_exchange_strong(slotValue, 0)) {
+        tls_lastTokenlessSlot = bestSlot ^ i;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool tryUnlockSharedDeferred(uint32_t slot) {
+    assert(slot < kMaxDeferredReaders);
+    auto slotValue = tokenfulSlotValue();
+    return deferredReader(slot)->compare_exchange_strong(slotValue, 0);
+  }
+
+  uint32_t unlockSharedInline() {
+    uint32_t state = (state_ -= kIncrHasS);
+    assert((state & (kHasE | kBegunE)) != 0 || state < state + kIncrHasS);
+    if ((state & kHasS) == 0) {
+      // Only the second half of lock() can be blocked by a non-zero
+      // reader count, so that's the only thing we need to wake
+      wakeRegisteredWaiters(state, kWaitingNotS);
+    }
+    return state;
+  }
+
+  template <class WaitContext>
+  bool lockUpgradeImpl(WaitContext& ctx) {
+    uint32_t state;
+    do {
+      if (!waitForZeroBits(state, kHasSolo, kWaitingU, ctx)) {
+        return false;
+      }
+    } while (!state_.compare_exchange_strong(state, state | kHasU));
+    return true;
+  }
+
+ public:
+  class ReadHolder {
+   public:
+    ReadHolder() : lock_(nullptr) {}
+
+    explicit ReadHolder(const SharedMutexImpl* lock) : ReadHolder(*lock) {}
+
+    explicit ReadHolder(const SharedMutexImpl& lock)
+        : lock_(const_cast<SharedMutexImpl*>(&lock)) {
+      lock_->lock_shared(token_);
+    }
+
+    ReadHolder(ReadHolder&& rhs) noexcept : lock_(rhs.lock_),
+                                            token_(rhs.token_) {
+      rhs.lock_ = nullptr;
+    }
+
+    // Downgrade from upgrade mode
+    explicit ReadHolder(UpgradeHolder&& upgraded) : lock_(upgraded.lock_) {
+      assert(upgraded.lock_ != nullptr);
+      upgraded.lock_ = nullptr;
+      lock_->unlock_upgrade_and_lock_shared(token_);
+    }
+
+    // Downgrade from exclusive mode
+    explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
+      assert(writer.lock_ != nullptr);
+      writer.lock_ = nullptr;
+      lock_->unlock_and_lock_shared(token_);
+    }
+
+    ReadHolder& operator=(ReadHolder&& rhs) noexcept {
+      std::swap(lock_, rhs.lock_);
+      std::swap(token_, rhs.token_);
+      return *this;
+    }
+
+    ReadHolder(const ReadHolder& rhs) = delete;
+    ReadHolder& operator=(const ReadHolder& rhs) = delete;
+
+    ~ReadHolder() {
+      if (lock_) {
+        lock_->unlock_shared(token_);
+      }
+    }
+
+   private:
+    friend class UpgradeHolder;
+    friend class WriteHolder;
+    SharedMutexImpl* lock_;
+    SharedMutexToken token_;
+  };
+
+  class UpgradeHolder {
+   public:
+    UpgradeHolder() : lock_(nullptr) {}
+
+    explicit UpgradeHolder(SharedMutexImpl* lock) : UpgradeHolder(*lock) {}
+
+    explicit UpgradeHolder(SharedMutexImpl& lock) : lock_(&lock) {
+      lock_->lock_upgrade();
+    }
+
+    // Downgrade from exclusive mode
+    explicit UpgradeHolder(WriteHolder&& writer) : lock_(writer.lock_) {
+      assert(writer.lock_ != nullptr);
+      writer.lock_ = nullptr;
+      lock_->unlock_and_lock_upgrade();
+    }
+
+    UpgradeHolder(UpgradeHolder&& rhs) noexcept : lock_(rhs.lock_) {
+      rhs.lock_ = nullptr;
+    }
+
+    UpgradeHolder& operator=(UpgradeHolder&& rhs) noexcept {
+      std::swap(lock_, rhs.lock_);
+      return *this;
+    }
+
+    UpgradeHolder(const UpgradeHolder& rhs) = delete;
+    UpgradeHolder& operator=(const UpgradeHolder& rhs) = delete;
+
+    ~UpgradeHolder() {
+      if (lock_) {
+        lock_->unlock_upgrade();
+      }
+    }
+
+   private:
+    friend class WriteHolder;
+    friend class ReadHolder;
+    SharedMutexImpl* lock_;
+  };
+
+  class WriteHolder {
+   public:
+    WriteHolder() : lock_(nullptr) {}
+
+    explicit WriteHolder(SharedMutexImpl* lock) : WriteHolder(*lock) {}
+
+    explicit WriteHolder(SharedMutexImpl& lock) : lock_(&lock) {
+      lock_->lock();
+    }
+
+    // Promotion from upgrade mode
+    explicit WriteHolder(UpgradeHolder&& upgrade) : lock_(upgrade.lock_) {
+      assert(upgrade.lock_ != nullptr);
+      upgrade.lock_ = nullptr;
+      lock_->unlock_upgrade_and_lock();
+    }
+
+    WriteHolder(WriteHolder&& rhs) noexcept : lock_(rhs.lock_) {
+      rhs.lock_ = nullptr;
+    }
+
+    WriteHolder& operator=(WriteHolder&& rhs) noexcept {
+      std::swap(lock_, rhs.lock_);
+      return *this;
+    }
+
+    WriteHolder(const WriteHolder& rhs) = delete;
+    WriteHolder& operator=(const WriteHolder& rhs) = delete;
+
+    ~WriteHolder() {
+      if (lock_) {
+        lock_->unlock();
+      }
+    }
+
+   private:
+    friend class ReadHolder;
+    friend class UpgradeHolder;
+    SharedMutexImpl* lock_;
+  };
+
+  // Adapters for Synchronized<>
+  friend void acquireRead(SharedMutexImpl& lock) { lock.lock_shared(); }
+  friend void acquireReadWrite(SharedMutexImpl& lock) { lock.lock(); }
+  friend void releaseRead(SharedMutexImpl& lock) { lock.unlock_shared(); }
+  friend void releaseReadWrite(SharedMutexImpl& lock) { lock.unlock(); }
+};
+
+#define COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(type) \
+  template <>                                                        \
+  type::DeferredReaderSlot                                           \
+      type::deferredReaders[type::kMaxDeferredReaders *              \
+                            type::kDeferredSeparationFactor] = {};   \
+  template <>                                                        \
+  FOLLY_TLS uint32_t type::tls_lastTokenlessSlot = 0;
+
+typedef SharedMutexImpl<true> SharedMutexReadPriority;
+typedef SharedMutexImpl<false> SharedMutexWritePriority;
+typedef SharedMutexWritePriority SharedMutex;
+
+} // namespace folly
diff --git a/folly/experimental/SharedMutex.cpp b/folly/experimental/SharedMutex.cpp
deleted file mode 100644
index 3a3ac17d..00000000
--- a/folly/experimental/SharedMutex.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright 2015 Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "SharedMutex.h"
-
-COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(
-    folly::SharedMutexReadPriority);
-COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(
-    folly::SharedMutexWritePriority);
diff --git a/folly/experimental/SharedMutex.h b/folly/experimental/SharedMutex.h
deleted file mode 100644
index 8bfd3262..00000000
--- a/folly/experimental/SharedMutex.h
+++ /dev/null
@@ -1,1366 +0,0 @@
-/*
- * Copyright 2015 Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// @author Nathan Bronson (ngbronson@fb.com)
-
-#pragma once
-
-#include <stdint.h>
-#include <atomic>
-#include <thread>
-#include <type_traits>
-#include <folly/Likely.h>
-#include <folly/detail/CacheLocality.h>
-#include <folly/detail/Futex.h>
-#include <sys/resource.h>
-
-// SharedMutex is a reader-writer lock.  It is small, very fast, scalable
-// on multi-core, and suitable for use when readers or writers may block.
-// Unlike most other reader-writer locks, its throughput with concurrent
-// readers scales linearly; it is able to acquire and release the lock
-// in shared mode without cache line ping-ponging.  It is suitable for
-// a wide range of lock hold times because it starts with spinning,
-// proceeds to using sched_yield with a preemption heuristic, and then
-// waits using futex and precise wakeups.
-//
-// SharedMutex provides all of the methods of folly::RWSpinLock,
-// boost::shared_mutex, boost::upgrade_mutex, and C++14's
-// std::shared_timed_mutex.  All operations that can block are available
-// in try, try-for, and try-until (system_clock or steady_clock) versions.
-//
-// SharedMutexReadPriority gives priority to readers,
-// SharedMutexWritePriority gives priority to writers.  SharedMutex is an
-// alias for SharedMutexWritePriority, because writer starvation is more
-// likely than reader starvation for the read-heavy workloads targetted
-// by SharedMutex.
-//
-// In my tests SharedMutex is as good or better than the other
-// reader-writer locks in use at Facebook for almost all use cases,
-// sometimes by a wide margin.  (If it is rare that there are actually
-// concurrent readers then RWSpinLock can be a few nanoseconds faster.)
-// I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64,
-// boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses
-// spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block.
-// (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.)
-// It is generally as good or better than the rest when evaluating size,
-// speed, scalability, or latency outliers.  In the corner cases where
-// it is not the fastest (such as single-threaded use or heavy write
-// contention) it is never very much worse than the best.  See the bottom
-// of folly/test/SharedMutexTest.cpp for lots of microbenchmark results.
-//
-// Comparison to folly::RWSpinLock:
-//
-//  * SharedMutex is faster than RWSpinLock when there are actually
-//    concurrent read accesses (sometimes much faster), and ~5 nanoseconds
-//    slower when there is not actually any contention.  SharedMutex is
-//    faster in every (benchmarked) scenario where the shared mode of
-//    the lock is actually useful.
-//
-//  * Concurrent shared access to SharedMutex scales linearly, while total
-//    RWSpinLock throughput drops as more threads try to access the lock
-//    in shared mode.  Under very heavy read contention SharedMutex can
-//    be two orders of magnitude faster than RWSpinLock (or any reader
-//    writer lock that doesn't use striping or deferral).
-//
-//  * SharedMutex can safely protect blocking calls, because after an
-//    initial period of spinning it waits using futex().
-//
-//  * RWSpinLock prioritizes readers, SharedMutex has both reader- and
-//    writer-priority variants, but defaults to write priority.
-//
-//  * RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's
-//    doesn't.  Both semantics are reasonable.  The boost documentation
-//    doesn't explicitly talk about this behavior (except by omitting
-//    any statement that those lock modes conflict), but the boost
-//    implementations do allow new readers while the upgradeable mode
-//    is held.  See https://github.com/boostorg/thread/blob/master/
-//      include/boost/thread/pthread/shared_mutex.hpp
-//
-//  * RWSpinLock::UpgradedHolder maps to SharedMutex::UpgradeHolder
-//    (UpgradeableHolder would be even more pedantically correct).
-//    SharedMutex's holders have fewer methods (no reset) and are less
-//    tolerant (promotion and downgrade crash if the donor doesn't own
-//    the lock, and you must use the default constructor rather than
-//    passing a nullptr to the pointer constructor).
-//
-// Both SharedMutex and RWSpinLock provide "exclusive", "upgrade",
-// and "shared" modes.  At all times num_threads_holding_exclusive +
-// num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive ==
-// 0 || num_threads_holding_shared == 0.  RWSpinLock has the additional
-// constraint that num_threads_holding_shared cannot increase while
-// num_threads_holding_upgrade is non-zero.
-//
-// Comparison to the internal RWLock:
-//
-//  * SharedMutex doesn't allow a maximum reader count to be configured,
-//    so it can't be used as a semaphore in the same way as RWLock.
-//
-//  * SharedMutex is 4 bytes, RWLock is 256.
-//
-//  * SharedMutex is as fast or faster than RWLock in all of my
-//    microbenchmarks, and has positive rather than negative scalability.
-//
-//  * RWLock and SharedMutex are both writer priority locks.
-//
-//  * SharedMutex avoids latency outliers as well as RWLock.
-//
-//  * SharedMutex uses different names (t != 0 below):
-//
-//    RWLock::lock(0)    => SharedMutex::lock()
-//
-//    RWLock::lock(t)    => SharedMutex::try_lock_for(milliseconds(t))
-//
-//    RWLock::tryLock()  => SharedMutex::try_lock()
-//
-//    RWLock::unlock()   => SharedMutex::unlock()
-//
-//    RWLock::enter(0)   => SharedMutex::lock_shared()
-//
-//    RWLock::enter(t)   =>
-//        SharedMutex::try_lock_shared_for(milliseconds(t))
-//
-//    RWLock::tryEnter() => SharedMutex::try_lock_shared()
-//
-//    RWLock::leave()    => SharedMutex::unlock_shared()
-//
-//  * RWLock allows the reader count to be adjusted by a value other
-//    than 1 during enter() or leave(). SharedMutex doesn't currently
-//    implement this feature.
-//
-//  * RWLock's methods are marked const, SharedMutex's aren't.
-//
-// Reader-writer locks have the potential to allow concurrent access
-// to shared read-mostly data, but in practice they often provide no
-// improvement over a mutex.  The problem is the cache coherence protocol
-// of modern CPUs.  Coherence is provided by making sure that when a cache
-// line is written it is present in only one core's cache.  Since a memory
-// write is required to acquire a reader-writer lock in shared mode, the
-// cache line holding the lock is invalidated in all of the other caches.
-// This leads to cache misses when another thread wants to acquire or
-// release the lock concurrently.  When the RWLock is colocated with the
-// data it protects (common), cache misses can also continue occur when
-// a thread that already holds the lock tries to read the protected data.
-//
-// Ideally, a reader-writer lock would allow multiple cores to acquire
-// and release the lock in shared mode without incurring any cache misses.
-// This requires that each core records its shared access in a cache line
-// that isn't read or written by other read-locking cores.  (Writers will
-// have to check all of the cache lines.)  Typical server hardware when
-// this comment was written has 16 L1 caches and cache lines of 64 bytes,
-// so a lock striped over all L1 caches would occupy a prohibitive 1024
-// bytes.  Nothing says that we need a separate set of per-core memory
-// locations for each lock, however.  Each SharedMutex instance is only
-// 4 bytes, but all locks together share a 2K area in which they make a
-// core-local record of lock acquisitions.
-//
-// SharedMutex's strategy of using a shared set of core-local stripes has
-// a potential downside, because it means that acquisition of any lock in
-// write mode can conflict with acquisition of any lock in shared mode.
-// If a lock instance doesn't actually experience concurrency then this
-// downside will outweight the upside of improved scalability for readers.
-// To avoid this problem we dynamically detect concurrent accesses to
-// SharedMutex, and don't start using the deferred mode unless we actually
-// observe concurrency.  See kNumSharedToStartDeferring.
-//
-// It is explicitly allowed to call lock_unshared() from a different
-// thread than lock_shared(), so long as they are properly paired.
-// lock_unshared() needs to find the location at which lock_shared()
-// recorded the lock, which might be in the lock itself or in any of
-// the shared slots.  If you can conveniently pass state from lock
-// acquisition to release then the fastest mechanism is to std::move
-// the SharedMutex::ReadHolder instance or an SharedMutex::Token (using
-// lock_shared(Token&) and unlock_sahred(Token&)).  The guard or token
-// will tell unlock_shared where in deferredReaders[] to look for the
-// deferred lock.  The Token-less version of unlock_shared() works in all
-// cases, but is optimized for the common (no inter-thread handoff) case.
-//
-// In both read- and write-priority mode, a waiting lock() (exclusive mode)
-// only blocks readers after it has waited for an active upgrade lock to be
-// released; until the upgrade lock is released (or upgraded or downgraded)
-// readers will still be able to enter.  Preferences about lock acquisition
-// are not guaranteed to be enforced perfectly (even if they were, there
-// is theoretically the chance that a thread could be arbitrarily suspended
-// between calling lock() and SharedMutex code actually getting executed).
-//
-// try_*_for methods always try at least once, even if the duration
-// is zero or negative.  The duration type must be compatible with
-// std::chrono::steady_clock.  try_*_until methods also always try at
-// least once.  std::chrono::system_clock and std::chrono::steady_clock
-// are supported.
-//
-// If you have observed by profiling that your SharedMutex-s are getting
-// cache misses on deferredReaders[] due to another SharedMutex user, then
-// you can use the tag type plus the RWDEFERREDLOCK_DECLARE_STATIC_STORAGE
-// macro to create your own instantiation of the type.  The contention
-// threshold (see kNumSharedToStartDeferring) should make this unnecessary
-// in all but the most extreme cases.  Make sure to check that the
-// increased icache and dcache footprint of the tagged result is worth it.
-
-namespace folly {
-
-struct SharedMutexToken {
-  enum class Type : uint16_t {
-    INVALID = 0,
-    INLINE_SHARED,
-    DEFERRED_SHARED,
-  };
-
-  Type type_;
-  uint16_t slot_;
-};
-
-template <bool ReaderPriority,
-          typename Tag_ = void,
-          template <typename> class Atom = std::atomic,
-          bool BlockImmediately = false>
-class SharedMutexImpl {
- public:
-  static constexpr bool kReaderPriority = ReaderPriority;
-  typedef Tag_ Tag;
-
-  typedef SharedMutexToken Token;
-
-  class ReadHolder;
-  class UpgradeHolder;
-  class WriteHolder;
-
-  SharedMutexImpl() : state_(0) {}
-
-  SharedMutexImpl(const SharedMutexImpl&) = delete;
-  SharedMutexImpl(SharedMutexImpl&&) = delete;
-  SharedMutexImpl& operator = (const SharedMutexImpl&) = delete;
-  SharedMutexImpl& operator = (SharedMutexImpl&&) = delete;
-
-  // It is an error to destroy an SharedMutex that still has
-  // any outstanding locks.  This is checked if NDEBUG isn't defined.
-  // SharedMutex's exclusive mode can be safely used to guard the lock's
-  // own destruction.  If, for example, you acquire the lock in exclusive
-  // mode and then observe that the object containing the lock is no longer
-  // needed, you can unlock() and then immediately destroy the lock.
-  // See https://sourceware.org/bugzilla/show_bug.cgi?id=13690 for a
-  // description about why this property needs to be explicitly mentioned.
-  ~SharedMutexImpl() {
-#ifndef NDEBUG
-    auto state = state_.load(std::memory_order_acquire);
-
-    // if a futexWait fails to go to sleep because the value has been
-    // changed, we don't necessarily clean up the wait bits, so it is
-    // possible they will be set here in a correct system
-    assert((state & ~(kWaitingAny | kMayDefer)) == 0);
-    if ((state & kMayDefer) != 0) {
-      for (uint32_t slot = 0; slot < kMaxDeferredReaders; ++slot) {
-        auto slotValue = deferredReader(slot)->load(std::memory_order_acquire);
-        assert(!slotValueIsThis(slotValue));
-      }
-    }
-#endif
-  }
-
-  void lock() {
-    WaitForever ctx;
-    (void)lockExclusiveImpl(kHasSolo, ctx);
-  }
-
-  bool try_lock() {
-    WaitNever ctx;
-    return lockExclusiveImpl(kHasSolo, ctx);
-  }
-
-  template <class Rep, class Period>
-  bool try_lock_for(const std::chrono::duration<Rep, Period>& duration) {
-    WaitForDuration<Rep, Period> ctx(duration);
-    return lockExclusiveImpl(kHasSolo, ctx);
-  }
-
-  template <class Clock, class Duration>
-  bool try_lock_until(
-      const std::chrono::time_point<Clock, Duration>& absDeadline) {
-    WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
-    return lockExclusiveImpl(kHasSolo, ctx);
-  }
-
-  void unlock() {
-    // It is possible that we have a left-over kWaitingNotS if the last
-    // unlock_shared() that let our matching lock() complete finished
-    // releasing before lock()'s futexWait went to sleep.  Clean it up now
-    auto state = (state_ &= ~(kWaitingNotS | kPrevDefer | kHasE));
-    assert((state & ~kWaitingAny) == 0);
-    wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
-  }
-
-  // Managing the token yourself makes unlock_shared a bit faster
-
-  void lock_shared() {
-    WaitForever ctx;
-    (void)lockSharedImpl(nullptr, ctx);
-  }
-
-  void lock_shared(Token& token) {
-    WaitForever ctx;
-    (void)lockSharedImpl(&token, ctx);
-  }
-
-  bool try_lock_shared() {
-    WaitNever ctx;
-    return lockSharedImpl(nullptr, ctx);
-  }
-
-  bool try_lock_shared(Token& token) {
-    WaitNever ctx;
-    return lockSharedImpl(&token, ctx);
-  }
-
-  template <class Rep, class Period>
-  bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration) {
-    WaitForDuration<Rep, Period> ctx(duration);
-    return lockSharedImpl(nullptr, ctx);
-  }
-
-  template <class Rep, class Period>
-  bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration,
-                           Token& token) {
-    WaitForDuration<Rep, Period> ctx(duration);
-    return lockSharedImpl(&token, ctx);
-  }
-
-  template <class Clock, class Duration>
-  bool try_lock_shared_until(
-      const std::chrono::time_point<Clock, Duration>& absDeadline) {
-    WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
-    return lockSharedImpl(nullptr, ctx);
-  }
-
-  template <class Clock, class Duration>
-  bool try_lock_shared_until(
-      const std::chrono::time_point<Clock, Duration>& absDeadline,
-      Token& token) {
-    WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
-    return lockSharedImpl(&token, ctx);
-  }
-
-  void unlock_shared() {
-    auto state = state_.load(std::memory_order_acquire);
-
-    // kPrevDefer can only be set if HasE or BegunE is set
-    assert((state & (kPrevDefer | kHasE | kBegunE)) != kPrevDefer);
-
-    // lock() strips kMayDefer immediately, but then copies it to
-    // kPrevDefer so we can tell if the pre-lock() lock_shared() might
-    // have deferred
-    if ((state & (kMayDefer | kPrevDefer)) == 0 ||
-        !tryUnlockAnySharedDeferred()) {
-      // Matching lock_shared() couldn't have deferred, or the deferred
-      // lock has already been inlined by applyDeferredReaders()
-      unlockSharedInline();
-    }
-  }
-
-  void unlock_shared(Token& token) {
-    assert(token.type_ == Token::Type::INLINE_SHARED ||
-           token.type_ == Token::Type::DEFERRED_SHARED);
-
-    if (token.type_ != Token::Type::DEFERRED_SHARED ||
-        !tryUnlockSharedDeferred(token.slot_)) {
-      unlockSharedInline();
-    }
-#ifndef NDEBUG
-    token.type_ = Token::Type::INVALID;
-#endif
-  }
-
-  void unlock_and_lock_shared() {
-    // We can't use state_ -=, because we need to clear 2 bits (1 of which
-    // has an uncertain initial state) and set 1 other.  We might as well
-    // clear the relevant wake bits at the same time.  Note that since S
-    // doesn't block the beginning of a transition to E (writer priority
-    // can cut off new S, reader priority grabs BegunE and blocks deferred
-    // S) we need to wake E as well.
-    auto state = state_.load(std::memory_order_acquire);
-    do {
-      assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
-    } while (!state_.compare_exchange_strong(
-        state, (state & ~(kWaitingAny | kPrevDefer | kHasE)) + kIncrHasS));
-    if ((state & (kWaitingE | kWaitingU | kWaitingS)) != 0) {
-      futexWakeAll(kWaitingE | kWaitingU | kWaitingS);
-    }
-  }
-
-  void unlock_and_lock_shared(Token& token) {
-    unlock_and_lock_shared();
-    token.type_ = Token::Type::INLINE_SHARED;
-  }
-
-  void lock_upgrade() {
-    WaitForever ctx;
-    (void)lockUpgradeImpl(ctx);
-  }
-
-  bool try_lock_upgrade() {
-    WaitNever ctx;
-    return lockUpgradeImpl(ctx);
-  }
-
-  template <class Rep, class Period>
-  bool try_lock_upgrade_for(
-      const std::chrono::duration<Rep, Period>& duration) {
-    WaitForDuration<Rep, Period> ctx(duration);
-    return lockUpgradeImpl(ctx);
-  }
-
-  template <class Clock, class Duration>
-  bool try_lock_upgrade_until(
-      const std::chrono::time_point<Clock, Duration>& absDeadline) {
-    WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
-    return lockUpgradeImpl(ctx);
-  }
-
-  void unlock_upgrade() {
-    auto state = (state_ -= kHasU);
-    assert((state & (kWaitingNotS | kHasSolo)) == 0);
-    wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
-  }
-
-  void unlock_upgrade_and_lock() {
-    // no waiting necessary, so waitMask is empty
-    WaitForever ctx;
-    (void)lockExclusiveImpl(0, ctx);
-  }
-
-  void unlock_upgrade_and_lock_shared() {
-    auto state = (state_ -= kHasU - kIncrHasS);
-    assert((state & (kWaitingNotS | kHasSolo)) == 0 && (state & kHasS) != 0);
-    wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
-  }
-
-  void unlock_upgrade_and_lock_shared(Token& token) {
-    unlock_upgrade_and_lock_shared();
-    token.type_ = Token::Type::INLINE_SHARED;
-  }
-
-  void unlock_and_lock_upgrade() {
-    // We can't use state_ -=, because we need to clear 2 bits (1 of
-    // which has an uncertain initial state) and set 1 other.  We might
-    // as well clear the relevant wake bits at the same time.
-    auto state = state_.load(std::memory_order_acquire);
-    while (true) {
-      assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
-      auto after =
-          (state & ~(kWaitingNotS | kWaitingS | kPrevDefer | kHasE)) + kHasU;
-      if (state_.compare_exchange_strong(state, after)) {
-        if ((state & kWaitingS) != 0) {
-          futexWakeAll(kWaitingS);
-        }
-        return;
-      }
-    }
-  }
-
- private:
-  typedef typename folly::detail::Futex<Atom> Futex;
-
-  // Internally we use four kinds of wait contexts.  These are structs
-  // that provide a doWait method that returns true if a futex wake
-  // was issued that intersects with the waitMask, false if there was a
-  // timeout and no more waiting should be performed.  Spinning occurs
-  // before the wait context is invoked.
-
-  struct WaitForever {
-    bool canBlock() { return true; }
-    bool canTimeOut() { return false; }
-    bool shouldTimeOut() { return false; }
-
-    bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
-      futex.futexWait(expected, waitMask);
-      return true;
-    }
-  };
-
-  struct WaitNever {
-    bool canBlock() { return false; }
-    bool canTimeOut() { return true; }
-    bool shouldTimeOut() { return true; }
-
-    bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
-      return false;
-    }
-  };
-
-  template <class Rep, class Period>
-  struct WaitForDuration {
-    std::chrono::duration<Rep, Period> duration_;
-    bool deadlineComputed_;
-    std::chrono::steady_clock::time_point deadline_;
-
-    explicit WaitForDuration(const std::chrono::duration<Rep, Period>& duration)
-        : duration_(duration), deadlineComputed_(false) {}
-
-    std::chrono::steady_clock::time_point deadline() {
-      if (!deadlineComputed_) {
-        deadline_ = std::chrono::steady_clock::now() + duration_;
-        deadlineComputed_ = true;
-      }
-      return deadline_;
-    }
-
-    bool canBlock() { return duration_.count() > 0; }
-    bool canTimeOut() { return true; }
-
-    bool shouldTimeOut() {
-      return std::chrono::steady_clock::now() > deadline();
-    }
-
-    bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
-      auto result = futex.futexWaitUntil(expected, deadline(), waitMask);
-      return result != folly::detail::FutexResult::TIMEDOUT;
-    }
-  };
-
-  template <class Clock, class Duration>
-  struct WaitUntilDeadline {
-    std::chrono::time_point<Clock, Duration> absDeadline_;
-
-    bool canBlock() { return true; }
-    bool canTimeOut() { return true; }
-    bool shouldTimeOut() { return Clock::now() > absDeadline_; }
-
-    bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
-      auto result = futex.futexWaitUntil(expected, absDeadline_, waitMask);
-      return result != folly::detail::FutexResult::TIMEDOUT;
-    }
-  };
-
-  // 32 bits of state
-  Futex state_;
-
-  static constexpr uint32_t kIncrHasS = 1 << 10;
-  static constexpr uint32_t kHasS = ~(kIncrHasS - 1);
-
-  // If false, then there are definitely no deferred read locks for this
-  // instance.  Cleared after initialization and when exclusively locked.
-  static constexpr uint32_t kMayDefer = 1 << 9;
-
-  // lock() cleared kMayDefer as soon as it starts draining readers (so
-  // that it doesn't have to do a second CAS once drain completes), but
-  // unlock_shared() still needs to know whether to scan deferredReaders[]
-  // or not.  We copy kMayDefer to kPrevDefer when setting kHasE or
-  // kBegunE, and clear it when clearing those bits.
-  static constexpr uint32_t kPrevDefer = 1 << 8;
-
-  // Exclusive-locked blocks all read locks and write locks.  This bit
-  // may be set before all readers have finished, but in that case the
-  // thread that sets it won't return to the caller until all read locks
-  // have been released.
-  static constexpr uint32_t kHasE = 1 << 7;
-
-  // Exclusive-draining means that lock() is waiting for existing readers
-  // to leave, but that new readers may still acquire shared access.
-  // This is only used in reader priority mode.  New readers during
-  // drain must be inline.  The difference between this and kHasU is that
-  // kBegunE prevents kMayDefer from being set.
-  static constexpr uint32_t kBegunE = 1 << 6;
-
-  // At most one thread may have either exclusive or upgrade lock
-  // ownership.  Unlike exclusive mode, ownership of the lock in upgrade
-  // mode doesn't preclude other threads holding the lock in shared mode.
-  // boost's concept for this doesn't explicitly say whether new shared
-  // locks can be acquired one lock_upgrade has succeeded, but doesn't
-  // list that as disallowed.  RWSpinLock disallows new read locks after
-  // lock_upgrade has been acquired, but the boost implementation doesn't.
-  // We choose the latter.
-  static constexpr uint32_t kHasU = 1 << 5;
-
-  // There are three states that we consider to be "solo", in that they
-  // cannot coexist with other solo states.  These are kHasE, kBegunE,
-  // and kHasU.  Note that S doesn't conflict with any of these, because
-  // setting the kHasE is only one of the two steps needed to actually
-  // acquire the lock in exclusive mode (the other is draining the existing
-  // S holders).
-  static constexpr uint32_t kHasSolo = kHasE | kBegunE | kHasU;
-
-  // Once a thread sets kHasE it needs to wait for the current readers
-  // to exit the lock.  We give this a separate wait identity from the
-  // waiting to set kHasE so that we can perform partial wakeups (wake
-  // one instead of wake all).
-  static constexpr uint32_t kWaitingNotS = 1 << 4;
-
-  // When waking writers we can either wake them all, in which case we
-  // can clear kWaitingE, or we can call futexWake(1).  futexWake tells
-  // us if anybody woke up, but even if we detect that nobody woke up we
-  // can't clear the bit after the fact without issuing another wakeup.
-  // To avoid thundering herds when there are lots of pending lock()
-  // without needing to call futexWake twice when there is only one
-  // waiter, kWaitingE actually encodes if we have observed multiple
-  // concurrent waiters.  Tricky: ABA issues on futexWait mean that when
-  // we see kWaitingESingle we can't assume that there is only one.
-  static constexpr uint32_t kWaitingESingle = 1 << 2;
-  static constexpr uint32_t kWaitingEMultiple = 1 << 3;
-  static constexpr uint32_t kWaitingE = kWaitingESingle | kWaitingEMultiple;
-
-  // kWaitingU is essentially a 1 bit saturating counter.  It always
-  // requires a wakeAll.
-  static constexpr uint32_t kWaitingU = 1 << 1;
-
-  // All blocked lock_shared() should be awoken, so it is correct (not
-  // suboptimal) to wakeAll if there are any shared readers.
-  static constexpr uint32_t kWaitingS = 1 << 0;
-
-  // kWaitingAny is a mask of all of the bits that record the state of
-  // threads, rather than the state of the lock.  It is convenient to be
-  // able to mask them off during asserts.
-  static constexpr uint32_t kWaitingAny =
-      kWaitingNotS | kWaitingE | kWaitingU | kWaitingS;
-
-  // The reader count at which a reader will attempt to use the lock
-  // in deferred mode.  If this value is 2, then the second concurrent
-  // reader will set kMayDefer and use deferredReaders[].  kMayDefer is
-  // cleared during exclusive access, so this threshold must be reached
-  // each time a lock is held in exclusive mode.
-  static constexpr uint32_t kNumSharedToStartDeferring = 2;
-
-  // The typical number of spins that a thread will wait for a state
-  // transition.  There is no bound on the number of threads that can wait
-  // for a writer, so we are pretty conservative here to limit the chance
-  // that we are starving the writer of CPU.  Each spin is 6 or 7 nanos,
-  // almost all of which is in the pause instruction.
-  static constexpr uint32_t kMaxSpinCount = !BlockImmediately ? 1000 : 2;
-
-  // The maximum number of soft yields before falling back to futex.
-  // If the preemption heuristic is activated we will fall back before
-  // this.  A soft yield takes ~900 nanos (two sched_yield plus a call
-  // to getrusage, with checks of the goal at each step).  Soft yields
-  // aren't compatible with deterministic execution under test (unlike
-  // futexWaitUntil, which has a capricious but deterministic back end).
-  static constexpr uint32_t kMaxSoftYieldCount = !BlockImmediately ? 1000 : 0;
-
-  // If AccessSpreader assigns indexes from 0..k*n-1 on a system where some
-  // level of the memory hierarchy is symmetrically divided into k pieces
-  // (NUMA nodes, last-level caches, L1 caches, ...), then slot indexes
-  // that are the same after integer division by k share that resource.
-  // Our strategy for deferred readers is to probe up to numSlots/4 slots,
-  // using the full granularity of AccessSpreader for the start slot
-  // and then search outward.  We can use AccessSpreader::current(n)
-  // without managing our own spreader if kMaxDeferredReaders <=
-  // AccessSpreader::kMaxCpus, which is currently 128.
-  //
-  // Our 2-socket E5-2660 machines have 8 L1 caches on each chip,
-  // with 64 byte cache lines.  That means we need 64*16 bytes of
-  // deferredReaders[] to give each L1 its own playground.  On x86_64
-  // each DeferredReaderSlot is 8 bytes, so we need kMaxDeferredReaders
-  // * kDeferredSeparationFactor >= 64 * 16 / 8 == 128.  If
-  // kDeferredSearchDistance * kDeferredSeparationFactor <=
-  // 64 / 8 then we will search only within a single cache line, which
-  // guarantees we won't have inter-L1 contention.  We give ourselves
-  // a factor of 2 on the core count, which should hold us for a couple
-  // processor generations.  deferredReaders[] is 2048 bytes currently.
-  static constexpr uint32_t kMaxDeferredReaders = 64;
-  static constexpr uint32_t kDeferredSearchDistance = 2;
-  static constexpr uint32_t kDeferredSeparationFactor = 4;
-
-  static_assert(!(kMaxDeferredReaders & (kMaxDeferredReaders - 1)),
-                "kMaxDeferredReaders must be a power of 2");
-  static_assert(!(kDeferredSearchDistance & (kDeferredSearchDistance - 1)),
-                "kDeferredSearchDistance must be a power of 2");
-
-  // The number of deferred locks that can be simultaneously acquired
-  // by a thread via the token-less methods without performing any heap
-  // allocations.  Each of these costs 3 pointers (24 bytes, probably)
-  // per thread.  There's not much point in making this larger than
-  // kDeferredSearchDistance.
-  static constexpr uint32_t kTokenStackTLSCapacity = 2;
-
-  // We need to make sure that if there is a lock_shared()
-  // and lock_shared(token) followed by unlock_shared() and
-  // unlock_shared(token), the token-less unlock doesn't null
-  // out deferredReaders[token.slot_].  If we allowed that, then
-  // unlock_shared(token) wouldn't be able to assume that its lock
-  // had been inlined by applyDeferredReaders when it finds that
-  // deferredReaders[token.slot_] no longer points to this.  We accomplish
-  // this by stealing bit 0 from the pointer to record that the slot's
-  // element has no token, hence our use of uintptr_t in deferredReaders[].
-  static constexpr uintptr_t kTokenless = 0x1;
-
-  // This is the starting location for Token-less unlock_shared().
-  static FOLLY_TLS uint32_t tls_lastTokenlessSlot;
-
-  // Only indexes divisible by kDeferredSeparationFactor are used.
-  // If any of those elements points to a SharedMutexImpl, then it
-  // should be considered that there is a shared lock on that instance.
-  // See kTokenless.
-  typedef Atom<uintptr_t> DeferredReaderSlot;
-  static DeferredReaderSlot deferredReaders
-      [kMaxDeferredReaders *
-       kDeferredSeparationFactor] FOLLY_ALIGN_TO_AVOID_FALSE_SHARING;
-
-  // Performs an exclusive lock, waiting for state_ & waitMask to be
-  // zero first
-  template <class WaitContext>
-  bool lockExclusiveImpl(uint32_t preconditionGoalMask, WaitContext& ctx) {
-    uint32_t state = state_.load(std::memory_order_acquire);
-    if (LIKELY(
-            (state & (preconditionGoalMask | kMayDefer | kHasS)) == 0 &&
-            state_.compare_exchange_strong(state, (state | kHasE) & ~kHasU))) {
-      return true;
-    } else {
-      return lockExclusiveImpl(state, preconditionGoalMask, ctx);
-    }
-  }
-
-  template <class WaitContext>
-  bool lockExclusiveImpl(uint32_t& state,
-                         uint32_t preconditionGoalMask,
-                         WaitContext& ctx) {
-    while (true) {
-      if (UNLIKELY((state & preconditionGoalMask) != 0) &&
-          !waitForZeroBits(state, preconditionGoalMask, kWaitingE, ctx) &&
-          ctx.canTimeOut()) {
-        return false;
-      }
-
-      uint32_t after = (state & kMayDefer) == 0 ? 0 : kPrevDefer;
-      if (!ReaderPriority || (state & (kMayDefer | kHasS)) == 0) {
-        // Block readers immediately, either because we are in write
-        // priority mode or because we can acquire the lock in one
-        // step.  Note that if state has kHasU, then we are doing an
-        // unlock_upgrade_and_lock() and we should clear it (reader
-        // priority branch also does this).
-        after |= (state | kHasE) & ~(kHasU | kMayDefer);
-      } else {
-        after |= (state | kBegunE) & ~(kHasU | kMayDefer);
-      }
-      if (state_.compare_exchange_strong(state, after)) {
-        auto before = state;
-        state = after;
-
-        // If we set kHasE (writer priority) then no new readers can
-        // arrive.  If we set kBegunE then they can still enter, but
-        // they must be inline.  Either way we need to either spin on
-        // deferredReaders[] slots, or inline them so that we can wait on
-        // kHasS to zero itself.  deferredReaders[] is pointers, which on
-        // x86_64 are bigger than futex() can handle, so we inline the
-        // deferred locks instead of trying to futexWait on each slot.
-        // Readers are responsible for rechecking state_ after recording
-        // a deferred read to avoid atomicity problems between the state_
-        // CAS and applyDeferredReader's reads of deferredReaders[].
-        if (UNLIKELY((before & kMayDefer) != 0)) {
-          applyDeferredReaders(state, ctx);
-        }
-        while (true) {
-          assert((state & (kHasE | kBegunE)) != 0 && (state & kHasU) == 0);
-          if (UNLIKELY((state & kHasS) != 0) &&
-              !waitForZeroBits(state, kHasS, kWaitingNotS, ctx) &&
-              ctx.canTimeOut()) {
-            // Ugh.  We blocked new readers and other writers for a while,
-            // but were unable to complete.  Move on.  On the plus side
-            // we can clear kWaitingNotS because nobody else can piggyback
-            // on it.
-            state = (state_ &= ~(kPrevDefer | kHasE | kBegunE | kWaitingNotS));
-            wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
-            return false;
-          }
-
-          if (ReaderPriority && (state & kHasE) == 0) {
-            assert((state & kBegunE) != 0);
-            if (!state_.compare_exchange_strong(state,
-                                                (state & ~kBegunE) | kHasE)) {
-              continue;
-            }
-          }
-
-          return true;
-        }
-      }
-    }
-  }
-
-  template <class WaitContext>
-  bool waitForZeroBits(uint32_t& state,
-                       uint32_t goal,
-                       uint32_t waitMask,
-                       WaitContext& ctx) {
-    uint32_t spinCount = 0;
-    while (true) {
-      state = state_.load(std::memory_order_acquire);
-      if ((state & goal) == 0) {
-        return true;
-      }
-#if FOLLY_X64
-      asm volatile("pause");
-#endif
-      ++spinCount;
-      if (UNLIKELY(spinCount >= kMaxSpinCount)) {
-        return ctx.canBlock() &&
-               yieldWaitForZeroBits(state, goal, waitMask, ctx);
-      }
-    }
-  }
-
-  template <class WaitContext>
-  bool yieldWaitForZeroBits(uint32_t& state,
-                            uint32_t goal,
-                            uint32_t waitMask,
-                            WaitContext& ctx) {
-#ifdef RUSAGE_THREAD
-    struct rusage usage;
-    long before = -1;
-#endif
-    for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
-         ++yieldCount) {
-      for (int softState = 0; softState < 3; ++softState) {
-        if (softState < 2) {
-          std::this_thread::yield();
-        } else {
-#ifdef RUSAGE_THREAD
-          getrusage(RUSAGE_THREAD, &usage);
-#endif
-        }
-        if (((state = state_.load(std::memory_order_acquire)) & goal) == 0) {
-          return true;
-        }
-        if (ctx.shouldTimeOut()) {
-          return false;
-        }
-      }
-#ifdef RUSAGE_THREAD
-      if (before >= 0 && usage.ru_nivcsw >= before + 2) {
-        // One involuntary csw might just be occasional background work,
-        // but if we get two in a row then we guess that there is someone
-        // else who can profitably use this CPU.  Fall back to futex
-        break;
-      }
-      before = usage.ru_nivcsw;
-#endif
-    }
-    return futexWaitForZeroBits(state, goal, waitMask, ctx);
-  }
-
-  template <class WaitContext>
-  bool futexWaitForZeroBits(uint32_t& state,
-                            uint32_t goal,
-                            uint32_t waitMask,
-                            WaitContext& ctx) {
-    assert(waitMask == kWaitingNotS || waitMask == kWaitingE ||
-           waitMask == kWaitingU || waitMask == kWaitingS);
-
-    while (true) {
-      state = state_.load(std::memory_order_acquire);
-      if ((state & goal) == 0) {
-        return true;
-      }
-
-      auto after = state;
-      if (waitMask == kWaitingE) {
-        if ((state & kWaitingESingle) != 0) {
-          after |= kWaitingEMultiple;
-        } else {
-          after |= kWaitingESingle;
-        }
-      } else {
-        after |= waitMask;
-      }
-
-      // CAS is better than atomic |= here, because it lets us avoid
-      // setting the wait flag when the goal is concurrently achieved
-      if (after != state && !state_.compare_exchange_strong(state, after)) {
-        continue;
-      }
-
-      if (!ctx.doWait(state_, after, waitMask)) {
-        // timed out
-        return false;
-      }
-    }
-  }
-
-  // Wakes up waiters registered in state_ as appropriate, clearing the
-  // awaiting bits for anybody that was awoken.  Tries to perform direct
-  // single wakeup of an exclusive waiter if appropriate
-  void wakeRegisteredWaiters(uint32_t& state, uint32_t wakeMask) {
-    if (UNLIKELY((state & wakeMask) != 0)) {
-      wakeRegisteredWaitersImpl(state, wakeMask);
-    }
-  }
-
-  void wakeRegisteredWaitersImpl(uint32_t& state, uint32_t wakeMask) {
-    // If there are multiple lock() pending only one of them will actually
-    // get to wake up, so issuing futexWakeAll will make a thundering herd.
-    // There's nothing stopping us from issuing futexWake(1) instead,
-    // so long as the wait bits are still an accurate reflection of
-    // the waiters.  If we notice (via futexWake's return value) that
-    // nobody woke up then we can try again with the normal wake-all path.
-    // Note that we can't just clear the bits at that point; we need to
-    // clear the bits and then issue another wakeup.
-    //
-    // It is possible that we wake an E waiter but an outside S grabs the
-    // lock instead, at which point we should wake pending U and S waiters.
-    // Rather than tracking state to make the failing E regenerate the
-    // wakeup, we just disable the optimization in the case that there
-    // are waiting U or S that we are eligible to wake.
-    if ((wakeMask & kWaitingE) == kWaitingE &&
-        (state & wakeMask) == kWaitingE &&
-        state_.futexWake(1, kWaitingE) > 0) {
-      // somebody woke up, so leave state_ as is and clear it later
-      return;
-    }
-
-    if ((state & wakeMask) != 0) {
-      auto prev = state_.fetch_and(~wakeMask);
-      if ((prev & wakeMask) != 0) {
-        futexWakeAll(wakeMask);
-      }
-      state = prev & ~wakeMask;
-    }
-  }
-
-  void futexWakeAll(uint32_t wakeMask) {
-    state_.futexWake(std::numeric_limits<int>::max(), wakeMask);
-  }
-
-  DeferredReaderSlot* deferredReader(uint32_t slot) {
-    return &deferredReaders[slot * kDeferredSeparationFactor];
-  }
-
-  uintptr_t tokenfulSlotValue() { return reinterpret_cast<uintptr_t>(this); }
-
-  uintptr_t tokenlessSlotValue() { return tokenfulSlotValue() | kTokenless; }
-
-  bool slotValueIsThis(uintptr_t slotValue) {
-    return (slotValue & ~kTokenless) == tokenfulSlotValue();
-  }
-
-  // Clears any deferredReaders[] that point to this, adjusting the inline
-  // shared lock count to compensate.  Does some spinning and yielding
-  // to avoid the work.  Always finishes the application, even if ctx
-  // times out.
-  template <class WaitContext>
-  void applyDeferredReaders(uint32_t& state, WaitContext& ctx) {
-    uint32_t slot = 0;
-
-    uint32_t spinCount = 0;
-    while (true) {
-      while (!slotValueIsThis(
-                 deferredReader(slot)->load(std::memory_order_acquire))) {
-        if (++slot == kMaxDeferredReaders) {
-          return;
-        }
-      }
-#if FOLLY_X64
-      asm("pause");
-#endif
-      if (UNLIKELY(++spinCount >= kMaxSpinCount)) {
-        applyDeferredReaders(state, ctx, slot);
-        return;
-      }
-    }
-  }
-
-  template <class WaitContext>
-  void applyDeferredReaders(uint32_t& state, WaitContext& ctx, uint32_t slot) {
-
-#ifdef RUSAGE_THREAD
-    struct rusage usage;
-    long before = -1;
-#endif
-    for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
-         ++yieldCount) {
-      for (int softState = 0; softState < 3; ++softState) {
-        if (softState < 2) {
-          std::this_thread::yield();
-        } else {
-#ifdef RUSAGE_THREAD
-          getrusage(RUSAGE_THREAD, &usage);
-#endif
-        }
-        while (!slotValueIsThis(
-                   deferredReader(slot)->load(std::memory_order_acquire))) {
-          if (++slot == kMaxDeferredReaders) {
-            return;
-          }
-        }
-        if (ctx.shouldTimeOut()) {
-          // finish applying immediately on timeout
-          break;
-        }
-      }
-#ifdef RUSAGE_THREAD
-      if (before >= 0 && usage.ru_nivcsw >= before + 2) {
-        // heuristic says run queue is not empty
-        break;
-      }
-      before = usage.ru_nivcsw;
-#endif
-    }
-
-    uint32_t movedSlotCount = 0;
-    for (; slot < kMaxDeferredReaders; ++slot) {
-      auto slotPtr = deferredReader(slot);
-      auto slotValue = slotPtr->load(std::memory_order_acquire);
-      if (slotValueIsThis(slotValue) &&
-          slotPtr->compare_exchange_strong(slotValue, 0)) {
-        ++movedSlotCount;
-      }
-    }
-
-    if (movedSlotCount > 0) {
-      state = (state_ += movedSlotCount * kIncrHasS);
-    }
-    assert((state & (kHasE | kBegunE)) != 0);
-
-    // if state + kIncrHasS overflows (off the end of state) then either
-    // we have 2^(32-9) readers (almost certainly an application bug)
-    // or we had an underflow (also a bug)
-    assert(state < state + kIncrHasS);
-  }
-
-  // It is straightfoward to make a token-less lock_shared() and
-  // unlock_shared() either by making the token-less version always use
-  // INLINE_SHARED mode or by removing the token version.  Supporting
-  // deferred operation for both types is trickier than it appears, because
-  // the purpose of the token it so that unlock_shared doesn't have to
-  // look in other slots for its deferred lock.  Token-less unlock_shared
-  // might place a deferred lock in one place and then release a different
-  // slot that was originally used by the token-ful version.  If this was
-  // important we could solve the problem by differentiating the deferred
-  // locks so that cross-variety release wouldn't occur.  The best way
-  // is probably to steal a bit from the pointer, making deferredLocks[]
-  // an array of Atom<uintptr_t>.
-
-  template <class WaitContext>
-  bool lockSharedImpl(Token* token, WaitContext& ctx) {
-    uint32_t state = state_.load(std::memory_order_relaxed);
-    if ((state & (kHasS | kMayDefer | kHasE)) == 0 &&
-        state_.compare_exchange_strong(state, state + kIncrHasS)) {
-      if (token != nullptr) {
-        token->type_ = Token::Type::INLINE_SHARED;
-      }
-      return true;
-    }
-    return lockSharedImpl(state, token, ctx);
-  }
-
-  template <class WaitContext>
-  bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
-    while (true) {
-      if (UNLIKELY((state & kHasE) != 0) &&
-          !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
-        return false;
-      }
-
-      uint32_t slot;
-      uintptr_t slotValue = 1; // any non-zero value will do
-
-      bool canAlreadyDefer = (state & kMayDefer) != 0;
-      bool aboveDeferThreshold =
-          (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS;
-      bool drainInProgress = ReaderPriority && (state & kBegunE) != 0;
-      if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) {
-        // starting point for our empty-slot search, can change after
-        // calling waitForZeroBits
-        uint32_t bestSlot =
-            (uint32_t)folly::detail::AccessSpreader<Atom>::current(
-                kMaxDeferredReaders);
-
-        // deferred readers are already enabled, or it is time to
-        // enable them if we can find a slot
-        for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) {
-          slot = bestSlot ^ i;
-          assert(slot < kMaxDeferredReaders);
-          slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
-          if (slotValue == 0) {
-            // found empty slot
-            break;
-          }
-        }
-      }
-
-      if (slotValue != 0) {
-        // not yet deferred, or no empty slots
-        if (state_.compare_exchange_strong(state, state + kIncrHasS)) {
-          // successfully recorded the read lock inline
-          if (token != nullptr) {
-            token->type_ = Token::Type::INLINE_SHARED;
-          }
-          return true;
-        }
-        // state is updated, try again
-        continue;
-      }
-
-      // record that deferred readers might be in use if necessary
-      if ((state & kMayDefer) == 0) {
-        if (!state_.compare_exchange_strong(state, state | kMayDefer)) {
-          // keep going if CAS failed because somebody else set the bit
-          // for us
-          if ((state & (kHasE | kMayDefer)) != kMayDefer) {
-            continue;
-          }
-        }
-        // state = state | kMayDefer;
-      }
-
-      // try to use the slot
-      bool gotSlot = deferredReader(slot)->compare_exchange_strong(
-          slotValue,
-          token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue());
-
-      // If we got the slot, we need to verify that an exclusive lock
-      // didn't happen since we last checked.  If we didn't get the slot we
-      // need to recheck state_ anyway to make sure we don't waste too much
-      // work.  It is also possible that since we checked state_ someone
-      // has acquired and released the write lock, clearing kMayDefer.
-      // Both cases are covered by looking for the readers-possible bit,
-      // because it is off when the exclusive lock bit is set.
-      state = state_.load(std::memory_order_acquire);
-
-      if (!gotSlot) {
-        continue;
-      }
-
-      if (token == nullptr) {
-        tls_lastTokenlessSlot = slot;
-      }
-
-      if ((state & kMayDefer) != 0) {
-        assert((state & kHasE) == 0);
-        // success
-        if (token != nullptr) {
-          token->type_ = Token::Type::DEFERRED_SHARED;
-          token->slot_ = (uint16_t)slot;
-        }
-        return true;
-      }
-
-      // release the slot before retrying
-      if (token == nullptr) {
-        // We can't rely on slot.  Token-less slot values can be freed by
-        // any unlock_shared(), so we need to do the full deferredReader
-        // search during unlock.  Unlike unlock_shared(), we can't trust
-        // kPrevDefer here.  This deferred lock isn't visible to lock()
-        // (that's the whole reason we're undoing it) so there might have
-        // subsequently been an unlock() and lock() with no intervening
-        // transition to deferred mode.
-        if (!tryUnlockAnySharedDeferred()) {
-          unlockSharedInline();
-        }
-      } else {
-        if (!tryUnlockSharedDeferred(slot)) {
-          unlockSharedInline();
-        }
-      }
-
-      // We got here not because the lock was unavailable, but because
-      // we lost a compare-and-swap.  Try-lock is typically allowed to
-      // have spurious failures, but there is no lock efficiency gain
-      // from exploiting that freedom here.
-    }
-  }
-
-  bool tryUnlockAnySharedDeferred() {
-    auto bestSlot = tls_lastTokenlessSlot;
-    for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
-      auto slotPtr = deferredReader(bestSlot ^ i);
-      auto slotValue = slotPtr->load(std::memory_order_relaxed);
-      if (slotValue == tokenlessSlotValue() &&
-          slotPtr->compare_exchange_strong(slotValue, 0)) {
-        tls_lastTokenlessSlot = bestSlot ^ i;
-        return true;
-      }
-    }
-    return false;
-  }
-
-  bool tryUnlockSharedDeferred(uint32_t slot) {
-    assert(slot < kMaxDeferredReaders);
-    auto slotValue = tokenfulSlotValue();
-    return deferredReader(slot)->compare_exchange_strong(slotValue, 0);
-  }
-
-  uint32_t unlockSharedInline() {
-    uint32_t state = (state_ -= kIncrHasS);
-    assert((state & (kHasE | kBegunE)) != 0 || state < state + kIncrHasS);
-    if ((state & kHasS) == 0) {
-      // Only the second half of lock() can be blocked by a non-zero
-      // reader count, so that's the only thing we need to wake
-      wakeRegisteredWaiters(state, kWaitingNotS);
-    }
-    return state;
-  }
-
-  template <class WaitContext>
-  bool lockUpgradeImpl(WaitContext& ctx) {
-    uint32_t state;
-    do {
-      if (!waitForZeroBits(state, kHasSolo, kWaitingU, ctx)) {
-        return false;
-      }
-    } while (!state_.compare_exchange_strong(state, state | kHasU));
-    return true;
-  }
-
- public:
-  class ReadHolder {
-   public:
-    ReadHolder() : lock_(nullptr) {}
-
-    explicit ReadHolder(const SharedMutexImpl* lock) : ReadHolder(*lock) {}
-
-    explicit ReadHolder(const SharedMutexImpl& lock)
-        : lock_(const_cast<SharedMutexImpl*>(&lock)) {
-      lock_->lock_shared(token_);
-    }
-
-    ReadHolder(ReadHolder&& rhs) noexcept : lock_(rhs.lock_),
-                                            token_(rhs.token_) {
-      rhs.lock_ = nullptr;
-    }
-
-    // Downgrade from upgrade mode
-    explicit ReadHolder(UpgradeHolder&& upgraded) : lock_(upgraded.lock_) {
-      assert(upgraded.lock_ != nullptr);
-      upgraded.lock_ = nullptr;
-      lock_->unlock_upgrade_and_lock_shared(token_);
-    }
-
-    // Downgrade from exclusive mode
-    explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
-      assert(writer.lock_ != nullptr);
-      writer.lock_ = nullptr;
-      lock_->unlock_and_lock_shared(token_);
-    }
-
-    ReadHolder& operator=(ReadHolder&& rhs) noexcept {
-      std::swap(lock_, rhs.lock_);
-      std::swap(token_, rhs.token_);
-      return *this;
-    }
-
-    ReadHolder(const ReadHolder& rhs) = delete;
-    ReadHolder& operator=(const ReadHolder& rhs) = delete;
-
-    ~ReadHolder() {
-      if (lock_) {
-        lock_->unlock_shared(token_);
-      }
-    }
-
-   private:
-    friend class UpgradeHolder;
-    friend class WriteHolder;
-    SharedMutexImpl* lock_;
-    SharedMutexToken token_;
-  };
-
-  class UpgradeHolder {
-   public:
-    UpgradeHolder() : lock_(nullptr) {}
-
-    explicit UpgradeHolder(SharedMutexImpl* lock) : UpgradeHolder(*lock) {}
-
-    explicit UpgradeHolder(SharedMutexImpl& lock) : lock_(&lock) {
-      lock_->lock_upgrade();
-    }
-
-    // Downgrade from exclusive mode
-    explicit UpgradeHolder(WriteHolder&& writer) : lock_(writer.lock_) {
-      assert(writer.lock_ != nullptr);
-      writer.lock_ = nullptr;
-      lock_->unlock_and_lock_upgrade();
-    }
-
-    UpgradeHolder(UpgradeHolder&& rhs) noexcept : lock_(rhs.lock_) {
-      rhs.lock_ = nullptr;
-    }
-
-    UpgradeHolder& operator=(UpgradeHolder&& rhs) noexcept {
-      std::swap(lock_, rhs.lock_);
-      return *this;
-    }
-
-    UpgradeHolder(const UpgradeHolder& rhs) = delete;
-    UpgradeHolder& operator=(const UpgradeHolder& rhs) = delete;
-
-    ~UpgradeHolder() {
-      if (lock_) {
-        lock_->unlock_upgrade();
-      }
-    }
-
-   private:
-    friend class WriteHolder;
-    friend class ReadHolder;
-    SharedMutexImpl* lock_;
-  };
-
-  class WriteHolder {
-   public:
-    WriteHolder() : lock_(nullptr) {}
-
-    explicit WriteHolder(SharedMutexImpl* lock) : WriteHolder(*lock) {}
-
-    explicit WriteHolder(SharedMutexImpl& lock) : lock_(&lock) {
-      lock_->lock();
-    }
-
-    // Promotion from upgrade mode
-    explicit WriteHolder(UpgradeHolder&& upgrade) : lock_(upgrade.lock_) {
-      assert(upgrade.lock_ != nullptr);
-      upgrade.lock_ = nullptr;
-      lock_->unlock_upgrade_and_lock();
-    }
-
-    WriteHolder(WriteHolder&& rhs) noexcept : lock_(rhs.lock_) {
-      rhs.lock_ = nullptr;
-    }
-
-    WriteHolder& operator=(WriteHolder&& rhs) noexcept {
-      std::swap(lock_, rhs.lock_);
-      return *this;
-    }
-
-    WriteHolder(const WriteHolder& rhs) = delete;
-    WriteHolder& operator=(const WriteHolder& rhs) = delete;
-
-    ~WriteHolder() {
-      if (lock_) {
-        lock_->unlock();
-      }
-    }
-
-   private:
-    friend class ReadHolder;
-    friend class UpgradeHolder;
-    SharedMutexImpl* lock_;
-  };
-
-  // Adapters for Synchronized<>
-  friend void acquireRead(SharedMutexImpl& lock) { lock.lock_shared(); }
-  friend void acquireReadWrite(SharedMutexImpl& lock) { lock.lock(); }
-  friend void releaseRead(SharedMutexImpl& lock) { lock.unlock_shared(); }
-  friend void releaseReadWrite(SharedMutexImpl& lock) { lock.unlock(); }
-};
-
-#define COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(type) \
-  template <>                                                        \
-  type::DeferredReaderSlot                                           \
-      type::deferredReaders[type::kMaxDeferredReaders *              \
-                            type::kDeferredSeparationFactor] = {};   \
-  template <>                                                        \
-  FOLLY_TLS uint32_t type::tls_lastTokenlessSlot = 0;
-
-typedef SharedMutexImpl<true> SharedMutexReadPriority;
-typedef SharedMutexImpl<false> SharedMutexWritePriority;
-typedef SharedMutexWritePriority SharedMutex;
-
-} // namespace folly
diff --git a/folly/experimental/test/SharedMutexTest.cpp b/folly/experimental/test/SharedMutexTest.cpp
deleted file mode 100644
index 026284c3..00000000
--- a/folly/experimental/test/SharedMutexTest.cpp
+++ /dev/null
@@ -1,2070 +0,0 @@
-/*
- * Copyright 2015 Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <folly/experimental/SharedMutex.h>
-
-#include <stdlib.h>
-#include <thread>
-#include <vector>
-#include <boost/optional.hpp>
-#include <folly/Benchmark.h>
-#include <folly/MPMCQueue.h>
-#include <folly/Random.h>
-#include <folly/test/DeterministicSchedule.h>
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-
-#include <boost/thread/shared_mutex.hpp>
-#include <folly/RWSpinLock.h>
-
-using namespace folly;
-using namespace folly::test;
-using namespace std;
-using namespace chrono;
-
-typedef DeterministicSchedule DSched;
-typedef SharedMutexImpl<true, void, DeterministicAtomic, true>
-    DSharedMutexReadPriority;
-typedef SharedMutexImpl<false, void, DeterministicAtomic, true>
-    DSharedMutexWritePriority;
-
-COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(
-    DSharedMutexReadPriority);
-COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(
-    DSharedMutexWritePriority);
-
-template <typename Lock>
-void runBasicTest() {
-  Lock lock;
-  SharedMutexToken token1;
-  SharedMutexToken token2;
-  SharedMutexToken token3;
-
-  EXPECT_TRUE(lock.try_lock());
-  EXPECT_FALSE(lock.try_lock());
-  EXPECT_FALSE(lock.try_lock_shared(token1));
-  lock.unlock();
-
-  EXPECT_TRUE(lock.try_lock_shared(token1));
-  EXPECT_FALSE(lock.try_lock());
-  EXPECT_TRUE(lock.try_lock_shared(token2));
-  lock.lock_shared(token3);
-  lock.unlock_shared(token3);
-  lock.unlock_shared(token2);
-  lock.unlock_shared(token1);
-
-  lock.lock();
-  lock.unlock();
-
-  lock.lock_shared(token1);
-  lock.lock_shared(token2);
-  lock.unlock_shared(token1);
-  lock.unlock_shared(token2);
-
-  lock.lock();
-  lock.unlock_and_lock_shared(token1);
-  lock.lock_shared(token2);
-  lock.unlock_shared(token2);
-  lock.unlock_shared(token1);
-}
-
-TEST(SharedMutex, basic) {
-  runBasicTest<SharedMutexReadPriority>();
-  runBasicTest<SharedMutexWritePriority>();
-}
-
-template <typename Lock>
-void runBasicHoldersTest() {
-  Lock lock;
-  SharedMutexToken token;
-
-  {
-    typename Lock::WriteHolder holder(lock);
-    EXPECT_FALSE(lock.try_lock());
-    EXPECT_FALSE(lock.try_lock_shared(token));
-
-    typename Lock::WriteHolder holder2(std::move(holder));
-    typename Lock::WriteHolder holder3;
-    holder3 = std::move(holder2);
-
-    typename Lock::UpgradeHolder holder4(std::move(holder3));
-    typename Lock::WriteHolder holder5(std::move(holder4));
-
-    typename Lock::ReadHolder holder6(std::move(holder5));
-
-    EXPECT_FALSE(lock.try_lock());
-    EXPECT_TRUE(lock.try_lock_shared(token));
-    lock.unlock_shared(token);
-  }
-
-  {
-    typename Lock::WriteHolder holder(lock);
-    EXPECT_FALSE(lock.try_lock());
-  }
-
-  {
-    typename Lock::ReadHolder holder(lock);
-    typename Lock::ReadHolder holder2(lock);
-    typename Lock::UpgradeHolder holder3(lock);
-  }
-
-  {
-    typename Lock::UpgradeHolder holder(lock);
-    typename Lock::ReadHolder holder2(lock);
-    typename Lock::ReadHolder holder3(std::move(holder));
-  }
-}
-
-TEST(SharedMutex, basic_holders) {
-  runBasicHoldersTest<SharedMutexReadPriority>();
-  runBasicHoldersTest<SharedMutexWritePriority>();
-}
-
-template <typename Lock>
-void runManyReadLocksTestWithTokens() {
-  Lock lock;
-
-  vector<SharedMutexToken> tokens;
-  for (int i = 0; i < 1000; ++i) {
-    tokens.emplace_back();
-    EXPECT_TRUE(lock.try_lock_shared(tokens.back()));
-  }
-  for (auto& token : tokens) {
-    lock.unlock_shared(token);
-  }
-  EXPECT_TRUE(lock.try_lock());
-  lock.unlock();
-}
-
-TEST(SharedMutex, many_read_locks_with_tokens) {
-  runManyReadLocksTestWithTokens<SharedMutexReadPriority>();
-  runManyReadLocksTestWithTokens<SharedMutexWritePriority>();
-}
-
-template <typename Lock>
-void runManyReadLocksTestWithoutTokens() {
-  Lock lock;
-
-  for (int i = 0; i < 1000; ++i) {
-    EXPECT_TRUE(lock.try_lock_shared());
-  }
-  for (int i = 0; i < 1000; ++i) {
-    lock.unlock_shared();
-  }
-  EXPECT_TRUE(lock.try_lock());
-  lock.unlock();
-}
-
-TEST(SharedMutex, many_read_locks_without_tokens) {
-  runManyReadLocksTestWithoutTokens<SharedMutexReadPriority>();
-  runManyReadLocksTestWithoutTokens<SharedMutexWritePriority>();
-}
-
-template <typename Lock>
-void runTimeoutInPastTest() {
-  Lock lock;
-
-  EXPECT_TRUE(lock.try_lock_for(milliseconds(0)));
-  lock.unlock();
-  EXPECT_TRUE(lock.try_lock_for(milliseconds(-1)));
-  lock.unlock();
-  EXPECT_TRUE(lock.try_lock_shared_for(milliseconds(0)));
-  lock.unlock_shared();
-  EXPECT_TRUE(lock.try_lock_shared_for(milliseconds(-1)));
-  lock.unlock_shared();
-  EXPECT_TRUE(lock.try_lock_until(system_clock::now() - milliseconds(1)));
-  lock.unlock();
-  EXPECT_TRUE(
-      lock.try_lock_shared_until(system_clock::now() - milliseconds(1)));
-  lock.unlock_shared();
-  EXPECT_TRUE(lock.try_lock_until(steady_clock::now() - milliseconds(1)));
-  lock.unlock();
-  EXPECT_TRUE(
-      lock.try_lock_shared_until(steady_clock::now() - milliseconds(1)));
-  lock.unlock_shared();
-}
-
-TEST(SharedMutex, timeout_in_past) {
-  runTimeoutInPastTest<SharedMutexReadPriority>();
-  runTimeoutInPastTest<SharedMutexWritePriority>();
-}
-
-template <class Func>
-bool funcHasDuration(milliseconds expectedDuration, Func func) {
-  // elapsed time should eventually fall within expectedDuration +- 25%
-  for (int tries = 0; tries < 100; ++tries) {
-    auto start = steady_clock::now();
-    func();
-    auto elapsed = steady_clock::now() - start;
-    if (elapsed > expectedDuration - expectedDuration / 4 &&
-        elapsed < expectedDuration + expectedDuration / 4) {
-      return true;
-    }
-  }
-  return false;
-}
-
-template <typename Lock>
-void runFailingTryTimeoutTest() {
-  Lock lock;
-  lock.lock();
-  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
-    EXPECT_FALSE(lock.try_lock_for(milliseconds(10)));
-  }));
-  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
-    typename Lock::Token token;
-    EXPECT_FALSE(lock.try_lock_shared_for(milliseconds(10), token));
-  }));
-  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
-    EXPECT_FALSE(lock.try_lock_upgrade_for(milliseconds(10)));
-  }));
-  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
-    EXPECT_FALSE(lock.try_lock_until(steady_clock::now() + milliseconds(10)));
-  }));
-  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
-    typename Lock::Token token;
-    EXPECT_FALSE(lock.try_lock_shared_until(
-        steady_clock::now() + milliseconds(10), token));
-  }));
-  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
-    EXPECT_FALSE(
-        lock.try_lock_upgrade_until(steady_clock::now() + milliseconds(10)));
-  }));
-  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
-    EXPECT_FALSE(lock.try_lock_until(system_clock::now() + milliseconds(10)));
-  }));
-  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
-    typename Lock::Token token;
-    EXPECT_FALSE(lock.try_lock_shared_until(
-        system_clock::now() + milliseconds(10), token));
-  }));
-  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
-    EXPECT_FALSE(
-        lock.try_lock_upgrade_until(system_clock::now() + milliseconds(10)));
-  }));
-  lock.unlock();
-
-  lock.lock_shared();
-  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
-    EXPECT_FALSE(lock.try_lock_for(milliseconds(10)));
-  }));
-  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
-    EXPECT_FALSE(lock.try_lock_until(steady_clock::now() + milliseconds(10)));
-  }));
-  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
-    EXPECT_FALSE(lock.try_lock_until(system_clock::now() + milliseconds(10)));
-  }));
-  lock.unlock_shared();
-
-  lock.lock();
-  for (int p = 0; p < 8; ++p) {
-    EXPECT_FALSE(lock.try_lock_for(nanoseconds(1 << p)));
-  }
-  lock.unlock();
-
-  for (int p = 0; p < 8; ++p) {
-    typename Lock::ReadHolder holder1(lock);
-    typename Lock::ReadHolder holder2(lock);
-    typename Lock::ReadHolder holder3(lock);
-    EXPECT_FALSE(lock.try_lock_for(nanoseconds(1 << p)));
-  }
-}
-
-TEST(SharedMutex, failing_try_timeout) {
-  runFailingTryTimeoutTest<SharedMutexReadPriority>();
-  runFailingTryTimeoutTest<SharedMutexWritePriority>();
-}
-
-template <typename Lock>
-void runBasicUpgradeTest() {
-  Lock lock;
-  typename Lock::Token token1;
-  typename Lock::Token token2;
-
-  lock.lock_upgrade();
-  EXPECT_FALSE(lock.try_lock());
-  EXPECT_TRUE(lock.try_lock_shared(token1));
-  lock.unlock_shared(token1);
-  lock.unlock_upgrade();
-
-  lock.lock_upgrade();
-  lock.unlock_upgrade_and_lock();
-  EXPECT_FALSE(lock.try_lock_shared(token1));
-  lock.unlock();
-
-  lock.lock_upgrade();
-  lock.unlock_upgrade_and_lock_shared(token1);
-  lock.lock_upgrade();
-  lock.unlock_upgrade_and_lock_shared(token2);
-  lock.unlock_shared(token1);
-  lock.unlock_shared(token2);
-
-  lock.lock();
-  lock.unlock_and_lock_upgrade();
-  EXPECT_TRUE(lock.try_lock_shared(token1));
-  lock.unlock_upgrade();
-  lock.unlock_shared(token1);
-}
-
-TEST(SharedMutex, basic_upgrade_tests) {
-  runBasicUpgradeTest<SharedMutexReadPriority>();
-  runBasicUpgradeTest<SharedMutexWritePriority>();
-}
-
-TEST(SharedMutex, read_has_prio) {
-  SharedMutexReadPriority lock;
-  SharedMutexToken token1;
-  SharedMutexToken token2;
-  lock.lock_shared(token1);
-  bool exclusiveAcquired = false;
-  auto writer = thread([&] {
-    lock.lock();
-    exclusiveAcquired = true;
-    lock.unlock();
-  });
-
-  // lock() can't complete until we unlock token1, but it should stake
-  // its claim with regards to other exclusive or upgrade locks.  We can
-  // use try_lock_upgrade to poll for that eventuality.
-  while (lock.try_lock_upgrade()) {
-    lock.unlock_upgrade();
-    this_thread::yield();
-  }
-  EXPECT_FALSE(exclusiveAcquired);
-
-  // Even though lock() is stuck we should be able to get token2
-  EXPECT_TRUE(lock.try_lock_shared(token2));
-  lock.unlock_shared(token1);
-  lock.unlock_shared(token2);
-  writer.join();
-  EXPECT_TRUE(exclusiveAcquired);
-}
-
-TEST(SharedMutex, write_has_prio) {
-  SharedMutexWritePriority lock;
-  SharedMutexToken token1;
-  SharedMutexToken token2;
-  lock.lock_shared(token1);
-  auto writer = thread([&] {
-    lock.lock();
-    lock.unlock();
-  });
-
-  // eventually lock() should block readers
-  while (lock.try_lock_shared(token2)) {
-    lock.unlock_shared(token2);
-    this_thread::yield();
-  }
-
-  lock.unlock_shared(token1);
-  writer.join();
-}
-
-struct TokenLocker {
-  SharedMutexToken token;
-
-  template <typename T>
-  void lock(T* lock) {
-    lock->lock();
-  }
-
-  template <typename T>
-  void unlock(T* lock) {
-    lock->unlock();
-  }
-
-  template <typename T>
-  void lock_shared(T* lock) {
-    lock->lock_shared(token);
-  }
-
-  template <typename T>
-  void unlock_shared(T* lock) {
-    lock->unlock_shared(token);
-  }
-};
-
-struct Locker {
-  template <typename T>
-  void lock(T* lock) {
-    lock->lock();
-  }
-
-  template <typename T>
-  void unlock(T* lock) {
-    lock->unlock();
-  }
-
-  template <typename T>
-  void lock_shared(T* lock) {
-    lock->lock_shared();
-  }
-
-  template <typename T>
-  void unlock_shared(T* lock) {
-    lock->unlock_shared();
-  }
-};
-
-struct EnterLocker {
-  template <typename T>
-  void lock(T* lock) {
-    lock->lock(0);
-  }
-
-  template <typename T>
-  void unlock(T* lock) {
-    lock->unlock();
-  }
-
-  template <typename T>
-  void lock_shared(T* lock) {
-    lock->enter(0);
-  }
-
-  template <typename T>
-  void unlock_shared(T* lock) {
-    lock->leave();
-  }
-};
-
-struct PosixRWLock {
-  pthread_rwlock_t lock_;
-
-  PosixRWLock() { pthread_rwlock_init(&lock_, nullptr); }
-
-  ~PosixRWLock() { pthread_rwlock_destroy(&lock_); }
-
-  void lock() { pthread_rwlock_wrlock(&lock_); }
-
-  void unlock() { pthread_rwlock_unlock(&lock_); }
-
-  void lock_shared() { pthread_rwlock_rdlock(&lock_); }
-
-  void unlock_shared() { pthread_rwlock_unlock(&lock_); }
-};
-
-struct PosixMutex {
-  pthread_mutex_t lock_;
-
-  PosixMutex() { pthread_mutex_init(&lock_, nullptr); }
-
-  ~PosixMutex() { pthread_mutex_destroy(&lock_); }
-
-  void lock() { pthread_mutex_lock(&lock_); }
-
-  void unlock() { pthread_mutex_unlock(&lock_); }
-
-  void lock_shared() { pthread_mutex_lock(&lock_); }
-
-  void unlock_shared() { pthread_mutex_unlock(&lock_); }
-};
-
-template <template <typename> class Atom, typename Lock, typename Locker>
-static void runContendedReaders(size_t numOps,
-                                size_t numThreads,
-                                bool useSeparateLocks) {
-  char padding1[64];
-  Lock globalLock;
-  int valueProtectedByLock = 10;
-  char padding2[64];
-  Atom<bool> go(false);
-  Atom<bool>* goPtr = &go; // workaround for clang bug
-  vector<thread> threads(numThreads);
-
-  BENCHMARK_SUSPEND {
-    for (size_t t = 0; t < numThreads; ++t) {
-      threads[t] = DSched::thread([&, t, numThreads] {
-        Lock privateLock;
-        Lock* lock = useSeparateLocks ? &privateLock : &globalLock;
-        Locker locker;
-        while (!goPtr->load()) {
-          this_thread::yield();
-        }
-        for (size_t op = t; op < numOps; op += numThreads) {
-          locker.lock_shared(lock);
-          // note: folly::doNotOptimizeAway reads and writes to its arg,
-          // so the following two lines are very different than a call
-          // to folly::doNotOptimizeAway(valueProtectedByLock);
-          auto copy = valueProtectedByLock;
-          folly::doNotOptimizeAway(copy);
-          locker.unlock_shared(lock);
-        }
-      });
-    }
-  }
-
-  go.store(true);
-  for (auto& thr : threads) {
-    DSched::join(thr);
-  }
-}
-
-static void folly_rwspin_reads(uint numOps,
-                               size_t numThreads,
-                               bool useSeparateLocks) {
-  runContendedReaders<atomic, RWSpinLock, Locker>(
-      numOps, numThreads, useSeparateLocks);
-}
-
-static void shmtx_wr_pri_reads(uint numOps,
-                               size_t numThreads,
-                               bool useSeparateLocks) {
-  runContendedReaders<atomic, SharedMutexWritePriority, TokenLocker>(
-      numOps, numThreads, useSeparateLocks);
-}
-
-static void shmtx_w_bare_reads(uint numOps,
-                               size_t numThreads,
-                               bool useSeparateLocks) {
-  runContendedReaders<atomic, SharedMutexWritePriority, Locker>(
-      numOps, numThreads, useSeparateLocks);
-}
-
-static void shmtx_rd_pri_reads(uint numOps,
-                               size_t numThreads,
-                               bool useSeparateLocks) {
-  runContendedReaders<atomic, SharedMutexReadPriority, TokenLocker>(
-      numOps, numThreads, useSeparateLocks);
-}
-
-static void shmtx_r_bare_reads(uint numOps,
-                               size_t numThreads,
-                               bool useSeparateLocks) {
-  runContendedReaders<atomic, SharedMutexReadPriority, Locker>(
-      numOps, numThreads, useSeparateLocks);
-}
-
-static void folly_ticket_reads(uint numOps,
-                               size_t numThreads,
-                               bool useSeparateLocks) {
-  runContendedReaders<atomic, RWTicketSpinLock64, Locker>(
-      numOps, numThreads, useSeparateLocks);
-}
-
-static void boost_shared_reads(uint numOps,
-                               size_t numThreads,
-                               bool useSeparateLocks) {
-  runContendedReaders<atomic, boost::shared_mutex, Locker>(
-      numOps, numThreads, useSeparateLocks);
-}
-
-static void pthrd_rwlock_reads(uint numOps,
-                               size_t numThreads,
-                               bool useSeparateLocks) {
-  runContendedReaders<atomic, PosixRWLock, Locker>(
-      numOps, numThreads, useSeparateLocks);
-}
-
-template <template <typename> class Atom, typename Lock, typename Locker>
-static void runMixed(size_t numOps,
-                     size_t numThreads,
-                     double writeFraction,
-                     bool useSeparateLocks) {
-  char padding1[64];
-  Lock globalLock;
-  int valueProtectedByLock = 0;
-  char padding2[64];
-  Atom<bool> go(false);
-  Atom<bool>* goPtr = &go; // workaround for clang bug
-  vector<thread> threads(numThreads);
-
-  BENCHMARK_SUSPEND {
-    for (size_t t = 0; t < numThreads; ++t) {
-      threads[t] = DSched::thread([&, t, numThreads] {
-        struct drand48_data buffer;
-        srand48_r(t, &buffer);
-        long writeThreshold = writeFraction * 0x7fffffff;
-        Lock privateLock;
-        Lock* lock = useSeparateLocks ? &privateLock : &globalLock;
-        Locker locker;
-        while (!goPtr->load()) {
-          this_thread::yield();
-        }
-        for (size_t op = t; op < numOps; op += numThreads) {
-          long randVal;
-          lrand48_r(&buffer, &randVal);
-          bool writeOp = randVal < writeThreshold;
-          SharedMutexToken token;
-          if (writeOp) {
-            locker.lock(lock);
-            if (!useSeparateLocks) {
-              ++valueProtectedByLock;
-            }
-            locker.unlock(lock);
-          } else {
-            locker.lock_shared(lock);
-            auto v = valueProtectedByLock;
-            folly::doNotOptimizeAway(v);
-            locker.unlock_shared(lock);
-          }
-        }
-      });
-    }
-  }
-
-  go.store(true);
-  for (auto& thr : threads) {
-    DSched::join(thr);
-  }
-}
-
-static void folly_rwspin(size_t numOps,
-                         size_t numThreads,
-                         double writeFraction,
-                         bool useSeparateLocks) {
-  runMixed<atomic, RWSpinLock, Locker>(
-      numOps, numThreads, writeFraction, useSeparateLocks);
-}
-
-static void shmtx_wr_pri(uint numOps,
-                         size_t numThreads,
-                         double writeFraction,
-                         bool useSeparateLocks) {
-  runMixed<atomic, SharedMutexWritePriority, TokenLocker>(
-      numOps, numThreads, writeFraction, useSeparateLocks);
-}
-
-static void shmtx_w_bare(uint numOps,
-                         size_t numThreads,
-                         double writeFraction,
-                         bool useSeparateLocks) {
-  runMixed<atomic, SharedMutexWritePriority, Locker>(
-      numOps, numThreads, writeFraction, useSeparateLocks);
-}
-
-static void shmtx_rd_pri(uint numOps,
-                         size_t numThreads,
-                         double writeFraction,
-                         bool useSeparateLocks) {
-  runMixed<atomic, SharedMutexReadPriority, TokenLocker>(
-      numOps, numThreads, writeFraction, useSeparateLocks);
-}
-
-static void shmtx_r_bare(uint numOps,
-                         size_t numThreads,
-                         double writeFraction,
-                         bool useSeparateLocks) {
-  runMixed<atomic, SharedMutexReadPriority, Locker>(
-      numOps, numThreads, writeFraction, useSeparateLocks);
-}
-
-static void folly_ticket(size_t numOps,
-                         size_t numThreads,
-                         double writeFraction,
-                         bool useSeparateLocks) {
-  runMixed<atomic, RWTicketSpinLock64, Locker>(
-      numOps, numThreads, writeFraction, useSeparateLocks);
-}
-
-static void boost_shared(size_t numOps,
-                         size_t numThreads,
-                         double writeFraction,
-                         bool useSeparateLocks) {
-  runMixed<atomic, boost::shared_mutex, Locker>(
-      numOps, numThreads, writeFraction, useSeparateLocks);
-}
-
-static void pthrd_rwlock(size_t numOps,
-                         size_t numThreads,
-                         double writeFraction,
-                         bool useSeparateLocks) {
-  runMixed<atomic, PosixRWLock, Locker>(
-      numOps, numThreads, writeFraction, useSeparateLocks);
-}
-
-static void pthrd_mutex_(size_t numOps,
-                         size_t numThreads,
-                         double writeFraction,
-                         bool useSeparateLocks) {
-  runMixed<atomic, PosixMutex, Locker>(
-      numOps, numThreads, writeFraction, useSeparateLocks);
-}
-
-template <typename Lock, template <typename> class Atom>
-static void runAllAndValidate(size_t numOps, size_t numThreads) {
-  Lock globalLock;
-  Atom<int> globalExclusiveCount(0);
-  Atom<int> globalUpgradeCount(0);
-  Atom<int> globalSharedCount(0);
-
-  Atom<bool> go(false);
-
-  // clang crashes on access to Atom<> captured by ref in closure
-  Atom<int>* globalExclusiveCountPtr = &globalExclusiveCount;
-  Atom<int>* globalUpgradeCountPtr = &globalUpgradeCount;
-  Atom<int>* globalSharedCountPtr = &globalSharedCount;
-  Atom<bool>* goPtr = &go;
-
-  vector<thread> threads(numThreads);
-
-  BENCHMARK_SUSPEND {
-    for (size_t t = 0; t < numThreads; ++t) {
-      threads[t] = DSched::thread([&, t, numThreads] {
-        struct drand48_data buffer;
-        srand48_r(t, &buffer);
-
-        bool exclusive = false;
-        bool upgrade = false;
-        bool shared = false;
-        bool ourGlobalTokenUsed = false;
-        SharedMutexToken ourGlobalToken;
-
-        Lock privateLock;
-        vector<SharedMutexToken> privateTokens;
-
-        while (!goPtr->load()) {
-          this_thread::yield();
-        }
-        for (size_t op = t; op < numOps; op += numThreads) {
-          // randVal in [0,1000)
-          long randVal;
-          lrand48_r(&buffer, &randVal);
-          randVal = (long)((randVal * (uint64_t)1000) / 0x7fffffff);
-
-          // make as many assertions as possible about the global state
-          if (exclusive) {
-            EXPECT_EQ(1, globalExclusiveCountPtr->load(memory_order_acquire));
-            EXPECT_EQ(0, globalUpgradeCountPtr->load(memory_order_acquire));
-            EXPECT_EQ(0, globalSharedCountPtr->load(memory_order_acquire));
-          }
-          if (upgrade) {
-            EXPECT_EQ(0, globalExclusiveCountPtr->load(memory_order_acquire));
-            EXPECT_EQ(1, globalUpgradeCountPtr->load(memory_order_acquire));
-          }
-          if (shared) {
-            EXPECT_EQ(0, globalExclusiveCountPtr->load(memory_order_acquire));
-            EXPECT_TRUE(globalSharedCountPtr->load(memory_order_acquire) > 0);
-          } else {
-            EXPECT_FALSE(ourGlobalTokenUsed);
-          }
-
-          // independent 20% chance we do something to the private lock
-          if (randVal < 200) {
-            // it's okay to take multiple private shared locks because
-            // we never take an exclusive lock, so reader versus writer
-            // priority doesn't cause deadlocks
-            if (randVal < 100 && privateTokens.size() > 0) {
-              auto i = randVal % privateTokens.size();
-              privateLock.unlock_shared(privateTokens[i]);
-              privateTokens.erase(privateTokens.begin() + i);
-            } else {
-              SharedMutexToken token;
-              privateLock.lock_shared(token);
-              privateTokens.push_back(token);
-            }
-            continue;
-          }
-
-          // if we've got a lock, the only thing we can do is release it
-          // or transform it into a different kind of lock
-          if (exclusive) {
-            exclusive = false;
-            --*globalExclusiveCountPtr;
-            if (randVal < 500) {
-              globalLock.unlock();
-            } else if (randVal < 700) {
-              globalLock.unlock_and_lock_shared();
-              ++*globalSharedCountPtr;
-              shared = true;
-            } else if (randVal < 900) {
-              globalLock.unlock_and_lock_shared(ourGlobalToken);
-              ++*globalSharedCountPtr;
-              shared = true;
-              ourGlobalTokenUsed = true;
-            } else {
-              globalLock.unlock_and_lock_upgrade();
-              ++*globalUpgradeCountPtr;
-              upgrade = true;
-            }
-          } else if (upgrade) {
-            upgrade = false;
-            --*globalUpgradeCountPtr;
-            if (randVal < 500) {
-              globalLock.unlock_upgrade();
-            } else if (randVal < 700) {
-              globalLock.unlock_upgrade_and_lock_shared();
-              ++*globalSharedCountPtr;
-              shared = true;
-            } else if (randVal < 900) {
-              globalLock.unlock_upgrade_and_lock_shared(ourGlobalToken);
-              ++*globalSharedCountPtr;
-              shared = true;
-              ourGlobalTokenUsed = true;
-            } else {
-              globalLock.unlock_upgrade_and_lock();
-              ++*globalExclusiveCountPtr;
-              exclusive = true;
-            }
-          } else if (shared) {
-            shared = false;
-            --*globalSharedCountPtr;
-            if (ourGlobalTokenUsed) {
-              globalLock.unlock_shared(ourGlobalToken);
-              ourGlobalTokenUsed = false;
-            } else {
-              globalLock.unlock_shared();
-            }
-          } else if (randVal < 400) {
-            // 40% chance of shared lock with token, 5 ways to get it
-
-            // delta t goes from -1 millis to 7 millis
-            auto dt = microseconds(10 * (randVal - 100));
-
-            if (randVal < 400) {
-              globalLock.lock_shared(ourGlobalToken);
-              shared = true;
-            } else if (randVal < 500) {
-              shared = globalLock.try_lock_shared(ourGlobalToken);
-            } else if (randVal < 600) {
-              shared = globalLock.try_lock_shared_for(dt, ourGlobalToken);
-            } else if (randVal < 800) {
-              shared = globalLock.try_lock_shared_until(
-                  system_clock::now() + dt, ourGlobalToken);
-            }
-            if (shared) {
-              ourGlobalTokenUsed = true;
-              ++*globalSharedCountPtr;
-            }
-          } else if (randVal < 800) {
-            // 40% chance of shared lock without token
-            auto dt = microseconds(10 * (randVal - 100));
-            if (randVal < 400) {
-              globalLock.lock_shared();
-              shared = true;
-            } else if (randVal < 500) {
-              shared = globalLock.try_lock_shared();
-            } else if (randVal < 600) {
-              shared = globalLock.try_lock_shared_for(dt);
-            } else if (randVal < 800) {
-              shared = globalLock.try_lock_shared_until(
-                  system_clock::now() + dt);
-            }
-            if (shared) {
-              ++*globalSharedCountPtr;
-            }
-          } else if (randVal < 900) {
-            // 10% change of upgrade lock
-            globalLock.lock_upgrade();
-            upgrade = true;
-            ++*globalUpgradeCountPtr;
-          } else {
-            // 10% chance of exclusive lock, 5 ways to get it
-
-            // delta t goes from -1 millis to 9 millis
-            auto dt = microseconds(100 * (randVal - 910));
-
-            if (randVal < 400) {
-              globalLock.lock();
-              exclusive = true;
-            } else if (randVal < 500) {
-              exclusive = globalLock.try_lock();
-            } else if (randVal < 600) {
-              exclusive = globalLock.try_lock_for(dt);
-            } else if (randVal < 700) {
-              exclusive = globalLock.try_lock_until(steady_clock::now() + dt);
-            } else {
-              exclusive = globalLock.try_lock_until(system_clock::now() + dt);
-            }
-            if (exclusive) {
-              ++*globalExclusiveCountPtr;
-            }
-          }
-        }
-
-        if (exclusive) {
-          --*globalExclusiveCountPtr;
-          globalLock.unlock();
-        }
-        if (upgrade) {
-          --*globalUpgradeCountPtr;
-          globalLock.unlock_upgrade();
-        }
-        if (shared) {
-          --*globalSharedCountPtr;
-          if (ourGlobalTokenUsed) {
-            globalLock.unlock_shared(ourGlobalToken);
-            ourGlobalTokenUsed = false;
-          } else {
-            globalLock.unlock_shared();
-          }
-        }
-        for (auto& token : privateTokens) {
-          privateLock.unlock_shared(token);
-        }
-      });
-    }
-  }
-
-  go.store(true);
-  for (auto& thr : threads) {
-    DSched::join(thr);
-  }
-}
-
-TEST(SharedMutex, deterministic_concurrent_readers_of_one_lock_read_prio) {
-  for (int pass = 0; pass < 3; ++pass) {
-    DSched sched(DSched::uniform(pass));
-    runContendedReaders<DeterministicAtomic,
-                        DSharedMutexReadPriority,
-                        Locker>(1000, 3, false);
-  }
-}
-
-TEST(SharedMutex, deterministic_concurrent_readers_of_one_lock_write_prio) {
-  for (int pass = 0; pass < 3; ++pass) {
-    DSched sched(DSched::uniform(pass));
-    runContendedReaders<DeterministicAtomic,
-                        DSharedMutexWritePriority,
-                        Locker>(1000, 3, false);
-  }
-}
-
-TEST(SharedMutex, concurrent_readers_of_one_lock_read_prio) {
-  for (int pass = 0; pass < 10; ++pass) {
-    runContendedReaders<atomic, SharedMutexReadPriority, Locker>(
-        100000, 32, false);
-  }
-}
-
-TEST(SharedMutex, concurrent_readers_of_one_lock_write_prio) {
-  for (int pass = 0; pass < 10; ++pass) {
-    runContendedReaders<atomic, SharedMutexWritePriority, Locker>(
-        100000, 32, false);
-  }
-}
-
-TEST(SharedMutex, deterministic_readers_of_concurrent_locks_read_prio) {
-  for (int pass = 0; pass < 3; ++pass) {
-    DSched sched(DSched::uniform(pass));
-    runContendedReaders<DeterministicAtomic,
-                        DSharedMutexReadPriority,
-                        Locker>(1000, 3, true);
-  }
-}
-
-TEST(SharedMutex, deterministic_readers_of_concurrent_locks_write_prio) {
-  for (int pass = 0; pass < 3; ++pass) {
-    DSched sched(DSched::uniform(pass));
-    runContendedReaders<DeterministicAtomic,
-                        DSharedMutexWritePriority,
-                        Locker>(1000, 3, true);
-  }
-}
-
-TEST(SharedMutex, readers_of_concurrent_locks_read_prio) {
-  for (int pass = 0; pass < 10; ++pass) {
-    runContendedReaders<atomic, SharedMutexReadPriority, TokenLocker>(
-        100000, 32, true);
-  }
-}
-
-TEST(SharedMutex, readers_of_concurrent_locks_write_prio) {
-  for (int pass = 0; pass < 10; ++pass) {
-    runContendedReaders<atomic, SharedMutexWritePriority, TokenLocker>(
-        100000, 32, true);
-  }
-}
-
-TEST(SharedMutex, deterministic_mixed_mostly_read_read_prio) {
-  for (int pass = 0; pass < 3; ++pass) {
-    DSched sched(DSched::uniform(pass));
-    runMixed<DeterministicAtomic, DSharedMutexReadPriority, Locker>(
-        1000, 3, 0.1, false);
-  }
-}
-
-TEST(SharedMutex, deterministic_mixed_mostly_read_write_prio) {
-  for (int pass = 0; pass < 3; ++pass) {
-    DSched sched(DSched::uniform(pass));
-    runMixed<DeterministicAtomic, DSharedMutexWritePriority, Locker>(
-        1000, 3, 0.1, false);
-  }
-}
-
-TEST(SharedMutex, mixed_mostly_read_read_prio) {
-  for (int pass = 0; pass < 5; ++pass) {
-    runMixed<atomic, SharedMutexReadPriority, TokenLocker>(
-        50000, 32, 0.1, false);
-  }
-}
-
-TEST(SharedMutex, mixed_mostly_read_write_prio) {
-  for (int pass = 0; pass < 5; ++pass) {
-    runMixed<atomic, SharedMutexWritePriority, TokenLocker>(
-        50000, 32, 0.1, false);
-  }
-}
-
-TEST(SharedMutex, deterministic_mixed_mostly_write_read_prio) {
-  for (int pass = 0; pass < 1; ++pass) {
-    DSched sched(DSched::uniform(pass));
-    runMixed<DeterministicAtomic, DSharedMutexReadPriority, TokenLocker>(
-        1000, 10, 0.9, false);
-  }
-}
-
-TEST(SharedMutex, deterministic_mixed_mostly_write_write_prio) {
-  for (int pass = 0; pass < 1; ++pass) {
-    DSched sched(DSched::uniform(pass));
-    runMixed<DeterministicAtomic, DSharedMutexWritePriority, TokenLocker>(
-        1000, 10, 0.9, false);
-  }
-}
-
-TEST(SharedMutex, deterministic_lost_wakeup_write_prio) {
-  for (int pass = 0; pass < 10; ++pass) {
-    DSched sched(DSched::uniformSubset(pass, 2, 200));
-    runMixed<DeterministicAtomic, DSharedMutexWritePriority, TokenLocker>(
-        1000, 3, 1.0, false);
-  }
-}
-
-TEST(SharedMutex, mixed_mostly_write_read_prio) {
-  for (int pass = 0; pass < 5; ++pass) {
-    runMixed<atomic, SharedMutexReadPriority, TokenLocker>(
-        50000, 300, 0.9, false);
-  }
-}
-
-TEST(SharedMutex, mixed_mostly_write_write_prio) {
-  for (int pass = 0; pass < 5; ++pass) {
-    runMixed<atomic, SharedMutexWritePriority, TokenLocker>(
-        50000, 300, 0.9, false);
-  }
-}
-
-TEST(SharedMutex, deterministic_all_ops_read_prio) {
-  for (int pass = 0; pass < 5; ++pass) {
-    DSched sched(DSched::uniform(pass));
-    runAllAndValidate<DSharedMutexReadPriority, DeterministicAtomic>(1000, 8);
-  }
-}
-
-TEST(SharedMutex, deterministic_all_ops_write_prio) {
-  for (int pass = 0; pass < 5; ++pass) {
-    DSched sched(DSched::uniform(pass));
-    runAllAndValidate<DSharedMutexWritePriority, DeterministicAtomic>(1000, 8);
-  }
-}
-
-TEST(SharedMutex, all_ops_read_prio) {
-  for (int pass = 0; pass < 5; ++pass) {
-    runAllAndValidate<SharedMutexReadPriority, atomic>(100000, 32);
-  }
-}
-
-TEST(SharedMutex, all_ops_write_prio) {
-  for (int pass = 0; pass < 5; ++pass) {
-    runAllAndValidate<SharedMutexWritePriority, atomic>(100000, 32);
-  }
-}
-
-FOLLY_ASSUME_FBVECTOR_COMPATIBLE(
-    boost::optional<boost::optional<SharedMutexToken>>)
-
-// Setup is a set of threads that either grab a shared lock, or exclusive
-// and then downgrade it, or upgrade then upgrade and downgrade, then
-// enqueue the shared lock to a second set of threads that just performs
-// unlocks.  Half of the shared locks use tokens, the others don't.
-template <typename Lock, template <typename> class Atom>
-static void runRemoteUnlock(size_t numOps,
-                            double preWriteFraction,
-                            double preUpgradeFraction,
-                            size_t numSendingThreads,
-                            size_t numReceivingThreads) {
-  Lock globalLock;
-  MPMCQueue<boost::optional<boost::optional<SharedMutexToken>>, Atom>
-    queue(10);
-  auto queuePtr = &queue; // workaround for clang crash
-
-  Atom<bool> go(false);
-  auto goPtr = &go; // workaround for clang crash
-  Atom<int> pendingSenders(numSendingThreads);
-  auto pendingSendersPtr = &pendingSenders; // workaround for clang crash
-  vector<thread> threads(numSendingThreads + numReceivingThreads);
-
-  BENCHMARK_SUSPEND {
-    for (size_t t = 0; t < threads.size(); ++t) {
-      threads[t] = DSched::thread([&, t, numSendingThreads] {
-        if (t >= numSendingThreads) {
-          // we're a receiver
-          typename decltype(queue)::value_type elem;
-          while (true) {
-            queuePtr->blockingRead(elem);
-            if (!elem) {
-              // EOF, pass the EOF token
-              queuePtr->blockingWrite(std::move(elem));
-              break;
-            }
-            if (*elem) {
-              globalLock.unlock_shared(**elem);
-            } else {
-              globalLock.unlock_shared();
-            }
-          }
-          return;
-        }
-        // else we're a sender
-
-        struct drand48_data buffer;
-        srand48_r(t, &buffer);
-
-        while (!goPtr->load()) {
-          this_thread::yield();
-        }
-        for (size_t op = t; op < numOps; op += numSendingThreads) {
-          long unscaledRandVal;
-          lrand48_r(&buffer, &unscaledRandVal);
-
-          // randVal in [0,1]
-          double randVal = ((double)unscaledRandVal) / 0x7fffffff;
-
-          // extract a bit and rescale
-          bool useToken = randVal >= 0.5;
-          randVal = (randVal - (useToken ? 0.5 : 0.0)) * 2;
-
-          boost::optional<SharedMutexToken> maybeToken;
-
-          if (useToken) {
-            SharedMutexToken token;
-            if (randVal < preWriteFraction) {
-              globalLock.lock();
-              globalLock.unlock_and_lock_shared(token);
-            } else if (randVal < preWriteFraction + preUpgradeFraction / 2) {
-              globalLock.lock_upgrade();
-              globalLock.unlock_upgrade_and_lock_shared(token);
-            } else if (randVal < preWriteFraction + preUpgradeFraction) {
-              globalLock.lock_upgrade();
-              globalLock.unlock_upgrade_and_lock();
-              globalLock.unlock_and_lock_shared(token);
-            } else {
-              globalLock.lock_shared(token);
-            }
-            maybeToken = token;
-          } else {
-            if (randVal < preWriteFraction) {
-              globalLock.lock();
-              globalLock.unlock_and_lock_shared();
-            } else if (randVal < preWriteFraction + preUpgradeFraction / 2) {
-              globalLock.lock_upgrade();
-              globalLock.unlock_upgrade_and_lock_shared();
-            } else if (randVal < preWriteFraction + preUpgradeFraction) {
-              globalLock.lock_upgrade();
-              globalLock.unlock_upgrade_and_lock();
-              globalLock.unlock_and_lock_shared();
-            } else {
-              globalLock.lock_shared();
-            }
-          }
-
-          // blockingWrite is emplace-like, so this automatically adds
-          // another level of wrapping
-          queuePtr->blockingWrite(maybeToken);
-        }
-        if (--*pendingSendersPtr == 0) {
-          queuePtr->blockingWrite(boost::none);
-        }
-      });
-    }
-  }
-
-  go.store(true);
-  for (auto& thr : threads) {
-    DSched::join(thr);
-  }
-}
-
-TEST(SharedMutex, deterministic_remote_write_prio) {
-  for (int pass = 0; pass < 1; ++pass) {
-    DSched sched(DSched::uniform(pass));
-    runRemoteUnlock<DSharedMutexWritePriority, DeterministicAtomic>(
-        500, 0.1, 0.1, 5, 5);
-  }
-}
-
-TEST(SharedMutex, deterministic_remote_read_prio) {
-  for (int pass = 0; pass < 1; ++pass) {
-    DSched sched(DSched::uniform(pass));
-    runRemoteUnlock<DSharedMutexReadPriority, DeterministicAtomic>(
-        500, 0.1, 0.1, 5, 5);
-  }
-}
-
-TEST(SharedMutex, remote_write_prio) {
-  for (int pass = 0; pass < 1; ++pass) {
-    runRemoteUnlock<SharedMutexWritePriority, atomic>(100000, 0.1, 0.1, 5, 5);
-  }
-}
-
-TEST(SharedMutex, remote_read_prio) {
-  for (int pass = 0; pass < 1; ++pass) {
-    runRemoteUnlock<SharedMutexReadPriority, atomic>(100000, 0.1, 0.1, 5, 5);
-  }
-}
-
-static void burn(size_t n) {
-  for (size_t i = 0; i < n; ++i) {
-    folly::doNotOptimizeAway(i);
-  }
-}
-
-// Two threads and three locks, arranged so that they have to proceed
-// in turn with reader/writer conflict
-template <typename Lock, template <typename> class Atom = atomic>
-static void runPingPong(size_t numRounds, size_t burnCount) {
-  char padding1[56];
-  pair<Lock, char[56]> locks[3];
-  char padding2[56];
-
-  Atom<int> avail(0);
-  auto availPtr = &avail; // workaround for clang crash
-  Atom<bool> go(false);
-  auto goPtr = &go; // workaround for clang crash
-  vector<thread> threads(2);
-
-  locks[0].first.lock();
-  locks[1].first.lock();
-  locks[2].first.lock_shared();
-
-  BENCHMARK_SUSPEND {
-    threads[0] = DSched::thread([&] {
-      ++*availPtr;
-      while (!goPtr->load()) {
-        this_thread::yield();
-      }
-      for (size_t i = 0; i < numRounds; ++i) {
-        locks[i % 3].first.unlock();
-        locks[(i + 2) % 3].first.lock();
-        burn(burnCount);
-      }
-    });
-    threads[1] = DSched::thread([&] {
-      ++*availPtr;
-      while (!goPtr->load()) {
-        this_thread::yield();
-      }
-      for (size_t i = 0; i < numRounds; ++i) {
-        locks[i % 3].first.lock_shared();
-        burn(burnCount);
-        locks[(i + 2) % 3].first.unlock_shared();
-      }
-    });
-
-    while (avail.load() < 2) {
-      this_thread::yield();
-    }
-  }
-
-  go.store(true);
-  for (auto& thr : threads) {
-    DSched::join(thr);
-  }
-  locks[numRounds % 3].first.unlock();
-  locks[(numRounds + 1) % 3].first.unlock();
-  locks[(numRounds + 2) % 3].first.unlock_shared();
-}
-
-static void folly_rwspin_ping_pong(size_t n, size_t scale, size_t burnCount) {
-  runPingPong<RWSpinLock>(n / scale, burnCount);
-}
-
-static void shmtx_w_bare_ping_pong(size_t n, size_t scale, size_t burnCount) {
-  runPingPong<SharedMutexWritePriority>(n / scale, burnCount);
-}
-
-static void shmtx_r_bare_ping_pong(size_t n, size_t scale, size_t burnCount) {
-  runPingPong<SharedMutexReadPriority>(n / scale, burnCount);
-}
-
-static void folly_ticket_ping_pong(size_t n, size_t scale, size_t burnCount) {
-  runPingPong<RWTicketSpinLock64>(n / scale, burnCount);
-}
-
-static void boost_shared_ping_pong(size_t n, size_t scale, size_t burnCount) {
-  runPingPong<boost::shared_mutex>(n / scale, burnCount);
-}
-
-static void pthrd_rwlock_ping_pong(size_t n, size_t scale, size_t burnCount) {
-  runPingPong<PosixRWLock>(n / scale, burnCount);
-}
-
-TEST(SharedMutex, deterministic_ping_pong_write_prio) {
-  for (int pass = 0; pass < 1; ++pass) {
-    DSched sched(DSched::uniform(pass));
-    runPingPong<DSharedMutexWritePriority, DeterministicAtomic>(500, 0);
-  }
-}
-
-TEST(SharedMutex, deterministic_ping_pong_read_prio) {
-  for (int pass = 0; pass < 1; ++pass) {
-    DSched sched(DSched::uniform(pass));
-    runPingPong<DSharedMutexReadPriority, DeterministicAtomic>(500, 0);
-  }
-}
-
-TEST(SharedMutex, ping_pong_write_prio) {
-  for (int pass = 0; pass < 1; ++pass) {
-    runPingPong<SharedMutexWritePriority, atomic>(50000, 0);
-  }
-}
-
-TEST(SharedMutex, ping_pong_read_prio) {
-  for (int pass = 0; pass < 1; ++pass) {
-    runPingPong<SharedMutexReadPriority, atomic>(50000, 0);
-  }
-}
-
-// This is here so you can tell how much of the runtime reported by the
-// more complex harnesses is due to the harness, although due to the
-// magic of compiler optimization it may also be slower
-BENCHMARK(single_thread_lock_shared_unlock_shared, iters) {
-  SharedMutex lock;
-  for (size_t n = 0; n < iters; ++n) {
-    SharedMutex::Token token;
-    lock.lock_shared(token);
-    folly::doNotOptimizeAway(0);
-    lock.unlock_shared(token);
-  }
-}
-
-BENCHMARK(single_thread_lock_unlock, iters) {
-  SharedMutex lock;
-  for (size_t n = 0; n < iters; ++n) {
-    lock.lock();
-    folly::doNotOptimizeAway(0);
-    lock.unlock();
-  }
-}
-
-#define BENCH_BASE(args...) BENCHMARK_NAMED_PARAM(args)
-#define BENCH_REL(args...) BENCHMARK_RELATIVE_NAMED_PARAM(args)
-
-// 100% reads.  Best-case scenario for deferred locks.  Lock is colocated
-// with read data, so inline lock takes cache miss every time but deferred
-// lock has only cache hits and local access.
-BENCHMARK_DRAW_LINE()
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin_reads, 1thread, 1, false)
-BENCH_REL (shmtx_wr_pri_reads, 1thread, 1, false)
-BENCH_REL (shmtx_w_bare_reads, 1thread, 1, false)
-BENCH_REL (shmtx_rd_pri_reads, 1thread, 1, false)
-BENCH_REL (shmtx_r_bare_reads, 1thread, 1, false)
-BENCH_REL (folly_ticket_reads, 1thread, 1, false)
-BENCH_REL (boost_shared_reads, 1thread, 1, false)
-BENCH_REL (pthrd_rwlock_reads, 1thread, 1, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin_reads, 2thread, 2, false)
-BENCH_REL (shmtx_wr_pri_reads, 2thread, 2, false)
-BENCH_REL (shmtx_w_bare_reads, 2thread, 2, false)
-BENCH_REL (shmtx_rd_pri_reads, 2thread, 2, false)
-BENCH_REL (shmtx_r_bare_reads, 2thread, 2, false)
-BENCH_REL (folly_ticket_reads, 2thread, 2, false)
-BENCH_REL (boost_shared_reads, 2thread, 2, false)
-BENCH_REL (pthrd_rwlock_reads, 2thread, 2, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin_reads, 4thread, 4, false)
-BENCH_REL (shmtx_wr_pri_reads, 4thread, 4, false)
-BENCH_REL (shmtx_w_bare_reads, 4thread, 4, false)
-BENCH_REL (shmtx_rd_pri_reads, 4thread, 4, false)
-BENCH_REL (shmtx_r_bare_reads, 4thread, 4, false)
-BENCH_REL (folly_ticket_reads, 4thread, 4, false)
-BENCH_REL (boost_shared_reads, 4thread, 4, false)
-BENCH_REL (pthrd_rwlock_reads, 4thread, 4, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin_reads, 8thread, 8, false)
-BENCH_REL (shmtx_wr_pri_reads, 8thread, 8, false)
-BENCH_REL (shmtx_w_bare_reads, 8thread, 8, false)
-BENCH_REL (shmtx_rd_pri_reads, 8thread, 8, false)
-BENCH_REL (shmtx_r_bare_reads, 8thread, 8, false)
-BENCH_REL (folly_ticket_reads, 8thread, 8, false)
-BENCH_REL (boost_shared_reads, 8thread, 8, false)
-BENCH_REL (pthrd_rwlock_reads, 8thread, 8, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin_reads, 16thread, 16, false)
-BENCH_REL (shmtx_wr_pri_reads, 16thread, 16, false)
-BENCH_REL (shmtx_w_bare_reads, 16thread, 16, false)
-BENCH_REL (shmtx_rd_pri_reads, 16thread, 16, false)
-BENCH_REL (shmtx_r_bare_reads, 16thread, 16, false)
-BENCH_REL (folly_ticket_reads, 16thread, 16, false)
-BENCH_REL (boost_shared_reads, 16thread, 16, false)
-BENCH_REL (pthrd_rwlock_reads, 16thread, 16, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin_reads, 32thread, 32, false)
-BENCH_REL (shmtx_wr_pri_reads, 32thread, 32, false)
-BENCH_REL (shmtx_w_bare_reads, 32thread, 32, false)
-BENCH_REL (shmtx_rd_pri_reads, 32thread, 32, false)
-BENCH_REL (shmtx_r_bare_reads, 32thread, 32, false)
-BENCH_REL (folly_ticket_reads, 32thread, 32, false)
-BENCH_REL (boost_shared_reads, 32thread, 32, false)
-BENCH_REL (pthrd_rwlock_reads, 32thread, 32, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin_reads, 64thread, 64, false)
-BENCH_REL (shmtx_wr_pri_reads, 64thread, 64, false)
-BENCH_REL (shmtx_w_bare_reads, 64thread, 64, false)
-BENCH_REL (shmtx_rd_pri_reads, 64thread, 64, false)
-BENCH_REL (shmtx_r_bare_reads, 64thread, 64, false)
-BENCH_REL (folly_ticket_reads, 64thread, 64, false)
-BENCH_REL (boost_shared_reads, 64thread, 64, false)
-BENCH_REL (pthrd_rwlock_reads, 64thread, 64, false)
-
-// 1 lock used by everybody, 100% writes.  Threads only hurt, but it is
-// good to not fail catastrophically.  Compare to single_thread_lock_unlock
-// to see the overhead of the generic driver (and its pseudo-random number
-// generator).  pthrd_mutex_ is a pthread_mutex_t (default, not adaptive),
-// which is better than any of the reader-writer locks for this scenario.
-BENCHMARK_DRAW_LINE()
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 1thread_all_write, 1, 1.0, false)
-BENCH_REL (shmtx_wr_pri, 1thread_all_write, 1, 1.0, false)
-BENCH_REL (shmtx_rd_pri, 1thread_all_write, 1, 1.0, false)
-BENCH_REL (folly_ticket, 1thread_all_write, 1, 1.0, false)
-BENCH_REL (boost_shared, 1thread_all_write, 1, 1.0, false)
-BENCH_REL (pthrd_rwlock, 1thread_all_write, 1, 1.0, false)
-BENCH_REL (pthrd_mutex_, 1thread_all_write, 1, 1.0, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 2thread_all_write, 2, 1.0, false)
-BENCH_REL (shmtx_wr_pri, 2thread_all_write, 2, 1.0, false)
-BENCH_REL (shmtx_rd_pri, 2thread_all_write, 2, 1.0, false)
-BENCH_REL (folly_ticket, 2thread_all_write, 2, 1.0, false)
-BENCH_REL (boost_shared, 2thread_all_write, 2, 1.0, false)
-BENCH_REL (pthrd_rwlock, 2thread_all_write, 2, 1.0, false)
-BENCH_REL (pthrd_mutex_, 2thread_all_write, 2, 1.0, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 4thread_all_write, 4, 1.0, false)
-BENCH_REL (shmtx_wr_pri, 4thread_all_write, 4, 1.0, false)
-BENCH_REL (shmtx_rd_pri, 4thread_all_write, 4, 1.0, false)
-BENCH_REL (folly_ticket, 4thread_all_write, 4, 1.0, false)
-BENCH_REL (boost_shared, 4thread_all_write, 4, 1.0, false)
-BENCH_REL (pthrd_rwlock, 4thread_all_write, 4, 1.0, false)
-BENCH_REL (pthrd_mutex_, 4thread_all_write, 4, 1.0, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 8thread_all_write, 8, 1.0, false)
-BENCH_REL (shmtx_wr_pri, 8thread_all_write, 8, 1.0, false)
-BENCH_REL (shmtx_rd_pri, 8thread_all_write, 8, 1.0, false)
-BENCH_REL (folly_ticket, 8thread_all_write, 8, 1.0, false)
-BENCH_REL (boost_shared, 8thread_all_write, 8, 1.0, false)
-BENCH_REL (pthrd_rwlock, 8thread_all_write, 8, 1.0, false)
-BENCH_REL (pthrd_mutex_, 8thread_all_write, 8, 1.0, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 16thread_all_write, 16, 1.0, false)
-BENCH_REL (shmtx_wr_pri, 16thread_all_write, 16, 1.0, false)
-BENCH_REL (shmtx_rd_pri, 16thread_all_write, 16, 1.0, false)
-BENCH_REL (folly_ticket, 16thread_all_write, 16, 1.0, false)
-BENCH_REL (boost_shared, 16thread_all_write, 16, 1.0, false)
-BENCH_REL (pthrd_rwlock, 16thread_all_write, 16, 1.0, false)
-BENCH_REL (pthrd_mutex_, 16thread_all_write, 16, 1.0, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 32thread_all_write, 32, 1.0, false)
-BENCH_REL (shmtx_wr_pri, 32thread_all_write, 32, 1.0, false)
-BENCH_REL (shmtx_rd_pri, 32thread_all_write, 32, 1.0, false)
-BENCH_REL (folly_ticket, 32thread_all_write, 32, 1.0, false)
-BENCH_REL (boost_shared, 32thread_all_write, 32, 1.0, false)
-BENCH_REL (pthrd_rwlock, 32thread_all_write, 32, 1.0, false)
-BENCH_REL (pthrd_mutex_, 32thread_all_write, 32, 1.0, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 64thread_all_write, 64, 1.0, false)
-BENCH_REL (shmtx_wr_pri, 64thread_all_write, 64, 1.0, false)
-BENCH_REL (shmtx_rd_pri, 64thread_all_write, 64, 1.0, false)
-BENCH_REL (folly_ticket, 64thread_all_write, 64, 1.0, false)
-BENCH_REL (boost_shared, 64thread_all_write, 64, 1.0, false)
-BENCH_REL (pthrd_rwlock, 64thread_all_write, 64, 1.0, false)
-BENCH_REL (pthrd_mutex_, 64thread_all_write, 64, 1.0, false)
-
-// 1 lock used by everybody, 10% writes.  Not much scaling to be had.  Perf
-// is best at 1 thread, once you've got multiple threads > 8 threads hurts.
-BENCHMARK_DRAW_LINE()
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 1thread_10pct_write, 1, 0.10, false)
-BENCH_REL (shmtx_wr_pri, 1thread_10pct_write, 1, 0.10, false)
-BENCH_REL (shmtx_rd_pri, 1thread_10pct_write, 1, 0.10, false)
-BENCH_REL (folly_ticket, 1thread_10pct_write, 1, 0.10, false)
-BENCH_REL (boost_shared, 1thread_10pct_write, 1, 0.10, false)
-BENCH_REL (pthrd_rwlock, 1thread_10pct_write, 1, 0.10, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 2thread_10pct_write, 2, 0.10, false)
-BENCH_REL (shmtx_wr_pri, 2thread_10pct_write, 2, 0.10, false)
-BENCH_REL (shmtx_rd_pri, 2thread_10pct_write, 2, 0.10, false)
-BENCH_REL (folly_ticket, 2thread_10pct_write, 2, 0.10, false)
-BENCH_REL (boost_shared, 2thread_10pct_write, 2, 0.10, false)
-BENCH_REL (pthrd_rwlock, 2thread_10pct_write, 2, 0.10, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 4thread_10pct_write, 4, 0.10, false)
-BENCH_REL (shmtx_wr_pri, 4thread_10pct_write, 4, 0.10, false)
-BENCH_REL (shmtx_rd_pri, 4thread_10pct_write, 4, 0.10, false)
-BENCH_REL (folly_ticket, 4thread_10pct_write, 4, 0.10, false)
-BENCH_REL (boost_shared, 4thread_10pct_write, 4, 0.10, false)
-BENCH_REL (pthrd_rwlock, 4thread_10pct_write, 4, 0.10, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 8thread_10pct_write, 8, 0.10, false)
-BENCH_REL (shmtx_wr_pri, 8thread_10pct_write, 8, 0.10, false)
-BENCH_REL (shmtx_rd_pri, 8thread_10pct_write, 8, 0.10, false)
-BENCH_REL (folly_ticket, 8thread_10pct_write, 8, 0.10, false)
-BENCH_REL (boost_shared, 8thread_10pct_write, 8, 0.10, false)
-BENCH_REL (pthrd_rwlock, 8thread_10pct_write, 8, 0.10, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 16thread_10pct_write, 16, 0.10, false)
-BENCH_REL (shmtx_wr_pri, 16thread_10pct_write, 16, 0.10, false)
-BENCH_REL (shmtx_rd_pri, 16thread_10pct_write, 16, 0.10, false)
-BENCH_REL (folly_ticket, 16thread_10pct_write, 16, 0.10, false)
-BENCH_REL (boost_shared, 16thread_10pct_write, 16, 0.10, false)
-BENCH_REL (pthrd_rwlock, 16thread_10pct_write, 16, 0.10, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 32thread_10pct_write, 32, 0.10, false)
-BENCH_REL (shmtx_wr_pri, 32thread_10pct_write, 32, 0.10, false)
-BENCH_REL (shmtx_rd_pri, 32thread_10pct_write, 32, 0.10, false)
-BENCH_REL (folly_ticket, 32thread_10pct_write, 32, 0.10, false)
-BENCH_REL (boost_shared, 32thread_10pct_write, 32, 0.10, false)
-BENCH_REL (pthrd_rwlock, 32thread_10pct_write, 32, 0.10, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 64thread_10pct_write, 64, 0.10, false)
-BENCH_REL (shmtx_wr_pri, 64thread_10pct_write, 64, 0.10, false)
-BENCH_REL (shmtx_rd_pri, 64thread_10pct_write, 64, 0.10, false)
-BENCH_REL (folly_ticket, 64thread_10pct_write, 64, 0.10, false)
-BENCH_REL (boost_shared, 64thread_10pct_write, 64, 0.10, false)
-BENCH_REL (pthrd_rwlock, 64thread_10pct_write, 64, 0.10, false)
-
-// 1 lock used by everybody, 1% writes.  This is a more realistic example
-// than the concurrent_*_reads benchmark, but still shows SharedMutex locks
-// winning over all of the others
-BENCHMARK_DRAW_LINE()
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 1thread_1pct_write, 1, 0.01, false)
-BENCH_REL (shmtx_wr_pri, 1thread_1pct_write, 1, 0.01, false)
-BENCH_REL (shmtx_w_bare, 1thread_1pct_write, 1, 0.01, false)
-BENCH_REL (shmtx_rd_pri, 1thread_1pct_write, 1, 0.01, false)
-BENCH_REL (shmtx_r_bare, 1thread_1pct_write, 1, 0.01, false)
-BENCH_REL (folly_ticket, 1thread_1pct_write, 1, 0.01, false)
-BENCH_REL (boost_shared, 1thread_1pct_write, 1, 0.01, false)
-BENCH_REL (pthrd_rwlock, 1thread_1pct_write, 1, 0.01, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 2thread_1pct_write, 2, 0.01, false)
-BENCH_REL (shmtx_wr_pri, 2thread_1pct_write, 2, 0.01, false)
-BENCH_REL (shmtx_w_bare, 2thread_1pct_write, 2, 0.01, false)
-BENCH_REL (shmtx_rd_pri, 2thread_1pct_write, 2, 0.01, false)
-BENCH_REL (shmtx_r_bare, 2thread_1pct_write, 2, 0.01, false)
-BENCH_REL (folly_ticket, 2thread_1pct_write, 2, 0.01, false)
-BENCH_REL (boost_shared, 2thread_1pct_write, 2, 0.01, false)
-BENCH_REL (pthrd_rwlock, 2thread_1pct_write, 2, 0.01, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 4thread_1pct_write, 4, 0.01, false)
-BENCH_REL (shmtx_wr_pri, 4thread_1pct_write, 4, 0.01, false)
-BENCH_REL (shmtx_w_bare, 4thread_1pct_write, 4, 0.01, false)
-BENCH_REL (shmtx_rd_pri, 4thread_1pct_write, 4, 0.01, false)
-BENCH_REL (shmtx_r_bare, 4thread_1pct_write, 4, 0.01, false)
-BENCH_REL (folly_ticket, 4thread_1pct_write, 4, 0.01, false)
-BENCH_REL (boost_shared, 4thread_1pct_write, 4, 0.01, false)
-BENCH_REL (pthrd_rwlock, 4thread_1pct_write, 4, 0.01, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 8thread_1pct_write, 8, 0.01, false)
-BENCH_REL (shmtx_wr_pri, 8thread_1pct_write, 8, 0.01, false)
-BENCH_REL (shmtx_w_bare, 8thread_1pct_write, 8, 0.01, false)
-BENCH_REL (shmtx_rd_pri, 8thread_1pct_write, 8, 0.01, false)
-BENCH_REL (shmtx_r_bare, 8thread_1pct_write, 8, 0.01, false)
-BENCH_REL (folly_ticket, 8thread_1pct_write, 8, 0.01, false)
-BENCH_REL (boost_shared, 8thread_1pct_write, 8, 0.01, false)
-BENCH_REL (pthrd_rwlock, 8thread_1pct_write, 8, 0.01, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 16thread_1pct_write, 16, 0.01, false)
-BENCH_REL (shmtx_wr_pri, 16thread_1pct_write, 16, 0.01, false)
-BENCH_REL (shmtx_w_bare, 16thread_1pct_write, 16, 0.01, false)
-BENCH_REL (shmtx_rd_pri, 16thread_1pct_write, 16, 0.01, false)
-BENCH_REL (shmtx_r_bare, 16thread_1pct_write, 16, 0.01, false)
-BENCH_REL (folly_ticket, 16thread_1pct_write, 16, 0.01, false)
-BENCH_REL (boost_shared, 16thread_1pct_write, 16, 0.01, false)
-BENCH_REL (pthrd_rwlock, 16thread_1pct_write, 16, 0.01, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 32thread_1pct_write, 32, 0.01, false)
-BENCH_REL (shmtx_wr_pri, 32thread_1pct_write, 32, 0.01, false)
-BENCH_REL (shmtx_w_bare, 32thread_1pct_write, 32, 0.01, false)
-BENCH_REL (shmtx_rd_pri, 32thread_1pct_write, 32, 0.01, false)
-BENCH_REL (shmtx_r_bare, 32thread_1pct_write, 32, 0.01, false)
-BENCH_REL (folly_ticket, 32thread_1pct_write, 32, 0.01, false)
-BENCH_REL (boost_shared, 32thread_1pct_write, 32, 0.01, false)
-BENCH_REL (pthrd_rwlock, 32thread_1pct_write, 32, 0.01, false)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 64thread_1pct_write, 64, 0.01, false)
-BENCH_REL (shmtx_wr_pri, 64thread_1pct_write, 64, 0.01, false)
-BENCH_REL (shmtx_w_bare, 64thread_1pct_write, 64, 0.01, false)
-BENCH_REL (shmtx_rd_pri, 64thread_1pct_write, 64, 0.01, false)
-BENCH_REL (shmtx_r_bare, 64thread_1pct_write, 64, 0.01, false)
-BENCH_REL (folly_ticket, 64thread_1pct_write, 64, 0.01, false)
-BENCH_REL (boost_shared, 64thread_1pct_write, 64, 0.01, false)
-BENCH_REL (pthrd_rwlock, 64thread_1pct_write, 64, 0.01, false)
-
-// Worst case scenario for deferred locks. No actual sharing, likely that
-// read operations will have to first set the kDeferredReadersPossibleBit,
-// and likely that writers will have to scan deferredReaders[].
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 2thr_2lock_50pct_write, 2, 0.50, true)
-BENCH_REL (shmtx_wr_pri, 2thr_2lock_50pct_write, 2, 0.50, true)
-BENCH_REL (shmtx_rd_pri, 2thr_2lock_50pct_write, 2, 0.50, true)
-BENCH_BASE(folly_rwspin, 4thr_4lock_50pct_write, 4, 0.50, true)
-BENCH_REL (shmtx_wr_pri, 4thr_4lock_50pct_write, 4, 0.50, true)
-BENCH_REL (shmtx_rd_pri, 4thr_4lock_50pct_write, 4, 0.50, true)
-BENCH_BASE(folly_rwspin, 8thr_8lock_50pct_write, 8, 0.50, true)
-BENCH_REL (shmtx_wr_pri, 8thr_8lock_50pct_write, 8, 0.50, true)
-BENCH_REL (shmtx_rd_pri, 8thr_8lock_50pct_write, 8, 0.50, true)
-BENCH_BASE(folly_rwspin, 16thr_16lock_50pct_write, 16, 0.50, true)
-BENCH_REL (shmtx_wr_pri, 16thr_16lock_50pct_write, 16, 0.50, true)
-BENCH_REL (shmtx_rd_pri, 16thr_16lock_50pct_write, 16, 0.50, true)
-BENCH_BASE(folly_rwspin, 32thr_32lock_50pct_write, 32, 0.50, true)
-BENCH_REL (shmtx_wr_pri, 32thr_32lock_50pct_write, 32, 0.50, true)
-BENCH_REL (shmtx_rd_pri, 32thr_32lock_50pct_write, 32, 0.50, true)
-BENCH_BASE(folly_rwspin, 64thr_64lock_50pct_write, 64, 0.50, true)
-BENCH_REL (shmtx_wr_pri, 64thr_64lock_50pct_write, 64, 0.50, true)
-BENCH_REL (shmtx_rd_pri, 64thr_64lock_50pct_write, 64, 0.50, true)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 2thr_2lock_10pct_write, 2, 0.10, true)
-BENCH_REL (shmtx_wr_pri, 2thr_2lock_10pct_write, 2, 0.10, true)
-BENCH_REL (shmtx_rd_pri, 2thr_2lock_10pct_write, 2, 0.10, true)
-BENCH_BASE(folly_rwspin, 4thr_4lock_10pct_write, 4, 0.10, true)
-BENCH_REL (shmtx_wr_pri, 4thr_4lock_10pct_write, 4, 0.10, true)
-BENCH_REL (shmtx_rd_pri, 4thr_4lock_10pct_write, 4, 0.10, true)
-BENCH_BASE(folly_rwspin, 8thr_8lock_10pct_write, 8, 0.10, true)
-BENCH_REL (shmtx_wr_pri, 8thr_8lock_10pct_write, 8, 0.10, true)
-BENCH_REL (shmtx_rd_pri, 8thr_8lock_10pct_write, 8, 0.10, true)
-BENCH_BASE(folly_rwspin, 16thr_16lock_10pct_write, 16, 0.10, true)
-BENCH_REL (shmtx_wr_pri, 16thr_16lock_10pct_write, 16, 0.10, true)
-BENCH_REL (shmtx_rd_pri, 16thr_16lock_10pct_write, 16, 0.10, true)
-BENCH_BASE(folly_rwspin, 32thr_32lock_10pct_write, 32, 0.10, true)
-BENCH_REL (shmtx_wr_pri, 32thr_32lock_10pct_write, 32, 0.10, true)
-BENCH_REL (shmtx_rd_pri, 32thr_32lock_10pct_write, 32, 0.10, true)
-BENCH_BASE(folly_rwspin, 64thr_64lock_10pct_write, 64, 0.10, true)
-BENCH_REL (shmtx_wr_pri, 64thr_64lock_10pct_write, 64, 0.10, true)
-BENCH_REL (shmtx_rd_pri, 64thr_64lock_10pct_write, 64, 0.10, true)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin, 2thr_2lock_1pct_write, 2, 0.01, true)
-BENCH_REL (shmtx_wr_pri, 2thr_2lock_1pct_write, 2, 0.01, true)
-BENCH_REL (shmtx_rd_pri, 2thr_2lock_1pct_write, 2, 0.01, true)
-BENCH_BASE(folly_rwspin, 4thr_4lock_1pct_write, 4, 0.01, true)
-BENCH_REL (shmtx_wr_pri, 4thr_4lock_1pct_write, 4, 0.01, true)
-BENCH_REL (shmtx_rd_pri, 4thr_4lock_1pct_write, 4, 0.01, true)
-BENCH_BASE(folly_rwspin, 8thr_8lock_1pct_write, 8, 0.01, true)
-BENCH_REL (shmtx_wr_pri, 8thr_8lock_1pct_write, 8, 0.01, true)
-BENCH_REL (shmtx_rd_pri, 8thr_8lock_1pct_write, 8, 0.01, true)
-BENCH_BASE(folly_rwspin, 16thr_16lock_1pct_write, 16, 0.01, true)
-BENCH_REL (shmtx_wr_pri, 16thr_16lock_1pct_write, 16, 0.01, true)
-BENCH_REL (shmtx_rd_pri, 16thr_16lock_1pct_write, 16, 0.01, true)
-BENCH_BASE(folly_rwspin, 32thr_32lock_1pct_write, 32, 0.01, true)
-BENCH_REL (shmtx_wr_pri, 32thr_32lock_1pct_write, 32, 0.01, true)
-BENCH_REL (shmtx_rd_pri, 32thr_32lock_1pct_write, 32, 0.01, true)
-BENCH_BASE(folly_rwspin, 64thr_64lock_1pct_write, 64, 0.01, true)
-BENCH_REL (shmtx_wr_pri, 64thr_64lock_1pct_write, 64, 0.01, true)
-BENCH_REL (shmtx_rd_pri, 64thr_64lock_1pct_write, 64, 0.01, true)
-
-// Ping-pong tests have a scaled number of iterations, because their burn
-// loop would make them too slow otherwise.  Ping-pong with burn count of
-// 100k or 300k shows the advantage of soft-spin, reducing the cost of
-// each wakeup by about 20 usec.  (Take benchmark reported difference,
-// ~400 nanos, multiply by the scale of 100, then divide by 2 because
-// each round has two wakeups.)
-BENCHMARK_DRAW_LINE()
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin_ping_pong, burn0, 1, 0)
-BENCH_REL (shmtx_w_bare_ping_pong, burn0, 1, 0)
-BENCH_REL (shmtx_r_bare_ping_pong, burn0, 1, 0)
-BENCH_REL (folly_ticket_ping_pong, burn0, 1, 0)
-BENCH_REL (boost_shared_ping_pong, burn0, 1, 0)
-BENCH_REL (pthrd_rwlock_ping_pong, burn0, 1, 0)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin_ping_pong, burn100k, 100, 100000)
-BENCH_REL (shmtx_w_bare_ping_pong, burn100k, 100, 100000)
-BENCH_REL (shmtx_r_bare_ping_pong, burn100k, 100, 100000)
-BENCH_REL (folly_ticket_ping_pong, burn100k, 100, 100000)
-BENCH_REL (boost_shared_ping_pong, burn100k, 100, 100000)
-BENCH_REL (pthrd_rwlock_ping_pong, burn100k, 100, 100000)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin_ping_pong, burn300k, 100, 300000)
-BENCH_REL (shmtx_w_bare_ping_pong, burn300k, 100, 300000)
-BENCH_REL (shmtx_r_bare_ping_pong, burn300k, 100, 300000)
-BENCH_REL (folly_ticket_ping_pong, burn300k, 100, 300000)
-BENCH_REL (boost_shared_ping_pong, burn300k, 100, 300000)
-BENCH_REL (pthrd_rwlock_ping_pong, burn300k, 100, 300000)
-BENCHMARK_DRAW_LINE()
-BENCH_BASE(folly_rwspin_ping_pong, burn1M, 1000, 1000000)
-BENCH_REL (shmtx_w_bare_ping_pong, burn1M, 1000, 1000000)
-BENCH_REL (shmtx_r_bare_ping_pong, burn1M, 1000, 1000000)
-BENCH_REL (folly_ticket_ping_pong, burn1M, 1000, 1000000)
-BENCH_REL (boost_shared_ping_pong, burn1M, 1000, 1000000)
-BENCH_REL (pthrd_rwlock_ping_pong, burn1M, 1000, 1000000)
-
-// Reproduce with 10 minutes and
-//   sudo nice -n -20 \
-//     shared_mutex_test --benchmark --bm_min_iters=1000000
-//
-// Comparison use folly::RWSpinLock as the baseline, with the
-// following row being the default SharedMutex (using *Holder or
-// Token-ful methods).
-// ============================================================================
-// folly/experimental/test/SharedMutexTest.cpp     relative  time/iter  iters/s
-// ============================================================================
-// single_thread_lock_shared_unlock_shared                     22.78ns   43.89M
-// single_thread_lock_unlock                                   26.01ns   38.45M
-// ----------------------------------------------------------------------------
-// ----------------------------------------------------------------------------
-// folly_rwspin_reads(1thread)                                 15.09ns   66.25M
-// shmtx_wr_pri_reads(1thread)                       69.89%    21.60ns   46.30M
-// shmtx_w_bare_reads(1thread)                       58.25%    25.91ns   38.59M
-// shmtx_rd_pri_reads(1thread)                       72.50%    20.82ns   48.03M
-// shmtx_r_bare_reads(1thread)                       58.27%    25.91ns   38.60M
-// folly_ticket_reads(1thread)                       54.80%    27.55ns   36.30M
-// boost_shared_reads(1thread)                       10.88%   138.80ns    7.20M
-// pthrd_rwlock_reads(1thread)                       40.68%    37.11ns   26.95M
-// ----------------------------------------------------------------------------
-// folly_rwspin_reads(2thread)                                 92.63ns   10.80M
-// shmtx_wr_pri_reads(2thread)                      462.86%    20.01ns   49.97M
-// shmtx_w_bare_reads(2thread)                      430.53%    21.51ns   46.48M
-// shmtx_rd_pri_reads(2thread)                      487.13%    19.01ns   52.59M
-// shmtx_r_bare_reads(2thread)                      433.35%    21.37ns   46.79M
-// folly_ticket_reads(2thread)                       69.82%   132.67ns    7.54M
-// boost_shared_reads(2thread)                       36.66%   252.63ns    3.96M
-// pthrd_rwlock_reads(2thread)                      127.76%    72.50ns   13.79M
-// ----------------------------------------------------------------------------
-// folly_rwspin_reads(4thread)                                 97.45ns   10.26M
-// shmtx_wr_pri_reads(4thread)                      978.22%     9.96ns  100.38M
-// shmtx_w_bare_reads(4thread)                      908.35%    10.73ns   93.21M
-// shmtx_rd_pri_reads(4thread)                     1032.29%     9.44ns  105.93M
-// shmtx_r_bare_reads(4thread)                      912.38%    10.68ns   93.63M
-// folly_ticket_reads(4thread)                       46.08%   211.46ns    4.73M
-// boost_shared_reads(4thread)                       25.00%   389.74ns    2.57M
-// pthrd_rwlock_reads(4thread)                       47.53%   205.01ns    4.88M
-// ----------------------------------------------------------------------------
-// folly_rwspin_reads(8thread)                                147.24ns    6.79M
-// shmtx_wr_pri_reads(8thread)                     2915.66%     5.05ns  198.02M
-// shmtx_w_bare_reads(8thread)                     2699.32%     5.45ns  183.32M
-// shmtx_rd_pri_reads(8thread)                     3092.58%     4.76ns  210.03M
-// shmtx_r_bare_reads(8thread)                     2744.63%     5.36ns  186.40M
-// folly_ticket_reads(8thread)                       54.84%   268.47ns    3.72M
-// boost_shared_reads(8thread)                       42.40%   347.30ns    2.88M
-// pthrd_rwlock_reads(8thread)                       78.90%   186.63ns    5.36M
-// ----------------------------------------------------------------------------
-// folly_rwspin_reads(16thread)                               166.25ns    6.02M
-// shmtx_wr_pri_reads(16thread)                    6133.03%     2.71ns  368.91M
-// shmtx_w_bare_reads(16thread)                    5936.05%     2.80ns  357.06M
-// shmtx_rd_pri_reads(16thread)                    6786.57%     2.45ns  408.22M
-// shmtx_r_bare_reads(16thread)                    5995.54%     2.77ns  360.64M
-// folly_ticket_reads(16thread)                      56.35%   295.01ns    3.39M
-// boost_shared_reads(16thread)                      51.62%   322.08ns    3.10M
-// pthrd_rwlock_reads(16thread)                      92.47%   179.79ns    5.56M
-// ----------------------------------------------------------------------------
-// folly_rwspin_reads(32thread)                               107.72ns    9.28M
-// shmtx_wr_pri_reads(32thread)                    6772.80%     1.59ns  628.77M
-// shmtx_w_bare_reads(32thread)                    6236.13%     1.73ns  578.94M
-// shmtx_rd_pri_reads(32thread)                    8143.32%     1.32ns  756.00M
-// shmtx_r_bare_reads(32thread)                    6485.18%     1.66ns  602.06M
-// folly_ticket_reads(32thread)                      35.12%   306.73ns    3.26M
-// boost_shared_reads(32thread)                      28.19%   382.17ns    2.62M
-// pthrd_rwlock_reads(32thread)                      65.29%   164.99ns    6.06M
-// ----------------------------------------------------------------------------
-// folly_rwspin_reads(64thread)                               119.46ns    8.37M
-// shmtx_wr_pri_reads(64thread)                    6744.92%     1.77ns  564.60M
-// shmtx_w_bare_reads(64thread)                    6268.50%     1.91ns  524.72M
-// shmtx_rd_pri_reads(64thread)                    7508.56%     1.59ns  628.52M
-// shmtx_r_bare_reads(64thread)                    6299.53%     1.90ns  527.32M
-// folly_ticket_reads(64thread)                      37.42%   319.26ns    3.13M
-// boost_shared_reads(64thread)                      32.58%   366.70ns    2.73M
-// pthrd_rwlock_reads(64thread)                      73.64%   162.24ns    6.16M
-// ----------------------------------------------------------------------------
-// ----------------------------------------------------------------------------
-// folly_rwspin(1thread_all_write)                             25.51ns   39.19M
-// shmtx_wr_pri(1thread_all_write)                   97.38%    26.20ns   38.17M
-// shmtx_rd_pri(1thread_all_write)                   97.55%    26.16ns   38.23M
-// folly_ticket(1thread_all_write)                   90.98%    28.04ns   35.66M
-// boost_shared(1thread_all_write)                   16.80%   151.89ns    6.58M
-// pthrd_rwlock(1thread_all_write)                   63.86%    39.96ns   25.03M
-// pthrd_mutex_(1thread_all_write)                   82.05%    31.09ns   32.16M
-// ----------------------------------------------------------------------------
-// folly_rwspin(2thread_all_write)                            100.70ns    9.93M
-// shmtx_wr_pri(2thread_all_write)                   40.83%   246.61ns    4.05M
-// shmtx_rd_pri(2thread_all_write)                   40.53%   248.44ns    4.03M
-// folly_ticket(2thread_all_write)                   58.49%   172.17ns    5.81M
-// boost_shared(2thread_all_write)                   24.26%   415.00ns    2.41M
-// pthrd_rwlock(2thread_all_write)                   41.35%   243.49ns    4.11M
-// pthrd_mutex_(2thread_all_write)                  146.91%    68.55ns   14.59M
-// ----------------------------------------------------------------------------
-// folly_rwspin(4thread_all_write)                            199.52ns    5.01M
-// shmtx_wr_pri(4thread_all_write)                   51.71%   385.86ns    2.59M
-// shmtx_rd_pri(4thread_all_write)                   49.43%   403.62ns    2.48M
-// folly_ticket(4thread_all_write)                  117.88%   169.26ns    5.91M
-// boost_shared(4thread_all_write)                    9.81%     2.03us  491.48K
-// pthrd_rwlock(4thread_all_write)                   28.23%   706.69ns    1.42M
-// pthrd_mutex_(4thread_all_write)                  111.54%   178.88ns    5.59M
-// ----------------------------------------------------------------------------
-// folly_rwspin(8thread_all_write)                            304.61ns    3.28M
-// shmtx_wr_pri(8thread_all_write)                   69.77%   436.59ns    2.29M
-// shmtx_rd_pri(8thread_all_write)                   66.58%   457.51ns    2.19M
-// folly_ticket(8thread_all_write)                  141.00%   216.03ns    4.63M
-// boost_shared(8thread_all_write)                    6.11%     4.99us  200.59K
-// pthrd_rwlock(8thread_all_write)                   38.03%   800.88ns    1.25M
-// pthrd_mutex_(8thread_all_write)                  177.66%   171.45ns    5.83M
-// ----------------------------------------------------------------------------
-// folly_rwspin(16thread_all_write)                           576.97ns    1.73M
-// shmtx_wr_pri(16thread_all_write)                 105.72%   545.77ns    1.83M
-// shmtx_rd_pri(16thread_all_write)                 105.13%   548.83ns    1.82M
-// folly_ticket(16thread_all_write)                 161.70%   356.82ns    2.80M
-// boost_shared(16thread_all_write)                   7.73%     7.46us  134.03K
-// pthrd_rwlock(16thread_all_write)                  96.88%   595.54ns    1.68M
-// pthrd_mutex_(16thread_all_write)                 330.44%   174.61ns    5.73M
-// ----------------------------------------------------------------------------
-// folly_rwspin(32thread_all_write)                             1.41us  707.76K
-// shmtx_wr_pri(32thread_all_write)                 240.46%   587.58ns    1.70M
-// shmtx_rd_pri(32thread_all_write)                 393.71%   358.87ns    2.79M
-// folly_ticket(32thread_all_write)                 325.07%   434.65ns    2.30M
-// boost_shared(32thread_all_write)                  18.57%     7.61us  131.43K
-// pthrd_rwlock(32thread_all_write)                 266.78%   529.62ns    1.89M
-// pthrd_mutex_(32thread_all_write)                 877.89%   160.94ns    6.21M
-// ----------------------------------------------------------------------------
-// folly_rwspin(64thread_all_write)                             1.76us  566.94K
-// shmtx_wr_pri(64thread_all_write)                 255.67%   689.91ns    1.45M
-// shmtx_rd_pri(64thread_all_write)                 468.82%   376.23ns    2.66M
-// folly_ticket(64thread_all_write)                 294.72%   598.49ns    1.67M
-// boost_shared(64thread_all_write)                  23.39%     7.54us  132.58K
-// pthrd_rwlock(64thread_all_write)                 321.39%   548.83ns    1.82M
-// pthrd_mutex_(64thread_all_write)                1165.04%   151.40ns    6.61M
-// ----------------------------------------------------------------------------
-// ----------------------------------------------------------------------------
-// folly_rwspin(1thread_10pct_write)                           19.51ns   51.26M
-// shmtx_wr_pri(1thread_10pct_write)                 83.25%    23.43ns   42.67M
-// shmtx_rd_pri(1thread_10pct_write)                 83.31%    23.42ns   42.71M
-// folly_ticket(1thread_10pct_write)                 70.88%    27.52ns   36.34M
-// boost_shared(1thread_10pct_write)                 13.09%   148.99ns    6.71M
-// pthrd_rwlock(1thread_10pct_write)                 47.41%    41.15ns   24.30M
-// ----------------------------------------------------------------------------
-// folly_rwspin(2thread_10pct_write)                          159.42ns    6.27M
-// shmtx_wr_pri(2thread_10pct_write)                188.44%    84.60ns   11.82M
-// shmtx_rd_pri(2thread_10pct_write)                188.29%    84.67ns   11.81M
-// folly_ticket(2thread_10pct_write)                140.28%   113.64ns    8.80M
-// boost_shared(2thread_10pct_write)                 42.09%   378.81ns    2.64M
-// pthrd_rwlock(2thread_10pct_write)                103.86%   153.49ns    6.51M
-// ----------------------------------------------------------------------------
-// folly_rwspin(4thread_10pct_write)                          193.35ns    5.17M
-// shmtx_wr_pri(4thread_10pct_write)                184.30%   104.91ns    9.53M
-// shmtx_rd_pri(4thread_10pct_write)                163.76%   118.07ns    8.47M
-// folly_ticket(4thread_10pct_write)                124.07%   155.84ns    6.42M
-// boost_shared(4thread_10pct_write)                 16.32%     1.18us  843.92K
-// pthrd_rwlock(4thread_10pct_write)                 48.59%   397.94ns    2.51M
-// ----------------------------------------------------------------------------
-// folly_rwspin(8thread_10pct_write)                          373.17ns    2.68M
-// shmtx_wr_pri(8thread_10pct_write)                252.02%   148.08ns    6.75M
-// shmtx_rd_pri(8thread_10pct_write)                203.59%   183.30ns    5.46M
-// folly_ticket(8thread_10pct_write)                184.37%   202.40ns    4.94M
-// boost_shared(8thread_10pct_write)                 15.85%     2.35us  424.72K
-// pthrd_rwlock(8thread_10pct_write)                 83.03%   449.45ns    2.22M
-// ----------------------------------------------------------------------------
-// folly_rwspin(16thread_10pct_write)                         742.87ns    1.35M
-// shmtx_wr_pri(16thread_10pct_write)               344.27%   215.78ns    4.63M
-// shmtx_rd_pri(16thread_10pct_write)               287.04%   258.80ns    3.86M
-// folly_ticket(16thread_10pct_write)               277.25%   267.94ns    3.73M
-// boost_shared(16thread_10pct_write)                15.33%     4.85us  206.30K
-// pthrd_rwlock(16thread_10pct_write)               158.34%   469.16ns    2.13M
-// ----------------------------------------------------------------------------
-// folly_rwspin(32thread_10pct_write)                         799.97ns    1.25M
-// shmtx_wr_pri(32thread_10pct_write)               351.40%   227.65ns    4.39M
-// shmtx_rd_pri(32thread_10pct_write)               341.71%   234.11ns    4.27M
-// folly_ticket(32thread_10pct_write)               245.91%   325.31ns    3.07M
-// boost_shared(32thread_10pct_write)                 7.72%    10.36us   96.56K
-// pthrd_rwlock(32thread_10pct_write)               165.87%   482.30ns    2.07M
-// ----------------------------------------------------------------------------
-// folly_rwspin(64thread_10pct_write)                           1.12us  892.01K
-// shmtx_wr_pri(64thread_10pct_write)               429.84%   260.81ns    3.83M
-// shmtx_rd_pri(64thread_10pct_write)               456.93%   245.35ns    4.08M
-// folly_ticket(64thread_10pct_write)               219.21%   511.42ns    1.96M
-// boost_shared(64thread_10pct_write)                 5.43%    20.65us   48.44K
-// pthrd_rwlock(64thread_10pct_write)               233.93%   479.23ns    2.09M
-// ----------------------------------------------------------------------------
-// ----------------------------------------------------------------------------
-// folly_rwspin(1thread_1pct_write)                            18.88ns   52.98M
-// shmtx_wr_pri(1thread_1pct_write)                  81.53%    23.15ns   43.19M
-// shmtx_w_bare(1thread_1pct_write)                  67.90%    27.80ns   35.97M
-// shmtx_rd_pri(1thread_1pct_write)                  81.50%    23.16ns   43.18M
-// shmtx_r_bare(1thread_1pct_write)                  67.74%    27.86ns   35.89M
-// folly_ticket(1thread_1pct_write)                  68.68%    27.48ns   36.39M
-// boost_shared(1thread_1pct_write)                  12.80%   147.51ns    6.78M
-// pthrd_rwlock(1thread_1pct_write)                  45.81%    41.20ns   24.27M
-// ----------------------------------------------------------------------------
-// folly_rwspin(2thread_1pct_write)                           125.85ns    7.95M
-// shmtx_wr_pri(2thread_1pct_write)                 359.04%    35.05ns   28.53M
-// shmtx_w_bare(2thread_1pct_write)                 475.60%    26.46ns   37.79M
-// shmtx_rd_pri(2thread_1pct_write)                 332.75%    37.82ns   26.44M
-// shmtx_r_bare(2thread_1pct_write)                 115.64%   108.83ns    9.19M
-// folly_ticket(2thread_1pct_write)                 140.24%    89.74ns   11.14M
-// boost_shared(2thread_1pct_write)                  40.62%   309.82ns    3.23M
-// pthrd_rwlock(2thread_1pct_write)                 134.67%    93.45ns   10.70M
-// ----------------------------------------------------------------------------
-// folly_rwspin(4thread_1pct_write)                           126.70ns    7.89M
-// shmtx_wr_pri(4thread_1pct_write)                 422.20%    30.01ns   33.32M
-// shmtx_w_bare(4thread_1pct_write)                 403.52%    31.40ns   31.85M
-// shmtx_rd_pri(4thread_1pct_write)                 282.50%    44.85ns   22.30M
-// shmtx_r_bare(4thread_1pct_write)                  66.30%   191.10ns    5.23M
-// folly_ticket(4thread_1pct_write)                  91.93%   137.83ns    7.26M
-// boost_shared(4thread_1pct_write)                  22.74%   557.10ns    1.80M
-// pthrd_rwlock(4thread_1pct_write)                  55.66%   227.62ns    4.39M
-// ----------------------------------------------------------------------------
-// folly_rwspin(8thread_1pct_write)                           169.42ns    5.90M
-// shmtx_wr_pri(8thread_1pct_write)                 567.81%    29.84ns   33.51M
-// shmtx_w_bare(8thread_1pct_write)                 519.18%    32.63ns   30.64M
-// shmtx_rd_pri(8thread_1pct_write)                 172.36%    98.30ns   10.17M
-// shmtx_r_bare(8thread_1pct_write)                  75.56%   224.21ns    4.46M
-// folly_ticket(8thread_1pct_write)                 104.03%   162.85ns    6.14M
-// boost_shared(8thread_1pct_write)                  22.01%   769.73ns    1.30M
-// pthrd_rwlock(8thread_1pct_write)                  71.79%   235.99ns    4.24M
-// ----------------------------------------------------------------------------
-// folly_rwspin(16thread_1pct_write)                          385.88ns    2.59M
-// shmtx_wr_pri(16thread_1pct_write)               1039.03%    37.14ns   26.93M
-// shmtx_w_bare(16thread_1pct_write)                997.26%    38.69ns   25.84M
-// shmtx_rd_pri(16thread_1pct_write)                263.60%   146.39ns    6.83M
-// shmtx_r_bare(16thread_1pct_write)                173.16%   222.85ns    4.49M
-// folly_ticket(16thread_1pct_write)                179.37%   215.13ns    4.65M
-// boost_shared(16thread_1pct_write)                 26.95%     1.43us  698.42K
-// pthrd_rwlock(16thread_1pct_write)                166.70%   231.48ns    4.32M
-// ----------------------------------------------------------------------------
-// folly_rwspin(32thread_1pct_write)                          382.49ns    2.61M
-// shmtx_wr_pri(32thread_1pct_write)               1046.64%    36.54ns   27.36M
-// shmtx_w_bare(32thread_1pct_write)                922.87%    41.45ns   24.13M
-// shmtx_rd_pri(32thread_1pct_write)                251.93%   151.82ns    6.59M
-// shmtx_r_bare(32thread_1pct_write)                176.44%   216.78ns    4.61M
-// folly_ticket(32thread_1pct_write)                131.07%   291.82ns    3.43M
-// boost_shared(32thread_1pct_write)                 12.77%     2.99us  333.95K
-// pthrd_rwlock(32thread_1pct_write)                173.43%   220.55ns    4.53M
-// ----------------------------------------------------------------------------
-// folly_rwspin(64thread_1pct_write)                          510.54ns    1.96M
-// shmtx_wr_pri(64thread_1pct_write)               1378.27%    37.04ns   27.00M
-// shmtx_w_bare(64thread_1pct_write)               1178.24%    43.33ns   23.08M
-// shmtx_rd_pri(64thread_1pct_write)                325.29%   156.95ns    6.37M
-// shmtx_r_bare(64thread_1pct_write)                247.82%   206.02ns    4.85M
-// folly_ticket(64thread_1pct_write)                117.87%   433.13ns    2.31M
-// boost_shared(64thread_1pct_write)                  9.45%     5.40us  185.09K
-// pthrd_rwlock(64thread_1pct_write)                236.72%   215.68ns    4.64M
-// ----------------------------------------------------------------------------
-// folly_rwspin(2thr_2lock_50pct_write)                        10.85ns   92.15M
-// shmtx_wr_pri(2thr_2lock_50pct_write)              81.73%    13.28ns   75.32M
-// shmtx_rd_pri(2thr_2lock_50pct_write)              81.82%    13.26ns   75.40M
-// folly_rwspin(4thr_4lock_50pct_write)                         5.29ns  188.90M
-// shmtx_wr_pri(4thr_4lock_50pct_write)              80.89%     6.54ns  152.80M
-// shmtx_rd_pri(4thr_4lock_50pct_write)              81.07%     6.53ns  153.14M
-// folly_rwspin(8thr_8lock_50pct_write)                         2.63ns  380.57M
-// shmtx_wr_pri(8thr_8lock_50pct_write)              80.56%     3.26ns  306.57M
-// shmtx_rd_pri(8thr_8lock_50pct_write)              80.29%     3.27ns  305.54M
-// folly_rwspin(16thr_16lock_50pct_write)                       1.31ns  764.70M
-// shmtx_wr_pri(16thr_16lock_50pct_write)            79.32%     1.65ns  606.54M
-// shmtx_rd_pri(16thr_16lock_50pct_write)            79.62%     1.64ns  608.84M
-// folly_rwspin(32thr_32lock_50pct_write)                       1.20ns  836.75M
-// shmtx_wr_pri(32thr_32lock_50pct_write)            91.67%     1.30ns  767.07M
-// shmtx_rd_pri(32thr_32lock_50pct_write)            92.00%     1.30ns  769.82M
-// folly_rwspin(64thr_64lock_50pct_write)                       1.39ns  717.80M
-// shmtx_wr_pri(64thr_64lock_50pct_write)            93.21%     1.49ns  669.08M
-// shmtx_rd_pri(64thr_64lock_50pct_write)            92.49%     1.51ns  663.89M
-// ----------------------------------------------------------------------------
-// folly_rwspin(2thr_2lock_10pct_write)                        10.24ns   97.70M
-// shmtx_wr_pri(2thr_2lock_10pct_write)              76.46%    13.39ns   74.70M
-// shmtx_rd_pri(2thr_2lock_10pct_write)              76.35%    13.41ns   74.60M
-// folly_rwspin(4thr_4lock_10pct_write)                         5.02ns  199.03M
-// shmtx_wr_pri(4thr_4lock_10pct_write)              75.83%     6.63ns  150.91M
-// shmtx_rd_pri(4thr_4lock_10pct_write)              76.10%     6.60ns  151.46M
-// folly_rwspin(8thr_8lock_10pct_write)                         2.47ns  405.50M
-// shmtx_wr_pri(8thr_8lock_10pct_write)              74.54%     3.31ns  302.27M
-// shmtx_rd_pri(8thr_8lock_10pct_write)              74.85%     3.29ns  303.52M
-// folly_rwspin(16thr_16lock_10pct_write)                       1.22ns  818.68M
-// shmtx_wr_pri(16thr_16lock_10pct_write)            73.35%     1.67ns  600.47M
-// shmtx_rd_pri(16thr_16lock_10pct_write)            73.38%     1.66ns  600.73M
-// folly_rwspin(32thr_32lock_10pct_write)                       1.21ns  827.95M
-// shmtx_wr_pri(32thr_32lock_10pct_write)            96.13%     1.26ns  795.89M
-// shmtx_rd_pri(32thr_32lock_10pct_write)            96.01%     1.26ns  794.95M
-// folly_rwspin(64thr_64lock_10pct_write)                       1.40ns  716.17M
-// shmtx_wr_pri(64thr_64lock_10pct_write)            96.91%     1.44ns  694.03M
-// shmtx_rd_pri(64thr_64lock_10pct_write)            96.85%     1.44ns  693.64M
-// ----------------------------------------------------------------------------
-// folly_rwspin(2thr_2lock_1pct_write)                         10.11ns   98.91M
-// shmtx_wr_pri(2thr_2lock_1pct_write)               75.07%    13.47ns   74.25M
-// shmtx_rd_pri(2thr_2lock_1pct_write)               74.98%    13.48ns   74.16M
-// folly_rwspin(4thr_4lock_1pct_write)                          4.96ns  201.77M
-// shmtx_wr_pri(4thr_4lock_1pct_write)               74.59%     6.64ns  150.49M
-// shmtx_rd_pri(4thr_4lock_1pct_write)               74.60%     6.64ns  150.51M
-// folly_rwspin(8thr_8lock_1pct_write)                          2.44ns  410.42M
-// shmtx_wr_pri(8thr_8lock_1pct_write)               73.68%     3.31ns  302.41M
-// shmtx_rd_pri(8thr_8lock_1pct_write)               73.38%     3.32ns  301.16M
-// folly_rwspin(16thr_16lock_1pct_write)                        1.21ns  827.53M
-// shmtx_wr_pri(16thr_16lock_1pct_write)             72.11%     1.68ns  596.74M
-// shmtx_rd_pri(16thr_16lock_1pct_write)             72.23%     1.67ns  597.73M
-// folly_rwspin(32thr_32lock_1pct_write)                        1.22ns  819.53M
-// shmtx_wr_pri(32thr_32lock_1pct_write)             98.17%     1.24ns  804.50M
-// shmtx_rd_pri(32thr_32lock_1pct_write)             98.21%     1.24ns  804.86M
-// folly_rwspin(64thr_64lock_1pct_write)                        1.41ns  710.26M
-// shmtx_wr_pri(64thr_64lock_1pct_write)             97.81%     1.44ns  694.71M
-// shmtx_rd_pri(64thr_64lock_1pct_write)             99.44%     1.42ns  706.28M
-// ----------------------------------------------------------------------------
-// ----------------------------------------------------------------------------
-// folly_rwspin_ping_pong(burn0)                              641.24ns    1.56M
-// shmtx_w_bare_ping_pong(burn0)                     91.07%   704.12ns    1.42M
-// shmtx_r_bare_ping_pong(burn0)                     78.70%   814.84ns    1.23M
-// folly_ticket_ping_pong(burn0)                     85.67%   748.53ns    1.34M
-// boost_shared_ping_pong(burn0)                      5.58%    11.50us   86.96K
-// pthrd_rwlock_ping_pong(burn0)                      8.81%     7.28us  137.40K
-// ----------------------------------------------------------------------------
-// folly_rwspin_ping_pong(burn100k)                           678.97ns    1.47M
-// shmtx_w_bare_ping_pong(burn100k)                  99.73%   680.78ns    1.47M
-// shmtx_r_bare_ping_pong(burn100k)                  98.67%   688.13ns    1.45M
-// folly_ticket_ping_pong(burn100k)                  99.31%   683.68ns    1.46M
-// boost_shared_ping_pong(burn100k)                  58.23%     1.17us  857.64K
-// pthrd_rwlock_ping_pong(burn100k)                  57.43%     1.18us  845.86K
-// ----------------------------------------------------------------------------
-// folly_rwspin_ping_pong(burn300k)                             2.03us  492.99K
-// shmtx_w_bare_ping_pong(burn300k)                  99.98%     2.03us  492.88K
-// shmtx_r_bare_ping_pong(burn300k)                  99.94%     2.03us  492.68K
-// folly_ticket_ping_pong(burn300k)                  99.88%     2.03us  492.40K
-// boost_shared_ping_pong(burn300k)                  81.43%     2.49us  401.47K
-// pthrd_rwlock_ping_pong(burn300k)                  83.22%     2.44us  410.29K
-// ----------------------------------------------------------------------------
-// folly_rwspin_ping_pong(burn1M)                             677.07ns    1.48M
-// shmtx_w_bare_ping_pong(burn1M)                   100.50%   673.74ns    1.48M
-// shmtx_r_bare_ping_pong(burn1M)                   100.14%   676.12ns    1.48M
-// folly_ticket_ping_pong(burn1M)                   100.44%   674.14ns    1.48M
-// boost_shared_ping_pong(burn1M)                    93.04%   727.72ns    1.37M
-// pthrd_rwlock_ping_pong(burn1M)                    94.52%   716.30ns    1.40M
-// ============================================================================
-
-int main(int argc, char** argv) {
-  (void)folly_rwspin_reads;
-  (void)shmtx_wr_pri_reads;
-  (void)shmtx_w_bare_reads;
-  (void)shmtx_rd_pri_reads;
-  (void)shmtx_r_bare_reads;
-  (void)folly_ticket_reads;
-  (void)boost_shared_reads;
-  (void)pthrd_rwlock_reads;
-  (void)folly_rwspin;
-  (void)shmtx_wr_pri;
-  (void)shmtx_w_bare;
-  (void)shmtx_rd_pri;
-  (void)shmtx_r_bare;
-  (void)folly_ticket;
-  (void)boost_shared;
-  (void)pthrd_rwlock;
-  (void)pthrd_mutex_;
-  (void)folly_rwspin_ping_pong;
-  (void)shmtx_w_bare_ping_pong;
-  (void)shmtx_r_bare_ping_pong;
-  (void)folly_ticket_ping_pong;
-  (void)boost_shared_ping_pong;
-  (void)pthrd_rwlock_ping_pong;
-
-  testing::InitGoogleTest(&argc, argv);
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  int rv = RUN_ALL_TESTS();
-  folly::runBenchmarksOnFlag();
-  return rv;
-}
diff --git a/folly/test/SharedMutexTest.cpp b/folly/test/SharedMutexTest.cpp
new file mode 100644
index 00000000..3b374f09
--- /dev/null
+++ b/folly/test/SharedMutexTest.cpp
@@ -0,0 +1,2070 @@
+/*
+ * Copyright 2015 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/SharedMutex.h>
+
+#include <stdlib.h>
+#include <thread>
+#include <vector>
+#include <boost/optional.hpp>
+#include <folly/Benchmark.h>
+#include <folly/MPMCQueue.h>
+#include <folly/Random.h>
+#include <folly/test/DeterministicSchedule.h>
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include <boost/thread/shared_mutex.hpp>
+#include <folly/RWSpinLock.h>
+
+using namespace folly;
+using namespace folly::test;
+using namespace std;
+using namespace chrono;
+
+typedef DeterministicSchedule DSched;
+typedef SharedMutexImpl<true, void, DeterministicAtomic, true>
+    DSharedMutexReadPriority;
+typedef SharedMutexImpl<false, void, DeterministicAtomic, true>
+    DSharedMutexWritePriority;
+
+COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(
+    DSharedMutexReadPriority);
+COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(
+    DSharedMutexWritePriority);
+
+template <typename Lock>
+void runBasicTest() {
+  Lock lock;
+  SharedMutexToken token1;
+  SharedMutexToken token2;
+  SharedMutexToken token3;
+
+  EXPECT_TRUE(lock.try_lock());
+  EXPECT_FALSE(lock.try_lock());
+  EXPECT_FALSE(lock.try_lock_shared(token1));
+  lock.unlock();
+
+  EXPECT_TRUE(lock.try_lock_shared(token1));
+  EXPECT_FALSE(lock.try_lock());
+  EXPECT_TRUE(lock.try_lock_shared(token2));
+  lock.lock_shared(token3);
+  lock.unlock_shared(token3);
+  lock.unlock_shared(token2);
+  lock.unlock_shared(token1);
+
+  lock.lock();
+  lock.unlock();
+
+  lock.lock_shared(token1);
+  lock.lock_shared(token2);
+  lock.unlock_shared(token1);
+  lock.unlock_shared(token2);
+
+  lock.lock();
+  lock.unlock_and_lock_shared(token1);
+  lock.lock_shared(token2);
+  lock.unlock_shared(token2);
+  lock.unlock_shared(token1);
+}
+
+TEST(SharedMutex, basic) {
+  runBasicTest<SharedMutexReadPriority>();
+  runBasicTest<SharedMutexWritePriority>();
+}
+
+template <typename Lock>
+void runBasicHoldersTest() {
+  Lock lock;
+  SharedMutexToken token;
+
+  {
+    typename Lock::WriteHolder holder(lock);
+    EXPECT_FALSE(lock.try_lock());
+    EXPECT_FALSE(lock.try_lock_shared(token));
+
+    typename Lock::WriteHolder holder2(std::move(holder));
+    typename Lock::WriteHolder holder3;
+    holder3 = std::move(holder2);
+
+    typename Lock::UpgradeHolder holder4(std::move(holder3));
+    typename Lock::WriteHolder holder5(std::move(holder4));
+
+    typename Lock::ReadHolder holder6(std::move(holder5));
+
+    EXPECT_FALSE(lock.try_lock());
+    EXPECT_TRUE(lock.try_lock_shared(token));
+    lock.unlock_shared(token);
+  }
+
+  {
+    typename Lock::WriteHolder holder(lock);
+    EXPECT_FALSE(lock.try_lock());
+  }
+
+  {
+    typename Lock::ReadHolder holder(lock);
+    typename Lock::ReadHolder holder2(lock);
+    typename Lock::UpgradeHolder holder3(lock);
+  }
+
+  {
+    typename Lock::UpgradeHolder holder(lock);
+    typename Lock::ReadHolder holder2(lock);
+    typename Lock::ReadHolder holder3(std::move(holder));
+  }
+}
+
+TEST(SharedMutex, basic_holders) {
+  runBasicHoldersTest<SharedMutexReadPriority>();
+  runBasicHoldersTest<SharedMutexWritePriority>();
+}
+
+template <typename Lock>
+void runManyReadLocksTestWithTokens() {
+  Lock lock;
+
+  vector<SharedMutexToken> tokens;
+  for (int i = 0; i < 1000; ++i) {
+    tokens.emplace_back();
+    EXPECT_TRUE(lock.try_lock_shared(tokens.back()));
+  }
+  for (auto& token : tokens) {
+    lock.unlock_shared(token);
+  }
+  EXPECT_TRUE(lock.try_lock());
+  lock.unlock();
+}
+
+TEST(SharedMutex, many_read_locks_with_tokens) {
+  runManyReadLocksTestWithTokens<SharedMutexReadPriority>();
+  runManyReadLocksTestWithTokens<SharedMutexWritePriority>();
+}
+
+template <typename Lock>
+void runManyReadLocksTestWithoutTokens() {
+  Lock lock;
+
+  for (int i = 0; i < 1000; ++i) {
+    EXPECT_TRUE(lock.try_lock_shared());
+  }
+  for (int i = 0; i < 1000; ++i) {
+    lock.unlock_shared();
+  }
+  EXPECT_TRUE(lock.try_lock());
+  lock.unlock();
+}
+
+TEST(SharedMutex, many_read_locks_without_tokens) {
+  runManyReadLocksTestWithoutTokens<SharedMutexReadPriority>();
+  runManyReadLocksTestWithoutTokens<SharedMutexWritePriority>();
+}
+
+template <typename Lock>
+void runTimeoutInPastTest() {
+  Lock lock;
+
+  EXPECT_TRUE(lock.try_lock_for(milliseconds(0)));
+  lock.unlock();
+  EXPECT_TRUE(lock.try_lock_for(milliseconds(-1)));
+  lock.unlock();
+  EXPECT_TRUE(lock.try_lock_shared_for(milliseconds(0)));
+  lock.unlock_shared();
+  EXPECT_TRUE(lock.try_lock_shared_for(milliseconds(-1)));
+  lock.unlock_shared();
+  EXPECT_TRUE(lock.try_lock_until(system_clock::now() - milliseconds(1)));
+  lock.unlock();
+  EXPECT_TRUE(
+      lock.try_lock_shared_until(system_clock::now() - milliseconds(1)));
+  lock.unlock_shared();
+  EXPECT_TRUE(lock.try_lock_until(steady_clock::now() - milliseconds(1)));
+  lock.unlock();
+  EXPECT_TRUE(
+      lock.try_lock_shared_until(steady_clock::now() - milliseconds(1)));
+  lock.unlock_shared();
+}
+
+TEST(SharedMutex, timeout_in_past) {
+  runTimeoutInPastTest<SharedMutexReadPriority>();
+  runTimeoutInPastTest<SharedMutexWritePriority>();
+}
+
+template <class Func>
+bool funcHasDuration(milliseconds expectedDuration, Func func) {
+  // elapsed time should eventually fall within expectedDuration +- 25%
+  for (int tries = 0; tries < 100; ++tries) {
+    auto start = steady_clock::now();
+    func();
+    auto elapsed = steady_clock::now() - start;
+    if (elapsed > expectedDuration - expectedDuration / 4 &&
+        elapsed < expectedDuration + expectedDuration / 4) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <typename Lock>
+void runFailingTryTimeoutTest() {
+  Lock lock;
+  lock.lock();
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(lock.try_lock_for(milliseconds(10)));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    typename Lock::Token token;
+    EXPECT_FALSE(lock.try_lock_shared_for(milliseconds(10), token));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(lock.try_lock_upgrade_for(milliseconds(10)));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(lock.try_lock_until(steady_clock::now() + milliseconds(10)));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    typename Lock::Token token;
+    EXPECT_FALSE(lock.try_lock_shared_until(
+        steady_clock::now() + milliseconds(10), token));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(
+        lock.try_lock_upgrade_until(steady_clock::now() + milliseconds(10)));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(lock.try_lock_until(system_clock::now() + milliseconds(10)));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    typename Lock::Token token;
+    EXPECT_FALSE(lock.try_lock_shared_until(
+        system_clock::now() + milliseconds(10), token));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(
+        lock.try_lock_upgrade_until(system_clock::now() + milliseconds(10)));
+  }));
+  lock.unlock();
+
+  lock.lock_shared();
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(lock.try_lock_for(milliseconds(10)));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(lock.try_lock_until(steady_clock::now() + milliseconds(10)));
+  }));
+  EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+    EXPECT_FALSE(lock.try_lock_until(system_clock::now() + milliseconds(10)));
+  }));
+  lock.unlock_shared();
+
+  lock.lock();
+  for (int p = 0; p < 8; ++p) {
+    EXPECT_FALSE(lock.try_lock_for(nanoseconds(1 << p)));
+  }
+  lock.unlock();
+
+  for (int p = 0; p < 8; ++p) {
+    typename Lock::ReadHolder holder1(lock);
+    typename Lock::ReadHolder holder2(lock);
+    typename Lock::ReadHolder holder3(lock);
+    EXPECT_FALSE(lock.try_lock_for(nanoseconds(1 << p)));
+  }
+}
+
+TEST(SharedMutex, failing_try_timeout) {
+  runFailingTryTimeoutTest<SharedMutexReadPriority>();
+  runFailingTryTimeoutTest<SharedMutexWritePriority>();
+}
+
+template <typename Lock>
+void runBasicUpgradeTest() {
+  Lock lock;
+  typename Lock::Token token1;
+  typename Lock::Token token2;
+
+  lock.lock_upgrade();
+  EXPECT_FALSE(lock.try_lock());
+  EXPECT_TRUE(lock.try_lock_shared(token1));
+  lock.unlock_shared(token1);
+  lock.unlock_upgrade();
+
+  lock.lock_upgrade();
+  lock.unlock_upgrade_and_lock();
+  EXPECT_FALSE(lock.try_lock_shared(token1));
+  lock.unlock();
+
+  lock.lock_upgrade();
+  lock.unlock_upgrade_and_lock_shared(token1);
+  lock.lock_upgrade();
+  lock.unlock_upgrade_and_lock_shared(token2);
+  lock.unlock_shared(token1);
+  lock.unlock_shared(token2);
+
+  lock.lock();
+  lock.unlock_and_lock_upgrade();
+  EXPECT_TRUE(lock.try_lock_shared(token1));
+  lock.unlock_upgrade();
+  lock.unlock_shared(token1);
+}
+
+TEST(SharedMutex, basic_upgrade_tests) {
+  runBasicUpgradeTest<SharedMutexReadPriority>();
+  runBasicUpgradeTest<SharedMutexWritePriority>();
+}
+
+TEST(SharedMutex, read_has_prio) {
+  SharedMutexReadPriority lock;
+  SharedMutexToken token1;
+  SharedMutexToken token2;
+  lock.lock_shared(token1);
+  bool exclusiveAcquired = false;
+  auto writer = thread([&] {
+    lock.lock();
+    exclusiveAcquired = true;
+    lock.unlock();
+  });
+
+  // lock() can't complete until we unlock token1, but it should stake
+  // its claim with regards to other exclusive or upgrade locks.  We can
+  // use try_lock_upgrade to poll for that eventuality.
+  while (lock.try_lock_upgrade()) {
+    lock.unlock_upgrade();
+    this_thread::yield();
+  }
+  EXPECT_FALSE(exclusiveAcquired);
+
+  // Even though lock() is stuck we should be able to get token2
+  EXPECT_TRUE(lock.try_lock_shared(token2));
+  lock.unlock_shared(token1);
+  lock.unlock_shared(token2);
+  writer.join();
+  EXPECT_TRUE(exclusiveAcquired);
+}
+
+TEST(SharedMutex, write_has_prio) {
+  SharedMutexWritePriority lock;
+  SharedMutexToken token1;
+  SharedMutexToken token2;
+  lock.lock_shared(token1);
+  auto writer = thread([&] {
+    lock.lock();
+    lock.unlock();
+  });
+
+  // eventually lock() should block readers
+  while (lock.try_lock_shared(token2)) {
+    lock.unlock_shared(token2);
+    this_thread::yield();
+  }
+
+  lock.unlock_shared(token1);
+  writer.join();
+}
+
+struct TokenLocker {
+  SharedMutexToken token;
+
+  template <typename T>
+  void lock(T* lock) {
+    lock->lock();
+  }
+
+  template <typename T>
+  void unlock(T* lock) {
+    lock->unlock();
+  }
+
+  template <typename T>
+  void lock_shared(T* lock) {
+    lock->lock_shared(token);
+  }
+
+  template <typename T>
+  void unlock_shared(T* lock) {
+    lock->unlock_shared(token);
+  }
+};
+
+struct Locker {
+  template <typename T>
+  void lock(T* lock) {
+    lock->lock();
+  }
+
+  template <typename T>
+  void unlock(T* lock) {
+    lock->unlock();
+  }
+
+  template <typename T>
+  void lock_shared(T* lock) {
+    lock->lock_shared();
+  }
+
+  template <typename T>
+  void unlock_shared(T* lock) {
+    lock->unlock_shared();
+  }
+};
+
+struct EnterLocker {
+  template <typename T>
+  void lock(T* lock) {
+    lock->lock(0);
+  }
+
+  template <typename T>
+  void unlock(T* lock) {
+    lock->unlock();
+  }
+
+  template <typename T>
+  void lock_shared(T* lock) {
+    lock->enter(0);
+  }
+
+  template <typename T>
+  void unlock_shared(T* lock) {
+    lock->leave();
+  }
+};
+
+struct PosixRWLock {
+  pthread_rwlock_t lock_;
+
+  PosixRWLock() { pthread_rwlock_init(&lock_, nullptr); }
+
+  ~PosixRWLock() { pthread_rwlock_destroy(&lock_); }
+
+  void lock() { pthread_rwlock_wrlock(&lock_); }
+
+  void unlock() { pthread_rwlock_unlock(&lock_); }
+
+  void lock_shared() { pthread_rwlock_rdlock(&lock_); }
+
+  void unlock_shared() { pthread_rwlock_unlock(&lock_); }
+};
+
+struct PosixMutex {
+  pthread_mutex_t lock_;
+
+  PosixMutex() { pthread_mutex_init(&lock_, nullptr); }
+
+  ~PosixMutex() { pthread_mutex_destroy(&lock_); }
+
+  void lock() { pthread_mutex_lock(&lock_); }
+
+  void unlock() { pthread_mutex_unlock(&lock_); }
+
+  void lock_shared() { pthread_mutex_lock(&lock_); }
+
+  void unlock_shared() { pthread_mutex_unlock(&lock_); }
+};
+
+template <template <typename> class Atom, typename Lock, typename Locker>
+static void runContendedReaders(size_t numOps,
+                                size_t numThreads,
+                                bool useSeparateLocks) {
+  char padding1[64];
+  Lock globalLock;
+  int valueProtectedByLock = 10;
+  char padding2[64];
+  Atom<bool> go(false);
+  Atom<bool>* goPtr = &go; // workaround for clang bug
+  vector<thread> threads(numThreads);
+
+  BENCHMARK_SUSPEND {
+    for (size_t t = 0; t < numThreads; ++t) {
+      threads[t] = DSched::thread([&, t, numThreads] {
+        Lock privateLock;
+        Lock* lock = useSeparateLocks ? &privateLock : &globalLock;
+        Locker locker;
+        while (!goPtr->load()) {
+          this_thread::yield();
+        }
+        for (size_t op = t; op < numOps; op += numThreads) {
+          locker.lock_shared(lock);
+          // note: folly::doNotOptimizeAway reads and writes to its arg,
+          // so the following two lines are very different than a call
+          // to folly::doNotOptimizeAway(valueProtectedByLock);
+          auto copy = valueProtectedByLock;
+          folly::doNotOptimizeAway(copy);
+          locker.unlock_shared(lock);
+        }
+      });
+    }
+  }
+
+  go.store(true);
+  for (auto& thr : threads) {
+    DSched::join(thr);
+  }
+}
+
+static void folly_rwspin_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, RWSpinLock, Locker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+static void shmtx_wr_pri_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, SharedMutexWritePriority, TokenLocker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+static void shmtx_w_bare_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, SharedMutexWritePriority, Locker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+static void shmtx_rd_pri_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, SharedMutexReadPriority, TokenLocker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+static void shmtx_r_bare_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, SharedMutexReadPriority, Locker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+static void folly_ticket_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, RWTicketSpinLock64, Locker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+static void boost_shared_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, boost::shared_mutex, Locker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+static void pthrd_rwlock_reads(uint numOps,
+                               size_t numThreads,
+                               bool useSeparateLocks) {
+  runContendedReaders<atomic, PosixRWLock, Locker>(
+      numOps, numThreads, useSeparateLocks);
+}
+
+template <template <typename> class Atom, typename Lock, typename Locker>
+static void runMixed(size_t numOps,
+                     size_t numThreads,
+                     double writeFraction,
+                     bool useSeparateLocks) {
+  char padding1[64];
+  Lock globalLock;
+  int valueProtectedByLock = 0;
+  char padding2[64];
+  Atom<bool> go(false);
+  Atom<bool>* goPtr = &go; // workaround for clang bug
+  vector<thread> threads(numThreads);
+
+  BENCHMARK_SUSPEND {
+    for (size_t t = 0; t < numThreads; ++t) {
+      threads[t] = DSched::thread([&, t, numThreads] {
+        struct drand48_data buffer;
+        srand48_r(t, &buffer);
+        long writeThreshold = writeFraction * 0x7fffffff;
+        Lock privateLock;
+        Lock* lock = useSeparateLocks ? &privateLock : &globalLock;
+        Locker locker;
+        while (!goPtr->load()) {
+          this_thread::yield();
+        }
+        for (size_t op = t; op < numOps; op += numThreads) {
+          long randVal;
+          lrand48_r(&buffer, &randVal);
+          bool writeOp = randVal < writeThreshold;
+          SharedMutexToken token;
+          if (writeOp) {
+            locker.lock(lock);
+            if (!useSeparateLocks) {
+              ++valueProtectedByLock;
+            }
+            locker.unlock(lock);
+          } else {
+            locker.lock_shared(lock);
+            auto v = valueProtectedByLock;
+            folly::doNotOptimizeAway(v);
+            locker.unlock_shared(lock);
+          }
+        }
+      });
+    }
+  }
+
+  go.store(true);
+  for (auto& thr : threads) {
+    DSched::join(thr);
+  }
+}
+
+static void folly_rwspin(size_t numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, RWSpinLock, Locker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void shmtx_wr_pri(uint numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, SharedMutexWritePriority, TokenLocker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void shmtx_w_bare(uint numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, SharedMutexWritePriority, Locker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void shmtx_rd_pri(uint numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, SharedMutexReadPriority, TokenLocker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void shmtx_r_bare(uint numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, SharedMutexReadPriority, Locker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void folly_ticket(size_t numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, RWTicketSpinLock64, Locker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void boost_shared(size_t numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, boost::shared_mutex, Locker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void pthrd_rwlock(size_t numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, PosixRWLock, Locker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void pthrd_mutex_(size_t numOps,
+                         size_t numThreads,
+                         double writeFraction,
+                         bool useSeparateLocks) {
+  runMixed<atomic, PosixMutex, Locker>(
+      numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+template <typename Lock, template <typename> class Atom>
+static void runAllAndValidate(size_t numOps, size_t numThreads) {
+  Lock globalLock;
+  Atom<int> globalExclusiveCount(0);
+  Atom<int> globalUpgradeCount(0);
+  Atom<int> globalSharedCount(0);
+
+  Atom<bool> go(false);
+
+  // clang crashes on access to Atom<> captured by ref in closure
+  Atom<int>* globalExclusiveCountPtr = &globalExclusiveCount;
+  Atom<int>* globalUpgradeCountPtr = &globalUpgradeCount;
+  Atom<int>* globalSharedCountPtr = &globalSharedCount;
+  Atom<bool>* goPtr = &go;
+
+  vector<thread> threads(numThreads);
+
+  BENCHMARK_SUSPEND {
+    for (size_t t = 0; t < numThreads; ++t) {
+      threads[t] = DSched::thread([&, t, numThreads] {
+        struct drand48_data buffer;
+        srand48_r(t, &buffer);
+
+        bool exclusive = false;
+        bool upgrade = false;
+        bool shared = false;
+        bool ourGlobalTokenUsed = false;
+        SharedMutexToken ourGlobalToken;
+
+        Lock privateLock;
+        vector<SharedMutexToken> privateTokens;
+
+        while (!goPtr->load()) {
+          this_thread::yield();
+        }
+        for (size_t op = t; op < numOps; op += numThreads) {
+          // randVal in [0,1000)
+          long randVal;
+          lrand48_r(&buffer, &randVal);
+          randVal = (long)((randVal * (uint64_t)1000) / 0x7fffffff);
+
+          // make as many assertions as possible about the global state
+          if (exclusive) {
+            EXPECT_EQ(1, globalExclusiveCountPtr->load(memory_order_acquire));
+            EXPECT_EQ(0, globalUpgradeCountPtr->load(memory_order_acquire));
+            EXPECT_EQ(0, globalSharedCountPtr->load(memory_order_acquire));
+          }
+          if (upgrade) {
+            EXPECT_EQ(0, globalExclusiveCountPtr->load(memory_order_acquire));
+            EXPECT_EQ(1, globalUpgradeCountPtr->load(memory_order_acquire));
+          }
+          if (shared) {
+            EXPECT_EQ(0, globalExclusiveCountPtr->load(memory_order_acquire));
+            EXPECT_TRUE(globalSharedCountPtr->load(memory_order_acquire) > 0);
+          } else {
+            EXPECT_FALSE(ourGlobalTokenUsed);
+          }
+
+          // independent 20% chance we do something to the private lock
+          if (randVal < 200) {
+            // it's okay to take multiple private shared locks because
+            // we never take an exclusive lock, so reader versus writer
+            // priority doesn't cause deadlocks
+            if (randVal < 100 && privateTokens.size() > 0) {
+              auto i = randVal % privateTokens.size();
+              privateLock.unlock_shared(privateTokens[i]);
+              privateTokens.erase(privateTokens.begin() + i);
+            } else {
+              SharedMutexToken token;
+              privateLock.lock_shared(token);
+              privateTokens.push_back(token);
+            }
+            continue;
+          }
+
+          // if we've got a lock, the only thing we can do is release it
+          // or transform it into a different kind of lock
+          if (exclusive) {
+            exclusive = false;
+            --*globalExclusiveCountPtr;
+            if (randVal < 500) {
+              globalLock.unlock();
+            } else if (randVal < 700) {
+              globalLock.unlock_and_lock_shared();
+              ++*globalSharedCountPtr;
+              shared = true;
+            } else if (randVal < 900) {
+              globalLock.unlock_and_lock_shared(ourGlobalToken);
+              ++*globalSharedCountPtr;
+              shared = true;
+              ourGlobalTokenUsed = true;
+            } else {
+              globalLock.unlock_and_lock_upgrade();
+              ++*globalUpgradeCountPtr;
+              upgrade = true;
+            }
+          } else if (upgrade) {
+            upgrade = false;
+            --*globalUpgradeCountPtr;
+            if (randVal < 500) {
+              globalLock.unlock_upgrade();
+            } else if (randVal < 700) {
+              globalLock.unlock_upgrade_and_lock_shared();
+              ++*globalSharedCountPtr;
+              shared = true;
+            } else if (randVal < 900) {
+              globalLock.unlock_upgrade_and_lock_shared(ourGlobalToken);
+              ++*globalSharedCountPtr;
+              shared = true;
+              ourGlobalTokenUsed = true;
+            } else {
+              globalLock.unlock_upgrade_and_lock();
+              ++*globalExclusiveCountPtr;
+              exclusive = true;
+            }
+          } else if (shared) {
+            shared = false;
+            --*globalSharedCountPtr;
+            if (ourGlobalTokenUsed) {
+              globalLock.unlock_shared(ourGlobalToken);
+              ourGlobalTokenUsed = false;
+            } else {
+              globalLock.unlock_shared();
+            }
+          } else if (randVal < 400) {
+            // 40% chance of shared lock with token, 5 ways to get it
+
+            // delta t goes from -1 millis to 7 millis
+            auto dt = microseconds(10 * (randVal - 100));
+
+            if (randVal < 400) {
+              globalLock.lock_shared(ourGlobalToken);
+              shared = true;
+            } else if (randVal < 500) {
+              shared = globalLock.try_lock_shared(ourGlobalToken);
+            } else if (randVal < 600) {
+              shared = globalLock.try_lock_shared_for(dt, ourGlobalToken);
+            } else if (randVal < 800) {
+              shared = globalLock.try_lock_shared_until(
+                  system_clock::now() + dt, ourGlobalToken);
+            }
+            if (shared) {
+              ourGlobalTokenUsed = true;
+              ++*globalSharedCountPtr;
+            }
+          } else if (randVal < 800) {
+            // 40% chance of shared lock without token
+            auto dt = microseconds(10 * (randVal - 100));
+            if (randVal < 400) {
+              globalLock.lock_shared();
+              shared = true;
+            } else if (randVal < 500) {
+              shared = globalLock.try_lock_shared();
+            } else if (randVal < 600) {
+              shared = globalLock.try_lock_shared_for(dt);
+            } else if (randVal < 800) {
+              shared = globalLock.try_lock_shared_until(
+                  system_clock::now() + dt);
+            }
+            if (shared) {
+              ++*globalSharedCountPtr;
+            }
+          } else if (randVal < 900) {
+            // 10% change of upgrade lock
+            globalLock.lock_upgrade();
+            upgrade = true;
+            ++*globalUpgradeCountPtr;
+          } else {
+            // 10% chance of exclusive lock, 5 ways to get it
+
+            // delta t goes from -1 millis to 9 millis
+            auto dt = microseconds(100 * (randVal - 910));
+
+            if (randVal < 400) {
+              globalLock.lock();
+              exclusive = true;
+            } else if (randVal < 500) {
+              exclusive = globalLock.try_lock();
+            } else if (randVal < 600) {
+              exclusive = globalLock.try_lock_for(dt);
+            } else if (randVal < 700) {
+              exclusive = globalLock.try_lock_until(steady_clock::now() + dt);
+            } else {
+              exclusive = globalLock.try_lock_until(system_clock::now() + dt);
+            }
+            if (exclusive) {
+              ++*globalExclusiveCountPtr;
+            }
+          }
+        }
+
+        if (exclusive) {
+          --*globalExclusiveCountPtr;
+          globalLock.unlock();
+        }
+        if (upgrade) {
+          --*globalUpgradeCountPtr;
+          globalLock.unlock_upgrade();
+        }
+        if (shared) {
+          --*globalSharedCountPtr;
+          if (ourGlobalTokenUsed) {
+            globalLock.unlock_shared(ourGlobalToken);
+            ourGlobalTokenUsed = false;
+          } else {
+            globalLock.unlock_shared();
+          }
+        }
+        for (auto& token : privateTokens) {
+          privateLock.unlock_shared(token);
+        }
+      });
+    }
+  }
+
+  go.store(true);
+  for (auto& thr : threads) {
+    DSched::join(thr);
+  }
+}
+
+TEST(SharedMutex, deterministic_concurrent_readers_of_one_lock_read_prio) {
+  for (int pass = 0; pass < 3; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runContendedReaders<DeterministicAtomic,
+                        DSharedMutexReadPriority,
+                        Locker>(1000, 3, false);
+  }
+}
+
+TEST(SharedMutex, deterministic_concurrent_readers_of_one_lock_write_prio) {
+  for (int pass = 0; pass < 3; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runContendedReaders<DeterministicAtomic,
+                        DSharedMutexWritePriority,
+                        Locker>(1000, 3, false);
+  }
+}
+
+TEST(SharedMutex, concurrent_readers_of_one_lock_read_prio) {
+  for (int pass = 0; pass < 10; ++pass) {
+    runContendedReaders<atomic, SharedMutexReadPriority, Locker>(
+        100000, 32, false);
+  }
+}
+
+TEST(SharedMutex, concurrent_readers_of_one_lock_write_prio) {
+  for (int pass = 0; pass < 10; ++pass) {
+    runContendedReaders<atomic, SharedMutexWritePriority, Locker>(
+        100000, 32, false);
+  }
+}
+
+TEST(SharedMutex, deterministic_readers_of_concurrent_locks_read_prio) {
+  for (int pass = 0; pass < 3; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runContendedReaders<DeterministicAtomic,
+                        DSharedMutexReadPriority,
+                        Locker>(1000, 3, true);
+  }
+}
+
+TEST(SharedMutex, deterministic_readers_of_concurrent_locks_write_prio) {
+  for (int pass = 0; pass < 3; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runContendedReaders<DeterministicAtomic,
+                        DSharedMutexWritePriority,
+                        Locker>(1000, 3, true);
+  }
+}
+
+TEST(SharedMutex, readers_of_concurrent_locks_read_prio) {
+  for (int pass = 0; pass < 10; ++pass) {
+    runContendedReaders<atomic, SharedMutexReadPriority, TokenLocker>(
+        100000, 32, true);
+  }
+}
+
+TEST(SharedMutex, readers_of_concurrent_locks_write_prio) {
+  for (int pass = 0; pass < 10; ++pass) {
+    runContendedReaders<atomic, SharedMutexWritePriority, TokenLocker>(
+        100000, 32, true);
+  }
+}
+
+TEST(SharedMutex, deterministic_mixed_mostly_read_read_prio) {
+  for (int pass = 0; pass < 3; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runMixed<DeterministicAtomic, DSharedMutexReadPriority, Locker>(
+        1000, 3, 0.1, false);
+  }
+}
+
+TEST(SharedMutex, deterministic_mixed_mostly_read_write_prio) {
+  for (int pass = 0; pass < 3; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runMixed<DeterministicAtomic, DSharedMutexWritePriority, Locker>(
+        1000, 3, 0.1, false);
+  }
+}
+
+TEST(SharedMutex, mixed_mostly_read_read_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    runMixed<atomic, SharedMutexReadPriority, TokenLocker>(
+        50000, 32, 0.1, false);
+  }
+}
+
+TEST(SharedMutex, mixed_mostly_read_write_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    runMixed<atomic, SharedMutexWritePriority, TokenLocker>(
+        50000, 32, 0.1, false);
+  }
+}
+
+TEST(SharedMutex, deterministic_mixed_mostly_write_read_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runMixed<DeterministicAtomic, DSharedMutexReadPriority, TokenLocker>(
+        1000, 10, 0.9, false);
+  }
+}
+
+TEST(SharedMutex, deterministic_mixed_mostly_write_write_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runMixed<DeterministicAtomic, DSharedMutexWritePriority, TokenLocker>(
+        1000, 10, 0.9, false);
+  }
+}
+
+TEST(SharedMutex, deterministic_lost_wakeup_write_prio) {
+  for (int pass = 0; pass < 10; ++pass) {
+    DSched sched(DSched::uniformSubset(pass, 2, 200));
+    runMixed<DeterministicAtomic, DSharedMutexWritePriority, TokenLocker>(
+        1000, 3, 1.0, false);
+  }
+}
+
+TEST(SharedMutex, mixed_mostly_write_read_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    runMixed<atomic, SharedMutexReadPriority, TokenLocker>(
+        50000, 300, 0.9, false);
+  }
+}
+
+TEST(SharedMutex, mixed_mostly_write_write_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    runMixed<atomic, SharedMutexWritePriority, TokenLocker>(
+        50000, 300, 0.9, false);
+  }
+}
+
+TEST(SharedMutex, deterministic_all_ops_read_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runAllAndValidate<DSharedMutexReadPriority, DeterministicAtomic>(1000, 8);
+  }
+}
+
+TEST(SharedMutex, deterministic_all_ops_write_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runAllAndValidate<DSharedMutexWritePriority, DeterministicAtomic>(1000, 8);
+  }
+}
+
+TEST(SharedMutex, all_ops_read_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    runAllAndValidate<SharedMutexReadPriority, atomic>(100000, 32);
+  }
+}
+
+TEST(SharedMutex, all_ops_write_prio) {
+  for (int pass = 0; pass < 5; ++pass) {
+    runAllAndValidate<SharedMutexWritePriority, atomic>(100000, 32);
+  }
+}
+
+FOLLY_ASSUME_FBVECTOR_COMPATIBLE(
+    boost::optional<boost::optional<SharedMutexToken>>)
+
+// Setup is a set of threads that either grab a shared lock, or exclusive
+// and then downgrade it, or upgrade then upgrade and downgrade, then
+// enqueue the shared lock to a second set of threads that just performs
+// unlocks.  Half of the shared locks use tokens, the others don't.
+template <typename Lock, template <typename> class Atom>
+static void runRemoteUnlock(size_t numOps,
+                            double preWriteFraction,
+                            double preUpgradeFraction,
+                            size_t numSendingThreads,
+                            size_t numReceivingThreads) {
+  Lock globalLock;
+  MPMCQueue<boost::optional<boost::optional<SharedMutexToken>>, Atom>
+    queue(10);
+  auto queuePtr = &queue; // workaround for clang crash
+
+  Atom<bool> go(false);
+  auto goPtr = &go; // workaround for clang crash
+  Atom<int> pendingSenders(numSendingThreads);
+  auto pendingSendersPtr = &pendingSenders; // workaround for clang crash
+  vector<thread> threads(numSendingThreads + numReceivingThreads);
+
+  BENCHMARK_SUSPEND {
+    for (size_t t = 0; t < threads.size(); ++t) {
+      threads[t] = DSched::thread([&, t, numSendingThreads] {
+        if (t >= numSendingThreads) {
+          // we're a receiver
+          typename decltype(queue)::value_type elem;
+          while (true) {
+            queuePtr->blockingRead(elem);
+            if (!elem) {
+              // EOF, pass the EOF token
+              queuePtr->blockingWrite(std::move(elem));
+              break;
+            }
+            if (*elem) {
+              globalLock.unlock_shared(**elem);
+            } else {
+              globalLock.unlock_shared();
+            }
+          }
+          return;
+        }
+        // else we're a sender
+
+        struct drand48_data buffer;
+        srand48_r(t, &buffer);
+
+        while (!goPtr->load()) {
+          this_thread::yield();
+        }
+        for (size_t op = t; op < numOps; op += numSendingThreads) {
+          long unscaledRandVal;
+          lrand48_r(&buffer, &unscaledRandVal);
+
+          // randVal in [0,1]
+          double randVal = ((double)unscaledRandVal) / 0x7fffffff;
+
+          // extract a bit and rescale
+          bool useToken = randVal >= 0.5;
+          randVal = (randVal - (useToken ? 0.5 : 0.0)) * 2;
+
+          boost::optional<SharedMutexToken> maybeToken;
+
+          if (useToken) {
+            SharedMutexToken token;
+            if (randVal < preWriteFraction) {
+              globalLock.lock();
+              globalLock.unlock_and_lock_shared(token);
+            } else if (randVal < preWriteFraction + preUpgradeFraction / 2) {
+              globalLock.lock_upgrade();
+              globalLock.unlock_upgrade_and_lock_shared(token);
+            } else if (randVal < preWriteFraction + preUpgradeFraction) {
+              globalLock.lock_upgrade();
+              globalLock.unlock_upgrade_and_lock();
+              globalLock.unlock_and_lock_shared(token);
+            } else {
+              globalLock.lock_shared(token);
+            }
+            maybeToken = token;
+          } else {
+            if (randVal < preWriteFraction) {
+              globalLock.lock();
+              globalLock.unlock_and_lock_shared();
+            } else if (randVal < preWriteFraction + preUpgradeFraction / 2) {
+              globalLock.lock_upgrade();
+              globalLock.unlock_upgrade_and_lock_shared();
+            } else if (randVal < preWriteFraction + preUpgradeFraction) {
+              globalLock.lock_upgrade();
+              globalLock.unlock_upgrade_and_lock();
+              globalLock.unlock_and_lock_shared();
+            } else {
+              globalLock.lock_shared();
+            }
+          }
+
+          // blockingWrite is emplace-like, so this automatically adds
+          // another level of wrapping
+          queuePtr->blockingWrite(maybeToken);
+        }
+        if (--*pendingSendersPtr == 0) {
+          queuePtr->blockingWrite(boost::none);
+        }
+      });
+    }
+  }
+
+  go.store(true);
+  for (auto& thr : threads) {
+    DSched::join(thr);
+  }
+}
+
+TEST(SharedMutex, deterministic_remote_write_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runRemoteUnlock<DSharedMutexWritePriority, DeterministicAtomic>(
+        500, 0.1, 0.1, 5, 5);
+  }
+}
+
+TEST(SharedMutex, deterministic_remote_read_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runRemoteUnlock<DSharedMutexReadPriority, DeterministicAtomic>(
+        500, 0.1, 0.1, 5, 5);
+  }
+}
+
+TEST(SharedMutex, remote_write_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    runRemoteUnlock<SharedMutexWritePriority, atomic>(100000, 0.1, 0.1, 5, 5);
+  }
+}
+
+TEST(SharedMutex, remote_read_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    runRemoteUnlock<SharedMutexReadPriority, atomic>(100000, 0.1, 0.1, 5, 5);
+  }
+}
+
+static void burn(size_t n) {
+  for (size_t i = 0; i < n; ++i) {
+    folly::doNotOptimizeAway(i);
+  }
+}
+
+// Two threads and three locks, arranged so that they have to proceed
+// in turn with reader/writer conflict
+template <typename Lock, template <typename> class Atom = atomic>
+static void runPingPong(size_t numRounds, size_t burnCount) {
+  char padding1[56];
+  pair<Lock, char[56]> locks[3];
+  char padding2[56];
+
+  Atom<int> avail(0);
+  auto availPtr = &avail; // workaround for clang crash
+  Atom<bool> go(false);
+  auto goPtr = &go; // workaround for clang crash
+  vector<thread> threads(2);
+
+  locks[0].first.lock();
+  locks[1].first.lock();
+  locks[2].first.lock_shared();
+
+  BENCHMARK_SUSPEND {
+    threads[0] = DSched::thread([&] {
+      ++*availPtr;
+      while (!goPtr->load()) {
+        this_thread::yield();
+      }
+      for (size_t i = 0; i < numRounds; ++i) {
+        locks[i % 3].first.unlock();
+        locks[(i + 2) % 3].first.lock();
+        burn(burnCount);
+      }
+    });
+    threads[1] = DSched::thread([&] {
+      ++*availPtr;
+      while (!goPtr->load()) {
+        this_thread::yield();
+      }
+      for (size_t i = 0; i < numRounds; ++i) {
+        locks[i % 3].first.lock_shared();
+        burn(burnCount);
+        locks[(i + 2) % 3].first.unlock_shared();
+      }
+    });
+
+    while (avail.load() < 2) {
+      this_thread::yield();
+    }
+  }
+
+  go.store(true);
+  for (auto& thr : threads) {
+    DSched::join(thr);
+  }
+  locks[numRounds % 3].first.unlock();
+  locks[(numRounds + 1) % 3].first.unlock();
+  locks[(numRounds + 2) % 3].first.unlock_shared();
+}
+
+static void folly_rwspin_ping_pong(size_t n, size_t scale, size_t burnCount) {
+  runPingPong<RWSpinLock>(n / scale, burnCount);
+}
+
+static void shmtx_w_bare_ping_pong(size_t n, size_t scale, size_t burnCount) {
+  runPingPong<SharedMutexWritePriority>(n / scale, burnCount);
+}
+
+static void shmtx_r_bare_ping_pong(size_t n, size_t scale, size_t burnCount) {
+  runPingPong<SharedMutexReadPriority>(n / scale, burnCount);
+}
+
+static void folly_ticket_ping_pong(size_t n, size_t scale, size_t burnCount) {
+  runPingPong<RWTicketSpinLock64>(n / scale, burnCount);
+}
+
+static void boost_shared_ping_pong(size_t n, size_t scale, size_t burnCount) {
+  runPingPong<boost::shared_mutex>(n / scale, burnCount);
+}
+
+static void pthrd_rwlock_ping_pong(size_t n, size_t scale, size_t burnCount) {
+  runPingPong<PosixRWLock>(n / scale, burnCount);
+}
+
+TEST(SharedMutex, deterministic_ping_pong_write_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runPingPong<DSharedMutexWritePriority, DeterministicAtomic>(500, 0);
+  }
+}
+
+TEST(SharedMutex, deterministic_ping_pong_read_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    DSched sched(DSched::uniform(pass));
+    runPingPong<DSharedMutexReadPriority, DeterministicAtomic>(500, 0);
+  }
+}
+
+TEST(SharedMutex, ping_pong_write_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    runPingPong<SharedMutexWritePriority, atomic>(50000, 0);
+  }
+}
+
+TEST(SharedMutex, ping_pong_read_prio) {
+  for (int pass = 0; pass < 1; ++pass) {
+    runPingPong<SharedMutexReadPriority, atomic>(50000, 0);
+  }
+}
+
+// This is here so you can tell how much of the runtime reported by the
+// more complex harnesses is due to the harness, although due to the
+// magic of compiler optimization it may also be slower
+BENCHMARK(single_thread_lock_shared_unlock_shared, iters) {
+  SharedMutex lock;
+  for (size_t n = 0; n < iters; ++n) {
+    SharedMutex::Token token;
+    lock.lock_shared(token);
+    folly::doNotOptimizeAway(0);
+    lock.unlock_shared(token);
+  }
+}
+
+BENCHMARK(single_thread_lock_unlock, iters) {
+  SharedMutex lock;
+  for (size_t n = 0; n < iters; ++n) {
+    lock.lock();
+    folly::doNotOptimizeAway(0);
+    lock.unlock();
+  }
+}
+
+#define BENCH_BASE(args...) BENCHMARK_NAMED_PARAM(args)
+#define BENCH_REL(args...) BENCHMARK_RELATIVE_NAMED_PARAM(args)
+
+// 100% reads.  Best-case scenario for deferred locks.  Lock is colocated
+// with read data, so inline lock takes cache miss every time but deferred
+// lock has only cache hits and local access.
+BENCHMARK_DRAW_LINE()
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 1thread, 1, false)
+BENCH_REL (shmtx_wr_pri_reads, 1thread, 1, false)
+BENCH_REL (shmtx_w_bare_reads, 1thread, 1, false)
+BENCH_REL (shmtx_rd_pri_reads, 1thread, 1, false)
+BENCH_REL (shmtx_r_bare_reads, 1thread, 1, false)
+BENCH_REL (folly_ticket_reads, 1thread, 1, false)
+BENCH_REL (boost_shared_reads, 1thread, 1, false)
+BENCH_REL (pthrd_rwlock_reads, 1thread, 1, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 2thread, 2, false)
+BENCH_REL (shmtx_wr_pri_reads, 2thread, 2, false)
+BENCH_REL (shmtx_w_bare_reads, 2thread, 2, false)
+BENCH_REL (shmtx_rd_pri_reads, 2thread, 2, false)
+BENCH_REL (shmtx_r_bare_reads, 2thread, 2, false)
+BENCH_REL (folly_ticket_reads, 2thread, 2, false)
+BENCH_REL (boost_shared_reads, 2thread, 2, false)
+BENCH_REL (pthrd_rwlock_reads, 2thread, 2, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 4thread, 4, false)
+BENCH_REL (shmtx_wr_pri_reads, 4thread, 4, false)
+BENCH_REL (shmtx_w_bare_reads, 4thread, 4, false)
+BENCH_REL (shmtx_rd_pri_reads, 4thread, 4, false)
+BENCH_REL (shmtx_r_bare_reads, 4thread, 4, false)
+BENCH_REL (folly_ticket_reads, 4thread, 4, false)
+BENCH_REL (boost_shared_reads, 4thread, 4, false)
+BENCH_REL (pthrd_rwlock_reads, 4thread, 4, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 8thread, 8, false)
+BENCH_REL (shmtx_wr_pri_reads, 8thread, 8, false)
+BENCH_REL (shmtx_w_bare_reads, 8thread, 8, false)
+BENCH_REL (shmtx_rd_pri_reads, 8thread, 8, false)
+BENCH_REL (shmtx_r_bare_reads, 8thread, 8, false)
+BENCH_REL (folly_ticket_reads, 8thread, 8, false)
+BENCH_REL (boost_shared_reads, 8thread, 8, false)
+BENCH_REL (pthrd_rwlock_reads, 8thread, 8, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 16thread, 16, false)
+BENCH_REL (shmtx_wr_pri_reads, 16thread, 16, false)
+BENCH_REL (shmtx_w_bare_reads, 16thread, 16, false)
+BENCH_REL (shmtx_rd_pri_reads, 16thread, 16, false)
+BENCH_REL (shmtx_r_bare_reads, 16thread, 16, false)
+BENCH_REL (folly_ticket_reads, 16thread, 16, false)
+BENCH_REL (boost_shared_reads, 16thread, 16, false)
+BENCH_REL (pthrd_rwlock_reads, 16thread, 16, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 32thread, 32, false)
+BENCH_REL (shmtx_wr_pri_reads, 32thread, 32, false)
+BENCH_REL (shmtx_w_bare_reads, 32thread, 32, false)
+BENCH_REL (shmtx_rd_pri_reads, 32thread, 32, false)
+BENCH_REL (shmtx_r_bare_reads, 32thread, 32, false)
+BENCH_REL (folly_ticket_reads, 32thread, 32, false)
+BENCH_REL (boost_shared_reads, 32thread, 32, false)
+BENCH_REL (pthrd_rwlock_reads, 32thread, 32, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 64thread, 64, false)
+BENCH_REL (shmtx_wr_pri_reads, 64thread, 64, false)
+BENCH_REL (shmtx_w_bare_reads, 64thread, 64, false)
+BENCH_REL (shmtx_rd_pri_reads, 64thread, 64, false)
+BENCH_REL (shmtx_r_bare_reads, 64thread, 64, false)
+BENCH_REL (folly_ticket_reads, 64thread, 64, false)
+BENCH_REL (boost_shared_reads, 64thread, 64, false)
+BENCH_REL (pthrd_rwlock_reads, 64thread, 64, false)
+
+// 1 lock used by everybody, 100% writes.  Threads only hurt, but it is
+// good to not fail catastrophically.  Compare to single_thread_lock_unlock
+// to see the overhead of the generic driver (and its pseudo-random number
+// generator).  pthrd_mutex_ is a pthread_mutex_t (default, not adaptive),
+// which is better than any of the reader-writer locks for this scenario.
+BENCHMARK_DRAW_LINE()
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (folly_ticket, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (boost_shared, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (pthrd_rwlock, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (pthrd_mutex_, 1thread_all_write, 1, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (folly_ticket, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (boost_shared, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (pthrd_rwlock, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (pthrd_mutex_, 2thread_all_write, 2, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (folly_ticket, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (boost_shared, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (pthrd_rwlock, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (pthrd_mutex_, 4thread_all_write, 4, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (folly_ticket, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (boost_shared, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (pthrd_rwlock, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (pthrd_mutex_, 8thread_all_write, 8, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (folly_ticket, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (boost_shared, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (pthrd_rwlock, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (pthrd_mutex_, 16thread_all_write, 16, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (folly_ticket, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (boost_shared, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (pthrd_rwlock, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (pthrd_mutex_, 32thread_all_write, 32, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (folly_ticket, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (boost_shared, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (pthrd_rwlock, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (pthrd_mutex_, 64thread_all_write, 64, 1.0, false)
+
+// 1 lock used by everybody, 10% writes.  Not much scaling to be had.  Perf
+// is best at 1 thread, once you've got multiple threads > 8 threads hurts.
+BENCHMARK_DRAW_LINE()
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 1thread_10pct_write, 1, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 1thread_10pct_write, 1, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 1thread_10pct_write, 1, 0.10, false)
+BENCH_REL (folly_ticket, 1thread_10pct_write, 1, 0.10, false)
+BENCH_REL (boost_shared, 1thread_10pct_write, 1, 0.10, false)
+BENCH_REL (pthrd_rwlock, 1thread_10pct_write, 1, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thread_10pct_write, 2, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 2thread_10pct_write, 2, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 2thread_10pct_write, 2, 0.10, false)
+BENCH_REL (folly_ticket, 2thread_10pct_write, 2, 0.10, false)
+BENCH_REL (boost_shared, 2thread_10pct_write, 2, 0.10, false)
+BENCH_REL (pthrd_rwlock, 2thread_10pct_write, 2, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 4thread_10pct_write, 4, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 4thread_10pct_write, 4, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 4thread_10pct_write, 4, 0.10, false)
+BENCH_REL (folly_ticket, 4thread_10pct_write, 4, 0.10, false)
+BENCH_REL (boost_shared, 4thread_10pct_write, 4, 0.10, false)
+BENCH_REL (pthrd_rwlock, 4thread_10pct_write, 4, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 8thread_10pct_write, 8, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 8thread_10pct_write, 8, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 8thread_10pct_write, 8, 0.10, false)
+BENCH_REL (folly_ticket, 8thread_10pct_write, 8, 0.10, false)
+BENCH_REL (boost_shared, 8thread_10pct_write, 8, 0.10, false)
+BENCH_REL (pthrd_rwlock, 8thread_10pct_write, 8, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 16thread_10pct_write, 16, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 16thread_10pct_write, 16, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 16thread_10pct_write, 16, 0.10, false)
+BENCH_REL (folly_ticket, 16thread_10pct_write, 16, 0.10, false)
+BENCH_REL (boost_shared, 16thread_10pct_write, 16, 0.10, false)
+BENCH_REL (pthrd_rwlock, 16thread_10pct_write, 16, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 32thread_10pct_write, 32, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 32thread_10pct_write, 32, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 32thread_10pct_write, 32, 0.10, false)
+BENCH_REL (folly_ticket, 32thread_10pct_write, 32, 0.10, false)
+BENCH_REL (boost_shared, 32thread_10pct_write, 32, 0.10, false)
+BENCH_REL (pthrd_rwlock, 32thread_10pct_write, 32, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 64thread_10pct_write, 64, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 64thread_10pct_write, 64, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 64thread_10pct_write, 64, 0.10, false)
+BENCH_REL (folly_ticket, 64thread_10pct_write, 64, 0.10, false)
+BENCH_REL (boost_shared, 64thread_10pct_write, 64, 0.10, false)
+BENCH_REL (pthrd_rwlock, 64thread_10pct_write, 64, 0.10, false)
+
+// 1 lock used by everybody, 1% writes.  This is a more realistic example
+// than the concurrent_*_reads benchmark, but still shows SharedMutex locks
+// winning over all of the others
+BENCHMARK_DRAW_LINE()
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (shmtx_w_bare, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (shmtx_r_bare, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (folly_ticket, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (boost_shared, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (pthrd_rwlock, 1thread_1pct_write, 1, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (shmtx_w_bare, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (shmtx_r_bare, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (folly_ticket, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (boost_shared, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (pthrd_rwlock, 2thread_1pct_write, 2, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (shmtx_w_bare, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (shmtx_r_bare, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (folly_ticket, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (boost_shared, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (pthrd_rwlock, 4thread_1pct_write, 4, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (shmtx_w_bare, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (shmtx_r_bare, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (folly_ticket, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (boost_shared, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (pthrd_rwlock, 8thread_1pct_write, 8, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (shmtx_w_bare, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (shmtx_r_bare, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (folly_ticket, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (boost_shared, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (pthrd_rwlock, 16thread_1pct_write, 16, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (shmtx_w_bare, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (shmtx_r_bare, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (folly_ticket, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (boost_shared, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (pthrd_rwlock, 32thread_1pct_write, 32, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (shmtx_w_bare, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (shmtx_r_bare, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (folly_ticket, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (boost_shared, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (pthrd_rwlock, 64thread_1pct_write, 64, 0.01, false)
+
+// Worst case scenario for deferred locks. No actual sharing, likely that
+// read operations will have to first set the kDeferredReadersPossibleBit,
+// and likely that writers will have to scan deferredReaders[].
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thr_2lock_50pct_write, 2, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 2thr_2lock_50pct_write, 2, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 2thr_2lock_50pct_write, 2, 0.50, true)
+BENCH_BASE(folly_rwspin, 4thr_4lock_50pct_write, 4, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 4thr_4lock_50pct_write, 4, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 4thr_4lock_50pct_write, 4, 0.50, true)
+BENCH_BASE(folly_rwspin, 8thr_8lock_50pct_write, 8, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 8thr_8lock_50pct_write, 8, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 8thr_8lock_50pct_write, 8, 0.50, true)
+BENCH_BASE(folly_rwspin, 16thr_16lock_50pct_write, 16, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 16thr_16lock_50pct_write, 16, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 16thr_16lock_50pct_write, 16, 0.50, true)
+BENCH_BASE(folly_rwspin, 32thr_32lock_50pct_write, 32, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 32thr_32lock_50pct_write, 32, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 32thr_32lock_50pct_write, 32, 0.50, true)
+BENCH_BASE(folly_rwspin, 64thr_64lock_50pct_write, 64, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 64thr_64lock_50pct_write, 64, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 64thr_64lock_50pct_write, 64, 0.50, true)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thr_2lock_10pct_write, 2, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 2thr_2lock_10pct_write, 2, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 2thr_2lock_10pct_write, 2, 0.10, true)
+BENCH_BASE(folly_rwspin, 4thr_4lock_10pct_write, 4, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 4thr_4lock_10pct_write, 4, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 4thr_4lock_10pct_write, 4, 0.10, true)
+BENCH_BASE(folly_rwspin, 8thr_8lock_10pct_write, 8, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 8thr_8lock_10pct_write, 8, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 8thr_8lock_10pct_write, 8, 0.10, true)
+BENCH_BASE(folly_rwspin, 16thr_16lock_10pct_write, 16, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 16thr_16lock_10pct_write, 16, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 16thr_16lock_10pct_write, 16, 0.10, true)
+BENCH_BASE(folly_rwspin, 32thr_32lock_10pct_write, 32, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 32thr_32lock_10pct_write, 32, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 32thr_32lock_10pct_write, 32, 0.10, true)
+BENCH_BASE(folly_rwspin, 64thr_64lock_10pct_write, 64, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 64thr_64lock_10pct_write, 64, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 64thr_64lock_10pct_write, 64, 0.10, true)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thr_2lock_1pct_write, 2, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 2thr_2lock_1pct_write, 2, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 2thr_2lock_1pct_write, 2, 0.01, true)
+BENCH_BASE(folly_rwspin, 4thr_4lock_1pct_write, 4, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 4thr_4lock_1pct_write, 4, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 4thr_4lock_1pct_write, 4, 0.01, true)
+BENCH_BASE(folly_rwspin, 8thr_8lock_1pct_write, 8, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 8thr_8lock_1pct_write, 8, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 8thr_8lock_1pct_write, 8, 0.01, true)
+BENCH_BASE(folly_rwspin, 16thr_16lock_1pct_write, 16, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 16thr_16lock_1pct_write, 16, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 16thr_16lock_1pct_write, 16, 0.01, true)
+BENCH_BASE(folly_rwspin, 32thr_32lock_1pct_write, 32, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 32thr_32lock_1pct_write, 32, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 32thr_32lock_1pct_write, 32, 0.01, true)
+BENCH_BASE(folly_rwspin, 64thr_64lock_1pct_write, 64, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 64thr_64lock_1pct_write, 64, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 64thr_64lock_1pct_write, 64, 0.01, true)
+
+// Ping-pong tests have a scaled number of iterations, because their burn
+// loop would make them too slow otherwise.  Ping-pong with burn count of
+// 100k or 300k shows the advantage of soft-spin, reducing the cost of
+// each wakeup by about 20 usec.  (Take benchmark reported difference,
+// ~400 nanos, multiply by the scale of 100, then divide by 2 because
+// each round has two wakeups.)
+BENCHMARK_DRAW_LINE()
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_ping_pong, burn0, 1, 0)
+BENCH_REL (shmtx_w_bare_ping_pong, burn0, 1, 0)
+BENCH_REL (shmtx_r_bare_ping_pong, burn0, 1, 0)
+BENCH_REL (folly_ticket_ping_pong, burn0, 1, 0)
+BENCH_REL (boost_shared_ping_pong, burn0, 1, 0)
+BENCH_REL (pthrd_rwlock_ping_pong, burn0, 1, 0)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_ping_pong, burn100k, 100, 100000)
+BENCH_REL (shmtx_w_bare_ping_pong, burn100k, 100, 100000)
+BENCH_REL (shmtx_r_bare_ping_pong, burn100k, 100, 100000)
+BENCH_REL (folly_ticket_ping_pong, burn100k, 100, 100000)
+BENCH_REL (boost_shared_ping_pong, burn100k, 100, 100000)
+BENCH_REL (pthrd_rwlock_ping_pong, burn100k, 100, 100000)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_ping_pong, burn300k, 100, 300000)
+BENCH_REL (shmtx_w_bare_ping_pong, burn300k, 100, 300000)
+BENCH_REL (shmtx_r_bare_ping_pong, burn300k, 100, 300000)
+BENCH_REL (folly_ticket_ping_pong, burn300k, 100, 300000)
+BENCH_REL (boost_shared_ping_pong, burn300k, 100, 300000)
+BENCH_REL (pthrd_rwlock_ping_pong, burn300k, 100, 300000)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_ping_pong, burn1M, 1000, 1000000)
+BENCH_REL (shmtx_w_bare_ping_pong, burn1M, 1000, 1000000)
+BENCH_REL (shmtx_r_bare_ping_pong, burn1M, 1000, 1000000)
+BENCH_REL (folly_ticket_ping_pong, burn1M, 1000, 1000000)
+BENCH_REL (boost_shared_ping_pong, burn1M, 1000, 1000000)
+BENCH_REL (pthrd_rwlock_ping_pong, burn1M, 1000, 1000000)
+
+// Reproduce with 10 minutes and
+//   sudo nice -n -20 \
+//     shared_mutex_test --benchmark --bm_min_iters=1000000
+//
+// Comparison use folly::RWSpinLock as the baseline, with the
+// following row being the default SharedMutex (using *Holder or
+// Token-ful methods).
+// ============================================================================
+// folly/experimental/test/SharedMutexTest.cpp     relative  time/iter  iters/s
+// ============================================================================
+// single_thread_lock_shared_unlock_shared                     22.78ns   43.89M
+// single_thread_lock_unlock                                   26.01ns   38.45M
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(1thread)                                 15.09ns   66.25M
+// shmtx_wr_pri_reads(1thread)                       69.89%    21.60ns   46.30M
+// shmtx_w_bare_reads(1thread)                       58.25%    25.91ns   38.59M
+// shmtx_rd_pri_reads(1thread)                       72.50%    20.82ns   48.03M
+// shmtx_r_bare_reads(1thread)                       58.27%    25.91ns   38.60M
+// folly_ticket_reads(1thread)                       54.80%    27.55ns   36.30M
+// boost_shared_reads(1thread)                       10.88%   138.80ns    7.20M
+// pthrd_rwlock_reads(1thread)                       40.68%    37.11ns   26.95M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(2thread)                                 92.63ns   10.80M
+// shmtx_wr_pri_reads(2thread)                      462.86%    20.01ns   49.97M
+// shmtx_w_bare_reads(2thread)                      430.53%    21.51ns   46.48M
+// shmtx_rd_pri_reads(2thread)                      487.13%    19.01ns   52.59M
+// shmtx_r_bare_reads(2thread)                      433.35%    21.37ns   46.79M
+// folly_ticket_reads(2thread)                       69.82%   132.67ns    7.54M
+// boost_shared_reads(2thread)                       36.66%   252.63ns    3.96M
+// pthrd_rwlock_reads(2thread)                      127.76%    72.50ns   13.79M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(4thread)                                 97.45ns   10.26M
+// shmtx_wr_pri_reads(4thread)                      978.22%     9.96ns  100.38M
+// shmtx_w_bare_reads(4thread)                      908.35%    10.73ns   93.21M
+// shmtx_rd_pri_reads(4thread)                     1032.29%     9.44ns  105.93M
+// shmtx_r_bare_reads(4thread)                      912.38%    10.68ns   93.63M
+// folly_ticket_reads(4thread)                       46.08%   211.46ns    4.73M
+// boost_shared_reads(4thread)                       25.00%   389.74ns    2.57M
+// pthrd_rwlock_reads(4thread)                       47.53%   205.01ns    4.88M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(8thread)                                147.24ns    6.79M
+// shmtx_wr_pri_reads(8thread)                     2915.66%     5.05ns  198.02M
+// shmtx_w_bare_reads(8thread)                     2699.32%     5.45ns  183.32M
+// shmtx_rd_pri_reads(8thread)                     3092.58%     4.76ns  210.03M
+// shmtx_r_bare_reads(8thread)                     2744.63%     5.36ns  186.40M
+// folly_ticket_reads(8thread)                       54.84%   268.47ns    3.72M
+// boost_shared_reads(8thread)                       42.40%   347.30ns    2.88M
+// pthrd_rwlock_reads(8thread)                       78.90%   186.63ns    5.36M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(16thread)                               166.25ns    6.02M
+// shmtx_wr_pri_reads(16thread)                    6133.03%     2.71ns  368.91M
+// shmtx_w_bare_reads(16thread)                    5936.05%     2.80ns  357.06M
+// shmtx_rd_pri_reads(16thread)                    6786.57%     2.45ns  408.22M
+// shmtx_r_bare_reads(16thread)                    5995.54%     2.77ns  360.64M
+// folly_ticket_reads(16thread)                      56.35%   295.01ns    3.39M
+// boost_shared_reads(16thread)                      51.62%   322.08ns    3.10M
+// pthrd_rwlock_reads(16thread)                      92.47%   179.79ns    5.56M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(32thread)                               107.72ns    9.28M
+// shmtx_wr_pri_reads(32thread)                    6772.80%     1.59ns  628.77M
+// shmtx_w_bare_reads(32thread)                    6236.13%     1.73ns  578.94M
+// shmtx_rd_pri_reads(32thread)                    8143.32%     1.32ns  756.00M
+// shmtx_r_bare_reads(32thread)                    6485.18%     1.66ns  602.06M
+// folly_ticket_reads(32thread)                      35.12%   306.73ns    3.26M
+// boost_shared_reads(32thread)                      28.19%   382.17ns    2.62M
+// pthrd_rwlock_reads(32thread)                      65.29%   164.99ns    6.06M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(64thread)                               119.46ns    8.37M
+// shmtx_wr_pri_reads(64thread)                    6744.92%     1.77ns  564.60M
+// shmtx_w_bare_reads(64thread)                    6268.50%     1.91ns  524.72M
+// shmtx_rd_pri_reads(64thread)                    7508.56%     1.59ns  628.52M
+// shmtx_r_bare_reads(64thread)                    6299.53%     1.90ns  527.32M
+// folly_ticket_reads(64thread)                      37.42%   319.26ns    3.13M
+// boost_shared_reads(64thread)                      32.58%   366.70ns    2.73M
+// pthrd_rwlock_reads(64thread)                      73.64%   162.24ns    6.16M
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// folly_rwspin(1thread_all_write)                             25.51ns   39.19M
+// shmtx_wr_pri(1thread_all_write)                   97.38%    26.20ns   38.17M
+// shmtx_rd_pri(1thread_all_write)                   97.55%    26.16ns   38.23M
+// folly_ticket(1thread_all_write)                   90.98%    28.04ns   35.66M
+// boost_shared(1thread_all_write)                   16.80%   151.89ns    6.58M
+// pthrd_rwlock(1thread_all_write)                   63.86%    39.96ns   25.03M
+// pthrd_mutex_(1thread_all_write)                   82.05%    31.09ns   32.16M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thread_all_write)                            100.70ns    9.93M
+// shmtx_wr_pri(2thread_all_write)                   40.83%   246.61ns    4.05M
+// shmtx_rd_pri(2thread_all_write)                   40.53%   248.44ns    4.03M
+// folly_ticket(2thread_all_write)                   58.49%   172.17ns    5.81M
+// boost_shared(2thread_all_write)                   24.26%   415.00ns    2.41M
+// pthrd_rwlock(2thread_all_write)                   41.35%   243.49ns    4.11M
+// pthrd_mutex_(2thread_all_write)                  146.91%    68.55ns   14.59M
+// ----------------------------------------------------------------------------
+// folly_rwspin(4thread_all_write)                            199.52ns    5.01M
+// shmtx_wr_pri(4thread_all_write)                   51.71%   385.86ns    2.59M
+// shmtx_rd_pri(4thread_all_write)                   49.43%   403.62ns    2.48M
+// folly_ticket(4thread_all_write)                  117.88%   169.26ns    5.91M
+// boost_shared(4thread_all_write)                    9.81%     2.03us  491.48K
+// pthrd_rwlock(4thread_all_write)                   28.23%   706.69ns    1.42M
+// pthrd_mutex_(4thread_all_write)                  111.54%   178.88ns    5.59M
+// ----------------------------------------------------------------------------
+// folly_rwspin(8thread_all_write)                            304.61ns    3.28M
+// shmtx_wr_pri(8thread_all_write)                   69.77%   436.59ns    2.29M
+// shmtx_rd_pri(8thread_all_write)                   66.58%   457.51ns    2.19M
+// folly_ticket(8thread_all_write)                  141.00%   216.03ns    4.63M
+// boost_shared(8thread_all_write)                    6.11%     4.99us  200.59K
+// pthrd_rwlock(8thread_all_write)                   38.03%   800.88ns    1.25M
+// pthrd_mutex_(8thread_all_write)                  177.66%   171.45ns    5.83M
+// ----------------------------------------------------------------------------
+// folly_rwspin(16thread_all_write)                           576.97ns    1.73M
+// shmtx_wr_pri(16thread_all_write)                 105.72%   545.77ns    1.83M
+// shmtx_rd_pri(16thread_all_write)                 105.13%   548.83ns    1.82M
+// folly_ticket(16thread_all_write)                 161.70%   356.82ns    2.80M
+// boost_shared(16thread_all_write)                   7.73%     7.46us  134.03K
+// pthrd_rwlock(16thread_all_write)                  96.88%   595.54ns    1.68M
+// pthrd_mutex_(16thread_all_write)                 330.44%   174.61ns    5.73M
+// ----------------------------------------------------------------------------
+// folly_rwspin(32thread_all_write)                             1.41us  707.76K
+// shmtx_wr_pri(32thread_all_write)                 240.46%   587.58ns    1.70M
+// shmtx_rd_pri(32thread_all_write)                 393.71%   358.87ns    2.79M
+// folly_ticket(32thread_all_write)                 325.07%   434.65ns    2.30M
+// boost_shared(32thread_all_write)                  18.57%     7.61us  131.43K
+// pthrd_rwlock(32thread_all_write)                 266.78%   529.62ns    1.89M
+// pthrd_mutex_(32thread_all_write)                 877.89%   160.94ns    6.21M
+// ----------------------------------------------------------------------------
+// folly_rwspin(64thread_all_write)                             1.76us  566.94K
+// shmtx_wr_pri(64thread_all_write)                 255.67%   689.91ns    1.45M
+// shmtx_rd_pri(64thread_all_write)                 468.82%   376.23ns    2.66M
+// folly_ticket(64thread_all_write)                 294.72%   598.49ns    1.67M
+// boost_shared(64thread_all_write)                  23.39%     7.54us  132.58K
+// pthrd_rwlock(64thread_all_write)                 321.39%   548.83ns    1.82M
+// pthrd_mutex_(64thread_all_write)                1165.04%   151.40ns    6.61M
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// folly_rwspin(1thread_10pct_write)                           19.51ns   51.26M
+// shmtx_wr_pri(1thread_10pct_write)                 83.25%    23.43ns   42.67M
+// shmtx_rd_pri(1thread_10pct_write)                 83.31%    23.42ns   42.71M
+// folly_ticket(1thread_10pct_write)                 70.88%    27.52ns   36.34M
+// boost_shared(1thread_10pct_write)                 13.09%   148.99ns    6.71M
+// pthrd_rwlock(1thread_10pct_write)                 47.41%    41.15ns   24.30M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thread_10pct_write)                          159.42ns    6.27M
+// shmtx_wr_pri(2thread_10pct_write)                188.44%    84.60ns   11.82M
+// shmtx_rd_pri(2thread_10pct_write)                188.29%    84.67ns   11.81M
+// folly_ticket(2thread_10pct_write)                140.28%   113.64ns    8.80M
+// boost_shared(2thread_10pct_write)                 42.09%   378.81ns    2.64M
+// pthrd_rwlock(2thread_10pct_write)                103.86%   153.49ns    6.51M
+// ----------------------------------------------------------------------------
+// folly_rwspin(4thread_10pct_write)                          193.35ns    5.17M
+// shmtx_wr_pri(4thread_10pct_write)                184.30%   104.91ns    9.53M
+// shmtx_rd_pri(4thread_10pct_write)                163.76%   118.07ns    8.47M
+// folly_ticket(4thread_10pct_write)                124.07%   155.84ns    6.42M
+// boost_shared(4thread_10pct_write)                 16.32%     1.18us  843.92K
+// pthrd_rwlock(4thread_10pct_write)                 48.59%   397.94ns    2.51M
+// ----------------------------------------------------------------------------
+// folly_rwspin(8thread_10pct_write)                          373.17ns    2.68M
+// shmtx_wr_pri(8thread_10pct_write)                252.02%   148.08ns    6.75M
+// shmtx_rd_pri(8thread_10pct_write)                203.59%   183.30ns    5.46M
+// folly_ticket(8thread_10pct_write)                184.37%   202.40ns    4.94M
+// boost_shared(8thread_10pct_write)                 15.85%     2.35us  424.72K
+// pthrd_rwlock(8thread_10pct_write)                 83.03%   449.45ns    2.22M
+// ----------------------------------------------------------------------------
+// folly_rwspin(16thread_10pct_write)                         742.87ns    1.35M
+// shmtx_wr_pri(16thread_10pct_write)               344.27%   215.78ns    4.63M
+// shmtx_rd_pri(16thread_10pct_write)               287.04%   258.80ns    3.86M
+// folly_ticket(16thread_10pct_write)               277.25%   267.94ns    3.73M
+// boost_shared(16thread_10pct_write)                15.33%     4.85us  206.30K
+// pthrd_rwlock(16thread_10pct_write)               158.34%   469.16ns    2.13M
+// ----------------------------------------------------------------------------
+// folly_rwspin(32thread_10pct_write)                         799.97ns    1.25M
+// shmtx_wr_pri(32thread_10pct_write)               351.40%   227.65ns    4.39M
+// shmtx_rd_pri(32thread_10pct_write)               341.71%   234.11ns    4.27M
+// folly_ticket(32thread_10pct_write)               245.91%   325.31ns    3.07M
+// boost_shared(32thread_10pct_write)                 7.72%    10.36us   96.56K
+// pthrd_rwlock(32thread_10pct_write)               165.87%   482.30ns    2.07M
+// ----------------------------------------------------------------------------
+// folly_rwspin(64thread_10pct_write)                           1.12us  892.01K
+// shmtx_wr_pri(64thread_10pct_write)               429.84%   260.81ns    3.83M
+// shmtx_rd_pri(64thread_10pct_write)               456.93%   245.35ns    4.08M
+// folly_ticket(64thread_10pct_write)               219.21%   511.42ns    1.96M
+// boost_shared(64thread_10pct_write)                 5.43%    20.65us   48.44K
+// pthrd_rwlock(64thread_10pct_write)               233.93%   479.23ns    2.09M
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// folly_rwspin(1thread_1pct_write)                            18.88ns   52.98M
+// shmtx_wr_pri(1thread_1pct_write)                  81.53%    23.15ns   43.19M
+// shmtx_w_bare(1thread_1pct_write)                  67.90%    27.80ns   35.97M
+// shmtx_rd_pri(1thread_1pct_write)                  81.50%    23.16ns   43.18M
+// shmtx_r_bare(1thread_1pct_write)                  67.74%    27.86ns   35.89M
+// folly_ticket(1thread_1pct_write)                  68.68%    27.48ns   36.39M
+// boost_shared(1thread_1pct_write)                  12.80%   147.51ns    6.78M
+// pthrd_rwlock(1thread_1pct_write)                  45.81%    41.20ns   24.27M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thread_1pct_write)                           125.85ns    7.95M
+// shmtx_wr_pri(2thread_1pct_write)                 359.04%    35.05ns   28.53M
+// shmtx_w_bare(2thread_1pct_write)                 475.60%    26.46ns   37.79M
+// shmtx_rd_pri(2thread_1pct_write)                 332.75%    37.82ns   26.44M
+// shmtx_r_bare(2thread_1pct_write)                 115.64%   108.83ns    9.19M
+// folly_ticket(2thread_1pct_write)                 140.24%    89.74ns   11.14M
+// boost_shared(2thread_1pct_write)                  40.62%   309.82ns    3.23M
+// pthrd_rwlock(2thread_1pct_write)                 134.67%    93.45ns   10.70M
+// ----------------------------------------------------------------------------
+// folly_rwspin(4thread_1pct_write)                           126.70ns    7.89M
+// shmtx_wr_pri(4thread_1pct_write)                 422.20%    30.01ns   33.32M
+// shmtx_w_bare(4thread_1pct_write)                 403.52%    31.40ns   31.85M
+// shmtx_rd_pri(4thread_1pct_write)                 282.50%    44.85ns   22.30M
+// shmtx_r_bare(4thread_1pct_write)                  66.30%   191.10ns    5.23M
+// folly_ticket(4thread_1pct_write)                  91.93%   137.83ns    7.26M
+// boost_shared(4thread_1pct_write)                  22.74%   557.10ns    1.80M
+// pthrd_rwlock(4thread_1pct_write)                  55.66%   227.62ns    4.39M
+// ----------------------------------------------------------------------------
+// folly_rwspin(8thread_1pct_write)                           169.42ns    5.90M
+// shmtx_wr_pri(8thread_1pct_write)                 567.81%    29.84ns   33.51M
+// shmtx_w_bare(8thread_1pct_write)                 519.18%    32.63ns   30.64M
+// shmtx_rd_pri(8thread_1pct_write)                 172.36%    98.30ns   10.17M
+// shmtx_r_bare(8thread_1pct_write)                  75.56%   224.21ns    4.46M
+// folly_ticket(8thread_1pct_write)                 104.03%   162.85ns    6.14M
+// boost_shared(8thread_1pct_write)                  22.01%   769.73ns    1.30M
+// pthrd_rwlock(8thread_1pct_write)                  71.79%   235.99ns    4.24M
+// ----------------------------------------------------------------------------
+// folly_rwspin(16thread_1pct_write)                          385.88ns    2.59M
+// shmtx_wr_pri(16thread_1pct_write)               1039.03%    37.14ns   26.93M
+// shmtx_w_bare(16thread_1pct_write)                997.26%    38.69ns   25.84M
+// shmtx_rd_pri(16thread_1pct_write)                263.60%   146.39ns    6.83M
+// shmtx_r_bare(16thread_1pct_write)                173.16%   222.85ns    4.49M
+// folly_ticket(16thread_1pct_write)                179.37%   215.13ns    4.65M
+// boost_shared(16thread_1pct_write)                 26.95%     1.43us  698.42K
+// pthrd_rwlock(16thread_1pct_write)                166.70%   231.48ns    4.32M
+// ----------------------------------------------------------------------------
+// folly_rwspin(32thread_1pct_write)                          382.49ns    2.61M
+// shmtx_wr_pri(32thread_1pct_write)               1046.64%    36.54ns   27.36M
+// shmtx_w_bare(32thread_1pct_write)                922.87%    41.45ns   24.13M
+// shmtx_rd_pri(32thread_1pct_write)                251.93%   151.82ns    6.59M
+// shmtx_r_bare(32thread_1pct_write)                176.44%   216.78ns    4.61M
+// folly_ticket(32thread_1pct_write)                131.07%   291.82ns    3.43M
+// boost_shared(32thread_1pct_write)                 12.77%     2.99us  333.95K
+// pthrd_rwlock(32thread_1pct_write)                173.43%   220.55ns    4.53M
+// ----------------------------------------------------------------------------
+// folly_rwspin(64thread_1pct_write)                          510.54ns    1.96M
+// shmtx_wr_pri(64thread_1pct_write)               1378.27%    37.04ns   27.00M
+// shmtx_w_bare(64thread_1pct_write)               1178.24%    43.33ns   23.08M
+// shmtx_rd_pri(64thread_1pct_write)                325.29%   156.95ns    6.37M
+// shmtx_r_bare(64thread_1pct_write)                247.82%   206.02ns    4.85M
+// folly_ticket(64thread_1pct_write)                117.87%   433.13ns    2.31M
+// boost_shared(64thread_1pct_write)                  9.45%     5.40us  185.09K
+// pthrd_rwlock(64thread_1pct_write)                236.72%   215.68ns    4.64M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thr_2lock_50pct_write)                        10.85ns   92.15M
+// shmtx_wr_pri(2thr_2lock_50pct_write)              81.73%    13.28ns   75.32M
+// shmtx_rd_pri(2thr_2lock_50pct_write)              81.82%    13.26ns   75.40M
+// folly_rwspin(4thr_4lock_50pct_write)                         5.29ns  188.90M
+// shmtx_wr_pri(4thr_4lock_50pct_write)              80.89%     6.54ns  152.80M
+// shmtx_rd_pri(4thr_4lock_50pct_write)              81.07%     6.53ns  153.14M
+// folly_rwspin(8thr_8lock_50pct_write)                         2.63ns  380.57M
+// shmtx_wr_pri(8thr_8lock_50pct_write)              80.56%     3.26ns  306.57M
+// shmtx_rd_pri(8thr_8lock_50pct_write)              80.29%     3.27ns  305.54M
+// folly_rwspin(16thr_16lock_50pct_write)                       1.31ns  764.70M
+// shmtx_wr_pri(16thr_16lock_50pct_write)            79.32%     1.65ns  606.54M
+// shmtx_rd_pri(16thr_16lock_50pct_write)            79.62%     1.64ns  608.84M
+// folly_rwspin(32thr_32lock_50pct_write)                       1.20ns  836.75M
+// shmtx_wr_pri(32thr_32lock_50pct_write)            91.67%     1.30ns  767.07M
+// shmtx_rd_pri(32thr_32lock_50pct_write)            92.00%     1.30ns  769.82M
+// folly_rwspin(64thr_64lock_50pct_write)                       1.39ns  717.80M
+// shmtx_wr_pri(64thr_64lock_50pct_write)            93.21%     1.49ns  669.08M
+// shmtx_rd_pri(64thr_64lock_50pct_write)            92.49%     1.51ns  663.89M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thr_2lock_10pct_write)                        10.24ns   97.70M
+// shmtx_wr_pri(2thr_2lock_10pct_write)              76.46%    13.39ns   74.70M
+// shmtx_rd_pri(2thr_2lock_10pct_write)              76.35%    13.41ns   74.60M
+// folly_rwspin(4thr_4lock_10pct_write)                         5.02ns  199.03M
+// shmtx_wr_pri(4thr_4lock_10pct_write)              75.83%     6.63ns  150.91M
+// shmtx_rd_pri(4thr_4lock_10pct_write)              76.10%     6.60ns  151.46M
+// folly_rwspin(8thr_8lock_10pct_write)                         2.47ns  405.50M
+// shmtx_wr_pri(8thr_8lock_10pct_write)              74.54%     3.31ns  302.27M
+// shmtx_rd_pri(8thr_8lock_10pct_write)              74.85%     3.29ns  303.52M
+// folly_rwspin(16thr_16lock_10pct_write)                       1.22ns  818.68M
+// shmtx_wr_pri(16thr_16lock_10pct_write)            73.35%     1.67ns  600.47M
+// shmtx_rd_pri(16thr_16lock_10pct_write)            73.38%     1.66ns  600.73M
+// folly_rwspin(32thr_32lock_10pct_write)                       1.21ns  827.95M
+// shmtx_wr_pri(32thr_32lock_10pct_write)            96.13%     1.26ns  795.89M
+// shmtx_rd_pri(32thr_32lock_10pct_write)            96.01%     1.26ns  794.95M
+// folly_rwspin(64thr_64lock_10pct_write)                       1.40ns  716.17M
+// shmtx_wr_pri(64thr_64lock_10pct_write)            96.91%     1.44ns  694.03M
+// shmtx_rd_pri(64thr_64lock_10pct_write)            96.85%     1.44ns  693.64M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thr_2lock_1pct_write)                         10.11ns   98.91M
+// shmtx_wr_pri(2thr_2lock_1pct_write)               75.07%    13.47ns   74.25M
+// shmtx_rd_pri(2thr_2lock_1pct_write)               74.98%    13.48ns   74.16M
+// folly_rwspin(4thr_4lock_1pct_write)                          4.96ns  201.77M
+// shmtx_wr_pri(4thr_4lock_1pct_write)               74.59%     6.64ns  150.49M
+// shmtx_rd_pri(4thr_4lock_1pct_write)               74.60%     6.64ns  150.51M
+// folly_rwspin(8thr_8lock_1pct_write)                          2.44ns  410.42M
+// shmtx_wr_pri(8thr_8lock_1pct_write)               73.68%     3.31ns  302.41M
+// shmtx_rd_pri(8thr_8lock_1pct_write)               73.38%     3.32ns  301.16M
+// folly_rwspin(16thr_16lock_1pct_write)                        1.21ns  827.53M
+// shmtx_wr_pri(16thr_16lock_1pct_write)             72.11%     1.68ns  596.74M
+// shmtx_rd_pri(16thr_16lock_1pct_write)             72.23%     1.67ns  597.73M
+// folly_rwspin(32thr_32lock_1pct_write)                        1.22ns  819.53M
+// shmtx_wr_pri(32thr_32lock_1pct_write)             98.17%     1.24ns  804.50M
+// shmtx_rd_pri(32thr_32lock_1pct_write)             98.21%     1.24ns  804.86M
+// folly_rwspin(64thr_64lock_1pct_write)                        1.41ns  710.26M
+// shmtx_wr_pri(64thr_64lock_1pct_write)             97.81%     1.44ns  694.71M
+// shmtx_rd_pri(64thr_64lock_1pct_write)             99.44%     1.42ns  706.28M
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// folly_rwspin_ping_pong(burn0)                              641.24ns    1.56M
+// shmtx_w_bare_ping_pong(burn0)                     91.07%   704.12ns    1.42M
+// shmtx_r_bare_ping_pong(burn0)                     78.70%   814.84ns    1.23M
+// folly_ticket_ping_pong(burn0)                     85.67%   748.53ns    1.34M
+// boost_shared_ping_pong(burn0)                      5.58%    11.50us   86.96K
+// pthrd_rwlock_ping_pong(burn0)                      8.81%     7.28us  137.40K
+// ----------------------------------------------------------------------------
+// folly_rwspin_ping_pong(burn100k)                           678.97ns    1.47M
+// shmtx_w_bare_ping_pong(burn100k)                  99.73%   680.78ns    1.47M
+// shmtx_r_bare_ping_pong(burn100k)                  98.67%   688.13ns    1.45M
+// folly_ticket_ping_pong(burn100k)                  99.31%   683.68ns    1.46M
+// boost_shared_ping_pong(burn100k)                  58.23%     1.17us  857.64K
+// pthrd_rwlock_ping_pong(burn100k)                  57.43%     1.18us  845.86K
+// ----------------------------------------------------------------------------
+// folly_rwspin_ping_pong(burn300k)                             2.03us  492.99K
+// shmtx_w_bare_ping_pong(burn300k)                  99.98%     2.03us  492.88K
+// shmtx_r_bare_ping_pong(burn300k)                  99.94%     2.03us  492.68K
+// folly_ticket_ping_pong(burn300k)                  99.88%     2.03us  492.40K
+// boost_shared_ping_pong(burn300k)                  81.43%     2.49us  401.47K
+// pthrd_rwlock_ping_pong(burn300k)                  83.22%     2.44us  410.29K
+// ----------------------------------------------------------------------------
+// folly_rwspin_ping_pong(burn1M)                             677.07ns    1.48M
+// shmtx_w_bare_ping_pong(burn1M)                   100.50%   673.74ns    1.48M
+// shmtx_r_bare_ping_pong(burn1M)                   100.14%   676.12ns    1.48M
+// folly_ticket_ping_pong(burn1M)                   100.44%   674.14ns    1.48M
+// boost_shared_ping_pong(burn1M)                    93.04%   727.72ns    1.37M
+// pthrd_rwlock_ping_pong(burn1M)                    94.52%   716.30ns    1.40M
+// ============================================================================
+
+int main(int argc, char** argv) {
+  (void)folly_rwspin_reads;
+  (void)shmtx_wr_pri_reads;
+  (void)shmtx_w_bare_reads;
+  (void)shmtx_rd_pri_reads;
+  (void)shmtx_r_bare_reads;
+  (void)folly_ticket_reads;
+  (void)boost_shared_reads;
+  (void)pthrd_rwlock_reads;
+  (void)folly_rwspin;
+  (void)shmtx_wr_pri;
+  (void)shmtx_w_bare;
+  (void)shmtx_rd_pri;
+  (void)shmtx_r_bare;
+  (void)folly_ticket;
+  (void)boost_shared;
+  (void)pthrd_rwlock;
+  (void)pthrd_mutex_;
+  (void)folly_rwspin_ping_pong;
+  (void)shmtx_w_bare_ping_pong;
+  (void)shmtx_r_bare_ping_pong;
+  (void)folly_ticket_ping_pong;
+  (void)boost_shared_ping_pong;
+  (void)pthrd_rwlock_ping_pong;
+
+  testing::InitGoogleTest(&argc, argv);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  int rv = RUN_ALL_TESTS();
+  folly::runBenchmarksOnFlag();
+  return rv;
+}