folly/RWSpinLock.h

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * N.B. You most likely do _not_ want to use RWSpinLock or any other
  19  * kind of spinlock.  Use SharedMutex instead.
  20  *
  21  * In short, spinlocks in preemptive multi-tasking operating systems
  22  * have serious problems and fast mutexes like SharedMutex are almost
  23  * certainly the better choice, because letting the OS scheduler put a
  24  * thread to sleep is better for system responsiveness and throughput
  25  * than wasting a timeslice repeatedly querying a lock held by a
  26  * thread that's blocked, and you can't prevent userspace
  27  * programs blocking.
  28  *
  29  * Spinlocks in an operating system kernel make much more sense than
  30  * they do in userspace.
  31  *
  32  * -------------------------------------------------------------------
  33  *
  34  * Two Read-Write spin lock implementations.
  35  *
  36  *  Ref: http://locklessinc.com/articles/locks
  37  *
  38  *  Both locks here are faster than pthread_rwlock and have very low
  39  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  40  *  are very compact (4/8 bytes), so are suitable for per-instance
  41  *  based locking, particularly when contention is not expected.
  42  *
  43  *  For a spinlock, RWSpinLock is a reasonable choice.  (See the note
  44  *  about for why a spin lock is frequently a bad idea generally.)
  45  *  RWSpinLock has minimal overhead, and comparable contention
  46  *  performance when the number of competing threads is less than or
  47  *  equal to the number of logical CPUs.  Even as the number of
  48  *  threads gets larger, RWSpinLock can still be very competitive in
  49  *  READ, although it is slower on WRITE, and also inherently unfair
  50  *  to writers.
  51  *
  52  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  53  *  your application really needs a lot more threads, and a
  54  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  55  *
  56  *  Caveats:
  57  *
  58  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  59  *    based systems.
  60  *
  61  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  62  *    readers and writers.
  63  *
  64  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  65  *    readers and writers.
  66  *
  67  *    RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
  68  *    writer priority) is NOT reentrant, even for lock_shared().
  69  *
  70  *    The lock will not grant any new shared (read) accesses while a thread
  71  *    attempting to acquire the lock in write mode is blocked. (That is,
  72  *    if the lock is held in shared mode by N threads, and a thread attempts
  73  *    to acquire it in write mode, no one else can acquire it in shared mode
  74  *    until these N threads release the lock and then the blocked thread
  75  *    acquires and releases the exclusive lock.) This also applies for
  76  *    attempts to reacquire the lock in shared mode by threads that already
  77  *    hold it in shared mode, making the lock non-reentrant.
  78  *
  79  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  80  *
  81  * @author Xin Liu <xliux@fb.com>
  82  */
  83
  84 #pragma once
  85
  86 /*
  87 ========================================================================
  88 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
  89 ========================================================================
  90
  91 ------------------------------------------------------------------------------
  92 1. Single thread benchmark (read/write lock + unlock overhead)
  93 Benchmark                                    Iters   Total t    t/iter iter/sec
  94 -------------------------------------------------------------------------------
  95 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
  96 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
  97 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
  98 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
  99 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
 100 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
 101 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
 102 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
 103 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
 104 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
 105 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
 106 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
 107
 108 ------------------------------------------------------------------------------
 109 2. Contention Benchmark      90% read  10% write
 110 Benchmark                    hits       average    min       max        sigma
 111 ------------------------------------------------------------------------------
 112 ---------- 8  threads ------------
 113 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
 114 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
 115 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
 116 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
 117 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
 118 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
 119
 120 ---------- 16 threads ------------
 121 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
 122 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
 123 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
 124 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
 125 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
 126 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
 127
 128 ---------- 50 threads ------------
 129 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 130 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 131 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 132 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 133 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 134 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 135
 136 */
 137
 138 #include <folly/Portability.h>
 139 #include <folly/portability/Asm.h>
 140
 141 #if defined(__GNUC__) && (defined(__i386) || FOLLY_X64 || defined(ARCH_K8))
 142 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 143 #include <x86intrin.h>
 144 #elif defined(_MSC_VER) && defined(FOLLY_X64)
 145 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 146 #else
 147 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 148 #endif
 149
 150 // iOS doesn't define _mm_cvtsi64_si128 and friends
 151 #if (FOLLY_SSE >= 2) && !FOLLY_MOBILE
 152 #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 153 #else
 154 #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 155 #endif
 156
 157 #include <algorithm>
 158 #include <atomic>
 159 #include <string>
 160 #include <thread>
 161
 162 #include <glog/logging.h>
 163
 164 #include <folly/Likely.h>
 165
 166 namespace folly {
 167
 168 /*
 169  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 170  * a nice writer and don't expect a lot of write/read contention, or
 171  * when you need small rwlocks since you are creating a large number
 172  * of them.
 173  *
 174  * Note that the unfairness here is extreme: if the lock is
 175  * continually accessed for read, writers will never get a chance.  If
 176  * the lock can be that highly contended this class is probably not an
 177  * ideal choice anyway.
 178  *
 179  * It currently implements most of the Lockable, SharedLockable and
 180  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 181  * interfaces.
 182  */
 183 class RWSpinLock {
 184   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 185  public:
 186   constexpr RWSpinLock() : bits_(0) {}
 187
 188   RWSpinLock(RWSpinLock const&) = delete;
 189   RWSpinLock& operator=(RWSpinLock const&) = delete;
 190
 191   // Lockable Concept
 192   void lock() {
 193     int count = 0;
 194     while (!LIKELY(try_lock())) {
 195       if (++count > 1000) std::this_thread::yield();
 196     }
 197   }
 198
 199   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 200   void unlock() {
 201     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 202     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 203   }
 204
 205   // SharedLockable Concept
 206   void lock_shared() {
 207     int count = 0;
 208     while (!LIKELY(try_lock_shared())) {
 209       if (++count > 1000) std::this_thread::yield();
 210     }
 211   }
 212
 213   void unlock_shared() {
 214     bits_.fetch_add(-READER, std::memory_order_release);
 215   }
 216
 217   // Downgrade the lock from writer status to reader status.
 218   void unlock_and_lock_shared() {
 219     bits_.fetch_add(READER, std::memory_order_acquire);
 220     unlock();
 221   }
 222
 223   // UpgradeLockable Concept
 224   void lock_upgrade() {
 225     int count = 0;
 226     while (!try_lock_upgrade()) {
 227       if (++count > 1000) std::this_thread::yield();
 228     }
 229   }
 230
 231   void unlock_upgrade() {
 232     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 233   }
 234
 235   // unlock upgrade and try to acquire write lock
 236   void unlock_upgrade_and_lock() {
 237     int64_t count = 0;
 238     while (!try_unlock_upgrade_and_lock()) {
 239       if (++count > 1000) std::this_thread::yield();
 240     }
 241   }
 242
 243   // unlock upgrade and read lock atomically
 244   void unlock_upgrade_and_lock_shared() {
 245     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 246   }
 247
 248   // write unlock and upgrade lock atomically
 249   void unlock_and_lock_upgrade() {
 250     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 251     // the same time when other threads are trying do try_lock_upgrade().
 252     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 253     bits_.fetch_add(-WRITER, std::memory_order_release);
 254   }
 255
 256
 257   // Attempt to acquire writer permission. Return false if we didn't get it.
 258   bool try_lock() {
 259     int32_t expect = 0;
 260     return bits_.compare_exchange_strong(expect, WRITER,
 261       std::memory_order_acq_rel);
 262   }
 263
 264   // Try to get reader permission on the lock. This can fail if we
 265   // find out someone is a writer or upgrader.
 266   // Setting the UPGRADED bit would allow a writer-to-be to indicate
 267   // its intention to write and block any new readers while waiting
 268   // for existing readers to finish and release their read locks. This
 269   // helps avoid starving writers (promoted from upgraders).
 270   bool try_lock_shared() {
 271     // fetch_add is considerably (100%) faster than compare_exchange,
 272     // so here we are optimizing for the common (lock success) case.
 273     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 274     if (UNLIKELY(value & (WRITER|UPGRADED))) {
 275       bits_.fetch_add(-READER, std::memory_order_release);
 276       return false;
 277     }
 278     return true;
 279   }
 280
 281   // try to unlock upgrade and write lock atomically
 282   bool try_unlock_upgrade_and_lock() {
 283     int32_t expect = UPGRADED;
 284     return bits_.compare_exchange_strong(expect, WRITER,
 285         std::memory_order_acq_rel);
 286   }
 287
 288   // try to acquire an upgradable lock.
 289   bool try_lock_upgrade() {
 290     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 291
 292     // Note: when failed, we cannot flip the UPGRADED bit back,
 293     // as in this case there is either another upgrade lock or a write lock.
 294     // If it's a write lock, the bit will get cleared up when that lock's done
 295     // with unlock().
 296     return ((value & (UPGRADED | WRITER)) == 0);
 297   }
 298
 299   // mainly for debugging purposes.
 300   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 301
 302   class ReadHolder;
 303   class UpgradedHolder;
 304   class WriteHolder;
 305
 306   class ReadHolder {
 307    public:
 308     explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
 309       if (lock_) lock_->lock_shared();
 310     }
 311
 312     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 313       lock_->lock_shared();
 314     }
 315
 316     ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
 317       other.lock_ = nullptr;
 318     }
 319
 320     // down-grade
 321     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 322       upgraded.lock_ = nullptr;
 323       if (lock_) lock_->unlock_upgrade_and_lock_shared();
 324     }
 325
 326     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 327       writer.lock_ = nullptr;
 328       if (lock_) lock_->unlock_and_lock_shared();
 329     }
 330
 331     ReadHolder& operator=(ReadHolder&& other) {
 332       using std::swap;
 333       swap(lock_, other.lock_);
 334       return *this;
 335     }
 336
 337     ReadHolder(const ReadHolder& other) = delete;
 338     ReadHolder& operator=(const ReadHolder& other) = delete;
 339
 340     ~ReadHolder() { if (lock_) lock_->unlock_shared(); }
 341
 342     void reset(RWSpinLock* lock = nullptr) {
 343       if (lock == lock_) return;
 344       if (lock_) lock_->unlock_shared();
 345       lock_ = lock;
 346       if (lock_) lock_->lock_shared();
 347     }
 348
 349     void swap(ReadHolder* other) {
 350       std::swap(lock_, other->lock_);
 351     }
 352
 353    private:
 354     friend class UpgradedHolder;
 355     friend class WriteHolder;
 356     RWSpinLock* lock_;
 357   };
 358
 359   class UpgradedHolder {
 360    public:
 361     explicit UpgradedHolder(RWSpinLock* lock) : lock_(lock) {
 362       if (lock_) lock_->lock_upgrade();
 363     }
 364
 365     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 366       lock_->lock_upgrade();
 367     }
 368
 369     explicit UpgradedHolder(WriteHolder&& writer) {
 370       lock_ = writer.lock_;
 371       writer.lock_ = nullptr;
 372       if (lock_) lock_->unlock_and_lock_upgrade();
 373     }
 374
 375     UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
 376       other.lock_ = nullptr;
 377     }
 378
 379     UpgradedHolder& operator =(UpgradedHolder&& other) {
 380       using std::swap;
 381       swap(lock_, other.lock_);
 382       return *this;
 383     }
 384
 385     UpgradedHolder(const UpgradedHolder& other) = delete;
 386     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 387
 388     ~UpgradedHolder() { if (lock_) lock_->unlock_upgrade(); }
 389
 390     void reset(RWSpinLock* lock = nullptr) {
 391       if (lock == lock_) return;
 392       if (lock_) lock_->unlock_upgrade();
 393       lock_ = lock;
 394       if (lock_) lock_->lock_upgrade();
 395     }
 396
 397     void swap(UpgradedHolder* other) {
 398       using std::swap;
 399       swap(lock_, other->lock_);
 400     }
 401
 402    private:
 403     friend class WriteHolder;
 404     friend class ReadHolder;
 405     RWSpinLock* lock_;
 406   };
 407
 408   class WriteHolder {
 409    public:
 410     explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
 411       if (lock_) lock_->lock();
 412     }
 413
 414     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 415       lock_->lock();
 416     }
 417
 418     // promoted from an upgrade lock holder
 419     explicit WriteHolder(UpgradedHolder&& upgraded) {
 420       lock_ = upgraded.lock_;
 421       upgraded.lock_ = nullptr;
 422       if (lock_) lock_->unlock_upgrade_and_lock();
 423     }
 424
 425     WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
 426       other.lock_ = nullptr;
 427     }
 428
 429     WriteHolder& operator =(WriteHolder&& other) {
 430       using std::swap;
 431       swap(lock_, other.lock_);
 432       return *this;
 433     }
 434
 435     WriteHolder(const WriteHolder& other) = delete;
 436     WriteHolder& operator =(const WriteHolder& other) = delete;
 437
 438     ~WriteHolder () { if (lock_) lock_->unlock(); }
 439
 440     void reset(RWSpinLock* lock = nullptr) {
 441       if (lock == lock_) return;
 442       if (lock_) lock_->unlock();
 443       lock_ = lock;
 444       if (lock_) lock_->lock();
 445     }
 446
 447     void swap(WriteHolder* other) {
 448       using std::swap;
 449       swap(lock_, other->lock_);
 450     }
 451
 452    private:
 453     friend class ReadHolder;
 454     friend class UpgradedHolder;
 455     RWSpinLock* lock_;
 456   };
 457
 458  private:
 459   std::atomic<int32_t> bits_;
 460 };
 461
 462
 463 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 464 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 465
 466 namespace detail {
 467 template <size_t kBitWidth> struct RWTicketIntTrait {
 468   static_assert(kBitWidth == 32 || kBitWidth == 64,
 469       "bit width has to be either 32 or 64 ");
 470 };
 471
 472 template <>
 473 struct RWTicketIntTrait<64> {
 474   typedef uint64_t FullInt;
 475   typedef uint32_t HalfInt;
 476   typedef uint16_t QuarterInt;
 477
 478 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 479   static __m128i make128(const uint16_t v[4]) {
 480     return _mm_set_epi16(0, 0, 0, 0,
 481         short(v[3]), short(v[2]), short(v[1]), short(v[0]));
 482   }
 483   static inline __m128i fromInteger(uint64_t from) {
 484     return _mm_cvtsi64_si128(int64_t(from));
 485   }
 486   static inline uint64_t toInteger(__m128i in) {
 487     return uint64_t(_mm_cvtsi128_si64(in));
 488   }
 489   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 490     return toInteger(_mm_add_epi16(in, kDelta));
 491   }
 492 #endif
 493 };
 494
 495 template <>
 496 struct RWTicketIntTrait<32> {
 497   typedef uint32_t FullInt;
 498   typedef uint16_t HalfInt;
 499   typedef uint8_t QuarterInt;
 500
 501 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 502   static __m128i make128(const uint8_t v[4]) {
 503     return _mm_set_epi8(
 504         0, 0, 0, 0,
 505         0, 0, 0, 0,
 506         0, 0, 0, 0,
 507         char(v[3]), char(v[2]), char(v[1]), char(v[0]));
 508   }
 509   static inline __m128i fromInteger(uint32_t from) {
 510     return _mm_cvtsi32_si128(int32_t(from));
 511   }
 512   static inline uint32_t toInteger(__m128i in) {
 513     return uint32_t(_mm_cvtsi128_si32(in));
 514   }
 515   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 516     return toInteger(_mm_add_epi8(in, kDelta));
 517   }
 518 #endif
 519 };
 520 }  // detail
 521
 522
 523 template<size_t kBitWidth, bool kFavorWriter=false>
 524 class RWTicketSpinLockT {
 525   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 526   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 527   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 528   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 529     QuarterInt;
 530
 531   union RWTicket {
 532     constexpr RWTicket() : whole(0) {}
 533     FullInt whole;
 534     HalfInt readWrite;
 535     __extension__ struct {
 536       QuarterInt write;
 537       QuarterInt read;
 538       QuarterInt users;
 539     };
 540   } ticket;
 541
 542  private: // Some x64-specific utilities for atomic access to ticket.
 543   template<class T> static T load_acquire(T* addr) {
 544     T t = *addr; // acquire barrier
 545     asm_volatile_memory();
 546     return t;
 547   }
 548
 549   template<class T>
 550   static void store_release(T* addr, T v) {
 551     asm_volatile_memory();
 552     *addr = v; // release barrier
 553   }
 554
 555  public:
 556
 557   constexpr RWTicketSpinLockT() {}
 558
 559   RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
 560   RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
 561
 562   void lock() {
 563     if (kFavorWriter) {
 564       writeLockAggressive();
 565     } else {
 566       writeLockNice();
 567     }
 568   }
 569
 570   /*
 571    * Both try_lock and try_lock_shared diverge in our implementation from the
 572    * lock algorithm described in the link above.
 573    *
 574    * In the read case, it is undesirable that the readers could wait
 575    * for another reader (before increasing ticket.read in the other
 576    * implementation).  Our approach gives up on
 577    * first-come-first-serve, but our benchmarks showed improve
 578    * performance for both readers and writers under heavily contended
 579    * cases, particularly when the number of threads exceeds the number
 580    * of logical CPUs.
 581    *
 582    * We have writeLockAggressive() using the original implementation
 583    * for a writer, which gives some advantage to the writer over the
 584    * readers---for that path it is guaranteed that the writer will
 585    * acquire the lock after all the existing readers exit.
 586    */
 587   bool try_lock() {
 588     RWTicket t;
 589     FullInt old = t.whole = load_acquire(&ticket.whole);
 590     if (t.users != t.write) return false;
 591     ++t.users;
 592     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 593   }
 594
 595   /*
 596    * Call this if you want to prioritize writer to avoid starvation.
 597    * Unlike writeLockNice, immediately acquires the write lock when
 598    * the existing readers (arriving before the writer) finish their
 599    * turns.
 600    */
 601   void writeLockAggressive() {
 602     // std::this_thread::yield() is needed here to avoid a pathology if the number
 603     // of threads attempting concurrent writes is >= the number of real
 604     // cores allocated to this process. This is less likely than the
 605     // corresponding situation in lock_shared(), but we still want to
 606     // avoid it
 607     int count = 0;
 608     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 609     while (val != load_acquire(&ticket.write)) {
 610       asm_volatile_pause();
 611       if (UNLIKELY(++count > 1000)) std::this_thread::yield();
 612     }
 613   }
 614
 615   // Call this when the writer should be nicer to the readers.
 616   void writeLockNice() {
 617     // Here it doesn't cpu-relax the writer.
 618     //
 619     // This is because usually we have many more readers than the
 620     // writers, so the writer has less chance to get the lock when
 621     // there are a lot of competing readers.  The aggressive spinning
 622     // can help to avoid starving writers.
 623     //
 624     // We don't worry about std::this_thread::yield() here because the caller
 625     // has already explicitly abandoned fairness.
 626     while (!try_lock()) {}
 627   }
 628
 629   // Atomically unlock the write-lock from writer and acquire the read-lock.
 630   void unlock_and_lock_shared() {
 631     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 632   }
 633
 634   // Release writer permission on the lock.
 635   void unlock() {
 636     RWTicket t;
 637     t.whole = load_acquire(&ticket.whole);
 638     FullInt old = t.whole;
 639
 640 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 641     // SSE2 can reduce the lock and unlock overhead by 10%
 642     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 643     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 644     __m128i m = IntTraitType::fromInteger(old);
 645     t.whole = IntTraitType::addParallel(m, kDelta);
 646 #else
 647     ++t.read;
 648     ++t.write;
 649 #endif
 650     store_release(&ticket.readWrite, t.readWrite);
 651   }
 652
 653   void lock_shared() {
 654     // std::this_thread::yield() is important here because we can't grab the
 655     // shared lock if there is a pending writeLockAggressive, so we
 656     // need to let threads that already have a shared lock complete
 657     int count = 0;
 658     while (!LIKELY(try_lock_shared())) {
 659       asm_volatile_pause();
 660       if (UNLIKELY((++count & 1023) == 0)) std::this_thread::yield();
 661     }
 662   }
 663
 664   bool try_lock_shared() {
 665     RWTicket t, old;
 666     old.whole = t.whole = load_acquire(&ticket.whole);
 667     old.users = old.read;
 668 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 669     // SSE2 may reduce the total lock and unlock overhead by 10%
 670     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 671     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 672     __m128i m = IntTraitType::fromInteger(old.whole);
 673     t.whole = IntTraitType::addParallel(m, kDelta);
 674 #else
 675     ++t.read;
 676     ++t.users;
 677 #endif
 678     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 679   }
 680
 681   void unlock_shared() {
 682     QuarterInt val = __sync_fetch_and_add(&ticket.write, 1);
 683   }
 684
 685   class WriteHolder;
 686
 687   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 688   class ReadHolder {
 689    public:
 690     ReadHolder(ReadHolder const&) = delete;
 691     ReadHolder& operator=(ReadHolder const&) = delete;
 692
 693     explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
 694       if (lock_) lock_->lock_shared();
 695     }
 696
 697     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 698       if (lock_) lock_->lock_shared();
 699     }
 700
 701     // atomically unlock the write-lock from writer and acquire the read-lock
 702     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 703       std::swap(this->lock_, writer->lock_);
 704       if (lock_) {
 705         lock_->unlock_and_lock_shared();
 706       }
 707     }
 708
 709     ~ReadHolder() {
 710       if (lock_) lock_->unlock_shared();
 711     }
 712
 713     void reset(RWSpinLock *lock = nullptr) {
 714       if (lock_) lock_->unlock_shared();
 715       lock_ = lock;
 716       if (lock_) lock_->lock_shared();
 717     }
 718
 719     void swap(ReadHolder *other) {
 720       std::swap(this->lock_, other->lock_);
 721     }
 722
 723    private:
 724     RWSpinLock *lock_;
 725   };
 726
 727   class WriteHolder {
 728    public:
 729     WriteHolder(WriteHolder const&) = delete;
 730     WriteHolder& operator=(WriteHolder const&) = delete;
 731
 732     explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
 733       if (lock_) lock_->lock();
 734     }
 735     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 736       if (lock_) lock_->lock();
 737     }
 738
 739     ~WriteHolder() {
 740       if (lock_) lock_->unlock();
 741     }
 742
 743     void reset(RWSpinLock *lock = nullptr) {
 744       if (lock == lock_) return;
 745       if (lock_) lock_->unlock();
 746       lock_ = lock;
 747       if (lock_) lock_->lock();
 748     }
 749
 750     void swap(WriteHolder *other) {
 751       std::swap(this->lock_, other->lock_);
 752     }
 753
 754    private:
 755     friend class ReadHolder;
 756     RWSpinLock *lock_;
 757   };
 758 };
 759
 760 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 761 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 762
 763 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 764
 765 }  // namespace folly
 766
 767 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 768 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 769 #endif