folly/RWSpinLock.h

   1 /*
   2  * Copyright 2016 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * N.B. You most likely do _not_ want to use RWSpinLock or any other
  19  * kind of spinlock.  Use SharedMutex instead.
  20  *
  21  * In short, spinlocks in preemptive multi-tasking operating systems
  22  * have serious problems and fast mutexes like SharedMutex are almost
  23  * certainly the better choice, because letting the OS scheduler put a
  24  * thread to sleep is better for system responsiveness and throughput
  25  * than wasting a timeslice repeatedly querying a lock held by a
  26  * thread that's blocked, and you can't prevent userspace
  27  * programs blocking.
  28  *
  29  * Spinlocks in an operating system kernel make much more sense than
  30  * they do in userspace.
  31  *
  32  * -------------------------------------------------------------------
  33  *
  34  * Two Read-Write spin lock implementations.
  35  *
  36  *  Ref: http://locklessinc.com/articles/locks
  37  *
  38  *  Both locks here are faster than pthread_rwlock and have very low
  39  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  40  *  are very compact (4/8 bytes), so are suitable for per-instance
  41  *  based locking, particularly when contention is not expected.
  42  *
  43  *  For a spinlock, RWSpinLock is a reasonable choice.  (See the note
  44  *  about for why a spin lock is frequently a bad idea generally.)
  45  *  RWSpinLock has minimal overhead, and comparable contention
  46  *  performance when the number of competing threads is less than or
  47  *  equal to the number of logical CPUs.  Even as the number of
  48  *  threads gets larger, RWSpinLock can still be very competitive in
  49  *  READ, although it is slower on WRITE, and also inherently unfair
  50  *  to writers.
  51  *
  52  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  53  *  your application really needs a lot more threads, and a
  54  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  55  *
  56  *  Caveats:
  57  *
  58  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  59  *    based systems.
  60  *
  61  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  62  *    readers and writers.
  63  *
  64  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  65  *    readers and writers.
  66  *
  67  *    RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
  68  *    writer priority) is NOT reentrant, even for lock_shared().
  69  *
  70  *    The lock will not grant any new shared (read) accesses while a thread
  71  *    attempting to acquire the lock in write mode is blocked. (That is,
  72  *    if the lock is held in shared mode by N threads, and a thread attempts
  73  *    to acquire it in write mode, no one else can acquire it in shared mode
  74  *    until these N threads release the lock and then the blocked thread
  75  *    acquires and releases the exclusive lock.) This also applies for
  76  *    attempts to reacquire the lock in shared mode by threads that already
  77  *    hold it in shared mode, making the lock non-reentrant.
  78  *
  79  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  80  *
  81  * @author Xin Liu <xliux@fb.com>
  82  */
  83
  84 #ifndef FOLLY_RWSPINLOCK_H_
  85 #define FOLLY_RWSPINLOCK_H_
  86
  87 /*
  88 ========================================================================
  89 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
  90 ========================================================================
  91
  92 ------------------------------------------------------------------------------
  93 1. Single thread benchmark (read/write lock + unlock overhead)
  94 Benchmark                                    Iters   Total t    t/iter iter/sec
  95 -------------------------------------------------------------------------------
  96 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
  97 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
  98 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
  99 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
 100 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
 101 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
 102 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
 103 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
 104 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
 105 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
 106 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
 107 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
 108
 109 ------------------------------------------------------------------------------
 110 2. Contention Benchmark      90% read  10% write
 111 Benchmark                    hits       average    min       max        sigma
 112 ------------------------------------------------------------------------------
 113 ---------- 8  threads ------------
 114 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
 115 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
 116 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
 117 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
 118 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
 119 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
 120
 121 ---------- 16 threads ------------
 122 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
 123 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
 124 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
 125 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
 126 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
 127 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
 128
 129 ---------- 50 threads ------------
 130 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 131 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 132 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 133 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 134 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 135 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 136
 137 */
 138
 139 #include <folly/Portability.h>
 140
 141 #if defined(__GNUC__) && \
 142   (defined(__i386) || FOLLY_X64 || \
 143    defined(ARCH_K8))
 144 # define RW_SPINLOCK_USE_X86_INTRINSIC_
 145 # include <x86intrin.h>
 146 #elif defined(_MSC_VER) && defined(FOLLY_X64)
 147 # define RW_SPINLOCK_USE_X86_INTRINSIC_
 148 #else
 149 # undef RW_SPINLOCK_USE_X86_INTRINSIC_
 150 #endif
 151
 152 // iOS doesn't define _mm_cvtsi64_si128 and friends
 153 #if (FOLLY_SSE >= 2) && !TARGET_OS_IPHONE
 154 #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 155 #else
 156 #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 157 #endif
 158
 159 #include <atomic>
 160 #include <string>
 161 #include <algorithm>
 162
 163 #include <sched.h>
 164 #include <glog/logging.h>
 165
 166 #include <folly/Likely.h>
 167
 168 namespace folly {
 169
 170 /*
 171  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 172  * a nice writer and don't expect a lot of write/read contention, or
 173  * when you need small rwlocks since you are creating a large number
 174  * of them.
 175  *
 176  * Note that the unfairness here is extreme: if the lock is
 177  * continually accessed for read, writers will never get a chance.  If
 178  * the lock can be that highly contended this class is probably not an
 179  * ideal choice anyway.
 180  *
 181  * It currently implements most of the Lockable, SharedLockable and
 182  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 183  * interfaces.
 184  */
 185 class RWSpinLock {
 186   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 187  public:
 188   constexpr RWSpinLock() : bits_(0) {}
 189
 190   RWSpinLock(RWSpinLock const&) = delete;
 191   RWSpinLock& operator=(RWSpinLock const&) = delete;
 192
 193   // Lockable Concept
 194   void lock() {
 195     int count = 0;
 196     while (!LIKELY(try_lock())) {
 197       if (++count > 1000) sched_yield();
 198     }
 199   }
 200
 201   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 202   void unlock() {
 203     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 204     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 205   }
 206
 207   // SharedLockable Concept
 208   void lock_shared() {
 209     int count = 0;
 210     while (!LIKELY(try_lock_shared())) {
 211       if (++count > 1000) sched_yield();
 212     }
 213   }
 214
 215   void unlock_shared() {
 216     bits_.fetch_add(-READER, std::memory_order_release);
 217   }
 218
 219   // Downgrade the lock from writer status to reader status.
 220   void unlock_and_lock_shared() {
 221     bits_.fetch_add(READER, std::memory_order_acquire);
 222     unlock();
 223   }
 224
 225   // UpgradeLockable Concept
 226   void lock_upgrade() {
 227     int count = 0;
 228     while (!try_lock_upgrade()) {
 229       if (++count > 1000) sched_yield();
 230     }
 231   }
 232
 233   void unlock_upgrade() {
 234     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 235   }
 236
 237   // unlock upgrade and try to acquire write lock
 238   void unlock_upgrade_and_lock() {
 239     int64_t count = 0;
 240     while (!try_unlock_upgrade_and_lock()) {
 241       if (++count > 1000) sched_yield();
 242     }
 243   }
 244
 245   // unlock upgrade and read lock atomically
 246   void unlock_upgrade_and_lock_shared() {
 247     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 248   }
 249
 250   // write unlock and upgrade lock atomically
 251   void unlock_and_lock_upgrade() {
 252     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 253     // the same time when other threads are trying do try_lock_upgrade().
 254     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 255     bits_.fetch_add(-WRITER, std::memory_order_release);
 256   }
 257
 258
 259   // Attempt to acquire writer permission. Return false if we didn't get it.
 260   bool try_lock() {
 261     int32_t expect = 0;
 262     return bits_.compare_exchange_strong(expect, WRITER,
 263       std::memory_order_acq_rel);
 264   }
 265
 266   // Try to get reader permission on the lock. This can fail if we
 267   // find out someone is a writer or upgrader.
 268   // Setting the UPGRADED bit would allow a writer-to-be to indicate
 269   // its intention to write and block any new readers while waiting
 270   // for existing readers to finish and release their read locks. This
 271   // helps avoid starving writers (promoted from upgraders).
 272   bool try_lock_shared() {
 273     // fetch_add is considerably (100%) faster than compare_exchange,
 274     // so here we are optimizing for the common (lock success) case.
 275     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 276     if (UNLIKELY(value & (WRITER|UPGRADED))) {
 277       bits_.fetch_add(-READER, std::memory_order_release);
 278       return false;
 279     }
 280     return true;
 281   }
 282
 283   // try to unlock upgrade and write lock atomically
 284   bool try_unlock_upgrade_and_lock() {
 285     int32_t expect = UPGRADED;
 286     return bits_.compare_exchange_strong(expect, WRITER,
 287         std::memory_order_acq_rel);
 288   }
 289
 290   // try to acquire an upgradable lock.
 291   bool try_lock_upgrade() {
 292     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 293
 294     // Note: when failed, we cannot flip the UPGRADED bit back,
 295     // as in this case there is either another upgrade lock or a write lock.
 296     // If it's a write lock, the bit will get cleared up when that lock's done
 297     // with unlock().
 298     return ((value & (UPGRADED | WRITER)) == 0);
 299   }
 300
 301   // mainly for debugging purposes.
 302   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 303
 304   class ReadHolder;
 305   class UpgradedHolder;
 306   class WriteHolder;
 307
 308   class ReadHolder {
 309    public:
 310     explicit ReadHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 311       if (lock_) lock_->lock_shared();
 312     }
 313
 314     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 315       lock_->lock_shared();
 316     }
 317
 318     ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
 319       other.lock_ = nullptr;
 320     }
 321
 322     // down-grade
 323     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 324       upgraded.lock_ = nullptr;
 325       if (lock_) lock_->unlock_upgrade_and_lock_shared();
 326     }
 327
 328     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 329       writer.lock_ = nullptr;
 330       if (lock_) lock_->unlock_and_lock_shared();
 331     }
 332
 333     ReadHolder& operator=(ReadHolder&& other) {
 334       using std::swap;
 335       swap(lock_, other.lock_);
 336       return *this;
 337     }
 338
 339     ReadHolder(const ReadHolder& other) = delete;
 340     ReadHolder& operator=(const ReadHolder& other) = delete;
 341
 342     ~ReadHolder() { if (lock_) lock_->unlock_shared(); }
 343
 344     void reset(RWSpinLock* lock = nullptr) {
 345       if (lock == lock_) return;
 346       if (lock_) lock_->unlock_shared();
 347       lock_ = lock;
 348       if (lock_) lock_->lock_shared();
 349     }
 350
 351     void swap(ReadHolder* other) {
 352       std::swap(lock_, other->lock_);
 353     }
 354
 355    private:
 356     friend class UpgradedHolder;
 357     friend class WriteHolder;
 358     RWSpinLock* lock_;
 359   };
 360
 361   class UpgradedHolder {
 362    public:
 363     explicit UpgradedHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 364       if (lock_) lock_->lock_upgrade();
 365     }
 366
 367     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 368       lock_->lock_upgrade();
 369     }
 370
 371     explicit UpgradedHolder(WriteHolder&& writer) {
 372       lock_ = writer.lock_;
 373       writer.lock_ = nullptr;
 374       if (lock_) lock_->unlock_and_lock_upgrade();
 375     }
 376
 377     UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
 378       other.lock_ = nullptr;
 379     }
 380
 381     UpgradedHolder& operator =(UpgradedHolder&& other) {
 382       using std::swap;
 383       swap(lock_, other.lock_);
 384       return *this;
 385     }
 386
 387     UpgradedHolder(const UpgradedHolder& other) = delete;
 388     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 389
 390     ~UpgradedHolder() { if (lock_) lock_->unlock_upgrade(); }
 391
 392     void reset(RWSpinLock* lock = nullptr) {
 393       if (lock == lock_) return;
 394       if (lock_) lock_->unlock_upgrade();
 395       lock_ = lock;
 396       if (lock_) lock_->lock_upgrade();
 397     }
 398
 399     void swap(UpgradedHolder* other) {
 400       using std::swap;
 401       swap(lock_, other->lock_);
 402     }
 403
 404    private:
 405     friend class WriteHolder;
 406     friend class ReadHolder;
 407     RWSpinLock* lock_;
 408   };
 409
 410   class WriteHolder {
 411    public:
 412     explicit WriteHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 413       if (lock_) lock_->lock();
 414     }
 415
 416     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 417       lock_->lock();
 418     }
 419
 420     // promoted from an upgrade lock holder
 421     explicit WriteHolder(UpgradedHolder&& upgraded) {
 422       lock_ = upgraded.lock_;
 423       upgraded.lock_ = nullptr;
 424       if (lock_) lock_->unlock_upgrade_and_lock();
 425     }
 426
 427     WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
 428       other.lock_ = nullptr;
 429     }
 430
 431     WriteHolder& operator =(WriteHolder&& other) {
 432       using std::swap;
 433       swap(lock_, other.lock_);
 434       return *this;
 435     }
 436
 437     WriteHolder(const WriteHolder& other) = delete;
 438     WriteHolder& operator =(const WriteHolder& other) = delete;
 439
 440     ~WriteHolder () { if (lock_) lock_->unlock(); }
 441
 442     void reset(RWSpinLock* lock = nullptr) {
 443       if (lock == lock_) return;
 444       if (lock_) lock_->unlock();
 445       lock_ = lock;
 446       if (lock_) lock_->lock();
 447     }
 448
 449     void swap(WriteHolder* other) {
 450       using std::swap;
 451       swap(lock_, other->lock_);
 452     }
 453
 454    private:
 455     friend class ReadHolder;
 456     friend class UpgradedHolder;
 457     RWSpinLock* lock_;
 458   };
 459
 460   // Synchronized<> adaptors
 461   friend void acquireRead(RWSpinLock& l) { return l.lock_shared(); }
 462   friend void acquireReadWrite(RWSpinLock& l) { return l.lock(); }
 463   friend void releaseRead(RWSpinLock& l) { return l.unlock_shared(); }
 464   friend void releaseReadWrite(RWSpinLock& l) { return l.unlock(); }
 465
 466  private:
 467   std::atomic<int32_t> bits_;
 468 };
 469
 470
 471 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 472 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 473
 474 namespace detail {
 475 template <size_t kBitWidth> struct RWTicketIntTrait {
 476   static_assert(kBitWidth == 32 || kBitWidth == 64,
 477       "bit width has to be either 32 or 64 ");
 478 };
 479
 480 template <>
 481 struct RWTicketIntTrait<64> {
 482   typedef uint64_t FullInt;
 483   typedef uint32_t HalfInt;
 484   typedef uint16_t QuarterInt;
 485
 486 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 487   static __m128i make128(const uint16_t v[4]) {
 488     return _mm_set_epi16(0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 489   }
 490   static inline __m128i fromInteger(uint64_t from) {
 491     return _mm_cvtsi64_si128(from);
 492   }
 493   static inline uint64_t toInteger(__m128i in) {
 494     return _mm_cvtsi128_si64(in);
 495   }
 496   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 497     return toInteger(_mm_add_epi16(in, kDelta));
 498   }
 499 #endif
 500 };
 501
 502 template <>
 503 struct RWTicketIntTrait<32> {
 504   typedef uint32_t FullInt;
 505   typedef uint16_t HalfInt;
 506   typedef uint8_t QuarterInt;
 507
 508 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 509   static __m128i make128(const uint8_t v[4]) {
 510     return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
 511         0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 512   }
 513   static inline __m128i fromInteger(uint32_t from) {
 514     return _mm_cvtsi32_si128(from);
 515   }
 516   static inline uint32_t toInteger(__m128i in) {
 517     return _mm_cvtsi128_si32(in);
 518   }
 519   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 520     return toInteger(_mm_add_epi8(in, kDelta));
 521   }
 522 #endif
 523 };
 524 }  // detail
 525
 526
 527 template<size_t kBitWidth, bool kFavorWriter=false>
 528 class RWTicketSpinLockT {
 529   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 530   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 531   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 532   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 533     QuarterInt;
 534
 535   union RWTicket {
 536     constexpr RWTicket() : whole(0) {}
 537     FullInt whole;
 538     HalfInt readWrite;
 539     __extension__ struct {
 540       QuarterInt write;
 541       QuarterInt read;
 542       QuarterInt users;
 543     };
 544   } ticket;
 545
 546  private: // Some x64-specific utilities for atomic access to ticket.
 547   template<class T> static T load_acquire(T* addr) {
 548     T t = *addr; // acquire barrier
 549     asm_volatile_memory();
 550     return t;
 551   }
 552
 553   template<class T>
 554   static void store_release(T* addr, T v) {
 555     asm_volatile_memory();
 556     *addr = v; // release barrier
 557   }
 558
 559  public:
 560
 561   constexpr RWTicketSpinLockT() {}
 562
 563   RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
 564   RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
 565
 566   void lock() {
 567     if (kFavorWriter) {
 568       writeLockAggressive();
 569     } else {
 570       writeLockNice();
 571     }
 572   }
 573
 574   /*
 575    * Both try_lock and try_lock_shared diverge in our implementation from the
 576    * lock algorithm described in the link above.
 577    *
 578    * In the read case, it is undesirable that the readers could wait
 579    * for another reader (before increasing ticket.read in the other
 580    * implementation).  Our approach gives up on
 581    * first-come-first-serve, but our benchmarks showed improve
 582    * performance for both readers and writers under heavily contended
 583    * cases, particularly when the number of threads exceeds the number
 584    * of logical CPUs.
 585    *
 586    * We have writeLockAggressive() using the original implementation
 587    * for a writer, which gives some advantage to the writer over the
 588    * readers---for that path it is guaranteed that the writer will
 589    * acquire the lock after all the existing readers exit.
 590    */
 591   bool try_lock() {
 592     RWTicket t;
 593     FullInt old = t.whole = load_acquire(&ticket.whole);
 594     if (t.users != t.write) return false;
 595     ++t.users;
 596     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 597   }
 598
 599   /*
 600    * Call this if you want to prioritize writer to avoid starvation.
 601    * Unlike writeLockNice, immediately acquires the write lock when
 602    * the existing readers (arriving before the writer) finish their
 603    * turns.
 604    */
 605   void writeLockAggressive() {
 606     // sched_yield() is needed here to avoid a pathology if the number
 607     // of threads attempting concurrent writes is >= the number of real
 608     // cores allocated to this process. This is less likely than the
 609     // corresponding situation in lock_shared(), but we still want to
 610     // avoid it
 611     int count = 0;
 612     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 613     while (val != load_acquire(&ticket.write)) {
 614       asm_volatile_pause();
 615       if (UNLIKELY(++count > 1000)) sched_yield();
 616     }
 617   }
 618
 619   // Call this when the writer should be nicer to the readers.
 620   void writeLockNice() {
 621     // Here it doesn't cpu-relax the writer.
 622     //
 623     // This is because usually we have many more readers than the
 624     // writers, so the writer has less chance to get the lock when
 625     // there are a lot of competing readers.  The aggressive spinning
 626     // can help to avoid starving writers.
 627     //
 628     // We don't worry about sched_yield() here because the caller
 629     // has already explicitly abandoned fairness.
 630     while (!try_lock()) {}
 631   }
 632
 633   // Atomically unlock the write-lock from writer and acquire the read-lock.
 634   void unlock_and_lock_shared() {
 635     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 636   }
 637
 638   // Release writer permission on the lock.
 639   void unlock() {
 640     RWTicket t;
 641     t.whole = load_acquire(&ticket.whole);
 642     FullInt old = t.whole;
 643
 644 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 645     // SSE2 can reduce the lock and unlock overhead by 10%
 646     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 647     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 648     __m128i m = IntTraitType::fromInteger(old);
 649     t.whole = IntTraitType::addParallel(m, kDelta);
 650 #else
 651     ++t.read;
 652     ++t.write;
 653 #endif
 654     store_release(&ticket.readWrite, t.readWrite);
 655   }
 656
 657   void lock_shared() {
 658     // sched_yield() is important here because we can't grab the
 659     // shared lock if there is a pending writeLockAggressive, so we
 660     // need to let threads that already have a shared lock complete
 661     int count = 0;
 662     while (!LIKELY(try_lock_shared())) {
 663       asm_volatile_pause();
 664       if (UNLIKELY((++count & 1023) == 0)) sched_yield();
 665     }
 666   }
 667
 668   bool try_lock_shared() {
 669     RWTicket t, old;
 670     old.whole = t.whole = load_acquire(&ticket.whole);
 671     old.users = old.read;
 672 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 673     // SSE2 may reduce the total lock and unlock overhead by 10%
 674     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 675     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 676     __m128i m = IntTraitType::fromInteger(old.whole);
 677     t.whole = IntTraitType::addParallel(m, kDelta);
 678 #else
 679     ++t.read;
 680     ++t.users;
 681 #endif
 682     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 683   }
 684
 685   void unlock_shared() {
 686     QuarterInt val = __sync_fetch_and_add(&ticket.write, 1);
 687   }
 688
 689   class WriteHolder;
 690
 691   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 692   class ReadHolder {
 693    public:
 694     ReadHolder(ReadHolder const&) = delete;
 695     ReadHolder& operator=(ReadHolder const&) = delete;
 696
 697     explicit ReadHolder(RWSpinLock *lock = nullptr) :
 698       lock_(lock) {
 699       if (lock_) lock_->lock_shared();
 700     }
 701
 702     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 703       if (lock_) lock_->lock_shared();
 704     }
 705
 706     // atomically unlock the write-lock from writer and acquire the read-lock
 707     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 708       std::swap(this->lock_, writer->lock_);
 709       if (lock_) {
 710         lock_->unlock_and_lock_shared();
 711       }
 712     }
 713
 714     ~ReadHolder() {
 715       if (lock_) lock_->unlock_shared();
 716     }
 717
 718     void reset(RWSpinLock *lock = nullptr) {
 719       if (lock_) lock_->unlock_shared();
 720       lock_ = lock;
 721       if (lock_) lock_->lock_shared();
 722     }
 723
 724     void swap(ReadHolder *other) {
 725       std::swap(this->lock_, other->lock_);
 726     }
 727
 728    private:
 729     RWSpinLock *lock_;
 730   };
 731
 732   class WriteHolder {
 733    public:
 734     WriteHolder(WriteHolder const&) = delete;
 735     WriteHolder& operator=(WriteHolder const&) = delete;
 736
 737     explicit WriteHolder(RWSpinLock *lock = nullptr) : lock_(lock) {
 738       if (lock_) lock_->lock();
 739     }
 740     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 741       if (lock_) lock_->lock();
 742     }
 743
 744     ~WriteHolder() {
 745       if (lock_) lock_->unlock();
 746     }
 747
 748     void reset(RWSpinLock *lock = nullptr) {
 749       if (lock == lock_) return;
 750       if (lock_) lock_->unlock();
 751       lock_ = lock;
 752       if (lock_) lock_->lock();
 753     }
 754
 755     void swap(WriteHolder *other) {
 756       std::swap(this->lock_, other->lock_);
 757     }
 758
 759    private:
 760     friend class ReadHolder;
 761     RWSpinLock *lock_;
 762   };
 763
 764   // Synchronized<> adaptors.
 765   friend void acquireRead(RWTicketSpinLockT& mutex) {
 766     mutex.lock_shared();
 767   }
 768   friend void acquireReadWrite(RWTicketSpinLockT& mutex) {
 769     mutex.lock();
 770   }
 771   friend void releaseRead(RWTicketSpinLockT& mutex) {
 772     mutex.unlock_shared();
 773   }
 774   friend void releaseReadWrite(RWTicketSpinLockT& mutex) {
 775     mutex.unlock();
 776   }
 777 };
 778
 779 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 780 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 781
 782 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 783
 784 }  // namespace folly
 785
 786 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 787 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 788 #endif
 789
 790 #endif  // FOLLY_RWSPINLOCK_H_