folly/RWSpinLock.h

   1 /*
   2  * Copyright 2015 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * Two Read-Write spin lock implementations.
  19  *
  20  *  Ref: http://locklessinc.com/articles/locks
  21  *
  22  *  Both locks here are faster than pthread_rwlock and have very low
  23  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  24  *  are very compact (4/8 bytes), so are suitable for per-instance
  25  *  based locking, particularly when contention is not expected.
  26  *
  27  *  In most cases, RWSpinLock is a reasonable choice.  It has minimal
  28  *  overhead, and comparable contention performance when the number of
  29  *  competing threads is less than or equal to the number of logical
  30  *  CPUs.  Even as the number of threads gets larger, RWSpinLock can
  31  *  still be very competitive in READ, although it is slower on WRITE,
  32  *  and also inherently unfair to writers.
  33  *
  34  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  35  *  your application really needs a lot more threads, and a
  36  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  37  *
  38  *  Caveats:
  39  *
  40  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  41  *    based systems.
  42  *
  43  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  44  *    readers and writers.
  45  *
  46  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  47  *    readers and writers.
  48  *
  49  *    RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
  50  *    writer priority) is NOT reentrant, even for lock_shared().
  51  *
  52  *    The lock will not grant any new shared (read) accesses while a thread
  53  *    attempting to acquire the lock in write mode is blocked. (That is,
  54  *    if the lock is held in shared mode by N threads, and a thread attempts
  55  *    to acquire it in write mode, no one else can acquire it in shared mode
  56  *    until these N threads release the lock and then the blocked thread
  57  *    acquires and releases the exclusive lock.) This also applies for
  58  *    attempts to reacquire the lock in shared mode by threads that already
  59  *    hold it in shared mode, making the lock non-reentrant.
  60  *
  61  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  62  *
  63  * @author Xin Liu <xliux@fb.com>
  64  */
  65
  66 #ifndef FOLLY_RWSPINLOCK_H_
  67 #define FOLLY_RWSPINLOCK_H_
  68
  69 /*
  70 ========================================================================
  71 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
  72 ========================================================================
  73
  74 ------------------------------------------------------------------------------
  75 1. Single thread benchmark (read/write lock + unlock overhead)
  76 Benchmark                                    Iters   Total t    t/iter iter/sec
  77 -------------------------------------------------------------------------------
  78 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
  79 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
  80 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
  81 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
  82 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
  83 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
  84 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
  85 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
  86 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
  87 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
  88 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
  89 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
  90
  91 ------------------------------------------------------------------------------
  92 2. Contention Benchmark      90% read  10% write
  93 Benchmark                    hits       average    min       max        sigma
  94 ------------------------------------------------------------------------------
  95 ---------- 8  threads ------------
  96 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
  97 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
  98 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
  99 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
 100 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
 101 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
 102
 103 ---------- 16 threads ------------
 104 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
 105 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
 106 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
 107 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
 108 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
 109 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
 110
 111 ---------- 50 threads ------------
 112 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 113 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 114 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 115 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 116 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 117 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 118
 119 */
 120
 121 #include <folly/Portability.h>
 122
 123 #if defined(__GNUC__) && \
 124   (defined(__i386) || FOLLY_X64 || \
 125    defined(ARCH_K8))
 126 # define RW_SPINLOCK_USE_X86_INTRINSIC_
 127 # include <x86intrin.h>
 128 #elif defined(_MSC_VER) && defined(FOLLY_X64)
 129 # define RW_SPINLOCK_USE_X86_INTRINSIC_
 130 #else
 131 # undef RW_SPINLOCK_USE_X86_INTRINSIC_
 132 #endif
 133
 134 // iOS doesn't define _mm_cvtsi64_si128 and friends
 135 #if (FOLLY_SSE >= 2) && !TARGET_OS_IPHONE
 136 #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 137 #else
 138 #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 139 #endif
 140
 141 #include <atomic>
 142 #include <string>
 143 #include <algorithm>
 144
 145 #include <sched.h>
 146 #include <glog/logging.h>
 147
 148 #include <folly/Likely.h>
 149
 150 namespace folly {
 151
 152 /*
 153  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 154  * a nice writer and don't expect a lot of write/read contention, or
 155  * when you need small rwlocks since you are creating a large number
 156  * of them.
 157  *
 158  * Note that the unfairness here is extreme: if the lock is
 159  * continually accessed for read, writers will never get a chance.  If
 160  * the lock can be that highly contended this class is probably not an
 161  * ideal choice anyway.
 162  *
 163  * It currently implements most of the Lockable, SharedLockable and
 164  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 165  * interfaces.
 166  */
 167 class RWSpinLock {
 168   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 169  public:
 170   constexpr RWSpinLock() : bits_(0) {}
 171
 172   RWSpinLock(RWSpinLock const&) = delete;
 173   RWSpinLock& operator=(RWSpinLock const&) = delete;
 174
 175   // Lockable Concept
 176   void lock() {
 177     int count = 0;
 178     while (!LIKELY(try_lock())) {
 179       if (++count > 1000) sched_yield();
 180     }
 181   }
 182
 183   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 184   void unlock() {
 185     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 186     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 187   }
 188
 189   // SharedLockable Concept
 190   void lock_shared() {
 191     int count = 0;
 192     while (!LIKELY(try_lock_shared())) {
 193       if (++count > 1000) sched_yield();
 194     }
 195   }
 196
 197   void unlock_shared() {
 198     bits_.fetch_add(-READER, std::memory_order_release);
 199   }
 200
 201   // Downgrade the lock from writer status to reader status.
 202   void unlock_and_lock_shared() {
 203     bits_.fetch_add(READER, std::memory_order_acquire);
 204     unlock();
 205   }
 206
 207   // UpgradeLockable Concept
 208   void lock_upgrade() {
 209     int count = 0;
 210     while (!try_lock_upgrade()) {
 211       if (++count > 1000) sched_yield();
 212     }
 213   }
 214
 215   void unlock_upgrade() {
 216     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 217   }
 218
 219   // unlock upgrade and try to acquire write lock
 220   void unlock_upgrade_and_lock() {
 221     int64_t count = 0;
 222     while (!try_unlock_upgrade_and_lock()) {
 223       if (++count > 1000) sched_yield();
 224     }
 225   }
 226
 227   // unlock upgrade and read lock atomically
 228   void unlock_upgrade_and_lock_shared() {
 229     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 230   }
 231
 232   // write unlock and upgrade lock atomically
 233   void unlock_and_lock_upgrade() {
 234     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 235     // the same time when other threads are trying do try_lock_upgrade().
 236     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 237     bits_.fetch_add(-WRITER, std::memory_order_release);
 238   }
 239
 240
 241   // Attempt to acquire writer permission. Return false if we didn't get it.
 242   bool try_lock() {
 243     int32_t expect = 0;
 244     return bits_.compare_exchange_strong(expect, WRITER,
 245       std::memory_order_acq_rel);
 246   }
 247
 248   // Try to get reader permission on the lock. This can fail if we
 249   // find out someone is a writer or upgrader.
 250   // Setting the UPGRADED bit would allow a writer-to-be to indicate
 251   // its intention to write and block any new readers while waiting
 252   // for existing readers to finish and release their read locks. This
 253   // helps avoid starving writers (promoted from upgraders).
 254   bool try_lock_shared() {
 255     // fetch_add is considerably (100%) faster than compare_exchange,
 256     // so here we are optimizing for the common (lock success) case.
 257     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 258     if (UNLIKELY(value & (WRITER|UPGRADED))) {
 259       bits_.fetch_add(-READER, std::memory_order_release);
 260       return false;
 261     }
 262     return true;
 263   }
 264
 265   // try to unlock upgrade and write lock atomically
 266   bool try_unlock_upgrade_and_lock() {
 267     int32_t expect = UPGRADED;
 268     return bits_.compare_exchange_strong(expect, WRITER,
 269         std::memory_order_acq_rel);
 270   }
 271
 272   // try to acquire an upgradable lock.
 273   bool try_lock_upgrade() {
 274     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 275
 276     // Note: when failed, we cannot flip the UPGRADED bit back,
 277     // as in this case there is either another upgrade lock or a write lock.
 278     // If it's a write lock, the bit will get cleared up when that lock's done
 279     // with unlock().
 280     return ((value & (UPGRADED | WRITER)) == 0);
 281   }
 282
 283   // mainly for debugging purposes.
 284   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 285
 286   class ReadHolder;
 287   class UpgradedHolder;
 288   class WriteHolder;
 289
 290   class ReadHolder {
 291    public:
 292     explicit ReadHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 293       if (lock_) lock_->lock_shared();
 294     }
 295
 296     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 297       lock_->lock_shared();
 298     }
 299
 300     ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
 301       other.lock_ = nullptr;
 302     }
 303
 304     // down-grade
 305     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 306       upgraded.lock_ = nullptr;
 307       if (lock_) lock_->unlock_upgrade_and_lock_shared();
 308     }
 309
 310     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 311       writer.lock_ = nullptr;
 312       if (lock_) lock_->unlock_and_lock_shared();
 313     }
 314
 315     ReadHolder& operator=(ReadHolder&& other) {
 316       using std::swap;
 317       swap(lock_, other.lock_);
 318       return *this;
 319     }
 320
 321     ReadHolder(const ReadHolder& other) = delete;
 322     ReadHolder& operator=(const ReadHolder& other) = delete;
 323
 324     ~ReadHolder() { if (lock_) lock_->unlock_shared(); }
 325
 326     void reset(RWSpinLock* lock = nullptr) {
 327       if (lock == lock_) return;
 328       if (lock_) lock_->unlock_shared();
 329       lock_ = lock;
 330       if (lock_) lock_->lock_shared();
 331     }
 332
 333     void swap(ReadHolder* other) {
 334       std::swap(lock_, other->lock_);
 335     }
 336
 337    private:
 338     friend class UpgradedHolder;
 339     friend class WriteHolder;
 340     RWSpinLock* lock_;
 341   };
 342
 343   class UpgradedHolder {
 344    public:
 345     explicit UpgradedHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 346       if (lock_) lock_->lock_upgrade();
 347     }
 348
 349     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 350       lock_->lock_upgrade();
 351     }
 352
 353     explicit UpgradedHolder(WriteHolder&& writer) {
 354       lock_ = writer.lock_;
 355       writer.lock_ = nullptr;
 356       if (lock_) lock_->unlock_and_lock_upgrade();
 357     }
 358
 359     UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
 360       other.lock_ = nullptr;
 361     }
 362
 363     UpgradedHolder& operator =(UpgradedHolder&& other) {
 364       using std::swap;
 365       swap(lock_, other.lock_);
 366       return *this;
 367     }
 368
 369     UpgradedHolder(const UpgradedHolder& other) = delete;
 370     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 371
 372     ~UpgradedHolder() { if (lock_) lock_->unlock_upgrade(); }
 373
 374     void reset(RWSpinLock* lock = nullptr) {
 375       if (lock == lock_) return;
 376       if (lock_) lock_->unlock_upgrade();
 377       lock_ = lock;
 378       if (lock_) lock_->lock_upgrade();
 379     }
 380
 381     void swap(UpgradedHolder* other) {
 382       using std::swap;
 383       swap(lock_, other->lock_);
 384     }
 385
 386    private:
 387     friend class WriteHolder;
 388     friend class ReadHolder;
 389     RWSpinLock* lock_;
 390   };
 391
 392   class WriteHolder {
 393    public:
 394     explicit WriteHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 395       if (lock_) lock_->lock();
 396     }
 397
 398     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 399       lock_->lock();
 400     }
 401
 402     // promoted from an upgrade lock holder
 403     explicit WriteHolder(UpgradedHolder&& upgraded) {
 404       lock_ = upgraded.lock_;
 405       upgraded.lock_ = nullptr;
 406       if (lock_) lock_->unlock_upgrade_and_lock();
 407     }
 408
 409     WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
 410       other.lock_ = nullptr;
 411     }
 412
 413     WriteHolder& operator =(WriteHolder&& other) {
 414       using std::swap;
 415       swap(lock_, other.lock_);
 416       return *this;
 417     }
 418
 419     WriteHolder(const WriteHolder& other) = delete;
 420     WriteHolder& operator =(const WriteHolder& other) = delete;
 421
 422     ~WriteHolder () { if (lock_) lock_->unlock(); }
 423
 424     void reset(RWSpinLock* lock = nullptr) {
 425       if (lock == lock_) return;
 426       if (lock_) lock_->unlock();
 427       lock_ = lock;
 428       if (lock_) lock_->lock();
 429     }
 430
 431     void swap(WriteHolder* other) {
 432       using std::swap;
 433       swap(lock_, other->lock_);
 434     }
 435
 436    private:
 437     friend class ReadHolder;
 438     friend class UpgradedHolder;
 439     RWSpinLock* lock_;
 440   };
 441
 442   // Synchronized<> adaptors
 443   friend void acquireRead(RWSpinLock& l) { return l.lock_shared(); }
 444   friend void acquireReadWrite(RWSpinLock& l) { return l.lock(); }
 445   friend void releaseRead(RWSpinLock& l) { return l.unlock_shared(); }
 446   friend void releaseReadWrite(RWSpinLock& l) { return l.unlock(); }
 447
 448  private:
 449   std::atomic<int32_t> bits_;
 450 };
 451
 452
 453 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 454 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 455
 456 namespace detail {
 457 template <size_t kBitWidth> struct RWTicketIntTrait {
 458   static_assert(kBitWidth == 32 || kBitWidth == 64,
 459       "bit width has to be either 32 or 64 ");
 460 };
 461
 462 template <>
 463 struct RWTicketIntTrait<64> {
 464   typedef uint64_t FullInt;
 465   typedef uint32_t HalfInt;
 466   typedef uint16_t QuarterInt;
 467
 468 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 469   static __m128i make128(const uint16_t v[4]) {
 470     return _mm_set_epi16(0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 471   }
 472   static inline __m128i fromInteger(uint64_t from) {
 473     return _mm_cvtsi64_si128(from);
 474   }
 475   static inline uint64_t toInteger(__m128i in) {
 476     return _mm_cvtsi128_si64(in);
 477   }
 478   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 479     return toInteger(_mm_add_epi16(in, kDelta));
 480   }
 481 #endif
 482 };
 483
 484 template <>
 485 struct RWTicketIntTrait<32> {
 486   typedef uint32_t FullInt;
 487   typedef uint16_t HalfInt;
 488   typedef uint8_t QuarterInt;
 489
 490 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 491   static __m128i make128(const uint8_t v[4]) {
 492     return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
 493         0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 494   }
 495   static inline __m128i fromInteger(uint32_t from) {
 496     return _mm_cvtsi32_si128(from);
 497   }
 498   static inline uint32_t toInteger(__m128i in) {
 499     return _mm_cvtsi128_si32(in);
 500   }
 501   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 502     return toInteger(_mm_add_epi8(in, kDelta));
 503   }
 504 #endif
 505 };
 506 }  // detail
 507
 508
 509 template<size_t kBitWidth, bool kFavorWriter=false>
 510 class RWTicketSpinLockT {
 511   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 512   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 513   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 514   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 515     QuarterInt;
 516
 517   union RWTicket {
 518     constexpr RWTicket() : whole(0) {}
 519     FullInt whole;
 520     HalfInt readWrite;
 521     __extension__ struct {
 522       QuarterInt write;
 523       QuarterInt read;
 524       QuarterInt users;
 525     };
 526   } ticket;
 527
 528  private: // Some x64-specific utilities for atomic access to ticket.
 529   template<class T> static T load_acquire(T* addr) {
 530     T t = *addr; // acquire barrier
 531     asm_volatile_memory();
 532     return t;
 533   }
 534
 535   template<class T>
 536   static void store_release(T* addr, T v) {
 537     asm_volatile_memory();
 538     *addr = v; // release barrier
 539   }
 540
 541  public:
 542
 543   constexpr RWTicketSpinLockT() {}
 544
 545   RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
 546   RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
 547
 548   void lock() {
 549     if (kFavorWriter) {
 550       writeLockAggressive();
 551     } else {
 552       writeLockNice();
 553     }
 554   }
 555
 556   /*
 557    * Both try_lock and try_lock_shared diverge in our implementation from the
 558    * lock algorithm described in the link above.
 559    *
 560    * In the read case, it is undesirable that the readers could wait
 561    * for another reader (before increasing ticket.read in the other
 562    * implementation).  Our approach gives up on
 563    * first-come-first-serve, but our benchmarks showed improve
 564    * performance for both readers and writers under heavily contended
 565    * cases, particularly when the number of threads exceeds the number
 566    * of logical CPUs.
 567    *
 568    * We have writeLockAggressive() using the original implementation
 569    * for a writer, which gives some advantage to the writer over the
 570    * readers---for that path it is guaranteed that the writer will
 571    * acquire the lock after all the existing readers exit.
 572    */
 573   bool try_lock() {
 574     RWTicket t;
 575     FullInt old = t.whole = load_acquire(&ticket.whole);
 576     if (t.users != t.write) return false;
 577     ++t.users;
 578     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 579   }
 580
 581   /*
 582    * Call this if you want to prioritize writer to avoid starvation.
 583    * Unlike writeLockNice, immediately acquires the write lock when
 584    * the existing readers (arriving before the writer) finish their
 585    * turns.
 586    */
 587   void writeLockAggressive() {
 588     // sched_yield() is needed here to avoid a pathology if the number
 589     // of threads attempting concurrent writes is >= the number of real
 590     // cores allocated to this process. This is less likely than the
 591     // corresponding situation in lock_shared(), but we still want to
 592     // avoid it
 593     int count = 0;
 594     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 595     while (val != load_acquire(&ticket.write)) {
 596       asm_volatile_pause();
 597       if (UNLIKELY(++count > 1000)) sched_yield();
 598     }
 599   }
 600
 601   // Call this when the writer should be nicer to the readers.
 602   void writeLockNice() {
 603     // Here it doesn't cpu-relax the writer.
 604     //
 605     // This is because usually we have many more readers than the
 606     // writers, so the writer has less chance to get the lock when
 607     // there are a lot of competing readers.  The aggressive spinning
 608     // can help to avoid starving writers.
 609     //
 610     // We don't worry about sched_yield() here because the caller
 611     // has already explicitly abandoned fairness.
 612     while (!try_lock()) {}
 613   }
 614
 615   // Atomically unlock the write-lock from writer and acquire the read-lock.
 616   void unlock_and_lock_shared() {
 617     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 618   }
 619
 620   // Release writer permission on the lock.
 621   void unlock() {
 622     RWTicket t;
 623     t.whole = load_acquire(&ticket.whole);
 624     FullInt old = t.whole;
 625
 626 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 627     // SSE2 can reduce the lock and unlock overhead by 10%
 628     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 629     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 630     __m128i m = IntTraitType::fromInteger(old);
 631     t.whole = IntTraitType::addParallel(m, kDelta);
 632 #else
 633     ++t.read;
 634     ++t.write;
 635 #endif
 636     store_release(&ticket.readWrite, t.readWrite);
 637   }
 638
 639   void lock_shared() {
 640     // sched_yield() is important here because we can't grab the
 641     // shared lock if there is a pending writeLockAggressive, so we
 642     // need to let threads that already have a shared lock complete
 643     int count = 0;
 644     while (!LIKELY(try_lock_shared())) {
 645       asm_volatile_pause();
 646       if (UNLIKELY((++count & 1023) == 0)) sched_yield();
 647     }
 648   }
 649
 650   bool try_lock_shared() {
 651     RWTicket t, old;
 652     old.whole = t.whole = load_acquire(&ticket.whole);
 653     old.users = old.read;
 654 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 655     // SSE2 may reduce the total lock and unlock overhead by 10%
 656     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 657     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 658     __m128i m = IntTraitType::fromInteger(old.whole);
 659     t.whole = IntTraitType::addParallel(m, kDelta);
 660 #else
 661     ++t.read;
 662     ++t.users;
 663 #endif
 664     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 665   }
 666
 667   void unlock_shared() {
 668     QuarterInt val = __sync_fetch_and_add(&ticket.write, 1);
 669   }
 670
 671   class WriteHolder;
 672
 673   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 674   class ReadHolder {
 675    public:
 676     ReadHolder(ReadHolder const&) = delete;
 677     ReadHolder& operator=(ReadHolder const&) = delete;
 678
 679     explicit ReadHolder(RWSpinLock *lock = nullptr) :
 680       lock_(lock) {
 681       if (lock_) lock_->lock_shared();
 682     }
 683
 684     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 685       if (lock_) lock_->lock_shared();
 686     }
 687
 688     // atomically unlock the write-lock from writer and acquire the read-lock
 689     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 690       std::swap(this->lock_, writer->lock_);
 691       if (lock_) {
 692         lock_->unlock_and_lock_shared();
 693       }
 694     }
 695
 696     ~ReadHolder() {
 697       if (lock_) lock_->unlock_shared();
 698     }
 699
 700     void reset(RWSpinLock *lock = nullptr) {
 701       if (lock_) lock_->unlock_shared();
 702       lock_ = lock;
 703       if (lock_) lock_->lock_shared();
 704     }
 705
 706     void swap(ReadHolder *other) {
 707       std::swap(this->lock_, other->lock_);
 708     }
 709
 710    private:
 711     RWSpinLock *lock_;
 712   };
 713
 714   class WriteHolder {
 715    public:
 716     WriteHolder(WriteHolder const&) = delete;
 717     WriteHolder& operator=(WriteHolder const&) = delete;
 718
 719     explicit WriteHolder(RWSpinLock *lock = nullptr) : lock_(lock) {
 720       if (lock_) lock_->lock();
 721     }
 722     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 723       if (lock_) lock_->lock();
 724     }
 725
 726     ~WriteHolder() {
 727       if (lock_) lock_->unlock();
 728     }
 729
 730     void reset(RWSpinLock *lock = nullptr) {
 731       if (lock == lock_) return;
 732       if (lock_) lock_->unlock();
 733       lock_ = lock;
 734       if (lock_) lock_->lock();
 735     }
 736
 737     void swap(WriteHolder *other) {
 738       std::swap(this->lock_, other->lock_);
 739     }
 740
 741    private:
 742     friend class ReadHolder;
 743     RWSpinLock *lock_;
 744   };
 745
 746   // Synchronized<> adaptors.
 747   friend void acquireRead(RWTicketSpinLockT& mutex) {
 748     mutex.lock_shared();
 749   }
 750   friend void acquireReadWrite(RWTicketSpinLockT& mutex) {
 751     mutex.lock();
 752   }
 753   friend void releaseRead(RWTicketSpinLockT& mutex) {
 754     mutex.unlock_shared();
 755   }
 756   friend void releaseReadWrite(RWTicketSpinLockT& mutex) {
 757     mutex.unlock();
 758   }
 759 };
 760
 761 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 762 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 763
 764 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 765
 766 }  // namespace folly
 767
 768 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 769 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 770 #endif
 771
 772 #endif  // FOLLY_RWSPINLOCK_H_