folly/RWSpinLock.h

   1 /*
   2  * Copyright 2015 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * Two Read-Write spin lock implementations.
  19  *
  20  *  Ref: http://locklessinc.com/articles/locks
  21  *
  22  *  Both locks here are faster than pthread_rwlock and have very low
  23  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  24  *  are very compact (4/8 bytes), so are suitable for per-instance
  25  *  based locking, particularly when contention is not expected.
  26  *
  27  *  In most cases, RWSpinLock is a reasonable choice.  It has minimal
  28  *  overhead, and comparable contention performance when the number of
  29  *  competing threads is less than or equal to the number of logical
  30  *  CPUs.  Even as the number of threads gets larger, RWSpinLock can
  31  *  still be very competitive in READ, although it is slower on WRITE,
  32  *  and also inherently unfair to writers.
  33  *
  34  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  35  *  your application really needs a lot more threads, and a
  36  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  37  *
  38  *  Caveats:
  39  *
  40  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  41  *    based systems.
  42  *
  43  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  44  *    readers and writers.
  45  *
  46  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  47  *    readers and writers.
  48  *
  49  *    RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
  50  *    writer priority) is NOT reentrant, even for lock_shared().
  51  *
  52  *    The lock will not grant any new shared (read) accesses while a thread
  53  *    attempting to acquire the lock in write mode is blocked. (That is,
  54  *    if the lock is held in shared mode by N threads, and a thread attempts
  55  *    to acquire it in write mode, no one else can acquire it in shared mode
  56  *    until these N threads release the lock and then the blocked thread
  57  *    acquires and releases the exclusive lock.) This also applies for
  58  *    attempts to reacquire the lock in shared mode by threads that already
  59  *    hold it in shared mode, making the lock non-reentrant.
  60  *
  61  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  62  *
  63  * @author Xin Liu <xliux@fb.com>
  64  */
  65
  66 #ifndef FOLLY_RWSPINLOCK_H_
  67 #define FOLLY_RWSPINLOCK_H_
  68
  69 /*
  70 ========================================================================
  71 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
  72 ========================================================================
  73
  74 ------------------------------------------------------------------------------
  75 1. Single thread benchmark (read/write lock + unlock overhead)
  76 Benchmark                                    Iters   Total t    t/iter iter/sec
  77 -------------------------------------------------------------------------------
  78 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
  79 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
  80 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
  81 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
  82 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
  83 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
  84 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
  85 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
  86 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
  87 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
  88 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
  89 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
  90
  91 ------------------------------------------------------------------------------
  92 2. Contention Benchmark      90% read  10% write
  93 Benchmark                    hits       average    min       max        sigma
  94 ------------------------------------------------------------------------------
  95 ---------- 8  threads ------------
  96 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
  97 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
  98 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
  99 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
 100 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
 101 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
 102
 103 ---------- 16 threads ------------
 104 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
 105 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
 106 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
 107 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
 108 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
 109 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
 110
 111 ---------- 50 threads ------------
 112 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 113 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 114 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 115 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 116 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 117 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 118
 119 */
 120
 121 #include <folly/Portability.h>
 122
 123 #if defined(__GNUC__) && \
 124   (defined(__i386) || FOLLY_X64 || \
 125    defined(ARCH_K8))
 126 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 127 #include <x86intrin.h>
 128 #else
 129 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 130 #endif
 131
 132 // iOS doesn't define _mm_cvtsi64_si128 and friends
 133 #if (FOLLY_SSE >= 2) && !TARGET_OS_IPHONE
 134 #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 135 #else
 136 #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 137 #endif
 138
 139 #include <atomic>
 140 #include <string>
 141 #include <algorithm>
 142 #include <boost/noncopyable.hpp>
 143
 144 #include <sched.h>
 145 #include <glog/logging.h>
 146
 147 #include <folly/Likely.h>
 148
 149 namespace folly {
 150
 151 /*
 152  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 153  * a nice writer and don't expect a lot of write/read contention, or
 154  * when you need small rwlocks since you are creating a large number
 155  * of them.
 156  *
 157  * Note that the unfairness here is extreme: if the lock is
 158  * continually accessed for read, writers will never get a chance.  If
 159  * the lock can be that highly contended this class is probably not an
 160  * ideal choice anyway.
 161  *
 162  * It currently implements most of the Lockable, SharedLockable and
 163  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 164  * interfaces.
 165  */
 166 class RWSpinLock : boost::noncopyable {
 167   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 168  public:
 169   RWSpinLock() : bits_(0) {}
 170
 171   // Lockable Concept
 172   void lock() {
 173     int count = 0;
 174     while (!LIKELY(try_lock())) {
 175       if (++count > 1000) sched_yield();
 176     }
 177   }
 178
 179   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 180   void unlock() {
 181     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 182     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 183   }
 184
 185   // SharedLockable Concept
 186   void lock_shared() {
 187     int count = 0;
 188     while (!LIKELY(try_lock_shared())) {
 189       if (++count > 1000) sched_yield();
 190     }
 191   }
 192
 193   void unlock_shared() {
 194     bits_.fetch_add(-READER, std::memory_order_release);
 195   }
 196
 197   // Downgrade the lock from writer status to reader status.
 198   void unlock_and_lock_shared() {
 199     bits_.fetch_add(READER, std::memory_order_acquire);
 200     unlock();
 201   }
 202
 203   // UpgradeLockable Concept
 204   void lock_upgrade() {
 205     int count = 0;
 206     while (!try_lock_upgrade()) {
 207       if (++count > 1000) sched_yield();
 208     }
 209   }
 210
 211   void unlock_upgrade() {
 212     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 213   }
 214
 215   // unlock upgrade and try to acquire write lock
 216   void unlock_upgrade_and_lock() {
 217     int64_t count = 0;
 218     while (!try_unlock_upgrade_and_lock()) {
 219       if (++count > 1000) sched_yield();
 220     }
 221   }
 222
 223   // unlock upgrade and read lock atomically
 224   void unlock_upgrade_and_lock_shared() {
 225     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 226   }
 227
 228   // write unlock and upgrade lock atomically
 229   void unlock_and_lock_upgrade() {
 230     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 231     // the same time when other threads are trying do try_lock_upgrade().
 232     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 233     bits_.fetch_add(-WRITER, std::memory_order_release);
 234   }
 235
 236
 237   // Attempt to acquire writer permission. Return false if we didn't get it.
 238   bool try_lock() {
 239     int32_t expect = 0;
 240     return bits_.compare_exchange_strong(expect, WRITER,
 241       std::memory_order_acq_rel);
 242   }
 243
 244   // Try to get reader permission on the lock. This can fail if we
 245   // find out someone is a writer or upgrader.
 246   // Setting the UPGRADED bit would allow a writer-to-be to indicate
 247   // its intention to write and block any new readers while waiting
 248   // for existing readers to finish and release their read locks. This
 249   // helps avoid starving writers (promoted from upgraders).
 250   bool try_lock_shared() {
 251     // fetch_add is considerably (100%) faster than compare_exchange,
 252     // so here we are optimizing for the common (lock success) case.
 253     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 254     if (UNLIKELY(value & (WRITER|UPGRADED))) {
 255       bits_.fetch_add(-READER, std::memory_order_release);
 256       return false;
 257     }
 258     return true;
 259   }
 260
 261   // try to unlock upgrade and write lock atomically
 262   bool try_unlock_upgrade_and_lock() {
 263     int32_t expect = UPGRADED;
 264     return bits_.compare_exchange_strong(expect, WRITER,
 265         std::memory_order_acq_rel);
 266   }
 267
 268   // try to acquire an upgradable lock.
 269   bool try_lock_upgrade() {
 270     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 271
 272     // Note: when failed, we cannot flip the UPGRADED bit back,
 273     // as in this case there is either another upgrade lock or a write lock.
 274     // If it's a write lock, the bit will get cleared up when that lock's done
 275     // with unlock().
 276     return ((value & (UPGRADED | WRITER)) == 0);
 277   }
 278
 279   // mainly for debugging purposes.
 280   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 281
 282   class ReadHolder;
 283   class UpgradedHolder;
 284   class WriteHolder;
 285
 286   class ReadHolder {
 287    public:
 288     explicit ReadHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 289       if (lock_) lock_->lock_shared();
 290     }
 291
 292     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 293       lock_->lock_shared();
 294     }
 295
 296     ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
 297       other.lock_ = nullptr;
 298     }
 299
 300     // down-grade
 301     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 302       upgraded.lock_ = nullptr;
 303       if (lock_) lock_->unlock_upgrade_and_lock_shared();
 304     }
 305
 306     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 307       writer.lock_ = nullptr;
 308       if (lock_) lock_->unlock_and_lock_shared();
 309     }
 310
 311     ReadHolder& operator=(ReadHolder&& other) {
 312       using std::swap;
 313       swap(lock_, other.lock_);
 314       return *this;
 315     }
 316
 317     ReadHolder(const ReadHolder& other) = delete;
 318     ReadHolder& operator=(const ReadHolder& other) = delete;
 319
 320     ~ReadHolder() { if (lock_) lock_->unlock_shared(); }
 321
 322     void reset(RWSpinLock* lock = nullptr) {
 323       if (lock == lock_) return;
 324       if (lock_) lock_->unlock_shared();
 325       lock_ = lock;
 326       if (lock_) lock_->lock_shared();
 327     }
 328
 329     void swap(ReadHolder* other) {
 330       std::swap(lock_, other->lock_);
 331     }
 332
 333    private:
 334     friend class UpgradedHolder;
 335     friend class WriteHolder;
 336     RWSpinLock* lock_;
 337   };
 338
 339   class UpgradedHolder {
 340    public:
 341     explicit UpgradedHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 342       if (lock_) lock_->lock_upgrade();
 343     }
 344
 345     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 346       lock_->lock_upgrade();
 347     }
 348
 349     explicit UpgradedHolder(WriteHolder&& writer) {
 350       lock_ = writer.lock_;
 351       writer.lock_ = nullptr;
 352       if (lock_) lock_->unlock_and_lock_upgrade();
 353     }
 354
 355     UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
 356       other.lock_ = nullptr;
 357     }
 358
 359     UpgradedHolder& operator =(UpgradedHolder&& other) {
 360       using std::swap;
 361       swap(lock_, other.lock_);
 362       return *this;
 363     }
 364
 365     UpgradedHolder(const UpgradedHolder& other) = delete;
 366     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 367
 368     ~UpgradedHolder() { if (lock_) lock_->unlock_upgrade(); }
 369
 370     void reset(RWSpinLock* lock = nullptr) {
 371       if (lock == lock_) return;
 372       if (lock_) lock_->unlock_upgrade();
 373       lock_ = lock;
 374       if (lock_) lock_->lock_upgrade();
 375     }
 376
 377     void swap(UpgradedHolder* other) {
 378       using std::swap;
 379       swap(lock_, other->lock_);
 380     }
 381
 382    private:
 383     friend class WriteHolder;
 384     friend class ReadHolder;
 385     RWSpinLock* lock_;
 386   };
 387
 388   class WriteHolder {
 389    public:
 390     explicit WriteHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 391       if (lock_) lock_->lock();
 392     }
 393
 394     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 395       lock_->lock();
 396     }
 397
 398     // promoted from an upgrade lock holder
 399     explicit WriteHolder(UpgradedHolder&& upgraded) {
 400       lock_ = upgraded.lock_;
 401       upgraded.lock_ = nullptr;
 402       if (lock_) lock_->unlock_upgrade_and_lock();
 403     }
 404
 405     WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
 406       other.lock_ = nullptr;
 407     }
 408
 409     WriteHolder& operator =(WriteHolder&& other) {
 410       using std::swap;
 411       swap(lock_, other.lock_);
 412       return *this;
 413     }
 414
 415     WriteHolder(const WriteHolder& other) = delete;
 416     WriteHolder& operator =(const WriteHolder& other) = delete;
 417
 418     ~WriteHolder () { if (lock_) lock_->unlock(); }
 419
 420     void reset(RWSpinLock* lock = nullptr) {
 421       if (lock == lock_) return;
 422       if (lock_) lock_->unlock();
 423       lock_ = lock;
 424       if (lock_) lock_->lock();
 425     }
 426
 427     void swap(WriteHolder* other) {
 428       using std::swap;
 429       swap(lock_, other->lock_);
 430     }
 431
 432    private:
 433     friend class ReadHolder;
 434     friend class UpgradedHolder;
 435     RWSpinLock* lock_;
 436   };
 437
 438   // Synchronized<> adaptors
 439   friend void acquireRead(RWSpinLock& l) { return l.lock_shared(); }
 440   friend void acquireReadWrite(RWSpinLock& l) { return l.lock(); }
 441   friend void releaseRead(RWSpinLock& l) { return l.unlock_shared(); }
 442   friend void releaseReadWrite(RWSpinLock& l) { return l.unlock(); }
 443
 444  private:
 445   std::atomic<int32_t> bits_;
 446 };
 447
 448
 449 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 450 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 451
 452 namespace detail {
 453 template <size_t kBitWidth> struct RWTicketIntTrait {
 454   static_assert(kBitWidth == 32 || kBitWidth == 64,
 455       "bit width has to be either 32 or 64 ");
 456 };
 457
 458 template <>
 459 struct RWTicketIntTrait<64> {
 460   typedef uint64_t FullInt;
 461   typedef uint32_t HalfInt;
 462   typedef uint16_t QuarterInt;
 463
 464 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 465   static __m128i make128(const uint16_t v[4]) {
 466     return _mm_set_epi16(0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 467   }
 468   static inline __m128i fromInteger(uint64_t from) {
 469     return _mm_cvtsi64_si128(from);
 470   }
 471   static inline uint64_t toInteger(__m128i in) {
 472     return _mm_cvtsi128_si64(in);
 473   }
 474   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 475     return toInteger(_mm_add_epi16(in, kDelta));
 476   }
 477 #endif
 478 };
 479
 480 template <>
 481 struct RWTicketIntTrait<32> {
 482   typedef uint32_t FullInt;
 483   typedef uint16_t HalfInt;
 484   typedef uint8_t QuarterInt;
 485
 486 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 487   static __m128i make128(const uint8_t v[4]) {
 488     return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
 489         0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 490   }
 491   static inline __m128i fromInteger(uint32_t from) {
 492     return _mm_cvtsi32_si128(from);
 493   }
 494   static inline uint32_t toInteger(__m128i in) {
 495     return _mm_cvtsi128_si32(in);
 496   }
 497   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 498     return toInteger(_mm_add_epi8(in, kDelta));
 499   }
 500 #endif
 501 };
 502 }  // detail
 503
 504
 505 template<size_t kBitWidth, bool kFavorWriter=false>
 506 class RWTicketSpinLockT : boost::noncopyable {
 507   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 508   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 509   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 510   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 511     QuarterInt;
 512
 513   union RWTicket {
 514     FullInt whole;
 515     HalfInt readWrite;
 516     __extension__ struct {
 517       QuarterInt write;
 518       QuarterInt read;
 519       QuarterInt users;
 520     };
 521   } ticket;
 522
 523  private: // Some x64-specific utilities for atomic access to ticket.
 524   template<class T> static T load_acquire(T* addr) {
 525     T t = *addr; // acquire barrier
 526     asm volatile("" : : : "memory");
 527     return t;
 528   }
 529
 530   template<class T>
 531   static void store_release(T* addr, T v) {
 532     asm volatile("" : : : "memory");
 533     *addr = v; // release barrier
 534   }
 535
 536  public:
 537
 538   RWTicketSpinLockT() {
 539     store_release(&ticket.whole, FullInt(0));
 540   }
 541
 542   void lock() {
 543     if (kFavorWriter) {
 544       writeLockAggressive();
 545     } else {
 546       writeLockNice();
 547     }
 548   }
 549
 550   /*
 551    * Both try_lock and try_lock_shared diverge in our implementation from the
 552    * lock algorithm described in the link above.
 553    *
 554    * In the read case, it is undesirable that the readers could wait
 555    * for another reader (before increasing ticket.read in the other
 556    * implementation).  Our approach gives up on
 557    * first-come-first-serve, but our benchmarks showed improve
 558    * performance for both readers and writers under heavily contended
 559    * cases, particularly when the number of threads exceeds the number
 560    * of logical CPUs.
 561    *
 562    * We have writeLockAggressive() using the original implementation
 563    * for a writer, which gives some advantage to the writer over the
 564    * readers---for that path it is guaranteed that the writer will
 565    * acquire the lock after all the existing readers exit.
 566    */
 567   bool try_lock() {
 568     RWTicket t;
 569     FullInt old = t.whole = load_acquire(&ticket.whole);
 570     if (t.users != t.write) return false;
 571     ++t.users;
 572     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 573   }
 574
 575   /*
 576    * Call this if you want to prioritize writer to avoid starvation.
 577    * Unlike writeLockNice, immediately acquires the write lock when
 578    * the existing readers (arriving before the writer) finish their
 579    * turns.
 580    */
 581   void writeLockAggressive() {
 582     // sched_yield() is needed here to avoid a pathology if the number
 583     // of threads attempting concurrent writes is >= the number of real
 584     // cores allocated to this process. This is less likely than the
 585     // corresponding situation in lock_shared(), but we still want to
 586     // avoid it
 587     int count = 0;
 588     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 589     while (val != load_acquire(&ticket.write)) {
 590       asm_volatile_pause();
 591       if (UNLIKELY(++count > 1000)) sched_yield();
 592     }
 593   }
 594
 595   // Call this when the writer should be nicer to the readers.
 596   void writeLockNice() {
 597     // Here it doesn't cpu-relax the writer.
 598     //
 599     // This is because usually we have many more readers than the
 600     // writers, so the writer has less chance to get the lock when
 601     // there are a lot of competing readers.  The aggressive spinning
 602     // can help to avoid starving writers.
 603     //
 604     // We don't worry about sched_yield() here because the caller
 605     // has already explicitly abandoned fairness.
 606     while (!try_lock()) {}
 607   }
 608
 609   // Atomically unlock the write-lock from writer and acquire the read-lock.
 610   void unlock_and_lock_shared() {
 611     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 612   }
 613
 614   // Release writer permission on the lock.
 615   void unlock() {
 616     RWTicket t;
 617     t.whole = load_acquire(&ticket.whole);
 618     FullInt old = t.whole;
 619
 620 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 621     // SSE2 can reduce the lock and unlock overhead by 10%
 622     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 623     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 624     __m128i m = IntTraitType::fromInteger(old);
 625     t.whole = IntTraitType::addParallel(m, kDelta);
 626 #else
 627     ++t.read;
 628     ++t.write;
 629 #endif
 630     store_release(&ticket.readWrite, t.readWrite);
 631   }
 632
 633   void lock_shared() {
 634     // sched_yield() is important here because we can't grab the
 635     // shared lock if there is a pending writeLockAggressive, so we
 636     // need to let threads that already have a shared lock complete
 637     int count = 0;
 638     while (!LIKELY(try_lock_shared())) {
 639       asm_volatile_pause();
 640       if (UNLIKELY((++count & 1023) == 0)) sched_yield();
 641     }
 642   }
 643
 644   bool try_lock_shared() {
 645     RWTicket t, old;
 646     old.whole = t.whole = load_acquire(&ticket.whole);
 647     old.users = old.read;
 648 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 649     // SSE2 may reduce the total lock and unlock overhead by 10%
 650     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 651     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 652     __m128i m = IntTraitType::fromInteger(old.whole);
 653     t.whole = IntTraitType::addParallel(m, kDelta);
 654 #else
 655     ++t.read;
 656     ++t.users;
 657 #endif
 658     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 659   }
 660
 661   void unlock_shared() {
 662     QuarterInt val = __sync_fetch_and_add(&ticket.write, 1);
 663   }
 664
 665   class WriteHolder;
 666
 667   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 668   class ReadHolder : boost::noncopyable {
 669    public:
 670     explicit ReadHolder(RWSpinLock *lock = nullptr) :
 671       lock_(lock) {
 672       if (lock_) lock_->lock_shared();
 673     }
 674
 675     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 676       if (lock_) lock_->lock_shared();
 677     }
 678
 679     // atomically unlock the write-lock from writer and acquire the read-lock
 680     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 681       std::swap(this->lock_, writer->lock_);
 682       if (lock_) {
 683         lock_->unlock_and_lock_shared();
 684       }
 685     }
 686
 687     ~ReadHolder() {
 688       if (lock_) lock_->unlock_shared();
 689     }
 690
 691     void reset(RWSpinLock *lock = nullptr) {
 692       if (lock_) lock_->unlock_shared();
 693       lock_ = lock;
 694       if (lock_) lock_->lock_shared();
 695     }
 696
 697     void swap(ReadHolder *other) {
 698       std::swap(this->lock_, other->lock_);
 699     }
 700
 701    private:
 702     RWSpinLock *lock_;
 703   };
 704
 705   class WriteHolder : boost::noncopyable {
 706    public:
 707     explicit WriteHolder(RWSpinLock *lock = nullptr) : lock_(lock) {
 708       if (lock_) lock_->lock();
 709     }
 710     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 711       if (lock_) lock_->lock();
 712     }
 713
 714     ~WriteHolder() {
 715       if (lock_) lock_->unlock();
 716     }
 717
 718     void reset(RWSpinLock *lock = nullptr) {
 719       if (lock == lock_) return;
 720       if (lock_) lock_->unlock();
 721       lock_ = lock;
 722       if (lock_) lock_->lock();
 723     }
 724
 725     void swap(WriteHolder *other) {
 726       std::swap(this->lock_, other->lock_);
 727     }
 728
 729    private:
 730     friend class ReadHolder;
 731     RWSpinLock *lock_;
 732   };
 733
 734   // Synchronized<> adaptors.
 735   friend void acquireRead(RWTicketSpinLockT& mutex) {
 736     mutex.lock_shared();
 737   }
 738   friend void acquireReadWrite(RWTicketSpinLockT& mutex) {
 739     mutex.lock();
 740   }
 741   friend bool acquireReadWrite(RWTicketSpinLockT& mutex,
 742                                unsigned int milliseconds) {
 743     mutex.lock();
 744     return true;
 745   }
 746   friend void releaseRead(RWTicketSpinLockT& mutex) {
 747     mutex.unlock_shared();
 748   }
 749   friend void releaseReadWrite(RWTicketSpinLockT& mutex) {
 750     mutex.unlock();
 751   }
 752 };
 753
 754 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 755 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 756
 757 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 758
 759 }  // namespace folly
 760
 761 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 762 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 763 #endif
 764
 765 #endif  // FOLLY_RWSPINLOCK_H_