folly/RWSpinLock.h

   1 /*
   2  * Copyright 2015 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * Two Read-Write spin lock implementations.
  19  *
  20  *  Ref: http://locklessinc.com/articles/locks
  21  *
  22  *  Both locks here are faster than pthread_rwlock and have very low
  23  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  24  *  are very compact (4/8 bytes), so are suitable for per-instance
  25  *  based locking, particularly when contention is not expected.
  26  *
  27  *  In most cases, RWSpinLock is a reasonable choice.  It has minimal
  28  *  overhead, and comparable contention performance when the number of
  29  *  competing threads is less than or equal to the number of logical
  30  *  CPUs.  Even as the number of threads gets larger, RWSpinLock can
  31  *  still be very competitive in READ, although it is slower on WRITE,
  32  *  and also inherently unfair to writers.
  33  *
  34  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  35  *  your application really needs a lot more threads, and a
  36  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  37  *
  38  *  Caveats:
  39  *
  40  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  41  *    based systems.
  42  *
  43  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  44  *    readers and writers.
  45  *
  46  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  47  *    readers and writers.
  48  *
  49  *    RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
  50  *    writer priority) is NOT reentrant, even for lock_shared().
  51  *
  52  *    The lock will not grant any new shared (read) accesses while a thread
  53  *    attempting to acquire the lock in write mode is blocked. (That is,
  54  *    if the lock is held in shared mode by N threads, and a thread attempts
  55  *    to acquire it in write mode, no one else can acquire it in shared mode
  56  *    until these N threads release the lock and then the blocked thread
  57  *    acquires and releases the exclusive lock.) This also applies for
  58  *    attempts to reacquire the lock in shared mode by threads that already
  59  *    hold it in shared mode, making the lock non-reentrant.
  60  *
  61  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  62  *
  63  * @author Xin Liu <xliux@fb.com>
  64  */
  65
  66 #ifndef FOLLY_RWSPINLOCK_H_
  67 #define FOLLY_RWSPINLOCK_H_
  68
  69 /*
  70 ========================================================================
  71 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
  72 ========================================================================
  73
  74 ------------------------------------------------------------------------------
  75 1. Single thread benchmark (read/write lock + unlock overhead)
  76 Benchmark                                    Iters   Total t    t/iter iter/sec
  77 -------------------------------------------------------------------------------
  78 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
  79 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
  80 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
  81 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
  82 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
  83 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
  84 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
  85 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
  86 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
  87 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
  88 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
  89 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
  90
  91 ------------------------------------------------------------------------------
  92 2. Contention Benchmark      90% read  10% write
  93 Benchmark                    hits       average    min       max        sigma
  94 ------------------------------------------------------------------------------
  95 ---------- 8  threads ------------
  96 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
  97 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
  98 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
  99 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
 100 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
 101 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
 102
 103 ---------- 16 threads ------------
 104 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
 105 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
 106 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
 107 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
 108 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
 109 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
 110
 111 ---------- 50 threads ------------
 112 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 113 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 114 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 115 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 116 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 117 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 118
 119 */
 120
 121 #include <folly/Portability.h>
 122
 123 #if defined(__GNUC__) && \
 124   (defined(__i386) || FOLLY_X64 || \
 125    defined(ARCH_K8))
 126 # define RW_SPINLOCK_USE_X86_INTRINSIC_
 127 # include <x86intrin.h>
 128 #elif defined(_MSC_VER) && defined(FOLLY_X64)
 129 # define RW_SPINLOCK_USE_X86_INTRINSIC_
 130 #else
 131 # undef RW_SPINLOCK_USE_X86_INTRINSIC_
 132 #endif
 133
 134 // iOS doesn't define _mm_cvtsi64_si128 and friends
 135 #if (FOLLY_SSE >= 2) && !TARGET_OS_IPHONE
 136 #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 137 #else
 138 #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 139 #endif
 140
 141 #include <atomic>
 142 #include <string>
 143 #include <algorithm>
 144 #include <boost/noncopyable.hpp>
 145
 146 #include <sched.h>
 147 #include <glog/logging.h>
 148
 149 #include <folly/Likely.h>
 150
 151 namespace folly {
 152
 153 /*
 154  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 155  * a nice writer and don't expect a lot of write/read contention, or
 156  * when you need small rwlocks since you are creating a large number
 157  * of them.
 158  *
 159  * Note that the unfairness here is extreme: if the lock is
 160  * continually accessed for read, writers will never get a chance.  If
 161  * the lock can be that highly contended this class is probably not an
 162  * ideal choice anyway.
 163  *
 164  * It currently implements most of the Lockable, SharedLockable and
 165  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 166  * interfaces.
 167  */
 168 class RWSpinLock {
 169   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 170  public:
 171   constexpr RWSpinLock() : bits_(0) {}
 172
 173   RWSpinLock(RWSpinLock const&) = delete;
 174   RWSpinLock& operator=(RWSpinLock const&) = delete;
 175
 176   // Lockable Concept
 177   void lock() {
 178     int count = 0;
 179     while (!LIKELY(try_lock())) {
 180       if (++count > 1000) sched_yield();
 181     }
 182   }
 183
 184   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 185   void unlock() {
 186     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 187     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 188   }
 189
 190   // SharedLockable Concept
 191   void lock_shared() {
 192     int count = 0;
 193     while (!LIKELY(try_lock_shared())) {
 194       if (++count > 1000) sched_yield();
 195     }
 196   }
 197
 198   void unlock_shared() {
 199     bits_.fetch_add(-READER, std::memory_order_release);
 200   }
 201
 202   // Downgrade the lock from writer status to reader status.
 203   void unlock_and_lock_shared() {
 204     bits_.fetch_add(READER, std::memory_order_acquire);
 205     unlock();
 206   }
 207
 208   // UpgradeLockable Concept
 209   void lock_upgrade() {
 210     int count = 0;
 211     while (!try_lock_upgrade()) {
 212       if (++count > 1000) sched_yield();
 213     }
 214   }
 215
 216   void unlock_upgrade() {
 217     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 218   }
 219
 220   // unlock upgrade and try to acquire write lock
 221   void unlock_upgrade_and_lock() {
 222     int64_t count = 0;
 223     while (!try_unlock_upgrade_and_lock()) {
 224       if (++count > 1000) sched_yield();
 225     }
 226   }
 227
 228   // unlock upgrade and read lock atomically
 229   void unlock_upgrade_and_lock_shared() {
 230     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 231   }
 232
 233   // write unlock and upgrade lock atomically
 234   void unlock_and_lock_upgrade() {
 235     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 236     // the same time when other threads are trying do try_lock_upgrade().
 237     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 238     bits_.fetch_add(-WRITER, std::memory_order_release);
 239   }
 240
 241
 242   // Attempt to acquire writer permission. Return false if we didn't get it.
 243   bool try_lock() {
 244     int32_t expect = 0;
 245     return bits_.compare_exchange_strong(expect, WRITER,
 246       std::memory_order_acq_rel);
 247   }
 248
 249   // Try to get reader permission on the lock. This can fail if we
 250   // find out someone is a writer or upgrader.
 251   // Setting the UPGRADED bit would allow a writer-to-be to indicate
 252   // its intention to write and block any new readers while waiting
 253   // for existing readers to finish and release their read locks. This
 254   // helps avoid starving writers (promoted from upgraders).
 255   bool try_lock_shared() {
 256     // fetch_add is considerably (100%) faster than compare_exchange,
 257     // so here we are optimizing for the common (lock success) case.
 258     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 259     if (UNLIKELY(value & (WRITER|UPGRADED))) {
 260       bits_.fetch_add(-READER, std::memory_order_release);
 261       return false;
 262     }
 263     return true;
 264   }
 265
 266   // try to unlock upgrade and write lock atomically
 267   bool try_unlock_upgrade_and_lock() {
 268     int32_t expect = UPGRADED;
 269     return bits_.compare_exchange_strong(expect, WRITER,
 270         std::memory_order_acq_rel);
 271   }
 272
 273   // try to acquire an upgradable lock.
 274   bool try_lock_upgrade() {
 275     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 276
 277     // Note: when failed, we cannot flip the UPGRADED bit back,
 278     // as in this case there is either another upgrade lock or a write lock.
 279     // If it's a write lock, the bit will get cleared up when that lock's done
 280     // with unlock().
 281     return ((value & (UPGRADED | WRITER)) == 0);
 282   }
 283
 284   // mainly for debugging purposes.
 285   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 286
 287   class ReadHolder;
 288   class UpgradedHolder;
 289   class WriteHolder;
 290
 291   class ReadHolder {
 292    public:
 293     explicit ReadHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 294       if (lock_) lock_->lock_shared();
 295     }
 296
 297     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 298       lock_->lock_shared();
 299     }
 300
 301     ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
 302       other.lock_ = nullptr;
 303     }
 304
 305     // down-grade
 306     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 307       upgraded.lock_ = nullptr;
 308       if (lock_) lock_->unlock_upgrade_and_lock_shared();
 309     }
 310
 311     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 312       writer.lock_ = nullptr;
 313       if (lock_) lock_->unlock_and_lock_shared();
 314     }
 315
 316     ReadHolder& operator=(ReadHolder&& other) {
 317       using std::swap;
 318       swap(lock_, other.lock_);
 319       return *this;
 320     }
 321
 322     ReadHolder(const ReadHolder& other) = delete;
 323     ReadHolder& operator=(const ReadHolder& other) = delete;
 324
 325     ~ReadHolder() { if (lock_) lock_->unlock_shared(); }
 326
 327     void reset(RWSpinLock* lock = nullptr) {
 328       if (lock == lock_) return;
 329       if (lock_) lock_->unlock_shared();
 330       lock_ = lock;
 331       if (lock_) lock_->lock_shared();
 332     }
 333
 334     void swap(ReadHolder* other) {
 335       std::swap(lock_, other->lock_);
 336     }
 337
 338    private:
 339     friend class UpgradedHolder;
 340     friend class WriteHolder;
 341     RWSpinLock* lock_;
 342   };
 343
 344   class UpgradedHolder {
 345    public:
 346     explicit UpgradedHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 347       if (lock_) lock_->lock_upgrade();
 348     }
 349
 350     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 351       lock_->lock_upgrade();
 352     }
 353
 354     explicit UpgradedHolder(WriteHolder&& writer) {
 355       lock_ = writer.lock_;
 356       writer.lock_ = nullptr;
 357       if (lock_) lock_->unlock_and_lock_upgrade();
 358     }
 359
 360     UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
 361       other.lock_ = nullptr;
 362     }
 363
 364     UpgradedHolder& operator =(UpgradedHolder&& other) {
 365       using std::swap;
 366       swap(lock_, other.lock_);
 367       return *this;
 368     }
 369
 370     UpgradedHolder(const UpgradedHolder& other) = delete;
 371     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 372
 373     ~UpgradedHolder() { if (lock_) lock_->unlock_upgrade(); }
 374
 375     void reset(RWSpinLock* lock = nullptr) {
 376       if (lock == lock_) return;
 377       if (lock_) lock_->unlock_upgrade();
 378       lock_ = lock;
 379       if (lock_) lock_->lock_upgrade();
 380     }
 381
 382     void swap(UpgradedHolder* other) {
 383       using std::swap;
 384       swap(lock_, other->lock_);
 385     }
 386
 387    private:
 388     friend class WriteHolder;
 389     friend class ReadHolder;
 390     RWSpinLock* lock_;
 391   };
 392
 393   class WriteHolder {
 394    public:
 395     explicit WriteHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 396       if (lock_) lock_->lock();
 397     }
 398
 399     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 400       lock_->lock();
 401     }
 402
 403     // promoted from an upgrade lock holder
 404     explicit WriteHolder(UpgradedHolder&& upgraded) {
 405       lock_ = upgraded.lock_;
 406       upgraded.lock_ = nullptr;
 407       if (lock_) lock_->unlock_upgrade_and_lock();
 408     }
 409
 410     WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
 411       other.lock_ = nullptr;
 412     }
 413
 414     WriteHolder& operator =(WriteHolder&& other) {
 415       using std::swap;
 416       swap(lock_, other.lock_);
 417       return *this;
 418     }
 419
 420     WriteHolder(const WriteHolder& other) = delete;
 421     WriteHolder& operator =(const WriteHolder& other) = delete;
 422
 423     ~WriteHolder () { if (lock_) lock_->unlock(); }
 424
 425     void reset(RWSpinLock* lock = nullptr) {
 426       if (lock == lock_) return;
 427       if (lock_) lock_->unlock();
 428       lock_ = lock;
 429       if (lock_) lock_->lock();
 430     }
 431
 432     void swap(WriteHolder* other) {
 433       using std::swap;
 434       swap(lock_, other->lock_);
 435     }
 436
 437    private:
 438     friend class ReadHolder;
 439     friend class UpgradedHolder;
 440     RWSpinLock* lock_;
 441   };
 442
 443   // Synchronized<> adaptors
 444   friend void acquireRead(RWSpinLock& l) { return l.lock_shared(); }
 445   friend void acquireReadWrite(RWSpinLock& l) { return l.lock(); }
 446   friend void releaseRead(RWSpinLock& l) { return l.unlock_shared(); }
 447   friend void releaseReadWrite(RWSpinLock& l) { return l.unlock(); }
 448
 449  private:
 450   std::atomic<int32_t> bits_;
 451 };
 452
 453
 454 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 455 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 456
 457 namespace detail {
 458 template <size_t kBitWidth> struct RWTicketIntTrait {
 459   static_assert(kBitWidth == 32 || kBitWidth == 64,
 460       "bit width has to be either 32 or 64 ");
 461 };
 462
 463 template <>
 464 struct RWTicketIntTrait<64> {
 465   typedef uint64_t FullInt;
 466   typedef uint32_t HalfInt;
 467   typedef uint16_t QuarterInt;
 468
 469 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 470   static __m128i make128(const uint16_t v[4]) {
 471     return _mm_set_epi16(0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 472   }
 473   static inline __m128i fromInteger(uint64_t from) {
 474     return _mm_cvtsi64_si128(from);
 475   }
 476   static inline uint64_t toInteger(__m128i in) {
 477     return _mm_cvtsi128_si64(in);
 478   }
 479   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 480     return toInteger(_mm_add_epi16(in, kDelta));
 481   }
 482 #endif
 483 };
 484
 485 template <>
 486 struct RWTicketIntTrait<32> {
 487   typedef uint32_t FullInt;
 488   typedef uint16_t HalfInt;
 489   typedef uint8_t QuarterInt;
 490
 491 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 492   static __m128i make128(const uint8_t v[4]) {
 493     return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
 494         0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 495   }
 496   static inline __m128i fromInteger(uint32_t from) {
 497     return _mm_cvtsi32_si128(from);
 498   }
 499   static inline uint32_t toInteger(__m128i in) {
 500     return _mm_cvtsi128_si32(in);
 501   }
 502   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 503     return toInteger(_mm_add_epi8(in, kDelta));
 504   }
 505 #endif
 506 };
 507 }  // detail
 508
 509
 510 template<size_t kBitWidth, bool kFavorWriter=false>
 511 class RWTicketSpinLockT {
 512   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 513   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 514   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 515   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 516     QuarterInt;
 517
 518   union RWTicket {
 519     constexpr RWTicket() : whole(0) {}
 520     FullInt whole;
 521     HalfInt readWrite;
 522     __extension__ struct {
 523       QuarterInt write;
 524       QuarterInt read;
 525       QuarterInt users;
 526     };
 527   } ticket;
 528
 529  private: // Some x64-specific utilities for atomic access to ticket.
 530   template<class T> static T load_acquire(T* addr) {
 531     T t = *addr; // acquire barrier
 532     asm_volatile_memory();
 533     return t;
 534   }
 535
 536   template<class T>
 537   static void store_release(T* addr, T v) {
 538     asm_volatile_memory();
 539     *addr = v; // release barrier
 540   }
 541
 542  public:
 543
 544   constexpr RWTicketSpinLockT() {}
 545
 546   RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
 547   RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
 548
 549   void lock() {
 550     if (kFavorWriter) {
 551       writeLockAggressive();
 552     } else {
 553       writeLockNice();
 554     }
 555   }
 556
 557   /*
 558    * Both try_lock and try_lock_shared diverge in our implementation from the
 559    * lock algorithm described in the link above.
 560    *
 561    * In the read case, it is undesirable that the readers could wait
 562    * for another reader (before increasing ticket.read in the other
 563    * implementation).  Our approach gives up on
 564    * first-come-first-serve, but our benchmarks showed improve
 565    * performance for both readers and writers under heavily contended
 566    * cases, particularly when the number of threads exceeds the number
 567    * of logical CPUs.
 568    *
 569    * We have writeLockAggressive() using the original implementation
 570    * for a writer, which gives some advantage to the writer over the
 571    * readers---for that path it is guaranteed that the writer will
 572    * acquire the lock after all the existing readers exit.
 573    */
 574   bool try_lock() {
 575     RWTicket t;
 576     FullInt old = t.whole = load_acquire(&ticket.whole);
 577     if (t.users != t.write) return false;
 578     ++t.users;
 579     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 580   }
 581
 582   /*
 583    * Call this if you want to prioritize writer to avoid starvation.
 584    * Unlike writeLockNice, immediately acquires the write lock when
 585    * the existing readers (arriving before the writer) finish their
 586    * turns.
 587    */
 588   void writeLockAggressive() {
 589     // sched_yield() is needed here to avoid a pathology if the number
 590     // of threads attempting concurrent writes is >= the number of real
 591     // cores allocated to this process. This is less likely than the
 592     // corresponding situation in lock_shared(), but we still want to
 593     // avoid it
 594     int count = 0;
 595     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 596     while (val != load_acquire(&ticket.write)) {
 597       asm_volatile_pause();
 598       if (UNLIKELY(++count > 1000)) sched_yield();
 599     }
 600   }
 601
 602   // Call this when the writer should be nicer to the readers.
 603   void writeLockNice() {
 604     // Here it doesn't cpu-relax the writer.
 605     //
 606     // This is because usually we have many more readers than the
 607     // writers, so the writer has less chance to get the lock when
 608     // there are a lot of competing readers.  The aggressive spinning
 609     // can help to avoid starving writers.
 610     //
 611     // We don't worry about sched_yield() here because the caller
 612     // has already explicitly abandoned fairness.
 613     while (!try_lock()) {}
 614   }
 615
 616   // Atomically unlock the write-lock from writer and acquire the read-lock.
 617   void unlock_and_lock_shared() {
 618     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 619   }
 620
 621   // Release writer permission on the lock.
 622   void unlock() {
 623     RWTicket t;
 624     t.whole = load_acquire(&ticket.whole);
 625     FullInt old = t.whole;
 626
 627 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 628     // SSE2 can reduce the lock and unlock overhead by 10%
 629     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 630     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 631     __m128i m = IntTraitType::fromInteger(old);
 632     t.whole = IntTraitType::addParallel(m, kDelta);
 633 #else
 634     ++t.read;
 635     ++t.write;
 636 #endif
 637     store_release(&ticket.readWrite, t.readWrite);
 638   }
 639
 640   void lock_shared() {
 641     // sched_yield() is important here because we can't grab the
 642     // shared lock if there is a pending writeLockAggressive, so we
 643     // need to let threads that already have a shared lock complete
 644     int count = 0;
 645     while (!LIKELY(try_lock_shared())) {
 646       asm_volatile_pause();
 647       if (UNLIKELY((++count & 1023) == 0)) sched_yield();
 648     }
 649   }
 650
 651   bool try_lock_shared() {
 652     RWTicket t, old;
 653     old.whole = t.whole = load_acquire(&ticket.whole);
 654     old.users = old.read;
 655 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 656     // SSE2 may reduce the total lock and unlock overhead by 10%
 657     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 658     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 659     __m128i m = IntTraitType::fromInteger(old.whole);
 660     t.whole = IntTraitType::addParallel(m, kDelta);
 661 #else
 662     ++t.read;
 663     ++t.users;
 664 #endif
 665     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 666   }
 667
 668   void unlock_shared() {
 669     QuarterInt val = __sync_fetch_and_add(&ticket.write, 1);
 670   }
 671
 672   class WriteHolder;
 673
 674   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 675   class ReadHolder : boost::noncopyable {
 676    public:
 677     explicit ReadHolder(RWSpinLock *lock = nullptr) :
 678       lock_(lock) {
 679       if (lock_) lock_->lock_shared();
 680     }
 681
 682     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 683       if (lock_) lock_->lock_shared();
 684     }
 685
 686     // atomically unlock the write-lock from writer and acquire the read-lock
 687     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 688       std::swap(this->lock_, writer->lock_);
 689       if (lock_) {
 690         lock_->unlock_and_lock_shared();
 691       }
 692     }
 693
 694     ~ReadHolder() {
 695       if (lock_) lock_->unlock_shared();
 696     }
 697
 698     void reset(RWSpinLock *lock = nullptr) {
 699       if (lock_) lock_->unlock_shared();
 700       lock_ = lock;
 701       if (lock_) lock_->lock_shared();
 702     }
 703
 704     void swap(ReadHolder *other) {
 705       std::swap(this->lock_, other->lock_);
 706     }
 707
 708    private:
 709     RWSpinLock *lock_;
 710   };
 711
 712   class WriteHolder : boost::noncopyable {
 713    public:
 714     explicit WriteHolder(RWSpinLock *lock = nullptr) : lock_(lock) {
 715       if (lock_) lock_->lock();
 716     }
 717     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 718       if (lock_) lock_->lock();
 719     }
 720
 721     ~WriteHolder() {
 722       if (lock_) lock_->unlock();
 723     }
 724
 725     void reset(RWSpinLock *lock = nullptr) {
 726       if (lock == lock_) return;
 727       if (lock_) lock_->unlock();
 728       lock_ = lock;
 729       if (lock_) lock_->lock();
 730     }
 731
 732     void swap(WriteHolder *other) {
 733       std::swap(this->lock_, other->lock_);
 734     }
 735
 736    private:
 737     friend class ReadHolder;
 738     RWSpinLock *lock_;
 739   };
 740
 741   // Synchronized<> adaptors.
 742   friend void acquireRead(RWTicketSpinLockT& mutex) {
 743     mutex.lock_shared();
 744   }
 745   friend void acquireReadWrite(RWTicketSpinLockT& mutex) {
 746     mutex.lock();
 747   }
 748   friend void releaseRead(RWTicketSpinLockT& mutex) {
 749     mutex.unlock_shared();
 750   }
 751   friend void releaseReadWrite(RWTicketSpinLockT& mutex) {
 752     mutex.unlock();
 753   }
 754 };
 755
 756 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 757 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 758
 759 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 760
 761 }  // namespace folly
 762
 763 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 764 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 765 #endif
 766
 767 #endif  // FOLLY_RWSPINLOCK_H_