folly/RWSpinLock.h

   1 /*
   2  * Copyright 2013 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * Two Read-Write spin lock implementations.
  19  *
  20  *  Ref: http://locklessinc.com/articles/locks
  21  *
  22  *  Both locks here are faster than pthread_rwlock and have very low
  23  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  24  *  are very compact (4/8 bytes), so are suitable for per-instance
  25  *  based locking, particularly when contention is not expected.
  26  *
  27  *  In most cases, RWSpinLock is a reasonable choice.  It has minimal
  28  *  overhead, and comparable contention performance when the number of
  29  *  competing threads is less than or equal to the number of logical
  30  *  CPUs.  Even as the number of threads gets larger, RWSpinLock can
  31  *  still be very competitive in READ, although it is slower on WRITE,
  32  *  and also inherently unfair to writers.
  33  *
  34  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  35  *  your application really needs a lot more threads, and a
  36  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  37  *
  38  *  Caveats:
  39  *
  40  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  41  *    based systems.
  42  *
  43  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  44  *    readers and writers.
  45  *
  46  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  47  *    readers and writers.
  48  *
  49  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  50  *
  51  * @author Xin Liu <xliux@fb.com>
  52  */
  53
  54 #ifndef FOLLY_RWSPINLOCK_H_
  55 #define FOLLY_RWSPINLOCK_H_
  56
  57 /*
  58 ========================================================================
  59 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
  60 ========================================================================
  61
  62 ------------------------------------------------------------------------------
  63 1. Single thread benchmark (read/write lock + unlock overhead)
  64 Benchmark                                    Iters   Total t    t/iter iter/sec
  65 -------------------------------------------------------------------------------
  66 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
  67 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
  68 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
  69 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
  70 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
  71 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
  72 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
  73 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
  74 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
  75 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
  76 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
  77 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
  78
  79 ------------------------------------------------------------------------------
  80 2. Contention Benchmark      90% read  10% write
  81 Benchmark                    hits       average    min       max        sigma
  82 ------------------------------------------------------------------------------
  83 ---------- 8  threads ------------
  84 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
  85 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
  86 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
  87 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
  88 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
  89 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
  90
  91 ---------- 16 threads ------------
  92 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
  93 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
  94 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
  95 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
  96 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
  97 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
  98
  99 ---------- 50 threads ------------
 100 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 101 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 102 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 103 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 104 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 105 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 106
 107 */
 108
 109 #if defined(__GNUC__) && (defined(__i386) || defined(__x86_64__) || \
 110     defined(ARCH_K8))
 111 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 112 #include <x86intrin.h>
 113 #else
 114 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 115 #endif
 116
 117 #include <atomic>
 118 #include <string>
 119 #include <algorithm>
 120 #include <boost/noncopyable.hpp>
 121
 122 #include <sched.h>
 123 #include <glog/logging.h>
 124
 125 #include "folly/Likely.h"
 126
 127 namespace folly {
 128
 129 /*
 130  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 131  * a nice writer and don't expect a lot of write/read contention, or
 132  * when you need small rwlocks since you are creating a large number
 133  * of them.
 134  *
 135  * Note that the unfairness here is extreme: if the lock is
 136  * continually accessed for read, writers will never get a chance.  If
 137  * the lock can be that highly contended this class is probably not an
 138  * ideal choice anyway.
 139  *
 140  * It currently implements most of the Lockable, SharedLockable and
 141  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 142  * interfaces.
 143  */
 144 class RWSpinLock : boost::noncopyable {
 145   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 146  public:
 147   RWSpinLock() : bits_(0) {}
 148
 149   // Lockable Concept
 150   void lock() {
 151     int count = 0;
 152     while (!LIKELY(try_lock())) {
 153       if (++count > 1000) sched_yield();
 154     }
 155   }
 156
 157   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 158   void unlock() {
 159     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 160     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 161   }
 162
 163   // SharedLockable Concept
 164   void lock_shared() {
 165     int count = 0;
 166     while (!LIKELY(try_lock_shared())) {
 167       if (++count > 1000) sched_yield();
 168     }
 169   }
 170
 171   void unlock_shared() {
 172     bits_.fetch_add(-READER, std::memory_order_release);
 173   }
 174
 175   // Downgrade the lock from writer status to reader status.
 176   void unlock_and_lock_shared() {
 177     bits_.fetch_add(READER, std::memory_order_acquire);
 178     unlock();
 179   }
 180
 181   // UpgradeLockable Concept
 182   void lock_upgrade() {
 183     int count = 0;
 184     while (!try_lock_upgrade()) {
 185       if (++count > 1000) sched_yield();
 186     }
 187   }
 188
 189   void unlock_upgrade() {
 190     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 191   }
 192
 193   // unlock upgrade and try to acquire write lock
 194   void unlock_upgrade_and_lock() {
 195     int64_t count = 0;
 196     while (!try_unlock_upgrade_and_lock()) {
 197       if (++count > 1000) sched_yield();
 198     }
 199   }
 200
 201   // unlock upgrade and read lock atomically
 202   void unlock_upgrade_and_lock_shared() {
 203     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 204   }
 205
 206   // write unlock and upgrade lock atomically
 207   void unlock_and_lock_upgrade() {
 208     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 209     // the same time when other threads are trying do try_lock_upgrade().
 210     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 211     bits_.fetch_add(-WRITER, std::memory_order_release);
 212   }
 213
 214
 215   // Attempt to acquire writer permission. Return false if we didn't get it.
 216   bool try_lock() {
 217     int32_t expect = 0;
 218     return bits_.compare_exchange_strong(expect, WRITER,
 219       std::memory_order_acq_rel);
 220   }
 221
 222   // Try to get reader permission on the lock. This can fail if we
 223   // find out someone is a writer or upgrader.
 224   // Setting the UPGRADED bit would allow a writer-to-be to indicate
 225   // its intention to write and block any new readers while waiting
 226   // for existing readers to finish and release their read locks. This
 227   // helps avoid starving writers (promoted from upgraders).
 228   bool try_lock_shared() {
 229     // fetch_add is considerably (100%) faster than compare_exchange,
 230     // so here we are optimizing for the common (lock success) case.
 231     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 232     if (UNLIKELY(value & (WRITER|UPGRADED))) {
 233       bits_.fetch_add(-READER, std::memory_order_release);
 234       return false;
 235     }
 236     return true;
 237   }
 238
 239   // try to unlock upgrade and write lock atomically
 240   bool try_unlock_upgrade_and_lock() {
 241     int32_t expect = UPGRADED;
 242     return bits_.compare_exchange_strong(expect, WRITER,
 243         std::memory_order_acq_rel);
 244   }
 245
 246   // try to acquire an upgradable lock.
 247   bool try_lock_upgrade() {
 248     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 249
 250     // Note: when failed, we cannot flip the UPGRADED bit back,
 251     // as in this case there is either another upgrade lock or a write lock.
 252     // If it's a write lock, the bit will get cleared up when that lock's done
 253     // with unlock().
 254     return ((value & (UPGRADED | WRITER)) == 0);
 255   }
 256
 257   // mainly for debugging purposes.
 258   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 259
 260   class ReadHolder;
 261   class UpgradedHolder;
 262   class WriteHolder;
 263
 264   class ReadHolder {
 265    public:
 266     explicit ReadHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 267       if (lock_) lock_->lock_shared();
 268     }
 269
 270     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 271       lock_->lock_shared();
 272     }
 273
 274     ReadHolder(ReadHolder&& other) : lock_(other.lock_) {
 275       other.lock_ = nullptr;
 276     }
 277
 278     // down-grade
 279     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 280       upgraded.lock_ = nullptr;
 281       if (lock_) lock_->unlock_upgrade_and_lock_shared();
 282     }
 283
 284     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 285       writer.lock_ = nullptr;
 286       if (lock_) lock_->unlock_and_lock_shared();
 287     }
 288
 289     ReadHolder& operator=(ReadHolder&& other) {
 290       using std::swap;
 291       swap(lock_, other.lock_);
 292       return *this;
 293     }
 294
 295     ReadHolder(const ReadHolder& other) = delete;
 296     ReadHolder& operator=(const ReadHolder& other) = delete;
 297
 298     ~ReadHolder() { if (lock_) lock_->unlock_shared(); }
 299
 300     void reset(RWSpinLock* lock = nullptr) {
 301       if (lock == lock_) return;
 302       if (lock_) lock_->unlock_shared();
 303       lock_ = lock;
 304       if (lock_) lock_->lock_shared();
 305     }
 306
 307     void swap(ReadHolder* other) {
 308       std::swap(lock_, other->lock_);
 309     }
 310
 311    private:
 312     friend class UpgradedHolder;
 313     friend class WriteHolder;
 314     RWSpinLock* lock_;
 315   };
 316
 317   class UpgradedHolder {
 318    public:
 319     explicit UpgradedHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 320       if (lock_) lock_->lock_upgrade();
 321     }
 322
 323     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 324       lock_->lock_upgrade();
 325     }
 326
 327     explicit UpgradedHolder(WriteHolder&& writer) {
 328       lock_ = writer.lock_;
 329       writer.lock_ = nullptr;
 330       if (lock_) lock_->unlock_and_lock_upgrade();
 331     }
 332
 333     UpgradedHolder(UpgradedHolder&& other) : lock_(other.lock_) {
 334       other.lock_ = nullptr;
 335     }
 336
 337     UpgradedHolder& operator =(UpgradedHolder&& other) {
 338       using std::swap;
 339       swap(lock_, other.lock_);
 340       return *this;
 341     }
 342
 343     UpgradedHolder(const UpgradedHolder& other) = delete;
 344     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 345
 346     ~UpgradedHolder() { if (lock_) lock_->unlock_upgrade(); }
 347
 348     void reset(RWSpinLock* lock = nullptr) {
 349       if (lock == lock_) return;
 350       if (lock_) lock_->unlock_upgrade();
 351       lock_ = lock;
 352       if (lock_) lock_->lock_upgrade();
 353     }
 354
 355     void swap(UpgradedHolder* other) {
 356       using std::swap;
 357       swap(lock_, other->lock_);
 358     }
 359
 360    private:
 361     friend class WriteHolder;
 362     friend class ReadHolder;
 363     RWSpinLock* lock_;
 364   };
 365
 366   class WriteHolder {
 367    public:
 368     explicit WriteHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 369       if (lock_) lock_->lock();
 370     }
 371
 372     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 373       lock_->lock();
 374     }
 375
 376     // promoted from an upgrade lock holder
 377     explicit WriteHolder(UpgradedHolder&& upgraded) {
 378       lock_ = upgraded.lock_;
 379       upgraded.lock_ = nullptr;
 380       if (lock_) lock_->unlock_upgrade_and_lock();
 381     }
 382
 383     WriteHolder(WriteHolder&& other) : lock_(other.lock_) {
 384       other.lock_ = nullptr;
 385     }
 386
 387     WriteHolder& operator =(WriteHolder&& other) {
 388       using std::swap;
 389       swap(lock_, other.lock_);
 390       return *this;
 391     }
 392
 393     WriteHolder(const WriteHolder& other) = delete;
 394     WriteHolder& operator =(const WriteHolder& other) = delete;
 395
 396     ~WriteHolder () { if (lock_) lock_->unlock(); }
 397
 398     void reset(RWSpinLock* lock = nullptr) {
 399       if (lock == lock_) return;
 400       if (lock_) lock_->unlock();
 401       lock_ = lock;
 402       if (lock_) lock_->lock();
 403     }
 404
 405     void swap(WriteHolder* other) {
 406       using std::swap;
 407       swap(lock_, other->lock_);
 408     }
 409
 410    private:
 411     friend class ReadHolder;
 412     friend class UpgradedHolder;
 413     RWSpinLock* lock_;
 414   };
 415
 416   // Synchronized<> adaptors
 417   friend void acquireRead(RWSpinLock& l) { return l.lock_shared(); }
 418   friend void acquireReadWrite(RWSpinLock& l) { return l.lock(); }
 419   friend void releaseRead(RWSpinLock& l) { return l.unlock_shared(); }
 420   friend void releaseReadWrite(RWSpinLock& l) { return l.unlock(); }
 421
 422  private:
 423   std::atomic<int32_t> bits_;
 424 };
 425
 426
 427 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 428 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 429
 430 namespace detail {
 431 template <size_t kBitWidth> struct RWTicketIntTrait {
 432   static_assert(kBitWidth == 32 || kBitWidth == 64,
 433       "bit width has to be either 32 or 64 ");
 434 };
 435
 436 template <>
 437 struct RWTicketIntTrait<64> {
 438   typedef uint64_t FullInt;
 439   typedef uint32_t HalfInt;
 440   typedef uint16_t QuarterInt;
 441
 442 #ifdef __SSE2__
 443   static __m128i make128(const uint16_t v[4]) {
 444     return _mm_set_epi16(0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 445   }
 446   static inline __m128i fromInteger(uint64_t from) {
 447     return _mm_cvtsi64_si128(from);
 448   }
 449   static inline uint64_t toInteger(__m128i in) {
 450     return _mm_cvtsi128_si64(in);
 451   }
 452   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 453     return toInteger(_mm_add_epi16(in, kDelta));
 454   }
 455 #endif
 456 };
 457
 458 template <>
 459 struct RWTicketIntTrait<32> {
 460   typedef uint32_t FullInt;
 461   typedef uint16_t HalfInt;
 462   typedef uint8_t QuarterInt;
 463
 464 #ifdef __SSE2__
 465   static __m128i make128(const uint8_t v[4]) {
 466     return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
 467         0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 468   }
 469   static inline __m128i fromInteger(uint32_t from) {
 470     return _mm_cvtsi32_si128(from);
 471   }
 472   static inline uint32_t toInteger(__m128i in) {
 473     return _mm_cvtsi128_si32(in);
 474   }
 475   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 476     return toInteger(_mm_add_epi8(in, kDelta));
 477   }
 478 #endif
 479 };
 480 }  // detail
 481
 482
 483 template<size_t kBitWidth, bool kFavorWriter=false>
 484 class RWTicketSpinLockT : boost::noncopyable {
 485   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 486   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 487   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 488   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 489     QuarterInt;
 490
 491   union RWTicket {
 492     FullInt whole;
 493     HalfInt readWrite;
 494     __extension__ struct {
 495       QuarterInt write;
 496       QuarterInt read;
 497       QuarterInt users;
 498     };
 499   } ticket;
 500
 501  private: // Some x64-specific utilities for atomic access to ticket.
 502   template<class T> static T load_acquire(T* addr) {
 503     T t = *addr; // acquire barrier
 504     asm volatile("" : : : "memory");
 505     return t;
 506   }
 507
 508   template<class T>
 509   static void store_release(T* addr, T v) {
 510     asm volatile("" : : : "memory");
 511     *addr = v; // release barrier
 512   }
 513
 514  public:
 515
 516   RWTicketSpinLockT() {
 517     store_release(&ticket.whole, FullInt(0));
 518   }
 519
 520   void lock() {
 521     if (kFavorWriter) {
 522       writeLockAggressive();
 523     } else {
 524       writeLockNice();
 525     }
 526   }
 527
 528   /*
 529    * Both try_lock and try_lock_shared diverge in our implementation from the
 530    * lock algorithm described in the link above.
 531    *
 532    * In the read case, it is undesirable that the readers could wait
 533    * for another reader (before increasing ticket.read in the other
 534    * implementation).  Our approach gives up on
 535    * first-come-first-serve, but our benchmarks showed improve
 536    * performance for both readers and writers under heavily contended
 537    * cases, particularly when the number of threads exceeds the number
 538    * of logical CPUs.
 539    *
 540    * We have writeLockAggressive() using the original implementation
 541    * for a writer, which gives some advantage to the writer over the
 542    * readers---for that path it is guaranteed that the writer will
 543    * acquire the lock after all the existing readers exit.
 544    */
 545   bool try_lock() {
 546     RWTicket t;
 547     FullInt old = t.whole = load_acquire(&ticket.whole);
 548     if (t.users != t.write) return false;
 549     ++t.users;
 550     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 551   }
 552
 553   /*
 554    * Call this if you want to prioritize writer to avoid starvation.
 555    * Unlike writeLockNice, immediately acquires the write lock when
 556    * the existing readers (arriving before the writer) finish their
 557    * turns.
 558    */
 559   void writeLockAggressive() {
 560     // sched_yield() is needed here to avoid a pathology if the number
 561     // of threads attempting concurrent writes is >= the number of real
 562     // cores allocated to this process. This is less likely than the
 563     // corresponding situation in lock_shared(), but we still want to
 564     // avoid it
 565     int count = 0;
 566     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 567     while (val != load_acquire(&ticket.write)) {
 568       asm volatile("pause");
 569       if (UNLIKELY(++count > 1000)) sched_yield();
 570     }
 571   }
 572
 573   // Call this when the writer should be nicer to the readers.
 574   void writeLockNice() {
 575     // Here it doesn't cpu-relax the writer.
 576     //
 577     // This is because usually we have many more readers than the
 578     // writers, so the writer has less chance to get the lock when
 579     // there are a lot of competing readers.  The aggressive spinning
 580     // can help to avoid starving writers.
 581     //
 582     // We don't worry about sched_yield() here because the caller
 583     // has already explicitly abandoned fairness.
 584     while (!try_lock()) {}
 585   }
 586
 587   // Atomically unlock the write-lock from writer and acquire the read-lock.
 588   void unlock_and_lock_shared() {
 589     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 590   }
 591
 592   // Release writer permission on the lock.
 593   void unlock() {
 594     RWTicket t;
 595     t.whole = load_acquire(&ticket.whole);
 596     FullInt old = t.whole;
 597
 598 #ifdef __SSE2__
 599     // SSE2 can reduce the lock and unlock overhead by 10%
 600     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 601     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 602     __m128i m = IntTraitType::fromInteger(old);
 603     t.whole = IntTraitType::addParallel(m, kDelta);
 604 #else
 605     ++t.read;
 606     ++t.write;
 607 #endif
 608     store_release(&ticket.readWrite, t.readWrite);
 609   }
 610
 611   void lock_shared() {
 612     // sched_yield() is important here because we can't grab the
 613     // shared lock if there is a pending writeLockAggressive, so we
 614     // need to let threads that already have a shared lock complete
 615     int count = 0;
 616     while (!LIKELY(try_lock_shared())) {
 617       asm volatile("pause");
 618       if (UNLIKELY((++count & 1023) == 0)) sched_yield();
 619     }
 620   }
 621
 622   bool try_lock_shared() {
 623     RWTicket t, old;
 624     old.whole = t.whole = load_acquire(&ticket.whole);
 625     old.users = old.read;
 626 #ifdef  __SSE2__
 627     // SSE2 may reduce the total lock and unlock overhead by 10%
 628     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 629     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 630     __m128i m = IntTraitType::fromInteger(old.whole);
 631     t.whole = IntTraitType::addParallel(m, kDelta);
 632 #else
 633     ++t.read;
 634     ++t.users;
 635 #endif
 636     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 637   }
 638
 639   void unlock_shared() {
 640     QuarterInt val = __sync_fetch_and_add(&ticket.write, 1);
 641   }
 642
 643   class WriteHolder;
 644
 645   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 646   class ReadHolder : boost::noncopyable {
 647    public:
 648     explicit ReadHolder(RWSpinLock *lock = nullptr) :
 649       lock_(lock) {
 650       if (lock_) lock_->lock_shared();
 651     }
 652
 653     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 654       if (lock_) lock_->lock_shared();
 655     }
 656
 657     // atomically unlock the write-lock from writer and acquire the read-lock
 658     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 659       std::swap(this->lock_, writer->lock_);
 660       if (lock_) {
 661         lock_->unlock_and_lock_shared();
 662       }
 663     }
 664
 665     ~ReadHolder() {
 666       if (lock_) lock_->unlock_shared();
 667     }
 668
 669     void reset(RWSpinLock *lock = nullptr) {
 670       if (lock_) lock_->unlock_shared();
 671       lock_ = lock;
 672       if (lock_) lock_->lock_shared();
 673     }
 674
 675     void swap(ReadHolder *other) {
 676       std::swap(this->lock_, other->lock_);
 677     }
 678
 679    private:
 680     RWSpinLock *lock_;
 681   };
 682
 683   class WriteHolder : boost::noncopyable {
 684    public:
 685     explicit WriteHolder(RWSpinLock *lock = nullptr) : lock_(lock) {
 686       if (lock_) lock_->lock();
 687     }
 688     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 689       if (lock_) lock_->lock();
 690     }
 691
 692     ~WriteHolder() {
 693       if (lock_) lock_->unlock();
 694     }
 695
 696     void reset(RWSpinLock *lock = nullptr) {
 697       if (lock == lock_) return;
 698       if (lock_) lock_->unlock();
 699       lock_ = lock;
 700       if (lock_) lock_->lock();
 701     }
 702
 703     void swap(WriteHolder *other) {
 704       std::swap(this->lock_, other->lock_);
 705     }
 706
 707    private:
 708     friend class ReadHolder;
 709     RWSpinLock *lock_;
 710   };
 711
 712   // Synchronized<> adaptors.
 713   friend void acquireRead(RWTicketSpinLockT& mutex) {
 714     mutex.lock_shared();
 715   }
 716   friend void acquireReadWrite(RWTicketSpinLockT& mutex) {
 717     mutex.lock();
 718   }
 719   friend bool acquireReadWrite(RWTicketSpinLockT& mutex,
 720                                unsigned int milliseconds) {
 721     mutex.lock();
 722     return true;
 723   }
 724   friend void releaseRead(RWTicketSpinLockT& mutex) {
 725     mutex.unlock_shared();
 726   }
 727   friend void releaseReadWrite(RWTicketSpinLockT& mutex) {
 728     mutex.unlock();
 729   }
 730 };
 731
 732 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 733 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 734
 735 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 736
 737 }  // namespace folly
 738
 739 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 740 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 741 #endif
 742
 743 #endif  // FOLLY_RWSPINLOCK_H_