folly/RWSpinLock.h

   1 /*
   2  * Copyright 2016 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * N.B. You most likely do _not_ want to use RWSpinLock or any other
  19  * kind of spinlock.  Use SharedMutex instead.
  20  *
  21  * In short, spinlocks in preemptive multi-tasking operating systems
  22  * have serious problems and fast mutexes like SharedMutex are almost
  23  * certainly the better choice, because letting the OS scheduler put a
  24  * thread to sleep is better for system responsiveness and throughput
  25  * than wasting a timeslice repeatedly querying a lock held by a
  26  * thread that's blocked, and you can't prevent userspace
  27  * programs blocking.
  28  *
  29  * Spinlocks in an operating system kernel make much more sense than
  30  * they do in userspace.
  31  *
  32  * -------------------------------------------------------------------
  33  *
  34  * Two Read-Write spin lock implementations.
  35  *
  36  *  Ref: http://locklessinc.com/articles/locks
  37  *
  38  *  Both locks here are faster than pthread_rwlock and have very low
  39  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  40  *  are very compact (4/8 bytes), so are suitable for per-instance
  41  *  based locking, particularly when contention is not expected.
  42  *
  43  *  For a spinlock, RWSpinLock is a reasonable choice.  (See the note
  44  *  about for why a spin lock is frequently a bad idea generally.)
  45  *  RWSpinLock has minimal overhead, and comparable contention
  46  *  performance when the number of competing threads is less than or
  47  *  equal to the number of logical CPUs.  Even as the number of
  48  *  threads gets larger, RWSpinLock can still be very competitive in
  49  *  READ, although it is slower on WRITE, and also inherently unfair
  50  *  to writers.
  51  *
  52  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  53  *  your application really needs a lot more threads, and a
  54  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  55  *
  56  *  Caveats:
  57  *
  58  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  59  *    based systems.
  60  *
  61  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  62  *    readers and writers.
  63  *
  64  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  65  *    readers and writers.
  66  *
  67  *    RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
  68  *    writer priority) is NOT reentrant, even for lock_shared().
  69  *
  70  *    The lock will not grant any new shared (read) accesses while a thread
  71  *    attempting to acquire the lock in write mode is blocked. (That is,
  72  *    if the lock is held in shared mode by N threads, and a thread attempts
  73  *    to acquire it in write mode, no one else can acquire it in shared mode
  74  *    until these N threads release the lock and then the blocked thread
  75  *    acquires and releases the exclusive lock.) This also applies for
  76  *    attempts to reacquire the lock in shared mode by threads that already
  77  *    hold it in shared mode, making the lock non-reentrant.
  78  *
  79  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  80  *
  81  * @author Xin Liu <xliux@fb.com>
  82  */
  83
  84 #ifndef FOLLY_RWSPINLOCK_H_
  85 #define FOLLY_RWSPINLOCK_H_
  86
  87 /*
  88 ========================================================================
  89 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
  90 ========================================================================
  91
  92 ------------------------------------------------------------------------------
  93 1. Single thread benchmark (read/write lock + unlock overhead)
  94 Benchmark                                    Iters   Total t    t/iter iter/sec
  95 -------------------------------------------------------------------------------
  96 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
  97 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
  98 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
  99 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
 100 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
 101 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
 102 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
 103 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
 104 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
 105 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
 106 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
 107 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
 108
 109 ------------------------------------------------------------------------------
 110 2. Contention Benchmark      90% read  10% write
 111 Benchmark                    hits       average    min       max        sigma
 112 ------------------------------------------------------------------------------
 113 ---------- 8  threads ------------
 114 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
 115 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
 116 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
 117 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
 118 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
 119 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
 120
 121 ---------- 16 threads ------------
 122 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
 123 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
 124 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
 125 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
 126 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
 127 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
 128
 129 ---------- 50 threads ------------
 130 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 131 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 132 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 133 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 134 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 135 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 136
 137 */
 138
 139 #include <folly/Portability.h>
 140 #include <folly/portability/Asm.h>
 141
 142 #if defined(__GNUC__) && \
 143   (defined(__i386) || FOLLY_X64 || \
 144    defined(ARCH_K8))
 145 # define RW_SPINLOCK_USE_X86_INTRINSIC_
 146 # include <x86intrin.h>
 147 #elif defined(_MSC_VER) && defined(FOLLY_X64)
 148 # define RW_SPINLOCK_USE_X86_INTRINSIC_
 149 #else
 150 # undef RW_SPINLOCK_USE_X86_INTRINSIC_
 151 #endif
 152
 153 // iOS doesn't define _mm_cvtsi64_si128 and friends
 154 #if (FOLLY_SSE >= 2) && !FOLLY_MOBILE
 155 #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 156 #else
 157 #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 158 #endif
 159
 160 #include <atomic>
 161 #include <string>
 162 #include <algorithm>
 163
 164 #include <sched.h>
 165 #include <glog/logging.h>
 166
 167 #include <folly/Likely.h>
 168
 169 namespace folly {
 170
 171 /*
 172  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 173  * a nice writer and don't expect a lot of write/read contention, or
 174  * when you need small rwlocks since you are creating a large number
 175  * of them.
 176  *
 177  * Note that the unfairness here is extreme: if the lock is
 178  * continually accessed for read, writers will never get a chance.  If
 179  * the lock can be that highly contended this class is probably not an
 180  * ideal choice anyway.
 181  *
 182  * It currently implements most of the Lockable, SharedLockable and
 183  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 184  * interfaces.
 185  */
 186 class RWSpinLock {
 187   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 188  public:
 189   constexpr RWSpinLock() : bits_(0) {}
 190
 191   RWSpinLock(RWSpinLock const&) = delete;
 192   RWSpinLock& operator=(RWSpinLock const&) = delete;
 193
 194   // Lockable Concept
 195   void lock() {
 196     int count = 0;
 197     while (!LIKELY(try_lock())) {
 198       if (++count > 1000) sched_yield();
 199     }
 200   }
 201
 202   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 203   void unlock() {
 204     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 205     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 206   }
 207
 208   // SharedLockable Concept
 209   void lock_shared() {
 210     int count = 0;
 211     while (!LIKELY(try_lock_shared())) {
 212       if (++count > 1000) sched_yield();
 213     }
 214   }
 215
 216   void unlock_shared() {
 217     bits_.fetch_add(-READER, std::memory_order_release);
 218   }
 219
 220   // Downgrade the lock from writer status to reader status.
 221   void unlock_and_lock_shared() {
 222     bits_.fetch_add(READER, std::memory_order_acquire);
 223     unlock();
 224   }
 225
 226   // UpgradeLockable Concept
 227   void lock_upgrade() {
 228     int count = 0;
 229     while (!try_lock_upgrade()) {
 230       if (++count > 1000) sched_yield();
 231     }
 232   }
 233
 234   void unlock_upgrade() {
 235     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 236   }
 237
 238   // unlock upgrade and try to acquire write lock
 239   void unlock_upgrade_and_lock() {
 240     int64_t count = 0;
 241     while (!try_unlock_upgrade_and_lock()) {
 242       if (++count > 1000) sched_yield();
 243     }
 244   }
 245
 246   // unlock upgrade and read lock atomically
 247   void unlock_upgrade_and_lock_shared() {
 248     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 249   }
 250
 251   // write unlock and upgrade lock atomically
 252   void unlock_and_lock_upgrade() {
 253     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 254     // the same time when other threads are trying do try_lock_upgrade().
 255     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 256     bits_.fetch_add(-WRITER, std::memory_order_release);
 257   }
 258
 259
 260   // Attempt to acquire writer permission. Return false if we didn't get it.
 261   bool try_lock() {
 262     int32_t expect = 0;
 263     return bits_.compare_exchange_strong(expect, WRITER,
 264       std::memory_order_acq_rel);
 265   }
 266
 267   // Try to get reader permission on the lock. This can fail if we
 268   // find out someone is a writer or upgrader.
 269   // Setting the UPGRADED bit would allow a writer-to-be to indicate
 270   // its intention to write and block any new readers while waiting
 271   // for existing readers to finish and release their read locks. This
 272   // helps avoid starving writers (promoted from upgraders).
 273   bool try_lock_shared() {
 274     // fetch_add is considerably (100%) faster than compare_exchange,
 275     // so here we are optimizing for the common (lock success) case.
 276     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 277     if (UNLIKELY(value & (WRITER|UPGRADED))) {
 278       bits_.fetch_add(-READER, std::memory_order_release);
 279       return false;
 280     }
 281     return true;
 282   }
 283
 284   // try to unlock upgrade and write lock atomically
 285   bool try_unlock_upgrade_and_lock() {
 286     int32_t expect = UPGRADED;
 287     return bits_.compare_exchange_strong(expect, WRITER,
 288         std::memory_order_acq_rel);
 289   }
 290
 291   // try to acquire an upgradable lock.
 292   bool try_lock_upgrade() {
 293     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 294
 295     // Note: when failed, we cannot flip the UPGRADED bit back,
 296     // as in this case there is either another upgrade lock or a write lock.
 297     // If it's a write lock, the bit will get cleared up when that lock's done
 298     // with unlock().
 299     return ((value & (UPGRADED | WRITER)) == 0);
 300   }
 301
 302   // mainly for debugging purposes.
 303   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 304
 305   class ReadHolder;
 306   class UpgradedHolder;
 307   class WriteHolder;
 308
 309   class ReadHolder {
 310    public:
 311     explicit ReadHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 312       if (lock_) lock_->lock_shared();
 313     }
 314
 315     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 316       lock_->lock_shared();
 317     }
 318
 319     ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
 320       other.lock_ = nullptr;
 321     }
 322
 323     // down-grade
 324     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 325       upgraded.lock_ = nullptr;
 326       if (lock_) lock_->unlock_upgrade_and_lock_shared();
 327     }
 328
 329     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 330       writer.lock_ = nullptr;
 331       if (lock_) lock_->unlock_and_lock_shared();
 332     }
 333
 334     ReadHolder& operator=(ReadHolder&& other) {
 335       using std::swap;
 336       swap(lock_, other.lock_);
 337       return *this;
 338     }
 339
 340     ReadHolder(const ReadHolder& other) = delete;
 341     ReadHolder& operator=(const ReadHolder& other) = delete;
 342
 343     ~ReadHolder() { if (lock_) lock_->unlock_shared(); }
 344
 345     void reset(RWSpinLock* lock = nullptr) {
 346       if (lock == lock_) return;
 347       if (lock_) lock_->unlock_shared();
 348       lock_ = lock;
 349       if (lock_) lock_->lock_shared();
 350     }
 351
 352     void swap(ReadHolder* other) {
 353       std::swap(lock_, other->lock_);
 354     }
 355
 356    private:
 357     friend class UpgradedHolder;
 358     friend class WriteHolder;
 359     RWSpinLock* lock_;
 360   };
 361
 362   class UpgradedHolder {
 363    public:
 364     explicit UpgradedHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 365       if (lock_) lock_->lock_upgrade();
 366     }
 367
 368     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 369       lock_->lock_upgrade();
 370     }
 371
 372     explicit UpgradedHolder(WriteHolder&& writer) {
 373       lock_ = writer.lock_;
 374       writer.lock_ = nullptr;
 375       if (lock_) lock_->unlock_and_lock_upgrade();
 376     }
 377
 378     UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
 379       other.lock_ = nullptr;
 380     }
 381
 382     UpgradedHolder& operator =(UpgradedHolder&& other) {
 383       using std::swap;
 384       swap(lock_, other.lock_);
 385       return *this;
 386     }
 387
 388     UpgradedHolder(const UpgradedHolder& other) = delete;
 389     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 390
 391     ~UpgradedHolder() { if (lock_) lock_->unlock_upgrade(); }
 392
 393     void reset(RWSpinLock* lock = nullptr) {
 394       if (lock == lock_) return;
 395       if (lock_) lock_->unlock_upgrade();
 396       lock_ = lock;
 397       if (lock_) lock_->lock_upgrade();
 398     }
 399
 400     void swap(UpgradedHolder* other) {
 401       using std::swap;
 402       swap(lock_, other->lock_);
 403     }
 404
 405    private:
 406     friend class WriteHolder;
 407     friend class ReadHolder;
 408     RWSpinLock* lock_;
 409   };
 410
 411   class WriteHolder {
 412    public:
 413     explicit WriteHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 414       if (lock_) lock_->lock();
 415     }
 416
 417     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 418       lock_->lock();
 419     }
 420
 421     // promoted from an upgrade lock holder
 422     explicit WriteHolder(UpgradedHolder&& upgraded) {
 423       lock_ = upgraded.lock_;
 424       upgraded.lock_ = nullptr;
 425       if (lock_) lock_->unlock_upgrade_and_lock();
 426     }
 427
 428     WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
 429       other.lock_ = nullptr;
 430     }
 431
 432     WriteHolder& operator =(WriteHolder&& other) {
 433       using std::swap;
 434       swap(lock_, other.lock_);
 435       return *this;
 436     }
 437
 438     WriteHolder(const WriteHolder& other) = delete;
 439     WriteHolder& operator =(const WriteHolder& other) = delete;
 440
 441     ~WriteHolder () { if (lock_) lock_->unlock(); }
 442
 443     void reset(RWSpinLock* lock = nullptr) {
 444       if (lock == lock_) return;
 445       if (lock_) lock_->unlock();
 446       lock_ = lock;
 447       if (lock_) lock_->lock();
 448     }
 449
 450     void swap(WriteHolder* other) {
 451       using std::swap;
 452       swap(lock_, other->lock_);
 453     }
 454
 455    private:
 456     friend class ReadHolder;
 457     friend class UpgradedHolder;
 458     RWSpinLock* lock_;
 459   };
 460
 461   // Synchronized<> adaptors
 462   friend void acquireRead(RWSpinLock& l) { return l.lock_shared(); }
 463   friend void acquireReadWrite(RWSpinLock& l) { return l.lock(); }
 464   friend void releaseRead(RWSpinLock& l) { return l.unlock_shared(); }
 465   friend void releaseReadWrite(RWSpinLock& l) { return l.unlock(); }
 466
 467  private:
 468   std::atomic<int32_t> bits_;
 469 };
 470
 471
 472 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 473 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 474
 475 namespace detail {
 476 template <size_t kBitWidth> struct RWTicketIntTrait {
 477   static_assert(kBitWidth == 32 || kBitWidth == 64,
 478       "bit width has to be either 32 or 64 ");
 479 };
 480
 481 template <>
 482 struct RWTicketIntTrait<64> {
 483   typedef uint64_t FullInt;
 484   typedef uint32_t HalfInt;
 485   typedef uint16_t QuarterInt;
 486
 487 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 488   static __m128i make128(const uint16_t v[4]) {
 489     return _mm_set_epi16(0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 490   }
 491   static inline __m128i fromInteger(uint64_t from) {
 492     return _mm_cvtsi64_si128(from);
 493   }
 494   static inline uint64_t toInteger(__m128i in) {
 495     return _mm_cvtsi128_si64(in);
 496   }
 497   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 498     return toInteger(_mm_add_epi16(in, kDelta));
 499   }
 500 #endif
 501 };
 502
 503 template <>
 504 struct RWTicketIntTrait<32> {
 505   typedef uint32_t FullInt;
 506   typedef uint16_t HalfInt;
 507   typedef uint8_t QuarterInt;
 508
 509 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 510   static __m128i make128(const uint8_t v[4]) {
 511     return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
 512         0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 513   }
 514   static inline __m128i fromInteger(uint32_t from) {
 515     return _mm_cvtsi32_si128(from);
 516   }
 517   static inline uint32_t toInteger(__m128i in) {
 518     return _mm_cvtsi128_si32(in);
 519   }
 520   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 521     return toInteger(_mm_add_epi8(in, kDelta));
 522   }
 523 #endif
 524 };
 525 }  // detail
 526
 527
 528 template<size_t kBitWidth, bool kFavorWriter=false>
 529 class RWTicketSpinLockT {
 530   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 531   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 532   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 533   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 534     QuarterInt;
 535
 536   union RWTicket {
 537     constexpr RWTicket() : whole(0) {}
 538     FullInt whole;
 539     HalfInt readWrite;
 540     __extension__ struct {
 541       QuarterInt write;
 542       QuarterInt read;
 543       QuarterInt users;
 544     };
 545   } ticket;
 546
 547  private: // Some x64-specific utilities for atomic access to ticket.
 548   template<class T> static T load_acquire(T* addr) {
 549     T t = *addr; // acquire barrier
 550     asm_volatile_memory();
 551     return t;
 552   }
 553
 554   template<class T>
 555   static void store_release(T* addr, T v) {
 556     asm_volatile_memory();
 557     *addr = v; // release barrier
 558   }
 559
 560  public:
 561
 562   constexpr RWTicketSpinLockT() {}
 563
 564   RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
 565   RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
 566
 567   void lock() {
 568     if (kFavorWriter) {
 569       writeLockAggressive();
 570     } else {
 571       writeLockNice();
 572     }
 573   }
 574
 575   /*
 576    * Both try_lock and try_lock_shared diverge in our implementation from the
 577    * lock algorithm described in the link above.
 578    *
 579    * In the read case, it is undesirable that the readers could wait
 580    * for another reader (before increasing ticket.read in the other
 581    * implementation).  Our approach gives up on
 582    * first-come-first-serve, but our benchmarks showed improve
 583    * performance for both readers and writers under heavily contended
 584    * cases, particularly when the number of threads exceeds the number
 585    * of logical CPUs.
 586    *
 587    * We have writeLockAggressive() using the original implementation
 588    * for a writer, which gives some advantage to the writer over the
 589    * readers---for that path it is guaranteed that the writer will
 590    * acquire the lock after all the existing readers exit.
 591    */
 592   bool try_lock() {
 593     RWTicket t;
 594     FullInt old = t.whole = load_acquire(&ticket.whole);
 595     if (t.users != t.write) return false;
 596     ++t.users;
 597     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 598   }
 599
 600   /*
 601    * Call this if you want to prioritize writer to avoid starvation.
 602    * Unlike writeLockNice, immediately acquires the write lock when
 603    * the existing readers (arriving before the writer) finish their
 604    * turns.
 605    */
 606   void writeLockAggressive() {
 607     // sched_yield() is needed here to avoid a pathology if the number
 608     // of threads attempting concurrent writes is >= the number of real
 609     // cores allocated to this process. This is less likely than the
 610     // corresponding situation in lock_shared(), but we still want to
 611     // avoid it
 612     int count = 0;
 613     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 614     while (val != load_acquire(&ticket.write)) {
 615       asm_volatile_pause();
 616       if (UNLIKELY(++count > 1000)) sched_yield();
 617     }
 618   }
 619
 620   // Call this when the writer should be nicer to the readers.
 621   void writeLockNice() {
 622     // Here it doesn't cpu-relax the writer.
 623     //
 624     // This is because usually we have many more readers than the
 625     // writers, so the writer has less chance to get the lock when
 626     // there are a lot of competing readers.  The aggressive spinning
 627     // can help to avoid starving writers.
 628     //
 629     // We don't worry about sched_yield() here because the caller
 630     // has already explicitly abandoned fairness.
 631     while (!try_lock()) {}
 632   }
 633
 634   // Atomically unlock the write-lock from writer and acquire the read-lock.
 635   void unlock_and_lock_shared() {
 636     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 637   }
 638
 639   // Release writer permission on the lock.
 640   void unlock() {
 641     RWTicket t;
 642     t.whole = load_acquire(&ticket.whole);
 643     FullInt old = t.whole;
 644
 645 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 646     // SSE2 can reduce the lock and unlock overhead by 10%
 647     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 648     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 649     __m128i m = IntTraitType::fromInteger(old);
 650     t.whole = IntTraitType::addParallel(m, kDelta);
 651 #else
 652     ++t.read;
 653     ++t.write;
 654 #endif
 655     store_release(&ticket.readWrite, t.readWrite);
 656   }
 657
 658   void lock_shared() {
 659     // sched_yield() is important here because we can't grab the
 660     // shared lock if there is a pending writeLockAggressive, so we
 661     // need to let threads that already have a shared lock complete
 662     int count = 0;
 663     while (!LIKELY(try_lock_shared())) {
 664       asm_volatile_pause();
 665       if (UNLIKELY((++count & 1023) == 0)) sched_yield();
 666     }
 667   }
 668
 669   bool try_lock_shared() {
 670     RWTicket t, old;
 671     old.whole = t.whole = load_acquire(&ticket.whole);
 672     old.users = old.read;
 673 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 674     // SSE2 may reduce the total lock and unlock overhead by 10%
 675     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 676     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 677     __m128i m = IntTraitType::fromInteger(old.whole);
 678     t.whole = IntTraitType::addParallel(m, kDelta);
 679 #else
 680     ++t.read;
 681     ++t.users;
 682 #endif
 683     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 684   }
 685
 686   void unlock_shared() {
 687     QuarterInt val = __sync_fetch_and_add(&ticket.write, 1);
 688   }
 689
 690   class WriteHolder;
 691
 692   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 693   class ReadHolder {
 694    public:
 695     ReadHolder(ReadHolder const&) = delete;
 696     ReadHolder& operator=(ReadHolder const&) = delete;
 697
 698     explicit ReadHolder(RWSpinLock *lock = nullptr) :
 699       lock_(lock) {
 700       if (lock_) lock_->lock_shared();
 701     }
 702
 703     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 704       if (lock_) lock_->lock_shared();
 705     }
 706
 707     // atomically unlock the write-lock from writer and acquire the read-lock
 708     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 709       std::swap(this->lock_, writer->lock_);
 710       if (lock_) {
 711         lock_->unlock_and_lock_shared();
 712       }
 713     }
 714
 715     ~ReadHolder() {
 716       if (lock_) lock_->unlock_shared();
 717     }
 718
 719     void reset(RWSpinLock *lock = nullptr) {
 720       if (lock_) lock_->unlock_shared();
 721       lock_ = lock;
 722       if (lock_) lock_->lock_shared();
 723     }
 724
 725     void swap(ReadHolder *other) {
 726       std::swap(this->lock_, other->lock_);
 727     }
 728
 729    private:
 730     RWSpinLock *lock_;
 731   };
 732
 733   class WriteHolder {
 734    public:
 735     WriteHolder(WriteHolder const&) = delete;
 736     WriteHolder& operator=(WriteHolder const&) = delete;
 737
 738     explicit WriteHolder(RWSpinLock *lock = nullptr) : lock_(lock) {
 739       if (lock_) lock_->lock();
 740     }
 741     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 742       if (lock_) lock_->lock();
 743     }
 744
 745     ~WriteHolder() {
 746       if (lock_) lock_->unlock();
 747     }
 748
 749     void reset(RWSpinLock *lock = nullptr) {
 750       if (lock == lock_) return;
 751       if (lock_) lock_->unlock();
 752       lock_ = lock;
 753       if (lock_) lock_->lock();
 754     }
 755
 756     void swap(WriteHolder *other) {
 757       std::swap(this->lock_, other->lock_);
 758     }
 759
 760    private:
 761     friend class ReadHolder;
 762     RWSpinLock *lock_;
 763   };
 764
 765   // Synchronized<> adaptors.
 766   friend void acquireRead(RWTicketSpinLockT& mutex) {
 767     mutex.lock_shared();
 768   }
 769   friend void acquireReadWrite(RWTicketSpinLockT& mutex) {
 770     mutex.lock();
 771   }
 772   friend void releaseRead(RWTicketSpinLockT& mutex) {
 773     mutex.unlock_shared();
 774   }
 775   friend void releaseReadWrite(RWTicketSpinLockT& mutex) {
 776     mutex.unlock();
 777   }
 778 };
 779
 780 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 781 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 782
 783 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 784
 785 }  // namespace folly
 786
 787 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 788 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 789 #endif
 790
 791 #endif  // FOLLY_RWSPINLOCK_H_