folly/synchronization/RWSpinLock.h

   1 /*
   2  * Copyright 2017-present Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16 /*
  17  * N.B. You most likely do _not_ want to use RWSpinLock or any other
  18  * kind of spinlock.  Use SharedMutex instead.
  19  *
  20  * In short, spinlocks in preemptive multi-tasking operating systems
  21  * have serious problems and fast mutexes like SharedMutex are almost
  22  * certainly the better choice, because letting the OS scheduler put a
  23  * thread to sleep is better for system responsiveness and throughput
  24  * than wasting a timeslice repeatedly querying a lock held by a
  25  * thread that's blocked, and you can't prevent userspace
  26  * programs blocking.
  27  *
  28  * Spinlocks in an operating system kernel make much more sense than
  29  * they do in userspace.
  30  *
  31  * -------------------------------------------------------------------
  32  *
  33  * Two Read-Write spin lock implementations.
  34  *
  35  *  Ref: http://locklessinc.com/articles/locks
  36  *
  37  *  Both locks here are faster than pthread_rwlock and have very low
  38  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  39  *  are very compact (4/8 bytes), so are suitable for per-instance
  40  *  based locking, particularly when contention is not expected.
  41  *
  42  *  For a spinlock, RWSpinLock is a reasonable choice.  (See the note
  43  *  about for why a spin lock is frequently a bad idea generally.)
  44  *  RWSpinLock has minimal overhead, and comparable contention
  45  *  performance when the number of competing threads is less than or
  46  *  equal to the number of logical CPUs.  Even as the number of
  47  *  threads gets larger, RWSpinLock can still be very competitive in
  48  *  READ, although it is slower on WRITE, and also inherently unfair
  49  *  to writers.
  50  *
  51  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  52  *  your application really needs a lot more threads, and a
  53  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  54  *
  55  *  Caveats:
  56  *
  57  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  58  *    based systems.
  59  *
  60  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  61  *    readers and writers.
  62  *
  63  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  64  *    readers and writers.
  65  *
  66  *    RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
  67  *    writer priority) is NOT reentrant, even for lock_shared().
  68  *
  69  *    The lock will not grant any new shared (read) accesses while a thread
  70  *    attempting to acquire the lock in write mode is blocked. (That is,
  71  *    if the lock is held in shared mode by N threads, and a thread attempts
  72  *    to acquire it in write mode, no one else can acquire it in shared mode
  73  *    until these N threads release the lock and then the blocked thread
  74  *    acquires and releases the exclusive lock.) This also applies for
  75  *    attempts to reacquire the lock in shared mode by threads that already
  76  *    hold it in shared mode, making the lock non-reentrant.
  77  *
  78  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  79  *
  80  * @author Xin Liu <xliux@fb.com>
  81  */
  82
  83 #pragma once
  84
  85 /*
  86 ========================================================================
  87 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
  88 ========================================================================
  89
  90 ------------------------------------------------------------------------------
  91 1. Single thread benchmark (read/write lock + unlock overhead)
  92 Benchmark                                    Iters   Total t    t/iter iter/sec
  93 -------------------------------------------------------------------------------
  94 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
  95 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
  96 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
  97 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
  98 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
  99 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
 100 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
 101 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
 102 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
 103 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
 104 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
 105 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
 106
 107 ------------------------------------------------------------------------------
 108 2. Contention Benchmark      90% read  10% write
 109 Benchmark                    hits       average    min       max        sigma
 110 ------------------------------------------------------------------------------
 111 ---------- 8  threads ------------
 112 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
 113 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
 114 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
 115 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
 116 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
 117 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
 118
 119 ---------- 16 threads ------------
 120 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
 121 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
 122 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
 123 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
 124 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
 125 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
 126
 127 ---------- 50 threads ------------
 128 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 129 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 130 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 131 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 132 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 133 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 134
 135 */
 136
 137 #include <folly/Portability.h>
 138 #include <folly/portability/Asm.h>
 139
 140 #if defined(__GNUC__) && (defined(__i386) || FOLLY_X64 || defined(ARCH_K8))
 141 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 142 #include <x86intrin.h>
 143 #elif defined(_MSC_VER) && defined(FOLLY_X64)
 144 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 145 #elif FOLLY_AARCH64
 146 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 147 #else
 148 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 149 #endif
 150
 151 // iOS doesn't define _mm_cvtsi64_si128 and friends
 152 #if (FOLLY_SSE >= 2) && !FOLLY_MOBILE && FOLLY_X64
 153 #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 154 #else
 155 #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 156 #endif
 157
 158 #include <algorithm>
 159 #include <atomic>
 160 #include <string>
 161 #include <thread>
 162
 163 #include <glog/logging.h>
 164
 165 #include <folly/Likely.h>
 166
 167 namespace folly {
 168
 169 /*
 170  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 171  * a nice writer and don't expect a lot of write/read contention, or
 172  * when you need small rwlocks since you are creating a large number
 173  * of them.
 174  *
 175  * Note that the unfairness here is extreme: if the lock is
 176  * continually accessed for read, writers will never get a chance.  If
 177  * the lock can be that highly contended this class is probably not an
 178  * ideal choice anyway.
 179  *
 180  * It currently implements most of the Lockable, SharedLockable and
 181  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 182  * interfaces.
 183  */
 184 class RWSpinLock {
 185   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 186  public:
 187   constexpr RWSpinLock() : bits_(0) {}
 188
 189   RWSpinLock(RWSpinLock const&) = delete;
 190   RWSpinLock& operator=(RWSpinLock const&) = delete;
 191
 192   // Lockable Concept
 193   void lock() {
 194     uint_fast32_t count = 0;
 195     while (!LIKELY(try_lock())) {
 196       if (++count > 1000) {
 197         std::this_thread::yield();
 198       }
 199     }
 200   }
 201
 202   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 203   void unlock() {
 204     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 205     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 206   }
 207
 208   // SharedLockable Concept
 209   void lock_shared() {
 210     uint_fast32_t count = 0;
 211     while (!LIKELY(try_lock_shared())) {
 212       if (++count > 1000) {
 213         std::this_thread::yield();
 214       }
 215     }
 216   }
 217
 218   void unlock_shared() {
 219     bits_.fetch_add(-READER, std::memory_order_release);
 220   }
 221
 222   // Downgrade the lock from writer status to reader status.
 223   void unlock_and_lock_shared() {
 224     bits_.fetch_add(READER, std::memory_order_acquire);
 225     unlock();
 226   }
 227
 228   // UpgradeLockable Concept
 229   void lock_upgrade() {
 230     uint_fast32_t count = 0;
 231     while (!try_lock_upgrade()) {
 232       if (++count > 1000) {
 233         std::this_thread::yield();
 234       }
 235     }
 236   }
 237
 238   void unlock_upgrade() {
 239     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 240   }
 241
 242   // unlock upgrade and try to acquire write lock
 243   void unlock_upgrade_and_lock() {
 244     int64_t count = 0;
 245     while (!try_unlock_upgrade_and_lock()) {
 246       if (++count > 1000) {
 247         std::this_thread::yield();
 248       }
 249     }
 250   }
 251
 252   // unlock upgrade and read lock atomically
 253   void unlock_upgrade_and_lock_shared() {
 254     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 255   }
 256
 257   // write unlock and upgrade lock atomically
 258   void unlock_and_lock_upgrade() {
 259     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 260     // the same time when other threads are trying do try_lock_upgrade().
 261     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 262     bits_.fetch_add(-WRITER, std::memory_order_release);
 263   }
 264
 265
 266   // Attempt to acquire writer permission. Return false if we didn't get it.
 267   bool try_lock() {
 268     int32_t expect = 0;
 269     return bits_.compare_exchange_strong(expect, WRITER,
 270       std::memory_order_acq_rel);
 271   }
 272
 273   // Try to get reader permission on the lock. This can fail if we
 274   // find out someone is a writer or upgrader.
 275   // Setting the UPGRADED bit would allow a writer-to-be to indicate
 276   // its intention to write and block any new readers while waiting
 277   // for existing readers to finish and release their read locks. This
 278   // helps avoid starving writers (promoted from upgraders).
 279   bool try_lock_shared() {
 280     // fetch_add is considerably (100%) faster than compare_exchange,
 281     // so here we are optimizing for the common (lock success) case.
 282     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 283     if (UNLIKELY(value & (WRITER|UPGRADED))) {
 284       bits_.fetch_add(-READER, std::memory_order_release);
 285       return false;
 286     }
 287     return true;
 288   }
 289
 290   // try to unlock upgrade and write lock atomically
 291   bool try_unlock_upgrade_and_lock() {
 292     int32_t expect = UPGRADED;
 293     return bits_.compare_exchange_strong(expect, WRITER,
 294         std::memory_order_acq_rel);
 295   }
 296
 297   // try to acquire an upgradable lock.
 298   bool try_lock_upgrade() {
 299     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 300
 301     // Note: when failed, we cannot flip the UPGRADED bit back,
 302     // as in this case there is either another upgrade lock or a write lock.
 303     // If it's a write lock, the bit will get cleared up when that lock's done
 304     // with unlock().
 305     return ((value & (UPGRADED | WRITER)) == 0);
 306   }
 307
 308   // mainly for debugging purposes.
 309   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 310
 311   class ReadHolder;
 312   class UpgradedHolder;
 313   class WriteHolder;
 314
 315   class ReadHolder {
 316    public:
 317     explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
 318       if (lock_) {
 319         lock_->lock_shared();
 320       }
 321     }
 322
 323     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 324       lock_->lock_shared();
 325     }
 326
 327     ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
 328       other.lock_ = nullptr;
 329     }
 330
 331     // down-grade
 332     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 333       upgraded.lock_ = nullptr;
 334       if (lock_) {
 335         lock_->unlock_upgrade_and_lock_shared();
 336       }
 337     }
 338
 339     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 340       writer.lock_ = nullptr;
 341       if (lock_) {
 342         lock_->unlock_and_lock_shared();
 343       }
 344     }
 345
 346     ReadHolder& operator=(ReadHolder&& other) {
 347       using std::swap;
 348       swap(lock_, other.lock_);
 349       return *this;
 350     }
 351
 352     ReadHolder(const ReadHolder& other) = delete;
 353     ReadHolder& operator=(const ReadHolder& other) = delete;
 354
 355     ~ReadHolder() {
 356       if (lock_) {
 357         lock_->unlock_shared();
 358       }
 359     }
 360
 361     void reset(RWSpinLock* lock = nullptr) {
 362       if (lock == lock_) {
 363         return;
 364       }
 365       if (lock_) {
 366         lock_->unlock_shared();
 367       }
 368       lock_ = lock;
 369       if (lock_) {
 370         lock_->lock_shared();
 371       }
 372     }
 373
 374     void swap(ReadHolder* other) {
 375       std::swap(lock_, other->lock_);
 376     }
 377
 378    private:
 379     friend class UpgradedHolder;
 380     friend class WriteHolder;
 381     RWSpinLock* lock_;
 382   };
 383
 384   class UpgradedHolder {
 385    public:
 386     explicit UpgradedHolder(RWSpinLock* lock) : lock_(lock) {
 387       if (lock_) {
 388         lock_->lock_upgrade();
 389       }
 390     }
 391
 392     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 393       lock_->lock_upgrade();
 394     }
 395
 396     explicit UpgradedHolder(WriteHolder&& writer) {
 397       lock_ = writer.lock_;
 398       writer.lock_ = nullptr;
 399       if (lock_) {
 400         lock_->unlock_and_lock_upgrade();
 401       }
 402     }
 403
 404     UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
 405       other.lock_ = nullptr;
 406     }
 407
 408     UpgradedHolder& operator =(UpgradedHolder&& other) {
 409       using std::swap;
 410       swap(lock_, other.lock_);
 411       return *this;
 412     }
 413
 414     UpgradedHolder(const UpgradedHolder& other) = delete;
 415     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 416
 417     ~UpgradedHolder() {
 418       if (lock_) {
 419         lock_->unlock_upgrade();
 420       }
 421     }
 422
 423     void reset(RWSpinLock* lock = nullptr) {
 424       if (lock == lock_) {
 425         return;
 426       }
 427       if (lock_) {
 428         lock_->unlock_upgrade();
 429       }
 430       lock_ = lock;
 431       if (lock_) {
 432         lock_->lock_upgrade();
 433       }
 434     }
 435
 436     void swap(UpgradedHolder* other) {
 437       using std::swap;
 438       swap(lock_, other->lock_);
 439     }
 440
 441    private:
 442     friend class WriteHolder;
 443     friend class ReadHolder;
 444     RWSpinLock* lock_;
 445   };
 446
 447   class WriteHolder {
 448    public:
 449     explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
 450       if (lock_) {
 451         lock_->lock();
 452       }
 453     }
 454
 455     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 456       lock_->lock();
 457     }
 458
 459     // promoted from an upgrade lock holder
 460     explicit WriteHolder(UpgradedHolder&& upgraded) {
 461       lock_ = upgraded.lock_;
 462       upgraded.lock_ = nullptr;
 463       if (lock_) {
 464         lock_->unlock_upgrade_and_lock();
 465       }
 466     }
 467
 468     WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
 469       other.lock_ = nullptr;
 470     }
 471
 472     WriteHolder& operator =(WriteHolder&& other) {
 473       using std::swap;
 474       swap(lock_, other.lock_);
 475       return *this;
 476     }
 477
 478     WriteHolder(const WriteHolder& other) = delete;
 479     WriteHolder& operator =(const WriteHolder& other) = delete;
 480
 481     ~WriteHolder() {
 482       if (lock_) {
 483         lock_->unlock();
 484       }
 485     }
 486
 487     void reset(RWSpinLock* lock = nullptr) {
 488       if (lock == lock_) {
 489         return;
 490       }
 491       if (lock_) {
 492         lock_->unlock();
 493       }
 494       lock_ = lock;
 495       if (lock_) {
 496         lock_->lock();
 497       }
 498     }
 499
 500     void swap(WriteHolder* other) {
 501       using std::swap;
 502       swap(lock_, other->lock_);
 503     }
 504
 505    private:
 506     friend class ReadHolder;
 507     friend class UpgradedHolder;
 508     RWSpinLock* lock_;
 509   };
 510
 511  private:
 512   std::atomic<int32_t> bits_;
 513 };
 514
 515
 516 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 517 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 518
 519 namespace detail {
 520 template <size_t kBitWidth> struct RWTicketIntTrait {
 521   static_assert(kBitWidth == 32 || kBitWidth == 64,
 522       "bit width has to be either 32 or 64 ");
 523 };
 524
 525 template <>
 526 struct RWTicketIntTrait<64> {
 527   typedef uint64_t FullInt;
 528   typedef uint32_t HalfInt;
 529   typedef uint16_t QuarterInt;
 530
 531 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 532   static __m128i make128(const uint16_t v[4]) {
 533     return _mm_set_epi16(0, 0, 0, 0,
 534         short(v[3]), short(v[2]), short(v[1]), short(v[0]));
 535   }
 536   static inline __m128i fromInteger(uint64_t from) {
 537     return _mm_cvtsi64_si128(int64_t(from));
 538   }
 539   static inline uint64_t toInteger(__m128i in) {
 540     return uint64_t(_mm_cvtsi128_si64(in));
 541   }
 542   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 543     return toInteger(_mm_add_epi16(in, kDelta));
 544   }
 545 #endif
 546 };
 547
 548 template <>
 549 struct RWTicketIntTrait<32> {
 550   typedef uint32_t FullInt;
 551   typedef uint16_t HalfInt;
 552   typedef uint8_t QuarterInt;
 553
 554 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 555   static __m128i make128(const uint8_t v[4]) {
 556     return _mm_set_epi8(
 557         0, 0, 0, 0,
 558         0, 0, 0, 0,
 559         0, 0, 0, 0,
 560         char(v[3]), char(v[2]), char(v[1]), char(v[0]));
 561   }
 562   static inline __m128i fromInteger(uint32_t from) {
 563     return _mm_cvtsi32_si128(int32_t(from));
 564   }
 565   static inline uint32_t toInteger(__m128i in) {
 566     return uint32_t(_mm_cvtsi128_si32(in));
 567   }
 568   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 569     return toInteger(_mm_add_epi8(in, kDelta));
 570   }
 571 #endif
 572 };
 573 } // namespace detail
 574
 575 template <size_t kBitWidth, bool kFavorWriter = false>
 576 class RWTicketSpinLockT {
 577   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 578   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 579   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 580   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 581     QuarterInt;
 582
 583   union RWTicket {
 584     constexpr RWTicket() : whole(0) {}
 585     FullInt whole;
 586     HalfInt readWrite;
 587     __extension__ struct {
 588       QuarterInt write;
 589       QuarterInt read;
 590       QuarterInt users;
 591     };
 592   } ticket;
 593
 594  private: // Some x64-specific utilities for atomic access to ticket.
 595   template <class T> static T load_acquire(T* addr) {
 596     T t = *addr; // acquire barrier
 597     asm_volatile_memory();
 598     return t;
 599   }
 600
 601   template <class T>
 602   static void store_release(T* addr, T v) {
 603     asm_volatile_memory();
 604     *addr = v; // release barrier
 605   }
 606
 607  public:
 608
 609   constexpr RWTicketSpinLockT() {}
 610
 611   RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
 612   RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
 613
 614   void lock() {
 615     if (kFavorWriter) {
 616       writeLockAggressive();
 617     } else {
 618       writeLockNice();
 619     }
 620   }
 621
 622   /*
 623    * Both try_lock and try_lock_shared diverge in our implementation from the
 624    * lock algorithm described in the link above.
 625    *
 626    * In the read case, it is undesirable that the readers could wait
 627    * for another reader (before increasing ticket.read in the other
 628    * implementation).  Our approach gives up on
 629    * first-come-first-serve, but our benchmarks showed improve
 630    * performance for both readers and writers under heavily contended
 631    * cases, particularly when the number of threads exceeds the number
 632    * of logical CPUs.
 633    *
 634    * We have writeLockAggressive() using the original implementation
 635    * for a writer, which gives some advantage to the writer over the
 636    * readers---for that path it is guaranteed that the writer will
 637    * acquire the lock after all the existing readers exit.
 638    */
 639   bool try_lock() {
 640     RWTicket t;
 641     FullInt old = t.whole = load_acquire(&ticket.whole);
 642     if (t.users != t.write) {
 643       return false;
 644     }
 645     ++t.users;
 646     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 647   }
 648
 649   /*
 650    * Call this if you want to prioritize writer to avoid starvation.
 651    * Unlike writeLockNice, immediately acquires the write lock when
 652    * the existing readers (arriving before the writer) finish their
 653    * turns.
 654    */
 655   void writeLockAggressive() {
 656     // std::this_thread::yield() is needed here to avoid a pathology if the number
 657     // of threads attempting concurrent writes is >= the number of real
 658     // cores allocated to this process. This is less likely than the
 659     // corresponding situation in lock_shared(), but we still want to
 660     // avoid it
 661     uint_fast32_t count = 0;
 662     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 663     while (val != load_acquire(&ticket.write)) {
 664       asm_volatile_pause();
 665       if (UNLIKELY(++count > 1000)) {
 666         std::this_thread::yield();
 667       }
 668     }
 669   }
 670
 671   // Call this when the writer should be nicer to the readers.
 672   void writeLockNice() {
 673     // Here it doesn't cpu-relax the writer.
 674     //
 675     // This is because usually we have many more readers than the
 676     // writers, so the writer has less chance to get the lock when
 677     // there are a lot of competing readers.  The aggressive spinning
 678     // can help to avoid starving writers.
 679     //
 680     // We don't worry about std::this_thread::yield() here because the caller
 681     // has already explicitly abandoned fairness.
 682     while (!try_lock()) {}
 683   }
 684
 685   // Atomically unlock the write-lock from writer and acquire the read-lock.
 686   void unlock_and_lock_shared() {
 687     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 688   }
 689
 690   // Release writer permission on the lock.
 691   void unlock() {
 692     RWTicket t;
 693     t.whole = load_acquire(&ticket.whole);
 694
 695 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 696     FullInt old = t.whole;
 697     // SSE2 can reduce the lock and unlock overhead by 10%
 698     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 699     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 700     __m128i m = IntTraitType::fromInteger(old);
 701     t.whole = IntTraitType::addParallel(m, kDelta);
 702 #else
 703     ++t.read;
 704     ++t.write;
 705 #endif
 706     store_release(&ticket.readWrite, t.readWrite);
 707   }
 708
 709   void lock_shared() {
 710     // std::this_thread::yield() is important here because we can't grab the
 711     // shared lock if there is a pending writeLockAggressive, so we
 712     // need to let threads that already have a shared lock complete
 713     uint_fast32_t count = 0;
 714     while (!LIKELY(try_lock_shared())) {
 715       asm_volatile_pause();
 716       if (UNLIKELY((++count & 1023) == 0)) {
 717         std::this_thread::yield();
 718       }
 719     }
 720   }
 721
 722   bool try_lock_shared() {
 723     RWTicket t, old;
 724     old.whole = t.whole = load_acquire(&ticket.whole);
 725     old.users = old.read;
 726 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 727     // SSE2 may reduce the total lock and unlock overhead by 10%
 728     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 729     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 730     __m128i m = IntTraitType::fromInteger(old.whole);
 731     t.whole = IntTraitType::addParallel(m, kDelta);
 732 #else
 733     ++t.read;
 734     ++t.users;
 735 #endif
 736     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 737   }
 738
 739   void unlock_shared() {
 740     __sync_fetch_and_add(&ticket.write, 1);
 741   }
 742
 743   class WriteHolder;
 744
 745   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 746   class ReadHolder {
 747    public:
 748     ReadHolder(ReadHolder const&) = delete;
 749     ReadHolder& operator=(ReadHolder const&) = delete;
 750
 751     explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
 752       if (lock_) {
 753         lock_->lock_shared();
 754       }
 755     }
 756
 757     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 758       if (lock_) {
 759         lock_->lock_shared();
 760       }
 761     }
 762
 763     // atomically unlock the write-lock from writer and acquire the read-lock
 764     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 765       std::swap(this->lock_, writer->lock_);
 766       if (lock_) {
 767         lock_->unlock_and_lock_shared();
 768       }
 769     }
 770
 771     ~ReadHolder() {
 772       if (lock_) {
 773         lock_->unlock_shared();
 774       }
 775     }
 776
 777     void reset(RWSpinLock *lock = nullptr) {
 778       if (lock_) {
 779         lock_->unlock_shared();
 780       }
 781       lock_ = lock;
 782       if (lock_) {
 783         lock_->lock_shared();
 784       }
 785     }
 786
 787     void swap(ReadHolder *other) {
 788       std::swap(this->lock_, other->lock_);
 789     }
 790
 791    private:
 792     RWSpinLock *lock_;
 793   };
 794
 795   class WriteHolder {
 796    public:
 797     WriteHolder(WriteHolder const&) = delete;
 798     WriteHolder& operator=(WriteHolder const&) = delete;
 799
 800     explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
 801       if (lock_) {
 802         lock_->lock();
 803       }
 804     }
 805     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 806       if (lock_) {
 807         lock_->lock();
 808       }
 809     }
 810
 811     ~WriteHolder() {
 812       if (lock_) {
 813         lock_->unlock();
 814       }
 815     }
 816
 817     void reset(RWSpinLock *lock = nullptr) {
 818       if (lock == lock_) {
 819         return;
 820       }
 821       if (lock_) {
 822         lock_->unlock();
 823       }
 824       lock_ = lock;
 825       if (lock_) {
 826         lock_->lock();
 827       }
 828     }
 829
 830     void swap(WriteHolder *other) {
 831       std::swap(this->lock_, other->lock_);
 832     }
 833
 834    private:
 835     friend class ReadHolder;
 836     RWSpinLock *lock_;
 837   };
 838 };
 839
 840 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 841 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 842
 843 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 844
 845 } // namespace folly
 846
 847 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 848 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 849 #endif