folly/RWSpinLock.h

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * N.B. You most likely do _not_ want to use RWSpinLock or any other
  19  * kind of spinlock.  Use SharedMutex instead.
  20  *
  21  * In short, spinlocks in preemptive multi-tasking operating systems
  22  * have serious problems and fast mutexes like SharedMutex are almost
  23  * certainly the better choice, because letting the OS scheduler put a
  24  * thread to sleep is better for system responsiveness and throughput
  25  * than wasting a timeslice repeatedly querying a lock held by a
  26  * thread that's blocked, and you can't prevent userspace
  27  * programs blocking.
  28  *
  29  * Spinlocks in an operating system kernel make much more sense than
  30  * they do in userspace.
  31  *
  32  * -------------------------------------------------------------------
  33  *
  34  * Two Read-Write spin lock implementations.
  35  *
  36  *  Ref: http://locklessinc.com/articles/locks
  37  *
  38  *  Both locks here are faster than pthread_rwlock and have very low
  39  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  40  *  are very compact (4/8 bytes), so are suitable for per-instance
  41  *  based locking, particularly when contention is not expected.
  42  *
  43  *  For a spinlock, RWSpinLock is a reasonable choice.  (See the note
  44  *  about for why a spin lock is frequently a bad idea generally.)
  45  *  RWSpinLock has minimal overhead, and comparable contention
  46  *  performance when the number of competing threads is less than or
  47  *  equal to the number of logical CPUs.  Even as the number of
  48  *  threads gets larger, RWSpinLock can still be very competitive in
  49  *  READ, although it is slower on WRITE, and also inherently unfair
  50  *  to writers.
  51  *
  52  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  53  *  your application really needs a lot more threads, and a
  54  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  55  *
  56  *  Caveats:
  57  *
  58  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  59  *    based systems.
  60  *
  61  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  62  *    readers and writers.
  63  *
  64  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  65  *    readers and writers.
  66  *
  67  *    RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
  68  *    writer priority) is NOT reentrant, even for lock_shared().
  69  *
  70  *    The lock will not grant any new shared (read) accesses while a thread
  71  *    attempting to acquire the lock in write mode is blocked. (That is,
  72  *    if the lock is held in shared mode by N threads, and a thread attempts
  73  *    to acquire it in write mode, no one else can acquire it in shared mode
  74  *    until these N threads release the lock and then the blocked thread
  75  *    acquires and releases the exclusive lock.) This also applies for
  76  *    attempts to reacquire the lock in shared mode by threads that already
  77  *    hold it in shared mode, making the lock non-reentrant.
  78  *
  79  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  80  *
  81  * @author Xin Liu <xliux@fb.com>
  82  */
  83
  84 #pragma once
  85
  86 /*
  87 ========================================================================
  88 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
  89 ========================================================================
  90
  91 ------------------------------------------------------------------------------
  92 1. Single thread benchmark (read/write lock + unlock overhead)
  93 Benchmark                                    Iters   Total t    t/iter iter/sec
  94 -------------------------------------------------------------------------------
  95 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
  96 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
  97 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
  98 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
  99 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
 100 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
 101 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
 102 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
 103 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
 104 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
 105 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
 106 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
 107
 108 ------------------------------------------------------------------------------
 109 2. Contention Benchmark      90% read  10% write
 110 Benchmark                    hits       average    min       max        sigma
 111 ------------------------------------------------------------------------------
 112 ---------- 8  threads ------------
 113 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
 114 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
 115 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
 116 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
 117 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
 118 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
 119
 120 ---------- 16 threads ------------
 121 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
 122 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
 123 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
 124 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
 125 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
 126 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
 127
 128 ---------- 50 threads ------------
 129 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 130 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 131 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 132 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 133 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 134 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 135
 136 */
 137
 138 #include <folly/Portability.h>
 139 #include <folly/portability/Asm.h>
 140
 141 #if defined(__GNUC__) && (defined(__i386) || FOLLY_X64 || defined(ARCH_K8))
 142 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 143 #include <x86intrin.h>
 144 #elif defined(_MSC_VER) && defined(FOLLY_X64)
 145 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 146 #elif FOLLY_AARCH64
 147 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 148 #else
 149 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 150 #endif
 151
 152 // iOS doesn't define _mm_cvtsi64_si128 and friends
 153 #if (FOLLY_SSE >= 2) && !FOLLY_MOBILE && FOLLY_X64
 154 #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 155 #else
 156 #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 157 #endif
 158
 159 #include <algorithm>
 160 #include <atomic>
 161 #include <string>
 162 #include <thread>
 163
 164 #include <glog/logging.h>
 165
 166 #include <folly/Likely.h>
 167
 168 namespace folly {
 169
 170 /*
 171  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 172  * a nice writer and don't expect a lot of write/read contention, or
 173  * when you need small rwlocks since you are creating a large number
 174  * of them.
 175  *
 176  * Note that the unfairness here is extreme: if the lock is
 177  * continually accessed for read, writers will never get a chance.  If
 178  * the lock can be that highly contended this class is probably not an
 179  * ideal choice anyway.
 180  *
 181  * It currently implements most of the Lockable, SharedLockable and
 182  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 183  * interfaces.
 184  */
 185 class RWSpinLock {
 186   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 187  public:
 188   constexpr RWSpinLock() : bits_(0) {}
 189
 190   RWSpinLock(RWSpinLock const&) = delete;
 191   RWSpinLock& operator=(RWSpinLock const&) = delete;
 192
 193   // Lockable Concept
 194   void lock() {
 195     uint_fast32_t count = 0;
 196     while (!LIKELY(try_lock())) {
 197       if (++count > 1000) {
 198         std::this_thread::yield();
 199       }
 200     }
 201   }
 202
 203   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 204   void unlock() {
 205     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 206     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 207   }
 208
 209   // SharedLockable Concept
 210   void lock_shared() {
 211     uint_fast32_t count = 0;
 212     while (!LIKELY(try_lock_shared())) {
 213       if (++count > 1000) {
 214         std::this_thread::yield();
 215       }
 216     }
 217   }
 218
 219   void unlock_shared() {
 220     bits_.fetch_add(-READER, std::memory_order_release);
 221   }
 222
 223   // Downgrade the lock from writer status to reader status.
 224   void unlock_and_lock_shared() {
 225     bits_.fetch_add(READER, std::memory_order_acquire);
 226     unlock();
 227   }
 228
 229   // UpgradeLockable Concept
 230   void lock_upgrade() {
 231     uint_fast32_t count = 0;
 232     while (!try_lock_upgrade()) {
 233       if (++count > 1000) {
 234         std::this_thread::yield();
 235       }
 236     }
 237   }
 238
 239   void unlock_upgrade() {
 240     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 241   }
 242
 243   // unlock upgrade and try to acquire write lock
 244   void unlock_upgrade_and_lock() {
 245     int64_t count = 0;
 246     while (!try_unlock_upgrade_and_lock()) {
 247       if (++count > 1000) {
 248         std::this_thread::yield();
 249       }
 250     }
 251   }
 252
 253   // unlock upgrade and read lock atomically
 254   void unlock_upgrade_and_lock_shared() {
 255     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 256   }
 257
 258   // write unlock and upgrade lock atomically
 259   void unlock_and_lock_upgrade() {
 260     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 261     // the same time when other threads are trying do try_lock_upgrade().
 262     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 263     bits_.fetch_add(-WRITER, std::memory_order_release);
 264   }
 265
 266
 267   // Attempt to acquire writer permission. Return false if we didn't get it.
 268   bool try_lock() {
 269     int32_t expect = 0;
 270     return bits_.compare_exchange_strong(expect, WRITER,
 271       std::memory_order_acq_rel);
 272   }
 273
 274   // Try to get reader permission on the lock. This can fail if we
 275   // find out someone is a writer or upgrader.
 276   // Setting the UPGRADED bit would allow a writer-to-be to indicate
 277   // its intention to write and block any new readers while waiting
 278   // for existing readers to finish and release their read locks. This
 279   // helps avoid starving writers (promoted from upgraders).
 280   bool try_lock_shared() {
 281     // fetch_add is considerably (100%) faster than compare_exchange,
 282     // so here we are optimizing for the common (lock success) case.
 283     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 284     if (UNLIKELY(value & (WRITER|UPGRADED))) {
 285       bits_.fetch_add(-READER, std::memory_order_release);
 286       return false;
 287     }
 288     return true;
 289   }
 290
 291   // try to unlock upgrade and write lock atomically
 292   bool try_unlock_upgrade_and_lock() {
 293     int32_t expect = UPGRADED;
 294     return bits_.compare_exchange_strong(expect, WRITER,
 295         std::memory_order_acq_rel);
 296   }
 297
 298   // try to acquire an upgradable lock.
 299   bool try_lock_upgrade() {
 300     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 301
 302     // Note: when failed, we cannot flip the UPGRADED bit back,
 303     // as in this case there is either another upgrade lock or a write lock.
 304     // If it's a write lock, the bit will get cleared up when that lock's done
 305     // with unlock().
 306     return ((value & (UPGRADED | WRITER)) == 0);
 307   }
 308
 309   // mainly for debugging purposes.
 310   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 311
 312   class ReadHolder;
 313   class UpgradedHolder;
 314   class WriteHolder;
 315
 316   class ReadHolder {
 317    public:
 318     explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
 319       if (lock_) {
 320         lock_->lock_shared();
 321       }
 322     }
 323
 324     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 325       lock_->lock_shared();
 326     }
 327
 328     ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
 329       other.lock_ = nullptr;
 330     }
 331
 332     // down-grade
 333     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 334       upgraded.lock_ = nullptr;
 335       if (lock_) {
 336         lock_->unlock_upgrade_and_lock_shared();
 337       }
 338     }
 339
 340     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 341       writer.lock_ = nullptr;
 342       if (lock_) {
 343         lock_->unlock_and_lock_shared();
 344       }
 345     }
 346
 347     ReadHolder& operator=(ReadHolder&& other) {
 348       using std::swap;
 349       swap(lock_, other.lock_);
 350       return *this;
 351     }
 352
 353     ReadHolder(const ReadHolder& other) = delete;
 354     ReadHolder& operator=(const ReadHolder& other) = delete;
 355
 356     ~ReadHolder() {
 357       if (lock_) {
 358         lock_->unlock_shared();
 359       }
 360     }
 361
 362     void reset(RWSpinLock* lock = nullptr) {
 363       if (lock == lock_) {
 364         return;
 365       }
 366       if (lock_) {
 367         lock_->unlock_shared();
 368       }
 369       lock_ = lock;
 370       if (lock_) {
 371         lock_->lock_shared();
 372       }
 373     }
 374
 375     void swap(ReadHolder* other) {
 376       std::swap(lock_, other->lock_);
 377     }
 378
 379    private:
 380     friend class UpgradedHolder;
 381     friend class WriteHolder;
 382     RWSpinLock* lock_;
 383   };
 384
 385   class UpgradedHolder {
 386    public:
 387     explicit UpgradedHolder(RWSpinLock* lock) : lock_(lock) {
 388       if (lock_) {
 389         lock_->lock_upgrade();
 390       }
 391     }
 392
 393     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 394       lock_->lock_upgrade();
 395     }
 396
 397     explicit UpgradedHolder(WriteHolder&& writer) {
 398       lock_ = writer.lock_;
 399       writer.lock_ = nullptr;
 400       if (lock_) {
 401         lock_->unlock_and_lock_upgrade();
 402       }
 403     }
 404
 405     UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
 406       other.lock_ = nullptr;
 407     }
 408
 409     UpgradedHolder& operator =(UpgradedHolder&& other) {
 410       using std::swap;
 411       swap(lock_, other.lock_);
 412       return *this;
 413     }
 414
 415     UpgradedHolder(const UpgradedHolder& other) = delete;
 416     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 417
 418     ~UpgradedHolder() {
 419       if (lock_) {
 420         lock_->unlock_upgrade();
 421       }
 422     }
 423
 424     void reset(RWSpinLock* lock = nullptr) {
 425       if (lock == lock_) {
 426         return;
 427       }
 428       if (lock_) {
 429         lock_->unlock_upgrade();
 430       }
 431       lock_ = lock;
 432       if (lock_) {
 433         lock_->lock_upgrade();
 434       }
 435     }
 436
 437     void swap(UpgradedHolder* other) {
 438       using std::swap;
 439       swap(lock_, other->lock_);
 440     }
 441
 442    private:
 443     friend class WriteHolder;
 444     friend class ReadHolder;
 445     RWSpinLock* lock_;
 446   };
 447
 448   class WriteHolder {
 449    public:
 450     explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
 451       if (lock_) {
 452         lock_->lock();
 453       }
 454     }
 455
 456     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 457       lock_->lock();
 458     }
 459
 460     // promoted from an upgrade lock holder
 461     explicit WriteHolder(UpgradedHolder&& upgraded) {
 462       lock_ = upgraded.lock_;
 463       upgraded.lock_ = nullptr;
 464       if (lock_) {
 465         lock_->unlock_upgrade_and_lock();
 466       }
 467     }
 468
 469     WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
 470       other.lock_ = nullptr;
 471     }
 472
 473     WriteHolder& operator =(WriteHolder&& other) {
 474       using std::swap;
 475       swap(lock_, other.lock_);
 476       return *this;
 477     }
 478
 479     WriteHolder(const WriteHolder& other) = delete;
 480     WriteHolder& operator =(const WriteHolder& other) = delete;
 481
 482     ~WriteHolder() {
 483       if (lock_) {
 484         lock_->unlock();
 485       }
 486     }
 487
 488     void reset(RWSpinLock* lock = nullptr) {
 489       if (lock == lock_) {
 490         return;
 491       }
 492       if (lock_) {
 493         lock_->unlock();
 494       }
 495       lock_ = lock;
 496       if (lock_) {
 497         lock_->lock();
 498       }
 499     }
 500
 501     void swap(WriteHolder* other) {
 502       using std::swap;
 503       swap(lock_, other->lock_);
 504     }
 505
 506    private:
 507     friend class ReadHolder;
 508     friend class UpgradedHolder;
 509     RWSpinLock* lock_;
 510   };
 511
 512  private:
 513   std::atomic<int32_t> bits_;
 514 };
 515
 516
 517 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 518 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 519
 520 namespace detail {
 521 template <size_t kBitWidth> struct RWTicketIntTrait {
 522   static_assert(kBitWidth == 32 || kBitWidth == 64,
 523       "bit width has to be either 32 or 64 ");
 524 };
 525
 526 template <>
 527 struct RWTicketIntTrait<64> {
 528   typedef uint64_t FullInt;
 529   typedef uint32_t HalfInt;
 530   typedef uint16_t QuarterInt;
 531
 532 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 533   static __m128i make128(const uint16_t v[4]) {
 534     return _mm_set_epi16(0, 0, 0, 0,
 535         short(v[3]), short(v[2]), short(v[1]), short(v[0]));
 536   }
 537   static inline __m128i fromInteger(uint64_t from) {
 538     return _mm_cvtsi64_si128(int64_t(from));
 539   }
 540   static inline uint64_t toInteger(__m128i in) {
 541     return uint64_t(_mm_cvtsi128_si64(in));
 542   }
 543   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 544     return toInteger(_mm_add_epi16(in, kDelta));
 545   }
 546 #endif
 547 };
 548
 549 template <>
 550 struct RWTicketIntTrait<32> {
 551   typedef uint32_t FullInt;
 552   typedef uint16_t HalfInt;
 553   typedef uint8_t QuarterInt;
 554
 555 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 556   static __m128i make128(const uint8_t v[4]) {
 557     return _mm_set_epi8(
 558         0, 0, 0, 0,
 559         0, 0, 0, 0,
 560         0, 0, 0, 0,
 561         char(v[3]), char(v[2]), char(v[1]), char(v[0]));
 562   }
 563   static inline __m128i fromInteger(uint32_t from) {
 564     return _mm_cvtsi32_si128(int32_t(from));
 565   }
 566   static inline uint32_t toInteger(__m128i in) {
 567     return uint32_t(_mm_cvtsi128_si32(in));
 568   }
 569   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 570     return toInteger(_mm_add_epi8(in, kDelta));
 571   }
 572 #endif
 573 };
 574 } // namespace detail
 575
 576 template <size_t kBitWidth, bool kFavorWriter = false>
 577 class RWTicketSpinLockT {
 578   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 579   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 580   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 581   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 582     QuarterInt;
 583
 584   union RWTicket {
 585     constexpr RWTicket() : whole(0) {}
 586     FullInt whole;
 587     HalfInt readWrite;
 588     __extension__ struct {
 589       QuarterInt write;
 590       QuarterInt read;
 591       QuarterInt users;
 592     };
 593   } ticket;
 594
 595  private: // Some x64-specific utilities for atomic access to ticket.
 596   template <class T> static T load_acquire(T* addr) {
 597     T t = *addr; // acquire barrier
 598     asm_volatile_memory();
 599     return t;
 600   }
 601
 602   template <class T>
 603   static void store_release(T* addr, T v) {
 604     asm_volatile_memory();
 605     *addr = v; // release barrier
 606   }
 607
 608  public:
 609
 610   constexpr RWTicketSpinLockT() {}
 611
 612   RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
 613   RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
 614
 615   void lock() {
 616     if (kFavorWriter) {
 617       writeLockAggressive();
 618     } else {
 619       writeLockNice();
 620     }
 621   }
 622
 623   /*
 624    * Both try_lock and try_lock_shared diverge in our implementation from the
 625    * lock algorithm described in the link above.
 626    *
 627    * In the read case, it is undesirable that the readers could wait
 628    * for another reader (before increasing ticket.read in the other
 629    * implementation).  Our approach gives up on
 630    * first-come-first-serve, but our benchmarks showed improve
 631    * performance for both readers and writers under heavily contended
 632    * cases, particularly when the number of threads exceeds the number
 633    * of logical CPUs.
 634    *
 635    * We have writeLockAggressive() using the original implementation
 636    * for a writer, which gives some advantage to the writer over the
 637    * readers---for that path it is guaranteed that the writer will
 638    * acquire the lock after all the existing readers exit.
 639    */
 640   bool try_lock() {
 641     RWTicket t;
 642     FullInt old = t.whole = load_acquire(&ticket.whole);
 643     if (t.users != t.write) {
 644       return false;
 645     }
 646     ++t.users;
 647     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 648   }
 649
 650   /*
 651    * Call this if you want to prioritize writer to avoid starvation.
 652    * Unlike writeLockNice, immediately acquires the write lock when
 653    * the existing readers (arriving before the writer) finish their
 654    * turns.
 655    */
 656   void writeLockAggressive() {
 657     // std::this_thread::yield() is needed here to avoid a pathology if the number
 658     // of threads attempting concurrent writes is >= the number of real
 659     // cores allocated to this process. This is less likely than the
 660     // corresponding situation in lock_shared(), but we still want to
 661     // avoid it
 662     uint_fast32_t count = 0;
 663     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 664     while (val != load_acquire(&ticket.write)) {
 665       asm_volatile_pause();
 666       if (UNLIKELY(++count > 1000)) {
 667         std::this_thread::yield();
 668       }
 669     }
 670   }
 671
 672   // Call this when the writer should be nicer to the readers.
 673   void writeLockNice() {
 674     // Here it doesn't cpu-relax the writer.
 675     //
 676     // This is because usually we have many more readers than the
 677     // writers, so the writer has less chance to get the lock when
 678     // there are a lot of competing readers.  The aggressive spinning
 679     // can help to avoid starving writers.
 680     //
 681     // We don't worry about std::this_thread::yield() here because the caller
 682     // has already explicitly abandoned fairness.
 683     while (!try_lock()) {}
 684   }
 685
 686   // Atomically unlock the write-lock from writer and acquire the read-lock.
 687   void unlock_and_lock_shared() {
 688     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 689   }
 690
 691   // Release writer permission on the lock.
 692   void unlock() {
 693     RWTicket t;
 694     t.whole = load_acquire(&ticket.whole);
 695
 696 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 697     FullInt old = t.whole;
 698     // SSE2 can reduce the lock and unlock overhead by 10%
 699     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 700     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 701     __m128i m = IntTraitType::fromInteger(old);
 702     t.whole = IntTraitType::addParallel(m, kDelta);
 703 #else
 704     ++t.read;
 705     ++t.write;
 706 #endif
 707     store_release(&ticket.readWrite, t.readWrite);
 708   }
 709
 710   void lock_shared() {
 711     // std::this_thread::yield() is important here because we can't grab the
 712     // shared lock if there is a pending writeLockAggressive, so we
 713     // need to let threads that already have a shared lock complete
 714     uint_fast32_t count = 0;
 715     while (!LIKELY(try_lock_shared())) {
 716       asm_volatile_pause();
 717       if (UNLIKELY((++count & 1023) == 0)) {
 718         std::this_thread::yield();
 719       }
 720     }
 721   }
 722
 723   bool try_lock_shared() {
 724     RWTicket t, old;
 725     old.whole = t.whole = load_acquire(&ticket.whole);
 726     old.users = old.read;
 727 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 728     // SSE2 may reduce the total lock and unlock overhead by 10%
 729     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 730     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 731     __m128i m = IntTraitType::fromInteger(old.whole);
 732     t.whole = IntTraitType::addParallel(m, kDelta);
 733 #else
 734     ++t.read;
 735     ++t.users;
 736 #endif
 737     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 738   }
 739
 740   void unlock_shared() {
 741     __sync_fetch_and_add(&ticket.write, 1);
 742   }
 743
 744   class WriteHolder;
 745
 746   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 747   class ReadHolder {
 748    public:
 749     ReadHolder(ReadHolder const&) = delete;
 750     ReadHolder& operator=(ReadHolder const&) = delete;
 751
 752     explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
 753       if (lock_) {
 754         lock_->lock_shared();
 755       }
 756     }
 757
 758     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 759       if (lock_) {
 760         lock_->lock_shared();
 761       }
 762     }
 763
 764     // atomically unlock the write-lock from writer and acquire the read-lock
 765     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 766       std::swap(this->lock_, writer->lock_);
 767       if (lock_) {
 768         lock_->unlock_and_lock_shared();
 769       }
 770     }
 771
 772     ~ReadHolder() {
 773       if (lock_) {
 774         lock_->unlock_shared();
 775       }
 776     }
 777
 778     void reset(RWSpinLock *lock = nullptr) {
 779       if (lock_) {
 780         lock_->unlock_shared();
 781       }
 782       lock_ = lock;
 783       if (lock_) {
 784         lock_->lock_shared();
 785       }
 786     }
 787
 788     void swap(ReadHolder *other) {
 789       std::swap(this->lock_, other->lock_);
 790     }
 791
 792    private:
 793     RWSpinLock *lock_;
 794   };
 795
 796   class WriteHolder {
 797    public:
 798     WriteHolder(WriteHolder const&) = delete;
 799     WriteHolder& operator=(WriteHolder const&) = delete;
 800
 801     explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
 802       if (lock_) {
 803         lock_->lock();
 804       }
 805     }
 806     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 807       if (lock_) {
 808         lock_->lock();
 809       }
 810     }
 811
 812     ~WriteHolder() {
 813       if (lock_) {
 814         lock_->unlock();
 815       }
 816     }
 817
 818     void reset(RWSpinLock *lock = nullptr) {
 819       if (lock == lock_) {
 820         return;
 821       }
 822       if (lock_) {
 823         lock_->unlock();
 824       }
 825       lock_ = lock;
 826       if (lock_) {
 827         lock_->lock();
 828       }
 829     }
 830
 831     void swap(WriteHolder *other) {
 832       std::swap(this->lock_, other->lock_);
 833     }
 834
 835    private:
 836     friend class ReadHolder;
 837     RWSpinLock *lock_;
 838   };
 839 };
 840
 841 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 842 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 843
 844 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 845
 846 } // namespace folly
 847
 848 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 849 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 850 #endif