folly/RWSpinLock.h

   1 /*
   2  * Copyright 2016 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*************************************************
  18
  19 IF YOU HAVE TO ASK, NO, YOU DO NOT WANT A SPINLOCK.
  20
  21 Yes, even if a microbench shows that a spinlock is faster, you still probably
  22 don't want one.
  23
  24 Spinlocks in general are a big problem on systems for which you cannot disable
  25 preemption, like normal user-space code running on POXIX and Windows
  26 platforms. If the thread holding a spinlock is preempted, another thread
  27 trying to acquire the lock and pounding the lock variable
  28 has no idea that it's spinning in vain. Some spinlock implementations use
  29 sched_yield or similar to try to make this problem less severe --- we don't
  30 use the rest of our timeslice to pointlessly read a variable ---
  31 but the overall result is still poor, especially if the thread locking the lock
  32 sleeps (which any thread can do on a demand-paging system), then we'll
  33 sched_yield over and over again, still pointlessly, pounding on the lock.
  34
  35 You really want a plain old mutex. Regular mutexes will spin for a little while,
  36 then go to sleep.  While sleeping, threads use no CPU resources and do
  37 not cause scheduler contention.
  38
  39 There are exceptions to the above warning.  If you have to ask, no, your
  40 code doesn't qualify.
  41
  42
  43 STOP USING SPINLOCKS IN ORDINARY CODE.
  44
  45
  46 **************************************************/
  47
  48 /*
  49  * Two Read-Write spin lock implementations.
  50  *
  51  *  Ref: http://locklessinc.com/articles/locks
  52  *
  53  *  Both locks here are faster than pthread_rwlock and have very low
  54  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  55  *  are very compact (4/8 bytes), so are suitable for per-instance
  56  *  based locking, particularly when contention is not expected.
  57  *
  58  *  In most cases, RWSpinLock is a reasonable choice.  It has minimal
  59  *  overhead, and comparable contention performance when the number of
  60  *  competing threads is less than or equal to the number of logical
  61  *  CPUs.  Even as the number of threads gets larger, RWSpinLock can
  62  *  still be very competitive in READ, although it is slower on WRITE,
  63  *  and also inherently unfair to writers.
  64  *
  65  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  66  *  your application really needs a lot more threads, and a
  67  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  68  *
  69  *  Caveats:
  70  *
  71  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  72  *    based systems.
  73  *
  74  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  75  *    readers and writers.
  76  *
  77  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  78  *    readers and writers.
  79  *
  80  *    RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
  81  *    writer priority) is NOT reentrant, even for lock_shared().
  82  *
  83  *    The lock will not grant any new shared (read) accesses while a thread
  84  *    attempting to acquire the lock in write mode is blocked. (That is,
  85  *    if the lock is held in shared mode by N threads, and a thread attempts
  86  *    to acquire it in write mode, no one else can acquire it in shared mode
  87  *    until these N threads release the lock and then the blocked thread
  88  *    acquires and releases the exclusive lock.) This also applies for
  89  *    attempts to reacquire the lock in shared mode by threads that already
  90  *    hold it in shared mode, making the lock non-reentrant.
  91  *
  92  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  93  *
  94  * @author Xin Liu <xliux@fb.com>
  95  */
  96
  97 #ifndef FOLLY_RWSPINLOCK_H_
  98 #define FOLLY_RWSPINLOCK_H_
  99
 100 /*
 101 ========================================================================
 102 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
 103 ========================================================================
 104
 105 ------------------------------------------------------------------------------
 106 1. Single thread benchmark (read/write lock + unlock overhead)
 107 Benchmark                                    Iters   Total t    t/iter iter/sec
 108 -------------------------------------------------------------------------------
 109 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
 110 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
 111 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
 112 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
 113 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
 114 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
 115 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
 116 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
 117 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
 118 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
 119 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
 120 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
 121
 122 ------------------------------------------------------------------------------
 123 2. Contention Benchmark      90% read  10% write
 124 Benchmark                    hits       average    min       max        sigma
 125 ------------------------------------------------------------------------------
 126 ---------- 8  threads ------------
 127 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
 128 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
 129 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
 130 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
 131 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
 132 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
 133
 134 ---------- 16 threads ------------
 135 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
 136 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
 137 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
 138 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
 139 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
 140 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
 141
 142 ---------- 50 threads ------------
 143 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 144 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 145 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 146 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 147 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 148 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 149
 150 */
 151
 152 #include <folly/Portability.h>
 153
 154 #if defined(__GNUC__) && \
 155   (defined(__i386) || FOLLY_X64 || \
 156    defined(ARCH_K8))
 157 # define RW_SPINLOCK_USE_X86_INTRINSIC_
 158 # include <x86intrin.h>
 159 #elif defined(_MSC_VER) && defined(FOLLY_X64)
 160 # define RW_SPINLOCK_USE_X86_INTRINSIC_
 161 #else
 162 # undef RW_SPINLOCK_USE_X86_INTRINSIC_
 163 #endif
 164
 165 // iOS doesn't define _mm_cvtsi64_si128 and friends
 166 #if (FOLLY_SSE >= 2) && !TARGET_OS_IPHONE
 167 #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 168 #else
 169 #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 170 #endif
 171
 172 #include <atomic>
 173 #include <string>
 174 #include <algorithm>
 175
 176 #include <sched.h>
 177 #include <glog/logging.h>
 178
 179 #include <folly/Likely.h>
 180
 181 namespace folly {
 182
 183 /*
 184  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 185  * a nice writer and don't expect a lot of write/read contention, or
 186  * when you need small rwlocks since you are creating a large number
 187  * of them.
 188  *
 189  * Note that the unfairness here is extreme: if the lock is
 190  * continually accessed for read, writers will never get a chance.  If
 191  * the lock can be that highly contended this class is probably not an
 192  * ideal choice anyway.
 193  *
 194  * It currently implements most of the Lockable, SharedLockable and
 195  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 196  * interfaces.
 197  */
 198 class RWSpinLock {
 199   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 200  public:
 201   constexpr RWSpinLock() : bits_(0) {}
 202
 203   RWSpinLock(RWSpinLock const&) = delete;
 204   RWSpinLock& operator=(RWSpinLock const&) = delete;
 205
 206   // Lockable Concept
 207   void lock() {
 208     int count = 0;
 209     while (!LIKELY(try_lock())) {
 210       if (++count > 1000) sched_yield();
 211     }
 212   }
 213
 214   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 215   void unlock() {
 216     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 217     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 218   }
 219
 220   // SharedLockable Concept
 221   void lock_shared() {
 222     int count = 0;
 223     while (!LIKELY(try_lock_shared())) {
 224       if (++count > 1000) sched_yield();
 225     }
 226   }
 227
 228   void unlock_shared() {
 229     bits_.fetch_add(-READER, std::memory_order_release);
 230   }
 231
 232   // Downgrade the lock from writer status to reader status.
 233   void unlock_and_lock_shared() {
 234     bits_.fetch_add(READER, std::memory_order_acquire);
 235     unlock();
 236   }
 237
 238   // UpgradeLockable Concept
 239   void lock_upgrade() {
 240     int count = 0;
 241     while (!try_lock_upgrade()) {
 242       if (++count > 1000) sched_yield();
 243     }
 244   }
 245
 246   void unlock_upgrade() {
 247     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 248   }
 249
 250   // unlock upgrade and try to acquire write lock
 251   void unlock_upgrade_and_lock() {
 252     int64_t count = 0;
 253     while (!try_unlock_upgrade_and_lock()) {
 254       if (++count > 1000) sched_yield();
 255     }
 256   }
 257
 258   // unlock upgrade and read lock atomically
 259   void unlock_upgrade_and_lock_shared() {
 260     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 261   }
 262
 263   // write unlock and upgrade lock atomically
 264   void unlock_and_lock_upgrade() {
 265     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 266     // the same time when other threads are trying do try_lock_upgrade().
 267     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 268     bits_.fetch_add(-WRITER, std::memory_order_release);
 269   }
 270
 271
 272   // Attempt to acquire writer permission. Return false if we didn't get it.
 273   bool try_lock() {
 274     int32_t expect = 0;
 275     return bits_.compare_exchange_strong(expect, WRITER,
 276       std::memory_order_acq_rel);
 277   }
 278
 279   // Try to get reader permission on the lock. This can fail if we
 280   // find out someone is a writer or upgrader.
 281   // Setting the UPGRADED bit would allow a writer-to-be to indicate
 282   // its intention to write and block any new readers while waiting
 283   // for existing readers to finish and release their read locks. This
 284   // helps avoid starving writers (promoted from upgraders).
 285   bool try_lock_shared() {
 286     // fetch_add is considerably (100%) faster than compare_exchange,
 287     // so here we are optimizing for the common (lock success) case.
 288     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 289     if (UNLIKELY(value & (WRITER|UPGRADED))) {
 290       bits_.fetch_add(-READER, std::memory_order_release);
 291       return false;
 292     }
 293     return true;
 294   }
 295
 296   // try to unlock upgrade and write lock atomically
 297   bool try_unlock_upgrade_and_lock() {
 298     int32_t expect = UPGRADED;
 299     return bits_.compare_exchange_strong(expect, WRITER,
 300         std::memory_order_acq_rel);
 301   }
 302
 303   // try to acquire an upgradable lock.
 304   bool try_lock_upgrade() {
 305     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 306
 307     // Note: when failed, we cannot flip the UPGRADED bit back,
 308     // as in this case there is either another upgrade lock or a write lock.
 309     // If it's a write lock, the bit will get cleared up when that lock's done
 310     // with unlock().
 311     return ((value & (UPGRADED | WRITER)) == 0);
 312   }
 313
 314   // mainly for debugging purposes.
 315   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 316
 317   class ReadHolder;
 318   class UpgradedHolder;
 319   class WriteHolder;
 320
 321   class ReadHolder {
 322    public:
 323     explicit ReadHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 324       if (lock_) lock_->lock_shared();
 325     }
 326
 327     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 328       lock_->lock_shared();
 329     }
 330
 331     ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
 332       other.lock_ = nullptr;
 333     }
 334
 335     // down-grade
 336     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 337       upgraded.lock_ = nullptr;
 338       if (lock_) lock_->unlock_upgrade_and_lock_shared();
 339     }
 340
 341     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 342       writer.lock_ = nullptr;
 343       if (lock_) lock_->unlock_and_lock_shared();
 344     }
 345
 346     ReadHolder& operator=(ReadHolder&& other) {
 347       using std::swap;
 348       swap(lock_, other.lock_);
 349       return *this;
 350     }
 351
 352     ReadHolder(const ReadHolder& other) = delete;
 353     ReadHolder& operator=(const ReadHolder& other) = delete;
 354
 355     ~ReadHolder() { if (lock_) lock_->unlock_shared(); }
 356
 357     void reset(RWSpinLock* lock = nullptr) {
 358       if (lock == lock_) return;
 359       if (lock_) lock_->unlock_shared();
 360       lock_ = lock;
 361       if (lock_) lock_->lock_shared();
 362     }
 363
 364     void swap(ReadHolder* other) {
 365       std::swap(lock_, other->lock_);
 366     }
 367
 368    private:
 369     friend class UpgradedHolder;
 370     friend class WriteHolder;
 371     RWSpinLock* lock_;
 372   };
 373
 374   class UpgradedHolder {
 375    public:
 376     explicit UpgradedHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 377       if (lock_) lock_->lock_upgrade();
 378     }
 379
 380     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 381       lock_->lock_upgrade();
 382     }
 383
 384     explicit UpgradedHolder(WriteHolder&& writer) {
 385       lock_ = writer.lock_;
 386       writer.lock_ = nullptr;
 387       if (lock_) lock_->unlock_and_lock_upgrade();
 388     }
 389
 390     UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
 391       other.lock_ = nullptr;
 392     }
 393
 394     UpgradedHolder& operator =(UpgradedHolder&& other) {
 395       using std::swap;
 396       swap(lock_, other.lock_);
 397       return *this;
 398     }
 399
 400     UpgradedHolder(const UpgradedHolder& other) = delete;
 401     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 402
 403     ~UpgradedHolder() { if (lock_) lock_->unlock_upgrade(); }
 404
 405     void reset(RWSpinLock* lock = nullptr) {
 406       if (lock == lock_) return;
 407       if (lock_) lock_->unlock_upgrade();
 408       lock_ = lock;
 409       if (lock_) lock_->lock_upgrade();
 410     }
 411
 412     void swap(UpgradedHolder* other) {
 413       using std::swap;
 414       swap(lock_, other->lock_);
 415     }
 416
 417    private:
 418     friend class WriteHolder;
 419     friend class ReadHolder;
 420     RWSpinLock* lock_;
 421   };
 422
 423   class WriteHolder {
 424    public:
 425     explicit WriteHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 426       if (lock_) lock_->lock();
 427     }
 428
 429     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 430       lock_->lock();
 431     }
 432
 433     // promoted from an upgrade lock holder
 434     explicit WriteHolder(UpgradedHolder&& upgraded) {
 435       lock_ = upgraded.lock_;
 436       upgraded.lock_ = nullptr;
 437       if (lock_) lock_->unlock_upgrade_and_lock();
 438     }
 439
 440     WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
 441       other.lock_ = nullptr;
 442     }
 443
 444     WriteHolder& operator =(WriteHolder&& other) {
 445       using std::swap;
 446       swap(lock_, other.lock_);
 447       return *this;
 448     }
 449
 450     WriteHolder(const WriteHolder& other) = delete;
 451     WriteHolder& operator =(const WriteHolder& other) = delete;
 452
 453     ~WriteHolder () { if (lock_) lock_->unlock(); }
 454
 455     void reset(RWSpinLock* lock = nullptr) {
 456       if (lock == lock_) return;
 457       if (lock_) lock_->unlock();
 458       lock_ = lock;
 459       if (lock_) lock_->lock();
 460     }
 461
 462     void swap(WriteHolder* other) {
 463       using std::swap;
 464       swap(lock_, other->lock_);
 465     }
 466
 467    private:
 468     friend class ReadHolder;
 469     friend class UpgradedHolder;
 470     RWSpinLock* lock_;
 471   };
 472
 473   // Synchronized<> adaptors
 474   friend void acquireRead(RWSpinLock& l) { return l.lock_shared(); }
 475   friend void acquireReadWrite(RWSpinLock& l) { return l.lock(); }
 476   friend void releaseRead(RWSpinLock& l) { return l.unlock_shared(); }
 477   friend void releaseReadWrite(RWSpinLock& l) { return l.unlock(); }
 478
 479  private:
 480   std::atomic<int32_t> bits_;
 481 };
 482
 483
 484 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 485 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 486
 487 namespace detail {
 488 template <size_t kBitWidth> struct RWTicketIntTrait {
 489   static_assert(kBitWidth == 32 || kBitWidth == 64,
 490       "bit width has to be either 32 or 64 ");
 491 };
 492
 493 template <>
 494 struct RWTicketIntTrait<64> {
 495   typedef uint64_t FullInt;
 496   typedef uint32_t HalfInt;
 497   typedef uint16_t QuarterInt;
 498
 499 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 500   static __m128i make128(const uint16_t v[4]) {
 501     return _mm_set_epi16(0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 502   }
 503   static inline __m128i fromInteger(uint64_t from) {
 504     return _mm_cvtsi64_si128(from);
 505   }
 506   static inline uint64_t toInteger(__m128i in) {
 507     return _mm_cvtsi128_si64(in);
 508   }
 509   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 510     return toInteger(_mm_add_epi16(in, kDelta));
 511   }
 512 #endif
 513 };
 514
 515 template <>
 516 struct RWTicketIntTrait<32> {
 517   typedef uint32_t FullInt;
 518   typedef uint16_t HalfInt;
 519   typedef uint8_t QuarterInt;
 520
 521 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 522   static __m128i make128(const uint8_t v[4]) {
 523     return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
 524         0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 525   }
 526   static inline __m128i fromInteger(uint32_t from) {
 527     return _mm_cvtsi32_si128(from);
 528   }
 529   static inline uint32_t toInteger(__m128i in) {
 530     return _mm_cvtsi128_si32(in);
 531   }
 532   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 533     return toInteger(_mm_add_epi8(in, kDelta));
 534   }
 535 #endif
 536 };
 537 }  // detail
 538
 539
 540 template<size_t kBitWidth, bool kFavorWriter=false>
 541 class RWTicketSpinLockT {
 542   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 543   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 544   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 545   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 546     QuarterInt;
 547
 548   union RWTicket {
 549     constexpr RWTicket() : whole(0) {}
 550     FullInt whole;
 551     HalfInt readWrite;
 552     __extension__ struct {
 553       QuarterInt write;
 554       QuarterInt read;
 555       QuarterInt users;
 556     };
 557   } ticket;
 558
 559  private: // Some x64-specific utilities for atomic access to ticket.
 560   template<class T> static T load_acquire(T* addr) {
 561     T t = *addr; // acquire barrier
 562     asm_volatile_memory();
 563     return t;
 564   }
 565
 566   template<class T>
 567   static void store_release(T* addr, T v) {
 568     asm_volatile_memory();
 569     *addr = v; // release barrier
 570   }
 571
 572  public:
 573
 574   constexpr RWTicketSpinLockT() {}
 575
 576   RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
 577   RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
 578
 579   void lock() {
 580     if (kFavorWriter) {
 581       writeLockAggressive();
 582     } else {
 583       writeLockNice();
 584     }
 585   }
 586
 587   /*
 588    * Both try_lock and try_lock_shared diverge in our implementation from the
 589    * lock algorithm described in the link above.
 590    *
 591    * In the read case, it is undesirable that the readers could wait
 592    * for another reader (before increasing ticket.read in the other
 593    * implementation).  Our approach gives up on
 594    * first-come-first-serve, but our benchmarks showed improve
 595    * performance for both readers and writers under heavily contended
 596    * cases, particularly when the number of threads exceeds the number
 597    * of logical CPUs.
 598    *
 599    * We have writeLockAggressive() using the original implementation
 600    * for a writer, which gives some advantage to the writer over the
 601    * readers---for that path it is guaranteed that the writer will
 602    * acquire the lock after all the existing readers exit.
 603    */
 604   bool try_lock() {
 605     RWTicket t;
 606     FullInt old = t.whole = load_acquire(&ticket.whole);
 607     if (t.users != t.write) return false;
 608     ++t.users;
 609     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 610   }
 611
 612   /*
 613    * Call this if you want to prioritize writer to avoid starvation.
 614    * Unlike writeLockNice, immediately acquires the write lock when
 615    * the existing readers (arriving before the writer) finish their
 616    * turns.
 617    */
 618   void writeLockAggressive() {
 619     // sched_yield() is needed here to avoid a pathology if the number
 620     // of threads attempting concurrent writes is >= the number of real
 621     // cores allocated to this process. This is less likely than the
 622     // corresponding situation in lock_shared(), but we still want to
 623     // avoid it
 624     int count = 0;
 625     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 626     while (val != load_acquire(&ticket.write)) {
 627       asm_volatile_pause();
 628       if (UNLIKELY(++count > 1000)) sched_yield();
 629     }
 630   }
 631
 632   // Call this when the writer should be nicer to the readers.
 633   void writeLockNice() {
 634     // Here it doesn't cpu-relax the writer.
 635     //
 636     // This is because usually we have many more readers than the
 637     // writers, so the writer has less chance to get the lock when
 638     // there are a lot of competing readers.  The aggressive spinning
 639     // can help to avoid starving writers.
 640     //
 641     // We don't worry about sched_yield() here because the caller
 642     // has already explicitly abandoned fairness.
 643     while (!try_lock()) {}
 644   }
 645
 646   // Atomically unlock the write-lock from writer and acquire the read-lock.
 647   void unlock_and_lock_shared() {
 648     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 649   }
 650
 651   // Release writer permission on the lock.
 652   void unlock() {
 653     RWTicket t;
 654     t.whole = load_acquire(&ticket.whole);
 655     FullInt old = t.whole;
 656
 657 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 658     // SSE2 can reduce the lock and unlock overhead by 10%
 659     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 660     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 661     __m128i m = IntTraitType::fromInteger(old);
 662     t.whole = IntTraitType::addParallel(m, kDelta);
 663 #else
 664     ++t.read;
 665     ++t.write;
 666 #endif
 667     store_release(&ticket.readWrite, t.readWrite);
 668   }
 669
 670   void lock_shared() {
 671     // sched_yield() is important here because we can't grab the
 672     // shared lock if there is a pending writeLockAggressive, so we
 673     // need to let threads that already have a shared lock complete
 674     int count = 0;
 675     while (!LIKELY(try_lock_shared())) {
 676       asm_volatile_pause();
 677       if (UNLIKELY((++count & 1023) == 0)) sched_yield();
 678     }
 679   }
 680
 681   bool try_lock_shared() {
 682     RWTicket t, old;
 683     old.whole = t.whole = load_acquire(&ticket.whole);
 684     old.users = old.read;
 685 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 686     // SSE2 may reduce the total lock and unlock overhead by 10%
 687     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 688     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 689     __m128i m = IntTraitType::fromInteger(old.whole);
 690     t.whole = IntTraitType::addParallel(m, kDelta);
 691 #else
 692     ++t.read;
 693     ++t.users;
 694 #endif
 695     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 696   }
 697
 698   void unlock_shared() {
 699     QuarterInt val = __sync_fetch_and_add(&ticket.write, 1);
 700   }
 701
 702   class WriteHolder;
 703
 704   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 705   class ReadHolder {
 706    public:
 707     ReadHolder(ReadHolder const&) = delete;
 708     ReadHolder& operator=(ReadHolder const&) = delete;
 709
 710     explicit ReadHolder(RWSpinLock *lock = nullptr) :
 711       lock_(lock) {
 712       if (lock_) lock_->lock_shared();
 713     }
 714
 715     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 716       if (lock_) lock_->lock_shared();
 717     }
 718
 719     // atomically unlock the write-lock from writer and acquire the read-lock
 720     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 721       std::swap(this->lock_, writer->lock_);
 722       if (lock_) {
 723         lock_->unlock_and_lock_shared();
 724       }
 725     }
 726
 727     ~ReadHolder() {
 728       if (lock_) lock_->unlock_shared();
 729     }
 730
 731     void reset(RWSpinLock *lock = nullptr) {
 732       if (lock_) lock_->unlock_shared();
 733       lock_ = lock;
 734       if (lock_) lock_->lock_shared();
 735     }
 736
 737     void swap(ReadHolder *other) {
 738       std::swap(this->lock_, other->lock_);
 739     }
 740
 741    private:
 742     RWSpinLock *lock_;
 743   };
 744
 745   class WriteHolder {
 746    public:
 747     WriteHolder(WriteHolder const&) = delete;
 748     WriteHolder& operator=(WriteHolder const&) = delete;
 749
 750     explicit WriteHolder(RWSpinLock *lock = nullptr) : lock_(lock) {
 751       if (lock_) lock_->lock();
 752     }
 753     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 754       if (lock_) lock_->lock();
 755     }
 756
 757     ~WriteHolder() {
 758       if (lock_) lock_->unlock();
 759     }
 760
 761     void reset(RWSpinLock *lock = nullptr) {
 762       if (lock == lock_) return;
 763       if (lock_) lock_->unlock();
 764       lock_ = lock;
 765       if (lock_) lock_->lock();
 766     }
 767
 768     void swap(WriteHolder *other) {
 769       std::swap(this->lock_, other->lock_);
 770     }
 771
 772    private:
 773     friend class ReadHolder;
 774     RWSpinLock *lock_;
 775   };
 776
 777   // Synchronized<> adaptors.
 778   friend void acquireRead(RWTicketSpinLockT& mutex) {
 779     mutex.lock_shared();
 780   }
 781   friend void acquireReadWrite(RWTicketSpinLockT& mutex) {
 782     mutex.lock();
 783   }
 784   friend void releaseRead(RWTicketSpinLockT& mutex) {
 785     mutex.unlock_shared();
 786   }
 787   friend void releaseReadWrite(RWTicketSpinLockT& mutex) {
 788     mutex.unlock();
 789   }
 790 };
 791
 792 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 793 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 794
 795 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 796
 797 }  // namespace folly
 798
 799 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 800 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 801 #endif
 802
 803 #endif  // FOLLY_RWSPINLOCK_H_