2 * Copyright 2017-present Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 * N.B. You most likely do _not_ want to use RWSpinLock or any other
18 * kind of spinlock. Use SharedMutex instead.
20 * In short, spinlocks in preemptive multi-tasking operating systems
21 * have serious problems and fast mutexes like SharedMutex are almost
22 * certainly the better choice, because letting the OS scheduler put a
23 * thread to sleep is better for system responsiveness and throughput
24 * than wasting a timeslice repeatedly querying a lock held by a
25 * thread that's blocked, and you can't prevent userspace
28 * Spinlocks in an operating system kernel make much more sense than
29 * they do in userspace.
31 * -------------------------------------------------------------------
33 * Two Read-Write spin lock implementations.
35 * Ref: http://locklessinc.com/articles/locks
37 * Both locks here are faster than pthread_rwlock and have very low
38 * overhead (usually 20-30ns). They don't use any system mutexes and
39 * are very compact (4/8 bytes), so are suitable for per-instance
40 * based locking, particularly when contention is not expected.
42 * For a spinlock, RWSpinLock is a reasonable choice. (See the note
43 * about for why a spin lock is frequently a bad idea generally.)
44 * RWSpinLock has minimal overhead, and comparable contention
45 * performance when the number of competing threads is less than or
46 * equal to the number of logical CPUs. Even as the number of
47 * threads gets larger, RWSpinLock can still be very competitive in
48 * READ, although it is slower on WRITE, and also inherently unfair
51 * RWTicketSpinLock shows more balanced READ/WRITE performance. If
52 * your application really needs a lot more threads, and a
53 * higher-priority writer, prefer one of the RWTicketSpinLock locks.
57 * RWTicketSpinLock locks can only be used with GCC on x86/x86-64
60 * RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
61 * readers and writers.
63 * RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
64 * readers and writers.
66 * RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
67 * writer priority) is NOT reentrant, even for lock_shared().
69 * The lock will not grant any new shared (read) accesses while a thread
70 * attempting to acquire the lock in write mode is blocked. (That is,
71 * if the lock is held in shared mode by N threads, and a thread attempts
72 * to acquire it in write mode, no one else can acquire it in shared mode
73 * until these N threads release the lock and then the blocked thread
74 * acquires and releases the exclusive lock.) This also applies for
75 * attempts to reacquire the lock in shared mode by threads that already
76 * hold it in shared mode, making the lock non-reentrant.
78 * RWSpinLock handles 2^30 - 1 concurrent readers.
80 * @author Xin Liu <xliux@fb.com>
86 ========================================================================
87 Benchmark on (Intel(R) Xeon(R) CPU L5630 @ 2.13GHz) 8 cores(16 HTs)
88 ========================================================================
90 ------------------------------------------------------------------------------
91 1. Single thread benchmark (read/write lock + unlock overhead)
92 Benchmark Iters Total t t/iter iter/sec
93 -------------------------------------------------------------------------------
94 * BM_RWSpinLockRead 100000 1.786 ms 17.86 ns 53.4M
95 +30.5% BM_RWSpinLockWrite 100000 2.331 ms 23.31 ns 40.91M
96 +85.7% BM_RWTicketSpinLock32Read 100000 3.317 ms 33.17 ns 28.75M
97 +96.0% BM_RWTicketSpinLock32Write 100000 3.5 ms 35 ns 27.25M
98 +85.6% BM_RWTicketSpinLock64Read 100000 3.315 ms 33.15 ns 28.77M
99 +96.0% BM_RWTicketSpinLock64Write 100000 3.5 ms 35 ns 27.25M
100 +85.7% BM_RWTicketSpinLock32FavorWriterRead 100000 3.317 ms 33.17 ns 28.75M
101 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000 2.316 ms 23.16 ns 41.18M
102 +85.3% BM_RWTicketSpinLock64FavorWriterRead 100000 3.309 ms 33.09 ns 28.82M
103 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000 2.325 ms 23.25 ns 41.02M
104 + 175% BM_PThreadRWMutexRead 100000 4.917 ms 49.17 ns 19.4M
105 + 166% BM_PThreadRWMutexWrite 100000 4.757 ms 47.57 ns 20.05M
107 ------------------------------------------------------------------------------
108 2. Contention Benchmark 90% read 10% write
109 Benchmark hits average min max sigma
110 ------------------------------------------------------------------------------
111 ---------- 8 threads ------------
112 RWSpinLock Write 142666 220ns 78ns 40.8us 269ns
113 RWSpinLock Read 1282297 222ns 80ns 37.7us 248ns
114 RWTicketSpinLock Write 85692 209ns 71ns 17.9us 252ns
115 RWTicketSpinLock Read 769571 215ns 78ns 33.4us 251ns
116 pthread_rwlock_t Write 84248 2.48us 99ns 269us 8.19us
117 pthread_rwlock_t Read 761646 933ns 101ns 374us 3.25us
119 ---------- 16 threads ------------
120 RWSpinLock Write 124236 237ns 78ns 261us 801ns
121 RWSpinLock Read 1115807 236ns 78ns 2.27ms 2.17us
122 RWTicketSpinLock Write 81781 231ns 71ns 31.4us 351ns
123 RWTicketSpinLock Read 734518 238ns 78ns 73.6us 379ns
124 pthread_rwlock_t Write 83363 7.12us 99ns 785us 28.1us
125 pthread_rwlock_t Read 754978 2.18us 101ns 1.02ms 14.3us
127 ---------- 50 threads ------------
128 RWSpinLock Write 131142 1.37us 82ns 7.53ms 68.2us
129 RWSpinLock Read 1181240 262ns 78ns 6.62ms 12.7us
130 RWTicketSpinLock Write 83045 397ns 73ns 7.01ms 31.5us
131 RWTicketSpinLock Read 744133 386ns 78ns 11ms 31.4us
132 pthread_rwlock_t Write 80849 112us 103ns 4.52ms 263us
133 pthread_rwlock_t Read 728698 24us 101ns 7.28ms 194us
137 #include <folly/Portability.h>
138 #include <folly/portability/Asm.h>
140 #if defined(__GNUC__) && (defined(__i386) || FOLLY_X64 || defined(ARCH_K8))
141 #define RW_SPINLOCK_USE_X86_INTRINSIC_
142 #include <x86intrin.h>
143 #elif defined(_MSC_VER) && defined(FOLLY_X64)
144 #define RW_SPINLOCK_USE_X86_INTRINSIC_
146 #define RW_SPINLOCK_USE_X86_INTRINSIC_
148 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
151 // iOS doesn't define _mm_cvtsi64_si128 and friends
152 #if (FOLLY_SSE >= 2) && !FOLLY_MOBILE && FOLLY_X64
153 #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
155 #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
163 #include <glog/logging.h>
165 #include <folly/Likely.h>
170 * A simple, small (4-bytes), but unfair rwlock. Use it when you want
171 * a nice writer and don't expect a lot of write/read contention, or
172 * when you need small rwlocks since you are creating a large number
175 * Note that the unfairness here is extreme: if the lock is
176 * continually accessed for read, writers will never get a chance. If
177 * the lock can be that highly contended this class is probably not an
178 * ideal choice anyway.
180 * It currently implements most of the Lockable, SharedLockable and
181 * UpgradeLockable concepts except the TimedLockable related locking/unlocking
185 enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
187 constexpr RWSpinLock() : bits_(0) {}
189 RWSpinLock(RWSpinLock const&) = delete;
190 RWSpinLock& operator=(RWSpinLock const&) = delete;
194 uint_fast32_t count = 0;
195 while (!LIKELY(try_lock())) {
196 if (++count > 1000) {
197 std::this_thread::yield();
202 // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
204 static_assert(READER > WRITER + UPGRADED, "wrong bits!");
205 bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
208 // SharedLockable Concept
210 uint_fast32_t count = 0;
211 while (!LIKELY(try_lock_shared())) {
212 if (++count > 1000) {
213 std::this_thread::yield();
218 void unlock_shared() {
219 bits_.fetch_add(-READER, std::memory_order_release);
222 // Downgrade the lock from writer status to reader status.
223 void unlock_and_lock_shared() {
224 bits_.fetch_add(READER, std::memory_order_acquire);
228 // UpgradeLockable Concept
229 void lock_upgrade() {
230 uint_fast32_t count = 0;
231 while (!try_lock_upgrade()) {
232 if (++count > 1000) {
233 std::this_thread::yield();
238 void unlock_upgrade() {
239 bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
242 // unlock upgrade and try to acquire write lock
243 void unlock_upgrade_and_lock() {
245 while (!try_unlock_upgrade_and_lock()) {
246 if (++count > 1000) {
247 std::this_thread::yield();
252 // unlock upgrade and read lock atomically
253 void unlock_upgrade_and_lock_shared() {
254 bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
257 // write unlock and upgrade lock atomically
258 void unlock_and_lock_upgrade() {
259 // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
260 // the same time when other threads are trying do try_lock_upgrade().
261 bits_.fetch_or(UPGRADED, std::memory_order_acquire);
262 bits_.fetch_add(-WRITER, std::memory_order_release);
266 // Attempt to acquire writer permission. Return false if we didn't get it.
269 return bits_.compare_exchange_strong(expect, WRITER,
270 std::memory_order_acq_rel);
273 // Try to get reader permission on the lock. This can fail if we
274 // find out someone is a writer or upgrader.
275 // Setting the UPGRADED bit would allow a writer-to-be to indicate
276 // its intention to write and block any new readers while waiting
277 // for existing readers to finish and release their read locks. This
278 // helps avoid starving writers (promoted from upgraders).
279 bool try_lock_shared() {
280 // fetch_add is considerably (100%) faster than compare_exchange,
281 // so here we are optimizing for the common (lock success) case.
282 int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
283 if (UNLIKELY(value & (WRITER|UPGRADED))) {
284 bits_.fetch_add(-READER, std::memory_order_release);
290 // try to unlock upgrade and write lock atomically
291 bool try_unlock_upgrade_and_lock() {
292 int32_t expect = UPGRADED;
293 return bits_.compare_exchange_strong(expect, WRITER,
294 std::memory_order_acq_rel);
297 // try to acquire an upgradable lock.
298 bool try_lock_upgrade() {
299 int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
301 // Note: when failed, we cannot flip the UPGRADED bit back,
302 // as in this case there is either another upgrade lock or a write lock.
303 // If it's a write lock, the bit will get cleared up when that lock's done
305 return ((value & (UPGRADED | WRITER)) == 0);
308 // mainly for debugging purposes.
309 int32_t bits() const { return bits_.load(std::memory_order_acquire); }
312 class UpgradedHolder;
317 explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
319 lock_->lock_shared();
323 explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
324 lock_->lock_shared();
327 ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
328 other.lock_ = nullptr;
332 explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
333 upgraded.lock_ = nullptr;
335 lock_->unlock_upgrade_and_lock_shared();
339 explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
340 writer.lock_ = nullptr;
342 lock_->unlock_and_lock_shared();
346 ReadHolder& operator=(ReadHolder&& other) {
348 swap(lock_, other.lock_);
352 ReadHolder(const ReadHolder& other) = delete;
353 ReadHolder& operator=(const ReadHolder& other) = delete;
357 lock_->unlock_shared();
361 void reset(RWSpinLock* lock = nullptr) {
366 lock_->unlock_shared();
370 lock_->lock_shared();
374 void swap(ReadHolder* other) {
375 std::swap(lock_, other->lock_);
379 friend class UpgradedHolder;
380 friend class WriteHolder;
384 class UpgradedHolder {
386 explicit UpgradedHolder(RWSpinLock* lock) : lock_(lock) {
388 lock_->lock_upgrade();
392 explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
393 lock_->lock_upgrade();
396 explicit UpgradedHolder(WriteHolder&& writer) {
397 lock_ = writer.lock_;
398 writer.lock_ = nullptr;
400 lock_->unlock_and_lock_upgrade();
404 UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
405 other.lock_ = nullptr;
408 UpgradedHolder& operator =(UpgradedHolder&& other) {
410 swap(lock_, other.lock_);
414 UpgradedHolder(const UpgradedHolder& other) = delete;
415 UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
419 lock_->unlock_upgrade();
423 void reset(RWSpinLock* lock = nullptr) {
428 lock_->unlock_upgrade();
432 lock_->lock_upgrade();
436 void swap(UpgradedHolder* other) {
438 swap(lock_, other->lock_);
442 friend class WriteHolder;
443 friend class ReadHolder;
449 explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
455 explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
459 // promoted from an upgrade lock holder
460 explicit WriteHolder(UpgradedHolder&& upgraded) {
461 lock_ = upgraded.lock_;
462 upgraded.lock_ = nullptr;
464 lock_->unlock_upgrade_and_lock();
468 WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
469 other.lock_ = nullptr;
472 WriteHolder& operator =(WriteHolder&& other) {
474 swap(lock_, other.lock_);
478 WriteHolder(const WriteHolder& other) = delete;
479 WriteHolder& operator =(const WriteHolder& other) = delete;
487 void reset(RWSpinLock* lock = nullptr) {
500 void swap(WriteHolder* other) {
502 swap(lock_, other->lock_);
506 friend class ReadHolder;
507 friend class UpgradedHolder;
512 std::atomic<int32_t> bits_;
516 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
517 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
520 template <size_t kBitWidth> struct RWTicketIntTrait {
521 static_assert(kBitWidth == 32 || kBitWidth == 64,
522 "bit width has to be either 32 or 64 ");
526 struct RWTicketIntTrait<64> {
527 typedef uint64_t FullInt;
528 typedef uint32_t HalfInt;
529 typedef uint16_t QuarterInt;
531 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
532 static __m128i make128(const uint16_t v[4]) {
533 return _mm_set_epi16(0, 0, 0, 0,
534 short(v[3]), short(v[2]), short(v[1]), short(v[0]));
536 static inline __m128i fromInteger(uint64_t from) {
537 return _mm_cvtsi64_si128(int64_t(from));
539 static inline uint64_t toInteger(__m128i in) {
540 return uint64_t(_mm_cvtsi128_si64(in));
542 static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
543 return toInteger(_mm_add_epi16(in, kDelta));
549 struct RWTicketIntTrait<32> {
550 typedef uint32_t FullInt;
551 typedef uint16_t HalfInt;
552 typedef uint8_t QuarterInt;
554 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
555 static __m128i make128(const uint8_t v[4]) {
560 char(v[3]), char(v[2]), char(v[1]), char(v[0]));
562 static inline __m128i fromInteger(uint32_t from) {
563 return _mm_cvtsi32_si128(int32_t(from));
565 static inline uint32_t toInteger(__m128i in) {
566 return uint32_t(_mm_cvtsi128_si32(in));
568 static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
569 return toInteger(_mm_add_epi8(in, kDelta));
573 } // namespace detail
575 template <size_t kBitWidth, bool kFavorWriter = false>
576 class RWTicketSpinLockT {
577 typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
578 typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
579 typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
580 typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
584 constexpr RWTicket() : whole(0) {}
587 __extension__ struct {
594 private: // Some x64-specific utilities for atomic access to ticket.
595 template <class T> static T load_acquire(T* addr) {
596 T t = *addr; // acquire barrier
597 asm_volatile_memory();
602 static void store_release(T* addr, T v) {
603 asm_volatile_memory();
604 *addr = v; // release barrier
609 constexpr RWTicketSpinLockT() {}
611 RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
612 RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
616 writeLockAggressive();
623 * Both try_lock and try_lock_shared diverge in our implementation from the
624 * lock algorithm described in the link above.
626 * In the read case, it is undesirable that the readers could wait
627 * for another reader (before increasing ticket.read in the other
628 * implementation). Our approach gives up on
629 * first-come-first-serve, but our benchmarks showed improve
630 * performance for both readers and writers under heavily contended
631 * cases, particularly when the number of threads exceeds the number
634 * We have writeLockAggressive() using the original implementation
635 * for a writer, which gives some advantage to the writer over the
636 * readers---for that path it is guaranteed that the writer will
637 * acquire the lock after all the existing readers exit.
641 FullInt old = t.whole = load_acquire(&ticket.whole);
642 if (t.users != t.write) {
646 return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
650 * Call this if you want to prioritize writer to avoid starvation.
651 * Unlike writeLockNice, immediately acquires the write lock when
652 * the existing readers (arriving before the writer) finish their
655 void writeLockAggressive() {
656 // std::this_thread::yield() is needed here to avoid a pathology if the number
657 // of threads attempting concurrent writes is >= the number of real
658 // cores allocated to this process. This is less likely than the
659 // corresponding situation in lock_shared(), but we still want to
661 uint_fast32_t count = 0;
662 QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
663 while (val != load_acquire(&ticket.write)) {
664 asm_volatile_pause();
665 if (UNLIKELY(++count > 1000)) {
666 std::this_thread::yield();
671 // Call this when the writer should be nicer to the readers.
672 void writeLockNice() {
673 // Here it doesn't cpu-relax the writer.
675 // This is because usually we have many more readers than the
676 // writers, so the writer has less chance to get the lock when
677 // there are a lot of competing readers. The aggressive spinning
678 // can help to avoid starving writers.
680 // We don't worry about std::this_thread::yield() here because the caller
681 // has already explicitly abandoned fairness.
682 while (!try_lock()) {}
685 // Atomically unlock the write-lock from writer and acquire the read-lock.
686 void unlock_and_lock_shared() {
687 QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
690 // Release writer permission on the lock.
693 t.whole = load_acquire(&ticket.whole);
695 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
696 FullInt old = t.whole;
697 // SSE2 can reduce the lock and unlock overhead by 10%
698 static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 }; // write/read/user
699 static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
700 __m128i m = IntTraitType::fromInteger(old);
701 t.whole = IntTraitType::addParallel(m, kDelta);
706 store_release(&ticket.readWrite, t.readWrite);
710 // std::this_thread::yield() is important here because we can't grab the
711 // shared lock if there is a pending writeLockAggressive, so we
712 // need to let threads that already have a shared lock complete
713 uint_fast32_t count = 0;
714 while (!LIKELY(try_lock_shared())) {
715 asm_volatile_pause();
716 if (UNLIKELY((++count & 1023) == 0)) {
717 std::this_thread::yield();
722 bool try_lock_shared() {
724 old.whole = t.whole = load_acquire(&ticket.whole);
725 old.users = old.read;
726 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
727 // SSE2 may reduce the total lock and unlock overhead by 10%
728 static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 }; // write/read/user
729 static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
730 __m128i m = IntTraitType::fromInteger(old.whole);
731 t.whole = IntTraitType::addParallel(m, kDelta);
736 return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
739 void unlock_shared() {
740 __sync_fetch_and_add(&ticket.write, 1);
745 typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
748 ReadHolder(ReadHolder const&) = delete;
749 ReadHolder& operator=(ReadHolder const&) = delete;
751 explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
753 lock_->lock_shared();
757 explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
759 lock_->lock_shared();
763 // atomically unlock the write-lock from writer and acquire the read-lock
764 explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
765 std::swap(this->lock_, writer->lock_);
767 lock_->unlock_and_lock_shared();
773 lock_->unlock_shared();
777 void reset(RWSpinLock *lock = nullptr) {
779 lock_->unlock_shared();
783 lock_->lock_shared();
787 void swap(ReadHolder *other) {
788 std::swap(this->lock_, other->lock_);
797 WriteHolder(WriteHolder const&) = delete;
798 WriteHolder& operator=(WriteHolder const&) = delete;
800 explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
805 explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
817 void reset(RWSpinLock *lock = nullptr) {
830 void swap(WriteHolder *other) {
831 std::swap(this->lock_, other->lock_);
835 friend class ReadHolder;
840 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
841 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
843 #endif // RW_SPINLOCK_USE_X86_INTRINSIC_
847 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
848 #undef RW_SPINLOCK_USE_X86_INTRINSIC_