X-Git-Url: http://plrg.eecs.uci.edu/git/?p=folly.git;a=blobdiff_plain;f=folly%2FRWSpinLock.h;h=cc4f480a4e29b123cafd16af71963bbd3b63f03e;hp=99f018e45c9d8d1fc531186938dbb1558c13a9fb;hb=9883bc3793fc71b8d4c8d1adf6c6fe796e9131e5;hpb=1805d922175a0cb620461f8873f6a7061a683413

diff --git a/folly/RWSpinLock.h b/folly/RWSpinLock.h
index 99f018e4..cc4f480a 100644
--- a/folly/RWSpinLock.h
+++ b/folly/RWSpinLock.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2012 Facebook, Inc.
+ * Copyright 2017 Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,22 @@
  */
 
 /*
+ * N.B. You most likely do _not_ want to use RWSpinLock or any other
+ * kind of spinlock.  Use SharedMutex instead.
+ *
+ * In short, spinlocks in preemptive multi-tasking operating systems
+ * have serious problems and fast mutexes like SharedMutex are almost
+ * certainly the better choice, because letting the OS scheduler put a
+ * thread to sleep is better for system responsiveness and throughput
+ * than wasting a timeslice repeatedly querying a lock held by a
+ * thread that's blocked, and you can't prevent userspace
+ * programs blocking.
+ *
+ * Spinlocks in an operating system kernel make much more sense than
+ * they do in userspace.
+ *
+ * -------------------------------------------------------------------
+ *
  * Two Read-Write spin lock implementations.
  *
  *  Ref: http://locklessinc.com/articles/locks
@@ -24,12 +40,14 @@
  *  are very compact (4/8 bytes), so are suitable for per-instance
  *  based locking, particularly when contention is not expected.
  *
- *  In most cases, RWSpinLock is a reasonable choice.  It has minimal
- *  overhead, and comparable contention performance when the number of
- *  competing threads is less than or equal to the number of logical
- *  CPUs.  Even as the number of threads gets larger, RWSpinLock can
- *  still be very competitive in READ, although it is slower on WRITE,
- *  and also inherently unfair to writers.
+ *  For a spinlock, RWSpinLock is a reasonable choice.  (See the note
+ *  about for why a spin lock is frequently a bad idea generally.)
+ *  RWSpinLock has minimal overhead, and comparable contention
+ *  performance when the number of competing threads is less than or
+ *  equal to the number of logical CPUs.  Even as the number of
+ *  threads gets larger, RWSpinLock can still be very competitive in
+ *  READ, although it is slower on WRITE, and also inherently unfair
+ *  to writers.
  *
  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  *  your application really needs a lot more threads, and a
@@ -46,13 +64,24 @@
  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  *    readers and writers.
  *
+ *    RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
+ *    writer priority) is NOT reentrant, even for lock_shared().
+ *
+ *    The lock will not grant any new shared (read) accesses while a thread
+ *    attempting to acquire the lock in write mode is blocked. (That is,
+ *    if the lock is held in shared mode by N threads, and a thread attempts
+ *    to acquire it in write mode, no one else can acquire it in shared mode
+ *    until these N threads release the lock and then the blocked thread
+ *    acquires and releases the exclusive lock.) This also applies for
+ *    attempts to reacquire the lock in shared mode by threads that already
+ *    hold it in shared mode, making the lock non-reentrant.
+ *
  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  *
  * @author Xin Liu <xliux@fb.com>
  */
 
-#ifndef FOLLY_RWSPINLOCK_H_
-#define FOLLY_RWSPINLOCK_H_
+#pragma once
 
 /*
 ========================================================================
@@ -106,23 +135,35 @@ pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 
 */
 
-#if defined(__GNUC__) && (defined(__i386) || defined(__x86_64__) || \
-    defined(ARCH_K8))
-#define RW_SPINLOCK_USE_X86_INTRINSIC_
-#include <x86intrin.h>
+#include <folly/Portability.h>
+#include <folly/portability/Asm.h>
+
+#if defined(__GNUC__) && \
+  (defined(__i386) || FOLLY_X64 || \
+   defined(ARCH_K8))
+# define RW_SPINLOCK_USE_X86_INTRINSIC_
+# include <x86intrin.h>
+#elif defined(_MSC_VER) && defined(FOLLY_X64)
+# define RW_SPINLOCK_USE_X86_INTRINSIC_
 #else
-#undef RW_SPINLOCK_USE_X86_INTRINSIC_
+# undef RW_SPINLOCK_USE_X86_INTRINSIC_
 #endif
 
+// iOS doesn't define _mm_cvtsi64_si128 and friends
+#if (FOLLY_SSE >= 2) && !FOLLY_MOBILE
+#define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
+#else
+#undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
+#endif
+
+#include <algorithm>
 #include <atomic>
 #include <string>
-#include <algorithm>
-#include <boost/noncopyable.hpp>
+#include <thread>
 
-#include <sched.h>
 #include <glog/logging.h>
 
-#include "folly/Likely.h"
+#include <folly/Likely.h>
 
 namespace folly {
 
@@ -141,16 +182,19 @@ namespace folly {
  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
  * interfaces.
  */
-class RWSpinLock : boost::noncopyable {
+class RWSpinLock {
   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
  public:
-  RWSpinLock() : bits_(0) {}
+  constexpr RWSpinLock() : bits_(0) {}
+
+  RWSpinLock(RWSpinLock const&) = delete;
+  RWSpinLock& operator=(RWSpinLock const&) = delete;
 
   // Lockable Concept
   void lock() {
     int count = 0;
     while (!LIKELY(try_lock())) {
-      if (++count > 1000) sched_yield();
+      if (++count > 1000) std::this_thread::yield();
     }
   }
 
@@ -164,7 +208,7 @@ class RWSpinLock : boost::noncopyable {
   void lock_shared() {
     int count = 0;
     while (!LIKELY(try_lock_shared())) {
-      if (++count > 1000) sched_yield();
+      if (++count > 1000) std::this_thread::yield();
     }
   }
 
@@ -182,7 +226,7 @@ class RWSpinLock : boost::noncopyable {
   void lock_upgrade() {
     int count = 0;
     while (!try_lock_upgrade()) {
-      if (++count > 1000) sched_yield();
+      if (++count > 1000) std::this_thread::yield();
     }
   }
 
@@ -194,7 +238,7 @@ class RWSpinLock : boost::noncopyable {
   void unlock_upgrade_and_lock() {
     int64_t count = 0;
     while (!try_unlock_upgrade_and_lock()) {
-      if (++count > 1000) sched_yield();
+      if (++count > 1000) std::this_thread::yield();
     }
   }
 
@@ -203,11 +247,6 @@ class RWSpinLock : boost::noncopyable {
     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
   }
 
-  void unlock_shared_and_lock_upgrade() {
-    lock_upgrade();
-    unlock_shared();
-  }
-
   // write unlock and upgrade lock atomically
   void unlock_and_lock_upgrade() {
     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
@@ -225,12 +264,16 @@ class RWSpinLock : boost::noncopyable {
   }
 
   // Try to get reader permission on the lock. This can fail if we
-  // find out someone is a writer.
+  // find out someone is a writer or upgrader.
+  // Setting the UPGRADED bit would allow a writer-to-be to indicate
+  // its intention to write and block any new readers while waiting
+  // for existing readers to finish and release their read locks. This
+  // helps avoid starving writers (promoted from upgraders).
   bool try_lock_shared() {
     // fetch_add is considerably (100%) faster than compare_exchange,
     // so here we are optimizing for the common (lock success) case.
     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
-    if (UNLIKELY(value & WRITER)) {
+    if (UNLIKELY(value & (WRITER|UPGRADED))) {
       bits_.fetch_add(-READER, std::memory_order_release);
       return false;
     }
@@ -264,7 +307,7 @@ class RWSpinLock : boost::noncopyable {
 
   class ReadHolder {
    public:
-    explicit ReadHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
+    explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
       if (lock_) lock_->lock_shared();
     }
 
@@ -272,7 +315,7 @@ class RWSpinLock : boost::noncopyable {
       lock_->lock_shared();
     }
 
-    ReadHolder(ReadHolder&& other) : lock_(other.lock_) {
+    ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
       other.lock_ = nullptr;
     }
 
@@ -317,7 +360,7 @@ class RWSpinLock : boost::noncopyable {
 
   class UpgradedHolder {
    public:
-    explicit UpgradedHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
+    explicit UpgradedHolder(RWSpinLock* lock) : lock_(lock) {
       if (lock_) lock_->lock_upgrade();
     }
 
@@ -325,19 +368,13 @@ class RWSpinLock : boost::noncopyable {
       lock_->lock_upgrade();
     }
 
-    explicit UpgradedHolder(ReadHolder&& reader) {
-      lock_ = reader.lock_;
-      reader.lock_ = nullptr;
-      if (lock_) lock_->unlock_shared_and_lock_upgrade();
-    }
-
     explicit UpgradedHolder(WriteHolder&& writer) {
       lock_ = writer.lock_;
       writer.lock_ = nullptr;
       if (lock_) lock_->unlock_and_lock_upgrade();
     }
 
-    UpgradedHolder(UpgradedHolder&& other) : lock_(other.lock_) {
+    UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
       other.lock_ = nullptr;
     }
 
@@ -372,7 +409,7 @@ class RWSpinLock : boost::noncopyable {
 
   class WriteHolder {
    public:
-    explicit WriteHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
+    explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
       if (lock_) lock_->lock();
     }
 
@@ -387,7 +424,7 @@ class RWSpinLock : boost::noncopyable {
       if (lock_) lock_->unlock_upgrade_and_lock();
     }
 
-    WriteHolder(WriteHolder&& other) : lock_(other.lock_) {
+    WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
       other.lock_ = nullptr;
     }
 
@@ -420,12 +457,6 @@ class RWSpinLock : boost::noncopyable {
     RWSpinLock* lock_;
   };
 
-  // Synchronized<> adaptors
-  friend void acquireRead(RWSpinLock& l) { return l.lock_shared(); }
-  friend void acquireReadWrite(RWSpinLock& l) { return l.lock(); }
-  friend void releaseRead(RWSpinLock& l) { return l.unlock_shared(); }
-  friend void releaseReadWrite(RWSpinLock& l) { return l.unlock(); }
-
  private:
   std::atomic<int32_t> bits_;
 };
@@ -446,15 +477,16 @@ struct RWTicketIntTrait<64> {
   typedef uint32_t HalfInt;
   typedef uint16_t QuarterInt;
 
-#ifdef __SSE2__
+#ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
   static __m128i make128(const uint16_t v[4]) {
-    return _mm_set_epi16(0, 0, 0, 0, v[3], v[2], v[1], v[0]);
+    return _mm_set_epi16(0, 0, 0, 0,
+        short(v[3]), short(v[2]), short(v[1]), short(v[0]));
   }
   static inline __m128i fromInteger(uint64_t from) {
-    return _mm_cvtsi64_si128(from);
+    return _mm_cvtsi64_si128(int64_t(from));
   }
   static inline uint64_t toInteger(__m128i in) {
-    return _mm_cvtsi128_si64(in);
+    return uint64_t(_mm_cvtsi128_si64(in));
   }
   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
     return toInteger(_mm_add_epi16(in, kDelta));
@@ -468,16 +500,19 @@ struct RWTicketIntTrait<32> {
   typedef uint16_t HalfInt;
   typedef uint8_t QuarterInt;
 
-#ifdef __SSE2__
+#ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
   static __m128i make128(const uint8_t v[4]) {
-    return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, v[3], v[2], v[1], v[0]);
+    return _mm_set_epi8(
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        char(v[3]), char(v[2]), char(v[1]), char(v[0]));
   }
   static inline __m128i fromInteger(uint32_t from) {
-    return _mm_cvtsi32_si128(from);
+    return _mm_cvtsi32_si128(int32_t(from));
   }
   static inline uint32_t toInteger(__m128i in) {
-    return _mm_cvtsi128_si32(in);
+    return uint32_t(_mm_cvtsi128_si32(in));
   }
   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
     return toInteger(_mm_add_epi8(in, kDelta));
@@ -488,7 +523,7 @@ struct RWTicketIntTrait<32> {
 
 
 template<size_t kBitWidth, bool kFavorWriter=false>
-class RWTicketSpinLockT : boost::noncopyable {
+class RWTicketSpinLockT {
   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
@@ -496,6 +531,7 @@ class RWTicketSpinLockT : boost::noncopyable {
     QuarterInt;
 
   union RWTicket {
+    constexpr RWTicket() : whole(0) {}
     FullInt whole;
     HalfInt readWrite;
     __extension__ struct {
@@ -508,21 +544,22 @@ class RWTicketSpinLockT : boost::noncopyable {
  private: // Some x64-specific utilities for atomic access to ticket.
   template<class T> static T load_acquire(T* addr) {
     T t = *addr; // acquire barrier
-    asm volatile("" : : : "memory");
+    asm_volatile_memory();
     return t;
   }
 
   template<class T>
   static void store_release(T* addr, T v) {
-    asm volatile("" : : : "memory");
+    asm_volatile_memory();
     *addr = v; // release barrier
   }
 
  public:
 
-  RWTicketSpinLockT() {
-    store_release(&ticket.whole, FullInt(0));
-  }
+  constexpr RWTicketSpinLockT() {}
+
+  RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
+  RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
 
   void lock() {
     if (kFavorWriter) {
@@ -564,7 +601,7 @@ class RWTicketSpinLockT : boost::noncopyable {
    * turns.
    */
   void writeLockAggressive() {
-    // sched_yield() is needed here to avoid a pathology if the number
+    // std::this_thread::yield() is needed here to avoid a pathology if the number
     // of threads attempting concurrent writes is >= the number of real
     // cores allocated to this process. This is less likely than the
     // corresponding situation in lock_shared(), but we still want to
@@ -572,8 +609,8 @@ class RWTicketSpinLockT : boost::noncopyable {
     int count = 0;
     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
     while (val != load_acquire(&ticket.write)) {
-      asm volatile("pause");
-      if (UNLIKELY(++count > 1000)) sched_yield();
+      asm_volatile_pause();
+      if (UNLIKELY(++count > 1000)) std::this_thread::yield();
     }
   }
 
@@ -586,7 +623,7 @@ class RWTicketSpinLockT : boost::noncopyable {
     // there are a lot of competing readers.  The aggressive spinning
     // can help to avoid starving writers.
     //
-    // We don't worry about sched_yield() here because the caller
+    // We don't worry about std::this_thread::yield() here because the caller
     // has already explicitly abandoned fairness.
     while (!try_lock()) {}
   }
@@ -602,7 +639,7 @@ class RWTicketSpinLockT : boost::noncopyable {
     t.whole = load_acquire(&ticket.whole);
     FullInt old = t.whole;
 
-#ifdef __SSE2__
+#ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
     // SSE2 can reduce the lock and unlock overhead by 10%
     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
@@ -616,13 +653,13 @@ class RWTicketSpinLockT : boost::noncopyable {
   }
 
   void lock_shared() {
-    // sched_yield() is important here because we can't grab the
+    // std::this_thread::yield() is important here because we can't grab the
     // shared lock if there is a pending writeLockAggressive, so we
     // need to let threads that already have a shared lock complete
     int count = 0;
     while (!LIKELY(try_lock_shared())) {
-      asm volatile("pause");
-      if (UNLIKELY((++count & 1023) == 0)) sched_yield();
+      asm_volatile_pause();
+      if (UNLIKELY((++count & 1023) == 0)) std::this_thread::yield();
     }
   }
 
@@ -630,7 +667,7 @@ class RWTicketSpinLockT : boost::noncopyable {
     RWTicket t, old;
     old.whole = t.whole = load_acquire(&ticket.whole);
     old.users = old.read;
-#ifdef  __SSE2__
+#ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
     // SSE2 may reduce the total lock and unlock overhead by 10%
     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
@@ -650,10 +687,12 @@ class RWTicketSpinLockT : boost::noncopyable {
   class WriteHolder;
 
   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
-  class ReadHolder : boost::noncopyable {
+  class ReadHolder {
    public:
-    explicit ReadHolder(RWSpinLock *lock = nullptr) :
-      lock_(lock) {
+    ReadHolder(ReadHolder const&) = delete;
+    ReadHolder& operator=(ReadHolder const&) = delete;
+
+    explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
       if (lock_) lock_->lock_shared();
     }
 
@@ -687,9 +726,12 @@ class RWTicketSpinLockT : boost::noncopyable {
     RWSpinLock *lock_;
   };
 
-  class WriteHolder : boost::noncopyable {
+  class WriteHolder {
    public:
-    explicit WriteHolder(RWSpinLock *lock = nullptr) : lock_(lock) {
+    WriteHolder(WriteHolder const&) = delete;
+    WriteHolder& operator=(WriteHolder const&) = delete;
+
+    explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
       if (lock_) lock_->lock();
     }
     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
@@ -715,25 +757,6 @@ class RWTicketSpinLockT : boost::noncopyable {
     friend class ReadHolder;
     RWSpinLock *lock_;
   };
-
-  // Synchronized<> adaptors.
-  friend void acquireRead(RWTicketSpinLockT& mutex) {
-    mutex.lock_shared();
-  }
-  friend void acquireReadWrite(RWTicketSpinLockT& mutex) {
-    mutex.lock();
-  }
-  friend bool acquireReadWrite(RWTicketSpinLockT& mutex,
-                               unsigned int milliseconds) {
-    mutex.lock();
-    return true;
-  }
-  friend void releaseRead(RWTicketSpinLockT& mutex) {
-    mutex.unlock_shared();
-  }
-  friend void releaseReadWrite(RWTicketSpinLockT& mutex) {
-    mutex.unlock();
-  }
 };
 
 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
@@ -746,5 +769,3 @@ typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 #endif
-
-#endif  // FOLLY_RWSPINLOCK_H_