--- /dev/null
+/*
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/MicroLock.h>
+
+namespace folly {
+
+void MicroLockCore::lockSlowPath(uint32_t oldWord,
+ detail::Futex<>* wordPtr,
+ uint32_t slotHeldBit,
+ unsigned maxSpins,
+ unsigned maxYields) {
+ unsigned newWord;
+ unsigned spins = 0;
+ uint32_t slotWaitBit = slotHeldBit << 1;
+
+retry:
+ if ((oldWord & slotHeldBit) != 0) {
+ ++spins;
+ if (spins > maxSpins + maxYields) {
+ // Somebody appears to have the lock. Block waiting for the
+ // holder to unlock the lock. We set heldbit(slot) so that the
+ // lock holder knows to FUTEX_WAKE us.
+ newWord = oldWord | slotWaitBit;
+ if (newWord != oldWord) {
+ if (!wordPtr->compare_exchange_weak(oldWord,
+ newWord,
+ std::memory_order_relaxed,
+ std::memory_order_relaxed)) {
+ goto retry;
+ }
+ }
+ (void)wordPtr->futexWait(newWord, slotHeldBit);
+ } else if (spins > maxSpins) {
+ sched_yield();
+ }
+ oldWord = wordPtr->load(std::memory_order_relaxed);
+ goto retry;
+ }
+
+ newWord = oldWord | slotHeldBit;
+ if (!wordPtr->compare_exchange_weak(oldWord,
+ newWord,
+ std::memory_order_relaxed,
+ std::memory_order_relaxed)) {
+ goto retry;
+ }
+
+ // Locks are traditionally memory barriers, so we emit a full fence
+ // even though we were happy using relaxed atomics for the
+ // lock itself.
+ std::atomic_thread_fence(std::memory_order_seq_cst);
+}
+}
--- /dev/null
+/*
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <limits>
+#include <stdint.h>
+#include <folly/detail/Futex.h>
+#include <folly/Portability.h>
+
+namespace folly {
+
+/**
+ * Tiny exclusive lock that packs four lock slots into a single
+ * byte. Each slot is an independent real, sleeping lock. The default
+ * lock and unlock functions operate on slot zero, which modifies only
+ * the low two bits of the host byte.
+ *
+ * You should zero-initialize the bits of a MicroLock that you intend
+ * to use.
+ *
+ * If you're not space-constrained, prefer std::mutex, which will
+ * likely be faster, since it has more than two bits of information to
+ * work with.
+ *
+ * You are free to put a MicroLock in a union with some other object.
+ * If, for example, you want to use the bottom two bits of a pointer
+ * as a lock, you can put a MicroLock in a union with the pointer and
+ * limit yourself to MicroLock slot zero, which will use the two
+ * least-significant bits in the bottom byte.
+ *
+ * MicroLock uses a dirty trick: it actually operates on the full
+ * word-size, word-aligned bit of memory into which it is embedded.
+ * It never modifies bits outside the ones it's defined to modify, but
+ * it _accesses_ all the bits in the word for purposes of
+ * futex management.
+ *
+ * The MaxSpins template parameter controls the number of times we
+ * spin trying to acquire the lock. MaxYields controls the number of
+ * times we call sched_yield; once we've tried to acquire the lock
+ * MaxSpins + MaxYields times, we sleep on the lock futex.
+ * By adjusting these parameters, you can make MicroLock behave as
+ * much or as little like a conventional spinlock as you'd like.
+ *
+ * Performance
+ * -----------
+ *
+ * With the default template options, the timings for uncontended
+ * acquire-then-release come out as follows on Intel(R) Xeon(R) CPU
+ * E5-2660 0 @ 2.20GHz, in @mode/opt, as of the master tree at Tue, 01
+ * Mar 2016 19:48:15.
+ *
+ * ========================================================================
+ * folly/test/SmallLocksBenchmark.cpp relative time/iter iters/s
+ * ========================================================================
+ * MicroSpinLockUncontendedBenchmark 13.46ns 74.28M
+ * PicoSpinLockUncontendedBenchmark 14.99ns 66.71M
+ * MicroLockUncontendedBenchmark 27.06ns 36.96M
+ * StdMutexUncontendedBenchmark 25.18ns 39.72M
+ * VirtualFunctionCall 1.72ns 579.78M
+ * ========================================================================
+ *
+ * (The virtual dispatch benchmark is provided for scale.)
+ *
+ * The contended case for MicroLock is likely to be worse compared to
+ * std::mutex than the contended case is. Make sure to benchmark your
+ * particular workload.
+ *
+ */
+
+class MicroLockCore {
+ protected:
+ uint8_t lock_;
+ inline detail::Futex<>* word() const;
+ inline uint32_t baseShift(unsigned slot) const;
+ inline uint32_t heldBit(unsigned slot) const;
+ inline uint32_t waitBit(unsigned slot) const;
+ static void lockSlowPath(uint32_t oldWord,
+ detail::Futex<>* wordPtr,
+ uint32_t slotHeldBit,
+ unsigned maxSpins,
+ unsigned maxYields);
+
+ public:
+ inline void unlock(unsigned slot);
+ inline void unlock() { unlock(0); }
+ inline void init(unsigned slot) { lock_ &= ~(3U << (2 * slot)); }
+ inline void init() { init(0); }
+};
+
+inline detail::Futex<>* MicroLockCore::word() const {
+ uintptr_t lockptr = (uintptr_t)&lock_;
+ lockptr &= ~(sizeof(uint32_t) - 1);
+ return (detail::Futex<>*)lockptr;
+}
+
+inline unsigned MicroLockCore::baseShift(unsigned slot) const {
+ assert(slot < CHAR_BIT / 2);
+ uintptr_t offset_bytes = (uintptr_t)&lock_ - (uintptr_t)word();
+ assert(offset_bytes < sizeof(uint32_t));
+
+ return kIsLittleEndian
+ ? offset_bytes * CHAR_BIT + slot * 2
+ : CHAR_BIT * (sizeof(uint32_t) - offset_bytes - 1) + slot * 2;
+}
+
+inline uint32_t MicroLockCore::heldBit(unsigned slot) const {
+ return 1U << (baseShift(slot) + 0);
+}
+
+inline uint32_t MicroLockCore::waitBit(unsigned slot) const {
+ return 1U << (baseShift(slot) + 1);
+}
+
+void MicroLockCore::unlock(unsigned slot) {
+ detail::Futex<>* wordPtr = word();
+ uint32_t oldWord;
+ uint32_t newWord;
+
+ oldWord = wordPtr->load(std::memory_order_relaxed);
+ do {
+ assert(oldWord & heldBit(slot));
+ newWord = oldWord & ~(heldBit(slot) | waitBit(slot));
+ } while (!wordPtr->compare_exchange_weak(
+ oldWord, newWord, std::memory_order_release, std::memory_order_relaxed));
+
+ if (oldWord & waitBit(slot)) {
+ // We don't track the number of waiters, so wake everyone
+ (void)wordPtr->futexWake(std::numeric_limits<int>::max(), heldBit(slot));
+ }
+}
+
+template <unsigned MaxSpins = 1000, unsigned MaxYields = 0>
+class MicroLockBase : public MicroLockCore {
+ public:
+ inline void lock(unsigned slot);
+ inline void lock() { lock(0); }
+ inline bool try_lock(unsigned slot);
+ inline bool try_lock() { return try_lock(0); }
+};
+
+template <unsigned MaxSpins, unsigned MaxYields>
+bool MicroLockBase<MaxSpins, MaxYields>::try_lock(unsigned slot) {
+
+ // N.B. You might think that try_lock is just the fast path of lock,
+ // but you'd be wrong. Keep in mind that other parts of our host
+ // word might be changing while we take the lock! We're not allowed
+ // to fail spuriously if the lock is in fact not held, even if other
+ // people are concurrently modifying other parts of the word.
+ //
+ // We need to loop until we either see firm evidence that somebody
+ // else has the lock (by looking at heldBit) or see our CAS succeed.
+ // A failed CAS by itself does not indicate lock-acquire failure.
+
+ detail::Futex<>* wordPtr = word();
+ uint32_t oldWord = wordPtr->load(std::memory_order_relaxed);
+ do {
+ if (oldWord & heldBit(slot)) {
+ return false;
+ }
+ } while (!wordPtr->compare_exchange_weak(oldWord,
+ oldWord | heldBit(slot),
+ std::memory_order_acquire,
+ std::memory_order_relaxed));
+
+ return true;
+}
+
+template <unsigned MaxSpins, unsigned MaxYields>
+void MicroLockBase<MaxSpins, MaxYields>::lock(unsigned slot) {
+
+ static_assert(MaxSpins + MaxYields < (unsigned)-1, "overflow");
+
+ detail::Futex<>* wordPtr = word();
+ uint32_t oldWord;
+ oldWord = wordPtr->load(std::memory_order_relaxed);
+ if ((oldWord & heldBit(slot)) == 0 &&
+ wordPtr->compare_exchange_weak(oldWord,
+ oldWord | heldBit(slot),
+ std::memory_order_acquire,
+ std::memory_order_relaxed)) {
+ // Fast uncontended case: seq_cst above is our memory barrier
+ } else {
+ // lockSlowPath doesn't have any slot-dependent computation; it
+ // just shifts the input bit. Make sure its shifting produces the
+ // same result a call to waitBit for our slot would.
+ assert(heldBit(slot) << 1 == waitBit(slot));
+ // lockSlowPath emits its own memory barrier
+ lockSlowPath(oldWord, wordPtr, heldBit(slot), MaxSpins, MaxYields);
+ }
+}
+
+typedef MicroLockBase<> MicroLock;
+}
--- /dev/null
+/*
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/Benchmark.h>
+#include <folly/SmallLocks.h>
+
+BENCHMARK(MicroSpinLockUncontendedBenchmark, iters) {
+ folly::MicroSpinLock lock;
+ lock.init();
+ while (iters--) {
+ lock.lock();
+ lock.unlock();
+ }
+}
+
+BENCHMARK(PicoSpinLockUncontendedBenchmark, iters) {
+ // uint8_t would be more fair, but PicoSpinLock needs at lesat two bytes
+ folly::PicoSpinLock<uint16_t> lock;
+ lock.init();
+ while (iters--) {
+ lock.lock();
+ lock.unlock();
+ }
+}
+
+BENCHMARK(MicroLockUncontendedBenchmark, iters) {
+ folly::MicroLock lock;
+ lock.init();
+ while (iters--) {
+ lock.lock();
+ lock.unlock();
+ }
+}
+
+BENCHMARK(StdMutexUncontendedBenchmark, iters) {
+ std::mutex lock;
+ while (iters--) {
+ lock.lock();
+ lock.unlock();
+ }
+}
+
+struct VirtualBase {
+ virtual void foo() = 0;
+ virtual ~VirtualBase() {}
+};
+
+struct VirtualImpl : VirtualBase {
+ virtual void foo() { /* noop */
+ }
+ virtual ~VirtualImpl() {}
+};
+
+#ifndef __clang__
+__attribute__((noinline, noclone)) VirtualBase* makeVirtual() {
+ return new VirtualImpl();
+}
+
+BENCHMARK(VirtualFunctionCall, iters) {
+ VirtualBase* vb = makeVirtual();
+ while (iters--) {
+ vb->foo();
+ }
+ delete vb;
+}
+#endif
+
+int main(int argc, char** argv) {
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
+ folly::runBenchmarks();
+ return 0;
+}
#include <cassert>
#include <cstdio>
#include <mutex>
+#include <condition_variable>
#include <string>
#include <vector>
#include <pthread.h>
#include <gtest/gtest.h>
-using std::string;
+using folly::MSLGuard;
+using folly::MicroLock;
using folly::MicroSpinLock;
using folly::PicoSpinLock;
-using folly::MSLGuard;
+using std::string;
namespace {
TEST(SmallLocks, RegClobber) {
TestClobber().go();
}
+
+FOLLY_PACK_PUSH
+static_assert(sizeof(MicroLock) == 1, "Size check failed");
+FOLLY_PACK_POP
+
+namespace {
+
+struct SimpleBarrier {
+
+ SimpleBarrier() : lock_(), cv_(), ready_(false) {}
+
+ void wait() {
+ std::unique_lock<std::mutex> lockHeld(lock_);
+ while (!ready_) {
+ cv_.wait(lockHeld);
+ }
+ }
+
+ void run() {
+ {
+ std::unique_lock<std::mutex> lockHeld(lock_);
+ ready_ = true;
+ }
+
+ cv_.notify_all();
+ }
+
+ private:
+ std::mutex lock_;
+ std::condition_variable cv_;
+ bool ready_;
+};
+}
+
+static void runMicroLockTest() {
+ volatile uint64_t counters[4] = {0, 0, 0, 0};
+ std::vector<std::thread> threads;
+ static const unsigned nrThreads = 20;
+ static const unsigned iterPerThread = 10000;
+ SimpleBarrier startBarrier;
+
+ assert(iterPerThread % 4 == 0);
+
+ // Embed the lock in a larger structure to ensure that we do not
+ // affect bits outside the ones MicroLock is defined to affect.
+ struct {
+ uint8_t a;
+ volatile uint8_t b;
+ union {
+ MicroLock alock;
+ uint8_t c;
+ };
+ volatile uint8_t d;
+ } x;
+
+ uint8_t origB = 'b';
+ uint8_t origD = 'd';
+
+ x.a = 'a';
+ x.b = origB;
+ x.c = 0;
+ x.d = origD;
+
+ // This thread touches other parts of the host word to show that
+ // MicroLock does not interfere with memory outside of the byte
+ // it owns.
+ std::thread adjacentMemoryToucher = std::thread([&] {
+ startBarrier.wait();
+ for (unsigned iter = 0; iter < iterPerThread; ++iter) {
+ if (iter % 2) {
+ x.b++;
+ } else {
+ x.d++;
+ }
+ }
+ });
+
+ for (unsigned i = 0; i < nrThreads; ++i) {
+ threads.emplace_back([&] {
+ startBarrier.wait();
+ for (unsigned iter = 0; iter < iterPerThread; ++iter) {
+ unsigned slotNo = iter % 4;
+ x.alock.lock(slotNo);
+ counters[slotNo] += 1;
+ // The occasional sleep makes it more likely that we'll
+ // exercise the futex-wait path inside MicroLock.
+ if (iter % 1000 == 0) {
+ struct timespec ts = {0, 10000};
+ (void)nanosleep(&ts, nullptr);
+ }
+ x.alock.unlock(slotNo);
+ }
+ });
+ }
+
+ startBarrier.run();
+
+ for (auto it = threads.begin(); it != threads.end(); ++it) {
+ it->join();
+ }
+
+ adjacentMemoryToucher.join();
+
+ EXPECT_EQ(x.a, 'a');
+ EXPECT_EQ(x.b, (uint8_t)(origB + iterPerThread / 2));
+ EXPECT_EQ(x.c, 0);
+ EXPECT_EQ(x.d, (uint8_t)(origD + iterPerThread / 2));
+ for (unsigned i = 0; i < 4; ++i) {
+ EXPECT_EQ(counters[i], ((uint64_t)nrThreads * iterPerThread) / 4);
+ }
+}
+
+TEST(SmallLocks, MicroLock) { runMicroLockTest(); }
+TEST(SmallLocks, MicroLockTryLock) {
+ MicroLock lock;
+ lock.init();
+ EXPECT_TRUE(lock.try_lock());
+ EXPECT_FALSE(lock.try_lock());
+}