/*
- * Copyright 2014 Facebook, Inc.
+ * Copyright 2017 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
*/
#include <folly/detail/Futex.h>
+#include <boost/intrusive/list.hpp>
+#include <folly/Hash.h>
+#include <folly/ScopeGuard.h>
+#include <folly/portability/SysSyscall.h>
#include <stdint.h>
#include <string.h>
+#include <cerrno>
#include <condition_variable>
#include <mutex>
-#include <boost/intrusive/list.hpp>
-#include <folly/Hash.h>
-#include <folly/ScopeGuard.h>
#ifdef __linux__
-# include <errno.h>
-# include <linux/futex.h>
-# include <sys/syscall.h>
+#include <linux/futex.h>
#endif
using namespace std::chrono;
#ifdef __linux__
+/// Certain toolchains (like Android's) don't include the full futex API in
+/// their headers even though they support it. Make sure we have our constants
+/// even if the headers don't have them.
+#ifndef FUTEX_WAIT_BITSET
+# define FUTEX_WAIT_BITSET 9
+#endif
+#ifndef FUTEX_WAKE_BITSET
+# define FUTEX_WAKE_BITSET 10
+#endif
+#ifndef FUTEX_PRIVATE_FLAG
+# define FUTEX_PRIVATE_FLAG 128
+#endif
+#ifndef FUTEX_CLOCK_REALTIME
+# define FUTEX_CLOCK_REALTIME 256
+#endif
+
int nativeFutexWake(void* addr, int count, uint32_t wakeMask) {
- int rv = syscall(SYS_futex,
+ int rv = syscall(__NR_futex,
addr, /* addr1 */
FUTEX_WAKE_BITSET | FUTEX_PRIVATE_FLAG, /* op */
count, /* val */
nullptr, /* addr2 */
wakeMask); /* val3 */
- assert(rv >= 0);
-
+ /* NOTE: we ignore errors on wake for the case of a futex
+ guarding its own destruction, similar to this
+ glibc bug with sem_post/sem_wait:
+ https://sourceware.org/bugzilla/show_bug.cgi?id=12674 */
+ if (rv < 0) {
+ return 0;
+ }
return rv;
}
struct timespec
timeSpecFromTimePoint(time_point<Clock> absTime)
{
- auto duration = absTime.time_since_epoch();
- auto secs = duration_cast<seconds>(duration);
- auto nanos = duration_cast<nanoseconds>(duration - secs);
+ auto epoch = absTime.time_since_epoch();
+ if (epoch.count() < 0) {
+ // kernel timespec_valid requires non-negative seconds and nanos in [0,1G)
+ epoch = Clock::duration::zero();
+ }
+
+ // timespec-safe seconds and nanoseconds;
+ // chrono::{nano,}seconds are `long long int`
+ // whereas timespec uses smaller types
+ using time_t_seconds = duration<std::time_t, seconds::period>;
+ using long_nanos = duration<long int, nanoseconds::period>;
+
+ auto secs = duration_cast<time_t_seconds>(epoch);
+ auto nanos = duration_cast<long_nanos>(epoch - secs);
struct timespec result = { secs.count(), nanos.count() };
return result;
}
// Unlike FUTEX_WAIT, FUTEX_WAIT_BITSET requires an absolute timeout
// value - http://locklessinc.com/articles/futex_cheat_sheet/
- int rv = syscall(SYS_futex,
+ int rv = syscall(__NR_futex,
addr, /* addr1 */
op, /* op */
expected, /* val */
return FutexResult::VALUE_CHANGED;
default:
assert(false);
- // EACCESS, EFAULT, or EINVAL. All of these mean *addr point to
- // invalid memory (or I misunderstand the API). We can either
+ // EINVAL, EACCESS, or EFAULT. EINVAL means there was an invalid
+ // op (should be impossible) or an invalid timeout (should have
+ // been sanitized by timeSpecFromTimePoint). EACCESS or EFAULT
+ // means *addr points to invalid memory, which is unlikely because
+ // the caller should have segfaulted already. We can either
// crash, or return a value that lets the process continue for
// a bit. We choose the latter. VALUE_CHANGED probably turns the
// caller into a spin lock.
///////////////////////////////////////////////////////
// compatibility implementation using standard C++ API
+// Our emulated futex uses 4096 lists of wait nodes. There are two levels
+// of locking: a per-list mutex that controls access to the list and a
+// per-node mutex, condvar, and bool that are used for the actual wakeups.
+// The per-node mutex allows us to do precise wakeups without thundering
+// herds.
+
struct EmulatedFutexWaitNode : public boost::intrusive::list_base_hook<> {
void* const addr_;
const uint32_t waitMask_;
- bool hasTimeout_;
+
+ // tricky: hold both bucket and node mutex to write, either to read
bool signaled_;
+ std::mutex mutex_;
std::condition_variable cond_;
- EmulatedFutexWaitNode(void* addr, uint32_t waitMask, bool hasTimeout)
+ EmulatedFutexWaitNode(void* addr, uint32_t waitMask)
: addr_(addr)
, waitMask_(waitMask)
- , hasTimeout_(hasTimeout)
, signaled_(false)
{
}
boost::intrusive::list<EmulatedFutexWaitNode> waiters_;
static const size_t kNumBuckets = 4096;
- static EmulatedFutexBucket* gBuckets;
- static std::once_flag gBucketInit;
static EmulatedFutexBucket& bucketFor(void* addr) {
- std::call_once(gBucketInit, [](){
- gBuckets = new EmulatedFutexBucket[kNumBuckets];
- });
+ static auto gBuckets = new EmulatedFutexBucket[kNumBuckets];
uint64_t mixedBits = folly::hash::twang_mix64(
reinterpret_cast<uintptr_t>(addr));
return gBuckets[mixedBits % kNumBuckets];
}
};
-EmulatedFutexBucket* EmulatedFutexBucket::gBuckets;
-std::once_flag EmulatedFutexBucket::gBucketInit;
-
int emulatedFutexWake(void* addr, int count, uint32_t waitMask) {
auto& bucket = EmulatedFutexBucket::bucketFor(addr);
+ std::unique_lock<std::mutex> bucketLock(bucket.mutex_);
int numAwoken = 0;
- boost::intrusive::list<EmulatedFutexWaitNode> deferredWakeups;
-
- {
- std::unique_lock<std::mutex> lock(bucket.mutex_);
-
- for (auto iter = bucket.waiters_.begin();
- numAwoken < count && iter != bucket.waiters_.end(); ) {
- auto current = iter;
- auto& node = *iter++;
- if (node.addr_ == addr && (node.waitMask_ & waitMask) != 0) {
- // We unlink, but waiter destroys the node. We must signal timed
- // waiters under the lock, to avoid a race where we release the lock,
- // the waiter times out and deletes the node, and then we try to
- // signal it. This problem doesn't exist for unbounded waiters,
- // so for them we optimize their wakeup by releasing the lock first.
- bucket.waiters_.erase(current);
- if (node.hasTimeout_) {
- node.signaled_ = true;
- node.cond_.notify_one();
- } else {
- deferredWakeups.push_back(node);
- }
- ++numAwoken;
- }
+ for (auto iter = bucket.waiters_.begin();
+ numAwoken < count && iter != bucket.waiters_.end(); ) {
+ auto current = iter;
+ auto& node = *iter++;
+ if (node.addr_ == addr && (node.waitMask_ & waitMask) != 0) {
+ ++numAwoken;
+
+ // we unlink, but waiter destroys the node
+ bucket.waiters_.erase(current);
+
+ std::unique_lock<std::mutex> nodeLock(node.mutex_);
+ node.signaled_ = true;
+ node.cond_.notify_one();
}
}
-
- while (!deferredWakeups.empty()) {
- auto& node = deferredWakeups.front();
- deferredWakeups.pop_front();
- node.signaled_ = true;
- node.cond_.notify_one();
- }
-
return numAwoken;
}
+template <typename F>
FutexResult emulatedFutexWaitImpl(
- void* addr,
- uint32_t expected,
- time_point<system_clock>* absSystemTime,
- time_point<steady_clock>* absSteadyTime,
- uint32_t waitMask) {
- bool hasTimeout = absSystemTime != nullptr || absSteadyTime != nullptr;
- EmulatedFutexWaitNode node(addr, waitMask, hasTimeout);
-
+ F* futex,
+ uint32_t expected,
+ time_point<system_clock>* absSystemTime,
+ time_point<steady_clock>* absSteadyTime,
+ uint32_t waitMask) {
+ static_assert(
+ std::is_same<F, Futex<std::atomic>>::value ||
+ std::is_same<F, Futex<EmulatedFutexAtomic>>::value,
+ "Type F must be either Futex<std::atomic> or Futex<EmulatedFutexAtomic>");
+ void* addr = static_cast<void*>(futex);
auto& bucket = EmulatedFutexBucket::bucketFor(addr);
- std::unique_lock<std::mutex> lock(bucket.mutex_);
+ EmulatedFutexWaitNode node(addr, waitMask);
- uint32_t actual;
- memcpy(&actual, addr, sizeof(uint32_t));
- if (actual != expected) {
- return FutexResult::VALUE_CHANGED;
- }
+ {
+ std::unique_lock<std::mutex> bucketLock(bucket.mutex_);
- bucket.waiters_.push_back(node);
- while (!node.signaled_) {
- std::cv_status status = std::cv_status::no_timeout;
- if (absSystemTime != nullptr) {
- status = node.cond_.wait_until(lock, *absSystemTime);
- } else if (absSteadyTime != nullptr) {
- status = node.cond_.wait_until(lock, *absSteadyTime);
- } else {
- node.cond_.wait(lock);
+ if (futex->load(std::memory_order_relaxed) != expected) {
+ return FutexResult::VALUE_CHANGED;
+ }
+
+ bucket.waiters_.push_back(node);
+ } // bucketLock scope
+
+ std::cv_status status = std::cv_status::no_timeout;
+ {
+ std::unique_lock<std::mutex> nodeLock(node.mutex_);
+ while (!node.signaled_ && status != std::cv_status::timeout) {
+ if (absSystemTime != nullptr) {
+ status = node.cond_.wait_until(nodeLock, *absSystemTime);
+ } else if (absSteadyTime != nullptr) {
+ status = node.cond_.wait_until(nodeLock, *absSteadyTime);
+ } else {
+ node.cond_.wait(nodeLock);
+ }
}
+ } // nodeLock scope
- if (status == std::cv_status::timeout) {
+ if (status == std::cv_status::timeout) {
+ // it's not really a timeout until we unlink the unsignaled node
+ std::unique_lock<std::mutex> bucketLock(bucket.mutex_);
+ if (!node.signaled_) {
bucket.waiters_.erase(bucket.waiters_.iterator_to(node));
return FutexResult::TIMEDOUT;
}