From: Yedidya Feldblum Date: Wed, 6 Dec 2017 06:03:40 +0000 (-0800) Subject: Move folly/Baton.h to folly/synchronization/ X-Git-Tag: v2017.12.11.00~22 X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=b3dc093ef094d6d97c10755e60392124828c5103;p=folly.git Move folly/Baton.h to folly/synchronization/ Summary: [Folly] Move `folly/Baton.h` to `folly/synchronization/`. Reviewed By: phoad, Orvid Differential Revision: D6490282 fbshipit-source-id: 66e2d25ffe3275d576b97b81c1987709000f6649 --- diff --git a/CMakeLists.txt b/CMakeLists.txt index ac34e7d4..fc0084dc 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -537,6 +537,7 @@ if (BUILD_TESTS) TEST timeseries_test SOURCES TimeseriesTest.cpp DIRECTORY synchronization/test/ + TEST baton_test SOURCES BatonTest.cpp TEST call_once_test SOURCES CallOnceTest.cpp TEST lifo_sem_test SOURCES LifoSemTests.cpp @@ -558,7 +559,6 @@ if (BUILD_TESTS) TEST atomic_linked_list_test SOURCES AtomicLinkedListTest.cpp TEST atomic_struct_test SOURCES AtomicStructTest.cpp TEST atomic_unordered_map_test SOURCES AtomicUnorderedMapTest.cpp - TEST baton_test SOURCES BatonTest.cpp TEST bit_iterator_test SOURCES BitIteratorTest.cpp TEST bits_test SOURCES BitsTest.cpp TEST cacheline_padded_test SOURCES CachelinePaddedTest.cpp diff --git a/folly/Baton.h b/folly/Baton.h deleted file mode 100644 index 83323915..00000000 --- a/folly/Baton.h +++ /dev/null @@ -1,373 +0,0 @@ -/* - * Copyright 2017 Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include -#include -#include - -namespace folly { - -/// A Baton allows a thread to block once and be awoken. The single -/// poster version (with SinglePoster == true) captures a single -/// handoff, and during its lifecycle (from construction/reset to -/// destruction/reset) a baton must either be post()ed and wait()ed -/// exactly once each, or not at all. -/// -/// The multi-poster version (SinglePoster == false) allows multiple -/// concurrent handoff attempts, the first of which completes the -/// handoff and the rest if any are idempotent. -/// -/// Baton includes no internal padding, and is only 4 bytes in size. -/// Any alignment or padding to avoid false sharing is up to the user. -/// -/// This is basically a stripped-down semaphore that supports (only a -/// single call to sem_post, when SinglePoster == true) and a single -/// call to sem_wait. -/// -/// The non-blocking version (Blocking == false) provides more speed -/// by using only load acquire and store release operations in the -/// critical path, at the cost of disallowing blocking and timing out. -/// -/// The current posix semaphore sem_t isn't too bad, but this provides -/// more a bit more speed, inlining, smaller size, a guarantee that -/// the implementation won't change, and compatibility with -/// DeterministicSchedule. By having a much more restrictive -/// lifecycle we can also add a bunch of assertions that can help to -/// catch race conditions ahead of time. -template < - template class Atom = std::atomic, - bool SinglePoster = true, // single vs multiple posters - bool Blocking = true> // blocking vs spinning -struct Baton { - constexpr Baton() : state_(INIT) {} - - Baton(Baton const&) = delete; - Baton& operator=(Baton const&) = delete; - - /// It is an error to destroy a Baton on which a thread is currently - /// wait()ing. In practice this means that the waiter usually takes - /// responsibility for destroying the Baton. - ~Baton() { - // The docblock for this function says that it can't be called when - // there is a concurrent waiter. We assume a strong version of this - // requirement in which the caller must _know_ that this is true, they - // are not allowed to be merely lucky. If two threads are involved, - // the destroying thread must actually have synchronized with the - // waiting thread after wait() returned. To convey causality the the - // waiting thread must have used release semantics and the destroying - // thread must have used acquire semantics for that communication, - // so we are guaranteed to see the post-wait() value of state_, - // which cannot be WAITING. - // - // Note that since we only care about a single memory location, - // the only two plausible memory orders here are relaxed and seq_cst. - assert(state_.load(std::memory_order_relaxed) != WAITING); - } - - /// Equivalent to destroying the Baton and creating a new one. It is - /// a bug to call this while there is a waiting thread, so in practice - /// the waiter will be the one that resets the baton. - void reset() { - // See ~Baton for a discussion about why relaxed is okay here - assert(state_.load(std::memory_order_relaxed) != WAITING); - - // We use a similar argument to justify the use of a relaxed store - // here. Since both wait() and post() are required to be called - // only once per lifetime, no thread can actually call those methods - // correctly after a reset() unless it synchronizes with the thread - // that performed the reset(). If a post() or wait() on another thread - // didn't synchronize, then regardless of what operation we performed - // here there would be a race on proper use of the Baton's spec - // (although not on any particular load and store). Put another way, - // we don't need to synchronize here because anybody that might rely - // on such synchronization is required by the baton rules to perform - // an additional synchronization that has the desired effect anyway. - // - // There is actually a similar argument to be made about the - // constructor, in which the fenceless constructor initialization - // of state_ is piggybacked on whatever synchronization mechanism - // distributes knowledge of the Baton's existence - state_.store(INIT, std::memory_order_relaxed); - } - - /// Causes wait() to wake up. For each lifetime of a Baton (where a - /// lifetime starts at construction or reset() and ends at - /// destruction or reset()) there can be at most one call to post(), - /// in the single poster version. Any thread may call post(). - void post() { - if (!Blocking) { - /// Non-blocking version - /// - assert([&] { - auto state = state_.load(std::memory_order_relaxed); - return (state == INIT || state == EARLY_DELIVERY); - }()); - state_.store(EARLY_DELIVERY, std::memory_order_release); - return; - } - - /// Blocking versions - /// - if (SinglePoster) { - /// Single poster version - /// - uint32_t before = state_.load(std::memory_order_acquire); - - assert(before == INIT || before == WAITING || before == TIMED_OUT); - - if (before == INIT && - state_.compare_exchange_strong(before, EARLY_DELIVERY)) { - return; - } - - assert(before == WAITING || before == TIMED_OUT); - - if (before == TIMED_OUT) { - return; - } - - assert(before == WAITING); - state_.store(LATE_DELIVERY, std::memory_order_release); - state_.futexWake(1); - } else { - /// Multi-poster version - /// - while (true) { - uint32_t before = state_.load(std::memory_order_acquire); - - if (before == INIT && - state_.compare_exchange_strong(before, EARLY_DELIVERY)) { - return; - } - - if (before == TIMED_OUT) { - return; - } - - if (before == EARLY_DELIVERY || before == LATE_DELIVERY) { - // The reason for not simply returning (without the following - // atomic operation) is to avoid the following case: - // - // T1: T2: T3: - // local1.post(); local2.post(); global.wait(); - // global.post(); global.post(); local1.try_wait() == true; - // local2.try_wait() == false; - // - if (state_.fetch_add(0) != before) { - continue; - } - return; - } - - assert(before == WAITING); - if (!state_.compare_exchange_weak(before, LATE_DELIVERY)) { - continue; - } - state_.futexWake(1); - return; - } - } - } - - /// Waits until post() has been called in the current Baton lifetime. - /// May be called at most once during a Baton lifetime (construction - /// |reset until destruction|reset). If post is called before wait in - /// the current lifetime then this method returns immediately. - /// - /// The restriction that there can be at most one wait() per lifetime - /// could be relaxed somewhat without any perf or size regressions, - /// but by making this condition very restrictive we can provide better - /// checking in debug builds. - void wait() { - if (spinWaitForEarlyDelivery()) { - assert(state_.load(std::memory_order_acquire) == EARLY_DELIVERY); - return; - } - - if (!Blocking) { - while (!try_wait()) { - std::this_thread::yield(); - } - return; - } - - // guess we have to block :( - uint32_t expected = INIT; - if (!state_.compare_exchange_strong(expected, WAITING)) { - // CAS failed, last minute reprieve - assert(expected == EARLY_DELIVERY); - return; - } - - while (true) { - detail::MemoryIdler::futexWait(state_, WAITING); - - // state_ is the truth even if FUTEX_WAIT reported a matching - // FUTEX_WAKE, since we aren't using type-stable storage and we - // don't guarantee reuse. The scenario goes like this: thread - // A's last touch of a Baton is a call to wake(), which stores - // LATE_DELIVERY and gets an unlucky context switch before delivering - // the corresponding futexWake. Thread B sees LATE_DELIVERY - // without consuming a futex event, because it calls futexWait - // with an expected value of WAITING and hence doesn't go to sleep. - // B returns, so the Baton's memory is reused and becomes another - // Baton (or a reuse of this one). B calls futexWait on the new - // Baton lifetime, then A wakes up and delivers a spurious futexWake - // to the same memory location. B's futexWait will then report a - // consumed wake event even though state_ is still WAITING. - // - // It would be possible to add an extra state_ dance to communicate - // that the futexWake has been sent so that we can be sure to consume - // it before returning, but that would be a perf and complexity hit. - uint32_t s = state_.load(std::memory_order_acquire); - assert(s == WAITING || s == LATE_DELIVERY); - - if (s == LATE_DELIVERY) { - return; - } - // retry - } - } - - /// Similar to wait, but with a timeout. The thread is unblocked if the - /// timeout expires. - /// Note: Only a single call to timed_wait/wait is allowed during a baton's - /// life-cycle (from construction/reset to destruction/reset). In other - /// words, after timed_wait the caller can't invoke wait/timed_wait/try_wait - /// again on the same baton without resetting it. - /// - /// @param deadline Time until which the thread can block - /// @return true if the baton was posted to before timeout, - /// false otherwise - template - bool timed_wait(const std::chrono::time_point& deadline) { - static_assert(Blocking, "Non-blocking Baton does not support timed wait."); - - if (spinWaitForEarlyDelivery()) { - assert(state_.load(std::memory_order_acquire) == EARLY_DELIVERY); - return true; - } - - // guess we have to block :( - uint32_t expected = INIT; - if (!state_.compare_exchange_strong(expected, WAITING)) { - // CAS failed, last minute reprieve - assert(expected == EARLY_DELIVERY); - return true; - } - - while (true) { - auto rv = state_.futexWaitUntil(WAITING, deadline); - if (rv == folly::detail::FutexResult::TIMEDOUT) { - state_.store(TIMED_OUT, std::memory_order_release); - return false; - } - - uint32_t s = state_.load(std::memory_order_acquire); - assert(s == WAITING || s == LATE_DELIVERY); - if (s == LATE_DELIVERY) { - return true; - } - } - } - - /// Similar to timed_wait, but with a duration. - template - bool timed_wait(const Duration& duration) { - auto deadline = Clock::now() + duration; - return timed_wait(deadline); - } - - /// Similar to wait, but doesn't block the thread if it hasn't been posted. - /// - /// try_wait has the following semantics: - /// - It is ok to call try_wait any number times on the same baton until - /// try_wait reports that the baton has been posted. - /// - It is ok to call timed_wait or wait on the same baton if try_wait - /// reports that baton hasn't been posted. - /// - If try_wait indicates that the baton has been posted, it is invalid to - /// call wait, try_wait or timed_wait on the same baton without resetting - /// - /// @return true if baton has been posted, false othewise - bool try_wait() const { - auto s = state_.load(std::memory_order_acquire); - assert(s == INIT || s == EARLY_DELIVERY); - return s == EARLY_DELIVERY; - } - - private: - enum State : uint32_t { - INIT = 0, - EARLY_DELIVERY = 1, - WAITING = 2, - LATE_DELIVERY = 3, - TIMED_OUT = 4 - }; - - enum { - // Must be positive. If multiple threads are actively using a - // higher-level data structure that uses batons internally, it is - // likely that the post() and wait() calls happen almost at the same - // time. In this state, we lose big 50% of the time if the wait goes - // to sleep immediately. On circa-2013 devbox hardware it costs about - // 7 usec to FUTEX_WAIT and then be awoken (half the t/iter as the - // posix_sem_pingpong test in BatonTests). We can improve our chances - // of EARLY_DELIVERY by spinning for a bit, although we have to balance - // this against the loss if we end up sleeping any way. Spins on this - // hw take about 7 nanos (all but 0.5 nanos is the pause instruction). - // We give ourself 300 spins, which is about 2 usec of waiting. As a - // partial consolation, since we are using the pause instruction we - // are giving a speed boost to the colocated hyperthread. - PreBlockAttempts = 300, - }; - - // Spin for "some time" (see discussion on PreBlockAttempts) waiting - // for a post. - // - // @return true if we received an early delivery during the wait, - // false otherwise. If the function returns true then - // state_ is guaranteed to be EARLY_DELIVERY - bool spinWaitForEarlyDelivery() { - - static_assert(PreBlockAttempts > 0, - "isn't this assert clearer than an uninitialized variable warning?"); - for (int i = 0; i < PreBlockAttempts; ++i) { - if (try_wait()) { - // hooray! - return true; - } - // The pause instruction is the polite way to spin, but it doesn't - // actually affect correctness to omit it if we don't have it. - // Pausing donates the full capabilities of the current core to - // its other hyperthreads for a dozen cycles or so - asm_volatile_pause(); - } - - return false; - } - - detail::Futex state_; -}; - -} // namespace folly diff --git a/folly/Makefile.am b/folly/Makefile.am index 4ca4388d..98547554 100644 --- a/folly/Makefile.am +++ b/folly/Makefile.am @@ -37,7 +37,6 @@ nobase_follyinclude_HEADERS = \ AtomicLinkedList.h \ AtomicStruct.h \ AtomicUnorderedMap.h \ - Baton.h \ Benchmark.h \ Bits.h \ CachelinePadded.h \ @@ -433,6 +432,7 @@ nobase_follyinclude_HEADERS = \ stats/TimeseriesHistogram-defs.h \ stats/TimeseriesHistogram.h \ synchronization/AsymmetricMemoryBarrier.h \ + synchronization/Baton.h \ synchronization/CallOnce.h \ synchronization/LifoSem.h \ synchronization/detail/AtomicUtils.h \ diff --git a/folly/Singleton.h b/folly/Singleton.h index 01a9d5d1..48251b66 100644 --- a/folly/Singleton.h +++ b/folly/Singleton.h @@ -121,7 +121,6 @@ // should call reenableInstances. #pragma once -#include #include #include #include @@ -131,6 +130,7 @@ #include #include #include +#include #include #include diff --git a/folly/executors/ThreadPoolExecutor.h b/folly/executors/ThreadPoolExecutor.h index a92726ef..0e5bcc17 100644 --- a/folly/executors/ThreadPoolExecutor.h +++ b/folly/executors/ThreadPoolExecutor.h @@ -15,7 +15,6 @@ */ #pragma once -#include #include #include #include @@ -23,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/folly/executors/task_queue/test/UnboundedBlockingQueueTest.cpp b/folly/executors/task_queue/test/UnboundedBlockingQueueTest.cpp index a7b1efd5..c3e7d0d4 100644 --- a/folly/executors/task_queue/test/UnboundedBlockingQueueTest.cpp +++ b/folly/executors/task_queue/test/UnboundedBlockingQueueTest.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ #include -#include #include +#include #include using namespace folly; diff --git a/folly/executors/test/ExecutorTest.cpp b/folly/executors/test/ExecutorTest.cpp index 2bbc56a1..e017ca78 100644 --- a/folly/executors/test/ExecutorTest.cpp +++ b/folly/executors/test/ExecutorTest.cpp @@ -14,12 +14,12 @@ * limitations under the License. */ -#include #include #include #include #include #include +#include // TODO(jsedgwick) move this test to executors/test/ once the tested executors // have all moved diff --git a/folly/executors/test/SerialExecutorTest.cpp b/folly/executors/test/SerialExecutorTest.cpp index ae487591..5e022a1f 100644 --- a/folly/executors/test/SerialExecutorTest.cpp +++ b/folly/executors/test/SerialExecutorTest.cpp @@ -16,11 +16,11 @@ #include -#include #include #include #include #include +#include using namespace std::chrono; using folly::SerialExecutor; diff --git a/folly/experimental/flat_combining/FlatCombining.h b/folly/experimental/flat_combining/FlatCombining.h index 63fe1ce7..235e24b2 100644 --- a/folly/experimental/flat_combining/FlatCombining.h +++ b/folly/experimental/flat_combining/FlatCombining.h @@ -16,11 +16,11 @@ #pragma once -#include #include #include #include #include +#include #include #include diff --git a/folly/experimental/flat_combining/test/FlatCombiningExamples.h b/folly/experimental/flat_combining/test/FlatCombiningExamples.h index 4736dfa8..fe6ab184 100644 --- a/folly/experimental/flat_combining/test/FlatCombiningExamples.h +++ b/folly/experimental/flat_combining/test/FlatCombiningExamples.h @@ -20,8 +20,8 @@ #include #include -#include #include +#include namespace folly { diff --git a/folly/experimental/observer/test/ObserverTest.cpp b/folly/experimental/observer/test/ObserverTest.cpp index ed372f5a..1c3bb28f 100644 --- a/folly/experimental/observer/test/ObserverTest.cpp +++ b/folly/experimental/observer/test/ObserverTest.cpp @@ -16,9 +16,9 @@ #include -#include #include #include +#include using namespace folly::observer; diff --git a/folly/experimental/test/FunctionSchedulerTest.cpp b/folly/experimental/test/FunctionSchedulerTest.cpp index ec943958..b869f2a8 100644 --- a/folly/experimental/test/FunctionSchedulerTest.cpp +++ b/folly/experimental/test/FunctionSchedulerTest.cpp @@ -21,10 +21,10 @@ #include -#include #include #include #include +#include #if defined(__linux__) #include diff --git a/folly/experimental/test/ReadMostlySharedPtrTest.cpp b/folly/experimental/test/ReadMostlySharedPtrTest.cpp index 08996115..789eeecd 100644 --- a/folly/experimental/test/ReadMostlySharedPtrTest.cpp +++ b/folly/experimental/test/ReadMostlySharedPtrTest.cpp @@ -20,11 +20,11 @@ #include #include -#include #include #include #include #include +#include using folly::ReadMostlyMainPtr; using folly::ReadMostlyWeakPtr; diff --git a/folly/experimental/test/RefCountTest.cpp b/folly/experimental/test/RefCountTest.cpp index a63de14d..3c75ae3c 100644 --- a/folly/experimental/test/RefCountTest.cpp +++ b/folly/experimental/test/RefCountTest.cpp @@ -15,10 +15,10 @@ */ #include -#include #include #include #include +#include namespace folly { diff --git a/folly/fibers/GenericBaton.h b/folly/fibers/GenericBaton.h index 93dbebae..6cab1bf0 100644 --- a/folly/fibers/GenericBaton.h +++ b/folly/fibers/GenericBaton.h @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include diff --git a/folly/futures/Future-inl.h b/folly/futures/Future-inl.h index 5818244c..be53ff7c 100644 --- a/folly/futures/Future-inl.h +++ b/folly/futures/Future-inl.h @@ -21,11 +21,11 @@ #include #include -#include #include #include #include #include +#include #ifndef FOLLY_FUTURE_USING_FIBER #if FOLLY_MOBILE || defined(__APPLE__) diff --git a/folly/futures/test/Benchmark.cpp b/folly/futures/test/Benchmark.cpp index a9c743de..336ce7f7 100644 --- a/folly/futures/test/Benchmark.cpp +++ b/folly/futures/test/Benchmark.cpp @@ -15,12 +15,12 @@ */ #include -#include #include #include #include #include #include +#include #include diff --git a/folly/futures/test/FutureTest.cpp b/folly/futures/test/FutureTest.cpp index e269aa15..ae321fde 100644 --- a/folly/futures/test/FutureTest.cpp +++ b/folly/futures/test/FutureTest.cpp @@ -14,13 +14,13 @@ * limitations under the License. */ -#include +#include #include #include #include #include -#include #include +#include #include #include diff --git a/folly/futures/test/InterruptTest.cpp b/folly/futures/test/InterruptTest.cpp index 7c51bb1e..fc87606b 100644 --- a/folly/futures/test/InterruptTest.cpp +++ b/folly/futures/test/InterruptTest.cpp @@ -14,10 +14,10 @@ * limitations under the License. */ -#include #include #include #include +#include using namespace folly; diff --git a/folly/futures/test/SemiFutureTest.cpp b/folly/futures/test/SemiFutureTest.cpp index 39ea108b..7b927021 100644 --- a/folly/futures/test/SemiFutureTest.cpp +++ b/folly/futures/test/SemiFutureTest.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include @@ -22,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/folly/futures/test/ViaTest.cpp b/folly/futures/test/ViaTest.cpp index fb2008a8..f7a81c99 100644 --- a/folly/futures/test/ViaTest.cpp +++ b/folly/futures/test/ViaTest.cpp @@ -16,13 +16,13 @@ #include -#include #include #include #include #include #include #include +#include using namespace folly; diff --git a/folly/futures/test/WaitTest.cpp b/folly/futures/test/WaitTest.cpp index 57e4e172..1d1b287d 100644 --- a/folly/futures/test/WaitTest.cpp +++ b/folly/futures/test/WaitTest.cpp @@ -16,10 +16,10 @@ #include -#include #include #include #include +#include using namespace folly; using std::vector; diff --git a/folly/io/async/EventBase.cpp b/folly/io/async/EventBase.cpp index c1ed7336..1466d721 100644 --- a/folly/io/async/EventBase.cpp +++ b/folly/io/async/EventBase.cpp @@ -26,11 +26,11 @@ #include #include -#include #include #include #include #include +#include #include namespace folly { diff --git a/folly/io/async/ScopedEventBaseThread.h b/folly/io/async/ScopedEventBaseThread.h index c81ab393..c234dc00 100644 --- a/folly/io/async/ScopedEventBaseThread.h +++ b/folly/io/async/ScopedEventBaseThread.h @@ -19,8 +19,8 @@ #include #include -#include #include +#include namespace folly { diff --git a/folly/io/async/VirtualEventBase.h b/folly/io/async/VirtualEventBase.h index 5120f21e..30e8404b 100644 --- a/folly/io/async/VirtualEventBase.h +++ b/folly/io/async/VirtualEventBase.h @@ -18,9 +18,9 @@ #include -#include #include #include +#include namespace folly { diff --git a/folly/io/async/test/EventBaseThreadTest.cpp b/folly/io/async/test/EventBaseThreadTest.cpp index e804501e..88ed75b2 100644 --- a/folly/io/async/test/EventBaseThreadTest.cpp +++ b/folly/io/async/test/EventBaseThreadTest.cpp @@ -18,9 +18,9 @@ #include -#include #include #include +#include #include using namespace std; diff --git a/folly/io/async/test/NotificationQueueTest.cpp b/folly/io/async/test/NotificationQueueTest.cpp index 727eaad2..2ff48d54 100644 --- a/folly/io/async/test/NotificationQueueTest.cpp +++ b/folly/io/async/test/NotificationQueueTest.cpp @@ -22,9 +22,9 @@ #include #include -#include #include #include +#include #ifndef _WIN32 #include diff --git a/folly/io/async/test/ScopedEventBaseThreadTest.cpp b/folly/io/async/test/ScopedEventBaseThreadTest.cpp index 6f1d2266..321a0888 100644 --- a/folly/io/async/test/ScopedEventBaseThreadTest.cpp +++ b/folly/io/async/test/ScopedEventBaseThreadTest.cpp @@ -19,10 +19,10 @@ #include #include -#include #include #include #include +#include #include using namespace std; diff --git a/folly/synchronization/Baton.h b/folly/synchronization/Baton.h new file mode 100644 index 00000000..83323915 --- /dev/null +++ b/folly/synchronization/Baton.h @@ -0,0 +1,373 @@ +/* + * Copyright 2017 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace folly { + +/// A Baton allows a thread to block once and be awoken. The single +/// poster version (with SinglePoster == true) captures a single +/// handoff, and during its lifecycle (from construction/reset to +/// destruction/reset) a baton must either be post()ed and wait()ed +/// exactly once each, or not at all. +/// +/// The multi-poster version (SinglePoster == false) allows multiple +/// concurrent handoff attempts, the first of which completes the +/// handoff and the rest if any are idempotent. +/// +/// Baton includes no internal padding, and is only 4 bytes in size. +/// Any alignment or padding to avoid false sharing is up to the user. +/// +/// This is basically a stripped-down semaphore that supports (only a +/// single call to sem_post, when SinglePoster == true) and a single +/// call to sem_wait. +/// +/// The non-blocking version (Blocking == false) provides more speed +/// by using only load acquire and store release operations in the +/// critical path, at the cost of disallowing blocking and timing out. +/// +/// The current posix semaphore sem_t isn't too bad, but this provides +/// more a bit more speed, inlining, smaller size, a guarantee that +/// the implementation won't change, and compatibility with +/// DeterministicSchedule. By having a much more restrictive +/// lifecycle we can also add a bunch of assertions that can help to +/// catch race conditions ahead of time. +template < + template class Atom = std::atomic, + bool SinglePoster = true, // single vs multiple posters + bool Blocking = true> // blocking vs spinning +struct Baton { + constexpr Baton() : state_(INIT) {} + + Baton(Baton const&) = delete; + Baton& operator=(Baton const&) = delete; + + /// It is an error to destroy a Baton on which a thread is currently + /// wait()ing. In practice this means that the waiter usually takes + /// responsibility for destroying the Baton. + ~Baton() { + // The docblock for this function says that it can't be called when + // there is a concurrent waiter. We assume a strong version of this + // requirement in which the caller must _know_ that this is true, they + // are not allowed to be merely lucky. If two threads are involved, + // the destroying thread must actually have synchronized with the + // waiting thread after wait() returned. To convey causality the the + // waiting thread must have used release semantics and the destroying + // thread must have used acquire semantics for that communication, + // so we are guaranteed to see the post-wait() value of state_, + // which cannot be WAITING. + // + // Note that since we only care about a single memory location, + // the only two plausible memory orders here are relaxed and seq_cst. + assert(state_.load(std::memory_order_relaxed) != WAITING); + } + + /// Equivalent to destroying the Baton and creating a new one. It is + /// a bug to call this while there is a waiting thread, so in practice + /// the waiter will be the one that resets the baton. + void reset() { + // See ~Baton for a discussion about why relaxed is okay here + assert(state_.load(std::memory_order_relaxed) != WAITING); + + // We use a similar argument to justify the use of a relaxed store + // here. Since both wait() and post() are required to be called + // only once per lifetime, no thread can actually call those methods + // correctly after a reset() unless it synchronizes with the thread + // that performed the reset(). If a post() or wait() on another thread + // didn't synchronize, then regardless of what operation we performed + // here there would be a race on proper use of the Baton's spec + // (although not on any particular load and store). Put another way, + // we don't need to synchronize here because anybody that might rely + // on such synchronization is required by the baton rules to perform + // an additional synchronization that has the desired effect anyway. + // + // There is actually a similar argument to be made about the + // constructor, in which the fenceless constructor initialization + // of state_ is piggybacked on whatever synchronization mechanism + // distributes knowledge of the Baton's existence + state_.store(INIT, std::memory_order_relaxed); + } + + /// Causes wait() to wake up. For each lifetime of a Baton (where a + /// lifetime starts at construction or reset() and ends at + /// destruction or reset()) there can be at most one call to post(), + /// in the single poster version. Any thread may call post(). + void post() { + if (!Blocking) { + /// Non-blocking version + /// + assert([&] { + auto state = state_.load(std::memory_order_relaxed); + return (state == INIT || state == EARLY_DELIVERY); + }()); + state_.store(EARLY_DELIVERY, std::memory_order_release); + return; + } + + /// Blocking versions + /// + if (SinglePoster) { + /// Single poster version + /// + uint32_t before = state_.load(std::memory_order_acquire); + + assert(before == INIT || before == WAITING || before == TIMED_OUT); + + if (before == INIT && + state_.compare_exchange_strong(before, EARLY_DELIVERY)) { + return; + } + + assert(before == WAITING || before == TIMED_OUT); + + if (before == TIMED_OUT) { + return; + } + + assert(before == WAITING); + state_.store(LATE_DELIVERY, std::memory_order_release); + state_.futexWake(1); + } else { + /// Multi-poster version + /// + while (true) { + uint32_t before = state_.load(std::memory_order_acquire); + + if (before == INIT && + state_.compare_exchange_strong(before, EARLY_DELIVERY)) { + return; + } + + if (before == TIMED_OUT) { + return; + } + + if (before == EARLY_DELIVERY || before == LATE_DELIVERY) { + // The reason for not simply returning (without the following + // atomic operation) is to avoid the following case: + // + // T1: T2: T3: + // local1.post(); local2.post(); global.wait(); + // global.post(); global.post(); local1.try_wait() == true; + // local2.try_wait() == false; + // + if (state_.fetch_add(0) != before) { + continue; + } + return; + } + + assert(before == WAITING); + if (!state_.compare_exchange_weak(before, LATE_DELIVERY)) { + continue; + } + state_.futexWake(1); + return; + } + } + } + + /// Waits until post() has been called in the current Baton lifetime. + /// May be called at most once during a Baton lifetime (construction + /// |reset until destruction|reset). If post is called before wait in + /// the current lifetime then this method returns immediately. + /// + /// The restriction that there can be at most one wait() per lifetime + /// could be relaxed somewhat without any perf or size regressions, + /// but by making this condition very restrictive we can provide better + /// checking in debug builds. + void wait() { + if (spinWaitForEarlyDelivery()) { + assert(state_.load(std::memory_order_acquire) == EARLY_DELIVERY); + return; + } + + if (!Blocking) { + while (!try_wait()) { + std::this_thread::yield(); + } + return; + } + + // guess we have to block :( + uint32_t expected = INIT; + if (!state_.compare_exchange_strong(expected, WAITING)) { + // CAS failed, last minute reprieve + assert(expected == EARLY_DELIVERY); + return; + } + + while (true) { + detail::MemoryIdler::futexWait(state_, WAITING); + + // state_ is the truth even if FUTEX_WAIT reported a matching + // FUTEX_WAKE, since we aren't using type-stable storage and we + // don't guarantee reuse. The scenario goes like this: thread + // A's last touch of a Baton is a call to wake(), which stores + // LATE_DELIVERY and gets an unlucky context switch before delivering + // the corresponding futexWake. Thread B sees LATE_DELIVERY + // without consuming a futex event, because it calls futexWait + // with an expected value of WAITING and hence doesn't go to sleep. + // B returns, so the Baton's memory is reused and becomes another + // Baton (or a reuse of this one). B calls futexWait on the new + // Baton lifetime, then A wakes up and delivers a spurious futexWake + // to the same memory location. B's futexWait will then report a + // consumed wake event even though state_ is still WAITING. + // + // It would be possible to add an extra state_ dance to communicate + // that the futexWake has been sent so that we can be sure to consume + // it before returning, but that would be a perf and complexity hit. + uint32_t s = state_.load(std::memory_order_acquire); + assert(s == WAITING || s == LATE_DELIVERY); + + if (s == LATE_DELIVERY) { + return; + } + // retry + } + } + + /// Similar to wait, but with a timeout. The thread is unblocked if the + /// timeout expires. + /// Note: Only a single call to timed_wait/wait is allowed during a baton's + /// life-cycle (from construction/reset to destruction/reset). In other + /// words, after timed_wait the caller can't invoke wait/timed_wait/try_wait + /// again on the same baton without resetting it. + /// + /// @param deadline Time until which the thread can block + /// @return true if the baton was posted to before timeout, + /// false otherwise + template + bool timed_wait(const std::chrono::time_point& deadline) { + static_assert(Blocking, "Non-blocking Baton does not support timed wait."); + + if (spinWaitForEarlyDelivery()) { + assert(state_.load(std::memory_order_acquire) == EARLY_DELIVERY); + return true; + } + + // guess we have to block :( + uint32_t expected = INIT; + if (!state_.compare_exchange_strong(expected, WAITING)) { + // CAS failed, last minute reprieve + assert(expected == EARLY_DELIVERY); + return true; + } + + while (true) { + auto rv = state_.futexWaitUntil(WAITING, deadline); + if (rv == folly::detail::FutexResult::TIMEDOUT) { + state_.store(TIMED_OUT, std::memory_order_release); + return false; + } + + uint32_t s = state_.load(std::memory_order_acquire); + assert(s == WAITING || s == LATE_DELIVERY); + if (s == LATE_DELIVERY) { + return true; + } + } + } + + /// Similar to timed_wait, but with a duration. + template + bool timed_wait(const Duration& duration) { + auto deadline = Clock::now() + duration; + return timed_wait(deadline); + } + + /// Similar to wait, but doesn't block the thread if it hasn't been posted. + /// + /// try_wait has the following semantics: + /// - It is ok to call try_wait any number times on the same baton until + /// try_wait reports that the baton has been posted. + /// - It is ok to call timed_wait or wait on the same baton if try_wait + /// reports that baton hasn't been posted. + /// - If try_wait indicates that the baton has been posted, it is invalid to + /// call wait, try_wait or timed_wait on the same baton without resetting + /// + /// @return true if baton has been posted, false othewise + bool try_wait() const { + auto s = state_.load(std::memory_order_acquire); + assert(s == INIT || s == EARLY_DELIVERY); + return s == EARLY_DELIVERY; + } + + private: + enum State : uint32_t { + INIT = 0, + EARLY_DELIVERY = 1, + WAITING = 2, + LATE_DELIVERY = 3, + TIMED_OUT = 4 + }; + + enum { + // Must be positive. If multiple threads are actively using a + // higher-level data structure that uses batons internally, it is + // likely that the post() and wait() calls happen almost at the same + // time. In this state, we lose big 50% of the time if the wait goes + // to sleep immediately. On circa-2013 devbox hardware it costs about + // 7 usec to FUTEX_WAIT and then be awoken (half the t/iter as the + // posix_sem_pingpong test in BatonTests). We can improve our chances + // of EARLY_DELIVERY by spinning for a bit, although we have to balance + // this against the loss if we end up sleeping any way. Spins on this + // hw take about 7 nanos (all but 0.5 nanos is the pause instruction). + // We give ourself 300 spins, which is about 2 usec of waiting. As a + // partial consolation, since we are using the pause instruction we + // are giving a speed boost to the colocated hyperthread. + PreBlockAttempts = 300, + }; + + // Spin for "some time" (see discussion on PreBlockAttempts) waiting + // for a post. + // + // @return true if we received an early delivery during the wait, + // false otherwise. If the function returns true then + // state_ is guaranteed to be EARLY_DELIVERY + bool spinWaitForEarlyDelivery() { + + static_assert(PreBlockAttempts > 0, + "isn't this assert clearer than an uninitialized variable warning?"); + for (int i = 0; i < PreBlockAttempts; ++i) { + if (try_wait()) { + // hooray! + return true; + } + // The pause instruction is the polite way to spin, but it doesn't + // actually affect correctness to omit it if we don't have it. + // Pausing donates the full capabilities of the current core to + // its other hyperthreads for a dozen cycles or so + asm_volatile_pause(); + } + + return false; + } + + detail::Futex state_; +}; + +} // namespace folly diff --git a/folly/synchronization/LifoSem.h b/folly/synchronization/LifoSem.h index 90176094..ef27df4e 100644 --- a/folly/synchronization/LifoSem.h +++ b/folly/synchronization/LifoSem.h @@ -24,10 +24,10 @@ #include #include -#include #include #include #include +#include namespace folly { diff --git a/folly/synchronization/test/BatonBenchmark.cpp b/folly/synchronization/test/BatonBenchmark.cpp new file mode 100644 index 00000000..25126282 --- /dev/null +++ b/folly/synchronization/test/BatonBenchmark.cpp @@ -0,0 +1,104 @@ +/* + * Copyright 2017 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include + +using namespace folly; +using namespace folly::test; +using folly::detail::EmulatedFutexAtomic; + +typedef DeterministicSchedule DSched; + +BENCHMARK(baton_pingpong_single_poster_blocking, iters) { + run_pingpong_test(iters); +} + +BENCHMARK(baton_pingpong_multi_poster_blocking, iters) { + run_pingpong_test(iters); +} + +BENCHMARK(baton_pingpong_single_poster_nonblocking, iters) { + run_pingpong_test(iters); +} + +BENCHMARK(baton_pingpong_multi_poster_nonblocking, iters) { + run_pingpong_test(iters); +} + +BENCHMARK_DRAW_LINE() + +BENCHMARK(baton_pingpong_emulated_futex_single_poster_blocking, iters) { + run_pingpong_test(iters); +} + +BENCHMARK(baton_pingpong_emulated_futex_multi_poster_blocking, iters) { + run_pingpong_test(iters); +} + +BENCHMARK(baton_pingpong_emulated_futex_single_poster_nonblocking, iters) { + run_pingpong_test(iters); +} + +BENCHMARK(baton_pingpong_emulated_futex_multi_poster_nonblocking, iters) { + run_pingpong_test(iters); +} + +BENCHMARK_DRAW_LINE() + +BENCHMARK(posix_sem_pingpong, iters) { + sem_t sems[3]; + sem_t* a = sems + 0; + sem_t* b = sems + 2; // to get it on a different cache line + + sem_init(a, 0, 0); + sem_init(b, 0, 0); + auto thr = std::thread([=] { + for (size_t i = 0; i < iters; ++i) { + sem_wait(a); + sem_post(b); + } + }); + for (size_t i = 0; i < iters; ++i) { + sem_post(a); + sem_wait(b); + } + thr.join(); +} + +// I am omitting a benchmark result snapshot because these microbenchmarks +// mainly illustrate that PreBlockAttempts is very effective for rapid +// handoffs. The performance of Baton and sem_t is essentially identical +// to the required futex calls for the blocking case + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + auto rv = RUN_ALL_TESTS(); + if (!rv && FLAGS_benchmark) { + folly::runBenchmarks(); + } + return rv; +} diff --git a/folly/synchronization/test/BatonTest.cpp b/folly/synchronization/test/BatonTest.cpp new file mode 100644 index 00000000..2e6f5164 --- /dev/null +++ b/folly/synchronization/test/BatonTest.cpp @@ -0,0 +1,285 @@ +/* + * Copyright 2017 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +using namespace folly; +using namespace folly::test; +using folly::detail::EmulatedFutexAtomic; + +/// Basic test + +TEST(Baton, basic_single_poster_blocking) { + run_basic_test(); + run_basic_test(); + run_basic_test(); +} + +TEST(Baton, basic_single_poster_nonblocking) { + run_basic_test(); + run_basic_test(); + run_basic_test(); +} + +TEST(Baton, basic_multi_poster_blocking) { + run_basic_test(); +} + +TEST(Baton, basic_multi_poster_nonblocking) { + run_basic_test(); +} + +/// Ping pong tests + +TEST(Baton, pingpong_single_poster_blocking) { + DSched sched(DSched::uniform(0)); + + run_pingpong_test(1000); +} + +TEST(Baton, pingpong_single_poster_nonblocking) { + DSched sched(DSched::uniform(0)); + + run_pingpong_test(1000); +} + +TEST(Baton, pingpong_multi_poster_blocking) { + DSched sched(DSched::uniform(0)); + + run_pingpong_test(1000); +} + +TEST(Baton, pingpong_multi_poster_nonblocking) { + DSched sched(DSched::uniform(0)); + + run_pingpong_test(1000); +} + +/// Timed wait tests - Nonblocking Baton does not support timed_wait() + +// Timed wait basic system clock tests + +TEST(Baton, timed_wait_basic_system_clock_single_poster) { + run_basic_timed_wait_tests(); + run_basic_timed_wait_tests< + EmulatedFutexAtomic, + std::chrono::system_clock, + true>(); + run_basic_timed_wait_tests< + DeterministicAtomic, + std::chrono::system_clock, + true>(); +} + +TEST(Baton, timed_wait_basic_system_clock_multi_poster) { + run_basic_timed_wait_tests(); + run_basic_timed_wait_tests< + EmulatedFutexAtomic, + std::chrono::system_clock, + false>(); + run_basic_timed_wait_tests< + DeterministicAtomic, + std::chrono::system_clock, + false>(); +} + +// Timed wait timeout system clock tests + +TEST(Baton, timed_wait_timeout_system_clock_single_poster) { + run_timed_wait_tmo_tests(); + run_timed_wait_tmo_tests< + EmulatedFutexAtomic, + std::chrono::system_clock, + true>(); + run_timed_wait_tmo_tests< + DeterministicAtomic, + std::chrono::system_clock, + true>(); +} + +TEST(Baton, timed_wait_timeout_system_clock_multi_poster) { + run_timed_wait_tmo_tests(); + run_timed_wait_tmo_tests< + EmulatedFutexAtomic, + std::chrono::system_clock, + false>(); + run_timed_wait_tmo_tests< + DeterministicAtomic, + std::chrono::system_clock, + false>(); +} + +// Timed wait regular system clock tests + +TEST(Baton, timed_wait_system_clock_single_poster) { + run_timed_wait_regular_test(); + run_timed_wait_regular_test< + EmulatedFutexAtomic, + std::chrono::system_clock, + true>(); + run_timed_wait_regular_test< + DeterministicAtomic, + std::chrono::system_clock, + true>(); +} + +TEST(Baton, timed_wait_system_clock_multi_poster) { + run_timed_wait_regular_test(); + run_timed_wait_regular_test< + EmulatedFutexAtomic, + std::chrono::system_clock, + false>(); + run_timed_wait_regular_test< + DeterministicAtomic, + std::chrono::system_clock, + false>(); +} + +// Timed wait basic steady clock tests + +TEST(Baton, timed_wait_basic_steady_clock_single_poster) { + run_basic_timed_wait_tests(); + run_basic_timed_wait_tests< + EmulatedFutexAtomic, + std::chrono::steady_clock, + true>(); + run_basic_timed_wait_tests< + DeterministicAtomic, + std::chrono::steady_clock, + true>(); +} + +TEST(Baton, timed_wait_basic_steady_clock_multi_poster) { + run_basic_timed_wait_tests(); + run_basic_timed_wait_tests< + EmulatedFutexAtomic, + std::chrono::steady_clock, + false>(); + run_basic_timed_wait_tests< + DeterministicAtomic, + std::chrono::steady_clock, + false>(); +} + +// Timed wait timeout steady clock tests + +TEST(Baton, timed_wait_timeout_steady_clock_single_poster) { + run_timed_wait_tmo_tests(); + run_timed_wait_tmo_tests< + EmulatedFutexAtomic, + std::chrono::steady_clock, + true>(); + run_timed_wait_tmo_tests< + DeterministicAtomic, + std::chrono::steady_clock, + true>(); +} + +TEST(Baton, timed_wait_timeout_steady_clock_multi_poster) { + run_timed_wait_tmo_tests(); + run_timed_wait_tmo_tests< + EmulatedFutexAtomic, + std::chrono::steady_clock, + false>(); + run_timed_wait_tmo_tests< + DeterministicAtomic, + std::chrono::steady_clock, + false>(); +} + +// Timed wait regular steady clock tests + +TEST(Baton, timed_wait_steady_clock_single_poster) { + run_timed_wait_regular_test(); + run_timed_wait_regular_test< + EmulatedFutexAtomic, + std::chrono::steady_clock, + true>(); + run_timed_wait_regular_test< + DeterministicAtomic, + std::chrono::steady_clock, + true>(); +} + +TEST(Baton, timed_wait_steady_clock_multi_poster) { + run_timed_wait_regular_test(); + run_timed_wait_regular_test< + EmulatedFutexAtomic, + std::chrono::steady_clock, + false>(); + run_timed_wait_regular_test< + DeterministicAtomic, + std::chrono::steady_clock, + false>(); +} + +/// Try wait tests + +TEST(Baton, try_wait_single_poster_blocking) { + run_try_wait_tests(); + run_try_wait_tests(); + run_try_wait_tests(); +} + +TEST(Baton, try_wait_single_poster_nonblocking) { + run_try_wait_tests(); + run_try_wait_tests(); + run_try_wait_tests(); +} + +TEST(Baton, try_wait_multi_poster_blocking) { + run_try_wait_tests(); + run_try_wait_tests(); + run_try_wait_tests(); +} + +TEST(Baton, try_wait_multi_poster_nonblocking) { + run_try_wait_tests(); + run_try_wait_tests(); + run_try_wait_tests(); +} + +/// Multi-producer tests + +TEST(Baton, multi_producer_single_poster_blocking) { + run_try_wait_tests(); + run_try_wait_tests(); + run_try_wait_tests(); +} + +TEST(Baton, multi_producer_single_poster_nonblocking) { + run_try_wait_tests(); + run_try_wait_tests(); + run_try_wait_tests(); +} + +TEST(Baton, multi_producer_multi_poster_blocking) { + run_try_wait_tests(); + run_try_wait_tests(); + run_try_wait_tests(); +} + +TEST(Baton, multi_producer_multi_poster_nonblocking) { + run_try_wait_tests(); + run_try_wait_tests(); + run_try_wait_tests(); +} diff --git a/folly/synchronization/test/BatonTestHelpers.h b/folly/synchronization/test/BatonTestHelpers.h new file mode 100644 index 00000000..6e242e2a --- /dev/null +++ b/folly/synchronization/test/BatonTestHelpers.h @@ -0,0 +1,171 @@ +/* + * Copyright 2017 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace folly { +namespace test { + +typedef DeterministicSchedule DSched; + +template