folly/synchronization/Baton.h

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #pragma once
  18
  19 #include <assert.h>
  20 #include <errno.h>
  21 #include <stdint.h>
  22 #include <atomic>
  23 #include <thread>
  24
  25 #include <folly/Likely.h>
  26 #include <folly/detail/Futex.h>
  27 #include <folly/detail/MemoryIdler.h>
  28 #include <folly/portability/Asm.h>
  29
  30 namespace folly {
  31
  32 /// A Baton allows a thread to block once and be awoken. The single
  33 /// poster version (with SinglePoster == true) captures a single
  34 /// handoff, and during its lifecycle (from construction/reset to
  35 /// destruction/reset) a baton must either be post()ed and wait()ed
  36 /// exactly once each, or not at all.
  37 ///
  38 /// The multi-poster version (SinglePoster == false) allows multiple
  39 /// concurrent handoff attempts, the first of which completes the
  40 /// handoff and the rest if any are idempotent.
  41 ///
  42 /// Baton includes no internal padding, and is only 4 bytes in size.
  43 /// Any alignment or padding to avoid false sharing is up to the user.
  44 ///
  45 /// This is basically a stripped-down semaphore that supports (only a
  46 /// single call to sem_post, when SinglePoster == true) and a single
  47 /// call to sem_wait.
  48 ///
  49 /// The non-blocking version (Blocking == false) provides more speed
  50 /// by using only load acquire and store release operations in the
  51 /// critical path, at the cost of disallowing blocking and timing out.
  52 ///
  53 /// The current posix semaphore sem_t isn't too bad, but this provides
  54 /// more a bit more speed, inlining, smaller size, a guarantee that
  55 /// the implementation won't change, and compatibility with
  56 /// DeterministicSchedule.  By having a much more restrictive
  57 /// lifecycle we can also add a bunch of assertions that can help to
  58 /// catch race conditions ahead of time.
  59 template <
  60     template <typename> class Atom = std::atomic,
  61     bool SinglePoster = true, //  single vs multiple posters
  62     bool Blocking = true> // blocking vs spinning
  63 struct Baton {
  64   constexpr Baton() : state_(INIT) {}
  65
  66   Baton(Baton const&) = delete;
  67   Baton& operator=(Baton const&) = delete;
  68
  69   /// It is an error to destroy a Baton on which a thread is currently
  70   /// wait()ing.  In practice this means that the waiter usually takes
  71   /// responsibility for destroying the Baton.
  72   ~Baton() {
  73     // The docblock for this function says that it can't be called when
  74     // there is a concurrent waiter.  We assume a strong version of this
  75     // requirement in which the caller must _know_ that this is true, they
  76     // are not allowed to be merely lucky.  If two threads are involved,
  77     // the destroying thread must actually have synchronized with the
  78     // waiting thread after wait() returned.  To convey causality the the
  79     // waiting thread must have used release semantics and the destroying
  80     // thread must have used acquire semantics for that communication,
  81     // so we are guaranteed to see the post-wait() value of state_,
  82     // which cannot be WAITING.
  83     //
  84     // Note that since we only care about a single memory location,
  85     // the only two plausible memory orders here are relaxed and seq_cst.
  86     assert(state_.load(std::memory_order_relaxed) != WAITING);
  87   }
  88
  89   /// Equivalent to destroying the Baton and creating a new one.  It is
  90   /// a bug to call this while there is a waiting thread, so in practice
  91   /// the waiter will be the one that resets the baton.
  92   void reset() {
  93     // See ~Baton for a discussion about why relaxed is okay here
  94     assert(state_.load(std::memory_order_relaxed) != WAITING);
  95
  96     // We use a similar argument to justify the use of a relaxed store
  97     // here.  Since both wait() and post() are required to be called
  98     // only once per lifetime, no thread can actually call those methods
  99     // correctly after a reset() unless it synchronizes with the thread
 100     // that performed the reset().  If a post() or wait() on another thread
 101     // didn't synchronize, then regardless of what operation we performed
 102     // here there would be a race on proper use of the Baton's spec
 103     // (although not on any particular load and store).  Put another way,
 104     // we don't need to synchronize here because anybody that might rely
 105     // on such synchronization is required by the baton rules to perform
 106     // an additional synchronization that has the desired effect anyway.
 107     //
 108     // There is actually a similar argument to be made about the
 109     // constructor, in which the fenceless constructor initialization
 110     // of state_ is piggybacked on whatever synchronization mechanism
 111     // distributes knowledge of the Baton's existence
 112     state_.store(INIT, std::memory_order_relaxed);
 113   }
 114
 115   /// Causes wait() to wake up.  For each lifetime of a Baton (where a
 116   /// lifetime starts at construction or reset() and ends at
 117   /// destruction or reset()) there can be at most one call to post(),
 118   /// in the single poster version.  Any thread may call post().
 119   void post() {
 120     if (!Blocking) {
 121       /// Non-blocking version
 122       ///
 123       assert([&] {
 124         auto state = state_.load(std::memory_order_relaxed);
 125         return (state == INIT || state == EARLY_DELIVERY);
 126       }());
 127       state_.store(EARLY_DELIVERY, std::memory_order_release);
 128       return;
 129     }
 130
 131     /// Blocking versions
 132     ///
 133     if (SinglePoster) {
 134       /// Single poster version
 135       ///
 136       uint32_t before = state_.load(std::memory_order_acquire);
 137
 138       assert(before == INIT || before == WAITING || before == TIMED_OUT);
 139
 140       if (before == INIT &&
 141           state_.compare_exchange_strong(before, EARLY_DELIVERY)) {
 142         return;
 143       }
 144
 145       assert(before == WAITING || before == TIMED_OUT);
 146
 147       if (before == TIMED_OUT) {
 148         return;
 149       }
 150
 151       assert(before == WAITING);
 152       state_.store(LATE_DELIVERY, std::memory_order_release);
 153       state_.futexWake(1);
 154     } else {
 155       /// Multi-poster version
 156       ///
 157       while (true) {
 158         uint32_t before = state_.load(std::memory_order_acquire);
 159
 160         if (before == INIT &&
 161             state_.compare_exchange_strong(before, EARLY_DELIVERY)) {
 162           return;
 163         }
 164
 165         if (before == TIMED_OUT) {
 166           return;
 167         }
 168
 169         if (before == EARLY_DELIVERY || before == LATE_DELIVERY) {
 170           // The reason for not simply returning (without the following
 171           // atomic operation) is to avoid the following case:
 172           //
 173           //  T1:             T2:             T3:
 174           //  local1.post();  local2.post();  global.wait();
 175           //  global.post();  global.post();  local1.try_wait() == true;
 176           //                                  local2.try_wait() == false;
 177           //
 178           if (state_.fetch_add(0) != before) {
 179             continue;
 180           }
 181           return;
 182         }
 183
 184         assert(before == WAITING);
 185         if (!state_.compare_exchange_weak(before, LATE_DELIVERY)) {
 186           continue;
 187         }
 188         state_.futexWake(1);
 189         return;
 190       }
 191     }
 192   }
 193
 194   /// Waits until post() has been called in the current Baton lifetime.
 195   /// May be called at most once during a Baton lifetime (construction
 196   /// |reset until destruction|reset).  If post is called before wait in
 197   /// the current lifetime then this method returns immediately.
 198   ///
 199   /// The restriction that there can be at most one wait() per lifetime
 200   /// could be relaxed somewhat without any perf or size regressions,
 201   /// but by making this condition very restrictive we can provide better
 202   /// checking in debug builds.
 203   FOLLY_ALWAYS_INLINE void wait() {
 204     if (try_wait()) {
 205       return;
 206     }
 207
 208     waitSlow();
 209   }
 210
 211   /// Similar to wait, but doesn't block the thread if it hasn't been posted.
 212   ///
 213   /// try_wait has the following semantics:
 214   /// - It is ok to call try_wait any number times on the same baton until
 215   ///   try_wait reports that the baton has been posted.
 216   /// - It is ok to call timed_wait or wait on the same baton if try_wait
 217   ///   reports that baton hasn't been posted.
 218   /// - If try_wait indicates that the baton has been posted, it is invalid to
 219   ///   call wait, try_wait or timed_wait on the same baton without resetting
 220   ///
 221   /// @return       true if baton has been posted, false othewise
 222   FOLLY_ALWAYS_INLINE bool try_wait() const {
 223     auto s = state_.load(std::memory_order_acquire);
 224     assert(s == INIT || s == EARLY_DELIVERY);
 225     return LIKELY(s == EARLY_DELIVERY);
 226   }
 227
 228   /// Similar to wait, but with a timeout. The thread is unblocked if the
 229   /// timeout expires.
 230   /// Note: Only a single call to wait/try_wait_for/try_wait_until is allowed
 231   /// during a baton's life-cycle (from ctor/reset to dtor/reset). In other
 232   /// words, after try_wait_for the caller can't invoke
 233   /// wait/try_wait/try_wait_for/try_wait_until
 234   /// again on the same baton without resetting it.
 235   ///
 236   /// @param  timeout       Time until which the thread can block
 237   /// @return               true if the baton was posted to before timeout,
 238   ///                       false otherwise
 239   template <typename Rep, typename Period>
 240   FOLLY_ALWAYS_INLINE bool try_wait_for(
 241       const std::chrono::duration<Rep, Period>& timeout) {
 242     static_assert(
 243         Blocking, "Non-blocking Baton does not support try_wait_for.");
 244
 245     if (try_wait()) {
 246       return true;
 247     }
 248
 249     auto deadline = std::chrono::steady_clock::now() + timeout;
 250     return tryWaitUntilSlow(deadline);
 251   }
 252
 253   /// Similar to wait, but with a deadline. The thread is unblocked if the
 254   /// deadline expires.
 255   /// Note: Only a single call to wait/try_wait_for/try_wait_until is allowed
 256   /// during a baton's life-cycle (from ctor/reset to dtor/reset). In other
 257   /// words, after try_wait_until the caller can't invoke
 258   /// wait/try_wait/try_wait_for/try_wait_until
 259   /// again on the same baton without resetting it.
 260   ///
 261   /// @param  deadline      Time until which the thread can block
 262   /// @return               true if the baton was posted to before deadline,
 263   ///                       false otherwise
 264   template <typename Clock, typename Duration>
 265   FOLLY_ALWAYS_INLINE bool try_wait_until(
 266       const std::chrono::time_point<Clock, Duration>& deadline) {
 267     static_assert(
 268         Blocking, "Non-blocking Baton does not support try_wait_until.");
 269
 270     if (try_wait()) {
 271       return true;
 272     }
 273
 274     return tryWaitUntilSlow(deadline);
 275   }
 276
 277   /// Alias to try_wait_for. Deprecated.
 278   template <typename Rep, typename Period>
 279   FOLLY_ALWAYS_INLINE bool timed_wait(
 280       const std::chrono::duration<Rep, Period>& timeout) {
 281     return try_wait_for(timeout);
 282   }
 283
 284   /// Alias to try_wait_until. Deprecated.
 285   template <typename Clock, typename Duration>
 286   FOLLY_ALWAYS_INLINE bool timed_wait(
 287       const std::chrono::time_point<Clock, Duration>& deadline) {
 288     return try_wait_until(deadline);
 289   }
 290
 291  private:
 292   enum State : uint32_t {
 293     INIT = 0,
 294     EARLY_DELIVERY = 1,
 295     WAITING = 2,
 296     LATE_DELIVERY = 3,
 297     TIMED_OUT = 4,
 298   };
 299
 300   enum {
 301     // Must be positive.  If multiple threads are actively using a
 302     // higher-level data structure that uses batons internally, it is
 303     // likely that the post() and wait() calls happen almost at the same
 304     // time.  In this state, we lose big 50% of the time if the wait goes
 305     // to sleep immediately.  On circa-2013 devbox hardware it costs about
 306     // 7 usec to FUTEX_WAIT and then be awoken (half the t/iter as the
 307     // posix_sem_pingpong test in BatonTests).  We can improve our chances
 308     // of EARLY_DELIVERY by spinning for a bit, although we have to balance
 309     // this against the loss if we end up sleeping any way.  Spins on this
 310     // hw take about 7 nanos (all but 0.5 nanos is the pause instruction).
 311     // We give ourself 300 spins, which is about 2 usec of waiting.  As a
 312     // partial consolation, since we are using the pause instruction we
 313     // are giving a speed boost to the colocated hyperthread.
 314     PreBlockAttempts = 300,
 315   };
 316
 317   // Spin for "some time" (see discussion on PreBlockAttempts) waiting
 318   // for a post.
 319   //
 320   // @return       true if we received an early delivery during the wait,
 321   //               false otherwise. If the function returns true then
 322   //               state_ is guaranteed to be EARLY_DELIVERY
 323   bool spinWaitForEarlyDelivery() {
 324     static_assert(
 325         PreBlockAttempts > 0,
 326         "isn't this assert clearer than an uninitialized variable warning?");
 327     for (int i = 0; i < PreBlockAttempts; ++i) {
 328       if (try_wait()) {
 329         return true;
 330       }
 331
 332       // The pause instruction is the polite way to spin, but it doesn't
 333       // actually affect correctness to omit it if we don't have it.
 334       // Pausing donates the full capabilities of the current core to
 335       // its other hyperthreads for a dozen cycles or so
 336       asm_volatile_pause();
 337     }
 338
 339     return false;
 340   }
 341
 342   FOLLY_NOINLINE void waitSlow() {
 343     if (spinWaitForEarlyDelivery()) {
 344       assert(state_.load(std::memory_order_acquire) == EARLY_DELIVERY);
 345       return;
 346     }
 347
 348     if (!Blocking) {
 349       while (!try_wait()) {
 350         std::this_thread::yield();
 351       }
 352       return;
 353     }
 354
 355     // guess we have to block :(
 356     uint32_t expected = INIT;
 357     if (!state_.compare_exchange_strong(expected, WAITING)) {
 358       // CAS failed, last minute reprieve
 359       assert(expected == EARLY_DELIVERY);
 360       return;
 361     }
 362
 363     while (true) {
 364       detail::MemoryIdler::futexWait(state_, WAITING);
 365
 366       // state_ is the truth even if FUTEX_WAIT reported a matching
 367       // FUTEX_WAKE, since we aren't using type-stable storage and we
 368       // don't guarantee reuse.  The scenario goes like this: thread
 369       // A's last touch of a Baton is a call to wake(), which stores
 370       // LATE_DELIVERY and gets an unlucky context switch before delivering
 371       // the corresponding futexWake.  Thread B sees LATE_DELIVERY
 372       // without consuming a futex event, because it calls futexWait
 373       // with an expected value of WAITING and hence doesn't go to sleep.
 374       // B returns, so the Baton's memory is reused and becomes another
 375       // Baton (or a reuse of this one).  B calls futexWait on the new
 376       // Baton lifetime, then A wakes up and delivers a spurious futexWake
 377       // to the same memory location.  B's futexWait will then report a
 378       // consumed wake event even though state_ is still WAITING.
 379       //
 380       // It would be possible to add an extra state_ dance to communicate
 381       // that the futexWake has been sent so that we can be sure to consume
 382       // it before returning, but that would be a perf and complexity hit.
 383       uint32_t s = state_.load(std::memory_order_acquire);
 384       assert(s == WAITING || s == LATE_DELIVERY);
 385
 386       if (s == LATE_DELIVERY) {
 387         return;
 388       }
 389       // retry
 390     }
 391   }
 392
 393   template <typename Clock, typename Duration>
 394   FOLLY_NOINLINE bool tryWaitUntilSlow(
 395       const std::chrono::time_point<Clock, Duration>& deadline) {
 396     if (spinWaitForEarlyDelivery()) {
 397       assert(state_.load(std::memory_order_acquire) == EARLY_DELIVERY);
 398       return true;
 399     }
 400
 401     // guess we have to block :(
 402     uint32_t expected = INIT;
 403     if (!state_.compare_exchange_strong(expected, WAITING)) {
 404       // CAS failed, last minute reprieve
 405       assert(expected == EARLY_DELIVERY);
 406       return true;
 407     }
 408
 409     while (true) {
 410       auto rv = state_.futexWaitUntil(WAITING, deadline);
 411       if (rv == folly::detail::FutexResult::TIMEDOUT) {
 412         state_.store(TIMED_OUT, std::memory_order_release);
 413         return false;
 414       }
 415
 416       uint32_t s = state_.load(std::memory_order_acquire);
 417       assert(s == WAITING || s == LATE_DELIVERY);
 418       if (s == LATE_DELIVERY) {
 419         return true;
 420       }
 421     }
 422   }
 423
 424   detail::Futex<Atom> state_;
 425 };
 426
 427 } // namespace folly