folly/synchronization/Baton.h

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #pragma once
  18
  19 #include <assert.h>
  20 #include <errno.h>
  21 #include <stdint.h>
  22 #include <atomic>
  23 #include <thread>
  24
  25 #include <folly/detail/Futex.h>
  26 #include <folly/detail/MemoryIdler.h>
  27 #include <folly/portability/Asm.h>
  28
  29 namespace folly {
  30
  31 /// A Baton allows a thread to block once and be awoken. The single
  32 /// poster version (with SinglePoster == true) captures a single
  33 /// handoff, and during its lifecycle (from construction/reset to
  34 /// destruction/reset) a baton must either be post()ed and wait()ed
  35 /// exactly once each, or not at all.
  36 ///
  37 /// The multi-poster version (SinglePoster == false) allows multiple
  38 /// concurrent handoff attempts, the first of which completes the
  39 /// handoff and the rest if any are idempotent.
  40 ///
  41 /// Baton includes no internal padding, and is only 4 bytes in size.
  42 /// Any alignment or padding to avoid false sharing is up to the user.
  43 ///
  44 /// This is basically a stripped-down semaphore that supports (only a
  45 /// single call to sem_post, when SinglePoster == true) and a single
  46 /// call to sem_wait.
  47 ///
  48 /// The non-blocking version (Blocking == false) provides more speed
  49 /// by using only load acquire and store release operations in the
  50 /// critical path, at the cost of disallowing blocking and timing out.
  51 ///
  52 /// The current posix semaphore sem_t isn't too bad, but this provides
  53 /// more a bit more speed, inlining, smaller size, a guarantee that
  54 /// the implementation won't change, and compatibility with
  55 /// DeterministicSchedule.  By having a much more restrictive
  56 /// lifecycle we can also add a bunch of assertions that can help to
  57 /// catch race conditions ahead of time.
  58 template <
  59     template <typename> class Atom = std::atomic,
  60     bool SinglePoster = true, //  single vs multiple posters
  61     bool Blocking = true> // blocking vs spinning
  62 struct Baton {
  63   constexpr Baton() : state_(INIT) {}
  64
  65   Baton(Baton const&) = delete;
  66   Baton& operator=(Baton const&) = delete;
  67
  68   /// It is an error to destroy a Baton on which a thread is currently
  69   /// wait()ing.  In practice this means that the waiter usually takes
  70   /// responsibility for destroying the Baton.
  71   ~Baton() {
  72     // The docblock for this function says that it can't be called when
  73     // there is a concurrent waiter.  We assume a strong version of this
  74     // requirement in which the caller must _know_ that this is true, they
  75     // are not allowed to be merely lucky.  If two threads are involved,
  76     // the destroying thread must actually have synchronized with the
  77     // waiting thread after wait() returned.  To convey causality the the
  78     // waiting thread must have used release semantics and the destroying
  79     // thread must have used acquire semantics for that communication,
  80     // so we are guaranteed to see the post-wait() value of state_,
  81     // which cannot be WAITING.
  82     //
  83     // Note that since we only care about a single memory location,
  84     // the only two plausible memory orders here are relaxed and seq_cst.
  85     assert(state_.load(std::memory_order_relaxed) != WAITING);
  86   }
  87
  88   /// Equivalent to destroying the Baton and creating a new one.  It is
  89   /// a bug to call this while there is a waiting thread, so in practice
  90   /// the waiter will be the one that resets the baton.
  91   void reset() {
  92     // See ~Baton for a discussion about why relaxed is okay here
  93     assert(state_.load(std::memory_order_relaxed) != WAITING);
  94
  95     // We use a similar argument to justify the use of a relaxed store
  96     // here.  Since both wait() and post() are required to be called
  97     // only once per lifetime, no thread can actually call those methods
  98     // correctly after a reset() unless it synchronizes with the thread
  99     // that performed the reset().  If a post() or wait() on another thread
 100     // didn't synchronize, then regardless of what operation we performed
 101     // here there would be a race on proper use of the Baton's spec
 102     // (although not on any particular load and store).  Put another way,
 103     // we don't need to synchronize here because anybody that might rely
 104     // on such synchronization is required by the baton rules to perform
 105     // an additional synchronization that has the desired effect anyway.
 106     //
 107     // There is actually a similar argument to be made about the
 108     // constructor, in which the fenceless constructor initialization
 109     // of state_ is piggybacked on whatever synchronization mechanism
 110     // distributes knowledge of the Baton's existence
 111     state_.store(INIT, std::memory_order_relaxed);
 112   }
 113
 114   /// Causes wait() to wake up.  For each lifetime of a Baton (where a
 115   /// lifetime starts at construction or reset() and ends at
 116   /// destruction or reset()) there can be at most one call to post(),
 117   /// in the single poster version.  Any thread may call post().
 118   void post() {
 119     if (!Blocking) {
 120       /// Non-blocking version
 121       ///
 122       assert([&] {
 123         auto state = state_.load(std::memory_order_relaxed);
 124         return (state == INIT || state == EARLY_DELIVERY);
 125       }());
 126       state_.store(EARLY_DELIVERY, std::memory_order_release);
 127       return;
 128     }
 129
 130     /// Blocking versions
 131     ///
 132     if (SinglePoster) {
 133       /// Single poster version
 134       ///
 135       uint32_t before = state_.load(std::memory_order_acquire);
 136
 137       assert(before == INIT || before == WAITING || before == TIMED_OUT);
 138
 139       if (before == INIT &&
 140           state_.compare_exchange_strong(before, EARLY_DELIVERY)) {
 141         return;
 142       }
 143
 144       assert(before == WAITING || before == TIMED_OUT);
 145
 146       if (before == TIMED_OUT) {
 147         return;
 148       }
 149
 150       assert(before == WAITING);
 151       state_.store(LATE_DELIVERY, std::memory_order_release);
 152       state_.futexWake(1);
 153     } else {
 154       /// Multi-poster version
 155       ///
 156       while (true) {
 157         uint32_t before = state_.load(std::memory_order_acquire);
 158
 159         if (before == INIT &&
 160             state_.compare_exchange_strong(before, EARLY_DELIVERY)) {
 161           return;
 162         }
 163
 164         if (before == TIMED_OUT) {
 165           return;
 166         }
 167
 168         if (before == EARLY_DELIVERY || before == LATE_DELIVERY) {
 169           // The reason for not simply returning (without the following
 170           // atomic operation) is to avoid the following case:
 171           //
 172           //  T1:             T2:             T3:
 173           //  local1.post();  local2.post();  global.wait();
 174           //  global.post();  global.post();  local1.try_wait() == true;
 175           //                                  local2.try_wait() == false;
 176           //
 177           if (state_.fetch_add(0) != before) {
 178             continue;
 179           }
 180           return;
 181         }
 182
 183         assert(before == WAITING);
 184         if (!state_.compare_exchange_weak(before, LATE_DELIVERY)) {
 185           continue;
 186         }
 187         state_.futexWake(1);
 188         return;
 189       }
 190     }
 191   }
 192
 193   /// Waits until post() has been called in the current Baton lifetime.
 194   /// May be called at most once during a Baton lifetime (construction
 195   /// |reset until destruction|reset).  If post is called before wait in
 196   /// the current lifetime then this method returns immediately.
 197   ///
 198   /// The restriction that there can be at most one wait() per lifetime
 199   /// could be relaxed somewhat without any perf or size regressions,
 200   /// but by making this condition very restrictive we can provide better
 201   /// checking in debug builds.
 202   void wait() {
 203     if (spinWaitForEarlyDelivery()) {
 204       assert(state_.load(std::memory_order_acquire) == EARLY_DELIVERY);
 205       return;
 206     }
 207
 208     if (!Blocking) {
 209       while (!try_wait()) {
 210         std::this_thread::yield();
 211       }
 212       return;
 213     }
 214
 215     // guess we have to block :(
 216     uint32_t expected = INIT;
 217     if (!state_.compare_exchange_strong(expected, WAITING)) {
 218       // CAS failed, last minute reprieve
 219       assert(expected == EARLY_DELIVERY);
 220       return;
 221     }
 222
 223     while (true) {
 224       detail::MemoryIdler::futexWait(state_, WAITING);
 225
 226       // state_ is the truth even if FUTEX_WAIT reported a matching
 227       // FUTEX_WAKE, since we aren't using type-stable storage and we
 228       // don't guarantee reuse.  The scenario goes like this: thread
 229       // A's last touch of a Baton is a call to wake(), which stores
 230       // LATE_DELIVERY and gets an unlucky context switch before delivering
 231       // the corresponding futexWake.  Thread B sees LATE_DELIVERY
 232       // without consuming a futex event, because it calls futexWait
 233       // with an expected value of WAITING and hence doesn't go to sleep.
 234       // B returns, so the Baton's memory is reused and becomes another
 235       // Baton (or a reuse of this one).  B calls futexWait on the new
 236       // Baton lifetime, then A wakes up and delivers a spurious futexWake
 237       // to the same memory location.  B's futexWait will then report a
 238       // consumed wake event even though state_ is still WAITING.
 239       //
 240       // It would be possible to add an extra state_ dance to communicate
 241       // that the futexWake has been sent so that we can be sure to consume
 242       // it before returning, but that would be a perf and complexity hit.
 243       uint32_t s = state_.load(std::memory_order_acquire);
 244       assert(s == WAITING || s == LATE_DELIVERY);
 245
 246       if (s == LATE_DELIVERY) {
 247         return;
 248       }
 249       // retry
 250     }
 251   }
 252
 253   /// Similar to wait, but doesn't block the thread if it hasn't been posted.
 254   ///
 255   /// try_wait has the following semantics:
 256   /// - It is ok to call try_wait any number times on the same baton until
 257   ///   try_wait reports that the baton has been posted.
 258   /// - It is ok to call timed_wait or wait on the same baton if try_wait
 259   ///   reports that baton hasn't been posted.
 260   /// - If try_wait indicates that the baton has been posted, it is invalid to
 261   ///   call wait, try_wait or timed_wait on the same baton without resetting
 262   ///
 263   /// @return       true if baton has been posted, false othewise
 264   bool try_wait() const {
 265     auto s = state_.load(std::memory_order_acquire);
 266     assert(s == INIT || s == EARLY_DELIVERY);
 267     return s == EARLY_DELIVERY;
 268   }
 269
 270   /// Similar to wait, but with a timeout. The thread is unblocked if the
 271   /// timeout expires.
 272   /// Note: Only a single call to wait/try_wait_for/try_wait_until is allowed
 273   /// during a baton's life-cycle (from ctor/reset to dtor/reset). In other
 274   /// words, after try_wait_for the caller can't invoke
 275   /// wait/try_wait/try_wait_for/try_wait_until
 276   /// again on the same baton without resetting it.
 277   ///
 278   /// @param  timeout       Time until which the thread can block
 279   /// @return               true if the baton was posted to before timeout,
 280   ///                       false otherwise
 281   template <typename Rep, typename Period>
 282   bool try_wait_for(const std::chrono::duration<Rep, Period>& timeout) {
 283     static_assert(
 284         Blocking, "Non-blocking Baton does not support try_wait_for.");
 285
 286     auto deadline = std::chrono::steady_clock::now() + timeout;
 287     return try_wait_until(deadline);
 288   }
 289
 290   /// Similar to wait, but with a deadline. The thread is unblocked if the
 291   /// deadline expires.
 292   /// Note: Only a single call to wait/try_wait_for/try_wait_until is allowed
 293   /// during a baton's life-cycle (from ctor/reset to dtor/reset). In other
 294   /// words, after try_wait_until the caller can't invoke
 295   /// wait/try_wait/try_wait_for/try_wait_until
 296   /// again on the same baton without resetting it.
 297   ///
 298   /// @param  deadline      Time until which the thread can block
 299   /// @return               true if the baton was posted to before deadline,
 300   ///                       false otherwise
 301   template <typename Clock, typename Duration>
 302   bool try_wait_until(
 303       const std::chrono::time_point<Clock, Duration>& deadline) {
 304     static_assert(
 305         Blocking, "Non-blocking Baton does not support try_wait_until.");
 306
 307     if (spinWaitForEarlyDelivery()) {
 308       assert(state_.load(std::memory_order_acquire) == EARLY_DELIVERY);
 309       return true;
 310     }
 311
 312     // guess we have to block :(
 313     uint32_t expected = INIT;
 314     if (!state_.compare_exchange_strong(expected, WAITING)) {
 315       // CAS failed, last minute reprieve
 316       assert(expected == EARLY_DELIVERY);
 317       return true;
 318     }
 319
 320     while (true) {
 321       auto rv = state_.futexWaitUntil(WAITING, deadline);
 322       if (rv == folly::detail::FutexResult::TIMEDOUT) {
 323         state_.store(TIMED_OUT, std::memory_order_release);
 324         return false;
 325       }
 326
 327       uint32_t s = state_.load(std::memory_order_acquire);
 328       assert(s == WAITING || s == LATE_DELIVERY);
 329       if (s == LATE_DELIVERY) {
 330         return true;
 331       }
 332     }
 333   }
 334
 335   /// Alias to try_wait_for. Deprecated.
 336   template <typename Rep, typename Period>
 337   bool timed_wait(const std::chrono::duration<Rep, Period>& timeout) {
 338     return try_wait_for(timeout);
 339   }
 340
 341   /// Alias to try_wait_until. Deprecated.
 342   template <typename Clock, typename Duration>
 343   bool timed_wait(const std::chrono::time_point<Clock, Duration>& deadline) {
 344     return try_wait_until(deadline);
 345   }
 346
 347  private:
 348   enum State : uint32_t {
 349     INIT = 0,
 350     EARLY_DELIVERY = 1,
 351     WAITING = 2,
 352     LATE_DELIVERY = 3,
 353     TIMED_OUT = 4
 354   };
 355
 356   enum {
 357     // Must be positive.  If multiple threads are actively using a
 358     // higher-level data structure that uses batons internally, it is
 359     // likely that the post() and wait() calls happen almost at the same
 360     // time.  In this state, we lose big 50% of the time if the wait goes
 361     // to sleep immediately.  On circa-2013 devbox hardware it costs about
 362     // 7 usec to FUTEX_WAIT and then be awoken (half the t/iter as the
 363     // posix_sem_pingpong test in BatonTests).  We can improve our chances
 364     // of EARLY_DELIVERY by spinning for a bit, although we have to balance
 365     // this against the loss if we end up sleeping any way.  Spins on this
 366     // hw take about 7 nanos (all but 0.5 nanos is the pause instruction).
 367     // We give ourself 300 spins, which is about 2 usec of waiting.  As a
 368     // partial consolation, since we are using the pause instruction we
 369     // are giving a speed boost to the colocated hyperthread.
 370     PreBlockAttempts = 300,
 371   };
 372
 373   // Spin for "some time" (see discussion on PreBlockAttempts) waiting
 374   // for a post.
 375   //
 376   // @return       true if we received an early delivery during the wait,
 377   //               false otherwise. If the function returns true then
 378   //               state_ is guaranteed to be EARLY_DELIVERY
 379   bool spinWaitForEarlyDelivery() {
 380
 381     static_assert(PreBlockAttempts > 0,
 382         "isn't this assert clearer than an uninitialized variable warning?");
 383     for (int i = 0; i < PreBlockAttempts; ++i) {
 384       if (try_wait()) {
 385         // hooray!
 386         return true;
 387       }
 388       // The pause instruction is the polite way to spin, but it doesn't
 389       // actually affect correctness to omit it if we don't have it.
 390       // Pausing donates the full capabilities of the current core to
 391       // its other hyperthreads for a dozen cycles or so
 392       asm_volatile_pause();
 393     }
 394
 395     return false;
 396   }
 397
 398   detail::Futex<Atom> state_;
 399 };
 400
 401 } // namespace folly