folly/Baton.h

   1 /*
   2  * Copyright 2014 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #ifndef FOLLY_BATON_H
  18 #define FOLLY_BATON_H
  19
  20 #include <stdint.h>
  21 #include <atomic>
  22 #include <boost/noncopyable.hpp>
  23 #include <errno.h>
  24 #include <assert.h>
  25
  26 #include <folly/detail/Futex.h>
  27
  28 namespace folly {
  29
  30 /// A Baton allows a thread to block once and be awoken: it captures
  31 /// a single handoff.  During its lifecycle (from construction/reset to
  32 /// destruction/reset) a baton must either be post()ed and wait()ed exactly
  33 /// once each, or not at all.
  34 ///
  35 /// Baton includes no internal padding, and is only 4 bytes in size.
  36 /// Any alignment or padding to avoid false sharing is up to the user.
  37 ///
  38 /// This is basically a stripped-down semaphore that supports only a
  39 /// single call to sem_post and a single call to sem_wait.  The current
  40 /// posix semaphore sem_t isn't too bad, but this provides more a bit more
  41 /// speed, inlining, smaller size, a guarantee that the implementation
  42 /// won't change, and compatibility with DeterministicSchedule.  By having
  43 /// a much more restrictive lifecycle we can also add a bunch of assertions
  44 /// that can help to catch race conditions ahead of time.
  45 template <template<typename> class Atom = std::atomic>
  46 struct Baton : boost::noncopyable {
  47   Baton() : state_(INIT) {}
  48
  49   /// It is an error to destroy a Baton on which a thread is currently
  50   /// wait()ing.  In practice this means that the waiter usually takes
  51   /// responsibility for destroying the Baton.
  52   ~Baton() {
  53     // The docblock for this function says that is can't be called when
  54     // there is a concurrent waiter.  We assume a strong version of this
  55     // requirement in which the caller must _know_ that this is true, they
  56     // are not allowed to be merely lucky.  If two threads are involved,
  57     // the destroying thread must actually have synchronized with the
  58     // waiting thread after wait() returned.  To convey causality the the
  59     // waiting thread must have used release semantics and the destroying
  60     // thread must have used acquire semantics for that communication,
  61     // so we are guaranteed to see the post-wait() value of state_,
  62     // which cannot be WAITING.
  63     //
  64     // Note that since we only care about a single memory location,
  65     // the only two plausible memory orders here are relaxed and seq_cst.
  66     assert(state_.load(std::memory_order_relaxed) != WAITING);
  67   }
  68
  69   /// Equivalent to destroying the Baton and creating a new one.  It is
  70   /// a bug to call this while there is a waiting thread, so in practice
  71   /// the waiter will be the one that resets the baton.
  72   void reset() {
  73     // See ~Baton for a discussion about why relaxed is okay here
  74     assert(state_.load(std::memory_order_relaxed) != WAITING);
  75
  76     // We use a similar argument to justify the use of a relaxed store
  77     // here.  Since both wait() and post() are required to be called
  78     // only once per lifetime, no thread can actually call those methods
  79     // correctly after a reset() unless it synchronizes with the thread
  80     // that performed the reset().  If a post() or wait() on another thread
  81     // didn't synchronize, then regardless of what operation we performed
  82     // here there would be a race on proper use of the Baton's spec
  83     // (although not on any particular load and store).  Put another way,
  84     // we don't need to synchronize here because anybody that might rely
  85     // on such synchronization is required by the baton rules to perform
  86     // an additional synchronization that has the desired effect anyway.
  87     //
  88     // There is actually a similar argument to be made about the
  89     // constructor, in which the fenceless constructor initialization
  90     // of state_ is piggybacked on whatever synchronization mechanism
  91     // distributes knowledge of the Baton's existence
  92     state_.store(INIT, std::memory_order_relaxed);
  93   }
  94
  95   /// Causes wait() to wake up.  For each lifetime of a Baton (where a
  96   /// lifetime starts at construction or reset() and ends at destruction
  97   /// or reset()) there can be at most one call to post().  Any thread
  98   /// may call post().
  99   ///
 100   /// Although we could implement a more generic semaphore semantics
 101   /// without any extra size or CPU overhead, the single-call limitation
 102   /// allows us to have better assert-ions during debug builds.
 103   void post() {
 104     uint32_t before = state_.load(std::memory_order_acquire);
 105     assert(before == INIT || before == WAITING);
 106     if (before != INIT ||
 107         !state_.compare_exchange_strong(before, EARLY_DELIVERY)) {
 108       // we didn't get to state_ before wait(), so we need to call futex()
 109       assert(before == WAITING);
 110
 111       state_.store(LATE_DELIVERY, std::memory_order_release);
 112       state_.futexWake(1);
 113     }
 114   }
 115
 116   /// Waits until post() has been called in the current Baton lifetime.
 117   /// May be called at most once during a Baton lifetime (construction
 118   /// |reset until destruction|reset).  If post is called before wait in
 119   /// the current lifetime then this method returns immediately.
 120   ///
 121   /// The restriction that there can be at most one wait() per lifetime
 122   /// could be relaxed somewhat without any perf or size regressions,
 123   /// but by making this condition very restrictive we can provide better
 124   /// checking in debug builds.
 125   void wait() {
 126     uint32_t before;
 127
 128     static_assert(PreBlockAttempts > 0,
 129         "isn't this assert clearer than an uninitialized variable warning?");
 130     for (int i = 0; i < PreBlockAttempts; ++i) {
 131       before = state_.load(std::memory_order_acquire);
 132       if (before == EARLY_DELIVERY) {
 133         // hooray!
 134         return;
 135       }
 136       assert(before == INIT);
 137 #ifdef __x86_64__
 138       // The pause instruction is the polite way to spin, but it doesn't
 139       // actually affect correctness to omit it if we don't have it.
 140       // Pausing donates the full capabilities of the current core to
 141       // its other hyperthreads for a dozen cycles or so
 142       asm volatile ("pause");
 143 #endif
 144     }
 145
 146     // guess we have to block :(
 147     if (!state_.compare_exchange_strong(before, WAITING)) {
 148       // CAS failed, last minute reprieve
 149       assert(before == EARLY_DELIVERY);
 150       return;
 151     }
 152
 153     while (true) {
 154       state_.futexWait(WAITING);
 155
 156       // state_ is the truth even if FUTEX_WAIT reported a matching
 157       // FUTEX_WAKE, since we aren't using type-stable storage and
 158       // we don't guarantee reuse
 159       uint32_t s = state_.load(std::memory_order_acquire);
 160       assert(s == WAITING || s == LATE_DELIVERY);
 161
 162       if (s == LATE_DELIVERY) {
 163         return;
 164       }
 165       // retry
 166     }
 167   }
 168
 169  private:
 170   enum State : uint32_t {
 171     INIT = 0,
 172     EARLY_DELIVERY = 1,
 173     WAITING = 2,
 174     LATE_DELIVERY = 3,
 175   };
 176
 177   enum {
 178     // Must be positive.  If multiple threads are actively using a
 179     // higher-level data structure that uses batons internally, it is
 180     // likely that the post() and wait() calls happen almost at the same
 181     // time.  In this state, we lose big 50% of the time if the wait goes
 182     // to sleep immediately.  On circa-2013 devbox hardware it costs about
 183     // 7 usec to FUTEX_WAIT and then be awoken (half the t/iter as the
 184     // posix_sem_pingpong test in BatonTests).  We can improve our chances
 185     // of EARLY_DELIVERY by spinning for a bit, although we have to balance
 186     // this against the loss if we end up sleeping any way.  Spins on this
 187     // hw take about 7 nanos (all but 0.5 nanos is the pause instruction).
 188     // We give ourself 300 spins, which is about 2 usec of waiting.  As a
 189     // partial consolation, since we are using the pause instruction we
 190     // are giving a speed boost to the colocated hyperthread.
 191     PreBlockAttempts = 300,
 192   };
 193
 194   detail::Futex<Atom> state_;
 195 };
 196
 197 } // namespace folly
 198
 199 #endif