From: James Sedgwick Date: Tue, 16 Jun 2015 17:30:18 +0000 (-0700) Subject: various perf improvements X-Git-Tag: v0.47.0~8 X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=ca1e87ed0a85daf3abaf2ac423266e869f3961ec;p=folly.git various perf improvements Summary: Three strategies 1. Optimistic locking 2. Acquire-release memory ordering instead of full sequential consistency 3. Some low-hanging branch miss optimizations Please review carefully; the dogscience is strong with this one ``` Before: ============================================================================ folly/futures/test/Benchmark.cpp relative time/iter iters/s ============================================================================ constantFuture 127.99ns 7.81M promiseAndFuture 94.89% 134.89ns 7.41M withThen 28.40% 450.63ns 2.22M ---------------------------------------------------------------------------- oneThen 446.68ns 2.24M twoThens 58.35% 765.55ns 1.31M fourThens 31.87% 1.40us 713.41K hundredThens 1.61% 27.78us 35.99K ---------------------------------------------------------------------------- no_contention 4.63ms 216.00 contention 80.79% 5.73ms 174.52 ---------------------------------------------------------------------------- throwAndCatch 10.91us 91.64K throwAndCatchWrapped 127.14% 8.58us 116.51K throwWrappedAndCatch 178.22% 6.12us 163.32K throwWrappedAndCatchWrapped 793.75% 1.37us 727.38K ---------------------------------------------------------------------------- throwAndCatchContended 1.35s 741.33m throwAndCatchWrappedContended 139.18% 969.23ms 1.03 throwWrappedAndCatchContended 169.51% 795.76ms 1.26 throwWrappedAndCatchWrappedContended 17742.23% 7.60ms 131.53 ---------------------------------------------------------------------------- complexUnit 127.50us 7.84K complexBlob4 100.14% 127.32us 7.85K complexBlob8 100.16% 127.30us 7.86K complexBlob64 96.45% 132.19us 7.57K complexBlob128 92.83% 137.35us 7.28K complexBlob256 87.79% 145.23us 6.89K complexBlob512 81.64% 156.18us 6.40K complexBlob1024 72.54% 175.76us 5.69K complexBlob2048 58.52% 217.89us 4.59K complexBlob4096 32.54% 391.78us 2.55K ============================================================================ After: ============================================================================ folly/futures/test/Benchmark.cpp relative time/iter iters/s ============================================================================ constantFuture 85.28ns 11.73M promiseAndFuture 88.63% 96.22ns 10.39M withThen 30.46% 279.99ns 3.57M ---------------------------------------------------------------------------- oneThen 231.18ns 4.33M twoThens 60.57% 381.70ns 2.62M fourThens 33.52% 689.71ns 1.45M hundredThens 1.49% 15.48us 64.58K ---------------------------------------------------------------------------- no_contention 3.84ms 260.19 contention 88.29% 4.35ms 229.73 ---------------------------------------------------------------------------- throwAndCatch 10.63us 94.06K throwAndCatchWrapped 127.17% 8.36us 119.61K throwWrappedAndCatch 179.83% 5.91us 169.15K throwWrappedAndCatchWrapped 1014.48% 1.05us 954.19K ---------------------------------------------------------------------------- throwAndCatchContended 1.34s 749.03m throwAndCatchWrappedContended 140.66% 949.16ms 1.05 throwWrappedAndCatchContended 164.87% 809.77ms 1.23 throwWrappedAndCatchWrappedContended 49406.39% 2.70ms 370.07 ---------------------------------------------------------------------------- complexUnit 86.83us 11.52K complexBlob4 97.42% 89.12us 11.22K complexBlob8 96.63% 89.85us 11.13K complexBlob64 92.53% 93.84us 10.66K complexBlob128 90.85% 95.57us 10.46K complexBlob256 82.56% 105.17us 9.51K complexBlob512 74.13% 117.12us 8.54K complexBlob1024 63.67% 136.37us 7.33K complexBlob2048 50.25% 172.79us 5.79K complexBlob4096 26.63% 326.05us 3.07K ============================================================================ ``` Reviewed By: @djwatson Differential Revision: D2139822 --- diff --git a/folly/futures/Future-inl.h b/folly/futures/Future-inl.h index 2189585b..0d6679d8 100644 --- a/folly/futures/Future-inl.h +++ b/folly/futures/Future-inl.h @@ -106,14 +106,12 @@ Future::thenImplementation(F func, detail::argResult) { // wrap these so we can move them into the lambda folly::MoveWrapper> p; - p->setInterruptHandler(core_->getInterruptHandler()); + p->core_->setInterruptHandlerNoLock(core_->getInterruptHandler()); folly::MoveWrapper funcm(std::forward(func)); // grab the Future now before we lose our handle on the Promise auto f = p->getFuture(); - if (getExecutor()) { - f.setExecutor(getExecutor()); - } + f.core_->setExecutorNoLock(getExecutor()); /* This is a bit tricky. @@ -174,13 +172,12 @@ Future::thenImplementation(F func, detail::argResult) { // wrap these so we can move them into the lambda folly::MoveWrapper> p; + p->core_->setInterruptHandlerNoLock(core_->getInterruptHandler()); folly::MoveWrapper funcm(std::forward(func)); // grab the Future now before we lose our handle on the Promise auto f = p->getFuture(); - if (getExecutor()) { - f.setExecutor(getExecutor()); - } + f.core_->setExecutorNoLock(getExecutor()); setCallback_( [p, funcm](Try&& t) mutable { diff --git a/folly/futures/Promise-inl.h b/folly/futures/Promise-inl.h index 245b0d6f..751d9b4c 100644 --- a/folly/futures/Promise-inl.h +++ b/folly/futures/Promise-inl.h @@ -44,16 +44,19 @@ Promise& Promise::operator=(Promise&& other) noexcept { template void Promise::throwIfFulfilled() { - if (!core_) + if (UNLIKELY(!core_)) { throw NoState(); - if (core_->ready()) + } + if (UNLIKELY(core_->ready())) { throw PromiseAlreadySatisfied(); + } } template void Promise::throwIfRetrieved() { - if (retrieved_) + if (UNLIKELY(retrieved_)) { throw FutureAlreadyRetrieved(); + } } template diff --git a/folly/futures/Promise.h b/folly/futures/Promise.h index 27b20bd8..c6af9168 100644 --- a/folly/futures/Promise.h +++ b/folly/futures/Promise.h @@ -103,6 +103,7 @@ public: private: typedef typename Future::corePtr corePtr; + template friend class Future; // Whether the Future has been retrieved (a one-time operation). bool retrieved_; diff --git a/folly/futures/Try-inl.h b/folly/futures/Try-inl.h index 22bc8779..fac9cc56 100644 --- a/folly/futures/Try-inl.h +++ b/folly/futures/Try-inl.h @@ -79,9 +79,9 @@ Try& Try::operator=(const Try& t) { template Try::~Try() { - if (contains_ == Contains::VALUE) { + if (LIKELY(contains_ == Contains::VALUE)) { value_.~T(); - } else if (contains_ == Contains::EXCEPTION) { + } else if (UNLIKELY(contains_ == Contains::EXCEPTION)) { e_.~unique_ptr(); } } diff --git a/folly/futures/detail/Core.h b/folly/futures/detail/Core.h index 27842b55..02651591 100644 --- a/folly/futures/detail/Core.h +++ b/folly/futures/detail/Core.h @@ -81,12 +81,12 @@ class Core { Core() {} explicit Core(Try&& t) - : fsm_(State::OnlyResult), - attached_(1), - result_(std::move(t)) {} + : result_(std::move(t)), + fsm_(State::OnlyResult), + attached_(1) {} ~Core() { - assert(attached_ == 0); + DCHECK(attached_ == 0); } // not copyable @@ -212,7 +212,7 @@ class Core { void detachPromise() { // detachPromise() and setResult() should never be called in parallel // so we don't need to protect this. - if (!result_) { + if (UNLIKELY(!result_)) { setResult(Try(exception_wrapper(BrokenPromise()))); } detachOne(); @@ -220,63 +220,89 @@ class Core { /// May call from any thread void deactivate() { - active_ = false; + active_.store(false, std::memory_order_release); } /// May call from any thread void activate() { - active_ = true; + active_.store(true, std::memory_order_release); maybeCallback(); } /// May call from any thread - bool isActive() { return active_; } + bool isActive() { return active_.load(std::memory_order_acquire); } /// Call only from Future thread - void setExecutor(Executor* x, int8_t priority) { - folly::MSLGuard g(executorLock_); + void setExecutor(Executor* x, int8_t priority = Executor::MID_PRI) { + if (!executorLock_.try_lock()) { + executorLock_.lock(); + } + executor_ = x; + priority_ = priority; + executorLock_.unlock(); + } + + void setExecutorNoLock(Executor* x, int8_t priority = Executor::MID_PRI) { executor_ = x; priority_ = priority; } Executor* getExecutor() { - folly::MSLGuard g(executorLock_); return executor_; } /// Call only from Future thread void raise(exception_wrapper e) { - folly::MSLGuard guard(interruptLock_); + if (!interruptLock_.try_lock()) { + interruptLock_.lock(); + } if (!interrupt_ && !hasResult()) { interrupt_ = folly::make_unique(std::move(e)); if (interruptHandler_) { interruptHandler_(*interrupt_); } } + interruptLock_.unlock(); } std::function getInterruptHandler() { - folly::MSLGuard guard(interruptLock_); - return interruptHandler_; + if (!interruptHandlerSet_.load(std::memory_order_acquire)) { + return nullptr; + } + if (!interruptLock_.try_lock()) { + interruptLock_.lock(); + } + auto handler = interruptHandler_; + interruptLock_.unlock(); + return handler; } /// Call only from Promise thread void setInterruptHandler(std::function fn) { - folly::MSLGuard guard(interruptLock_); + if (!interruptLock_.try_lock()) { + interruptLock_.lock(); + } if (!hasResult()) { if (interrupt_) { fn(*interrupt_); } else { - interruptHandler_ = std::move(fn); + setInterruptHandlerNoLock(std::move(fn)); } } + interruptLock_.unlock(); + } + + void setInterruptHandlerNoLock( + std::function fn) { + interruptHandlerSet_.store(true, std::memory_order_relaxed); + interruptHandler_ = std::move(fn); } protected: void maybeCallback() { FSM_START(fsm_) case State::Armed: - if (active_) { + if (active_.load(std::memory_order_acquire)) { FSM_UPDATE2(fsm_, State::Done, []{}, [this]{ this->doCallback(); }); } FSM_BREAK @@ -289,17 +315,20 @@ class Core { void doCallback() { RequestContext::setContext(context_); - // TODO(6115514) semantic race on reading executor_ and setExecutor() - Executor* x; + Executor* x = executor_; int8_t priority; - { - folly::MSLGuard g(executorLock_); + if (x) { + if (!executorLock_.try_lock()) { + executorLock_.lock(); + } x = executor_; priority = priority_; + executorLock_.unlock(); } if (x) { - ++attached_; // keep Core alive until executor did its thing + // keep Core alive until executor did its thing + ++attached_; try { if (LIKELY(x->getNumPriorities() == 1)) { x->add([this]() mutable { @@ -330,17 +359,21 @@ class Core { } } + // lambdaBuf occupies exactly one cache line + static constexpr size_t lambdaBufSize = 8 * sizeof(void*); + char lambdaBuf_[lambdaBufSize]; + // place result_ next to increase the likelihood that the value will be + // contained entirely in one cache line + folly::Optional> result_ {}; + std::function&&)> callback_ {nullptr}; FSM fsm_ {State::Start}; std::atomic attached_ {2}; std::atomic active_ {true}; + std::atomic interruptHandlerSet_ {false}; folly::MicroSpinLock interruptLock_ {0}; folly::MicroSpinLock executorLock_ {0}; int8_t priority_ {-1}; Executor* executor_ {nullptr}; - folly::Optional> result_ {}; - std::function&&)> callback_ {nullptr}; - static constexpr size_t lambdaBufSize = 8 * sizeof(void*); - char lambdaBuf_[lambdaBufSize]; std::shared_ptr context_ {nullptr}; std::unique_ptr interrupt_ {}; std::function interruptHandler_ {nullptr}; diff --git a/folly/futures/detail/FSM.h b/folly/futures/detail/FSM.h index 0d4ffbed..ce77c551 100644 --- a/folly/futures/detail/FSM.h +++ b/folly/futures/detail/FSM.h @@ -44,7 +44,7 @@ public: explicit FSM(Enum startState) : state_(startState) {} Enum getState() const { - return state_.load(std::memory_order_relaxed); + return state_.load(std::memory_order_acquire); } /// Atomically do a state transition with accompanying action. @@ -52,10 +52,16 @@ public: /// @returns true on success, false and action unexecuted otherwise template bool updateState(Enum A, Enum B, F const& action) { - std::lock_guard lock(mutex_); - if (state_ != A) return false; + if (!mutex_.try_lock()) { + mutex_.lock(); + } + if (state_.load(std::memory_order_relaxed) != A) { + mutex_.unlock(); + return false; + } action(); - state_ = B; + state_.store(B, std::memory_order_relaxed); + mutex_.unlock(); return true; } diff --git a/folly/futures/test/Benchmark.cpp b/folly/futures/test/Benchmark.cpp index 386e8b9d..8721622b 100644 --- a/folly/futures/test/Benchmark.cpp +++ b/folly/futures/test/Benchmark.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -290,6 +291,98 @@ BENCHMARK_RELATIVE(throwWrappedAndCatchWrappedContended) { contend(throwWrappedAndCatchWrappedImpl); } +InlineExecutor exe; + +template +Future fGen() { + Promise p; + auto f = p.getFuture() + .then([] (T&& t) { + return std::move(t); + }) + .then([] (T&& t) { + return makeFuture(std::move(t)); + }) + .via(&exe) + .then([] (T&& t) { + return std::move(t); + }) + .then([] (T&& t) { + return makeFuture(std::move(t)); + }); + p.setValue(T()); + return f; +} + +template +std::vector> fsGen() { + std::vector> fs; + for (auto i = 0; i < 10; i++) { + fs.push_back(fGen()); + } + return fs; +} + +template +void complexBenchmark() { + collect(fsGen()); + collectAll(fsGen()); + collectAny(fsGen()); + futures::map(fsGen(), [] (const T& t) { + return t; + }); + futures::map(fsGen(), [] (const T& t) { + return makeFuture(T(t)); + }); +} + +BENCHMARK_DRAW_LINE(); + +template +struct Blob { + char buf[S]; +}; + +BENCHMARK(complexUnit) { + complexBenchmark(); +} + +BENCHMARK_RELATIVE(complexBlob4) { + complexBenchmark>(); +} + +BENCHMARK_RELATIVE(complexBlob8) { + complexBenchmark>(); +} + +BENCHMARK_RELATIVE(complexBlob64) { + complexBenchmark>(); +} + +BENCHMARK_RELATIVE(complexBlob128) { + complexBenchmark>(); +} + +BENCHMARK_RELATIVE(complexBlob256) { + complexBenchmark>(); +} + +BENCHMARK_RELATIVE(complexBlob512) { + complexBenchmark>(); +} + +BENCHMARK_RELATIVE(complexBlob1024) { + complexBenchmark>(); +} + +BENCHMARK_RELATIVE(complexBlob2048) { + complexBenchmark>(); +} + +BENCHMARK_RELATIVE(complexBlob4096) { + complexBenchmark>(); +} + int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); folly::runBenchmarks();