folly/experimental/flat_combining/FlatCombining.h

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #pragma once
  18
  19 #include <folly/Baton.h>
  20 #include <folly/Function.h>
  21 #include <folly/IndexedMemPool.h>
  22 #include <folly/Portability.h>
  23 #include <folly/detail/CacheLocality.h>
  24
  25 #include <atomic>
  26 #include <cassert>
  27 #include <mutex>
  28
  29 namespace folly {
  30
  31 /// Flat combining (FC) was introduced in the SPAA 2010 paper Flat
  32 /// Combining and the Synchronization-Parallelism Tradeoff, by Danny
  33 /// Hendler, Itai Incze, Nir Shavit, and Moran Tzafrir.
  34 /// http://mcg.cs.tau.ac.il/projects/projects/flat-combining
  35 ///
  36 /// FC is an alternative to coarse-grained locking for making
  37 /// sequential data structures thread-safe while minimizing the
  38 /// synchroniation overheads and cache coherence traffic associated
  39 /// with locking.
  40 ///
  41 /// Under FC, when a thread finds the lock contended, it can
  42 /// request (using a request record) that the lock holder execute its
  43 /// operation on the shared data structure. There can be a designated
  44 /// combiner thread or any thread can act as the combiner when it
  45 /// holds the lock.
  46 ///
  47 /// Potential advantages of FC include:
  48 /// - Reduced cache coherence traffic
  49 /// - Reduced synchronization overheads, as the overheads of releasing
  50 ///   and acquiring the lock are eliminated from the critical path of
  51 ///   operating on the data structure.
  52 /// - Opportunities for smart combining, where executing multiple
  53 ///   operations together may take less time than executng the
  54 ///   operations separately, e.g., K delete_min operations on a
  55 ///   priority queue may be combined to take O(K + log N) time instead
  56 ///   of O(K * log N).
  57 ///
  58 /// This implementation of flat combining supports:
  59
  60 /// - A simple interface that requires minimal extra code by the
  61 ///   user. To use this interface efficiently the user-provided
  62 ///   functions must be copyable to folly::Functio without dynamic
  63 ///   allocation. If this is impossible or inconvenient, the user is
  64 ///   encouraged to use the custom interface described below.
  65 /// - A custom interface that supports custom combinining and custom
  66 ///   request structure, either for the sake of smart combining or for
  67 ///   efficiently supporting operations that are not be copyable to
  68 ///   folly::Function without synamic allocation.
  69 /// - Both synchronous and asynchronous operations.
  70 /// - Request records with and without thread-caching.
  71 /// - Combining with and without a dedicated combiner thread.
  72 ///
  73 /// This implementation differs from the algorithm in the SPAA 2010 paper:
  74 /// - It does not require thread caching of request records
  75 /// - It supports a dedicated combiner
  76 /// - It supports asynchronous operations
  77 ///
  78 /// The generic FC class template supports generic data structures and
  79 /// utilities with arbitrary operations. The template supports static
  80 /// polymorphism for the combining function to enable custom smart
  81 /// combining.
  82 ///
  83 /// A simple example of using the FC template:
  84 ///   class ConcurrentFoo : public FlatCombining<ConcurrentFoo> {
  85 ///     Foo foo_; // sequential data structure
  86 ///    public:
  87 ///     T bar(V v) { // thread-safe execution of foo_.bar(v)
  88 ///       T result;
  89 ///       // Note: fn must be copyable to folly::Function without dynamic
  90 ///       // allocation. Otherwise, it is recommended to use the custom
  91 ///       // interface and manage the function arguments and results
  92 ///       // explicitly in a custom request structure.
  93 ///       auto fn = [&] { result = foo_.bar(v); };
  94 ///       this->requestFC(fn);
  95 ///       return result;
  96 ///     }
  97 ///   };
  98 ///
  99 /// See test/FlatCombiningExamples.h for more examples. See the
 100 /// comments for requestFC() below for a list of simple and custom
 101 /// variants of that function.
 102
 103 template <
 104     typename T, // concurrent data structure using FC interface
 105     typename Mutex = std::mutex,
 106     template <typename> class Atom = std::atomic,
 107     typename Req = /* default dummy type */ bool>
 108 class FlatCombining {
 109   using SavedFn = folly::Function<void()>;
 110
 111  public:
 112   /// Combining request record.
 113   class Rec {
 114     FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
 115     folly::Baton<Atom, true, false> valid_;
 116     folly::Baton<Atom, true, false> done_;
 117     folly::Baton<Atom, true, false> disconnected_;
 118     size_t index_;
 119     size_t next_;
 120     uint64_t last_;
 121     Req req_;
 122     SavedFn fn_;
 123
 124    public:
 125     Rec() {
 126       setDone();
 127       setDisconnected();
 128     }
 129
 130     void setValid() {
 131       valid_.post();
 132     }
 133
 134     void clearValid() {
 135       valid_.reset();
 136     }
 137
 138     bool isValid() const {
 139       return valid_.try_wait();
 140     }
 141
 142     void setDone() {
 143       done_.post();
 144     }
 145
 146     void clearDone() {
 147       done_.reset();
 148     }
 149
 150     bool isDone() const {
 151       return done_.try_wait();
 152     }
 153
 154     void awaitDone() {
 155       done_.wait();
 156     }
 157
 158     void setDisconnected() {
 159       disconnected_.post();
 160     }
 161
 162     void clearDisconnected() {
 163       disconnected_.reset();
 164     }
 165
 166     bool isDisconnected() const {
 167       return disconnected_.try_wait();
 168     }
 169
 170     void setIndex(const size_t index) {
 171       index_ = index;
 172     }
 173
 174     size_t getIndex() const {
 175       return index_;
 176     }
 177
 178     void setNext(const size_t next) {
 179       next_ = next;
 180     }
 181
 182     size_t getNext() const {
 183       return next_;
 184     }
 185
 186     void setLast(const uint64_t pass) {
 187       last_ = pass;
 188     }
 189
 190     uint64_t getLast() const {
 191       return last_;
 192     }
 193
 194     Req& getReq() {
 195       return req_;
 196     }
 197
 198     template <typename Func>
 199     void setFn(Func&& fn) {
 200       static_assert(
 201           std::is_nothrow_constructible<
 202               folly::Function<void()>,
 203               _t<std::decay<Func>>>::value,
 204           "Try using a smaller function object that can fit in folly::Function "
 205           "without allocation, or use the custom interface of requestFC() to "
 206           "manage the requested function's arguments and results explicitly "
 207           "in a custom request structure without allocation.");
 208       fn_ = std::forward<Func>(fn);
 209       assert(fn_);
 210     }
 211
 212     void clearFn() {
 213       fn_ = {};
 214       assert(!fn_);
 215     }
 216
 217     SavedFn& getFn() {
 218       return fn_;
 219     }
 220
 221     void complete() {
 222       clearValid();
 223       assert(!isDone());
 224       setDone();
 225     }
 226   };
 227
 228   using Pool = folly::IndexedMemPool<Rec, 32, 4, Atom, false, false>;
 229
 230  public:
 231   /// The constructor takes three optional arguments:
 232   /// - Optional dedicated combiner thread (default true)
 233   /// - Number of records (if 0, then kDefaultNumRecs)
 234   /// - A hint for the max. number of combined operations per
 235   ///   combining session that is checked at the beginning of each pass
 236   ///   on the request records (if 0, then kDefaultMaxops)
 237   explicit FlatCombining(
 238       const bool dedicated = true,
 239       uint32_t numRecs = 0, // number of combining records
 240       const uint32_t maxOps = 0 // hint of max ops per combining session
 241       )
 242       : numRecs_(numRecs == 0 ? kDefaultNumRecs : numRecs),
 243         maxOps_(maxOps == 0 ? kDefaultMaxOps : maxOps),
 244         recs_(NULL_INDEX),
 245         dedicated_(dedicated),
 246         recsPool_(numRecs_) {
 247     if (dedicated_) {
 248       // dedicated combiner thread
 249       combiner_ = std::thread([this] { dedicatedCombining(); });
 250     }
 251   }
 252
 253   /// Destructor: If there is a dedicated combiner, the destructor
 254   /// flags it to shutdown. Otherwise, the destructor waits for all
 255   /// pending asynchronous requests to be completed.
 256   ~FlatCombining() {
 257     if (dedicated_) {
 258       shutdown();
 259       combiner_.join();
 260     } else {
 261       drainAll();
 262     }
 263   }
 264
 265   // Wait for all pending operations to complete. Useful primarily
 266   // when there are asynchronous operations without a dedicated
 267   // combiner.
 268   void drainAll() {
 269     for (size_t i = getRecsHead(); i != NULL_INDEX; i = nextIndex(i)) {
 270       Rec& rec = recsPool_[i];
 271       awaitDone(rec);
 272     }
 273   }
 274
 275   // Give the caller exclusive access.
 276   void acquireExclusive() {
 277     m_.lock();
 278   }
 279
 280   // Give the caller exclusive access through a lock holder.
 281   // No need for explicit release.
 282   template <typename LockHolder>
 283   void acquireExclusive(LockHolder& l) {
 284     l = LockHolder(m_);
 285   }
 286
 287   // Try to give the caller exclusive access. Returns true iff successful.
 288   bool tryExclusive() {
 289     return m_.try_lock();
 290   }
 291
 292   // Release exclusive access. The caller must have exclusive access.
 293   void releaseExclusive() {
 294     m_.unlock();
 295   }
 296
 297   // Execute an operation without combining
 298   template <typename OpFunc>
 299   void requestNoFC(OpFunc& opFn) {
 300     std::lock_guard<Mutex> guard(m_);
 301     opFn();
 302   }
 303
 304   // This function first tries to execute the operation without
 305   // combining. If unuccessful, it allocates a combining record if
 306   // needed. If there are no available records, it waits for exclusive
 307   // access and executes the operation. If a record is available and
 308   // ready for use, it fills the record and indicates that the request
 309   // is valid for combining. If the request is synchronous (by default
 310   // or necessity), it waits for the operation to be completed by a
 311   // combiner and optionally extracts the result, if any.
 312   //
 313   // This function can be called in several forms:
 314   //   Simple forms that do not require the user to define a Req structure
 315   //   or to override any request processing member functions:
 316   //     requestFC(opFn)
 317   //     requestFC(opFn, rec) // provides its own pre-allocated record
 318   //     requestFC(opFn, rec, syncop) // asynchronous if syncop == false
 319   //   Custom forms that require the user to define a Req structure and to
 320   //   override some request processing member functions:
 321   //     requestFC(opFn, fillFn)
 322   //     requestFC(opFn, fillFn, rec)
 323   //     requestFC(opFn, fillFn, rec, syncop)
 324   //     requestFC(opFn, fillFn, resFn)
 325   //     requestFC(opFn, fillFn, resFn, rec)
 326   template <typename OpFunc>
 327   void requestFC(OpFunc&& opFn, Rec* rec = nullptr, bool syncop = true) {
 328     auto dummy = [](Req&) {};
 329     requestOp(
 330         std::forward<OpFunc>(opFn),
 331         dummy /* fillFn */,
 332         dummy /* resFn */,
 333         rec,
 334         syncop,
 335         false /* simple */);
 336   }
 337   template <typename OpFunc, typename FillFunc>
 338   void requestFC(
 339       OpFunc&& opFn,
 340       const FillFunc& fillFn,
 341       Rec* rec = nullptr,
 342       bool syncop = true) {
 343     auto dummy = [](Req&) {};
 344     requestOp(
 345         std::forward<OpFunc>(opFn),
 346         fillFn,
 347         dummy /* resFn */,
 348         rec,
 349         syncop,
 350         true /* custom */);
 351   }
 352   template <typename OpFunc, typename FillFunc, typename ResFn>
 353   void requestFC(
 354       OpFunc&& opFn,
 355       const FillFunc& fillFn,
 356       const ResFn& resFn,
 357       Rec* rec = nullptr) {
 358     // must wait for result to execute resFn -- so it must be synchronous
 359     requestOp(
 360         std::forward<OpFunc>(opFn),
 361         fillFn,
 362         resFn,
 363         rec,
 364         true /* sync */,
 365         true /* custom*/);
 366   }
 367
 368   // Allocate a record.
 369   Rec* allocRec() {
 370     auto idx = recsPool_.allocIndex();
 371     if (idx == NULL_INDEX) {
 372       outOfSpaceCount_.fetch_add(1);
 373       return nullptr;
 374     }
 375     Rec& rec = recsPool_[idx];
 376     rec.setIndex(idx);
 377     return &rec;
 378   }
 379
 380   // Free a record
 381   void freeRec(Rec* rec) {
 382     if (rec == nullptr) {
 383       return;
 384     }
 385     auto idx = rec->getIndex();
 386     recsPool_.recycleIndex(idx);
 387   }
 388
 389   // Returns a count of the number of combined operations so far.
 390   uint64_t getCombinedOpCount() {
 391     std::lock_guard<Mutex> guard(m_);
 392     return combined_;
 393   }
 394
 395   // Returns a count of the number of combining passes so far.
 396   uint64_t getCombiningPasses() {
 397     std::lock_guard<Mutex> guard(m_);
 398     return passes_;
 399   }
 400
 401   uint64_t getOutOfSpaceCount() {
 402     return outOfSpaceCount_.load();
 403   }
 404
 405  protected:
 406   const size_t NULL_INDEX = 0;
 407   const uint32_t kDefaultMaxOps = 100;
 408   const uint64_t kDefaultNumRecs = 64;
 409   const uint64_t kIdleThreshold = 10;
 410
 411   FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
 412   Mutex m_;
 413
 414   FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
 415   folly::Baton<Atom, false, true> pending_;
 416   Atom<bool> shutdown_{false};
 417
 418   FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
 419   uint32_t numRecs_;
 420   uint32_t maxOps_;
 421   Atom<size_t> recs_;
 422   bool dedicated_;
 423   std::thread combiner_;
 424   Pool recsPool_;
 425
 426   FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
 427   uint64_t combined_ = 0;
 428   uint64_t passes_ = 0;
 429   uint64_t sessions_ = 0;
 430   Atom<uint64_t> outOfSpaceCount_{0};
 431
 432   template <typename OpFunc, typename FillFunc, typename ResFn>
 433   void requestOp(
 434       OpFunc&& opFn,
 435       const FillFunc& fillFn,
 436       const ResFn& resFn,
 437       Rec* rec,
 438       bool syncop,
 439       const bool custom) {
 440     std::unique_lock<Mutex> l(this->m_, std::defer_lock);
 441     if (l.try_lock()) {
 442       // No contention
 443       tryCombining();
 444       opFn();
 445       return;
 446     }
 447
 448     // Try FC
 449     bool tc = (rec != nullptr);
 450     if (!tc) {
 451       // if an async op doesn't have a thread-cached record then turn
 452       // it into a synchronous op.
 453       syncop = true;
 454       rec = allocRec();
 455     }
 456     if (rec == nullptr) {
 457       // Can't use FC - Must acquire lock
 458       l.lock();
 459       opFn();
 460       return;
 461     }
 462
 463     // Use FC
 464     // Wait if record is in use
 465     awaitDone(*rec);
 466     rec->clearDone();
 467     // Fill record
 468     if (custom) {
 469       // Fill the request (custom)
 470       Req& req = rec->getReq();
 471       fillFn(req);
 472       rec->clearFn();
 473     } else {
 474       rec->setFn(std::forward<OpFunc>(opFn));
 475     }
 476     // Indicate that record is valid
 477     assert(!rec->isValid());
 478     rec->setValid();
 479     // end of combining critical path
 480     setPending();
 481     // store-load order setValid before isDisconnected
 482     std::atomic_thread_fence(std::memory_order_seq_cst);
 483     if (rec->isDisconnected()) {
 484       rec->clearDisconnected();
 485       pushRec(rec->getIndex());
 486       setPending();
 487     }
 488     // If synchronous wait for the request to be completed
 489     if (syncop) {
 490       awaitDone(*rec);
 491       if (custom) {
 492         Req& req = rec->getReq();
 493         resFn(req); // Extract the result (custom)
 494       }
 495       if (!tc) {
 496         freeRec(rec); // Free the temporary record.
 497       }
 498     }
 499   }
 500
 501   void pushRec(size_t idx) {
 502     Rec& rec = recsPool_[idx];
 503     while (true) {
 504       auto head = recs_.load(std::memory_order_acquire);
 505       rec.setNext(head); // there shouldn't be a data race here
 506       if (recs_.compare_exchange_weak(head, idx)) {
 507         return;
 508       }
 509     }
 510   }
 511
 512   size_t getRecsHead() {
 513     return recs_.load(std::memory_order_acquire);
 514   }
 515
 516   size_t nextIndex(size_t idx) {
 517     return recsPool_[idx].getNext();
 518   }
 519
 520   void clearPending() {
 521     pending_.reset();
 522   }
 523
 524   void setPending() {
 525     pending_.post();
 526   }
 527
 528   bool isPending() const {
 529     return pending_.try_wait();
 530   }
 531
 532   void awaitPending() {
 533     pending_.wait();
 534   }
 535
 536   uint64_t combiningSession() {
 537     uint64_t combined = 0;
 538     do {
 539       uint64_t count = static_cast<T*>(this)->combiningPass();
 540       if (count == 0) {
 541         break;
 542       }
 543       combined += count;
 544       ++this->passes_;
 545     } while (combined < this->maxOps_);
 546     return combined;
 547   }
 548
 549   void tryCombining() {
 550     if (!dedicated_) {
 551       while (isPending()) {
 552         clearPending();
 553         combined_ += combiningSession();
 554       }
 555     }
 556   }
 557
 558   void dedicatedCombining() {
 559     while (true) {
 560       awaitPending();
 561       clearPending();
 562       if (shutdown_.load()) {
 563         break;
 564       }
 565       while (true) {
 566         uint64_t count;
 567         ++sessions_;
 568         {
 569           std::lock_guard<Mutex> guard(m_);
 570           count = combiningSession();
 571           combined_ += count;
 572         }
 573         if (count < maxOps_) {
 574           break;
 575         }
 576       }
 577     }
 578   }
 579
 580   void awaitDone(Rec& rec) {
 581     if (dedicated_) {
 582       rec.awaitDone();
 583     } else {
 584       awaitDoneTryLock(rec);
 585     }
 586   }
 587
 588   /// Waits for the request to be done and occasionally tries to
 589   /// acquire the lock and to do combining. Used only in the absence
 590   /// of a dedicated combiner.
 591   void awaitDoneTryLock(Rec& rec) {
 592     assert(!dedicated_);
 593     int count = 0;
 594     while (!rec.isDone()) {
 595       if (count == 0) {
 596         std::unique_lock<Mutex> l(m_, std::defer_lock);
 597         if (l.try_lock()) {
 598           setPending();
 599           tryCombining();
 600         }
 601       } else {
 602         folly::asm_volatile_pause();
 603         if (++count == 1000) {
 604           count = 0;
 605         }
 606       }
 607     }
 608   }
 609
 610   void shutdown() {
 611     shutdown_.store(true);
 612     setPending();
 613   }
 614
 615   /// The following member functions may be overridden for customization
 616
 617   void combinedOp(Req&) {
 618     throw std::runtime_error(
 619         "FlatCombining::combinedOp(Req&) must be overridden in the derived"
 620         " class if called.");
 621   }
 622
 623   void processReq(Rec& rec) {
 624     SavedFn& opFn = rec.getFn();
 625     if (opFn) {
 626       // simple interface
 627       opFn();
 628     } else {
 629       // custom interface
 630       Req& req = rec.getReq();
 631       static_cast<T*>(this)->combinedOp(req); // defined in derived class
 632     }
 633     rec.setLast(passes_);
 634     rec.complete();
 635   }
 636
 637   uint64_t combiningPass() {
 638     uint64_t count = 0;
 639     auto idx = getRecsHead();
 640     Rec* prev = nullptr;
 641     while (idx != NULL_INDEX) {
 642       Rec& rec = recsPool_[idx];
 643       auto next = rec.getNext();
 644       bool valid = rec.isValid();
 645       if (!valid && (passes_ - rec.getLast() > kIdleThreshold) &&
 646           (prev != nullptr)) {
 647         // Disconnect
 648         prev->setNext(next);
 649         rec.setDisconnected();
 650         // store-load order setDisconnected before isValid
 651         std::atomic_thread_fence(std::memory_order_seq_cst);
 652         valid = rec.isValid();
 653       } else {
 654         prev = &rec;
 655       }
 656       if (valid) {
 657         processReq(rec);
 658         ++count;
 659       }
 660       idx = next;
 661     }
 662     return count;
 663   }
 664 };
 665
 666 } // namespace folly {