folly/IndexedMemPool.h

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #pragma once
  18
  19 #include <type_traits>
  20 #include <assert.h>
  21 #include <errno.h>
  22 #include <stdint.h>
  23 #include <boost/noncopyable.hpp>
  24 #include <folly/AtomicStruct.h>
  25 #include <folly/detail/CacheLocality.h>
  26 #include <folly/portability/SysMman.h>
  27 #include <folly/portability/Unistd.h>
  28
  29 // Ignore shadowing warnings within this file, so includers can use -Wshadow.
  30 #pragma GCC diagnostic push
  31 #pragma GCC diagnostic ignored "-Wshadow"
  32
  33 namespace folly {
  34
  35 namespace detail {
  36 template <typename Pool>
  37 struct IndexedMemPoolRecycler;
  38 }
  39
  40 /// Instances of IndexedMemPool dynamically allocate and then pool their
  41 /// element type (T), returning 4-byte integer indices that can be passed
  42 /// to the pool's operator[] method to access or obtain pointers to the
  43 /// actual elements.  The memory backing items returned from the pool
  44 /// will always be readable, even if items have been returned to the pool.
  45 /// These two features are useful for lock-free algorithms.  The indexing
  46 /// behavior makes it easy to build tagged pointer-like-things, since
  47 /// a large number of elements can be managed using fewer bits than a
  48 /// full pointer.  The access-after-free behavior makes it safe to read
  49 /// from T-s even after they have been recycled, since it is guaranteed
  50 /// that the memory won't have been returned to the OS and unmapped
  51 /// (the algorithm must still use a mechanism to validate that the read
  52 /// was correct, but it doesn't have to worry about page faults), and if
  53 /// the elements use internal sequence numbers it can be guaranteed that
  54 /// there won't be an ABA match due to the element being overwritten with
  55 /// a different type that has the same bit pattern.
  56 ///
  57 /// IndexedMemPool has two object lifecycle strategies.  The first
  58 /// is to construct objects when they are allocated from the pool and
  59 /// destroy them when they are recycled.  In this mode allocIndex and
  60 /// allocElem have emplace-like semantics.  In the second mode, objects
  61 /// are default-constructed the first time they are removed from the pool,
  62 /// and deleted when the pool itself is deleted.  By default the first
  63 /// mode is used for non-trivial T, and the second is used for trivial T.
  64 ///
  65 /// IMPORTANT: Space for extra elements is allocated to account for those
  66 /// that are inaccessible because they are in other local lists, so the
  67 /// actual number of items that can be allocated ranges from capacity to
  68 /// capacity + (NumLocalLists_-1)*LocalListLimit_.  This is important if
  69 /// you are trying to maximize the capacity of the pool while constraining
  70 /// the bit size of the resulting pointers, because the pointers will
  71 /// actually range up to the boosted capacity.  See maxIndexForCapacity
  72 /// and capacityForMaxIndex.
  73 ///
  74 /// To avoid contention, NumLocalLists_ free lists of limited (less than
  75 /// or equal to LocalListLimit_) size are maintained, and each thread
  76 /// retrieves and returns entries from its associated local list.  If the
  77 /// local list becomes too large then elements are placed in bulk in a
  78 /// global free list.  This allows items to be efficiently recirculated
  79 /// from consumers to producers.  AccessSpreader is used to access the
  80 /// local lists, so there is no performance advantage to having more
  81 /// local lists than L1 caches.
  82 ///
  83 /// The pool mmap-s the entire necessary address space when the pool is
  84 /// constructed, but delays element construction.  This means that only
  85 /// elements that are actually returned to the caller get paged into the
  86 /// process's resident set (RSS).
  87 template <
  88     typename T,
  89     uint32_t NumLocalLists_ = 32,
  90     uint32_t LocalListLimit_ = 200,
  91     template <typename> class Atom = std::atomic,
  92     bool EagerRecycleWhenTrivial = false,
  93     bool EagerRecycleWhenNotTrivial = true>
  94 struct IndexedMemPool : boost::noncopyable {
  95   typedef T value_type;
  96
  97   typedef std::unique_ptr<T, detail::IndexedMemPoolRecycler<IndexedMemPool>>
  98       UniquePtr;
  99
 100   static_assert(LocalListLimit_ <= 255, "LocalListLimit must fit in 8 bits");
 101   enum {
 102     NumLocalLists = NumLocalLists_,
 103     LocalListLimit = LocalListLimit_
 104   };
 105
 106
 107   static constexpr bool eagerRecycle() {
 108     return std::is_trivial<T>::value
 109         ? EagerRecycleWhenTrivial : EagerRecycleWhenNotTrivial;
 110   }
 111
 112   // these are public because clients may need to reason about the number
 113   // of bits required to hold indices from a pool, given its capacity
 114
 115   static constexpr uint32_t maxIndexForCapacity(uint32_t capacity) {
 116     // index of std::numeric_limits<uint32_t>::max() is reserved for isAllocated
 117     // tracking
 118     return uint32_t(std::min(
 119         uint64_t(capacity) + (NumLocalLists - 1) * LocalListLimit,
 120         uint64_t(std::numeric_limits<uint32_t>::max() - 1)));
 121   }
 122
 123   static constexpr uint32_t capacityForMaxIndex(uint32_t maxIndex) {
 124     return maxIndex - (NumLocalLists - 1) * LocalListLimit;
 125   }
 126
 127
 128   /// Constructs a pool that can allocate at least _capacity_ elements,
 129   /// even if all the local lists are full
 130   explicit IndexedMemPool(uint32_t capacity)
 131     : actualCapacity_(maxIndexForCapacity(capacity))
 132     , size_(0)
 133     , globalHead_(TaggedPtr{})
 134   {
 135     const size_t needed = sizeof(Slot) * (actualCapacity_ + 1);
 136     size_t pagesize = size_t(sysconf(_SC_PAGESIZE));
 137     mmapLength_ = ((needed - 1) & ~(pagesize - 1)) + pagesize;
 138     assert(needed <= mmapLength_ && mmapLength_ < needed + pagesize);
 139     assert((mmapLength_ % pagesize) == 0);
 140
 141     slots_ = static_cast<Slot*>(mmap(nullptr, mmapLength_,
 142                                      PROT_READ | PROT_WRITE,
 143                                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
 144     if (slots_ == MAP_FAILED) {
 145       assert(errno == ENOMEM);
 146       throw std::bad_alloc();
 147     }
 148   }
 149
 150   /// Destroys all of the contained elements
 151   ~IndexedMemPool() {
 152     if (!eagerRecycle()) {
 153       // Take the minimum since it is possible that size_ > actualCapacity_.
 154       // This can happen if there are multiple concurrent requests
 155       // when size_ == actualCapacity_ - 1.
 156       uint32_t last = std::min(uint32_t(size_), uint32_t(actualCapacity_));
 157       for (uint32_t i = last; i > 0; --i) {
 158         slots_[i].~Slot();
 159       }
 160     }
 161     munmap(slots_, mmapLength_);
 162   }
 163
 164   /// Returns a lower bound on the number of elements that may be
 165   /// simultaneously allocated and not yet recycled.  Because of the
 166   /// local lists it is possible that more elements than this are returned
 167   /// successfully
 168   uint32_t capacity() {
 169     return capacityForMaxIndex(actualCapacity_);
 170   }
 171
 172   /// Finds a slot with a non-zero index, emplaces a T there if we're
 173   /// using the eager recycle lifecycle mode, and returns the index,
 174   /// or returns 0 if no elements are available.
 175   template <typename ...Args>
 176   uint32_t allocIndex(Args&&... args) {
 177     static_assert(sizeof...(Args) == 0 || eagerRecycle(),
 178         "emplace-style allocation requires eager recycle, "
 179         "which is defaulted only for non-trivial types");
 180     auto idx = localPop(localHead());
 181     if (idx != 0 && eagerRecycle()) {
 182       T* ptr = &slot(idx).elem;
 183       new (ptr) T(std::forward<Args>(args)...);
 184     }
 185     return idx;
 186   }
 187
 188   /// If an element is available, returns a std::unique_ptr to it that will
 189   /// recycle the element to the pool when it is reclaimed, otherwise returns
 190   /// a null (falsy) std::unique_ptr
 191   template <typename ...Args>
 192   UniquePtr allocElem(Args&&... args) {
 193     auto idx = allocIndex(std::forward<Args>(args)...);
 194     T* ptr = idx == 0 ? nullptr : &slot(idx).elem;
 195     return UniquePtr(ptr, typename UniquePtr::deleter_type(this));
 196   }
 197
 198   /// Gives up ownership previously granted by alloc()
 199   void recycleIndex(uint32_t idx) {
 200     assert(isAllocated(idx));
 201     if (eagerRecycle()) {
 202       slot(idx).elem.~T();
 203     }
 204     localPush(localHead(), idx);
 205   }
 206
 207   /// Provides access to the pooled element referenced by idx
 208   T& operator[](uint32_t idx) {
 209     return slot(idx).elem;
 210   }
 211
 212   /// Provides access to the pooled element referenced by idx
 213   const T& operator[](uint32_t idx) const {
 214     return slot(idx).elem;
 215   }
 216
 217   /// If elem == &pool[idx], then pool.locateElem(elem) == idx.  Also,
 218   /// pool.locateElem(nullptr) == 0
 219   uint32_t locateElem(const T* elem) const {
 220     if (!elem) {
 221       return 0;
 222     }
 223
 224     static_assert(std::is_standard_layout<Slot>::value, "offsetof needs POD");
 225
 226     auto slot = reinterpret_cast<const Slot*>(
 227         reinterpret_cast<const char*>(elem) - offsetof(Slot, elem));
 228     auto rv = uint32_t(slot - slots_);
 229
 230     // this assert also tests that rv is in range
 231     assert(elem == &(*this)[rv]);
 232     return rv;
 233   }
 234
 235   /// Returns true iff idx has been alloc()ed and not recycleIndex()ed
 236   bool isAllocated(uint32_t idx) const {
 237     return slot(idx).localNext.load(std::memory_order_relaxed) == uint32_t(-1);
 238   }
 239
 240
 241  private:
 242   ///////////// types
 243
 244   struct Slot {
 245     T elem;
 246     Atom<uint32_t> localNext;
 247     Atom<uint32_t> globalNext;
 248
 249     Slot() : localNext{}, globalNext{} {}
 250   };
 251
 252   struct TaggedPtr {
 253     uint32_t idx;
 254
 255     // size is bottom 8 bits, tag in top 24.  g++'s code generation for
 256     // bitfields seems to depend on the phase of the moon, plus we can
 257     // do better because we can rely on other checks to avoid masking
 258     uint32_t tagAndSize;
 259
 260     enum : uint32_t {
 261         SizeBits = 8,
 262         SizeMask = (1U << SizeBits) - 1,
 263         TagIncr = 1U << SizeBits,
 264     };
 265
 266     uint32_t size() const {
 267       return tagAndSize & SizeMask;
 268     }
 269
 270     TaggedPtr withSize(uint32_t repl) const {
 271       assert(repl <= LocalListLimit);
 272       return TaggedPtr{ idx, (tagAndSize & ~SizeMask) | repl };
 273     }
 274
 275     TaggedPtr withSizeIncr() const {
 276       assert(size() < LocalListLimit);
 277       return TaggedPtr{ idx, tagAndSize + 1 };
 278     }
 279
 280     TaggedPtr withSizeDecr() const {
 281       assert(size() > 0);
 282       return TaggedPtr{ idx, tagAndSize - 1 };
 283     }
 284
 285     TaggedPtr withIdx(uint32_t repl) const {
 286       return TaggedPtr{ repl, tagAndSize + TagIncr };
 287     }
 288
 289     TaggedPtr withEmpty() const {
 290       return withIdx(0).withSize(0);
 291     }
 292   };
 293
 294   struct FOLLY_ALIGN_TO_AVOID_FALSE_SHARING LocalList {
 295     AtomicStruct<TaggedPtr,Atom> head;
 296
 297     LocalList() : head(TaggedPtr{}) {}
 298   };
 299
 300   ////////// fields
 301
 302   /// the number of bytes allocated from mmap, which is a multiple of
 303   /// the page size of the machine
 304   size_t mmapLength_;
 305
 306   /// the actual number of slots that we will allocate, to guarantee
 307   /// that we will satisfy the capacity requested at construction time.
 308   /// They will be numbered 1..actualCapacity_ (note the 1-based counting),
 309   /// and occupy slots_[1..actualCapacity_].
 310   uint32_t actualCapacity_;
 311
 312   /// this records the number of slots that have actually been constructed.
 313   /// To allow use of atomic ++ instead of CAS, we let this overflow.
 314   /// The actual number of constructed elements is min(actualCapacity_,
 315   /// size_)
 316   Atom<uint32_t> size_;
 317
 318   /// raw storage, only 1..min(size_,actualCapacity_) (inclusive) are
 319   /// actually constructed.  Note that slots_[0] is not constructed or used
 320   FOLLY_ALIGN_TO_AVOID_FALSE_SHARING Slot* slots_;
 321
 322   /// use AccessSpreader to find your list.  We use stripes instead of
 323   /// thread-local to avoid the need to grow or shrink on thread start
 324   /// or join.   These are heads of lists chained with localNext
 325   LocalList local_[NumLocalLists];
 326
 327   /// this is the head of a list of node chained by globalNext, that are
 328   /// themselves each the head of a list chained by localNext
 329   FOLLY_ALIGN_TO_AVOID_FALSE_SHARING AtomicStruct<TaggedPtr,Atom> globalHead_;
 330
 331   ///////////// private methods
 332
 333   uint32_t slotIndex(uint32_t idx) const {
 334     assert(0 < idx &&
 335            idx <= actualCapacity_ &&
 336            idx <= size_.load(std::memory_order_acquire));
 337     return idx;
 338   }
 339
 340   Slot& slot(uint32_t idx) {
 341     return slots_[slotIndex(idx)];
 342   }
 343
 344   const Slot& slot(uint32_t idx) const {
 345     return slots_[slotIndex(idx)];
 346   }
 347
 348   // localHead references a full list chained by localNext.  s should
 349   // reference slot(localHead), it is passed as a micro-optimization
 350   void globalPush(Slot& s, uint32_t localHead) {
 351     while (true) {
 352       TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
 353       s.globalNext.store(gh.idx, std::memory_order_relaxed);
 354       if (globalHead_.compare_exchange_strong(gh, gh.withIdx(localHead))) {
 355         // success
 356         return;
 357       }
 358     }
 359   }
 360
 361   // idx references a single node
 362   void localPush(AtomicStruct<TaggedPtr,Atom>& head, uint32_t idx) {
 363     Slot& s = slot(idx);
 364     TaggedPtr h = head.load(std::memory_order_acquire);
 365     while (true) {
 366       s.localNext.store(h.idx, std::memory_order_relaxed);
 367
 368       if (h.size() == LocalListLimit) {
 369         // push will overflow local list, steal it instead
 370         if (head.compare_exchange_strong(h, h.withEmpty())) {
 371           // steal was successful, put everything in the global list
 372           globalPush(s, idx);
 373           return;
 374         }
 375       } else {
 376         // local list has space
 377         if (head.compare_exchange_strong(h, h.withIdx(idx).withSizeIncr())) {
 378           // success
 379           return;
 380         }
 381       }
 382       // h was updated by failing CAS
 383     }
 384   }
 385
 386   // returns 0 if empty
 387   uint32_t globalPop() {
 388     while (true) {
 389       TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
 390       if (gh.idx == 0 ||
 391           globalHead_.compare_exchange_strong(
 392               gh,
 393               gh.withIdx(
 394                   slot(gh.idx).globalNext.load(std::memory_order_relaxed)))) {
 395         // global list is empty, or pop was successful
 396         return gh.idx;
 397       }
 398     }
 399   }
 400
 401   // returns 0 if allocation failed
 402   uint32_t localPop(AtomicStruct<TaggedPtr,Atom>& head) {
 403     while (true) {
 404       TaggedPtr h = head.load(std::memory_order_acquire);
 405       if (h.idx != 0) {
 406         // local list is non-empty, try to pop
 407         Slot& s = slot(h.idx);
 408         auto next = s.localNext.load(std::memory_order_relaxed);
 409         if (head.compare_exchange_strong(h, h.withIdx(next).withSizeDecr())) {
 410           // success
 411           s.localNext.store(uint32_t(-1), std::memory_order_relaxed);
 412           return h.idx;
 413         }
 414         continue;
 415       }
 416
 417       uint32_t idx = globalPop();
 418       if (idx == 0) {
 419         // global list is empty, allocate and construct new slot
 420         if (size_.load(std::memory_order_relaxed) >= actualCapacity_ ||
 421             (idx = ++size_) > actualCapacity_) {
 422           // allocation failed
 423           return 0;
 424         }
 425         // default-construct it now if we aren't going to construct and
 426         // destroy on each allocation
 427         if (!eagerRecycle()) {
 428           T* ptr = &slot(idx).elem;
 429           new (ptr) T();
 430         }
 431         slot(idx).localNext.store(uint32_t(-1), std::memory_order_relaxed);
 432         return idx;
 433       }
 434
 435       Slot& s = slot(idx);
 436       auto next = s.localNext.load(std::memory_order_relaxed);
 437       if (head.compare_exchange_strong(
 438               h, h.withIdx(next).withSize(LocalListLimit))) {
 439         // global list moved to local list, keep head for us
 440         s.localNext.store(uint32_t(-1), std::memory_order_relaxed);
 441         return idx;
 442       }
 443       // local bulk push failed, return idx to the global list and try again
 444       globalPush(s, idx);
 445     }
 446   }
 447
 448   AtomicStruct<TaggedPtr,Atom>& localHead() {
 449     auto stripe = detail::AccessSpreader<Atom>::current(NumLocalLists);
 450     return local_[stripe].head;
 451   }
 452 };
 453
 454 namespace detail {
 455
 456 /// This is a stateful Deleter functor, which allows std::unique_ptr
 457 /// to track elements allocated from an IndexedMemPool by tracking the
 458 /// associated pool.  See IndexedMemPool::allocElem.
 459 template <typename Pool>
 460 struct IndexedMemPoolRecycler {
 461   Pool* pool;
 462
 463   explicit IndexedMemPoolRecycler(Pool* pool) : pool(pool) {}
 464
 465   IndexedMemPoolRecycler(const IndexedMemPoolRecycler<Pool>& rhs)
 466       = default;
 467   IndexedMemPoolRecycler& operator= (const IndexedMemPoolRecycler<Pool>& rhs)
 468       = default;
 469
 470   void operator()(typename Pool::value_type* elem) const {
 471     pool->recycleIndex(pool->locateElem(elem));
 472   }
 473 };
 474
 475 }
 476
 477 } // namespace folly
 478
 479 # pragma GCC diagnostic pop