folly/detail/ThreadLocalDetail.h

   1 /*
   2  * Copyright 2015 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #ifndef FOLLY_DETAIL_THREADLOCALDETAIL_H_
  18 #define FOLLY_DETAIL_THREADLOCALDETAIL_H_
  19
  20 #include <limits.h>
  21 #include <pthread.h>
  22
  23 #include <mutex>
  24 #include <string>
  25 #include <vector>
  26
  27 #include <glog/logging.h>
  28
  29 #include <folly/Foreach.h>
  30 #include <folly/Exception.h>
  31 #include <folly/Malloc.h>
  32
  33 // In general, emutls cleanup is not guaranteed to play nice with the way
  34 // StaticMeta mixes direct pthread calls and the use of __thread. This has
  35 // caused problems on multiple platforms so don't use __thread there.
  36 //
  37 // XXX: Ideally we would instead determine if emutls is in use at runtime as it
  38 // is possible to configure glibc on Linux to use emutls regardless.
  39 #if !__APPLE__ && !__ANDROID__
  40 #define FOLLY_TLD_USE_FOLLY_TLS 1
  41 #else
  42 #undef FOLLY_TLD_USE_FOLLY_TLS
  43 #endif
  44
  45 namespace folly {
  46 namespace threadlocal_detail {
  47
  48 /**
  49  * Base class for deleters.
  50  */
  51 class DeleterBase {
  52  public:
  53   virtual ~DeleterBase() { }
  54   virtual void dispose(void* ptr, TLPDestructionMode mode) const = 0;
  55 };
  56
  57 /**
  58  * Simple deleter class that calls delete on the passed-in pointer.
  59  */
  60 template <class Ptr>
  61 class SimpleDeleter : public DeleterBase {
  62  public:
  63   virtual void dispose(void* ptr, TLPDestructionMode /*mode*/) const {
  64     delete static_cast<Ptr>(ptr);
  65   }
  66 };
  67
  68 /**
  69  * Custom deleter that calls a given callable.
  70  */
  71 template <class Ptr, class Deleter>
  72 class CustomDeleter : public DeleterBase {
  73  public:
  74   explicit CustomDeleter(Deleter d) : deleter_(d) { }
  75   virtual void dispose(void* ptr, TLPDestructionMode mode) const {
  76     deleter_(static_cast<Ptr>(ptr), mode);
  77   }
  78  private:
  79   Deleter deleter_;
  80 };
  81
  82
  83 /**
  84  * POD wrapper around an element (a void*) and an associated deleter.
  85  * This must be POD, as we memset() it to 0 and memcpy() it around.
  86  */
  87 struct ElementWrapper {
  88   bool dispose(TLPDestructionMode mode) {
  89     if (ptr == nullptr) {
  90       return false;
  91     }
  92
  93     DCHECK(deleter != nullptr);
  94     deleter->dispose(ptr, mode);
  95     cleanup();
  96     return true;
  97   }
  98
  99   void* release() {
 100     auto retPtr = ptr;
 101
 102     if (ptr != nullptr) {
 103       cleanup();
 104     }
 105
 106     return retPtr;
 107   }
 108
 109   template <class Ptr>
 110   void set(Ptr p) {
 111     DCHECK(ptr == nullptr);
 112     DCHECK(deleter == nullptr);
 113
 114     if (p) {
 115       // We leak a single object here but that is ok.  If we used an
 116       // object directly, there is a chance that the destructor will be
 117       // called on that static object before any of the ElementWrappers
 118       // are disposed and that isn't so nice.
 119       static auto d = new SimpleDeleter<Ptr>();
 120       ptr = p;
 121       deleter = d;
 122       ownsDeleter = false;
 123     }
 124   }
 125
 126   template <class Ptr, class Deleter>
 127   void set(Ptr p, Deleter d) {
 128     DCHECK(ptr == nullptr);
 129     DCHECK(deleter == nullptr);
 130     if (p) {
 131       ptr = p;
 132       deleter = new CustomDeleter<Ptr,Deleter>(d);
 133       ownsDeleter = true;
 134     }
 135   }
 136
 137   void cleanup() {
 138     if (ownsDeleter) {
 139       delete deleter;
 140     }
 141     ptr = nullptr;
 142     deleter = nullptr;
 143     ownsDeleter = false;
 144   }
 145
 146   void* ptr;
 147   DeleterBase* deleter;
 148   bool ownsDeleter;
 149 };
 150
 151 /**
 152  * Per-thread entry.  Each thread using a StaticMeta object has one.
 153  * This is written from the owning thread only (under the lock), read
 154  * from the owning thread (no lock necessary), and read from other threads
 155  * (under the lock).
 156  */
 157 struct ThreadEntry {
 158   ElementWrapper* elements;
 159   size_t elementsCapacity;
 160   ThreadEntry* next;
 161   ThreadEntry* prev;
 162 };
 163
 164 constexpr uint32_t kEntryIDInvalid = std::numeric_limits<uint32_t>::max();
 165
 166 struct PthreadKeyUnregisterTester;
 167
 168 /**
 169  * We want to disable onThreadExit call at the end of shutdown, we don't care
 170  * about leaking memory at that point.
 171  *
 172  * Otherwise if ThreadLocal is used in a shared library, onThreadExit may be
 173  * called after dlclose().
 174  *
 175  * This class has one single static instance; however since it's so widely used,
 176  * directly or indirectly, by so many classes, we need to take care to avoid
 177  * problems stemming from the Static Initialization/Destruction Order Fiascos.
 178  * Therefore this class needs to be constexpr-constructible, so as to avoid
 179  * the need for this to participate in init/destruction order.
 180  */
 181 class PthreadKeyUnregister {
 182  public:
 183   static constexpr size_t kMaxKeys = 1UL << 16;
 184
 185   ~PthreadKeyUnregister() {
 186     std::lock_guard<std::mutex> lg(mutex_);
 187     while (size_) {
 188       pthread_key_delete(keys_[--size_]);
 189     }
 190   }
 191
 192   static void registerKey(pthread_key_t key) {
 193     instance_.registerKeyImpl(key);
 194   }
 195
 196  private:
 197   /**
 198    * Only one global instance should exist, hence this is private.
 199    * See also the important note at the top of this class about `constexpr`
 200    * usage.
 201    */
 202   constexpr PthreadKeyUnregister() : mutex_(), size_(0), keys_() { }
 203   friend class folly::threadlocal_detail::PthreadKeyUnregisterTester;
 204
 205   void registerKeyImpl(pthread_key_t key) {
 206     std::lock_guard<std::mutex> lg(mutex_);
 207     CHECK_LT(size_, kMaxKeys);
 208     keys_[size_++] = key;
 209   }
 210
 211   std::mutex mutex_;
 212   size_t size_;
 213   pthread_key_t keys_[kMaxKeys];
 214
 215   static PthreadKeyUnregister instance_;
 216 };
 217
 218 // Held in a singleton to track our global instances.
 219 // We have one of these per "Tag", by default one for the whole system
 220 // (Tag=void).
 221 //
 222 // Creating and destroying ThreadLocalPtr objects, as well as thread exit
 223 // for threads that use ThreadLocalPtr objects collide on a lock inside
 224 // StaticMeta; you can specify multiple Tag types to break that lock.
 225 template <class Tag>
 226 struct StaticMeta {
 227   // Represents an ID of a thread local object. Initially set to the maximum
 228   // uint. This representation allows us to avoid a branch in accessing TLS data
 229   // (because if you test capacity > id if id = maxint then the test will always
 230   // fail). It allows us to keep a constexpr constructor and avoid SIOF.
 231   class EntryID {
 232    public:
 233     std::atomic<uint32_t> value;
 234
 235     constexpr EntryID() : value(kEntryIDInvalid) {
 236     }
 237
 238     EntryID(EntryID&& other) noexcept : value(other.value.load()) {
 239       other.value = kEntryIDInvalid;
 240     }
 241
 242     EntryID& operator=(EntryID&& other) {
 243       assert(this != &other);
 244       value = other.value.load();
 245       other.value = kEntryIDInvalid;
 246       return *this;
 247     }
 248
 249     EntryID(const EntryID& other) = delete;
 250     EntryID& operator=(const EntryID& other) = delete;
 251
 252     uint32_t getOrInvalid() {
 253       // It's OK for this to be relaxed, even though we're effectively doing
 254       // double checked locking in using this value. We only care about the
 255       // uniqueness of IDs, getOrAllocate does not modify any other memory
 256       // this thread will use.
 257       return value.load(std::memory_order_relaxed);
 258     }
 259
 260     uint32_t getOrAllocate() {
 261       uint32_t id = getOrInvalid();
 262       if (id != kEntryIDInvalid) {
 263         return id;
 264       }
 265       // The lock inside allocate ensures that a single value is allocated
 266       return instance().allocate(this);
 267     }
 268   };
 269
 270   static StaticMeta<Tag>& instance() {
 271     // Leak it on exit, there's only one per process and we don't have to
 272     // worry about synchronization with exiting threads.
 273     static bool constructed = (inst_ = new StaticMeta<Tag>());
 274     (void)constructed; // suppress unused warning
 275     return *inst_;
 276   }
 277
 278   uint32_t nextId_;
 279   std::vector<uint32_t> freeIds_;
 280   std::mutex lock_;
 281   pthread_key_t pthreadKey_;
 282   ThreadEntry head_;
 283
 284   void push_back(ThreadEntry* t) {
 285     t->next = &head_;
 286     t->prev = head_.prev;
 287     head_.prev->next = t;
 288     head_.prev = t;
 289   }
 290
 291   void erase(ThreadEntry* t) {
 292     t->next->prev = t->prev;
 293     t->prev->next = t->next;
 294     t->next = t->prev = t;
 295   }
 296
 297 #ifdef FOLLY_TLD_USE_FOLLY_TLS
 298   static FOLLY_TLS ThreadEntry threadEntry_;
 299 #endif
 300   static StaticMeta<Tag>* inst_;
 301
 302   StaticMeta() : nextId_(1) {
 303     head_.next = head_.prev = &head_;
 304     int ret = pthread_key_create(&pthreadKey_, &onThreadExit);
 305     checkPosixError(ret, "pthread_key_create failed");
 306     PthreadKeyUnregister::registerKey(pthreadKey_);
 307
 308 #if FOLLY_HAVE_PTHREAD_ATFORK
 309     ret = pthread_atfork(/*prepare*/ &StaticMeta::preFork,
 310                          /*parent*/ &StaticMeta::onForkParent,
 311                          /*child*/ &StaticMeta::onForkChild);
 312     checkPosixError(ret, "pthread_atfork failed");
 313 #elif !__ANDROID__ && !defined(_MSC_VER)
 314     // pthread_atfork is not part of the Android NDK at least as of n9d. If
 315     // something is trying to call native fork() directly at all with Android's
 316     // process management model, this is probably the least of the problems.
 317     //
 318     // But otherwise, this is a problem.
 319     #warning pthread_atfork unavailable
 320 #endif
 321   }
 322   ~StaticMeta() {
 323     LOG(FATAL) << "StaticMeta lives forever!";
 324   }
 325
 326   static ThreadEntry* getThreadEntry() {
 327 #ifdef FOLLY_TLD_USE_FOLLY_TLS
 328     return &threadEntry_;
 329 #else
 330     auto key = instance().pthreadKey_;
 331     ThreadEntry* threadEntry =
 332       static_cast<ThreadEntry*>(pthread_getspecific(key));
 333     if (!threadEntry) {
 334         threadEntry = new ThreadEntry();
 335         int ret = pthread_setspecific(key, threadEntry);
 336         checkPosixError(ret, "pthread_setspecific failed");
 337     }
 338     return threadEntry;
 339 #endif
 340   }
 341
 342   static void preFork(void) {
 343     instance().lock_.lock();  // Make sure it's created
 344   }
 345
 346   static void onForkParent(void) {
 347     inst_->lock_.unlock();
 348   }
 349
 350   static void onForkChild(void) {
 351     // only the current thread survives
 352     inst_->head_.next = inst_->head_.prev = &inst_->head_;
 353     ThreadEntry* threadEntry = getThreadEntry();
 354     // If this thread was in the list before the fork, add it back.
 355     if (threadEntry->elementsCapacity != 0) {
 356       inst_->push_back(threadEntry);
 357     }
 358     inst_->lock_.unlock();
 359   }
 360
 361   static void onThreadExit(void* ptr) {
 362     auto& meta = instance();
 363 #ifdef FOLLY_TLD_USE_FOLLY_TLS
 364     ThreadEntry* threadEntry = getThreadEntry();
 365
 366     DCHECK_EQ(ptr, &meta);
 367     DCHECK_GT(threadEntry->elementsCapacity, 0);
 368 #else
 369     // pthread sets the thread-specific value corresponding
 370     // to meta.pthreadKey_ to NULL before calling onThreadExit.
 371     // We need to set it back to ptr to enable the correct behaviour
 372     // of the subsequent calls of getThreadEntry
 373     // (which may happen in user-provided custom deleters)
 374     pthread_setspecific(meta.pthreadKey_, ptr);
 375     ThreadEntry* threadEntry = static_cast<ThreadEntry*>(ptr);
 376 #endif
 377     {
 378       std::lock_guard<std::mutex> g(meta.lock_);
 379       meta.erase(threadEntry);
 380       // No need to hold the lock any longer; the ThreadEntry is private to this
 381       // thread now that it's been removed from meta.
 382     }
 383     // NOTE: User-provided deleter / object dtor itself may be using ThreadLocal
 384     // with the same Tag, so dispose() calls below may (re)create some of the
 385     // elements or even increase elementsCapacity, thus multiple cleanup rounds
 386     // may be required.
 387     for (bool shouldRun = true; shouldRun; ) {
 388       shouldRun = false;
 389       FOR_EACH_RANGE(i, 0, threadEntry->elementsCapacity) {
 390         if (threadEntry->elements[i].dispose(TLPDestructionMode::THIS_THREAD)) {
 391           shouldRun = true;
 392         }
 393       }
 394     }
 395     free(threadEntry->elements);
 396     threadEntry->elements = nullptr;
 397     pthread_setspecific(meta.pthreadKey_, nullptr);
 398
 399 #ifndef FOLLY_TLD_USE_FOLLY_TLS
 400     // Allocated in getThreadEntry() when not using folly TLS; free it
 401     delete threadEntry;
 402 #endif
 403   }
 404
 405   static uint32_t allocate(EntryID* ent) {
 406     uint32_t id;
 407     auto & meta = instance();
 408     std::lock_guard<std::mutex> g(meta.lock_);
 409
 410     id = ent->value.load();
 411     if (id != kEntryIDInvalid) {
 412       return id;
 413     }
 414
 415     if (!meta.freeIds_.empty()) {
 416       id = meta.freeIds_.back();
 417       meta.freeIds_.pop_back();
 418     } else {
 419       id = meta.nextId_++;
 420     }
 421
 422     uint32_t old_id = ent->value.exchange(id);
 423     DCHECK_EQ(old_id, kEntryIDInvalid);
 424     return id;
 425   }
 426
 427   static void destroy(EntryID* ent) {
 428     try {
 429       auto & meta = instance();
 430       // Elements in other threads that use this id.
 431       std::vector<ElementWrapper> elements;
 432       {
 433         std::lock_guard<std::mutex> g(meta.lock_);
 434         uint32_t id = ent->value.exchange(kEntryIDInvalid);
 435         if (id == kEntryIDInvalid) {
 436           return;
 437         }
 438
 439         for (ThreadEntry* e = meta.head_.next; e != &meta.head_; e = e->next) {
 440           if (id < e->elementsCapacity && e->elements[id].ptr) {
 441             elements.push_back(e->elements[id]);
 442
 443             /*
 444              * Writing another thread's ThreadEntry from here is fine;
 445              * the only other potential reader is the owning thread --
 446              * from onThreadExit (which grabs the lock, so is properly
 447              * synchronized with us) or from get(), which also grabs
 448              * the lock if it needs to resize the elements vector.
 449              *
 450              * We can't conflict with reads for a get(id), because
 451              * it's illegal to call get on a thread local that's
 452              * destructing.
 453              */
 454             e->elements[id].ptr = nullptr;
 455             e->elements[id].deleter = nullptr;
 456             e->elements[id].ownsDeleter = false;
 457           }
 458         }
 459         meta.freeIds_.push_back(id);
 460       }
 461       // Delete elements outside the lock
 462       FOR_EACH(it, elements) {
 463         it->dispose(TLPDestructionMode::ALL_THREADS);
 464       }
 465     } catch (...) { // Just in case we get a lock error or something anyway...
 466       LOG(WARNING) << "Destructor discarding an exception that was thrown.";
 467     }
 468   }
 469
 470   /**
 471    * Reserve enough space in the ThreadEntry::elements for the item
 472    * @id to fit in.
 473    */
 474   static void reserve(EntryID* id) {
 475     auto& meta = instance();
 476     ThreadEntry* threadEntry = getThreadEntry();
 477     size_t prevCapacity = threadEntry->elementsCapacity;
 478
 479     uint32_t idval = id->getOrAllocate();
 480     if (prevCapacity > idval) {
 481       return;
 482     }
 483     // Growth factor < 2, see folly/docs/FBVector.md; + 5 to prevent
 484     // very slow start.
 485     size_t newCapacity = static_cast<size_t>((idval + 5) * 1.7);
 486     assert(newCapacity > prevCapacity);
 487     ElementWrapper* reallocated = nullptr;
 488
 489     // Need to grow. Note that we can't call realloc, as elements is
 490     // still linked in meta, so another thread might access invalid memory
 491     // after realloc succeeds. We'll copy by hand and update our ThreadEntry
 492     // under the lock.
 493     if (usingJEMalloc()) {
 494       bool success = false;
 495       size_t newByteSize = nallocx(newCapacity * sizeof(ElementWrapper), 0);
 496
 497       // Try to grow in place.
 498       //
 499       // Note that xallocx(MALLOCX_ZERO) will only zero newly allocated memory,
 500       // even if a previous allocation allocated more than we requested.
 501       // This is fine; we always use MALLOCX_ZERO with jemalloc and we
 502       // always expand our allocation to the real size.
 503       if (prevCapacity * sizeof(ElementWrapper) >=
 504           jemallocMinInPlaceExpandable) {
 505         success = (xallocx(threadEntry->elements, newByteSize, 0, MALLOCX_ZERO)
 506                    == newByteSize);
 507       }
 508
 509       // In-place growth failed.
 510       if (!success) {
 511         success = ((reallocated = static_cast<ElementWrapper*>(
 512                     mallocx(newByteSize, MALLOCX_ZERO))) != nullptr);
 513       }
 514
 515       if (success) {
 516         // Expand to real size
 517         assert(newByteSize / sizeof(ElementWrapper) >= newCapacity);
 518         newCapacity = newByteSize / sizeof(ElementWrapper);
 519       } else {
 520         throw std::bad_alloc();
 521       }
 522     } else {  // no jemalloc
 523       // calloc() is simpler than malloc() followed by memset(), and
 524       // potentially faster when dealing with a lot of memory, as it can get
 525       // already-zeroed pages from the kernel.
 526       reallocated = static_cast<ElementWrapper*>(
 527           calloc(newCapacity, sizeof(ElementWrapper)));
 528       if (!reallocated) {
 529         throw std::bad_alloc();
 530       }
 531     }
 532
 533     // Success, update the entry
 534     {
 535       std::lock_guard<std::mutex> g(meta.lock_);
 536
 537       if (prevCapacity == 0) {
 538         meta.push_back(threadEntry);
 539       }
 540
 541       if (reallocated) {
 542        /*
 543         * Note: we need to hold the meta lock when copying data out of
 544         * the old vector, because some other thread might be
 545         * destructing a ThreadLocal and writing to the elements vector
 546         * of this thread.
 547         */
 548         if (prevCapacity != 0) {
 549           memcpy(reallocated, threadEntry->elements,
 550                  sizeof(*reallocated) * prevCapacity);
 551         }
 552         std::swap(reallocated, threadEntry->elements);
 553       }
 554       threadEntry->elementsCapacity = newCapacity;
 555     }
 556
 557     free(reallocated);
 558
 559 #ifdef FOLLY_TLD_USE_FOLLY_TLS
 560     if (prevCapacity == 0) {
 561       pthread_setspecific(meta.pthreadKey_, &meta);
 562     }
 563 #endif
 564   }
 565
 566   static ElementWrapper& get(EntryID* ent) {
 567     ThreadEntry* threadEntry = getThreadEntry();
 568     uint32_t id = ent->getOrInvalid();
 569     // if id is invalid, it is equal to uint32_t's max value.
 570     // x <= max value is always true
 571     if (UNLIKELY(threadEntry->elementsCapacity <= id)) {
 572       reserve(ent);
 573       id = ent->getOrInvalid();
 574       assert(threadEntry->elementsCapacity > id);
 575     }
 576     return threadEntry->elements[id];
 577   }
 578 };
 579
 580 #ifdef FOLLY_TLD_USE_FOLLY_TLS
 581 template <class Tag>
 582 FOLLY_TLS ThreadEntry StaticMeta<Tag>::threadEntry_ = {nullptr, 0,
 583                                                        nullptr, nullptr};
 584 #endif
 585 template <class Tag> StaticMeta<Tag>* StaticMeta<Tag>::inst_ = nullptr;
 586
 587 }  // namespace threadlocal_detail
 588 }  // namespace folly
 589
 590 #endif /* FOLLY_DETAIL_THREADLOCALDETAIL_H_ */