folly/detail/ThreadLocalDetail.h

   1 /*
   2  * Copyright 2015 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #ifndef FOLLY_DETAIL_THREADLOCALDETAIL_H_
  18 #define FOLLY_DETAIL_THREADLOCALDETAIL_H_
  19
  20 #include <limits.h>
  21 #include <pthread.h>
  22
  23 #include <mutex>
  24 #include <string>
  25 #include <vector>
  26
  27 #include <glog/logging.h>
  28
  29 #include <folly/Foreach.h>
  30 #include <folly/Exception.h>
  31 #include <folly/Malloc.h>
  32 #include <folly/MicroSpinLock.h>
  33
  34 // In general, emutls cleanup is not guaranteed to play nice with the way
  35 // StaticMeta mixes direct pthread calls and the use of __thread. This has
  36 // caused problems on multiple platforms so don't use __thread there.
  37 //
  38 // XXX: Ideally we would instead determine if emutls is in use at runtime as it
  39 // is possible to configure glibc on Linux to use emutls regardless.
  40 #if !__APPLE__ && !__ANDROID__
  41 #define FOLLY_TLD_USE_FOLLY_TLS 1
  42 #else
  43 #undef FOLLY_TLD_USE_FOLLY_TLS
  44 #endif
  45
  46 namespace folly {
  47 namespace threadlocal_detail {
  48
  49 /**
  50  * Base class for deleters.
  51  */
  52 class DeleterBase {
  53  public:
  54   virtual ~DeleterBase() { }
  55   virtual void dispose(void* ptr, TLPDestructionMode mode) const = 0;
  56 };
  57
  58 /**
  59  * Simple deleter class that calls delete on the passed-in pointer.
  60  */
  61 template <class Ptr>
  62 class SimpleDeleter : public DeleterBase {
  63  public:
  64   virtual void dispose(void* ptr, TLPDestructionMode /*mode*/) const {
  65     delete static_cast<Ptr>(ptr);
  66   }
  67 };
  68
  69 /**
  70  * Custom deleter that calls a given callable.
  71  */
  72 template <class Ptr, class Deleter>
  73 class CustomDeleter : public DeleterBase {
  74  public:
  75   explicit CustomDeleter(Deleter d) : deleter_(d) { }
  76   virtual void dispose(void* ptr, TLPDestructionMode mode) const {
  77     deleter_(static_cast<Ptr>(ptr), mode);
  78   }
  79  private:
  80   Deleter deleter_;
  81 };
  82
  83
  84 /**
  85  * POD wrapper around an element (a void*) and an associated deleter.
  86  * This must be POD, as we memset() it to 0 and memcpy() it around.
  87  */
  88 struct ElementWrapper {
  89   bool dispose(TLPDestructionMode mode) {
  90     if (ptr == nullptr) {
  91       return false;
  92     }
  93
  94     DCHECK(deleter != nullptr);
  95     deleter->dispose(ptr, mode);
  96     cleanup();
  97     return true;
  98   }
  99
 100   void* release() {
 101     auto retPtr = ptr;
 102
 103     if (ptr != nullptr) {
 104       cleanup();
 105     }
 106
 107     return retPtr;
 108   }
 109
 110   template <class Ptr>
 111   void set(Ptr p) {
 112     DCHECK(ptr == nullptr);
 113     DCHECK(deleter == nullptr);
 114
 115     if (p) {
 116       // We leak a single object here but that is ok.  If we used an
 117       // object directly, there is a chance that the destructor will be
 118       // called on that static object before any of the ElementWrappers
 119       // are disposed and that isn't so nice.
 120       static auto d = new SimpleDeleter<Ptr>();
 121       ptr = p;
 122       deleter = d;
 123       ownsDeleter = false;
 124     }
 125   }
 126
 127   template <class Ptr, class Deleter>
 128   void set(Ptr p, Deleter d) {
 129     DCHECK(ptr == nullptr);
 130     DCHECK(deleter == nullptr);
 131     if (p) {
 132       ptr = p;
 133       deleter = new CustomDeleter<Ptr,Deleter>(d);
 134       ownsDeleter = true;
 135     }
 136   }
 137
 138   void cleanup() {
 139     if (ownsDeleter) {
 140       delete deleter;
 141     }
 142     ptr = nullptr;
 143     deleter = nullptr;
 144     ownsDeleter = false;
 145   }
 146
 147   void* ptr;
 148   DeleterBase* deleter;
 149   bool ownsDeleter;
 150 };
 151
 152 /**
 153  * Per-thread entry.  Each thread using a StaticMeta object has one.
 154  * This is written from the owning thread only (under the lock), read
 155  * from the owning thread (no lock necessary), and read from other threads
 156  * (under the lock).
 157  */
 158 struct ThreadEntry {
 159   ElementWrapper* elements;
 160   size_t elementsCapacity;
 161   ThreadEntry* next;
 162   ThreadEntry* prev;
 163 };
 164
 165 constexpr uint32_t kEntryIDInvalid = std::numeric_limits<uint32_t>::max();
 166
 167 class PthreadKeyUnregisterTester;
 168
 169 /**
 170  * We want to disable onThreadExit call at the end of shutdown, we don't care
 171  * about leaking memory at that point.
 172  *
 173  * Otherwise if ThreadLocal is used in a shared library, onThreadExit may be
 174  * called after dlclose().
 175  *
 176  * This class has one single static instance; however since it's so widely used,
 177  * directly or indirectly, by so many classes, we need to take care to avoid
 178  * problems stemming from the Static Initialization/Destruction Order Fiascos.
 179  * Therefore this class needs to be constexpr-constructible, so as to avoid
 180  * the need for this to participate in init/destruction order.
 181  */
 182 class PthreadKeyUnregister {
 183  public:
 184   static constexpr size_t kMaxKeys = 1UL << 16;
 185
 186   ~PthreadKeyUnregister() {
 187     MSLGuard lg(lock_);
 188     while (size_) {
 189       pthread_key_delete(keys_[--size_]);
 190     }
 191   }
 192
 193   static void registerKey(pthread_key_t key) {
 194     instance_.registerKeyImpl(key);
 195   }
 196
 197  private:
 198   /**
 199    * Only one global instance should exist, hence this is private.
 200    * See also the important note at the top of this class about `constexpr`
 201    * usage.
 202    */
 203   constexpr PthreadKeyUnregister() : lock_(), size_(0), keys_() { }
 204   friend class folly::threadlocal_detail::PthreadKeyUnregisterTester;
 205
 206   void registerKeyImpl(pthread_key_t key) {
 207     MSLGuard lg(lock_);
 208     if (size_ == kMaxKeys) {
 209       throw std::logic_error("pthread_key limit has already been reached");
 210     }
 211     keys_[size_++] = key;
 212   }
 213
 214   MicroSpinLock lock_;
 215   size_t size_;
 216   pthread_key_t keys_[kMaxKeys];
 217
 218   static PthreadKeyUnregister instance_;
 219 };
 220
 221 // Held in a singleton to track our global instances.
 222 // We have one of these per "Tag", by default one for the whole system
 223 // (Tag=void).
 224 //
 225 // Creating and destroying ThreadLocalPtr objects, as well as thread exit
 226 // for threads that use ThreadLocalPtr objects collide on a lock inside
 227 // StaticMeta; you can specify multiple Tag types to break that lock.
 228 template <class Tag>
 229 struct StaticMeta {
 230   // Represents an ID of a thread local object. Initially set to the maximum
 231   // uint. This representation allows us to avoid a branch in accessing TLS data
 232   // (because if you test capacity > id if id = maxint then the test will always
 233   // fail). It allows us to keep a constexpr constructor and avoid SIOF.
 234   class EntryID {
 235    public:
 236     std::atomic<uint32_t> value;
 237
 238     constexpr EntryID() : value(kEntryIDInvalid) {
 239     }
 240
 241     EntryID(EntryID&& other) noexcept : value(other.value.load()) {
 242       other.value = kEntryIDInvalid;
 243     }
 244
 245     EntryID& operator=(EntryID&& other) {
 246       assert(this != &other);
 247       value = other.value.load();
 248       other.value = kEntryIDInvalid;
 249       return *this;
 250     }
 251
 252     EntryID(const EntryID& other) = delete;
 253     EntryID& operator=(const EntryID& other) = delete;
 254
 255     uint32_t getOrInvalid() {
 256       // It's OK for this to be relaxed, even though we're effectively doing
 257       // double checked locking in using this value. We only care about the
 258       // uniqueness of IDs, getOrAllocate does not modify any other memory
 259       // this thread will use.
 260       return value.load(std::memory_order_relaxed);
 261     }
 262
 263     uint32_t getOrAllocate() {
 264       uint32_t id = getOrInvalid();
 265       if (id != kEntryIDInvalid) {
 266         return id;
 267       }
 268       // The lock inside allocate ensures that a single value is allocated
 269       return instance().allocate(this);
 270     }
 271   };
 272
 273   static StaticMeta<Tag>& instance() {
 274     // Leak it on exit, there's only one per process and we don't have to
 275     // worry about synchronization with exiting threads.
 276     static bool constructed = (inst_ = new StaticMeta<Tag>());
 277     (void)constructed; // suppress unused warning
 278     return *inst_;
 279   }
 280
 281   uint32_t nextId_;
 282   std::vector<uint32_t> freeIds_;
 283   std::mutex lock_;
 284   pthread_key_t pthreadKey_;
 285   ThreadEntry head_;
 286
 287   void push_back(ThreadEntry* t) {
 288     t->next = &head_;
 289     t->prev = head_.prev;
 290     head_.prev->next = t;
 291     head_.prev = t;
 292   }
 293
 294   void erase(ThreadEntry* t) {
 295     t->next->prev = t->prev;
 296     t->prev->next = t->next;
 297     t->next = t->prev = t;
 298   }
 299
 300 #ifdef FOLLY_TLD_USE_FOLLY_TLS
 301   static FOLLY_TLS ThreadEntry threadEntry_;
 302 #endif
 303   static StaticMeta<Tag>* inst_;
 304
 305   StaticMeta() : nextId_(1) {
 306     head_.next = head_.prev = &head_;
 307     int ret = pthread_key_create(&pthreadKey_, &onThreadExit);
 308     checkPosixError(ret, "pthread_key_create failed");
 309     PthreadKeyUnregister::registerKey(pthreadKey_);
 310
 311 #if FOLLY_HAVE_PTHREAD_ATFORK
 312     ret = pthread_atfork(/*prepare*/ &StaticMeta::preFork,
 313                          /*parent*/ &StaticMeta::onForkParent,
 314                          /*child*/ &StaticMeta::onForkChild);
 315     checkPosixError(ret, "pthread_atfork failed");
 316 #elif !__ANDROID__ && !defined(_MSC_VER)
 317     // pthread_atfork is not part of the Android NDK at least as of n9d. If
 318     // something is trying to call native fork() directly at all with Android's
 319     // process management model, this is probably the least of the problems.
 320     //
 321     // But otherwise, this is a problem.
 322     #warning pthread_atfork unavailable
 323 #endif
 324   }
 325   ~StaticMeta() {
 326     LOG(FATAL) << "StaticMeta lives forever!";
 327   }
 328
 329   static ThreadEntry* getThreadEntry() {
 330 #ifdef FOLLY_TLD_USE_FOLLY_TLS
 331     return &threadEntry_;
 332 #else
 333     auto key = instance().pthreadKey_;
 334     ThreadEntry* threadEntry =
 335       static_cast<ThreadEntry*>(pthread_getspecific(key));
 336     if (!threadEntry) {
 337         threadEntry = new ThreadEntry();
 338         int ret = pthread_setspecific(key, threadEntry);
 339         checkPosixError(ret, "pthread_setspecific failed");
 340     }
 341     return threadEntry;
 342 #endif
 343   }
 344
 345   static void preFork(void) {
 346     instance().lock_.lock();  // Make sure it's created
 347   }
 348
 349   static void onForkParent(void) {
 350     inst_->lock_.unlock();
 351   }
 352
 353   static void onForkChild(void) {
 354     // only the current thread survives
 355     inst_->head_.next = inst_->head_.prev = &inst_->head_;
 356     ThreadEntry* threadEntry = getThreadEntry();
 357     // If this thread was in the list before the fork, add it back.
 358     if (threadEntry->elementsCapacity != 0) {
 359       inst_->push_back(threadEntry);
 360     }
 361     inst_->lock_.unlock();
 362   }
 363
 364   static void onThreadExit(void* ptr) {
 365     auto& meta = instance();
 366 #ifdef FOLLY_TLD_USE_FOLLY_TLS
 367     ThreadEntry* threadEntry = getThreadEntry();
 368
 369     DCHECK_EQ(ptr, &meta);
 370     DCHECK_GT(threadEntry->elementsCapacity, 0);
 371 #else
 372     // pthread sets the thread-specific value corresponding
 373     // to meta.pthreadKey_ to NULL before calling onThreadExit.
 374     // We need to set it back to ptr to enable the correct behaviour
 375     // of the subsequent calls of getThreadEntry
 376     // (which may happen in user-provided custom deleters)
 377     pthread_setspecific(meta.pthreadKey_, ptr);
 378     ThreadEntry* threadEntry = static_cast<ThreadEntry*>(ptr);
 379 #endif
 380     {
 381       std::lock_guard<std::mutex> g(meta.lock_);
 382       meta.erase(threadEntry);
 383       // No need to hold the lock any longer; the ThreadEntry is private to this
 384       // thread now that it's been removed from meta.
 385     }
 386     // NOTE: User-provided deleter / object dtor itself may be using ThreadLocal
 387     // with the same Tag, so dispose() calls below may (re)create some of the
 388     // elements or even increase elementsCapacity, thus multiple cleanup rounds
 389     // may be required.
 390     for (bool shouldRun = true; shouldRun; ) {
 391       shouldRun = false;
 392       FOR_EACH_RANGE(i, 0, threadEntry->elementsCapacity) {
 393         if (threadEntry->elements[i].dispose(TLPDestructionMode::THIS_THREAD)) {
 394           shouldRun = true;
 395         }
 396       }
 397     }
 398     free(threadEntry->elements);
 399     threadEntry->elements = nullptr;
 400     pthread_setspecific(meta.pthreadKey_, nullptr);
 401
 402 #ifndef FOLLY_TLD_USE_FOLLY_TLS
 403     // Allocated in getThreadEntry() when not using folly TLS; free it
 404     delete threadEntry;
 405 #endif
 406   }
 407
 408   static uint32_t allocate(EntryID* ent) {
 409     uint32_t id;
 410     auto & meta = instance();
 411     std::lock_guard<std::mutex> g(meta.lock_);
 412
 413     id = ent->value.load();
 414     if (id != kEntryIDInvalid) {
 415       return id;
 416     }
 417
 418     if (!meta.freeIds_.empty()) {
 419       id = meta.freeIds_.back();
 420       meta.freeIds_.pop_back();
 421     } else {
 422       id = meta.nextId_++;
 423     }
 424
 425     uint32_t old_id = ent->value.exchange(id);
 426     DCHECK_EQ(old_id, kEntryIDInvalid);
 427     return id;
 428   }
 429
 430   static void destroy(EntryID* ent) {
 431     try {
 432       auto & meta = instance();
 433       // Elements in other threads that use this id.
 434       std::vector<ElementWrapper> elements;
 435       {
 436         std::lock_guard<std::mutex> g(meta.lock_);
 437         uint32_t id = ent->value.exchange(kEntryIDInvalid);
 438         if (id == kEntryIDInvalid) {
 439           return;
 440         }
 441
 442         for (ThreadEntry* e = meta.head_.next; e != &meta.head_; e = e->next) {
 443           if (id < e->elementsCapacity && e->elements[id].ptr) {
 444             elements.push_back(e->elements[id]);
 445
 446             /*
 447              * Writing another thread's ThreadEntry from here is fine;
 448              * the only other potential reader is the owning thread --
 449              * from onThreadExit (which grabs the lock, so is properly
 450              * synchronized with us) or from get(), which also grabs
 451              * the lock if it needs to resize the elements vector.
 452              *
 453              * We can't conflict with reads for a get(id), because
 454              * it's illegal to call get on a thread local that's
 455              * destructing.
 456              */
 457             e->elements[id].ptr = nullptr;
 458             e->elements[id].deleter = nullptr;
 459             e->elements[id].ownsDeleter = false;
 460           }
 461         }
 462         meta.freeIds_.push_back(id);
 463       }
 464       // Delete elements outside the lock
 465       FOR_EACH(it, elements) {
 466         it->dispose(TLPDestructionMode::ALL_THREADS);
 467       }
 468     } catch (...) { // Just in case we get a lock error or something anyway...
 469       LOG(WARNING) << "Destructor discarding an exception that was thrown.";
 470     }
 471   }
 472
 473   /**
 474    * Reserve enough space in the ThreadEntry::elements for the item
 475    * @id to fit in.
 476    */
 477   static void reserve(EntryID* id) {
 478     auto& meta = instance();
 479     ThreadEntry* threadEntry = getThreadEntry();
 480     size_t prevCapacity = threadEntry->elementsCapacity;
 481
 482     uint32_t idval = id->getOrAllocate();
 483     if (prevCapacity > idval) {
 484       return;
 485     }
 486     // Growth factor < 2, see folly/docs/FBVector.md; + 5 to prevent
 487     // very slow start.
 488     size_t newCapacity = static_cast<size_t>((idval + 5) * 1.7);
 489     assert(newCapacity > prevCapacity);
 490     ElementWrapper* reallocated = nullptr;
 491
 492     // Need to grow. Note that we can't call realloc, as elements is
 493     // still linked in meta, so another thread might access invalid memory
 494     // after realloc succeeds. We'll copy by hand and update our ThreadEntry
 495     // under the lock.
 496     if (usingJEMalloc()) {
 497       bool success = false;
 498       size_t newByteSize = nallocx(newCapacity * sizeof(ElementWrapper), 0);
 499
 500       // Try to grow in place.
 501       //
 502       // Note that xallocx(MALLOCX_ZERO) will only zero newly allocated memory,
 503       // even if a previous allocation allocated more than we requested.
 504       // This is fine; we always use MALLOCX_ZERO with jemalloc and we
 505       // always expand our allocation to the real size.
 506       if (prevCapacity * sizeof(ElementWrapper) >=
 507           jemallocMinInPlaceExpandable) {
 508         success = (xallocx(threadEntry->elements, newByteSize, 0, MALLOCX_ZERO)
 509                    == newByteSize);
 510       }
 511
 512       // In-place growth failed.
 513       if (!success) {
 514         success = ((reallocated = static_cast<ElementWrapper*>(
 515                     mallocx(newByteSize, MALLOCX_ZERO))) != nullptr);
 516       }
 517
 518       if (success) {
 519         // Expand to real size
 520         assert(newByteSize / sizeof(ElementWrapper) >= newCapacity);
 521         newCapacity = newByteSize / sizeof(ElementWrapper);
 522       } else {
 523         throw std::bad_alloc();
 524       }
 525     } else {  // no jemalloc
 526       // calloc() is simpler than malloc() followed by memset(), and
 527       // potentially faster when dealing with a lot of memory, as it can get
 528       // already-zeroed pages from the kernel.
 529       reallocated = static_cast<ElementWrapper*>(
 530           calloc(newCapacity, sizeof(ElementWrapper)));
 531       if (!reallocated) {
 532         throw std::bad_alloc();
 533       }
 534     }
 535
 536     // Success, update the entry
 537     {
 538       std::lock_guard<std::mutex> g(meta.lock_);
 539
 540       if (prevCapacity == 0) {
 541         meta.push_back(threadEntry);
 542       }
 543
 544       if (reallocated) {
 545        /*
 546         * Note: we need to hold the meta lock when copying data out of
 547         * the old vector, because some other thread might be
 548         * destructing a ThreadLocal and writing to the elements vector
 549         * of this thread.
 550         */
 551         if (prevCapacity != 0) {
 552           memcpy(reallocated, threadEntry->elements,
 553                  sizeof(*reallocated) * prevCapacity);
 554         }
 555         std::swap(reallocated, threadEntry->elements);
 556       }
 557       threadEntry->elementsCapacity = newCapacity;
 558     }
 559
 560     free(reallocated);
 561
 562 #ifdef FOLLY_TLD_USE_FOLLY_TLS
 563     if (prevCapacity == 0) {
 564       pthread_setspecific(meta.pthreadKey_, &meta);
 565     }
 566 #endif
 567   }
 568
 569   static ElementWrapper& get(EntryID* ent) {
 570     ThreadEntry* threadEntry = getThreadEntry();
 571     uint32_t id = ent->getOrInvalid();
 572     // if id is invalid, it is equal to uint32_t's max value.
 573     // x <= max value is always true
 574     if (UNLIKELY(threadEntry->elementsCapacity <= id)) {
 575       reserve(ent);
 576       id = ent->getOrInvalid();
 577       assert(threadEntry->elementsCapacity > id);
 578     }
 579     return threadEntry->elements[id];
 580   }
 581 };
 582
 583 #ifdef FOLLY_TLD_USE_FOLLY_TLS
 584 template <class Tag>
 585 FOLLY_TLS ThreadEntry StaticMeta<Tag>::threadEntry_ = {nullptr, 0,
 586                                                        nullptr, nullptr};
 587 #endif
 588 template <class Tag> StaticMeta<Tag>* StaticMeta<Tag>::inst_ = nullptr;
 589
 590 }  // namespace threadlocal_detail
 591 }  // namespace folly
 592
 593 #endif /* FOLLY_DETAIL_THREADLOCALDETAIL_H_ */