X-Git-Url: http://plrg.eecs.uci.edu/git/?p=folly.git;a=blobdiff_plain;f=folly%2Fdetail%2FThreadLocalDetail.h;h=a88bfbb1d36a9e8f14b5db9455be3d029093a0d1;hp=9efd8cab127a0e54151f21fe24f1ad84b18df74e;hb=d4aacd244f21e76dce685365acc281a9015897c1;hpb=22afce906d7e98d95f8c45c3301072d9fd891d41 diff --git a/folly/detail/ThreadLocalDetail.h b/folly/detail/ThreadLocalDetail.h index 9efd8cab..a88bfbb1 100644 --- a/folly/detail/ThreadLocalDetail.h +++ b/folly/detail/ThreadLocalDetail.h @@ -1,5 +1,5 @@ /* - * Copyright 2014 Facebook, Inc. + * Copyright 2017 Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,111 +14,133 @@ * limitations under the License. */ -#ifndef FOLLY_DETAIL_THREADLOCALDETAIL_H_ -#define FOLLY_DETAIL_THREADLOCALDETAIL_H_ +#pragma once #include -#include +#include +#include #include #include #include #include -#include "folly/Foreach.h" -#include "folly/Exception.h" -#include "folly/Malloc.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// In general, emutls cleanup is not guaranteed to play nice with the way +// StaticMeta mixes direct pthread calls and the use of __thread. This has +// caused problems on multiple platforms so don't use __thread there. +// +// XXX: Ideally we would instead determine if emutls is in use at runtime as it +// is possible to configure glibc on Linux to use emutls regardless. +#if !FOLLY_MOBILE && !defined(__APPLE__) && !defined(_MSC_VER) +#define FOLLY_TLD_USE_FOLLY_TLS 1 +#else +#undef FOLLY_TLD_USE_FOLLY_TLS +#endif namespace folly { + +enum class TLPDestructionMode { THIS_THREAD, ALL_THREADS }; +struct AccessModeStrict {}; + namespace threadlocal_detail { /** - * Base class for deleters. + * POD wrapper around an element (a void*) and an associated deleter. + * This must be POD, as we memset() it to 0 and memcpy() it around. */ -class DeleterBase { - public: - virtual ~DeleterBase() { } - virtual void dispose(void* ptr, TLPDestructionMode mode) const = 0; -}; +struct ElementWrapper { + using DeleterFunType = void(void*, TLPDestructionMode); -/** - * Simple deleter class that calls delete on the passed-in pointer. - */ -template -class SimpleDeleter : public DeleterBase { - public: - virtual void dispose(void* ptr, TLPDestructionMode mode) const { - delete static_cast(ptr); - } -}; + bool dispose(TLPDestructionMode mode) { + if (ptr == nullptr) { + return false; + } -/** - * Custom deleter that calls a given callable. - */ -template -class CustomDeleter : public DeleterBase { - public: - explicit CustomDeleter(Deleter d) : deleter_(d) { } - virtual void dispose(void* ptr, TLPDestructionMode mode) const { - deleter_(static_cast(ptr), mode); + DCHECK(deleter1 != nullptr); + ownsDeleter ? (*deleter2)(ptr, mode) : (*deleter1)(ptr, mode); + cleanup(); + return true; } - private: - Deleter deleter_; -}; + void* release() { + auto retPtr = ptr; -/** - * POD wrapper around an element (a void*) and an associated deleter. - * This must be POD, as we memset() it to 0 and memcpy() it around. - */ -struct ElementWrapper { - void dispose(TLPDestructionMode mode) { - if (ptr != NULL) { - DCHECK(deleter != NULL); - deleter->dispose(ptr, mode); - if (ownsDeleter) { - delete deleter; - } - ptr = NULL; - deleter = NULL; - ownsDeleter = false; + if (ptr != nullptr) { + cleanup(); } + + return retPtr; } template void set(Ptr p) { - DCHECK(ptr == NULL); - DCHECK(deleter == NULL); + auto guard = makeGuard([&] { delete p; }); + DCHECK(ptr == nullptr); + DCHECK(deleter1 == nullptr); if (p) { - // We leak a single object here but that is ok. If we used an - // object directly, there is a chance that the destructor will be - // called on that static object before any of the ElementWrappers - // are disposed and that isn't so nice. - static auto d = new SimpleDeleter(); ptr = p; - deleter = d; + deleter1 = [](void* pt, TLPDestructionMode) { + delete static_cast(pt); + }; ownsDeleter = false; + guard.dismiss(); } } template - void set(Ptr p, Deleter d) { - DCHECK(ptr == NULL); - DCHECK(deleter == NULL); + void set(Ptr p, const Deleter& d) { + auto guard = makeGuard([&] { + if (p) { + d(p, TLPDestructionMode::THIS_THREAD); + } + }); + DCHECK(ptr == nullptr); + DCHECK(deleter2 == nullptr); if (p) { ptr = p; - deleter = new CustomDeleter(d); + auto d2 = d; // gcc-4.8 doesn't decay types correctly in lambda captures + deleter2 = new std::function( + [d2](void* pt, TLPDestructionMode mode) { + d2(static_cast(pt), mode); + }); ownsDeleter = true; + guard.dismiss(); + } + } + + void cleanup() { + if (ownsDeleter) { + delete deleter2; } + ptr = nullptr; + deleter1 = nullptr; + ownsDeleter = false; } void* ptr; - DeleterBase* deleter; + union { + DeleterFunType* deleter1; + std::function* deleter2; + }; bool ownsDeleter; }; +struct StaticMetaBase; + /** * Per-thread entry. Each thread using a StaticMeta object has one. * This is written from the owning thread only (under the lock), read @@ -126,294 +148,254 @@ struct ElementWrapper { * (under the lock). */ struct ThreadEntry { - ElementWrapper* elements; - size_t elementsCapacity; - ThreadEntry* next; - ThreadEntry* prev; + ElementWrapper* elements{nullptr}; + size_t elementsCapacity{0}; + ThreadEntry* next{nullptr}; + ThreadEntry* prev{nullptr}; + StaticMetaBase* meta{nullptr}; }; -// Held in a singleton to track our global instances. -// We have one of these per "Tag", by default one for the whole system -// (Tag=void). -// -// Creating and destroying ThreadLocalPtr objects, as well as thread exit -// for threads that use ThreadLocalPtr objects collide on a lock inside -// StaticMeta; you can specify multiple Tag types to break that lock. -template -struct StaticMeta { - static StaticMeta& instance() { - // Leak it on exit, there's only one per process and we don't have to - // worry about synchronization with exiting threads. - static bool constructed = (inst_ = new StaticMeta()); - (void)constructed; // suppress unused warning - return *inst_; - } +constexpr uint32_t kEntryIDInvalid = std::numeric_limits::max(); - int nextId_; - std::vector freeIds_; - std::mutex lock_; - pthread_key_t pthreadKey_; - ThreadEntry head_; +struct PthreadKeyUnregisterTester; - void push_back(ThreadEntry* t) { - t->next = &head_; - t->prev = head_.prev; - head_.prev->next = t; - head_.prev = t; +/** + * We want to disable onThreadExit call at the end of shutdown, we don't care + * about leaking memory at that point. + * + * Otherwise if ThreadLocal is used in a shared library, onThreadExit may be + * called after dlclose(). + * + * This class has one single static instance; however since it's so widely used, + * directly or indirectly, by so many classes, we need to take care to avoid + * problems stemming from the Static Initialization/Destruction Order Fiascos. + * Therefore this class needs to be constexpr-constructible, so as to avoid + * the need for this to participate in init/destruction order. + */ +class PthreadKeyUnregister { + public: + static constexpr size_t kMaxKeys = 1UL << 16; + + ~PthreadKeyUnregister() { + // If static constructor priorities are not supported then + // ~PthreadKeyUnregister logic is not safe. +#if !defined(__APPLE__) && !defined(_MSC_VER) + MSLGuard lg(lock_); + while (size_) { + pthread_key_delete(keys_[--size_]); + } +#endif } - void erase(ThreadEntry* t) { - t->next->prev = t->prev; - t->prev->next = t->next; - t->next = t->prev = t; + static void registerKey(pthread_key_t key) { + instance_.registerKeyImpl(key); } -#if !__APPLE__ - static __thread ThreadEntry threadEntry_; -#endif - static StaticMeta* inst_; - - StaticMeta() : nextId_(1) { - head_.next = head_.prev = &head_; - int ret = pthread_key_create(&pthreadKey_, &onThreadExit); - checkPosixError(ret, "pthread_key_create failed"); - - ret = pthread_atfork(/*prepare*/ &StaticMeta::preFork, - /*parent*/ &StaticMeta::onForkParent, - /*child*/ &StaticMeta::onForkChild); - checkPosixError(ret, "pthread_atfork failed"); - } - ~StaticMeta() { - LOG(FATAL) << "StaticMeta lives forever!"; - } + private: + /** + * Only one global instance should exist, hence this is private. + * See also the important note at the top of this class about `constexpr` + * usage. + */ + constexpr PthreadKeyUnregister() : lock_(), size_(0), keys_() { } + friend struct folly::threadlocal_detail::PthreadKeyUnregisterTester; - static ThreadEntry* getThreadEntry() { -#if !__APPLE__ - return &threadEntry_; -#else - ThreadEntry* threadEntry = - static_cast(pthread_getspecific(inst_->pthreadKey_)); - if (!threadEntry) { - threadEntry = new ThreadEntry(); - int ret = pthread_setspecific(inst_->pthreadKey_, threadEntry); - checkPosixError(ret, "pthread_setspecific failed"); + void registerKeyImpl(pthread_key_t key) { + MSLGuard lg(lock_); + if (size_ == kMaxKeys) { + throw std::logic_error("pthread_key limit has already been reached"); } - return threadEntry; -#endif + keys_[size_++] = key; } - static void preFork(void) { - instance().lock_.lock(); // Make sure it's created - } + MicroSpinLock lock_; + size_t size_; + pthread_key_t keys_[kMaxKeys]; - static void onForkParent(void) { - inst_->lock_.unlock(); - } + static PthreadKeyUnregister instance_; +}; - static void onForkChild(void) { - // only the current thread survives - inst_->head_.next = inst_->head_.prev = &inst_->head_; - inst_->push_back(getThreadEntry()); - inst_->lock_.unlock(); - } +struct StaticMetaBase { + // Represents an ID of a thread local object. Initially set to the maximum + // uint. This representation allows us to avoid a branch in accessing TLS data + // (because if you test capacity > id if id = maxint then the test will always + // fail). It allows us to keep a constexpr constructor and avoid SIOF. + class EntryID { + public: + std::atomic value; - static void onThreadExit(void* ptr) { - auto & meta = instance(); -#if !__APPLE__ - ThreadEntry* threadEntry = getThreadEntry(); + constexpr EntryID() : value(kEntryIDInvalid) { + } - DCHECK_EQ(ptr, &meta); - DCHECK_GT(threadEntry->elementsCapacity, 0); -#else - ThreadEntry* threadEntry = static_cast(ptr); -#endif - { - std::lock_guard g(meta.lock_); - meta.erase(threadEntry); - // No need to hold the lock any longer; the ThreadEntry is private to this - // thread now that it's been removed from meta. + EntryID(EntryID&& other) noexcept : value(other.value.load()) { + other.value = kEntryIDInvalid; } - FOR_EACH_RANGE(i, 0, threadEntry->elementsCapacity) { - threadEntry->elements[i].dispose(TLPDestructionMode::THIS_THREAD); + + EntryID& operator=(EntryID&& other) { + assert(this != &other); + value = other.value.load(); + other.value = kEntryIDInvalid; + return *this; } - free(threadEntry->elements); - threadEntry->elements = NULL; - pthread_setspecific(meta.pthreadKey_, NULL); -#if __APPLE__ - // Allocated in getThreadEntry(); free it - delete threadEntry; -#endif - } + EntryID(const EntryID& other) = delete; + EntryID& operator=(const EntryID& other) = delete; - static int create() { - int id; - auto & meta = instance(); - std::lock_guard g(meta.lock_); - if (!meta.freeIds_.empty()) { - id = meta.freeIds_.back(); - meta.freeIds_.pop_back(); - } else { - id = meta.nextId_++; + uint32_t getOrInvalid() { + // It's OK for this to be relaxed, even though we're effectively doing + // double checked locking in using this value. We only care about the + // uniqueness of IDs, getOrAllocate does not modify any other memory + // this thread will use. + return value.load(std::memory_order_relaxed); } - return id; - } - static void destroy(int id) { - try { - auto & meta = instance(); - // Elements in other threads that use this id. - std::vector elements; - { - std::lock_guard g(meta.lock_); - for (ThreadEntry* e = meta.head_.next; e != &meta.head_; e = e->next) { - if (id < e->elementsCapacity && e->elements[id].ptr) { - elements.push_back(e->elements[id]); - - /* - * Writing another thread's ThreadEntry from here is fine; - * the only other potential reader is the owning thread -- - * from onThreadExit (which grabs the lock, so is properly - * synchronized with us) or from get(), which also grabs - * the lock if it needs to resize the elements vector. - * - * We can't conflict with reads for a get(id), because - * it's illegal to call get on a thread local that's - * destructing. - */ - e->elements[id].ptr = nullptr; - e->elements[id].deleter = nullptr; - e->elements[id].ownsDeleter = false; - } - } - meta.freeIds_.push_back(id); - } - // Delete elements outside the lock - FOR_EACH(it, elements) { - it->dispose(TLPDestructionMode::ALL_THREADS); + uint32_t getOrAllocate(StaticMetaBase& meta) { + uint32_t id = getOrInvalid(); + if (id != kEntryIDInvalid) { + return id; } - } catch (...) { // Just in case we get a lock error or something anyway... - LOG(WARNING) << "Destructor discarding an exception that was thrown."; + // The lock inside allocate ensures that a single value is allocated + return meta.allocate(this); } + }; + + StaticMetaBase(ThreadEntry* (*threadEntry)(), bool strict); + + [[noreturn]] ~StaticMetaBase() { + folly::assume_unreachable(); + } + + void push_back(ThreadEntry* t) { + t->next = &head_; + t->prev = head_.prev; + head_.prev->next = t; + head_.prev = t; } + void erase(ThreadEntry* t) { + t->next->prev = t->prev; + t->prev->next = t->next; + t->next = t->prev = t; + } + + static void onThreadExit(void* ptr); + + uint32_t allocate(EntryID* ent); + + void destroy(EntryID* ent); + /** * Reserve enough space in the ThreadEntry::elements for the item * @id to fit in. */ - static void reserve(int id) { - auto& meta = instance(); - ThreadEntry* threadEntry = getThreadEntry(); - size_t prevCapacity = threadEntry->elementsCapacity; - // Growth factor < 2, see folly/docs/FBVector.md; + 5 to prevent - // very slow start. - size_t newCapacity = static_cast((id + 5) * 1.7); - assert(newCapacity > prevCapacity); - ElementWrapper* reallocated = nullptr; - - // Need to grow. Note that we can't call realloc, as elements is - // still linked in meta, so another thread might access invalid memory - // after realloc succeeds. We'll copy by hand and update our ThreadEntry - // under the lock. - if (usingJEMalloc()) { - bool success = false; - size_t newByteSize = newCapacity * sizeof(ElementWrapper); - size_t realByteSize = 0; - - // Try to grow in place. - // - // Note that rallocm(ALLOCM_ZERO) will only zero newly allocated memory, - // even if a previous allocation allocated more than we requested. - // This is fine; we always use ALLOCM_ZERO with jemalloc and we - // always expand our allocation to the real size. - if (prevCapacity * sizeof(ElementWrapper) >= - jemallocMinInPlaceExpandable) { - success = (rallocm(reinterpret_cast(&threadEntry->elements), - &realByteSize, - newByteSize, - 0, - ALLOCM_NO_MOVE | ALLOCM_ZERO) == ALLOCM_SUCCESS); + void reserve(EntryID* id); - } - - // In-place growth failed. - if (!success) { - // Note that, unlike calloc,allocm(... ALLOCM_ZERO) zeros all - // allocated bytes (*realByteSize) and not just the requested - // bytes (newByteSize) - success = (allocm(reinterpret_cast(&reallocated), - &realByteSize, - newByteSize, - ALLOCM_ZERO) == ALLOCM_SUCCESS); - } - - if (success) { - // Expand to real size - assert(realByteSize / sizeof(ElementWrapper) >= newCapacity); - newCapacity = realByteSize / sizeof(ElementWrapper); - } else { - throw std::bad_alloc(); - } - } else { // no jemalloc - // calloc() is simpler than malloc() followed by memset(), and - // potentially faster when dealing with a lot of memory, as it can get - // already-zeroed pages from the kernel. - reallocated = static_cast( - calloc(newCapacity, sizeof(ElementWrapper))); - if (!reallocated) { - throw std::bad_alloc(); - } - } + ElementWrapper& get(EntryID* ent); - // Success, update the entry - { - std::lock_guard g(meta.lock_); + static void initAtFork(); + static void registerAtFork( + folly::Function prepare, + folly::Function parent, + folly::Function child); - if (prevCapacity == 0) { - meta.push_back(threadEntry); - } - - if (reallocated) { - /* - * Note: we need to hold the meta lock when copying data out of - * the old vector, because some other thread might be - * destructing a ThreadLocal and writing to the elements vector - * of this thread. - */ - memcpy(reallocated, threadEntry->elements, - sizeof(ElementWrapper) * prevCapacity); - using std::swap; - swap(reallocated, threadEntry->elements); - } - threadEntry->elementsCapacity = newCapacity; - } + uint32_t nextId_; + std::vector freeIds_; + std::mutex lock_; + SharedMutex accessAllThreadsLock_; + pthread_key_t pthreadKey_; + ThreadEntry head_; + ThreadEntry* (*threadEntry_)(); + bool strict_; +}; - free(reallocated); +// Held in a singleton to track our global instances. +// We have one of these per "Tag", by default one for the whole system +// (Tag=void). +// +// Creating and destroying ThreadLocalPtr objects, as well as thread exit +// for threads that use ThreadLocalPtr objects collide on a lock inside +// StaticMeta; you can specify multiple Tag types to break that lock. +template +struct StaticMeta : StaticMetaBase { + StaticMeta() + : StaticMetaBase( + &StaticMeta::getThreadEntrySlow, + std::is_same::value) { + registerAtFork( + /*prepare*/ &StaticMeta::preFork, + /*parent*/ &StaticMeta::onForkParent, + /*child*/ &StaticMeta::onForkChild); + } -#if !__APPLE__ - if (prevCapacity == 0) { - pthread_setspecific(meta.pthreadKey_, &meta); - } -#endif + static StaticMeta& instance() { + // Leak it on exit, there's only one per process and we don't have to + // worry about synchronization with exiting threads. + /* library-local */ static auto instance = + detail::createGlobal, void>(); + return *instance; } - static ElementWrapper& get(int id) { + ElementWrapper& get(EntryID* ent) { ThreadEntry* threadEntry = getThreadEntry(); + uint32_t id = ent->getOrInvalid(); + // if id is invalid, it is equal to uint32_t's max value. + // x <= max value is always true if (UNLIKELY(threadEntry->elementsCapacity <= id)) { - reserve(id); + reserve(ent); + id = ent->getOrInvalid(); assert(threadEntry->elementsCapacity > id); } return threadEntry->elements[id]; } -}; -#if !__APPLE__ -template __thread ThreadEntry StaticMeta::threadEntry_ = {0}; + static ThreadEntry* getThreadEntrySlow() { + auto& meta = instance(); + auto key = meta.pthreadKey_; + ThreadEntry* threadEntry = + static_cast(pthread_getspecific(key)); + if (!threadEntry) { +#ifdef FOLLY_TLD_USE_FOLLY_TLS + static FOLLY_TLS ThreadEntry threadEntrySingleton; + threadEntry = &threadEntrySingleton; +#else + threadEntry = new ThreadEntry(); +#endif + threadEntry->meta = &meta; + int ret = pthread_setspecific(key, threadEntry); + checkPosixError(ret, "pthread_setspecific failed"); + } + return threadEntry; + } + + inline static ThreadEntry* getThreadEntry() { +#ifdef FOLLY_TLD_USE_FOLLY_TLS + static FOLLY_TLS ThreadEntry* threadEntryCache{nullptr}; + if (UNLIKELY(threadEntryCache == nullptr)) { + threadEntryCache = instance().threadEntry_(); + } + return threadEntryCache; +#else + return instance().threadEntry_(); #endif -template StaticMeta* StaticMeta::inst_ = nullptr; + } -} // namespace threadlocal_detail -} // namespace folly + static void preFork(void) { + instance().lock_.lock(); // Make sure it's created + } -#endif /* FOLLY_DETAIL_THREADLOCALDETAIL_H_ */ + static void onForkParent(void) { instance().lock_.unlock(); } + + static void onForkChild(void) { + // only the current thread survives + instance().head_.next = instance().head_.prev = &instance().head_; + ThreadEntry* threadEntry = getThreadEntry(); + // If this thread was in the list before the fork, add it back. + if (threadEntry->elementsCapacity != 0) { + instance().push_back(threadEntry); + } + instance().lock_.unlock(); + } +}; +} // namespace threadlocal_detail +} // namespace folly