X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=folly%2Fdetail%2FThreadLocalDetail.h;h=6769410be67e6ff3b4d6317abc5727330f488963;hb=320a9600f9cb11bbfd3f17dc99cb7b252132eb37;hp=ad53b9c9e671490260f858a915a42697901e2f6f;hpb=88ae9ac7f80c48c29b2e0ece6249973a0bef5184;p=folly.git diff --git a/folly/detail/ThreadLocalDetail.h b/folly/detail/ThreadLocalDetail.h index ad53b9c9..6769410b 100644 --- a/folly/detail/ThreadLocalDetail.h +++ b/folly/detail/ThreadLocalDetail.h @@ -1,5 +1,5 @@ /* - * Copyright 2013 Facebook, Inc. + * Copyright 2015 Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,19 +19,28 @@ #include #include -#include + +#include #include #include -#include -#include -#include - #include -#include "folly/Exception.h" -#include "folly/Foreach.h" -#include "folly/Malloc.h" +#include +#include +#include + +// In general, emutls cleanup is not guaranteed to play nice with the way +// StaticMeta mixes direct pthread calls and the use of __thread. This has +// caused problems on multiple platforms so don't use __thread there. +// +// XXX: Ideally we would instead determine if emutls is in use at runtime as it +// is possible to configure glibc on Linux to use emutls regardless. +#if !__APPLE__ && !__ANDROID__ +#define FOLLY_TLD_USE_FOLLY_TLS 1 +#else +#undef FOLLY_TLD_USE_FOLLY_TLS +#endif namespace folly { namespace threadlocal_detail { @@ -76,23 +85,31 @@ class CustomDeleter : public DeleterBase { * This must be POD, as we memset() it to 0 and memcpy() it around. */ struct ElementWrapper { - void dispose(TLPDestructionMode mode) { - if (ptr != NULL) { - DCHECK(deleter != NULL); - deleter->dispose(ptr, mode); - if (ownsDeleter) { - delete deleter; - } - ptr = NULL; - deleter = NULL; - ownsDeleter = false; + bool dispose(TLPDestructionMode mode) { + if (ptr == nullptr) { + return false; } + + DCHECK(deleter != nullptr); + deleter->dispose(ptr, mode); + cleanup(); + return true; + } + + void* release() { + auto retPtr = ptr; + + if (ptr != nullptr) { + cleanup(); + } + + return retPtr; } template void set(Ptr p) { - DCHECK(ptr == NULL); - DCHECK(deleter == NULL); + DCHECK(ptr == nullptr); + DCHECK(deleter == nullptr); if (p) { // We leak a single object here but that is ok. If we used an @@ -108,8 +125,8 @@ struct ElementWrapper { template void set(Ptr p, Deleter d) { - DCHECK(ptr == NULL); - DCHECK(deleter == NULL); + DCHECK(ptr == nullptr); + DCHECK(deleter == nullptr); if (p) { ptr = p; deleter = new CustomDeleter(d); @@ -117,6 +134,15 @@ struct ElementWrapper { } } + void cleanup() { + if (ownsDeleter) { + delete deleter; + } + ptr = nullptr; + deleter = nullptr; + ownsDeleter = false; + } + void* ptr; DeleterBase* deleter; bool ownsDeleter; @@ -152,9 +178,9 @@ struct StaticMeta { return *inst_; } - int nextId_; - std::vector freeIds_; - boost::mutex lock_; + uint32_t nextId_; + std::vector freeIds_; + std::mutex lock_; pthread_key_t pthreadKey_; ThreadEntry head_; @@ -171,7 +197,9 @@ struct StaticMeta { t->next = t->prev = t; } - static __thread ThreadEntry threadEntry_; +#ifdef FOLLY_TLD_USE_FOLLY_TLS + static FOLLY_TLS ThreadEntry threadEntry_; +#endif static StaticMeta* inst_; StaticMeta() : nextId_(1) { @@ -179,15 +207,39 @@ struct StaticMeta { int ret = pthread_key_create(&pthreadKey_, &onThreadExit); checkPosixError(ret, "pthread_key_create failed"); +#if FOLLY_HAVE_PTHREAD_ATFORK ret = pthread_atfork(/*prepare*/ &StaticMeta::preFork, /*parent*/ &StaticMeta::onForkParent, /*child*/ &StaticMeta::onForkChild); checkPosixError(ret, "pthread_atfork failed"); +#elif !__ANDROID__ + // pthread_atfork is not part of the Android NDK at least as of n9d. If + // something is trying to call native fork() directly at all with Android's + // process management model, this is probably the least of the problems. + // + // But otherwise, this is a problem. + #warning pthread_atfork unavailable +#endif } ~StaticMeta() { LOG(FATAL) << "StaticMeta lives forever!"; } + static ThreadEntry* getThreadEntry() { +#ifdef FOLLY_TLD_USE_FOLLY_TLS + return &threadEntry_; +#else + ThreadEntry* threadEntry = + static_cast(pthread_getspecific(inst_->pthreadKey_)); + if (!threadEntry) { + threadEntry = new ThreadEntry(); + int ret = pthread_setspecific(inst_->pthreadKey_, threadEntry); + checkPosixError(ret, "pthread_setspecific failed"); + } + return threadEntry; +#endif + } + static void preFork(void) { instance().lock_.lock(); // Make sure it's created } @@ -197,34 +249,58 @@ struct StaticMeta { } static void onForkChild(void) { + // only the current thread survives inst_->head_.next = inst_->head_.prev = &inst_->head_; - inst_->push_back(&threadEntry_); // only the current thread survives + ThreadEntry* threadEntry = getThreadEntry(); + // If this thread was in the list before the fork, add it back. + if (threadEntry->elementsCapacity != 0) { + inst_->push_back(threadEntry); + } inst_->lock_.unlock(); } static void onThreadExit(void* ptr) { - auto & meta = instance(); + auto& meta = instance(); +#ifdef FOLLY_TLD_USE_FOLLY_TLS + ThreadEntry* threadEntry = getThreadEntry(); + DCHECK_EQ(ptr, &meta); - // We wouldn't call pthread_setspecific unless we actually called get() - DCHECK_NE(threadEntry_.elementsCapacity, 0); + DCHECK_GT(threadEntry->elementsCapacity, 0); +#else + ThreadEntry* threadEntry = static_cast(ptr); +#endif { - boost::lock_guard g(meta.lock_); - meta.erase(&threadEntry_); - // No need to hold the lock any longer; threadEntry_ is private to this + std::lock_guard g(meta.lock_); + meta.erase(threadEntry); + // No need to hold the lock any longer; the ThreadEntry is private to this // thread now that it's been removed from meta. } - FOR_EACH_RANGE(i, 0, threadEntry_.elementsCapacity) { - threadEntry_.elements[i].dispose(TLPDestructionMode::THIS_THREAD); + // NOTE: User-provided deleter / object dtor itself may be using ThreadLocal + // with the same Tag, so dispose() calls below may (re)create some of the + // elements or even increase elementsCapacity, thus multiple cleanup rounds + // may be required. + for (bool shouldRun = true; shouldRun; ) { + shouldRun = false; + FOR_EACH_RANGE(i, 0, threadEntry->elementsCapacity) { + if (threadEntry->elements[i].dispose(TLPDestructionMode::THIS_THREAD)) { + shouldRun = true; + } + } } - free(threadEntry_.elements); - threadEntry_.elements = NULL; - pthread_setspecific(meta.pthreadKey_, NULL); + free(threadEntry->elements); + threadEntry->elements = nullptr; + pthread_setspecific(meta.pthreadKey_, nullptr); + +#ifndef FOLLY_TLD_USE_FOLLY_TLS + // Allocated in getThreadEntry() when not using folly TLS; free it + delete threadEntry; +#endif } - static int create() { - int id; + static uint32_t create() { + uint32_t id; auto & meta = instance(); - boost::lock_guard g(meta.lock_); + std::lock_guard g(meta.lock_); if (!meta.freeIds_.empty()) { id = meta.freeIds_.back(); meta.freeIds_.pop_back(); @@ -234,25 +310,31 @@ struct StaticMeta { return id; } - static void destroy(int id) { + static void destroy(uint32_t id) { try { auto & meta = instance(); // Elements in other threads that use this id. std::vector elements; { - boost::lock_guard g(meta.lock_); + std::lock_guard g(meta.lock_); for (ThreadEntry* e = meta.head_.next; e != &meta.head_; e = e->next) { if (id < e->elementsCapacity && e->elements[id].ptr) { elements.push_back(e->elements[id]); - // Writing another thread's ThreadEntry from here is fine; - // the only other potential reader is the owning thread -- - // from onThreadExit (which grabs the lock, so is properly - // synchronized with us) or from get() -- but using get() on a - // ThreadLocalPtr object that's being destroyed is a bug, so - // undefined behavior is fair game. - e->elements[id].ptr = NULL; - e->elements[id].deleter = NULL; + /* + * Writing another thread's ThreadEntry from here is fine; + * the only other potential reader is the owning thread -- + * from onThreadExit (which grabs the lock, so is properly + * synchronized with us) or from get(), which also grabs + * the lock if it needs to resize the elements vector. + * + * We can't conflict with reads for a get(id), because + * it's illegal to call get on a thread local that's + * destructing. + */ + e->elements[id].ptr = nullptr; + e->elements[id].deleter = nullptr; + e->elements[id].ownsDeleter = false; } } meta.freeIds_.push_back(id); @@ -267,75 +349,113 @@ struct StaticMeta { } /** - * Reserve enough space in the threadEntry_.elements for the item + * Reserve enough space in the ThreadEntry::elements for the item * @id to fit in. */ - static void reserve(int id) { - size_t prevSize = threadEntry_.elementsCapacity; - size_t newSize = static_cast((id + 5) * 1.7); + static void reserve(uint32_t id) { auto& meta = instance(); - ElementWrapper* ptr = nullptr; - // Rely on jemalloc to zero the memory if possible -- maybe it knows - // it's already zeroed and saves us some work. - if (!usingJEMalloc() || - prevSize < jemallocMinInPlaceExpandable || - (rallocm( - static_cast(static_cast(&threadEntry_.elements)), - NULL, newSize * sizeof(ElementWrapper), 0, - ALLOCM_NO_MOVE | ALLOCM_ZERO) != ALLOCM_SUCCESS)) { - // Sigh, must realloc, but we can't call realloc here, as elements is - // still linked in meta, so another thread might access invalid memory - // after realloc succeeds. We'll copy by hand and update threadEntry_ - // under the lock. + ThreadEntry* threadEntry = getThreadEntry(); + size_t prevCapacity = threadEntry->elementsCapacity; + // Growth factor < 2, see folly/docs/FBVector.md; + 5 to prevent + // very slow start. + size_t newCapacity = static_cast((id + 5) * 1.7); + assert(newCapacity > prevCapacity); + ElementWrapper* reallocated = nullptr; + + // Need to grow. Note that we can't call realloc, as elements is + // still linked in meta, so another thread might access invalid memory + // after realloc succeeds. We'll copy by hand and update our ThreadEntry + // under the lock. + if (usingJEMalloc()) { + bool success = false; + size_t newByteSize = nallocx(newCapacity * sizeof(ElementWrapper), 0); + + // Try to grow in place. // - // Note that we're using calloc instead of malloc in order to zero - // the entire region. rallocm (ALLOCM_ZERO) will only zero newly - // allocated memory, so if a previous allocation allocated more than - // we requested, it's our responsibility to guarantee that the tail - // is zeroed. calloc() is simpler than malloc() followed by memset(), - // and potentially faster when dealing with a lot of memory, as - // it can get already-zeroed pages from the kernel. - if ((ptr = static_cast( - calloc(newSize, sizeof(ElementWrapper)))) != nullptr) { - memcpy(ptr, threadEntry_.elements, sizeof(ElementWrapper) * prevSize); + // Note that xallocx(MALLOCX_ZERO) will only zero newly allocated memory, + // even if a previous allocation allocated more than we requested. + // This is fine; we always use MALLOCX_ZERO with jemalloc and we + // always expand our allocation to the real size. + if (prevCapacity * sizeof(ElementWrapper) >= + jemallocMinInPlaceExpandable) { + success = (xallocx(threadEntry->elements, newByteSize, 0, MALLOCX_ZERO) + == newByteSize); + } + + // In-place growth failed. + if (!success) { + success = ((reallocated = static_cast( + mallocx(newByteSize, MALLOCX_ZERO))) != nullptr); + } + + if (success) { + // Expand to real size + assert(newByteSize / sizeof(ElementWrapper) >= newCapacity); + newCapacity = newByteSize / sizeof(ElementWrapper); } else { throw std::bad_alloc(); } + } else { // no jemalloc + // calloc() is simpler than malloc() followed by memset(), and + // potentially faster when dealing with a lot of memory, as it can get + // already-zeroed pages from the kernel. + reallocated = static_cast( + calloc(newCapacity, sizeof(ElementWrapper))); + if (!reallocated) { + throw std::bad_alloc(); + } } // Success, update the entry { - boost::lock_guard g(meta.lock_); - if (prevSize == 0) { - meta.push_back(&threadEntry_); + std::lock_guard g(meta.lock_); + + if (prevCapacity == 0) { + meta.push_back(threadEntry); } - if (ptr) { + + if (reallocated) { + /* + * Note: we need to hold the meta lock when copying data out of + * the old vector, because some other thread might be + * destructing a ThreadLocal and writing to the elements vector + * of this thread. + */ + memcpy(reallocated, threadEntry->elements, + sizeof(ElementWrapper) * prevCapacity); using std::swap; - swap(ptr, threadEntry_.elements); + swap(reallocated, threadEntry->elements); } - threadEntry_.elementsCapacity = newSize; + threadEntry->elementsCapacity = newCapacity; } - free(ptr); + free(reallocated); - if (prevSize == 0) { +#ifdef FOLLY_TLD_USE_FOLLY_TLS + if (prevCapacity == 0) { pthread_setspecific(meta.pthreadKey_, &meta); } +#endif } - static ElementWrapper& get(int id) { - if (UNLIKELY(threadEntry_.elementsCapacity <= id)) { + static ElementWrapper& get(uint32_t id) { + ThreadEntry* threadEntry = getThreadEntry(); + if (UNLIKELY(threadEntry->elementsCapacity <= id)) { reserve(id); + assert(threadEntry->elementsCapacity > id); } - return threadEntry_.elements[id]; + return threadEntry->elements[id]; } }; -template __thread ThreadEntry StaticMeta::threadEntry_ = {0}; +#ifdef FOLLY_TLD_USE_FOLLY_TLS +template +FOLLY_TLS ThreadEntry StaticMeta::threadEntry_{nullptr, 0, + nullptr, nullptr}; +#endif template StaticMeta* StaticMeta::inst_ = nullptr; } // namespace threadlocal_detail } // namespace folly #endif /* FOLLY_DETAIL_THREADLOCALDETAIL_H_ */ -