/*
- * Copyright 2014 Facebook, Inc.
+ * Copyright 2016 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* limitations under the License.
*/
-#ifndef FOLLY_DETAIL_THREADLOCALDETAIL_H_
-#define FOLLY_DETAIL_THREADLOCALDETAIL_H_
+#pragma once
#include <limits.h>
#include <pthread.h>
+#include <atomic>
+#include <functional>
#include <mutex>
#include <string>
#include <vector>
#include <glog/logging.h>
-#include <folly/Foreach.h>
#include <folly/Exception.h>
+#include <folly/Foreach.h>
+#include <folly/Function.h>
#include <folly/Malloc.h>
+#include <folly/MicroSpinLock.h>
+#include <folly/Portability.h>
+#include <folly/ScopeGuard.h>
-namespace folly {
-namespace threadlocal_detail {
+#include <folly/detail/StaticSingletonManager.h>
-/**
- * Base class for deleters.
- */
-class DeleterBase {
- public:
- virtual ~DeleterBase() { }
- virtual void dispose(void* ptr, TLPDestructionMode mode) const = 0;
-};
-
-/**
- * Simple deleter class that calls delete on the passed-in pointer.
- */
-template <class Ptr>
-class SimpleDeleter : public DeleterBase {
- public:
- virtual void dispose(void* ptr, TLPDestructionMode mode) const {
- delete static_cast<Ptr>(ptr);
- }
-};
-
-/**
- * Custom deleter that calls a given callable.
- */
-template <class Ptr, class Deleter>
-class CustomDeleter : public DeleterBase {
- public:
- explicit CustomDeleter(Deleter d) : deleter_(d) { }
- virtual void dispose(void* ptr, TLPDestructionMode mode) const {
- deleter_(static_cast<Ptr>(ptr), mode);
- }
- private:
- Deleter deleter_;
-};
+// In general, emutls cleanup is not guaranteed to play nice with the way
+// StaticMeta mixes direct pthread calls and the use of __thread. This has
+// caused problems on multiple platforms so don't use __thread there.
+//
+// XXX: Ideally we would instead determine if emutls is in use at runtime as it
+// is possible to configure glibc on Linux to use emutls regardless.
+#if !FOLLY_MOBILE && !defined(__APPLE__) && !defined(_MSC_VER)
+#define FOLLY_TLD_USE_FOLLY_TLS 1
+#else
+#undef FOLLY_TLD_USE_FOLLY_TLS
+#endif
+namespace folly {
+namespace threadlocal_detail {
/**
* POD wrapper around an element (a void*) and an associated deleter.
* This must be POD, as we memset() it to 0 and memcpy() it around.
*/
struct ElementWrapper {
- void dispose(TLPDestructionMode mode) {
- if (ptr != nullptr) {
- DCHECK(deleter != nullptr);
- deleter->dispose(ptr, mode);
+ using DeleterFunType = void(void*, TLPDestructionMode);
- cleanup();
+ bool dispose(TLPDestructionMode mode) {
+ if (ptr == nullptr) {
+ return false;
}
+
+ DCHECK(deleter1 != nullptr);
+ ownsDeleter ? (*deleter2)(ptr, mode) : (*deleter1)(ptr, mode);
+ cleanup();
+ return true;
}
void* release() {
template <class Ptr>
void set(Ptr p) {
+ auto guard = makeGuard([&] { delete p; });
DCHECK(ptr == nullptr);
- DCHECK(deleter == nullptr);
+ DCHECK(deleter1 == nullptr);
if (p) {
- // We leak a single object here but that is ok. If we used an
- // object directly, there is a chance that the destructor will be
- // called on that static object before any of the ElementWrappers
- // are disposed and that isn't so nice.
- static auto d = new SimpleDeleter<Ptr>();
ptr = p;
- deleter = d;
+ deleter1 = [](void* pt, TLPDestructionMode) {
+ delete static_cast<Ptr>(pt);
+ };
ownsDeleter = false;
+ guard.dismiss();
}
}
template <class Ptr, class Deleter>
- void set(Ptr p, Deleter d) {
+ void set(Ptr p, const Deleter& d) {
+ auto guard = makeGuard([&] {
+ if (p) {
+ d(p, TLPDestructionMode::THIS_THREAD);
+ }
+ });
DCHECK(ptr == nullptr);
- DCHECK(deleter == nullptr);
+ DCHECK(deleter2 == nullptr);
if (p) {
ptr = p;
- deleter = new CustomDeleter<Ptr,Deleter>(d);
+ auto d2 = d; // gcc-4.8 doesn't decay types correctly in lambda captures
+ deleter2 = new std::function<DeleterFunType>(
+ [d2](void* pt, TLPDestructionMode mode) {
+ d2(static_cast<Ptr>(pt), mode);
+ });
ownsDeleter = true;
+ guard.dismiss();
}
}
void cleanup() {
if (ownsDeleter) {
- delete deleter;
+ delete deleter2;
}
ptr = nullptr;
- deleter = nullptr;
+ deleter1 = nullptr;
ownsDeleter = false;
}
void* ptr;
- DeleterBase* deleter;
+ union {
+ DeleterFunType* deleter1;
+ std::function<DeleterFunType>* deleter2;
+ };
bool ownsDeleter;
};
+struct StaticMetaBase;
+
/**
* Per-thread entry. Each thread using a StaticMeta object has one.
* This is written from the owning thread only (under the lock), read
* (under the lock).
*/
struct ThreadEntry {
- ElementWrapper* elements;
- size_t elementsCapacity;
- ThreadEntry* next;
- ThreadEntry* prev;
+ ElementWrapper* elements{nullptr};
+ size_t elementsCapacity{0};
+ ThreadEntry* next{nullptr};
+ ThreadEntry* prev{nullptr};
+ StaticMetaBase* meta{nullptr};
};
-// Held in a singleton to track our global instances.
-// We have one of these per "Tag", by default one for the whole system
-// (Tag=void).
-//
-// Creating and destroying ThreadLocalPtr objects, as well as thread exit
-// for threads that use ThreadLocalPtr objects collide on a lock inside
-// StaticMeta; you can specify multiple Tag types to break that lock.
-template <class Tag>
-struct StaticMeta {
- static StaticMeta<Tag>& instance() {
- // Leak it on exit, there's only one per process and we don't have to
- // worry about synchronization with exiting threads.
- static bool constructed = (inst_ = new StaticMeta<Tag>());
- (void)constructed; // suppress unused warning
- return *inst_;
- }
+constexpr uint32_t kEntryIDInvalid = std::numeric_limits<uint32_t>::max();
- int nextId_;
- std::vector<int> freeIds_;
- std::mutex lock_;
- pthread_key_t pthreadKey_;
- ThreadEntry head_;
+struct PthreadKeyUnregisterTester;
- void push_back(ThreadEntry* t) {
- t->next = &head_;
- t->prev = head_.prev;
- head_.prev->next = t;
- head_.prev = t;
+/**
+ * We want to disable onThreadExit call at the end of shutdown, we don't care
+ * about leaking memory at that point.
+ *
+ * Otherwise if ThreadLocal is used in a shared library, onThreadExit may be
+ * called after dlclose().
+ *
+ * This class has one single static instance; however since it's so widely used,
+ * directly or indirectly, by so many classes, we need to take care to avoid
+ * problems stemming from the Static Initialization/Destruction Order Fiascos.
+ * Therefore this class needs to be constexpr-constructible, so as to avoid
+ * the need for this to participate in init/destruction order.
+ */
+class PthreadKeyUnregister {
+ public:
+ static constexpr size_t kMaxKeys = 1UL << 16;
+
+ ~PthreadKeyUnregister() {
+ // If static constructor priorities are not supported then
+ // ~PthreadKeyUnregister logic is not safe.
+#if !defined(__APPLE__) && !defined(_MSC_VER)
+ MSLGuard lg(lock_);
+ while (size_) {
+ pthread_key_delete(keys_[--size_]);
+ }
+#endif
}
- void erase(ThreadEntry* t) {
- t->next->prev = t->prev;
- t->prev->next = t->next;
- t->next = t->prev = t;
+ static void registerKey(pthread_key_t key) {
+ instance_.registerKeyImpl(key);
}
-#if !__APPLE__
- static FOLLY_TLS ThreadEntry threadEntry_;
-#endif
- static StaticMeta<Tag>* inst_;
-
- StaticMeta() : nextId_(1) {
- head_.next = head_.prev = &head_;
- int ret = pthread_key_create(&pthreadKey_, &onThreadExit);
- checkPosixError(ret, "pthread_key_create failed");
-
- ret = pthread_atfork(/*prepare*/ &StaticMeta::preFork,
- /*parent*/ &StaticMeta::onForkParent,
- /*child*/ &StaticMeta::onForkChild);
- checkPosixError(ret, "pthread_atfork failed");
- }
- ~StaticMeta() {
- LOG(FATAL) << "StaticMeta lives forever!";
- }
+ private:
+ /**
+ * Only one global instance should exist, hence this is private.
+ * See also the important note at the top of this class about `constexpr`
+ * usage.
+ */
+ constexpr PthreadKeyUnregister() : lock_(), size_(0), keys_() { }
+ friend struct folly::threadlocal_detail::PthreadKeyUnregisterTester;
- static ThreadEntry* getThreadEntry() {
-#if !__APPLE__
- return &threadEntry_;
-#else
- ThreadEntry* threadEntry =
- static_cast<ThreadEntry*>(pthread_getspecific(inst_->pthreadKey_));
- if (!threadEntry) {
- threadEntry = new ThreadEntry();
- int ret = pthread_setspecific(inst_->pthreadKey_, threadEntry);
- checkPosixError(ret, "pthread_setspecific failed");
+ void registerKeyImpl(pthread_key_t key) {
+ MSLGuard lg(lock_);
+ if (size_ == kMaxKeys) {
+ throw std::logic_error("pthread_key limit has already been reached");
}
- return threadEntry;
-#endif
+ keys_[size_++] = key;
}
- static void preFork(void) {
- instance().lock_.lock(); // Make sure it's created
- }
+ MicroSpinLock lock_;
+ size_t size_;
+ pthread_key_t keys_[kMaxKeys];
- static void onForkParent(void) {
- inst_->lock_.unlock();
- }
+ static PthreadKeyUnregister instance_;
+};
- static void onForkChild(void) {
- // only the current thread survives
- inst_->head_.next = inst_->head_.prev = &inst_->head_;
- ThreadEntry* threadEntry = getThreadEntry();
- // If this thread was in the list before the fork, add it back.
- if (threadEntry->elementsCapacity != 0) {
- inst_->push_back(threadEntry);
- }
- inst_->lock_.unlock();
- }
+struct StaticMetaBase {
+ // Represents an ID of a thread local object. Initially set to the maximum
+ // uint. This representation allows us to avoid a branch in accessing TLS data
+ // (because if you test capacity > id if id = maxint then the test will always
+ // fail). It allows us to keep a constexpr constructor and avoid SIOF.
+ class EntryID {
+ public:
+ std::atomic<uint32_t> value;
- static void onThreadExit(void* ptr) {
- auto & meta = instance();
-#if !__APPLE__
- ThreadEntry* threadEntry = getThreadEntry();
+ constexpr EntryID() : value(kEntryIDInvalid) {
+ }
- DCHECK_EQ(ptr, &meta);
- DCHECK_GT(threadEntry->elementsCapacity, 0);
-#else
- ThreadEntry* threadEntry = static_cast<ThreadEntry*>(ptr);
-#endif
- {
- std::lock_guard<std::mutex> g(meta.lock_);
- meta.erase(threadEntry);
- // No need to hold the lock any longer; the ThreadEntry is private to this
- // thread now that it's been removed from meta.
+ EntryID(EntryID&& other) noexcept : value(other.value.load()) {
+ other.value = kEntryIDInvalid;
}
- FOR_EACH_RANGE(i, 0, threadEntry->elementsCapacity) {
- threadEntry->elements[i].dispose(TLPDestructionMode::THIS_THREAD);
+
+ EntryID& operator=(EntryID&& other) {
+ assert(this != &other);
+ value = other.value.load();
+ other.value = kEntryIDInvalid;
+ return *this;
}
- free(threadEntry->elements);
- threadEntry->elements = nullptr;
- pthread_setspecific(meta.pthreadKey_, nullptr);
-#if __APPLE__
- // Allocated in getThreadEntry(); free it
- delete threadEntry;
-#endif
- }
+ EntryID(const EntryID& other) = delete;
+ EntryID& operator=(const EntryID& other) = delete;
- static int create() {
- int id;
- auto & meta = instance();
- std::lock_guard<std::mutex> g(meta.lock_);
- if (!meta.freeIds_.empty()) {
- id = meta.freeIds_.back();
- meta.freeIds_.pop_back();
- } else {
- id = meta.nextId_++;
+ uint32_t getOrInvalid() {
+ // It's OK for this to be relaxed, even though we're effectively doing
+ // double checked locking in using this value. We only care about the
+ // uniqueness of IDs, getOrAllocate does not modify any other memory
+ // this thread will use.
+ return value.load(std::memory_order_relaxed);
}
- return id;
- }
- static void destroy(size_t id) {
- try {
- auto & meta = instance();
- // Elements in other threads that use this id.
- std::vector<ElementWrapper> elements;
- {
- std::lock_guard<std::mutex> g(meta.lock_);
- for (ThreadEntry* e = meta.head_.next; e != &meta.head_; e = e->next) {
- if (id < e->elementsCapacity && e->elements[id].ptr) {
- elements.push_back(e->elements[id]);
-
- /*
- * Writing another thread's ThreadEntry from here is fine;
- * the only other potential reader is the owning thread --
- * from onThreadExit (which grabs the lock, so is properly
- * synchronized with us) or from get(), which also grabs
- * the lock if it needs to resize the elements vector.
- *
- * We can't conflict with reads for a get(id), because
- * it's illegal to call get on a thread local that's
- * destructing.
- */
- e->elements[id].ptr = nullptr;
- e->elements[id].deleter = nullptr;
- e->elements[id].ownsDeleter = false;
- }
- }
- meta.freeIds_.push_back(id);
- }
- // Delete elements outside the lock
- FOR_EACH(it, elements) {
- it->dispose(TLPDestructionMode::ALL_THREADS);
+ uint32_t getOrAllocate(StaticMetaBase& meta) {
+ uint32_t id = getOrInvalid();
+ if (id != kEntryIDInvalid) {
+ return id;
}
- } catch (...) { // Just in case we get a lock error or something anyway...
- LOG(WARNING) << "Destructor discarding an exception that was thrown.";
+ // The lock inside allocate ensures that a single value is allocated
+ return meta.allocate(this);
}
+ };
+
+ StaticMetaBase(ThreadEntry* (*threadEntry)(), bool strict);
+
+ [[noreturn]] ~StaticMetaBase() {
+ LOG(FATAL) << "StaticMeta lives forever!";
+ }
+
+ void push_back(ThreadEntry* t) {
+ t->next = &head_;
+ t->prev = head_.prev;
+ head_.prev->next = t;
+ head_.prev = t;
}
+ void erase(ThreadEntry* t) {
+ t->next->prev = t->prev;
+ t->prev->next = t->next;
+ t->next = t->prev = t;
+ }
+
+ static void onThreadExit(void* ptr);
+
+ uint32_t allocate(EntryID* ent);
+
+ void destroy(EntryID* ent);
+
/**
* Reserve enough space in the ThreadEntry::elements for the item
* @id to fit in.
*/
- static void reserve(int id) {
- auto& meta = instance();
- ThreadEntry* threadEntry = getThreadEntry();
- size_t prevCapacity = threadEntry->elementsCapacity;
- // Growth factor < 2, see folly/docs/FBVector.md; + 5 to prevent
- // very slow start.
- size_t newCapacity = static_cast<size_t>((id + 5) * 1.7);
- assert(newCapacity > prevCapacity);
- ElementWrapper* reallocated = nullptr;
-
- // Need to grow. Note that we can't call realloc, as elements is
- // still linked in meta, so another thread might access invalid memory
- // after realloc succeeds. We'll copy by hand and update our ThreadEntry
- // under the lock.
- if (usingJEMalloc()) {
- bool success = false;
- size_t newByteSize = newCapacity * sizeof(ElementWrapper);
- size_t realByteSize = 0;
-
- // Try to grow in place.
- //
- // Note that rallocm(ALLOCM_ZERO) will only zero newly allocated memory,
- // even if a previous allocation allocated more than we requested.
- // This is fine; we always use ALLOCM_ZERO with jemalloc and we
- // always expand our allocation to the real size.
- if (prevCapacity * sizeof(ElementWrapper) >=
- jemallocMinInPlaceExpandable) {
- success = (rallocm(reinterpret_cast<void**>(&threadEntry->elements),
- &realByteSize,
- newByteSize,
- 0,
- ALLOCM_NO_MOVE | ALLOCM_ZERO) == ALLOCM_SUCCESS);
+ void reserve(EntryID* id);
- }
+ ElementWrapper& get(EntryID* ent);
- // In-place growth failed.
- if (!success) {
- // Note that, unlike calloc,allocm(... ALLOCM_ZERO) zeros all
- // allocated bytes (*realByteSize) and not just the requested
- // bytes (newByteSize)
- success = (allocm(reinterpret_cast<void**>(&reallocated),
- &realByteSize,
- newByteSize,
- ALLOCM_ZERO) == ALLOCM_SUCCESS);
- }
+ static void initAtFork();
+ static void registerAtFork(
+ folly::Function<void()> prepare,
+ folly::Function<void()> parent,
+ folly::Function<void()> child);
- if (success) {
- // Expand to real size
- assert(realByteSize / sizeof(ElementWrapper) >= newCapacity);
- newCapacity = realByteSize / sizeof(ElementWrapper);
- } else {
- throw std::bad_alloc();
- }
- } else { // no jemalloc
- // calloc() is simpler than malloc() followed by memset(), and
- // potentially faster when dealing with a lot of memory, as it can get
- // already-zeroed pages from the kernel.
- reallocated = static_cast<ElementWrapper*>(
- calloc(newCapacity, sizeof(ElementWrapper)));
- if (!reallocated) {
- throw std::bad_alloc();
- }
- }
+ uint32_t nextId_;
+ std::vector<uint32_t> freeIds_;
+ std::mutex lock_;
+ SharedMutex accessAllThreadsLock_;
+ pthread_key_t pthreadKey_;
+ ThreadEntry head_;
+ ThreadEntry* (*threadEntry_)();
+ bool strict_;
+};
- // Success, update the entry
- {
- std::lock_guard<std::mutex> g(meta.lock_);
+// Held in a singleton to track our global instances.
+// We have one of these per "Tag", by default one for the whole system
+// (Tag=void).
+//
+// Creating and destroying ThreadLocalPtr objects, as well as thread exit
+// for threads that use ThreadLocalPtr objects collide on a lock inside
+// StaticMeta; you can specify multiple Tag types to break that lock.
+template <class Tag, class AccessMode>
+struct StaticMeta : StaticMetaBase {
+ StaticMeta()
+ : StaticMetaBase(
+ &StaticMeta::getThreadEntrySlow,
+ std::is_same<AccessMode, AccessModeStrict>::value) {
+ registerAtFork(
+ /*prepare*/ &StaticMeta::preFork,
+ /*parent*/ &StaticMeta::onForkParent,
+ /*child*/ &StaticMeta::onForkChild);
+ }
- if (prevCapacity == 0) {
- meta.push_back(threadEntry);
- }
+ static StaticMeta<Tag, AccessMode>& instance() {
+ // Leak it on exit, there's only one per process and we don't have to
+ // worry about synchronization with exiting threads.
+ /* library-local */ static auto instance =
+ detail::createGlobal<StaticMeta<Tag, AccessMode>, void>();
+ return *instance;
+ }
- if (reallocated) {
- /*
- * Note: we need to hold the meta lock when copying data out of
- * the old vector, because some other thread might be
- * destructing a ThreadLocal and writing to the elements vector
- * of this thread.
- */
- memcpy(reallocated, threadEntry->elements,
- sizeof(ElementWrapper) * prevCapacity);
- using std::swap;
- swap(reallocated, threadEntry->elements);
- }
- threadEntry->elementsCapacity = newCapacity;
+ ElementWrapper& get(EntryID* ent) {
+ ThreadEntry* threadEntry = getThreadEntry();
+ uint32_t id = ent->getOrInvalid();
+ // if id is invalid, it is equal to uint32_t's max value.
+ // x <= max value is always true
+ if (UNLIKELY(threadEntry->elementsCapacity <= id)) {
+ reserve(ent);
+ id = ent->getOrInvalid();
+ assert(threadEntry->elementsCapacity > id);
}
+ return threadEntry->elements[id];
+ }
- free(reallocated);
+ static ThreadEntry* getThreadEntrySlow() {
+ auto& meta = instance();
+ auto key = meta.pthreadKey_;
+ ThreadEntry* threadEntry =
+ static_cast<ThreadEntry*>(pthread_getspecific(key));
+ if (!threadEntry) {
+#ifdef FOLLY_TLD_USE_FOLLY_TLS
+ static FOLLY_TLS ThreadEntry threadEntrySingleton;
+ threadEntry = &threadEntrySingleton;
+#else
+ threadEntry = new ThreadEntry();
+#endif
+ threadEntry->meta = &meta;
+ int ret = pthread_setspecific(key, threadEntry);
+ checkPosixError(ret, "pthread_setspecific failed");
+ }
+ return threadEntry;
+ }
-#if !__APPLE__
- if (prevCapacity == 0) {
- pthread_setspecific(meta.pthreadKey_, &meta);
+ inline static ThreadEntry* getThreadEntry() {
+#ifdef FOLLY_TLD_USE_FOLLY_TLS
+ static FOLLY_TLS ThreadEntry* threadEntryCache{nullptr};
+ if (UNLIKELY(threadEntryCache == nullptr)) {
+ threadEntryCache = instance().threadEntry_();
}
+ return threadEntryCache;
+#else
+ return instance().threadEntry_();
#endif
}
- static ElementWrapper& get(size_t id) {
+ static void preFork(void) {
+ instance().lock_.lock(); // Make sure it's created
+ }
+
+ static void onForkParent(void) { instance().lock_.unlock(); }
+
+ static void onForkChild(void) {
+ // only the current thread survives
+ instance().head_.next = instance().head_.prev = &instance().head_;
ThreadEntry* threadEntry = getThreadEntry();
- if (UNLIKELY(threadEntry->elementsCapacity <= id)) {
- reserve(id);
- assert(threadEntry->elementsCapacity > id);
+ // If this thread was in the list before the fork, add it back.
+ if (threadEntry->elementsCapacity != 0) {
+ instance().push_back(threadEntry);
}
- return threadEntry->elements[id];
+ instance().lock_.unlock();
}
};
-#if !__APPLE__
-template <class Tag>
-FOLLY_TLS ThreadEntry StaticMeta<Tag>::threadEntry_{nullptr, 0,
- nullptr, nullptr};
-#endif
-template <class Tag> StaticMeta<Tag>* StaticMeta<Tag>::inst_ = nullptr;
-
} // namespace threadlocal_detail
} // namespace folly
-
-#endif /* FOLLY_DETAIL_THREADLOCALDETAIL_H_ */