Prevent leaks in ThreadLocalPtr initialization

[folly.git] / folly / detail / CacheLocality.cpp
diff --git a/folly/detail/CacheLocality.cpp b/folly/detail/CacheLocality.cpp

index ef562fa99f584e34ee98a7021df6ef8bba696cc5..e75120d6b39c2aa64863ef46a4304461ba4a96d0 100644 (file)
--- a/folly/detail/CacheLocality.cpp
+++ b/folly/detail/CacheLocality.cpp
@@ -1,5 +1,5 @@
  /*
- * Copyright 2014 Facebook, Inc.
+ * Copyright 2016 Facebook, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -16,10 +16,11 @@
  
  #include <folly/detail/CacheLocality.h>
  
+#ifndef _MSC_VER
  #define _GNU_SOURCE 1 // for RTLD_NOLOAD
  #include <dlfcn.h>
+#endif
  #include <fstream>
-#include <mutex>
  
  #include <folly/Conv.h>
  #include <folly/Exception.h>
@@ -27,7 +28,10 @@
  #include <folly/Format.h>
  #include <folly/ScopeGuard.h>
  
-namespace folly { namespace detail {
+DECLARE_ACCESS_SPREADER_TYPE(std::atomic)
+
+namespace folly {
+namespace detail {
  
  ///////////// CacheLocality
  
@@ -37,7 +41,7 @@ static CacheLocality getSystemLocalityInfo() {
    try {
      return CacheLocality::readFromSysfs();
    } catch (...) {
-    // fall through to below if something goes wrong
+    // keep trying
    }
  #endif
  
@@ -58,8 +62,8 @@ static CacheLocality getSystemLocalityInfo() {
  
  template <>
  const CacheLocality& CacheLocality::system<std::atomic>() {
-  static CacheLocality cache(getSystemLocalityInfo());
-  return cache;
+  static auto* cache = new CacheLocality(getSystemLocalityInfo());
+  return *cache;
  }
  
  // Each level of cache has sharing sets, which are the set of cpus
@@ -83,11 +87,11 @@ const CacheLocality& CacheLocality::system<std::atomic>() {
  /// '\n', or eos.
  static size_t parseLeadingNumber(const std::string& line) {
    auto raw = line.c_str();
-  char *end;
+  char* end;
    unsigned long val = strtoul(raw, &end, 10);
-  if (end == raw || (*end != ',' && *end != '-' && *end != '\n')) {
-    throw std::runtime_error(to<std::string>(
-        "error parsing list '", line, "'").c_str());
+  if (end == raw || (*end != ',' && *end != '-' && *end != '\n' && *end != 0)) {
+    throw std::runtime_error(
+        to<std::string>("error parsing list '", line, "'").c_str());
    }
    return val;
  }
@@ -106,9 +110,9 @@ CacheLocality CacheLocality::readFromSysfsTree(
    while (true) {
      auto cpu = cpus.size();
      std::vector<size_t> levels;
-    for (size_t index = 0; ; ++index) {
-      auto dir = format("/sys/devices/system/cpu/cpu{}/cache/index{}/",
-                        cpu, index).str();
+    for (size_t index = 0;; ++index) {
+      auto dir =
+          sformat("/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index);
        auto cacheType = mapping(dir + "type");
        auto equivStr = mapping(dir + "shared_cpu_list");
        if (cacheType.size() == 0 || equivStr.size() == 0) {
@@ -145,22 +149,26 @@ CacheLocality CacheLocality::readFromSysfsTree(
      throw std::runtime_error("unable to load cache sharing info");
    }
  
-  std::sort(cpus.begin(), cpus.end(), [&](size_t lhs, size_t rhs) -> bool {
-    // sort first by equiv class of cache with highest index, direction
-    // doesn't matter.  If different cpus have different numbers of
-    // caches then this code might produce a sub-optimal ordering, but
-    // it won't crash
-    auto& lhsEquiv = equivClassesByCpu[lhs];
-    auto& rhsEquiv = equivClassesByCpu[rhs];
-    for (int i = std::min(lhsEquiv.size(), rhsEquiv.size()) - 1; i >= 0; --i) {
-      if (lhsEquiv[i] != rhsEquiv[i]) {
-        return lhsEquiv[i] < rhsEquiv[i];
-      }
-    }
-
-    // break ties deterministically by cpu
-    return lhs < rhs;
-  });
+  std::sort(cpus.begin(),
+            cpus.end(),
+            [&](size_t lhs, size_t rhs) -> bool {
+              // sort first by equiv class of cache with highest index,
+              // direction doesn't matter.  If different cpus have
+              // different numbers of caches then this code might produce
+              // a sub-optimal ordering, but it won't crash
+              auto& lhsEquiv = equivClassesByCpu[lhs];
+              auto& rhsEquiv = equivClassesByCpu[rhs];
+              for (int i = std::min(lhsEquiv.size(), rhsEquiv.size()) - 1;
+                   i >= 0;
+                   --i) {
+                if (lhsEquiv[i] != rhsEquiv[i]) {
+                  return lhsEquiv[i] < rhsEquiv[i];
+                }
+              }
+
+              // break ties deterministically by cpu
+              return lhs < rhs;
+            });
  
    // the cpus are now sorted by locality, with neighboring entries closer
    // to each other than entries that are far away.  For striping we want
@@ -171,7 +179,7 @@ CacheLocality CacheLocality::readFromSysfsTree(
    }
  
    return CacheLocality{
-      cpus.size(), std::move(numCachesByLevel), std::move(indexes) };
+      cpus.size(), std::move(numCachesByLevel), std::move(indexes)};
  }
  
  CacheLocality CacheLocality::readFromSysfs() {
@@ -183,7 +191,6 @@ CacheLocality CacheLocality::readFromSysfs() {
    });
  }
  
-
  CacheLocality CacheLocality::uniform(size_t numCpus) {
    CacheLocality rv;
  
@@ -202,136 +209,45 @@ CacheLocality CacheLocality::uniform(size_t numCpus) {
  
  ////////////// Getcpu
  
-#ifdef CLOCK_REALTIME_COARSE
-
-static std::once_flag gVdsoInitOnce;
-static Getcpu::Func gVdsoGetcpuFunc;
-static int64_t (*gVdsoGettimeNsFunc)(clockid_t);
-
-static int cachingVdsoGetcpu(unsigned* cpu, unsigned* unused_node,
-                             void* unused_tcache) {
-  static __thread unsigned tls_cpu;
-  static __thread int64_t tls_lastContextSwitchNanos;
-
-  auto lastContextSwitchNanos = gVdsoGettimeNsFunc(CLOCK_REALTIME_COARSE);
-  if (tls_lastContextSwitchNanos != lastContextSwitchNanos) {
-    int rv = gVdsoGetcpuFunc(&tls_cpu, nullptr, nullptr);
-    if (rv != 0) {
-      return rv;
-    }
-    tls_lastContextSwitchNanos = lastContextSwitchNanos;
+Getcpu::Func Getcpu::resolveVdsoFunc() {
+#if !FOLLY_HAVE_LINUX_VDSO
+  return nullptr;
+#else
+  void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+  if (h == nullptr) {
+    return nullptr;
    }
-  *cpu = tls_cpu;
-  return 0;
-}
-#endif
-
-/// Resolves the dynamically loaded symbol __vdso_getcpu and
-/// __vdso_clock_gettime_ns, returning a pair of nulls on failure.  Does a
-/// little bit of probing to make sure that the __vdso_clock_gettime_ns
-/// function isn't using the slow fallback path.
-Getcpu::Func Getcpu::vdsoFunc() {
-#ifdef CLOCK_REALTIME_COARSE
-  std::call_once(gVdsoInitOnce, []{
-    void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
-
-    typedef int64_t (*GettimeNsFunc)(clockid_t);
-
-    auto getcpuFunc = Getcpu::Func(
-        !h ? nullptr : dlsym(h, "__vdso_getcpu"));
-    auto gettimeNsFunc = GettimeNsFunc(
-        !h ? nullptr : dlsym(h, "__vdso_clock_gettime_ns"));
-
-    bool coarseGettimeDetected = false;
-    if (gettimeNsFunc != nullptr) {
-      // The TLS cache of getcpu results only is an optimization if the
-      // __vdso_clock_gettime_ns implementation is fast and actually
-      // coarse.  The slow fallback implementation is not coarse, so if
-      // we detect a coarse clock we are set.  If CLOCK_REALTIME_COARSE
-      // has the right properties, then so long as there is no context
-      // switch between two calls the returned time will be identical.
-      // Dynamically verify this.  An unlikely context switch while we're
-      // testing can lead to a false negative, but not a false positive,
-      // so we just run the test multiple times.  This ensures that we
-      // will get two calls to gettimeNsFunc in a row with no intervening
-      // context switch.
-      auto prev = gettimeNsFunc(CLOCK_REALTIME_COARSE);
-      for (int i = 0; i < 10 && !coarseGettimeDetected; ++i) {
-        auto next = gettimeNsFunc(CLOCK_REALTIME_COARSE);
-        coarseGettimeDetected = next == prev;
-        prev = next;
-      }
-    }
-
-    if (getcpuFunc == nullptr || !coarseGettimeDetected) {
-      // technically a null getcpuFunc could either be a failure or
-      // a successful lookup of a symbol with the null value, but the
-      // second can't actually happen for this symbol.  No point holding
-      // the handle forever if we don't need the code
-      if (h) {
-        dlclose(h);
-      }
-    } else {
-      gVdsoGetcpuFunc = getcpuFunc;
-      gVdsoGettimeNsFunc = gettimeNsFunc;
-    }
-  });
  
-  if (gVdsoGetcpuFunc != nullptr) {
-    return cachingVdsoGetcpu;
+  auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu"));
+  if (func == nullptr) {
+    // technically a null result could either be a failure or a successful
+    // lookup of a symbol with the null value, but the second can't actually
+    // happen for this symbol.  No point holding the handle forever if
+    // we don't need the code
+    dlclose(h);
    }
-#endif
  
-  return nullptr;
+  return func;
+#endif
  }
  
+#ifdef FOLLY_TLS
  /////////////// SequentialThreadId
  
-template<>
+template <>
  std::atomic<size_t> SequentialThreadId<std::atomic>::prevId(0);
  
-template<>
+template <>
  FOLLY_TLS size_t SequentialThreadId<std::atomic>::currentId(0);
+#endif
  
  /////////////// AccessSpreader
  
-template<>
-const AccessSpreader<std::atomic>
-AccessSpreader<std::atomic>::stripeByCore(
-    CacheLocality::system<>().numCachesByLevel.front());
-
-template<>
-const AccessSpreader<std::atomic>
-AccessSpreader<std::atomic>::stripeByChip(
-    CacheLocality::system<>().numCachesByLevel.back());
-
-template<>
-AccessSpreaderArray<std::atomic,128>
-AccessSpreaderArray<std::atomic,128>::sharedInstance = {};
-
-/// Always claims to be on CPU zero, node zero
-static int degenerateGetcpu(unsigned* cpu, unsigned* node, void* unused) {
-  if (cpu != nullptr) {
-    *cpu = 0;
-  }
-  if (node != nullptr) {
-    *node = 0;
-  }
-  return 0;
-}
-
-template<>
-Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t numStripes) {
-  if (numStripes == 1) {
-    // there's no need to call getcpu if there is only one stripe.
-    // This should not be common, so we don't want to waste a test and
-    // branch in the main code path, but we might as well use a faster
-    // function pointer
-    return &degenerateGetcpu;
-  } else {
-    auto best = Getcpu::vdsoFunc();
-    return best ? best : &SequentialThreadId<std::atomic>::getcpu;
-  }
+template <>
+Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc() {
+  auto best = Getcpu::resolveVdsoFunc();
+  return best ? best : &FallbackGetcpuType::getcpu;
  }
  
-} } // namespace folly::detail
+} // namespace detail
+} // namespace folly