folly/detail/CacheLocality.cpp

   1 /*
   2  * Copyright 2014 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <folly/detail/CacheLocality.h>
  18
  19 #define _GNU_SOURCE 1 // for RTLD_NOLOAD
  20 #include <dlfcn.h>
  21 #include <fstream>
  22 #include <mutex>
  23
  24 #include <folly/Conv.h>
  25 #include <folly/Exception.h>
  26 #include <folly/FileUtil.h>
  27 #include <folly/Format.h>
  28 #include <folly/ScopeGuard.h>
  29
  30 namespace folly { namespace detail {
  31
  32 ///////////// CacheLocality
  33
  34 /// Returns the best real CacheLocality information available
  35 static CacheLocality getSystemLocalityInfo() {
  36 #ifdef __linux__
  37   try {
  38     return CacheLocality::readFromSysfs();
  39   } catch (...) {
  40     // fall through to below if something goes wrong
  41   }
  42 #endif
  43
  44   long numCpus = sysconf(_SC_NPROCESSORS_CONF);
  45   if (numCpus <= 0) {
  46     // This shouldn't happen, but if it does we should try to keep
  47     // going.  We are probably not going to be able to parse /sys on
  48     // this box either (although we will try), which means we are going
  49     // to fall back to the SequentialThreadId splitter.  On my 16 core
  50     // (x hyperthreading) dev box 16 stripes is enough to get pretty good
  51     // contention avoidance with SequentialThreadId, and there is little
  52     // improvement from going from 32 to 64.  This default gives us some
  53     // wiggle room
  54     numCpus = 32;
  55   }
  56   return CacheLocality::uniform(numCpus);
  57 }
  58
  59 template <>
  60 const CacheLocality& CacheLocality::system<std::atomic>() {
  61   static CacheLocality cache(getSystemLocalityInfo());
  62   return cache;
  63 }
  64
  65 // Each level of cache has sharing sets, which are the set of cpus
  66 // that share a common cache at that level.  These are available in a
  67 // hex bitset form (/sys/devices/system/cpu/cpu0/index0/shared_cpu_map,
  68 // for example).  They are also available in a human-readable list form,
  69 // as in /sys/devices/system/cpu/cpu0/index0/shared_cpu_list.  The list
  70 // is a comma-separated list of numbers and ranges, where the ranges are
  71 // a pair of decimal numbers separated by a '-'.
  72 //
  73 // To sort the cpus for optimum locality we don't really need to parse
  74 // the sharing sets, we just need a unique representative from the
  75 // equivalence class.  The smallest value works fine, and happens to be
  76 // the first decimal number in the file.  We load all of the equivalence
  77 // class information from all of the cpu*/index* directories, order the
  78 // cpus first by increasing last-level cache equivalence class, then by
  79 // the smaller caches.  Finally, we break ties with the cpu number itself.
  80
  81 /// Returns the first decimal number in the string, or throws an exception
  82 /// if the string does not start with a number terminated by ',', '-',
  83 /// '\n', or eos.
  84 static size_t parseLeadingNumber(const std::string& line) {
  85   auto raw = line.c_str();
  86   char *end;
  87   unsigned long val = strtoul(raw, &end, 10);
  88   if (end == raw || (*end != ',' && *end != '-' && *end != '\n')) {
  89     throw std::runtime_error(to<std::string>(
  90         "error parsing list '", line, "'").c_str());
  91   }
  92   return val;
  93 }
  94
  95 CacheLocality CacheLocality::readFromSysfsTree(
  96     const std::function<std::string(std::string)>& mapping) {
  97   // number of equivalence classes per level
  98   std::vector<size_t> numCachesByLevel;
  99
 100   // the list of cache equivalence classes, where equivalance classes
 101   // are named by the smallest cpu in the class
 102   std::vector<std::vector<size_t>> equivClassesByCpu;
 103
 104   std::vector<size_t> cpus;
 105
 106   while (true) {
 107     auto cpu = cpus.size();
 108     std::vector<size_t> levels;
 109     for (size_t index = 0; ; ++index) {
 110       auto dir = format("/sys/devices/system/cpu/cpu{}/cache/index{}/",
 111                         cpu, index).str();
 112       auto cacheType = mapping(dir + "type");
 113       auto equivStr = mapping(dir + "shared_cpu_list");
 114       if (cacheType.size() == 0 || equivStr.size() == 0) {
 115         // no more caches
 116         break;
 117       }
 118       if (cacheType[0] == 'I') {
 119         // cacheType in { "Data", "Instruction", "Unified" }. skip icache
 120         continue;
 121       }
 122       auto equiv = parseLeadingNumber(equivStr);
 123       auto level = levels.size();
 124       levels.push_back(equiv);
 125
 126       if (equiv == cpu) {
 127         // we only want to count the equiv classes once, so we do it when
 128         // we first encounter them
 129         while (numCachesByLevel.size() <= level) {
 130           numCachesByLevel.push_back(0);
 131         }
 132         numCachesByLevel[level]++;
 133       }
 134     }
 135
 136     if (levels.size() == 0) {
 137       // no levels at all for this cpu, we must be done
 138       break;
 139     }
 140     equivClassesByCpu.emplace_back(std::move(levels));
 141     cpus.push_back(cpu);
 142   }
 143
 144   if (cpus.size() == 0) {
 145     throw std::runtime_error("unable to load cache sharing info");
 146   }
 147
 148   std::sort(cpus.begin(), cpus.end(), [&](size_t lhs, size_t rhs) -> bool {
 149     // sort first by equiv class of cache with highest index, direction
 150     // doesn't matter.  If different cpus have different numbers of
 151     // caches then this code might produce a sub-optimal ordering, but
 152     // it won't crash
 153     auto& lhsEquiv = equivClassesByCpu[lhs];
 154     auto& rhsEquiv = equivClassesByCpu[rhs];
 155     for (int i = std::min(lhsEquiv.size(), rhsEquiv.size()) - 1; i >= 0; --i) {
 156       if (lhsEquiv[i] != rhsEquiv[i]) {
 157         return lhsEquiv[i] < rhsEquiv[i];
 158       }
 159     }
 160
 161     // break ties deterministically by cpu
 162     return lhs < rhs;
 163   });
 164
 165   // the cpus are now sorted by locality, with neighboring entries closer
 166   // to each other than entries that are far away.  For striping we want
 167   // the inverse map, since we are starting with the cpu
 168   std::vector<size_t> indexes(cpus.size());
 169   for (size_t i = 0; i < cpus.size(); ++i) {
 170     indexes[cpus[i]] = i;
 171   }
 172
 173   return CacheLocality{
 174       cpus.size(), std::move(numCachesByLevel), std::move(indexes) };
 175 }
 176
 177 CacheLocality CacheLocality::readFromSysfs() {
 178   return readFromSysfsTree([](std::string name) {
 179     std::ifstream xi(name.c_str());
 180     std::string rv;
 181     std::getline(xi, rv);
 182     return rv;
 183   });
 184 }
 185
 186
 187 CacheLocality CacheLocality::uniform(size_t numCpus) {
 188   CacheLocality rv;
 189
 190   rv.numCpus = numCpus;
 191
 192   // one cache shared by all cpus
 193   rv.numCachesByLevel.push_back(numCpus);
 194
 195   // no permutations in locality index mapping
 196   for (size_t cpu = 0; cpu < numCpus; ++cpu) {
 197     rv.localityIndexByCpu.push_back(cpu);
 198   }
 199
 200   return rv;
 201 }
 202
 203 ////////////// Getcpu
 204
 205 #ifdef CLOCK_REALTIME_COARSE
 206
 207 static std::once_flag gVdsoInitOnce;
 208 static Getcpu::Func gVdsoGetcpuFunc;
 209 static int64_t (*gVdsoGettimeNsFunc)(clockid_t);
 210
 211 static int cachingVdsoGetcpu(unsigned* cpu, unsigned* unused_node,
 212                              void* unused_tcache) {
 213   static __thread unsigned tls_cpu;
 214   static __thread int64_t tls_lastContextSwitchNanos;
 215
 216   auto lastContextSwitchNanos = gVdsoGettimeNsFunc(CLOCK_REALTIME_COARSE);
 217   if (tls_lastContextSwitchNanos != lastContextSwitchNanos) {
 218     int rv = gVdsoGetcpuFunc(&tls_cpu, nullptr, nullptr);
 219     if (rv != 0) {
 220       return rv;
 221     }
 222     tls_lastContextSwitchNanos = lastContextSwitchNanos;
 223   }
 224   *cpu = tls_cpu;
 225   return 0;
 226 }
 227 #endif
 228
 229 /// Resolves the dynamically loaded symbol __vdso_getcpu and
 230 /// __vdso_clock_gettime_ns, returning a pair of nulls on failure.  Does a
 231 /// little bit of probing to make sure that the __vdso_clock_gettime_ns
 232 /// function isn't using the slow fallback path.
 233 Getcpu::Func Getcpu::vdsoFunc() {
 234 #ifdef CLOCK_REALTIME_COARSE
 235   std::call_once(gVdsoInitOnce, []{
 236     void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
 237
 238     typedef int64_t (*GettimeNsFunc)(clockid_t);
 239
 240     auto getcpuFunc = Getcpu::Func(
 241         !h ? nullptr : dlsym(h, "__vdso_getcpu"));
 242     auto gettimeNsFunc = GettimeNsFunc(
 243         !h ? nullptr : dlsym(h, "__vdso_clock_gettime_ns"));
 244
 245     bool coarseGettimeDetected = false;
 246     if (gettimeNsFunc != nullptr) {
 247       // The TLS cache of getcpu results only is an optimization if the
 248       // __vdso_clock_gettime_ns implementation is fast and actually
 249       // coarse.  The slow fallback implementation is not coarse, so if
 250       // we detect a coarse clock we are set.  If CLOCK_REALTIME_COARSE
 251       // has the right properties, then so long as there is no context
 252       // switch between two calls the returned time will be identical.
 253       // Dynamically verify this.  An unlikely context switch while we're
 254       // testing can lead to a false negative, but not a false positive,
 255       // so we just run the test multiple times.  This ensures that we
 256       // will get two calls to gettimeNsFunc in a row with no intervening
 257       // context switch.
 258       auto prev = gettimeNsFunc(CLOCK_REALTIME_COARSE);
 259       for (int i = 0; i < 10 && !coarseGettimeDetected; ++i) {
 260         auto next = gettimeNsFunc(CLOCK_REALTIME_COARSE);
 261         coarseGettimeDetected = next == prev;
 262         prev = next;
 263       }
 264     }
 265
 266     if (getcpuFunc == nullptr || !coarseGettimeDetected) {
 267       // technically a null getcpuFunc could either be a failure or
 268       // a successful lookup of a symbol with the null value, but the
 269       // second can't actually happen for this symbol.  No point holding
 270       // the handle forever if we don't need the code
 271       if (h) {
 272         dlclose(h);
 273       }
 274     } else {
 275       gVdsoGetcpuFunc = getcpuFunc;
 276       gVdsoGettimeNsFunc = gettimeNsFunc;
 277     }
 278   });
 279
 280   if (gVdsoGetcpuFunc != nullptr) {
 281     return cachingVdsoGetcpu;
 282   }
 283 #endif
 284
 285   return nullptr;
 286 }
 287
 288 /////////////// SequentialThreadId
 289
 290 template<>
 291 std::atomic<size_t> SequentialThreadId<std::atomic>::prevId(0);
 292
 293 template<>
 294 FOLLY_TLS size_t SequentialThreadId<std::atomic>::currentId(0);
 295
 296 /////////////// AccessSpreader
 297
 298 template<>
 299 const AccessSpreader<std::atomic>
 300 AccessSpreader<std::atomic>::stripeByCore(
 301     CacheLocality::system<>().numCachesByLevel.front());
 302
 303 template<>
 304 const AccessSpreader<std::atomic>
 305 AccessSpreader<std::atomic>::stripeByChip(
 306     CacheLocality::system<>().numCachesByLevel.back());
 307
 308 template<>
 309 AccessSpreaderArray<std::atomic,128>
 310 AccessSpreaderArray<std::atomic,128>::sharedInstance = {};
 311
 312 /// Always claims to be on CPU zero, node zero
 313 static int degenerateGetcpu(unsigned* cpu, unsigned* node, void* unused) {
 314   if (cpu != nullptr) {
 315     *cpu = 0;
 316   }
 317   if (node != nullptr) {
 318     *node = 0;
 319   }
 320   return 0;
 321 }
 322
 323 template<>
 324 Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t numStripes) {
 325   if (numStripes == 1) {
 326     // there's no need to call getcpu if there is only one stripe.
 327     // This should not be common, so we don't want to waste a test and
 328     // branch in the main code path, but we might as well use a faster
 329     // function pointer
 330     return &degenerateGetcpu;
 331   } else {
 332     auto best = Getcpu::vdsoFunc();
 333     return best ? best : &SequentialThreadId<std::atomic>::getcpu;
 334   }
 335 }
 336
 337 } } // namespace folly::detail