folly/detail/CacheLocality.cpp

   1 /*
   2  * Copyright 2016 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <folly/detail/CacheLocality.h>
  18
  19 #ifndef _MSC_VER
  20 #define _GNU_SOURCE 1 // for RTLD_NOLOAD
  21 #include <dlfcn.h>
  22 #endif
  23 #include <fstream>
  24
  25 #include <folly/Conv.h>
  26 #include <folly/Exception.h>
  27 #include <folly/FileUtil.h>
  28 #include <folly/Format.h>
  29 #include <folly/ScopeGuard.h>
  30
  31 DECLARE_ACCESS_SPREADER_TYPE(std::atomic)
  32
  33 namespace folly {
  34 namespace detail {
  35
  36 ///////////// CacheLocality
  37
  38 /// Returns the best real CacheLocality information available
  39 static CacheLocality getSystemLocalityInfo() {
  40 #ifdef __linux__
  41   try {
  42     return CacheLocality::readFromSysfs();
  43   } catch (...) {
  44     // keep trying
  45   }
  46 #endif
  47
  48   long numCpus = sysconf(_SC_NPROCESSORS_CONF);
  49   if (numCpus <= 0) {
  50     // This shouldn't happen, but if it does we should try to keep
  51     // going.  We are probably not going to be able to parse /sys on
  52     // this box either (although we will try), which means we are going
  53     // to fall back to the SequentialThreadId splitter.  On my 16 core
  54     // (x hyperthreading) dev box 16 stripes is enough to get pretty good
  55     // contention avoidance with SequentialThreadId, and there is little
  56     // improvement from going from 32 to 64.  This default gives us some
  57     // wiggle room
  58     numCpus = 32;
  59   }
  60   return CacheLocality::uniform(numCpus);
  61 }
  62
  63 template <>
  64 const CacheLocality& CacheLocality::system<std::atomic>() {
  65   static auto* cache = new CacheLocality(getSystemLocalityInfo());
  66   return *cache;
  67 }
  68
  69 // Each level of cache has sharing sets, which are the set of cpus
  70 // that share a common cache at that level.  These are available in a
  71 // hex bitset form (/sys/devices/system/cpu/cpu0/index0/shared_cpu_map,
  72 // for example).  They are also available in a human-readable list form,
  73 // as in /sys/devices/system/cpu/cpu0/index0/shared_cpu_list.  The list
  74 // is a comma-separated list of numbers and ranges, where the ranges are
  75 // a pair of decimal numbers separated by a '-'.
  76 //
  77 // To sort the cpus for optimum locality we don't really need to parse
  78 // the sharing sets, we just need a unique representative from the
  79 // equivalence class.  The smallest value works fine, and happens to be
  80 // the first decimal number in the file.  We load all of the equivalence
  81 // class information from all of the cpu*/index* directories, order the
  82 // cpus first by increasing last-level cache equivalence class, then by
  83 // the smaller caches.  Finally, we break ties with the cpu number itself.
  84
  85 /// Returns the first decimal number in the string, or throws an exception
  86 /// if the string does not start with a number terminated by ',', '-',
  87 /// '\n', or eos.
  88 static size_t parseLeadingNumber(const std::string& line) {
  89   auto raw = line.c_str();
  90   char* end;
  91   unsigned long val = strtoul(raw, &end, 10);
  92   if (end == raw || (*end != ',' && *end != '-' && *end != '\n' && *end != 0)) {
  93     throw std::runtime_error(
  94         to<std::string>("error parsing list '", line, "'").c_str());
  95   }
  96   return val;
  97 }
  98
  99 CacheLocality CacheLocality::readFromSysfsTree(
 100     const std::function<std::string(std::string)>& mapping) {
 101   // number of equivalence classes per level
 102   std::vector<size_t> numCachesByLevel;
 103
 104   // the list of cache equivalence classes, where equivalance classes
 105   // are named by the smallest cpu in the class
 106   std::vector<std::vector<size_t>> equivClassesByCpu;
 107
 108   std::vector<size_t> cpus;
 109
 110   while (true) {
 111     auto cpu = cpus.size();
 112     std::vector<size_t> levels;
 113     for (size_t index = 0;; ++index) {
 114       auto dir =
 115           sformat("/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index);
 116       auto cacheType = mapping(dir + "type");
 117       auto equivStr = mapping(dir + "shared_cpu_list");
 118       if (cacheType.size() == 0 || equivStr.size() == 0) {
 119         // no more caches
 120         break;
 121       }
 122       if (cacheType[0] == 'I') {
 123         // cacheType in { "Data", "Instruction", "Unified" }. skip icache
 124         continue;
 125       }
 126       auto equiv = parseLeadingNumber(equivStr);
 127       auto level = levels.size();
 128       levels.push_back(equiv);
 129
 130       if (equiv == cpu) {
 131         // we only want to count the equiv classes once, so we do it when
 132         // we first encounter them
 133         while (numCachesByLevel.size() <= level) {
 134           numCachesByLevel.push_back(0);
 135         }
 136         numCachesByLevel[level]++;
 137       }
 138     }
 139
 140     if (levels.size() == 0) {
 141       // no levels at all for this cpu, we must be done
 142       break;
 143     }
 144     equivClassesByCpu.emplace_back(std::move(levels));
 145     cpus.push_back(cpu);
 146   }
 147
 148   if (cpus.size() == 0) {
 149     throw std::runtime_error("unable to load cache sharing info");
 150   }
 151
 152   std::sort(cpus.begin(),
 153             cpus.end(),
 154             [&](size_t lhs, size_t rhs) -> bool {
 155               // sort first by equiv class of cache with highest index,
 156               // direction doesn't matter.  If different cpus have
 157               // different numbers of caches then this code might produce
 158               // a sub-optimal ordering, but it won't crash
 159               auto& lhsEquiv = equivClassesByCpu[lhs];
 160               auto& rhsEquiv = equivClassesByCpu[rhs];
 161               for (int i = std::min(lhsEquiv.size(), rhsEquiv.size()) - 1;
 162                    i >= 0;
 163                    --i) {
 164                 if (lhsEquiv[i] != rhsEquiv[i]) {
 165                   return lhsEquiv[i] < rhsEquiv[i];
 166                 }
 167               }
 168
 169               // break ties deterministically by cpu
 170               return lhs < rhs;
 171             });
 172
 173   // the cpus are now sorted by locality, with neighboring entries closer
 174   // to each other than entries that are far away.  For striping we want
 175   // the inverse map, since we are starting with the cpu
 176   std::vector<size_t> indexes(cpus.size());
 177   for (size_t i = 0; i < cpus.size(); ++i) {
 178     indexes[cpus[i]] = i;
 179   }
 180
 181   return CacheLocality{
 182       cpus.size(), std::move(numCachesByLevel), std::move(indexes)};
 183 }
 184
 185 CacheLocality CacheLocality::readFromSysfs() {
 186   return readFromSysfsTree([](std::string name) {
 187     std::ifstream xi(name.c_str());
 188     std::string rv;
 189     std::getline(xi, rv);
 190     return rv;
 191   });
 192 }
 193
 194 CacheLocality CacheLocality::uniform(size_t numCpus) {
 195   CacheLocality rv;
 196
 197   rv.numCpus = numCpus;
 198
 199   // one cache shared by all cpus
 200   rv.numCachesByLevel.push_back(numCpus);
 201
 202   // no permutations in locality index mapping
 203   for (size_t cpu = 0; cpu < numCpus; ++cpu) {
 204     rv.localityIndexByCpu.push_back(cpu);
 205   }
 206
 207   return rv;
 208 }
 209
 210 ////////////// Getcpu
 211
 212 Getcpu::Func Getcpu::resolveVdsoFunc() {
 213 #if !FOLLY_HAVE_LINUX_VDSO
 214   return nullptr;
 215 #else
 216   void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
 217   if (h == nullptr) {
 218     return nullptr;
 219   }
 220
 221   auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu"));
 222   if (func == nullptr) {
 223     // technically a null result could either be a failure or a successful
 224     // lookup of a symbol with the null value, but the second can't actually
 225     // happen for this symbol.  No point holding the handle forever if
 226     // we don't need the code
 227     dlclose(h);
 228   }
 229
 230   return func;
 231 #endif
 232 }
 233
 234 #ifdef FOLLY_TLS
 235 /////////////// SequentialThreadId
 236
 237 template <>
 238 std::atomic<size_t> SequentialThreadId<std::atomic>::prevId(0);
 239
 240 template <>
 241 FOLLY_TLS size_t SequentialThreadId<std::atomic>::currentId(0);
 242 #endif
 243
 244 /////////////// AccessSpreader
 245
 246 template <>
 247 Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc() {
 248   auto best = Getcpu::resolveVdsoFunc();
 249   return best ? best : &FallbackGetcpuType::getcpu;
 250 }
 251
 252 } // namespace detail
 253 } // namespace folly