folly/concurrency/CacheLocality.cpp

   1 /*
   2  * Copyright 2013-present Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <folly/concurrency/CacheLocality.h>
  18
  19 #ifndef _MSC_VER
  20 #define _GNU_SOURCE 1 // for RTLD_NOLOAD
  21 #include <dlfcn.h>
  22 #endif
  23 #include <fstream>
  24
  25 #include <folly/Conv.h>
  26 #include <folly/Exception.h>
  27 #include <folly/FileUtil.h>
  28 #include <folly/Format.h>
  29 #include <folly/ScopeGuard.h>
  30
  31 namespace folly {
  32
  33 ///////////// CacheLocality
  34
  35 /// Returns the best real CacheLocality information available
  36 static CacheLocality getSystemLocalityInfo() {
  37 #ifdef __linux__
  38   try {
  39     return CacheLocality::readFromSysfs();
  40   } catch (...) {
  41     // keep trying
  42   }
  43 #endif
  44
  45   long numCpus = sysconf(_SC_NPROCESSORS_CONF);
  46   if (numCpus <= 0) {
  47     // This shouldn't happen, but if it does we should try to keep
  48     // going.  We are probably not going to be able to parse /sys on
  49     // this box either (although we will try), which means we are going
  50     // to fall back to the SequentialThreadId splitter.  On my 16 core
  51     // (x hyperthreading) dev box 16 stripes is enough to get pretty good
  52     // contention avoidance with SequentialThreadId, and there is little
  53     // improvement from going from 32 to 64.  This default gives us some
  54     // wiggle room
  55     numCpus = 32;
  56   }
  57   return CacheLocality::uniform(size_t(numCpus));
  58 }
  59
  60 template <>
  61 const CacheLocality& CacheLocality::system<std::atomic>() {
  62   static auto* cache = new CacheLocality(getSystemLocalityInfo());
  63   return *cache;
  64 }
  65
  66 // Each level of cache has sharing sets, which are the set of cpus
  67 // that share a common cache at that level.  These are available in a
  68 // hex bitset form (/sys/devices/system/cpu/cpu0/index0/shared_cpu_map,
  69 // for example).  They are also available in a human-readable list form,
  70 // as in /sys/devices/system/cpu/cpu0/index0/shared_cpu_list.  The list
  71 // is a comma-separated list of numbers and ranges, where the ranges are
  72 // a pair of decimal numbers separated by a '-'.
  73 //
  74 // To sort the cpus for optimum locality we don't really need to parse
  75 // the sharing sets, we just need a unique representative from the
  76 // equivalence class.  The smallest value works fine, and happens to be
  77 // the first decimal number in the file.  We load all of the equivalence
  78 // class information from all of the cpu*/index* directories, order the
  79 // cpus first by increasing last-level cache equivalence class, then by
  80 // the smaller caches.  Finally, we break ties with the cpu number itself.
  81
  82 /// Returns the first decimal number in the string, or throws an exception
  83 /// if the string does not start with a number terminated by ',', '-',
  84 /// '\n', or eos.
  85 static size_t parseLeadingNumber(const std::string& line) {
  86   auto raw = line.c_str();
  87   char* end;
  88   unsigned long val = strtoul(raw, &end, 10);
  89   if (end == raw || (*end != ',' && *end != '-' && *end != '\n' && *end != 0)) {
  90     throw std::runtime_error(
  91         to<std::string>("error parsing list '", line, "'").c_str());
  92   }
  93   return val;
  94 }
  95
  96 CacheLocality CacheLocality::readFromSysfsTree(
  97     const std::function<std::string(std::string)>& mapping) {
  98   // number of equivalence classes per level
  99   std::vector<size_t> numCachesByLevel;
 100
 101   // the list of cache equivalence classes, where equivalance classes
 102   // are named by the smallest cpu in the class
 103   std::vector<std::vector<size_t>> equivClassesByCpu;
 104
 105   std::vector<size_t> cpus;
 106
 107   while (true) {
 108     auto cpu = cpus.size();
 109     std::vector<size_t> levels;
 110     for (size_t index = 0;; ++index) {
 111       auto dir =
 112           sformat("/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index);
 113       auto cacheType = mapping(dir + "type");
 114       auto equivStr = mapping(dir + "shared_cpu_list");
 115       if (cacheType.size() == 0 || equivStr.size() == 0) {
 116         // no more caches
 117         break;
 118       }
 119       if (cacheType[0] == 'I') {
 120         // cacheType in { "Data", "Instruction", "Unified" }. skip icache
 121         continue;
 122       }
 123       auto equiv = parseLeadingNumber(equivStr);
 124       auto level = levels.size();
 125       levels.push_back(equiv);
 126
 127       if (equiv == cpu) {
 128         // we only want to count the equiv classes once, so we do it when
 129         // we first encounter them
 130         while (numCachesByLevel.size() <= level) {
 131           numCachesByLevel.push_back(0);
 132         }
 133         numCachesByLevel[level]++;
 134       }
 135     }
 136
 137     if (levels.size() == 0) {
 138       // no levels at all for this cpu, we must be done
 139       break;
 140     }
 141     equivClassesByCpu.emplace_back(std::move(levels));
 142     cpus.push_back(cpu);
 143   }
 144
 145   if (cpus.size() == 0) {
 146     throw std::runtime_error("unable to load cache sharing info");
 147   }
 148
 149   std::sort(cpus.begin(), cpus.end(), [&](size_t lhs, size_t rhs) -> bool {
 150     // sort first by equiv class of cache with highest index,
 151     // direction doesn't matter.  If different cpus have
 152     // different numbers of caches then this code might produce
 153     // a sub-optimal ordering, but it won't crash
 154     auto& lhsEquiv = equivClassesByCpu[lhs];
 155     auto& rhsEquiv = equivClassesByCpu[rhs];
 156     for (ssize_t i = ssize_t(std::min(lhsEquiv.size(), rhsEquiv.size())) - 1;
 157          i >= 0;
 158          --i) {
 159       auto idx = size_t(i);
 160       if (lhsEquiv[idx] != rhsEquiv[idx]) {
 161         return lhsEquiv[idx] < rhsEquiv[idx];
 162       }
 163     }
 164
 165     // break ties deterministically by cpu
 166     return lhs < rhs;
 167   });
 168
 169   // the cpus are now sorted by locality, with neighboring entries closer
 170   // to each other than entries that are far away.  For striping we want
 171   // the inverse map, since we are starting with the cpu
 172   std::vector<size_t> indexes(cpus.size());
 173   for (size_t i = 0; i < cpus.size(); ++i) {
 174     indexes[cpus[i]] = i;
 175   }
 176
 177   return CacheLocality{
 178       cpus.size(), std::move(numCachesByLevel), std::move(indexes)};
 179 }
 180
 181 CacheLocality CacheLocality::readFromSysfs() {
 182   return readFromSysfsTree([](std::string name) {
 183     std::ifstream xi(name.c_str());
 184     std::string rv;
 185     std::getline(xi, rv);
 186     return rv;
 187   });
 188 }
 189
 190 CacheLocality CacheLocality::uniform(size_t numCpus) {
 191   CacheLocality rv;
 192
 193   rv.numCpus = numCpus;
 194
 195   // one cache shared by all cpus
 196   rv.numCachesByLevel.push_back(numCpus);
 197
 198   // no permutations in locality index mapping
 199   for (size_t cpu = 0; cpu < numCpus; ++cpu) {
 200     rv.localityIndexByCpu.push_back(cpu);
 201   }
 202
 203   return rv;
 204 }
 205
 206 ////////////// Getcpu
 207
 208 Getcpu::Func Getcpu::resolveVdsoFunc() {
 209 #if !FOLLY_HAVE_LINUX_VDSO
 210   return nullptr;
 211 #else
 212   void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
 213   if (h == nullptr) {
 214     return nullptr;
 215   }
 216
 217   auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu"));
 218   if (func == nullptr) {
 219     // technically a null result could either be a failure or a successful
 220     // lookup of a symbol with the null value, but the second can't actually
 221     // happen for this symbol.  No point holding the handle forever if
 222     // we don't need the code
 223     dlclose(h);
 224   }
 225
 226   return func;
 227 #endif
 228 }
 229
 230 #ifdef FOLLY_TLS
 231 /////////////// SequentialThreadId
 232 template struct SequentialThreadId<std::atomic>;
 233 #endif
 234
 235 /////////////// AccessSpreader
 236 template struct AccessSpreader<std::atomic>;
 237
 238 SimpleAllocator::SimpleAllocator(size_t allocSize, size_t sz)
 239     : allocSize_{allocSize}, sz_(sz) {}
 240
 241 SimpleAllocator::~SimpleAllocator() {
 242   std::lock_guard<std::mutex> g(m_);
 243   for (auto& block : blocks_) {
 244     folly::aligned_free(block);
 245   }
 246 }
 247
 248 void* SimpleAllocator::allocateHard() {
 249   // Allocate a new slab.
 250   mem_ = static_cast<uint8_t*>(folly::aligned_malloc(allocSize_, allocSize_));
 251   if (!mem_) {
 252     std::__throw_bad_alloc();
 253   }
 254   end_ = mem_ + allocSize_;
 255   blocks_.push_back(mem_);
 256
 257   // Install a pointer to ourselves as the allocator.
 258   *reinterpret_cast<SimpleAllocator**>(mem_) = this;
 259   static_assert(max_align_v >= sizeof(SimpleAllocator*), "alignment too small");
 260   mem_ += std::min(sz_, max_align_v);
 261
 262   // New allocation.
 263   auto mem = mem_;
 264   mem_ += sz_;
 265   assert(intptr_t(mem) % 128 != 0);
 266   return mem;
 267 }
 268
 269 } // namespace folly