2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/detail/CacheLocality.h>
20 #define _GNU_SOURCE 1 // for RTLD_NOLOAD
25 #include <folly/Conv.h>
26 #include <folly/Exception.h>
27 #include <folly/FileUtil.h>
28 #include <folly/Format.h>
29 #include <folly/ScopeGuard.h>
34 ///////////// CacheLocality
36 /// Returns the best real CacheLocality information available
37 static CacheLocality getSystemLocalityInfo() {
40 return CacheLocality::readFromSysfs();
46 long numCpus = sysconf(_SC_NPROCESSORS_CONF);
48 // This shouldn't happen, but if it does we should try to keep
49 // going. We are probably not going to be able to parse /sys on
50 // this box either (although we will try), which means we are going
51 // to fall back to the SequentialThreadId splitter. On my 16 core
52 // (x hyperthreading) dev box 16 stripes is enough to get pretty good
53 // contention avoidance with SequentialThreadId, and there is little
54 // improvement from going from 32 to 64. This default gives us some
58 return CacheLocality::uniform(size_t(numCpus));
62 const CacheLocality& CacheLocality::system<std::atomic>() {
63 static auto* cache = new CacheLocality(getSystemLocalityInfo());
67 // Each level of cache has sharing sets, which are the set of cpus
68 // that share a common cache at that level. These are available in a
69 // hex bitset form (/sys/devices/system/cpu/cpu0/index0/shared_cpu_map,
70 // for example). They are also available in a human-readable list form,
71 // as in /sys/devices/system/cpu/cpu0/index0/shared_cpu_list. The list
72 // is a comma-separated list of numbers and ranges, where the ranges are
73 // a pair of decimal numbers separated by a '-'.
75 // To sort the cpus for optimum locality we don't really need to parse
76 // the sharing sets, we just need a unique representative from the
77 // equivalence class. The smallest value works fine, and happens to be
78 // the first decimal number in the file. We load all of the equivalence
79 // class information from all of the cpu*/index* directories, order the
80 // cpus first by increasing last-level cache equivalence class, then by
81 // the smaller caches. Finally, we break ties with the cpu number itself.
83 /// Returns the first decimal number in the string, or throws an exception
84 /// if the string does not start with a number terminated by ',', '-',
86 static size_t parseLeadingNumber(const std::string& line) {
87 auto raw = line.c_str();
89 unsigned long val = strtoul(raw, &end, 10);
90 if (end == raw || (*end != ',' && *end != '-' && *end != '\n' && *end != 0)) {
91 throw std::runtime_error(
92 to<std::string>("error parsing list '", line, "'").c_str());
97 CacheLocality CacheLocality::readFromSysfsTree(
98 const std::function<std::string(std::string)>& mapping) {
99 // number of equivalence classes per level
100 std::vector<size_t> numCachesByLevel;
102 // the list of cache equivalence classes, where equivalance classes
103 // are named by the smallest cpu in the class
104 std::vector<std::vector<size_t>> equivClassesByCpu;
106 std::vector<size_t> cpus;
109 auto cpu = cpus.size();
110 std::vector<size_t> levels;
111 for (size_t index = 0;; ++index) {
113 sformat("/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index);
114 auto cacheType = mapping(dir + "type");
115 auto equivStr = mapping(dir + "shared_cpu_list");
116 if (cacheType.size() == 0 || equivStr.size() == 0) {
120 if (cacheType[0] == 'I') {
121 // cacheType in { "Data", "Instruction", "Unified" }. skip icache
124 auto equiv = parseLeadingNumber(equivStr);
125 auto level = levels.size();
126 levels.push_back(equiv);
129 // we only want to count the equiv classes once, so we do it when
130 // we first encounter them
131 while (numCachesByLevel.size() <= level) {
132 numCachesByLevel.push_back(0);
134 numCachesByLevel[level]++;
138 if (levels.size() == 0) {
139 // no levels at all for this cpu, we must be done
142 equivClassesByCpu.emplace_back(std::move(levels));
146 if (cpus.size() == 0) {
147 throw std::runtime_error("unable to load cache sharing info");
150 std::sort(cpus.begin(),
152 [&](size_t lhs, size_t rhs) -> bool {
153 // sort first by equiv class of cache with highest index,
154 // direction doesn't matter. If different cpus have
155 // different numbers of caches then this code might produce
156 // a sub-optimal ordering, but it won't crash
157 auto& lhsEquiv = equivClassesByCpu[lhs];
158 auto& rhsEquiv = equivClassesByCpu[rhs];
159 for (ssize_t i = ssize_t(std::min(lhsEquiv.size(), rhsEquiv.size())) - 1;
162 auto idx = size_t(i);
163 if (lhsEquiv[idx] != rhsEquiv[idx]) {
164 return lhsEquiv[idx] < rhsEquiv[idx];
168 // break ties deterministically by cpu
172 // the cpus are now sorted by locality, with neighboring entries closer
173 // to each other than entries that are far away. For striping we want
174 // the inverse map, since we are starting with the cpu
175 std::vector<size_t> indexes(cpus.size());
176 for (size_t i = 0; i < cpus.size(); ++i) {
177 indexes[cpus[i]] = i;
180 return CacheLocality{
181 cpus.size(), std::move(numCachesByLevel), std::move(indexes)};
184 CacheLocality CacheLocality::readFromSysfs() {
185 return readFromSysfsTree([](std::string name) {
186 std::ifstream xi(name.c_str());
188 std::getline(xi, rv);
193 CacheLocality CacheLocality::uniform(size_t numCpus) {
196 rv.numCpus = numCpus;
198 // one cache shared by all cpus
199 rv.numCachesByLevel.push_back(numCpus);
201 // no permutations in locality index mapping
202 for (size_t cpu = 0; cpu < numCpus; ++cpu) {
203 rv.localityIndexByCpu.push_back(cpu);
209 ////////////// Getcpu
211 Getcpu::Func Getcpu::resolveVdsoFunc() {
212 #if !FOLLY_HAVE_LINUX_VDSO
215 void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
220 auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu"));
221 if (func == nullptr) {
222 // technically a null result could either be a failure or a successful
223 // lookup of a symbol with the null value, but the second can't actually
224 // happen for this symbol. No point holding the handle forever if
225 // we don't need the code
234 /////////////// SequentialThreadId
235 template struct SequentialThreadId<std::atomic>;
238 /////////////// AccessSpreader
239 template struct AccessSpreader<std::atomic>;
241 SimpleAllocator::SimpleAllocator(size_t allocSize, size_t sz)
242 : allocSize_{allocSize}, sz_(sz) {}
244 SimpleAllocator::~SimpleAllocator() {
245 std::lock_guard<std::mutex> g(m_);
246 for (auto& block : blocks_) {
251 void* SimpleAllocator::allocateHard() {
252 // Allocate a new slab.
253 mem_ = static_cast<uint8_t*>(aligned_malloc(allocSize_, allocSize_));
255 std::__throw_bad_alloc();
257 end_ = mem_ + allocSize_;
258 blocks_.push_back(mem_);
260 // Install a pointer to ourselves as the allocator.
261 *reinterpret_cast<SimpleAllocator**>(mem_) = this;
263 alignof(std::max_align_t) >= sizeof(SimpleAllocator*),
264 "alignment too small");
265 mem_ += std::min(sz_, alignof(std::max_align_t));
270 assert(intptr_t(mem) % 128 != 0);
274 } // namespace detail