10 #include "lockguard.h"
11 #include "static_vector.h"
16 static event_counter evt_allocator_total_region_usage(
17 "allocator_total_region_usage_bytes");
19 // page+alloc routines taken from masstree
22 const allocator::pgmetadata *
23 allocator::PointerToPgMetadata(const void *p)
25 static const size_t hugepgsize = GetHugepageSize();
26 if (unlikely(!ManagesPointer(p)))
28 const size_t cpu = PointerToCpu(p);
29 const regionctx &pc = g_regions[cpu];
30 if (p >= pc.region_begin)
32 // round pg down to page
33 p = (const void *) ((uintptr_t)p & ~(hugepgsize-1));
34 const pgmetadata *pmd = (const pgmetadata *) p;
35 ALWAYS_ASSERT((pmd->unit_ % AllocAlignment) == 0);
36 ALWAYS_ASSERT((MAX_ARENAS * AllocAlignment) >= pmd->unit_);
42 allocator::GetHugepageSizeImpl()
44 FILE *f = fopen("/proc/meminfo", "r");
48 static const char *key = "Hugepagesize:";
49 static const int keylen = strlen(key);
51 while (getline(&linep, &n, f) > 0) {
52 if (strstr(linep, key) != linep)
54 size = atol(linep + keylen) * 1024;
63 allocator::GetPageSizeImpl()
65 return sysconf(_SC_PAGESIZE);
69 allocator::UseMAdvWillNeed()
71 static const char *px = getenv("DISABLE_MADV_WILLNEED");
72 static const std::string s = px ? to_lower(px) : "";
73 static const bool use_madv = !(s == "1" || s == "true");
78 allocator::Initialize(size_t ncpus, size_t maxpercore)
80 static spinlock s_lock;
81 static bool s_init = false;
84 lock_guard<spinlock> l(s_lock);
87 ALWAYS_ASSERT(!g_memstart);
88 ALWAYS_ASSERT(!g_memend);
89 ALWAYS_ASSERT(!g_ncpus);
90 ALWAYS_ASSERT(!g_maxpercore);
92 static const size_t hugepgsize = GetHugepageSize();
94 // round maxpercore to the nearest hugepagesize
95 maxpercore = slow_round_up(maxpercore, hugepgsize);
98 g_maxpercore = maxpercore;
100 // mmap() the entire region for now, but just as a marker
101 // (this does not actually cause physical pages to be allocated)
102 // note: we allocate an extra hugepgsize so we can guarantee alignment
103 // of g_memstart to a huge page boundary
105 void * const x = mmap(nullptr, g_ncpus * g_maxpercore + hugepgsize,
106 PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
107 if (x == MAP_FAILED) {
109 ALWAYS_ASSERT(false);
112 void * const endpx = (void *) ((uintptr_t)x + g_ncpus * g_maxpercore + hugepgsize);
113 std::cerr << "allocator::Initialize()" << std::endl
114 << " hugepgsize: " << hugepgsize << std::endl
115 << " use MADV_WILLNEED: " << UseMAdvWillNeed() << std::endl
116 << " mmap() region [" << x << ", " << endpx << ")" << std::endl;
118 g_memstart = reinterpret_cast<void *>(util::iceil(uintptr_t(x), hugepgsize));
119 g_memend = reinterpret_cast<char *>(g_memstart) + (g_ncpus * g_maxpercore);
121 ALWAYS_ASSERT(!(reinterpret_cast<uintptr_t>(g_memstart) % hugepgsize));
122 ALWAYS_ASSERT(reinterpret_cast<uintptr_t>(g_memend) <=
123 (reinterpret_cast<uintptr_t>(x) + (g_ncpus * g_maxpercore + hugepgsize)));
125 for (size_t i = 0; i < g_ncpus; i++) {
126 g_regions[i].region_begin =
127 reinterpret_cast<char *>(g_memstart) + (i * g_maxpercore);
128 g_regions[i].region_end =
129 reinterpret_cast<char *>(g_memstart) + ((i + 1) * g_maxpercore);
130 std::cerr << "cpu" << i << " owns [" << g_regions[i].region_begin
131 << ", " << g_regions[i].region_end << ")" << std::endl;
132 ALWAYS_ASSERT(g_regions[i].region_begin < g_regions[i].region_end);
133 ALWAYS_ASSERT(g_regions[i].region_begin >= x);
134 ALWAYS_ASSERT(g_regions[i].region_end <= endpx);
141 allocator::DumpStats()
143 std::cerr << "[allocator] ncpus=" << g_ncpus << std::endl;
144 for (size_t i = 0; i < g_ncpus; i++) {
145 const bool f = g_regions[i].region_faulted;
146 const size_t remaining =
147 intptr_t(g_regions[i].region_end) -
148 intptr_t(g_regions[i].region_begin);
149 std::cerr << "[allocator] cpu=" << i << " fully_faulted?=" << f
150 << " remaining=" << remaining << " bytes" << std::endl;
155 initialize_page(void *page, const size_t pagesize, const size_t unit)
157 INVARIANT(((uintptr_t)page % pagesize) == 0);
159 #ifdef MEMCHECK_MAGIC
160 ::allocator::pgmetadata *pmd = (::allocator::pgmetadata *) page;
162 page = (void *) ((uintptr_t)page + sizeof(*pmd));
165 void *first = (void *)util::iceil((uintptr_t)page, (uintptr_t)unit);
166 INVARIANT((uintptr_t)first + unit <= (uintptr_t)page + pagesize);
167 void **p = (void **)first;
168 void *next = (void *)((uintptr_t)p + unit);
169 while ((uintptr_t)next + unit <= (uintptr_t)page + pagesize) {
170 INVARIANT(((uintptr_t)p % unit) == 0);
172 #ifdef MEMCHECK_MAGIC
174 (char *) p + sizeof(void **),
175 MEMCHECK_MAGIC, unit - sizeof(void **));
178 next = (void *)((uintptr_t)next + unit);
180 INVARIANT(((uintptr_t)p % unit) == 0);
182 #ifdef MEMCHECK_MAGIC
184 (char *) p + sizeof(void **),
185 MEMCHECK_MAGIC, unit - sizeof(void **));
191 allocator::AllocateArenas(size_t cpu, size_t arena)
193 INVARIANT(cpu < g_ncpus);
194 INVARIANT(arena < MAX_ARENAS);
195 INVARIANT(g_memstart);
196 INVARIANT(g_maxpercore);
197 static const size_t hugepgsize = GetHugepageSize();
199 regionctx &pc = g_regions[cpu];
201 if (likely(pc.arenas[arena])) {
203 void *ret = pc.arenas[arena];
204 pc.arenas[arena] = nullptr;
209 void * const mypx = AllocateUnmanagedWithLock(pc, 1); // releases lock
210 return initialize_page(mypx, hugepgsize, (arena + 1) * AllocAlignment);
214 allocator::AllocateUnmanaged(size_t cpu, size_t nhugepgs)
216 regionctx &pc = g_regions[cpu];
218 return AllocateUnmanagedWithLock(pc, nhugepgs); // releases lock
222 allocator::AllocateUnmanagedWithLock(regionctx &pc, size_t nhugepgs)
224 static const size_t hugepgsize = GetHugepageSize();
226 void * const mypx = pc.region_begin;
229 if (reinterpret_cast<uintptr_t>(mypx) % hugepgsize)
230 ALWAYS_ASSERT(false);
232 void * const mynewpx =
233 reinterpret_cast<char *>(mypx) + nhugepgs * hugepgsize;
235 if (unlikely(mynewpx > pc.region_end)) {
236 std::cerr << "allocator::AllocateUnmanagedWithLock():" << std::endl
237 << " region ending at " << pc.region_end << " OOM" << std::endl;
238 ALWAYS_ASSERT(false); // out of memory otherwise
241 const bool needs_mmap = !pc.region_faulted;
242 pc.region_begin = mynewpx;
245 evt_allocator_total_region_usage.inc(nhugepgs * hugepgsize);
248 void * const x = mmap(mypx, hugepgsize, PROT_READ | PROT_WRITE,
249 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
250 if (unlikely(x == MAP_FAILED)) {
252 ALWAYS_ASSERT(false);
254 INVARIANT(x == mypx);
256 UseMAdvWillNeed() ? MADV_HUGEPAGE | MADV_WILLNEED : MADV_HUGEPAGE;
257 if (madvise(x, hugepgsize, advice)) {
259 ALWAYS_ASSERT(false);
267 allocator::ReleaseArenas(void **arenas)
269 // cpu -> [(head, tail)]
270 // XXX: use a small_map here?
271 std::map<size_t, static_vector<std::pair<void *, void *>, MAX_ARENAS>> m;
272 for (size_t arena = 0; arena < MAX_ARENAS; arena++) {
273 void *p = arenas[arena];
275 void * const pnext = *reinterpret_cast<void **>(p);
276 const size_t cpu = PointerToCpu(p);
277 auto it = m.find(cpu);
280 v.resize(MAX_ARENAS);
281 *reinterpret_cast<void **>(p) = nullptr;
282 v[arena].first = v[arena].second = p;
284 auto &v = it->second;
285 if (!v[arena].second) {
286 *reinterpret_cast<void **>(p) = nullptr;
287 v[arena].first = v[arena].second = p;
289 *reinterpret_cast<void **>(p) = v[arena].first;
297 INVARIANT(!p.second.empty());
298 regionctx &pc = g_regions[p.first];
299 lock_guard<spinlock> l(pc.lock);
300 for (size_t arena = 0; arena < MAX_ARENAS; arena++) {
301 INVARIANT(bool(p.second[arena].first) == bool(p.second[arena].second));
302 if (!p.second[arena].first)
304 *reinterpret_cast<void **>(p.second[arena].second) = pc.arenas[arena];
305 pc.arenas[arena] = p.second[arena].first;
311 numa_hint_memory_placement(void *px, size_t sz, unsigned node)
313 struct bitmask *bm = numa_allocate_nodemask();
314 numa_bitmask_setbit(bm, node);
315 numa_interleave_memory(px, sz, bm);
316 numa_free_nodemask(bm);
320 allocator::FaultRegion(size_t cpu)
322 static const size_t hugepgsize = GetHugepageSize();
323 ALWAYS_ASSERT(cpu < g_ncpus);
324 regionctx &pc = g_regions[cpu];
325 if (pc.region_faulted)
327 lock_guard<std::mutex> l1(pc.fault_lock);
328 lock_guard<spinlock> l(pc.lock); // exclude other users of the allocator
329 if (pc.region_faulted)
331 // mmap the entire region + memset it for faulting
332 if (reinterpret_cast<uintptr_t>(pc.region_begin) % hugepgsize)
333 ALWAYS_ASSERT(false);
335 reinterpret_cast<uintptr_t>(pc.region_end) -
336 reinterpret_cast<uintptr_t>(pc.region_begin);
337 void * const x = mmap(pc.region_begin, sz, PROT_READ | PROT_WRITE,
338 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
339 if (unlikely(x == MAP_FAILED)) {
341 std::cerr << " cpu" << cpu
342 << " [" << pc.region_begin << ", " << pc.region_end << ")"
344 ALWAYS_ASSERT(false);
346 ALWAYS_ASSERT(x == pc.region_begin);
348 UseMAdvWillNeed() ? MADV_HUGEPAGE | MADV_WILLNEED : MADV_HUGEPAGE;
349 if (madvise(x, sz, advice)) {
351 ALWAYS_ASSERT(false);
353 numa_hint_memory_placement(
355 (uintptr_t)pc.region_end - (uintptr_t)pc.region_begin,
356 numa_node_of_cpu(cpu));
357 const size_t nfaults =
358 ((uintptr_t)pc.region_end - (uintptr_t)pc.region_begin) / hugepgsize;
359 std::cerr << "cpu" << cpu << " starting faulting region ("
360 << intptr_t(pc.region_end) - intptr_t(pc.region_begin)
361 << " bytes / " << nfaults << " hugepgs)" << std::endl;
363 for (char *px = (char *) pc.region_begin;
364 px < (char *) pc.region_end;
365 px += CACHELINE_SIZE)
367 std::cerr << "cpu" << cpu << " finished faulting region in "
368 << t.lap_ms() << " ms" << std::endl;
369 pc.region_faulted = true;
372 void *allocator::g_memstart = nullptr;
373 void *allocator::g_memend = nullptr;
374 size_t allocator::g_ncpus = 0;
375 size_t allocator::g_maxpercore = 0;
376 percore<allocator::regionctx> allocator::g_regions;