2 * Copyright 2015 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/MemoryMapping.h>
18 #include <folly/Format.h>
19 #include <folly/Portability.h>
22 #include <folly/experimental/io/HugePages.h>
27 #include <sys/types.h>
28 #include <system_error>
29 #include <gflags/gflags.h>
31 DEFINE_int64(mlock_chunk_size, 1 << 20, // 1MB
32 "Maximum bytes to mlock/munlock/munmap at once "
33 "(will be rounded up to PAGESIZE)");
36 #define MAP_POPULATE 0
41 MemoryMapping::MemoryMapping(MemoryMapping&& other) noexcept {
45 MemoryMapping::MemoryMapping(File file, off_t offset, off_t length,
47 : file_(std::move(file)),
48 options_(std::move(options)) {
53 MemoryMapping::MemoryMapping(const char* name, off_t offset, off_t length,
55 : MemoryMapping(File(name, options.writable ? O_RDWR : O_RDONLY),
60 MemoryMapping::MemoryMapping(int fd, off_t offset, off_t length,
62 : MemoryMapping(File(fd), offset, length, options) { }
64 MemoryMapping::MemoryMapping(AnonymousType, off_t length, Options options)
65 : options_(std::move(options)) {
72 void getDeviceOptions(dev_t device, off_t& pageSize, bool& autoExtend) {
73 auto ps = getHugePageSizeForDevice(device);
80 inline void getDeviceOptions(dev_t device, off_t& pageSize,
86 void MemoryMapping::init(off_t offset, off_t length) {
87 const bool grow = options_.grow;
88 const bool anon = !file_;
89 CHECK(!(grow && anon));
91 off_t& pageSize = options_.pageSize;
95 // On Linux, hugetlbfs file systems don't require ftruncate() to grow the
96 // file, and (on kernels before 2.6.24) don't even allow it. Also, the file
97 // size is always a multiple of the page size.
98 bool autoExtend = false;
102 CHECK_ERR(fstat(file_.fd(), &st));
105 getDeviceOptions(st.st_dev, pageSize, autoExtend);
109 DCHECK_EQ(offset, 0);
110 CHECK_EQ(pageSize, 0);
115 pageSize = sysconf(_SC_PAGESIZE);
118 CHECK_GT(pageSize, 0);
119 CHECK_EQ(pageSize & (pageSize - 1), 0); // power of two
122 // Round down the start of the mapped region
123 size_t skipStart = offset % pageSize;
127 if (mapLength_ != -1) {
128 mapLength_ += skipStart;
130 // Round up the end of the mapped region
131 mapLength_ = (mapLength_ + pageSize - 1) / pageSize * pageSize;
134 off_t remaining = anon ? length : st.st_size - offset;
136 if (mapLength_ == -1) {
137 length = mapLength_ = remaining;
139 if (length > remaining) {
142 PCHECK(0 == ftruncate(file_.fd(), offset + length))
143 << "ftruncate() failed, couldn't grow file to "
147 // Extend mapping to multiple of page size, don't use ftruncate
148 remaining = mapLength_;
154 if (mapLength_ > remaining) {
155 mapLength_ = remaining;
163 int flags = options_.shared ? MAP_SHARED : MAP_PRIVATE;
164 if (anon) flags |= MAP_ANONYMOUS;
165 if (options_.prefault) flags |= MAP_POPULATE;
167 // The standard doesn't actually require PROT_NONE to be zero...
168 int prot = PROT_NONE;
169 if (options_.readable || options_.writable) {
170 prot = ((options_.readable ? PROT_READ : 0) |
171 (options_.writable ? PROT_WRITE : 0));
174 unsigned char* start = static_cast<unsigned char*>(
175 mmap(options_.address, mapLength_, prot, flags, file_.fd(), offset));
176 PCHECK(start != MAP_FAILED)
177 << " offset=" << offset
178 << " length=" << mapLength_;
180 data_.reset(start + skipStart, length);
186 off_t memOpChunkSize(off_t length, off_t pageSize) {
187 off_t chunkSize = length;
188 if (FLAGS_mlock_chunk_size <= 0) {
192 chunkSize = FLAGS_mlock_chunk_size;
193 off_t r = chunkSize % pageSize;
195 chunkSize += (pageSize - r);
201 * Run @op in chunks over the buffer @mem of @bufSize length.
204 * - success: true + amountSucceeded == bufSize (op success on whole buffer)
205 * - failure: false + amountSucceeded == nr bytes on which op succeeded.
207 bool memOpInChunks(std::function<int(void*, size_t)> op,
208 void* mem, size_t bufSize, off_t pageSize,
209 size_t& amountSucceeded) {
210 // unmap/mlock/munlock take a kernel semaphore and block other threads from
211 // doing other memory operations. If the size of the buffer is big the
212 // semaphore can be down for seconds (for benchmarks see
213 // http://kostja-osipov.livejournal.com/42963.html). Doing the operations in
214 // chunks breaks the locking into intervals and lets other threads do memory
215 // operations of their own.
217 size_t chunkSize = memOpChunkSize(bufSize, pageSize);
219 char* addr = static_cast<char*>(mem);
222 while (amountSucceeded < bufSize) {
223 size_t size = std::min(chunkSize, bufSize - amountSucceeded);
224 if (op(addr + amountSucceeded, size) != 0) {
227 amountSucceeded += size;
233 } // anonymous namespace
235 bool MemoryMapping::mlock(LockMode lock) {
236 size_t amountSucceeded = 0;
237 locked_ = memOpInChunks(::mlock, mapStart_, mapLength_, options_.pageSize,
243 auto msg(folly::format(
244 "mlock({}) failed at {}",
245 mapLength_, amountSucceeded).str());
247 if (lock == LockMode::TRY_LOCK && (errno == EPERM || errno == ENOMEM)) {
248 PLOG(WARNING) << msg;
253 // only part of the buffer was mlocked, unlock it back
254 if (!memOpInChunks(::munlock, mapStart_, amountSucceeded, options_.pageSize,
256 PLOG(WARNING) << "munlock()";
262 void MemoryMapping::munlock(bool dontneed) {
263 if (!locked_) return;
265 size_t amountSucceeded = 0;
266 if (!memOpInChunks(::munlock, mapStart_, mapLength_, options_.pageSize,
268 PLOG(WARNING) << "munlock()";
270 if (mapLength_ && dontneed &&
271 ::madvise(mapStart_, mapLength_, MADV_DONTNEED)) {
272 PLOG(WARNING) << "madvise()";
277 void MemoryMapping::hintLinearScan() {
278 advise(MADV_SEQUENTIAL);
281 MemoryMapping::~MemoryMapping() {
283 size_t amountSucceeded = 0;
284 if (!memOpInChunks(::munmap, mapStart_, mapLength_, options_.pageSize,
286 PLOG(FATAL) << folly::format(
287 "munmap({}) failed at {}",
288 mapLength_, amountSucceeded).str();
293 void MemoryMapping::advise(int advice) const {
294 if (mapLength_ && ::madvise(mapStart_, mapLength_, advice)) {
295 PLOG(WARNING) << "madvise()";
299 MemoryMapping& MemoryMapping::operator=(MemoryMapping other) {
304 void MemoryMapping::swap(MemoryMapping& other) noexcept {
306 swap(this->file_, other.file_);
307 swap(this->mapStart_, other.mapStart_);
308 swap(this->mapLength_, other.mapLength_);
309 swap(this->options_, other.options_);
310 swap(this->locked_, other.locked_);
311 swap(this->data_, other.data_);
314 void swap(MemoryMapping& a, MemoryMapping& b) noexcept { a.swap(b); }
316 void alignedForwardMemcpy(void* dst, const void* src, size_t size) {
317 assert(reinterpret_cast<uintptr_t>(src) % alignof(unsigned long) == 0);
318 assert(reinterpret_cast<uintptr_t>(dst) % alignof(unsigned long) == 0);
320 auto srcl = static_cast<const unsigned long*>(src);
321 auto dstl = static_cast<unsigned long*>(dst);
323 while (size >= sizeof(unsigned long)) {
325 size -= sizeof(unsigned long);
328 auto srcc = reinterpret_cast<const unsigned char*>(srcl);
329 auto dstc = reinterpret_cast<unsigned char*>(dstl);
337 void mmapFileCopy(const char* src, const char* dest, mode_t mode) {
338 MemoryMapping srcMap(src);
339 srcMap.hintLinearScan();
341 MemoryMapping destMap(
342 File(dest, O_RDWR | O_CREAT | O_TRUNC, mode),
344 srcMap.range().size(),
345 MemoryMapping::writable());
347 alignedForwardMemcpy(destMap.writableRange().data(),
348 srcMap.range().data(),
349 srcMap.range().size());