* Address space accounting code <alan@lxorguk.ukuu.org.uk>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
+#include <linux/mmdebug.h>
#include <linux/perf_event.h>
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/sched/sysctl.h>
#include <linux/notifier.h>
#include <linux/memory.h>
+#include <linux/printk.h>
+#include <linux/userfaultfd_k.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#define arch_rebalance_pgtables(addr, len) (addr)
#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
+const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
+int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
+const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
+int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
+#endif
+
+
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
* MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (yes) yes w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
- *
+ *
* MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (copy) copy w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
}
EXPORT_SYMBOL(vm_get_page_prot);
+static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
+{
+ return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
+}
+
+/* Update vma->vm_page_prot to reflect vma->vm_flags. */
+void vma_set_page_prot(struct vm_area_struct *vma)
+{
+ unsigned long vm_flags = vma->vm_flags;
+
+ vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
+ if (vma_wants_writenotify(vma)) {
+ vm_flags &= ~VM_SHARED;
+ vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot,
+ vm_flags);
+ }
+}
+
+
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed, reserve;
+ long free, allowed, reserve;
+
+ VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
+ -(s64)vm_committed_as_batch * num_online_cpus(),
+ "memory commitment underflow");
vm_acct_memory(pages);
goto error;
}
- allowed = (totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100;
+ allowed = vm_commit_limit();
/*
* Reserve some for root
*/
if (!cap_sys_admin)
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed += total_swap_pages;
/*
* Don't let a single process grow so big a user can't recover
*/
if (mm) {
reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min(mm->total_vm / 32, reserve);
+ allowed -= min_t(long, mm->total_vm / 32, reserve);
}
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
}
/*
- * Requires inode->i_mapping->i_mmap_mutex
+ * Requires inode->i_mapping->i_mmap_rwsem
*/
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
if (vma->vm_flags & VM_DENYWRITE)
atomic_inc(&file_inode(file)->i_writecount);
if (vma->vm_flags & VM_SHARED)
- mapping->i_mmap_writable--;
+ mapping_unmap_writable(mapping);
flush_dcache_mmap_lock(mapping);
- if (unlikely(vma->vm_flags & VM_NONLINEAR))
- list_del_init(&vma->shared.nonlinear);
- else
- vma_interval_tree_remove(vma, &mapping->i_mmap);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
if (file) {
struct address_space *mapping = file->f_mapping;
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
__remove_shared_vm_struct(vma, file, mapping);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
}
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
- unsigned long rlim, retval;
+ unsigned long retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
unsigned long min_brk;
* segment grow beyond its set limit the in case where the limit is
* not page aligned -Ram Gupta
*/
- rlim = rlimit(RLIMIT_DATA);
- if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
- (mm->end_data - mm->start_data) > rlim)
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
+ mm->end_data, mm->start_data))
goto out;
newbrk = PAGE_ALIGN(brk);
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
if (vma->vm_start < prev) {
- printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
+ pr_emerg("vm_start %lx < prev %lx\n",
+ vma->vm_start, prev);
bug = 1;
}
if (vma->vm_start < pend) {
- printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
+ pr_emerg("vm_start %lx < pend %lx\n",
+ vma->vm_start, pend);
bug = 1;
}
if (vma->vm_start > vma->vm_end) {
- printk("vm_end %lx < vm_start %lx\n",
- vma->vm_end, vma->vm_start);
+ pr_emerg("vm_start %lx > vm_end %lx\n",
+ vma->vm_start, vma->vm_end);
bug = 1;
}
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
- printk("free gap %lx, correct %lx\n",
+ pr_emerg("free gap %lx, correct %lx\n",
vma->rb_subtree_gap,
vma_compute_subtree_gap(vma));
bug = 1;
for (nd = pn; nd; nd = rb_prev(nd))
j++;
if (i != j) {
- printk("backwards %d, forwards %d\n", j, i);
+ pr_emerg("backwards %d, forwards %d\n", j, i);
bug = 1;
}
return bug ? -1 : i;
for (nd = rb_first(root); nd; nd = rb_next(nd)) {
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- BUG_ON(vma != ignore &&
- vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
+ VM_BUG_ON_VMA(vma != ignore &&
+ vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
+ vma);
}
}
-void validate_mm(struct mm_struct *mm)
+static void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
unsigned long highest_address = 0;
struct vm_area_struct *vma = mm->mmap;
+
while (vma) {
struct anon_vma_chain *avc;
+
vma_lock_anon_vma(vma);
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_verify(avc);
i++;
}
if (i != mm->map_count) {
- printk("map_count %d vm_next %d\n", mm->map_count, i);
+ pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
bug = 1;
}
if (highest_address != mm->highest_vm_end) {
- printk("mm->highest_vm_end %lx, found %lx\n",
- mm->highest_vm_end, highest_address);
+ pr_emerg("mm->highest_vm_end %lx, found %lx\n",
+ mm->highest_vm_end, highest_address);
bug = 1;
}
i = browse_rb(&mm->mm_rb);
if (i != mm->map_count) {
- printk("map_count %d rb %d\n", mm->map_count, i);
+ if (i != -1)
+ pr_emerg("map_count %d rb %d\n", mm->map_count, i);
bug = 1;
}
- BUG_ON(bug);
+ VM_BUG_ON_MM(bug, mm);
}
#else
#define validate_mm_rb(root, ignore) do { } while (0)
if (vma->vm_flags & VM_DENYWRITE)
atomic_dec(&file_inode(file)->i_writecount);
if (vma->vm_flags & VM_SHARED)
- mapping->i_mmap_writable++;
+ atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
- if (unlikely(vma->vm_flags & VM_NONLINEAR))
- vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
- else
- vma_interval_tree_insert(vma, &mapping->i_mmap);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
}
{
struct address_space *mapping = NULL;
- if (vma->vm_file)
+ if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
-
- if (mapping)
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
+ }
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
if (mapping)
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
mm->map_count++;
validate_mm(mm);
prev->vm_next = next = vma->vm_next;
if (next)
next->vm_prev = prev;
- if (mm->mmap_cache == vma)
- mm->mmap_cache = prev;
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
* split_vma inserting another: so it must be
* mprotect case 4 shifting the boundary down.
*/
- adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
+ adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
exporter = vma;
importer = next;
}
* shrinking vma had, to cover any anon pages imported.
*/
if (exporter && exporter->anon_vma && !importer->anon_vma) {
- if (anon_vma_clone(importer, exporter))
- return -ENOMEM;
+ int error;
+
importer->anon_vma = exporter->anon_vma;
+ error = anon_vma_clone(importer, exporter);
+ if (error)
+ return error;
}
}
if (file) {
mapping = file->f_mapping;
- if (!(vma->vm_flags & VM_NONLINEAR)) {
- root = &mapping->i_mmap;
- uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+ root = &mapping->i_mmap;
+ uprobe_munmap(vma, vma->vm_start, vma->vm_end);
- if (adjust_next)
- uprobe_munmap(next, next->vm_start,
- next->vm_end);
- }
+ if (adjust_next)
+ uprobe_munmap(next, next->vm_start, next->vm_end);
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
if (insert) {
/*
* Put into interval tree now, so instantiated pages
if (!anon_vma && adjust_next)
anon_vma = next->anon_vma;
if (anon_vma) {
- VM_BUG_ON(adjust_next && next->anon_vma &&
- anon_vma != next->anon_vma);
+ VM_BUG_ON_VMA(adjust_next && next->anon_vma &&
+ anon_vma != next->anon_vma, next);
anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
if (adjust_next)
anon_vma_unlock_write(anon_vma);
}
if (mapping)
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
if (root) {
uprobe_mmap(vma);
if (next->anon_vma)
anon_vma_merge(vma, next);
mm->map_count--;
- vma_set_policy(vma, vma_policy(next));
+ mpol_put(vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
/*
* In mprotect's case 6 (see comments on vma_merge),
* per-vma resources, so we don't attempt to merge those.
*/
static inline int is_mergeable_vma(struct vm_area_struct *vma,
- struct file *file, unsigned long vm_flags)
+ struct file *file, unsigned long vm_flags,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char __user *anon_name)
{
- if (vma->vm_flags ^ vm_flags)
+ /*
+ * VM_SOFTDIRTY should not prevent from VMA merging, if we
+ * match the flags but dirty bit -- the caller should mark
+ * merged VMA as dirty. If dirty bit won't be excluded from
+ * comparison, we increase pressue on the memory system forcing
+ * the kernel to generate new VMAs when old one could be
+ * extended instead.
+ */
+ if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
return 0;
if (vma->vm_file != file)
return 0;
if (vma->vm_ops && vma->vm_ops->close)
return 0;
+ if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
+ return 0;
+ if (vma_get_anon_name(vma) != anon_name)
+ return 0;
return 1;
}
*/
static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t vm_pgoff,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char __user *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
*/
static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t vm_pgoff,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char __user *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
- vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ vm_pglen = vma_pages(vma);
if (vma->vm_pgoff + vm_pglen == vm_pgoff)
return 1;
}
}
/*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor. Or both (it neatly fills a hole).
*
* In most cases - when called for mmap, brk or mremap - [addr,end) is
* certain not to be mapped by the time vma_merge is called; but when
struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
- pgoff_t pgoff, struct mempolicy *policy)
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t pgoff, struct mempolicy *policy,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char __user *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
* Can it merge with the predecessor?
*/
if (prev && prev->vm_end == addr &&
- mpol_equal(vma_policy(prev), policy) &&
+ mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
- anon_vma, file, pgoff)) {
+ anon_vma, file, pgoff,
+ vm_userfaultfd_ctx,
+ anon_name)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen) &&
+ anon_vma, file,
+ pgoff+pglen,
+ vm_userfaultfd_ctx,
+ anon_name) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
end, prev->vm_pgoff, NULL);
if (err)
return NULL;
- khugepaged_enter_vma_merge(prev);
+ khugepaged_enter_vma_merge(prev, vm_flags);
return prev;
}
* Can this new request be merged in front of next?
*/
if (next && end == next->vm_start &&
- mpol_equal(policy, vma_policy(next)) &&
+ mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen)) {
+ anon_vma, file, pgoff+pglen,
+ vm_userfaultfd_ctx,
+ anon_name)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL);
next->vm_pgoff - pglen, NULL);
if (err)
return NULL;
- khugepaged_enter_vma_merge(area);
+ khugepaged_enter_vma_merge(area, vm_flags);
return area;
}
return a->vm_end == b->vm_start &&
mpol_equal(vma_policy(a), vma_policy(b)) &&
a->vm_file == b->vm_file &&
- !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
+ !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}
* by another page fault trying to merge _that_. But that's ok: if it
* is being set up, that automatically means that it will be a singleton
* acceptable for merging, so we can do all of this optimistically. But
- * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
+ * we do that READ_ONCE() to make sure that we never re-load the pointer.
*
* IOW: that the "list_is_singular()" test on the anon_vma_chain only
* matters for the 'stable anon_vma' case (ie the thing we want to avoid
static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
{
if (anon_vma_compatible(a, b)) {
- struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
+ struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
if (anon_vma && list_is_singular(&old->anon_vma_chain))
return anon_vma;
return hint;
}
+static inline int mlock_future_check(struct mm_struct *mm,
+ unsigned long flags,
+ unsigned long len)
+{
+ unsigned long locked, lock_limit;
+
+ /* mlock MCL_FUTURE? */
+ if (flags & VM_LOCKED) {
+ locked = len >> PAGE_SHIFT;
+ locked += mm->locked_vm;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ return -EAGAIN;
+ }
+ return 0;
+}
+
/*
* The caller must hold down_write(¤t->mm->mmap_sem).
*/
-
-unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
- unsigned long flags, unsigned long pgoff,
- unsigned long *populate)
+ unsigned long flags, vm_flags_t vm_flags,
+ unsigned long pgoff, unsigned long *populate)
{
- struct mm_struct * mm = current->mm;
- struct inode *inode;
- vm_flags_t vm_flags;
+ struct mm_struct *mm = current->mm;
*populate = 0;
+ if (!len)
+ return -EINVAL;
+
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
* mounted, in which case we dont add PROT_EXEC.)
*/
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
- if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
+ if (!(file && path_noexec(&file->f_path)))
prot |= PROT_EXEC;
- if (!len)
- return -EINVAL;
-
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
- return -EOVERFLOW;
+ return -EOVERFLOW;
/* Too many mappings? */
if (mm->map_count > sysctl_max_map_count)
* that it represents a valid section of the address space.
*/
addr = get_unmapped_area(file, addr, len, pgoff, flags);
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return addr;
/* Do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
- vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+ vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
- /* mlock MCL_FUTURE? */
- if (vm_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
-
- inode = file ? file_inode(file) : NULL;
+ if (mlock_future_check(mm, vm_flags, len))
+ return -EAGAIN;
if (file) {
+ struct inode *inode = file_inode(file);
+
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
/*
* Make sure there are no mandatory locks on the file.
*/
- if (locks_verify_locked(inode))
+ if (locks_verify_locked(file))
return -EAGAIN;
vm_flags |= VM_SHARED | VM_MAYSHARE;
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
- if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
+ if (path_noexec(&file->f_path)) {
if (vm_flags & VM_EXEC)
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
}
- if (!file->f_op || !file->f_op->mmap)
+ if (!file->f_op->mmap)
return -ENODEV;
+ if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+ return -EINVAL;
break;
default:
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
+ if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+ return -EINVAL;
/*
* Ignore pgoff.
*/
unsigned long, fd, unsigned long, pgoff)
{
struct file *file = NULL;
- unsigned long retval = -EBADF;
+ unsigned long retval;
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
- if (unlikely(flags & MAP_HUGETLB))
- return -EINVAL;
file = fget(fd);
if (!file)
- goto out;
+ return -EBADF;
if (is_file_hugepages(file))
len = ALIGN(len, huge_page_size(hstate_file(file)));
+ retval = -EINVAL;
+ if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
+ goto out_fput;
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
- struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) &
- SHM_HUGE_MASK);
+ struct hstate *hs;
+ hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
if (!hs)
return -EINVAL;
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+out_fput:
if (file)
fput(file);
-out:
return retval;
}
if (copy_from_user(&a, arg, sizeof(a)))
return -EFAULT;
- if (a.offset & ~PAGE_MASK)
+ if (offset_in_page(a.offset))
return -EINVAL;
return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
int vma_wants_writenotify(struct vm_area_struct *vma)
{
vm_flags_t vm_flags = vma->vm_flags;
+ const struct vm_operations_struct *vm_ops = vma->vm_ops;
/* If it was private or non-writable, the write bit is already clear */
if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
return 0;
/* The backer wishes to know when pages are first written to? */
- if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+ if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))
return 1;
- /* The open routine did something to the protections already? */
+ /* The open routine did something to the protections that pgprot_modify
+ * won't preserve? */
if (pgprot_val(vma->vm_page_prot) !=
- pgprot_val(vm_get_page_prot(vm_flags)))
+ pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags)))
return 0;
+ /* Do we need to track softdirty? */
+ if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY))
+ return 1;
+
/* Specialty mapping? */
if (vm_flags & VM_PFNMAP)
return 0;
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
- int correct_wcount = 0;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
- struct inode *inode = file ? file_inode(file) : NULL;
/* Check against address space limit. */
if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
}
/* Clear old maps */
- error = -ENOMEM;
-munmap_back:
- if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
+ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
+ &rb_parent)) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
- goto munmap_back;
}
/*
/*
* Can we just expand an old mapping?
*/
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+ vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
+ NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (vma)
goto out;
vma->vm_pgoff = pgoff;
INIT_LIST_HEAD(&vma->anon_vma_chain);
- error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */
-
if (file) {
- if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
- goto free_vma;
if (vm_flags & VM_DENYWRITE) {
error = deny_write_access(file);
if (error)
goto free_vma;
- correct_wcount = 1;
}
+ if (vm_flags & VM_SHARED) {
+ error = mapping_map_writable(file->f_mapping);
+ if (error)
+ goto allow_write_and_free_vma;
+ }
+
+ /* ->mmap() can change vma->vm_file, but must guarantee that
+ * vma_link() below can deny write-access if VM_DENYWRITE is set
+ * and map writably if VM_SHARED is set. This usually means the
+ * new file must not have been exposed to user-space, yet.
+ */
vma->vm_file = get_file(file);
error = file->f_op->mmap(file, vma);
if (error)
WARN_ON_ONCE(addr != vma->vm_start);
addr = vma->vm_start;
- pgoff = vma->vm_pgoff;
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
- if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
- goto free_vma;
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
}
- if (vma_wants_writenotify(vma)) {
- pgprot_t pprot = vma->vm_page_prot;
-
- /* Can vma->vm_page_prot have changed??
- *
- * Answer: Yes, drivers may have changed it in their
- * f_op->mmap method.
- *
- * Ensures that vmas marked as uncached stay that way.
- */
- vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
- if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
- vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
- }
-
vma_link(mm, vma, prev, rb_link, rb_parent);
- file = vma->vm_file;
-
/* Once vma denies write, undo our temporary denial count */
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
+ if (file) {
+ if (vm_flags & VM_SHARED)
+ mapping_unmap_writable(file->f_mapping);
+ if (vm_flags & VM_DENYWRITE)
+ allow_write_access(file);
+ }
+ file = vma->vm_file;
out:
perf_event_mmap(vma);
vma == get_gate_vma(current->mm)))
mm->locked_vm += (len >> PAGE_SHIFT);
else
- vma->vm_flags &= ~VM_LOCKED;
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
}
if (file)
uprobe_mmap(vma);
+ /*
+ * New (or expanded) vma always get soft dirty status.
+ * Otherwise user-space soft-dirty page tracker won't
+ * be able to distinguish situation when vma area unmapped,
+ * then new mapped in-place (which must be aimed as
+ * a completely new data area).
+ */
+ vma->vm_flags |= VM_SOFTDIRTY;
+
+ vma_set_page_prot(vma);
+
return addr;
unmap_and_free_vma:
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
vma->vm_file = NULL;
fput(file);
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
+ if (vm_flags & VM_SHARED)
+ mapping_unmap_writable(file->f_mapping);
+allow_write_and_free_vma:
+ if (vm_flags & VM_DENYWRITE)
+ allow_write_access(file);
free_vma:
kmem_cache_free(vm_area_cachep, vma);
unacct_error:
struct vm_area_struct *vma;
struct vm_unmapped_area_info info;
- if (len > TASK_SIZE)
+ if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = 0;
info.length = len;
- info.low_limit = TASK_UNMAPPED_BASE;
+ info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
info.align_mask = 0;
return vm_unmapped_area(&info);
}
-#endif
-
-void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
-{
- /*
- * Is this a new hole at the lowest possible address?
- */
- if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
- mm->free_area_cache = addr;
-}
+#endif
/*
* This mmap-allocator allocates new areas top-down from below the
struct vm_unmapped_area_info info;
/* requested length too big for entire address space */
- if (len > TASK_SIZE)
+ if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
- info.low_limit = PAGE_SIZE;
+ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
info.high_limit = mm->mmap_base;
info.align_mask = 0;
addr = vm_unmapped_area(&info);
* can happen with large stack limits and large mmap()
* allocations.
*/
- if (addr & ~PAGE_MASK) {
+ if (offset_in_page(addr)) {
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
}
#endif
-void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
-{
- /*
- * Is this a new hole at the highest possible address?
- */
- if (addr > mm->free_area_cache)
- mm->free_area_cache = addr;
-
- /* dont allow allocations above current base */
- if (mm->free_area_cache > mm->mmap_base)
- mm->free_area_cache = mm->mmap_base;
-}
-
unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
return -ENOMEM;
get_area = current->mm->get_unmapped_area;
- if (file && file->f_op && file->f_op->get_unmapped_area)
+ if (file && file->f_op->get_unmapped_area)
get_area = file->f_op->get_unmapped_area;
addr = get_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
if (addr > TASK_SIZE - len)
return -ENOMEM;
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return -EINVAL;
addr = arch_rebalance_pgtables(addr, len);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma = NULL;
+ struct rb_node *rb_node;
+ struct vm_area_struct *vma;
/* Check the cache first. */
- /* (Cache hit rate is typically around 35%.) */
- vma = ACCESS_ONCE(mm->mmap_cache);
- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
- struct rb_node *rb_node;
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
+ return vma;
- rb_node = mm->mm_rb.rb_node;
- vma = NULL;
+ rb_node = mm->mm_rb.rb_node;
- while (rb_node) {
- struct vm_area_struct *vma_tmp;
-
- vma_tmp = rb_entry(rb_node,
- struct vm_area_struct, vm_rb);
-
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
- if (vma)
- mm->mmap_cache = vma;
+ while (rb_node) {
+ struct vm_area_struct *tmp;
+
+ tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+ if (tmp->vm_end > addr) {
+ vma = tmp;
+ if (tmp->vm_start <= addr)
+ break;
+ rb_node = rb_node->rb_left;
+ } else
+ rb_node = rb_node->rb_right;
}
+
+ if (vma)
+ vmacache_update(addr, vma);
return vma;
}
{
struct mm_struct *mm = vma->vm_mm;
struct rlimit *rlim = current->signal->rlim;
- unsigned long new_start;
+ unsigned long new_start, actual_size;
/* address space limit tests */
if (!may_expand_vm(mm, grow))
return -ENOMEM;
/* Stack limit test */
- if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+ actual_size = size;
+ if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
+ actual_size -= PAGE_SIZE;
+ if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
return -ENOMEM;
/* mlock limit tests */
unsigned long locked;
unsigned long limit;
locked = mm->locked_vm + grow;
- limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
+ limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
limit >>= PAGE_SHIFT;
if (locked > limit && !capable(CAP_IPC_LOCK))
return -ENOMEM;
if (security_vm_enough_memory_mm(mm, grow))
return -ENOMEM;
- /* Ok, everything looks good - let it rip */
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
- vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
return 0;
}
*/
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
+ struct mm_struct *mm = vma->vm_mm;
int error;
if (!(vma->vm_flags & VM_GROWSUP))
* So, we reuse mm->page_table_lock to guard
* against concurrent vma expansions.
*/
- spin_lock(&vma->vm_mm->page_table_lock);
+ spin_lock(&mm->page_table_lock);
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags,
+ vma->vm_file, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
anon_vma_interval_tree_post_update_vma(vma);
if (vma->vm_next)
vma_gap_update(vma->vm_next);
else
- vma->vm_mm->highest_vm_end = address;
- spin_unlock(&vma->vm_mm->page_table_lock);
+ mm->highest_vm_end = address;
+ spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
}
}
}
vma_unlock_anon_vma(vma);
- khugepaged_enter_vma_merge(vma);
- validate_mm(vma->vm_mm);
+ khugepaged_enter_vma_merge(vma, vma->vm_flags);
+ validate_mm(mm);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
int expand_downwards(struct vm_area_struct *vma,
unsigned long address)
{
+ struct mm_struct *mm = vma->vm_mm;
int error;
/*
* So, we reuse mm->page_table_lock to guard
* against concurrent vma expansions.
*/
- spin_lock(&vma->vm_mm->page_table_lock);
+ spin_lock(&mm->page_table_lock);
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags,
+ vma->vm_file, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
anon_vma_interval_tree_post_update_vma(vma);
vma_gap_update(vma);
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
}
}
}
vma_unlock_anon_vma(vma);
- khugepaged_enter_vma_merge(vma);
- validate_mm(vma->vm_mm);
+ khugepaged_enter_vma_merge(vma, vma->vm_flags);
+ validate_mm(mm);
return error;
}
if (!prev || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
- __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
+ populate_vma_page_range(prev, addr, prev->vm_end, NULL);
return prev;
}
#else
}
struct vm_area_struct *
-find_extend_vma(struct mm_struct * mm, unsigned long addr)
+find_extend_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct * vma;
+ struct vm_area_struct *vma;
unsigned long start;
addr &= PAGE_MASK;
- vma = find_vma(mm,addr);
+ vma = find_vma(mm, addr);
if (!vma)
return NULL;
if (vma->vm_start <= addr)
if (expand_stack(vma, addr))
return NULL;
if (vma->vm_flags & VM_LOCKED)
- __mlock_vma_pages_range(vma, addr, start, NULL);
+ populate_vma_page_range(vma, addr, start, NULL);
return vma;
}
#endif
+EXPORT_SYMBOL_GPL(find_extend_vma);
+
/*
* Ok - we have the memory areas we should free on the vma list,
* so release them, and do the vma updates.
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end)
{
- struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
+ struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
struct mmu_gather tlb;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, 0);
+ tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
{
struct vm_area_struct **insertion_point;
struct vm_area_struct *tail_vma = NULL;
- unsigned long addr;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
} else
mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
- if (mm->unmap_area == arch_unmap_area)
- addr = prev ? prev->vm_end : mm->mmap_base;
- else
- addr = vma ? vma->vm_start : mm->mmap_base;
- mm->unmap_area(mm, addr);
- mm->mmap_cache = NULL; /* Kill the cache. */
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
* __split_vma() bypasses sysctl_max_map_count checking. We use this on the
* munmap path where it doesn't make sense to fail.
*/
-static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
- struct mempolicy *pol;
struct vm_area_struct *new;
- int err = -ENOMEM;
+ int err;
if (is_vm_hugetlb_page(vma) && (addr &
~(huge_page_mask(hstate_vma(vma)))))
new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
- goto out_err;
+ return -ENOMEM;
/* most fields are the same, copy all, and then fixup */
*new = *vma;
new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
}
- pol = mpol_dup(vma_policy(vma));
- if (IS_ERR(pol)) {
- err = PTR_ERR(pol);
+ err = vma_dup_policy(vma, new);
+ if (err)
goto out_free_vma;
- }
- vma_set_policy(new, pol);
- if (anon_vma_clone(new, vma))
+ err = anon_vma_clone(new, vma);
+ if (err)
goto out_free_mpol;
if (new->vm_file)
fput(new->vm_file);
unlink_anon_vmas(new);
out_free_mpol:
- mpol_put(pol);
+ mpol_put(vma_policy(new));
out_free_vma:
kmem_cache_free(vm_area_cachep, new);
- out_err:
return err;
}
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
- if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
+ if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
return -EINVAL;
- if ((len = PAGE_ALIGN(len)) == 0)
+ len = PAGE_ALIGN(len);
+ if (len == 0)
return -EINVAL;
/* Find the first overlapping VMA */
if (error)
return error;
}
- vma = prev? prev->vm_next: mm->mmap;
+ vma = prev ? prev->vm_next : mm->mmap;
/*
* unlock any mlock()ed ranges before detaching vmas
detach_vmas_to_be_unmapped(mm, vma, prev, end);
unmap_region(mm, vma, prev, start, end);
+ arch_unmap(mm, vma, start, end);
+
/* Fix up all other VM information */
remove_vma_list(mm, vma);
return vm_munmap(addr, len);
}
+
+/*
+ * Emulation of deprecated remap_file_pages() syscall.
+ */
+SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
+ unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
+{
+
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long populate = 0;
+ unsigned long ret = -EINVAL;
+ struct file *file;
+
+ pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
+ "See Documentation/vm/remap_file_pages.txt.\n",
+ current->comm, current->pid);
+
+ if (prot)
+ return ret;
+ start = start & PAGE_MASK;
+ size = size & PAGE_MASK;
+
+ if (start + size <= start)
+ return ret;
+
+ /* Does pgoff wrap? */
+ if (pgoff + (size >> PAGE_SHIFT) < pgoff)
+ return ret;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma(mm, start);
+
+ if (!vma || !(vma->vm_flags & VM_SHARED))
+ goto out;
+
+ if (start < vma->vm_start || start + size > vma->vm_end)
+ goto out;
+
+ if (pgoff == linear_page_index(vma, start)) {
+ ret = 0;
+ goto out;
+ }
+
+ prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
+ prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
+ prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
+
+ flags &= MAP_NONBLOCK;
+ flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
+ if (vma->vm_flags & VM_LOCKED) {
+ flags |= MAP_LOCKED;
+ /* drop PG_Mlocked flag for over-mapped range */
+ munlock_vma_pages_range(vma, start, start + size);
+ }
+
+ file = get_file(vma->vm_file);
+ ret = do_mmap_pgoff(vma->vm_file, start, size,
+ prot, flags, pgoff, &populate);
+ fput(file);
+out:
+ up_write(&mm->mmap_sem);
+ if (populate)
+ mm_populate(ret, populate);
+ if (!IS_ERR_VALUE(ret))
+ ret = 0;
+ return ret;
+}
+
static inline void verify_mm_writelocked(struct mm_struct *mm)
{
#ifdef CONFIG_DEBUG_VM
*/
static unsigned long do_brk(unsigned long addr, unsigned long len)
{
- struct mm_struct * mm = current->mm;
- struct vm_area_struct * vma, * prev;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
unsigned long flags;
- struct rb_node ** rb_link, * rb_parent;
+ struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- if (error & ~PAGE_MASK)
+ if (offset_in_page(error))
return error;
- /*
- * mlock MCL_FUTURE?
- */
- if (mm->def_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
+ error = mlock_future_check(mm, mm->def_flags, len);
+ if (error)
+ return error;
/*
* mm->mmap_sem is required to protect against another thread
/*
* Clear old maps. this also does some error checking for us
*/
- munmap_back:
- if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
+ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
+ &rb_parent)) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
- goto munmap_back;
}
/* Check against address space limits *after* clearing old maps... */
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL);
+ NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (vma)
goto out;
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
+ vma->vm_flags |= VM_SOFTDIRTY;
return addr;
}
lru_add_drain();
flush_cache_mm(mm);
- tlb_gather_mmu(&tlb, mm, 1);
+ tlb_gather_mmu(&tlb, mm, 0, -1);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
vma = remove_vma(vma);
}
vm_unacct_memory(nr_accounted);
-
- WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
}
/* Insert vm structure into process list sorted by address
* and into the inode's i_mmap tree. If vm_file is non-NULL
- * then i_mmap_mutex is taken here.
+ * then i_mmap_rwsem is taken here.
*/
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
struct vm_area_struct *prev;
struct rb_node **rb_link, *rb_parent;
+ if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+ &prev, &rb_link, &rb_parent))
+ return -ENOMEM;
+ if ((vma->vm_flags & VM_ACCOUNT) &&
+ security_vm_enough_memory_mm(mm, vma_pages(vma)))
+ return -ENOMEM;
+
/*
* The vm_pgoff of a purely anonymous vma should be irrelevant
* until its first write fault, when page's anon_vma and index
* using the existing file pgoff checks and manipulations.
* Similarly in do_mmap_pgoff and in do_brk.
*/
- if (!vma->vm_file) {
+ if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
- if (find_vma_links(mm, vma->vm_start, vma->vm_end,
- &prev, &rb_link, &rb_parent))
- return -ENOMEM;
- if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory_mm(mm, vma_pages(vma)))
- return -ENOMEM;
vma_link(mm, vma, prev, rb_link, rb_parent);
return 0;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
struct rb_node **rb_link, *rb_parent;
- struct mempolicy *pol;
bool faulted_in_anon_vma = true;
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
*/
- if (unlikely(!vma->vm_file && !vma->anon_vma)) {
+ if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
pgoff = addr >> PAGE_SHIFT;
faulted_in_anon_vma = false;
}
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma
* safe. It is only safe to keep the vm_pgoff
* linear if there are no pages mapped yet.
*/
- VM_BUG_ON(faulted_in_anon_vma);
+ VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
*vmap = vma = new_vma;
}
*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
} else {
new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (new_vma) {
- *new_vma = *vma;
- new_vma->vm_start = addr;
- new_vma->vm_end = addr + len;
- new_vma->vm_pgoff = pgoff;
- pol = mpol_dup(vma_policy(vma));
- if (IS_ERR(pol))
- goto out_free_vma;
- vma_set_policy(new_vma, pol);
- INIT_LIST_HEAD(&new_vma->anon_vma_chain);
- if (anon_vma_clone(new_vma, vma))
- goto out_free_mempol;
- if (new_vma->vm_file)
- get_file(new_vma->vm_file);
- if (new_vma->vm_ops && new_vma->vm_ops->open)
- new_vma->vm_ops->open(new_vma);
- vma_link(mm, new_vma, prev, rb_link, rb_parent);
- *need_rmap_locks = false;
- }
+ if (!new_vma)
+ goto out;
+ *new_vma = *vma;
+ new_vma->vm_start = addr;
+ new_vma->vm_end = addr + len;
+ new_vma->vm_pgoff = pgoff;
+ if (vma_dup_policy(vma, new_vma))
+ goto out_free_vma;
+ INIT_LIST_HEAD(&new_vma->anon_vma_chain);
+ if (anon_vma_clone(new_vma, vma))
+ goto out_free_mempol;
+ if (new_vma->vm_file)
+ get_file(new_vma->vm_file);
+ if (new_vma->vm_ops && new_vma->vm_ops->open)
+ new_vma->vm_ops->open(new_vma);
+ vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ *need_rmap_locks = false;
}
return new_vma;
- out_free_mempol:
- mpol_put(pol);
- out_free_vma:
+out_free_mempol:
+ mpol_put(vma_policy(new_vma));
+out_free_vma:
kmem_cache_free(vm_area_cachep, new_vma);
+out:
return NULL;
}
return 1;
}
+static int special_mapping_fault(struct vm_area_struct *vma,
+ struct vm_fault *vmf);
+
+/*
+ * Having a close hook prevents vma merging regardless of flags.
+ */
+static void special_mapping_close(struct vm_area_struct *vma)
+{
+}
+
+static const char *special_mapping_name(struct vm_area_struct *vma)
+{
+ return ((struct vm_special_mapping *)vma->vm_private_data)->name;
+}
+
+static const struct vm_operations_struct special_mapping_vmops = {
+ .close = special_mapping_close,
+ .fault = special_mapping_fault,
+ .name = special_mapping_name,
+};
+
+static const struct vm_operations_struct legacy_special_mapping_vmops = {
+ .close = special_mapping_close,
+ .fault = special_mapping_fault,
+};
static int special_mapping_fault(struct vm_area_struct *vma,
struct vm_fault *vmf)
pgoff_t pgoff;
struct page **pages;
- /*
- * special mappings have no vm_file, and in that case, the mm
- * uses vm_pgoff internally. So we have to subtract it from here.
- * We are allowed to do this because we are the mm; do not copy
- * this code into drivers!
- */
- pgoff = vmf->pgoff - vma->vm_pgoff;
+ if (vma->vm_ops == &legacy_special_mapping_vmops)
+ pages = vma->vm_private_data;
+ else
+ pages = ((struct vm_special_mapping *)vma->vm_private_data)->
+ pages;
- for (pages = vma->vm_private_data; pgoff && *pages; ++pages)
+ for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
pgoff--;
if (*pages) {
return VM_FAULT_SIGBUS;
}
-/*
- * Having a close hook prevents vma merging regardless of flags.
- */
-static void special_mapping_close(struct vm_area_struct *vma)
-{
-}
-
-static const struct vm_operations_struct special_mapping_vmops = {
- .close = special_mapping_close,
- .fault = special_mapping_fault,
-};
-
-/*
- * Called with mm->mmap_sem held for writing.
- * Insert a new vma covering the given region, with the given flags.
- * Its pages are supplied by the given array of struct page *.
- * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
- * The region past the last page supplied will always produce SIGBUS.
- * The array pointer and the pages it points to are assumed to stay alive
- * for as long as this mapping might exist.
- */
-int install_special_mapping(struct mm_struct *mm,
- unsigned long addr, unsigned long len,
- unsigned long vm_flags, struct page **pages)
+static struct vm_area_struct *__install_special_mapping(
+ struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, void *priv,
+ const struct vm_operations_struct *ops)
{
int ret;
struct vm_area_struct *vma;
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (unlikely(vma == NULL))
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&vma->anon_vma_chain);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
+ vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
- vma->vm_ops = &special_mapping_vmops;
- vma->vm_private_data = pages;
+ vma->vm_ops = ops;
+ vma->vm_private_data = priv;
ret = insert_vm_struct(mm, vma);
if (ret)
perf_event_mmap(vma);
- return 0;
+ return vma;
out:
kmem_cache_free(vm_area_cachep, vma);
- return ret;
+ return ERR_PTR(ret);
+}
+
+/*
+ * Called with mm->mmap_sem held for writing.
+ * Insert a new vma covering the given region, with the given flags.
+ * Its pages are supplied by the given array of struct page *.
+ * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
+ * The region past the last page supplied will always produce SIGBUS.
+ * The array pointer and the pages it points to are assumed to stay alive
+ * for as long as this mapping might exist.
+ */
+struct vm_area_struct *_install_special_mapping(
+ struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, const struct vm_special_mapping *spec)
+{
+ return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
+ &special_mapping_vmops);
+}
+
+int install_special_mapping(struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, struct page **pages)
+{
+ struct vm_area_struct *vma = __install_special_mapping(
+ mm, addr, len, vm_flags, (void *)pages,
+ &legacy_special_mapping_vmops);
+
+ return PTR_ERR_OR_ZERO(vma);
}
static DEFINE_MUTEX(mm_all_locks_mutex);
*/
if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
BUG();
- mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
+ down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem);
}
}
*
* mmap_sem in write mode is required in order to block all operations
* that could modify pagetables and free pages without need of
- * altering the vma layout (for example populate_range() with
- * nonlinear vmas). It's also needed in write mode to avoid new
+ * altering the vma layout. It's also needed in write mode to avoid new
* anon_vmas to be associated with existing vmas.
*
* A single task can't take more than one mm_take_all_locks() in a row
* vma in this mm is backed by the same anon_vma or address_space.
*
* We can take all the locks in random order because the VM code
- * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
+ * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never
* takes more than one of them in a row. Secondly we're protected
* against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
*
* AS_MM_ALL_LOCKS can't change to 0 from under us
* because we hold the mm_all_locks_mutex.
*/
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
&mapping->flags))
BUG();
{
int ret;
- ret = percpu_counter_init(&vm_committed_as, 0);
+ ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
}
sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
return 0;
}
-module_init(init_user_reserve)
+subsys_initcall(init_user_reserve);
/*
* Initialise sysctl_admin_reserve_kbytes.
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
}
-module_init(init_admin_reserve)
+subsys_initcall(init_admin_reserve);
/*
* Reinititalise user and admin reserves if memory is added or removed.
static int __meminit init_reserve_notifier(void)
{
if (register_hotmemory_notifier(&reserve_mem_nb))
- printk("Failed registering memory add/remove notifier for admin reserve");
+ pr_err("Failed registering memory add/remove notifier for admin reserve\n");
return 0;
}
-module_init(init_reserve_notifier)
+subsys_initcall(init_reserve_notifier);