mm: add a field to store names for private anonymous memory
authorColin Cross <ccross@android.com>
Tue, 27 Oct 2015 23:42:08 +0000 (16:42 -0700)
committerJohn Stultz <john.stultz@linaro.org>
Tue, 16 Feb 2016 21:54:13 +0000 (13:54 -0800)
Userspace processes often have multiple allocators that each do
anonymous mmaps to get memory.  When examining memory usage of
individual processes or systems as a whole, it is useful to be
able to break down the various heaps that were allocated by
each layer and examine their size, RSS, and physical memory
usage.

This patch adds a user pointer to the shared union in
vm_area_struct that points to a null terminated string inside
the user process containing a name for the vma.  vmas that
point to the same address will be merged, but vmas that
point to equivalent strings at different addresses will
not be merged.

Userspace can set the name for a region of memory by calling
prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
Setting the name to NULL clears it.

The names of named anonymous vmas are shown in /proc/pid/maps
as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
that is only present for named vmas.  If the userspace pointer
is no longer valid all or part of the name will be replaced
with "<fault>".

The idea to store a userspace pointer to reduce the complexity
within mm (at the expense of the complexity of reading
/proc/pid/mem) came from Dave Hansen.  This results in no
runtime overhead in the mm subsystem other than comparing
the anon_name pointers when considering vma merging.  The pointer
is stored in a union with fieds that are only used on file-backed
mappings, so it does not increase memory usage.

Includes fix from Jed Davis <jld@mozilla.com> for typo in
prctl_set_vma_anon_name, which could attempt to set the name
across two vmas at the same time due to a typo, which might
corrupt the vma list.  Fix it to use tmp instead of end to limit
the name setting to a single vma at a time.

Change-Id: I9aa7b6b5ef536cd780599ba4e2fba8ceebe8b59f
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
Documentation/filesystems/proc.txt
fs/proc/task_mmu.c
include/linux/mm.h
include/linux/mm_types.h
include/uapi/linux/prctl.h
kernel/sys.c
mm/madvise.c
mm/mempolicy.c
mm/mlock.c
mm/mmap.c
mm/mprotect.c

index 402ab99e409fabb1c8ec8d4ba8b6fd248e295f14..0bcba2823d9dd50fc79db321f0909db41c695e0e 100644 (file)
@@ -381,6 +381,8 @@ is not associated with a file:
  [stack:1001]             = the stack of the thread with tid 1001
  [vdso]                   = the "virtual dynamic shared object",
                             the kernel system call handler
+ [anon:<name>]            = an anonymous mapping that has been
+                            named by userspace
 
  or if empty, the mapping is anonymous.
 
@@ -435,6 +437,7 @@ KernelPageSize:        4 kB
 MMUPageSize:           4 kB
 Locked:                0 kB
 VmFlags: rd ex mr mw me dw
+Name:           name from userspace
 
 the first of these lines shows the same information as is displayed for the
 mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
@@ -497,6 +500,9 @@ Note that there is no guarantee that every flag and associated mnemonic will
 be present in all further kernel releases. Things get changed, the flags may
 be vanished or the reverse -- new added.
 
+The "Name" field will only be present on a mapping that has been named by
+userspace, and will show the name passed in by userspace.
+
 This file is only present if the CONFIG_MMU kernel configuration option is
 enabled.
 
index 187b3b5f242ef946658751d15580a6c7e6ac85d6..91698054b965d56ea5b64cd82992a5ddf149527f 100644 (file)
@@ -116,6 +116,56 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
 }
 #endif
 
+static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma)
+{
+       const char __user *name = vma_get_anon_name(vma);
+       struct mm_struct *mm = vma->vm_mm;
+
+       unsigned long page_start_vaddr;
+       unsigned long page_offset;
+       unsigned long num_pages;
+       unsigned long max_len = NAME_MAX;
+       int i;
+
+       page_start_vaddr = (unsigned long)name & PAGE_MASK;
+       page_offset = (unsigned long)name - page_start_vaddr;
+       num_pages = DIV_ROUND_UP(page_offset + max_len, PAGE_SIZE);
+
+       seq_puts(m, "[anon:");
+
+       for (i = 0; i < num_pages; i++) {
+               int len;
+               int write_len;
+               const char *kaddr;
+               long pages_pinned;
+               struct page *page;
+
+               pages_pinned = get_user_pages(current, mm, page_start_vaddr,
+                               1, 0, 0, &page, NULL);
+               if (pages_pinned < 1) {
+                       seq_puts(m, "<fault>]");
+                       return;
+               }
+
+               kaddr = (const char *)kmap(page);
+               len = min(max_len, PAGE_SIZE - page_offset);
+               write_len = strnlen(kaddr + page_offset, len);
+               seq_write(m, kaddr + page_offset, write_len);
+               kunmap(page);
+               put_page(page);
+
+               /* if strnlen hit a null terminator then we're done */
+               if (write_len != len)
+                       break;
+
+               max_len -= len;
+               page_offset = 0;
+               page_start_vaddr += PAGE_SIZE;
+       }
+
+       seq_putc(m, ']');
+}
+
 static void vma_stop(struct proc_maps_private *priv)
 {
        struct mm_struct *mm = priv->mm;
@@ -351,6 +401,12 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
                                seq_pad(m, ' ');
                                seq_printf(m, "[stack:%d]", tid);
                        }
+                       goto done;
+               }
+
+               if (vma_get_anon_name(vma)) {
+                       seq_pad(m, ' ');
+                       seq_print_vma_name(m, vma);
                }
        }
 
@@ -676,6 +732,12 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 
        show_map_vma(m, vma, is_pid);
 
+       if (vma_get_anon_name(vma)) {
+               seq_puts(m, "Name:           ");
+               seq_print_vma_name(m, vma);
+               seq_putc(m, '\n');
+       }
+
        seq_printf(m,
                   "Size:           %8lu kB\n"
                   "Rss:            %8lu kB\n"
index e976e180e873910774b6585b263122a3aae52f24..3ccb2cb40e84edfbba808a7497861f901fc46a17 100644 (file)
@@ -1866,7 +1866,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 extern struct vm_area_struct *vma_merge(struct mm_struct *,
        struct vm_area_struct *prev, unsigned long addr, unsigned long end,
        unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-       struct mempolicy *, struct vm_userfaultfd_ctx);
+       struct mempolicy *, struct vm_userfaultfd_ctx, const char __user *);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int split_vma(struct mm_struct *,
        struct vm_area_struct *, unsigned long addr, int new_below);
index f8d1492a114f4447d0d7f4dfd8136e854a4a7581..0a732c5e0de1a0a880ae10048566df7a0af9ae50 100644 (file)
@@ -323,11 +323,18 @@ struct vm_area_struct {
        /*
         * For areas with an address space and backing store,
         * linkage into the address_space->i_mmap interval tree.
+        *
+        * For private anonymous mappings, a pointer to a null terminated string
+        * in the user process containing the name given to the vma, or NULL
+        * if unnamed.
         */
-       struct {
-               struct rb_node rb;
-               unsigned long rb_subtree_last;
-       } shared;
+       union {
+               struct {
+                       struct rb_node rb;
+                       unsigned long rb_subtree_last;
+               } shared;
+               const char __user *anon_name;
+       };
 
        /*
         * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
@@ -591,4 +598,13 @@ typedef struct {
        unsigned long val;
 } swp_entry_t;
 
+/* Return the name for an anonymous mapping or NULL for a file-backed mapping */
+static inline const char __user *vma_get_anon_name(struct vm_area_struct *vma)
+{
+       if (vma->vm_file)
+               return NULL;
+
+       return vma->anon_name;
+}
+
 #endif /* _LINUX_MM_TYPES_H */
index 4f1a92a83f894ce12f458963fad00c9d5382d5cd..c1af9b3c27c4faae7efa546135d9788463476a46 100644 (file)
@@ -203,4 +203,7 @@ struct prctl_mm_map {
  */
 #define PR_SET_TIMERSLACK_PID  127
 
+#define PR_SET_VMA             0x53564d41
+# define PR_SET_VMA_ANON_NAME          0
+
 #endif /* _LINUX_PRCTL_H */
index 66b933a5c061f2880af255e45b1af455d8818170..d0cb632eef8b85c585f8475db8e2a8a83a4c32d0 100644 (file)
@@ -41,6 +41,8 @@
 #include <linux/syscore_ops.h>
 #include <linux/version.h>
 #include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/mempolicy.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -2072,6 +2074,153 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
 }
 #endif
 
+#ifdef CONFIG_MMU
+static int prctl_update_vma_anon_name(struct vm_area_struct *vma,
+               struct vm_area_struct **prev,
+               unsigned long start, unsigned long end,
+               const char __user *name_addr)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       int error = 0;
+       pgoff_t pgoff;
+
+       if (name_addr == vma_get_anon_name(vma)) {
+               *prev = vma;
+               goto out;
+       }
+
+       pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+       *prev = vma_merge(mm, *prev, start, end, vma->vm_flags, vma->anon_vma,
+                               vma->vm_file, pgoff, vma_policy(vma),
+                               name_addr);
+       if (*prev) {
+               vma = *prev;
+               goto success;
+       }
+
+       *prev = vma;
+
+       if (start != vma->vm_start) {
+               error = split_vma(mm, vma, start, 1);
+               if (error)
+                       goto out;
+       }
+
+       if (end != vma->vm_end) {
+               error = split_vma(mm, vma, end, 0);
+               if (error)
+                       goto out;
+       }
+
+success:
+       if (!vma->vm_file)
+               vma->anon_name = name_addr;
+
+out:
+       if (error == -ENOMEM)
+               error = -EAGAIN;
+       return error;
+}
+
+static int prctl_set_vma_anon_name(unsigned long start, unsigned long end,
+                       unsigned long arg)
+{
+       unsigned long tmp;
+       struct vm_area_struct *vma, *prev;
+       int unmapped_error = 0;
+       int error = -EINVAL;
+
+       /*
+        * If the interval [start,end) covers some unmapped address
+        * ranges, just ignore them, but return -ENOMEM at the end.
+        * - this matches the handling in madvise.
+        */
+       vma = find_vma_prev(current->mm, start, &prev);
+       if (vma && start > vma->vm_start)
+               prev = vma;
+
+       for (;;) {
+               /* Still start < end. */
+               error = -ENOMEM;
+               if (!vma)
+                       return error;
+
+               /* Here start < (end|vma->vm_end). */
+               if (start < vma->vm_start) {
+                       unmapped_error = -ENOMEM;
+                       start = vma->vm_start;
+                       if (start >= end)
+                               return error;
+               }
+
+               /* Here vma->vm_start <= start < (end|vma->vm_end) */
+               tmp = vma->vm_end;
+               if (end < tmp)
+                       tmp = end;
+
+               /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+               error = prctl_update_vma_anon_name(vma, &prev, start, tmp,
+                               (const char __user *)arg);
+               if (error)
+                       return error;
+               start = tmp;
+               if (prev && start < prev->vm_end)
+                       start = prev->vm_end;
+               error = unmapped_error;
+               if (start >= end)
+                       return error;
+               if (prev)
+                       vma = prev->vm_next;
+               else    /* madvise_remove dropped mmap_sem */
+                       vma = find_vma(current->mm, start);
+       }
+}
+
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+               unsigned long len_in, unsigned long arg)
+{
+       struct mm_struct *mm = current->mm;
+       int error;
+       unsigned long len;
+       unsigned long end;
+
+       if (start & ~PAGE_MASK)
+               return -EINVAL;
+       len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+       /* Check to see whether len was rounded up from small -ve to zero */
+       if (len_in && !len)
+               return -EINVAL;
+
+       end = start + len;
+       if (end < start)
+               return -EINVAL;
+
+       if (end == start)
+               return 0;
+
+       down_write(&mm->mmap_sem);
+
+       switch (opt) {
+       case PR_SET_VMA_ANON_NAME:
+               error = prctl_set_vma_anon_name(start, end, arg);
+               break;
+       default:
+               error = -EINVAL;
+       }
+
+       up_write(&mm->mmap_sem);
+
+       return error;
+}
+#else /* CONFIG_MMU */
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+               unsigned long len_in, unsigned long arg)
+{
+       return -EINVAL;
+}
+#endif
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                unsigned long, arg4, unsigned long, arg5)
 {
@@ -2287,6 +2436,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
        case PR_GET_FP_MODE:
                error = GET_FP_MODE(me);
                break;
+       case PR_SET_VMA:
+               error = prctl_set_vma(arg2, arg3, arg4, arg5);
+               break;
        default:
                error = -EINVAL;
                break;
index c889fcbb530e98d8779ef75750e1fde08bf786cf..c154e107630307715da37d290417de8e2cd8adb7 100644 (file)
@@ -104,7 +104,7 @@ static long madvise_behavior(struct vm_area_struct *vma,
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
                          vma->vm_file, pgoff, vma_policy(vma),
-                         vma->vm_userfaultfd_ctx);
+                         vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
        if (*prev) {
                vma = *prev;
                goto success;
index 87a177917cb2e60a13b09e6a53836ccd9f9275bf..f20eb4e8c4ccd4566fe8c46cdfde91ad1115c6d8 100644 (file)
@@ -720,7 +720,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
                                 vma->anon_vma, vma->vm_file, pgoff,
-                                new_pol, vma->vm_userfaultfd_ctx);
+                                new_pol, vma->vm_userfaultfd_ctx,
+                                vma_get_anon_name(vma));
                if (prev) {
                        vma = prev;
                        next = vma->vm_next;
index 339d9e0949b6d63a1b81af1e121c1858af77f472..a6617735f2f48e35d5bb7996fb9899ec72317aae 100644 (file)
@@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
                          vma->vm_file, pgoff, vma_policy(vma),
-                         vma->vm_userfaultfd_ctx);
+                         vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
        if (*prev) {
                vma = *prev;
                goto success;
index 2ce04a649f6b4977e54b76a29be9d5bac5e71dab..c00e1bde030e6f171cbaf688fe4112b7a29155a4 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -921,7 +921,8 @@ again:                      remove_next = 1 + (end > next->vm_end);
  */
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
                                struct file *file, unsigned long vm_flags,
-                               struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+                               struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+                               const char __user *anon_name)
 {
        /*
         * VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -939,6 +940,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
                return 0;
        if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
                return 0;
+       if (vma_get_anon_name(vma) != anon_name)
+               return 0;
        return 1;
 }
 
@@ -971,9 +974,10 @@ static int
 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
                     struct anon_vma *anon_vma, struct file *file,
                     pgoff_t vm_pgoff,
-                    struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+                    struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+                    const char __user *anon_name)
 {
-       if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+       if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                if (vma->vm_pgoff == vm_pgoff)
                        return 1;
@@ -992,9 +996,10 @@ static int
 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
                    struct anon_vma *anon_vma, struct file *file,
                    pgoff_t vm_pgoff,
-                   struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+                   struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+                   const char __user *anon_name)
 {
-       if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+       if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                pgoff_t vm_pglen;
                vm_pglen = vma_pages(vma);
@@ -1005,9 +1010,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 }
 
 /*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor.  Or both (it neatly fills a hole).
  *
  * In most cases - when called for mmap, brk or mremap - [addr,end) is
  * certain not to be mapped by the time vma_merge is called; but when
@@ -1038,7 +1043,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                        unsigned long end, unsigned long vm_flags,
                        struct anon_vma *anon_vma, struct file *file,
                        pgoff_t pgoff, struct mempolicy *policy,
-                       struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+                       struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+                       const char __user *anon_name)
 {
        pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
        struct vm_area_struct *area, *next;
@@ -1066,7 +1072,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                        mpol_equal(vma_policy(prev), policy) &&
                        can_vma_merge_after(prev, vm_flags,
                                            anon_vma, file, pgoff,
-                                           vm_userfaultfd_ctx)) {
+                                           vm_userfaultfd_ctx,
+                                           anon_name)) {
                /*
                 * OK, it can.  Can we now merge in the successor as well?
                 */
@@ -1075,7 +1082,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                can_vma_merge_before(next, vm_flags,
                                                     anon_vma, file,
                                                     pgoff+pglen,
-                                                    vm_userfaultfd_ctx) &&
+                                                    vm_userfaultfd_ctx,
+                                                    anon_name) &&
                                is_mergeable_anon_vma(prev->anon_vma,
                                                      next->anon_vma, NULL)) {
                                                        /* cases 1, 6 */
@@ -1097,7 +1105,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                        mpol_equal(policy, vma_policy(next)) &&
                        can_vma_merge_before(next, vm_flags,
                                             anon_vma, file, pgoff+pglen,
-                                            vm_userfaultfd_ctx)) {
+                                            vm_userfaultfd_ctx,
+                                            anon_name)) {
                if (prev && addr < prev->vm_end)        /* case 4 */
                        err = vma_adjust(prev, prev->vm_start,
                                addr, prev->vm_pgoff, NULL);
@@ -1581,7 +1590,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
         * Can we just expand an old mapping?
         */
        vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
-                       NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
+                       NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
        if (vma)
                goto out;
 
@@ -2771,7 +2780,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 
        /* Can we just expand an old private anonymous mapping? */
        vma = vma_merge(mm, prev, addr, addr + len, flags,
-                       NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
+                       NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
        if (vma)
                goto out;
 
@@ -2929,7 +2938,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                return NULL;    /* should never get here */
        new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
                            vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
-                           vma->vm_userfaultfd_ctx);
+                           vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
        if (new_vma) {
                /*
                 * Source vma may have been merged into new_vma
index ef5be8eaab001792b469fac1bd5b43cb139d1b0b..bddb2c75492d06b4806c6ed91c0352c05a9555f3 100644 (file)
@@ -293,7 +293,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        *pprev = vma_merge(mm, *pprev, start, end, newflags,
                           vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
-                          vma->vm_userfaultfd_ctx);
+                          vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
        if (*pprev) {
                vma = *pprev;
                goto success;