perf: Fix mmap() accounting hole
authorPeter Zijlstra <peterz@infradead.org>
Tue, 4 Jun 2013 08:44:21 +0000 (10:44 +0200)
committerIngo Molnar <mingo@kernel.org>
Wed, 19 Jun 2013 10:44:13 +0000 (12:44 +0200)
Vince's fuzzer once again found holes. This time it spotted a leak in
the locked page accounting.

When an event had redirected output and its close() was the last
reference to the buffer we didn't have a vm context to undo accounting.

Change the code to destroy the buffer on the last munmap() and detach
all redirected events at that time. This provides us the right context
to undo the vm accounting.

Reported-and-tested-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20130604084421.GI8923@twins.programming.kicks-ass.net
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
kernel/events/core.c
kernel/events/internal.h

index ae752cd4a086ab82198380c08f1c7124da24cd2f..b391907d53520cb4a126bc8e3e0a2d5a64ade54d 100644 (file)
@@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 static void update_context_time(struct perf_event_context *ctx);
 static u64 perf_event_time(struct perf_event *event);
 
-static void ring_buffer_attach(struct perf_event *event,
-                              struct ring_buffer *rb);
-
 void __weak perf_event_print_debug(void)       { }
 
 extern __weak const char *perf_pmu_name(void)
@@ -2917,7 +2914,8 @@ static void free_event_rcu(struct rcu_head *head)
        kfree(event);
 }
 
-static bool ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
 
 static void free_event(struct perf_event *event)
 {
@@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event)
                if (has_branch_stack(event)) {
                        static_key_slow_dec_deferred(&perf_sched_events);
                        /* is system-wide event */
-                       if (!(event->attach_state & PERF_ATTACH_TASK))
+                       if (!(event->attach_state & PERF_ATTACH_TASK)) {
                                atomic_dec(&per_cpu(perf_branch_stack_events,
                                                    event->cpu));
+                       }
                }
        }
 
        if (event->rb) {
-               ring_buffer_put(event->rb);
-               event->rb = NULL;
+               struct ring_buffer *rb;
+
+               /*
+                * Can happen when we close an event with re-directed output.
+                *
+                * Since we have a 0 refcount, perf_mmap_close() will skip
+                * over us; possibly making our ring_buffer_put() the last.
+                */
+               mutex_lock(&event->mmap_mutex);
+               rb = event->rb;
+               if (rb) {
+                       rcu_assign_pointer(event->rb, NULL);
+                       ring_buffer_detach(event, rb);
+                       ring_buffer_put(rb); /* could be last */
+               }
+               mutex_unlock(&event->mmap_mutex);
        }
 
        if (is_cgroup_event(event))
@@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
        unsigned int events = POLL_HUP;
 
        /*
-        * Race between perf_event_set_output() and perf_poll(): perf_poll()
-        * grabs the rb reference but perf_event_set_output() overrides it.
-        * Here is the timeline for two threads T1, T2:
-        * t0: T1, rb = rcu_dereference(event->rb)
-        * t1: T2, old_rb = event->rb
-        * t2: T2, event->rb = new rb
-        * t3: T2, ring_buffer_detach(old_rb)
-        * t4: T1, ring_buffer_attach(rb1)
-        * t5: T1, poll_wait(event->waitq)
-        *
-        * To avoid this problem, we grab mmap_mutex in perf_poll()
-        * thereby ensuring that the assignment of the new ring buffer
-        * and the detachment of the old buffer appear atomic to perf_poll()
+        * Pin the event->rb by taking event->mmap_mutex; otherwise
+        * perf_event_set_output() can swizzle our rb and make us miss wakeups.
         */
        mutex_lock(&event->mmap_mutex);
-
-       rcu_read_lock();
-       rb = rcu_dereference(event->rb);
-       if (rb) {
-               ring_buffer_attach(event, rb);
+       rb = event->rb;
+       if (rb)
                events = atomic_xchg(&rb->poll, 0);
-       }
-       rcu_read_unlock();
-
        mutex_unlock(&event->mmap_mutex);
 
        poll_wait(file, &event->waitq, wait);
@@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event,
                return;
 
        spin_lock_irqsave(&rb->event_lock, flags);
-       if (!list_empty(&event->rb_entry))
-               goto unlock;
-
-       list_add(&event->rb_entry, &rb->event_list);
-unlock:
+       if (list_empty(&event->rb_entry))
+               list_add(&event->rb_entry, &rb->event_list);
        spin_unlock_irqrestore(&rb->event_lock, flags);
 }
 
-static void ring_buffer_detach(struct perf_event *event,
-                              struct ring_buffer *rb)
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
 {
        unsigned long flags;
 
@@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
 
        rcu_read_lock();
        rb = rcu_dereference(event->rb);
-       if (!rb)
-               goto unlock;
-
-       list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
-               wake_up_all(&event->waitq);
-
-unlock:
+       if (rb) {
+               list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+                       wake_up_all(&event->waitq);
+       }
        rcu_read_unlock();
 }
 
@@ -3582,23 +3571,14 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
        return rb;
 }
 
-static bool ring_buffer_put(struct ring_buffer *rb)
+static void ring_buffer_put(struct ring_buffer *rb)
 {
-       struct perf_event *event, *n;
-       unsigned long flags;
-
        if (!atomic_dec_and_test(&rb->refcount))
-               return false;
+               return;
 
-       spin_lock_irqsave(&rb->event_lock, flags);
-       list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
-               list_del_init(&event->rb_entry);
-               wake_up_all(&event->waitq);
-       }
-       spin_unlock_irqrestore(&rb->event_lock, flags);
+       WARN_ON_ONCE(!list_empty(&rb->event_list));
 
        call_rcu(&rb->rcu_head, rb_free_rcu);
-       return true;
 }
 
 static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3606,28 +3586,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
        struct perf_event *event = vma->vm_file->private_data;
 
        atomic_inc(&event->mmap_count);
+       atomic_inc(&event->rb->mmap_count);
 }
 
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
 static void perf_mmap_close(struct vm_area_struct *vma)
 {
        struct perf_event *event = vma->vm_file->private_data;
 
-       if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-               struct ring_buffer *rb = event->rb;
-               struct user_struct *mmap_user = rb->mmap_user;
-               int mmap_locked = rb->mmap_locked;
-               unsigned long size = perf_data_size(rb);
+       struct ring_buffer *rb = event->rb;
+       struct user_struct *mmap_user = rb->mmap_user;
+       int mmap_locked = rb->mmap_locked;
+       unsigned long size = perf_data_size(rb);
 
-               rcu_assign_pointer(event->rb, NULL);
-               ring_buffer_detach(event, rb);
-               mutex_unlock(&event->mmap_mutex);
+       atomic_dec(&rb->mmap_count);
+
+       if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+               return;
+
+       /* Detach current event from the buffer. */
+       rcu_assign_pointer(event->rb, NULL);
+       ring_buffer_detach(event, rb);
+       mutex_unlock(&event->mmap_mutex);
+
+       /* If there's still other mmap()s of this buffer, we're done. */
+       if (atomic_read(&rb->mmap_count)) {
+               ring_buffer_put(rb); /* can't be last */
+               return;
+       }
 
-               if (ring_buffer_put(rb)) {
-                       atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
-                       vma->vm_mm->pinned_vm -= mmap_locked;
-                       free_uid(mmap_user);
+       /*
+        * No other mmap()s, detach from all other events that might redirect
+        * into the now unreachable buffer. Somewhat complicated by the
+        * fact that rb::event_lock otherwise nests inside mmap_mutex.
+        */
+again:
+       rcu_read_lock();
+       list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+               if (!atomic_long_inc_not_zero(&event->refcount)) {
+                       /*
+                        * This event is en-route to free_event() which will
+                        * detach it and remove it from the list.
+                        */
+                       continue;
                }
+               rcu_read_unlock();
+
+               mutex_lock(&event->mmap_mutex);
+               /*
+                * Check we didn't race with perf_event_set_output() which can
+                * swizzle the rb from under us while we were waiting to
+                * acquire mmap_mutex.
+                *
+                * If we find a different rb; ignore this event, a next
+                * iteration will no longer find it on the list. We have to
+                * still restart the iteration to make sure we're not now
+                * iterating the wrong list.
+                */
+               if (event->rb == rb) {
+                       rcu_assign_pointer(event->rb, NULL);
+                       ring_buffer_detach(event, rb);
+                       ring_buffer_put(rb); /* can't be last, we still have one */
+               }
+               mutex_unlock(&event->mmap_mutex);
+               put_event(event);
+
+               /*
+                * Restart the iteration; either we're on the wrong list or
+                * destroyed its integrity by doing a deletion.
+                */
+               goto again;
        }
+       rcu_read_unlock();
+
+       /*
+        * It could be there's still a few 0-ref events on the list; they'll
+        * get cleaned up by free_event() -- they'll also still have their
+        * ref on the rb and will free it whenever they are done with it.
+        *
+        * Aside from that, this buffer is 'fully' detached and unmapped,
+        * undo the VM accounting.
+        */
+
+       atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+       vma->vm_mm->pinned_vm -= mmap_locked;
+       free_uid(mmap_user);
+
+       ring_buffer_put(rb); /* could be last */
 }
 
 static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3677,10 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                return -EINVAL;
 
        WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
        mutex_lock(&event->mmap_mutex);
        if (event->rb) {
-               if (event->rb->nr_pages != nr_pages)
+               if (event->rb->nr_pages != nr_pages) {
                        ret = -EINVAL;
+                       goto unlock;
+               }
+
+               if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+                       /*
+                        * Raced against perf_mmap_close() through
+                        * perf_event_set_output(). Try again, hope for better
+                        * luck.
+                        */
+                       mutex_unlock(&event->mmap_mutex);
+                       goto again;
+               }
+
                goto unlock;
        }
 
@@ -3722,12 +3788,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                goto unlock;
        }
 
+       atomic_set(&rb->mmap_count, 1);
        rb->mmap_locked = extra;
        rb->mmap_user = get_current_user();
 
        atomic_long_add(user_extra, &user->locked_vm);
        vma->vm_mm->pinned_vm += extra;
 
+       ring_buffer_attach(event, rb);
        rcu_assign_pointer(event->rb, rb);
 
        perf_event_update_userpage(event);
@@ -3737,6 +3805,10 @@ unlock:
                atomic_inc(&event->mmap_count);
        mutex_unlock(&event->mmap_mutex);
 
+       /*
+        * Since pinned accounting is per vm we cannot allow fork() to copy our
+        * vma.
+        */
        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &perf_mmap_vmops;
 
@@ -6415,6 +6487,8 @@ set:
        if (atomic_read(&event->mmap_count))
                goto unlock;
 
+       old_rb = event->rb;
+
        if (output_event) {
                /* get the rb we want to redirect to */
                rb = ring_buffer_get(output_event);
@@ -6422,16 +6496,28 @@ set:
                        goto unlock;
        }
 
-       old_rb = event->rb;
-       rcu_assign_pointer(event->rb, rb);
        if (old_rb)
                ring_buffer_detach(event, old_rb);
+
+       if (rb)
+               ring_buffer_attach(event, rb);
+
+       rcu_assign_pointer(event->rb, rb);
+
+       if (old_rb) {
+               ring_buffer_put(old_rb);
+               /*
+                * Since we detached before setting the new rb, so that we
+                * could attach the new rb, we could have missed a wakeup.
+                * Provide it now.
+                */
+               wake_up_all(&event->waitq);
+       }
+
        ret = 0;
 unlock:
        mutex_unlock(&event->mmap_mutex);
 
-       if (old_rb)
-               ring_buffer_put(old_rb);
 out:
        return ret;
 }
index 5bc6c8e9b8515716736a9717a4e3dad413e17f5c..ca6599723be5624cdeb4a3cad5ef5dd10a4e1c7b 100644 (file)
@@ -31,7 +31,8 @@ struct ring_buffer {
        spinlock_t                      event_lock;
        struct list_head                event_list;
 
-       int                             mmap_locked;
+       atomic_t                        mmap_count;
+       unsigned long                   mmap_locked;
        struct user_struct              *mmap_user;
 
        struct perf_event_mmap_page     *user_page;