ARM: dts: rk3288: support AP6335 wifi for rk3288-evb
[firefly-linux-kernel-4.4.55.git] / kernel / events / uprobes.c
index f3569747d6295043ca3c8a853361a48c2ecf2869..7dad84913abfb06df2495fae555e7f8bcac2b104 100644 (file)
@@ -19,7 +19,7 @@
  * Authors:
  *     Srikar Dronamraju
  *     Jim Keniston
- * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
  */
 
 #include <linux/kernel.h>
@@ -35,6 +35,8 @@
 #include <linux/kdebug.h>      /* notifier mechanism */
 #include "../../mm/internal.h" /* munlock_vma_page */
 #include <linux/percpu-rwsem.h>
+#include <linux/task_work.h>
+#include <linux/shmem_fs.h>
 
 #include <linux/uprobes.h>
 
@@ -59,8 +61,6 @@ static struct percpu_rw_semaphore dup_mmap_sem;
 
 /* Have a copy of original instruction */
 #define UPROBE_COPY_INSN       0
-/* Can skip singlestep */
-#define UPROBE_SKIP_SSTEP      1
 
 struct uprobe {
        struct rb_node          rb_node;        /* node in the rb tree */
@@ -72,16 +72,42 @@ struct uprobe {
        struct inode            *inode;         /* Also hold a ref to inode */
        loff_t                  offset;
        unsigned long           flags;
+
+       /*
+        * The generic code assumes that it has two members of unknown type
+        * owned by the arch-specific code:
+        *
+        *      insn -  copy_insn() saves the original instruction here for
+        *              arch_uprobe_analyze_insn().
+        *
+        *      ixol -  potentially modified instruction to execute out of
+        *              line, copied to xol_area by xol_get_insn_slot().
+        */
        struct arch_uprobe      arch;
 };
 
-struct return_instance {
-       struct uprobe           *uprobe;
-       unsigned long           func;
-       unsigned long           orig_ret_vaddr; /* original return address */
-       bool                    chained;        /* true, if instance is nested */
+/*
+ * Execute out of line area: anonymous executable mapping installed
+ * by the probed task to execute the copy of the original instruction
+ * mangled by set_swbp().
+ *
+ * On a breakpoint hit, thread contests for a slot.  It frees the
+ * slot after singlestep. Currently a fixed number of slots are
+ * allocated.
+ */
+struct xol_area {
+       wait_queue_head_t               wq;             /* if all slots are busy */
+       atomic_t                        slot_count;     /* number of in-use slots */
+       unsigned long                   *bitmap;        /* 0 = free slot */
 
-       struct return_instance  *next;          /* keep as stack */
+       struct vm_special_mapping       xol_mapping;
+       struct page                     *pages[2];
+       /*
+        * We keep the vma's vm_start rather than a pointer to the vma
+        * itself.  The probed process or a naughty kernel module could make
+        * the vma go away, and we must handle that reasonably gracefully.
+        */
+       unsigned long                   vaddr;          /* Page(s) of instruction slots */
 };
 
 /*
@@ -94,7 +120,7 @@ struct return_instance {
  */
 static bool valid_vma(struct vm_area_struct *vma, bool is_register)
 {
-       vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED;
+       vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
 
        if (is_register)
                flags |= VM_WRITE;
@@ -133,6 +159,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        /* For mmu_notifiers */
        const unsigned long mmun_start = addr;
        const unsigned long mmun_end   = addr + PAGE_SIZE;
+       struct mem_cgroup *memcg;
+
+       err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+       if (err)
+               return err;
 
        /* For try_to_free_swap() and munlock_vma_page() below */
        lock_page(page);
@@ -145,6 +176,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
        get_page(kpage);
        page_add_new_anon_rmap(kpage, vma, addr);
+       mem_cgroup_commit_charge(kpage, memcg, false);
+       lru_cache_add_active_or_unevictable(kpage, vma);
 
        if (!PageAnon(page)) {
                dec_mm_counter(mm, MM_FILEPAGES);
@@ -152,7 +185,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        }
 
        flush_cache_page(vma, addr, pte_pfn(*ptep));
-       ptep_clear_flush(vma, addr, ptep);
+       ptep_clear_flush_notify(vma, addr, ptep);
        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
 
        page_remove_rmap(page);
@@ -166,6 +199,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
        err = 0;
  unlock:
+       mem_cgroup_cancel_charge(kpage, memcg);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        unlock_page(page);
        return err;
@@ -244,23 +278,18 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
  * the architecture. If an arch has variable length instruction and the
  * breakpoint instruction is not of the smallest length instruction
  * supported by that architecture then we need to modify is_trap_at_addr and
- * write_opcode accordingly. This would never be a problem for archs that
- * have fixed length instructions.
- */
-
-/*
- * write_opcode - write the opcode at a given virtual address.
+ * uprobe_write_opcode accordingly. This would never be a problem for archs
+ * that have fixed length instructions.
+ *
+ * uprobe_write_opcode - write the opcode at a given virtual address.
  * @mm: the probed process address space.
  * @vaddr: the virtual address to store the opcode.
  * @opcode: opcode to be written at @vaddr.
  *
- * Called with mm->mmap_sem held (for read and with a reference to
- * mm).
- *
- * For mm @mm, write the opcode at @vaddr.
+ * Called with mm->mmap_sem held for write.
  * Return 0 (success) or a negative errno.
  */
-static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
+int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
                        uprobe_opcode_t opcode)
 {
        struct page *old_page, *new_page;
@@ -277,23 +306,20 @@ retry:
        if (ret <= 0)
                goto put_old;
 
+       ret = anon_vma_prepare(vma);
+       if (ret)
+               goto put_old;
+
        ret = -ENOMEM;
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
        if (!new_page)
                goto put_old;
 
        __SetPageUptodate(new_page);
-
        copy_highpage(new_page, old_page);
        copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
 
-       ret = anon_vma_prepare(vma);
-       if (ret)
-               goto put_new;
-
        ret = __replace_page(vma, vaddr, old_page, new_page);
-
-put_new:
        page_cache_release(new_page);
 put_old:
        put_page(old_page);
@@ -314,7 +340,7 @@ put_old:
  */
 int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-       return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
+       return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
 }
 
 /**
@@ -329,7 +355,19 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
 int __weak
 set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-       return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
+       return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
+}
+
+static struct uprobe *get_uprobe(struct uprobe *uprobe)
+{
+       atomic_inc(&uprobe->ref);
+       return uprobe;
+}
+
+static void put_uprobe(struct uprobe *uprobe)
+{
+       if (atomic_dec_and_test(&uprobe->ref))
+               kfree(uprobe);
 }
 
 static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -359,10 +397,8 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
        while (n) {
                uprobe = rb_entry(n, struct uprobe, rb_node);
                match = match_uprobe(&u, uprobe);
-               if (!match) {
-                       atomic_inc(&uprobe->ref);
-                       return uprobe;
-               }
+               if (!match)
+                       return get_uprobe(uprobe);
 
                if (match < 0)
                        n = n->rb_left;
@@ -398,10 +434,8 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
                parent = *p;
                u = rb_entry(parent, struct uprobe, rb_node);
                match = match_uprobe(uprobe, u);
-               if (!match) {
-                       atomic_inc(&u->ref);
-                       return u;
-               }
+               if (!match)
+                       return get_uprobe(u);
 
                if (match < 0)
                        p = &parent->rb_left;
@@ -438,12 +472,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
        return u;
 }
 
-static void put_uprobe(struct uprobe *uprobe)
-{
-       if (atomic_dec_and_test(&uprobe->ref))
-               kfree(uprobe);
-}
-
 static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
 {
        struct uprobe *uprobe, *cur_uprobe;
@@ -456,12 +484,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
        uprobe->offset = offset;
        init_rwsem(&uprobe->register_rwsem);
        init_rwsem(&uprobe->consumer_rwsem);
-       /* For now assume that the instruction need not be single-stepped */
-       __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
 
        /* add to uprobes_tree, sorted on inode:offset */
        cur_uprobe = insert_uprobe(uprobe);
-
        /* a uprobe exists for this inode:offset combination */
        if (cur_uprobe) {
                kfree(uprobe);
@@ -503,19 +528,19 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
        return ret;
 }
 
-static int
-__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
-                       unsigned long nbytes, loff_t offset)
+static int __copy_insn(struct address_space *mapping, struct file *filp,
+                       void *insn, int nbytes, loff_t offset)
 {
        struct page *page;
-
-       if (!mapping->a_ops->readpage)
-               return -EIO;
        /*
-        * Ensure that the page that has the original instruction is
-        * populated and in page-cache.
+        * Ensure that the page that has the original instruction is populated
+        * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+        * see uprobe_register().
         */
-       page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+       if (mapping->a_ops->readpage)
+               page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+       else
+               page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
        if (IS_ERR(page))
                return PTR_ERR(page);
 
@@ -527,28 +552,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
 
 static int copy_insn(struct uprobe *uprobe, struct file *filp)
 {
-       struct address_space *mapping;
-       unsigned long nbytes;
-       int bytes;
-
-       nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
-       mapping = uprobe->inode->i_mapping;
+       struct address_space *mapping = uprobe->inode->i_mapping;
+       loff_t offs = uprobe->offset;
+       void *insn = &uprobe->arch.insn;
+       int size = sizeof(uprobe->arch.insn);
+       int len, err = -EIO;
 
-       /* Instruction at end of binary; copy only available bytes */
-       if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
-               bytes = uprobe->inode->i_size - uprobe->offset;
-       else
-               bytes = MAX_UINSN_BYTES;
+       /* Copy only available bytes, -EIO if nothing was read */
+       do {
+               if (offs >= i_size_read(uprobe->inode))
+                       break;
 
-       /* Instruction at the page-boundary; copy bytes in second page */
-       if (nbytes < bytes) {
-               int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
-                               bytes - nbytes, uprobe->offset + nbytes);
+               len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
+               err = __copy_insn(mapping, filp, insn, len, offs);
                if (err)
-                       return err;
-               bytes = nbytes;
-       }
-       return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
+                       break;
+
+               insn += len;
+               offs += len;
+               size -= len;
+       } while (size);
+
+       return err;
 }
 
 static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
@@ -569,14 +594,14 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
                goto out;
 
        ret = -ENOTSUPP;
-       if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
+       if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
                goto out;
 
        ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
        if (ret)
                goto out;
 
-       /* write_opcode() assumes we don't cross page boundary */
+       /* uprobe_write_opcode() assumes we don't cross page boundary */
        BUG_ON((uprobe->offset & ~PAGE_MASK) +
                        UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
 
@@ -693,14 +718,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
        int more = 0;
 
  again:
-       mutex_lock(&mapping->i_mmap_mutex);
+       i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                if (!valid_vma(vma, is_register))
                        continue;
 
                if (!prev && !more) {
                        /*
-                        * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
+                        * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
                         * reclaim. This is optimistic, no harm done if it fails.
                         */
                        prev = kmalloc(sizeof(struct map_info),
@@ -724,7 +749,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
                info->mm = vma->vm_mm;
                info->vaddr = offset_to_vaddr(vma, offset);
        }
-       mutex_unlock(&mapping->i_mmap_mutex);
+       i_mmap_unlock_read(mapping);
 
        if (!more)
                goto out;
@@ -816,7 +841,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
 {
        int err;
 
-       if (!consumer_del(uprobe, uc))  /* WARN? */
+       if (WARN_ON(!consumer_del(uprobe, uc)))
                return;
 
        err = register_for_each_vma(uprobe, NULL);
@@ -851,6 +876,9 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
        if (!uc->handler && !uc->ret_handler)
                return -EINVAL;
 
+       /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+       if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
+               return -EIO;
        /* Racy, just to catch the obvious mistakes */
        if (offset > i_size_read(inode))
                return -EINVAL;
@@ -894,7 +922,7 @@ int uprobe_apply(struct inode *inode, loff_t offset,
        int ret = -ENOENT;
 
        uprobe = find_uprobe(inode, offset);
-       if (!uprobe)
+       if (WARN_ON(!uprobe))
                return ret;
 
        down_write(&uprobe->register_rwsem);
@@ -919,7 +947,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
        struct uprobe *uprobe;
 
        uprobe = find_uprobe(inode, offset);
-       if (!uprobe)
+       if (WARN_ON(!uprobe))
                return;
 
        down_write(&uprobe->register_rwsem);
@@ -1005,14 +1033,14 @@ static void build_probe_list(struct inode *inode,
                        if (u->inode != inode || u->offset < min)
                                break;
                        list_add(&u->pending_list, head);
-                       atomic_inc(&u->ref);
+                       get_uprobe(u);
                }
                for (t = n; (t = rb_next(t)); ) {
                        u = rb_entry(t, struct uprobe, rb_node);
                        if (u->inode != inode || u->offset > max)
                                break;
                        list_add(&u->pending_list, head);
-                       atomic_inc(&u->ref);
+                       get_uprobe(u);
                }
        }
        spin_unlock(&uprobes_treelock);
@@ -1096,54 +1124,51 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
 }
 
 /* Slot allocation for XOL */
-static int xol_add_vma(struct xol_area *area)
+static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
 {
-       struct mm_struct *mm = current->mm;
-       int ret = -EALREADY;
+       struct vm_area_struct *vma;
+       int ret;
 
        down_write(&mm->mmap_sem);
-       if (mm->uprobes_state.xol_area)
+       if (mm->uprobes_state.xol_area) {
+               ret = -EALREADY;
                goto fail;
+       }
 
-       ret = -ENOMEM;
-       /* Try to map as high as possible, this is only a hint. */
-       area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
-       if (area->vaddr & ~PAGE_MASK) {
-               ret = area->vaddr;
-               goto fail;
+       if (!area->vaddr) {
+               /* Try to map as high as possible, this is only a hint. */
+               area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
+                                               PAGE_SIZE, 0, 0);
+               if (area->vaddr & ~PAGE_MASK) {
+                       ret = area->vaddr;
+                       goto fail;
+               }
        }
 
-       ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
-                               VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
-       if (ret)
+       vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
+                               VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
+                               &area->xol_mapping);
+       if (IS_ERR(vma)) {
+               ret = PTR_ERR(vma);
                goto fail;
+       }
 
+       ret = 0;
        smp_wmb();      /* pairs with get_xol_area() */
        mm->uprobes_state.xol_area = area;
-       ret = 0;
  fail:
        up_write(&mm->mmap_sem);
 
        return ret;
 }
 
-/*
- * get_xol_area - Allocate process's xol_area if necessary.
- * This area will be used for storing instructions for execution out of line.
- *
- * Returns the allocated area or NULL.
- */
-static struct xol_area *get_xol_area(void)
+static struct xol_area *__create_xol_area(unsigned long vaddr)
 {
        struct mm_struct *mm = current->mm;
-       struct xol_area *area;
        uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+       struct xol_area *area;
 
-       area = mm->uprobes_state.xol_area;
-       if (area)
-               goto ret;
-
-       area = kzalloc(sizeof(*area), GFP_KERNEL);
+       area = kmalloc(sizeof(*area), GFP_KERNEL);
        if (unlikely(!area))
                goto out;
 
@@ -1151,28 +1176,48 @@ static struct xol_area *get_xol_area(void)
        if (!area->bitmap)
                goto free_area;
 
-       area->page = alloc_page(GFP_HIGHUSER);
-       if (!area->page)
+       area->xol_mapping.name = "[uprobes]";
+       area->xol_mapping.pages = area->pages;
+       area->pages[0] = alloc_page(GFP_HIGHUSER);
+       if (!area->pages[0])
                goto free_bitmap;
+       area->pages[1] = NULL;
 
-       /* allocate first slot of task's xol_area for the return probes */
+       area->vaddr = vaddr;
+       init_waitqueue_head(&area->wq);
+       /* Reserve the 1st slot for get_trampoline_vaddr() */
        set_bit(0, area->bitmap);
-       copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
        atomic_set(&area->slot_count, 1);
-       init_waitqueue_head(&area->wq);
+       copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
 
-       if (!xol_add_vma(area))
+       if (!xol_add_vma(mm, area))
                return area;
 
-       __free_page(area->page);
+       __free_page(area->pages[0]);
  free_bitmap:
        kfree(area->bitmap);
  free_area:
        kfree(area);
  out:
+       return NULL;
+}
+
+/*
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
+ *
+ * Returns the allocated area or NULL.
+ */
+static struct xol_area *get_xol_area(void)
+{
+       struct mm_struct *mm = current->mm;
+       struct xol_area *area;
+
+       if (!mm->uprobes_state.xol_area)
+               __create_xol_area(0);
+
        area = mm->uprobes_state.xol_area;
- ret:
-       smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
+       smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
        return area;
 }
 
@@ -1186,7 +1231,7 @@ void uprobe_clear_state(struct mm_struct *mm)
        if (!area)
                return;
 
-       put_page(area->page);
+       put_page(area->pages[0]);
        kfree(area->bitmap);
        kfree(area);
 }
@@ -1255,13 +1300,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
        if (unlikely(!xol_vaddr))
                return 0;
 
-       /* Initialize the slot */
-       copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
-       /*
-        * We probably need flush_icache_user_range() but it needs vma.
-        * This should work on supported architectures too.
-        */
-       flush_dcache_page(area->page);
+       arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
+                             &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
 
        return xol_vaddr;
 }
@@ -1297,6 +1337,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
 
                clear_bit(slot_nr, area->bitmap);
                atomic_dec(&area->slot_count);
+               smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
                if (waitqueue_active(&area->wq))
                        wake_up(&area->wq);
 
@@ -1304,6 +1345,21 @@ static void xol_free_insn_slot(struct task_struct *tsk)
        }
 }
 
+void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
+                                 void *src, unsigned long len)
+{
+       /* Initialize the slot */
+       copy_to_page(page, vaddr, src, len);
+
+       /*
+        * We probably need flush_icache_user_range() but it needs vma.
+        * This should work on most of architectures by default. If
+        * architecture needs to do something different it can define
+        * its own version of the function.
+        */
+       flush_dcache_page(page);
+}
+
 /**
  * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
  * @regs: Reflects the saved state of the task after it has hit a breakpoint
@@ -1315,6 +1371,24 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
        return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
 }
 
+unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
+{
+       struct uprobe_task *utask = current->utask;
+
+       if (unlikely(utask && utask->active_uprobe))
+               return utask->vaddr;
+
+       return instruction_pointer(regs);
+}
+
+static struct return_instance *free_ret_instance(struct return_instance *ri)
+{
+       struct return_instance *next = ri->next;
+       put_uprobe(ri->uprobe);
+       kfree(ri);
+       return next;
+}
+
 /*
  * Called with no locks held.
  * Called in context of a exiting or a exec-ing thread.
@@ -1322,7 +1396,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
 void uprobe_free_utask(struct task_struct *t)
 {
        struct uprobe_task *utask = t->utask;
-       struct return_instance *ri, *tmp;
+       struct return_instance *ri;
 
        if (!utask)
                return;
@@ -1331,27 +1405,14 @@ void uprobe_free_utask(struct task_struct *t)
                put_uprobe(utask->active_uprobe);
 
        ri = utask->return_instances;
-       while (ri) {
-               tmp = ri;
-               ri = ri->next;
-
-               put_uprobe(tmp->uprobe);
-               kfree(tmp);
-       }
+       while (ri)
+               ri = free_ret_instance(ri);
 
        xol_free_insn_slot(t);
        kfree(utask);
        t->utask = NULL;
 }
 
-/*
- * Called in context of a new clone/fork from copy_process.
- */
-void uprobe_copy_process(struct task_struct *t)
-{
-       t->utask = NULL;
-}
-
 /*
  * Allocate a uprobe_task object for the task if if necessary.
  * Called when the thread hits a breakpoint.
@@ -1367,6 +1428,82 @@ static struct uprobe_task *get_utask(void)
        return current->utask;
 }
 
+static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
+{
+       struct uprobe_task *n_utask;
+       struct return_instance **p, *o, *n;
+
+       n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+       if (!n_utask)
+               return -ENOMEM;
+       t->utask = n_utask;
+
+       p = &n_utask->return_instances;
+       for (o = o_utask->return_instances; o; o = o->next) {
+               n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+               if (!n)
+                       return -ENOMEM;
+
+               *n = *o;
+               get_uprobe(n->uprobe);
+               n->next = NULL;
+
+               *p = n;
+               p = &n->next;
+               n_utask->depth++;
+       }
+
+       return 0;
+}
+
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+       pr_warn("uprobe: %s:%d failed to %s\n",
+                       current->comm, current->pid, msg);
+}
+
+static void dup_xol_work(struct callback_head *work)
+{
+       if (current->flags & PF_EXITING)
+               return;
+
+       if (!__create_xol_area(current->utask->dup_xol_addr))
+               uprobe_warn(current, "dup xol area");
+}
+
+/*
+ * Called in context of a new clone/fork from copy_process.
+ */
+void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+{
+       struct uprobe_task *utask = current->utask;
+       struct mm_struct *mm = current->mm;
+       struct xol_area *area;
+
+       t->utask = NULL;
+
+       if (!utask || !utask->return_instances)
+               return;
+
+       if (mm == t->mm && !(flags & CLONE_VFORK))
+               return;
+
+       if (dup_utask(t, utask))
+               return uprobe_warn(t, "dup ret instances");
+
+       /* The task can fork() after dup_xol_work() fails */
+       area = mm->uprobes_state.xol_area;
+       if (!area)
+               return uprobe_warn(t, "dup xol area");
+
+       if (mm == t->mm)
+               return;
+
+       t->utask->dup_xol_addr = area->vaddr;
+       init_task_work(&t->utask->dup_xol_work, dup_xol_work);
+       task_work_add(t, &t->utask->dup_xol_work, true);
+}
+
 /*
  * Current area->vaddr notion assume the trampoline address is always
  * equal area->vaddr.
@@ -1386,12 +1523,25 @@ static unsigned long get_trampoline_vaddr(void)
        return trampoline_vaddr;
 }
 
+static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
+                                       struct pt_regs *regs)
+{
+       struct return_instance *ri = utask->return_instances;
+       enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
+
+       while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
+               ri = free_ret_instance(ri);
+               utask->depth--;
+       }
+       utask->return_instances = ri;
+}
+
 static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 {
        struct return_instance *ri;
        struct uprobe_task *utask;
        unsigned long orig_ret_vaddr, trampoline_vaddr;
-       bool chained = false;
+       bool chained;
 
        if (!get_xol_area())
                return;
@@ -1407,49 +1557,47 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
                return;
        }
 
-       ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+       ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
        if (!ri)
-               goto fail;
+               return;
 
        trampoline_vaddr = get_trampoline_vaddr();
        orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
        if (orig_ret_vaddr == -1)
                goto fail;
 
+       /* drop the entries invalidated by longjmp() */
+       chained = (orig_ret_vaddr == trampoline_vaddr);
+       cleanup_return_instances(utask, chained, regs);
+
        /*
         * We don't want to keep trampoline address in stack, rather keep the
         * original return address of first caller thru all the consequent
         * instances. This also makes breakpoint unwrapping easier.
         */
-       if (orig_ret_vaddr == trampoline_vaddr) {
+       if (chained) {
                if (!utask->return_instances) {
                        /*
                         * This situation is not possible. Likely we have an
                         * attack from user-space.
                         */
-                       pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
-                                               current->pid, current->tgid);
+                       uprobe_warn(current, "handle tail call");
                        goto fail;
                }
-
-               chained = true;
                orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
        }
 
-       atomic_inc(&uprobe->ref);
-       ri->uprobe = uprobe;
+       ri->uprobe = get_uprobe(uprobe);
        ri->func = instruction_pointer(regs);
+       ri->stack = user_stack_pointer(regs);
        ri->orig_ret_vaddr = orig_ret_vaddr;
        ri->chained = chained;
 
        utask->depth++;
-
-       /* add instance to the stack */
        ri->next = utask->return_instances;
        utask->return_instances = ri;
 
        return;
-
  fail:
        kfree(ri);
 }
@@ -1511,27 +1659,12 @@ bool uprobe_deny_signal(void)
                if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
                        utask->state = UTASK_SSTEP_TRAPPED;
                        set_tsk_thread_flag(t, TIF_UPROBE);
-                       set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
                }
        }
 
        return true;
 }
 
-/*
- * Avoid singlestepping the original instruction if the original instruction
- * is a NOP or can be emulated.
- */
-static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
-{
-       if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
-               if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
-                       return true;
-               clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
-       }
-       return false;
-}
-
 static void mmf_recalc_uprobes(struct mm_struct *mm)
 {
        struct vm_area_struct *vma;
@@ -1652,47 +1785,68 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
        up_read(&uprobe->register_rwsem);
 }
 
-static bool handle_trampoline(struct pt_regs *regs)
+static struct return_instance *find_next_ret_chain(struct return_instance *ri)
 {
-       struct uprobe_task *utask;
-       struct return_instance *ri, *tmp;
        bool chained;
 
+       do {
+               chained = ri->chained;
+               ri = ri->next;  /* can't be NULL if chained */
+       } while (chained);
+
+       return ri;
+}
+
+static void handle_trampoline(struct pt_regs *regs)
+{
+       struct uprobe_task *utask;
+       struct return_instance *ri, *next;
+       bool valid;
+
        utask = current->utask;
        if (!utask)
-               return false;
+               goto sigill;
 
        ri = utask->return_instances;
        if (!ri)
-               return false;
+               goto sigill;
 
-       /*
-        * TODO: we should throw out return_instance's invalidated by
-        * longjmp(), currently we assume that the probed function always
-        * returns.
-        */
-       instruction_pointer_set(regs, ri->orig_ret_vaddr);
-
-       for (;;) {
-               handle_uretprobe_chain(ri, regs);
-
-               chained = ri->chained;
-               put_uprobe(ri->uprobe);
-
-               tmp = ri;
-               ri = ri->next;
-               kfree(tmp);
+       do {
+               /*
+                * We should throw out the frames invalidated by longjmp().
+                * If this chain is valid, then the next one should be alive
+                * or NULL; the latter case means that nobody but ri->func
+                * could hit this trampoline on return. TODO: sigaltstack().
+                */
+               next = find_next_ret_chain(ri);
+               valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
+
+               instruction_pointer_set(regs, ri->orig_ret_vaddr);
+               do {
+                       if (valid)
+                               handle_uretprobe_chain(ri, regs);
+                       ri = free_ret_instance(ri);
+                       utask->depth--;
+               } while (ri != next);
+       } while (!valid);
 
-               if (!chained)
-                       break;
+       utask->return_instances = ri;
+       return;
 
-               utask->depth--;
+ sigill:
+       uprobe_warn(current, "handle uretprobe, sending SIGILL.");
+       force_sig_info(SIGILL, SEND_SIG_FORCED, current);
 
-               BUG_ON(!ri);
-       }
+}
 
-       utask->return_instances = ri;
+bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
+{
+       return false;
+}
 
+bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+                                       struct pt_regs *regs)
+{
        return true;
 }
 
@@ -1707,13 +1861,8 @@ static void handle_swbp(struct pt_regs *regs)
        int uninitialized_var(is_swbp);
 
        bp_vaddr = uprobe_get_swbp_addr(regs);
-       if (bp_vaddr == get_trampoline_vaddr()) {
-               if (handle_trampoline(regs))
-                       return;
-
-               pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
-                                               current->pid, current->tgid);
-       }
+       if (bp_vaddr == get_trampoline_vaddr())
+               return handle_trampoline(regs);
 
        uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
        if (!uprobe) {
@@ -1746,14 +1895,22 @@ static void handle_swbp(struct pt_regs *regs)
        if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
                goto out;
 
+       /* Tracing handlers use ->utask to communicate with fetch methods */
+       if (!get_utask())
+               goto out;
+
+       if (arch_uprobe_ignore(&uprobe->arch, regs))
+               goto out;
+
        handler_chain(uprobe, regs);
-       if (can_skip_sstep(uprobe, regs))
+
+       if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
                goto out;
 
        if (!pre_ssout(uprobe, regs, bp_vaddr))
                return;
 
-       /* can_skip_sstep() succeeded, or restart if can't singlestep */
+       /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
 out:
        put_uprobe(uprobe);
 }
@@ -1765,10 +1922,11 @@ out:
 static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
 {
        struct uprobe *uprobe;
+       int err = 0;
 
        uprobe = utask->active_uprobe;
        if (utask->state == UTASK_SSTEP_ACK)
-               arch_uprobe_post_xol(&uprobe->arch, regs);
+               err = arch_uprobe_post_xol(&uprobe->arch, regs);
        else if (utask->state == UTASK_SSTEP_TRAPPED)
                arch_uprobe_abort_xol(&uprobe->arch, regs);
        else
@@ -1782,6 +1940,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
        spin_lock_irq(&current->sighand->siglock);
        recalc_sigpending(); /* see uprobe_deny_signal() */
        spin_unlock_irq(&current->sighand->siglock);
+
+       if (unlikely(err)) {
+               uprobe_warn(current, "execute the probed insn, sending SIGILL.");
+               force_sig_info(SIGILL, SEND_SIG_FORCED, current);
+       }
 }
 
 /*
@@ -1859,9 +2022,4 @@ static int __init init_uprobes(void)
 
        return register_die_notifier(&uprobe_exception_nb);
 }
-module_init(init_uprobes);
-
-static void __exit exit_uprobes(void)
-{
-}
-module_exit(exit_uprobes);
+__initcall(init_uprobes);