ARM64: dts: rockchip: enable tsadc node for rk3366-tb
[firefly-linux-kernel-4.4.55.git] / kernel / futex.c
index b26dcfc02c9489b3ca00bfd076f2208bb4964366..9d8163afd87ca7605ef85d2ca64d3c4521838fae 100644 (file)
 #include <linux/nsproxy.h>
 #include <linux/ptrace.h>
 #include <linux/sched/rt.h>
+#include <linux/hugetlb.h>
+#include <linux/freezer.h>
+#include <linux/bootmem.h>
+#include <linux/fault-inject.h>
 
 #include <asm/futex.h>
 
-#include "rtmutex_common.h"
+#include "locking/rtmutex_common.h"
 
-int __read_mostly futex_cmpxchg_enabled;
+/*
+ * READ this before attempting to hack on futexes!
+ *
+ * Basic futex operation and ordering guarantees
+ * =============================================
+ *
+ * The waiter reads the futex value in user space and calls
+ * futex_wait(). This function computes the hash bucket and acquires
+ * the hash bucket lock. After that it reads the futex user space value
+ * again and verifies that the data has not changed. If it has not changed
+ * it enqueues itself into the hash bucket, releases the hash bucket lock
+ * and schedules.
+ *
+ * The waker side modifies the user space value of the futex and calls
+ * futex_wake(). This function computes the hash bucket and acquires the
+ * hash bucket lock. Then it looks for waiters on that futex in the hash
+ * bucket and wakes them.
+ *
+ * In futex wake up scenarios where no tasks are blocked on a futex, taking
+ * the hb spinlock can be avoided and simply return. In order for this
+ * optimization to work, ordering guarantees must exist so that the waiter
+ * being added to the list is acknowledged when the list is concurrently being
+ * checked by the waker, avoiding scenarios like the following:
+ *
+ * CPU 0                               CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ *   futex_wait(futex, val);
+ *   uval = *futex;
+ *                                     *futex = newval;
+ *                                     sys_futex(WAKE, futex);
+ *                                       futex_wake(futex);
+ *                                       if (queue_empty())
+ *                                         return;
+ *   if (uval == val)
+ *      lock(hash_bucket(futex));
+ *      queue();
+ *     unlock(hash_bucket(futex));
+ *     schedule();
+ *
+ * This would cause the waiter on CPU 0 to wait forever because it
+ * missed the transition of the user space value from val to newval
+ * and the waker did not find the waiter in the hash bucket queue.
+ *
+ * The correct serialization ensures that a waiter either observes
+ * the changed user space value before blocking or is woken by a
+ * concurrent waker:
+ *
+ * CPU 0                                 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ *   futex_wait(futex, val);
+ *
+ *   waiters++; (a)
+ *   mb(); (A) <-- paired with -.
+ *                              |
+ *   lock(hash_bucket(futex));  |
+ *                              |
+ *   uval = *futex;             |
+ *                              |        *futex = newval;
+ *                              |        sys_futex(WAKE, futex);
+ *                              |          futex_wake(futex);
+ *                              |
+ *                              `------->  mb(); (B)
+ *   if (uval == val)
+ *     queue();
+ *     unlock(hash_bucket(futex));
+ *     schedule();                         if (waiters)
+ *                                           lock(hash_bucket(futex));
+ *   else                                    wake_waiters(futex);
+ *     waiters--; (b)                        unlock(hash_bucket(futex));
+ *
+ * Where (A) orders the waiters increment and the futex value read through
+ * atomic operations (see hb_waiters_inc) and where (B) orders the write
+ * to futex and the waiters read -- this is done by the barriers for both
+ * shared and private futexes in get_futex_key_refs().
+ *
+ * This yields the following case (where X:=waiters, Y:=futex):
+ *
+ *     X = Y = 0
+ *
+ *     w[X]=1          w[Y]=1
+ *     MB              MB
+ *     r[Y]=y          r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible; which translates back into
+ * the guarantee that we cannot both miss the futex variable change and the
+ * enqueue.
+ *
+ * Note that a new waiter is accounted for in (a) even when it is possible that
+ * the wait call can return error, in which case we backtrack from it in (b).
+ * Refer to the comment in queue_lock().
+ *
+ * Similarly, in order to account for waiters being requeued on another
+ * address we always increment the waiters for the destination bucket before
+ * acquiring the lock. It then decrements them again  after releasing it -
+ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
+ * will do the additional required waiter count housekeeping. This is done for
+ * double_lock_hb() and double_unlock_hb(), respectively.
+ */
 
-#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
+int __read_mostly futex_cmpxchg_enabled;
+#endif
 
 /*
  * Futex flags used to encode options to functions and preserve them across
@@ -145,11 +250,128 @@ static const struct futex_q futex_q_init = {
  * waiting on a futex.
  */
 struct futex_hash_bucket {
+       atomic_t waiters;
        spinlock_t lock;
        struct plist_head chain;
+} ____cacheline_aligned_in_smp;
+
+/*
+ * The base of the bucket array and its size are always used together
+ * (after initialization only in hash_futex()), so ensure that they
+ * reside in the same cacheline.
+ */
+static struct {
+       struct futex_hash_bucket *queues;
+       unsigned long            hashsize;
+} __futex_data __read_mostly __aligned(2*sizeof(long));
+#define futex_queues   (__futex_data.queues)
+#define futex_hashsize (__futex_data.hashsize)
+
+
+/*
+ * Fault injections for futexes.
+ */
+#ifdef CONFIG_FAIL_FUTEX
+
+static struct {
+       struct fault_attr attr;
+
+       bool ignore_private;
+} fail_futex = {
+       .attr = FAULT_ATTR_INITIALIZER,
+       .ignore_private = false,
 };
 
-static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
+static int __init setup_fail_futex(char *str)
+{
+       return setup_fault_attr(&fail_futex.attr, str);
+}
+__setup("fail_futex=", setup_fail_futex);
+
+static bool should_fail_futex(bool fshared)
+{
+       if (fail_futex.ignore_private && !fshared)
+               return false;
+
+       return should_fail(&fail_futex.attr, 1);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_futex_debugfs(void)
+{
+       umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+       struct dentry *dir;
+
+       dir = fault_create_debugfs_attr("fail_futex", NULL,
+                                       &fail_futex.attr);
+       if (IS_ERR(dir))
+               return PTR_ERR(dir);
+
+       if (!debugfs_create_bool("ignore-private", mode, dir,
+                                &fail_futex.ignore_private)) {
+               debugfs_remove_recursive(dir);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+late_initcall(fail_futex_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else
+static inline bool should_fail_futex(bool fshared)
+{
+       return false;
+}
+#endif /* CONFIG_FAIL_FUTEX */
+
+static inline void futex_get_mm(union futex_key *key)
+{
+       atomic_inc(&key->private.mm->mm_count);
+       /*
+        * Ensure futex_get_mm() implies a full barrier such that
+        * get_futex_key() implies a full barrier. This is relied upon
+        * as full barrier (B), see the ordering comment above.
+        */
+       smp_mb__after_atomic();
+}
+
+/*
+ * Reflects a new waiter being added to the waitqueue.
+ */
+static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+       atomic_inc(&hb->waiters);
+       /*
+        * Full barrier (A), see the ordering comment above.
+        */
+       smp_mb__after_atomic();
+#endif
+}
+
+/*
+ * Reflects a waiter being removed from the waitqueue by wakeup
+ * paths.
+ */
+static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+       atomic_dec(&hb->waiters);
+#endif
+}
+
+static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+       return atomic_read(&hb->waiters);
+#else
+       return 1;
+#endif
+}
 
 /*
  * We hash on the keys returned from get_futex_key (see below).
@@ -159,7 +381,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
        u32 hash = jhash2((u32*)&key->both.word,
                          (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
                          key->both.offset);
-       return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
+       return &futex_queues[hash & (futex_hashsize - 1)];
 }
 
 /*
@@ -185,17 +407,26 @@ static void get_futex_key_refs(union futex_key *key)
 
        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
        case FUT_OFF_INODE:
-               ihold(key->shared.inode);
+               ihold(key->shared.inode); /* implies MB (B) */
                break;
        case FUT_OFF_MMSHARED:
-               atomic_inc(&key->private.mm->mm_count);
+               futex_get_mm(key); /* implies MB (B) */
                break;
+       default:
+               /*
+                * Private futexes do not hold reference on an inode or
+                * mm, therefore the only purpose of calling get_futex_key_refs
+                * is because we need the barrier for the lockless waiter check.
+                */
+               smp_mb(); /* explicit MB (B) */
        }
 }
 
 /*
  * Drop a reference to the resource addressed by a key.
- * The hash bucket spinlock must not be held.
+ * The hash bucket spinlock must not be held. This is
+ * a no-op for private futexes, see comment in the get
+ * counterpart.
  */
 static void drop_futex_key_refs(union futex_key *key)
 {
@@ -249,6 +480,12 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
                return -EINVAL;
        address -= key->both.offset;
 
+       if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+               return -EFAULT;
+
+       if (unlikely(should_fail_futex(fshared)))
+               return -EFAULT;
+
        /*
         * PROCESS_PRIVATE futexes are fast.
         * As the mm cannot disappear under us and the 'key' only needs
@@ -257,15 +494,17 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
         *        but access_ok() should be faster than find_vma()
         */
        if (!fshared) {
-               if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
-                       return -EFAULT;
                key->private.mm = mm;
                key->private.address = address;
-               get_futex_key_refs(key);
+               get_futex_key_refs(key);  /* implies MB (B) */
                return 0;
        }
 
 again:
+       /* Ignore any VERIFY_READ mapping (futex common case) */
+       if (unlikely(should_fail_futex(fshared)))
+               return -EFAULT;
+
        err = get_user_pages_fast(address, 1, 1, &page);
        /*
         * If write access is not required (eg. FUTEX_WAIT), try
@@ -286,7 +525,7 @@ again:
                put_page(page);
                /* serialize against __split_huge_page_splitting() */
                local_irq_disable();
-               if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+               if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
                        page_head = compound_head(page);
                        /*
                         * page_head is valid pointer but we must pin
@@ -354,7 +593,7 @@ again:
                 * A RO anonymous page will never change and thus doesn't make
                 * sense for futex operations.
                 */
-               if (ro) {
+               if (unlikely(should_fail_futex(fshared)) || ro) {
                        err = -EFAULT;
                        goto out;
                }
@@ -365,10 +604,10 @@ again:
        } else {
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
                key->shared.inode = page_head->mapping->host;
-               key->shared.pgoff = page_head->index;
+               key->shared.pgoff = basepage_index(page);
        }
 
-       get_futex_key_refs(key);
+       get_futex_key_refs(key); /* implies MB (B) */
 
 out:
        unlock_page(page_head);
@@ -485,8 +724,14 @@ static struct futex_pi_state * alloc_pi_state(void)
        return pi_state;
 }
 
+/*
+ * Must be called with the hb lock held.
+ */
 static void free_pi_state(struct futex_pi_state *pi_state)
 {
+       if (!pi_state)
+               return;
+
        if (!atomic_dec_and_test(&pi_state->refcount))
                return;
 
@@ -589,62 +834,142 @@ void exit_pi_state_list(struct task_struct *curr)
        raw_spin_unlock_irq(&curr->pi_lock);
 }
 
-static int
-lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
-               union futex_key *key, struct futex_pi_state **ps)
+/*
+ * We need to check the following states:
+ *
+ *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
+ *
+ * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
+ * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
+ *
+ * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
+ *
+ * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
+ * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
+ *
+ * [6]  Found  | Found    | task      | 0         | 1      | Valid
+ *
+ * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
+ *
+ * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
+ * [9]  Found  | Found    | task      | 0         | 0      | Invalid
+ * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
+ *
+ * [1] Indicates that the kernel can acquire the futex atomically. We
+ *     came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+ *
+ * [2] Valid, if TID does not belong to a kernel thread. If no matching
+ *      thread is found then it indicates that the owner TID has died.
+ *
+ * [3] Invalid. The waiter is queued on a non PI futex
+ *
+ * [4] Valid state after exit_robust_list(), which sets the user space
+ *     value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
+ *
+ * [5] The user space value got manipulated between exit_robust_list()
+ *     and exit_pi_state_list()
+ *
+ * [6] Valid state after exit_pi_state_list() which sets the new owner in
+ *     the pi_state but cannot access the user space value.
+ *
+ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
+ *
+ * [8] Owner and user space value match
+ *
+ * [9] There is no transient state which sets the user space TID to 0
+ *     except exit_robust_list(), but this is indicated by the
+ *     FUTEX_OWNER_DIED bit. See [4]
+ *
+ * [10] There is no transient state which leaves owner and user space
+ *     TID out of sync.
+ */
+
+/*
+ * Validate that the existing waiter has a pi_state and sanity check
+ * the pi_state against the user space value. If correct, attach to
+ * it.
+ */
+static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+                             struct futex_pi_state **ps)
 {
-       struct futex_pi_state *pi_state = NULL;
-       struct futex_q *this, *next;
-       struct plist_head *head;
-       struct task_struct *p;
        pid_t pid = uval & FUTEX_TID_MASK;
 
-       head = &hb->chain;
+       /*
+        * Userspace might have messed up non-PI and PI futexes [3]
+        */
+       if (unlikely(!pi_state))
+               return -EINVAL;
 
-       plist_for_each_entry_safe(this, next, head, list) {
-               if (match_futex(&this->key, key)) {
-                       /*
-                        * Another waiter already exists - bump up
-                        * the refcount and return its pi_state:
-                        */
-                       pi_state = this->pi_state;
+       WARN_ON(!atomic_read(&pi_state->refcount));
+
+       /*
+        * Handle the owner died case:
+        */
+       if (uval & FUTEX_OWNER_DIED) {
+               /*
+                * exit_pi_state_list sets owner to NULL and wakes the
+                * topmost waiter. The task which acquires the
+                * pi_state->rt_mutex will fixup owner.
+                */
+               if (!pi_state->owner) {
                        /*
-                        * Userspace might have messed up non-PI and PI futexes
+                        * No pi state owner, but the user space TID
+                        * is not 0. Inconsistent state. [5]
                         */
-                       if (unlikely(!pi_state))
+                       if (pid)
                                return -EINVAL;
-
-                       WARN_ON(!atomic_read(&pi_state->refcount));
-
                        /*
-                        * When pi_state->owner is NULL then the owner died
-                        * and another waiter is on the fly. pi_state->owner
-                        * is fixed up by the task which acquires
-                        * pi_state->rt_mutex.
-                        *
-                        * We do not check for pid == 0 which can happen when
-                        * the owner died and robust_list_exit() cleared the
-                        * TID.
+                        * Take a ref on the state and return success. [4]
                         */
-                       if (pid && pi_state->owner) {
-                               /*
-                                * Bail out if user space manipulated the
-                                * futex value.
-                                */
-                               if (pid != task_pid_vnr(pi_state->owner))
-                                       return -EINVAL;
-                       }
-
-                       atomic_inc(&pi_state->refcount);
-                       *ps = pi_state;
-
-                       return 0;
+                       goto out_state;
                }
+
+               /*
+                * If TID is 0, then either the dying owner has not
+                * yet executed exit_pi_state_list() or some waiter
+                * acquired the rtmutex in the pi state, but did not
+                * yet fixup the TID in user space.
+                *
+                * Take a ref on the state and return success. [6]
+                */
+               if (!pid)
+                       goto out_state;
+       } else {
+               /*
+                * If the owner died bit is not set, then the pi_state
+                * must have an owner. [7]
+                */
+               if (!pi_state->owner)
+                       return -EINVAL;
        }
 
+       /*
+        * Bail out if user space manipulated the futex value. If pi
+        * state exists then the owner TID must be the same as the
+        * user space TID. [9/10]
+        */
+       if (pid != task_pid_vnr(pi_state->owner))
+               return -EINVAL;
+out_state:
+       atomic_inc(&pi_state->refcount);
+       *ps = pi_state;
+       return 0;
+}
+
+/*
+ * Lookup the task for the TID provided from user space and attach to
+ * it after doing proper sanity checks.
+ */
+static int attach_to_pi_owner(u32 uval, union futex_key *key,
+                             struct futex_pi_state **ps)
+{
+       pid_t pid = uval & FUTEX_TID_MASK;
+       struct futex_pi_state *pi_state;
+       struct task_struct *p;
+
        /*
         * We are the first waiter - try to look up the real owner and attach
-        * the new pi_state to it, but bail out when TID = 0
+        * the new pi_state to it, but bail out when TID = 0 [1]
         */
        if (!pid)
                return -ESRCH;
@@ -652,6 +977,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        if (!p)
                return -ESRCH;
 
+       if (unlikely(p->flags & PF_KTHREAD)) {
+               put_task_struct(p);
+               return -EPERM;
+       }
+
        /*
         * We need to look at the task state flags to figure out,
         * whether the task is exiting. To protect against the do_exit
@@ -672,10 +1002,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                return ret;
        }
 
+       /*
+        * No existing pi state. First waiter. [2]
+        */
        pi_state = alloc_pi_state();
 
        /*
-        * Initialize the pi_mutex in locked state and make 'p'
+        * Initialize the pi_mutex in locked state and make @p
         * the owner of it:
         */
        rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
@@ -695,6 +1028,39 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        return 0;
 }
 
+static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+                          union futex_key *key, struct futex_pi_state **ps)
+{
+       struct futex_q *match = futex_top_waiter(hb, key);
+
+       /*
+        * If there is a waiter on that futex, validate it and
+        * attach to the pi_state when the validation succeeds.
+        */
+       if (match)
+               return attach_to_pi_state(uval, match->pi_state, ps);
+
+       /*
+        * We are the first waiter - try to look up the owner based on
+        * @uval and attach to it.
+        */
+       return attach_to_pi_owner(uval, key, ps);
+}
+
+static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
+{
+       u32 uninitialized_var(curval);
+
+       if (unlikely(should_fail_futex(true)))
+               return -EFAULT;
+
+       if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
+               return -EFAULT;
+
+       /*If user space value changed, let the caller retry */
+       return curval != uval ? -EAGAIN : 0;
+}
+
 /**
  * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
  * @uaddr:             the pi futex user address
@@ -718,105 +1084,75 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                struct futex_pi_state **ps,
                                struct task_struct *task, int set_waiters)
 {
-       int lock_taken, ret, force_take = 0;
-       u32 uval, newval, curval, vpid = task_pid_vnr(task);
-
-retry:
-       ret = lock_taken = 0;
+       u32 uval, newval, vpid = task_pid_vnr(task);
+       struct futex_q *match;
+       int ret;
 
        /*
-        * To avoid races, we attempt to take the lock here again
-        * (by doing a 0 -> TID atomic cmpxchg), while holding all
-        * the locks. It will most likely not succeed.
+        * Read the user space value first so we can validate a few
+        * things before proceeding further.
         */
-       newval = vpid;
-       if (set_waiters)
-               newval |= FUTEX_WAITERS;
+       if (get_futex_value_locked(&uval, uaddr))
+               return -EFAULT;
 
-       if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
+       if (unlikely(should_fail_futex(true)))
                return -EFAULT;
 
        /*
         * Detect deadlocks.
         */
-       if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
+       if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
                return -EDEADLK;
 
-       /*
-        * Surprise - we got the lock. Just return to userspace:
-        */
-       if (unlikely(!curval))
-               return 1;
-
-       uval = curval;
+       if ((unlikely(should_fail_futex(true))))
+               return -EDEADLK;
 
        /*
-        * Set the FUTEX_WAITERS flag, so the owner will know it has someone
-        * to wake at the next unlock.
+        * Lookup existing state first. If it exists, try to attach to
+        * its pi_state.
         */
-       newval = curval | FUTEX_WAITERS;
+       match = futex_top_waiter(hb, key);
+       if (match)
+               return attach_to_pi_state(uval, match->pi_state, ps);
 
        /*
-        * Should we force take the futex? See below.
+        * No waiter and user TID is 0. We are here because the
+        * waiters or the owner died bit is set or called from
+        * requeue_cmp_pi or for whatever reason something took the
+        * syscall.
         */
-       if (unlikely(force_take)) {
+       if (!(uval & FUTEX_TID_MASK)) {
                /*
-                * Keep the OWNER_DIED and the WAITERS bit and set the
-                * new TID value.
+                * We take over the futex. No other waiters and the user space
+                * TID is 0. We preserve the owner died bit.
                 */
-               newval = (curval & ~FUTEX_TID_MASK) | vpid;
-               force_take = 0;
-               lock_taken = 1;
-       }
+               newval = uval & FUTEX_OWNER_DIED;
+               newval |= vpid;
 
-       if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
-               return -EFAULT;
-       if (unlikely(curval != uval))
-               goto retry;
+               /* The futex requeue_pi code can enforce the waiters bit */
+               if (set_waiters)
+                       newval |= FUTEX_WAITERS;
+
+               ret = lock_pi_update_atomic(uaddr, uval, newval);
+               /* If the take over worked, return 1 */
+               return ret < 0 ? ret : 1;
+       }
 
        /*
-        * We took the lock due to forced take over.
+        * First waiter. Set the waiters bit before attaching ourself to
+        * the owner. If owner tries to unlock, it will be forced into
+        * the kernel and blocked on hb->lock.
         */
-       if (unlikely(lock_taken))
-               return 1;
-
+       newval = uval | FUTEX_WAITERS;
+       ret = lock_pi_update_atomic(uaddr, uval, newval);
+       if (ret)
+               return ret;
        /*
-        * We dont have the lock. Look up the PI state (or create it if
-        * we are the first waiter):
+        * If the update of the user space value succeeded, we try to
+        * attach to the owner. If that fails, no harm done, we only
+        * set the FUTEX_WAITERS bit in the user space variable.
         */
-       ret = lookup_pi_state(uval, hb, key, ps);
-
-       if (unlikely(ret)) {
-               switch (ret) {
-               case -ESRCH:
-                       /*
-                        * We failed to find an owner for this
-                        * futex. So we have no pi_state to block
-                        * on. This can happen in two cases:
-                        *
-                        * 1) The owner died
-                        * 2) A stale FUTEX_WAITERS bit
-                        *
-                        * Re-read the futex value.
-                        */
-                       if (get_futex_value_locked(&curval, uaddr))
-                               return -EFAULT;
-
-                       /*
-                        * If the owner died or we have a stale
-                        * WAITERS bit the owner TID in the user space
-                        * futex is 0.
-                        */
-                       if (!(curval & FUTEX_TID_MASK)) {
-                               force_take = 1;
-                               goto retry;
-                       }
-               default:
-                       break;
-               }
-       }
-
-       return ret;
+       return attach_to_pi_owner(uval, key, ps);
 }
 
 /**
@@ -835,13 +1171,16 @@ static void __unqueue_futex(struct futex_q *q)
 
        hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
        plist_del(&q->list, &hb->chain);
+       hb_waiters_dec(hb);
 }
 
 /*
  * The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
  */
-static void wake_futex(struct futex_q *q)
+static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
 {
        struct task_struct *p = q->task;
 
@@ -849,14 +1188,10 @@ static void wake_futex(struct futex_q *q)
                return;
 
        /*
-        * We set q->lock_ptr = NULL _before_ we wake up the task. If
-        * a non-futex wake up happens on another CPU then the task
-        * might exit and p would dereference a non-existing task
-        * struct. Prevent this by holding a reference on p across the
-        * wake up.
+        * Queue the task for later wakeup for after we've released
+        * the hb->lock. wake_q_add() grabs reference to p.
         */
-       get_task_struct(p);
-
+       wake_q_add(wake_q, p);
        __unqueue_futex(q);
        /*
         * The waiting task can free the futex_q as soon as
@@ -866,16 +1201,17 @@ static void wake_futex(struct futex_q *q)
         */
        smp_wmb();
        q->lock_ptr = NULL;
-
-       wake_up_state(p, TASK_NORMAL);
-       put_task_struct(p);
 }
 
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
+                        struct futex_hash_bucket *hb)
 {
        struct task_struct *new_owner;
        struct futex_pi_state *pi_state = this->pi_state;
        u32 uninitialized_var(curval), newval;
+       WAKE_Q(wake_q);
+       bool deboost;
+       int ret = 0;
 
        if (!pi_state)
                return -EINVAL;
@@ -899,23 +1235,32 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
                new_owner = this->task;
 
        /*
-        * We pass it to the next owner. (The WAITERS bit is always
-        * kept enabled while there is PI state around. We must also
-        * preserve the owner died bit.)
+        * We pass it to the next owner. The WAITERS bit is always
+        * kept enabled while there is PI state around. We cleanup the
+        * owner died bit, because we are the owner.
         */
-       if (!(uval & FUTEX_OWNER_DIED)) {
-               int ret = 0;
+       newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 
-               newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+       if (unlikely(should_fail_futex(true)))
+               ret = -EFAULT;
 
-               if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
-                       ret = -EFAULT;
-               else if (curval != uval)
+       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
+               ret = -EFAULT;
+       } else if (curval != uval) {
+               /*
+                * If a unconditional UNLOCK_PI operation (user space did not
+                * try the TID->0 transition) raced with a waiter setting the
+                * FUTEX_WAITERS flag between get_user() and locking the hash
+                * bucket lock, retry the operation.
+                */
+               if ((FUTEX_TID_MASK & curval) == uval)
+                       ret = -EAGAIN;
+               else
                        ret = -EINVAL;
-               if (ret) {
-                       raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
-                       return ret;
-               }
+       }
+       if (ret) {
+               raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+               return ret;
        }
 
        raw_spin_lock_irq(&pi_state->owner->pi_lock);
@@ -930,23 +1275,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        raw_spin_unlock_irq(&new_owner->pi_lock);
 
        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
-       rt_mutex_unlock(&pi_state->pi_mutex);
 
-       return 0;
-}
-
-static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
-{
-       u32 uninitialized_var(oldval);
+       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
 
        /*
-        * There is no waiter, so we unlock the futex. The owner died
-        * bit has not to be preserved here. We are the owner:
+        * First unlock HB so the waiter does not spin on it once he got woken
+        * up. Second wake up the waiter before the priority is adjusted. If we
+        * deboost first (and lose our higher priority), then the task might get
+        * scheduled away before the wake up can take place.
         */
-       if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
-               return -EFAULT;
-       if (oldval != uval)
-               return -EAGAIN;
+       spin_unlock(&hb->lock);
+       wake_up_q(&wake_q);
+       if (deboost)
+               rt_mutex_adjust_prio(current);
 
        return 0;
 }
@@ -983,9 +1324,9 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
-       struct plist_head *head;
        union futex_key key = FUTEX_KEY_INIT;
        int ret;
+       WAKE_Q(wake_q);
 
        if (!bitset)
                return -EINVAL;
@@ -995,10 +1336,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
                goto out;
 
        hb = hash_futex(&key);
+
+       /* Make sure we really have tasks to wakeup */
+       if (!hb_waiters_pending(hb))
+               goto out_put_key;
+
        spin_lock(&hb->lock);
-       head = &hb->chain;
 
-       plist_for_each_entry_safe(this, next, head, list) {
+       plist_for_each_entry_safe(this, next, &hb->chain, list) {
                if (match_futex (&this->key, &key)) {
                        if (this->pi_state || this->rt_waiter) {
                                ret = -EINVAL;
@@ -1009,13 +1354,15 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
                        if (!(this->bitset & bitset))
                                continue;
 
-                       wake_futex(this);
+                       mark_wake_futex(&wake_q, this);
                        if (++ret >= nr_wake)
                                break;
                }
        }
 
        spin_unlock(&hb->lock);
+       wake_up_q(&wake_q);
+out_put_key:
        put_futex_key(&key);
 out:
        return ret;
@@ -1031,9 +1378,9 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
        struct futex_hash_bucket *hb1, *hb2;
-       struct plist_head *head;
        struct futex_q *this, *next;
        int ret, op_ret;
+       WAKE_Q(wake_q);
 
 retry:
        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@ -1079,31 +1426,27 @@ retry_private:
                goto retry;
        }
 
-       head = &hb1->chain;
-
-       plist_for_each_entry_safe(this, next, head, list) {
+       plist_for_each_entry_safe(this, next, &hb1->chain, list) {
                if (match_futex (&this->key, &key1)) {
                        if (this->pi_state || this->rt_waiter) {
                                ret = -EINVAL;
                                goto out_unlock;
                        }
-                       wake_futex(this);
+                       mark_wake_futex(&wake_q, this);
                        if (++ret >= nr_wake)
                                break;
                }
        }
 
        if (op_ret > 0) {
-               head = &hb2->chain;
-
                op_ret = 0;
-               plist_for_each_entry_safe(this, next, head, list) {
+               plist_for_each_entry_safe(this, next, &hb2->chain, list) {
                        if (match_futex (&this->key, &key2)) {
                                if (this->pi_state || this->rt_waiter) {
                                        ret = -EINVAL;
                                        goto out_unlock;
                                }
-                               wake_futex(this);
+                               mark_wake_futex(&wake_q, this);
                                if (++op_ret >= nr_wake2)
                                        break;
                        }
@@ -1113,6 +1456,7 @@ retry_private:
 
 out_unlock:
        double_unlock_hb(hb1, hb2);
+       wake_up_q(&wake_q);
 out_put_keys:
        put_futex_key(&key2);
 out_put_key1:
@@ -1139,6 +1483,8 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
         */
        if (likely(&hb1->chain != &hb2->chain)) {
                plist_del(&q->list, &hb1->chain);
+               hb_waiters_dec(hb1);
+               hb_waiters_inc(hb2);
                plist_add(&q->list, &hb2->chain);
                q->lock_ptr = &hb2->lock;
        }
@@ -1194,7 +1540,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
  *
  * Return:
  *  0 - failed to acquire the lock atomically;
- *  1 - acquired the lock;
+ * >0 - acquired the lock, return value is vpid of the top_waiter
  * <0 - error
  */
 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1205,11 +1551,14 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 {
        struct futex_q *top_waiter = NULL;
        u32 curval;
-       int ret;
+       int ret, vpid;
 
        if (get_futex_value_locked(&curval, pifutex))
                return -EFAULT;
 
+       if (unlikely(should_fail_futex(true)))
+               return -EFAULT;
+
        /*
         * Find the top_waiter and determine if there are additional waiters.
         * If the caller intends to requeue more than 1 waiter to pifutex,
@@ -1233,11 +1582,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
         * the contended case or if set_waiters is 1.  The pi_state is returned
         * in ps in contended cases.
         */
+       vpid = task_pid_vnr(top_waiter->task);
        ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
                                   set_waiters);
-       if (ret == 1)
+       if (ret == 1) {
                requeue_pi_wake_futex(top_waiter, key2, hb2);
-
+               return vpid;
+       }
        return ret;
 }
 
@@ -1267,11 +1618,17 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
        int drop_count = 0, task_count = 0, ret;
        struct futex_pi_state *pi_state = NULL;
        struct futex_hash_bucket *hb1, *hb2;
-       struct plist_head *head1;
        struct futex_q *this, *next;
-       u32 curval2;
+       WAKE_Q(wake_q);
 
        if (requeue_pi) {
+               /*
+                * Requeue PI only works on two distinct uaddrs. This
+                * check is only valid for private futexes. See below.
+                */
+               if (uaddr1 == uaddr2)
+                       return -EINVAL;
+
                /*
                 * requeue_pi requires a pi_state, try to allocate it now
                 * without any locks in case it fails.
@@ -1293,15 +1650,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
        }
 
 retry:
-       if (pi_state != NULL) {
-               /*
-                * We will have to lookup the pi_state again, so free this one
-                * to keep the accounting correct.
-                */
-               free_pi_state(pi_state);
-               pi_state = NULL;
-       }
-
        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
@@ -1310,10 +1658,20 @@ retry:
        if (unlikely(ret != 0))
                goto out_put_key1;
 
+       /*
+        * The check above which compares uaddrs is not sufficient for
+        * shared futexes. We need to compare the keys:
+        */
+       if (requeue_pi && match_futex(&key1, &key2)) {
+               ret = -EINVAL;
+               goto out_put_keys;
+       }
+
        hb1 = hash_futex(&key1);
        hb2 = hash_futex(&key2);
 
 retry_private:
+       hb_waiters_inc(hb2);
        double_lock_hb(hb1, hb2);
 
        if (likely(cmpval != NULL)) {
@@ -1323,6 +1681,7 @@ retry_private:
 
                if (unlikely(ret)) {
                        double_unlock_hb(hb1, hb2);
+                       hb_waiters_dec(hb2);
 
                        ret = get_user(curval, uaddr1);
                        if (ret)
@@ -1355,23 +1714,35 @@ retry_private:
                 * At this point the top_waiter has either taken uaddr2 or is
                 * waiting on it.  If the former, then the pi_state will not
                 * exist yet, look it up one more time to ensure we have a
-                * reference to it.
+                * reference to it. If the lock was taken, ret contains the
+                * vpid of the top waiter task.
                 */
-               if (ret == 1) {
+               if (ret > 0) {
                        WARN_ON(pi_state);
                        drop_count++;
                        task_count++;
-                       ret = get_futex_value_locked(&curval2, uaddr2);
-                       if (!ret)
-                               ret = lookup_pi_state(curval2, hb2, &key2,
-                                                     &pi_state);
+                       /*
+                        * If we acquired the lock, then the user
+                        * space value of uaddr2 should be vpid. It
+                        * cannot be changed by the top waiter as it
+                        * is blocked on hb2 lock if it tries to do
+                        * so. If something fiddled with it behind our
+                        * back the pi state lookup might unearth
+                        * it. So we rather use the known value than
+                        * rereading and handing potential crap to
+                        * lookup_pi_state.
+                        */
+                       ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
                }
 
                switch (ret) {
                case 0:
                        break;
                case -EFAULT:
+                       free_pi_state(pi_state);
+                       pi_state = NULL;
                        double_unlock_hb(hb1, hb2);
+                       hb_waiters_dec(hb2);
                        put_futex_key(&key2);
                        put_futex_key(&key1);
                        ret = fault_in_user_writeable(uaddr2);
@@ -1379,8 +1750,16 @@ retry_private:
                                goto retry;
                        goto out;
                case -EAGAIN:
-                       /* The owner was exiting, try again. */
+                       /*
+                        * Two reasons for this:
+                        * - Owner is exiting and we just wait for the
+                        *   exit to complete.
+                        * - The user space value changed.
+                        */
+                       free_pi_state(pi_state);
+                       pi_state = NULL;
                        double_unlock_hb(hb1, hb2);
+                       hb_waiters_dec(hb2);
                        put_futex_key(&key2);
                        put_futex_key(&key1);
                        cond_resched();
@@ -1390,8 +1769,7 @@ retry_private:
                }
        }
 
-       head1 = &hb1->chain;
-       plist_for_each_entry_safe(this, next, head1, list) {
+       plist_for_each_entry_safe(this, next, &hb1->chain, list) {
                if (task_count - nr_wake >= nr_requeue)
                        break;
 
@@ -1418,7 +1796,7 @@ retry_private:
                 * woken by futex_unlock_pi().
                 */
                if (++task_count <= nr_wake && !requeue_pi) {
-                       wake_futex(this);
+                       mark_wake_futex(&wake_q, this);
                        continue;
                }
 
@@ -1438,7 +1816,7 @@ retry_private:
                        this->pi_state = pi_state;
                        ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
                                                        this->rt_waiter,
-                                                       this->task, 1);
+                                                       this->task);
                        if (ret == 1) {
                                /* We got the lock. */
                                requeue_pi_wake_futex(this, &key2, hb2);
@@ -1456,7 +1834,10 @@ retry_private:
        }
 
 out_unlock:
+       free_pi_state(pi_state);
        double_unlock_hb(hb1, hb2);
+       wake_up_q(&wake_q);
+       hb_waiters_dec(hb2);
 
        /*
         * drop_futex_key_refs() must be called outside the spinlocks. During
@@ -1472,8 +1853,6 @@ out_put_keys:
 out_put_key1:
        put_futex_key(&key1);
 out:
-       if (pi_state != NULL)
-               free_pi_state(pi_state);
        return ret ? ret : task_count;
 }
 
@@ -1484,17 +1863,29 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
        struct futex_hash_bucket *hb;
 
        hb = hash_futex(&q->key);
+
+       /*
+        * Increment the counter before taking the lock so that
+        * a potential waker won't miss a to-be-slept task that is
+        * waiting for the spinlock. This is safe as all queue_lock()
+        * users end up calling queue_me(). Similarly, for housekeeping,
+        * decrement the counter at queue_unlock() when some error has
+        * occurred and we don't end up adding the task to the list.
+        */
+       hb_waiters_inc(hb);
+
        q->lock_ptr = &hb->lock;
 
-       spin_lock(&hb->lock);
+       spin_lock(&hb->lock); /* implies MB (A) */
        return hb;
 }
 
 static inline void
-queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+queue_unlock(struct futex_hash_bucket *hb)
        __releases(&hb->lock)
 {
        spin_unlock(&hb->lock);
+       hb_waiters_dec(hb);
 }
 
 /**
@@ -1782,7 +2173,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 {
        /*
         * The task state is guaranteed to be set before another task can
-        * wake it. set_current_state() is implemented using set_mb() and
+        * wake it. set_current_state() is implemented using smp_store_mb() and
         * queue_me() calls spin_unlock() upon completion, both serializing
         * access to the hash list and forcing another memory barrier.
         */
@@ -1790,11 +2181,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
        queue_me(q, hb);
 
        /* Arm the timer */
-       if (timeout) {
+       if (timeout)
                hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-               if (!hrtimer_active(&timeout->timer))
-                       timeout->task = NULL;
-       }
 
        /*
         * If we have been removed from the hash list, then another task
@@ -1807,7 +2195,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
                 * is no timeout, or if it has yet to expire.
                 */
                if (!timeout || timeout->task)
-                       schedule();
+                       freezable_schedule();
        }
        __set_current_state(TASK_RUNNING);
 }
@@ -1864,7 +2252,7 @@ retry_private:
        ret = get_futex_value_locked(&uval, uaddr);
 
        if (ret) {
-               queue_unlock(q, *hb);
+               queue_unlock(*hb);
 
                ret = get_user(uval, uaddr);
                if (ret)
@@ -1878,7 +2266,7 @@ retry_private:
        }
 
        if (uval != val) {
-               queue_unlock(q, *hb);
+               queue_unlock(*hb);
                ret = -EWOULDBLOCK;
        }
 
@@ -1944,7 +2332,7 @@ retry:
        if (!abs_time)
                goto out;
 
-       restart = &current_thread_info()->restart_block;
+       restart = &current->restart_block;
        restart->fn = futex_wait_restart;
        restart->futex.uaddr = uaddr;
        restart->futex.val = val;
@@ -1982,10 +2370,13 @@ static long futex_wait_restart(struct restart_block *restart)
 /*
  * Userspace tried a 0 -> TID atomic transition of the futex value
  * and failed. The kernel side here does the whole locking operation:
- * if there are waiters then it will block, it does PI, etc. (Due to
- * races the kernel might see a 0 value of the futex too.)
+ * if there are waiters then it will block as a consequence of relying
+ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
+ * a 0 value of the futex too.).
+ *
+ * Also serves as futex trylock_pi()'ing, and due semantics.
  */
-static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
                         ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
@@ -2014,6 +2405,10 @@ retry_private:
 
        ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
        if (unlikely(ret)) {
+               /*
+                * Atomic work succeeded and we got the lock,
+                * or failed. Either way, we do _not_ block.
+                */
                switch (ret) {
                case 1:
                        /* We got the lock. */
@@ -2023,10 +2418,12 @@ retry_private:
                        goto uaddr_faulted;
                case -EAGAIN:
                        /*
-                        * Task is exiting and we just wait for the
-                        * exit to complete.
+                        * Two reasons for this:
+                        * - Task is exiting and we just wait for the
+                        *   exit to complete.
+                        * - The user space value changed.
                         */
-                       queue_unlock(&q, hb);
+                       queue_unlock(hb);
                        put_futex_key(&q.key);
                        cond_resched();
                        goto retry;
@@ -2044,9 +2441,9 @@ retry_private:
        /*
         * Block on the PI mutex:
         */
-       if (!trylock)
-               ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
-       else {
+       if (!trylock) {
+               ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
+       else {
                ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
                /* Fixup the trylock return value: */
                ret = ret ? 0 : -EWOULDBLOCK;
@@ -2078,7 +2475,7 @@ retry_private:
        goto out_put_key;
 
 out_unlock_put_key:
-       queue_unlock(&q, hb);
+       queue_unlock(hb);
 
 out_put_key:
        put_futex_key(&q.key);
@@ -2088,7 +2485,7 @@ out:
        return ret != -EINTR ? ret : -ERESTARTNOINTR;
 
 uaddr_faulted:
-       queue_unlock(&q, hb);
+       queue_unlock(hb);
 
        ret = fault_in_user_writeable(uaddr);
        if (ret)
@@ -2108,11 +2505,10 @@ uaddr_faulted:
  */
 static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
-       struct futex_hash_bucket *hb;
-       struct futex_q *this, *next;
-       struct plist_head *head;
+       u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
        union futex_key key = FUTEX_KEY_INIT;
-       u32 uval, vpid = task_pid_vnr(current);
+       struct futex_hash_bucket *hb;
+       struct futex_q *match;
        int ret;
 
 retry:
@@ -2125,60 +2521,67 @@ retry:
                return -EPERM;
 
        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
-       if (unlikely(ret != 0))
-               goto out;
+       if (ret)
+               return ret;
 
        hb = hash_futex(&key);
        spin_lock(&hb->lock);
 
        /*
-        * To avoid races, try to do the TID -> 0 atomic transition
-        * again. If it succeeds then we can return without waking
-        * anyone else up:
-        */
-       if (!(uval & FUTEX_OWNER_DIED) &&
-           cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
-               goto pi_faulted;
-       /*
-        * Rare case: we managed to release the lock atomically,
-        * no need to wake anyone else up:
-        */
-       if (unlikely(uval == vpid))
-               goto out_unlock;
-
-       /*
-        * Ok, other tasks may need to be woken up - check waiters
-        * and do the wakeup if necessary:
+        * Check waiters first. We do not trust user space values at
+        * all and we at least want to know if user space fiddled
+        * with the futex value instead of blindly unlocking.
         */
-       head = &hb->chain;
-
-       plist_for_each_entry_safe(this, next, head, list) {
-               if (!match_futex (&this->key, &key))
-                       continue;
-               ret = wake_futex_pi(uaddr, uval, this);
+       match = futex_top_waiter(hb, &key);
+       if (match) {
+               ret = wake_futex_pi(uaddr, uval, match, hb);
                /*
-                * The atomic access to the futex value
-                * generated a pagefault, so retry the
-                * user-access and the wakeup:
+                * In case of success wake_futex_pi dropped the hash
+                * bucket lock.
+                */
+               if (!ret)
+                       goto out_putkey;
+               /*
+                * The atomic access to the futex value generated a
+                * pagefault, so retry the user-access and the wakeup:
                 */
                if (ret == -EFAULT)
                        goto pi_faulted;
+               /*
+                * A unconditional UNLOCK_PI op raced against a waiter
+                * setting the FUTEX_WAITERS bit. Try again.
+                */
+               if (ret == -EAGAIN) {
+                       spin_unlock(&hb->lock);
+                       put_futex_key(&key);
+                       goto retry;
+               }
+               /*
+                * wake_futex_pi has detected invalid state. Tell user
+                * space.
+                */
                goto out_unlock;
        }
+
        /*
-        * No waiters - kernel unlocks the futex:
+        * We have no kernel internal state, i.e. no waiters in the
+        * kernel. Waiters which are about to queue themselves are stuck
+        * on hb->lock. So we can safely ignore them. We do neither
+        * preserve the WAITERS bit not the OWNER_DIED one. We are the
+        * owner.
         */
-       if (!(uval & FUTEX_OWNER_DIED)) {
-               ret = unlock_futex_pi(uaddr, uval);
-               if (ret == -EFAULT)
-                       goto pi_faulted;
-       }
+       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
+               goto pi_faulted;
+
+       /*
+        * If uval has changed, let user space handle it.
+        */
+       ret = (curval == uval) ? 0 : -EAGAIN;
 
 out_unlock:
        spin_unlock(&hb->lock);
+out_putkey:
        put_futex_key(&key);
-
-out:
        return ret;
 
 pi_faulted:
@@ -2229,6 +2632,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
                 * Unqueue the futex_q and determine which it was.
                 */
                plist_del(&q->list, &hb->chain);
+               hb_waiters_dec(hb);
 
                /* Handle spurious wakeups gracefully */
                ret = -EWOULDBLOCK;
@@ -2244,7 +2648,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
  * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
  * @uaddr:     the futex we initially wait on (non-pi)
  * @flags:     futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
- *             the same type, no requeueing from private to shared, etc.
+ *             the same type, no requeueing from private to shared, etc.
  * @val:       the expected value of uaddr
  * @abs_time:  absolute timeout
  * @bitset:    32 bit wakeup bitset set by userspace, defaults to all
@@ -2313,6 +2717,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
         * code while we sleep on uaddr.
         */
        debug_rt_mutex_init_waiter(&rt_waiter);
+       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
+       RB_CLEAR_NODE(&rt_waiter.tree_entry);
        rt_waiter.task = NULL;
 
        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
@@ -2331,6 +2737,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
        if (ret)
                goto out_key2;
 
+       /*
+        * The check above which compares uaddrs is not sufficient for
+        * shared futexes. We need to compare the keys:
+        */
+       if (match_futex(&q.key, &key2)) {
+               queue_unlock(hb);
+               ret = -EINVAL;
+               goto out_put_keys;
+       }
+
        /* Queue the futex_q, drop the hb lock, wait for wakeup. */
        futex_wait_queue_me(hb, &q, to);
 
@@ -2358,6 +2774,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                if (q.pi_state && (q.pi_state->owner != current)) {
                        spin_lock(q.lock_ptr);
                        ret = fixup_pi_state_owner(uaddr2, &q, current);
+                       /*
+                        * Drop the reference to the pi state which
+                        * the requeue_pi() code acquired for us.
+                        */
+                       free_pi_state(q.pi_state);
                        spin_unlock(q.lock_ptr);
                }
        } else {
@@ -2368,7 +2789,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                 */
                WARN_ON(!q.pi_state);
                pi_mutex = &q.pi_state->pi_mutex;
-               ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
+               ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
                debug_rt_mutex_free_waiter(&rt_waiter);
 
                spin_lock(q.lock_ptr);
@@ -2484,7 +2905,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
        }
 
        ret = -EPERM;
-       if (!ptrace_may_access(p, PTRACE_MODE_READ))
+       if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
                goto err_unlock;
 
        head = p->robust_list;
@@ -2679,11 +3100,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
        case FUTEX_WAKE_OP:
                return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
        case FUTEX_LOCK_PI:
-               return futex_lock_pi(uaddr, flags, val, timeout, 0);
+               return futex_lock_pi(uaddr, flags, timeout, 0);
        case FUTEX_UNLOCK_PI:
                return futex_unlock_pi(uaddr, flags);
        case FUTEX_TRYLOCK_PI:
-               return futex_lock_pi(uaddr, flags, 0, timeout, 1);
+               return futex_lock_pi(uaddr, flags, NULL, 1);
        case FUTEX_WAIT_REQUEUE_PI:
                val3 = FUTEX_BITSET_MATCH_ANY;
                return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
@@ -2707,6 +3128,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
                      cmd == FUTEX_WAIT_BITSET ||
                      cmd == FUTEX_WAIT_REQUEUE_PI)) {
+               if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
+                       return -EFAULT;
                if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
                        return -EFAULT;
                if (!timespec_valid(&ts))
@@ -2728,10 +3151,10 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
 
-static int __init futex_init(void)
+static void __init futex_detect_cmpxchg(void)
 {
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
        u32 curval;
-       int i;
 
        /*
         * This will fail and we want it. Some arch implementations do
@@ -2745,8 +3168,31 @@ static int __init futex_init(void)
         */
        if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
                futex_cmpxchg_enabled = 1;
+#endif
+}
+
+static int __init futex_init(void)
+{
+       unsigned int futex_shift;
+       unsigned long i;
+
+#if CONFIG_BASE_SMALL
+       futex_hashsize = 16;
+#else
+       futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+#endif
+
+       futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
+                                              futex_hashsize, 0,
+                                              futex_hashsize < 256 ? HASH_SMALL : 0,
+                                              &futex_shift, NULL,
+                                              futex_hashsize, futex_hashsize);
+       futex_hashsize = 1UL << futex_shift;
+
+       futex_detect_cmpxchg();
 
-       for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+       for (i = 0; i < futex_hashsize; i++) {
+               atomic_set(&futex_queues[i].waiters, 0);
                plist_head_init(&futex_queues[i].chain);
                spin_lock_init(&futex_queues[i].lock);
        }