Merge branch 'linux-linaro-lsk' into linux-linaro-lsk-android
[firefly-linux-kernel-4.4.55.git] / kernel / futex.c
1 /*
2  *  Fast Userspace Mutexes (which I call "Futexes!").
3  *  (C) Rusty Russell, IBM 2002
4  *
5  *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
6  *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
7  *
8  *  Removed page pinning, fix privately mapped COW pages and other cleanups
9  *  (C) Copyright 2003, 2004 Jamie Lokier
10  *
11  *  Robust futex support started by Ingo Molnar
12  *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13  *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14  *
15  *  PI-futex support started by Ingo Molnar and Thomas Gleixner
16  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17  *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18  *
19  *  PRIVATE futexes by Eric Dumazet
20  *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21  *
22  *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
23  *  Copyright (C) IBM Corporation, 2009
24  *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
25  *
26  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
27  *  enough at me, Linus for the original (flawed) idea, Matthew
28  *  Kirkwood for proof-of-concept implementation.
29  *
30  *  "The futexes are also cursed."
31  *  "But they come in a choice of three flavours!"
32  *
33  *  This program is free software; you can redistribute it and/or modify
34  *  it under the terms of the GNU General Public License as published by
35  *  the Free Software Foundation; either version 2 of the License, or
36  *  (at your option) any later version.
37  *
38  *  This program is distributed in the hope that it will be useful,
39  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
40  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
41  *  GNU General Public License for more details.
42  *
43  *  You should have received a copy of the GNU General Public License
44  *  along with this program; if not, write to the Free Software
45  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
46  */
47 #include <linux/slab.h>
48 #include <linux/poll.h>
49 #include <linux/fs.h>
50 #include <linux/file.h>
51 #include <linux/jhash.h>
52 #include <linux/init.h>
53 #include <linux/futex.h>
54 #include <linux/mount.h>
55 #include <linux/pagemap.h>
56 #include <linux/syscalls.h>
57 #include <linux/signal.h>
58 #include <linux/export.h>
59 #include <linux/magic.h>
60 #include <linux/pid.h>
61 #include <linux/nsproxy.h>
62 #include <linux/ptrace.h>
63 #include <linux/sched/rt.h>
64 #include <linux/hugetlb.h>
65 #include <linux/freezer.h>
66
67 #include <asm/futex.h>
68
69 #include "rtmutex_common.h"
70
71 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
72 int __read_mostly futex_cmpxchg_enabled;
73 #endif
74
75 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
76
77 /*
78  * Futex flags used to encode options to functions and preserve them across
79  * restarts.
80  */
81 #define FLAGS_SHARED            0x01
82 #define FLAGS_CLOCKRT           0x02
83 #define FLAGS_HAS_TIMEOUT       0x04
84
85 /*
86  * Priority Inheritance state:
87  */
88 struct futex_pi_state {
89         /*
90          * list of 'owned' pi_state instances - these have to be
91          * cleaned up in do_exit() if the task exits prematurely:
92          */
93         struct list_head list;
94
95         /*
96          * The PI object:
97          */
98         struct rt_mutex pi_mutex;
99
100         struct task_struct *owner;
101         atomic_t refcount;
102
103         union futex_key key;
104 };
105
106 /**
107  * struct futex_q - The hashed futex queue entry, one per waiting task
108  * @list:               priority-sorted list of tasks waiting on this futex
109  * @task:               the task waiting on the futex
110  * @lock_ptr:           the hash bucket lock
111  * @key:                the key the futex is hashed on
112  * @pi_state:           optional priority inheritance state
113  * @rt_waiter:          rt_waiter storage for use with requeue_pi
114  * @requeue_pi_key:     the requeue_pi target futex key
115  * @bitset:             bitset for the optional bitmasked wakeup
116  *
117  * We use this hashed waitqueue, instead of a normal wait_queue_t, so
118  * we can wake only the relevant ones (hashed queues may be shared).
119  *
120  * A futex_q has a woken state, just like tasks have TASK_RUNNING.
121  * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
122  * The order of wakeup is always to make the first condition true, then
123  * the second.
124  *
125  * PI futexes are typically woken before they are removed from the hash list via
126  * the rt_mutex code. See unqueue_me_pi().
127  */
128 struct futex_q {
129         struct plist_node list;
130
131         struct task_struct *task;
132         spinlock_t *lock_ptr;
133         union futex_key key;
134         struct futex_pi_state *pi_state;
135         struct rt_mutex_waiter *rt_waiter;
136         union futex_key *requeue_pi_key;
137         u32 bitset;
138 };
139
140 static const struct futex_q futex_q_init = {
141         /* list gets initialized in queue_me()*/
142         .key = FUTEX_KEY_INIT,
143         .bitset = FUTEX_BITSET_MATCH_ANY
144 };
145
146 /*
147  * Hash buckets are shared by all the futex_keys that hash to the same
148  * location.  Each key may have multiple futex_q structures, one for each task
149  * waiting on a futex.
150  */
151 struct futex_hash_bucket {
152         spinlock_t lock;
153         struct plist_head chain;
154 };
155
156 static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
157
158 /*
159  * We hash on the keys returned from get_futex_key (see below).
160  */
161 static struct futex_hash_bucket *hash_futex(union futex_key *key)
162 {
163         u32 hash = jhash2((u32*)&key->both.word,
164                           (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
165                           key->both.offset);
166         return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
167 }
168
169 /*
170  * Return 1 if two futex_keys are equal, 0 otherwise.
171  */
172 static inline int match_futex(union futex_key *key1, union futex_key *key2)
173 {
174         return (key1 && key2
175                 && key1->both.word == key2->both.word
176                 && key1->both.ptr == key2->both.ptr
177                 && key1->both.offset == key2->both.offset);
178 }
179
180 /*
181  * Take a reference to the resource addressed by a key.
182  * Can be called while holding spinlocks.
183  *
184  */
185 static void get_futex_key_refs(union futex_key *key)
186 {
187         if (!key->both.ptr)
188                 return;
189
190         switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
191         case FUT_OFF_INODE:
192                 ihold(key->shared.inode);
193                 break;
194         case FUT_OFF_MMSHARED:
195                 atomic_inc(&key->private.mm->mm_count);
196                 break;
197         }
198 }
199
200 /*
201  * Drop a reference to the resource addressed by a key.
202  * The hash bucket spinlock must not be held.
203  */
204 static void drop_futex_key_refs(union futex_key *key)
205 {
206         if (!key->both.ptr) {
207                 /* If we're here then we tried to put a key we failed to get */
208                 WARN_ON_ONCE(1);
209                 return;
210         }
211
212         switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
213         case FUT_OFF_INODE:
214                 iput(key->shared.inode);
215                 break;
216         case FUT_OFF_MMSHARED:
217                 mmdrop(key->private.mm);
218                 break;
219         }
220 }
221
222 /**
223  * get_futex_key() - Get parameters which are the keys for a futex
224  * @uaddr:      virtual address of the futex
225  * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
226  * @key:        address where result is stored.
227  * @rw:         mapping needs to be read/write (values: VERIFY_READ,
228  *              VERIFY_WRITE)
229  *
230  * Return: a negative error code or 0
231  *
232  * The key words are stored in *key on success.
233  *
234  * For shared mappings, it's (page->index, file_inode(vma->vm_file),
235  * offset_within_page).  For private mappings, it's (uaddr, current->mm).
236  * We can usually work out the index without swapping in the page.
237  *
238  * lock_page() might sleep, the caller should not hold a spinlock.
239  */
240 static int
241 get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
242 {
243         unsigned long address = (unsigned long)uaddr;
244         struct mm_struct *mm = current->mm;
245         struct page *page, *page_head;
246         int err, ro = 0;
247
248         /*
249          * The futex address must be "naturally" aligned.
250          */
251         key->both.offset = address % PAGE_SIZE;
252         if (unlikely((address % sizeof(u32)) != 0))
253                 return -EINVAL;
254         address -= key->both.offset;
255
256         /*
257          * PROCESS_PRIVATE futexes are fast.
258          * As the mm cannot disappear under us and the 'key' only needs
259          * virtual address, we dont even have to find the underlying vma.
260          * Note : We do have to check 'uaddr' is a valid user address,
261          *        but access_ok() should be faster than find_vma()
262          */
263         if (!fshared) {
264                 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
265                         return -EFAULT;
266                 key->private.mm = mm;
267                 key->private.address = address;
268                 get_futex_key_refs(key);
269                 return 0;
270         }
271
272 again:
273         err = get_user_pages_fast(address, 1, 1, &page);
274         /*
275          * If write access is not required (eg. FUTEX_WAIT), try
276          * and get read-only access.
277          */
278         if (err == -EFAULT && rw == VERIFY_READ) {
279                 err = get_user_pages_fast(address, 1, 0, &page);
280                 ro = 1;
281         }
282         if (err < 0)
283                 return err;
284         else
285                 err = 0;
286
287 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
288         page_head = page;
289         if (unlikely(PageTail(page))) {
290                 put_page(page);
291                 /* serialize against __split_huge_page_splitting() */
292                 local_irq_disable();
293                 if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
294                         page_head = compound_head(page);
295                         /*
296                          * page_head is valid pointer but we must pin
297                          * it before taking the PG_lock and/or
298                          * PG_compound_lock. The moment we re-enable
299                          * irqs __split_huge_page_splitting() can
300                          * return and the head page can be freed from
301                          * under us. We can't take the PG_lock and/or
302                          * PG_compound_lock on a page that could be
303                          * freed from under us.
304                          */
305                         if (page != page_head) {
306                                 get_page(page_head);
307                                 put_page(page);
308                         }
309                         local_irq_enable();
310                 } else {
311                         local_irq_enable();
312                         goto again;
313                 }
314         }
315 #else
316         page_head = compound_head(page);
317         if (page != page_head) {
318                 get_page(page_head);
319                 put_page(page);
320         }
321 #endif
322
323         lock_page(page_head);
324
325         /*
326          * If page_head->mapping is NULL, then it cannot be a PageAnon
327          * page; but it might be the ZERO_PAGE or in the gate area or
328          * in a special mapping (all cases which we are happy to fail);
329          * or it may have been a good file page when get_user_pages_fast
330          * found it, but truncated or holepunched or subjected to
331          * invalidate_complete_page2 before we got the page lock (also
332          * cases which we are happy to fail).  And we hold a reference,
333          * so refcount care in invalidate_complete_page's remove_mapping
334          * prevents drop_caches from setting mapping to NULL beneath us.
335          *
336          * The case we do have to guard against is when memory pressure made
337          * shmem_writepage move it from filecache to swapcache beneath us:
338          * an unlikely race, but we do need to retry for page_head->mapping.
339          */
340         if (!page_head->mapping) {
341                 int shmem_swizzled = PageSwapCache(page_head);
342                 unlock_page(page_head);
343                 put_page(page_head);
344                 if (shmem_swizzled)
345                         goto again;
346                 return -EFAULT;
347         }
348
349         /*
350          * Private mappings are handled in a simple way.
351          *
352          * NOTE: When userspace waits on a MAP_SHARED mapping, even if
353          * it's a read-only handle, it's expected that futexes attach to
354          * the object not the particular process.
355          */
356         if (PageAnon(page_head)) {
357                 /*
358                  * A RO anonymous page will never change and thus doesn't make
359                  * sense for futex operations.
360                  */
361                 if (ro) {
362                         err = -EFAULT;
363                         goto out;
364                 }
365
366                 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
367                 key->private.mm = mm;
368                 key->private.address = address;
369         } else {
370                 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
371                 key->shared.inode = page_head->mapping->host;
372                 key->shared.pgoff = basepage_index(page);
373         }
374
375         get_futex_key_refs(key);
376
377 out:
378         unlock_page(page_head);
379         put_page(page_head);
380         return err;
381 }
382
383 static inline void put_futex_key(union futex_key *key)
384 {
385         drop_futex_key_refs(key);
386 }
387
388 /**
389  * fault_in_user_writeable() - Fault in user address and verify RW access
390  * @uaddr:      pointer to faulting user space address
391  *
392  * Slow path to fixup the fault we just took in the atomic write
393  * access to @uaddr.
394  *
395  * We have no generic implementation of a non-destructive write to the
396  * user address. We know that we faulted in the atomic pagefault
397  * disabled section so we can as well avoid the #PF overhead by
398  * calling get_user_pages() right away.
399  */
400 static int fault_in_user_writeable(u32 __user *uaddr)
401 {
402         struct mm_struct *mm = current->mm;
403         int ret;
404
405         down_read(&mm->mmap_sem);
406         ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
407                                FAULT_FLAG_WRITE);
408         up_read(&mm->mmap_sem);
409
410         return ret < 0 ? ret : 0;
411 }
412
413 /**
414  * futex_top_waiter() - Return the highest priority waiter on a futex
415  * @hb:         the hash bucket the futex_q's reside in
416  * @key:        the futex key (to distinguish it from other futex futex_q's)
417  *
418  * Must be called with the hb lock held.
419  */
420 static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
421                                         union futex_key *key)
422 {
423         struct futex_q *this;
424
425         plist_for_each_entry(this, &hb->chain, list) {
426                 if (match_futex(&this->key, key))
427                         return this;
428         }
429         return NULL;
430 }
431
432 static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
433                                       u32 uval, u32 newval)
434 {
435         int ret;
436
437         pagefault_disable();
438         ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
439         pagefault_enable();
440
441         return ret;
442 }
443
444 static int get_futex_value_locked(u32 *dest, u32 __user *from)
445 {
446         int ret;
447
448         pagefault_disable();
449         ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
450         pagefault_enable();
451
452         return ret ? -EFAULT : 0;
453 }
454
455
456 /*
457  * PI code:
458  */
459 static int refill_pi_state_cache(void)
460 {
461         struct futex_pi_state *pi_state;
462
463         if (likely(current->pi_state_cache))
464                 return 0;
465
466         pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
467
468         if (!pi_state)
469                 return -ENOMEM;
470
471         INIT_LIST_HEAD(&pi_state->list);
472         /* pi_mutex gets initialized later */
473         pi_state->owner = NULL;
474         atomic_set(&pi_state->refcount, 1);
475         pi_state->key = FUTEX_KEY_INIT;
476
477         current->pi_state_cache = pi_state;
478
479         return 0;
480 }
481
482 static struct futex_pi_state * alloc_pi_state(void)
483 {
484         struct futex_pi_state *pi_state = current->pi_state_cache;
485
486         WARN_ON(!pi_state);
487         current->pi_state_cache = NULL;
488
489         return pi_state;
490 }
491
492 static void free_pi_state(struct futex_pi_state *pi_state)
493 {
494         if (!atomic_dec_and_test(&pi_state->refcount))
495                 return;
496
497         /*
498          * If pi_state->owner is NULL, the owner is most probably dying
499          * and has cleaned up the pi_state already
500          */
501         if (pi_state->owner) {
502                 raw_spin_lock_irq(&pi_state->owner->pi_lock);
503                 list_del_init(&pi_state->list);
504                 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
505
506                 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
507         }
508
509         if (current->pi_state_cache)
510                 kfree(pi_state);
511         else {
512                 /*
513                  * pi_state->list is already empty.
514                  * clear pi_state->owner.
515                  * refcount is at 0 - put it back to 1.
516                  */
517                 pi_state->owner = NULL;
518                 atomic_set(&pi_state->refcount, 1);
519                 current->pi_state_cache = pi_state;
520         }
521 }
522
523 /*
524  * Look up the task based on what TID userspace gave us.
525  * We dont trust it.
526  */
527 static struct task_struct * futex_find_get_task(pid_t pid)
528 {
529         struct task_struct *p;
530
531         rcu_read_lock();
532         p = find_task_by_vpid(pid);
533         if (p)
534                 get_task_struct(p);
535
536         rcu_read_unlock();
537
538         return p;
539 }
540
541 /*
542  * This task is holding PI mutexes at exit time => bad.
543  * Kernel cleans up PI-state, but userspace is likely hosed.
544  * (Robust-futex cleanup is separate and might save the day for userspace.)
545  */
546 void exit_pi_state_list(struct task_struct *curr)
547 {
548         struct list_head *next, *head = &curr->pi_state_list;
549         struct futex_pi_state *pi_state;
550         struct futex_hash_bucket *hb;
551         union futex_key key = FUTEX_KEY_INIT;
552
553         if (!futex_cmpxchg_enabled)
554                 return;
555         /*
556          * We are a ZOMBIE and nobody can enqueue itself on
557          * pi_state_list anymore, but we have to be careful
558          * versus waiters unqueueing themselves:
559          */
560         raw_spin_lock_irq(&curr->pi_lock);
561         while (!list_empty(head)) {
562
563                 next = head->next;
564                 pi_state = list_entry(next, struct futex_pi_state, list);
565                 key = pi_state->key;
566                 hb = hash_futex(&key);
567                 raw_spin_unlock_irq(&curr->pi_lock);
568
569                 spin_lock(&hb->lock);
570
571                 raw_spin_lock_irq(&curr->pi_lock);
572                 /*
573                  * We dropped the pi-lock, so re-check whether this
574                  * task still owns the PI-state:
575                  */
576                 if (head->next != next) {
577                         spin_unlock(&hb->lock);
578                         continue;
579                 }
580
581                 WARN_ON(pi_state->owner != curr);
582                 WARN_ON(list_empty(&pi_state->list));
583                 list_del_init(&pi_state->list);
584                 pi_state->owner = NULL;
585                 raw_spin_unlock_irq(&curr->pi_lock);
586
587                 rt_mutex_unlock(&pi_state->pi_mutex);
588
589                 spin_unlock(&hb->lock);
590
591                 raw_spin_lock_irq(&curr->pi_lock);
592         }
593         raw_spin_unlock_irq(&curr->pi_lock);
594 }
595
596 static int
597 lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
598                 union futex_key *key, struct futex_pi_state **ps)
599 {
600         struct futex_pi_state *pi_state = NULL;
601         struct futex_q *this, *next;
602         struct plist_head *head;
603         struct task_struct *p;
604         pid_t pid = uval & FUTEX_TID_MASK;
605
606         head = &hb->chain;
607
608         plist_for_each_entry_safe(this, next, head, list) {
609                 if (match_futex(&this->key, key)) {
610                         /*
611                          * Another waiter already exists - bump up
612                          * the refcount and return its pi_state:
613                          */
614                         pi_state = this->pi_state;
615                         /*
616                          * Userspace might have messed up non-PI and PI futexes
617                          */
618                         if (unlikely(!pi_state))
619                                 return -EINVAL;
620
621                         WARN_ON(!atomic_read(&pi_state->refcount));
622
623                         /*
624                          * When pi_state->owner is NULL then the owner died
625                          * and another waiter is on the fly. pi_state->owner
626                          * is fixed up by the task which acquires
627                          * pi_state->rt_mutex.
628                          *
629                          * We do not check for pid == 0 which can happen when
630                          * the owner died and robust_list_exit() cleared the
631                          * TID.
632                          */
633                         if (pid && pi_state->owner) {
634                                 /*
635                                  * Bail out if user space manipulated the
636                                  * futex value.
637                                  */
638                                 if (pid != task_pid_vnr(pi_state->owner))
639                                         return -EINVAL;
640                         }
641
642                         atomic_inc(&pi_state->refcount);
643                         *ps = pi_state;
644
645                         return 0;
646                 }
647         }
648
649         /*
650          * We are the first waiter - try to look up the real owner and attach
651          * the new pi_state to it, but bail out when TID = 0
652          */
653         if (!pid)
654                 return -ESRCH;
655         p = futex_find_get_task(pid);
656         if (!p)
657                 return -ESRCH;
658
659         /*
660          * We need to look at the task state flags to figure out,
661          * whether the task is exiting. To protect against the do_exit
662          * change of the task flags, we do this protected by
663          * p->pi_lock:
664          */
665         raw_spin_lock_irq(&p->pi_lock);
666         if (unlikely(p->flags & PF_EXITING)) {
667                 /*
668                  * The task is on the way out. When PF_EXITPIDONE is
669                  * set, we know that the task has finished the
670                  * cleanup:
671                  */
672                 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
673
674                 raw_spin_unlock_irq(&p->pi_lock);
675                 put_task_struct(p);
676                 return ret;
677         }
678
679         pi_state = alloc_pi_state();
680
681         /*
682          * Initialize the pi_mutex in locked state and make 'p'
683          * the owner of it:
684          */
685         rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
686
687         /* Store the key for possible exit cleanups: */
688         pi_state->key = *key;
689
690         WARN_ON(!list_empty(&pi_state->list));
691         list_add(&pi_state->list, &p->pi_state_list);
692         pi_state->owner = p;
693         raw_spin_unlock_irq(&p->pi_lock);
694
695         put_task_struct(p);
696
697         *ps = pi_state;
698
699         return 0;
700 }
701
702 /**
703  * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
704  * @uaddr:              the pi futex user address
705  * @hb:                 the pi futex hash bucket
706  * @key:                the futex key associated with uaddr and hb
707  * @ps:                 the pi_state pointer where we store the result of the
708  *                      lookup
709  * @task:               the task to perform the atomic lock work for.  This will
710  *                      be "current" except in the case of requeue pi.
711  * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
712  *
713  * Return:
714  *  0 - ready to wait;
715  *  1 - acquired the lock;
716  * <0 - error
717  *
718  * The hb->lock and futex_key refs shall be held by the caller.
719  */
720 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
721                                 union futex_key *key,
722                                 struct futex_pi_state **ps,
723                                 struct task_struct *task, int set_waiters)
724 {
725         int lock_taken, ret, force_take = 0;
726         u32 uval, newval, curval, vpid = task_pid_vnr(task);
727
728 retry:
729         ret = lock_taken = 0;
730
731         /*
732          * To avoid races, we attempt to take the lock here again
733          * (by doing a 0 -> TID atomic cmpxchg), while holding all
734          * the locks. It will most likely not succeed.
735          */
736         newval = vpid;
737         if (set_waiters)
738                 newval |= FUTEX_WAITERS;
739
740         if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
741                 return -EFAULT;
742
743         /*
744          * Detect deadlocks.
745          */
746         if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
747                 return -EDEADLK;
748
749         /*
750          * Surprise - we got the lock. Just return to userspace:
751          */
752         if (unlikely(!curval))
753                 return 1;
754
755         uval = curval;
756
757         /*
758          * Set the FUTEX_WAITERS flag, so the owner will know it has someone
759          * to wake at the next unlock.
760          */
761         newval = curval | FUTEX_WAITERS;
762
763         /*
764          * Should we force take the futex? See below.
765          */
766         if (unlikely(force_take)) {
767                 /*
768                  * Keep the OWNER_DIED and the WAITERS bit and set the
769                  * new TID value.
770                  */
771                 newval = (curval & ~FUTEX_TID_MASK) | vpid;
772                 force_take = 0;
773                 lock_taken = 1;
774         }
775
776         if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
777                 return -EFAULT;
778         if (unlikely(curval != uval))
779                 goto retry;
780
781         /*
782          * We took the lock due to forced take over.
783          */
784         if (unlikely(lock_taken))
785                 return 1;
786
787         /*
788          * We dont have the lock. Look up the PI state (or create it if
789          * we are the first waiter):
790          */
791         ret = lookup_pi_state(uval, hb, key, ps);
792
793         if (unlikely(ret)) {
794                 switch (ret) {
795                 case -ESRCH:
796                         /*
797                          * We failed to find an owner for this
798                          * futex. So we have no pi_state to block
799                          * on. This can happen in two cases:
800                          *
801                          * 1) The owner died
802                          * 2) A stale FUTEX_WAITERS bit
803                          *
804                          * Re-read the futex value.
805                          */
806                         if (get_futex_value_locked(&curval, uaddr))
807                                 return -EFAULT;
808
809                         /*
810                          * If the owner died or we have a stale
811                          * WAITERS bit the owner TID in the user space
812                          * futex is 0.
813                          */
814                         if (!(curval & FUTEX_TID_MASK)) {
815                                 force_take = 1;
816                                 goto retry;
817                         }
818                 default:
819                         break;
820                 }
821         }
822
823         return ret;
824 }
825
826 /**
827  * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
828  * @q:  The futex_q to unqueue
829  *
830  * The q->lock_ptr must not be NULL and must be held by the caller.
831  */
832 static void __unqueue_futex(struct futex_q *q)
833 {
834         struct futex_hash_bucket *hb;
835
836         if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
837             || WARN_ON(plist_node_empty(&q->list)))
838                 return;
839
840         hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
841         plist_del(&q->list, &hb->chain);
842 }
843
844 /*
845  * The hash bucket lock must be held when this is called.
846  * Afterwards, the futex_q must not be accessed.
847  */
848 static void wake_futex(struct futex_q *q)
849 {
850         struct task_struct *p = q->task;
851
852         if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
853                 return;
854
855         /*
856          * We set q->lock_ptr = NULL _before_ we wake up the task. If
857          * a non-futex wake up happens on another CPU then the task
858          * might exit and p would dereference a non-existing task
859          * struct. Prevent this by holding a reference on p across the
860          * wake up.
861          */
862         get_task_struct(p);
863
864         __unqueue_futex(q);
865         /*
866          * The waiting task can free the futex_q as soon as
867          * q->lock_ptr = NULL is written, without taking any locks. A
868          * memory barrier is required here to prevent the following
869          * store to lock_ptr from getting ahead of the plist_del.
870          */
871         smp_wmb();
872         q->lock_ptr = NULL;
873
874         wake_up_state(p, TASK_NORMAL);
875         put_task_struct(p);
876 }
877
878 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
879 {
880         struct task_struct *new_owner;
881         struct futex_pi_state *pi_state = this->pi_state;
882         u32 uninitialized_var(curval), newval;
883
884         if (!pi_state)
885                 return -EINVAL;
886
887         /*
888          * If current does not own the pi_state then the futex is
889          * inconsistent and user space fiddled with the futex value.
890          */
891         if (pi_state->owner != current)
892                 return -EINVAL;
893
894         raw_spin_lock(&pi_state->pi_mutex.wait_lock);
895         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
896
897         /*
898          * It is possible that the next waiter (the one that brought
899          * this owner to the kernel) timed out and is no longer
900          * waiting on the lock.
901          */
902         if (!new_owner)
903                 new_owner = this->task;
904
905         /*
906          * We pass it to the next owner. (The WAITERS bit is always
907          * kept enabled while there is PI state around. We must also
908          * preserve the owner died bit.)
909          */
910         if (!(uval & FUTEX_OWNER_DIED)) {
911                 int ret = 0;
912
913                 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
914
915                 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
916                         ret = -EFAULT;
917                 else if (curval != uval)
918                         ret = -EINVAL;
919                 if (ret) {
920                         raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
921                         return ret;
922                 }
923         }
924
925         raw_spin_lock_irq(&pi_state->owner->pi_lock);
926         WARN_ON(list_empty(&pi_state->list));
927         list_del_init(&pi_state->list);
928         raw_spin_unlock_irq(&pi_state->owner->pi_lock);
929
930         raw_spin_lock_irq(&new_owner->pi_lock);
931         WARN_ON(!list_empty(&pi_state->list));
932         list_add(&pi_state->list, &new_owner->pi_state_list);
933         pi_state->owner = new_owner;
934         raw_spin_unlock_irq(&new_owner->pi_lock);
935
936         raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
937         rt_mutex_unlock(&pi_state->pi_mutex);
938
939         return 0;
940 }
941
942 static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
943 {
944         u32 uninitialized_var(oldval);
945
946         /*
947          * There is no waiter, so we unlock the futex. The owner died
948          * bit has not to be preserved here. We are the owner:
949          */
950         if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
951                 return -EFAULT;
952         if (oldval != uval)
953                 return -EAGAIN;
954
955         return 0;
956 }
957
958 /*
959  * Express the locking dependencies for lockdep:
960  */
961 static inline void
962 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
963 {
964         if (hb1 <= hb2) {
965                 spin_lock(&hb1->lock);
966                 if (hb1 < hb2)
967                         spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
968         } else { /* hb1 > hb2 */
969                 spin_lock(&hb2->lock);
970                 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
971         }
972 }
973
974 static inline void
975 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
976 {
977         spin_unlock(&hb1->lock);
978         if (hb1 != hb2)
979                 spin_unlock(&hb2->lock);
980 }
981
982 /*
983  * Wake up waiters matching bitset queued on this futex (uaddr).
984  */
985 static int
986 futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
987 {
988         struct futex_hash_bucket *hb;
989         struct futex_q *this, *next;
990         struct plist_head *head;
991         union futex_key key = FUTEX_KEY_INIT;
992         int ret;
993
994         if (!bitset)
995                 return -EINVAL;
996
997         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
998         if (unlikely(ret != 0))
999                 goto out;
1000
1001         hb = hash_futex(&key);
1002         spin_lock(&hb->lock);
1003         head = &hb->chain;
1004
1005         plist_for_each_entry_safe(this, next, head, list) {
1006                 if (match_futex (&this->key, &key)) {
1007                         if (this->pi_state || this->rt_waiter) {
1008                                 ret = -EINVAL;
1009                                 break;
1010                         }
1011
1012                         /* Check if one of the bits is set in both bitsets */
1013                         if (!(this->bitset & bitset))
1014                                 continue;
1015
1016                         wake_futex(this);
1017                         if (++ret >= nr_wake)
1018                                 break;
1019                 }
1020         }
1021
1022         spin_unlock(&hb->lock);
1023         put_futex_key(&key);
1024 out:
1025         return ret;
1026 }
1027
1028 /*
1029  * Wake up all waiters hashed on the physical page that is mapped
1030  * to this virtual address:
1031  */
1032 static int
1033 futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1034               int nr_wake, int nr_wake2, int op)
1035 {
1036         union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1037         struct futex_hash_bucket *hb1, *hb2;
1038         struct plist_head *head;
1039         struct futex_q *this, *next;
1040         int ret, op_ret;
1041
1042 retry:
1043         ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1044         if (unlikely(ret != 0))
1045                 goto out;
1046         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
1047         if (unlikely(ret != 0))
1048                 goto out_put_key1;
1049
1050         hb1 = hash_futex(&key1);
1051         hb2 = hash_futex(&key2);
1052
1053 retry_private:
1054         double_lock_hb(hb1, hb2);
1055         op_ret = futex_atomic_op_inuser(op, uaddr2);
1056         if (unlikely(op_ret < 0)) {
1057
1058                 double_unlock_hb(hb1, hb2);
1059
1060 #ifndef CONFIG_MMU
1061                 /*
1062                  * we don't get EFAULT from MMU faults if we don't have an MMU,
1063                  * but we might get them from range checking
1064                  */
1065                 ret = op_ret;
1066                 goto out_put_keys;
1067 #endif
1068
1069                 if (unlikely(op_ret != -EFAULT)) {
1070                         ret = op_ret;
1071                         goto out_put_keys;
1072                 }
1073
1074                 ret = fault_in_user_writeable(uaddr2);
1075                 if (ret)
1076                         goto out_put_keys;
1077
1078                 if (!(flags & FLAGS_SHARED))
1079                         goto retry_private;
1080
1081                 put_futex_key(&key2);
1082                 put_futex_key(&key1);
1083                 goto retry;
1084         }
1085
1086         head = &hb1->chain;
1087
1088         plist_for_each_entry_safe(this, next, head, list) {
1089                 if (match_futex (&this->key, &key1)) {
1090                         if (this->pi_state || this->rt_waiter) {
1091                                 ret = -EINVAL;
1092                                 goto out_unlock;
1093                         }
1094                         wake_futex(this);
1095                         if (++ret >= nr_wake)
1096                                 break;
1097                 }
1098         }
1099
1100         if (op_ret > 0) {
1101                 head = &hb2->chain;
1102
1103                 op_ret = 0;
1104                 plist_for_each_entry_safe(this, next, head, list) {
1105                         if (match_futex (&this->key, &key2)) {
1106                                 if (this->pi_state || this->rt_waiter) {
1107                                         ret = -EINVAL;
1108                                         goto out_unlock;
1109                                 }
1110                                 wake_futex(this);
1111                                 if (++op_ret >= nr_wake2)
1112                                         break;
1113                         }
1114                 }
1115                 ret += op_ret;
1116         }
1117
1118 out_unlock:
1119         double_unlock_hb(hb1, hb2);
1120 out_put_keys:
1121         put_futex_key(&key2);
1122 out_put_key1:
1123         put_futex_key(&key1);
1124 out:
1125         return ret;
1126 }
1127
1128 /**
1129  * requeue_futex() - Requeue a futex_q from one hb to another
1130  * @q:          the futex_q to requeue
1131  * @hb1:        the source hash_bucket
1132  * @hb2:        the target hash_bucket
1133  * @key2:       the new key for the requeued futex_q
1134  */
1135 static inline
1136 void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1137                    struct futex_hash_bucket *hb2, union futex_key *key2)
1138 {
1139
1140         /*
1141          * If key1 and key2 hash to the same bucket, no need to
1142          * requeue.
1143          */
1144         if (likely(&hb1->chain != &hb2->chain)) {
1145                 plist_del(&q->list, &hb1->chain);
1146                 plist_add(&q->list, &hb2->chain);
1147                 q->lock_ptr = &hb2->lock;
1148         }
1149         get_futex_key_refs(key2);
1150         q->key = *key2;
1151 }
1152
1153 /**
1154  * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1155  * @q:          the futex_q
1156  * @key:        the key of the requeue target futex
1157  * @hb:         the hash_bucket of the requeue target futex
1158  *
1159  * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1160  * target futex if it is uncontended or via a lock steal.  Set the futex_q key
1161  * to the requeue target futex so the waiter can detect the wakeup on the right
1162  * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1163  * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
1164  * to protect access to the pi_state to fixup the owner later.  Must be called
1165  * with both q->lock_ptr and hb->lock held.
1166  */
1167 static inline
1168 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1169                            struct futex_hash_bucket *hb)
1170 {
1171         get_futex_key_refs(key);
1172         q->key = *key;
1173
1174         __unqueue_futex(q);
1175
1176         WARN_ON(!q->rt_waiter);
1177         q->rt_waiter = NULL;
1178
1179         q->lock_ptr = &hb->lock;
1180
1181         wake_up_state(q->task, TASK_NORMAL);
1182 }
1183
1184 /**
1185  * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1186  * @pifutex:            the user address of the to futex
1187  * @hb1:                the from futex hash bucket, must be locked by the caller
1188  * @hb2:                the to futex hash bucket, must be locked by the caller
1189  * @key1:               the from futex key
1190  * @key2:               the to futex key
1191  * @ps:                 address to store the pi_state pointer
1192  * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
1193  *
1194  * Try and get the lock on behalf of the top waiter if we can do it atomically.
1195  * Wake the top waiter if we succeed.  If the caller specified set_waiters,
1196  * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1197  * hb1 and hb2 must be held by the caller.
1198  *
1199  * Return:
1200  *  0 - failed to acquire the lock atomically;
1201  *  1 - acquired the lock;
1202  * <0 - error
1203  */
1204 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1205                                  struct futex_hash_bucket *hb1,
1206                                  struct futex_hash_bucket *hb2,
1207                                  union futex_key *key1, union futex_key *key2,
1208                                  struct futex_pi_state **ps, int set_waiters)
1209 {
1210         struct futex_q *top_waiter = NULL;
1211         u32 curval;
1212         int ret;
1213
1214         if (get_futex_value_locked(&curval, pifutex))
1215                 return -EFAULT;
1216
1217         /*
1218          * Find the top_waiter and determine if there are additional waiters.
1219          * If the caller intends to requeue more than 1 waiter to pifutex,
1220          * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1221          * as we have means to handle the possible fault.  If not, don't set
1222          * the bit unecessarily as it will force the subsequent unlock to enter
1223          * the kernel.
1224          */
1225         top_waiter = futex_top_waiter(hb1, key1);
1226
1227         /* There are no waiters, nothing for us to do. */
1228         if (!top_waiter)
1229                 return 0;
1230
1231         /* Ensure we requeue to the expected futex. */
1232         if (!match_futex(top_waiter->requeue_pi_key, key2))
1233                 return -EINVAL;
1234
1235         /*
1236          * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
1237          * the contended case or if set_waiters is 1.  The pi_state is returned
1238          * in ps in contended cases.
1239          */
1240         ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1241                                    set_waiters);
1242         if (ret == 1)
1243                 requeue_pi_wake_futex(top_waiter, key2, hb2);
1244
1245         return ret;
1246 }
1247
1248 /**
1249  * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1250  * @uaddr1:     source futex user address
1251  * @flags:      futex flags (FLAGS_SHARED, etc.)
1252  * @uaddr2:     target futex user address
1253  * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
1254  * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1255  * @cmpval:     @uaddr1 expected value (or %NULL)
1256  * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1257  *              pi futex (pi to pi requeue is not supported)
1258  *
1259  * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1260  * uaddr2 atomically on behalf of the top waiter.
1261  *
1262  * Return:
1263  * >=0 - on success, the number of tasks requeued or woken;
1264  *  <0 - on error
1265  */
1266 static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1267                          u32 __user *uaddr2, int nr_wake, int nr_requeue,
1268                          u32 *cmpval, int requeue_pi)
1269 {
1270         union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1271         int drop_count = 0, task_count = 0, ret;
1272         struct futex_pi_state *pi_state = NULL;
1273         struct futex_hash_bucket *hb1, *hb2;
1274         struct plist_head *head1;
1275         struct futex_q *this, *next;
1276         u32 curval2;
1277
1278         if (requeue_pi) {
1279                 /*
1280                  * requeue_pi requires a pi_state, try to allocate it now
1281                  * without any locks in case it fails.
1282                  */
1283                 if (refill_pi_state_cache())
1284                         return -ENOMEM;
1285                 /*
1286                  * requeue_pi must wake as many tasks as it can, up to nr_wake
1287                  * + nr_requeue, since it acquires the rt_mutex prior to
1288                  * returning to userspace, so as to not leave the rt_mutex with
1289                  * waiters and no owner.  However, second and third wake-ups
1290                  * cannot be predicted as they involve race conditions with the
1291                  * first wake and a fault while looking up the pi_state.  Both
1292                  * pthread_cond_signal() and pthread_cond_broadcast() should
1293                  * use nr_wake=1.
1294                  */
1295                 if (nr_wake != 1)
1296                         return -EINVAL;
1297         }
1298
1299 retry:
1300         if (pi_state != NULL) {
1301                 /*
1302                  * We will have to lookup the pi_state again, so free this one
1303                  * to keep the accounting correct.
1304                  */
1305                 free_pi_state(pi_state);
1306                 pi_state = NULL;
1307         }
1308
1309         ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1310         if (unlikely(ret != 0))
1311                 goto out;
1312         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
1313                             requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1314         if (unlikely(ret != 0))
1315                 goto out_put_key1;
1316
1317         hb1 = hash_futex(&key1);
1318         hb2 = hash_futex(&key2);
1319
1320 retry_private:
1321         double_lock_hb(hb1, hb2);
1322
1323         if (likely(cmpval != NULL)) {
1324                 u32 curval;
1325
1326                 ret = get_futex_value_locked(&curval, uaddr1);
1327
1328                 if (unlikely(ret)) {
1329                         double_unlock_hb(hb1, hb2);
1330
1331                         ret = get_user(curval, uaddr1);
1332                         if (ret)
1333                                 goto out_put_keys;
1334
1335                         if (!(flags & FLAGS_SHARED))
1336                                 goto retry_private;
1337
1338                         put_futex_key(&key2);
1339                         put_futex_key(&key1);
1340                         goto retry;
1341                 }
1342                 if (curval != *cmpval) {
1343                         ret = -EAGAIN;
1344                         goto out_unlock;
1345                 }
1346         }
1347
1348         if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1349                 /*
1350                  * Attempt to acquire uaddr2 and wake the top waiter. If we
1351                  * intend to requeue waiters, force setting the FUTEX_WAITERS
1352                  * bit.  We force this here where we are able to easily handle
1353                  * faults rather in the requeue loop below.
1354                  */
1355                 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1356                                                  &key2, &pi_state, nr_requeue);
1357
1358                 /*
1359                  * At this point the top_waiter has either taken uaddr2 or is
1360                  * waiting on it.  If the former, then the pi_state will not
1361                  * exist yet, look it up one more time to ensure we have a
1362                  * reference to it.
1363                  */
1364                 if (ret == 1) {
1365                         WARN_ON(pi_state);
1366                         drop_count++;
1367                         task_count++;
1368                         ret = get_futex_value_locked(&curval2, uaddr2);
1369                         if (!ret)
1370                                 ret = lookup_pi_state(curval2, hb2, &key2,
1371                                                       &pi_state);
1372                 }
1373
1374                 switch (ret) {
1375                 case 0:
1376                         break;
1377                 case -EFAULT:
1378                         double_unlock_hb(hb1, hb2);
1379                         put_futex_key(&key2);
1380                         put_futex_key(&key1);
1381                         ret = fault_in_user_writeable(uaddr2);
1382                         if (!ret)
1383                                 goto retry;
1384                         goto out;
1385                 case -EAGAIN:
1386                         /* The owner was exiting, try again. */
1387                         double_unlock_hb(hb1, hb2);
1388                         put_futex_key(&key2);
1389                         put_futex_key(&key1);
1390                         cond_resched();
1391                         goto retry;
1392                 default:
1393                         goto out_unlock;
1394                 }
1395         }
1396
1397         head1 = &hb1->chain;
1398         plist_for_each_entry_safe(this, next, head1, list) {
1399                 if (task_count - nr_wake >= nr_requeue)
1400                         break;
1401
1402                 if (!match_futex(&this->key, &key1))
1403                         continue;
1404
1405                 /*
1406                  * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1407                  * be paired with each other and no other futex ops.
1408                  *
1409                  * We should never be requeueing a futex_q with a pi_state,
1410                  * which is awaiting a futex_unlock_pi().
1411                  */
1412                 if ((requeue_pi && !this->rt_waiter) ||
1413                     (!requeue_pi && this->rt_waiter) ||
1414                     this->pi_state) {
1415                         ret = -EINVAL;
1416                         break;
1417                 }
1418
1419                 /*
1420                  * Wake nr_wake waiters.  For requeue_pi, if we acquired the
1421                  * lock, we already woke the top_waiter.  If not, it will be
1422                  * woken by futex_unlock_pi().
1423                  */
1424                 if (++task_count <= nr_wake && !requeue_pi) {
1425                         wake_futex(this);
1426                         continue;
1427                 }
1428
1429                 /* Ensure we requeue to the expected futex for requeue_pi. */
1430                 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1431                         ret = -EINVAL;
1432                         break;
1433                 }
1434
1435                 /*
1436                  * Requeue nr_requeue waiters and possibly one more in the case
1437                  * of requeue_pi if we couldn't acquire the lock atomically.
1438                  */
1439                 if (requeue_pi) {
1440                         /* Prepare the waiter to take the rt_mutex. */
1441                         atomic_inc(&pi_state->refcount);
1442                         this->pi_state = pi_state;
1443                         ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1444                                                         this->rt_waiter,
1445                                                         this->task, 1);
1446                         if (ret == 1) {
1447                                 /* We got the lock. */
1448                                 requeue_pi_wake_futex(this, &key2, hb2);
1449                                 drop_count++;
1450                                 continue;
1451                         } else if (ret) {
1452                                 /* -EDEADLK */
1453                                 this->pi_state = NULL;
1454                                 free_pi_state(pi_state);
1455                                 goto out_unlock;
1456                         }
1457                 }
1458                 requeue_futex(this, hb1, hb2, &key2);
1459                 drop_count++;
1460         }
1461
1462 out_unlock:
1463         double_unlock_hb(hb1, hb2);
1464
1465         /*
1466          * drop_futex_key_refs() must be called outside the spinlocks. During
1467          * the requeue we moved futex_q's from the hash bucket at key1 to the
1468          * one at key2 and updated their key pointer.  We no longer need to
1469          * hold the references to key1.
1470          */
1471         while (--drop_count >= 0)
1472                 drop_futex_key_refs(&key1);
1473
1474 out_put_keys:
1475         put_futex_key(&key2);
1476 out_put_key1:
1477         put_futex_key(&key1);
1478 out:
1479         if (pi_state != NULL)
1480                 free_pi_state(pi_state);
1481         return ret ? ret : task_count;
1482 }
1483
1484 /* The key must be already stored in q->key. */
1485 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1486         __acquires(&hb->lock)
1487 {
1488         struct futex_hash_bucket *hb;
1489
1490         hb = hash_futex(&q->key);
1491         q->lock_ptr = &hb->lock;
1492
1493         spin_lock(&hb->lock);
1494         return hb;
1495 }
1496
1497 static inline void
1498 queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1499         __releases(&hb->lock)
1500 {
1501         spin_unlock(&hb->lock);
1502 }
1503
1504 /**
1505  * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1506  * @q:  The futex_q to enqueue
1507  * @hb: The destination hash bucket
1508  *
1509  * The hb->lock must be held by the caller, and is released here. A call to
1510  * queue_me() is typically paired with exactly one call to unqueue_me().  The
1511  * exceptions involve the PI related operations, which may use unqueue_me_pi()
1512  * or nothing if the unqueue is done as part of the wake process and the unqueue
1513  * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1514  * an example).
1515  */
1516 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1517         __releases(&hb->lock)
1518 {
1519         int prio;
1520
1521         /*
1522          * The priority used to register this element is
1523          * - either the real thread-priority for the real-time threads
1524          * (i.e. threads with a priority lower than MAX_RT_PRIO)
1525          * - or MAX_RT_PRIO for non-RT threads.
1526          * Thus, all RT-threads are woken first in priority order, and
1527          * the others are woken last, in FIFO order.
1528          */
1529         prio = min(current->normal_prio, MAX_RT_PRIO);
1530
1531         plist_node_init(&q->list, prio);
1532         plist_add(&q->list, &hb->chain);
1533         q->task = current;
1534         spin_unlock(&hb->lock);
1535 }
1536
1537 /**
1538  * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1539  * @q:  The futex_q to unqueue
1540  *
1541  * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1542  * be paired with exactly one earlier call to queue_me().
1543  *
1544  * Return:
1545  *   1 - if the futex_q was still queued (and we removed unqueued it);
1546  *   0 - if the futex_q was already removed by the waking thread
1547  */
1548 static int unqueue_me(struct futex_q *q)
1549 {
1550         spinlock_t *lock_ptr;
1551         int ret = 0;
1552
1553         /* In the common case we don't take the spinlock, which is nice. */
1554 retry:
1555         lock_ptr = q->lock_ptr;
1556         barrier();
1557         if (lock_ptr != NULL) {
1558                 spin_lock(lock_ptr);
1559                 /*
1560                  * q->lock_ptr can change between reading it and
1561                  * spin_lock(), causing us to take the wrong lock.  This
1562                  * corrects the race condition.
1563                  *
1564                  * Reasoning goes like this: if we have the wrong lock,
1565                  * q->lock_ptr must have changed (maybe several times)
1566                  * between reading it and the spin_lock().  It can
1567                  * change again after the spin_lock() but only if it was
1568                  * already changed before the spin_lock().  It cannot,
1569                  * however, change back to the original value.  Therefore
1570                  * we can detect whether we acquired the correct lock.
1571                  */
1572                 if (unlikely(lock_ptr != q->lock_ptr)) {
1573                         spin_unlock(lock_ptr);
1574                         goto retry;
1575                 }
1576                 __unqueue_futex(q);
1577
1578                 BUG_ON(q->pi_state);
1579
1580                 spin_unlock(lock_ptr);
1581                 ret = 1;
1582         }
1583
1584         drop_futex_key_refs(&q->key);
1585         return ret;
1586 }
1587
1588 /*
1589  * PI futexes can not be requeued and must remove themself from the
1590  * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1591  * and dropped here.
1592  */
1593 static void unqueue_me_pi(struct futex_q *q)
1594         __releases(q->lock_ptr)
1595 {
1596         __unqueue_futex(q);
1597
1598         BUG_ON(!q->pi_state);
1599         free_pi_state(q->pi_state);
1600         q->pi_state = NULL;
1601
1602         spin_unlock(q->lock_ptr);
1603 }
1604
1605 /*
1606  * Fixup the pi_state owner with the new owner.
1607  *
1608  * Must be called with hash bucket lock held and mm->sem held for non
1609  * private futexes.
1610  */
1611 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1612                                 struct task_struct *newowner)
1613 {
1614         u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1615         struct futex_pi_state *pi_state = q->pi_state;
1616         struct task_struct *oldowner = pi_state->owner;
1617         u32 uval, uninitialized_var(curval), newval;
1618         int ret;
1619
1620         /* Owner died? */
1621         if (!pi_state->owner)
1622                 newtid |= FUTEX_OWNER_DIED;
1623
1624         /*
1625          * We are here either because we stole the rtmutex from the
1626          * previous highest priority waiter or we are the highest priority
1627          * waiter but failed to get the rtmutex the first time.
1628          * We have to replace the newowner TID in the user space variable.
1629          * This must be atomic as we have to preserve the owner died bit here.
1630          *
1631          * Note: We write the user space value _before_ changing the pi_state
1632          * because we can fault here. Imagine swapped out pages or a fork
1633          * that marked all the anonymous memory readonly for cow.
1634          *
1635          * Modifying pi_state _before_ the user space value would
1636          * leave the pi_state in an inconsistent state when we fault
1637          * here, because we need to drop the hash bucket lock to
1638          * handle the fault. This might be observed in the PID check
1639          * in lookup_pi_state.
1640          */
1641 retry:
1642         if (get_futex_value_locked(&uval, uaddr))
1643                 goto handle_fault;
1644
1645         while (1) {
1646                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1647
1648                 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1649                         goto handle_fault;
1650                 if (curval == uval)
1651                         break;
1652                 uval = curval;
1653         }
1654
1655         /*
1656          * We fixed up user space. Now we need to fix the pi_state
1657          * itself.
1658          */
1659         if (pi_state->owner != NULL) {
1660                 raw_spin_lock_irq(&pi_state->owner->pi_lock);
1661                 WARN_ON(list_empty(&pi_state->list));
1662                 list_del_init(&pi_state->list);
1663                 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1664         }
1665
1666         pi_state->owner = newowner;
1667
1668         raw_spin_lock_irq(&newowner->pi_lock);
1669         WARN_ON(!list_empty(&pi_state->list));
1670         list_add(&pi_state->list, &newowner->pi_state_list);
1671         raw_spin_unlock_irq(&newowner->pi_lock);
1672         return 0;
1673
1674         /*
1675          * To handle the page fault we need to drop the hash bucket
1676          * lock here. That gives the other task (either the highest priority
1677          * waiter itself or the task which stole the rtmutex) the
1678          * chance to try the fixup of the pi_state. So once we are
1679          * back from handling the fault we need to check the pi_state
1680          * after reacquiring the hash bucket lock and before trying to
1681          * do another fixup. When the fixup has been done already we
1682          * simply return.
1683          */
1684 handle_fault:
1685         spin_unlock(q->lock_ptr);
1686
1687         ret = fault_in_user_writeable(uaddr);
1688
1689         spin_lock(q->lock_ptr);
1690
1691         /*
1692          * Check if someone else fixed it for us:
1693          */
1694         if (pi_state->owner != oldowner)
1695                 return 0;
1696
1697         if (ret)
1698                 return ret;
1699
1700         goto retry;
1701 }
1702
1703 static long futex_wait_restart(struct restart_block *restart);
1704
1705 /**
1706  * fixup_owner() - Post lock pi_state and corner case management
1707  * @uaddr:      user address of the futex
1708  * @q:          futex_q (contains pi_state and access to the rt_mutex)
1709  * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
1710  *
1711  * After attempting to lock an rt_mutex, this function is called to cleanup
1712  * the pi_state owner as well as handle race conditions that may allow us to
1713  * acquire the lock. Must be called with the hb lock held.
1714  *
1715  * Return:
1716  *  1 - success, lock taken;
1717  *  0 - success, lock not taken;
1718  * <0 - on error (-EFAULT)
1719  */
1720 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1721 {
1722         struct task_struct *owner;
1723         int ret = 0;
1724
1725         if (locked) {
1726                 /*
1727                  * Got the lock. We might not be the anticipated owner if we
1728                  * did a lock-steal - fix up the PI-state in that case:
1729                  */
1730                 if (q->pi_state->owner != current)
1731                         ret = fixup_pi_state_owner(uaddr, q, current);
1732                 goto out;
1733         }
1734
1735         /*
1736          * Catch the rare case, where the lock was released when we were on the
1737          * way back before we locked the hash bucket.
1738          */
1739         if (q->pi_state->owner == current) {
1740                 /*
1741                  * Try to get the rt_mutex now. This might fail as some other
1742                  * task acquired the rt_mutex after we removed ourself from the
1743                  * rt_mutex waiters list.
1744                  */
1745                 if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
1746                         locked = 1;
1747                         goto out;
1748                 }
1749
1750                 /*
1751                  * pi_state is incorrect, some other task did a lock steal and
1752                  * we returned due to timeout or signal without taking the
1753                  * rt_mutex. Too late.
1754                  */
1755                 raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1756                 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1757                 if (!owner)
1758                         owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1759                 raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1760                 ret = fixup_pi_state_owner(uaddr, q, owner);
1761                 goto out;
1762         }
1763
1764         /*
1765          * Paranoia check. If we did not take the lock, then we should not be
1766          * the owner of the rt_mutex.
1767          */
1768         if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1769                 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
1770                                 "pi-state %p\n", ret,
1771                                 q->pi_state->pi_mutex.owner,
1772                                 q->pi_state->owner);
1773
1774 out:
1775         return ret ? ret : locked;
1776 }
1777
1778 /**
1779  * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
1780  * @hb:         the futex hash bucket, must be locked by the caller
1781  * @q:          the futex_q to queue up on
1782  * @timeout:    the prepared hrtimer_sleeper, or null for no timeout
1783  */
1784 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1785                                 struct hrtimer_sleeper *timeout)
1786 {
1787         /*
1788          * The task state is guaranteed to be set before another task can
1789          * wake it. set_current_state() is implemented using set_mb() and
1790          * queue_me() calls spin_unlock() upon completion, both serializing
1791          * access to the hash list and forcing another memory barrier.
1792          */
1793         set_current_state(TASK_INTERRUPTIBLE);
1794         queue_me(q, hb);
1795
1796         /* Arm the timer */
1797         if (timeout) {
1798                 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
1799                 if (!hrtimer_active(&timeout->timer))
1800                         timeout->task = NULL;
1801         }
1802
1803         /*
1804          * If we have been removed from the hash list, then another task
1805          * has tried to wake us, and we can skip the call to schedule().
1806          */
1807         if (likely(!plist_node_empty(&q->list))) {
1808                 /*
1809                  * If the timer has already expired, current will already be
1810                  * flagged for rescheduling. Only call schedule if there
1811                  * is no timeout, or if it has yet to expire.
1812                  */
1813                 if (!timeout || timeout->task)
1814                         freezable_schedule();
1815         }
1816         __set_current_state(TASK_RUNNING);
1817 }
1818
1819 /**
1820  * futex_wait_setup() - Prepare to wait on a futex
1821  * @uaddr:      the futex userspace address
1822  * @val:        the expected value
1823  * @flags:      futex flags (FLAGS_SHARED, etc.)
1824  * @q:          the associated futex_q
1825  * @hb:         storage for hash_bucket pointer to be returned to caller
1826  *
1827  * Setup the futex_q and locate the hash_bucket.  Get the futex value and
1828  * compare it with the expected value.  Handle atomic faults internally.
1829  * Return with the hb lock held and a q.key reference on success, and unlocked
1830  * with no q.key reference on failure.
1831  *
1832  * Return:
1833  *  0 - uaddr contains val and hb has been locked;
1834  * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
1835  */
1836 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1837                            struct futex_q *q, struct futex_hash_bucket **hb)
1838 {
1839         u32 uval;
1840         int ret;
1841
1842         /*
1843          * Access the page AFTER the hash-bucket is locked.
1844          * Order is important:
1845          *
1846          *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
1847          *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
1848          *
1849          * The basic logical guarantee of a futex is that it blocks ONLY
1850          * if cond(var) is known to be true at the time of blocking, for
1851          * any cond.  If we locked the hash-bucket after testing *uaddr, that
1852          * would open a race condition where we could block indefinitely with
1853          * cond(var) false, which would violate the guarantee.
1854          *
1855          * On the other hand, we insert q and release the hash-bucket only
1856          * after testing *uaddr.  This guarantees that futex_wait() will NOT
1857          * absorb a wakeup if *uaddr does not match the desired values
1858          * while the syscall executes.
1859          */
1860 retry:
1861         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
1862         if (unlikely(ret != 0))
1863                 return ret;
1864
1865 retry_private:
1866         *hb = queue_lock(q);
1867
1868         ret = get_futex_value_locked(&uval, uaddr);
1869
1870         if (ret) {
1871                 queue_unlock(q, *hb);
1872
1873                 ret = get_user(uval, uaddr);
1874                 if (ret)
1875                         goto out;
1876
1877                 if (!(flags & FLAGS_SHARED))
1878                         goto retry_private;
1879
1880                 put_futex_key(&q->key);
1881                 goto retry;
1882         }
1883
1884         if (uval != val) {
1885                 queue_unlock(q, *hb);
1886                 ret = -EWOULDBLOCK;
1887         }
1888
1889 out:
1890         if (ret)
1891                 put_futex_key(&q->key);
1892         return ret;
1893 }
1894
1895 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
1896                       ktime_t *abs_time, u32 bitset)
1897 {
1898         struct hrtimer_sleeper timeout, *to = NULL;
1899         struct restart_block *restart;
1900         struct futex_hash_bucket *hb;
1901         struct futex_q q = futex_q_init;
1902         int ret;
1903
1904         if (!bitset)
1905                 return -EINVAL;
1906         q.bitset = bitset;
1907
1908         if (abs_time) {
1909                 to = &timeout;
1910
1911                 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
1912                                       CLOCK_REALTIME : CLOCK_MONOTONIC,
1913                                       HRTIMER_MODE_ABS);
1914                 hrtimer_init_sleeper(to, current);
1915                 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
1916                                              current->timer_slack_ns);
1917         }
1918
1919 retry:
1920         /*
1921          * Prepare to wait on uaddr. On success, holds hb lock and increments
1922          * q.key refs.
1923          */
1924         ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
1925         if (ret)
1926                 goto out;
1927
1928         /* queue_me and wait for wakeup, timeout, or a signal. */
1929         futex_wait_queue_me(hb, &q, to);
1930
1931         /* If we were woken (and unqueued), we succeeded, whatever. */
1932         ret = 0;
1933         /* unqueue_me() drops q.key ref */
1934         if (!unqueue_me(&q))
1935                 goto out;
1936         ret = -ETIMEDOUT;
1937         if (to && !to->task)
1938                 goto out;
1939
1940         /*
1941          * We expect signal_pending(current), but we might be the
1942          * victim of a spurious wakeup as well.
1943          */
1944         if (!signal_pending(current))
1945                 goto retry;
1946
1947         ret = -ERESTARTSYS;
1948         if (!abs_time)
1949                 goto out;
1950
1951         restart = &current_thread_info()->restart_block;
1952         restart->fn = futex_wait_restart;
1953         restart->futex.uaddr = uaddr;
1954         restart->futex.val = val;
1955         restart->futex.time = abs_time->tv64;
1956         restart->futex.bitset = bitset;
1957         restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
1958
1959         ret = -ERESTART_RESTARTBLOCK;
1960
1961 out:
1962         if (to) {
1963                 hrtimer_cancel(&to->timer);
1964                 destroy_hrtimer_on_stack(&to->timer);
1965         }
1966         return ret;
1967 }
1968
1969
1970 static long futex_wait_restart(struct restart_block *restart)
1971 {
1972         u32 __user *uaddr = restart->futex.uaddr;
1973         ktime_t t, *tp = NULL;
1974
1975         if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
1976                 t.tv64 = restart->futex.time;
1977                 tp = &t;
1978         }
1979         restart->fn = do_no_restart_syscall;
1980
1981         return (long)futex_wait(uaddr, restart->futex.flags,
1982                                 restart->futex.val, tp, restart->futex.bitset);
1983 }
1984
1985
1986 /*
1987  * Userspace tried a 0 -> TID atomic transition of the futex value
1988  * and failed. The kernel side here does the whole locking operation:
1989  * if there are waiters then it will block, it does PI, etc. (Due to
1990  * races the kernel might see a 0 value of the futex too.)
1991  */
1992 static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
1993                          ktime_t *time, int trylock)
1994 {
1995         struct hrtimer_sleeper timeout, *to = NULL;
1996         struct futex_hash_bucket *hb;
1997         struct futex_q q = futex_q_init;
1998         int res, ret;
1999
2000         if (refill_pi_state_cache())
2001                 return -ENOMEM;
2002
2003         if (time) {
2004                 to = &timeout;
2005                 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
2006                                       HRTIMER_MODE_ABS);
2007                 hrtimer_init_sleeper(to, current);
2008                 hrtimer_set_expires(&to->timer, *time);
2009         }
2010
2011 retry:
2012         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
2013         if (unlikely(ret != 0))
2014                 goto out;
2015
2016 retry_private:
2017         hb = queue_lock(&q);
2018
2019         ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
2020         if (unlikely(ret)) {
2021                 switch (ret) {
2022                 case 1:
2023                         /* We got the lock. */
2024                         ret = 0;
2025                         goto out_unlock_put_key;
2026                 case -EFAULT:
2027                         goto uaddr_faulted;
2028                 case -EAGAIN:
2029                         /*
2030                          * Task is exiting and we just wait for the
2031                          * exit to complete.
2032                          */
2033                         queue_unlock(&q, hb);
2034                         put_futex_key(&q.key);
2035                         cond_resched();
2036                         goto retry;
2037                 default:
2038                         goto out_unlock_put_key;
2039                 }
2040         }
2041
2042         /*
2043          * Only actually queue now that the atomic ops are done:
2044          */
2045         queue_me(&q, hb);
2046
2047         WARN_ON(!q.pi_state);
2048         /*
2049          * Block on the PI mutex:
2050          */
2051         if (!trylock)
2052                 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
2053         else {
2054                 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
2055                 /* Fixup the trylock return value: */
2056                 ret = ret ? 0 : -EWOULDBLOCK;
2057         }
2058
2059         spin_lock(q.lock_ptr);
2060         /*
2061          * Fixup the pi_state owner and possibly acquire the lock if we
2062          * haven't already.
2063          */
2064         res = fixup_owner(uaddr, &q, !ret);
2065         /*
2066          * If fixup_owner() returned an error, proprogate that.  If it acquired
2067          * the lock, clear our -ETIMEDOUT or -EINTR.
2068          */
2069         if (res)
2070                 ret = (res < 0) ? res : 0;
2071
2072         /*
2073          * If fixup_owner() faulted and was unable to handle the fault, unlock
2074          * it and return the fault to userspace.
2075          */
2076         if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
2077                 rt_mutex_unlock(&q.pi_state->pi_mutex);
2078
2079         /* Unqueue and drop the lock */
2080         unqueue_me_pi(&q);
2081
2082         goto out_put_key;
2083
2084 out_unlock_put_key:
2085         queue_unlock(&q, hb);
2086
2087 out_put_key:
2088         put_futex_key(&q.key);
2089 out:
2090         if (to)
2091                 destroy_hrtimer_on_stack(&to->timer);
2092         return ret != -EINTR ? ret : -ERESTARTNOINTR;
2093
2094 uaddr_faulted:
2095         queue_unlock(&q, hb);
2096
2097         ret = fault_in_user_writeable(uaddr);
2098         if (ret)
2099                 goto out_put_key;
2100
2101         if (!(flags & FLAGS_SHARED))
2102                 goto retry_private;
2103
2104         put_futex_key(&q.key);
2105         goto retry;
2106 }
2107
2108 /*
2109  * Userspace attempted a TID -> 0 atomic transition, and failed.
2110  * This is the in-kernel slowpath: we look up the PI state (if any),
2111  * and do the rt-mutex unlock.
2112  */
2113 static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2114 {
2115         struct futex_hash_bucket *hb;
2116         struct futex_q *this, *next;
2117         struct plist_head *head;
2118         union futex_key key = FUTEX_KEY_INIT;
2119         u32 uval, vpid = task_pid_vnr(current);
2120         int ret;
2121
2122 retry:
2123         if (get_user(uval, uaddr))
2124                 return -EFAULT;
2125         /*
2126          * We release only a lock we actually own:
2127          */
2128         if ((uval & FUTEX_TID_MASK) != vpid)
2129                 return -EPERM;
2130
2131         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
2132         if (unlikely(ret != 0))
2133                 goto out;
2134
2135         hb = hash_futex(&key);
2136         spin_lock(&hb->lock);
2137
2138         /*
2139          * To avoid races, try to do the TID -> 0 atomic transition
2140          * again. If it succeeds then we can return without waking
2141          * anyone else up:
2142          */
2143         if (!(uval & FUTEX_OWNER_DIED) &&
2144             cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2145                 goto pi_faulted;
2146         /*
2147          * Rare case: we managed to release the lock atomically,
2148          * no need to wake anyone else up:
2149          */
2150         if (unlikely(uval == vpid))
2151                 goto out_unlock;
2152
2153         /*
2154          * Ok, other tasks may need to be woken up - check waiters
2155          * and do the wakeup if necessary:
2156          */
2157         head = &hb->chain;
2158
2159         plist_for_each_entry_safe(this, next, head, list) {
2160                 if (!match_futex (&this->key, &key))
2161                         continue;
2162                 ret = wake_futex_pi(uaddr, uval, this);
2163                 /*
2164                  * The atomic access to the futex value
2165                  * generated a pagefault, so retry the
2166                  * user-access and the wakeup:
2167                  */
2168                 if (ret == -EFAULT)
2169                         goto pi_faulted;
2170                 goto out_unlock;
2171         }
2172         /*
2173          * No waiters - kernel unlocks the futex:
2174          */
2175         if (!(uval & FUTEX_OWNER_DIED)) {
2176                 ret = unlock_futex_pi(uaddr, uval);
2177                 if (ret == -EFAULT)
2178                         goto pi_faulted;
2179         }
2180
2181 out_unlock:
2182         spin_unlock(&hb->lock);
2183         put_futex_key(&key);
2184
2185 out:
2186         return ret;
2187
2188 pi_faulted:
2189         spin_unlock(&hb->lock);
2190         put_futex_key(&key);
2191
2192         ret = fault_in_user_writeable(uaddr);
2193         if (!ret)
2194                 goto retry;
2195
2196         return ret;
2197 }
2198
2199 /**
2200  * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2201  * @hb:         the hash_bucket futex_q was original enqueued on
2202  * @q:          the futex_q woken while waiting to be requeued
2203  * @key2:       the futex_key of the requeue target futex
2204  * @timeout:    the timeout associated with the wait (NULL if none)
2205  *
2206  * Detect if the task was woken on the initial futex as opposed to the requeue
2207  * target futex.  If so, determine if it was a timeout or a signal that caused
2208  * the wakeup and return the appropriate error code to the caller.  Must be
2209  * called with the hb lock held.
2210  *
2211  * Return:
2212  *  0 = no early wakeup detected;
2213  * <0 = -ETIMEDOUT or -ERESTARTNOINTR
2214  */
2215 static inline
2216 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2217                                    struct futex_q *q, union futex_key *key2,
2218                                    struct hrtimer_sleeper *timeout)
2219 {
2220         int ret = 0;
2221
2222         /*
2223          * With the hb lock held, we avoid races while we process the wakeup.
2224          * We only need to hold hb (and not hb2) to ensure atomicity as the
2225          * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2226          * It can't be requeued from uaddr2 to something else since we don't
2227          * support a PI aware source futex for requeue.
2228          */
2229         if (!match_futex(&q->key, key2)) {
2230                 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2231                 /*
2232                  * We were woken prior to requeue by a timeout or a signal.
2233                  * Unqueue the futex_q and determine which it was.
2234                  */
2235                 plist_del(&q->list, &hb->chain);
2236
2237                 /* Handle spurious wakeups gracefully */
2238                 ret = -EWOULDBLOCK;
2239                 if (timeout && !timeout->task)
2240                         ret = -ETIMEDOUT;
2241                 else if (signal_pending(current))
2242                         ret = -ERESTARTNOINTR;
2243         }
2244         return ret;
2245 }
2246
2247 /**
2248  * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2249  * @uaddr:      the futex we initially wait on (non-pi)
2250  * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2251  *              the same type, no requeueing from private to shared, etc.
2252  * @val:        the expected value of uaddr
2253  * @abs_time:   absolute timeout
2254  * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
2255  * @uaddr2:     the pi futex we will take prior to returning to user-space
2256  *
2257  * The caller will wait on uaddr and will be requeued by futex_requeue() to
2258  * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
2259  * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
2260  * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
2261  * without one, the pi logic would not know which task to boost/deboost, if
2262  * there was a need to.
2263  *
2264  * We call schedule in futex_wait_queue_me() when we enqueue and return there
2265  * via the following--
2266  * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2267  * 2) wakeup on uaddr2 after a requeue
2268  * 3) signal
2269  * 4) timeout
2270  *
2271  * If 3, cleanup and return -ERESTARTNOINTR.
2272  *
2273  * If 2, we may then block on trying to take the rt_mutex and return via:
2274  * 5) successful lock
2275  * 6) signal
2276  * 7) timeout
2277  * 8) other lock acquisition failure
2278  *
2279  * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2280  *
2281  * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2282  *
2283  * Return:
2284  *  0 - On success;
2285  * <0 - On error
2286  */
2287 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2288                                  u32 val, ktime_t *abs_time, u32 bitset,
2289                                  u32 __user *uaddr2)
2290 {
2291         struct hrtimer_sleeper timeout, *to = NULL;
2292         struct rt_mutex_waiter rt_waiter;
2293         struct rt_mutex *pi_mutex = NULL;
2294         struct futex_hash_bucket *hb;
2295         union futex_key key2 = FUTEX_KEY_INIT;
2296         struct futex_q q = futex_q_init;
2297         int res, ret;
2298
2299         if (uaddr == uaddr2)
2300                 return -EINVAL;
2301
2302         if (!bitset)
2303                 return -EINVAL;
2304
2305         if (abs_time) {
2306                 to = &timeout;
2307                 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2308                                       CLOCK_REALTIME : CLOCK_MONOTONIC,
2309                                       HRTIMER_MODE_ABS);
2310                 hrtimer_init_sleeper(to, current);
2311                 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2312                                              current->timer_slack_ns);
2313         }
2314
2315         /*
2316          * The waiter is allocated on our stack, manipulated by the requeue
2317          * code while we sleep on uaddr.
2318          */
2319         debug_rt_mutex_init_waiter(&rt_waiter);
2320         rt_waiter.task = NULL;
2321
2322         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
2323         if (unlikely(ret != 0))
2324                 goto out;
2325
2326         q.bitset = bitset;
2327         q.rt_waiter = &rt_waiter;
2328         q.requeue_pi_key = &key2;
2329
2330         /*
2331          * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2332          * count.
2333          */
2334         ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2335         if (ret)
2336                 goto out_key2;
2337
2338         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2339         futex_wait_queue_me(hb, &q, to);
2340
2341         spin_lock(&hb->lock);
2342         ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2343         spin_unlock(&hb->lock);
2344         if (ret)
2345                 goto out_put_keys;
2346
2347         /*
2348          * In order for us to be here, we know our q.key == key2, and since
2349          * we took the hb->lock above, we also know that futex_requeue() has
2350          * completed and we no longer have to concern ourselves with a wakeup
2351          * race with the atomic proxy lock acquisition by the requeue code. The
2352          * futex_requeue dropped our key1 reference and incremented our key2
2353          * reference count.
2354          */
2355
2356         /* Check if the requeue code acquired the second futex for us. */
2357         if (!q.rt_waiter) {
2358                 /*
2359                  * Got the lock. We might not be the anticipated owner if we
2360                  * did a lock-steal - fix up the PI-state in that case.
2361                  */
2362                 if (q.pi_state && (q.pi_state->owner != current)) {
2363                         spin_lock(q.lock_ptr);
2364                         ret = fixup_pi_state_owner(uaddr2, &q, current);
2365                         spin_unlock(q.lock_ptr);
2366                 }
2367         } else {
2368                 /*
2369                  * We have been woken up by futex_unlock_pi(), a timeout, or a
2370                  * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
2371                  * the pi_state.
2372                  */
2373                 WARN_ON(!q.pi_state);
2374                 pi_mutex = &q.pi_state->pi_mutex;
2375                 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2376                 debug_rt_mutex_free_waiter(&rt_waiter);
2377
2378                 spin_lock(q.lock_ptr);
2379                 /*
2380                  * Fixup the pi_state owner and possibly acquire the lock if we
2381                  * haven't already.
2382                  */
2383                 res = fixup_owner(uaddr2, &q, !ret);
2384                 /*
2385                  * If fixup_owner() returned an error, proprogate that.  If it
2386                  * acquired the lock, clear -ETIMEDOUT or -EINTR.
2387                  */
2388                 if (res)
2389                         ret = (res < 0) ? res : 0;
2390
2391                 /* Unqueue and drop the lock. */
2392                 unqueue_me_pi(&q);
2393         }
2394
2395         /*
2396          * If fixup_pi_state_owner() faulted and was unable to handle the
2397          * fault, unlock the rt_mutex and return the fault to userspace.
2398          */
2399         if (ret == -EFAULT) {
2400                 if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
2401                         rt_mutex_unlock(pi_mutex);
2402         } else if (ret == -EINTR) {
2403                 /*
2404                  * We've already been requeued, but cannot restart by calling
2405                  * futex_lock_pi() directly. We could restart this syscall, but
2406                  * it would detect that the user space "val" changed and return
2407                  * -EWOULDBLOCK.  Save the overhead of the restart and return
2408                  * -EWOULDBLOCK directly.
2409                  */
2410                 ret = -EWOULDBLOCK;
2411         }
2412
2413 out_put_keys:
2414         put_futex_key(&q.key);
2415 out_key2:
2416         put_futex_key(&key2);
2417
2418 out:
2419         if (to) {
2420                 hrtimer_cancel(&to->timer);
2421                 destroy_hrtimer_on_stack(&to->timer);
2422         }
2423         return ret;
2424 }
2425
2426 /*
2427  * Support for robust futexes: the kernel cleans up held futexes at
2428  * thread exit time.
2429  *
2430  * Implementation: user-space maintains a per-thread list of locks it
2431  * is holding. Upon do_exit(), the kernel carefully walks this list,
2432  * and marks all locks that are owned by this thread with the
2433  * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
2434  * always manipulated with the lock held, so the list is private and
2435  * per-thread. Userspace also maintains a per-thread 'list_op_pending'
2436  * field, to allow the kernel to clean up if the thread dies after
2437  * acquiring the lock, but just before it could have added itself to
2438  * the list. There can only be one such pending lock.
2439  */
2440
2441 /**
2442  * sys_set_robust_list() - Set the robust-futex list head of a task
2443  * @head:       pointer to the list-head
2444  * @len:        length of the list-head, as userspace expects
2445  */
2446 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2447                 size_t, len)
2448 {
2449         if (!futex_cmpxchg_enabled)
2450                 return -ENOSYS;
2451         /*
2452          * The kernel knows only one size for now:
2453          */
2454         if (unlikely(len != sizeof(*head)))
2455                 return -EINVAL;
2456
2457         current->robust_list = head;
2458
2459         return 0;
2460 }
2461
2462 /**
2463  * sys_get_robust_list() - Get the robust-futex list head of a task
2464  * @pid:        pid of the process [zero for current task]
2465  * @head_ptr:   pointer to a list-head pointer, the kernel fills it in
2466  * @len_ptr:    pointer to a length field, the kernel fills in the header size
2467  */
2468 SYSCALL_DEFINE3(get_robust_list, int, pid,
2469                 struct robust_list_head __user * __user *, head_ptr,
2470                 size_t __user *, len_ptr)
2471 {
2472         struct robust_list_head __user *head;
2473         unsigned long ret;
2474         struct task_struct *p;
2475
2476         if (!futex_cmpxchg_enabled)
2477                 return -ENOSYS;
2478
2479         rcu_read_lock();
2480
2481         ret = -ESRCH;
2482         if (!pid)
2483                 p = current;
2484         else {
2485                 p = find_task_by_vpid(pid);
2486                 if (!p)
2487                         goto err_unlock;
2488         }
2489
2490         ret = -EPERM;
2491         if (!ptrace_may_access(p, PTRACE_MODE_READ))
2492                 goto err_unlock;
2493
2494         head = p->robust_list;
2495         rcu_read_unlock();
2496
2497         if (put_user(sizeof(*head), len_ptr))
2498                 return -EFAULT;
2499         return put_user(head, head_ptr);
2500
2501 err_unlock:
2502         rcu_read_unlock();
2503
2504         return ret;
2505 }
2506
2507 /*
2508  * Process a futex-list entry, check whether it's owned by the
2509  * dying task, and do notification if so:
2510  */
2511 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
2512 {
2513         u32 uval, uninitialized_var(nval), mval;
2514
2515 retry:
2516         if (get_user(uval, uaddr))
2517                 return -1;
2518
2519         if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
2520                 /*
2521                  * Ok, this dying thread is truly holding a futex
2522                  * of interest. Set the OWNER_DIED bit atomically
2523                  * via cmpxchg, and if the value had FUTEX_WAITERS
2524                  * set, wake up a waiter (if any). (We have to do a
2525                  * futex_wake() even if OWNER_DIED is already set -
2526                  * to handle the rare but possible case of recursive
2527                  * thread-death.) The rest of the cleanup is done in
2528                  * userspace.
2529                  */
2530                 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2531                 /*
2532                  * We are not holding a lock here, but we want to have
2533                  * the pagefault_disable/enable() protection because
2534                  * we want to handle the fault gracefully. If the
2535                  * access fails we try to fault in the futex with R/W
2536                  * verification via get_user_pages. get_user() above
2537                  * does not guarantee R/W access. If that fails we
2538                  * give up and leave the futex locked.
2539                  */
2540                 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2541                         if (fault_in_user_writeable(uaddr))
2542                                 return -1;
2543                         goto retry;
2544                 }
2545                 if (nval != uval)
2546                         goto retry;
2547
2548                 /*
2549                  * Wake robust non-PI futexes here. The wakeup of
2550                  * PI futexes happens in exit_pi_state():
2551                  */
2552                 if (!pi && (uval & FUTEX_WAITERS))
2553                         futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
2554         }
2555         return 0;
2556 }
2557
2558 /*
2559  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
2560  */
2561 static inline int fetch_robust_entry(struct robust_list __user **entry,
2562                                      struct robust_list __user * __user *head,
2563                                      unsigned int *pi)
2564 {
2565         unsigned long uentry;
2566
2567         if (get_user(uentry, (unsigned long __user *)head))
2568                 return -EFAULT;
2569
2570         *entry = (void __user *)(uentry & ~1UL);
2571         *pi = uentry & 1;
2572
2573         return 0;
2574 }
2575
2576 /*
2577  * Walk curr->robust_list (very carefully, it's a userspace list!)
2578  * and mark any locks found there dead, and notify any waiters.
2579  *
2580  * We silently return on any sign of list-walking problem.
2581  */
2582 void exit_robust_list(struct task_struct *curr)
2583 {
2584         struct robust_list_head __user *head = curr->robust_list;
2585         struct robust_list __user *entry, *next_entry, *pending;
2586         unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
2587         unsigned int uninitialized_var(next_pi);
2588         unsigned long futex_offset;
2589         int rc;
2590
2591         if (!futex_cmpxchg_enabled)
2592                 return;
2593
2594         /*
2595          * Fetch the list head (which was registered earlier, via
2596          * sys_set_robust_list()):
2597          */
2598         if (fetch_robust_entry(&entry, &head->list.next, &pi))
2599                 return;
2600         /*
2601          * Fetch the relative futex offset:
2602          */
2603         if (get_user(futex_offset, &head->futex_offset))
2604                 return;
2605         /*
2606          * Fetch any possibly pending lock-add first, and handle it
2607          * if it exists:
2608          */
2609         if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
2610                 return;
2611
2612         next_entry = NULL;      /* avoid warning with gcc */
2613         while (entry != &head->list) {
2614                 /*
2615                  * Fetch the next entry in the list before calling
2616                  * handle_futex_death:
2617                  */
2618                 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
2619                 /*
2620                  * A pending lock might already be on the list, so
2621                  * don't process it twice:
2622                  */
2623                 if (entry != pending)
2624                         if (handle_futex_death((void __user *)entry + futex_offset,
2625                                                 curr, pi))
2626                                 return;
2627                 if (rc)
2628                         return;
2629                 entry = next_entry;
2630                 pi = next_pi;
2631                 /*
2632                  * Avoid excessively long or circular lists:
2633                  */
2634                 if (!--limit)
2635                         break;
2636
2637                 cond_resched();
2638         }
2639
2640         if (pending)
2641                 handle_futex_death((void __user *)pending + futex_offset,
2642                                    curr, pip);
2643 }
2644
2645 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2646                 u32 __user *uaddr2, u32 val2, u32 val3)
2647 {
2648         int cmd = op & FUTEX_CMD_MASK;
2649         unsigned int flags = 0;
2650
2651         if (!(op & FUTEX_PRIVATE_FLAG))
2652                 flags |= FLAGS_SHARED;
2653
2654         if (op & FUTEX_CLOCK_REALTIME) {
2655                 flags |= FLAGS_CLOCKRT;
2656                 if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2657                         return -ENOSYS;
2658         }
2659
2660         switch (cmd) {
2661         case FUTEX_LOCK_PI:
2662         case FUTEX_UNLOCK_PI:
2663         case FUTEX_TRYLOCK_PI:
2664         case FUTEX_WAIT_REQUEUE_PI:
2665         case FUTEX_CMP_REQUEUE_PI:
2666                 if (!futex_cmpxchg_enabled)
2667                         return -ENOSYS;
2668         }
2669
2670         switch (cmd) {
2671         case FUTEX_WAIT:
2672                 val3 = FUTEX_BITSET_MATCH_ANY;
2673         case FUTEX_WAIT_BITSET:
2674                 return futex_wait(uaddr, flags, val, timeout, val3);
2675         case FUTEX_WAKE:
2676                 val3 = FUTEX_BITSET_MATCH_ANY;
2677         case FUTEX_WAKE_BITSET:
2678                 return futex_wake(uaddr, flags, val, val3);
2679         case FUTEX_REQUEUE:
2680                 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2681         case FUTEX_CMP_REQUEUE:
2682                 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2683         case FUTEX_WAKE_OP:
2684                 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2685         case FUTEX_LOCK_PI:
2686                 return futex_lock_pi(uaddr, flags, val, timeout, 0);
2687         case FUTEX_UNLOCK_PI:
2688                 return futex_unlock_pi(uaddr, flags);
2689         case FUTEX_TRYLOCK_PI:
2690                 return futex_lock_pi(uaddr, flags, 0, timeout, 1);
2691         case FUTEX_WAIT_REQUEUE_PI:
2692                 val3 = FUTEX_BITSET_MATCH_ANY;
2693                 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2694                                              uaddr2);
2695         case FUTEX_CMP_REQUEUE_PI:
2696                 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2697         }
2698         return -ENOSYS;
2699 }
2700
2701
2702 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2703                 struct timespec __user *, utime, u32 __user *, uaddr2,
2704                 u32, val3)
2705 {
2706         struct timespec ts;
2707         ktime_t t, *tp = NULL;
2708         u32 val2 = 0;
2709         int cmd = op & FUTEX_CMD_MASK;
2710
2711         if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
2712                       cmd == FUTEX_WAIT_BITSET ||
2713                       cmd == FUTEX_WAIT_REQUEUE_PI)) {
2714                 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2715                         return -EFAULT;
2716                 if (!timespec_valid(&ts))
2717                         return -EINVAL;
2718
2719                 t = timespec_to_ktime(ts);
2720                 if (cmd == FUTEX_WAIT)
2721                         t = ktime_add_safe(ktime_get(), t);
2722                 tp = &t;
2723         }
2724         /*
2725          * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
2726          * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
2727          */
2728         if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
2729             cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
2730                 val2 = (u32) (unsigned long) utime;
2731
2732         return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
2733 }
2734
2735 static void __init futex_detect_cmpxchg(void)
2736 {
2737 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
2738         u32 curval;
2739
2740         /*
2741          * This will fail and we want it. Some arch implementations do
2742          * runtime detection of the futex_atomic_cmpxchg_inatomic()
2743          * functionality. We want to know that before we call in any
2744          * of the complex code paths. Also we want to prevent
2745          * registration of robust lists in that case. NULL is
2746          * guaranteed to fault and we get -EFAULT on functional
2747          * implementation, the non-functional ones will return
2748          * -ENOSYS.
2749          */
2750         if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2751                 futex_cmpxchg_enabled = 1;
2752 #endif
2753 }
2754
2755 static int __init futex_init(void)
2756 {
2757         int i;
2758
2759         futex_detect_cmpxchg();
2760
2761         for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
2762                 plist_head_init(&futex_queues[i].chain);
2763                 spin_lock_init(&futex_queues[i].lock);
2764         }
2765
2766         return 0;
2767 }
2768 __initcall(futex_init);