X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=fs%2Feventpoll.c;h=4c999ce7e73add0ef74b91177af53b3e95422738;hb=649f8c0d9aa08d8c9a04cc1a1412cdf2a4836bca;hp=db10e00c971a71add3be6b140a40b4f9d68580f1;hpb=91dc5646819e7fd4a60f5c6dd45030478dadf77b;p=firefly-linux-kernel-4.4.55.git diff --git a/fs/eventpoll.c b/fs/eventpoll.c index db10e00c971a..4c999ce7e73a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -42,6 +42,7 @@ #include #include #include +#include /* * LOCKING: @@ -134,8 +135,12 @@ struct nested_calls { * of these on a server and we do not want this to take another cache line. */ struct epitem { - /* RB tree node used to link this structure to the eventpoll RB tree */ - struct rb_node rbn; + union { + /* RB tree node links this structure to the eventpoll RB tree */ + struct rb_node rbn; + /* Used to free the struct epitem */ + struct rcu_head rcu; + }; /* List header used to link this structure to the eventpoll ready list */ struct list_head rdllink; @@ -289,7 +294,7 @@ static LIST_HEAD(tfile_check_list); static long zero; static long long_max = LONG_MAX; -ctl_table epoll_table[] = { +struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, @@ -581,14 +586,14 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi) * @sproc: Pointer to the scan callback. * @priv: Private opaque data passed to the @sproc callback. * @depth: The current depth of recursive f_op->poll calls. + * @ep_locked: caller already holds ep->mtx * * Returns: The same integer error code returned by the @sproc callback. */ static int ep_scan_ready_list(struct eventpoll *ep, int (*sproc)(struct eventpoll *, struct list_head *, void *), - void *priv, - int depth) + void *priv, int depth, bool ep_locked) { int error, pwake = 0; unsigned long flags; @@ -599,7 +604,9 @@ static int ep_scan_ready_list(struct eventpoll *ep, * We need to lock this because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ - mutex_lock_nested(&ep->mtx, depth); + + if (!ep_locked) + mutex_lock_nested(&ep->mtx, depth); /* * Steal the ready list, and re-init the original one to the @@ -663,7 +670,8 @@ static int ep_scan_ready_list(struct eventpoll *ep, } spin_unlock_irqrestore(&ep->lock, flags); - mutex_unlock(&ep->mtx); + if (!ep_locked) + mutex_unlock(&ep->mtx); /* We have to call this outside the lock */ if (pwake) @@ -672,6 +680,12 @@ static int ep_scan_ready_list(struct eventpoll *ep, return error; } +static void epi_rcu_free(struct rcu_head *head) +{ + struct epitem *epi = container_of(head, struct epitem, rcu); + kmem_cache_free(epi_cache, epi); +} + /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. @@ -693,8 +707,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); - if (ep_is_linked(&epi->fllink)) - list_del_init(&epi->fllink); + list_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); rb_erase(&epi->rbn, &ep->rbr); @@ -705,9 +718,14 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) spin_unlock_irqrestore(&ep->lock, flags); wakeup_source_unregister(ep_wakeup_source(epi)); - - /* At this point it is safe to free the eventpoll item */ - kmem_cache_free(epi_cache, epi); + /* + * At this point it is safe to free the eventpoll item. Use the union + * field epi->rcu, since we are trying to minimize the size of + * 'struct epitem'. The 'rbn' field is no longer in use. Protected by + * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make + * use of the rbn field. + */ + call_rcu(&epi->rcu, epi_rcu_free); atomic_long_dec(&ep->user->epoll_watches); @@ -740,6 +758,7 @@ static void ep_free(struct eventpoll *ep) epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); + cond_resched(); } /* @@ -754,6 +773,7 @@ static void ep_free(struct eventpoll *ep) while ((rbp = rb_first(&ep->rbr)) != NULL) { epi = rb_entry(rbp, struct epitem, rbn); ep_remove(ep, epi); + cond_resched(); } mutex_unlock(&ep->mtx); @@ -806,15 +826,34 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, return 0; } +static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, + poll_table *pt); + +struct readyevents_arg { + struct eventpoll *ep; + bool locked; +}; + static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) { - return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1); + struct readyevents_arg *arg = priv; + + return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL, + call_nests + 1, arg->locked); } static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) { int pollflags; struct eventpoll *ep = file->private_data; + struct readyevents_arg arg; + + /* + * During ep_insert() we already hold the ep->mtx for the tfile. + * Prevent re-aquisition. + */ + arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc); + arg.ep = ep; /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); @@ -826,31 +865,28 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) * could re-enter here. */ pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, - ep_poll_readyevents_proc, ep, ep, current); + ep_poll_readyevents_proc, &arg, ep, current); return pollflags != -1 ? pollflags : 0; } #ifdef CONFIG_PROC_FS -static int ep_show_fdinfo(struct seq_file *m, struct file *f) +static void ep_show_fdinfo(struct seq_file *m, struct file *f) { struct eventpoll *ep = f->private_data; struct rb_node *rbp; - int ret = 0; mutex_lock(&ep->mtx); for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { struct epitem *epi = rb_entry(rbp, struct epitem, rbn); - ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n", - epi->ffd.fd, epi->event.events, - (long long)epi->event.data); - if (ret) + seq_printf(m, "tfd: %8d events: %8x data: %16llx\n", + epi->ffd.fd, epi->event.events, + (long long)epi->event.data); + if (seq_has_overflowed(m)) break; } mutex_unlock(&ep->mtx); - - return ret; } #endif @@ -871,9 +907,8 @@ static const struct file_operations eventpoll_fops = { */ void eventpoll_release_file(struct file *file) { - struct list_head *lsthead = &file->f_ep_links; struct eventpoll *ep; - struct epitem *epi; + struct epitem *epi, *next; /* * We don't want to get "file->f_lock" because it is not @@ -889,17 +924,12 @@ void eventpoll_release_file(struct file *file) * Besides, ep_remove() acquires the lock, so we can't hold it here. */ mutex_lock(&epmutex); - - while (!list_empty(lsthead)) { - epi = list_first_entry(lsthead, struct epitem, fllink); - + list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) { ep = epi->ep; - list_del_init(&epi->fllink); mutex_lock_nested(&ep->mtx, 0); ep_remove(ep, epi); mutex_unlock(&ep->mtx); } - mutex_unlock(&epmutex); } @@ -1137,7 +1167,9 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) struct file *child_file; struct epitem *epi; - list_for_each_entry(epi, &file->f_ep_links, fllink) { + /* CTL_DEL can remove links here, but that can't increase our count */ + rcu_read_lock(); + list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) { child_file = epi->ep->file; if (is_file_epoll(child_file)) { if (list_empty(&child_file->f_ep_links)) { @@ -1159,6 +1191,7 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) "file is not an ep!\n"); } } + rcu_read_unlock(); return error; } @@ -1230,7 +1263,7 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi) * Must be called with "mtx" held. */ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, - struct file *tfile, int fd) + struct file *tfile, int fd, int full_check) { int error, revents, pwake = 0; unsigned long flags; @@ -1285,7 +1318,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* Add the current item to the list of active epoll hook for this file */ spin_lock(&tfile->f_lock); - list_add_tail(&epi->fllink, &tfile->f_ep_links); + list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); spin_unlock(&tfile->f_lock); /* @@ -1296,7 +1329,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* now check if we've created too many backpaths */ error = -EINVAL; - if (reverse_path_check()) + if (full_check && reverse_path_check()) goto error_remove_epi; /* We have to drop the new item inside our item list to keep track of it */ @@ -1326,8 +1359,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, error_remove_epi: spin_lock(&tfile->f_lock); - if (ep_is_linked(&epi->fllink)) - list_del_init(&epi->fllink); + list_del_rcu(&epi->fllink); spin_unlock(&tfile->f_lock); rb_erase(&epi->rbn, &ep->rbr); @@ -1520,7 +1552,7 @@ static int ep_send_events(struct eventpoll *ep, esed.maxevents = maxevents; esed.events = events; - return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0); + return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false); } static inline struct timespec ep_set_mstimeout(long ms) @@ -1609,9 +1641,9 @@ fetch_events: spin_lock_irqsave(&ep->lock, flags); } - __remove_wait_queue(&ep->wq, &wait); - set_current_state(TASK_RUNNING); + __remove_wait_queue(&ep->wq, &wait); + __set_current_state(TASK_RUNNING); } check_events: /* Is it worth to try to dig for events ? */ @@ -1791,35 +1823,36 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { int error; - int did_lock_epmutex = 0; - struct file *file, *tfile; + int full_check = 0; + struct fd f, tf; struct eventpoll *ep; struct epitem *epi; struct epoll_event epds; + struct eventpoll *tep = NULL; error = -EFAULT; if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) goto error_return; - /* Get the "struct file *" for the eventpoll file */ error = -EBADF; - file = fget(epfd); - if (!file) + f = fdget(epfd); + if (!f.file) goto error_return; /* Get the "struct file *" for the target file */ - tfile = fget(fd); - if (!tfile) + tf = fdget(fd); + if (!tf.file) goto error_fput; /* The target file descriptor must support poll */ error = -EPERM; - if (!tfile->f_op || !tfile->f_op->poll) + if (!tf.file->f_op->poll) goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ - ep_take_care_of_epollwakeup(&epds); + if (ep_op_has_event(op)) + ep_take_care_of_epollwakeup(&epds); /* * We have to check that the file structure underneath the file descriptor @@ -1827,14 +1860,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * adding an epoll file descriptor inside itself. */ error = -EINVAL; - if (file == tfile || !is_file_epoll(file)) + if (f.file == tf.file || !is_file_epoll(f.file)) goto error_tgt_fput; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ - ep = file->private_data; + ep = f.file->private_data; /* * When we insert an epoll file descriptor, inside another epoll file @@ -1844,43 +1877,54 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * and hang them on the tfile_check_list, so we can check that we * haven't created too many possible wakeup paths. * - * We need to hold the epmutex across both ep_insert and ep_remove - * b/c we want to make sure we are looking at a coherent view of - * epoll network. + * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when + * the epoll file descriptor is attaching directly to a wakeup source, + * unless the epoll file descriptor is nested. The purpose of taking the + * 'epmutex' on add is to prevent complex toplogies such as loops and + * deep wakeup paths from forming in parallel through multiple + * EPOLL_CTL_ADD operations. */ - if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { - mutex_lock(&epmutex); - did_lock_epmutex = 1; - } + mutex_lock_nested(&ep->mtx, 0); if (op == EPOLL_CTL_ADD) { - if (is_file_epoll(tfile)) { - error = -ELOOP; - if (ep_loop_check(ep, tfile) != 0) { - clear_tfile_check_list(); - goto error_tgt_fput; + if (!list_empty(&f.file->f_ep_links) || + is_file_epoll(tf.file)) { + full_check = 1; + mutex_unlock(&ep->mtx); + mutex_lock(&epmutex); + if (is_file_epoll(tf.file)) { + error = -ELOOP; + if (ep_loop_check(ep, tf.file) != 0) { + clear_tfile_check_list(); + goto error_tgt_fput; + } + } else + list_add(&tf.file->f_tfile_llink, + &tfile_check_list); + mutex_lock_nested(&ep->mtx, 0); + if (is_file_epoll(tf.file)) { + tep = tf.file->private_data; + mutex_lock_nested(&tep->mtx, 1); } - } else - list_add(&tfile->f_tfile_llink, &tfile_check_list); + } } - mutex_lock_nested(&ep->mtx, 0); - /* * Try to lookup the file inside our RB tree, Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ - epi = ep_find(ep, tfile, fd); + epi = ep_find(ep, tf.file, fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds.events |= POLLERR | POLLHUP; - error = ep_insert(ep, &epds, tfile, fd); + error = ep_insert(ep, &epds, tf.file, fd, full_check); } else error = -EEXIST; - clear_tfile_check_list(); + if (full_check) + clear_tfile_check_list(); break; case EPOLL_CTL_DEL: if (epi) @@ -1896,15 +1940,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, error = -ENOENT; break; } + if (tep != NULL) + mutex_unlock(&tep->mtx); mutex_unlock(&ep->mtx); error_tgt_fput: - if (did_lock_epmutex) + if (full_check) mutex_unlock(&epmutex); - fput(tfile); + fdput(tf); error_fput: - fput(file); + fdput(f); error_return: return error; @@ -1976,8 +2022,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, return -EINVAL; if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) return -EFAULT; - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + sigsaved = current->blocked; + set_current_blocked(&ksigmask); } error = sys_epoll_wait(epfd, events, maxevents, timeout); @@ -1994,7 +2040,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, sizeof(sigsaved)); set_restore_sigmask(); } else - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + set_current_blocked(&sigsaved); } return error; @@ -2021,8 +2067,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) return -EFAULT; sigset_from_compat(&ksigmask, &csigmask); - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + sigsaved = current->blocked; + set_current_blocked(&ksigmask); } err = sys_epoll_wait(epfd, events, maxevents, timeout); @@ -2039,7 +2085,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, sizeof(sigsaved)); set_restore_sigmask(); } else - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + set_current_blocked(&sigsaved); } return err;