vfs: fix dentry RCU to refcounting possibly sleeping dput()
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 9 Sep 2013 01:13:49 +0000 (18:13 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 9 Sep 2013 01:13:49 +0000 (18:13 -0700)
This is the fix that the last two commits indirectly led up to - making
sure that we don't call dput() in a bad context on the dentries we've
looked up in RCU mode after the sequence count validation fails.

This basically expands d_rcu_to_refcount() into the callers, and then
fixes the callers to delay the dput() in the failure case until _after_
we've dropped all locks and are no longer in an RCU-locked region.

The case of 'complete_walk()' was trivial, since its failure case did
the unlock_rcu_walk() directly after the call to d_rcu_to_refcount(),
and as such that is just a pure expansion of the function with a trivial
movement of the resulting dput() to after 'unlock_rcu_walk()'.

In contrast, the unlazy_walk() case was much more complicated, because
not only does convert two different dentries from RCU to be reference
counted, but it used to not call unlock_rcu_walk() at all, and instead
just returned an error and let the caller clean everything up in
"terminate_walk()".

Happily, one of the dentries in question (called "parent" inside
unlazy_walk()) is the dentry of "nd->path", which terminate_walk() wants
a refcount to anyway for the non-RCU case.

So what the new and improved unlazy_walk() does is to first turn that
dentry into a refcounted one, and once that is set up, the error cases
can continue to use the terminate_walk() helper for cleanup, but for the
non-RCU case.  Which makes it possible to drop out of RCU mode if we
actually hit the sequence number failure case.

Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/namei.c

index cc4bcfaa8624172ff82b13789c871b470de78d3c..56e4f4d537d0589912bea3ee629390e894842336 100644 (file)
@@ -494,37 +494,6 @@ static inline void unlock_rcu_walk(void)
        br_read_unlock(&vfsmount_lock);
 }
 
-/*
- * When we move over from the RCU domain to properly refcounted
- * long-lived dentries, we need to check the sequence numbers
- * we got before lookup very carefully.
- *
- * We cannot blindly increment a dentry refcount - even if it
- * is not locked - if it is zero, because it may have gone
- * through the final d_kill() logic already.
- *
- * So for a zero refcount, we need to get the spinlock (which is
- * safe even for a dead dentry because the de-allocation is
- * RCU-delayed), and check the sequence count under the lock.
- *
- * Once we have checked the sequence count, we know it is live,
- * and since we hold the spinlock it cannot die from under us.
- *
- * In contrast, if the reference count wasn't zero, we can just
- * increment the lockref without having to take the spinlock.
- * Even if the sequence number ends up being stale, we haven't
- * gone through the final dput() and killed the dentry yet.
- */
-static inline int d_rcu_to_refcount(struct dentry *dentry, seqcount_t *validate, unsigned seq)
-{
-       if (likely(lockref_get_not_dead(&dentry->d_lockref))) {
-               if (!read_seqcount_retry(validate, seq))
-                               return 0;
-               dput(dentry);
-       }
-       return -ECHILD;
-}
-
 /**
  * unlazy_walk - try to switch to ref-walk mode.
  * @nd: nameidata pathwalk data
@@ -539,16 +508,29 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 {
        struct fs_struct *fs = current->fs;
        struct dentry *parent = nd->path.dentry;
-       int want_root = 0;
 
        BUG_ON(!(nd->flags & LOOKUP_RCU));
-       if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
-               want_root = 1;
-               spin_lock(&fs->lock);
-               if (nd->root.mnt != fs->root.mnt ||
-                               nd->root.dentry != fs->root.dentry)
-                       goto err_root;
-       }
+
+       /*
+        * Get a reference to the parent first: we're
+        * going to make "path_put(nd->path)" valid in
+        * non-RCU context for "terminate_walk()".
+        *
+        * If this doesn't work, return immediately with
+        * RCU walking still active (and then we will do
+        * the RCU walk cleanup in terminate_walk()).
+        */
+       if (!lockref_get_not_dead(&parent->d_lockref))
+               return -ECHILD;
+
+       /*
+        * After the mntget(), we terminate_walk() will do
+        * the right thing for non-RCU mode, and all our
+        * subsequent exit cases should unlock_rcu_walk()
+        * before returning.
+        */
+       mntget(nd->path.mnt);
+       nd->flags &= ~LOOKUP_RCU;
 
        /*
         * For a negative lookup, the lookup sequence point is the parents
@@ -562,30 +544,39 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
         * be valid if the child sequence number is still valid.
         */
        if (!dentry) {
-               if (d_rcu_to_refcount(parent, &parent->d_seq, nd->seq) < 0)
-                       goto err_root;
+               if (read_seqcount_retry(&parent->d_seq, nd->seq))
+                       goto out;
                BUG_ON(nd->inode != parent->d_inode);
        } else {
-               if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0)
-                       goto err_root;
-               if (d_rcu_to_refcount(parent, &dentry->d_seq, nd->seq) < 0)
-                       goto err_parent;
+               if (!lockref_get_not_dead(&dentry->d_lockref))
+                       goto out;
+               if (read_seqcount_retry(&dentry->d_seq, nd->seq))
+                       goto drop_dentry;
        }
-       if (want_root) {
+
+       /*
+        * Sequence counts matched. Now make sure that the root is
+        * still valid and get it if required.
+        */
+       if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+               spin_lock(&fs->lock);
+               if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry)
+                       goto unlock_and_drop_dentry;
                path_get(&nd->root);
                spin_unlock(&fs->lock);
        }
-       mntget(nd->path.mnt);
 
        unlock_rcu_walk();
-       nd->flags &= ~LOOKUP_RCU;
        return 0;
 
-err_parent:
+unlock_and_drop_dentry:
+       spin_unlock(&fs->lock);
+drop_dentry:
+       unlock_rcu_walk();
        dput(dentry);
-err_root:
-       if (want_root)
-               spin_unlock(&fs->lock);
+       return -ECHILD;
+out:
+       unlock_rcu_walk();
        return -ECHILD;
 }
 
@@ -614,10 +605,15 @@ static int complete_walk(struct nameidata *nd)
                if (!(nd->flags & LOOKUP_ROOT))
                        nd->root.mnt = NULL;
 
-               if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) {
+               if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
                        unlock_rcu_walk();
                        return -ECHILD;
                }
+               if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
+                       unlock_rcu_walk();
+                       dput(dentry);
+                       return -ECHILD;
+               }
                mntget(nd->path.mnt);
                unlock_rcu_walk();
        }