change sb_writers to use percpu_rw_semaphore
authorOleg Nesterov <oleg@redhat.com>
Tue, 11 Aug 2015 15:05:04 +0000 (17:05 +0200)
committerOleg Nesterov <oleg@redhat.com>
Sat, 15 Aug 2015 11:52:13 +0000 (13:52 +0200)
We can remove everything from struct sb_writers except frozen
and add the array of percpu_rw_semaphore's instead.

This patch doesn't remove sb_writers->wait_unfrozen yet, we keep
it for get_super_thawed(). We will probably remove it later.

This change tries to address the following problems:

- Firstly, __sb_start_write() looks simply buggy. It does
  __sb_end_write() if it sees ->frozen, but if it migrates
  to another CPU before percpu_counter_dec(), sb_wait_write()
  can wrongly succeed if there is another task which holds
  the same "semaphore": sb_wait_write() can miss the result
  of the previous percpu_counter_inc() but see the result
  of this percpu_counter_dec().

- As Dave Hansen reports, it is suboptimal. The trivial
  microbenchmark that writes to a tmpfs file in a loop runs
  12% faster if we change this code to rely on RCU and kill
  the memory barriers.

- This code doesn't look simple. It would be better to rely
  on the generic locking code.

  According to Dave, this change adds the same performance
  improvement.

Note: with this change both freeze_super() and thaw_super() will do
synchronize_sched_expedited() 3 times. This is just ugly. But:

- This will be "fixed" by the rcu_sync changes we are going
  to merge. After that freeze_super()->percpu_down_write()
  will use synchronize_sched(), and thaw_super() won't use
  synchronize() at all.

  This doesn't need any changes in fs/super.c.

- Once we merge rcu_sync changes, we can also change super.c
  so that all wb_write->rw_sem's will share the single ->rss
  in struct sb_writes, then freeze_super() will need only one
  synchronize_sched().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Jan Kara <jack@suse.com>
fs/super.c
include/linux/fs.h

index c937bd7b4d33ba57da6156821b0b4413963f5d99..767b1e10f6ad3277918f0cd1fb69ddef1a24ea00 100644 (file)
@@ -142,7 +142,7 @@ static void destroy_super_work(struct work_struct *work)
        int i;
 
        for (i = 0; i < SB_FREEZE_LEVELS; i++)
-               percpu_counter_destroy(&s->s_writers.counter[i]);
+               percpu_free_rwsem(&s->s_writers.rw_sem[i]);
        kfree(s);
 }
 
@@ -193,13 +193,11 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
                goto fail;
 
        for (i = 0; i < SB_FREEZE_LEVELS; i++) {
-               if (percpu_counter_init(&s->s_writers.counter[i], 0,
-                                       GFP_KERNEL) < 0)
+               if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
+                                       sb_writers_name[i],
+                                       &type->s_writers_key[i]))
                        goto fail;
-               lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
-                                &type->s_writers_key[i], 0);
        }
-       init_waitqueue_head(&s->s_writers.wait);
        init_waitqueue_head(&s->s_writers.wait_unfrozen);
        s->s_bdi = &noop_backing_dev_info;
        s->s_flags = flags;
@@ -1161,47 +1159,10 @@ out:
  */
 void __sb_end_write(struct super_block *sb, int level)
 {
-       percpu_counter_dec(&sb->s_writers.counter[level-1]);
-       /*
-        * Make sure s_writers are updated before we wake up waiters in
-        * freeze_super().
-        */
-       smp_mb();
-       if (waitqueue_active(&sb->s_writers.wait))
-               wake_up(&sb->s_writers.wait);
-       rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
+       percpu_up_read(sb->s_writers.rw_sem + level-1);
 }
 EXPORT_SYMBOL(__sb_end_write);
 
-static int do_sb_start_write(struct super_block *sb, int level, bool wait,
-                               unsigned long ip)
-{
-       if (wait)
-               rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 0, ip);
-retry:
-       if (unlikely(sb->s_writers.frozen >= level)) {
-               if (!wait)
-                       return 0;
-               wait_event(sb->s_writers.wait_unfrozen,
-                          sb->s_writers.frozen < level);
-       }
-
-       percpu_counter_inc(&sb->s_writers.counter[level-1]);
-       /*
-        * Make sure counter is updated before we check for frozen.
-        * freeze_super() first sets frozen and then checks the counter.
-        */
-       smp_mb();
-       if (unlikely(sb->s_writers.frozen >= level)) {
-               __sb_end_write(sb, level);
-               goto retry;
-       }
-
-       if (!wait)
-               rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 1, ip);
-       return 1;
-}
-
 /*
  * This is an internal function, please use sb_start_{write,pagefault,intwrite}
  * instead.
@@ -1209,7 +1170,7 @@ retry:
 int __sb_start_write(struct super_block *sb, int level, bool wait)
 {
        bool force_trylock = false;
-       int ret;
+       int ret = 1;
 
 #ifdef CONFIG_LOCKDEP
        /*
@@ -1225,13 +1186,17 @@ int __sb_start_write(struct super_block *sb, int level, bool wait)
                int i;
 
                for (i = 0; i < level - 1; i++)
-                       if (lock_is_held(&sb->s_writers.lock_map[i])) {
+                       if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
                                force_trylock = true;
                                break;
                        }
        }
 #endif
-       ret = do_sb_start_write(sb, level, wait && !force_trylock, _RET_IP_);
+       if (wait && !force_trylock)
+               percpu_down_read(sb->s_writers.rw_sem + level-1);
+       else
+               ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
+
        WARN_ON(force_trylock & !ret);
        return ret;
 }
@@ -1243,15 +1208,11 @@ EXPORT_SYMBOL(__sb_start_write);
  * @level: type of writers we wait for (normal vs page fault)
  *
  * This function waits until there are no writers of given type to given file
- * system. Caller of this function should make sure there can be no new writers
- * of type @level before calling this function. Otherwise this function can
- * livelock.
+ * system.
  */
 static void sb_wait_write(struct super_block *sb, int level)
 {
-       s64 writers;
-
-       rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
+       percpu_down_write(sb->s_writers.rw_sem + level-1);
        /*
         * We are going to return to userspace and forget about this lock, the
         * ownership goes to the caller of thaw_super() which does unlock.
@@ -1262,24 +1223,18 @@ static void sb_wait_write(struct super_block *sb, int level)
         * this leads to lockdep false-positives, so currently we do the early
         * release right after acquire.
         */
-       rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
-
-       do {
-               DEFINE_WAIT(wait);
+       percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_);
+}
 
-               /*
-                * We use a barrier in prepare_to_wait() to separate setting
-                * of frozen and checking of the counter
-                */
-               prepare_to_wait(&sb->s_writers.wait, &wait,
-                               TASK_UNINTERRUPTIBLE);
+static void sb_freeze_unlock(struct super_block *sb)
+{
+       int level;
 
-               writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
-               if (writers)
-                       schedule();
+       for (level = 0; level < SB_FREEZE_LEVELS; ++level)
+               percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
 
-               finish_wait(&sb->s_writers.wait, &wait);
-       } while (writers);
+       for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
+               percpu_up_write(sb->s_writers.rw_sem + level);
 }
 
 /**
@@ -1338,20 +1293,14 @@ int freeze_super(struct super_block *sb)
                return 0;
        }
 
-       /* From now on, no new normal writers can start */
        sb->s_writers.frozen = SB_FREEZE_WRITE;
-       smp_wmb();
-
        /* Release s_umount to preserve sb_start_write -> s_umount ordering */
        up_write(&sb->s_umount);
-
        sb_wait_write(sb, SB_FREEZE_WRITE);
+       down_write(&sb->s_umount);
 
        /* Now we go and block page faults... */
-       down_write(&sb->s_umount);
        sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
-       smp_wmb();
-
        sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
 
        /* All writers are done so after syncing there won't be dirty data */
@@ -1359,7 +1308,6 @@ int freeze_super(struct super_block *sb)
 
        /* Now wait for internal filesystem counter */
        sb->s_writers.frozen = SB_FREEZE_FS;
-       smp_wmb();
        sb_wait_write(sb, SB_FREEZE_FS);
 
        if (sb->s_op->freeze_fs) {
@@ -1368,7 +1316,7 @@ int freeze_super(struct super_block *sb)
                        printk(KERN_ERR
                                "VFS:Filesystem freeze failed\n");
                        sb->s_writers.frozen = SB_UNFROZEN;
-                       smp_wmb();
+                       sb_freeze_unlock(sb);
                        wake_up(&sb->s_writers.wait_unfrozen);
                        deactivate_locked_super(sb);
                        return ret;
@@ -1400,8 +1348,10 @@ int thaw_super(struct super_block *sb)
                return -EINVAL;
        }
 
-       if (sb->s_flags & MS_RDONLY)
+       if (sb->s_flags & MS_RDONLY) {
+               sb->s_writers.frozen = SB_UNFROZEN;
                goto out;
+       }
 
        if (sb->s_op->unfreeze_fs) {
                error = sb->s_op->unfreeze_fs(sb);
@@ -1413,12 +1363,11 @@ int thaw_super(struct super_block *sb)
                }
        }
 
-out:
        sb->s_writers.frozen = SB_UNFROZEN;
-       smp_wmb();
+       sb_freeze_unlock(sb);
+out:
        wake_up(&sb->s_writers.wait_unfrozen);
        deactivate_locked_super(sb);
-
        return 0;
 }
 EXPORT_SYMBOL(thaw_super);
index 4bed78966c6bbc75d4a5879c1c21703be55e739b..ce356f66cc2a41de1e452b9f9dc89d8dc0d5bd30 100644 (file)
@@ -1,7 +1,6 @@
 #ifndef _LINUX_FS_H
 #define _LINUX_FS_H
 
-
 #include <linux/linkage.h>
 #include <linux/wait.h>
 #include <linux/kdev_t.h>
@@ -31,6 +30,7 @@
 #include <linux/percpu-rwsem.h>
 #include <linux/blk_types.h>
 #include <linux/workqueue.h>
+#include <linux/percpu-rwsem.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -1275,16 +1275,9 @@ enum {
 #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
 
 struct sb_writers {
-       /* Counters for counting writers at each level */
-       struct percpu_counter   counter[SB_FREEZE_LEVELS];
-       wait_queue_head_t       wait;           /* queue for waiting for
-                                                  writers / faults to finish */
-       int                     frozen;         /* Is sb frozen? */
-       wait_queue_head_t       wait_unfrozen;  /* queue for waiting for
-                                                  sb to be thawed */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-       struct lockdep_map      lock_map[SB_FREEZE_LEVELS];
-#endif
+       int                             frozen;         /* Is sb frozen? */
+       wait_queue_head_t               wait_unfrozen;  /* for get_super_thawed() */
+       struct percpu_rw_semaphore      rw_sem[SB_FREEZE_LEVELS];
 };
 
 struct super_block {
@@ -1393,9 +1386,9 @@ void __sb_end_write(struct super_block *sb, int level);
 int __sb_start_write(struct super_block *sb, int level, bool wait);
 
 #define __sb_writers_acquired(sb, lev) \
-       rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_)
+       percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
 #define __sb_writers_release(sb, lev)  \
-       rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_)
+       percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
 
 /**
  * sb_end_write - drop write access to a superblock