UPSTREAM: clk: rockchip: release io resource when failing to init clk
[firefly-linux-kernel-4.4.55.git] / fs / super.c
index 11afd67925bbd07bad0c25060ad32cec297de60b..1014e7cc355fbac57fc1aa3073853a7954b77e38 100644 (file)
@@ -22,7 +22,6 @@
 
 #include <linux/export.h>
 #include <linux/slab.h>
-#include <linux/acct.h>
 #include <linux/blkdev.h>
 #include <linux/mount.h>
 #include <linux/security.h>
@@ -37,8 +36,8 @@
 #include "internal.h"
 
 
-LIST_HEAD(super_blocks);
-DEFINE_SPINLOCK(sb_lock);
+static LIST_HEAD(super_blocks);
+static DEFINE_SPINLOCK(sb_lock);
 
 static char *sb_writers_name[SB_FREEZE_LEVELS] = {
        "sb_writers",
@@ -53,11 +52,15 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
  * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
  * take a passive reference to the superblock to avoid this from occurring.
  */
-static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long super_cache_scan(struct shrinker *shrink,
+                                     struct shrink_control *sc)
 {
        struct super_block *sb;
-       int     fs_objects = 0;
-       int     total_objects;
+       long    fs_objects = 0;
+       long    total_objects;
+       long    freed = 0;
+       long    dentries;
+       long    inodes;
 
        sb = container_of(shrink, struct super_block, s_shrink);
 
@@ -65,79 +68,106 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
         * Deadlock avoidance.  We may hold various FS locks, and we don't want
         * to recurse into the FS that called us in clear_inode() and friends..
         */
-       if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
-               return -1;
+       if (!(sc->gfp_mask & __GFP_FS))
+               return SHRINK_STOP;
 
-       if (!grab_super_passive(sb))
-               return -1;
+       if (!trylock_super(sb))
+               return SHRINK_STOP;
 
-       if (sb->s_op && sb->s_op->nr_cached_objects)
-               fs_objects = sb->s_op->nr_cached_objects(sb);
+       if (sb->s_op->nr_cached_objects)
+               fs_objects = sb->s_op->nr_cached_objects(sb, sc);
 
-       total_objects = sb->s_nr_dentry_unused +
-                       sb->s_nr_inodes_unused + fs_objects + 1;
+       inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
+       dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
+       total_objects = dentries + inodes + fs_objects + 1;
        if (!total_objects)
                total_objects = 1;
 
-       if (sc->nr_to_scan) {
-               int     dentries;
-               int     inodes;
-
-               /* proportion the scan between the caches */
-               dentries = (sc->nr_to_scan * sb->s_nr_dentry_unused) /
-                                                       total_objects;
-               inodes = (sc->nr_to_scan * sb->s_nr_inodes_unused) /
-                                                       total_objects;
-               if (fs_objects)
-                       fs_objects = (sc->nr_to_scan * fs_objects) /
-                                                       total_objects;
-               /*
-                * prune the dcache first as the icache is pinned by it, then
-                * prune the icache, followed by the filesystem specific caches
-                */
-               prune_dcache_sb(sb, dentries);
-               prune_icache_sb(sb, inodes);
+       /* proportion the scan between the caches */
+       dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
+       inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
+       fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
 
-               if (fs_objects && sb->s_op->free_cached_objects) {
-                       sb->s_op->free_cached_objects(sb, fs_objects);
-                       fs_objects = sb->s_op->nr_cached_objects(sb);
-               }
-               total_objects = sb->s_nr_dentry_unused +
-                               sb->s_nr_inodes_unused + fs_objects;
+       /*
+        * prune the dcache first as the icache is pinned by it, then
+        * prune the icache, followed by the filesystem specific caches
+        *
+        * Ensure that we always scan at least one object - memcg kmem
+        * accounting uses this to fully empty the caches.
+        */
+       sc->nr_to_scan = dentries + 1;
+       freed = prune_dcache_sb(sb, sc);
+       sc->nr_to_scan = inodes + 1;
+       freed += prune_icache_sb(sb, sc);
+
+       if (fs_objects) {
+               sc->nr_to_scan = fs_objects + 1;
+               freed += sb->s_op->free_cached_objects(sb, sc);
        }
 
-       total_objects = (total_objects / 100) * sysctl_vfs_cache_pressure;
-       drop_super(sb);
-       return total_objects;
+       up_read(&sb->s_umount);
+       return freed;
 }
 
-static int init_sb_writers(struct super_block *s, struct file_system_type *type)
+static unsigned long super_cache_count(struct shrinker *shrink,
+                                      struct shrink_control *sc)
 {
-       int err;
-       int i;
+       struct super_block *sb;
+       long    total_objects = 0;
 
-       for (i = 0; i < SB_FREEZE_LEVELS; i++) {
-               err = percpu_counter_init(&s->s_writers.counter[i], 0);
-               if (err < 0)
-                       goto err_out;
-               lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
-                                &type->s_writers_key[i], 0);
-       }
-       init_waitqueue_head(&s->s_writers.wait);
-       init_waitqueue_head(&s->s_writers.wait_unfrozen);
-       return 0;
-err_out:
-       while (--i >= 0)
-               percpu_counter_destroy(&s->s_writers.counter[i]);
-       return err;
+       sb = container_of(shrink, struct super_block, s_shrink);
+
+       /*
+        * Don't call trylock_super as it is a potential
+        * scalability bottleneck. The counts could get updated
+        * between super_cache_count and super_cache_scan anyway.
+        * Call to super_cache_count with shrinker_rwsem held
+        * ensures the safety of call to list_lru_shrink_count() and
+        * s_op->nr_cached_objects().
+        */
+       if (sb->s_op && sb->s_op->nr_cached_objects)
+               total_objects = sb->s_op->nr_cached_objects(sb, sc);
+
+       total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
+       total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
+
+       total_objects = vfs_pressure_ratio(total_objects);
+       return total_objects;
 }
 
-static void destroy_sb_writers(struct super_block *s)
+static void destroy_super_work(struct work_struct *work)
 {
+       struct super_block *s = container_of(work, struct super_block,
+                                                       destroy_work);
        int i;
 
        for (i = 0; i < SB_FREEZE_LEVELS; i++)
-               percpu_counter_destroy(&s->s_writers.counter[i]);
+               percpu_free_rwsem(&s->s_writers.rw_sem[i]);
+       kfree(s);
+}
+
+static void destroy_super_rcu(struct rcu_head *head)
+{
+       struct super_block *s = container_of(head, struct super_block, rcu);
+       INIT_WORK(&s->destroy_work, destroy_super_work);
+       schedule_work(&s->destroy_work);
+}
+
+/**
+ *     destroy_super   -       frees a superblock
+ *     @s: superblock to free
+ *
+ *     Frees a superblock.
+ */
+static void destroy_super(struct super_block *s)
+{
+       list_lru_destroy(&s->s_dentry_lru);
+       list_lru_destroy(&s->s_inode_lru);
+       security_sb_free(s);
+       WARN_ON(!list_empty(&s->s_mounts));
+       kfree(s->s_subtype);
+       kfree(s->s_options);
+       call_rcu(&s->rcu, destroy_super_rcu);
 }
 
 /**
@@ -152,86 +182,75 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 {
        struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
        static const struct super_operations default_op;
+       int i;
 
-       if (s) {
-               if (security_sb_alloc(s)) {
-                       /*
-                        * We cannot call security_sb_free() without
-                        * security_sb_alloc() succeeding. So bail out manually
-                        */
-                       kfree(s);
-                       s = NULL;
-                       goto out;
-               }
-               if (init_sb_writers(s, type))
-                       goto err_out;
-               s->s_flags = flags;
-               s->s_bdi = &default_backing_dev_info;
-               INIT_HLIST_NODE(&s->s_instances);
-               INIT_HLIST_BL_HEAD(&s->s_anon);
-               INIT_LIST_HEAD(&s->s_inodes);
-               INIT_LIST_HEAD(&s->s_dentry_lru);
-               INIT_LIST_HEAD(&s->s_inode_lru);
-               spin_lock_init(&s->s_inode_lru_lock);
-               INIT_LIST_HEAD(&s->s_mounts);
-               init_rwsem(&s->s_umount);
-               lockdep_set_class(&s->s_umount, &type->s_umount_key);
-               /*
-                * sget() can have s_umount recursion.
-                *
-                * When it cannot find a suitable sb, it allocates a new
-                * one (this one), and tries again to find a suitable old
-                * one.
-                *
-                * In case that succeeds, it will acquire the s_umount
-                * lock of the old one. Since these are clearly distrinct
-                * locks, and this object isn't exposed yet, there's no
-                * risk of deadlocks.
-                *
-                * Annotate this by putting this lock in a different
-                * subclass.
-                */
-               down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
-               s->s_count = 1;
-               atomic_set(&s->s_active, 1);
-               mutex_init(&s->s_vfs_rename_mutex);
-               lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
-               mutex_init(&s->s_dquot.dqio_mutex);
-               mutex_init(&s->s_dquot.dqonoff_mutex);
-               init_rwsem(&s->s_dquot.dqptr_sem);
-               s->s_maxbytes = MAX_NON_LFS;
-               s->s_op = &default_op;
-               s->s_time_gran = 1000000000;
-               s->cleancache_poolid = -1;
-
-               s->s_shrink.seeks = DEFAULT_SEEKS;
-               s->s_shrink.shrink = prune_super;
-               s->s_shrink.batch = 1024;
+       if (!s)
+               return NULL;
+
+       INIT_LIST_HEAD(&s->s_mounts);
+
+       if (security_sb_alloc(s))
+               goto fail;
+
+       for (i = 0; i < SB_FREEZE_LEVELS; i++) {
+               if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
+                                       sb_writers_name[i],
+                                       &type->s_writers_key[i]))
+                       goto fail;
        }
-out:
+       init_waitqueue_head(&s->s_writers.wait_unfrozen);
+       s->s_bdi = &noop_backing_dev_info;
+       s->s_flags = flags;
+       INIT_HLIST_NODE(&s->s_instances);
+       INIT_HLIST_BL_HEAD(&s->s_anon);
+       mutex_init(&s->s_sync_lock);
+       INIT_LIST_HEAD(&s->s_inodes);
+       spin_lock_init(&s->s_inode_list_lock);
+
+       if (list_lru_init_memcg(&s->s_dentry_lru))
+               goto fail;
+       if (list_lru_init_memcg(&s->s_inode_lru))
+               goto fail;
+
+       init_rwsem(&s->s_umount);
+       lockdep_set_class(&s->s_umount, &type->s_umount_key);
+       /*
+        * sget() can have s_umount recursion.
+        *
+        * When it cannot find a suitable sb, it allocates a new
+        * one (this one), and tries again to find a suitable old
+        * one.
+        *
+        * In case that succeeds, it will acquire the s_umount
+        * lock of the old one. Since these are clearly distrinct
+        * locks, and this object isn't exposed yet, there's no
+        * risk of deadlocks.
+        *
+        * Annotate this by putting this lock in a different
+        * subclass.
+        */
+       down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
+       s->s_count = 1;
+       atomic_set(&s->s_active, 1);
+       mutex_init(&s->s_vfs_rename_mutex);
+       lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
+       mutex_init(&s->s_dquot.dqio_mutex);
+       mutex_init(&s->s_dquot.dqonoff_mutex);
+       s->s_maxbytes = MAX_NON_LFS;
+       s->s_op = &default_op;
+       s->s_time_gran = 1000000000;
+       s->cleancache_poolid = CLEANCACHE_NO_POOL;
+
+       s->s_shrink.seeks = DEFAULT_SEEKS;
+       s->s_shrink.scan_objects = super_cache_scan;
+       s->s_shrink.count_objects = super_cache_count;
+       s->s_shrink.batch = 1024;
+       s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
        return s;
-err_out:
-       security_sb_free(s);
-       destroy_sb_writers(s);
-       kfree(s);
-       s = NULL;
-       goto out;
-}
 
-/**
- *     destroy_super   -       frees a superblock
- *     @s: superblock to free
- *
- *     Frees a superblock.
- */
-static inline void destroy_super(struct super_block *s)
-{
-       destroy_sb_writers(s);
-       security_sb_free(s);
-       WARN_ON(!list_empty(&s->s_mounts));
-       kfree(s->s_subtype);
-       kfree(s->s_options);
-       kfree(s);
+fail:
+       destroy_super(s);
+       return NULL;
 }
 
 /* Superblock refcounting  */
@@ -278,10 +297,17 @@ void deactivate_locked_super(struct super_block *s)
        struct file_system_type *fs = s->s_type;
        if (atomic_dec_and_test(&s->s_active)) {
                cleancache_invalidate_fs(s);
+               unregister_shrinker(&s->s_shrink);
                fs->kill_sb(s);
 
-               /* caches are now gone, we can safely kill the shrinker now */
-               unregister_shrinker(&s->s_shrink);
+               /*
+                * Since list_lru_destroy() may sleep, we cannot call it from
+                * put_super(), where we hold the sb_lock. Therefore we destroy
+                * the lru lists right now.
+                */
+               list_lru_destroy(&s->s_dentry_lru);
+               list_lru_destroy(&s->s_inode_lru);
+
                put_filesystem(fs);
                put_super(s);
        } else {
@@ -337,35 +363,31 @@ static int grab_super(struct super_block *s) __releases(sb_lock)
 }
 
 /*
- *     grab_super_passive - acquire a passive reference
+ *     trylock_super - try to grab ->s_umount shared
  *     @sb: reference we are trying to grab
  *
- *     Tries to acquire a passive reference. This is used in places where we
+ *     Try to prevent fs shutdown.  This is used in places where we
  *     cannot take an active reference but we need to ensure that the
- *     superblock does not go away while we are working on it. It returns
- *     false if a reference was not gained, and returns true with the s_umount
- *     lock held in read mode if a reference is gained. On successful return,
- *     the caller must drop the s_umount lock and the passive reference when
- *     done.
+ *     filesystem is not shut down while we are working on it. It returns
+ *     false if we cannot acquire s_umount or if we lose the race and
+ *     filesystem already got into shutdown, and returns true with the s_umount
+ *     lock held in read mode in case of success. On successful return,
+ *     the caller must drop the s_umount lock when done.
+ *
+ *     Note that unlike get_super() et.al. this one does *not* bump ->s_count.
+ *     The reason why it's safe is that we are OK with doing trylock instead
+ *     of down_read().  There's a couple of places that are OK with that, but
+ *     it's very much not a general-purpose interface.
  */
-bool grab_super_passive(struct super_block *sb)
+bool trylock_super(struct super_block *sb)
 {
-       spin_lock(&sb_lock);
-       if (hlist_unhashed(&sb->s_instances)) {
-               spin_unlock(&sb_lock);
-               return false;
-       }
-
-       sb->s_count++;
-       spin_unlock(&sb_lock);
-
        if (down_read_trylock(&sb->s_umount)) {
-               if (sb->s_root && (sb->s_flags & MS_BORN))
+               if (!hlist_unhashed(&sb->s_instances) &&
+                   sb->s_root && (sb->s_flags & MS_BORN))
                        return true;
                up_read(&sb->s_umount);
        }
 
-       put_super(sb);
        return false;
 }
 
@@ -392,10 +414,15 @@ void generic_shutdown_super(struct super_block *sb)
                sync_filesystem(sb);
                sb->s_flags &= ~MS_ACTIVE;
 
-               fsnotify_unmount_inodes(&sb->s_inodes);
+               fsnotify_unmount_inodes(sb);
 
                evict_inodes(sb);
 
+               if (sb->s_dio_done_wq) {
+                       destroy_workqueue(sb->s_dio_done_wq);
+                       sb->s_dio_done_wq = NULL;
+               }
+
                if (sop->put_super)
                        sop->put_super(sb);
 
@@ -696,13 +723,22 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                return -EACCES;
 #endif
 
-       if (flags & MS_RDONLY)
-               acct_auto_close(sb);
-       shrink_dcache_sb(sb);
-       sync_filesystem(sb);
-
        remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
 
+       if (remount_ro) {
+               if (!hlist_empty(&sb->s_pins)) {
+                       up_write(&sb->s_umount);
+                       group_pin_kill(&sb->s_pins);
+                       down_write(&sb->s_umount);
+                       if (!sb->s_root)
+                               return 0;
+                       if (sb->s_writers.frozen != SB_UNFROZEN)
+                               return -EBUSY;
+                       remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+               }
+       }
+       shrink_dcache_sb(sb);
+
        /* If we are remounting RDONLY and current sb is read/write,
           make sure there are no rw files opened */
        if (remount_ro) {
@@ -797,7 +833,10 @@ void emergency_remount(void)
 
 static DEFINE_IDA(unnamed_dev_ida);
 static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
-static int unnamed_dev_start = 0; /* don't bother trying below it */
+/* Many userspace utilities consider an FSID of 0 invalid.
+ * Always return at least 1 from get_anon_bdev.
+ */
+static int unnamed_dev_start = 1;
 
 int get_anon_bdev(dev_t *p)
 {
@@ -818,7 +857,7 @@ int get_anon_bdev(dev_t *p)
        else if (error)
                return -EAGAIN;
 
-       if (dev == (1 << MINORBITS)) {
+       if (dev >= (1 << MINORBITS)) {
                spin_lock(&unnamed_dev_lock);
                ida_remove(&unnamed_dev_ida, dev);
                if (unnamed_dev_start > dev)
@@ -844,10 +883,7 @@ EXPORT_SYMBOL(free_anon_bdev);
 
 int set_anon_super(struct super_block *s, void *data)
 {
-       int error = get_anon_bdev(&s->s_dev);
-       if (!error)
-               s->s_bdi = &noop_backing_dev_info;
-       return error;
+       return get_anon_bdev(&s->s_dev);
 }
 
 EXPORT_SYMBOL(set_anon_super);
@@ -1092,7 +1128,6 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
        sb = root->d_sb;
        BUG_ON(!sb);
        WARN_ON(!sb->s_bdi);
-       WARN_ON(sb->s_bdi == &default_backing_dev_info);
        sb->s_flags |= MS_BORN;
 
        error = security_sb_kern_mount(sb, flags, secdata);
@@ -1126,72 +1161,46 @@ out:
  */
 void __sb_end_write(struct super_block *sb, int level)
 {
-       percpu_counter_dec(&sb->s_writers.counter[level-1]);
-       /*
-        * Make sure s_writers are updated before we wake up waiters in
-        * freeze_super().
-        */
-       smp_mb();
-       if (waitqueue_active(&sb->s_writers.wait))
-               wake_up(&sb->s_writers.wait);
-       rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
+       percpu_up_read(sb->s_writers.rw_sem + level-1);
 }
 EXPORT_SYMBOL(__sb_end_write);
 
-#ifdef CONFIG_LOCKDEP
-/*
- * We want lockdep to tell us about possible deadlocks with freezing but
- * it's it bit tricky to properly instrument it. Getting a freeze protection
- * works as getting a read lock but there are subtle problems. XFS for example
- * gets freeze protection on internal level twice in some cases, which is OK
- * only because we already hold a freeze protection also on higher level. Due
- * to these cases we have to tell lockdep we are doing trylock when we
- * already hold a freeze protection for a higher freeze level.
- */
-static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
-                               unsigned long ip)
-{
-       int i;
-
-       if (!trylock) {
-               for (i = 0; i < level - 1; i++)
-                       if (lock_is_held(&sb->s_writers.lock_map[i])) {
-                               trylock = true;
-                               break;
-                       }
-       }
-       rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
-}
-#endif
-
 /*
  * This is an internal function, please use sb_start_{write,pagefault,intwrite}
  * instead.
  */
 int __sb_start_write(struct super_block *sb, int level, bool wait)
 {
-retry:
-       if (unlikely(sb->s_writers.frozen >= level)) {
-               if (!wait)
-                       return 0;
-               wait_event(sb->s_writers.wait_unfrozen,
-                          sb->s_writers.frozen < level);
-       }
+       bool force_trylock = false;
+       int ret = 1;
 
 #ifdef CONFIG_LOCKDEP
-       acquire_freeze_lock(sb, level, !wait, _RET_IP_);
-#endif
-       percpu_counter_inc(&sb->s_writers.counter[level-1]);
        /*
-        * Make sure counter is updated before we check for frozen.
-        * freeze_super() first sets frozen and then checks the counter.
+        * We want lockdep to tell us about possible deadlocks with freezing
+        * but it's it bit tricky to properly instrument it. Getting a freeze
+        * protection works as getting a read lock but there are subtle
+        * problems. XFS for example gets freeze protection on internal level
+        * twice in some cases, which is OK only because we already hold a
+        * freeze protection also on higher level. Due to these cases we have
+        * to use wait == F (trylock mode) which must not fail.
         */
-       smp_mb();
-       if (unlikely(sb->s_writers.frozen >= level)) {
-               __sb_end_write(sb, level);
-               goto retry;
+       if (wait) {
+               int i;
+
+               for (i = 0; i < level - 1; i++)
+                       if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
+                               force_trylock = true;
+                               break;
+                       }
        }
-       return 1;
+#endif
+       if (wait && !force_trylock)
+               percpu_down_read(sb->s_writers.rw_sem + level-1);
+       else
+               ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
+
+       WARN_ON(force_trylock & !ret);
+       return ret;
 }
 EXPORT_SYMBOL(__sb_start_write);
 
@@ -1201,37 +1210,33 @@ EXPORT_SYMBOL(__sb_start_write);
  * @level: type of writers we wait for (normal vs page fault)
  *
  * This function waits until there are no writers of given type to given file
- * system. Caller of this function should make sure there can be no new writers
- * of type @level before calling this function. Otherwise this function can
- * livelock.
+ * system.
  */
 static void sb_wait_write(struct super_block *sb, int level)
 {
-       s64 writers;
-
+       percpu_down_write(sb->s_writers.rw_sem + level-1);
        /*
-        * We just cycle-through lockdep here so that it does not complain
-        * about returning with lock to userspace
+        * We are going to return to userspace and forget about this lock, the
+        * ownership goes to the caller of thaw_super() which does unlock.
+        *
+        * FIXME: we should do this before return from freeze_super() after we
+        * called sync_filesystem(sb) and s_op->freeze_fs(sb), and thaw_super()
+        * should re-acquire these locks before s_op->unfreeze_fs(sb). However
+        * this leads to lockdep false-positives, so currently we do the early
+        * release right after acquire.
         */
-       rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
-       rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
-
-       do {
-               DEFINE_WAIT(wait);
+       percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_);
+}
 
-               /*
-                * We use a barrier in prepare_to_wait() to separate setting
-                * of frozen and checking of the counter
-                */
-               prepare_to_wait(&sb->s_writers.wait, &wait,
-                               TASK_UNINTERRUPTIBLE);
+static void sb_freeze_unlock(struct super_block *sb)
+{
+       int level;
 
-               writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
-               if (writers)
-                       schedule();
+       for (level = 0; level < SB_FREEZE_LEVELS; ++level)
+               percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
 
-               finish_wait(&sb->s_writers.wait, &wait);
-       } while (writers);
+       for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
+               percpu_up_write(sb->s_writers.rw_sem + level);
 }
 
 /**
@@ -1290,20 +1295,14 @@ int freeze_super(struct super_block *sb)
                return 0;
        }
 
-       /* From now on, no new normal writers can start */
        sb->s_writers.frozen = SB_FREEZE_WRITE;
-       smp_wmb();
-
        /* Release s_umount to preserve sb_start_write -> s_umount ordering */
        up_write(&sb->s_umount);
-
        sb_wait_write(sb, SB_FREEZE_WRITE);
+       down_write(&sb->s_umount);
 
        /* Now we go and block page faults... */
-       down_write(&sb->s_umount);
        sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
-       smp_wmb();
-
        sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
 
        /* All writers are done so after syncing there won't be dirty data */
@@ -1311,7 +1310,6 @@ int freeze_super(struct super_block *sb)
 
        /* Now wait for internal filesystem counter */
        sb->s_writers.frozen = SB_FREEZE_FS;
-       smp_wmb();
        sb_wait_write(sb, SB_FREEZE_FS);
 
        if (sb->s_op->freeze_fs) {
@@ -1320,7 +1318,7 @@ int freeze_super(struct super_block *sb)
                        printk(KERN_ERR
                                "VFS:Filesystem freeze failed\n");
                        sb->s_writers.frozen = SB_UNFROZEN;
-                       smp_wmb();
+                       sb_freeze_unlock(sb);
                        wake_up(&sb->s_writers.wait_unfrozen);
                        deactivate_locked_super(sb);
                        return ret;
@@ -1352,8 +1350,10 @@ int thaw_super(struct super_block *sb)
                return -EINVAL;
        }
 
-       if (sb->s_flags & MS_RDONLY)
+       if (sb->s_flags & MS_RDONLY) {
+               sb->s_writers.frozen = SB_UNFROZEN;
                goto out;
+       }
 
        if (sb->s_op->unfreeze_fs) {
                error = sb->s_op->unfreeze_fs(sb);
@@ -1365,12 +1365,11 @@ int thaw_super(struct super_block *sb)
                }
        }
 
-out:
        sb->s_writers.frozen = SB_UNFROZEN;
-       smp_wmb();
+       sb_freeze_unlock(sb);
+out:
        wake_up(&sb->s_writers.wait_unfrozen);
        deactivate_locked_super(sb);
-
        return 0;
 }
 EXPORT_SYMBOL(thaw_super);