Btrfs: introduce btrfs_{start, end}_nocow_write() for each subvolume
authorMiao Xie <miaox@cn.fujitsu.com>
Thu, 6 Mar 2014 05:38:19 +0000 (13:38 +0800)
committerJosef Bacik <jbacik@fb.com>
Mon, 10 Mar 2014 19:17:22 +0000 (15:17 -0400)
If the snapshot creation happened after the nocow write but before the dirty
data flush, we would fail to flush the dirty data because of no space.

So we must keep track of when those nocow write operations start and when they
end, if there are nocow writers, the snapshot creators must wait. In order
to implement this function, I introduce btrfs_{start, end}_nocow_write(),
which is similar to mnt_{want,drop}_write().

These two functions are only used for nocow file write operations.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/file.c
fs/btrfs/ioctl.c

index b4d2e957b89fb0cee501b37bbdf8566619d41708..374bb2f8ccd9bd1a1307d558ce1f3a7201e4cc89 100644 (file)
@@ -1681,6 +1681,11 @@ struct btrfs_fs_info {
        unsigned int update_uuid_tree_gen:1;
 };
 
+struct btrfs_subvolume_writers {
+       struct percpu_counter   counter;
+       wait_queue_head_t       wait;
+};
+
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
@@ -1829,6 +1834,8 @@ struct btrfs_root {
         * manipulation with the read-only status via SUBVOL_SETFLAGS
         */
        int send_in_progress;
+       struct btrfs_subvolume_writers *subv_writers;
+       atomic_t will_be_snapshoted;
 };
 
 struct btrfs_ioctl_defrag_range_args {
@@ -3353,6 +3360,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info);
 int __get_raid_index(u64 flags);
+
+int btrfs_start_nocow_write(struct btrfs_root *root);
+void btrfs_end_nocow_write(struct btrfs_root *root);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
index f7d84d955764838f846b393c09e911640af91980..7d09ca48c347d8944bb1b1e66a3a2cc095ab45e1 100644 (file)
@@ -1149,6 +1149,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        }
 }
 
+static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
+{
+       struct btrfs_subvolume_writers *writers;
+       int ret;
+
+       writers = kmalloc(sizeof(*writers), GFP_NOFS);
+       if (!writers)
+               return ERR_PTR(-ENOMEM);
+
+       ret = percpu_counter_init(&writers->counter, 0);
+       if (ret < 0) {
+               kfree(writers);
+               return ERR_PTR(ret);
+       }
+
+       init_waitqueue_head(&writers->wait);
+       return writers;
+}
+
+static void
+btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
+{
+       percpu_counter_destroy(&writers->counter);
+       kfree(writers);
+}
+
 static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
                         u32 stripesize, struct btrfs_root *root,
                         struct btrfs_fs_info *fs_info,
@@ -1205,6 +1231,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        atomic_set(&root->log_batch, 0);
        atomic_set(&root->orphan_inodes, 0);
        atomic_set(&root->refs, 1);
+       atomic_set(&root->will_be_snapshoted, 0);
        root->log_transid = 0;
        root->log_transid_committed = -1;
        root->last_log_commit = 0;
@@ -1502,6 +1529,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
 int btrfs_init_fs_root(struct btrfs_root *root)
 {
        int ret;
+       struct btrfs_subvolume_writers *writers;
 
        root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
        root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1511,6 +1539,13 @@ int btrfs_init_fs_root(struct btrfs_root *root)
                goto fail;
        }
 
+       writers = btrfs_alloc_subvolume_writers();
+       if (IS_ERR(writers)) {
+               ret = PTR_ERR(writers);
+               goto fail;
+       }
+       root->subv_writers = writers;
+
        btrfs_init_free_ino_ctl(root);
        mutex_init(&root->fs_commit_mutex);
        spin_lock_init(&root->cache_lock);
@@ -1518,8 +1553,11 @@ int btrfs_init_fs_root(struct btrfs_root *root)
 
        ret = get_anon_bdev(&root->anon_dev);
        if (ret)
-               goto fail;
+               goto free_writers;
        return 0;
+
+free_writers:
+       btrfs_free_subvolume_writers(root->subv_writers);
 fail:
        kfree(root->free_ino_ctl);
        kfree(root->free_ino_pinned);
@@ -3459,6 +3497,8 @@ static void free_fs_root(struct btrfs_root *root)
        root->orphan_block_rsv = NULL;
        if (root->anon_dev)
                free_anon_bdev(root->anon_dev);
+       if (root->subv_writers)
+               btrfs_free_subvolume_writers(root->subv_writers);
        free_extent_buffer(root->node);
        free_extent_buffer(root->commit_root);
        kfree(root->free_ino_ctl);
index 19ea8ad70c671661da0b1c5677e48be4f14086a5..6b821c64b37b715f2ad1d6e4e23ae0e1cddf04ad 100644 (file)
@@ -8938,3 +8938,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
        range->len = trimmed;
        return ret;
 }
+
+/*
+ * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
+ * they are used to prevent the some tasks writing data into the page cache
+ * by nocow before the subvolume is snapshoted, but flush the data into
+ * the disk after the snapshot creation.
+ */
+void btrfs_end_nocow_write(struct btrfs_root *root)
+{
+       percpu_counter_dec(&root->subv_writers->counter);
+       /*
+        * Make sure counter is updated before we wake up
+        * waiters.
+        */
+       smp_mb();
+       if (waitqueue_active(&root->subv_writers->wait))
+               wake_up(&root->subv_writers->wait);
+}
+
+int btrfs_start_nocow_write(struct btrfs_root *root)
+{
+       if (unlikely(atomic_read(&root->will_be_snapshoted)))
+               return 0;
+
+       percpu_counter_inc(&root->subv_writers->counter);
+       /*
+        * Make sure counter is updated before we check for snapshot creation.
+        */
+       smp_mb();
+       if (unlikely(atomic_read(&root->will_be_snapshoted))) {
+               btrfs_end_nocow_write(root);
+               return 0;
+       }
+       return 1;
+}
index d88f2dc4d045d757e39befe387f8d9a07095ad83..006cf9f0e8520b5ee935387a97aa956b44dd2311 100644 (file)
@@ -1410,6 +1410,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
        u64 num_bytes;
        int ret;
 
+       ret = btrfs_start_nocow_write(root);
+       if (!ret)
+               return -ENOSPC;
+
        lockstart = round_down(pos, root->sectorsize);
        lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
 
@@ -1427,11 +1431,13 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
 
        num_bytes = lockend - lockstart + 1;
        ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
-       if (ret <= 0)
+       if (ret <= 0) {
                ret = 0;
-       else
+               btrfs_end_nocow_write(root);
+       } else {
                *write_bytes = min_t(size_t, *write_bytes ,
                                     num_bytes - pos + lockstart);
+       }
 
        unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
 
@@ -1520,6 +1526,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                        if (!only_release_metadata)
                                btrfs_free_reserved_data_space(inode,
                                                               reserve_bytes);
+                       else
+                               btrfs_end_nocow_write(root);
                        break;
                }
 
@@ -1608,6 +1616,9 @@ again:
                }
 
                release_bytes = 0;
+               if (only_release_metadata)
+                       btrfs_end_nocow_write(root);
+
                if (only_release_metadata && copied > 0) {
                        u64 lockstart = round_down(pos, root->sectorsize);
                        u64 lockend = lockstart +
@@ -1634,10 +1645,12 @@ again:
        kfree(pages);
 
        if (release_bytes) {
-               if (only_release_metadata)
+               if (only_release_metadata) {
+                       btrfs_end_nocow_write(root);
                        btrfs_delalloc_release_metadata(inode, release_bytes);
-               else
+               } else {
                        btrfs_delalloc_release_space(inode, release_bytes);
+               }
        }
 
        return num_written ? num_written : ret;
index 1ae45bd9d27d51d15a511b85ad5a520e54c1dd65..57bc9f33fa3c6d5b3f6d7ffc0d50821ac183d436 100644 (file)
@@ -611,6 +611,23 @@ fail:
        return ret;
 }
 
+static void btrfs_wait_nocow_write(struct btrfs_root *root)
+{
+       s64 writers;
+       DEFINE_WAIT(wait);
+
+       do {
+               prepare_to_wait(&root->subv_writers->wait, &wait,
+                               TASK_UNINTERRUPTIBLE);
+
+               writers = percpu_counter_sum(&root->subv_writers->counter);
+               if (writers)
+                       schedule();
+
+               finish_wait(&root->subv_writers->wait, &wait);
+       } while (writers);
+}
+
 static int create_snapshot(struct btrfs_root *root, struct inode *dir,
                           struct dentry *dentry, char *name, int namelen,
                           u64 *async_transid, bool readonly,
@@ -624,15 +641,21 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
        if (!root->ref_cows)
                return -EINVAL;
 
+       atomic_inc(&root->will_be_snapshoted);
+       smp_mb__after_atomic_inc();
+       btrfs_wait_nocow_write(root);
+
        ret = btrfs_start_delalloc_inodes(root, 0);
        if (ret)
-               return ret;
+               goto out;
 
        btrfs_wait_ordered_extents(root, -1);
 
        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
-       if (!pending_snapshot)
-               return -ENOMEM;
+       if (!pending_snapshot) {
+               ret = -ENOMEM;
+               goto out;
+       }
 
        btrfs_init_block_rsv(&pending_snapshot->block_rsv,
                             BTRFS_BLOCK_RSV_TEMP);
@@ -649,7 +672,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
                                        &pending_snapshot->qgroup_reserved,
                                        false);
        if (ret)
-               goto out;
+               goto free;
 
        pending_snapshot->dentry = dentry;
        pending_snapshot->root = root;
@@ -700,8 +723,10 @@ fail:
        btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
                                         &pending_snapshot->block_rsv,
                                         pending_snapshot->qgroup_reserved);
-out:
+free:
        kfree(pending_snapshot);
+out:
+       atomic_dec(&root->will_be_snapshoted);
        return ret;
 }