Merge branch 'for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/mason...

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 7 Nov 2015 01:17:13 +0000 (17:17 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 7 Nov 2015 01:17:13 +0000 (17:17 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 7 Nov 2015 01:17:13 +0000 (17:17 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 7 Nov 2015 01:17:13 +0000 (17:17 -0800)
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c

index 9a2ec79e8cfb6c4ad26a5578e39f62f8fa80226b..6dcdb2ec921185ae350aa0f2ff49cd65eaa0d500 100644 (file)
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -362,6 +362,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                 goto out;
         }
  
+       if (btrfs_test_is_dummy_root(root)) {
+               srcu_read_unlock(&fs_info->subvol_srcu, index);
+               ret = -ENOENT;
+               goto out;
+       }
+
         if (path->search_commit_root)
                 root_level = btrfs_header_level(root->commit_root);
         else if (time_seq == (u64)-1)
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c

index 541fbfaed2763a836da04ce8076d43af359558bb..0340c57bf37778f6983645803c3f381ae1a27a2a 100644 (file)
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -667,7 +667,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
         selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
         if (NULL == selected_super) {
                 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
-               return -1;
+               return -ENOMEM;
         }
  
         list_for_each_entry(device, dev_head, dev_list) {
@@ -845,8 +845,8 @@ static int btrfsic_process_superblock_dev_mirror(
                 superblock_tmp->never_written = 0;
                 superblock_tmp->mirror_num = 1 + superblock_mirror_num;
                 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-                       printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)"
-                                    " @%llu (%s/%llu/%d)\n",
+                       btrfs_info_in_rcu(device->dev_root->fs_info,
+                               "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
                                      superblock_bdev,
                                      rcu_str_deref(device->name), dev_bytenr,
                                      dev_state->name, dev_bytenr,
@@ -1660,7 +1660,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
                                           sizeof(*block_ctx->pagev)) *
                                          num_pages, GFP_NOFS);
         if (!block_ctx->mem_to_free)
-               return -1;
+               return -ENOMEM;
         block_ctx->datav = block_ctx->mem_to_free;
         block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
         for (i = 0; i < num_pages; i++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c

index 57ee8ca29b0601060fae924f43b7897f2c4c7c7c..97b049ad0594cc3a3b79f1b8222def5e132d73ae 100644 (file)
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -745,11 +745,13 @@ out:
         return ret;
  }
  
-static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
-static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
-static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
-static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
-static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+static struct {
+       struct list_head idle_ws;
+       spinlock_t ws_lock;
+       int num_ws;
+       atomic_t alloc_ws;
+       wait_queue_head_t ws_wait;
+} btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
  
  static const struct btrfs_compress_op * const btrfs_compress_op[] = {
         &btrfs_zlib_compress,
@@ -761,10 +763,10 @@ void __init btrfs_init_compress(void)
         int i;
  
         for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
-               INIT_LIST_HEAD(&comp_idle_workspace[i]);
-               spin_lock_init(&comp_workspace_lock[i]);
-               atomic_set(&comp_alloc_workspace[i], 0);
-               init_waitqueue_head(&comp_workspace_wait[i]);
+               INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
+               spin_lock_init(&btrfs_comp_ws[i].ws_lock);
+               atomic_set(&btrfs_comp_ws[i].alloc_ws, 0);
+               init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
         }
  }
  
@@ -778,38 +780,38 @@ static struct list_head *find_workspace(int type)
         int cpus = num_online_cpus();
         int idx = type - 1;
  
-       struct list_head *idle_workspace        = &comp_idle_workspace[idx];
-       spinlock_t *workspace_lock              = &comp_workspace_lock[idx];
-       atomic_t *alloc_workspace               = &comp_alloc_workspace[idx];
-       wait_queue_head_t *workspace_wait       = &comp_workspace_wait[idx];
-       int *num_workspace                      = &comp_num_workspace[idx];
+       struct list_head *idle_ws       = &btrfs_comp_ws[idx].idle_ws;
+       spinlock_t *ws_lock             = &btrfs_comp_ws[idx].ws_lock;
+       atomic_t *alloc_ws              = &btrfs_comp_ws[idx].alloc_ws;
+       wait_queue_head_t *ws_wait      = &btrfs_comp_ws[idx].ws_wait;
+       int *num_ws                     = &btrfs_comp_ws[idx].num_ws;
  again:
-       spin_lock(workspace_lock);
-       if (!list_empty(idle_workspace)) {
-               workspace = idle_workspace->next;
+       spin_lock(ws_lock);
+       if (!list_empty(idle_ws)) {
+               workspace = idle_ws->next;
                 list_del(workspace);
-               (*num_workspace)--;
-               spin_unlock(workspace_lock);
+               (*num_ws)--;
+               spin_unlock(ws_lock);
                 return workspace;
  
         }
-       if (atomic_read(alloc_workspace) > cpus) {
+       if (atomic_read(alloc_ws) > cpus) {
                 DEFINE_WAIT(wait);
  
-               spin_unlock(workspace_lock);
-               prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
-               if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+               spin_unlock(ws_lock);
+               prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
+               if (atomic_read(alloc_ws) > cpus && !*num_ws)
                         schedule();
-               finish_wait(workspace_wait, &wait);
+               finish_wait(ws_wait, &wait);
                 goto again;
         }
-       atomic_inc(alloc_workspace);
-       spin_unlock(workspace_lock);
+       atomic_inc(alloc_ws);
+       spin_unlock(ws_lock);
  
         workspace = btrfs_compress_op[idx]->alloc_workspace();
         if (IS_ERR(workspace)) {
-               atomic_dec(alloc_workspace);
-               wake_up(workspace_wait);
+               atomic_dec(alloc_ws);
+               wake_up(ws_wait);
         }
         return workspace;
  }
@@ -821,27 +823,30 @@ again:
  static void free_workspace(int type, struct list_head *workspace)
  {
         int idx = type - 1;
-       struct list_head *idle_workspace        = &comp_idle_workspace[idx];
-       spinlock_t *workspace_lock              = &comp_workspace_lock[idx];
-       atomic_t *alloc_workspace               = &comp_alloc_workspace[idx];
-       wait_queue_head_t *workspace_wait       = &comp_workspace_wait[idx];
-       int *num_workspace                      = &comp_num_workspace[idx];
-
-       spin_lock(workspace_lock);
-       if (*num_workspace < num_online_cpus()) {
-               list_add(workspace, idle_workspace);
-               (*num_workspace)++;
-               spin_unlock(workspace_lock);
+       struct list_head *idle_ws       = &btrfs_comp_ws[idx].idle_ws;
+       spinlock_t *ws_lock             = &btrfs_comp_ws[idx].ws_lock;
+       atomic_t *alloc_ws              = &btrfs_comp_ws[idx].alloc_ws;
+       wait_queue_head_t *ws_wait      = &btrfs_comp_ws[idx].ws_wait;
+       int *num_ws                     = &btrfs_comp_ws[idx].num_ws;
+
+       spin_lock(ws_lock);
+       if (*num_ws < num_online_cpus()) {
+               list_add(workspace, idle_ws);
+               (*num_ws)++;
+               spin_unlock(ws_lock);
                 goto wake;
         }
-       spin_unlock(workspace_lock);
+       spin_unlock(ws_lock);
  
         btrfs_compress_op[idx]->free_workspace(workspace);
-       atomic_dec(alloc_workspace);
+       atomic_dec(alloc_ws);
  wake:
+       /*
+        * Make sure counter is updated before we wake up waiters.
+        */
         smp_mb();
-       if (waitqueue_active(workspace_wait))
-               wake_up(workspace_wait);
+       if (waitqueue_active(ws_wait))
+               wake_up(ws_wait);
  }
  
  /*
@@ -853,11 +858,11 @@ static void free_workspaces(void)
         int i;
  
         for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
-               while (!list_empty(&comp_idle_workspace[i])) {
-                       workspace = comp_idle_workspace[i].next;
+               while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
+                       workspace = btrfs_comp_ws[i].idle_ws.next;
                         list_del(workspace);
                         btrfs_compress_op[i]->free_workspace(workspace);
-                       atomic_dec(&comp_alloc_workspace[i]);
+                       atomic_dec(&btrfs_comp_ws[i].alloc_ws);
                 }
         }
  }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c

index 5f745eadf77dd07454ccb0c1d201124738cb4edc..5b8e235c4b6d6299183ebba91976f0d53612c16c 100644 (file)
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                         return ret;
                 if (refs == 0) {
                         ret = -EROFS;
-                       btrfs_std_error(root->fs_info, ret);
+                       btrfs_std_error(root->fs_info, ret, NULL);
                         return ret;
                 }
         } else {
@@ -1927,7 +1927,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                 child = read_node_slot(root, mid, 0);
                 if (!child) {
                         ret = -EROFS;
-                       btrfs_std_error(root->fs_info, ret);
+                       btrfs_std_error(root->fs_info, ret, NULL);
                         goto enospc;
                 }
  
@@ -2030,7 +2030,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                  */
                 if (!left) {
                         ret = -EROFS;
-                       btrfs_std_error(root->fs_info, ret);
+                       btrfs_std_error(root->fs_info, ret, NULL);
                         goto enospc;
                 }
                 wret = balance_node_right(trans, root, mid, left);
@@ -4940,8 +4940,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  {
         struct extent_buffer *leaf;
         struct btrfs_item *item;
-       int last_off;
-       int dsize = 0;
+       u32 last_off;
+       u32 dsize = 0;
         int ret = 0;
         int wret;
         int i;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index 938efe33be809240bc19bdf81d854c623191a3d8..a2e73f6053a853698bef78b8d5ed776a695933c5 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -823,8 +823,18 @@ struct btrfs_disk_balance_args {
          */
         __le64 profiles;
  
-       /* usage filter */
-       __le64 usage;
+       /*
+        * usage filter
+        * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
+        * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
+        */
+       union {
+               __le64 usage;
+               struct {
+                       __le32 usage_min;
+                       __le32 usage_max;
+               };
+       };
  
         /* devid filter */
         __le64 devid;
@@ -846,10 +856,27 @@ struct btrfs_disk_balance_args {
         /* BTRFS_BALANCE_ARGS_* */
         __le64 flags;
  
-       /* BTRFS_BALANCE_ARGS_LIMIT value */
-       __le64 limit;
+       /*
+        * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
+        * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
+        * and maximum
+        */
+       union {
+               __le64 limit;
+               struct {
+                       __le32 limit_min;
+                       __le32 limit_max;
+               };
+       };
  
-       __le64 unused[7];
+       /*
+        * Process chunks that cross stripes_min..stripes_max devices,
+        * BTRFS_BALANCE_ARGS_STRIPES_RANGE
+        */
+       __le32 stripes_min;
+       __le32 stripes_max;
+
+       __le64 unused[6];
  } __attribute__ ((__packed__));
  
  /*
@@ -1154,6 +1181,10 @@ struct btrfs_space_info {
                                    delalloc/allocations */
         u64 bytes_readonly;     /* total bytes that are read only */
  
+       u64 max_extent_size;    /* This will hold the maximum extent size of
+                                  the space info if we had an ENOSPC in the
+                                  allocator. */
+
         unsigned int full:1;    /* indicates that we cannot allocate any more
                                    chunks for this space */
         unsigned int chunk_alloc:1;     /* set if we are allocating a chunk */
@@ -1228,6 +1259,9 @@ struct btrfs_free_cluster {
         /* first extent starting offset */
         u64 window_start;
  
+       /* We did a full search and couldn't create a cluster */
+       bool fragmented;
+
         struct btrfs_block_group_cache *block_group;
         /*
          * when a cluster is allocated from a block group, we put the
@@ -1943,6 +1977,9 @@ struct btrfs_root {
         int send_in_progress;
         struct btrfs_subvolume_writers *subv_writers;
         atomic_t will_be_snapshoted;
+
+       /* For qgroup metadata space reserve */
+       atomic_t qgroup_meta_rsv;
  };
  
  struct btrfs_ioctl_defrag_range_args {
@@ -2145,6 +2182,8 @@ struct btrfs_ioctl_defrag_range_args {
  #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
  #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR       (1 << 22)
  #define BTRFS_MOUNT_RESCAN_UUID_TREE   (1 << 23)
+#define BTRFS_MOUNT_FRAGMENT_DATA      (1 << 24)
+#define BTRFS_MOUNT_FRAGMENT_METADATA  (1 << 25)
  
  #define BTRFS_DEFAULT_COMMIT_INTERVAL  (30)
  #define BTRFS_DEFAULT_MAX_INLINE       (8192)
@@ -2169,6 +2208,18 @@ struct btrfs_ioctl_defrag_range_args {
         btrfs_clear_opt(root->fs_info->mount_opt, opt);                 \
  }
  
+#ifdef CONFIG_BTRFS_DEBUG
+static inline int
+btrfs_should_fragment_free_space(struct btrfs_root *root,
+                                struct btrfs_block_group_cache *block_group)
+{
+       return (btrfs_test_opt(root, FRAGMENT_METADATA) &&
+               block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+              (btrfs_test_opt(root, FRAGMENT_DATA) &&
+               block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
+}
+#endif
+
  /*
   * Requests for changes that need to be done during transaction commit.
   *
@@ -3379,7 +3430,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
  int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      u64 root_objectid, u64 owner,
-                                    u64 offset, struct btrfs_key *ins);
+                                    u64 offset, u64 ram_bytes,
+                                    struct btrfs_key *ins);
  int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root,
                                    u64 root_objectid, u64 owner, u64 offset,
@@ -3398,7 +3450,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
  int btrfs_free_extent(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root,
                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-                     u64 owner, u64 offset, int no_quota);
+                     u64 owner, u64 offset);
  
  int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
                                int delalloc);
@@ -3411,7 +3463,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          u64 bytenr, u64 num_bytes, u64 parent,
-                        u64 root_objectid, u64 owner, u64 offset, int no_quota);
+                        u64 root_objectid, u64 owner, u64 offset);
  
  int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root);
@@ -3449,8 +3501,11 @@ enum btrfs_reserve_flush_enum {
         BTRFS_RESERVE_FLUSH_ALL,
  };
  
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+                                           u64 len);
  void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root);
  void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@ -3466,8 +3521,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
                                       u64 qgroup_reserved);
  int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
  void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
  void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
  struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
                                               unsigned short type);
@@ -4004,8 +4059,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
  /* sysfs.c */
  int btrfs_init_sysfs(void);
  void btrfs_exit_sysfs(void);
-int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
-void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
  
  /* xattr.c */
  ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -4039,14 +4094,102 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
  #define btrfs_info(fs_info, fmt, args...) \
         btrfs_printk(fs_info, KERN_INFO fmt, ##args)
  
+/*
+ * Wrappers that use printk_in_rcu
+ */
+#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk_in_rcu
+ */
+#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk
+ */
+#define btrfs_emerg_rl(fs_info, fmt, args...) \
+       btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl(fs_info, fmt, args...) \
+       btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl(fs_info, fmt, args...) \
+       btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl(fs_info, fmt, args...) \
+       btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl(fs_info, fmt, args...) \
+       btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl(fs_info, fmt, args...) \
+       btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl(fs_info, fmt, args...) \
+       btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
  #ifdef DEBUG
  #define btrfs_debug(fs_info, fmt, args...) \
         btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+       btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+       btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
  #else
  #define btrfs_debug(fs_info, fmt, args...) \
      no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+       no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+       no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+       no_printk(KERN_DEBUG fmt, ##args)
  #endif
  
+#define btrfs_printk_in_rcu(fs_info, fmt, args...)     \
+do {                                                   \
+       rcu_read_lock();                                \
+       btrfs_printk(fs_info, fmt, ##args);             \
+       rcu_read_unlock();                              \
+} while (0)
+
+#define btrfs_printk_ratelimited(fs_info, fmt, args...)                \
+do {                                                           \
+       static DEFINE_RATELIMIT_STATE(_rs,                      \
+               DEFAULT_RATELIMIT_INTERVAL,                     \
+               DEFAULT_RATELIMIT_BURST);                       \
+       if (__ratelimit(&_rs))                                  \
+               btrfs_printk(fs_info, fmt, ##args);             \
+} while (0)
+
+#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...)          \
+do {                                                           \
+       rcu_read_lock();                                        \
+       btrfs_printk_ratelimited(fs_info, fmt, ##args);         \
+       rcu_read_unlock();                                      \
+} while (0)
+
  #ifdef CONFIG_BTRFS_ASSERT
  
  __cold
@@ -4127,14 +4270,7 @@ do {                                                             \
                                   __LINE__, (errno));           \
  } while (0)
  
-#define btrfs_std_error(fs_info, errno)                                \
-do {                                                           \
-       if ((errno))                                            \
-               __btrfs_std_error((fs_info), __func__,          \
-                                  __LINE__, (errno), NULL);    \
-} while (0)
-
-#define btrfs_error(fs_info, errno, fmt, args...)              \
+#define btrfs_std_error(fs_info, errno, fmt, args...)          \
  do {                                                           \
         __btrfs_std_error((fs_info), __func__, __LINE__,        \
                           (errno), fmt, ##args);                \
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c

index a2ae42720a6afe92701de402b837d56dd66d03b4..e0941fbb913c2e99740b9df34d06efbc8cffdea1 100644 (file)
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -463,6 +463,10 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
  static void finish_one_item(struct btrfs_delayed_root *delayed_root)
  {
         int seq = atomic_inc_return(&delayed_root->items_seq);
+
+       /*
+        * atomic_dec_return implies a barrier for waitqueue_active
+        */
         if ((atomic_dec_return(&delayed_root->items) <
             BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
             waitqueue_active(&delayed_root->wait))
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c

index ac3e81da6d4edc8e33856840349cc88750771546..e06dd75ad13f98999682c00128ae852e0a886e14 100644 (file)
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -197,6 +197,119 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
                 trans->delayed_ref_updates--;
  }
  
+static bool merge_ref(struct btrfs_trans_handle *trans,
+                     struct btrfs_delayed_ref_root *delayed_refs,
+                     struct btrfs_delayed_ref_head *head,
+                     struct btrfs_delayed_ref_node *ref,
+                     u64 seq)
+{
+       struct btrfs_delayed_ref_node *next;
+       bool done = false;
+
+       next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
+                               list);
+       while (!done && &next->list != &head->ref_list) {
+               int mod;
+               struct btrfs_delayed_ref_node *next2;
+
+               next2 = list_next_entry(next, list);
+
+               if (next == ref)
+                       goto next;
+
+               if (seq && next->seq >= seq)
+                       goto next;
+
+               if (next->type != ref->type)
+                       goto next;
+
+               if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+                    ref->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
+                   comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref),
+                                  btrfs_delayed_node_to_tree_ref(next),
+                                  ref->type))
+                       goto next;
+               if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY ||
+                    ref->type == BTRFS_SHARED_DATA_REF_KEY) &&
+                   comp_data_refs(btrfs_delayed_node_to_data_ref(ref),
+                                  btrfs_delayed_node_to_data_ref(next)))
+                       goto next;
+
+               if (ref->action == next->action) {
+                       mod = next->ref_mod;
+               } else {
+                       if (ref->ref_mod < next->ref_mod) {
+                               swap(ref, next);
+                               done = true;
+                       }
+                       mod = -next->ref_mod;
+               }
+
+               drop_delayed_ref(trans, delayed_refs, head, next);
+               ref->ref_mod += mod;
+               if (ref->ref_mod == 0) {
+                       drop_delayed_ref(trans, delayed_refs, head, ref);
+                       done = true;
+               } else {
+                       /*
+                        * Can't have multiples of the same ref on a tree block.
+                        */
+                       WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+                               ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
+               }
+next:
+               next = next2;
+       }
+
+       return done;
+}
+
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info,
+                             struct btrfs_delayed_ref_root *delayed_refs,
+                             struct btrfs_delayed_ref_head *head)
+{
+       struct btrfs_delayed_ref_node *ref;
+       u64 seq = 0;
+
+       assert_spin_locked(&head->lock);
+
+       if (list_empty(&head->ref_list))
+               return;
+
+       /* We don't have too many refs to merge for data. */
+       if (head->is_data)
+               return;
+
+       spin_lock(&fs_info->tree_mod_seq_lock);
+       if (!list_empty(&fs_info->tree_mod_seq_list)) {
+               struct seq_list *elem;
+
+               elem = list_first_entry(&fs_info->tree_mod_seq_list,
+                                       struct seq_list, list);
+               seq = elem->seq;
+       }
+       spin_unlock(&fs_info->tree_mod_seq_lock);
+
+       ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
+                              list);
+       while (&ref->list != &head->ref_list) {
+               if (seq && ref->seq >= seq)
+                       goto next;
+
+               if (merge_ref(trans, delayed_refs, head, ref, seq)) {
+                       if (list_empty(&head->ref_list))
+                               break;
+                       ref = list_first_entry(&head->ref_list,
+                                              struct btrfs_delayed_ref_node,
+                                              list);
+                       continue;
+               }
+next:
+               ref = list_next_entry(ref, list);
+       }
+}
+
  int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
                             struct btrfs_delayed_ref_root *delayed_refs,
                             u64 seq)
@@ -292,8 +405,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
         exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
                            list);
         /* No need to compare bytenr nor is_head */
-       if (exist->type != ref->type || exist->no_quota != ref->no_quota ||
-           exist->seq != ref->seq)
+       if (exist->type != ref->type || exist->seq != ref->seq)
                 goto add_tail;
  
         if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
@@ -423,7 +535,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
                      struct btrfs_trans_handle *trans,
                      struct btrfs_delayed_ref_node *ref,
                      struct btrfs_qgroup_extent_record *qrecord,
-                    u64 bytenr, u64 num_bytes, int action, int is_data)
+                    u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
+                    int action, int is_data)
  {
         struct btrfs_delayed_ref_head *existing;
         struct btrfs_delayed_ref_head *head_ref = NULL;
@@ -432,6 +545,9 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
         int count_mod = 1;
         int must_insert_reserved = 0;
  
+       /* If reserved is provided, it must be a data extent. */
+       BUG_ON(!is_data && reserved);
+
         /*
          * the head node stores the sum of all the mods, so dropping a ref
          * should drop the sum in the head node by one.
@@ -476,9 +592,16 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
         INIT_LIST_HEAD(&head_ref->ref_list);
         head_ref->processing = 0;
         head_ref->total_ref_mod = count_mod;
+       head_ref->qgroup_reserved = 0;
+       head_ref->qgroup_ref_root = 0;
  
         /* Record qgroup extent info if provided */
         if (qrecord) {
+               if (ref_root && reserved) {
+                       head_ref->qgroup_ref_root = ref_root;
+                       head_ref->qgroup_reserved = reserved;
+               }
+
                 qrecord->bytenr = bytenr;
                 qrecord->num_bytes = num_bytes;
                 qrecord->old_roots = NULL;
@@ -497,6 +620,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
         existing = htree_insert(&delayed_refs->href_root,
                                 &head_ref->href_node);
         if (existing) {
+               WARN_ON(ref_root && reserved && existing->qgroup_ref_root
+                       && existing->qgroup_reserved);
                 update_existing_head_ref(delayed_refs, &existing->node, ref);
                 /*
                  * we've updated the existing ref, free the newly
@@ -524,7 +649,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
                      struct btrfs_delayed_ref_head *head_ref,
                      struct btrfs_delayed_ref_node *ref, u64 bytenr,
                      u64 num_bytes, u64 parent, u64 ref_root, int level,
-                    int action, int no_quota)
+                    int action)
  {
         struct btrfs_delayed_tree_ref *full_ref;
         struct btrfs_delayed_ref_root *delayed_refs;
@@ -546,7 +671,6 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
         ref->action = action;
         ref->is_head = 0;
         ref->in_tree = 1;
-       ref->no_quota = no_quota;
         ref->seq = seq;
  
         full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -579,7 +703,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
                      struct btrfs_delayed_ref_head *head_ref,
                      struct btrfs_delayed_ref_node *ref, u64 bytenr,
                      u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
-                    u64 offset, int action, int no_quota)
+                    u64 offset, int action)
  {
         struct btrfs_delayed_data_ref *full_ref;
         struct btrfs_delayed_ref_root *delayed_refs;
@@ -602,7 +726,6 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
         ref->action = action;
         ref->is_head = 0;
         ref->in_tree = 1;
-       ref->no_quota = no_quota;
         ref->seq = seq;
  
         full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -633,17 +756,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
                                struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes, u64 parent,
                                u64 ref_root,  int level, int action,
-                              struct btrfs_delayed_extent_op *extent_op,
-                              int no_quota)
+                              struct btrfs_delayed_extent_op *extent_op)
  {
         struct btrfs_delayed_tree_ref *ref;
         struct btrfs_delayed_ref_head *head_ref;
         struct btrfs_delayed_ref_root *delayed_refs;
         struct btrfs_qgroup_extent_record *record = NULL;
  
-       if (!is_fstree(ref_root) || !fs_info->quota_enabled)
-               no_quota = 0;
-
         BUG_ON(extent_op && extent_op->is_data);
         ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
         if (!ref)
@@ -669,11 +788,10 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
          * the spin lock
          */
         head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
-                                       bytenr, num_bytes, action, 0);
+                                       bytenr, num_bytes, 0, 0, action, 0);
  
         add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
-                                  num_bytes, parent, ref_root, level, action,
-                                  no_quota);
+                            num_bytes, parent, ref_root, level, action);
         spin_unlock(&delayed_refs->lock);
  
         return 0;
@@ -693,18 +811,14 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
                                struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
                                u64 parent, u64 ref_root,
-                              u64 owner, u64 offset, int action,
-                              struct btrfs_delayed_extent_op *extent_op,
-                              int no_quota)
+                              u64 owner, u64 offset, u64 reserved, int action,
+                              struct btrfs_delayed_extent_op *extent_op)
  {
         struct btrfs_delayed_data_ref *ref;
         struct btrfs_delayed_ref_head *head_ref;
         struct btrfs_delayed_ref_root *delayed_refs;
         struct btrfs_qgroup_extent_record *record = NULL;
  
-       if (!is_fstree(ref_root) || !fs_info->quota_enabled)
-               no_quota = 0;
-
         BUG_ON(extent_op && !extent_op->is_data);
         ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
         if (!ref)
@@ -736,16 +850,44 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
          * the spin lock
          */
         head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
-                                       bytenr, num_bytes, action, 1);
+                                       bytenr, num_bytes, ref_root, reserved,
+                                       action, 1);
  
         add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
                                    num_bytes, parent, ref_root, owner, offset,
-                                  action, no_quota);
+                                  action);
         spin_unlock(&delayed_refs->lock);
  
         return 0;
  }
  
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+                                    struct btrfs_trans_handle *trans,
+                                    u64 ref_root, u64 bytenr, u64 num_bytes)
+{
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_head *ref_head;
+       int ret = 0;
+
+       if (!fs_info->quota_enabled || !is_fstree(ref_root))
+               return 0;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+
+       spin_lock(&delayed_refs->lock);
+       ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0);
+       if (!ref_head) {
+               ret = -ENOENT;
+               goto out;
+       }
+       WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
+       ref_head->qgroup_ref_root = ref_root;
+       ref_head->qgroup_reserved = num_bytes;
+out:
+       spin_unlock(&delayed_refs->lock);
+       return ret;
+}
+
  int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
                                 struct btrfs_trans_handle *trans,
                                 u64 bytenr, u64 num_bytes,
@@ -764,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
         spin_lock(&delayed_refs->lock);
  
         add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
-                            num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+                            num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
                              extent_op->is_data);
  
         spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h

index 13fb5e6090fe0efc7f55dc9b09e9faac57a3c91b..00ed02cbf3e903a708dd55a4c232a911241da2a8 100644 (file)
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -68,7 +68,6 @@ struct btrfs_delayed_ref_node {
  
         unsigned int action:8;
         unsigned int type:8;
-       unsigned int no_quota:1;
         /* is this node still in the rbtree? */
         unsigned int is_head:1;
         unsigned int in_tree:1;
@@ -112,6 +111,17 @@ struct btrfs_delayed_ref_head {
          */
         int total_ref_mod;
  
+       /*
+        * For qgroup reserved space freeing.
+        *
+        * ref_root and reserved will be recorded after
+        * BTRFS_ADD_DELAYED_EXTENT is called.
+        * And will be used to free reserved qgroup space at
+        * run_delayed_refs() time.
+        */
+       u64 qgroup_ref_root;
+       u64 qgroup_reserved;
+
         /*
          * when a new extent is allocated, it is just reserved in memory
          * The actual extent isn't inserted into the extent allocation tree
@@ -233,15 +243,16 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
                                struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes, u64 parent,
                                u64 ref_root, int level, int action,
-                              struct btrfs_delayed_extent_op *extent_op,
-                              int no_quota);
+                              struct btrfs_delayed_extent_op *extent_op);
  int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
                                struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
                                u64 parent, u64 ref_root,
-                              u64 owner, u64 offset, int action,
-                              struct btrfs_delayed_extent_op *extent_op,
-                              int no_quota);
+                              u64 owner, u64 offset, u64 reserved, int action,
+                              struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+                                    struct btrfs_trans_handle *trans,
+                                    u64 ref_root, u64 bytenr, u64 num_bytes);
  int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
                                 struct btrfs_trans_handle *trans,
                                 u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c

index e54dd5905cee177912e03c915a8d471762bfc0cd..1e668fb7dd4c73dbc03b7f642660f2af20bd2ddb 100644 (file)
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -327,19 +327,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
             args->start.tgtdev_name[0] == '\0')
                 return -EINVAL;
  
-       /*
-        * Here we commit the transaction to make sure commit_total_bytes
-        * of all the devices are updated.
-        */
-       trans = btrfs_attach_transaction(root);
-       if (!IS_ERR(trans)) {
-               ret = btrfs_commit_transaction(trans, root);
-               if (ret)
-                       return ret;
-       } else if (PTR_ERR(trans) != -ENOENT) {
-               return PTR_ERR(trans);
-       }
-
         /* the disk copy procedure reuses the scrub code */
         mutex_lock(&fs_info->volume_mutex);
         ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
@@ -356,6 +343,19 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
         if (ret)
                 return ret;
  
+       /*
+        * Here we commit the transaction to make sure commit_total_bytes
+        * of all the devices are updated.
+        */
+       trans = btrfs_attach_transaction(root);
+       if (!IS_ERR(trans)) {
+               ret = btrfs_commit_transaction(trans, root);
+               if (ret)
+                       return ret;
+       } else if (PTR_ERR(trans) != -ENOENT) {
+               return PTR_ERR(trans);
+       }
+
         btrfs_dev_replace_lock(dev_replace);
         switch (dev_replace->replace_state) {
         case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@@ -375,12 +375,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
         WARN_ON(!tgt_device);
         dev_replace->tgtdev = tgt_device;
  
-       ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device);
-       if (ret)
-               btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
-
-       printk_in_rcu(KERN_INFO
-                     "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
+       btrfs_info_in_rcu(root->fs_info,
+                     "dev_replace from %s (devid %llu) to %s started",
                       src_device->missing ? "<missing disk>" :
                         rcu_str_deref(src_device->name),
                       src_device->devid,
@@ -401,6 +397,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
         args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
         btrfs_dev_replace_unlock(dev_replace);
  
+       ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
+       if (ret)
+               btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
+
         btrfs_wait_ordered_roots(root->fs_info, -1);
  
         /* force writing the updated state information to disk */
@@ -454,8 +454,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
  static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
  {
         clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
-       if (waitqueue_active(&fs_info->replace_wait))
-               wake_up(&fs_info->replace_wait);
+       wake_up(&fs_info->replace_wait);
  }
  
  static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
@@ -523,8 +522,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                                                                 src_device,
                                                                 tgt_device);
         } else {
-               printk_in_rcu(KERN_ERR
-                             "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+               btrfs_err_in_rcu(root->fs_info,
+                             "btrfs_scrub_dev(%s, %llu, %s) failed %d",
                               src_device->missing ? "<missing disk>" :
                                 rcu_str_deref(src_device->name),
                               src_device->devid,
@@ -540,8 +539,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                 return scrub_ret;
         }
  
-       printk_in_rcu(KERN_INFO
-                     "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
+       btrfs_info_in_rcu(root->fs_info,
+                     "dev_replace from %s (devid %llu) to %s finished",
                       src_device->missing ? "<missing disk>" :
                         rcu_str_deref(src_device->name),
                       src_device->devid,
@@ -586,7 +585,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
         mutex_unlock(&uuid_mutex);
  
         /* replace the sysfs entry */
-       btrfs_kobj_rm_device(fs_info->fs_devices, src_device);
+       btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
         btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
  
         /* write back the superblocks */
@@ -809,8 +808,8 @@ static int btrfs_dev_replace_kthread(void *data)
                 progress = status_args->status.progress_1000;
                 kfree(status_args);
                 progress = div_u64(progress, 10);
-               printk_in_rcu(KERN_INFO
-                       "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+               btrfs_info_in_rcu(fs_info,
+                       "continuing dev_replace from %s (devid %llu) to %s @%u%%",
                         dev_replace->srcdev->missing ? "<missing disk>" :
                         rcu_str_deref(dev_replace->srcdev->name),
                         dev_replace->srcdev->devid,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 1e60d00d4ea7c42104614ede9e203a1f56e6408a..2d4667594681133ba4e4c4c80ec583ce212e3787 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -319,9 +319,9 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
                         memcpy(&found, result, csum_size);
  
                         read_extent_buffer(buf, &val, 0, csum_size);
-                       printk_ratelimited(KERN_WARNING
-                               "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
-                               "level %d\n",
+                       btrfs_warn_rl(fs_info,
+                               "%s checksum verify failed on %llu wanted %X found %X "
+                               "level %d",
                                 fs_info->sb->s_id, buf->start,
                                 val, found, btrfs_header_level(buf));
                         if (result != (char *)&inline_result)
@@ -368,9 +368,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
                 ret = 0;
                 goto out;
         }
-       printk_ratelimited(KERN_ERR
-           "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
-                       eb->fs_info->sb->s_id, eb->start,
+       btrfs_err_rl(eb->fs_info,
+               "parent transid verify failed on %llu wanted %llu found %llu",
+                       eb->start,
                         parent_transid, btrfs_header_generation(eb));
         ret = 1;
  
@@ -629,15 +629,14 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
  
         found_start = btrfs_header_bytenr(eb);
         if (found_start != eb->start) {
-               printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
-                              "%llu %llu\n",
-                              eb->fs_info->sb->s_id, found_start, eb->start);
+               btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu",
+                              found_start, eb->start);
                 ret = -EIO;
                 goto err;
         }
         if (check_tree_block_fsid(root->fs_info, eb)) {
-               printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
-                              eb->fs_info->sb->s_id, eb->start);
+               btrfs_err_rl(eb->fs_info, "bad fsid on block %llu",
+                              eb->start);
                 ret = -EIO;
                 goto err;
         }
@@ -802,6 +801,9 @@ static void run_one_async_done(struct btrfs_work *work)
         limit = btrfs_async_submit_limit(fs_info);
         limit = limit * 2 / 3;
  
+       /*
+        * atomic_dec_return implies a barrier for waitqueue_active
+        */
         if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
             waitqueue_active(&fs_info->async_submit_wait))
                 wake_up(&fs_info->async_submit_wait);
@@ -1265,6 +1267,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
         atomic_set(&root->orphan_inodes, 0);
         atomic_set(&root->refs, 1);
         atomic_set(&root->will_be_snapshoted, 0);
+       atomic_set(&root->qgroup_meta_rsv, 0);
         root->log_transid = 0;
         root->log_transid_committed = -1;
         root->last_log_commit = 0;
@@ -1759,6 +1762,7 @@ static int cleaner_kthread(void *arg)
         int again;
         struct btrfs_trans_handle *trans;
  
+       set_freezable();
         do {
                 again = 0;
  
@@ -2348,8 +2352,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
         u64 bytenr = btrfs_super_log_root(disk_super);
  
         if (fs_devices->rw_devices == 0) {
-               printk(KERN_WARNING "BTRFS: log replay required "
-                      "on RO media\n");
+               btrfs_warn(fs_info, "log replay required on RO media");
                 return -EIO;
         }
  
@@ -2364,12 +2367,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
         log_tree_root->node = read_tree_block(tree_root, bytenr,
                         fs_info->generation + 1);
         if (IS_ERR(log_tree_root->node)) {
-               printk(KERN_ERR "BTRFS: failed to read log tree\n");
+               btrfs_warn(fs_info, "failed to read log tree");
                 ret = PTR_ERR(log_tree_root->node);
                 kfree(log_tree_root);
                 return ret;
         } else if (!extent_buffer_uptodate(log_tree_root->node)) {
-               printk(KERN_ERR "BTRFS: failed to read log tree\n");
+               btrfs_err(fs_info, "failed to read log tree");
                 free_extent_buffer(log_tree_root->node);
                 kfree(log_tree_root);
                 return -EIO;
@@ -2377,7 +2380,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
         /* returns with log_tree_root freed on success */
         ret = btrfs_recover_log_trees(log_tree_root);
         if (ret) {
-               btrfs_error(tree_root->fs_info, ret,
+               btrfs_std_error(tree_root->fs_info, ret,
                             "Failed to recover log tree");
                 free_extent_buffer(log_tree_root->node);
                 kfree(log_tree_root);
@@ -2653,8 +2656,8 @@ int open_ctree(struct super_block *sb,
          * Read super block and check the signature bytes only
          */
         bh = btrfs_read_dev_super(fs_devices->latest_bdev);
-       if (!bh) {
-               err = -EINVAL;
+       if (IS_ERR(bh)) {
+               err = PTR_ERR(bh);
                 goto fail_alloc;
         }
  
@@ -2937,7 +2940,7 @@ retry_root_backup:
                 goto fail_fsdev_sysfs;
         }
  
-       ret = btrfs_sysfs_add_one(fs_info);
+       ret = btrfs_sysfs_add_mounted(fs_info);
         if (ret) {
                 pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
                 goto fail_fsdev_sysfs;
@@ -3117,7 +3120,7 @@ fail_cleaner:
         filemap_write_and_wait(fs_info->btree_inode->i_mapping);
  
  fail_sysfs:
-       btrfs_sysfs_remove_one(fs_info);
+       btrfs_sysfs_remove_mounted(fs_info);
  
  fail_fsdev_sysfs:
         btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@ -3179,8 +3182,8 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
                 struct btrfs_device *device = (struct btrfs_device *)
                         bh->b_private;
  
-               printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
-                                         "I/O error on %s\n",
+               btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
+                               "lost page write due to IO error on %s",
                                           rcu_str_deref(device->name));
                 /* note, we dont' set_buffer_write_io_error because we have
                  * our own ways of dealing with the IO errors
@@ -3192,6 +3195,37 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
         put_bh(bh);
  }
  
+int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
+                       struct buffer_head **bh_ret)
+{
+       struct buffer_head *bh;
+       struct btrfs_super_block *super;
+       u64 bytenr;
+
+       bytenr = btrfs_sb_offset(copy_num);
+       if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+               return -EINVAL;
+
+       bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE);
+       /*
+        * If we fail to read from the underlying devices, as of now
+        * the best option we have is to mark it EIO.
+        */
+       if (!bh)
+               return -EIO;
+
+       super = (struct btrfs_super_block *)bh->b_data;
+       if (btrfs_super_bytenr(super) != bytenr ||
+                   btrfs_super_magic(super) != BTRFS_MAGIC) {
+               brelse(bh);
+               return -EINVAL;
+       }
+
+       *bh_ret = bh;
+       return 0;
+}
+
+
  struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
  {
         struct buffer_head *bh;
@@ -3199,7 +3233,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
         struct btrfs_super_block *super;
         int i;
         u64 transid = 0;
-       u64 bytenr;
+       int ret = -EINVAL;
  
         /* we would like to check all the supers, but that would make
          * a btrfs mount succeed after a mkfs from a different FS.
@@ -3207,21 +3241,11 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
          */
         for (i = 0; i < 1; i++) {
-               bytenr = btrfs_sb_offset(i);
-               if (bytenr + BTRFS_SUPER_INFO_SIZE >=
-                                       i_size_read(bdev->bd_inode))
-                       break;
-               bh = __bread(bdev, bytenr / 4096,
-                                       BTRFS_SUPER_INFO_SIZE);
-               if (!bh)
+               ret = btrfs_read_dev_one_super(bdev, i, &bh);
+               if (ret)
                         continue;
  
                 super = (struct btrfs_super_block *)bh->b_data;
-               if (btrfs_super_bytenr(super) != bytenr ||
-                   btrfs_super_magic(super) != BTRFS_MAGIC) {
-                       brelse(bh);
-                       continue;
-               }
  
                 if (!latest || btrfs_super_generation(super) > transid) {
                         brelse(latest);
@@ -3231,6 +3255,10 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
                         brelse(bh);
                 }
         }
+
+       if (!latest)
+               return ERR_PTR(ret);
+
         return latest;
  }
  
@@ -3299,8 +3327,9 @@ static int write_dev_supers(struct btrfs_device *device,
                         bh = __getblk(device->bdev, bytenr / 4096,
                                       BTRFS_SUPER_INFO_SIZE);
                         if (!bh) {
-                               printk(KERN_ERR "BTRFS: couldn't get super "
-                                      "buffer head for bytenr %Lu\n", bytenr);
+                               btrfs_err(device->dev_root->fs_info,
+                                   "couldn't get super buffer head for bytenr %llu",
+                                   bytenr);
                                 errors++;
                                 continue;
                         }
@@ -3449,22 +3478,31 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
  
  int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
  {
-       if ((flags & (BTRFS_BLOCK_GROUP_DUP |
-                     BTRFS_BLOCK_GROUP_RAID0 |
-                     BTRFS_AVAIL_ALLOC_BIT_SINGLE)) ||
-           ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))
-               return 0;
+       int raid_type;
+       int min_tolerated = INT_MAX;
  
-       if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-                    BTRFS_BLOCK_GROUP_RAID5 |
-                    BTRFS_BLOCK_GROUP_RAID10))
-               return 1;
+       if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
+           (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
+               min_tolerated = min(min_tolerated,
+                                   btrfs_raid_array[BTRFS_RAID_SINGLE].
+                                   tolerated_failures);
  
-       if (flags & BTRFS_BLOCK_GROUP_RAID6)
-               return 2;
+       for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+               if (raid_type == BTRFS_RAID_SINGLE)
+                       continue;
+               if (!(flags & btrfs_raid_group[raid_type]))
+                       continue;
+               min_tolerated = min(min_tolerated,
+                                   btrfs_raid_array[raid_type].
+                                   tolerated_failures);
+       }
  
-       pr_warn("BTRFS: unknown raid type: %llu\n", flags);
-       return 0;
+       if (min_tolerated == INT_MAX) {
+               pr_warn("BTRFS: unknown raid flag: %llu\n", flags);
+               min_tolerated = 0;
+       }
+
+       return min_tolerated;
  }
  
  int btrfs_calc_num_tolerated_disk_barrier_failures(
@@ -3548,7 +3586,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
                 if (ret) {
                         mutex_unlock(
                                 &root->fs_info->fs_devices->device_list_mutex);
-                       btrfs_error(root->fs_info, ret,
+                       btrfs_std_error(root->fs_info, ret,
                                     "errors while submitting device barriers.");
                         return ret;
                 }
@@ -3588,7 +3626,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  
                 /* FUA is masked off if unsupported and can't be the reason */
-               btrfs_error(root->fs_info, -EIO,
+               btrfs_std_error(root->fs_info, -EIO,
                             "%d errors while writing supers", total_errors);
                 return -EIO;
         }
@@ -3606,7 +3644,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
         }
         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
         if (total_errors > max_errors) {
-               btrfs_error(root->fs_info, -EIO,
+               btrfs_std_error(root->fs_info, -EIO,
                             "%d errors while writing supers", total_errors);
                 return -EIO;
         }
@@ -3792,7 +3830,7 @@ void close_ctree(struct btrfs_root *root)
                        percpu_counter_sum(&fs_info->delalloc_bytes));
         }
  
-       btrfs_sysfs_remove_one(fs_info);
+       btrfs_sysfs_remove_mounted(fs_info);
         btrfs_sysfs_remove_fsid(fs_info->fs_devices);
  
         btrfs_free_fs_roots(fs_info);
@@ -4290,25 +4328,6 @@ again:
         return 0;
  }
  
-static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
-                                      struct btrfs_fs_info *fs_info)
-{
-       struct btrfs_ordered_extent *ordered;
-
-       spin_lock(&fs_info->trans_lock);
-       while (!list_empty(&cur_trans->pending_ordered)) {
-               ordered = list_first_entry(&cur_trans->pending_ordered,
-                                          struct btrfs_ordered_extent,
-                                          trans_list);
-               list_del_init(&ordered->trans_list);
-               spin_unlock(&fs_info->trans_lock);
-
-               btrfs_put_ordered_extent(ordered);
-               spin_lock(&fs_info->trans_lock);
-       }
-       spin_unlock(&fs_info->trans_lock);
-}
-
  void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
                                    struct btrfs_root *root)
  {
@@ -4320,7 +4339,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
         cur_trans->state = TRANS_STATE_UNBLOCKED;
         wake_up(&root->fs_info->transaction_wait);
  
-       btrfs_free_pending_ordered(cur_trans, root->fs_info);
         btrfs_destroy_delayed_inodes(root);
         btrfs_assert_delayed_root_empty(root);
  
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h

index bdfb479ea85955112305d0c30a17af9c7647daed..adeb31830b9cc1d49d145fdaa4f476c23dc81aa7 100644 (file)
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -60,6 +60,8 @@ void close_ctree(struct btrfs_root *root);
  int write_ctree_super(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, int max_mirrors);
  struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
+                       struct buffer_head **bh_ret);
  int btrfs_commit_super(struct btrfs_root *root);
  struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
                                             u64 bytenr);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 601d7d45d164a7e91477748a900bbef8cf67d0b0..99a8e57da8a11fbbdcbc0d833a687d79629200cc 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -95,8 +95,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      u64 parent, u64 root_objectid,
                                      u64 flags, struct btrfs_disk_key *key,
-                                    int level, struct btrfs_key *ins,
-                                    int no_quota);
+                                    int level, struct btrfs_key *ins);
  static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                           struct btrfs_root *extent_root, u64 flags,
                           int force);
@@ -332,6 +331,27 @@ static void put_caching_control(struct btrfs_caching_control *ctl)
                 kfree(ctl);
  }
  
+#ifdef CONFIG_BTRFS_DEBUG
+static void fragment_free_space(struct btrfs_root *root,
+                               struct btrfs_block_group_cache *block_group)
+{
+       u64 start = block_group->key.objectid;
+       u64 len = block_group->key.offset;
+       u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+               root->nodesize : root->sectorsize;
+       u64 step = chunk << 1;
+
+       while (len > chunk) {
+               btrfs_remove_free_space(block_group, start, chunk);
+               start += step;
+               if (len < step)
+                       len = 0;
+               else
+                       len -= step;
+       }
+}
+#endif
+
  /*
   * this is only called by cache_block_group, since we could have freed extents
   * we need to check the pinned_extents for any extents that can't be used yet
@@ -388,6 +408,7 @@ static noinline void caching_thread(struct btrfs_work *work)
         u64 last = 0;
         u32 nritems;
         int ret = -ENOMEM;
+       bool wakeup = true;
  
         caching_ctl = container_of(work, struct btrfs_caching_control, work);
         block_group = caching_ctl->block_group;
@@ -400,6 +421,15 @@ static noinline void caching_thread(struct btrfs_work *work)
  
         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
  
+#ifdef CONFIG_BTRFS_DEBUG
+       /*
+        * If we're fragmenting we don't want to make anybody think we can
+        * allocate from this block group until we've had a chance to fragment
+        * the free space.
+        */
+       if (btrfs_should_fragment_free_space(extent_root, block_group))
+               wakeup = false;
+#endif
         /*
          * We don't want to deadlock with somebody trying to allocate a new
          * extent for the extent root while also trying to search the extent
@@ -441,7 +471,8 @@ next:
  
                         if (need_resched() ||
                             rwsem_is_contended(&fs_info->commit_root_sem)) {
-                               caching_ctl->progress = last;
+                               if (wakeup)
+                                       caching_ctl->progress = last;
                                 btrfs_release_path(path);
                                 up_read(&fs_info->commit_root_sem);
                                 mutex_unlock(&caching_ctl->mutex);
@@ -464,7 +495,8 @@ next:
                         key.offset = 0;
                         key.type = BTRFS_EXTENT_ITEM_KEY;
  
-                       caching_ctl->progress = last;
+                       if (wakeup)
+                               caching_ctl->progress = last;
                         btrfs_release_path(path);
                         goto next;
                 }
@@ -491,7 +523,8 @@ next:
  
                         if (total_found > (1024 * 1024 * 2)) {
                                 total_found = 0;
-                               wake_up(&caching_ctl->wait);
+                               if (wakeup)
+                                       wake_up(&caching_ctl->wait);
                         }
                 }
                 path->slots[0]++;
@@ -501,13 +534,27 @@ next:
         total_found += add_new_free_space(block_group, fs_info, last,
                                           block_group->key.objectid +
                                           block_group->key.offset);
-       caching_ctl->progress = (u64)-1;
-
         spin_lock(&block_group->lock);
         block_group->caching_ctl = NULL;
         block_group->cached = BTRFS_CACHE_FINISHED;
         spin_unlock(&block_group->lock);
  
+#ifdef CONFIG_BTRFS_DEBUG
+       if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+               u64 bytes_used;
+
+               spin_lock(&block_group->space_info->lock);
+               spin_lock(&block_group->lock);
+               bytes_used = block_group->key.offset -
+                       btrfs_block_group_used(&block_group->item);
+               block_group->space_info->bytes_used += bytes_used >> 1;
+               spin_unlock(&block_group->lock);
+               spin_unlock(&block_group->space_info->lock);
+               fragment_free_space(extent_root, block_group);
+       }
+#endif
+
+       caching_ctl->progress = (u64)-1;
  err:
         btrfs_free_path(path);
         up_read(&fs_info->commit_root_sem);
@@ -607,6 +654,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                         }
                 }
                 spin_unlock(&cache->lock);
+#ifdef CONFIG_BTRFS_DEBUG
+               if (ret == 1 &&
+                   btrfs_should_fragment_free_space(fs_info->extent_root,
+                                                    cache)) {
+                       u64 bytes_used;
+
+                       spin_lock(&cache->space_info->lock);
+                       spin_lock(&cache->lock);
+                       bytes_used = cache->key.offset -
+                               btrfs_block_group_used(&cache->item);
+                       cache->space_info->bytes_used += bytes_used >> 1;
+                       spin_unlock(&cache->lock);
+                       spin_unlock(&cache->space_info->lock);
+                       fragment_free_space(fs_info->extent_root, cache);
+               }
+#endif
                 mutex_unlock(&caching_ctl->mutex);
  
                 wake_up(&caching_ctl->wait);
@@ -2009,8 +2072,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          u64 bytenr, u64 num_bytes, u64 parent,
-                        u64 root_objectid, u64 owner, u64 offset,
-                        int no_quota)
+                        u64 root_objectid, u64 owner, u64 offset)
  {
         int ret;
         struct btrfs_fs_info *fs_info = root->fs_info;
@@ -2022,12 +2084,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
                                         num_bytes,
                                         parent, root_objectid, (int)owner,
-                                       BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+                                       BTRFS_ADD_DELAYED_REF, NULL);
         } else {
                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
-                                       num_bytes,
-                                       parent, root_objectid, owner, offset,
-                                       BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+                                       num_bytes, parent, root_objectid,
+                                       owner, offset, 0,
+                                       BTRFS_ADD_DELAYED_REF, NULL);
         }
         return ret;
  }
@@ -2048,15 +2110,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
         u64 num_bytes = node->num_bytes;
         u64 refs;
         int ret;
-       int no_quota = node->no_quota;
  
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
  
-       if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
-               no_quota = 1;
-
         path->reada = 1;
         path->leave_spinning = 1;
         /* this will setup the path even if it fails to insert the back ref */
@@ -2291,8 +2349,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
                                                 parent, ref_root,
                                                 extent_op->flags_to_set,
                                                 &extent_op->key,
-                                               ref->level, &ins,
-                                               node->no_quota);
+                                               ref->level, &ins);
         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
                 ret = __btrfs_inc_extent_ref(trans, root, node,
                                              parent, ref_root,
@@ -2345,6 +2402,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                                                       node->num_bytes);
                         }
                 }
+
+               /* Also free its reserved qgroup space */
+               btrfs_qgroup_free_delayed_ref(root->fs_info,
+                                             head->qgroup_ref_root,
+                                             head->qgroup_reserved);
                 return ret;
         }
  
@@ -2433,7 +2495,21 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                         }
                 }
  
+               /*
+                * We need to try and merge add/drops of the same ref since we
+                * can run into issues with relocate dropping the implicit ref
+                * and then it being added back again before the drop can
+                * finish.  If we merged anything we need to re-loop so we can
+                * get a good ref.
+                * Or we can get node references of the same type that weren't
+                * merged when created due to bumps in the tree mod seq, and
+                * we need to merge them to prevent adding an inline extent
+                * backref before dropping it (triggering a BUG_ON at
+                * insert_inline_extent_backref()).
+                */
                 spin_lock(&locked_ref->lock);
+               btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+                                        locked_ref);
  
                 /*
                  * locked_ref is the head node, so we have to go one
@@ -3109,7 +3185,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
         int level;
         int ret = 0;
         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                           u64, u64, u64, u64, u64, u64, int);
+                           u64, u64, u64, u64, u64, u64);
  
  
         if (btrfs_test_is_dummy_root(root))
@@ -3150,15 +3226,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                         key.offset -= btrfs_file_extent_offset(buf, fi);
                         ret = process_func(trans, root, bytenr, num_bytes,
                                            parent, ref_root, key.objectid,
-                                          key.offset, 1);
+                                          key.offset);
                         if (ret)
                                 goto fail;
                 } else {
                         bytenr = btrfs_node_blockptr(buf, i);
                         num_bytes = root->nodesize;
                         ret = process_func(trans, root, bytenr, num_bytes,
-                                          parent, ref_root, level - 1, 0,
-                                          1);
+                                          parent, ref_root, level - 1, 0);
                         if (ret)
                                 goto fail;
                 }
@@ -3338,6 +3413,15 @@ again:
         }
         spin_unlock(&block_group->lock);
  
+       /*
+        * We hit an ENOSPC when setting up the cache in this transaction, just
+        * skip doing the setup, we've already cleared the cache so we're safe.
+        */
+       if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+               ret = -ENOSPC;
+               goto out_put;
+       }
+
         /*
          * Try to preallocate enough space based on how big the block group is.
          * Keep in mind this has to include any pinned space which could end up
@@ -3351,16 +3435,26 @@ again:
         num_pages *= 16;
         num_pages *= PAGE_CACHE_SIZE;
  
-       ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+       ret = btrfs_check_data_free_space(inode, 0, num_pages);
         if (ret)
                 goto out_put;
  
         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
                                               num_pages, num_pages,
                                               &alloc_hint);
+       /*
+        * Our cache requires contiguous chunks so that we don't modify a bunch
+        * of metadata or split extents when writing the cache out, which means
+        * we can enospc if we are heavily fragmented in addition to just normal
+        * out of space conditions.  So if we hit this just skip setting up any
+        * other block groups for this transaction, maybe we'll unpin enough
+        * space the next time around.
+        */
         if (!ret)
                 dcs = BTRFS_DC_SETUP;
-       btrfs_free_reserved_data_space(inode, num_pages);
+       else if (ret == -ENOSPC)
+               set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
+       btrfs_free_reserved_data_space(inode, 0, num_pages);
  
  out_put:
         iput(inode);
@@ -3746,6 +3840,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
         found->bytes_readonly = 0;
         found->bytes_may_use = 0;
         found->full = 0;
+       found->max_extent_size = 0;
         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
         found->chunk_alloc = 0;
         found->flush = 0;
@@ -3822,7 +3917,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
  {
         u64 num_devices = root->fs_info->fs_devices->rw_devices;
         u64 target;
-       u64 tmp;
+       u64 raid_type;
+       u64 allowed = 0;
  
         /*
          * see if restripe for this chunk_type is in progress, if so
@@ -3840,31 +3936,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
         spin_unlock(&root->fs_info->balance_lock);
  
         /* First, mask out the RAID levels which aren't possible */
-       if (num_devices == 1)
-               flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
-                          BTRFS_BLOCK_GROUP_RAID5);
-       if (num_devices < 3)
-               flags &= ~BTRFS_BLOCK_GROUP_RAID6;
-       if (num_devices < 4)
-               flags &= ~BTRFS_BLOCK_GROUP_RAID10;
-
-       tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
-                      BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
-                      BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
-       flags &= ~tmp;
-
-       if (tmp & BTRFS_BLOCK_GROUP_RAID6)
-               tmp = BTRFS_BLOCK_GROUP_RAID6;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
-               tmp = BTRFS_BLOCK_GROUP_RAID5;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
-               tmp = BTRFS_BLOCK_GROUP_RAID10;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
-               tmp = BTRFS_BLOCK_GROUP_RAID1;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
-               tmp = BTRFS_BLOCK_GROUP_RAID0;
-
-       return extended_to_chunk(flags | tmp);
+       for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+               if (num_devices >= btrfs_raid_array[raid_type].devs_min)
+                       allowed |= btrfs_raid_group[raid_type];
+       }
+       allowed &= flags;
+
+       if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+               allowed = BTRFS_BLOCK_GROUP_RAID6;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
+               allowed = BTRFS_BLOCK_GROUP_RAID5;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
+               allowed = BTRFS_BLOCK_GROUP_RAID10;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
+               allowed = BTRFS_BLOCK_GROUP_RAID1;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
+               allowed = BTRFS_BLOCK_GROUP_RAID0;
+
+       flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       return extended_to_chunk(flags | allowed);
  }
  
  static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
@@ -3903,11 +3994,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
         return ret;
  }
  
-/*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
  {
         struct btrfs_space_info *data_sinfo;
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4006,7 +4093,8 @@ commit_trans:
                         if (IS_ERR(trans))
                                 return PTR_ERR(trans);
                         if (have_pinned_space >= 0 ||
-                           trans->transaction->have_free_bgs ||
+                           test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+                                    &trans->transaction->flags) ||
                             need_commit > 0) {
                                 ret = btrfs_commit_transaction(trans, root);
                                 if (ret)
@@ -4028,38 +4116,86 @@ commit_trans:
                                               data_sinfo->flags, bytes, 1);
                 return -ENOSPC;
         }
-       ret = btrfs_qgroup_reserve(root, write_bytes);
-       if (ret)
-               goto out;
         data_sinfo->bytes_may_use += bytes;
         trace_btrfs_space_reservation(root->fs_info, "space_info",
                                       data_sinfo->flags, bytes, 1);
-out:
         spin_unlock(&data_sinfo->lock);
  
         return ret;
  }
  
  /*
- * Called if we need to clear a data reservation for this inode.
+ * New check_data_free_space() with ability for precious data reservation
+ * Will replace old btrfs_check_data_free_space(), but for patch split,
+ * add a new function first and then replace it.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret;
+
+       /* align the range */
+       len = round_up(start + len, root->sectorsize) -
+             round_down(start, root->sectorsize);
+       start = round_down(start, root->sectorsize);
+
+       ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * Use new btrfs_qgroup_reserve_data to reserve precious data space
+        *
+        * TODO: Find a good method to avoid reserve data space for NOCOW
+        * range, but don't impact performance on quota disable case.
+        */
+       ret = btrfs_qgroup_reserve_data(inode, start, len);
+       return ret;
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will *NOT* use accurate qgroup reserved space API, just for case
+ * which we can't sleep and is sure it won't affect qgroup reserved space.
+ * Like clear_bit_hook().
   */
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+                                           u64 len)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_space_info *data_sinfo;
  
-       /* make sure bytes are sectorsize aligned */
-       bytes = ALIGN(bytes, root->sectorsize);
+       /* Make sure the range is aligned to sectorsize */
+       len = round_up(start + len, root->sectorsize) -
+             round_down(start, root->sectorsize);
+       start = round_down(start, root->sectorsize);
  
         data_sinfo = root->fs_info->data_sinfo;
         spin_lock(&data_sinfo->lock);
-       WARN_ON(data_sinfo->bytes_may_use < bytes);
-       data_sinfo->bytes_may_use -= bytes;
+       if (WARN_ON(data_sinfo->bytes_may_use < len))
+               data_sinfo->bytes_may_use = 0;
+       else
+               data_sinfo->bytes_may_use -= len;
         trace_btrfs_space_reservation(root->fs_info, "space_info",
-                                     data_sinfo->flags, bytes, 0);
+                                     data_sinfo->flags, len, 0);
         spin_unlock(&data_sinfo->lock);
  }
  
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-indoe data rsv map for accurate reserved
+ * space framework.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+{
+       btrfs_free_reserved_data_space_noquota(inode, start, len);
+       btrfs_qgroup_free_data(inode, start, len);
+}
+
  static void force_metadata_allocation(struct btrfs_fs_info *info)
  {
         struct list_head *head = &info->space_info;
@@ -4891,13 +5027,9 @@ static struct btrfs_block_rsv *get_block_rsv(
  {
         struct btrfs_block_rsv *block_rsv = NULL;
  
-       if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
-               block_rsv = trans->block_rsv;
-
-       if (root == root->fs_info->csum_root && trans->adding_csums)
-               block_rsv = trans->block_rsv;
-
-       if (root == root->fs_info->uuid_root)
+       if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+           (root == root->fs_info->csum_root && trans->adding_csums) ||
+            (root == root->fs_info->uuid_root))
                 block_rsv = trans->block_rsv;
  
         if (!block_rsv)
@@ -5340,7 +5472,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
         if (root->fs_info->quota_enabled) {
                 /* One for parent inode, two for dir entries */
                 num_bytes = 3 * root->nodesize;
-               ret = btrfs_qgroup_reserve(root, num_bytes);
+               ret = btrfs_qgroup_reserve_meta(root, num_bytes);
                 if (ret)
                         return ret;
         } else {
@@ -5358,10 +5490,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
         if (ret == -ENOSPC && use_global_rsv)
                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
  
-       if (ret) {
-               if (*qgroup_reserved)
-                       btrfs_qgroup_free(root, *qgroup_reserved);
-       }
+       if (ret && *qgroup_reserved)
+               btrfs_qgroup_free_meta(root, *qgroup_reserved);
  
         return ret;
  }
@@ -5522,15 +5652,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
         spin_unlock(&BTRFS_I(inode)->lock);
  
         if (root->fs_info->quota_enabled) {
-               ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+               ret = btrfs_qgroup_reserve_meta(root,
+                               nr_extents * root->nodesize);
                 if (ret)
                         goto out_fail;
         }
  
         ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
         if (unlikely(ret)) {
-               if (root->fs_info->quota_enabled)
-                       btrfs_qgroup_free(root, nr_extents * root->nodesize);
+               btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
                 goto out_fail;
         }
  
@@ -5653,41 +5783,48 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
  }
  
  /**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
   * @inode: inode we're writing to
- * @num_bytes: the number of bytes we want to allocate
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ *
+ * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
   *
   * This will do the following things
   *
- * o reserve space in the data space info for num_bytes
- * o reserve space in the metadata space info based on number of outstanding
+ * o reserve space in data space info for num bytes
+ *   and reserve precious corresponding qgroup space
+ *   (Done in check_data_free_space)
+ *
+ * o reserve space for metadata space, based on the number of outstanding
   *   extents and how much csums will be needed
- * o add to the inodes ->delalloc_bytes
+ *   also reserve metadata space in a per root over-reserve method.
+ * o add to the inodes->delalloc_bytes
   * o add it to the fs_info's delalloc inodes list.
+ *   (Above 3 all done in delalloc_reserve_metadata)
   *
- * This will return 0 for success and -ENOSPC if there is no space left.
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
   */
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
  {
         int ret;
  
-       ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
-       if (ret)
-               return ret;
-
-       ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
-       if (ret) {
-               btrfs_free_reserved_data_space(inode, num_bytes);
+       ret = btrfs_check_data_free_space(inode, start, len);
+       if (ret < 0)
                 return ret;
-       }
-
-       return 0;
+       ret = btrfs_delalloc_reserve_metadata(inode, len);
+       if (ret < 0)
+               btrfs_free_reserved_data_space(inode, start, len);
+       return ret;
  }
  
  /**
   * btrfs_delalloc_release_space - release data and metadata space for delalloc
   * @inode: inode we're releasing space for
- * @num_bytes: the number of bytes we want to free up
+ * @start: start position of the space already reserved
+ * @len: the len of the space already reserved
   *
   * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
   * called in the case that we don't need the metadata AND data reservations
@@ -5696,11 +5833,12 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
   * This function will release the metadata space that was not used and will
   * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
   * list if there are no delalloc bytes left.
+ * Also it will handle the qgroup reserved space.
   */
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
  {
-       btrfs_delalloc_release_metadata(inode, num_bytes);
-       btrfs_free_reserved_data_space(inode, num_bytes);
+       btrfs_delalloc_release_metadata(inode, len);
+       btrfs_free_reserved_data_space(inode, start, len);
  }
  
  static int update_block_group(struct btrfs_trans_handle *trans,
@@ -6065,6 +6203,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
         update_global_block_rsv(fs_info);
  }
  
+/*
+ * Returns the free cluster for the given space info and sets empty_cluster to
+ * what it should be based on the mount options.
+ */
+static struct btrfs_free_cluster *
+fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
+                  u64 *empty_cluster)
+{
+       struct btrfs_free_cluster *ret = NULL;
+       bool ssd = btrfs_test_opt(root, SSD);
+
+       *empty_cluster = 0;
+       if (btrfs_mixed_space_info(space_info))
+               return ret;
+
+       if (ssd)
+               *empty_cluster = 2 * 1024 * 1024;
+       if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+               ret = &root->fs_info->meta_alloc_cluster;
+               if (!ssd)
+                       *empty_cluster = 64 * 1024;
+       } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
+               ret = &root->fs_info->data_alloc_cluster;
+       }
+
+       return ret;
+}
+
  static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
                               const bool return_free_space)
  {
@@ -6072,7 +6238,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
         struct btrfs_block_group_cache *cache = NULL;
         struct btrfs_space_info *space_info;
         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+       struct btrfs_free_cluster *cluster = NULL;
         u64 len;
+       u64 total_unpinned = 0;
+       u64 empty_cluster = 0;
         bool readonly;
  
         while (start <= end) {
@@ -6081,8 +6250,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
                     start >= cache->key.objectid + cache->key.offset) {
                         if (cache)
                                 btrfs_put_block_group(cache);
+                       total_unpinned = 0;
                         cache = btrfs_lookup_block_group(fs_info, start);
                         BUG_ON(!cache); /* Logic error */
+
+                       cluster = fetch_cluster_info(root,
+                                                    cache->space_info,
+                                                    &empty_cluster);
+                       empty_cluster <<= 1;
                 }
  
                 len = cache->key.objectid + cache->key.offset - start;
@@ -6095,12 +6270,27 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
                 }
  
                 start += len;
+               total_unpinned += len;
                 space_info = cache->space_info;
  
+               /*
+                * If this space cluster has been marked as fragmented and we've
+                * unpinned enough in this block group to potentially allow a
+                * cluster to be created inside of it go ahead and clear the
+                * fragmented check.
+                */
+               if (cluster && cluster->fragmented &&
+                   total_unpinned > empty_cluster) {
+                       spin_lock(&cluster->lock);
+                       cluster->fragmented = 0;
+                       spin_unlock(&cluster->lock);
+               }
+
                 spin_lock(&space_info->lock);
                 spin_lock(&cache->lock);
                 cache->pinned -= len;
                 space_info->bytes_pinned -= len;
+               space_info->max_extent_size = 0;
                 percpu_counter_add(&space_info->total_bytes_pinned, -len);
                 if (cache->ro) {
                         space_info->bytes_readonly += len;
@@ -6233,7 +6423,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
         int extent_slot = 0;
         int found_extent = 0;
         int num_to_del = 1;
-       int no_quota = node->no_quota;
         u32 item_size;
         u64 refs;
         u64 bytenr = node->bytenr;
@@ -6242,9 +6431,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
                                                  SKINNY_METADATA);
  
-       if (!info->quota_enabled || !is_fstree(root_objectid))
-               no_quota = 1;
-
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
@@ -6570,7 +6756,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                                         buf->start, buf->len,
                                         parent, root->root_key.objectid,
                                         btrfs_header_level(buf),
-                                       BTRFS_DROP_DELAYED_REF, NULL, 0);
+                                       BTRFS_DROP_DELAYED_REF, NULL);
                 BUG_ON(ret); /* -ENOMEM */
         }
  
@@ -6618,7 +6804,7 @@ out:
  /* Can return -ENOMEM */
  int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-                     u64 owner, u64 offset, int no_quota)
+                     u64 owner, u64 offset)
  {
         int ret;
         struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6641,13 +6827,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
                                         num_bytes,
                                         parent, root_objectid, (int)owner,
-                                       BTRFS_DROP_DELAYED_REF, NULL, no_quota);
+                                       BTRFS_DROP_DELAYED_REF, NULL);
         } else {
                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
                                                 num_bytes,
                                                 parent, root_objectid, owner,
-                                               offset, BTRFS_DROP_DELAYED_REF,
-                                               NULL, no_quota);
+                                               offset, 0,
+                                               BTRFS_DROP_DELAYED_REF, NULL);
         }
         return ret;
  }
@@ -6833,7 +7019,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
         struct btrfs_block_group_cache *block_group = NULL;
         u64 search_start = 0;
         u64 max_extent_size = 0;
-       int empty_cluster = 2 * 1024 * 1024;
+       u64 empty_cluster = 0;
         struct btrfs_space_info *space_info;
         int loop = 0;
         int index = __get_raid_index(flags);
@@ -6843,6 +7029,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
         bool failed_alloc = false;
         bool use_cluster = true;
         bool have_caching_bg = false;
+       bool orig_have_caching_bg = false;
+       bool full_search = false;
  
         WARN_ON(num_bytes < root->sectorsize);
         ins->type = BTRFS_EXTENT_ITEM_KEY;
@@ -6858,36 +7046,47 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
         }
  
         /*
-        * If the space info is for both data and metadata it means we have a
-        * small filesystem and we can't use the clustering stuff.
+        * If our free space is heavily fragmented we may not be able to make
+        * big contiguous allocations, so instead of doing the expensive search
+        * for free space, simply return ENOSPC with our max_extent_size so we
+        * can go ahead and search for a more manageable chunk.
+        *
+        * If our max_extent_size is large enough for our allocation simply
+        * disable clustering since we will likely not be able to find enough
+        * space to create a cluster and induce latency trying.
          */
-       if (btrfs_mixed_space_info(space_info))
-               use_cluster = false;
-
-       if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
-               last_ptr = &root->fs_info->meta_alloc_cluster;
-               if (!btrfs_test_opt(root, SSD))
-                       empty_cluster = 64 * 1024;
-       }
-
-       if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
-           btrfs_test_opt(root, SSD)) {
-               last_ptr = &root->fs_info->data_alloc_cluster;
+       if (unlikely(space_info->max_extent_size)) {
+               spin_lock(&space_info->lock);
+               if (space_info->max_extent_size &&
+                   num_bytes > space_info->max_extent_size) {
+                       ins->offset = space_info->max_extent_size;
+                       spin_unlock(&space_info->lock);
+                       return -ENOSPC;
+               } else if (space_info->max_extent_size) {
+                       use_cluster = false;
+               }
+               spin_unlock(&space_info->lock);
         }
  
+       last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
         if (last_ptr) {
                 spin_lock(&last_ptr->lock);
                 if (last_ptr->block_group)
                         hint_byte = last_ptr->window_start;
+               if (last_ptr->fragmented) {
+                       /*
+                        * We still set window_start so we can keep track of the
+                        * last place we found an allocation to try and save
+                        * some time.
+                        */
+                       hint_byte = last_ptr->window_start;
+                       use_cluster = false;
+               }
                 spin_unlock(&last_ptr->lock);
         }
  
         search_start = max(search_start, first_logical_byte(root, 0));
         search_start = max(search_start, hint_byte);
-
-       if (!last_ptr)
-               empty_cluster = 0;
-
         if (search_start == hint_byte) {
                 block_group = btrfs_lookup_block_group(root->fs_info,
                                                        search_start);
@@ -6922,6 +7121,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
         }
  search:
         have_caching_bg = false;
+       if (index == 0 || index == __get_raid_index(flags))
+               full_search = true;
         down_read(&space_info->groups_sem);
         list_for_each_entry(block_group, &space_info->block_groups[index],
                             list) {
@@ -6955,6 +7156,7 @@ search:
  have_block_group:
                 cached = block_group_cache_done(block_group);
                 if (unlikely(!cached)) {
+                       have_caching_bg = true;
                         ret = cache_block_group(block_group, 0);
                         BUG_ON(ret < 0);
                         ret = 0;
@@ -6969,7 +7171,7 @@ have_block_group:
                  * Ok we want to try and use the cluster allocator, so
                  * lets look there
                  */
-               if (last_ptr) {
+               if (last_ptr && use_cluster) {
                         struct btrfs_block_group_cache *used_block_group;
                         unsigned long aligned_cluster;
                         /*
@@ -7095,6 +7297,16 @@ refill_cluster:
                 }
  
  unclustered_alloc:
+               /*
+                * We are doing an unclustered alloc, set the fragmented flag so
+                * we don't bother trying to setup a cluster again until we get
+                * more space.
+                */
+               if (unlikely(last_ptr)) {
+                       spin_lock(&last_ptr->lock);
+                       last_ptr->fragmented = 1;
+                       spin_unlock(&last_ptr->lock);
+               }
                 spin_lock(&block_group->free_space_ctl->tree_lock);
                 if (cached &&
                     block_group->free_space_ctl->free_space <
@@ -7127,8 +7339,6 @@ unclustered_alloc:
                         failed_alloc = true;
                         goto have_block_group;
                 } else if (!offset) {
-                       if (!cached)
-                               have_caching_bg = true;
                         goto loop;
                 }
  checks:
@@ -7169,6 +7379,10 @@ loop:
         }
         up_read(&space_info->groups_sem);
  
+       if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
+               && !orig_have_caching_bg)
+               orig_have_caching_bg = true;
+
         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
                 goto search;
  
@@ -7185,7 +7399,20 @@ loop:
          */
         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
                 index = 0;
-               loop++;
+               if (loop == LOOP_CACHING_NOWAIT) {
+                       /*
+                        * We want to skip the LOOP_CACHING_WAIT step if we
+                        * don't have any unached bgs and we've alrelady done a
+                        * full search through.
+                        */
+                       if (orig_have_caching_bg || !full_search)
+                               loop = LOOP_CACHING_WAIT;
+                       else
+                               loop = LOOP_ALLOC_CHUNK;
+               } else {
+                       loop++;
+               }
+
                 if (loop == LOOP_ALLOC_CHUNK) {
                         struct btrfs_trans_handle *trans;
                         int exist = 0;
@@ -7203,6 +7430,15 @@ loop:
  
                         ret = do_chunk_alloc(trans, root, flags,
                                              CHUNK_ALLOC_FORCE);
+
+                       /*
+                        * If we can't allocate a new chunk we've already looped
+                        * through at least once, move on to the NO_EMPTY_SIZE
+                        * case.
+                        */
+                       if (ret == -ENOSPC)
+                               loop = LOOP_NO_EMPTY_SIZE;
+
                         /*
                          * Do not bail out on ENOSPC since we
                          * can do more things.
@@ -7219,6 +7455,15 @@ loop:
                 }
  
                 if (loop == LOOP_NO_EMPTY_SIZE) {
+                       /*
+                        * Don't loop again if we already have no empty_size and
+                        * no empty_cluster.
+                        */
+                       if (empty_size == 0 &&
+                           empty_cluster == 0) {
+                               ret = -ENOSPC;
+                               goto out;
+                       }
                         empty_size = 0;
                         empty_cluster = 0;
                 }
@@ -7227,11 +7472,20 @@ loop:
         } else if (!ins->objectid) {
                 ret = -ENOSPC;
         } else if (ins->objectid) {
+               if (!use_cluster && last_ptr) {
+                       spin_lock(&last_ptr->lock);
+                       last_ptr->window_start = ins->objectid;
+                       spin_unlock(&last_ptr->lock);
+               }
                 ret = 0;
         }
  out:
-       if (ret == -ENOSPC)
+       if (ret == -ENOSPC) {
+               spin_lock(&space_info->lock);
+               space_info->max_extent_size = max_extent_size;
+               spin_unlock(&space_info->lock);
                 ins->offset = max_extent_size;
+       }
         return ret;
  }
  
@@ -7280,7 +7534,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
                          u64 empty_size, u64 hint_byte,
                          struct btrfs_key *ins, int is_data, int delalloc)
  {
-       bool final_tried = false;
+       bool final_tried = num_bytes == min_alloc_size;
         u64 flags;
         int ret;
  
@@ -7429,8 +7683,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      u64 parent, u64 root_objectid,
                                      u64 flags, struct btrfs_disk_key *key,
-                                    int level, struct btrfs_key *ins,
-                                    int no_quota)
+                                    int level, struct btrfs_key *ins)
  {
         int ret;
         struct btrfs_fs_info *fs_info = root->fs_info;
@@ -7511,7 +7764,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      u64 root_objectid, u64 owner,
-                                    u64 offset, struct btrfs_key *ins)
+                                    u64 offset, u64 ram_bytes,
+                                    struct btrfs_key *ins)
  {
         int ret;
  
@@ -7520,7 +7774,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
         ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
                                          ins->offset, 0,
                                          root_objectid, owner, offset,
-                                        BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
+                                        ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
+                                        NULL);
         return ret;
  }
  
@@ -7734,7 +7989,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                                                  ins.objectid, ins.offset,
                                                  parent, root_objectid, level,
                                                  BTRFS_ADD_DELAYED_EXTENT,
-                                                extent_op, 0);
+                                                extent_op);
                 if (ret)
                         goto out_free_delayed;
         }
@@ -8275,14 +8530,15 @@ skip:
                         ret = account_shared_subtree(trans, root, next,
                                                      generation, level - 1);
                         if (ret) {
-                               printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+                               btrfs_err_rl(root->fs_info,
+                                       "Error "
                                         "%d accounting shared subtree. Quota "
-                                       "is out of sync, rescan required.\n",
-                                       root->fs_info->sb->s_id, ret);
+                                       "is out of sync, rescan required.",
+                                       ret);
                         }
                 }
                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-                               root->root_key.objectid, level - 1, 0, 0);
+                               root->root_key.objectid, level - 1, 0);
                 BUG_ON(ret); /* -ENOMEM */
         }
         btrfs_tree_unlock(next);
@@ -8367,10 +8623,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                         BUG_ON(ret); /* -ENOMEM */
                         ret = account_leaf_items(trans, root, eb);
                         if (ret) {
-                               printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+                               btrfs_err_rl(root->fs_info,
+                                       "error "
                                         "%d accounting leaf items. Quota "
-                                       "is out of sync, rescan required.\n",
-                                       root->fs_info->sb->s_id, ret);
+                                       "is out of sync, rescan required.",
+                                       ret);
                         }
                 }
                 /* make block locked assertion in clean_tree_block happy */
@@ -8692,7 +8949,7 @@ out:
         if (!for_reloc && root_dropped == false)
                 btrfs_add_dead_root(root);
         if (err && err != -EAGAIN)
-               btrfs_std_error(root->fs_info, err);
+               btrfs_std_error(root->fs_info, err, NULL);
         return err;
  }
  
@@ -8880,7 +9137,7 @@ again:
          * back off and let this transaction commit
          */
         mutex_lock(&root->fs_info->ro_block_group_mutex);
-       if (trans->transaction->dirty_bg_run) {
+       if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
                 u64 transid = trans->transid;
  
                 mutex_unlock(&root->fs_info->ro_block_group_mutex);
@@ -9630,6 +9887,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
  
         free_excluded_extents(root, cache);
  
+#ifdef CONFIG_BTRFS_DEBUG
+       if (btrfs_should_fragment_free_space(root, cache)) {
+               u64 new_bytes_used = size - bytes_used;
+
+               bytes_used += new_bytes_used >> 1;
+               fragment_free_space(root, cache);
+       }
+#endif
         /*
          * Call to ensure the corresponding space_info object is created and
          * assigned to our block group, but don't update its counters just yet.
@@ -10370,8 +10635,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
  {
         percpu_counter_dec(&root->subv_writers->counter);
         /*
-        * Make sure counter is updated before we wake up
-        * waiters.
+        * Make sure counter is updated before we wake up waiters.
          */
         smp_mb();
         if (waitqueue_active(&root->subv_writers->wait))
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index 3915c9473e9445d4aeada81c8fb96af7fb521f2c..33a01ea414651d217349940ca6c0e2ed7bea6f4c 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -96,8 +96,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
         inode = tree->mapping->host;
         isize = i_size_read(inode);
         if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
-               printk_ratelimited(KERN_DEBUG
-                   "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+               btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+                   "%s: ino %llu isize %llu odd range [%llu,%llu]",
                                 caller, btrfs_ino(inode), isize, start, end);
         }
  }
@@ -131,6 +131,25 @@ struct extent_page_data {
         unsigned int sync_io:1;
  };
  
+static void add_extent_changeset(struct extent_state *state, unsigned bits,
+                                struct extent_changeset *changeset,
+                                int set)
+{
+       int ret;
+
+       if (!changeset)
+               return;
+       if (set && (state->state & bits) == bits)
+               return;
+       if (!set && (state->state & bits) == 0)
+               return;
+       changeset->bytes_changed += state->end - state->start + 1;
+       ret = ulist_add(changeset->range_changed, state->start, state->end,
+                       GFP_ATOMIC);
+       /* ENOMEM */
+       BUG_ON(ret < 0);
+}
+
  static noinline void flush_write_bio(void *data);
  static inline struct btrfs_fs_info *
  tree_fs_info(struct extent_io_tree *tree)
@@ -410,7 +429,8 @@ static void clear_state_cb(struct extent_io_tree *tree,
  }
  
  static void set_state_bits(struct extent_io_tree *tree,
-                          struct extent_state *state, unsigned *bits);
+                          struct extent_state *state, unsigned *bits,
+                          struct extent_changeset *changeset);
  
  /*
   * insert an extent_state struct into the tree.  'bits' are set on the
@@ -426,7 +446,7 @@ static int insert_state(struct extent_io_tree *tree,
                         struct extent_state *state, u64 start, u64 end,
                         struct rb_node ***p,
                         struct rb_node **parent,
-                       unsigned *bits)
+                       unsigned *bits, struct extent_changeset *changeset)
  {
         struct rb_node *node;
  
@@ -436,7 +456,7 @@ static int insert_state(struct extent_io_tree *tree,
         state->start = start;
         state->end = end;
  
-       set_state_bits(tree, state, bits);
+       set_state_bits(tree, state, bits, changeset);
  
         node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
         if (node) {
@@ -511,7 +531,8 @@ static struct extent_state *next_state(struct extent_state *state)
   */
  static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
                                             struct extent_state *state,
-                                           unsigned *bits, int wake)
+                                           unsigned *bits, int wake,
+                                           struct extent_changeset *changeset)
  {
         struct extent_state *next;
         unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
@@ -522,6 +543,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
                 tree->dirty_bytes -= range;
         }
         clear_state_cb(tree, state, bits);
+       add_extent_changeset(state, bits_to_clear, changeset, 0);
         state->state &= ~bits_to_clear;
         if (wake)
                 wake_up(&state->wq);
@@ -569,10 +591,10 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
   *
   * This takes the tree lock, and returns 0 on success and < 0 on error.
   */
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                    unsigned bits, int wake, int delete,
-                    struct extent_state **cached_state,
-                    gfp_t mask)
+static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                             unsigned bits, int wake, int delete,
+                             struct extent_state **cached_state,
+                             gfp_t mask, struct extent_changeset *changeset)
  {
         struct extent_state *state;
         struct extent_state *cached;
@@ -671,7 +693,8 @@ hit_next:
                 if (err)
                         goto out;
                 if (state->end <= end) {
-                       state = clear_state_bit(tree, state, &bits, wake);
+                       state = clear_state_bit(tree, state, &bits, wake,
+                                               changeset);
                         goto next;
                 }
                 goto search_again;
@@ -692,13 +715,13 @@ hit_next:
                 if (wake)
                         wake_up(&state->wq);
  
-               clear_state_bit(tree, prealloc, &bits, wake);
+               clear_state_bit(tree, prealloc, &bits, wake, changeset);
  
                 prealloc = NULL;
                 goto out;
         }
  
-       state = clear_state_bit(tree, state, &bits, wake);
+       state = clear_state_bit(tree, state, &bits, wake, changeset);
  next:
         if (last_end == (u64)-1)
                 goto out;
@@ -789,7 +812,7 @@ out:
  
  static void set_state_bits(struct extent_io_tree *tree,
                            struct extent_state *state,
-                          unsigned *bits)
+                          unsigned *bits, struct extent_changeset *changeset)
  {
         unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
  
@@ -798,6 +821,7 @@ static void set_state_bits(struct extent_io_tree *tree,
                 u64 range = state->end - state->start + 1;
                 tree->dirty_bytes += range;
         }
+       add_extent_changeset(state, bits_to_set, changeset, 1);
         state->state |= bits_to_set;
  }
  
@@ -835,7 +859,7 @@ static int __must_check
  __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                  unsigned bits, unsigned exclusive_bits,
                  u64 *failed_start, struct extent_state **cached_state,
-                gfp_t mask)
+                gfp_t mask, struct extent_changeset *changeset)
  {
         struct extent_state *state;
         struct extent_state *prealloc = NULL;
@@ -873,7 +897,7 @@ again:
                 prealloc = alloc_extent_state_atomic(prealloc);
                 BUG_ON(!prealloc);
                 err = insert_state(tree, prealloc, start, end,
-                                  &p, &parent, &bits);
+                                  &p, &parent, &bits, changeset);
                 if (err)
                         extent_io_tree_panic(tree, err);
  
@@ -899,7 +923,7 @@ hit_next:
                         goto out;
                 }
  
-               set_state_bits(tree, state, &bits);
+               set_state_bits(tree, state, &bits, changeset);
                 cache_state(state, cached_state);
                 merge_state(tree, state);
                 if (last_end == (u64)-1)
@@ -945,7 +969,7 @@ hit_next:
                 if (err)
                         goto out;
                 if (state->end <= end) {
-                       set_state_bits(tree, state, &bits);
+                       set_state_bits(tree, state, &bits, changeset);
                         cache_state(state, cached_state);
                         merge_state(tree, state);
                         if (last_end == (u64)-1)
@@ -980,7 +1004,7 @@ hit_next:
                  * the later extent.
                  */
                 err = insert_state(tree, prealloc, start, this_end,
-                                  NULL, NULL, &bits);
+                                  NULL, NULL, &bits, changeset);
                 if (err)
                         extent_io_tree_panic(tree, err);
  
@@ -1008,7 +1032,7 @@ hit_next:
                 if (err)
                         extent_io_tree_panic(tree, err);
  
-               set_state_bits(tree, prealloc, &bits);
+               set_state_bits(tree, prealloc, &bits, changeset);
                 cache_state(prealloc, cached_state);
                 merge_state(tree, prealloc);
                 prealloc = NULL;
@@ -1038,7 +1062,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                    struct extent_state **cached_state, gfp_t mask)
  {
         return __set_extent_bit(tree, start, end, bits, 0, failed_start,
-                               cached_state, mask);
+                               cached_state, mask, NULL);
  }
  
  
@@ -1111,7 +1135,7 @@ again:
                         goto out;
                 }
                 err = insert_state(tree, prealloc, start, end,
-                                  &p, &parent, &bits);
+                                  &p, &parent, &bits, NULL);
                 if (err)
                         extent_io_tree_panic(tree, err);
                 cache_state(prealloc, cached_state);
@@ -1130,9 +1154,9 @@ hit_next:
          * Just lock what we found and keep going
          */
         if (state->start == start && state->end <= end) {
-               set_state_bits(tree, state, &bits);
+               set_state_bits(tree, state, &bits, NULL);
                 cache_state(state, cached_state);
-               state = clear_state_bit(tree, state, &clear_bits, 0);
+               state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
                 if (last_end == (u64)-1)
                         goto out;
                 start = last_end + 1;
@@ -1171,9 +1195,10 @@ hit_next:
                 if (err)
                         goto out;
                 if (state->end <= end) {
-                       set_state_bits(tree, state, &bits);
+                       set_state_bits(tree, state, &bits, NULL);
                         cache_state(state, cached_state);
-                       state = clear_state_bit(tree, state, &clear_bits, 0);
+                       state = clear_state_bit(tree, state, &clear_bits, 0,
+                                               NULL);
                         if (last_end == (u64)-1)
                                 goto out;
                         start = last_end + 1;
@@ -1208,7 +1233,7 @@ hit_next:
                  * the later extent.
                  */
                 err = insert_state(tree, prealloc, start, this_end,
-                                  NULL, NULL, &bits);
+                                  NULL, NULL, &bits, NULL);
                 if (err)
                         extent_io_tree_panic(tree, err);
                 cache_state(prealloc, cached_state);
@@ -1233,9 +1258,9 @@ hit_next:
                 if (err)
                         extent_io_tree_panic(tree, err);
  
-               set_state_bits(tree, prealloc, &bits);
+               set_state_bits(tree, prealloc, &bits, NULL);
                 cache_state(prealloc, cached_state);
-               clear_state_bit(tree, prealloc, &clear_bits, 0);
+               clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
                 prealloc = NULL;
                 goto out;
         }
@@ -1274,6 +1299,30 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                               NULL, mask);
  }
  
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                          unsigned bits, gfp_t mask,
+                          struct extent_changeset *changeset)
+{
+       /*
+        * We don't support EXTENT_LOCKED yet, as current changeset will
+        * record any bits changed, so for EXTENT_LOCKED case, it will
+        * either fail with -EEXIST or changeset will record the whole
+        * range.
+        */
+       BUG_ON(bits & EXTENT_LOCKED);
+
+       return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
+                               changeset);
+}
+
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                    unsigned bits, int wake, int delete,
+                    struct extent_state **cached, gfp_t mask)
+{
+       return __clear_extent_bit(tree, start, end, bits, wake, delete,
+                                 cached, mask, NULL);
+}
+
  int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                       unsigned bits, gfp_t mask)
  {
@@ -1285,6 +1334,20 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
         return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
  }
  
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                            unsigned bits, gfp_t mask,
+                            struct extent_changeset *changeset)
+{
+       /*
+        * Don't support EXTENT_LOCKED case, same reason as
+        * set_record_extent_bits().
+        */
+       BUG_ON(bits & EXTENT_LOCKED);
+
+       return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
+                                 changeset);
+}
+
  int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                         struct extent_state **cached_state, gfp_t mask)
  {
@@ -1343,7 +1406,7 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
         while (1) {
                 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
                                        EXTENT_LOCKED, &failed_start,
-                                      cached_state, GFP_NOFS);
+                                      cached_state, GFP_NOFS, NULL);
                 if (err == -EEXIST) {
                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
                         start = failed_start;
@@ -1365,7 +1428,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
         u64 failed_start;
  
         err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
-                              &failed_start, NULL, GFP_NOFS);
+                              &failed_start, NULL, GFP_NOFS, NULL);
         if (err == -EEXIST) {
                 if (failed_start > start)
                         clear_extent_bit(tree, start, failed_start - 1,
@@ -2078,8 +2141,8 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
                 return -EIO;
         }
  
-       printk_ratelimited_in_rcu(KERN_INFO
-                                 "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
+       btrfs_info_rl_in_rcu(fs_info,
+               "read error corrected: ino %llu off %llu (dev %s sector %llu)",
                                   btrfs_ino(inode), start,
                                   rcu_str_deref(dev->name), sector);
         bio_put(bio);
@@ -3070,8 +3133,12 @@ static int __do_readpage(struct extent_io_tree *tree,
  
                         set_extent_uptodate(tree, cur, cur + iosize - 1,
                                             &cached, GFP_NOFS);
-                       unlock_extent_cached(tree, cur, cur + iosize - 1,
-                                            &cached, GFP_NOFS);
+                       if (parent_locked)
+                               free_extent_state(cached);
+                       else
+                               unlock_extent_cached(tree, cur,
+                                                    cur + iosize - 1,
+                                                    &cached, GFP_NOFS);
                         cur = cur + iosize;
                         pg_offset += iosize;
                         continue;
@@ -5566,13 +5633,15 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
         unsigned long src_i;
  
         if (src_offset + len > dst->len) {
-               printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
-                      "len %lu dst len %lu\n", src_offset, len, dst->len);
+               btrfs_err(dst->fs_info,
+                       "memmove bogus src_offset %lu move "
+                      "len %lu dst len %lu", src_offset, len, dst->len);
                 BUG_ON(1);
         }
         if (dst_offset + len > dst->len) {
-               printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
-                      "len %lu dst len %lu\n", dst_offset, len, dst->len);
+               btrfs_err(dst->fs_info,
+                       "memmove bogus dst_offset %lu move "
+                      "len %lu dst len %lu", dst_offset, len, dst->len);
                 BUG_ON(1);
         }
  
@@ -5612,13 +5681,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
         unsigned long src_i;
  
         if (src_offset + len > dst->len) {
-               printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
-                      "len %lu len %lu\n", src_offset, len, dst->len);
+               btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move "
+                      "len %lu len %lu", src_offset, len, dst->len);
                 BUG_ON(1);
         }
         if (dst_offset + len > dst->len) {
-               printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
-                      "len %lu len %lu\n", dst_offset, len, dst->len);
+               btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move "
+                      "len %lu len %lu", dst_offset, len, dst->len);
                 BUG_ON(1);
         }
         if (dst_offset < src_offset) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h

index c668f36898d3a4c2dd2e0e18456dd2cfa6c55747..f4c1ae11855f0b613894ea44026faf143021b613 100644 (file)
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -2,6 +2,7 @@
  #define __EXTENTIO__
  
  #include <linux/rbtree.h>
+#include "ulist.h"
  
  /* bits for the extent state */
  #define EXTENT_DIRTY           (1U << 0)
@@ -18,6 +19,7 @@
  #define EXTENT_NEED_WAIT       (1U << 13)
  #define EXTENT_DAMAGED         (1U << 14)
  #define EXTENT_NORESERVE       (1U << 15)
+#define EXTENT_QGROUP_RESERVED (1U << 16)
  #define EXTENT_IOBITS          (EXTENT_LOCKED | EXTENT_WRITEBACK)
  #define EXTENT_CTLBITS         (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
  
@@ -161,6 +163,17 @@ struct extent_buffer {
  #endif
  };
  
+/*
+ * Structure to record how many bytes and which ranges are set/cleared
+ */
+struct extent_changeset {
+       /* How many bytes are set/cleared in this operation */
+       u64 bytes_changed;
+
+       /* Changed ranges */
+       struct ulist *range_changed;
+};
+
  static inline void extent_set_compress_type(unsigned long *bio_flags,
                                             int compress_type)
  {
@@ -210,11 +223,17 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
                    struct extent_state *cached_state);
  int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                       unsigned bits, gfp_t mask);
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                            unsigned bits, gfp_t mask,
+                            struct extent_changeset *changeset);
  int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                      unsigned bits, int wake, int delete,
                      struct extent_state **cached, gfp_t mask);
  int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                     unsigned bits, gfp_t mask);
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                          unsigned bits, gfp_t mask,
+                          struct extent_changeset *changeset);
  int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                    unsigned bits, u64 *failed_start,
                    struct extent_state **cached_state, gfp_t mask);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index 8c6f247ba81d4e84c6a7d3d00d0348c974c1b049..6bd5ce9d75f0ab8eaca5a2f808bfaf5a66132742 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -847,7 +847,7 @@ next_slot:
                                                 disk_bytenr, num_bytes, 0,
                                                 root->root_key.objectid,
                                                 new_key.objectid,
-                                               start - extent_offset, 1);
+                                               start - extent_offset);
                                 BUG_ON(ret); /* -ENOMEM */
                         }
                         key.offset = start;
@@ -925,7 +925,7 @@ delete_extent_item:
                                                 disk_bytenr, num_bytes, 0,
                                                 root->root_key.objectid,
                                                 key.objectid, key.offset -
-                                               extent_offset, 0);
+                                               extent_offset);
                                 BUG_ON(ret); /* -ENOMEM */
                                 inode_sub_bytes(inode,
                                                 extent_end - key.offset);
@@ -1204,7 +1204,7 @@ again:
  
                 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
                                            root->root_key.objectid,
-                                          ino, orig_offset, 1);
+                                          ino, orig_offset);
                 BUG_ON(ret); /* -ENOMEM */
  
                 if (split == start) {
@@ -1231,7 +1231,7 @@ again:
                 del_nr++;
                 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                         0, root->root_key.objectid,
-                                       ino, orig_offset, 0);
+                                       ino, orig_offset);
                 BUG_ON(ret); /* -ENOMEM */
         }
         other_start = 0;
@@ -1248,7 +1248,7 @@ again:
                 del_nr++;
                 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                         0, root->root_key.objectid,
-                                       ino, orig_offset, 0);
+                                       ino, orig_offset);
                 BUG_ON(ret); /* -ENOMEM */
         }
         if (del_nr == 0) {
@@ -1469,7 +1469,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
         u64 release_bytes = 0;
         u64 lockstart;
         u64 lockend;
-       unsigned long first_index;
         size_t num_written = 0;
         int nrptrs;
         int ret = 0;
@@ -1485,8 +1484,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
         if (!pages)
                 return -ENOMEM;
  
-       first_index = pos >> PAGE_CACHE_SHIFT;
-
         while (iov_iter_count(i) > 0) {
                 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
                 size_t write_bytes = min(iov_iter_count(i),
@@ -1510,12 +1507,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                 }
  
                 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-               ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
-               if (ret == -ENOSPC &&
-                   (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
-                                             BTRFS_INODE_PREALLOC))) {
+
+               if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+                                            BTRFS_INODE_PREALLOC)) {
                         ret = check_can_nocow(inode, pos, &write_bytes);
+                       if (ret < 0)
+                               break;
                         if (ret > 0) {
+                               /*
+                                * For nodata cow case, no need to reserve
+                                * data space.
+                                */
                                 only_release_metadata = true;
                                 /*
                                  * our prealloc extent may be smaller than
@@ -1524,20 +1526,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                 num_pages = DIV_ROUND_UP(write_bytes + offset,
                                                          PAGE_CACHE_SIZE);
                                 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-                               ret = 0;
-                       } else {
-                               ret = -ENOSPC;
+                               goto reserve_metadata;
                         }
                 }
-
-               if (ret)
+               ret = btrfs_check_data_free_space(inode, pos, write_bytes);
+               if (ret < 0)
                         break;
  
+reserve_metadata:
                 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
                 if (ret) {
                         if (!only_release_metadata)
-                               btrfs_free_reserved_data_space(inode,
-                                                              reserve_bytes);
+                               btrfs_free_reserved_data_space(inode, pos,
+                                                              write_bytes);
                         else
                                 btrfs_end_write_no_snapshoting(root);
                         break;
@@ -1603,12 +1604,17 @@ again:
                                 BTRFS_I(inode)->outstanding_extents++;
                                 spin_unlock(&BTRFS_I(inode)->lock);
                         }
-                       if (only_release_metadata)
+                       if (only_release_metadata) {
                                 btrfs_delalloc_release_metadata(inode,
                                                                 release_bytes);
-                       else
-                               btrfs_delalloc_release_space(inode,
+                       } else {
+                               u64 __pos;
+
+                               __pos = round_down(pos, root->sectorsize) +
+                                       (dirty_pages << PAGE_CACHE_SHIFT);
+                               btrfs_delalloc_release_space(inode, __pos,
                                                              release_bytes);
+                       }
                 }
  
                 release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
@@ -1660,7 +1666,7 @@ again:
                         btrfs_end_write_no_snapshoting(root);
                         btrfs_delalloc_release_metadata(inode, release_bytes);
                 } else {
-                       btrfs_delalloc_release_space(inode, release_bytes);
+                       btrfs_delalloc_release_space(inode, pos, release_bytes);
                 }
         }
  
@@ -2266,7 +2272,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
         u64 drop_end;
         int ret = 0;
         int err = 0;
-       int rsv_count;
+       unsigned int rsv_count;
         bool same_page;
         bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
         u64 ino_size;
@@ -2487,6 +2493,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
         }
  
         trans->block_rsv = &root->fs_info->trans_block_rsv;
+       /*
+        * If we are using the NO_HOLES feature we might have had already an
+        * hole that overlaps a part of the region [lockstart, lockend] and
+        * ends at (or beyond) lockend. Since we have no file extent items to
+        * represent holes, drop_end can be less than lockend and so we must
+        * make sure we have an extent map representing the existing hole (the
+        * call to __btrfs_drop_extents() might have dropped the existing extent
+        * map representing the existing hole), otherwise the fast fsync path
+        * will not record the existence of the hole region
+        * [existing_hole_start, lockend].
+        */
+       if (drop_end <= lockend)
+               drop_end = lockend + 1;
         /*
          * Don't insert file hole extent item if it's for a range beyond eof
          * (because it's useless) or if it represents a 0 bytes range (when
@@ -2541,17 +2560,61 @@ out_only_mutex:
         return err;
  }
  
+/* Helper structure to record which range is already reserved */
+struct falloc_range {
+       struct list_head list;
+       u64 start;
+       u64 len;
+};
+
+/*
+ * Helper function to add falloc range
+ *
+ * Caller should have locked the larger range of extent containing
+ * [start, len)
+ */
+static int add_falloc_range(struct list_head *head, u64 start, u64 len)
+{
+       struct falloc_range *prev = NULL;
+       struct falloc_range *range = NULL;
+
+       if (list_empty(head))
+               goto insert;
+
+       /*
+        * As fallocate iterate by bytenr order, we only need to check
+        * the last range.
+        */
+       prev = list_entry(head->prev, struct falloc_range, list);
+       if (prev->start + prev->len == start) {
+               prev->len += len;
+               return 0;
+       }
+insert:
+       range = kmalloc(sizeof(*range), GFP_NOFS);
+       if (!range)
+               return -ENOMEM;
+       range->start = start;
+       range->len = len;
+       list_add_tail(&range->list, head);
+       return 0;
+}
+
  static long btrfs_fallocate(struct file *file, int mode,
                             loff_t offset, loff_t len)
  {
         struct inode *inode = file_inode(file);
         struct extent_state *cached_state = NULL;
+       struct falloc_range *range;
+       struct falloc_range *tmp;
+       struct list_head reserve_list;
         u64 cur_offset;
         u64 last_byte;
         u64 alloc_start;
         u64 alloc_end;
         u64 alloc_hint = 0;
         u64 locked_end;
+       u64 actual_end = 0;
         struct extent_map *em;
         int blocksize = BTRFS_I(inode)->root->sectorsize;
         int ret;
@@ -2567,11 +2630,12 @@ static long btrfs_fallocate(struct file *file, int mode,
                 return btrfs_punch_hole(inode, offset, len);
  
         /*
-        * Make sure we have enough space before we do the
-        * allocation.
+        * Only trigger disk allocation, don't trigger qgroup reserve
+        *
+        * For qgroup space, it will be checked later.
          */
-       ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
-       if (ret)
+       ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
+       if (ret < 0)
                 return ret;
  
         mutex_lock(&inode->i_mutex);
@@ -2579,6 +2643,13 @@ static long btrfs_fallocate(struct file *file, int mode,
         if (ret)
                 goto out;
  
+       /*
+        * TODO: Move these two operations after we have checked
+        * accurate reserved space, or fallocate can still fail but
+        * with page truncated or size expanded.
+        *
+        * But that's a minor problem and won't do much harm BTW.
+        */
         if (alloc_start > inode->i_size) {
                 ret = btrfs_cont_expand(inode, i_size_read(inode),
                                         alloc_start);
@@ -2637,10 +2708,10 @@ static long btrfs_fallocate(struct file *file, int mode,
                 }
         }
  
+       /* First, check if we exceed the qgroup limit */
+       INIT_LIST_HEAD(&reserve_list);
         cur_offset = alloc_start;
         while (1) {
-               u64 actual_end;
-
                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
                                       alloc_end - cur_offset, 0);
                 if (IS_ERR_OR_NULL(em)) {
@@ -2653,57 +2724,82 @@ static long btrfs_fallocate(struct file *file, int mode,
                 last_byte = min(extent_map_end(em), alloc_end);
                 actual_end = min_t(u64, extent_map_end(em), offset + len);
                 last_byte = ALIGN(last_byte, blocksize);
-
                 if (em->block_start == EXTENT_MAP_HOLE ||
                     (cur_offset >= inode->i_size &&
                      !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                       ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-                                                       last_byte - cur_offset,
-                                                       1 << inode->i_blkbits,
-                                                       offset + len,
-                                                       &alloc_hint);
-               } else if (actual_end > inode->i_size &&
-                          !(mode & FALLOC_FL_KEEP_SIZE)) {
-                       struct btrfs_trans_handle *trans;
-                       struct btrfs_root *root = BTRFS_I(inode)->root;
-
-                       /*
-                        * We didn't need to allocate any more space, but we
-                        * still extended the size of the file so we need to
-                        * update i_size and the inode item.
-                        */
-                       trans = btrfs_start_transaction(root, 1);
-                       if (IS_ERR(trans)) {
-                               ret = PTR_ERR(trans);
-                       } else {
-                               inode->i_ctime = CURRENT_TIME;
-                               i_size_write(inode, actual_end);
-                               btrfs_ordered_update_i_size(inode, actual_end,
-                                                           NULL);
-                               ret = btrfs_update_inode(trans, root, inode);
-                               if (ret)
-                                       btrfs_end_transaction(trans, root);
-                               else
-                                       ret = btrfs_end_transaction(trans,
-                                                                   root);
+                       ret = add_falloc_range(&reserve_list, cur_offset,
+                                              last_byte - cur_offset);
+                       if (ret < 0) {
+                               free_extent_map(em);
+                               break;
                         }
+                       ret = btrfs_qgroup_reserve_data(inode, cur_offset,
+                                       last_byte - cur_offset);
+                       if (ret < 0)
+                               break;
                 }
                 free_extent_map(em);
-               if (ret < 0)
-                       break;
-
                 cur_offset = last_byte;
-               if (cur_offset >= alloc_end) {
-                       ret = 0;
+               if (cur_offset >= alloc_end)
                         break;
+       }
+
+       /*
+        * If ret is still 0, means we're OK to fallocate.
+        * Or just cleanup the list and exit.
+        */
+       list_for_each_entry_safe(range, tmp, &reserve_list, list) {
+               if (!ret)
+                       ret = btrfs_prealloc_file_range(inode, mode,
+                                       range->start,
+                                       range->len, 1 << inode->i_blkbits,
+                                       offset + len, &alloc_hint);
+               list_del(&range->list);
+               kfree(range);
+       }
+       if (ret < 0)
+               goto out_unlock;
+
+       if (actual_end > inode->i_size &&
+           !(mode & FALLOC_FL_KEEP_SIZE)) {
+               struct btrfs_trans_handle *trans;
+               struct btrfs_root *root = BTRFS_I(inode)->root;
+
+               /*
+                * We didn't need to allocate any more space, but we
+                * still extended the size of the file so we need to
+                * update i_size and the inode item.
+                */
+               trans = btrfs_start_transaction(root, 1);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+               } else {
+                       inode->i_ctime = CURRENT_TIME;
+                       i_size_write(inode, actual_end);
+                       btrfs_ordered_update_i_size(inode, actual_end, NULL);
+                       ret = btrfs_update_inode(trans, root, inode);
+                       if (ret)
+                               btrfs_end_transaction(trans, root);
+                       else
+                               ret = btrfs_end_transaction(trans, root);
                 }
         }
+out_unlock:
         unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                              &cached_state, GFP_NOFS);
  out:
+       /*
+        * As we waited the extent range, the data_rsv_map must be empty
+        * in the range, as written data range will be released from it.
+        * And for prealloacted extent, it will also be released when
+        * its metadata is written.
+        * So this is completely used as cleanup.
+        */
+       btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
         mutex_unlock(&inode->i_mutex);
         /* Let go of our reservation. */
-       btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+       btrfs_free_reserved_data_space(inode, alloc_start,
+                                      alloc_end - alloc_start);
         return ret;
  }
  
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c

index abe3a66bd3ba6d31f9fb072c6d4d6c6be42c174e..0948d34cb84a5f24cb2221f54b1fe5a426f451eb 100644 (file)
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -450,9 +450,9 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
  
         gen = io_ctl->cur;
         if (le64_to_cpu(*gen) != generation) {
-               printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
-                                  "(%Lu) does not match inode (%Lu)\n", *gen,
-                                  generation);
+               btrfs_err_rl(io_ctl->root->fs_info,
+                       "space cache generation (%llu) does not match inode (%llu)",
+                               *gen, generation);
                 io_ctl_unmap_page(io_ctl);
                 return -EIO;
         }
@@ -506,8 +506,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
                               PAGE_CACHE_SIZE - offset);
         btrfs_csum_final(crc, (char *)&crc);
         if (val != crc) {
-               printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
-                                  "space cache\n");
+               btrfs_err_rl(io_ctl->root->fs_info,
+                       "csum mismatch on free space cache");
                 io_ctl_unmap_page(io_ctl);
                 return -EIO;
         }
@@ -1215,7 +1215,7 @@ out:
   * @offset - the offset for the key we'll insert
   *
   * This function writes out a free space cache struct to disk for quick recovery
- * on mount.  This will return 0 if it was successfull in writing the cache out,
+ * on mount.  This will return 0 if it was successful in writing the cache out,
   * or an errno if it was not.
   */
  static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
@@ -1730,7 +1730,7 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
   */
  static int search_bitmap(struct btrfs_free_space_ctl *ctl,
                          struct btrfs_free_space *bitmap_info, u64 *offset,
-                        u64 *bytes)
+                        u64 *bytes, bool for_alloc)
  {
         unsigned long found_bits = 0;
         unsigned long max_bits = 0;
@@ -1738,11 +1738,26 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
         unsigned long next_zero;
         unsigned long extent_bits;
  
+       /*
+        * Skip searching the bitmap if we don't have a contiguous section that
+        * is large enough for this allocation.
+        */
+       if (for_alloc &&
+           bitmap_info->max_extent_size &&
+           bitmap_info->max_extent_size < *bytes) {
+               *bytes = bitmap_info->max_extent_size;
+               return -1;
+       }
+
         i = offset_to_bit(bitmap_info->offset, ctl->unit,
                           max_t(u64, *offset, bitmap_info->offset));
         bits = bytes_to_bits(*bytes, ctl->unit);
  
         for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
+               if (for_alloc && bits == 1) {
+                       found_bits = 1;
+                       break;
+               }
                 next_zero = find_next_zero_bit(bitmap_info->bitmap,
                                                BITS_PER_BITMAP, i);
                 extent_bits = next_zero - i;
@@ -1762,6 +1777,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
         }
  
         *bytes = (u64)(max_bits) * ctl->unit;
+       bitmap_info->max_extent_size = *bytes;
         return -1;
  }
  
@@ -1813,7 +1829,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
                 if (entry->bitmap) {
                         u64 size = *bytes;
  
-                       ret = search_bitmap(ctl, entry, &tmp, &size);
+                       ret = search_bitmap(ctl, entry, &tmp, &size, true);
                         if (!ret) {
                                 *offset = tmp;
                                 *bytes = size;
@@ -1874,7 +1890,8 @@ again:
         search_start = *offset;
         search_bytes = ctl->unit;
         search_bytes = min(search_bytes, end - search_start + 1);
-       ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
+       ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
+                           false);
         if (ret < 0 || search_start != *offset)
                 return -EINVAL;
  
@@ -1919,7 +1936,7 @@ again:
                 search_start = *offset;
                 search_bytes = ctl->unit;
                 ret = search_bitmap(ctl, bitmap_info, &search_start,
-                                   &search_bytes);
+                                   &search_bytes, false);
                 if (ret < 0 || search_start != *offset)
                         return -EAGAIN;
  
@@ -1943,6 +1960,12 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
  
         bitmap_set_bits(ctl, info, offset, bytes_to_set);
  
+       /*
+        * We set some bytes, we have no idea what the max extent size is
+        * anymore.
+        */
+       info->max_extent_size = 0;
+
         return bytes_to_set;
  
  }
@@ -1951,12 +1974,19 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
                       struct btrfs_free_space *info)
  {
         struct btrfs_block_group_cache *block_group = ctl->private;
+       bool forced = false;
+
+#ifdef CONFIG_BTRFS_DEBUG
+       if (btrfs_should_fragment_free_space(block_group->fs_info->extent_root,
+                                            block_group))
+               forced = true;
+#endif
  
         /*
          * If we are below the extents threshold then we can add this as an
          * extent, and don't have to deal with the bitmap
          */
-       if (ctl->free_extents < ctl->extents_thresh) {
+       if (!forced && ctl->free_extents < ctl->extents_thresh) {
                 /*
                  * If this block group has some small extents we don't want to
                  * use up all of our free slots in the cache with them, we want
@@ -2661,7 +2691,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
         search_start = min_start;
         search_bytes = bytes;
  
-       err = search_bitmap(ctl, entry, &search_start, &search_bytes);
+       err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
         if (err) {
                 if (search_bytes > *max_extent_size)
                         *max_extent_size = search_bytes;
@@ -2775,6 +2805,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
         unsigned long want_bits;
         unsigned long min_bits;
         unsigned long found_bits;
+       unsigned long max_bits = 0;
         unsigned long start = 0;
         unsigned long total_found = 0;
         int ret;
@@ -2784,6 +2815,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
         want_bits = bytes_to_bits(bytes, ctl->unit);
         min_bits = bytes_to_bits(min_bytes, ctl->unit);
  
+       /*
+        * Don't bother looking for a cluster in this bitmap if it's heavily
+        * fragmented.
+        */
+       if (entry->max_extent_size &&
+           entry->max_extent_size < cont1_bytes)
+               return -ENOSPC;
  again:
         found_bits = 0;
         for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
@@ -2791,13 +2829,19 @@ again:
                                                BITS_PER_BITMAP, i);
                 if (next_zero - i >= min_bits) {
                         found_bits = next_zero - i;
+                       if (found_bits > max_bits)
+                               max_bits = found_bits;
                         break;
                 }
+               if (next_zero - i > max_bits)
+                       max_bits = next_zero - i;
                 i = next_zero;
         }
  
-       if (!found_bits)
+       if (!found_bits) {
+               entry->max_extent_size = (u64)max_bits * ctl->unit;
                 return -ENOSPC;
+       }
  
         if (!total_found) {
                 start = i;
@@ -3056,6 +3100,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
         spin_lock_init(&cluster->refill_lock);
         cluster->root = RB_ROOT;
         cluster->max_size = 0;
+       cluster->fragmented = false;
         INIT_LIST_HEAD(&cluster->block_group_list);
         cluster->block_group = NULL;
  }
@@ -3223,7 +3268,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
                 }
  
                 bytes = minlen;
-               ret2 = search_bitmap(ctl, entry, &start, &bytes);
+               ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
                 if (ret2 || start >= end) {
                         spin_unlock(&ctl->tree_lock);
                         mutex_unlock(&ctl->cache_writeout_mutex);
@@ -3376,7 +3421,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
                 u64 count = 1;
                 int ret;
  
-               ret = search_bitmap(ctl, entry, &offset, &count);
+               ret = search_bitmap(ctl, entry, &offset, &count, true);
                 /* Logic error; Should be empty if it can't find anything */
                 ASSERT(!ret);
  
@@ -3532,6 +3577,7 @@ again:
                 spin_lock(&ctl->tree_lock);
                 info->offset = offset;
                 info->bytes = bytes;
+               info->max_extent_size = 0;
                 ret = link_free_space(ctl, info);
                 spin_unlock(&ctl->tree_lock);
                 if (ret)
@@ -3559,6 +3605,7 @@ again:
         }
  
         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+
         bytes -= bytes_added;
         offset += bytes_added;
         spin_unlock(&ctl->tree_lock);
@@ -3602,7 +3649,7 @@ have_info:
  
                 bit_off = offset;
                 bit_bytes = ctl->unit;
-               ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
+               ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
                 if (!ret) {
                         if (bit_off == offset) {
                                 ret = 1;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h

index a16a029ad3b128dcd7723e6d657f43a66a26ed34..f251865eb6f3a628540740a93055e8cdc5aed08c 100644 (file)
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -23,6 +23,7 @@ struct btrfs_free_space {
         struct rb_node offset_index;
         u64 offset;
         u64 bytes;
+       u64 max_extent_size;
         unsigned long *bitmap;
         struct list_head list;
  };
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c

index 265e03c73f4daaea4ac1b69ab0a606c63f50895f..be4d22a5022fa09974488a001d859e574f686645 100644 (file)
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
          */
         if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
                                             name, name_len, &extref)) {
-               btrfs_std_error(root->fs_info, -ENOENT);
+               btrfs_std_error(root->fs_info, -ENOENT, NULL);
                 ret = -EROFS;
                 goto out;
         }
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c

index d4a582ac3f730f82299e997d0866e3caecacac7b..767a6056ac45afce29844e469f582baec8ef28cd 100644 (file)
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -488,17 +488,17 @@ again:
         /* Just to make sure we have enough space */
         prealloc += 8 * PAGE_CACHE_SIZE;
  
-       ret = btrfs_delalloc_reserve_space(inode, prealloc);
+       ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
         if (ret)
                 goto out_put;
  
         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
                                               prealloc, prealloc, &alloc_hint);
         if (ret) {
-               btrfs_delalloc_release_space(inode, prealloc);
+               btrfs_delalloc_release_space(inode, 0, prealloc);
                 goto out_put;
         }
-       btrfs_free_reserved_data_space(inode, prealloc);
+       btrfs_free_reserved_data_space(inode, 0, prealloc);
  
         ret = btrfs_write_out_ino_cache(root, trans, path, inode);
  out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 611b66d73e80ba0e5f97b4415595d20f8ae35e1a..4439fbb4ff451bbad4fa03f864ec5ed1ae8625be 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -310,6 +310,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
         btrfs_delalloc_release_metadata(inode, end + 1 - start);
         btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
  out:
+       /*
+        * Don't forget to free the reserved space, as for inlined extent
+        * it won't count as data extent, free them directly here.
+        * And at reserve time, it's always aligned to page size, so
+        * just free one page here.
+        */
+       btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
         btrfs_free_path(path);
         btrfs_end_transaction(trans, root);
         return ret;
@@ -1096,6 +1103,9 @@ static noinline void async_cow_submit(struct btrfs_work *work)
         nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
                 PAGE_CACHE_SHIFT;
  
+       /*
+        * atomic_sub_return implies a barrier for waitqueue_active
+        */
         if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
             5 * 1024 * 1024 &&
             waitqueue_active(&root->fs_info->async_submit_wait))
@@ -1766,7 +1776,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
  
                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
                     && do_list && !(state->state & EXTENT_NORESERVE))
-                       btrfs_free_reserved_data_space(inode, len);
+                       btrfs_free_reserved_data_space_noquota(inode,
+                                       state->start, len);
  
                 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
                                      root->fs_info->delalloc_batch);
@@ -1861,15 +1872,15 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                           u64 bio_offset)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
+       enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
         int ret = 0;
         int skip_sum;
-       int metadata = 0;
         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
  
         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
  
         if (btrfs_is_free_space_inode(inode))
-               metadata = 2;
+               metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
  
         if (!(rw & REQ_WRITE)) {
                 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
@@ -1989,7 +2000,8 @@ again:
                 goto again;
         }
  
-       ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+       ret = btrfs_delalloc_reserve_space(inode, page_start,
+                                          PAGE_CACHE_SIZE);
         if (ret) {
                 mapping_set_error(page->mapping, ret);
                 end_extent_writepage(page, ret, page_start, page_end);
@@ -2115,7 +2127,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
         ins.type = BTRFS_EXTENT_ITEM_KEY;
         ret = btrfs_alloc_reserved_file_extent(trans, root,
                                         root->root_key.objectid,
-                                       btrfs_ino(inode), file_pos, &ins);
+                                       btrfs_ino(inode), file_pos,
+                                       ram_bytes, &ins);
+       /*
+        * Release the reserved range from inode dirty range map, as it is
+        * already moved into delayed_ref_head
+        */
+       btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
  out:
         btrfs_free_path(path);
  
@@ -2573,7 +2591,7 @@ again:
         ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
                         new->disk_len, 0,
                         backref->root_id, backref->inum,
-                       new->file_pos, 0);      /* start - extent_offset */
+                       new->file_pos); /* start - extent_offset */
         if (ret) {
                 btrfs_abort_transaction(trans, root, ret);
                 goto out_free_path;
@@ -2599,7 +2617,6 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
                 return;
  
         list_for_each_entry_safe(old, tmp, &new->head, list) {
-               list_del(&old->list);
                 kfree(old);
         }
         kfree(new);
@@ -2824,6 +2841,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
  
         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+
+               /*
+                * For mwrite(mmap + memset to write) case, we still reserve
+                * space for NOCOW range.
+                * As NOCOW won't cause a new delayed ref, just free the space
+                */
+               btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+                                      ordered_extent->len);
                 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                 if (nolock)
                         trans = btrfs_join_transaction_nolock(root);
@@ -3018,8 +3043,6 @@ static int __readpage_endio_check(struct inode *inode,
         char *kaddr;
         u32 csum_expected;
         u32 csum = ~(u32)0;
-       static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
-                                     DEFAULT_RATELIMIT_BURST);
  
         csum_expected = *(((u32 *)io_bio->csum) + icsum);
  
@@ -3032,9 +3055,8 @@ static int __readpage_endio_check(struct inode *inode,
         kunmap_atomic(kaddr);
         return 0;
  zeroit:
-       if (__ratelimit(&_rs))
-               btrfs_warn(BTRFS_I(inode)->root->fs_info,
-                          "csum failed ino %llu off %llu csum %u expected csum %u",
+       btrfs_warn_rl(BTRFS_I(inode)->root->fs_info,
+               "csum failed ino %llu off %llu csum %u expected csum %u",
                            btrfs_ino(inode), start, csum, csum_expected);
         memset(kaddr + pgoff, 1, len);
         flush_dcache_page(page);
@@ -4217,6 +4239,47 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
  
  }
  
+static int truncate_inline_extent(struct inode *inode,
+                                 struct btrfs_path *path,
+                                 struct btrfs_key *found_key,
+                                 const u64 item_end,
+                                 const u64 new_size)
+{
+       struct extent_buffer *leaf = path->nodes[0];
+       int slot = path->slots[0];
+       struct btrfs_file_extent_item *fi;
+       u32 size = (u32)(new_size - found_key->offset);
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+
+       fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+       if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
+               loff_t offset = new_size;
+               loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
+
+               /*
+                * Zero out the remaining of the last page of our inline extent,
+                * instead of directly truncating our inline extent here - that
+                * would be much more complex (decompressing all the data, then
+                * compressing the truncated data, which might be bigger than
+                * the size of the inline extent, resize the extent, etc).
+                * We release the path because to get the page we might need to
+                * read the extent item from disk (data not in the page cache).
+                */
+               btrfs_release_path(path);
+               return btrfs_truncate_page(inode, offset, page_end - offset, 0);
+       }
+
+       btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+       size = btrfs_file_extent_calc_inline_size(size);
+       btrfs_truncate_item(root, path, size, 1);
+
+       if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+               inode_sub_bytes(inode, item_end + 1 - new_size);
+
+       return 0;
+}
+
  /*
   * this can truncate away extent items, csum items and directory items.
   * It starts at a high offset and removes keys until it can't find
@@ -4411,27 +4474,40 @@ search_again:
                          * special encodings
                          */
                         if (!del_item &&
-                           btrfs_file_extent_compression(leaf, fi) == 0 &&
                             btrfs_file_extent_encryption(leaf, fi) == 0 &&
                             btrfs_file_extent_other_encoding(leaf, fi) == 0) {
-                               u32 size = new_size - found_key.offset;
-
-                               if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
-                                       inode_sub_bytes(inode, item_end + 1 -
-                                                       new_size);
  
                                 /*
-                                * update the ram bytes to properly reflect
-                                * the new size of our item
+                                * Need to release path in order to truncate a
+                                * compressed extent. So delete any accumulated
+                                * extent items so far.
                                  */
-                               btrfs_set_file_extent_ram_bytes(leaf, fi, size);
-                               size =
-                                   btrfs_file_extent_calc_inline_size(size);
-                               btrfs_truncate_item(root, path, size, 1);
+                               if (btrfs_file_extent_compression(leaf, fi) !=
+                                   BTRFS_COMPRESS_NONE && pending_del_nr) {
+                                       err = btrfs_del_items(trans, root, path,
+                                                             pending_del_slot,
+                                                             pending_del_nr);
+                                       if (err) {
+                                               btrfs_abort_transaction(trans,
+                                                                       root,
+                                                                       err);
+                                               goto error;
+                                       }
+                                       pending_del_nr = 0;
+                               }
+
+                               err = truncate_inline_extent(inode, path,
+                                                            &found_key,
+                                                            item_end,
+                                                            new_size);
+                               if (err) {
+                                       btrfs_abort_transaction(trans,
+                                                               root, err);
+                                       goto error;
+                               }
                         } else if (test_bit(BTRFS_ROOT_REF_COWS,
                                             &root->state)) {
-                               inode_sub_bytes(inode, item_end + 1 -
-                                               found_key.offset);
+                               inode_sub_bytes(inode, item_end + 1 - new_size);
                         }
                 }
  delete:
@@ -4461,7 +4537,7 @@ delete:
                         ret = btrfs_free_extent(trans, root, extent_start,
                                                 extent_num_bytes, 0,
                                                 btrfs_header_owner(leaf),
-                                               ino, extent_offset, 0);
+                                               ino, extent_offset);
                         BUG_ON(ret);
                         if (btrfs_should_throttle_delayed_refs(trans, root))
                                 btrfs_async_run_delayed_refs(root,
@@ -4575,14 +4651,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
         if ((offset & (blocksize - 1)) == 0 &&
             (!len || ((len & (blocksize - 1)) == 0)))
                 goto out;
-       ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+       ret = btrfs_delalloc_reserve_space(inode,
+                       round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
         if (ret)
                 goto out;
  
  again:
         page = find_or_create_page(mapping, index, mask);
         if (!page) {
-               btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+               btrfs_delalloc_release_space(inode,
+                               round_down(from, PAGE_CACHE_SIZE),
+                               PAGE_CACHE_SIZE);
                 ret = -ENOMEM;
                 goto out;
         }
@@ -4650,7 +4729,8 @@ again:
  
  out_unlock:
         if (ret)
-               btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+               btrfs_delalloc_release_space(inode, page_start,
+                                            PAGE_CACHE_SIZE);
         unlock_page(page);
         page_cache_release(page);
  out:
@@ -5048,6 +5128,18 @@ static void evict_inode_truncate_pages(struct inode *inode)
                 spin_unlock(&io_tree->lock);
  
                 lock_extent_bits(io_tree, start, end, 0, &cached_state);
+
+               /*
+                * If still has DELALLOC flag, the extent didn't reach disk,
+                * and its reserved space won't be freed by delayed_ref.
+                * So we need to free its reserved space here.
+                * (Refer to comment in btrfs_invalidatepage, case 2)
+                *
+                * Note, end is the bytenr of last byte, so we need + 1 here.
+                */
+               if (state->state & EXTENT_DELALLOC)
+                       btrfs_qgroup_free_data(inode, start, end - start + 1);
+
                 clear_extent_bit(io_tree, start, end,
                                  EXTENT_LOCKED | EXTENT_DIRTY |
                                  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
@@ -7581,7 +7673,7 @@ unlock:
                         spin_unlock(&BTRFS_I(inode)->lock);
                 }
  
-               btrfs_free_reserved_data_space(inode, len);
+               btrfs_free_reserved_data_space(inode, start, len);
                 WARN_ON(dio_data->reserve < len);
                 dio_data->reserve -= len;
                 current->journal_info = dio_data;
@@ -8371,7 +8463,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                         mutex_unlock(&inode->i_mutex);
                         relock = true;
                 }
-               ret = btrfs_delalloc_reserve_space(inode, count);
+               ret = btrfs_delalloc_reserve_space(inode, offset, count);
                 if (ret)
                         goto out;
                 dio_data.outstanding_extents = div64_u64(count +
@@ -8400,10 +8492,10 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                 current->journal_info = NULL;
                 if (ret < 0 && ret != -EIOCBQUEUED) {
                         if (dio_data.reserve)
-                               btrfs_delalloc_release_space(inode,
-                                                       dio_data.reserve);
+                               btrfs_delalloc_release_space(inode, offset,
+                                                            dio_data.reserve);
                 } else if (ret >= 0 && (size_t)ret < count)
-                       btrfs_delalloc_release_space(inode,
+                       btrfs_delalloc_release_space(inode, offset,
                                                      count - (size_t)ret);
         }
  out:
@@ -8562,6 +8654,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
                 }
         }
  
+       /*
+        * Qgroup reserved space handler
+        * Page here will be either
+        * 1) Already written to disk
+        *    In this case, its reserved space is released from data rsv map
+        *    and will be freed by delayed_ref handler finally.
+        *    So even we call qgroup_free_data(), it won't decrease reserved
+        *    space.
+        * 2) Not written to disk
+        *    This means the reserved space should be freed here.
+        */
+       btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
         if (!inode_evicting) {
                 clear_extent_bit(tree, page_start, page_end,
                                  EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8612,7 +8716,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         u64 page_end;
  
         sb_start_pagefault(inode->i_sb);
-       ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+       page_start = page_offset(page);
+       page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+       ret = btrfs_delalloc_reserve_space(inode, page_start,
+                                          PAGE_CACHE_SIZE);
         if (!ret) {
                 ret = file_update_time(vma->vm_file);
                 reserved = 1;
@@ -8631,8 +8739,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  again:
         lock_page(page);
         size = i_size_read(inode);
-       page_start = page_offset(page);
-       page_end = page_start + PAGE_CACHE_SIZE - 1;
  
         if ((page->mapping != inode->i_mapping) ||
             (page_start >= size)) {
@@ -8709,7 +8815,7 @@ out_unlock:
         }
         unlock_page(page);
  out:
-       btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+       btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
  out_noreserve:
         sb_end_pagefault(inode->i_sb);
         return ret;
@@ -8998,6 +9104,7 @@ void btrfs_destroy_inode(struct inode *inode)
                         btrfs_put_ordered_extent(ordered);
                 }
         }
+       btrfs_qgroup_check_reserved_leak(inode);
         inode_tree_del(inode);
         btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
  free:
@@ -9634,6 +9741,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
         u64 cur_offset = start;
         u64 i_size;
         u64 cur_bytes;
+       u64 last_alloc = (u64)-1;
         int ret = 0;
         bool own_trans = true;
  
@@ -9650,6 +9758,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
  
                 cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
                 cur_bytes = max(cur_bytes, min_size);
+               /*
+                * If we are severely fragmented we could end up with really
+                * small allocations, so if the allocator is returning small
+                * chunks lets make its job easier by only searching for those
+                * sized chunks.
+                */
+               cur_bytes = min(cur_bytes, last_alloc);
                 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
                                            *alloc_hint, &ins, 1, 0);
                 if (ret) {
@@ -9658,6 +9773,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                         break;
                 }
  
+               last_alloc = ins.offset;
                 ret = insert_reserved_file_extent(trans, inode,
                                                   cur_offset, ins.objectid,
                                                   ins.offset, ins.offset,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c

index 8d20f3b1cab0abfa7b5f2bc4b8646c094a9353a5..da94138eb85eb3f15f127b08c54a113715956686 100644 (file)
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1120,7 +1120,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
         page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
  
         ret = btrfs_delalloc_reserve_space(inode,
-                                          page_cnt << PAGE_CACHE_SHIFT);
+                       start_index << PAGE_CACHE_SHIFT,
+                       page_cnt << PAGE_CACHE_SHIFT);
         if (ret)
                 return ret;
         i_done = 0;
@@ -1210,7 +1211,8 @@ again:
                 BTRFS_I(inode)->outstanding_extents++;
                 spin_unlock(&BTRFS_I(inode)->lock);
                 btrfs_delalloc_release_space(inode,
-                                    (page_cnt - i_done) << PAGE_CACHE_SHIFT);
+                               start_index << PAGE_CACHE_SHIFT,
+                               (page_cnt - i_done) << PAGE_CACHE_SHIFT);
         }
  
  
@@ -1235,7 +1237,9 @@ out:
                 unlock_page(pages[i]);
                 page_cache_release(pages[i]);
         }
-       btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
+       btrfs_delalloc_release_space(inode,
+                       start_index << PAGE_CACHE_SHIFT,
+                       page_cnt << PAGE_CACHE_SHIFT);
         return ret;
  
  }
@@ -1342,7 +1346,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                         break;
  
                 if (btrfs_defrag_cancelled(root->fs_info)) {
-                       printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n");
+                       btrfs_debug(root->fs_info, "defrag_file cancelled");
                         ret = -EAGAIN;
                         break;
                 }
@@ -1579,7 +1583,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
         new_size = div_u64(new_size, root->sectorsize);
         new_size *= root->sectorsize;
  
-       printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
+       btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu",
                       rcu_str_deref(device->name), new_size);
  
         if (new_size > old_size) {
@@ -2081,7 +2085,7 @@ static noinline int search_ioctl(struct inode *inode,
                 key.offset = (u64)-1;
                 root = btrfs_read_fs_root_no_name(info, &key);
                 if (IS_ERR(root)) {
-                       printk(KERN_ERR "BTRFS: could not find root %llu\n",
+                       btrfs_err(info, "could not find root %llu",
                                sk->tree_id);
                         btrfs_free_path(path);
                         return -ENOENT;
@@ -2221,7 +2225,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
         key.offset = (u64)-1;
         root = btrfs_read_fs_root_no_name(info, &key);
         if (IS_ERR(root)) {
-               printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id);
+               btrfs_err(info, "could not find root %llu", tree_id);
                 ret = -ENOENT;
                 goto out;
         }
@@ -2699,7 +2703,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
  {
         struct btrfs_ioctl_fs_info_args *fi_args;
         struct btrfs_device *device;
-       struct btrfs_device *next;
         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
         int ret = 0;
  
@@ -2711,7 +2714,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
         fi_args->num_devices = fs_devices->num_devices;
         memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
  
-       list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+       list_for_each_entry(device, &fs_devices->devices, dev_list) {
                 if (device->devid > fi_args->max_id)
                         fi_args->max_id = device->devid;
         }
@@ -3203,41 +3206,6 @@ out:
         return ret;
  }
  
-/* Helper to check and see if this root currently has a ref on the given disk
- * bytenr.  If it does then we need to update the quota for this root.  This
- * doesn't do anything if quotas aren't enabled.
- */
-static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                    u64 disko)
-{
-       struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
-       struct ulist *roots;
-       struct ulist_iterator uiter;
-       struct ulist_node *root_node = NULL;
-       int ret;
-
-       if (!root->fs_info->quota_enabled)
-               return 1;
-
-       btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
-       ret = btrfs_find_all_roots(trans, root->fs_info, disko,
-                                  tree_mod_seq_elem.seq, &roots);
-       if (ret < 0)
-               goto out;
-       ret = 0;
-       ULIST_ITER_INIT(&uiter);
-       while ((root_node = ulist_next(roots, &uiter))) {
-               if (root_node->val == root->objectid) {
-                       ret = 1;
-                       break;
-               }
-       }
-       ulist_free(roots);
-out:
-       btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
-       return ret;
-}
-
  static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
                                      struct inode *inode,
                                      u64 endoff,
@@ -3328,6 +3296,150 @@ static void clone_update_extent_map(struct inode *inode,
                         &BTRFS_I(inode)->runtime_flags);
  }
  
+/*
+ * Make sure we do not end up inserting an inline extent into a file that has
+ * already other (non-inline) extents. If a file has an inline extent it can
+ * not have any other extents and the (single) inline extent must start at the
+ * file offset 0. Failing to respect these rules will lead to file corruption,
+ * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
+ *
+ * We can have extents that have been already written to disk or we can have
+ * dirty ranges still in delalloc, in which case the extent maps and items are
+ * created only when we run delalloc, and the delalloc ranges might fall outside
+ * the range we are currently locking in the inode's io tree. So we check the
+ * inode's i_size because of that (i_size updates are done while holding the
+ * i_mutex, which we are holding here).
+ * We also check to see if the inode has a size not greater than "datal" but has
+ * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
+ * protected against such concurrent fallocate calls by the i_mutex).
+ *
+ * If the file has no extents but a size greater than datal, do not allow the
+ * copy because we would need turn the inline extent into a non-inline one (even
+ * with NO_HOLES enabled). If we find our destination inode only has one inline
+ * extent, just overwrite it with the source inline extent if its size is less
+ * than the source extent's size, or we could copy the source inline extent's
+ * data into the destination inode's inline extent if the later is greater then
+ * the former.
+ */
+static int clone_copy_inline_extent(struct inode *src,
+                                   struct inode *dst,
+                                   struct btrfs_trans_handle *trans,
+                                   struct btrfs_path *path,
+                                   struct btrfs_key *new_key,
+                                   const u64 drop_start,
+                                   const u64 datal,
+                                   const u64 skip,
+                                   const u64 size,
+                                   char *inline_data)
+{
+       struct btrfs_root *root = BTRFS_I(dst)->root;
+       const u64 aligned_end = ALIGN(new_key->offset + datal,
+                                     root->sectorsize);
+       int ret;
+       struct btrfs_key key;
+
+       if (new_key->offset > 0)
+               return -EOPNOTSUPP;
+
+       key.objectid = btrfs_ino(dst);
+       key.type = BTRFS_EXTENT_DATA_KEY;
+       key.offset = 0;
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0) {
+               return ret;
+       } else if (ret > 0) {
+               if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0)
+                               return ret;
+                       else if (ret > 0)
+                               goto copy_inline_extent;
+               }
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+               if (key.objectid == btrfs_ino(dst) &&
+                   key.type == BTRFS_EXTENT_DATA_KEY) {
+                       ASSERT(key.offset > 0);
+                       return -EOPNOTSUPP;
+               }
+       } else if (i_size_read(dst) <= datal) {
+               struct btrfs_file_extent_item *ei;
+               u64 ext_len;
+
+               /*
+                * If the file size is <= datal, make sure there are no other
+                * extents following (can happen do to an fallocate call with
+                * the flag FALLOC_FL_KEEP_SIZE).
+                */
+               ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                   struct btrfs_file_extent_item);
+               /*
+                * If it's an inline extent, it can not have other extents
+                * following it.
+                */
+               if (btrfs_file_extent_type(path->nodes[0], ei) ==
+                   BTRFS_FILE_EXTENT_INLINE)
+                       goto copy_inline_extent;
+
+               ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+               if (ext_len > aligned_end)
+                       return -EOPNOTSUPP;
+
+               ret = btrfs_next_item(root, path);
+               if (ret < 0) {
+                       return ret;
+               } else if (ret == 0) {
+                       btrfs_item_key_to_cpu(path->nodes[0], &key,
+                                             path->slots[0]);
+                       if (key.objectid == btrfs_ino(dst) &&
+                           key.type == BTRFS_EXTENT_DATA_KEY)
+                               return -EOPNOTSUPP;
+               }
+       }
+
+copy_inline_extent:
+       /*
+        * We have no extent items, or we have an extent at offset 0 which may
+        * or may not be inlined. All these cases are dealt the same way.
+        */
+       if (i_size_read(dst) > datal) {
+               /*
+                * If the destination inode has an inline extent...
+                * This would require copying the data from the source inline
+                * extent into the beginning of the destination's inline extent.
+                * But this is really complex, both extents can be compressed
+                * or just one of them, which would require decompressing and
+                * re-compressing data (which could increase the new compressed
+                * size, not allowing the compressed data to fit anymore in an
+                * inline extent).
+                * So just don't support this case for now (it should be rare,
+                * we are not really saving space when cloning inline extents).
+                */
+               return -EOPNOTSUPP;
+       }
+
+       btrfs_release_path(path);
+       ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
+       if (ret)
+               return ret;
+       ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
+       if (ret)
+               return ret;
+
+       if (skip) {
+               const u32 start = btrfs_file_extent_calc_inline_size(0);
+
+               memmove(inline_data + start, inline_data + start + skip, datal);
+       }
+
+       write_extent_buffer(path->nodes[0], inline_data,
+                           btrfs_item_ptr_offset(path->nodes[0],
+                                                 path->slots[0]),
+                           size);
+       inode_add_bytes(dst, datal);
+
+       return 0;
+}
+
  /**
   * btrfs_clone() - clone a range from inode file to another
   *
@@ -3352,9 +3464,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
         u32 nritems;
         int slot;
         int ret;
-       int no_quota;
         const u64 len = olen_aligned;
-       u64 last_disko = 0;
         u64 last_dest_end = destoff;
  
         ret = -ENOMEM;
@@ -3400,7 +3510,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
  
                 nritems = btrfs_header_nritems(path->nodes[0]);
  process_slot:
-               no_quota = 1;
                 if (path->slots[0] >= nritems) {
                         ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
                         if (ret < 0)
@@ -3552,35 +3661,13 @@ process_slot:
                                 btrfs_set_file_extent_num_bytes(leaf, extent,
                                                                 datal);
  
-                               /*
-                                * We need to look up the roots that point at
-                                * this bytenr and see if the new root does.  If
-                                * it does not we need to make sure we update
-                                * quotas appropriately.
-                                */
-                               if (disko && root != BTRFS_I(src)->root &&
-                                   disko != last_disko) {
-                                       no_quota = check_ref(trans, root,
-                                                            disko);
-                                       if (no_quota < 0) {
-                                               btrfs_abort_transaction(trans,
-                                                                       root,
-                                                                       ret);
-                                               btrfs_end_transaction(trans,
-                                                                     root);
-                                               ret = no_quota;
-                                               goto out;
-                                       }
-                               }
-
                                 if (disko) {
                                         inode_add_bytes(inode, datal);
                                         ret = btrfs_inc_extent_ref(trans, root,
                                                         disko, diskl, 0,
                                                         root->root_key.objectid,
                                                         btrfs_ino(inode),
-                                                       new_key.offset - datao,
-                                                       no_quota);
+                                                       new_key.offset - datao);
                                         if (ret) {
                                                 btrfs_abort_transaction(trans,
                                                                         root,
@@ -3594,21 +3681,6 @@ process_slot:
                         } else if (type == BTRFS_FILE_EXTENT_INLINE) {
                                 u64 skip = 0;
                                 u64 trim = 0;
-                               u64 aligned_end = 0;
-
-                               /*
-                                * Don't copy an inline extent into an offset
-                                * greater than zero. Having an inline extent
-                                * at such an offset results in chaos as btrfs
-                                * isn't prepared for such cases. Just skip
-                                * this case for the same reasons as commented
-                                * at btrfs_ioctl_clone().
-                                */
-                               if (last_dest_end > 0) {
-                                       ret = -EOPNOTSUPP;
-                                       btrfs_end_transaction(trans, root);
-                                       goto out;
-                               }
  
                                 if (off > key.offset) {
                                         skip = off - key.offset;
@@ -3626,42 +3698,22 @@ process_slot:
                                 size -= skip + trim;
                                 datal -= skip + trim;
  
-                               aligned_end = ALIGN(new_key.offset + datal,
-                                                   root->sectorsize);
-                               ret = btrfs_drop_extents(trans, root, inode,
-                                                        drop_start,
-                                                        aligned_end,
-                                                        1);
+                               ret = clone_copy_inline_extent(src, inode,
+                                                              trans, path,
+                                                              &new_key,
+                                                              drop_start,
+                                                              datal,
+                                                              skip, size, buf);
                                 if (ret) {
                                         if (ret != -EOPNOTSUPP)
                                                 btrfs_abort_transaction(trans,
-                                                       root, ret);
-                                       btrfs_end_transaction(trans, root);
-                                       goto out;
-                               }
-
-                               ret = btrfs_insert_empty_item(trans, root, path,
-                                                             &new_key, size);
-                               if (ret) {
-                                       btrfs_abort_transaction(trans, root,
-                                                               ret);
+                                                                       root,
+                                                                       ret);
                                         btrfs_end_transaction(trans, root);
                                         goto out;
                                 }
-
-                               if (skip) {
-                                       u32 start =
-                                         btrfs_file_extent_calc_inline_size(0);
-                                       memmove(buf+start, buf+start+skip,
-                                               datal);
-                               }
-
                                 leaf = path->nodes[0];
                                 slot = path->slots[0];
-                               write_extent_buffer(leaf, buf,
-                                           btrfs_item_ptr_offset(leaf, slot),
-                                           size);
-                               inode_add_bytes(inode, datal);
                         }
  
                         /* If we have an implicit hole (NO_HOLES feature). */
@@ -4814,7 +4866,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
         /* update qgroup status and info */
         err = btrfs_run_qgroups(trans, root->fs_info);
         if (err < 0)
-               btrfs_error(root->fs_info, ret,
+               btrfs_std_error(root->fs_info, ret,
                             "failed to update qgroup status and info\n");
         err = btrfs_end_transaction(trans, root);
         if (err && !ret)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c

index d7e6baf1b2054f1998608902385bc5016020c2cf..8077461fc56ab256d8d02838ac3c345408edf5a4 100644 (file)
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -79,6 +79,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
                 write_lock(&eb->lock);
                 WARN_ON(atomic_read(&eb->spinning_writers));
                 atomic_inc(&eb->spinning_writers);
+               /*
+                * atomic_dec_and_test implies a barrier for waitqueue_active
+                */
                 if (atomic_dec_and_test(&eb->blocking_writers) &&
                     waitqueue_active(&eb->write_lock_wq))
                         wake_up(&eb->write_lock_wq);
@@ -86,6 +89,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
                 BUG_ON(atomic_read(&eb->blocking_readers) == 0);
                 read_lock(&eb->lock);
                 atomic_inc(&eb->spinning_readers);
+               /*
+                * atomic_dec_and_test implies a barrier for waitqueue_active
+                */
                 if (atomic_dec_and_test(&eb->blocking_readers) &&
                     waitqueue_active(&eb->read_lock_wq))
                         wake_up(&eb->read_lock_wq);
@@ -229,6 +235,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
         }
         btrfs_assert_tree_read_locked(eb);
         WARN_ON(atomic_read(&eb->blocking_readers) == 0);
+       /*
+        * atomic_dec_and_test implies a barrier for waitqueue_active
+        */
         if (atomic_dec_and_test(&eb->blocking_readers) &&
             waitqueue_active(&eb->read_lock_wq))
                 wake_up(&eb->read_lock_wq);
@@ -280,6 +289,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
         if (blockers) {
                 WARN_ON(atomic_read(&eb->spinning_writers));
                 atomic_dec(&eb->blocking_writers);
+               /*
+                * Make sure counter is updated before we wake up waiters.
+                */
                 smp_mb();
                 if (waitqueue_active(&eb->write_lock_wq))
                         wake_up(&eb->write_lock_wq);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c

index 52170cf1757e3d25192a6cf6211a9d2edaa52751..8c27292ea9ea09361d94389c5a7f7a146bb8a925 100644 (file)
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -345,6 +345,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
  
         if (entry->bytes_left == 0) {
                 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+               /*
+                * Implicit memory barrier after test_and_set_bit
+                */
                 if (waitqueue_active(&entry->wait))
                         wake_up(&entry->wait);
         } else {
@@ -409,6 +412,9 @@ have_entry:
  
         if (entry->bytes_left == 0) {
                 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+               /*
+                * Implicit memory barrier after test_and_set_bit
+                */
                 if (waitqueue_active(&entry->wait))
                         wake_up(&entry->wait);
         } else {
@@ -484,15 +490,16 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
  
         spin_lock_irq(&log->log_extents_lock[index]);
         while (!list_empty(&log->logged_list[index])) {
+               struct inode *inode;
                 ordered = list_first_entry(&log->logged_list[index],
                                            struct btrfs_ordered_extent,
                                            log_list);
                 list_del_init(&ordered->log_list);
+               inode = ordered->inode;
                 spin_unlock_irq(&log->log_extents_lock[index]);
  
                 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
                     !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
-                       struct inode *inode = ordered->inode;
                         u64 start = ordered->file_offset;
                         u64 end = ordered->file_offset + ordered->len - 1;
  
@@ -503,20 +510,25 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
                                                    &ordered->flags));
  
                 /*
-                * If our ordered extent completed it means it updated the
-                * fs/subvol and csum trees already, so no need to make the
-                * current transaction's commit wait for it, as we end up
-                * holding memory unnecessarily and delaying the inode's iput
-                * until the transaction commit (we schedule an iput for the
-                * inode when the ordered extent's refcount drops to 0), which
-                * prevents it from being evictable until the transaction
-                * commits.
+                * In order to keep us from losing our ordered extent
+                * information when committing the transaction we have to make
+                * sure that any logged extents are completed when we go to
+                * commit the transaction.  To do this we simply increase the
+                * current transactions pending_ordered counter and decrement it
+                * when the ordered extent completes.
                  */
-               if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags))
-                       btrfs_put_ordered_extent(ordered);
-               else
-                       list_add_tail(&ordered->trans_list, &trans->ordered);
-
+               if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+                       struct btrfs_ordered_inode_tree *tree;
+
+                       tree = &BTRFS_I(inode)->ordered_tree;
+                       spin_lock_irq(&tree->lock);
+                       if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+                               set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+                               atomic_inc(&trans->transaction->pending_ordered);
+                       }
+                       spin_unlock_irq(&tree->lock);
+               }
+               btrfs_put_ordered_extent(ordered);
                 spin_lock_irq(&log->log_extents_lock[index]);
         }
         spin_unlock_irq(&log->log_extents_lock[index]);
@@ -578,6 +590,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
         struct btrfs_ordered_inode_tree *tree;
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct rb_node *node;
+       bool dec_pending_ordered = false;
  
         tree = &BTRFS_I(inode)->ordered_tree;
         spin_lock_irq(&tree->lock);
@@ -587,8 +600,37 @@ void btrfs_remove_ordered_extent(struct inode *inode,
         if (tree->last == node)
                 tree->last = NULL;
         set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+       if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags))
+               dec_pending_ordered = true;
         spin_unlock_irq(&tree->lock);
  
+       /*
+        * The current running transaction is waiting on us, we need to let it
+        * know that we're complete and wake it up.
+        */
+       if (dec_pending_ordered) {
+               struct btrfs_transaction *trans;
+
+               /*
+                * The checks for trans are just a formality, it should be set,
+                * but if it isn't we don't want to deref/assert under the spin
+                * lock, so be nice and check if trans is set, but ASSERT() so
+                * if it isn't set a developer will notice.
+                */
+               spin_lock(&root->fs_info->trans_lock);
+               trans = root->fs_info->running_transaction;
+               if (trans)
+                       atomic_inc(&trans->use_count);
+               spin_unlock(&root->fs_info->trans_lock);
+
+               ASSERT(trans);
+               if (trans) {
+                       if (atomic_dec_and_test(&trans->pending_ordered))
+                               wake_up(&trans->pending_wait);
+                       btrfs_put_transaction(trans);
+               }
+       }
+
         spin_lock(&root->ordered_extent_lock);
         list_del_init(&entry->root_extent_list);
         root->nr_ordered_extents--;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h

index 7176cc0fe43f7074ffa3c959bb3a7eae0d3b90e2..23c96059cef26a6292847a051b893f254cc472b3 100644 (file)
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -73,6 +73,8 @@ struct btrfs_ordered_sum {
  
  #define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
                                  * in the logging code. */
+#define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to
+                                 * complete in the current transaction. */
  struct btrfs_ordered_extent {
         /* logical offset in the file */
         u64 file_offset;
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c

index dca137b04095fc509390b2c02d4ca6ad1752ee74..f9e60231f6854545d514d8df53b8ba21be92762e 100644 (file)
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -49,18 +49,16 @@ static struct prop_handler prop_handlers[] = {
                 .extract = prop_compression_extract,
                 .inheritable = 1
         },
-       {
-               .xattr_name = NULL
-       }
  };
  
  void __init btrfs_props_init(void)
  {
-       struct prop_handler *p;
+       int i;
  
         hash_init(prop_handlers_ht);
  
-       for (p = &prop_handlers[0]; p->xattr_name; p++) {
+       for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+               struct prop_handler *p = &prop_handlers[i];
                 u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
  
                 hash_add(prop_handlers_ht, &p->node, h);
@@ -301,15 +299,16 @@ static int inherit_props(struct btrfs_trans_handle *trans,
                          struct inode *inode,
                          struct inode *parent)
  {
-       const struct prop_handler *h;
         struct btrfs_root *root = BTRFS_I(inode)->root;
         int ret;
+       int i;
  
         if (!test_bit(BTRFS_INODE_HAS_PROPS,
                       &BTRFS_I(parent)->runtime_flags))
                 return 0;
  
-       for (h = &prop_handlers[0]; h->xattr_name; h++) {
+       for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+               const struct prop_handler *h = &prop_handlers[i];
                 const char *value;
                 u64 num_bytes;
  
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c

index d904ee1c53497034ed6685b8e12f40d0880aaaca..46476c226395e777045a2303a1ada87b9e4b4a78 100644 (file)
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1652,10 +1652,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
                         }
                 }
  
-               /* For exclusive extent, free its reserved bytes too */
-               if (nr_old_roots == 0 && nr_new_roots == 1 &&
-                   cur_new_count == nr_new_roots)
-                       qg->reserved -= num_bytes;
                 if (dirty)
                         qgroup_dirty(fs_info, qg);
         }
@@ -2035,7 +2031,7 @@ out:
         return ret;
  }
  
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
  {
         struct btrfs_root *quota_root;
         struct btrfs_qgroup *qgroup;
@@ -2116,14 +2112,13 @@ out:
         return ret;
  }
  
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+                              u64 ref_root, u64 num_bytes)
  {
         struct btrfs_root *quota_root;
         struct btrfs_qgroup *qgroup;
-       struct btrfs_fs_info *fs_info = root->fs_info;
         struct ulist_node *unode;
         struct ulist_iterator uiter;
-       u64 ref_root = root->root_key.objectid;
         int ret = 0;
  
         if (!is_fstree(ref_root))
@@ -2169,6 +2164,11 @@ out:
         spin_unlock(&fs_info->qgroup_lock);
  }
  
+static inline void qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+       return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
+                                        num_bytes);
+}
  void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
  {
         if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
@@ -2188,10 +2188,10 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
   */
  static int
  qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
-                  struct btrfs_trans_handle *trans,
-                  struct extent_buffer *scratch_leaf)
+                  struct btrfs_trans_handle *trans)
  {
         struct btrfs_key found;
+       struct extent_buffer *scratch_leaf = NULL;
         struct ulist *roots = NULL;
         struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
         u64 num_bytes;
@@ -2229,7 +2229,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
         fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
  
         btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
-       memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf));
+       scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
+       if (!scratch_leaf) {
+               ret = -ENOMEM;
+               mutex_unlock(&fs_info->qgroup_rescan_lock);
+               goto out;
+       }
+       extent_buffer_get(scratch_leaf);
+       btrfs_tree_read_lock(scratch_leaf);
+       btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);
         slot = path->slots[0];
         btrfs_release_path(path);
         mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -2255,6 +2263,10 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
                         goto out;
         }
  out:
+       if (scratch_leaf) {
+               btrfs_tree_read_unlock_blocking(scratch_leaf);
+               free_extent_buffer(scratch_leaf);
+       }
         btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
  
         return ret;
@@ -2266,16 +2278,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
                                                      qgroup_rescan_work);
         struct btrfs_path *path;
         struct btrfs_trans_handle *trans = NULL;
-       struct extent_buffer *scratch_leaf = NULL;
         int err = -ENOMEM;
         int ret = 0;
  
         path = btrfs_alloc_path();
         if (!path)
                 goto out;
-       scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
-       if (!scratch_leaf)
-               goto out;
  
         err = 0;
         while (!err) {
@@ -2287,8 +2295,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
                 if (!fs_info->quota_enabled) {
                         err = -EINTR;
                 } else {
-                       err = qgroup_rescan_leaf(fs_info, path, trans,
-                                                scratch_leaf);
+                       err = qgroup_rescan_leaf(fs_info, path, trans);
                 }
                 if (err > 0)
                         btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2297,7 +2304,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
         }
  
  out:
-       kfree(scratch_leaf);
         btrfs_free_path(path);
  
         mutex_lock(&fs_info->qgroup_rescan_lock);
@@ -2486,3 +2492,190 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
                 btrfs_queue_work(fs_info->qgroup_rescan_workers,
                                  &fs_info->qgroup_rescan_work);
  }
+
+/*
+ * Reserve qgroup space for range [start, start + len).
+ *
+ * This function will either reserve space from related qgroups or doing
+ * nothing if the range is already reserved.
+ *
+ * Return 0 for successful reserve
+ * Return <0 for error (including -EQUOT)
+ *
+ * NOTE: this function may sleep for memory allocation.
+ */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct extent_changeset changeset;
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
+       int ret;
+
+       if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+           len == 0)
+               return 0;
+
+       changeset.bytes_changed = 0;
+       changeset.range_changed = ulist_alloc(GFP_NOFS);
+       ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+                       start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+                       &changeset);
+       trace_btrfs_qgroup_reserve_data(inode, start, len,
+                                       changeset.bytes_changed,
+                                       QGROUP_RESERVE);
+       if (ret < 0)
+               goto cleanup;
+       ret = qgroup_reserve(root, changeset.bytes_changed);
+       if (ret < 0)
+               goto cleanup;
+
+       ulist_free(changeset.range_changed);
+       return ret;
+
+cleanup:
+       /* cleanup already reserved ranges */
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(changeset.range_changed, &uiter)))
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
+                                unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
+                                GFP_NOFS);
+       ulist_free(changeset.range_changed);
+       return ret;
+}
+
+static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
+                                      int free)
+{
+       struct extent_changeset changeset;
+       int trace_op = QGROUP_RELEASE;
+       int ret;
+
+       changeset.bytes_changed = 0;
+       changeset.range_changed = ulist_alloc(GFP_NOFS);
+       if (!changeset.range_changed)
+               return -ENOMEM;
+
+       ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, 
+                       start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+                       &changeset);
+       if (ret < 0)
+               goto out;
+
+       if (free) {
+               qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+               trace_op = QGROUP_FREE;
+       }
+       trace_btrfs_qgroup_release_data(inode, start, len,
+                                       changeset.bytes_changed, trace_op);
+out:
+       ulist_free(changeset.range_changed);
+       return ret;
+}
+
+/*
+ * Free a reserved space range from io_tree and related qgroups
+ *
+ * Should be called when a range of pages get invalidated before reaching disk.
+ * Or for error cleanup case.
+ *
+ * For data written to disk, use btrfs_qgroup_release_data().
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
+{
+       return __btrfs_qgroup_release_data(inode, start, len, 1);
+}
+
+/*
+ * Release a reserved space range from io_tree only.
+ *
+ * Should be called when a range of pages get written to disk and corresponding
+ * FILE_EXTENT is inserted into corresponding root.
+ *
+ * Since new qgroup accounting framework will only update qgroup numbers at
+ * commit_transaction() time, its reserved space shouldn't be freed from
+ * related qgroups.
+ *
+ * But we should release the range from io_tree, to allow further write to be
+ * COWed.
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
+{
+       return __btrfs_qgroup_release_data(inode, start, len, 0);
+}
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
+{
+       int ret;
+
+       if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+           num_bytes == 0)
+               return 0;
+
+       BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+       ret = qgroup_reserve(root, num_bytes);
+       if (ret < 0)
+               return ret;
+       atomic_add(num_bytes, &root->qgroup_meta_rsv);
+       return ret;
+}
+
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
+{
+       int reserved;
+
+       if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+               return;
+
+       reserved = atomic_xchg(&root->qgroup_meta_rsv, 0);
+       if (reserved == 0)
+               return;
+       qgroup_free(root, reserved);
+}
+
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
+{
+       if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+               return;
+
+       BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+       WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes);
+       atomic_sub(num_bytes, &root->qgroup_meta_rsv);
+       qgroup_free(root, num_bytes);
+}
+
+/*
+ * Check qgroup reserved space leaking, normally at destory inode
+ * time
+ */
+void btrfs_qgroup_check_reserved_leak(struct inode *inode)
+{
+       struct extent_changeset changeset;
+       struct ulist_node *unode;
+       struct ulist_iterator iter;
+       int ret;
+
+       changeset.bytes_changed = 0;
+       changeset.range_changed = ulist_alloc(GFP_NOFS);
+       if (WARN_ON(!changeset.range_changed))
+               return;
+
+       ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+                       EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset);
+
+       WARN_ON(ret < 0);
+       if (WARN_ON(changeset.bytes_changed)) {
+               ULIST_ITER_INIT(&iter);
+               while ((unode = ulist_next(changeset.range_changed, &iter))) {
+                       btrfs_warn(BTRFS_I(inode)->root->fs_info,
+                               "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
+                               inode->i_ino, unode->val, unode->aux);
+               }
+               qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+       }
+       ulist_free(changeset.range_changed);
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h

index 6387dcfa354c6ecac672a543bf1d6b8e587e911b..ecb2c143ef756bd0356e3968b3f9f13fe5ac21cd 100644 (file)
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -33,6 +33,13 @@ struct btrfs_qgroup_extent_record {
         struct ulist *old_roots;
  };
  
+/*
+ * For qgroup event trace points only
+ */
+#define QGROUP_RESERVE         (1<<0)
+#define QGROUP_RELEASE         (1<<1)
+#define QGROUP_FREE            (1<<2)
+
  int btrfs_quota_enable(struct btrfs_trans_handle *trans,
                        struct btrfs_fs_info *fs_info);
  int btrfs_quota_disable(struct btrfs_trans_handle *trans,
@@ -71,9 +78,18 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
  int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
                          struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
                          struct btrfs_qgroup_inherit *inherit);
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
-
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+                              u64 ref_root, u64 num_bytes);
+/*
+ * TODO: Add proper trace point for it, as btrfs_qgroup_free() is
+ * called by everywhere, can't provide good trace for delayed ref case.
+ */
+static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
+                                                u64 ref_root, u64 num_bytes)
+{
+       btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
+       trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes);
+}
  void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
  
  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -81,4 +97,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
                                u64 rfer, u64 excl);
  #endif
  
+/* New io_tree based accurate qgroup reserve API */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_check_reserved_leak(struct inode *inode);
  #endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c

index fcf7265ca46fd84a65b647619f4e07ba996d7a9f..1a33d3eb36de184e4cbcf72efb26687857c70051 100644 (file)
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -810,7 +810,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
                         }
  
                         goto done_nolock;
-               } else  if (waitqueue_active(&h->wait)) {
+                       /*
+                        * The barrier for this waitqueue_active is not needed,
+                        * we're protected by h->lock and can't miss a wakeup.
+                        */
+               } else if (waitqueue_active(&h->wait)) {
                         spin_unlock(&rbio->bio_list_lock);
                         spin_unlock_irqrestore(&h->lock, flags);
                         wake_up(&h->wait);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c

index 4645cd16d5ba22d94396eec0f00e1243371eeb7c..619f92963e27102fb47a6f119b65451bdf959bef 100644 (file)
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -569,7 +569,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
         rec = kzalloc(sizeof(*rec), GFP_NOFS);
         if (!rec) {
                 reada_extent_put(root->fs_info, re);
-               return -1;
+               return -ENOMEM;
         }
  
         rec->rc = rc;
@@ -918,6 +918,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
         u64 start;
         u64 generation;
         int level;
+       int ret;
         struct extent_buffer *node;
         static struct btrfs_key max_key = {
                 .objectid = (u64)-1,
@@ -943,9 +944,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
         generation = btrfs_header_generation(node);
         free_extent_buffer(node);
  
-       if (reada_add_block(rc, start, &max_key, level, generation)) {
+       ret = reada_add_block(rc, start, &max_key, level, generation);
+       if (ret) {
                 kfree(rc);
-               return ERR_PTR(-ENOMEM);
+               return ERR_PTR(ret);
         }
  
         reada_start_machine(root->fs_info);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c

index 303babeef50579e66f48cb94d9668cb8a1731a6b..b4ca5454ef1a7ebc4fcb8eec476701bc8496a050 100644 (file)
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1716,7 +1716,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
                 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
                                            num_bytes, parent,
                                            btrfs_header_owner(leaf),
-                                          key.objectid, key.offset, 1);
+                                          key.objectid, key.offset);
                 if (ret) {
                         btrfs_abort_transaction(trans, root, ret);
                         break;
@@ -1724,7 +1724,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
  
                 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                         parent, btrfs_header_owner(leaf),
-                                       key.objectid, key.offset, 1);
+                                       key.objectid, key.offset);
                 if (ret) {
                         btrfs_abort_transaction(trans, root, ret);
                         break;
@@ -1900,23 +1900,21 @@ again:
  
                 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
                                         path->nodes[level]->start,
-                                       src->root_key.objectid, level - 1, 0,
-                                       1);
+                                       src->root_key.objectid, level - 1, 0);
                 BUG_ON(ret);
                 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
                                         0, dest->root_key.objectid, level - 1,
-                                       0, 1);
+                                       0);
                 BUG_ON(ret);
  
                 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
                                         path->nodes[level]->start,
-                                       src->root_key.objectid, level - 1, 0,
-                                       1);
+                                       src->root_key.objectid, level - 1, 0);
                 BUG_ON(ret);
  
                 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
                                         0, dest->root_key.objectid, level - 1,
-                                       0, 1);
+                                       0);
                 BUG_ON(ret);
  
                 btrfs_unlock_up_safe(path, 0);
@@ -2418,7 +2416,7 @@ again:
         }
  out:
         if (ret) {
-               btrfs_std_error(root->fs_info, ret);
+               btrfs_std_error(root->fs_info, ret, NULL);
                 if (!list_empty(&reloc_roots))
                         free_reloc_roots(&reloc_roots);
  
@@ -2745,7 +2743,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                                                 node->eb->start, blocksize,
                                                 upper->eb->start,
                                                 btrfs_header_owner(upper->eb),
-                                               node->level, 0, 1);
+                                               node->level, 0);
                         BUG_ON(ret);
  
                         ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -3034,8 +3032,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
         BUG_ON(cluster->start != cluster->boundary[0]);
         mutex_lock(&inode->i_mutex);
  
-       ret = btrfs_check_data_free_space(inode, cluster->end +
-                                         1 - cluster->start, 0);
+       ret = btrfs_check_data_free_space(inode, cluster->start,
+                                         cluster->end + 1 - cluster->start);
         if (ret)
                 goto out;
  
@@ -3056,8 +3054,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
                         break;
                 nr++;
         }
-       btrfs_free_reserved_data_space(inode, cluster->end +
-                                      1 - cluster->start);
+       btrfs_free_reserved_data_space(inode, cluster->start,
+                                      cluster->end + 1 - cluster->start);
  out:
         mutex_unlock(&inode->i_mutex);
         return ret;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c

index 360a728a639fec403893c6847e1f7859a47dff13..7cf8509deda7c0ea6e98af02a8ab0acafbe016b2 100644 (file)
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -45,12 +45,13 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
         if (!need_reset && btrfs_root_generation(item)
                 != btrfs_root_generation_v2(item)) {
                 if (btrfs_root_generation_v2(item) != 0) {
-                       printk(KERN_WARNING "BTRFS: mismatching "
+                       btrfs_warn(eb->fs_info,
+                                       "mismatching "
                                         "generation and generation_v2 "
                                         "found in root item. This root "
                                         "was probably mounted with an "
                                         "older kernel. Resetting all "
-                                       "new fields.\n");
+                                       "new fields.");
                 }
                 need_reset = 1;
         }
@@ -141,7 +142,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
         int ret;
         int slot;
         unsigned long ptr;
-       int old_len;
+       u32 old_len;
  
         path = btrfs_alloc_path();
         if (!path)
@@ -283,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                         trans = btrfs_join_transaction(tree_root);
                         if (IS_ERR(trans)) {
                                 err = PTR_ERR(trans);
-                               btrfs_error(tree_root->fs_info, err,
+                               btrfs_std_error(tree_root->fs_info, err,
                                             "Failed to start trans to delete "
                                             "orphan item");
                                 break;
@@ -292,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                                                     root_key.objectid);
                         btrfs_end_transaction(trans, tree_root);
                         if (err) {
-                               btrfs_error(tree_root->fs_info, err,
+                               btrfs_std_error(tree_root->fs_info, err,
                                             "Failed to delete root orphan "
                                             "item");
                                 break;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c

index a39f5d1144e8e0fe90b459f3c5528d5672be0d4a..550de89a8661af8fdb1c08783ffadde680c138ec 100644 (file)
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -580,9 +580,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
          * hold all of the paths here
          */
         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
-               printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+               btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
                         "%s, sector %llu, root %llu, inode %llu, offset %llu, "
-                       "length %llu, links %u (path: %s)\n", swarn->errstr,
+                       "length %llu, links %u (path: %s)", swarn->errstr,
                         swarn->logical, rcu_str_deref(swarn->dev->name),
                         (unsigned long long)swarn->sector, root, inum, offset,
                         min(isize - offset, (u64)PAGE_SIZE), nlink,
@@ -592,9 +592,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
         return 0;
  
  err:
-       printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+       btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
                 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
-               "resolving failed with ret=%d\n", swarn->errstr,
+               "resolving failed with ret=%d", swarn->errstr,
                 swarn->logical, rcu_str_deref(swarn->dev->name),
                 (unsigned long long)swarn->sector, root, inum, offset, ret);
  
@@ -649,10 +649,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
                                                       item_size, &ref_root,
                                                       &ref_level);
-                       printk_in_rcu(KERN_WARNING
-                               "BTRFS: %s at logical %llu on dev %s, "
+                       btrfs_warn_in_rcu(fs_info,
+                               "%s at logical %llu on dev %s, "
                                 "sector %llu: metadata %s (level %d) in tree "
-                               "%llu\n", errstr, swarn.logical,
+                               "%llu", errstr, swarn.logical,
                                 rcu_str_deref(dev->name),
                                 (unsigned long long)swarn.sector,
                                 ref_level ? "node" : "leaf",
@@ -850,8 +850,8 @@ out:
                 btrfs_dev_replace_stats_inc(
                         &sctx->dev_root->fs_info->dev_replace.
                         num_uncorrectable_read_errors);
-               printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
-                   "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+               btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+                   "unable to fixup (nodatasum) error at logical %llu on dev %s",
                         fixup->logical, rcu_str_deref(fixup->dev->name));
         }
  
@@ -1230,8 +1230,8 @@ corrected_error:
                         sctx->stat.corrected_errors++;
                         sblock_to_check->data_corrected = 1;
                         spin_unlock(&sctx->stat_lock);
-                       printk_ratelimited_in_rcu(KERN_ERR
-                               "BTRFS: fixed up error at logical %llu on dev %s\n",
+                       btrfs_err_rl_in_rcu(fs_info,
+                               "fixed up error at logical %llu on dev %s",
                                 logical, rcu_str_deref(dev->name));
                 }
         } else {
@@ -1239,8 +1239,8 @@ did_not_correct_error:
                 spin_lock(&sctx->stat_lock);
                 sctx->stat.uncorrectable_errors++;
                 spin_unlock(&sctx->stat_lock);
-               printk_ratelimited_in_rcu(KERN_ERR
-                       "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
+               btrfs_err_rl_in_rcu(fs_info,
+                       "unable to fixup (regular) error at logical %llu on dev %s",
                         logical, rcu_str_deref(dev->name));
         }
  
@@ -1626,9 +1626,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                 int ret;
  
                 if (!page_bad->dev->bdev) {
-                       printk_ratelimited(KERN_WARNING "BTRFS: "
+                       btrfs_warn_rl(sblock_bad->sctx->dev_root->fs_info,
                                 "scrub_repair_page_from_good_copy(bdev == NULL) "
-                               "is unexpected!\n");
+                               "is unexpected");
                         return -EIO;
                 }
  
@@ -2201,15 +2201,15 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
                 spin_lock(&sctx->stat_lock);
                 sctx->stat.read_errors++;
                 spin_unlock(&sctx->stat_lock);
-               printk_ratelimited_in_rcu(KERN_ERR
-                       "BTRFS: I/O error rebulding logical %llu for dev %s\n",
+               btrfs_err_rl_in_rcu(fs_info,
+                       "IO error rebuilding logical %llu for dev %s",
                         logical, rcu_str_deref(dev->name));
         } else if (sblock->header_error || sblock->checksum_error) {
                 spin_lock(&sctx->stat_lock);
                 sctx->stat.uncorrectable_errors++;
                 spin_unlock(&sctx->stat_lock);
-               printk_ratelimited_in_rcu(KERN_ERR
-                       "BTRFS: failed to rebuild valid logical %llu for dev %s\n",
+               btrfs_err_rl_in_rcu(fs_info,
+                       "failed to rebuild valid logical %llu for dev %s",
                         logical, rcu_str_deref(dev->name));
         } else {
                 scrub_write_block_to_dev_replace(sblock);
@@ -4375,8 +4375,8 @@ static int write_page_nocow(struct scrub_ctx *sctx,
         if (!dev)
                 return -EIO;
         if (!dev->bdev) {
-               printk_ratelimited(KERN_WARNING
-                       "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+               btrfs_warn_rl(dev->dev_root->fs_info,
+                       "scrub write_page_nocow(bdev == NULL) is unexpected");
                 return -EIO;
         }
         bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c

index a739b825bdd364cfa9cbf16edc9f978a68feb95f..355a458cba1abe29efb3410a6ae93261052ba1e9 100644 (file)
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1434,16 +1434,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
         }
  
         if (cur_clone_root) {
-               if (compressed != BTRFS_COMPRESS_NONE) {
-                       /*
-                        * Offsets given by iterate_extent_inodes() are relative
-                        * to the start of the extent, we need to add logical
-                        * offset from the file extent item.
-                        * (See why at backref.c:check_extent_in_eb())
-                        */
-                       cur_clone_root->offset += btrfs_file_extent_offset(eb,
-                                                                          fi);
-               }
                 *found = cur_clone_root;
                 ret = 0;
         } else {
@@ -2353,8 +2343,14 @@ static int send_subvol_begin(struct send_ctx *sctx)
         }
  
         TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
-       TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
-                       sctx->send_root->root_item.uuid);
+
+       if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
+               TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+                           sctx->send_root->root_item.received_uuid);
+       else
+               TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+                           sctx->send_root->root_item.uuid);
+
         TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
                     le64_to_cpu(sctx->send_root->root_item.ctransid));
         if (parent_root) {
@@ -2564,7 +2560,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
         } else if (S_ISSOCK(mode)) {
                 cmd = BTRFS_SEND_C_MKSOCK;
         } else {
-               printk(KERN_WARNING "btrfs: unexpected inode type %o",
+               btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
                                 (int)(mode & S_IFMT));
                 ret = -ENOTSUPP;
                 goto out;
@@ -4687,6 +4683,171 @@ tlv_put_failure:
         return ret;
  }
  
+static int send_extent_data(struct send_ctx *sctx,
+                           const u64 offset,
+                           const u64 len)
+{
+       u64 sent = 0;
+
+       if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
+               return send_update_extent(sctx, offset, len);
+
+       while (sent < len) {
+               u64 size = len - sent;
+               int ret;
+
+               if (size > BTRFS_SEND_READ_SIZE)
+                       size = BTRFS_SEND_READ_SIZE;
+               ret = send_write(sctx, offset + sent, size);
+               if (ret < 0)
+                       return ret;
+               if (!ret)
+                       break;
+               sent += ret;
+       }
+       return 0;
+}
+
+static int clone_range(struct send_ctx *sctx,
+                      struct clone_root *clone_root,
+                      const u64 disk_byte,
+                      u64 data_offset,
+                      u64 offset,
+                      u64 len)
+{
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       int ret;
+
+       path = alloc_path_for_send();
+       if (!path)
+               return -ENOMEM;
+
+       /*
+        * We can't send a clone operation for the entire range if we find
+        * extent items in the respective range in the source file that
+        * refer to different extents or if we find holes.
+        * So check for that and do a mix of clone and regular write/copy
+        * operations if needed.
+        *
+        * Example:
+        *
+        * mkfs.btrfs -f /dev/sda
+        * mount /dev/sda /mnt
+        * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
+        * cp --reflink=always /mnt/foo /mnt/bar
+        * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
+        * btrfs subvolume snapshot -r /mnt /mnt/snap
+        *
+        * If when we send the snapshot and we are processing file bar (which
+        * has a higher inode number than foo) we blindly send a clone operation
+        * for the [0, 100K[ range from foo to bar, the receiver ends up getting
+        * a file bar that matches the content of file foo - iow, doesn't match
+        * the content from bar in the original filesystem.
+        */
+       key.objectid = clone_root->ino;
+       key.type = BTRFS_EXTENT_DATA_KEY;
+       key.offset = clone_root->offset;
+       ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+       if (ret > 0 && path->slots[0] > 0) {
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+               if (key.objectid == clone_root->ino &&
+                   key.type == BTRFS_EXTENT_DATA_KEY)
+                       path->slots[0]--;
+       }
+
+       while (true) {
+               struct extent_buffer *leaf = path->nodes[0];
+               int slot = path->slots[0];
+               struct btrfs_file_extent_item *ei;
+               u8 type;
+               u64 ext_len;
+               u64 clone_len;
+
+               if (slot >= btrfs_header_nritems(leaf)) {
+                       ret = btrfs_next_leaf(clone_root->root, path);
+                       if (ret < 0)
+                               goto out;
+                       else if (ret > 0)
+                               break;
+                       continue;
+               }
+
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+
+               /*
+                * We might have an implicit trailing hole (NO_HOLES feature
+                * enabled). We deal with it after leaving this loop.
+                */
+               if (key.objectid != clone_root->ino ||
+                   key.type != BTRFS_EXTENT_DATA_KEY)
+                       break;
+
+               ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+               type = btrfs_file_extent_type(leaf, ei);
+               if (type == BTRFS_FILE_EXTENT_INLINE) {
+                       ext_len = btrfs_file_extent_inline_len(leaf, slot, ei);
+                       ext_len = PAGE_CACHE_ALIGN(ext_len);
+               } else {
+                       ext_len = btrfs_file_extent_num_bytes(leaf, ei);
+               }
+
+               if (key.offset + ext_len <= clone_root->offset)
+                       goto next;
+
+               if (key.offset > clone_root->offset) {
+                       /* Implicit hole, NO_HOLES feature enabled. */
+                       u64 hole_len = key.offset - clone_root->offset;
+
+                       if (hole_len > len)
+                               hole_len = len;
+                       ret = send_extent_data(sctx, offset, hole_len);
+                       if (ret < 0)
+                               goto out;
+
+                       len -= hole_len;
+                       if (len == 0)
+                               break;
+                       offset += hole_len;
+                       clone_root->offset += hole_len;
+                       data_offset += hole_len;
+               }
+
+               if (key.offset >= clone_root->offset + len)
+                       break;
+
+               clone_len = min_t(u64, ext_len, len);
+
+               if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
+                   btrfs_file_extent_offset(leaf, ei) == data_offset)
+                       ret = send_clone(sctx, offset, clone_len, clone_root);
+               else
+                       ret = send_extent_data(sctx, offset, clone_len);
+
+               if (ret < 0)
+                       goto out;
+
+               len -= clone_len;
+               if (len == 0)
+                       break;
+               offset += clone_len;
+               clone_root->offset += clone_len;
+               data_offset += clone_len;
+next:
+               path->slots[0]++;
+       }
+
+       if (len > 0)
+               ret = send_extent_data(sctx, offset, len);
+       else
+               ret = 0;
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
  static int send_write_or_clone(struct send_ctx *sctx,
                                struct btrfs_path *path,
                                struct btrfs_key *key,
@@ -4695,9 +4856,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
         int ret = 0;
         struct btrfs_file_extent_item *ei;
         u64 offset = key->offset;
-       u64 pos = 0;
         u64 len;
-       u32 l;
         u8 type;
         u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
  
@@ -4725,22 +4884,15 @@ static int send_write_or_clone(struct send_ctx *sctx,
         }
  
         if (clone_root && IS_ALIGNED(offset + len, bs)) {
-               ret = send_clone(sctx, offset, len, clone_root);
-       } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
-               ret = send_update_extent(sctx, offset, len);
+               u64 disk_byte;
+               u64 data_offset;
+
+               disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
+               data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
+               ret = clone_range(sctx, clone_root, disk_byte, data_offset,
+                                 offset, len);
         } else {
-               while (pos < len) {
-                       l = len - pos;
-                       if (l > BTRFS_SEND_READ_SIZE)
-                               l = BTRFS_SEND_READ_SIZE;
-                       ret = send_write(sctx, pos + offset, l);
-                       if (ret < 0)
-                               goto out;
-                       if (!ret)
-                               break;
-                       pos += ret;
-               }
-               ret = 0;
+               ret = send_extent_data(sctx, offset, len);
         }
  out:
         return ret;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c

index 11d1eab9234dc818244d1c1bbecd6d25981f4890..24154e422945167f474557887c62acaf6ed0779c 100644 (file)
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -130,7 +130,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
         }
  }
  
-#ifdef CONFIG_PRINTK
  /*
   * __btrfs_std_error decodes expected errors from the caller and
   * invokes the approciate error response.
@@ -140,7 +139,9 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
                        unsigned int line, int errno, const char *fmt, ...)
  {
         struct super_block *sb = fs_info->sb;
+#ifdef CONFIG_PRINTK
         const char *errstr;
+#endif
  
         /*
          * Special case: if the error is EROFS, and we're already
@@ -149,6 +150,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
         if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
                 return;
  
+#ifdef CONFIG_PRINTK
         errstr = btrfs_decode_error(errno);
         if (fmt) {
                 struct va_format vaf;
@@ -166,6 +168,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
                 printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
                         sb->s_id, function, line, errno, errstr);
         }
+#endif
  
         /* Don't go through full error handling during mount */
         save_error_info(fs_info);
@@ -173,6 +176,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
                 btrfs_handle_error(fs_info);
  }
  
+#ifdef CONFIG_PRINTK
  static const char * const logtypes[] = {
         "emergency",
         "alert",
@@ -212,27 +216,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
  
         va_end(args);
  }
-
-#else
-
-void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
-                      unsigned int line, int errno, const char *fmt, ...)
-{
-       struct super_block *sb = fs_info->sb;
-
-       /*
-        * Special case: if the error is EROFS, and we're already
-        * under MS_RDONLY, then it is safe here.
-        */
-       if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
-               return;
-
-       /* Don't go through full error handling during mount */
-       if (sb->s_flags & MS_BORN) {
-               save_error_info(fs_info);
-               btrfs_handle_error(fs_info);
-       }
-}
  #endif
  
  /*
@@ -320,6 +303,9 @@ enum {
         Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
         Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
         Opt_datasum, Opt_treelog, Opt_noinode_cache,
+#ifdef CONFIG_BTRFS_DEBUG
+       Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+#endif
         Opt_err,
  };
  
@@ -372,6 +358,11 @@ static match_table_t tokens = {
         {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
         {Opt_fatal_errors, "fatal_errors=%s"},
         {Opt_commit_interval, "commit=%d"},
+#ifdef CONFIG_BTRFS_DEBUG
+       {Opt_fragment_data, "fragment=data"},
+       {Opt_fragment_metadata, "fragment=metadata"},
+       {Opt_fragment_all, "fragment=all"},
+#endif
         {Opt_err, NULL},
  };
  
@@ -738,6 +729,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                 info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
                         }
                         break;
+#ifdef CONFIG_BTRFS_DEBUG
+               case Opt_fragment_all:
+                       btrfs_info(root->fs_info, "fragmenting all space");
+                       btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+                       btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
+                       break;
+               case Opt_fragment_metadata:
+                       btrfs_info(root->fs_info, "fragmenting metadata");
+                       btrfs_set_opt(info->mount_opt,
+                                     FRAGMENT_METADATA);
+                       break;
+               case Opt_fragment_data:
+                       btrfs_info(root->fs_info, "fragmenting data");
+                       btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+                       break;
+#endif
                 case Opt_err:
                         btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
                         ret = -EINVAL;
@@ -1189,6 +1196,12 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
                 seq_puts(seq, ",fatal_errors=panic");
         if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
                 seq_printf(seq, ",commit=%d", info->commit_interval);
+#ifdef CONFIG_BTRFS_DEBUG
+       if (btrfs_test_opt(root, FRAGMENT_DATA))
+               seq_puts(seq, ",fragment=data");
+       if (btrfs_test_opt(root, FRAGMENT_METADATA))
+               seq_puts(seq, ",fragment=metadata");
+#endif
         seq_printf(seq, ",subvolid=%llu",
                   BTRFS_I(d_inode(dentry))->root->root_key.objectid);
         seq_puts(seq, ",subvol=");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c

index 603b0cc2b9bbf627f0b07fec3dad216164e6ddf3..e0ac85949067c30191ce3635e2a65edbe8a56361 100644 (file)
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -437,24 +437,24 @@ static const struct attribute *btrfs_attrs[] = {
         NULL,
  };
  
-static void btrfs_release_super_kobj(struct kobject *kobj)
+static void btrfs_release_fsid_kobj(struct kobject *kobj)
  {
         struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj);
  
-       memset(&fs_devs->super_kobj, 0, sizeof(struct kobject));
+       memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject));
         complete(&fs_devs->kobj_unregister);
  }
  
  static struct kobj_type btrfs_ktype = {
         .sysfs_ops      = &kobj_sysfs_ops,
-       .release        = btrfs_release_super_kobj,
+       .release        = btrfs_release_fsid_kobj,
  };
  
  static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj)
  {
         if (kobj->ktype != &btrfs_ktype)
                 return NULL;
-       return container_of(kobj, struct btrfs_fs_devices, super_kobj);
+       return container_of(kobj, struct btrfs_fs_devices, fsid_kobj);
  }
  
  static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
@@ -502,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
                         attrs[0] = &fa->kobj_attr.attr;
                         if (add) {
                                 int ret;
-                               ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj,
+                               ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj,
                                                         &agroup);
                                 if (ret)
                                         return ret;
                         } else
-                               sysfs_unmerge_group(&fs_info->fs_devices->super_kobj,
+                               sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj,
                                                     &agroup);
                 }
  
@@ -523,9 +523,9 @@ static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
                 fs_devs->device_dir_kobj = NULL;
         }
  
-       if (fs_devs->super_kobj.state_initialized) {
-               kobject_del(&fs_devs->super_kobj);
-               kobject_put(&fs_devs->super_kobj);
+       if (fs_devs->fsid_kobj.state_initialized) {
+               kobject_del(&fs_devs->fsid_kobj);
+               kobject_put(&fs_devs->fsid_kobj);
                 wait_for_completion(&fs_devs->kobj_unregister);
         }
  }
@@ -545,7 +545,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
         }
  }
  
-void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
  {
         btrfs_reset_fs_info_ptr(fs_info);
  
@@ -555,9 +555,9 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
                 kobject_put(fs_info->space_info_kobj);
         }
         addrm_unknown_feature_attrs(fs_info, false);
-       sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
-       sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
-       btrfs_kobj_rm_device(fs_info->fs_devices, NULL);
+       sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
+       sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
+       btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
  }
  
  const char * const btrfs_feature_set_names[3] = {
@@ -637,7 +637,7 @@ static void init_feature_attrs(void)
  
  /* when one_device is NULL, it removes all device links */
  
-int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
                 struct btrfs_device *one_device)
  {
         struct hd_struct *disk;
@@ -675,7 +675,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
  {
         if (!fs_devs->device_dir_kobj)
                 fs_devs->device_dir_kobj = kobject_create_and_add("devices",
-                                               &fs_devs->super_kobj);
+                                               &fs_devs->fsid_kobj);
  
         if (!fs_devs->device_dir_kobj)
                 return -ENOMEM;
@@ -683,7 +683,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
         return 0;
  }
  
-int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
                                 struct btrfs_device *one_device)
  {
         int error = 0;
@@ -730,31 +730,31 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
         int error;
  
         init_completion(&fs_devs->kobj_unregister);
-       fs_devs->super_kobj.kset = btrfs_kset;
-       error = kobject_init_and_add(&fs_devs->super_kobj,
+       fs_devs->fsid_kobj.kset = btrfs_kset;
+       error = kobject_init_and_add(&fs_devs->fsid_kobj,
                                 &btrfs_ktype, parent, "%pU", fs_devs->fsid);
         return error;
  }
  
-int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
  {
         int error;
         struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
-       struct kobject *super_kobj = &fs_devs->super_kobj;
+       struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
  
         btrfs_set_fs_info_ptr(fs_info);
  
-       error = btrfs_kobj_add_device(fs_devs, NULL);
+       error = btrfs_sysfs_add_device_link(fs_devs, NULL);
         if (error)
                 return error;
  
-       error = sysfs_create_files(super_kobj, btrfs_attrs);
+       error = sysfs_create_files(fsid_kobj, btrfs_attrs);
         if (error) {
-               btrfs_kobj_rm_device(fs_devs, NULL);
+               btrfs_sysfs_rm_device_link(fs_devs, NULL);
                 return error;
         }
  
-       error = sysfs_create_group(super_kobj,
+       error = sysfs_create_group(fsid_kobj,
                                    &btrfs_feature_attr_group);
         if (error)
                 goto failure;
@@ -764,7 +764,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
                 goto failure;
  
         fs_info->space_info_kobj = kobject_create_and_add("allocation",
-                                                 super_kobj);
+                                                 fsid_kobj);
         if (!fs_info->space_info_kobj) {
                 error = -ENOMEM;
                 goto failure;
@@ -776,7 +776,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
  
         return 0;
  failure:
-       btrfs_sysfs_remove_one(fs_info);
+       btrfs_sysfs_remove_mounted(fs_info);
         return error;
  }
  
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h

index 6392527bcc15d5e56b66d39a3faac72fba730e2f..9c09522125a6b26d0577e139c38ebad7ca5c8331 100644 (file)
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -82,9 +82,9 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
  extern const char * const btrfs_feature_set_names[3];
  extern struct kobj_type space_info_ktype;
  extern struct kobj_type btrfs_raid_ktype;
-int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
                 struct btrfs_device *one_device);
-int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
                  struct btrfs_device *one_device);
  int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
                                 struct kobject *parent);
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c

index 2299bfde39eec666fe1b0876733746a76673f5a5..c8c3d70c31ffad4e02acd04e0f7dcaa54ad0fe2b 100644 (file)
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -19,6 +19,7 @@
  #include <linux/slab.h>
  #include "btrfs-tests.h"
  #include "../ctree.h"
+#include "../disk-io.h"
  #include "../free-space-cache.h"
  
  #define BITS_PER_BITMAP                (PAGE_CACHE_SIZE * 8)
@@ -35,6 +36,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
                 kfree(cache);
                 return NULL;
         }
+       cache->fs_info = btrfs_alloc_dummy_fs_info();
+       if (!cache->fs_info) {
+               kfree(cache->free_space_ctl);
+               kfree(cache);
+               return NULL;
+       }
  
         cache->key.objectid = 0;
         cache->key.offset = 1024 * 1024 * 1024;
@@ -879,7 +886,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
  int btrfs_test_free_space_cache(void)
  {
         struct btrfs_block_group_cache *cache;
-       int ret;
+       struct btrfs_root *root = NULL;
+       int ret = -ENOMEM;
  
         test_msg("Running btrfs free space cache tests\n");
  
@@ -889,6 +897,17 @@ int btrfs_test_free_space_cache(void)
                 return 0;
         }
  
+       root = btrfs_alloc_dummy_root();
+       if (!root)
+               goto out;
+
+       root->fs_info = btrfs_alloc_dummy_fs_info();
+       if (!root->fs_info)
+               goto out;
+
+       root->fs_info->extent_root = root;
+       cache->fs_info = root->fs_info;
+
         ret = test_extents(cache);
         if (ret)
                 goto out;
@@ -904,6 +923,7 @@ out:
         __btrfs_remove_free_space_cache(cache->free_space_ctl);
         kfree(cache->free_space_ctl);
         kfree(cache);
+       btrfs_free_dummy_root(root);
         test_msg("Free space cache tests finished\n");
         return ret;
  }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c

index a5b06442f0bf9d1630f201da3e0eb5c0422e8cc9..418c6a2ad7d88658f8624d99a1ba0e9e84c13d45 100644 (file)
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -82,6 +82,12 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
  static void clear_btree_io_tree(struct extent_io_tree *tree)
  {
         spin_lock(&tree->lock);
+       /*
+        * Do a single barrier for the waitqueue_active check here, the state
+        * of the waitqueue should not change once clear_btree_io_tree is
+        * called.
+        */
+       smp_mb();
         while (!RB_EMPTY_ROOT(&tree->state)) {
                 struct rb_node *node;
                 struct extent_state *state;
@@ -226,25 +232,22 @@ loop:
         extwriter_counter_init(cur_trans, type);
         init_waitqueue_head(&cur_trans->writer_wait);
         init_waitqueue_head(&cur_trans->commit_wait);
+       init_waitqueue_head(&cur_trans->pending_wait);
         cur_trans->state = TRANS_STATE_RUNNING;
         /*
          * One for this trans handle, one so it will live on until we
          * commit the transaction.
          */
         atomic_set(&cur_trans->use_count, 2);
-       cur_trans->have_free_bgs = 0;
+       atomic_set(&cur_trans->pending_ordered, 0);
+       cur_trans->flags = 0;
         cur_trans->start_time = get_seconds();
-       cur_trans->dirty_bg_run = 0;
+
+       memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
  
         cur_trans->delayed_refs.href_root = RB_ROOT;
         cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
         atomic_set(&cur_trans->delayed_refs.num_entries, 0);
-       cur_trans->delayed_refs.num_heads_ready = 0;
-       cur_trans->delayed_refs.pending_csums = 0;
-       cur_trans->delayed_refs.num_heads = 0;
-       cur_trans->delayed_refs.flushing = 0;
-       cur_trans->delayed_refs.run_delayed_start = 0;
-       cur_trans->delayed_refs.qgroup_to_skip = 0;
  
         /*
          * although the tree mod log is per file system and not per transaction,
@@ -264,7 +267,6 @@ loop:
         INIT_LIST_HEAD(&cur_trans->pending_snapshots);
         INIT_LIST_HEAD(&cur_trans->pending_chunks);
         INIT_LIST_HEAD(&cur_trans->switch_commits);
-       INIT_LIST_HEAD(&cur_trans->pending_ordered);
         INIT_LIST_HEAD(&cur_trans->dirty_bgs);
         INIT_LIST_HEAD(&cur_trans->io_bgs);
         INIT_LIST_HEAD(&cur_trans->dropped_roots);
@@ -447,8 +449,8 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
  }
  
  static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
-                 enum btrfs_reserve_flush_enum flush)
+start_transaction(struct btrfs_root *root, unsigned int num_items,
+                 unsigned int type, enum btrfs_reserve_flush_enum flush)
  {
         struct btrfs_trans_handle *h;
         struct btrfs_transaction *cur_trans;
@@ -478,13 +480,10 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
          * the appropriate flushing if need be.
          */
         if (num_items > 0 && root != root->fs_info->chunk_root) {
-               if (root->fs_info->quota_enabled &&
-                   is_fstree(root->root_key.objectid)) {
-                       qgroup_reserved = num_items * root->nodesize;
-                       ret = btrfs_qgroup_reserve(root, qgroup_reserved);
-                       if (ret)
-                               return ERR_PTR(ret);
-               }
+               qgroup_reserved = num_items * root->nodesize;
+               ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
+               if (ret)
+                       return ERR_PTR(ret);
  
                 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
                 /*
@@ -502,7 +501,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
                         goto reserve_fail;
         }
  again:
-       h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+       h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
         if (!h) {
                 ret = -ENOMEM;
                 goto alloc_fail;
@@ -543,26 +542,13 @@ again:
  
         h->transid = cur_trans->transid;
         h->transaction = cur_trans;
-       h->blocks_used = 0;
-       h->bytes_reserved = 0;
-       h->chunk_bytes_reserved = 0;
         h->root = root;
-       h->delayed_ref_updates = 0;
         h->use_count = 1;
-       h->adding_csums = 0;
-       h->block_rsv = NULL;
-       h->orig_rsv = NULL;
-       h->aborted = 0;
-       h->qgroup_reserved = 0;
-       h->delayed_ref_elem.seq = 0;
+
         h->type = type;
-       h->allocating_chunk = false;
         h->can_flush_pending_bgs = true;
-       h->reloc_reserved = false;
-       h->sync = false;
         INIT_LIST_HEAD(&h->qgroup_ref_list);
         INIT_LIST_HEAD(&h->new_bgs);
-       INIT_LIST_HEAD(&h->ordered);
  
         smp_mb();
         if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -579,7 +565,6 @@ again:
                 h->bytes_reserved = num_bytes;
                 h->reloc_reserved = reloc_reserved;
         }
-       h->qgroup_reserved = qgroup_reserved;
  
  got_it:
         btrfs_record_root_in_trans(h, root);
@@ -597,20 +582,20 @@ alloc_fail:
                 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
                                         num_bytes);
  reserve_fail:
-       if (qgroup_reserved)
-               btrfs_qgroup_free(root, qgroup_reserved);
+       btrfs_qgroup_free_meta(root, qgroup_reserved);
         return ERR_PTR(ret);
  }
  
  struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-                                                  int num_items)
+                                                  unsigned int num_items)
  {
         return start_transaction(root, num_items, TRANS_START,
                                  BTRFS_RESERVE_FLUSH_ALL);
  }
  
  struct btrfs_trans_handle *btrfs_start_transaction_lflush(
-                                       struct btrfs_root *root, int num_items)
+                                       struct btrfs_root *root,
+                                       unsigned int num_items)
  {
         return start_transaction(root, num_items, TRANS_START,
                                  BTRFS_RESERVE_FLUSH_LIMIT);
@@ -794,12 +779,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
         if (!list_empty(&trans->new_bgs))
                 btrfs_create_pending_block_groups(trans, root);
  
-       if (!list_empty(&trans->ordered)) {
-               spin_lock(&info->trans_lock);
-               list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
-               spin_unlock(&info->trans_lock);
-       }
-
         trans->delayed_ref_updates = 0;
         if (!trans->sync) {
                 must_run_delayed_refs =
@@ -815,15 +794,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                         must_run_delayed_refs = 2;
         }
  
-       if (trans->qgroup_reserved) {
-               /*
-                * the same root has to be passed here between start_transaction
-                * and end_transaction. Subvolume quota depends on this.
-                */
-               btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
-               trans->qgroup_reserved = 0;
-       }
-
         btrfs_trans_release_metadata(trans, root);
         trans->block_rsv = NULL;
  
@@ -856,6 +826,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
         atomic_dec(&cur_trans->num_writers);
         extwriter_counter_dec(cur_trans, trans->type);
  
+       /*
+        * Make sure counter is updated before we wake up waiters.
+        */
         smp_mb();
         if (waitqueue_active(&cur_trans->writer_wait))
                 wake_up(&cur_trans->writer_wait);
@@ -1238,6 +1211,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                         spin_lock(&fs_info->fs_roots_radix_lock);
                         if (err)
                                 break;
+                       btrfs_qgroup_free_meta_all(root);
                 }
         }
         spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1795,25 +1769,10 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
  }
  
  static inline void
-btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
-                          struct btrfs_fs_info *fs_info)
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
  {
-       struct btrfs_ordered_extent *ordered;
-
-       spin_lock(&fs_info->trans_lock);
-       while (!list_empty(&cur_trans->pending_ordered)) {
-               ordered = list_first_entry(&cur_trans->pending_ordered,
-                                          struct btrfs_ordered_extent,
-                                          trans_list);
-               list_del_init(&ordered->trans_list);
-               spin_unlock(&fs_info->trans_lock);
-
-               wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
-                                                  &ordered->flags));
-               btrfs_put_ordered_extent(ordered);
-               spin_lock(&fs_info->trans_lock);
-       }
-       spin_unlock(&fs_info->trans_lock);
+       wait_event(cur_trans->pending_wait,
+                  atomic_read(&cur_trans->pending_ordered) == 0);
  }
  
  int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -1842,10 +1801,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
         btrfs_trans_release_metadata(trans, root);
         trans->block_rsv = NULL;
-       if (trans->qgroup_reserved) {
-               btrfs_qgroup_free(root, trans->qgroup_reserved);
-               trans->qgroup_reserved = 0;
-       }
  
         cur_trans = trans->transaction;
  
@@ -1865,7 +1820,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                 return ret;
         }
  
-       if (!cur_trans->dirty_bg_run) {
+       if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
                 int run_it = 0;
  
                 /* this mutex is also taken before trying to set
@@ -1874,18 +1829,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                  * after a extents from that block group have been
                  * allocated for cache files.  btrfs_set_block_group_ro
                  * will wait for the transaction to commit if it
-                * finds dirty_bg_run = 1
+                * finds BTRFS_TRANS_DIRTY_BG_RUN set.
                  *
-                * The dirty_bg_run flag is also used to make sure only
-                * one process starts all the block group IO.  It wouldn't
+                * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
+                * only one process starts all the block group IO.  It wouldn't
                  * hurt to have more than one go through, but there's no
                  * real advantage to it either.
                  */
                 mutex_lock(&root->fs_info->ro_block_group_mutex);
-               if (!cur_trans->dirty_bg_run) {
+               if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
+                                     &cur_trans->flags))
                         run_it = 1;
-                       cur_trans->dirty_bg_run = 1;
-               }
                 mutex_unlock(&root->fs_info->ro_block_group_mutex);
  
                 if (run_it)
@@ -1897,7 +1851,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         }
  
         spin_lock(&root->fs_info->trans_lock);
-       list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
         if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
                 spin_unlock(&root->fs_info->trans_lock);
                 atomic_inc(&cur_trans->use_count);
@@ -1956,7 +1909,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
         btrfs_wait_delalloc_flush(root->fs_info);
  
-       btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+       btrfs_wait_pending_ordered(cur_trans);
  
         btrfs_scrub_pause(root);
         /*
@@ -2136,7 +2089,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
         ret = btrfs_write_and_wait_transaction(trans, root);
         if (ret) {
-               btrfs_error(root->fs_info, ret,
+               btrfs_std_error(root->fs_info, ret,
                             "Error while writing out transaction");
                 mutex_unlock(&root->fs_info->tree_log_mutex);
                 goto scrub_continue;
@@ -2156,7 +2109,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
         btrfs_finish_extent_commit(trans, root);
  
-       if (cur_trans->have_free_bgs)
+       if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
                 btrfs_clear_space_info_full(root->fs_info);
  
         root->fs_info->last_trans_committed = cur_trans->transid;
@@ -2198,10 +2151,6 @@ cleanup_transaction:
         btrfs_trans_release_metadata(trans, root);
         btrfs_trans_release_chunk_metadata(trans);
         trans->block_rsv = NULL;
-       if (trans->qgroup_reserved) {
-               btrfs_qgroup_free(root, trans->qgroup_reserved);
-               trans->qgroup_reserved = 0;
-       }
         btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
         if (current->journal_info == trans)
                 current->journal_info = NULL;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h

index a994bb097ee59c12bb0d5599f10c8a64b1954f43..b05b2f64d9133313f1af231c4a4254ff16972f00 100644 (file)
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -32,6 +32,10 @@ enum btrfs_trans_state {
         TRANS_STATE_MAX                 = 6,
  };
  
+#define BTRFS_TRANS_HAVE_FREE_BGS      0
+#define BTRFS_TRANS_DIRTY_BG_RUN       1
+#define BTRFS_TRANS_CACHE_ENOSPC       2
+
  struct btrfs_transaction {
         u64 transid;
         /*
@@ -46,11 +50,9 @@ struct btrfs_transaction {
          */
         atomic_t num_writers;
         atomic_t use_count;
+       atomic_t pending_ordered;
  
-       /*
-        * true if there is free bgs operations in this transaction
-        */
-       int have_free_bgs;
+       unsigned long flags;
  
         /* Be protected by fs_info->trans_lock when we want to change it. */
         enum btrfs_trans_state state;
@@ -59,9 +61,9 @@ struct btrfs_transaction {
         unsigned long start_time;
         wait_queue_head_t writer_wait;
         wait_queue_head_t commit_wait;
+       wait_queue_head_t pending_wait;
         struct list_head pending_snapshots;
         struct list_head pending_chunks;
-       struct list_head pending_ordered;
         struct list_head switch_commits;
         struct list_head dirty_bgs;
         struct list_head io_bgs;
@@ -80,7 +82,6 @@ struct btrfs_transaction {
         spinlock_t dropped_roots_lock;
         struct btrfs_delayed_ref_root delayed_refs;
         int aborted;
-       int dirty_bg_run;
  };
  
  #define __TRANS_FREEZABLE      (1U << 0)
@@ -107,7 +108,6 @@ struct btrfs_trans_handle {
         u64 transid;
         u64 bytes_reserved;
         u64 chunk_bytes_reserved;
-       u64 qgroup_reserved;
         unsigned long use_count;
         unsigned long blocks_reserved;
         unsigned long blocks_used;
@@ -129,7 +129,6 @@ struct btrfs_trans_handle {
          */
         struct btrfs_root *root;
         struct seq_list delayed_ref_elem;
-       struct list_head ordered;
         struct list_head qgroup_ref_list;
         struct list_head new_bgs;
  };
@@ -185,9 +184,10 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
  int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root);
  struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-                                                  int num_items);
+                                                  unsigned int num_items);
  struct btrfs_trans_handle *btrfs_start_transaction_lflush(
-                                       struct btrfs_root *root, int num_items);
+                                       struct btrfs_root *root,
+                                       unsigned int num_items);
  struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
  struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
  struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index 1bbaace733838e6fc70dba1babafb6f9265c00f9..323e12cc9d2f522388fe929ed66d4dd946b5881d 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -229,7 +229,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
  void btrfs_end_log_trans(struct btrfs_root *root)
  {
         if (atomic_dec_and_test(&root->log_writers)) {
-               smp_mb();
+               /*
+                * Implicit memory barrier after atomic_dec_and_test
+                */
                 if (waitqueue_active(&root->log_writer_wait))
                         wake_up(&root->log_writer_wait);
         }
@@ -691,7 +693,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                 ret = btrfs_inc_extent_ref(trans, root,
                                                 ins.objectid, ins.offset,
                                                 0, root->root_key.objectid,
-                                               key->objectid, offset, 0);
+                                               key->objectid, offset);
                                 if (ret)
                                         goto out;
                         } else {
@@ -2820,7 +2822,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
  
         mutex_lock(&log_root_tree->log_mutex);
         if (atomic_dec_and_test(&log_root_tree->log_writers)) {
-               smp_mb();
+               /*
+                * Implicit memory barrier after atomic_dec_and_test
+                */
                 if (waitqueue_active(&log_root_tree->log_writer_wait))
                         wake_up(&log_root_tree->log_writer_wait);
         }
@@ -2950,6 +2954,9 @@ out_wake_log_root:
         atomic_set(&log_root_tree->log_commit[index2], 0);
         mutex_unlock(&log_root_tree->log_mutex);
  
+       /*
+        * The barrier before waitqueue_active is implied by mutex_unlock
+        */
         if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
                 wake_up(&log_root_tree->log_commit_wait[index2]);
  out:
@@ -2961,6 +2968,9 @@ out:
         atomic_set(&root->log_commit[index1], 0);
         mutex_unlock(&root->log_mutex);
  
+       /*
+        * The barrier before waitqueue_active is implied by mutex_unlock
+        */
         if (waitqueue_active(&root->log_commit_wait[index1]))
                 wake_up(&root->log_commit_wait[index1]);
         return ret;
@@ -5314,7 +5324,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
  
         ret = walk_log_tree(trans, log_root_tree, &wc);
         if (ret) {
-               btrfs_error(fs_info, ret, "Failed to pin buffers while "
+               btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
                             "recovering log root tree.");
                 goto error;
         }
@@ -5328,7 +5338,7 @@ again:
                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
  
                 if (ret < 0) {
-                       btrfs_error(fs_info, ret,
+                       btrfs_std_error(fs_info, ret,
                                     "Couldn't find tree log root.");
                         goto error;
                 }
@@ -5346,7 +5356,7 @@ again:
                 log = btrfs_read_fs_root(log_root_tree, &found_key);
                 if (IS_ERR(log)) {
                         ret = PTR_ERR(log);
-                       btrfs_error(fs_info, ret,
+                       btrfs_std_error(fs_info, ret,
                                     "Couldn't read tree log root.");
                         goto error;
                 }
@@ -5361,7 +5371,7 @@ again:
                         free_extent_buffer(log->node);
                         free_extent_buffer(log->commit_root);
                         kfree(log);
-                       btrfs_error(fs_info, ret, "Couldn't read target root "
+                       btrfs_std_error(fs_info, ret, "Couldn't read target root "
                                     "for tree log recovery.");
                         goto error;
                 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index 6fc735869c186c35fb79fa66decc7d3519ed2e93..17ed76d18eb6e99a9b6c931a90371015f1eae6d4 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -42,6 +42,82 @@
  #include "dev-replace.h"
  #include "sysfs.h"
  
+const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+       [BTRFS_RAID_RAID10] = {
+               .sub_stripes    = 2,
+               .dev_stripes    = 1,
+               .devs_max       = 0,    /* 0 == as many as possible */
+               .devs_min       = 4,
+               .tolerated_failures = 1,
+               .devs_increment = 2,
+               .ncopies        = 2,
+       },
+       [BTRFS_RAID_RAID1] = {
+               .sub_stripes    = 1,
+               .dev_stripes    = 1,
+               .devs_max       = 2,
+               .devs_min       = 2,
+               .tolerated_failures = 1,
+               .devs_increment = 2,
+               .ncopies        = 2,
+       },
+       [BTRFS_RAID_DUP] = {
+               .sub_stripes    = 1,
+               .dev_stripes    = 2,
+               .devs_max       = 1,
+               .devs_min       = 1,
+               .tolerated_failures = 0,
+               .devs_increment = 1,
+               .ncopies        = 2,
+       },
+       [BTRFS_RAID_RAID0] = {
+               .sub_stripes    = 1,
+               .dev_stripes    = 1,
+               .devs_max       = 0,
+               .devs_min       = 2,
+               .tolerated_failures = 0,
+               .devs_increment = 1,
+               .ncopies        = 1,
+       },
+       [BTRFS_RAID_SINGLE] = {
+               .sub_stripes    = 1,
+               .dev_stripes    = 1,
+               .devs_max       = 1,
+               .devs_min       = 1,
+               .tolerated_failures = 0,
+               .devs_increment = 1,
+               .ncopies        = 1,
+       },
+       [BTRFS_RAID_RAID5] = {
+               .sub_stripes    = 1,
+               .dev_stripes    = 1,
+               .devs_max       = 0,
+               .devs_min       = 2,
+               .tolerated_failures = 1,
+               .devs_increment = 1,
+               .ncopies        = 2,
+       },
+       [BTRFS_RAID_RAID6] = {
+               .sub_stripes    = 1,
+               .dev_stripes    = 1,
+               .devs_max       = 0,
+               .devs_min       = 3,
+               .tolerated_failures = 2,
+               .devs_increment = 1,
+               .ncopies        = 3,
+       },
+};
+
+const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
+       [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
+       [BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
+       [BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
+       [BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
+       [BTRFS_RAID_SINGLE] = 0,
+       [BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
+       [BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
+};
+
  static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_device *device);
@@ -198,7 +274,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
  
         if (IS_ERR(*bdev)) {
                 ret = PTR_ERR(*bdev);
-               printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
                 goto error;
         }
  
@@ -211,8 +286,8 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
         }
         invalidate_bdev(*bdev);
         *bh = btrfs_read_dev_super(*bdev);
-       if (!*bh) {
-               ret = -EINVAL;
+       if (IS_ERR(*bh)) {
+               ret = PTR_ERR(*bh);
                 blkdev_put(*bdev, flags);
                 goto error;
         }
@@ -345,6 +420,9 @@ loop_lock:
                 pending = pending->bi_next;
                 cur->bi_next = NULL;
  
+               /*
+                * atomic_dec_return implies a barrier for waitqueue_active
+                */
                 if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
                     waitqueue_active(&fs_info->async_submit_wait))
                         wake_up(&fs_info->async_submit_wait);
@@ -765,36 +843,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
  
         mutex_lock(&fs_devices->device_list_mutex);
         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
-               struct btrfs_device *new_device;
-               struct rcu_string *name;
-
-               if (device->bdev)
-                       fs_devices->open_devices--;
-
-               if (device->writeable &&
-                   device->devid != BTRFS_DEV_REPLACE_DEVID) {
-                       list_del_init(&device->dev_alloc_list);
-                       fs_devices->rw_devices--;
-               }
-
-               if (device->missing)
-                       fs_devices->missing_devices--;
-
-               new_device = btrfs_alloc_device(NULL, &device->devid,
-                                               device->uuid);
-               BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
-
-               /* Safe because we are under uuid_mutex */
-               if (device->name) {
-                       name = rcu_string_strdup(device->name->str, GFP_NOFS);
-                       BUG_ON(!name); /* -ENOMEM */
-                       rcu_assign_pointer(new_device->name, name);
-               }
-
-               list_replace_rcu(&device->dev_list, &new_device->dev_list);
-               new_device->fs_devices = device->fs_devices;
-
-               call_rcu(&device->rcu, free_device);
+               btrfs_close_one_device(device);
         }
         mutex_unlock(&fs_devices->device_list_mutex);
  
@@ -1402,7 +1451,7 @@ again:
                 extent = btrfs_item_ptr(leaf, path->slots[0],
                                         struct btrfs_dev_extent);
         } else {
-               btrfs_error(root->fs_info, ret, "Slot search failed");
+               btrfs_std_error(root->fs_info, ret, "Slot search failed");
                 goto out;
         }
  
@@ -1410,10 +1459,10 @@ again:
  
         ret = btrfs_del_item(trans, root, path);
         if (ret) {
-               btrfs_error(root->fs_info, ret,
+               btrfs_std_error(root->fs_info, ret,
                             "Failed to remove dev extent item");
         } else {
-               trans->transaction->have_free_bgs = 1;
+               set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
         }
  out:
         btrfs_free_path(path);
@@ -1801,7 +1850,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
         if (device->bdev) {
                 device->fs_devices->open_devices--;
                 /* remove sysfs entry */
-               btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
+               btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
         }
  
         call_rcu(&device->rcu, free_device);
@@ -1924,7 +1973,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
         if (srcdev->writeable) {
                 fs_devices->rw_devices--;
                 /* zero out the old super if it is writable */
-               btrfs_scratch_superblock(srcdev);
+               btrfs_scratch_superblocks(srcdev->bdev,
+                                       rcu_str_deref(srcdev->name));
         }
  
         if (srcdev->bdev)
@@ -1971,10 +2021,11 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
         WARN_ON(!tgtdev);
         mutex_lock(&fs_info->fs_devices->device_list_mutex);
  
-       btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev);
+       btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
  
         if (tgtdev->bdev) {
-               btrfs_scratch_superblock(tgtdev);
+               btrfs_scratch_superblocks(tgtdev->bdev,
+                                       rcu_str_deref(tgtdev->name));
                 fs_info->fs_devices->open_devices--;
         }
         fs_info->fs_devices->num_devices--;
@@ -2041,10 +2092,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
                         }
                 }
  
-               if (!*device) {
-                       btrfs_err(root->fs_info, "no missing device found");
-                       return -ENOENT;
-               }
+               if (!*device)
+                       return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
  
                 return 0;
         } else {
@@ -2309,7 +2358,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                                     tmp + 1);
  
         /* add sysfs device entry */
-       btrfs_kobj_add_device(root->fs_info->fs_devices, device);
+       btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device);
  
         /*
          * we've got more storage, clear any full flags on the space
@@ -2350,9 +2399,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                  */
                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
                                                 root->fs_info->fsid);
-               if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
+               if (kobject_rename(&root->fs_info->fs_devices->fsid_kobj,
                                                                 fsid_buf))
-                       pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n");
+                       btrfs_warn(root->fs_info,
+                               "sysfs: failed to create fsid for sprout");
         }
  
         root->fs_info->num_tolerated_disk_barrier_failures =
@@ -2368,7 +2418,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
  
                 ret = btrfs_relocate_sys_chunks(root);
                 if (ret < 0)
-                       btrfs_error(root->fs_info, ret,
+                       btrfs_std_error(root->fs_info, ret,
                                     "Failed to relocate sys chunks after "
                                     "device initialization. This can be fixed "
                                     "using the \"btrfs balance\" command.");
@@ -2388,7 +2438,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
  error_trans:
         btrfs_end_transaction(trans, root);
         rcu_string_free(device->name);
-       btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
+       btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
         kfree(device);
  error:
         blkdev_put(bdev, FMODE_EXCL);
@@ -2613,7 +2663,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
         if (ret < 0)
                 goto out;
         else if (ret > 0) { /* Logic error or corruption */
-               btrfs_error(root->fs_info, -ENOENT,
+               btrfs_std_error(root->fs_info, -ENOENT,
                             "Failed lookup while freeing chunk.");
                 ret = -ENOENT;
                 goto out;
@@ -2621,7 +2671,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
  
         ret = btrfs_del_item(trans, root, path);
         if (ret < 0)
-               btrfs_error(root->fs_info, ret,
+               btrfs_std_error(root->fs_info, ret,
                             "Failed to delete chunk item.");
  out:
         btrfs_free_path(path);
@@ -2806,7 +2856,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
         trans = btrfs_start_transaction(root, 0);
         if (IS_ERR(trans)) {
                 ret = PTR_ERR(trans);
-               btrfs_std_error(root->fs_info, ret);
+               btrfs_std_error(root->fs_info, ret, NULL);
                 return ret;
         }
  
@@ -3009,16 +3059,19 @@ static void update_balance_args(struct btrfs_balance_control *bctl)
          * (albeit full) chunks.
          */
         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
                 bctl->data.usage = 90;
         }
         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
                 bctl->sys.usage = 90;
         }
         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
                 bctl->meta.usage = 90;
@@ -3072,6 +3125,39 @@ static int chunk_profiles_filter(u64 chunk_type,
  
  static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
                               struct btrfs_balance_args *bargs)
+{
+       struct btrfs_block_group_cache *cache;
+       u64 chunk_used;
+       u64 user_thresh_min;
+       u64 user_thresh_max;
+       int ret = 1;
+
+       cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+       chunk_used = btrfs_block_group_used(&cache->item);
+
+       if (bargs->usage_min == 0)
+               user_thresh_min = 0;
+       else
+               user_thresh_min = div_factor_fine(cache->key.offset,
+                                       bargs->usage_min);
+
+       if (bargs->usage_max == 0)
+               user_thresh_max = 1;
+       else if (bargs->usage_max > 100)
+               user_thresh_max = cache->key.offset;
+       else
+               user_thresh_max = div_factor_fine(cache->key.offset,
+                                       bargs->usage_max);
+
+       if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
+               ret = 0;
+
+       btrfs_put_block_group(cache);
+       return ret;
+}
+
+static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info,
+               u64 chunk_offset, struct btrfs_balance_args *bargs)
  {
         struct btrfs_block_group_cache *cache;
         u64 chunk_used, user_thresh;
@@ -3080,7 +3166,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
         chunk_used = btrfs_block_group_used(&cache->item);
  
-       if (bargs->usage == 0)
+       if (bargs->usage_min == 0)
                 user_thresh = 1;
         else if (bargs->usage > 100)
                 user_thresh = cache->key.offset;
@@ -3170,6 +3256,19 @@ static int chunk_vrange_filter(struct extent_buffer *leaf,
         return 1;
  }
  
+static int chunk_stripes_range_filter(struct extent_buffer *leaf,
+                              struct btrfs_chunk *chunk,
+                              struct btrfs_balance_args *bargs)
+{
+       int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+
+       if (bargs->stripes_min <= num_stripes
+                       && num_stripes <= bargs->stripes_max)
+               return 0;
+
+       return 1;
+}
+
  static int chunk_soft_convert_filter(u64 chunk_type,
                                      struct btrfs_balance_args *bargs)
  {
@@ -3216,6 +3315,9 @@ static int should_balance_chunk(struct btrfs_root *root,
         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
             chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
                 return 0;
+       } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
+           chunk_usage_range_filter(bctl->fs_info, chunk_offset, bargs)) {
+               return 0;
         }
  
         /* devid filter */
@@ -3236,6 +3338,12 @@ static int should_balance_chunk(struct btrfs_root *root,
                 return 0;
         }
  
+       /* stripes filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
+           chunk_stripes_range_filter(leaf, chunk, bargs)) {
+               return 0;
+       }
+
         /* soft profile changing mode */
         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
             chunk_soft_convert_filter(chunk_type, bargs)) {
@@ -3250,6 +3358,16 @@ static int should_balance_chunk(struct btrfs_root *root,
                         return 0;
                 else
                         bargs->limit--;
+       } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
+               /*
+                * Same logic as the 'limit' filter; the minimum cannot be
+                * determined here because we do not have the global informatoin
+                * about the count of all chunks that satisfy the filters.
+                */
+               if (bargs->limit_max == 0)
+                       return 0;
+               else
+                       bargs->limit_max--;
         }
  
         return 1;
@@ -3264,6 +3382,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
         struct btrfs_device *device;
         u64 old_size;
         u64 size_to_free;
+       u64 chunk_type;
         struct btrfs_chunk *chunk;
         struct btrfs_path *path;
         struct btrfs_key key;
@@ -3274,9 +3393,13 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
         int ret;
         int enospc_errors = 0;
         bool counting = true;
+       /* The single value limit and min/max limits use the same bytes in the */
         u64 limit_data = bctl->data.limit;
         u64 limit_meta = bctl->meta.limit;
         u64 limit_sys = bctl->sys.limit;
+       u32 count_data = 0;
+       u32 count_meta = 0;
+       u32 count_sys = 0;
  
         /* step one make some room on all the devices */
         devices = &fs_info->fs_devices->devices;
@@ -3317,6 +3440,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
         spin_unlock(&fs_info->balance_lock);
  again:
         if (!counting) {
+               /*
+                * The single value limit and min/max limits use the same bytes
+                * in the
+                */
                 bctl->data.limit = limit_data;
                 bctl->meta.limit = limit_meta;
                 bctl->sys.limit = limit_sys;
@@ -3364,6 +3491,7 @@ again:
                 }
  
                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+               chunk_type = btrfs_chunk_type(leaf, chunk);
  
                 if (!counting) {
                         spin_lock(&fs_info->balance_lock);
@@ -3384,6 +3512,28 @@ again:
                         spin_lock(&fs_info->balance_lock);
                         bctl->stat.expected++;
                         spin_unlock(&fs_info->balance_lock);
+
+                       if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+                               count_data++;
+                       else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+                               count_sys++;
+                       else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+                               count_meta++;
+
+                       goto loop;
+               }
+
+               /*
+                * Apply limit_min filter, no need to check if the LIMITS
+                * filter is used, limit_min is 0 by default
+                */
+               if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
+                                       count_data < bctl->data.limit_min)
+                               || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
+                                       count_meta < bctl->meta.limit_min)
+                               || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+                                       count_sys < bctl->sys.limit_min)) {
+                       mutex_unlock(&fs_info->delete_unused_bgs_mutex);
                         goto loop;
                 }
  
@@ -3461,11 +3611,20 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
         unset_balance_control(fs_info);
         ret = del_balance_item(fs_info->tree_root);
         if (ret)
-               btrfs_std_error(fs_info, ret);
+               btrfs_std_error(fs_info, ret, NULL);
  
         atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
  }
  
+/* Non-zero return value signifies invalidity */
+static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
+               u64 allowed)
+{
+       return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+               (!alloc_profile_is_valid(bctl_arg->target, 1) ||
+                (bctl_arg->target & ~allowed)));
+}
+
  /*
   * Should be called with both balance and volume mutexes held
   */
@@ -3523,27 +3682,21 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
         if (num_devices > 3)
                 allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
                             BTRFS_BLOCK_GROUP_RAID6);
-       if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-           (!alloc_profile_is_valid(bctl->data.target, 1) ||
-            (bctl->data.target & ~allowed))) {
+       if (validate_convert_profile(&bctl->data, allowed)) {
                 btrfs_err(fs_info, "unable to start balance with target "
                            "data profile %llu",
                        bctl->data.target);
                 ret = -EINVAL;
                 goto out;
         }
-       if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-           (!alloc_profile_is_valid(bctl->meta.target, 1) ||
-            (bctl->meta.target & ~allowed))) {
+       if (validate_convert_profile(&bctl->meta, allowed)) {
                 btrfs_err(fs_info,
                            "unable to start balance with target metadata profile %llu",
                        bctl->meta.target);
                 ret = -EINVAL;
                 goto out;
         }
-       if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-           (!alloc_profile_is_valid(bctl->sys.target, 1) ||
-            (bctl->sys.target & ~allowed))) {
+       if (validate_convert_profile(&bctl->sys, allowed)) {
                 btrfs_err(fs_info,
                            "unable to start balance with target system profile %llu",
                        bctl->sys.target);
@@ -4285,65 +4438,6 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
         return 0;
  }
  
-static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
-       [BTRFS_RAID_RAID10] = {
-               .sub_stripes    = 2,
-               .dev_stripes    = 1,
-               .devs_max       = 0,    /* 0 == as many as possible */
-               .devs_min       = 4,
-               .devs_increment = 2,
-               .ncopies        = 2,
-       },
-       [BTRFS_RAID_RAID1] = {
-               .sub_stripes    = 1,
-               .dev_stripes    = 1,
-               .devs_max       = 2,
-               .devs_min       = 2,
-               .devs_increment = 2,
-               .ncopies        = 2,
-       },
-       [BTRFS_RAID_DUP] = {
-               .sub_stripes    = 1,
-               .dev_stripes    = 2,
-               .devs_max       = 1,
-               .devs_min       = 1,
-               .devs_increment = 1,
-               .ncopies        = 2,
-       },
-       [BTRFS_RAID_RAID0] = {
-               .sub_stripes    = 1,
-               .dev_stripes    = 1,
-               .devs_max       = 0,
-               .devs_min       = 2,
-               .devs_increment = 1,
-               .ncopies        = 1,
-       },
-       [BTRFS_RAID_SINGLE] = {
-               .sub_stripes    = 1,
-               .dev_stripes    = 1,
-               .devs_max       = 1,
-               .devs_min       = 1,
-               .devs_increment = 1,
-               .ncopies        = 1,
-       },
-       [BTRFS_RAID_RAID5] = {
-               .sub_stripes    = 1,
-               .dev_stripes    = 1,
-               .devs_max       = 0,
-               .devs_min       = 2,
-               .devs_increment = 1,
-               .ncopies        = 2,
-       },
-       [BTRFS_RAID_RAID6] = {
-               .sub_stripes    = 1,
-               .dev_stripes    = 1,
-               .devs_max       = 0,
-               .devs_min       = 3,
-               .devs_increment = 1,
-               .ncopies        = 3,
-       },
-};
-
  static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
  {
         /* TODO allow them to set a preferred stripe size */
@@ -6594,8 +6688,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
         BUG_ON(!path);
         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
         if (ret < 0) {
-               printk_in_rcu(KERN_WARNING "BTRFS: "
-                       "error %d while searching for dev_stats item for device %s!\n",
+               btrfs_warn_in_rcu(dev_root->fs_info,
+                       "error %d while searching for dev_stats item for device %s",
                               ret, rcu_str_deref(device->name));
                 goto out;
         }
@@ -6605,8 +6699,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
                 /* need to delete old one and insert a new one */
                 ret = btrfs_del_item(trans, dev_root, path);
                 if (ret != 0) {
-                       printk_in_rcu(KERN_WARNING "BTRFS: "
-                               "delete too small dev_stats item for device %s failed %d!\n",
+                       btrfs_warn_in_rcu(dev_root->fs_info,
+                               "delete too small dev_stats item for device %s failed %d",
                                       rcu_str_deref(device->name), ret);
                         goto out;
                 }
@@ -6619,9 +6713,9 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
                 ret = btrfs_insert_empty_item(trans, dev_root, path,
                                               &key, sizeof(*ptr));
                 if (ret < 0) {
-                       printk_in_rcu(KERN_WARNING "BTRFS: "
-                                         "insert dev_stats item for device %s failed %d!\n",
-                                     rcu_str_deref(device->name), ret);
+                       btrfs_warn_in_rcu(dev_root->fs_info,
+                               "insert dev_stats item for device %s failed %d",
+                               rcu_str_deref(device->name), ret);
                         goto out;
                 }
         }
@@ -6675,8 +6769,8 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
  {
         if (!dev->dev_stats_valid)
                 return;
-       printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
-                          "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+       btrfs_err_rl_in_rcu(dev->dev_root->fs_info,
+               "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
                            rcu_str_deref(dev->name),
                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6695,8 +6789,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
         if (i == BTRFS_DEV_STAT_VALUES_MAX)
                 return; /* all values == 0, suppress message */
  
-       printk_in_rcu(KERN_INFO "BTRFS: "
-                  "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+       btrfs_info_in_rcu(dev->dev_root->fs_info,
+               "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
                rcu_str_deref(dev->name),
                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6740,22 +6834,34 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
         return 0;
  }
  
-int btrfs_scratch_superblock(struct btrfs_device *device)
+void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path)
  {
         struct buffer_head *bh;
         struct btrfs_super_block *disk_super;
+       int copy_num;
  
-       bh = btrfs_read_dev_super(device->bdev);
-       if (!bh)
-               return -EINVAL;
-       disk_super = (struct btrfs_super_block *)bh->b_data;
+       if (!bdev)
+               return;
  
-       memset(&disk_super->magic, 0, sizeof(disk_super->magic));
-       set_buffer_dirty(bh);
-       sync_dirty_buffer(bh);
-       brelse(bh);
+       for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
+               copy_num++) {
  
-       return 0;
+               if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
+                       continue;
+
+               disk_super = (struct btrfs_super_block *)bh->b_data;
+
+               memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+               set_buffer_dirty(bh);
+               sync_dirty_buffer(bh);
+               brelse(bh);
+       }
+
+       /* Notify udev that device has changed */
+       btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
+       /* Update ctime/mtime for device path for libblkid */
+       update_dev_time(device_path);
  }
  
  /*
@@ -6823,3 +6929,38 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
                 fs_devices = fs_devices->seed;
         }
  }
+
+void btrfs_close_one_device(struct btrfs_device *device)
+{
+       struct btrfs_fs_devices *fs_devices = device->fs_devices;
+       struct btrfs_device *new_device;
+       struct rcu_string *name;
+
+       if (device->bdev)
+               fs_devices->open_devices--;
+
+       if (device->writeable &&
+           device->devid != BTRFS_DEV_REPLACE_DEVID) {
+               list_del_init(&device->dev_alloc_list);
+               fs_devices->rw_devices--;
+       }
+
+       if (device->missing)
+               fs_devices->missing_devices--;
+
+       new_device = btrfs_alloc_device(NULL, &device->devid,
+                                       device->uuid);
+       BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
+
+       /* Safe because we are under uuid_mutex */
+       if (device->name) {
+               name = rcu_string_strdup(device->name->str, GFP_NOFS);
+               BUG_ON(!name); /* -ENOMEM */
+               rcu_assign_pointer(new_device->name, name);
+       }
+
+       list_replace_rcu(&device->dev_list, &new_device->dev_list);
+       new_device->fs_devices = device->fs_devices;
+
+       call_rcu(&device->rcu, free_device);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h

index 595279a8b99fd461e24cb24df3805fa8401f3dd6..ec571237273208fcb87f7be1c473b0c6a1392b50 100644 (file)
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -256,7 +256,7 @@ struct btrfs_fs_devices {
  
         struct btrfs_fs_info *fs_info;
         /* sysfs kobjects */
-       struct kobject super_kobj;
+       struct kobject fsid_kobj;
         struct kobject *device_dir_kobj;
         struct completion kobj_unregister;
  };
@@ -334,10 +334,15 @@ struct btrfs_raid_attr {
         int dev_stripes;        /* stripes per dev */
         int devs_max;           /* max devs to use */
         int devs_min;           /* min devs needed */
+       int tolerated_failures; /* max tolerated fail devs */
         int devs_increment;     /* ndevs has to be a multiple of this */
         int ncopies;            /* how many copies to data has */
  };
  
+extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
+
+extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES];
+
  struct map_lookup {
         u64 type;
         int io_align;
@@ -375,6 +380,9 @@ struct map_lookup {
  #define BTRFS_BALANCE_ARGS_DRANGE      (1ULL << 3)
  #define BTRFS_BALANCE_ARGS_VRANGE      (1ULL << 4)
  #define BTRFS_BALANCE_ARGS_LIMIT       (1ULL << 5)
+#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6)
+#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
+#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 8)
  
  #define BTRFS_BALANCE_ARGS_MASK                        \
         (BTRFS_BALANCE_ARGS_PROFILES |          \
@@ -382,7 +390,10 @@ struct map_lookup {
          BTRFS_BALANCE_ARGS_DEVID |             \
          BTRFS_BALANCE_ARGS_DRANGE |            \
          BTRFS_BALANCE_ARGS_VRANGE |            \
-        BTRFS_BALANCE_ARGS_LIMIT)
+        BTRFS_BALANCE_ARGS_LIMIT |             \
+        BTRFS_BALANCE_ARGS_LIMIT_RANGE |       \
+        BTRFS_BALANCE_ARGS_STRIPES_RANGE |     \
+        BTRFS_BALANCE_ARGS_USAGE_RANGE)
  
  /*
   * Profile changing flags.  When SOFT is set we won't relocate chunk if
@@ -482,7 +493,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
                                       struct btrfs_device *tgtdev);
  void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
                                               struct btrfs_device *tgtdev);
-int btrfs_scratch_superblock(struct btrfs_device *device);
+void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path);
  int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
                            u64 logical, u64 len, int mirror_num);
  unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
@@ -555,5 +566,6 @@ static inline void unlock_chunks(struct btrfs_root *root)
  struct list_head *btrfs_get_fs_uuids(void);
  void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
  void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
+void btrfs_close_one_device(struct btrfs_device *device);
  
  #endif
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h

index 0b73af9be12f467d8b838c278297e30e273dbc49..b4473dab39d613e58e4d4e58e0049d72522047cd 100644 (file)
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1117,6 +1117,119 @@ DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
         TP_ARGS(wq)
  );
  
+DECLARE_EVENT_CLASS(btrfs__qgroup_data_map,
+
+       TP_PROTO(struct inode *inode, u64 free_reserved),
+
+       TP_ARGS(inode, free_reserved),
+
+       TP_STRUCT__entry(
+               __field(        u64,            rootid          )
+               __field(        unsigned long,  ino             )
+               __field(        u64,            free_reserved   )
+       ),
+
+       TP_fast_assign(
+               __entry->rootid         =       BTRFS_I(inode)->root->objectid;
+               __entry->ino            =       inode->i_ino;
+               __entry->free_reserved  =       free_reserved;
+       ),
+
+       TP_printk("rootid=%llu, ino=%lu, free_reserved=%llu",
+                 __entry->rootid, __entry->ino, __entry->free_reserved)
+);
+
+DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_init_data_rsv_map,
+
+       TP_PROTO(struct inode *inode, u64 free_reserved),
+
+       TP_ARGS(inode, free_reserved)
+);
+
+DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_free_data_rsv_map,
+
+       TP_PROTO(struct inode *inode, u64 free_reserved),
+
+       TP_ARGS(inode, free_reserved)
+);
+
+#define BTRFS_QGROUP_OPERATIONS                                \
+       { QGROUP_RESERVE,       "reserve"       },      \
+       { QGROUP_RELEASE,       "release"       },      \
+       { QGROUP_FREE,          "free"          }
+
+DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
+
+       TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+
+       TP_ARGS(inode, start, len, reserved, op),
+
+       TP_STRUCT__entry(
+               __field(        u64,            rootid          )
+               __field(        unsigned long,  ino             )
+               __field(        u64,            start           )
+               __field(        u64,            len             )
+               __field(        u64,            reserved        )
+               __field(        int,            op              )
+       ),
+
+       TP_fast_assign(
+               __entry->rootid         = BTRFS_I(inode)->root->objectid;
+               __entry->ino            = inode->i_ino;
+               __entry->start          = start;
+               __entry->len            = len;
+               __entry->reserved       = reserved;
+               __entry->op             = op;
+       ),
+
+       TP_printk("root=%llu, ino=%lu, start=%llu, len=%llu, reserved=%llu, op=%s",
+                 __entry->rootid, __entry->ino, __entry->start, __entry->len,
+                 __entry->reserved,
+                 __print_flags((unsigned long)__entry->op, "",
+                               BTRFS_QGROUP_OPERATIONS)
+       )
+);
+
+DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_reserve_data,
+
+       TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+
+       TP_ARGS(inode, start, len, reserved, op)
+);
+
+DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
+
+       TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+
+       TP_ARGS(inode, start, len, reserved, op)
+);
+
+DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
+
+       TP_PROTO(u64 ref_root, u64 reserved),
+
+       TP_ARGS(ref_root, reserved),
+
+       TP_STRUCT__entry(
+               __field(        u64,            ref_root        )
+               __field(        u64,            reserved        )
+       ),
+
+       TP_fast_assign(
+               __entry->ref_root       = ref_root;
+               __entry->reserved       = reserved;
+       ),
+
+       TP_printk("root=%llu, reserved=%llu, op=free",
+                 __entry->ref_root, __entry->reserved)
+);
+
+DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
+
+       TP_PROTO(u64 ref_root, u64 reserved),
+
+       TP_ARGS(ref_root, reserved)
+);
  #endif /* _TRACE_BTRFS_H */
  
  /* This part must be outside protection */
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h

index b6dec05c7196a22511e346242724406eef88265b..dea8931992571a6e87569024567af5aff6cef01b 100644 (file)
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -206,7 +206,13 @@ struct btrfs_ioctl_feature_flags {
   */
  struct btrfs_balance_args {
         __u64 profiles;
-       __u64 usage;
+       union {
+               __le64 usage;
+               struct {
+                       __le32 usage_min;
+                       __le32 usage_max;
+               };
+       };
         __u64 devid;
         __u64 pstart;
         __u64 pend;
@@ -217,8 +223,27 @@ struct btrfs_balance_args {
  
         __u64 flags;
  
-       __u64 limit;            /* limit number of processed chunks */
-       __u64 unused[7];
+       /*
+        * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
+        * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
+        * and maximum
+        */
+       union {
+               __u64 limit;            /* limit number of processed chunks */
+               struct {
+                       __u32 limit_min;
+                       __u32 limit_max;
+               };
+       };
+
+       /*
+        * Process chunks that cross stripes_min..stripes_max devices,
+        * BTRFS_BALANCE_ARGS_STRIPES_RANGE
+        */
+       __le32 stripes_min;
+       __le32 stripes_max;
+
+       __u64 unused[6];
  } __attribute__ ((__packed__));
  
  /* report balance progress to userspace */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 7 Nov 2015 01:17:13 +0000 (17:17 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 7 Nov 2015 01:17:13 +0000 (17:17 -0800)
fs/btrfs/backref.c		patch \| blob \| history
fs/btrfs/check-integrity.c		patch \| blob \| history
fs/btrfs/compression.c		patch \| blob \| history
fs/btrfs/ctree.c		patch \| blob \| history
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/delayed-inode.c		patch \| blob \| history
fs/btrfs/delayed-ref.c		patch \| blob \| history
fs/btrfs/delayed-ref.h		patch \| blob \| history
fs/btrfs/dev-replace.c		patch \| blob \| history
fs/btrfs/disk-io.c		patch \| blob \| history
fs/btrfs/disk-io.h		patch \| blob \| history
fs/btrfs/extent-tree.c		patch \| blob \| history
fs/btrfs/extent_io.c		patch \| blob \| history
fs/btrfs/extent_io.h		patch \| blob \| history
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/free-space-cache.c		patch \| blob \| history
fs/btrfs/free-space-cache.h		patch \| blob \| history
fs/btrfs/inode-item.c		patch \| blob \| history
fs/btrfs/inode-map.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/ioctl.c		patch \| blob \| history
fs/btrfs/locking.c		patch \| blob \| history
fs/btrfs/ordered-data.c		patch \| blob \| history
fs/btrfs/ordered-data.h		patch \| blob \| history
fs/btrfs/props.c		patch \| blob \| history
fs/btrfs/qgroup.c		patch \| blob \| history
fs/btrfs/qgroup.h		patch \| blob \| history
fs/btrfs/raid56.c		patch \| blob \| history
fs/btrfs/reada.c		patch \| blob \| history
fs/btrfs/relocation.c		patch \| blob \| history
fs/btrfs/root-tree.c		patch \| blob \| history
fs/btrfs/scrub.c		patch \| blob \| history
fs/btrfs/send.c		patch \| blob \| history
fs/btrfs/super.c		patch \| blob \| history
fs/btrfs/sysfs.c		patch \| blob \| history
fs/btrfs/sysfs.h		patch \| blob \| history
fs/btrfs/tests/free-space-tests.c		patch \| blob \| history
fs/btrfs/transaction.c		patch \| blob \| history
fs/btrfs/transaction.h		patch \| blob \| history
fs/btrfs/tree-log.c		patch \| blob \| history
fs/btrfs/volumes.c		patch \| blob \| history
fs/btrfs/volumes.h		patch \| blob \| history
include/trace/events/btrfs.h		patch \| blob \| history
include/uapi/linux/btrfs.h		patch \| blob \| history