Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 27 May 2010 17:43:44 +0000 (10:43 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 27 May 2010 17:43:44 +0000 (10:43 -0700)
* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: (27 commits)
  Btrfs: add more error checking to btrfs_dirty_inode
  Btrfs: allow unaligned DIO
  Btrfs: drop verbose enospc printk
  Btrfs: Fix block generation verification race
  Btrfs: fix preallocation and nodatacow checks in O_DIRECT
  Btrfs: avoid ENOSPC errors in btrfs_dirty_inode
  Btrfs: move O_DIRECT space reservation to btrfs_direct_IO
  Btrfs: rework O_DIRECT enospc handling
  Btrfs: use async helpers for DIO write checksumming
  Btrfs: don't walk around with task->state != TASK_RUNNING
  Btrfs: do aio_write instead of write
  Btrfs: add basic DIO read/write support
  direct-io: do not merge logically non-contiguous requests
  direct-io: add a hook for the fs to provide its own submit_bio function
  fs: allow short direct-io reads to be completed via buffered IO
  Btrfs: Metadata ENOSPC handling for balance
  Btrfs: Pre-allocate space for data relocation
  Btrfs: Metadata ENOSPC handling for tree log
  Btrfs: Metadata reservation for orphan inodes
  Btrfs: Introduce global metadata reservation
  ...

1  2 
fs/btrfs/extent-tree.c
fs/btrfs/inode.c
fs/btrfs/super.c
fs/btrfs/xattr.c
include/linux/fs.h
mm/filemap.c

diff --combined fs/btrfs/extent-tree.c
index c6a4f459ad76d87cf5a7d5295ad518ad2010c38f,6c14101506e10d59a0bfbe9b19673ca1b5777f1c..b9080d71991a35cea63d2620d1534234cde6e0a9
  
  static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
-                             u64 bytenr, u64 num_bytes, int alloc,
-                             int mark_free);
- static int update_reserved_extents(struct btrfs_block_group_cache *cache,
-                                  u64 num_bytes, int reserve);
+                             u64 bytenr, u64 num_bytes, int alloc);
+ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+                                u64 num_bytes, int reserve, int sinfo);
  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 parent,
@@@ -61,12 -60,6 +60,6 @@@ static int alloc_reserved_tree_block(st
  static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force);
- static int pin_down_bytes(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root,
-                         struct btrfs_path *path,
-                         u64 bytenr, u64 num_bytes,
-                         int is_data, int reserved,
-                         struct extent_buffer **must_clean);
  static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);
  static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@@ -91,8 -84,12 +84,12 @@@ void btrfs_get_block_group(struct btrfs
  
  void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
  {
-       if (atomic_dec_and_test(&cache->count))
+       if (atomic_dec_and_test(&cache->count)) {
+               WARN_ON(cache->pinned > 0);
+               WARN_ON(cache->reserved > 0);
+               WARN_ON(cache->reserved_pinned > 0);
                kfree(cache);
+       }
  }
  
  /*
@@@ -319,7 -316,7 +316,7 @@@ static int caching_kthread(void *data
  
        exclude_super_stripes(extent_root, block_group);
        spin_lock(&block_group->space_info->lock);
-       block_group->space_info->bytes_super += block_group->bytes_super;
+       block_group->space_info->bytes_readonly += block_group->bytes_super;
        spin_unlock(&block_group->space_info->lock);
  
        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@@ -507,6 -504,9 +504,9 @@@ static struct btrfs_space_info *__find_
        struct list_head *head = &info->space_info;
        struct btrfs_space_info *found;
  
+       flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
+                BTRFS_BLOCK_GROUP_METADATA;
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags == flags) {
@@@ -609,6 -609,113 +609,113 @@@ int btrfs_lookup_extent(struct btrfs_ro
        return ret;
  }
  
+ /*
+  * helper function to lookup reference count and flags of extent.
+  *
+  * the head node for delayed ref is used to store the sum of all the
+  * reference count modifications queued up in the rbtree. the head
+  * node may also store the extent flags to set. This way you can check
+  * to see what the reference count and extent flags would be if all of
+  * the delayed refs are not processed.
+  */
+ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 bytenr,
+                            u64 num_bytes, u64 *refs, u64 *flags)
+ {
+       struct btrfs_delayed_ref_head *head;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_path *path;
+       struct btrfs_extent_item *ei;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       u32 item_size;
+       u64 num_refs;
+       u64 extent_flags;
+       int ret;
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       key.objectid = bytenr;
+       key.type = BTRFS_EXTENT_ITEM_KEY;
+       key.offset = num_bytes;
+       if (!trans) {
+               path->skip_locking = 1;
+               path->search_commit_root = 1;
+       }
+ again:
+       ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+                               &key, path, 0, 0);
+       if (ret < 0)
+               goto out_free;
+       if (ret == 0) {
+               leaf = path->nodes[0];
+               item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+               if (item_size >= sizeof(*ei)) {
+                       ei = btrfs_item_ptr(leaf, path->slots[0],
+                                           struct btrfs_extent_item);
+                       num_refs = btrfs_extent_refs(leaf, ei);
+                       extent_flags = btrfs_extent_flags(leaf, ei);
+               } else {
+ #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                       struct btrfs_extent_item_v0 *ei0;
+                       BUG_ON(item_size != sizeof(*ei0));
+                       ei0 = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_extent_item_v0);
+                       num_refs = btrfs_extent_refs_v0(leaf, ei0);
+                       /* FIXME: this isn't correct for data */
+                       extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ #else
+                       BUG();
+ #endif
+               }
+               BUG_ON(num_refs == 0);
+       } else {
+               num_refs = 0;
+               extent_flags = 0;
+               ret = 0;
+       }
+       if (!trans)
+               goto out;
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+       head = btrfs_find_delayed_ref_head(trans, bytenr);
+       if (head) {
+               if (!mutex_trylock(&head->mutex)) {
+                       atomic_inc(&head->node.refs);
+                       spin_unlock(&delayed_refs->lock);
+                       btrfs_release_path(root->fs_info->extent_root, path);
+                       mutex_lock(&head->mutex);
+                       mutex_unlock(&head->mutex);
+                       btrfs_put_delayed_ref(&head->node);
+                       goto again;
+               }
+               if (head->extent_op && head->extent_op->update_flags)
+                       extent_flags |= head->extent_op->flags_to_set;
+               else
+                       BUG_ON(num_refs == 0);
+               num_refs += head->node.ref_mod;
+               mutex_unlock(&head->mutex);
+       }
+       spin_unlock(&delayed_refs->lock);
+ out:
+       WARN_ON(num_refs == 0);
+       if (refs)
+               *refs = num_refs;
+       if (flags)
+               *flags = extent_flags;
+ out_free:
+       btrfs_free_path(path);
+       return ret;
+ }
  /*
   * Back reference rules.  Back refs have three main goals:
   *
@@@ -1589,7 -1696,7 +1696,7 @@@ static void btrfs_issue_discard(struct 
                                u64 start, u64 len)
  {
        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
 -                           DISCARD_FL_BARRIER);
 +                      BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
  }
  
  static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@@ -1871,7 -1978,6 +1978,6 @@@ static int run_delayed_tree_ref(struct 
        return ret;
  }
  
  /* helper function to actually process a single delayed ref entry */
  static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                BUG_ON(extent_op);
                head = btrfs_delayed_node_to_head(node);
                if (insert_reserved) {
-                       int mark_free = 0;
-                       struct extent_buffer *must_clean = NULL;
-                       ret = pin_down_bytes(trans, root, NULL,
-                                            node->bytenr, node->num_bytes,
-                                            head->is_data, 1, &must_clean);
-                       if (ret > 0)
-                               mark_free = 1;
-                       if (must_clean) {
-                               clean_tree_block(NULL, root, must_clean);
-                               btrfs_tree_unlock(must_clean);
-                               free_extent_buffer(must_clean);
-                       }
+                       btrfs_pin_extent(root, node->bytenr,
+                                        node->num_bytes, 1);
                        if (head->is_data) {
                                ret = btrfs_del_csums(trans, root,
                                                      node->bytenr,
                                                      node->num_bytes);
                                BUG_ON(ret);
                        }
-                       if (mark_free) {
-                               ret = btrfs_free_reserved_extent(root,
-                                                       node->bytenr,
-                                                       node->num_bytes);
-                               BUG_ON(ret);
-                       }
                }
                mutex_unlock(&head->mutex);
                return 0;
@@@ -2347,6 -2435,8 +2435,8 @@@ int btrfs_cross_ref_exist(struct btrfs_
                ret = 0;
  out:
        btrfs_free_path(path);
+       if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+               WARN_ON(ret > 0);
        return ret;
  }
  
@@@ -2660,12 -2750,21 +2750,21 @@@ static int update_space_info(struct btr
                             struct btrfs_space_info **space_info)
  {
        struct btrfs_space_info *found;
+       int i;
+       int factor;
+       if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+                    BTRFS_BLOCK_GROUP_RAID10))
+               factor = 2;
+       else
+               factor = 1;
  
        found = __find_space_info(info, flags);
        if (found) {
                spin_lock(&found->lock);
                found->total_bytes += total_bytes;
                found->bytes_used += bytes_used;
+               found->disk_used += bytes_used * factor;
                found->full = 0;
                spin_unlock(&found->lock);
                *space_info = found;
        if (!found)
                return -ENOMEM;
  
-       INIT_LIST_HEAD(&found->block_groups);
+       for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+               INIT_LIST_HEAD(&found->block_groups[i]);
        init_rwsem(&found->groups_sem);
-       init_waitqueue_head(&found->flush_wait);
-       init_waitqueue_head(&found->allocate_wait);
        spin_lock_init(&found->lock);
-       found->flags = flags;
+       found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
+                               BTRFS_BLOCK_GROUP_SYSTEM |
+                               BTRFS_BLOCK_GROUP_METADATA);
        found->total_bytes = total_bytes;
        found->bytes_used = bytes_used;
+       found->disk_used = bytes_used * factor;
        found->bytes_pinned = 0;
        found->bytes_reserved = 0;
        found->bytes_readonly = 0;
-       found->bytes_delalloc = 0;
+       found->bytes_may_use = 0;
        found->full = 0;
        found->force_alloc = 0;
        *space_info = found;
@@@ -2711,19 -2812,6 +2812,6 @@@ static void set_avail_alloc_bits(struc
        }
  }
  
- static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
- {
-       spin_lock(&cache->space_info->lock);
-       spin_lock(&cache->lock);
-       if (!cache->ro) {
-               cache->space_info->bytes_readonly += cache->key.offset -
-                                       btrfs_block_group_used(&cache->item);
-               cache->ro = 1;
-       }
-       spin_unlock(&cache->lock);
-       spin_unlock(&cache->space_info->lock);
- }
  u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
  {
        u64 num_devices = root->fs_info->fs_devices->rw_devices;
        return flags;
  }
  
- static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
- {
-       struct btrfs_fs_info *info = root->fs_info;
-       u64 alloc_profile;
-       if (data) {
-               alloc_profile = info->avail_data_alloc_bits &
-                       info->data_alloc_profile;
-               data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
-       } else if (root == root->fs_info->chunk_root) {
-               alloc_profile = info->avail_system_alloc_bits &
-                       info->system_alloc_profile;
-               data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
-       } else {
-               alloc_profile = info->avail_metadata_alloc_bits &
-                       info->metadata_alloc_profile;
-               data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
-       }
-       return btrfs_reduce_alloc_profile(root, data);
- }
- void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
  {
-       u64 alloc_target;
-       alloc_target = btrfs_get_alloc_profile(root, 1);
-       BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
-                                                      alloc_target);
+       if (flags & BTRFS_BLOCK_GROUP_DATA)
+               flags |= root->fs_info->avail_data_alloc_bits &
+                        root->fs_info->data_alloc_profile;
+       else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               flags |= root->fs_info->avail_system_alloc_bits &
+                        root->fs_info->system_alloc_profile;
+       else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+               flags |= root->fs_info->avail_metadata_alloc_bits &
+                        root->fs_info->metadata_alloc_profile;
+       return btrfs_reduce_alloc_profile(root, flags);
  }
  
- static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
+ static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
  {
-       u64 num_bytes;
-       int level;
-       level = BTRFS_MAX_LEVEL - 2;
-       /*
-        * NOTE: these calculations are absolutely the worst possible case.
-        * This assumes that _every_ item we insert will require a new leaf, and
-        * that the tree has grown to its maximum level size.
-        */
+       u64 flags;
  
-       /*
-        * for every item we insert we could insert both an extent item and a
-        * extent ref item.  Then for ever item we insert, we will need to cow
-        * both the original leaf, plus the leaf to the left and right of it.
-        *
-        * Unless we are talking about the extent root, then we just want the
-        * number of items * 2, since we just need the extent item plus its ref.
-        */
-       if (root == root->fs_info->extent_root)
-               num_bytes = num_items * 2;
+       if (data)
+               flags = BTRFS_BLOCK_GROUP_DATA;
+       else if (root == root->fs_info->chunk_root)
+               flags = BTRFS_BLOCK_GROUP_SYSTEM;
        else
-               num_bytes = (num_items + (2 * num_items)) * 3;
+               flags = BTRFS_BLOCK_GROUP_METADATA;
  
-       /*
-        * num_bytes is total number of leaves we could need times the leaf
-        * size, and then for every leaf we could end up cow'ing 2 nodes per
-        * level, down to the leaf level.
-        */
-       num_bytes = (num_bytes * root->leafsize) +
-               (num_bytes * (level * 2)) * root->nodesize;
+       return get_alloc_profile(root, flags);
+ }
  
-       return num_bytes;
+ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+ {
+       BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
+                                                      BTRFS_BLOCK_GROUP_DATA);
  }
  
  /*
-  * Unreserve metadata space for delalloc.  If we have less reserved credits than
-  * we have extents, this function does nothing.
+  * This will check the space that the inode allocates from to make sure we have
+  * enough space for bytes.
   */
- int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
-                                         struct inode *inode, int num_items)
+ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
  {
-       struct btrfs_fs_info *info = root->fs_info;
-       struct btrfs_space_info *meta_sinfo;
-       u64 num_bytes;
-       u64 alloc_target;
-       bool bug = false;
+       struct btrfs_space_info *data_sinfo;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       u64 used;
+       int ret = 0, committed = 0;
  
-       /* get the space info for where the metadata will live */
-       alloc_target = btrfs_get_alloc_profile(root, 0);
-       meta_sinfo = __find_space_info(info, alloc_target);
+       /* make sure bytes are sectorsize aligned */
+       bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
  
-       num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
-                                          num_items);
+       data_sinfo = BTRFS_I(inode)->space_info;
+       if (!data_sinfo)
+               goto alloc;
  
-       spin_lock(&meta_sinfo->lock);
-       spin_lock(&BTRFS_I(inode)->accounting_lock);
-       if (BTRFS_I(inode)->reserved_extents <=
-           BTRFS_I(inode)->outstanding_extents) {
-               spin_unlock(&BTRFS_I(inode)->accounting_lock);
-               spin_unlock(&meta_sinfo->lock);
-               return 0;
-       }
-       spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ again:
+       /* make sure we have enough space to handle the data first */
+       spin_lock(&data_sinfo->lock);
+       used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
+               data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
+               data_sinfo->bytes_may_use;
+       if (used + bytes > data_sinfo->total_bytes) {
+               struct btrfs_trans_handle *trans;
  
-       BTRFS_I(inode)->reserved_extents -= num_items;
-       BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
+               /*
+                * if we don't have enough free bytes in this space then we need
+                * to alloc a new chunk.
+                */
+               if (!data_sinfo->full) {
+                       u64 alloc_target;
  
-       if (meta_sinfo->bytes_delalloc < num_bytes) {
-               bug = true;
-               meta_sinfo->bytes_delalloc = 0;
-       } else {
-               meta_sinfo->bytes_delalloc -= num_bytes;
-       }
-       spin_unlock(&meta_sinfo->lock);
+                       data_sinfo->force_alloc = 1;
+                       spin_unlock(&data_sinfo->lock);
+ alloc:
+                       alloc_target = btrfs_get_alloc_profile(root, 1);
+                       trans = btrfs_join_transaction(root, 1);
+                       if (IS_ERR(trans))
+                               return PTR_ERR(trans);
  
-       BUG_ON(bug);
+                       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                                            bytes + 2 * 1024 * 1024,
+                                            alloc_target, 0);
+                       btrfs_end_transaction(trans, root);
+                       if (ret < 0)
+                               return ret;
  
-       return 0;
- }
+                       if (!data_sinfo) {
+                               btrfs_set_inode_space_info(root, inode);
+                               data_sinfo = BTRFS_I(inode)->space_info;
+                       }
+                       goto again;
+               }
+               spin_unlock(&data_sinfo->lock);
  
- static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
- {
-       u64 thresh;
+               /* commit the current transaction and try again */
+               if (!committed && !root->fs_info->open_ioctl_trans) {
+                       committed = 1;
+                       trans = btrfs_join_transaction(root, 1);
+                       if (IS_ERR(trans))
+                               return PTR_ERR(trans);
+                       ret = btrfs_commit_transaction(trans, root);
+                       if (ret)
+                               return ret;
+                       goto again;
+               }
  
-       thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-               meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-               meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-               meta_sinfo->bytes_may_use;
+ #if 0 /* I hope we never need this code again, just in case */
+               printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
+                      "%llu bytes_reserved, " "%llu bytes_pinned, "
+                      "%llu bytes_readonly, %llu may use %llu total\n",
+                      (unsigned long long)bytes,
+                      (unsigned long long)data_sinfo->bytes_used,
+                      (unsigned long long)data_sinfo->bytes_reserved,
+                      (unsigned long long)data_sinfo->bytes_pinned,
+                      (unsigned long long)data_sinfo->bytes_readonly,
+                      (unsigned long long)data_sinfo->bytes_may_use,
+                      (unsigned long long)data_sinfo->total_bytes);
+ #endif
+               return -ENOSPC;
+       }
+       data_sinfo->bytes_may_use += bytes;
+       BTRFS_I(inode)->reserved_bytes += bytes;
+       spin_unlock(&data_sinfo->lock);
  
-       thresh = meta_sinfo->total_bytes - thresh;
-       thresh *= 80;
-       do_div(thresh, 100);
-       if (thresh <= meta_sinfo->bytes_delalloc)
-               meta_sinfo->force_delalloc = 1;
-       else
-               meta_sinfo->force_delalloc = 0;
+       return 0;
  }
  
- struct async_flush {
-       struct btrfs_root *root;
-       struct btrfs_space_info *info;
-       struct btrfs_work work;
- };
- static noinline void flush_delalloc_async(struct btrfs_work *work)
+ /*
+  * called when we are clearing an delalloc extent from the
+  * inode's io_tree or there was an error for whatever reason
+  * after calling btrfs_check_data_free_space
+  */
+ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
  {
-       struct async_flush *async;
-       struct btrfs_root *root;
-       struct btrfs_space_info *info;
-       async = container_of(work, struct async_flush, work);
-       root = async->root;
-       info = async->info;
-       btrfs_start_delalloc_inodes(root, 0);
-       wake_up(&info->flush_wait);
-       btrfs_wait_ordered_extents(root, 0, 0);
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_space_info *data_sinfo;
  
-       spin_lock(&info->lock);
-       info->flushing = 0;
-       spin_unlock(&info->lock);
-       wake_up(&info->flush_wait);
+       /* make sure bytes are sectorsize aligned */
+       bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
  
-       kfree(async);
+       data_sinfo = BTRFS_I(inode)->space_info;
+       spin_lock(&data_sinfo->lock);
+       data_sinfo->bytes_may_use -= bytes;
+       BTRFS_I(inode)->reserved_bytes -= bytes;
+       spin_unlock(&data_sinfo->lock);
  }
  
- static void wait_on_flush(struct btrfs_space_info *info)
+ static void force_metadata_allocation(struct btrfs_fs_info *info)
  {
-       DEFINE_WAIT(wait);
-       u64 used;
-       while (1) {
-               prepare_to_wait(&info->flush_wait, &wait,
-                               TASK_UNINTERRUPTIBLE);
-               spin_lock(&info->lock);
-               if (!info->flushing) {
-                       spin_unlock(&info->lock);
-                       break;
-               }
+       struct list_head *head = &info->space_info;
+       struct btrfs_space_info *found;
  
-               used = info->bytes_used + info->bytes_reserved +
-                       info->bytes_pinned + info->bytes_readonly +
-                       info->bytes_super + info->bytes_root +
-                       info->bytes_may_use + info->bytes_delalloc;
-               if (used < info->total_bytes) {
-                       spin_unlock(&info->lock);
-                       break;
-               }
-               spin_unlock(&info->lock);
-               schedule();
+       rcu_read_lock();
+       list_for_each_entry_rcu(found, head, list) {
+               if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+                       found->force_alloc = 1;
        }
-       finish_wait(&info->flush_wait, &wait);
+       rcu_read_unlock();
  }
  
- static void flush_delalloc(struct btrfs_root *root,
-                                struct btrfs_space_info *info)
+ static int should_alloc_chunk(struct btrfs_space_info *sinfo,
+                             u64 alloc_bytes)
  {
-       struct async_flush *async;
-       bool wait = false;
-       spin_lock(&info->lock);
-       if (!info->flushing)
-               info->flushing = 1;
-       else
-               wait = true;
-       spin_unlock(&info->lock);
-       if (wait) {
-               wait_on_flush(info);
-               return;
-       }
-       async = kzalloc(sizeof(*async), GFP_NOFS);
-       if (!async)
-               goto flush;
-       async->root = root;
-       async->info = info;
-       async->work.func = flush_delalloc_async;
+       u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
  
-       btrfs_queue_worker(&root->fs_info->enospc_workers,
-                          &async->work);
-       wait_on_flush(info);
-       return;
+       if (sinfo->bytes_used + sinfo->bytes_reserved +
+           alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+               return 0;
  
- flush:
-       btrfs_start_delalloc_inodes(root, 0);
-       btrfs_wait_ordered_extents(root, 0, 0);
+       if (sinfo->bytes_used + sinfo->bytes_reserved +
+           alloc_bytes < div_factor(num_bytes, 8))
+               return 0;
  
-       spin_lock(&info->lock);
-       info->flushing = 0;
-       spin_unlock(&info->lock);
-       wake_up(&info->flush_wait);
+       return 1;
  }
  
- static int maybe_allocate_chunk(struct btrfs_root *root,
-                                struct btrfs_space_info *info)
+ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *extent_root, u64 alloc_bytes,
+                         u64 flags, int force)
  {
-       struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
-       struct btrfs_trans_handle *trans;
-       bool wait = false;
+       struct btrfs_space_info *space_info;
+       struct btrfs_fs_info *fs_info = extent_root->fs_info;
        int ret = 0;
-       u64 min_metadata;
-       u64 free_space;
  
-       free_space = btrfs_super_total_bytes(disk_super);
-       /*
-        * we allow the metadata to grow to a max of either 10gb or 5% of the
-        * space in the volume.
-        */
-       min_metadata = min((u64)10 * 1024 * 1024 * 1024,
-                            div64_u64(free_space * 5, 100));
-       if (info->total_bytes >= min_metadata) {
-               spin_unlock(&info->lock);
-               return 0;
-       }
+       mutex_lock(&fs_info->chunk_mutex);
  
-       if (info->full) {
-               spin_unlock(&info->lock);
-               return 0;
+       flags = btrfs_reduce_alloc_profile(extent_root, flags);
+       space_info = __find_space_info(extent_root->fs_info, flags);
+       if (!space_info) {
+               ret = update_space_info(extent_root->fs_info, flags,
+                                       0, 0, &space_info);
+               BUG_ON(ret);
        }
+       BUG_ON(!space_info);
  
-       if (!info->allocating_chunk) {
-               info->force_alloc = 1;
-               info->allocating_chunk = 1;
-       } else {
-               wait = true;
+       spin_lock(&space_info->lock);
+       if (space_info->force_alloc)
+               force = 1;
+       if (space_info->full) {
+               spin_unlock(&space_info->lock);
+               goto out;
        }
  
-       spin_unlock(&info->lock);
-       if (wait) {
-               wait_event(info->allocate_wait,
-                          !info->allocating_chunk);
-               return 1;
+       if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
+               spin_unlock(&space_info->lock);
+               goto out;
        }
+       spin_unlock(&space_info->lock);
  
-       trans = btrfs_start_transaction(root, 1);
-       if (!trans) {
-               ret = -ENOMEM;
-               goto out;
+       /*
+        * if we're doing a data chunk, go ahead and make sure that
+        * we keep a reasonable number of metadata chunks allocated in the
+        * FS as well.
+        */
+       if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
+               fs_info->data_chunk_allocations++;
+               if (!(fs_info->data_chunk_allocations %
+                     fs_info->metadata_ratio))
+                       force_metadata_allocation(fs_info);
        }
  
-       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                            4096 + 2 * 1024 * 1024,
-                            info->flags, 0);
-       btrfs_end_transaction(trans, root);
+       ret = btrfs_alloc_chunk(trans, extent_root, flags);
+       spin_lock(&space_info->lock);
        if (ret)
-               goto out;
+               space_info->full = 1;
+       else
+               ret = 1;
+       space_info->force_alloc = 0;
+       spin_unlock(&space_info->lock);
  out:
-       spin_lock(&info->lock);
-       info->allocating_chunk = 0;
-       spin_unlock(&info->lock);
-       wake_up(&info->allocate_wait);
+       mutex_unlock(&extent_root->fs_info->chunk_mutex);
+       return ret;
+ }
  
-       if (ret)
+ static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct btrfs_space_info *sinfo, u64 num_bytes)
+ {
+       int ret;
+       int end_trans = 0;
+       if (sinfo->full)
                return 0;
-       return 1;
+       spin_lock(&sinfo->lock);
+       ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
+       spin_unlock(&sinfo->lock);
+       if (!ret)
+               return 0;
+       if (!trans) {
+               trans = btrfs_join_transaction(root, 1);
+               BUG_ON(IS_ERR(trans));
+               end_trans = 1;
+       }
+       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                            num_bytes + 2 * 1024 * 1024,
+                            get_alloc_profile(root, sinfo->flags), 0);
+       if (end_trans)
+               btrfs_end_transaction(trans, root);
+       return ret == 1 ? 1 : 0;
  }
  
  /*
-  * Reserve metadata space for delalloc.
+  * shrink metadata reservation for delalloc
   */
- int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
-                                       struct inode *inode, int num_items)
+ static int shrink_delalloc(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, u64 to_reclaim)
+ {
+       struct btrfs_block_rsv *block_rsv;
+       u64 reserved;
+       u64 max_reclaim;
+       u64 reclaimed = 0;
+       int pause = 1;
+       int ret;
+       block_rsv = &root->fs_info->delalloc_block_rsv;
+       spin_lock(&block_rsv->lock);
+       reserved = block_rsv->reserved;
+       spin_unlock(&block_rsv->lock);
+       if (reserved == 0)
+               return 0;
+       max_reclaim = min(reserved, to_reclaim);
+       while (1) {
+               ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+               if (!ret) {
+                       __set_current_state(TASK_INTERRUPTIBLE);
+                       schedule_timeout(pause);
+                       pause <<= 1;
+                       if (pause > HZ / 10)
+                               pause = HZ / 10;
+               } else {
+                       pause = 1;
+               }
+               spin_lock(&block_rsv->lock);
+               if (reserved > block_rsv->reserved)
+                       reclaimed = reserved - block_rsv->reserved;
+               reserved = block_rsv->reserved;
+               spin_unlock(&block_rsv->lock);
+               if (reserved == 0 || reclaimed >= max_reclaim)
+                       break;
+               if (trans && trans->transaction->blocked)
+                       return -EAGAIN;
+       }
+       return reclaimed >= to_reclaim;
+ }
+ static int should_retry_reserve(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct btrfs_block_rsv *block_rsv,
+                               u64 num_bytes, int *retries)
  {
-       struct btrfs_fs_info *info = root->fs_info;
-       struct btrfs_space_info *meta_sinfo;
-       u64 num_bytes;
-       u64 used;
-       u64 alloc_target;
-       int flushed = 0;
-       int force_delalloc;
+       struct btrfs_space_info *space_info = block_rsv->space_info;
+       int ret;
  
-       /* get the space info for where the metadata will live */
-       alloc_target = btrfs_get_alloc_profile(root, 0);
-       meta_sinfo = __find_space_info(info, alloc_target);
+       if ((*retries) > 2)
+               return -ENOSPC;
  
-       num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
-                                          num_items);
- again:
-       spin_lock(&meta_sinfo->lock);
+       ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
+       if (ret)
+               return 1;
  
-       force_delalloc = meta_sinfo->force_delalloc;
+       if (trans && trans->transaction->in_commit)
+               return -ENOSPC;
  
-       if (unlikely(!meta_sinfo->bytes_root))
-               meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+       ret = shrink_delalloc(trans, root, num_bytes);
+       if (ret)
+               return ret;
  
-       if (!flushed)
-               meta_sinfo->bytes_delalloc += num_bytes;
+       spin_lock(&space_info->lock);
+       if (space_info->bytes_pinned < num_bytes)
+               ret = 1;
+       spin_unlock(&space_info->lock);
+       if (ret)
+               return -ENOSPC;
  
-       used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-               meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-               meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-               meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+       (*retries)++;
  
-       if (used > meta_sinfo->total_bytes) {
-               flushed++;
+       if (trans)
+               return -EAGAIN;
  
-               if (flushed == 1) {
-                       if (maybe_allocate_chunk(root, meta_sinfo))
-                               goto again;
-                       flushed++;
+       trans = btrfs_join_transaction(root, 1);
+       BUG_ON(IS_ERR(trans));
+       ret = btrfs_commit_transaction(trans, root);
+       BUG_ON(ret);
+       return 1;
+ }
+ static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
+                                 u64 num_bytes)
+ {
+       struct btrfs_space_info *space_info = block_rsv->space_info;
+       u64 unused;
+       int ret = -ENOSPC;
+       spin_lock(&space_info->lock);
+       unused = space_info->bytes_used + space_info->bytes_reserved +
+                space_info->bytes_pinned + space_info->bytes_readonly;
+       if (unused < space_info->total_bytes)
+               unused = space_info->total_bytes - unused;
+       else
+               unused = 0;
+       if (unused >= num_bytes) {
+               if (block_rsv->priority >= 10) {
+                       space_info->bytes_reserved += num_bytes;
+                       ret = 0;
                } else {
-                       spin_unlock(&meta_sinfo->lock);
+                       if ((unused + block_rsv->reserved) *
+                           block_rsv->priority >=
+                           (num_bytes + block_rsv->reserved) * 10) {
+                               space_info->bytes_reserved += num_bytes;
+                               ret = 0;
+                       }
                }
+       }
+       spin_unlock(&space_info->lock);
  
-               if (flushed == 2) {
-                       filemap_flush(inode->i_mapping);
-                       goto again;
-               } else if (flushed == 3) {
-                       flush_delalloc(root, meta_sinfo);
-                       goto again;
+       return ret;
+ }
+ static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
+                                            struct btrfs_root *root)
+ {
+       struct btrfs_block_rsv *block_rsv;
+       if (root->ref_cows)
+               block_rsv = trans->block_rsv;
+       else
+               block_rsv = root->block_rsv;
+       if (!block_rsv)
+               block_rsv = &root->fs_info->empty_block_rsv;
+       return block_rsv;
+ }
+ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+                              u64 num_bytes)
+ {
+       int ret = -ENOSPC;
+       spin_lock(&block_rsv->lock);
+       if (block_rsv->reserved >= num_bytes) {
+               block_rsv->reserved -= num_bytes;
+               if (block_rsv->reserved < block_rsv->size)
+                       block_rsv->full = 0;
+               ret = 0;
+       }
+       spin_unlock(&block_rsv->lock);
+       return ret;
+ }
+ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+                               u64 num_bytes, int update_size)
+ {
+       spin_lock(&block_rsv->lock);
+       block_rsv->reserved += num_bytes;
+       if (update_size)
+               block_rsv->size += num_bytes;
+       else if (block_rsv->reserved >= block_rsv->size)
+               block_rsv->full = 1;
+       spin_unlock(&block_rsv->lock);
+ }
+ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+                            struct btrfs_block_rsv *dest, u64 num_bytes)
+ {
+       struct btrfs_space_info *space_info = block_rsv->space_info;
+       spin_lock(&block_rsv->lock);
+       if (num_bytes == (u64)-1)
+               num_bytes = block_rsv->size;
+       block_rsv->size -= num_bytes;
+       if (block_rsv->reserved >= block_rsv->size) {
+               num_bytes = block_rsv->reserved - block_rsv->size;
+               block_rsv->reserved = block_rsv->size;
+               block_rsv->full = 1;
+       } else {
+               num_bytes = 0;
+       }
+       spin_unlock(&block_rsv->lock);
+       if (num_bytes > 0) {
+               if (dest) {
+                       block_rsv_add_bytes(dest, num_bytes, 0);
+               } else {
+                       spin_lock(&space_info->lock);
+                       space_info->bytes_reserved -= num_bytes;
+                       spin_unlock(&space_info->lock);
                }
-               spin_lock(&meta_sinfo->lock);
-               meta_sinfo->bytes_delalloc -= num_bytes;
-               spin_unlock(&meta_sinfo->lock);
-               printk(KERN_ERR "enospc, has %d, reserved %d\n",
-                      BTRFS_I(inode)->outstanding_extents,
-                      BTRFS_I(inode)->reserved_extents);
-               dump_space_info(meta_sinfo, 0, 0);
-               return -ENOSPC;
        }
+ }
  
-       BTRFS_I(inode)->reserved_extents += num_items;
-       check_force_delalloc(meta_sinfo);
-       spin_unlock(&meta_sinfo->lock);
+ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
+                                  struct btrfs_block_rsv *dst, u64 num_bytes)
+ {
+       int ret;
  
-       if (!flushed && force_delalloc)
-               filemap_flush(inode->i_mapping);
+       ret = block_rsv_use_bytes(src, num_bytes);
+       if (ret)
+               return ret;
  
+       block_rsv_add_bytes(dst, num_bytes, 1);
        return 0;
  }
  
- /*
-  * unreserve num_items number of items worth of metadata space.  This needs to
-  * be paired with btrfs_reserve_metadata_space.
-  *
-  * NOTE: if you have the option, run this _AFTER_ you do a
-  * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
-  * oprations which will result in more used metadata, so we want to make sure we
-  * can do that without issue.
-  */
- int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
+ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
  {
-       struct btrfs_fs_info *info = root->fs_info;
-       struct btrfs_space_info *meta_sinfo;
-       u64 num_bytes;
+       memset(rsv, 0, sizeof(*rsv));
+       spin_lock_init(&rsv->lock);
+       atomic_set(&rsv->usage, 1);
+       rsv->priority = 6;
+       INIT_LIST_HEAD(&rsv->list);
+ }
+ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+ {
+       struct btrfs_block_rsv *block_rsv;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        u64 alloc_target;
-       bool bug = false;
  
-       /* get the space info for where the metadata will live */
-       alloc_target = btrfs_get_alloc_profile(root, 0);
-       meta_sinfo = __find_space_info(info, alloc_target);
+       block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+       if (!block_rsv)
+               return NULL;
  
-       num_bytes = calculate_bytes_needed(root, num_items);
+       btrfs_init_block_rsv(block_rsv);
  
-       spin_lock(&meta_sinfo->lock);
-       if (meta_sinfo->bytes_may_use < num_bytes) {
-               bug = true;
-               meta_sinfo->bytes_may_use = 0;
-       } else {
-               meta_sinfo->bytes_may_use -= num_bytes;
-       }
-       spin_unlock(&meta_sinfo->lock);
+       alloc_target = btrfs_get_alloc_profile(root, 0);
+       block_rsv->space_info = __find_space_info(fs_info,
+                                                 BTRFS_BLOCK_GROUP_METADATA);
  
-       BUG_ON(bug);
+       return block_rsv;
+ }
  
-       return 0;
+ void btrfs_free_block_rsv(struct btrfs_root *root,
+                         struct btrfs_block_rsv *rsv)
+ {
+       if (rsv && atomic_dec_and_test(&rsv->usage)) {
+               btrfs_block_rsv_release(root, rsv, (u64)-1);
+               if (!rsv->durable)
+                       kfree(rsv);
+       }
  }
  
  /*
-  * Reserve some metadata space for use.  We'll calculate the worste case number
-  * of bytes that would be needed to modify num_items number of items.  If we
-  * have space, fantastic, if not, you get -ENOSPC.  Please call
-  * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
-  * items you reserved, since whatever metadata you needed should have already
-  * been allocated.
-  *
-  * This will commit the transaction to make more space if we don't have enough
-  * metadata space.  THe only time we don't do this is if we're reserving space
-  * inside of a transaction, then we will just return -ENOSPC and it is the
-  * callers responsibility to handle it properly.
+  * make the block_rsv struct be able to capture freed space.
+  * the captured space will re-add to the the block_rsv struct
+  * after transaction commit
   */
- int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
+ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+                                struct btrfs_block_rsv *block_rsv)
  {
-       struct btrfs_fs_info *info = root->fs_info;
-       struct btrfs_space_info *meta_sinfo;
-       u64 num_bytes;
-       u64 used;
-       u64 alloc_target;
-       int retries = 0;
+       block_rsv->durable = 1;
+       mutex_lock(&fs_info->durable_block_rsv_mutex);
+       list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
+       mutex_unlock(&fs_info->durable_block_rsv_mutex);
+ }
  
-       /* get the space info for where the metadata will live */
-       alloc_target = btrfs_get_alloc_profile(root, 0);
-       meta_sinfo = __find_space_info(info, alloc_target);
+ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       struct btrfs_block_rsv *block_rsv,
+                       u64 num_bytes, int *retries)
+ {
+       int ret;
  
-       num_bytes = calculate_bytes_needed(root, num_items);
+       if (num_bytes == 0)
+               return 0;
  again:
-       spin_lock(&meta_sinfo->lock);
+       ret = reserve_metadata_bytes(block_rsv, num_bytes);
+       if (!ret) {
+               block_rsv_add_bytes(block_rsv, num_bytes, 1);
+               return 0;
+       }
  
-       if (unlikely(!meta_sinfo->bytes_root))
-               meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+       ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
+       if (ret > 0)
+               goto again;
+       return ret;
+ }
  
-       if (!retries)
-               meta_sinfo->bytes_may_use += num_bytes;
+ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         struct btrfs_block_rsv *block_rsv,
+                         u64 min_reserved, int min_factor)
+ {
+       u64 num_bytes = 0;
+       int commit_trans = 0;
+       int ret = -ENOSPC;
  
-       used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-               meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-               meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-               meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+       if (!block_rsv)
+               return 0;
  
-       if (used > meta_sinfo->total_bytes) {
-               retries++;
-               if (retries == 1) {
-                       if (maybe_allocate_chunk(root, meta_sinfo))
-                               goto again;
-                       retries++;
-               } else {
-                       spin_unlock(&meta_sinfo->lock);
-               }
+       spin_lock(&block_rsv->lock);
+       if (min_factor > 0)
+               num_bytes = div_factor(block_rsv->size, min_factor);
+       if (min_reserved > num_bytes)
+               num_bytes = min_reserved;
  
-               if (retries == 2) {
-                       flush_delalloc(root, meta_sinfo);
-                       goto again;
+       if (block_rsv->reserved >= num_bytes) {
+               ret = 0;
+       } else {
+               num_bytes -= block_rsv->reserved;
+               if (block_rsv->durable &&
+                   block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
+                       commit_trans = 1;
+       }
+       spin_unlock(&block_rsv->lock);
+       if (!ret)
+               return 0;
+       if (block_rsv->refill_used) {
+               ret = reserve_metadata_bytes(block_rsv, num_bytes);
+               if (!ret) {
+                       block_rsv_add_bytes(block_rsv, num_bytes, 0);
+                       return 0;
                }
-               spin_lock(&meta_sinfo->lock);
-               meta_sinfo->bytes_may_use -= num_bytes;
-               spin_unlock(&meta_sinfo->lock);
+       }
  
-               dump_space_info(meta_sinfo, 0, 0);
-               return -ENOSPC;
+       if (commit_trans) {
+               if (trans)
+                       return -EAGAIN;
+               trans = btrfs_join_transaction(root, 1);
+               BUG_ON(IS_ERR(trans));
+               ret = btrfs_commit_transaction(trans, root);
+               return 0;
        }
  
-       check_force_delalloc(meta_sinfo);
-       spin_unlock(&meta_sinfo->lock);
+       WARN_ON(1);
+       printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+               block_rsv->size, block_rsv->reserved,
+               block_rsv->freed[0], block_rsv->freed[1]);
  
-       return 0;
+       return -ENOSPC;
+ }
+ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+                           struct btrfs_block_rsv *dst_rsv,
+                           u64 num_bytes)
+ {
+       return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+ }
+ void btrfs_block_rsv_release(struct btrfs_root *root,
+                            struct btrfs_block_rsv *block_rsv,
+                            u64 num_bytes)
+ {
+       struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+       if (global_rsv->full || global_rsv == block_rsv ||
+           block_rsv->space_info != global_rsv->space_info)
+               global_rsv = NULL;
+       block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
  }
  
  /*
-  * This will check the space that the inode allocates from to make sure we have
-  * enough space for bytes.
+  * helper to calculate size of global block reservation.
+  * the desired value is sum of space used by extent tree,
+  * checksum tree and root tree
   */
- int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
-                               u64 bytes)
+ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
  {
-       struct btrfs_space_info *data_sinfo;
-       u64 used;
-       int ret = 0, committed = 0, flushed = 0;
+       struct btrfs_space_info *sinfo;
+       u64 num_bytes;
+       u64 meta_used;
+       u64 data_used;
+       int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+ #if 0
+       /*
+        * per tree used space accounting can be inaccuracy, so we
+        * can't rely on it.
+        */
+       spin_lock(&fs_info->extent_root->accounting_lock);
+       num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
+       spin_unlock(&fs_info->extent_root->accounting_lock);
  
-       /* make sure bytes are sectorsize aligned */
-       bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+       spin_lock(&fs_info->csum_root->accounting_lock);
+       num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
+       spin_unlock(&fs_info->csum_root->accounting_lock);
  
-       data_sinfo = BTRFS_I(inode)->space_info;
-       if (!data_sinfo)
-               goto alloc;
+       spin_lock(&fs_info->tree_root->accounting_lock);
+       num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
+       spin_unlock(&fs_info->tree_root->accounting_lock);
+ #endif
+       sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+       spin_lock(&sinfo->lock);
+       data_used = sinfo->bytes_used;
+       spin_unlock(&sinfo->lock);
  
- again:
-       /* make sure we have enough space to handle the data first */
-       spin_lock(&data_sinfo->lock);
-       used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
-               data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
-               data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
-               data_sinfo->bytes_super;
+       sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+       spin_lock(&sinfo->lock);
+       meta_used = sinfo->bytes_used;
+       spin_unlock(&sinfo->lock);
  
-       if (used + bytes > data_sinfo->total_bytes) {
-               struct btrfs_trans_handle *trans;
+       num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
+                   csum_size * 2;
+       num_bytes += div64_u64(data_used + meta_used, 50);
  
-               if (!flushed) {
-                       spin_unlock(&data_sinfo->lock);
-                       flush_delalloc(root, data_sinfo);
-                       flushed = 1;
-                       goto again;
-               }
+       if (num_bytes * 3 > meta_used)
+               num_bytes = div64_u64(meta_used, 3);
  
-               /*
-                * if we don't have enough free bytes in this space then we need
-                * to alloc a new chunk.
-                */
-               if (!data_sinfo->full) {
-                       u64 alloc_target;
+       return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+ }
  
-                       data_sinfo->force_alloc = 1;
-                       spin_unlock(&data_sinfo->lock);
- alloc:
-                       alloc_target = btrfs_get_alloc_profile(root, 1);
-                       trans = btrfs_start_transaction(root, 1);
-                       if (!trans)
-                               return -ENOMEM;
+ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
+ {
+       struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+       struct btrfs_space_info *sinfo = block_rsv->space_info;
+       u64 num_bytes;
  
-                       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                            bytes + 2 * 1024 * 1024,
-                                            alloc_target, 0);
-                       btrfs_end_transaction(trans, root);
-                       if (ret)
-                               return ret;
+       num_bytes = calc_global_metadata_size(fs_info);
  
-                       if (!data_sinfo) {
-                               btrfs_set_inode_space_info(root, inode);
-                               data_sinfo = BTRFS_I(inode)->space_info;
-                       }
-                       goto again;
-               }
-               spin_unlock(&data_sinfo->lock);
+       spin_lock(&block_rsv->lock);
+       spin_lock(&sinfo->lock);
  
-               /* commit the current transaction and try again */
-               if (!committed && !root->fs_info->open_ioctl_trans) {
-                       committed = 1;
-                       trans = btrfs_join_transaction(root, 1);
-                       if (!trans)
-                               return -ENOMEM;
-                       ret = btrfs_commit_transaction(trans, root);
-                       if (ret)
-                               return ret;
-                       goto again;
-               }
+       block_rsv->size = num_bytes;
  
-               printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
-                      ", %llu bytes_used, %llu bytes_reserved, "
-                      "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
-                      "%llu total\n", (unsigned long long)bytes,
-                      (unsigned long long)data_sinfo->bytes_delalloc,
-                      (unsigned long long)data_sinfo->bytes_used,
-                      (unsigned long long)data_sinfo->bytes_reserved,
-                      (unsigned long long)data_sinfo->bytes_pinned,
-                      (unsigned long long)data_sinfo->bytes_readonly,
-                      (unsigned long long)data_sinfo->bytes_may_use,
-                      (unsigned long long)data_sinfo->total_bytes);
-               return -ENOSPC;
+       num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
+                   sinfo->bytes_reserved + sinfo->bytes_readonly;
+       if (sinfo->total_bytes > num_bytes) {
+               num_bytes = sinfo->total_bytes - num_bytes;
+               block_rsv->reserved += num_bytes;
+               sinfo->bytes_reserved += num_bytes;
        }
-       data_sinfo->bytes_may_use += bytes;
-       BTRFS_I(inode)->reserved_bytes += bytes;
-       spin_unlock(&data_sinfo->lock);
  
-       return 0;
+       if (block_rsv->reserved >= block_rsv->size) {
+               num_bytes = block_rsv->reserved - block_rsv->size;
+               sinfo->bytes_reserved -= num_bytes;
+               block_rsv->reserved = block_rsv->size;
+               block_rsv->full = 1;
+       }
+ #if 0
+       printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
+               block_rsv->size, block_rsv->reserved);
+ #endif
+       spin_unlock(&sinfo->lock);
+       spin_unlock(&block_rsv->lock);
  }
  
- /*
-  * if there was an error for whatever reason after calling
-  * btrfs_check_data_free_space, call this so we can cleanup the counters.
-  */
- void btrfs_free_reserved_data_space(struct btrfs_root *root,
-                                   struct inode *inode, u64 bytes)
+ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
  {
-       struct btrfs_space_info *data_sinfo;
+       struct btrfs_space_info *space_info;
  
-       /* make sure bytes are sectorsize aligned */
-       bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+       space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+       fs_info->chunk_block_rsv.space_info = space_info;
+       fs_info->chunk_block_rsv.priority = 10;
  
-       data_sinfo = BTRFS_I(inode)->space_info;
-       spin_lock(&data_sinfo->lock);
-       data_sinfo->bytes_may_use -= bytes;
-       BTRFS_I(inode)->reserved_bytes -= bytes;
-       spin_unlock(&data_sinfo->lock);
+       space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+       fs_info->global_block_rsv.space_info = space_info;
+       fs_info->global_block_rsv.priority = 10;
+       fs_info->global_block_rsv.refill_used = 1;
+       fs_info->delalloc_block_rsv.space_info = space_info;
+       fs_info->trans_block_rsv.space_info = space_info;
+       fs_info->empty_block_rsv.space_info = space_info;
+       fs_info->empty_block_rsv.priority = 10;
+       fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
+       fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+       fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
+       fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+       fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
+       btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
+       btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
+       update_global_block_rsv(fs_info);
  }
  
- /* called when we are adding a delalloc extent to the inode's io_tree */
- void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
-                                 u64 bytes)
+ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
  {
-       struct btrfs_space_info *data_sinfo;
+       block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+       WARN_ON(fs_info->delalloc_block_rsv.size > 0);
+       WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
+       WARN_ON(fs_info->trans_block_rsv.size > 0);
+       WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+       WARN_ON(fs_info->chunk_block_rsv.size > 0);
+       WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+ }
  
-       /* get the space info for where this inode will be storing its data */
-       data_sinfo = BTRFS_I(inode)->space_info;
+ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
+ {
+       return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+               3 * num_items;
+ }
  
-       /* make sure we have enough space to handle the data first */
-       spin_lock(&data_sinfo->lock);
-       data_sinfo->bytes_delalloc += bytes;
+ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                int num_items, int *retries)
+ {
+       u64 num_bytes;
+       int ret;
  
-       /*
-        * we are adding a delalloc extent without calling
-        * btrfs_check_data_free_space first.  This happens on a weird
-        * writepage condition, but shouldn't hurt our accounting
-        */
-       if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
-               data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
-               BTRFS_I(inode)->reserved_bytes = 0;
-       } else {
-               data_sinfo->bytes_may_use -= bytes;
-               BTRFS_I(inode)->reserved_bytes -= bytes;
-       }
+       if (num_items == 0 || root->fs_info->chunk_root == root)
+               return 0;
  
-       spin_unlock(&data_sinfo->lock);
+       num_bytes = calc_trans_metadata_size(root, num_items);
+       ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
+                                 num_bytes, retries);
+       if (!ret) {
+               trans->bytes_reserved += num_bytes;
+               trans->block_rsv = &root->fs_info->trans_block_rsv;
+       }
+       return ret;
  }
  
- /* called when we are clearing an delalloc extent from the inode's io_tree */
- void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
-                             u64 bytes)
+ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root)
  {
-       struct btrfs_space_info *info;
+       if (!trans->bytes_reserved)
+               return;
  
-       info = BTRFS_I(inode)->space_info;
+       BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
+       btrfs_block_rsv_release(root, trans->block_rsv,
+                               trans->bytes_reserved);
+       trans->bytes_reserved = 0;
+ }
  
-       spin_lock(&info->lock);
-       info->bytes_delalloc -= bytes;
-       spin_unlock(&info->lock);
+ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+                                 struct inode *inode)
+ {
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+       struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
+       /*
+        * one for deleting orphan item, one for updating inode and
+        * two for calling btrfs_truncate_inode_items.
+        *
+        * btrfs_truncate_inode_items is a delete operation, it frees
+        * more space than it uses in most cases. So two units of
+        * metadata space should be enough for calling it many times.
+        * If all of the metadata space is used, we can commit
+        * transaction and use space it freed.
+        */
+       u64 num_bytes = calc_trans_metadata_size(root, 4);
+       return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
  }
  
static void force_metadata_allocation(struct btrfs_fs_info *info)
void btrfs_orphan_release_metadata(struct inode *inode)
  {
-       struct list_head *head = &info->space_info;
-       struct btrfs_space_info *found;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       u64 num_bytes = calc_trans_metadata_size(root, 4);
+       btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
+ }
  
-       rcu_read_lock();
-       list_for_each_entry_rcu(found, head, list) {
-               if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
-                       found->force_alloc = 1;
-       }
-       rcu_read_unlock();
+ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+                               struct btrfs_pending_snapshot *pending)
+ {
+       struct btrfs_root *root = pending->root;
+       struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+       struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
+       /*
+        * two for root back/forward refs, two for directory entries
+        * and one for root of the snapshot.
+        */
+       u64 num_bytes = calc_trans_metadata_size(root, 5);
+       dst_rsv->space_info = src_rsv->space_info;
+       return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
  }
  
- static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *extent_root, u64 alloc_bytes,
-                         u64 flags, int force)
+ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
  {
-       struct btrfs_space_info *space_info;
-       struct btrfs_fs_info *fs_info = extent_root->fs_info;
-       u64 thresh;
-       int ret = 0;
+       return num_bytes >>= 3;
+ }
  
-       mutex_lock(&fs_info->chunk_mutex);
+ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
+ {
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
+       u64 to_reserve;
+       int nr_extents;
+       int retries = 0;
+       int ret;
  
-       flags = btrfs_reduce_alloc_profile(extent_root, flags);
+       if (btrfs_transaction_in_commit(root->fs_info))
+               schedule_timeout(1);
  
-       space_info = __find_space_info(extent_root->fs_info, flags);
-       if (!space_info) {
-               ret = update_space_info(extent_root->fs_info, flags,
-                                       0, 0, &space_info);
-               BUG_ON(ret);
+       num_bytes = ALIGN(num_bytes, root->sectorsize);
+ again:
+       spin_lock(&BTRFS_I(inode)->accounting_lock);
+       nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
+       if (nr_extents > BTRFS_I(inode)->reserved_extents) {
+               nr_extents -= BTRFS_I(inode)->reserved_extents;
+               to_reserve = calc_trans_metadata_size(root, nr_extents);
+       } else {
+               nr_extents = 0;
+               to_reserve = 0;
        }
-       BUG_ON(!space_info);
  
-       spin_lock(&space_info->lock);
-       if (space_info->force_alloc)
-               force = 1;
-       if (space_info->full) {
-               spin_unlock(&space_info->lock);
-               goto out;
+       to_reserve += calc_csum_metadata_size(inode, num_bytes);
+       ret = reserve_metadata_bytes(block_rsv, to_reserve);
+       if (ret) {
+               spin_unlock(&BTRFS_I(inode)->accounting_lock);
+               ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
+                                          &retries);
+               if (ret > 0)
+                       goto again;
+               return ret;
        }
  
-       thresh = space_info->total_bytes - space_info->bytes_readonly;
-       thresh = div_factor(thresh, 8);
-       if (!force &&
-          (space_info->bytes_used + space_info->bytes_pinned +
-           space_info->bytes_reserved + alloc_bytes) < thresh) {
-               spin_unlock(&space_info->lock);
-               goto out;
-       }
-       spin_unlock(&space_info->lock);
+       BTRFS_I(inode)->reserved_extents += nr_extents;
+       atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+       spin_unlock(&BTRFS_I(inode)->accounting_lock);
  
-       /*
-        * if we're doing a data chunk, go ahead and make sure that
-        * we keep a reasonable number of metadata chunks allocated in the
-        * FS as well.
-        */
-       if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
-               fs_info->data_chunk_allocations++;
-               if (!(fs_info->data_chunk_allocations %
-                     fs_info->metadata_ratio))
-                       force_metadata_allocation(fs_info);
+       block_rsv_add_bytes(block_rsv, to_reserve, 1);
+       if (block_rsv->size > 512 * 1024 * 1024)
+               shrink_delalloc(NULL, root, to_reserve);
+       return 0;
+ }
+ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
+ {
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       u64 to_free;
+       int nr_extents;
+       num_bytes = ALIGN(num_bytes, root->sectorsize);
+       atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+       spin_lock(&BTRFS_I(inode)->accounting_lock);
+       nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
+       if (nr_extents < BTRFS_I(inode)->reserved_extents) {
+               nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
+               BTRFS_I(inode)->reserved_extents -= nr_extents;
+       } else {
+               nr_extents = 0;
        }
+       spin_unlock(&BTRFS_I(inode)->accounting_lock);
  
-       ret = btrfs_alloc_chunk(trans, extent_root, flags);
-       spin_lock(&space_info->lock);
+       to_free = calc_csum_metadata_size(inode, num_bytes);
+       if (nr_extents > 0)
+               to_free += calc_trans_metadata_size(root, nr_extents);
+       btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
+                               to_free);
+ }
+ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+ {
+       int ret;
+       ret = btrfs_check_data_free_space(inode, num_bytes);
        if (ret)
-               space_info->full = 1;
-       space_info->force_alloc = 0;
-       spin_unlock(&space_info->lock);
- out:
-       mutex_unlock(&extent_root->fs_info->chunk_mutex);
-       return ret;
+               return ret;
+       ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
+       if (ret) {
+               btrfs_free_reserved_data_space(inode, num_bytes);
+               return ret;
+       }
+       return 0;
+ }
+ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+ {
+       btrfs_delalloc_release_metadata(inode, num_bytes);
+       btrfs_free_reserved_data_space(inode, num_bytes);
  }
  
  static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
-                             u64 bytenr, u64 num_bytes, int alloc,
-                             int mark_free)
+                             u64 bytenr, u64 num_bytes, int alloc)
  {
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *info = root->fs_info;
+       int factor;
        u64 total = num_bytes;
        u64 old_val;
        u64 byte_in_group;
                cache = btrfs_lookup_block_group(info, bytenr);
                if (!cache)
                        return -1;
+               if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
+                                   BTRFS_BLOCK_GROUP_RAID1 |
+                                   BTRFS_BLOCK_GROUP_RAID10))
+                       factor = 2;
+               else
+                       factor = 1;
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
  
                        old_val += num_bytes;
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->reserved -= num_bytes;
-                       cache->space_info->bytes_used += num_bytes;
                        cache->space_info->bytes_reserved -= num_bytes;
-                       if (cache->ro)
-                               cache->space_info->bytes_readonly -= num_bytes;
+                       cache->space_info->bytes_used += num_bytes;
+                       cache->space_info->disk_used += num_bytes * factor;
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
                } else {
                        old_val -= num_bytes;
-                       cache->space_info->bytes_used -= num_bytes;
-                       if (cache->ro)
-                               cache->space_info->bytes_readonly += num_bytes;
                        btrfs_set_block_group_used(&cache->item, old_val);
+                       cache->pinned += num_bytes;
+                       cache->space_info->bytes_pinned += num_bytes;
+                       cache->space_info->bytes_used -= num_bytes;
+                       cache->space_info->disk_used -= num_bytes * factor;
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
-                       if (mark_free) {
-                               int ret;
  
-                               ret = btrfs_discard_extent(root, bytenr,
-                                                          num_bytes);
-                               WARN_ON(ret);
-                               ret = btrfs_add_free_space(cache, bytenr,
-                                                          num_bytes);
-                               WARN_ON(ret);
-                       }
+                       set_extent_dirty(info->pinned_extents,
+                                        bytenr, bytenr + num_bytes - 1,
+                                        GFP_NOFS | __GFP_NOFAIL);
                }
                btrfs_put_block_group(cache);
                total -= num_bytes;
@@@ -3546,18 -3857,10 +3857,10 @@@ static u64 first_logical_byte(struct bt
        return bytenr;
  }
  
- /*
-  * this function must be called within transaction
-  */
- int btrfs_pin_extent(struct btrfs_root *root,
-                    u64 bytenr, u64 num_bytes, int reserved)
+ static int pin_down_extent(struct btrfs_root *root,
+                          struct btrfs_block_group_cache *cache,
+                          u64 bytenr, u64 num_bytes, int reserved)
  {
-       struct btrfs_fs_info *fs_info = root->fs_info;
-       struct btrfs_block_group_cache *cache;
-       cache = btrfs_lookup_block_group(fs_info, bytenr);
-       BUG_ON(!cache);
        spin_lock(&cache->space_info->lock);
        spin_lock(&cache->lock);
        cache->pinned += num_bytes;
        spin_unlock(&cache->lock);
        spin_unlock(&cache->space_info->lock);
  
-       btrfs_put_block_group(cache);
+       set_extent_dirty(root->fs_info->pinned_extents, bytenr,
+                        bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+       return 0;
+ }
+ /*
+  * this function must be called within transaction
+  */
+ int btrfs_pin_extent(struct btrfs_root *root,
+                    u64 bytenr, u64 num_bytes, int reserved)
+ {
+       struct btrfs_block_group_cache *cache;
  
-       set_extent_dirty(fs_info->pinned_extents,
-                        bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+       cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+       BUG_ON(!cache);
+       pin_down_extent(root, cache, bytenr, num_bytes, reserved);
+       btrfs_put_block_group(cache);
        return 0;
  }
  
- static int update_reserved_extents(struct btrfs_block_group_cache *cache,
-                                  u64 num_bytes, int reserve)
+ /*
+  * update size of reserved extents. this function may return -EAGAIN
+  * if 'reserve' is true or 'sinfo' is false.
+  */
+ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+                                u64 num_bytes, int reserve, int sinfo)
  {
-       spin_lock(&cache->space_info->lock);
-       spin_lock(&cache->lock);
-       if (reserve) {
-               cache->reserved += num_bytes;
-               cache->space_info->bytes_reserved += num_bytes;
+       int ret = 0;
+       if (sinfo) {
+               struct btrfs_space_info *space_info = cache->space_info;
+               spin_lock(&space_info->lock);
+               spin_lock(&cache->lock);
+               if (reserve) {
+                       if (cache->ro) {
+                               ret = -EAGAIN;
+                       } else {
+                               cache->reserved += num_bytes;
+                               space_info->bytes_reserved += num_bytes;
+                       }
+               } else {
+                       if (cache->ro)
+                               space_info->bytes_readonly += num_bytes;
+                       cache->reserved -= num_bytes;
+                       space_info->bytes_reserved -= num_bytes;
+               }
+               spin_unlock(&cache->lock);
+               spin_unlock(&space_info->lock);
        } else {
-               cache->reserved -= num_bytes;
-               cache->space_info->bytes_reserved -= num_bytes;
+               spin_lock(&cache->lock);
+               if (cache->ro) {
+                       ret = -EAGAIN;
+               } else {
+                       if (reserve)
+                               cache->reserved += num_bytes;
+                       else
+                               cache->reserved -= num_bytes;
+               }
+               spin_unlock(&cache->lock);
        }
-       spin_unlock(&cache->lock);
-       spin_unlock(&cache->space_info->lock);
-       return 0;
+       return ret;
  }
  
  int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
                fs_info->pinned_extents = &fs_info->freed_extents[0];
  
        up_write(&fs_info->extent_commit_sem);
+       update_global_block_rsv(fs_info);
        return 0;
  }
  
@@@ -3647,14 -3992,21 +3992,21 @@@ static int unpin_extent_range(struct bt
                        btrfs_add_free_space(cache, start, len);
                }
  
+               start += len;
                spin_lock(&cache->space_info->lock);
                spin_lock(&cache->lock);
                cache->pinned -= len;
                cache->space_info->bytes_pinned -= len;
+               if (cache->ro) {
+                       cache->space_info->bytes_readonly += len;
+               } else if (cache->reserved_pinned > 0) {
+                       len = min(len, cache->reserved_pinned);
+                       cache->reserved_pinned -= len;
+                       cache->space_info->bytes_reserved += len;
+               }
                spin_unlock(&cache->lock);
                spin_unlock(&cache->space_info->lock);
-               start += len;
        }
  
        if (cache)
@@@ -3667,8 -4019,11 +4019,11 @@@ int btrfs_finish_extent_commit(struct b
  {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_io_tree *unpin;
+       struct btrfs_block_rsv *block_rsv;
+       struct btrfs_block_rsv *next_rsv;
        u64 start;
        u64 end;
+       int idx;
        int ret;
  
        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
                cond_resched();
        }
  
-       return ret;
- }
- static int pin_down_bytes(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root,
-                         struct btrfs_path *path,
-                         u64 bytenr, u64 num_bytes,
-                         int is_data, int reserved,
-                         struct extent_buffer **must_clean)
- {
-       int err = 0;
-       struct extent_buffer *buf;
-       if (is_data)
-               goto pinit;
-       /*
-        * discard is sloooow, and so triggering discards on
-        * individual btree blocks isn't a good plan.  Just
-        * pin everything in discard mode.
-        */
-       if (btrfs_test_opt(root, DISCARD))
-               goto pinit;
+       mutex_lock(&fs_info->durable_block_rsv_mutex);
+       list_for_each_entry_safe(block_rsv, next_rsv,
+                                &fs_info->durable_block_rsv_list, list) {
  
-       buf = btrfs_find_tree_block(root, bytenr, num_bytes);
-       if (!buf)
-               goto pinit;
+               idx = trans->transid & 0x1;
+               if (block_rsv->freed[idx] > 0) {
+                       block_rsv_add_bytes(block_rsv,
+                                           block_rsv->freed[idx], 0);
+                       block_rsv->freed[idx] = 0;
+               }
+               if (atomic_read(&block_rsv->usage) == 0) {
+                       btrfs_block_rsv_release(root, block_rsv, (u64)-1);
  
-       /* we can reuse a block if it hasn't been written
-        * and it is from this transaction.  We can't
-        * reuse anything from the tree log root because
-        * it has tiny sub-transactions.
-        */
-       if (btrfs_buffer_uptodate(buf, 0) &&
-           btrfs_try_tree_lock(buf)) {
-               u64 header_owner = btrfs_header_owner(buf);
-               u64 header_transid = btrfs_header_generation(buf);
-               if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
-                   header_transid == trans->transid &&
-                   !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-                       *must_clean = buf;
-                       return 1;
+                       if (block_rsv->freed[0] == 0 &&
+                           block_rsv->freed[1] == 0) {
+                               list_del_init(&block_rsv->list);
+                               kfree(block_rsv);
+                       }
+               } else {
+                       btrfs_block_rsv_release(root, block_rsv, 0);
                }
-               btrfs_tree_unlock(buf);
        }
-       free_extent_buffer(buf);
- pinit:
-       if (path)
-               btrfs_set_path_blocking(path);
-       /* unlocks the pinned mutex */
-       btrfs_pin_extent(root, bytenr, num_bytes, reserved);
+       mutex_unlock(&fs_info->durable_block_rsv_mutex);
  
-       BUG_ON(err < 0);
        return 0;
  }
  
@@@ -3902,9 -4228,6 +4228,6 @@@ static int __btrfs_free_extent(struct b
                        BUG_ON(ret);
                }
        } else {
-               int mark_free = 0;
-               struct extent_buffer *must_clean = NULL;
                if (found_extent) {
                        BUG_ON(is_data && refs_to_drop !=
                               extent_data_ref_count(root, path, iref));
                        }
                }
  
-               ret = pin_down_bytes(trans, root, path, bytenr,
-                                    num_bytes, is_data, 0, &must_clean);
-               if (ret > 0)
-                       mark_free = 1;
-               BUG_ON(ret < 0);
-               /*
-                * it is going to be very rare for someone to be waiting
-                * on the block we're freeing.  del_items might need to
-                * schedule, so rather than get fancy, just force it
-                * to blocking here
-                */
-               if (must_clean)
-                       btrfs_set_lock_blocking(must_clean);
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
                BUG_ON(ret);
                btrfs_release_path(extent_root, path);
  
-               if (must_clean) {
-                       clean_tree_block(NULL, root, must_clean);
-                       btrfs_tree_unlock(must_clean);
-                       free_extent_buffer(must_clean);
-               }
                if (is_data) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
                        BUG_ON(ret);
                             (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
                }
  
-               ret = update_block_group(trans, root, bytenr, num_bytes, 0,
-                                        mark_free);
+               ret = update_block_group(trans, root, bytenr, num_bytes, 0);
                BUG_ON(ret);
        }
        btrfs_free_path(path);
  }
  
  /*
-  * when we free an extent, it is possible (and likely) that we free the last
+  * when we free an block, it is possible (and likely) that we free the last
   * delayed ref for that extent as well.  This searches the delayed ref tree for
   * a given extent, and if there are no other delayed refs to be processed, it
   * removes it from the tree.
@@@ -3972,7 -4274,7 +4274,7 @@@ static noinline int check_ref_cleanup(s
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct rb_node *node;
-       int ret;
+       int ret = 0;
  
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
        list_del_init(&head->cluster);
        spin_unlock(&delayed_refs->lock);
  
-       ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
-                                 &head->node, head->extent_op,
-                                 head->must_insert_reserved);
-       BUG_ON(ret);
+       BUG_ON(head->extent_op);
+       if (head->must_insert_reserved)
+               ret = 1;
+       mutex_unlock(&head->mutex);
        btrfs_put_delayed_ref(&head->node);
-       return 0;
+       return ret;
  out:
        spin_unlock(&delayed_refs->lock);
        return 0;
  }
  
+ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          struct extent_buffer *buf,
+                          u64 parent, int last_ref)
+ {
+       struct btrfs_block_rsv *block_rsv;
+       struct btrfs_block_group_cache *cache = NULL;
+       int ret;
+       if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+               ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
+                                               parent, root->root_key.objectid,
+                                               btrfs_header_level(buf),
+                                               BTRFS_DROP_DELAYED_REF, NULL);
+               BUG_ON(ret);
+       }
+       if (!last_ref)
+               return;
+       block_rsv = get_block_rsv(trans, root);
+       cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+       BUG_ON(block_rsv->space_info != cache->space_info);
+       if (btrfs_header_generation(buf) == trans->transid) {
+               if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+                       ret = check_ref_cleanup(trans, root, buf->start);
+                       if (!ret)
+                               goto pin;
+               }
+               if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+                       pin_down_extent(root, cache, buf->start, buf->len, 1);
+                       goto pin;
+               }
+               WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+               btrfs_add_free_space(cache, buf->start, buf->len);
+               ret = update_reserved_bytes(cache, buf->len, 0, 0);
+               if (ret == -EAGAIN) {
+                       /* block group became read-only */
+                       update_reserved_bytes(cache, buf->len, 0, 1);
+                       goto out;
+               }
+               ret = 1;
+               spin_lock(&block_rsv->lock);
+               if (block_rsv->reserved < block_rsv->size) {
+                       block_rsv->reserved += buf->len;
+                       ret = 0;
+               }
+               spin_unlock(&block_rsv->lock);
+               if (ret) {
+                       spin_lock(&cache->space_info->lock);
+                       cache->space_info->bytes_reserved -= buf->len;
+                       spin_unlock(&cache->space_info->lock);
+               }
+               goto out;
+       }
+ pin:
+       if (block_rsv->durable && !cache->ro) {
+               ret = 0;
+               spin_lock(&cache->lock);
+               if (!cache->ro) {
+                       cache->reserved_pinned += buf->len;
+                       ret = 1;
+               }
+               spin_unlock(&cache->lock);
+               if (ret) {
+                       spin_lock(&block_rsv->lock);
+                       block_rsv->freed[trans->transid & 0x1] += buf->len;
+                       spin_unlock(&block_rsv->lock);
+               }
+       }
+ out:
+       btrfs_put_block_group(cache);
+ }
  int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent,
                                        parent, root_objectid, (int)owner,
                                        BTRFS_DROP_DELAYED_REF, NULL);
                BUG_ON(ret);
-               ret = check_ref_cleanup(trans, root, bytenr);
-               BUG_ON(ret);
        } else {
                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
                                        parent, root_objectid, owner,
        return ret;
  }
  
- int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root,
-                         u64 bytenr, u32 blocksize,
-                         u64 parent, u64 root_objectid, int level)
- {
-       u64 used;
-       spin_lock(&root->node_lock);
-       used = btrfs_root_used(&root->root_item) - blocksize;
-       btrfs_set_root_used(&root->root_item, used);
-       spin_unlock(&root->node_lock);
-       return btrfs_free_extent(trans, root, bytenr, blocksize,
-                                parent, root_objectid, level, 0);
- }
  static u64 stripe_align(struct btrfs_root *root, u64 val)
  {
        u64 mask = ((u64)root->stripesize - 1);
@@@ -4134,6 -4501,22 +4501,22 @@@ wait_block_group_cache_done(struct btrf
        return 0;
  }
  
+ static int get_block_group_index(struct btrfs_block_group_cache *cache)
+ {
+       int index;
+       if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+               index = 0;
+       else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+               index = 1;
+       else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+               index = 2;
+       else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+               index = 3;
+       else
+               index = 4;
+       return index;
+ }
  enum btrfs_loop_type {
        LOOP_FIND_IDEAL = 0,
        LOOP_CACHING_NOWAIT = 1,
@@@ -4155,7 -4538,6 +4538,6 @@@ static noinline int find_free_extent(st
                                     u64 num_bytes, u64 empty_size,
                                     u64 search_start, u64 search_end,
                                     u64 hint_byte, struct btrfs_key *ins,
-                                    u64 exclude_start, u64 exclude_nr,
                                     int data)
  {
        int ret = 0;
        struct btrfs_space_info *space_info;
        int last_ptr_loop = 0;
        int loop = 0;
+       int index = 0;
        bool found_uncached_bg = false;
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
@@@ -4237,6 -4620,7 +4620,7 @@@ ideal_cache
                                btrfs_put_block_group(block_group);
                                up_read(&space_info->groups_sem);
                        } else {
+                               index = get_block_group_index(block_group);
                                goto have_block_group;
                        }
                } else if (block_group) {
        }
  search:
        down_read(&space_info->groups_sem);
-       list_for_each_entry(block_group, &space_info->block_groups, list) {
+       list_for_each_entry(block_group, &space_info->block_groups[index],
+                           list) {
                u64 offset;
                int cached;
  
@@@ -4436,23 -4821,22 +4821,22 @@@ checks
                        goto loop;
                }
  
-               if (exclude_nr > 0 &&
-                   (search_start + num_bytes > exclude_start &&
-                    search_start < exclude_start + exclude_nr)) {
-                       search_start = exclude_start + exclude_nr;
+               ins->objectid = search_start;
+               ins->offset = num_bytes;
+               if (offset < search_start)
+                       btrfs_add_free_space(block_group, offset,
+                                            search_start - offset);
+               BUG_ON(offset > search_start);
  
+               ret = update_reserved_bytes(block_group, num_bytes, 1,
+                                           (data & BTRFS_BLOCK_GROUP_DATA));
+               if (ret == -EAGAIN) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
-                       /*
-                        * if search_start is still in this block group
-                        * then we just re-search this block group
-                        */
-                       if (search_start >= block_group->key.objectid &&
-                           search_start < (block_group->key.objectid +
-                                           block_group->key.offset))
-                               goto have_block_group;
                        goto loop;
                }
  
+               /* we are all good, lets return */
                ins->objectid = search_start;
                ins->offset = num_bytes;
  
                        btrfs_add_free_space(block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
-               update_reserved_extents(block_group, num_bytes, 1);
-               /* we are all good, lets return */
                break;
  loop:
                failed_cluster_refill = false;
                failed_alloc = false;
+               BUG_ON(index != get_block_group_index(block_group));
                btrfs_put_block_group(block_group);
        }
        up_read(&space_info->groups_sem);
  
+       if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+               goto search;
        /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
         *                      for them to make caching progress.  Also
         *                      determine the best possible bg to cache
        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
            (found_uncached_bg || empty_size || empty_cluster ||
             allowed_chunk_alloc)) {
+               index = 0;
                if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
                        found_uncached_bg = false;
                        loop++;
@@@ -4567,31 -4952,30 +4952,30 @@@ static void dump_space_info(struct btrf
                            int dump_block_groups)
  {
        struct btrfs_block_group_cache *cache;
+       int index = 0;
  
        spin_lock(&info->lock);
        printk(KERN_INFO "space_info has %llu free, is %sfull\n",
               (unsigned long long)(info->total_bytes - info->bytes_used -
                                    info->bytes_pinned - info->bytes_reserved -
-                                   info->bytes_super),
+                                   info->bytes_readonly),
               (info->full) ? "" : "not ");
-       printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
-              " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
-              "\n",
+       printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
+              "reserved=%llu, may_use=%llu, readonly=%llu\n",
               (unsigned long long)info->total_bytes,
+              (unsigned long long)info->bytes_used,
               (unsigned long long)info->bytes_pinned,
-              (unsigned long long)info->bytes_delalloc,
+              (unsigned long long)info->bytes_reserved,
               (unsigned long long)info->bytes_may_use,
-              (unsigned long long)info->bytes_used,
-              (unsigned long long)info->bytes_root,
-              (unsigned long long)info->bytes_super,
-              (unsigned long long)info->bytes_reserved);
+              (unsigned long long)info->bytes_readonly);
        spin_unlock(&info->lock);
  
        if (!dump_block_groups)
                return;
  
        down_read(&info->groups_sem);
-       list_for_each_entry(cache, &info->block_groups, list) {
+ again:
+       list_for_each_entry(cache, &info->block_groups[index], list) {
                spin_lock(&cache->lock);
                printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
                       "%llu pinned %llu reserved\n",
                btrfs_dump_free_space(cache, bytes);
                spin_unlock(&cache->lock);
        }
+       if (++index < BTRFS_NR_RAID_TYPES)
+               goto again;
        up_read(&info->groups_sem);
  }
  
@@@ -4628,9 -5014,8 +5014,8 @@@ again
  
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
-                              search_start, search_end, hint_byte, ins,
-                              trans->alloc_exclude_start,
-                              trans->alloc_exclude_nr, data);
+                              search_start, search_end, hint_byte,
+                              ins, data);
  
        if (ret == -ENOSPC && num_bytes > min_alloc_size) {
                num_bytes = num_bytes >> 1;
@@@ -4668,7 -5053,7 +5053,7 @@@ int btrfs_free_reserved_extent(struct b
        ret = btrfs_discard_extent(root, start, len);
  
        btrfs_add_free_space(cache, start, len);
-       update_reserved_extents(cache, len, 0);
+       update_reserved_bytes(cache, len, 0, 1);
        btrfs_put_block_group(cache);
  
        return ret;
@@@ -4731,8 -5116,7 +5116,7 @@@ static int alloc_reserved_file_extent(s
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
  
-       ret = update_block_group(trans, root, ins->objectid, ins->offset,
-                                1, 0);
+       ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
        if (ret) {
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
@@@ -4792,8 -5176,7 +5176,7 @@@ static int alloc_reserved_tree_block(st
        btrfs_mark_buffer_dirty(leaf);
        btrfs_free_path(path);
  
-       ret = update_block_group(trans, root, ins->objectid, ins->offset,
-                                1, 0);
+       ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
        if (ret) {
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
@@@ -4869,73 -5252,14 +5252,14 @@@ int btrfs_alloc_logged_file_extent(stru
                put_caching_control(caching_ctl);
        }
  
-       update_reserved_extents(block_group, ins->offset, 1);
+       ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
+       BUG_ON(ret);
        btrfs_put_block_group(block_group);
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
                                         0, owner, offset, ins, 1);
        return ret;
  }
  
- /*
-  * finds a free extent and does all the dirty work required for allocation
-  * returns the key for the extent through ins, and a tree buffer for
-  * the first block of the extent through buf.
-  *
-  * returns 0 if everything worked, non-zero otherwise.
-  */
- static int alloc_tree_block(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root,
-                           u64 num_bytes, u64 parent, u64 root_objectid,
-                           struct btrfs_disk_key *key, int level,
-                           u64 empty_size, u64 hint_byte, u64 search_end,
-                           struct btrfs_key *ins)
- {
-       int ret;
-       u64 flags = 0;
-       ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
-                                  empty_size, hint_byte, search_end,
-                                  ins, 0);
-       if (ret)
-               return ret;
-       if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
-               if (parent == 0)
-                       parent = ins->objectid;
-               flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
-       } else
-               BUG_ON(parent > 0);
-       if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-               struct btrfs_delayed_extent_op *extent_op;
-               extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-               BUG_ON(!extent_op);
-               if (key)
-                       memcpy(&extent_op->key, key, sizeof(extent_op->key));
-               else
-                       memset(&extent_op->key, 0, sizeof(extent_op->key));
-               extent_op->flags_to_set = flags;
-               extent_op->update_key = 1;
-               extent_op->update_flags = 1;
-               extent_op->is_data = 0;
-               ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
-                                       ins->offset, parent, root_objectid,
-                                       level, BTRFS_ADD_DELAYED_EXTENT,
-                                       extent_op);
-               BUG_ON(ret);
-       }
-       if (root_objectid == root->root_key.objectid) {
-               u64 used;
-               spin_lock(&root->node_lock);
-               used = btrfs_root_used(&root->root_item) + num_bytes;
-               btrfs_set_root_used(&root->root_item, used);
-               spin_unlock(&root->node_lock);
-       }
-       return ret;
- }
  struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
        return buf;
  }
  
+ static struct btrfs_block_rsv *
+ use_block_rsv(struct btrfs_trans_handle *trans,
+             struct btrfs_root *root, u32 blocksize)
+ {
+       struct btrfs_block_rsv *block_rsv;
+       int ret;
+       block_rsv = get_block_rsv(trans, root);
+       if (block_rsv->size == 0) {
+               ret = reserve_metadata_bytes(block_rsv, blocksize);
+               if (ret)
+                       return ERR_PTR(ret);
+               return block_rsv;
+       }
+       ret = block_rsv_use_bytes(block_rsv, blocksize);
+       if (!ret)
+               return block_rsv;
+       WARN_ON(1);
+       printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+               block_rsv->size, block_rsv->reserved,
+               block_rsv->freed[0], block_rsv->freed[1]);
+       return ERR_PTR(-ENOSPC);
+ }
+ static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+ {
+       block_rsv_add_bytes(block_rsv, blocksize, 0);
+       block_rsv_release_bytes(block_rsv, NULL, 0);
+ }
  /*
-  * helper function to allocate a block for a given tree
+  * finds a free extent and does all the dirty work required for allocation
+  * returns the key for the extent through ins, and a tree buffer for
+  * the first block of the extent through buf.
+  *
   * returns the tree buffer or NULL.
   */
  struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        u64 hint, u64 empty_size)
  {
        struct btrfs_key ins;
-       int ret;
+       struct btrfs_block_rsv *block_rsv;
        struct extent_buffer *buf;
+       u64 flags = 0;
+       int ret;
  
-       ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
-                              key, level, empty_size, hint, (u64)-1, &ins);
+       block_rsv = use_block_rsv(trans, root, blocksize);
+       if (IS_ERR(block_rsv))
+               return ERR_CAST(block_rsv);
+       ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
+                                  empty_size, hint, (u64)-1, &ins, 0);
        if (ret) {
-               BUG_ON(ret > 0);
+               unuse_block_rsv(block_rsv, blocksize);
                return ERR_PTR(ret);
        }
  
        buf = btrfs_init_new_buffer(trans, root, ins.objectid,
                                    blocksize, level);
+       BUG_ON(IS_ERR(buf));
+       if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+               if (parent == 0)
+                       parent = ins.objectid;
+               flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+       } else
+               BUG_ON(parent > 0);
+       if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+               struct btrfs_delayed_extent_op *extent_op;
+               extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+               BUG_ON(!extent_op);
+               if (key)
+                       memcpy(&extent_op->key, key, sizeof(extent_op->key));
+               else
+                       memset(&extent_op->key, 0, sizeof(extent_op->key));
+               extent_op->flags_to_set = flags;
+               extent_op->update_key = 1;
+               extent_op->update_flags = 1;
+               extent_op->is_data = 0;
+               ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+                                       ins.offset, parent, root_objectid,
+                                       level, BTRFS_ADD_DELAYED_EXTENT,
+                                       extent_op);
+               BUG_ON(ret);
+       }
        return buf;
  }
  
@@@ -5321,7 -5717,7 +5717,7 @@@ static noinline int walk_up_proc(struc
                                 struct btrfs_path *path,
                                 struct walk_control *wc)
  {
-       int ret = 0;
+       int ret;
        int level = wc->level;
        struct extent_buffer *eb = path->nodes[level];
        u64 parent = 0;
                               btrfs_header_owner(path->nodes[level + 1]));
        }
  
-       ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
-                               root->root_key.objectid, level, 0);
-       BUG_ON(ret);
+       btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
  out:
        wc->refs[level] = 0;
        wc->flags[level] = 0;
-       return ret;
+       return 0;
  }
  
  static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@@ -5483,7 -5877,8 +5877,8 @@@ static noinline int walk_up_tree(struc
   * also make sure backrefs for the shared block and all lower level
   * blocks are properly updated.
   */
- int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+ int btrfs_drop_snapshot(struct btrfs_root *root,
+                       struct btrfs_block_rsv *block_rsv, int update_ref)
  {
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans;
        wc = kzalloc(sizeof(*wc), GFP_NOFS);
        BUG_ON(!wc);
  
-       trans = btrfs_start_transaction(tree_root, 1);
+       trans = btrfs_start_transaction(tree_root, 0);
+       if (block_rsv)
+               trans->block_rsv = block_rsv;
  
        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
                level = btrfs_header_level(root->node);
                }
  
                BUG_ON(wc->level == 0);
-               if (trans->transaction->in_commit ||
-                   trans->transaction->delayed_refs.flushing) {
+               if (btrfs_should_end_transaction(trans, tree_root)) {
                        ret = btrfs_update_root(trans, tree_root,
                                                &root->root_key,
                                                root_item);
                        BUG_ON(ret);
  
-                       btrfs_end_transaction(trans, tree_root);
-                       trans = btrfs_start_transaction(tree_root, 1);
-               } else {
-                       unsigned long update;
-                       update = trans->delayed_ref_updates;
-                       trans->delayed_ref_updates = 0;
-                       if (update)
-                               btrfs_run_delayed_refs(trans, tree_root,
-                                                      update);
+                       btrfs_end_transaction_throttle(trans, tree_root);
+                       trans = btrfs_start_transaction(tree_root, 0);
+                       if (block_rsv)
+                               trans->block_rsv = block_rsv;
                }
        }
        btrfs_release_path(root, path);
                kfree(root);
        }
  out:
-       btrfs_end_transaction(trans, tree_root);
+       btrfs_end_transaction_throttle(trans, tree_root);
        kfree(wc);
        btrfs_free_path(path);
        return err;
@@@ -7228,48 -7619,80 +7619,80 @@@ static u64 update_block_group_flags(str
        return flags;
  }
  
- static int __alloc_chunk_for_shrink(struct btrfs_root *root,
-                    struct btrfs_block_group_cache *shrink_block_group,
-                    int force)
+ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
  {
-       struct btrfs_trans_handle *trans;
-       u64 new_alloc_flags;
-       u64 calc;
+       struct btrfs_space_info *sinfo = cache->space_info;
+       u64 num_bytes;
+       int ret = -ENOSPC;
  
-       spin_lock(&shrink_block_group->lock);
-       if (btrfs_block_group_used(&shrink_block_group->item) +
-           shrink_block_group->reserved > 0) {
-               spin_unlock(&shrink_block_group->lock);
+       if (cache->ro)
+               return 0;
  
-               trans = btrfs_start_transaction(root, 1);
-               spin_lock(&shrink_block_group->lock);
+       spin_lock(&sinfo->lock);
+       spin_lock(&cache->lock);
+       num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+                   cache->bytes_super - btrfs_block_group_used(&cache->item);
+       if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
+           sinfo->bytes_may_use + sinfo->bytes_readonly +
+           cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+               sinfo->bytes_readonly += num_bytes;
+               sinfo->bytes_reserved += cache->reserved_pinned;
+               cache->reserved_pinned = 0;
+               cache->ro = 1;
+               ret = 0;
+       }
+       spin_unlock(&cache->lock);
+       spin_unlock(&sinfo->lock);
+       return ret;
+ }
  
-               new_alloc_flags = update_block_group_flags(root,
-                                                  shrink_block_group->flags);
-               if (new_alloc_flags != shrink_block_group->flags) {
-                       calc =
-                            btrfs_block_group_used(&shrink_block_group->item);
-               } else {
-                       calc = shrink_block_group->key.offset;
-               }
-               spin_unlock(&shrink_block_group->lock);
+ int btrfs_set_block_group_ro(struct btrfs_root *root,
+                            struct btrfs_block_group_cache *cache)
  
-               do_chunk_alloc(trans, root->fs_info->extent_root,
-                              calc + 2 * 1024 * 1024, new_alloc_flags, force);
+ {
+       struct btrfs_trans_handle *trans;
+       u64 alloc_flags;
+       int ret;
  
-               btrfs_end_transaction(trans, root);
-       } else
-               spin_unlock(&shrink_block_group->lock);
-       return 0;
- }
+       BUG_ON(cache->ro);
  
+       trans = btrfs_join_transaction(root, 1);
+       BUG_ON(IS_ERR(trans));
  
- int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
-                                        struct btrfs_block_group_cache *group)
+       alloc_flags = update_block_group_flags(root, cache->flags);
+       if (alloc_flags != cache->flags)
+               do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+       ret = set_block_group_ro(cache);
+       if (!ret)
+               goto out;
+       alloc_flags = get_alloc_profile(root, cache->space_info->flags);
+       ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+       if (ret < 0)
+               goto out;
+       ret = set_block_group_ro(cache);
+ out:
+       btrfs_end_transaction(trans, root);
+       return ret;
+ }
  
+ int btrfs_set_block_group_rw(struct btrfs_root *root,
+                             struct btrfs_block_group_cache *cache)
  {
-       __alloc_chunk_for_shrink(root, group, 1);
-       set_block_group_readonly(group);
+       struct btrfs_space_info *sinfo = cache->space_info;
+       u64 num_bytes;
+       BUG_ON(!cache->ro);
+       spin_lock(&sinfo->lock);
+       spin_lock(&cache->lock);
+       num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+                   cache->bytes_super - btrfs_block_group_used(&cache->item);
+       sinfo->bytes_readonly -= num_bytes;
+       cache->ro = 0;
+       spin_unlock(&cache->lock);
+       spin_unlock(&sinfo->lock);
        return 0;
  }
  
@@@ -7436,17 -7859,33 +7859,33 @@@ int btrfs_free_block_groups(struct btrf
         */
        synchronize_rcu();
  
+       release_global_block_rsv(info);
        while(!list_empty(&info->space_info)) {
                space_info = list_entry(info->space_info.next,
                                        struct btrfs_space_info,
                                        list);
+               if (space_info->bytes_pinned > 0 ||
+                   space_info->bytes_reserved > 0) {
+                       WARN_ON(1);
+                       dump_space_info(space_info, 0, 0);
+               }
                list_del(&space_info->list);
                kfree(space_info);
        }
        return 0;
  }
  
+ static void __link_block_group(struct btrfs_space_info *space_info,
+                              struct btrfs_block_group_cache *cache)
+ {
+       int index = get_block_group_index(cache);
+       down_write(&space_info->groups_sem);
+       list_add_tail(&cache->list, &space_info->block_groups[index]);
+       up_write(&space_info->groups_sem);
+ }
  int btrfs_read_block_groups(struct btrfs_root *root)
  {
        struct btrfs_path *path;
  
        while (1) {
                ret = find_first_block_group(root, path, &key);
-               if (ret > 0) {
-                       ret = 0;
-                       goto error;
-               }
+               if (ret > 0)
+                       break;
                if (ret != 0)
                        goto error;
  
                cache = kzalloc(sizeof(*cache), GFP_NOFS);
                if (!cache) {
                        ret = -ENOMEM;
-                       break;
+                       goto error;
                }
  
                atomic_set(&cache->count, 1);
                BUG_ON(ret);
                cache->space_info = space_info;
                spin_lock(&cache->space_info->lock);
-               cache->space_info->bytes_super += cache->bytes_super;
+               cache->space_info->bytes_readonly += cache->bytes_super;
                spin_unlock(&cache->space_info->lock);
  
-               down_write(&space_info->groups_sem);
-               list_add_tail(&cache->list, &space_info->block_groups);
-               up_write(&space_info->groups_sem);
+               __link_block_group(space_info, cache);
  
                ret = btrfs_add_block_group_cache(root->fs_info, cache);
                BUG_ON(ret);
  
                set_avail_alloc_bits(root->fs_info, cache->flags);
                if (btrfs_chunk_readonly(root, cache->key.objectid))
-                       set_block_group_readonly(cache);
+                       set_block_group_ro(cache);
+       }
+       list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
+               if (!(get_alloc_profile(root, space_info->flags) &
+                     (BTRFS_BLOCK_GROUP_RAID10 |
+                      BTRFS_BLOCK_GROUP_RAID1 |
+                      BTRFS_BLOCK_GROUP_DUP)))
+                       continue;
+               /*
+                * avoid allocating from un-mirrored block group if there are
+                * mirrored block groups.
+                */
+               list_for_each_entry(cache, &space_info->block_groups[3], list)
+                       set_block_group_ro(cache);
+               list_for_each_entry(cache, &space_info->block_groups[4], list)
+                       set_block_group_ro(cache);
        }
+       init_global_block_rsv(info);
        ret = 0;
  error:
        btrfs_free_path(path);
@@@ -7611,12 -8064,10 +8064,10 @@@ int btrfs_make_block_group(struct btrfs
        BUG_ON(ret);
  
        spin_lock(&cache->space_info->lock);
-       cache->space_info->bytes_super += cache->bytes_super;
+       cache->space_info->bytes_readonly += cache->bytes_super;
        spin_unlock(&cache->space_info->lock);
  
-       down_write(&cache->space_info->groups_sem);
-       list_add_tail(&cache->list, &cache->space_info->block_groups);
-       up_write(&cache->space_info->groups_sem);
+       __link_block_group(cache->space_info, cache);
  
        ret = btrfs_add_block_group_cache(root->fs_info, cache);
        BUG_ON(ret);
diff --combined fs/btrfs/inode.c
index d601629b85d171943ee7fa1421b630d7efc7fe35,2551b80183998aeebaffab5a72b477b6212921be..fa6ccc1bfe2a9a73e2de97ed04cac8708efa99f9
@@@ -252,6 -252,7 +252,7 @@@ static noinline int cow_file_range_inli
                                   inline_len, compressed_size,
                                   compressed_pages);
        BUG_ON(ret);
+       btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
        return 0;
  }
@@@ -414,6 -415,7 +415,7 @@@ again
                trans = btrfs_join_transaction(root, 1);
                BUG_ON(!trans);
                btrfs_set_trans_block_group(trans, inode);
+               trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
                /* lets try to make an inline extent */
                if (ret || total_in < (actual_end - start)) {
                             start, end, NULL,
                             EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
                             EXTENT_CLEAR_DELALLOC |
-                            EXTENT_CLEAR_ACCOUNTING |
                             EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
  
                        btrfs_end_transaction(trans, root);
@@@ -697,6 -698,38 +698,38 @@@ retry
        return 0;
  }
  
+ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+                                     u64 num_bytes)
+ {
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct extent_map *em;
+       u64 alloc_hint = 0;
+       read_lock(&em_tree->lock);
+       em = search_extent_mapping(em_tree, start, num_bytes);
+       if (em) {
+               /*
+                * if block start isn't an actual block number then find the
+                * first block in this inode and use that as a hint.  If that
+                * block is also bogus then just don't worry about it.
+                */
+               if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+                       free_extent_map(em);
+                       em = search_extent_mapping(em_tree, 0, 0);
+                       if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+                               alloc_hint = em->block_start;
+                       if (em)
+                               free_extent_map(em);
+               } else {
+                       alloc_hint = em->block_start;
+                       free_extent_map(em);
+               }
+       }
+       read_unlock(&em_tree->lock);
+       return alloc_hint;
+ }
  /*
   * when extent_io.c finds a delayed allocation range in the file,
   * the call backs end up in this code.  The basic idea is to
@@@ -734,6 -767,7 +767,7 @@@ static noinline int cow_file_range(stru
        trans = btrfs_join_transaction(root, 1);
        BUG_ON(!trans);
        btrfs_set_trans_block_group(trans, inode);
+       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
        actual_end = min_t(u64, isize, end + 1);
  
                                     EXTENT_CLEAR_UNLOCK_PAGE |
                                     EXTENT_CLEAR_UNLOCK |
                                     EXTENT_CLEAR_DELALLOC |
-                                    EXTENT_CLEAR_ACCOUNTING |
                                     EXTENT_CLEAR_DIRTY |
                                     EXTENT_SET_WRITEBACK |
                                     EXTENT_END_WRITEBACK);
        BUG_ON(disk_num_bytes >
               btrfs_super_total_bytes(&root->fs_info->super_copy));
  
-       read_lock(&BTRFS_I(inode)->extent_tree.lock);
-       em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
-                                  start, num_bytes);
-       if (em) {
-               /*
-                * if block start isn't an actual block number then find the
-                * first block in this inode and use that as a hint.  If that
-                * block is also bogus then just don't worry about it.
-                */
-               if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-                       free_extent_map(em);
-                       em = search_extent_mapping(em_tree, 0, 0);
-                       if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
-                               alloc_hint = em->block_start;
-                       if (em)
-                               free_extent_map(em);
-               } else {
-                       alloc_hint = em->block_start;
-                       free_extent_map(em);
-               }
-       }
-       read_unlock(&BTRFS_I(inode)->extent_tree.lock);
+       alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
  
        while (disk_num_bytes > 0) {
@@@ -1174,6 -1185,13 +1185,13 @@@ out_check
                                               num_bytes, num_bytes, type);
                BUG_ON(ret);
  
+               if (root->root_key.objectid ==
+                   BTRFS_DATA_RELOC_TREE_OBJECTID) {
+                       ret = btrfs_reloc_clone_csums(inode, cur_offset,
+                                                     num_bytes);
+                       BUG_ON(ret);
+               }
                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
                                cur_offset, cur_offset + num_bytes - 1,
                                locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@@ -1226,15 -1244,13 +1244,13 @@@ static int run_delalloc_range(struct in
  }
  
  static int btrfs_split_extent_hook(struct inode *inode,
-                                   struct extent_state *orig, u64 split)
+                                  struct extent_state *orig, u64 split)
  {
+       /* not delalloc, ignore it */
        if (!(orig->state & EXTENT_DELALLOC))
                return 0;
  
-       spin_lock(&BTRFS_I(inode)->accounting_lock);
-       BTRFS_I(inode)->outstanding_extents++;
-       spin_unlock(&BTRFS_I(inode)->accounting_lock);
+       atomic_inc(&BTRFS_I(inode)->outstanding_extents);
        return 0;
  }
  
@@@ -1252,10 -1268,7 +1268,7 @@@ static int btrfs_merge_extent_hook(stru
        if (!(other->state & EXTENT_DELALLOC))
                return 0;
  
-       spin_lock(&BTRFS_I(inode)->accounting_lock);
-       BTRFS_I(inode)->outstanding_extents--;
-       spin_unlock(&BTRFS_I(inode)->accounting_lock);
+       atomic_dec(&BTRFS_I(inode)->outstanding_extents);
        return 0;
  }
  
   * bytes in this file, and to maintain the list of inodes that
   * have pending delalloc work to be done.
   */
- static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
-                      unsigned long old, unsigned long bits)
+ static int btrfs_set_bit_hook(struct inode *inode,
+                             struct extent_state *state, int *bits)
  {
  
        /*
         * but in this case, we are only testeing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
-       if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+       if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
+               u64 len = state->end + 1 - state->start;
  
-               spin_lock(&BTRFS_I(inode)->accounting_lock);
-               BTRFS_I(inode)->outstanding_extents++;
-               spin_unlock(&BTRFS_I(inode)->accounting_lock);
-               btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+               if (*bits & EXTENT_FIRST_DELALLOC)
+                       *bits &= ~EXTENT_FIRST_DELALLOC;
+               else
+                       atomic_inc(&BTRFS_I(inode)->outstanding_extents);
  
                spin_lock(&root->fs_info->delalloc_lock);
-               BTRFS_I(inode)->delalloc_bytes += end - start + 1;
-               root->fs_info->delalloc_bytes += end - start + 1;
+               BTRFS_I(inode)->delalloc_bytes += len;
+               root->fs_info->delalloc_bytes += len;
                if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
                                      &root->fs_info->delalloc_inodes);
   * extent_io.c clear_bit_hook, see set_bit_hook for why
   */
  static int btrfs_clear_bit_hook(struct inode *inode,
-                               struct extent_state *state, unsigned long bits)
+                               struct extent_state *state, int *bits)
  {
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testeing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
-       if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+       if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
+               u64 len = state->end + 1 - state->start;
  
-               if (bits & EXTENT_DO_ACCOUNTING) {
-                       spin_lock(&BTRFS_I(inode)->accounting_lock);
-                       WARN_ON(!BTRFS_I(inode)->outstanding_extents);
-                       BTRFS_I(inode)->outstanding_extents--;
-                       spin_unlock(&BTRFS_I(inode)->accounting_lock);
-                       btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
-               }
+               if (*bits & EXTENT_FIRST_DELALLOC)
+                       *bits &= ~EXTENT_FIRST_DELALLOC;
+               else if (!(*bits & EXTENT_DO_ACCOUNTING))
+                       atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+               if (*bits & EXTENT_DO_ACCOUNTING)
+                       btrfs_delalloc_release_metadata(inode, len);
+               if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+                       btrfs_free_reserved_data_space(inode, len);
  
                spin_lock(&root->fs_info->delalloc_lock);
-               if (state->end - state->start + 1 >
-                   root->fs_info->delalloc_bytes) {
-                       printk(KERN_INFO "btrfs warning: delalloc account "
-                              "%llu %llu\n",
-                              (unsigned long long)
-                              state->end - state->start + 1,
-                              (unsigned long long)
-                              root->fs_info->delalloc_bytes);
-                       btrfs_delalloc_free_space(root, inode, (u64)-1);
-                       root->fs_info->delalloc_bytes = 0;
-                       BTRFS_I(inode)->delalloc_bytes = 0;
-               } else {
-                       btrfs_delalloc_free_space(root, inode,
-                                                 state->end -
-                                                 state->start + 1);
-                       root->fs_info->delalloc_bytes -= state->end -
-                               state->start + 1;
-                       BTRFS_I(inode)->delalloc_bytes -= state->end -
-                               state->start + 1;
-               }
+               root->fs_info->delalloc_bytes -= len;
+               BTRFS_I(inode)->delalloc_bytes -= len;
                if (BTRFS_I(inode)->delalloc_bytes == 0 &&
                    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@@ -1384,7 -1385,8 +1385,8 @@@ int btrfs_merge_bio_hook(struct page *p
   */
  static int __btrfs_submit_bio_start(struct inode *inode, int rw,
                                    struct bio *bio, int mirror_num,
-                                   unsigned long bio_flags)
+                                   unsigned long bio_flags,
+                                   u64 bio_offset)
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
   * are inserted into the btree
   */
  static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
-                         int mirror_num, unsigned long bio_flags)
+                         int mirror_num, unsigned long bio_flags,
+                         u64 bio_offset)
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        return btrfs_map_bio(root, rw, bio, mirror_num, 1);
   * on write, or reading the csums from the tree before a read
   */
  static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                         int mirror_num, unsigned long bio_flags)
+                         int mirror_num, unsigned long bio_flags,
+                         u64 bio_offset)
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
                /* we're doing a write, do the async checksumming */
                return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num,
-                                  bio_flags, __btrfs_submit_bio_start,
+                                  bio_flags, bio_offset,
+                                  __btrfs_submit_bio_start,
                                   __btrfs_submit_bio_done);
        }
  
@@@ -1520,6 -1525,7 +1525,7 @@@ again
                goto again;
        }
  
+       BUG();
        btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
        ClearPageChecked(page);
  out:
@@@ -1650,7 -1656,7 +1656,7 @@@ static int insert_reserved_file_extent(
  static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct btrfs_trans_handle *trans;
+       struct btrfs_trans_handle *trans = NULL;
        struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_state *cached_state = NULL;
                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                if (!ret) {
                        trans = btrfs_join_transaction(root, 1);
+                       btrfs_set_trans_block_group(trans, inode);
+                       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                        ret = btrfs_update_inode(trans, root, inode);
                        BUG_ON(ret);
-                       btrfs_end_transaction(trans, root);
                }
                goto out;
        }
                         0, &cached_state, GFP_NOFS);
  
        trans = btrfs_join_transaction(root, 1);
+       btrfs_set_trans_block_group(trans, inode);
+       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
                compressed = 1;
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                          &ordered_extent->list);
  
-       /* this also removes the ordered extent from the tree */
        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
-       btrfs_end_transaction(trans, root);
  out:
+       btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+       if (trans)
+               btrfs_end_transaction(trans, root);
        /* once for us */
        btrfs_put_ordered_extent(ordered_extent);
        /* once for the tree */
@@@ -1838,7 -1848,7 +1848,7 @@@ static int btrfs_io_failed_hook(struct 
  
        BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
                                                      failrec->last_mirror,
-                                                     failrec->bio_flags);
+                                                     failrec->bio_flags, 0);
        return 0;
  }
  
@@@ -1992,33 -2002,197 +2002,197 @@@ void btrfs_run_delayed_iputs(struct btr
        up_read(&root->fs_info->cleanup_work_sem);
  }
  
+ /*
+  * calculate extra metadata reservation when snapshotting a subvolume
+  * contains orphan files.
+  */
+ void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+                               struct btrfs_pending_snapshot *pending,
+                               u64 *bytes_to_reserve)
+ {
+       struct btrfs_root *root;
+       struct btrfs_block_rsv *block_rsv;
+       u64 num_bytes;
+       int index;
+       root = pending->root;
+       if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+               return;
+       block_rsv = root->orphan_block_rsv;
+       /* orphan block reservation for the snapshot */
+       num_bytes = block_rsv->size;
+       /*
+        * after the snapshot is created, COWing tree blocks may use more
+        * space than it frees. So we should make sure there is enough
+        * reserved space.
+        */
+       index = trans->transid & 0x1;
+       if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+               num_bytes += block_rsv->size -
+                            (block_rsv->reserved + block_rsv->freed[index]);
+       }
+       *bytes_to_reserve += num_bytes;
+ }
+ void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+                               struct btrfs_pending_snapshot *pending)
+ {
+       struct btrfs_root *root = pending->root;
+       struct btrfs_root *snap = pending->snap;
+       struct btrfs_block_rsv *block_rsv;
+       u64 num_bytes;
+       int index;
+       int ret;
+       if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+               return;
+       /* refill source subvolume's orphan block reservation */
+       block_rsv = root->orphan_block_rsv;
+       index = trans->transid & 0x1;
+       if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+               num_bytes = block_rsv->size -
+                           (block_rsv->reserved + block_rsv->freed[index]);
+               ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                             root->orphan_block_rsv,
+                                             num_bytes);
+               BUG_ON(ret);
+       }
+       /* setup orphan block reservation for the snapshot */
+       block_rsv = btrfs_alloc_block_rsv(snap);
+       BUG_ON(!block_rsv);
+       btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
+       snap->orphan_block_rsv = block_rsv;
+       num_bytes = root->orphan_block_rsv->size;
+       ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                     block_rsv, num_bytes);
+       BUG_ON(ret);
+ #if 0
+       /* insert orphan item for the snapshot */
+       WARN_ON(!root->orphan_item_inserted);
+       ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+                                      snap->root_key.objectid);
+       BUG_ON(ret);
+       snap->orphan_item_inserted = 1;
+ #endif
+ }
+ enum btrfs_orphan_cleanup_state {
+       ORPHAN_CLEANUP_STARTED  = 1,
+       ORPHAN_CLEANUP_DONE     = 2,
+ };
+ /*
+  * This is called in transaction commmit time. If there are no orphan
+  * files in the subvolume, it removes orphan item and frees block_rsv
+  * structure.
+  */
+ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root)
+ {
+       int ret;
+       if (!list_empty(&root->orphan_list) ||
+           root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
+               return;
+       if (root->orphan_item_inserted &&
+           btrfs_root_refs(&root->root_item) > 0) {
+               ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+                                           root->root_key.objectid);
+               BUG_ON(ret);
+               root->orphan_item_inserted = 0;
+       }
+       if (root->orphan_block_rsv) {
+               WARN_ON(root->orphan_block_rsv->size > 0);
+               btrfs_free_block_rsv(root, root->orphan_block_rsv);
+               root->orphan_block_rsv = NULL;
+       }
+ }
  /*
   * This creates an orphan entry for the given inode in case something goes
   * wrong in the middle of an unlink/truncate.
+  *
+  * NOTE: caller of this function should reserve 5 units of metadata for
+  *     this function.
   */
  int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       int ret = 0;
+       struct btrfs_block_rsv *block_rsv = NULL;
+       int reserve = 0;
+       int insert = 0;
+       int ret;
+       if (!root->orphan_block_rsv) {
+               block_rsv = btrfs_alloc_block_rsv(root);
+               BUG_ON(!block_rsv);
+       }
  
-       spin_lock(&root->list_lock);
+       spin_lock(&root->orphan_lock);
+       if (!root->orphan_block_rsv) {
+               root->orphan_block_rsv = block_rsv;
+       } else if (block_rsv) {
+               btrfs_free_block_rsv(root, block_rsv);
+               block_rsv = NULL;
+       }
  
-       /* already on the orphan list, we're good */
-       if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-               spin_unlock(&root->list_lock);
-               return 0;
+       if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+               list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+ #if 0
+               /*
+                * For proper ENOSPC handling, we should do orphan
+                * cleanup when mounting. But this introduces backward
+                * compatibility issue.
+                */
+               if (!xchg(&root->orphan_item_inserted, 1))
+                       insert = 2;
+               else
+                       insert = 1;
+ #endif
+               insert = 1;
+       } else {
+               WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
        }
  
-       list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+       if (!BTRFS_I(inode)->orphan_meta_reserved) {
+               BTRFS_I(inode)->orphan_meta_reserved = 1;
+               reserve = 1;
+       }
+       spin_unlock(&root->orphan_lock);
  
-       spin_unlock(&root->list_lock);
+       if (block_rsv)
+               btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
  
-       /*
-        * insert an orphan item to track this unlinked/truncated file
-        */
-       ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+       /* grab metadata reservation from transaction handle */
+       if (reserve) {
+               ret = btrfs_orphan_reserve_metadata(trans, inode);
+               BUG_ON(ret);
+       }
  
-       return ret;
+       /* insert an orphan item to track this unlinked/truncated file */
+       if (insert >= 1) {
+               ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+               BUG_ON(ret);
+       }
+       /* insert an orphan item to track subvolume contains orphan files */
+       if (insert >= 2) {
+               ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+                                              root->root_key.objectid);
+               BUG_ON(ret);
+       }
+       return 0;
  }
  
  /*
  int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       int delete_item = 0;
+       int release_rsv = 0;
        int ret = 0;
  
-       spin_lock(&root->list_lock);
-       if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-               spin_unlock(&root->list_lock);
-               return 0;
+       spin_lock(&root->orphan_lock);
+       if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+               list_del_init(&BTRFS_I(inode)->i_orphan);
+               delete_item = 1;
        }
  
-       list_del_init(&BTRFS_I(inode)->i_orphan);
-       if (!trans) {
-               spin_unlock(&root->list_lock);
-               return 0;
+       if (BTRFS_I(inode)->orphan_meta_reserved) {
+               BTRFS_I(inode)->orphan_meta_reserved = 0;
+               release_rsv = 1;
        }
+       spin_unlock(&root->orphan_lock);
  
-       spin_unlock(&root->list_lock);
+       if (trans && delete_item) {
+               ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+               BUG_ON(ret);
+       }
  
-       ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+       if (release_rsv)
+               btrfs_orphan_release_metadata(inode);
  
-       return ret;
+       return 0;
  }
  
  /*
@@@ -2064,7 -2243,7 +2243,7 @@@ void btrfs_orphan_cleanup(struct btrfs_
        struct inode *inode;
        int ret = 0, nr_unlink = 0, nr_truncate = 0;
  
-       if (!xchg(&root->clean_orphans, 0))
+       if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
                return;
  
        path = btrfs_alloc_path();
                found_key.type = BTRFS_INODE_ITEM_KEY;
                found_key.offset = 0;
                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
-               if (IS_ERR(inode))
-                       break;
+               BUG_ON(IS_ERR(inode));
  
                /*
                 * add this inode to the orphan list so btrfs_orphan_del does
                 * the proper thing when we hit it
                 */
-               spin_lock(&root->list_lock);
+               spin_lock(&root->orphan_lock);
                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-               spin_unlock(&root->list_lock);
+               spin_unlock(&root->orphan_lock);
  
                /*
                 * if this is a bad inode, means we actually succeeded in
                 * do a destroy_inode
                 */
                if (is_bad_inode(inode)) {
-                       trans = btrfs_start_transaction(root, 1);
+                       trans = btrfs_start_transaction(root, 0);
                        btrfs_orphan_del(trans, inode);
                        btrfs_end_transaction(trans, root);
                        iput(inode);
                /* this will do delete_inode and everything for us */
                iput(inode);
        }
+       btrfs_free_path(path);
+       root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
+       if (root->orphan_block_rsv)
+               btrfs_block_rsv_release(root, root->orphan_block_rsv,
+                                       (u64)-1);
+       if (root->orphan_block_rsv || root->orphan_item_inserted) {
+               trans = btrfs_join_transaction(root, 1);
+               btrfs_end_transaction(trans, root);
+       }
  
        if (nr_unlink)
                printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
        if (nr_truncate)
                printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
-       btrfs_free_path(path);
  }
  
  /*
        return ret;
  }
  
- static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+ /* helper to check if there is any shared block in the path */
+ static int check_path_shared(struct btrfs_root *root,
+                            struct btrfs_path *path)
  {
-       struct btrfs_root *root;
-       struct btrfs_trans_handle *trans;
-       struct inode *inode = dentry->d_inode;
+       struct extent_buffer *eb;
+       int level;
        int ret;
-       unsigned long nr = 0;
-       root = BTRFS_I(dir)->root;
-       /*
-        * 5 items for unlink inode
-        * 1 for orphan
-        */
-       ret = btrfs_reserve_metadata_space(root, 6);
-       if (ret)
-               return ret;
+       u64 refs;
  
-       trans = btrfs_start_transaction(root, 1);
-       if (IS_ERR(trans)) {
-               btrfs_unreserve_metadata_space(root, 6);
-               return PTR_ERR(trans);
+       for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+               if (!path->nodes[level])
+                       break;
+               eb = path->nodes[level];
+               if (!btrfs_block_can_be_shared(root, eb))
+                       continue;
+               ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
+                                              &refs, NULL);
+               if (refs > 1)
+                       return 1;
        }
-       btrfs_set_trans_block_group(trans, dir);
-       btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
-       ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
-                                dentry->d_name.name, dentry->d_name.len);
-       if (inode->i_nlink == 0)
-               ret = btrfs_orphan_add(trans, inode);
-       nr = trans->blocks_used;
-       btrfs_end_transaction_throttle(trans, root);
-       btrfs_unreserve_metadata_space(root, 6);
-       btrfs_btree_balance_dirty(root, nr);
-       return ret;
+       return 0;
  }
  
- int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root,
-                       struct inode *dir, u64 objectid,
-                       const char *name, int name_len)
+ /*
+  * helper to start transaction for unlink and rmdir.
+  *
+  * unlink and rmdir are special in btrfs, they do not always free space.
+  * so in enospc case, we should make sure they will free space before
+  * allowing them to use the global metadata reservation.
+  */
+ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
+                                                      struct dentry *dentry)
  {
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_path *path;
-       struct extent_buffer *leaf;
+       struct btrfs_inode_ref *ref;
        struct btrfs_dir_item *di;
-       struct btrfs_key key;
+       struct inode *inode = dentry->d_inode;
        u64 index;
+       int check_link = 1;
+       int err = -ENOSPC;
        int ret;
  
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
+       trans = btrfs_start_transaction(root, 10);
+       if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+               return trans;
  
-       di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
-                                  name, name_len, -1);
-       BUG_ON(!di || IS_ERR(di));
+       if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+               return ERR_PTR(-ENOSPC);
  
-       leaf = path->nodes[0];
-       btrfs_dir_item_key_to_cpu(leaf, di, &key);
-       WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
-       ret = btrfs_delete_one_dir_name(trans, root, path, di);
-       BUG_ON(ret);
-       btrfs_release_path(root, path);
+       /* check if there is someone else holds reference */
+       if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
+               return ERR_PTR(-ENOSPC);
  
-       ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
-                                objectid, root->root_key.objectid,
-                                dir->i_ino, &index, name, name_len);
-       if (ret < 0) {
-               BUG_ON(ret != -ENOENT);
-               di = btrfs_search_dir_index_item(root, path, dir->i_ino,
-                                                name, name_len);
-               BUG_ON(!di || IS_ERR(di));
+       if (atomic_read(&inode->i_count) > 2)
+               return ERR_PTR(-ENOSPC);
  
-               leaf = path->nodes[0];
-               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-               btrfs_release_path(root, path);
-               index = key.offset;
+       if (xchg(&root->fs_info->enospc_unlink, 1))
+               return ERR_PTR(-ENOSPC);
+       path = btrfs_alloc_path();
+       if (!path) {
+               root->fs_info->enospc_unlink = 0;
+               return ERR_PTR(-ENOMEM);
        }
  
-       di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
-                                        index, name, name_len, -1);
-       BUG_ON(!di || IS_ERR(di));
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               btrfs_free_path(path);
+               root->fs_info->enospc_unlink = 0;
+               return trans;
+       }
  
-       leaf = path->nodes[0];
-       btrfs_dir_item_key_to_cpu(leaf, di, &key);
-       WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
-       ret = btrfs_delete_one_dir_name(trans, root, path, di);
-       BUG_ON(ret);
-       btrfs_release_path(root, path);
+       path->skip_locking = 1;
+       path->search_commit_root = 1;
  
-       btrfs_i_size_write(dir, dir->i_size - name_len * 2);
-       dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+       ret = btrfs_lookup_inode(trans, root, path,
+                               &BTRFS_I(dir)->location, 0);
+       if (ret < 0) {
+               err = ret;
+               goto out;
+       }
+       if (ret == 0) {
+               if (check_path_shared(root, path))
+                       goto out;
+       } else {
+               check_link = 0;
+       }
+       btrfs_release_path(root, path);
+       ret = btrfs_lookup_inode(trans, root, path,
+                               &BTRFS_I(inode)->location, 0);
+       if (ret < 0) {
+               err = ret;
+               goto out;
+       }
+       if (ret == 0) {
+               if (check_path_shared(root, path))
+                       goto out;
+       } else {
+               check_link = 0;
+       }
+       btrfs_release_path(root, path);
+       if (ret == 0 && S_ISREG(inode->i_mode)) {
+               ret = btrfs_lookup_file_extent(trans, root, path,
+                                              inode->i_ino, (u64)-1, 0);
+               if (ret < 0) {
+                       err = ret;
+                       goto out;
+               }
+               BUG_ON(ret == 0);
+               if (check_path_shared(root, path))
+                       goto out;
+               btrfs_release_path(root, path);
+       }
+       if (!check_link) {
+               err = 0;
+               goto out;
+       }
+       di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+                               dentry->d_name.name, dentry->d_name.len, 0);
+       if (IS_ERR(di)) {
+               err = PTR_ERR(di);
+               goto out;
+       }
+       if (di) {
+               if (check_path_shared(root, path))
+                       goto out;
+       } else {
+               err = 0;
+               goto out;
+       }
+       btrfs_release_path(root, path);
+       ref = btrfs_lookup_inode_ref(trans, root, path,
+                               dentry->d_name.name, dentry->d_name.len,
+                               inode->i_ino, dir->i_ino, 0);
+       if (IS_ERR(ref)) {
+               err = PTR_ERR(ref);
+               goto out;
+       }
+       BUG_ON(!ref);
+       if (check_path_shared(root, path))
+               goto out;
+       index = btrfs_inode_ref_index(path->nodes[0], ref);
+       btrfs_release_path(root, path);
+       di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
+                               dentry->d_name.name, dentry->d_name.len, 0);
+       if (IS_ERR(di)) {
+               err = PTR_ERR(di);
+               goto out;
+       }
+       BUG_ON(ret == -ENOENT);
+       if (check_path_shared(root, path))
+               goto out;
+       err = 0;
+ out:
+       btrfs_free_path(path);
+       if (err) {
+               btrfs_end_transaction(trans, root);
+               root->fs_info->enospc_unlink = 0;
+               return ERR_PTR(err);
+       }
+       trans->block_rsv = &root->fs_info->global_block_rsv;
+       return trans;
+ }
+ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root)
+ {
+       if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+               BUG_ON(!root->fs_info->enospc_unlink);
+               root->fs_info->enospc_unlink = 0;
+       }
+       btrfs_end_transaction_throttle(trans, root);
+ }
+ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+ {
+       struct btrfs_root *root = BTRFS_I(dir)->root;
+       struct btrfs_trans_handle *trans;
+       struct inode *inode = dentry->d_inode;
+       int ret;
+       unsigned long nr = 0;
+       trans = __unlink_start_trans(dir, dentry);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
+       btrfs_set_trans_block_group(trans, dir);
+       btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
+       ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+                                dentry->d_name.name, dentry->d_name.len);
+       BUG_ON(ret);
+       if (inode->i_nlink == 0) {
+               ret = btrfs_orphan_add(trans, inode);
+               BUG_ON(ret);
+       }
+       nr = trans->blocks_used;
+       __unlink_end_trans(trans, root);
+       btrfs_btree_balance_dirty(root, nr);
+       return ret;
+ }
+ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       struct inode *dir, u64 objectid,
+                       const char *name, int name_len)
+ {
+       struct btrfs_path *path;
+       struct extent_buffer *leaf;
+       struct btrfs_dir_item *di;
+       struct btrfs_key key;
+       u64 index;
+       int ret;
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+                                  name, name_len, -1);
+       BUG_ON(!di || IS_ERR(di));
+       leaf = path->nodes[0];
+       btrfs_dir_item_key_to_cpu(leaf, di, &key);
+       WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+       ret = btrfs_delete_one_dir_name(trans, root, path, di);
+       BUG_ON(ret);
+       btrfs_release_path(root, path);
+       ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+                                objectid, root->root_key.objectid,
+                                dir->i_ino, &index, name, name_len);
+       if (ret < 0) {
+               BUG_ON(ret != -ENOENT);
+               di = btrfs_search_dir_index_item(root, path, dir->i_ino,
+                                                name, name_len);
+               BUG_ON(!di || IS_ERR(di));
+               leaf = path->nodes[0];
+               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+               btrfs_release_path(root, path);
+               index = key.offset;
+       }
+       di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+                                        index, name, name_len, -1);
+       BUG_ON(!di || IS_ERR(di));
+       leaf = path->nodes[0];
+       btrfs_dir_item_key_to_cpu(leaf, di, &key);
+       WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+       ret = btrfs_delete_one_dir_name(trans, root, path, di);
+       BUG_ON(ret);
+       btrfs_release_path(root, path);
+       btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+       dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, root, dir);
        BUG_ON(ret);
        dir->i_sb->s_dirt = 1;
@@@ -2587,7 -2948,6 +2948,6 @@@ static int btrfs_rmdir(struct inode *di
  {
        struct inode *inode = dentry->d_inode;
        int err = 0;
-       int ret;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_trans_handle *trans;
        unsigned long nr = 0;
            inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                return -ENOTEMPTY;
  
-       ret = btrfs_reserve_metadata_space(root, 5);
-       if (ret)
-               return ret;
-       trans = btrfs_start_transaction(root, 1);
-       if (IS_ERR(trans)) {
-               btrfs_unreserve_metadata_space(root, 5);
+       trans = __unlink_start_trans(dir, dentry);
+       if (IS_ERR(trans))
                return PTR_ERR(trans);
-       }
  
        btrfs_set_trans_block_group(trans, dir);
  
                btrfs_i_size_write(inode, 0);
  out:
        nr = trans->blocks_used;
-       ret = btrfs_end_transaction_throttle(trans, root);
-       btrfs_unreserve_metadata_space(root, 5);
+       __unlink_end_trans(trans, root);
        btrfs_btree_balance_dirty(root, nr);
  
-       if (ret && !err)
-               err = ret;
        return err;
  }
  
@@@ -3029,6 -3380,7 +3380,7 @@@ out
        if (pending_del_nr) {
                ret = btrfs_del_items(trans, root, path, pending_del_slot,
                                      pending_del_nr);
+               BUG_ON(ret);
        }
        btrfs_free_path(path);
        return err;
@@@ -3056,11 -3408,7 +3408,7 @@@ static int btrfs_truncate_page(struct a
  
        if ((offset & (blocksize - 1)) == 0)
                goto out;
-       ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
-       if (ret)
-               goto out;
-       ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+       ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
        if (ret)
                goto out;
  
  again:
        page = grab_cache_page(mapping, index);
        if (!page) {
-               btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-               btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+               btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
                goto out;
        }
  
  
  out_unlock:
        if (ret)
-               btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-       btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+               btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
        unlock_page(page);
        page_cache_release(page);
  out:
@@@ -3145,7 -3491,7 +3491,7 @@@ int btrfs_cont_expand(struct inode *ino
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-       struct extent_map *em;
+       struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        u64 mask = root->sectorsize - 1;
        u64 hole_start = (inode->i_size + mask) & ~mask;
                        u64 hint_byte = 0;
                        hole_size = last_byte - cur_offset;
  
-                       err = btrfs_reserve_metadata_space(root, 2);
-                       if (err)
+                       trans = btrfs_start_transaction(root, 2);
+                       if (IS_ERR(trans)) {
+                               err = PTR_ERR(trans);
                                break;
-                       trans = btrfs_start_transaction(root, 1);
+                       }
                        btrfs_set_trans_block_group(trans, inode);
  
                        err = btrfs_drop_extents(trans, inode, cur_offset,
                                        last_byte - 1, 0);
  
                        btrfs_end_transaction(trans, root);
-                       btrfs_unreserve_metadata_space(root, 2);
                }
                free_extent_map(em);
+               em = NULL;
                cur_offset = last_byte;
                if (cur_offset >= block_end)
                        break;
        }
  
+       free_extent_map(em);
        unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
                             GFP_NOFS);
        return err;
@@@ -3239,11 -3586,10 +3586,10 @@@ static int btrfs_setattr_size(struct in
                }
        }
  
-       ret = btrfs_reserve_metadata_space(root, 1);
-       if (ret)
-               return ret;
+       trans = btrfs_start_transaction(root, 5);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
  
-       trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, inode);
  
        ret = btrfs_orphan_add(trans, inode);
  
        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-       btrfs_unreserve_metadata_space(root, 1);
        btrfs_btree_balance_dirty(root, nr);
  
        if (attr->ia_size > inode->i_size) {
                i_size_write(inode, attr->ia_size);
                btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
  
-               trans = btrfs_start_transaction(root, 1);
+               trans = btrfs_start_transaction(root, 0);
+               BUG_ON(IS_ERR(trans));
                btrfs_set_trans_block_group(trans, inode);
+               trans->block_rsv = root->orphan_block_rsv;
+               BUG_ON(!trans->block_rsv);
  
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
@@@ -3345,10 -3693,21 +3693,21 @@@ void btrfs_delete_inode(struct inode *i
        btrfs_i_size_write(inode, 0);
  
        while (1) {
-               trans = btrfs_start_transaction(root, 1);
+               trans = btrfs_start_transaction(root, 0);
+               BUG_ON(IS_ERR(trans));
                btrfs_set_trans_block_group(trans, inode);
-               ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+               trans->block_rsv = root->orphan_block_rsv;
+               ret = btrfs_block_rsv_check(trans, root,
+                                           root->orphan_block_rsv, 0, 5);
+               if (ret) {
+                       BUG_ON(ret != -EAGAIN);
+                       ret = btrfs_commit_transaction(trans, root);
+                       BUG_ON(ret);
+                       continue;
+               }
  
+               ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
                if (ret != -EAGAIN)
                        break;
  
                btrfs_end_transaction(trans, root);
                trans = NULL;
                btrfs_btree_balance_dirty(root, nr);
        }
  
        if (ret == 0) {
@@@ -3596,40 -3956,10 +3956,10 @@@ again
        return 0;
  }
  
- static noinline void init_btrfs_i(struct inode *inode)
- {
-       struct btrfs_inode *bi = BTRFS_I(inode);
-       bi->generation = 0;
-       bi->sequence = 0;
-       bi->last_trans = 0;
-       bi->last_sub_trans = 0;
-       bi->logged_trans = 0;
-       bi->delalloc_bytes = 0;
-       bi->reserved_bytes = 0;
-       bi->disk_i_size = 0;
-       bi->flags = 0;
-       bi->index_cnt = (u64)-1;
-       bi->last_unlink_trans = 0;
-       bi->ordered_data_close = 0;
-       bi->force_compress = 0;
-       extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
-       extent_io_tree_init(&BTRFS_I(inode)->io_tree,
-                            inode->i_mapping, GFP_NOFS);
-       extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
-                            inode->i_mapping, GFP_NOFS);
-       INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
-       INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
-       RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
-       btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
-       mutex_init(&BTRFS_I(inode)->log_mutex);
- }
  static int btrfs_init_locked_inode(struct inode *inode, void *p)
  {
        struct btrfs_iget_args *args = p;
        inode->i_ino = args->ino;
-       init_btrfs_i(inode);
        BTRFS_I(inode)->root = args->root;
        btrfs_set_inode_space_info(args->root, inode);
        return 0;
@@@ -3692,8 -4022,6 +4022,6 @@@ static struct inode *new_simple_dir(str
        if (!inode)
                return ERR_PTR(-ENOMEM);
  
-       init_btrfs_i(inode);
        BTRFS_I(inode)->root = root;
        memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
        BTRFS_I(inode)->dummy_inode = 1;
@@@ -3950,7 -4278,7 +4278,7 @@@ int btrfs_write_inode(struct inode *ino
        struct btrfs_trans_handle *trans;
        int ret = 0;
  
-       if (root->fs_info->btree_inode == inode)
+       if (BTRFS_I(inode)->dummy_inode)
                return 0;
  
        if (wbc->sync_mode == WB_SYNC_ALL) {
@@@ -3971,10 -4299,38 +4299,38 @@@ void btrfs_dirty_inode(struct inode *in
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
+       int ret;
+       if (BTRFS_I(inode)->dummy_inode)
+               return;
  
        trans = btrfs_join_transaction(root, 1);
        btrfs_set_trans_block_group(trans, inode);
-       btrfs_update_inode(trans, root, inode);
+       ret = btrfs_update_inode(trans, root, inode);
+       if (ret && ret == -ENOSPC) {
+               /* whoops, lets try again with the full transaction */
+               btrfs_end_transaction(trans, root);
+               trans = btrfs_start_transaction(root, 1);
+               if (IS_ERR(trans)) {
+                       if (printk_ratelimit()) {
+                               printk(KERN_ERR "btrfs: fail to "
+                                      "dirty  inode %lu error %ld\n",
+                                      inode->i_ino, PTR_ERR(trans));
+                       }
+                       return;
+               }
+               btrfs_set_trans_block_group(trans, inode);
+               ret = btrfs_update_inode(trans, root, inode);
+               if (ret) {
+                       if (printk_ratelimit()) {
+                               printk(KERN_ERR "btrfs: fail to "
+                                      "dirty  inode %lu error %d\n",
+                                      inode->i_ino, ret);
+                       }
+               }
+       }
        btrfs_end_transaction(trans, root);
  }
  
@@@ -4092,7 -4448,6 +4448,6 @@@ static struct inode *btrfs_new_inode(st
         * btrfs_get_inode_index_count has an explanation for the magic
         * number
         */
-       init_btrfs_i(inode);
        BTRFS_I(inode)->index_cnt = 2;
        BTRFS_I(inode)->root = root;
        BTRFS_I(inode)->generation = trans->transid;
        if (ret != 0)
                goto fail;
  
 -      inode->i_uid = current_fsuid();
 -
 -      if (dir && (dir->i_mode & S_ISGID)) {
 -              inode->i_gid = dir->i_gid;
 -              if (S_ISDIR(mode))
 -                      mode |= S_ISGID;
 -      } else
 -              inode->i_gid = current_fsgid();
 -
 -      inode->i_mode = mode;
 +      inode_init_owner(inode, dir, mode);
        inode->i_ino = objectid;
        inode_set_bytes(inode, 0);
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@@ -4247,26 -4611,21 +4602,21 @@@ static int btrfs_mknod(struct inode *di
        if (!new_valid_dev(rdev))
                return -EINVAL;
  
+       err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+       if (err)
+               return err;
        /*
         * 2 for inode item and ref
         * 2 for dir items
         * 1 for xattr if selinux is on
         */
-       err = btrfs_reserve_metadata_space(root, 5);
-       if (err)
-               return err;
+       trans = btrfs_start_transaction(root, 5);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
  
-       trans = btrfs_start_transaction(root, 1);
-       if (!trans)
-               goto fail;
        btrfs_set_trans_block_group(trans, dir);
  
-       err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-       if (err) {
-               err = -ENOSPC;
-               goto out_unlock;
-       }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
  out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
- fail:
-       btrfs_unreserve_metadata_space(root, 5);
+       btrfs_btree_balance_dirty(root, nr);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
        }
-       btrfs_btree_balance_dirty(root, nr);
        return err;
  }
  
@@@ -4311,32 -4668,26 +4659,26 @@@ static int btrfs_create(struct inode *d
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct inode *inode = NULL;
-       int err;
        int drop_inode = 0;
+       int err;
        unsigned long nr = 0;
        u64 objectid;
        u64 index = 0;
  
+       err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+       if (err)
+               return err;
        /*
         * 2 for inode item and ref
         * 2 for dir items
         * 1 for xattr if selinux is on
         */
-       err = btrfs_reserve_metadata_space(root, 5);
-       if (err)
-               return err;
+       trans = btrfs_start_transaction(root, 5);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
  
-       trans = btrfs_start_transaction(root, 1);
-       if (!trans)
-               goto fail;
        btrfs_set_trans_block_group(trans, dir);
  
-       err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-       if (err) {
-               err = -ENOSPC;
-               goto out_unlock;
-       }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino,
  out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
- fail:
-       btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@@ -4396,21 -4745,21 +4736,21 @@@ static int btrfs_link(struct dentry *ol
        if (root->objectid != BTRFS_I(inode)->root->objectid)
                return -EPERM;
  
-       /*
-        * 1 item for inode ref
-        * 2 items for dir items
-        */
-       err = btrfs_reserve_metadata_space(root, 3);
-       if (err)
-               return err;
        btrfs_inc_nlink(inode);
  
        err = btrfs_set_inode_index(dir, &index);
        if (err)
                goto fail;
  
-       trans = btrfs_start_transaction(root, 1);
+       /*
+        * 1 item for inode ref
+        * 2 items for dir items
+        */
+       trans = btrfs_start_transaction(root, 3);
+       if (IS_ERR(trans)) {
+               err = PTR_ERR(trans);
+               goto fail;
+       }
  
        btrfs_set_trans_block_group(trans, dir);
        atomic_inc(&inode->i_count);
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
  fail:
-       btrfs_unreserve_metadata_space(root, 3);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@@ -4449,28 -4797,20 +4788,20 @@@ static int btrfs_mkdir(struct inode *di
        u64 index = 0;
        unsigned long nr = 1;
  
+       err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+       if (err)
+               return err;
        /*
         * 2 items for inode and ref
         * 2 items for dir items
         * 1 for xattr if selinux is on
         */
-       err = btrfs_reserve_metadata_space(root, 5);
-       if (err)
-               return err;
-       trans = btrfs_start_transaction(root, 1);
-       if (!trans) {
-               err = -ENOMEM;
-               goto out_unlock;
-       }
+       trans = btrfs_start_transaction(root, 5);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
        btrfs_set_trans_block_group(trans, dir);
  
-       err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-       if (err) {
-               err = -ENOSPC;
-               goto out_fail;
-       }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
  out_fail:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
- out_unlock:
-       btrfs_unreserve_metadata_space(root, 5);
        if (drop_on_err)
                iput(inode);
        btrfs_btree_balance_dirty(root, nr);
@@@ -4770,6 -5107,7 +5098,7 @@@ again
                        }
                        flush_dcache_page(page);
                } else if (create && PageUptodate(page)) {
+                       WARN_ON(1);
                        if (!trans) {
                                kunmap(page);
                                free_extent_map(em);
        return em;
  }
  
+ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+                                                 u64 start, u64 len)
+ {
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_trans_handle *trans;
+       struct extent_map *em;
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct btrfs_key ins;
+       u64 alloc_hint;
+       int ret;
+       btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+       trans = btrfs_join_transaction(root, 0);
+       if (!trans)
+               return ERR_PTR(-ENOMEM);
+       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+       alloc_hint = get_extent_allocation_hint(inode, start, len);
+       ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
+                                  alloc_hint, (u64)-1, &ins, 1);
+       if (ret) {
+               em = ERR_PTR(ret);
+               goto out;
+       }
+       em = alloc_extent_map(GFP_NOFS);
+       if (!em) {
+               em = ERR_PTR(-ENOMEM);
+               goto out;
+       }
+       em->start = start;
+       em->orig_start = em->start;
+       em->len = ins.offset;
+       em->block_start = ins.objectid;
+       em->block_len = ins.offset;
+       em->bdev = root->fs_info->fs_devices->latest_bdev;
+       set_bit(EXTENT_FLAG_PINNED, &em->flags);
+       while (1) {
+               write_lock(&em_tree->lock);
+               ret = add_extent_mapping(em_tree, em);
+               write_unlock(&em_tree->lock);
+               if (ret != -EEXIST)
+                       break;
+               btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+       }
+       ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+                                          ins.offset, ins.offset, 0);
+       if (ret) {
+               btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+               em = ERR_PTR(ret);
+       }
+ out:
+       btrfs_end_transaction(trans, root);
+       return em;
+ }
+ /*
+  * returns 1 when the nocow is safe, < 1 on error, 0 if the
+  * block must be cow'd
+  */
+ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+                                     struct inode *inode, u64 offset, u64 len)
+ {
+       struct btrfs_path *path;
+       int ret;
+       struct extent_buffer *leaf;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
+       u64 disk_bytenr;
+       u64 backref_offset;
+       u64 extent_end;
+       u64 num_bytes;
+       int slot;
+       int found_type;
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                                      offset, 0);
+       if (ret < 0)
+               goto out;
+       slot = path->slots[0];
+       if (ret == 1) {
+               if (slot == 0) {
+                       /* can't find the item, must cow */
+                       ret = 0;
+                       goto out;
+               }
+               slot--;
+       }
+       ret = 0;
+       leaf = path->nodes[0];
+       btrfs_item_key_to_cpu(leaf, &key, slot);
+       if (key.objectid != inode->i_ino ||
+           key.type != BTRFS_EXTENT_DATA_KEY) {
+               /* not our file or wrong item type, must cow */
+               goto out;
+       }
+       if (key.offset > offset) {
+               /* Wrong offset, must cow */
+               goto out;
+       }
+       fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+       found_type = btrfs_file_extent_type(leaf, fi);
+       if (found_type != BTRFS_FILE_EXTENT_REG &&
+           found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+               /* not a regular extent, must cow */
+               goto out;
+       }
+       disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+       backref_offset = btrfs_file_extent_offset(leaf, fi);
+       extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+       if (extent_end < offset + len) {
+               /* extent doesn't include our full range, must cow */
+               goto out;
+       }
+       if (btrfs_extent_readonly(root, disk_bytenr))
+               goto out;
+       /*
+        * look for other files referencing this extent, if we
+        * find any we must cow
+        */
+       if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+                                 key.offset - backref_offset, disk_bytenr))
+               goto out;
+       /*
+        * adjust disk_bytenr and num_bytes to cover just the bytes
+        * in this extent we are about to write.  If there
+        * are any csums in that range we have to cow in order
+        * to keep the csums correct
+        */
+       disk_bytenr += backref_offset;
+       disk_bytenr += offset - key.offset;
+       num_bytes = min(offset + len, extent_end) - offset;
+       if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+                               goto out;
+       /*
+        * all of the above have passed, it is safe to overwrite this extent
+        * without cow
+        */
+       ret = 1;
+ out:
+       btrfs_free_path(path);
+       return ret;
+ }
+ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create)
+ {
+       struct extent_map *em;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       u64 start = iblock << inode->i_blkbits;
+       u64 len = bh_result->b_size;
+       struct btrfs_trans_handle *trans;
+       em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+       if (IS_ERR(em))
+               return PTR_ERR(em);
+       /*
+        * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+        * io.  INLINE is special, and we could probably kludge it in here, but
+        * it's still buffered so for safety lets just fall back to the generic
+        * buffered path.
+        *
+        * For COMPRESSED we _have_ to read the entire extent in so we can
+        * decompress it, so there will be buffering required no matter what we
+        * do, so go ahead and fallback to buffered.
+        *
+        * We return -ENOTBLK because thats what makes DIO go ahead and go back
+        * to buffered IO.  Don't blame me, this is the price we pay for using
+        * the generic code.
+        */
+       if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+           em->block_start == EXTENT_MAP_INLINE) {
+               free_extent_map(em);
+               return -ENOTBLK;
+       }
+       /* Just a good old fashioned hole, return */
+       if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+                       test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+               free_extent_map(em);
+               /* DIO will do one hole at a time, so just unlock a sector */
+               unlock_extent(&BTRFS_I(inode)->io_tree, start,
+                             start + root->sectorsize - 1, GFP_NOFS);
+               return 0;
+       }
+       /*
+        * We don't allocate a new extent in the following cases
+        *
+        * 1) The inode is marked as NODATACOW.  In this case we'll just use the
+        * existing extent.
+        * 2) The extent is marked as PREALLOC.  We're good to go here and can
+        * just use the extent.
+        *
+        */
+       if (!create) {
+               len = em->len - (start - em->start);
+               goto map;
+       }
+       if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+           ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+            em->block_start != EXTENT_MAP_HOLE)) {
+               int type;
+               int ret;
+               u64 block_start;
+               if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                       type = BTRFS_ORDERED_PREALLOC;
+               else
+                       type = BTRFS_ORDERED_NOCOW;
+               len = min(len, em->len - (start - em->start));
+               block_start = em->block_start + (start - em->start);
+               /*
+                * we're not going to log anything, but we do need
+                * to make sure the current transaction stays open
+                * while we look for nocow cross refs
+                */
+               trans = btrfs_join_transaction(root, 0);
+               if (!trans)
+                       goto must_cow;
+               if (can_nocow_odirect(trans, inode, start, len) == 1) {
+                       ret = btrfs_add_ordered_extent_dio(inode, start,
+                                          block_start, len, len, type);
+                       btrfs_end_transaction(trans, root);
+                       if (ret) {
+                               free_extent_map(em);
+                               return ret;
+                       }
+                       goto unlock;
+               }
+               btrfs_end_transaction(trans, root);
+       }
+ must_cow:
+       /*
+        * this will cow the extent, reset the len in case we changed
+        * it above
+        */
+       len = bh_result->b_size;
+       free_extent_map(em);
+       em = btrfs_new_extent_direct(inode, start, len);
+       if (IS_ERR(em))
+               return PTR_ERR(em);
+       len = min(len, em->len - (start - em->start));
+ unlock:
+       clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+                         EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+                         0, NULL, GFP_NOFS);
+ map:
+       bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+               inode->i_blkbits;
+       bh_result->b_size = len;
+       bh_result->b_bdev = em->bdev;
+       set_buffer_mapped(bh_result);
+       if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+               set_buffer_new(bh_result);
+       free_extent_map(em);
+       return 0;
+ }
+ struct btrfs_dio_private {
+       struct inode *inode;
+       u64 logical_offset;
+       u64 disk_bytenr;
+       u64 bytes;
+       u32 *csums;
+       void *private;
+ };
+ static void btrfs_endio_direct_read(struct bio *bio, int err)
+ {
+       struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+       struct bio_vec *bvec = bio->bi_io_vec;
+       struct btrfs_dio_private *dip = bio->bi_private;
+       struct inode *inode = dip->inode;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       u64 start;
+       u32 *private = dip->csums;
+       start = dip->logical_offset;
+       do {
+               if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+                       struct page *page = bvec->bv_page;
+                       char *kaddr;
+                       u32 csum = ~(u32)0;
+                       unsigned long flags;
+                       local_irq_save(flags);
+                       kaddr = kmap_atomic(page, KM_IRQ0);
+                       csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+                                              csum, bvec->bv_len);
+                       btrfs_csum_final(csum, (char *)&csum);
+                       kunmap_atomic(kaddr, KM_IRQ0);
+                       local_irq_restore(flags);
+                       flush_dcache_page(bvec->bv_page);
+                       if (csum != *private) {
+                               printk(KERN_ERR "btrfs csum failed ino %lu off"
+                                     " %llu csum %u private %u\n",
+                                     inode->i_ino, (unsigned long long)start,
+                                     csum, *private);
+                               err = -EIO;
+                       }
+               }
+               start += bvec->bv_len;
+               private++;
+               bvec++;
+       } while (bvec <= bvec_end);
+       unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+                     dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+       bio->bi_private = dip->private;
+       kfree(dip->csums);
+       kfree(dip);
+       dio_end_io(bio, err);
+ }
+ static void btrfs_endio_direct_write(struct bio *bio, int err)
+ {
+       struct btrfs_dio_private *dip = bio->bi_private;
+       struct inode *inode = dip->inode;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_trans_handle *trans;
+       struct btrfs_ordered_extent *ordered = NULL;
+       struct extent_state *cached_state = NULL;
+       int ret;
+       if (err)
+               goto out_done;
+       ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+                                            dip->logical_offset, dip->bytes);
+       if (!ret)
+               goto out_done;
+       BUG_ON(!ordered);
+       trans = btrfs_join_transaction(root, 1);
+       if (!trans) {
+               err = -ENOMEM;
+               goto out;
+       }
+       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+       if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+               ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, root, inode);
+               err = ret;
+               goto out;
+       }
+       lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+                        ordered->file_offset + ordered->len - 1, 0,
+                        &cached_state, GFP_NOFS);
+       if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+               ret = btrfs_mark_extent_written(trans, inode,
+                                               ordered->file_offset,
+                                               ordered->file_offset +
+                                               ordered->len);
+               if (ret) {
+                       err = ret;
+                       goto out_unlock;
+               }
+       } else {
+               ret = insert_reserved_file_extent(trans, inode,
+                                                 ordered->file_offset,
+                                                 ordered->start,
+                                                 ordered->disk_len,
+                                                 ordered->len,
+                                                 ordered->len,
+                                                 0, 0, 0,
+                                                 BTRFS_FILE_EXTENT_REG);
+               unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+                                  ordered->file_offset, ordered->len);
+               if (ret) {
+                       err = ret;
+                       WARN_ON(1);
+                       goto out_unlock;
+               }
+       }
+       add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
+       btrfs_ordered_update_i_size(inode, 0, ordered);
+       btrfs_update_inode(trans, root, inode);
+ out_unlock:
+       unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+                            ordered->file_offset + ordered->len - 1,
+                            &cached_state, GFP_NOFS);
+ out:
+       btrfs_delalloc_release_metadata(inode, ordered->len);
+       btrfs_end_transaction(trans, root);
+       btrfs_put_ordered_extent(ordered);
+       btrfs_put_ordered_extent(ordered);
+ out_done:
+       bio->bi_private = dip->private;
+       kfree(dip->csums);
+       kfree(dip);
+       dio_end_io(bio, err);
+ }
+ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+                                   struct bio *bio, int mirror_num,
+                                   unsigned long bio_flags, u64 offset)
+ {
+       int ret;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+       BUG_ON(ret);
+       return 0;
+ }
+ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
+                               loff_t file_offset)
+ {
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_dio_private *dip;
+       struct bio_vec *bvec = bio->bi_io_vec;
+       u64 start;
+       int skip_sum;
+       int write = rw & (1 << BIO_RW);
+       int ret = 0;
+       skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+       dip = kmalloc(sizeof(*dip), GFP_NOFS);
+       if (!dip) {
+               ret = -ENOMEM;
+               goto free_ordered;
+       }
+       dip->csums = NULL;
+       if (!skip_sum) {
+               dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+               if (!dip->csums) {
+                       ret = -ENOMEM;
+                       goto free_ordered;
+               }
+       }
+       dip->private = bio->bi_private;
+       dip->inode = inode;
+       dip->logical_offset = file_offset;
+       start = dip->logical_offset;
+       dip->bytes = 0;
+       do {
+               dip->bytes += bvec->bv_len;
+               bvec++;
+       } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+       dip->disk_bytenr = (u64)bio->bi_sector << 9;
+       bio->bi_private = dip;
+       if (write)
+               bio->bi_end_io = btrfs_endio_direct_write;
+       else
+               bio->bi_end_io = btrfs_endio_direct_read;
+       ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+       if (ret)
+               goto out_err;
+       if (write && !skip_sum) {
+               ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                  inode, rw, bio, 0, 0,
+                                  dip->logical_offset,
+                                  __btrfs_submit_bio_start_direct_io,
+                                  __btrfs_submit_bio_done);
+               if (ret)
+                       goto out_err;
+               return;
+       } else if (!skip_sum)
+               btrfs_lookup_bio_sums_dio(root, inode, bio,
+                                         dip->logical_offset, dip->csums);
+       ret = btrfs_map_bio(root, rw, bio, 0, 1);
+       if (ret)
+               goto out_err;
+       return;
+ out_err:
+       kfree(dip->csums);
+       kfree(dip);
+ free_ordered:
+       /*
+        * If this is a write, we need to clean up the reserved space and kill
+        * the ordered extent.
+        */
+       if (write) {
+               struct btrfs_ordered_extent *ordered;
+               ordered = btrfs_lookup_ordered_extent(inode,
+                                                     dip->logical_offset);
+               if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+                   !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+                       btrfs_free_reserved_extent(root, ordered->start,
+                                                  ordered->disk_len);
+               btrfs_put_ordered_extent(ordered);
+               btrfs_put_ordered_extent(ordered);
+       }
+       bio_endio(bio, ret);
+ }
+ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+                       const struct iovec *iov, loff_t offset,
+                       unsigned long nr_segs)
+ {
+       int seg;
+       size_t size;
+       unsigned long addr;
+       unsigned blocksize_mask = root->sectorsize - 1;
+       ssize_t retval = -EINVAL;
+       loff_t end = offset;
+       if (offset & blocksize_mask)
+               goto out;
+       /* Check the memory alignment.  Blocks cannot straddle pages */
+       for (seg = 0; seg < nr_segs; seg++) {
+               addr = (unsigned long)iov[seg].iov_base;
+               size = iov[seg].iov_len;
+               end += size;
+               if ((addr & blocksize_mask) || (size & blocksize_mask)) 
+                       goto out;
+       }
+       retval = 0;
+ out:
+       return retval;
+ }
  static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                        const struct iovec *iov, loff_t offset,
                        unsigned long nr_segs)
  {
-       return -EINVAL;
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file->f_mapping->host;
+       struct btrfs_ordered_extent *ordered;
+       struct extent_state *cached_state = NULL;
+       u64 lockstart, lockend;
+       ssize_t ret;
+       int writing = rw & WRITE;
+       int write_bits = 0;
+       size_t count = iov_length(iov, nr_segs);
+       if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
+                           offset, nr_segs)) {
+               return 0;
+       }
+       lockstart = offset;
+       lockend = offset + count - 1;
+       if (writing) {
+               ret = btrfs_delalloc_reserve_space(inode, count);
+               if (ret)
+                       goto out;
+       }
+       while (1) {
+               lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                0, &cached_state, GFP_NOFS);
+               /*
+                * We're concerned with the entire range that we're going to be
+                * doing DIO to, so we need to make sure theres no ordered
+                * extents in this range.
+                */
+               ordered = btrfs_lookup_ordered_range(inode, lockstart,
+                                                    lockend - lockstart + 1);
+               if (!ordered)
+                       break;
+               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                    &cached_state, GFP_NOFS);
+               btrfs_start_ordered_extent(inode, ordered, 1);
+               btrfs_put_ordered_extent(ordered);
+               cond_resched();
+       }
+       /*
+        * we don't use btrfs_set_extent_delalloc because we don't want
+        * the dirty or uptodate bits
+        */
+       if (writing) {
+               write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+               ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                    EXTENT_DELALLOC, 0, NULL, &cached_state,
+                                    GFP_NOFS);
+               if (ret) {
+                       clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                        lockend, EXTENT_LOCKED | write_bits,
+                                        1, 0, &cached_state, GFP_NOFS);
+                       goto out;
+               }
+       }
+       free_extent_state(cached_state);
+       cached_state = NULL;
+       ret = __blockdev_direct_IO(rw, iocb, inode,
+                  BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+                  iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+                  btrfs_submit_direct, 0);
+       if (ret < 0 && ret != -EIOCBQUEUED) {
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+                             offset + iov_length(iov, nr_segs) - 1,
+                             EXTENT_LOCKED | write_bits, 1, 0,
+                             &cached_state, GFP_NOFS);
+       } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
+               /*
+                * We're falling back to buffered, unlock the section we didn't
+                * do IO on.
+                */
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+                             offset + iov_length(iov, nr_segs) - 1,
+                             EXTENT_LOCKED | write_bits, 1, 0,
+                             &cached_state, GFP_NOFS);
+       }
+ out:
+       free_extent_state(cached_state);
+       return ret;
  }
  
  static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@@ -5034,7 -6012,7 +6003,7 @@@ int btrfs_page_mkwrite(struct vm_area_s
        u64 page_start;
        u64 page_end;
  
-       ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+       ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
        if (ret) {
                if (ret == -ENOMEM)
                        ret = VM_FAULT_OOM;
                goto out;
        }
  
-       ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-       if (ret) {
-               btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-               ret = VM_FAULT_SIGBUS;
-               goto out;
-       }
        ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
  again:
        lock_page(page);
  
        if ((page->mapping != inode->i_mapping) ||
            (page_start >= size)) {
-               btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
                /* page got truncated out from underneath us */
                goto out_unlock;
        }
                unlock_extent_cached(io_tree, page_start, page_end,
                                     &cached_state, GFP_NOFS);
                ret = VM_FAULT_SIGBUS;
-               btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
                goto out_unlock;
        }
        ret = 0;
        unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
  
  out_unlock:
-       btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
        if (!ret)
                return VM_FAULT_LOCKED;
        unlock_page(page);
+       btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
  out:
        return ret;
  }
@@@ -5155,8 -6124,10 +6115,10 @@@ static void btrfs_truncate(struct inod
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
  
-       trans = btrfs_start_transaction(root, 1);
+       trans = btrfs_start_transaction(root, 0);
+       BUG_ON(IS_ERR(trans));
        btrfs_set_trans_block_group(trans, inode);
+       trans->block_rsv = root->orphan_block_rsv;
  
        /*
         * setattr is responsible for setting the ordered_data_close flag,
                btrfs_add_ordered_operation(trans, root, inode);
  
        while (1) {
+               if (!trans) {
+                       trans = btrfs_start_transaction(root, 0);
+                       BUG_ON(IS_ERR(trans));
+                       btrfs_set_trans_block_group(trans, inode);
+                       trans->block_rsv = root->orphan_block_rsv;
+               }
+               ret = btrfs_block_rsv_check(trans, root,
+                                           root->orphan_block_rsv, 0, 5);
+               if (ret) {
+                       BUG_ON(ret != -EAGAIN);
+                       ret = btrfs_commit_transaction(trans, root);
+                       BUG_ON(ret);
+                       trans = NULL;
+                       continue;
+               }
                ret = btrfs_truncate_inode_items(trans, root, inode,
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
  
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
+               trans = NULL;
                btrfs_btree_balance_dirty(root, nr);
-               trans = btrfs_start_transaction(root, 1);
-               btrfs_set_trans_block_group(trans, inode);
        }
  
        if (ret == 0 && inode->i_nlink > 0) {
@@@ -5254,21 -6240,47 +6231,47 @@@ unsigned long btrfs_force_ra(struct add
  struct inode *btrfs_alloc_inode(struct super_block *sb)
  {
        struct btrfs_inode *ei;
+       struct inode *inode;
  
        ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
+       ei->root = NULL;
+       ei->space_info = NULL;
+       ei->generation = 0;
+       ei->sequence = 0;
        ei->last_trans = 0;
        ei->last_sub_trans = 0;
        ei->logged_trans = 0;
-       ei->outstanding_extents = 0;
-       ei->reserved_extents = 0;
-       ei->root = NULL;
+       ei->delalloc_bytes = 0;
+       ei->reserved_bytes = 0;
+       ei->disk_i_size = 0;
+       ei->flags = 0;
+       ei->index_cnt = (u64)-1;
+       ei->last_unlink_trans = 0;
        spin_lock_init(&ei->accounting_lock);
+       atomic_set(&ei->outstanding_extents, 0);
+       ei->reserved_extents = 0;
+       ei->ordered_data_close = 0;
+       ei->orphan_meta_reserved = 0;
+       ei->dummy_inode = 0;
+       ei->force_compress = 0;
+       inode = &ei->vfs_inode;
+       extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
+       extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
+       extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+       mutex_init(&ei->log_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->i_orphan);
+       INIT_LIST_HEAD(&ei->delalloc_inodes);
        INIT_LIST_HEAD(&ei->ordered_operations);
-       return &ei->vfs_inode;
+       RB_CLEAR_NODE(&ei->rb_node);
+       return inode;
  }
  
  void btrfs_destroy_inode(struct inode *inode)
  
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
+       WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
+       WARN_ON(BTRFS_I(inode)->reserved_extents);
  
        /*
         * This can happen where we create an inode, but somebody else also
                spin_unlock(&root->fs_info->ordered_extent_lock);
        }
  
-       spin_lock(&root->list_lock);
+       spin_lock(&root->orphan_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
                       inode->i_ino);
                list_del_init(&BTRFS_I(inode)->i_orphan);
        }
-       spin_unlock(&root->list_lock);
+       spin_unlock(&root->orphan_lock);
  
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@@ -5425,19 -6439,6 +6430,6 @@@ static int btrfs_rename(struct inode *o
        if (S_ISDIR(old_inode->i_mode) && new_inode &&
            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
-       /*
-        * We want to reserve the absolute worst case amount of items.  So if
-        * both inodes are subvols and we need to unlink them then that would
-        * require 4 item modifications, but if they are both normal inodes it
-        * would require 5 item modifications, so we'll assume their normal
-        * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
-        * should cover the worst case number of items we'll modify.
-        */
-       ret = btrfs_reserve_metadata_space(root, 11);
-       if (ret)
-               return ret;
        /*
         * we're using rename to replace one file with another.
         * and the replacement file is large.  Start IO on it now so
        /* close the racy window with snapshot create/destroy ioctl */
        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                down_read(&root->fs_info->subvol_sem);
+       /*
+        * We want to reserve the absolute worst case amount of items.  So if
+        * both inodes are subvols and we need to unlink them then that would
+        * require 4 item modifications, but if they are both normal inodes it
+        * would require 5 item modifications, so we'll assume their normal
+        * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+        * should cover the worst case number of items we'll modify.
+        */
+       trans = btrfs_start_transaction(root, 20);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
  
-       trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, new_dir);
  
        if (dest != root)
@@@ -5550,7 -6561,6 +6552,6 @@@ out_fail
        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&root->fs_info->subvol_sem);
  
-       btrfs_unreserve_metadata_space(root, 11);
        return ret;
  }
  
@@@ -5602,6 -6612,38 +6603,38 @@@ int btrfs_start_delalloc_inodes(struct 
        return 0;
  }
  
+ int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+ {
+       struct btrfs_inode *binode;
+       struct inode *inode = NULL;
+       spin_lock(&root->fs_info->delalloc_lock);
+       while (!list_empty(&root->fs_info->delalloc_inodes)) {
+               binode = list_entry(root->fs_info->delalloc_inodes.next,
+                                   struct btrfs_inode, delalloc_inodes);
+               inode = igrab(&binode->vfs_inode);
+               if (inode) {
+                       list_move_tail(&binode->delalloc_inodes,
+                                      &root->fs_info->delalloc_inodes);
+                       break;
+               }
+               list_del_init(&binode->delalloc_inodes);
+               cond_resched_lock(&root->fs_info->delalloc_lock);
+       }
+       spin_unlock(&root->fs_info->delalloc_lock);
+       if (inode) {
+               write_inode_now(inode, 0);
+               if (delay_iput)
+                       btrfs_add_delayed_iput(inode);
+               else
+                       iput(inode);
+               return 1;
+       }
+       return 0;
+ }
  static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                         const char *symname)
  {
        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
                return -ENAMETOOLONG;
  
+       err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+       if (err)
+               return err;
        /*
         * 2 items for inode item and ref
         * 2 items for dir items
         * 1 item for xattr if selinux is on
         */
-       err = btrfs_reserve_metadata_space(root, 5);
-       if (err)
-               return err;
+       trans = btrfs_start_transaction(root, 5);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
  
-       trans = btrfs_start_transaction(root, 1);
-       if (!trans)
-               goto out_fail;
        btrfs_set_trans_block_group(trans, dir);
  
-       err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-       if (err) {
-               err = -ENOSPC;
-               goto out_unlock;
-       }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
  out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
- out_fail:
-       btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
        return err;
  }
  
- static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
-                       u64 alloc_hint, int mode, loff_t actual_len)
+ int btrfs_prealloc_file_range(struct inode *inode, int mode,
+                             u64 start, u64 num_bytes, u64 min_size,
+                             loff_t actual_len, u64 *alloc_hint)
  {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
-       u64 num_bytes = end - start;
        int ret = 0;
-       u64 i_size;
  
        while (num_bytes > 0) {
-               trans = btrfs_start_transaction(root, 1);
-               ret = btrfs_reserve_extent(trans, root, num_bytes,
-                                          root->sectorsize, 0, alloc_hint,
-                                          (u64)-1, &ins, 1);
-               if (ret) {
-                       WARN_ON(1);
-                       goto stop_trans;
+               trans = btrfs_start_transaction(root, 3);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       break;
                }
  
-               ret = btrfs_reserve_metadata_space(root, 3);
+               ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
+                                          0, *alloc_hint, (u64)-1, &ins, 1);
                if (ret) {
-                       btrfs_free_reserved_extent(root, ins.objectid,
-                                                  ins.offset);
-                       goto stop_trans;
+                       btrfs_end_transaction(trans, root);
+                       break;
                }
  
                ret = insert_reserved_file_extent(trans, inode,
  
                num_bytes -= ins.offset;
                cur_offset += ins.offset;
-               alloc_hint = ins.objectid + ins.offset;
+               *alloc_hint = ins.objectid + ins.offset;
  
                inode->i_ctime = CURRENT_TIME;
                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-                       (actual_len > inode->i_size) &&
-                       (cur_offset > inode->i_size)) {
+                   (actual_len > inode->i_size) &&
+                   (cur_offset > inode->i_size)) {
                        if (cur_offset > actual_len)
-                               i_size  = actual_len;
+                               i_size_write(inode, actual_len);
                        else
-                               i_size = cur_offset;
-                       i_size_write(inode, i_size);
-                       btrfs_ordered_update_i_size(inode, i_size, NULL);
+                               i_size_write(inode, cur_offset);
+                       i_size_write(inode, cur_offset);
+                       btrfs_ordered_update_i_size(inode, cur_offset, NULL);
                }
  
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
  
                btrfs_end_transaction(trans, root);
-               btrfs_unreserve_metadata_space(root, 3);
        }
        return ret;
- stop_trans:
-       btrfs_end_transaction(trans, root);
-       return ret;
  }
  
  static long btrfs_fallocate(struct inode *inode, int mode,
                        goto out;
        }
  
-       ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
-                                         alloc_end - alloc_start);
+       ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
        if (ret)
                goto out;
  
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                       ret = prealloc_file_range(inode,
-                                                 cur_offset, last_byte,
-                                               alloc_hint, mode, offset+len);
+                       ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
+                                                       last_byte - cur_offset,
+                                                       1 << inode->i_blkbits,
+                                                       offset + len,
+                                                       &alloc_hint);
                        if (ret < 0) {
                                free_extent_map(em);
                                break;
                        }
                }
-               if (em->block_start <= EXTENT_MAP_LAST_BYTE)
-                       alloc_hint = em->block_start;
                free_extent_map(em);
  
                cur_offset = last_byte;
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                             &cached_state, GFP_NOFS);
  
-       btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
-                                      alloc_end - alloc_start);
+       btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
  out:
        mutex_unlock(&inode->i_mutex);
        return ret;
diff --combined fs/btrfs/super.c
index 2909a03e5230d7bf1d0653e4b40bd392a669703a,574285c8cbd4623a22fbb70dc7b98537e0a223ec..d34b2dfc9628cde65f6181b5ed4043cf8603541c
@@@ -498,7 -498,7 +498,7 @@@ int btrfs_sync_fs(struct super_block *s
        btrfs_start_delalloc_inodes(root, 0);
        btrfs_wait_ordered_extents(root, 0, 0);
  
-       trans = btrfs_start_transaction(root, 1);
+       trans = btrfs_start_transaction(root, 0);
        ret = btrfs_commit_transaction(trans, root);
        return ret;
  }
@@@ -694,11 -694,11 +694,11 @@@ static int btrfs_remount(struct super_b
                if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
                        return -EINVAL;
  
-               /* recover relocation */
-               ret = btrfs_recover_relocation(root);
+               ret = btrfs_cleanup_fs_roots(root->fs_info);
                WARN_ON(ret);
  
-               ret = btrfs_cleanup_fs_roots(root->fs_info);
+               /* recover relocation */
+               ret = btrfs_recover_relocation(root);
                WARN_ON(ret);
  
                sb->s_flags &= ~MS_RDONLY;
@@@ -714,34 -714,18 +714,18 @@@ static int btrfs_statfs(struct dentry *
        struct list_head *head = &root->fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
-       u64 data_used = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)root->fs_info->fsid;
  
        rcu_read_lock();
-       list_for_each_entry_rcu(found, head, list) {
-               if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
-                                   BTRFS_BLOCK_GROUP_RAID10|
-                                   BTRFS_BLOCK_GROUP_RAID1)) {
-                       total_used += found->bytes_used;
-                       if (found->flags & BTRFS_BLOCK_GROUP_DATA)
-                               data_used += found->bytes_used;
-                       else
-                               data_used += found->total_bytes;
-               }
-               total_used += found->bytes_used;
-               if (found->flags & BTRFS_BLOCK_GROUP_DATA)
-                       data_used += found->bytes_used;
-               else
-                       data_used += found->total_bytes;
-       }
+       list_for_each_entry_rcu(found, head, list)
+               total_used += found->disk_used;
        rcu_read_unlock();
  
        buf->f_namelen = BTRFS_NAME_LEN;
        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
        buf->f_bfree = buf->f_blocks - (total_used >> bits);
-       buf->f_bavail = buf->f_blocks - (data_used >> bits);
+       buf->f_bavail = buf->f_bfree;
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
  
@@@ -832,14 -816,11 +816,14 @@@ static const struct file_operations btr
  };
  
  static struct miscdevice btrfs_misc = {
 -      .minor          = MISC_DYNAMIC_MINOR,
 +      .minor          = BTRFS_MINOR,
        .name           = "btrfs-control",
        .fops           = &btrfs_ctl_fops
  };
  
 +MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
 +MODULE_ALIAS("devname:btrfs-control");
 +
  static int btrfs_interface_init(void)
  {
        return misc_register(&btrfs_misc);
diff --combined fs/btrfs/xattr.c
index 59acd3eb288adedeedf5a063545c7863bcb49777,007fae581a04ab624ca471e6d58e3226f24194a6..88ecbb215878ae3573a5bbedfc4afadc6ef0a73d
@@@ -154,15 -154,10 +154,10 @@@ int __btrfs_setxattr(struct btrfs_trans
        if (trans)
                return do_setxattr(trans, inode, name, value, size, flags);
  
-       ret = btrfs_reserve_metadata_space(root, 2);
-       if (ret)
-               return ret;
+       trans = btrfs_start_transaction(root, 2);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
  
-       trans = btrfs_start_transaction(root, 1);
-       if (!trans) {
-               ret = -ENOMEM;
-               goto out;
-       }
        btrfs_set_trans_block_group(trans, inode);
  
        ret = do_setxattr(trans, inode, name, value, size, flags);
        BUG_ON(ret);
  out:
        btrfs_end_transaction_throttle(trans, root);
-       btrfs_unreserve_metadata_space(root, 2);
        return ret;
  }
  
@@@ -282,7 -276,7 +276,7 @@@ err
   * List of handlers for synthetic system.* attributes.  All real ondisk
   * attributes are handled directly.
   */
 -struct xattr_handler *btrfs_xattr_handlers[] = {
 +const struct xattr_handler *btrfs_xattr_handlers[] = {
  #ifdef CONFIG_BTRFS_FS_POSIX_ACL
        &btrfs_xattr_acl_access_handler,
        &btrfs_xattr_acl_default_handler,
diff --combined include/linux/fs.h
index 9682d52d1507a55752965ee0c26ba97fdf8adc5c,10704f0086c826b6604fba17730cc5a4ace2a603..85e823adcd4a7d82bc1a1dedf684e02cde5039ac
@@@ -651,7 -651,6 +651,7 @@@ struct block_device 
        int                     bd_openers;
        struct mutex            bd_mutex;       /* open/close mutex */
        struct list_head        bd_inodes;
 +      void *                  bd_claiming;
        void *                  bd_holder;
        int                     bd_holders;
  #ifdef CONFIG_SYSFS
@@@ -1281,12 -1280,10 +1281,12 @@@ static inline int lock_may_write(struc
  
  
  struct fasync_struct {
 -      int     magic;
 -      int     fa_fd;
 -      struct  fasync_struct   *fa_next; /* singly linked list */
 -      struct  file            *fa_file;
 +      spinlock_t              fa_lock;
 +      int                     magic;
 +      int                     fa_fd;
 +      struct fasync_struct    *fa_next; /* singly linked list */
 +      struct file             *fa_file;
 +      struct rcu_head         fa_rcu;
  };
  
  #define FASYNC_MAGIC 0x4601
  extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
  /* can be called from interrupts */
  extern void kill_fasync(struct fasync_struct **, int, int);
 -/* only for net: no internal synchronization */
 -extern void __kill_fasync(struct fasync_struct *, int, int);
  
  extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
  extern int f_setown(struct file *filp, unsigned long arg, int force);
@@@ -1315,6 -1314,8 +1315,6 @@@ extern int send_sigurg(struct fown_stru
  extern struct list_head super_blocks;
  extern spinlock_t sb_lock;
  
 -#define sb_entry(list)  list_entry((list), struct super_block, s_list)
 -#define S_BIAS (1<<30)
  struct super_block {
        struct list_head        s_list;         /* Keep this first */
        dev_t                   s_dev;          /* search index; _not_ kdev_t */
        struct rw_semaphore     s_umount;
        struct mutex            s_lock;
        int                     s_count;
 -      int                     s_need_sync;
        atomic_t                s_active;
  #ifdef CONFIG_SECURITY
        void                    *s_security;
  #endif
 -      struct xattr_handler    **s_xattr;
 +      const struct xattr_handler **s_xattr;
  
        struct list_head        s_inodes;       /* all inodes */
        struct hlist_head       s_anon;         /* anonymous dentries for (nfs) exporting */
@@@ -1429,8 -1431,7 +1429,8 @@@ extern void dentry_unhash(struct dentr
   * VFS file helper functions.
   */
  extern int file_permission(struct file *, int);
 -
 +extern void inode_init_owner(struct inode *inode, const struct inode *dir,
 +                      mode_t mode);
  /*
   * VFS FS_IOC_FIEMAP helper definitions.
   */
@@@ -1743,7 -1744,6 +1743,7 @@@ struct file_system_type 
  
        struct lock_class_key s_lock_key;
        struct lock_class_key s_umount_key;
 +      struct lock_class_key s_vfs_rename_key;
  
        struct lock_class_key i_lock_key;
        struct lock_class_key i_mutex_key;
@@@ -1781,6 -1781,8 +1781,6 @@@ extern int get_sb_pseudo(struct file_sy
        const struct super_operations *ops, unsigned long,
        struct vfsmount *mnt);
  extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
 -int __put_super_and_need_restart(struct super_block *sb);
 -void put_super(struct super_block *sb);
  
  /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
  #define fops_get(fops) \
@@@ -1800,8 -1802,6 +1800,8 @@@ extern void drop_collected_mounts(struc
  extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
                          struct vfsmount *);
  extern int vfs_statfs(struct dentry *, struct kstatfs *);
 +extern int freeze_super(struct super_block *super);
 +extern int thaw_super(struct super_block *super);
  
  extern int current_umask(void);
  
@@@ -2087,9 -2087,9 +2087,9 @@@ extern int __filemap_fdatawrite_range(s
  extern int filemap_fdatawrite_range(struct address_space *mapping,
                                loff_t start, loff_t end);
  
 -extern int vfs_fsync_range(struct file *file, struct dentry *dentry,
 -                         loff_t start, loff_t end, int datasync);
 -extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
 +extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
 +                         int datasync);
 +extern int vfs_fsync(struct file *file, int datasync);
  extern int generic_write_sync(struct file *file, loff_t pos, loff_t count);
  extern void sync_supers(void);
  extern void emergency_sync(void);
@@@ -2228,7 -2228,6 +2228,7 @@@ extern long do_splice_direct(struct fil
  
  extern void
  file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 +extern loff_t noop_llseek(struct file *file, loff_t offset, int origin);
  extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
  extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
  extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset,
@@@ -2251,10 -2250,15 +2251,15 @@@ static inline int xip_truncate_page(str
  #endif
  
  #ifdef CONFIG_BLOCK
+ struct bio;
+ typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
+                           loff_t file_offset);
+ void dio_end_io(struct bio *bio, int error);
  ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        struct block_device *bdev, const struct iovec *iov, loff_t offset,
        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-       int lock_type);
+       dio_submit_t submit_io, int lock_type);
  
  enum {
        /* need locking between buffered and direct access */
@@@ -2270,7 -2274,7 +2275,7 @@@ static inline ssize_t blockdev_direct_I
        dio_iodone_t end_io)
  {
        return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-                                   nr_segs, get_block, end_io,
+                                   nr_segs, get_block, end_io, NULL,
                                    DIO_LOCKING | DIO_SKIP_HOLES);
  }
  
@@@ -2280,7 -2284,7 +2285,7 @@@ static inline ssize_t blockdev_direct_I
        dio_iodone_t end_io)
  {
        return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-                               nr_segs, get_block, end_io, 0);
+                                   nr_segs, get_block, end_io, NULL, 0);
  }
  #endif
  
@@@ -2330,7 -2334,6 +2335,7 @@@ extern struct super_block *get_super(st
  extern struct super_block *get_active_super(struct block_device *bdev);
  extern struct super_block *user_get_super(dev_t);
  extern void drop_super(struct super_block *sb);
 +extern void iterate_supers(void (*)(struct super_block *, void *), void *);
  
  extern int dcache_dir_open(struct inode *, struct file *);
  extern int dcache_dir_close(struct inode *, struct file *);
@@@ -2364,8 -2367,6 +2369,8 @@@ extern void simple_release_fs(struct vf
  
  extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
                        loff_t *ppos, const void *from, size_t available);
 +extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
 +              const void __user *from, size_t count);
  
  extern int simple_fsync(struct file *, struct dentry *, int);
  
diff --combined mm/filemap.c
index 35e12d1865666717f09094090bd5032995b7e63b,829ac9cdbd709da52ee33dfeee771f9abc6d27bf..45a2d18df849b984421f5e756678964b1ae4e74a
@@@ -441,7 -441,7 +441,7 @@@ int add_to_page_cache_lru(struct page *
        /*
         * Splice_read and readahead add shmem/tmpfs pages into the page cache
         * before shmem_readpage has a chance to mark them as SwapBacked: they
 -       * need to go on the active_anon lru below, and mem_cgroup_cache_charge
 +       * need to go on the anon lru below, and mem_cgroup_cache_charge
         * (called in add_to_page_cache) needs to know where they're going too.
         */
        if (mapping_cap_swap_backed(mapping))
                if (page_is_file_cache(page))
                        lru_cache_add_file(page);
                else
 -                      lru_cache_add_active_anon(page);
 +                      lru_cache_add_anon(page);
        }
        return ret;
  }
@@@ -461,15 -461,9 +461,15 @@@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru
  #ifdef CONFIG_NUMA
  struct page *__page_cache_alloc(gfp_t gfp)
  {
 +      int n;
 +      struct page *page;
 +
        if (cpuset_do_page_mem_spread()) {
 -              int n = cpuset_mem_spread_node();
 -              return alloc_pages_exact_node(n, gfp, 0);
 +              get_mems_allowed();
 +              n = cpuset_mem_spread_node();
 +              page = alloc_pages_exact_node(n, gfp, 0);
 +              put_mems_allowed();
 +              return page;
        }
        return alloc_pages(gfp, 0);
  }
@@@ -1105,12 -1099,6 +1105,12 @@@ page_not_up_to_date_locked
                }
  
  readpage:
 +              /*
 +               * A previous I/O error may have been due to temporary
 +               * failures, eg. multipath errors.
 +               * PG_error will be set again if readpage fails.
 +               */
 +              ClearPageError(page);
                /* Start the actual read. The read will unlock the page. */
                error = mapping->a_ops->readpage(filp, page);
  
@@@ -1275,7 -1263,7 +1275,7 @@@ generic_file_aio_read(struct kiocb *ioc
  {
        struct file *filp = iocb->ki_filp;
        ssize_t retval;
-       unsigned long seg;
+       unsigned long seg = 0;
        size_t count;
        loff_t *ppos = &iocb->ki_pos;
  
                                retval = mapping->a_ops->direct_IO(READ, iocb,
                                                        iov, pos, nr_segs);
                        }
-                       if (retval > 0)
+                       if (retval > 0) {
                                *ppos = pos + retval;
-                       if (retval) {
+                               count -= retval;
+                       }
+                       /*
+                        * Btrfs can have a short DIO read if we encounter
+                        * compressed extents, so if there was an error, or if
+                        * we've already read everything we wanted to, or if
+                        * there was a short read because we hit EOF, go ahead
+                        * and return.  Otherwise fallthrough to buffered io for
+                        * the rest of the read.
+                        */
+                       if (retval < 0 || !count || *ppos >= size) {
                                file_accessed(filp);
                                goto out;
                        }
                }
        }
  
+       count = retval;
        for (seg = 0; seg < nr_segs; seg++) {
                read_descriptor_t desc;
+               loff_t offset = 0;
+               /*
+                * If we did a short DIO read we need to skip the section of the
+                * iov that we've already read data into.
+                */
+               if (count) {
+                       if (count > iov[seg].iov_len) {
+                               count -= iov[seg].iov_len;
+                               continue;
+                       }
+                       offset = count;
+                       count = 0;
+               }
  
                desc.written = 0;
-               desc.arg.buf = iov[seg].iov_base;
-               desc.count = iov[seg].iov_len;
+               desc.arg.buf = iov[seg].iov_base + offset;
+               desc.count = iov[seg].iov_len - offset;
                if (desc.count == 0)
                        continue;
                desc.error = 0;