Merge master.kernel.org:/pub/scm/linux/kernel/git/lethal/sh-2.6
[firefly-linux-kernel-4.4.55.git] / fs / ocfs2 / alloc.c
index 0db6a1f724e16d930c564bd4ddabf2163a6e88fa..f5e11f4fa952a424a5259d766a03c1d449c8ec31 100644 (file)
@@ -3726,6 +3726,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
                        u32 cpos,
                        u64 start_blk,
                        u32 new_clusters,
+                       u8 flags,
                        struct ocfs2_alloc_context *meta_ac)
 {
        int status;
@@ -3749,6 +3750,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
        rec.e_cpos = cpu_to_le32(cpos);
        rec.e_blkno = cpu_to_le64(start_blk);
        rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+       rec.e_flags = flags;
 
        status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
                                          &insert);
@@ -4137,7 +4139,374 @@ out:
        return ret;
 }
 
-static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
+                           handle_t *handle, struct ocfs2_path *path,
+                           int index, u32 new_range,
+                           struct ocfs2_alloc_context *meta_ac)
+{
+       int ret, depth, credits = handle->h_buffer_credits;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+       struct buffer_head *last_eb_bh = NULL;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_extent_list *rightmost_el, *el;
+       struct ocfs2_extent_rec split_rec;
+       struct ocfs2_extent_rec *rec;
+       struct ocfs2_insert_type insert;
+
+       /*
+        * Setup the record to split before we grow the tree.
+        */
+       el = path_leaf_el(path);
+       rec = &el->l_recs[index];
+       ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
+
+       depth = path->p_tree_depth;
+       if (depth > 0) {
+               ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                                      le64_to_cpu(di->i_last_eb_blk),
+                                      &last_eb_bh, OCFS2_BH_CACHED, inode);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+               rightmost_el = &eb->h_list;
+       } else
+               rightmost_el = path_leaf_el(path);
+
+       credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
+       ret = ocfs2_extend_trans(handle, credits);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+           le16_to_cpu(rightmost_el->l_count)) {
+               int old_depth = depth;
+
+               ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
+                                     meta_ac);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               if (old_depth != depth) {
+                       eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+                       rightmost_el = &eb->h_list;
+               }
+       }
+
+       memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+       insert.ins_appending = APPEND_NONE;
+       insert.ins_contig = CONTIG_NONE;
+       insert.ins_split = SPLIT_RIGHT;
+       insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
+               - le16_to_cpu(rightmost_el->l_next_free_rec);
+       insert.ins_tree_depth = depth;
+
+       ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
+       if (ret)
+               mlog_errno(ret);
+
+out:
+       brelse(last_eb_bh);
+       return ret;
+}
+
+static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
+                             struct ocfs2_path *path, int index,
+                             struct ocfs2_cached_dealloc_ctxt *dealloc,
+                             u32 cpos, u32 len)
+{
+       int ret;
+       u32 left_cpos, rec_range, trunc_range;
+       int wants_rotate = 0, is_rightmost_tree_rec = 0;
+       struct super_block *sb = inode->i_sb;
+       struct ocfs2_path *left_path = NULL;
+       struct ocfs2_extent_list *el = path_leaf_el(path);
+       struct ocfs2_extent_rec *rec;
+       struct ocfs2_extent_block *eb;
+
+       if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+               ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               index--;
+       }
+
+       if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
+           path->p_tree_depth) {
+               /*
+                * Check whether this is the rightmost tree record. If
+                * we remove all of this record or part of its right
+                * edge then an update of the record lengths above it
+                * will be required.
+                */
+               eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+               if (eb->h_next_leaf_blk == 0)
+                       is_rightmost_tree_rec = 1;
+       }
+
+       rec = &el->l_recs[index];
+       if (index == 0 && path->p_tree_depth &&
+           le32_to_cpu(rec->e_cpos) == cpos) {
+               /*
+                * Changing the leftmost offset (via partial or whole
+                * record truncate) of an interior (or rightmost) path
+                * means we have to update the subtree that is formed
+                * by this leaf and the one to it's left.
+                *
+                * There are two cases we can skip:
+                *   1) Path is the leftmost one in our inode tree.
+                *   2) The leaf is rightmost and will be empty after
+                *      we remove the extent record - the rotate code
+                *      knows how to update the newly formed edge.
+                */
+
+               ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
+                                                   &left_cpos);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
+                       left_path = ocfs2_new_path(path_root_bh(path),
+                                                  path_root_el(path));
+                       if (!left_path) {
+                               ret = -ENOMEM;
+                               mlog_errno(ret);
+                               goto out;
+                       }
+
+                       ret = ocfs2_find_path(inode, left_path, left_cpos);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out;
+                       }
+               }
+       }
+
+       ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                             handle->h_buffer_credits,
+                                             path);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_journal_access_path(inode, handle, path);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_journal_access_path(inode, handle, left_path);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+       trunc_range = cpos + len;
+
+       if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
+               int next_free;
+
+               memset(rec, 0, sizeof(*rec));
+               ocfs2_cleanup_merge(el, index);
+               wants_rotate = 1;
+
+               next_free = le16_to_cpu(el->l_next_free_rec);
+               if (is_rightmost_tree_rec && next_free > 1) {
+                       /*
+                        * We skip the edge update if this path will
+                        * be deleted by the rotate code.
+                        */
+                       rec = &el->l_recs[next_free - 1];
+                       ocfs2_adjust_rightmost_records(inode, handle, path,
+                                                      rec);
+               }
+       } else if (le32_to_cpu(rec->e_cpos) == cpos) {
+               /* Remove leftmost portion of the record. */
+               le32_add_cpu(&rec->e_cpos, len);
+               le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
+               le16_add_cpu(&rec->e_leaf_clusters, -len);
+       } else if (rec_range == trunc_range) {
+               /* Remove rightmost portion of the record */
+               le16_add_cpu(&rec->e_leaf_clusters, -len);
+               if (is_rightmost_tree_rec)
+                       ocfs2_adjust_rightmost_records(inode, handle, path, rec);
+       } else {
+               /* Caller should have trapped this. */
+               mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
+                    "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                    le32_to_cpu(rec->e_cpos),
+                    le16_to_cpu(rec->e_leaf_clusters), cpos, len);
+               BUG();
+       }
+
+       if (left_path) {
+               int subtree_index;
+
+               subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
+               ocfs2_complete_edge_insert(inode, handle, left_path, path,
+                                          subtree_index);
+       }
+
+       ocfs2_journal_dirty(handle, path_leaf_bh(path));
+
+       ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+out:
+       ocfs2_free_path(left_path);
+       return ret;
+}
+
+int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+                       u32 cpos, u32 len, handle_t *handle,
+                       struct ocfs2_alloc_context *meta_ac,
+                       struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+       int ret, index;
+       u32 rec_range, trunc_range;
+       struct ocfs2_extent_rec *rec;
+       struct ocfs2_extent_list *el;
+       struct ocfs2_path *path;
+
+       ocfs2_extent_map_trunc(inode, 0);
+
+       path = ocfs2_new_inode_path(di_bh);
+       if (!path) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_find_path(inode, path, cpos);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       el = path_leaf_el(path);
+       index = ocfs2_search_extent_list(el, cpos);
+       if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+               ocfs2_error(inode->i_sb,
+                           "Inode %llu has an extent at cpos %u which can no "
+                           "longer be found.\n",
+                           (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+               ret = -EROFS;
+               goto out;
+       }
+
+       /*
+        * We have 3 cases of extent removal:
+        *   1) Range covers the entire extent rec
+        *   2) Range begins or ends on one edge of the extent rec
+        *   3) Range is in the middle of the extent rec (no shared edges)
+        *
+        * For case 1 we remove the extent rec and left rotate to
+        * fill the hole.
+        *
+        * For case 2 we just shrink the existing extent rec, with a
+        * tree update if the shrinking edge is also the edge of an
+        * extent block.
+        *
+        * For case 3 we do a right split to turn the extent rec into
+        * something case 2 can handle.
+        */
+       rec = &el->l_recs[index];
+       rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+       trunc_range = cpos + len;
+
+       BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
+
+       mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
+            "(cpos %u, len %u)\n",
+            (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
+            le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
+
+       if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
+               ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
+                                        cpos, len);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       } else {
+               ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
+                                      trunc_range, meta_ac);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               /*
+                * The split could have manipulated the tree enough to
+                * move the record location, so we have to look for it again.
+                */
+               ocfs2_reinit_path(path, 1);
+
+               ret = ocfs2_find_path(inode, path, cpos);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               el = path_leaf_el(path);
+               index = ocfs2_search_extent_list(el, cpos);
+               if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+                       ocfs2_error(inode->i_sb,
+                                   "Inode %llu: split at cpos %u lost record.",
+                                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                   cpos);
+                       ret = -EROFS;
+                       goto out;
+               }
+
+               /*
+                * Double check our values here. If anything is fishy,
+                * it's easier to catch it at the top level.
+                */
+               rec = &el->l_recs[index];
+               rec_range = le32_to_cpu(rec->e_cpos) +
+                       ocfs2_rec_clusters(el, rec);
+               if (rec_range != trunc_range) {
+                       ocfs2_error(inode->i_sb,
+                                   "Inode %llu: error after split at cpos %u"
+                                   "trunc len %u, existing record is (%u,%u)",
+                                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                   cpos, len, le32_to_cpu(rec->e_cpos),
+                                   ocfs2_rec_clusters(el, rec));
+                       ret = -EROFS;
+                       goto out;
+               }
+
+               ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
+                                        cpos, len);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+
+out:
+       ocfs2_free_path(path);
+       return ret;
+}
+
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
 {
        struct buffer_head *tl_bh = osb->osb_tl_bh;
        struct ocfs2_dinode *di;
@@ -4170,10 +4539,10 @@ static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
        return current_tail == new_start;
 }
 
-static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
-                                    handle_t *handle,
-                                    u64 start_blk,
-                                    unsigned int num_clusters)
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+                             handle_t *handle,
+                             u64 start_blk,
+                             unsigned int num_clusters)
 {
        int status, index;
        unsigned int start_cluster, tl_count;
@@ -4329,7 +4698,7 @@ bail:
 }
 
 /* Expects you to already be holding tl_inode->i_mutex */
-static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 {
        int status;
        unsigned int num_to_flush;
@@ -5299,9 +5668,9 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
        return ocfs2_journal_dirty_data(handle, bh);
 }
 
-static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
-                                    struct page **pages, int numpages,
-                                    u64 phys, handle_t *handle)
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
+                                    loff_t end, struct page **pages,
+                                    int numpages, u64 phys, handle_t *handle)
 {
        int i, ret, partial = 0;
        void *kaddr;
@@ -5314,26 +5683,14 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
        if (numpages == 0)
                goto out;
 
-       from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
-       if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
-               /*
-                * Since 'from' has been capped to a value below page
-                * size, this calculation won't be able to overflow
-                * 'to'
-                */
-               to = ocfs2_align_bytes_to_clusters(sb, from);
-
-               /*
-                * The truncate tail in this case should never contain
-                * more than one page at maximum. The loop below also
-                * assumes this.
-                */
-               BUG_ON(numpages != 1);
-       }
-
+       to = PAGE_CACHE_SIZE;
        for(i = 0; i < numpages; i++) {
                page = pages[i];
 
+               from = start & (PAGE_CACHE_SIZE - 1);
+               if ((end >> PAGE_CACHE_SHIFT) == page->index)
+                       to = end & (PAGE_CACHE_SIZE - 1);
+
                BUG_ON(from > PAGE_CACHE_SIZE);
                BUG_ON(to > PAGE_CACHE_SIZE);
 
@@ -5370,10 +5727,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
 
                flush_dcache_page(page);
 
-               /*
-                * Every page after the 1st one should be completely zero'd.
-                */
-               from = 0;
+               start = (page->index + 1) << PAGE_CACHE_SHIFT;
        }
 out:
        if (pages) {
@@ -5386,24 +5740,26 @@ out:
        }
 }
 
-static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
-                               int *num, u64 *phys)
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
+                               struct page **pages, int *num, u64 *phys)
 {
        int i, numpages = 0, ret = 0;
-       unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
        unsigned int ext_flags;
        struct super_block *sb = inode->i_sb;
        struct address_space *mapping = inode->i_mapping;
        unsigned long index;
-       u64 next_cluster_bytes;
+       loff_t last_page_bytes;
 
        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+       BUG_ON(start > end);
 
-       /* Cluster boundary, so we don't need to grab any pages. */
-       if ((isize & (csize - 1)) == 0)
+       if (start == end)
                goto out;
 
-       ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+       BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
+              (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
+
+       ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits,
                                          phys, NULL, &ext_flags);
        if (ret) {
                mlog_errno(ret);
@@ -5419,8 +5775,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
        if (ext_flags & OCFS2_EXT_UNWRITTEN)
                goto out;
 
-       next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
-       index = isize >> PAGE_CACHE_SHIFT;
+       last_page_bytes = PAGE_ALIGN(end);
+       index = start >> PAGE_CACHE_SHIFT;
        do {
                pages[numpages] = grab_cache_page(mapping, index);
                if (!pages[numpages]) {
@@ -5431,7 +5787,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
 
                numpages++;
                index++;
-       } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
+       } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
 
 out:
        if (ret != 0) {
@@ -5460,11 +5816,10 @@ out:
  * otherwise block_write_full_page() will skip writeout of pages past
  * i_size. The new_i_size parameter is passed for this reason.
  */
-int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
-                                u64 new_i_size)
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
+                                 u64 range_start, u64 range_end)
 {
        int ret, numpages;
-       loff_t endbyte;
        struct page **pages = NULL;
        u64 phys;
 
@@ -5483,7 +5838,8 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
                goto out;
        }
 
-       ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
+       ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
+                                  &numpages, &phys);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -5492,17 +5848,16 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
        if (numpages == 0)
                goto out;
 
-       ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
-                                handle);
+       ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
+                                numpages, phys, handle);
 
        /*
         * Initiate writeout of the pages we zero'd here. We don't
         * wait on them - the truncate_inode_pages() call later will
         * do that for us.
         */
-       endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
-       ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
-                                   endbyte - 1, SYNC_FILE_RANGE_WRITE);
+       ret = do_sync_mapping_range(inode->i_mapping, range_start,
+                                   range_end - 1, SYNC_FILE_RANGE_WRITE);
        if (ret)
                mlog_errno(ret);