ext4: optimize ext4_ext_convert_to_initialized()
authorEric Gouriou <egouriou@google.com>
Thu, 27 Oct 2011 15:43:23 +0000 (11:43 -0400)
committerTheodore Ts'o <tytso@mit.edu>
Thu, 27 Oct 2011 15:43:23 +0000 (11:43 -0400)
This patch introduces a fast path in ext4_ext_convert_to_initialized()
for the case when the conversion can be performed by transferring
the newly initialized blocks from the uninitialized extent into
an adjacent initialized extent. Doing so removes the expensive
invocations of memmove() which occur during extent insertion and
the subsequent merge.

In practice this should be the common case for clients performing
append writes into files pre-allocated via
fallocate(FALLOC_FL_KEEP_SIZE). In such a workload performed via
direct IO and when using a suboptimal implementation of memmove()
(x86_64 prior to the 2.6.39 rewrite), this patch reduces kernel CPU
consumption by 32%.

Two new trace points are added to ext4_ext_convert_to_initialized()
to offer visibility into its operations. No exit trace point has
been added due to the multiplicity of return points. This can be
revisited once the upstream cleanup is backported.

Signed-off-by: Eric Gouriou <egouriou@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
fs/ext4/extents.c
fs/ext4/super.c
include/trace/events/ext4.h

index c2ac06cb2d4641bb007ea2013900b3c229023637..8b6a17b60970e740d6d538029b39c6acfb574e24 100644 (file)
@@ -2919,12 +2919,23 @@ out:
  *   a> There is no split required: Entire extent should be initialized
  *   b> Splits in two extents: Write is happening at either end of the extent
  *   c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * Pre-conditions:
+ *  - The extent pointed to by 'path' is uninitialized.
+ *  - The extent pointed to by 'path' contains a superset
+ *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
+ *
+ * Post-conditions on success:
+ *  - the returned value is the number of blocks beyond map->l_lblk
+ *    that are allocated and initialized.
+ *    It is guaranteed to be >= map->m_len.
  */
 static int ext4_ext_convert_to_initialized(handle_t *handle,
                                           struct inode *inode,
                                           struct ext4_map_blocks *map,
                                           struct ext4_ext_path *path)
 {
+       struct ext4_extent_header *eh;
        struct ext4_map_blocks split_map;
        struct ext4_extent zero_ex;
        struct ext4_extent *ex;
@@ -2944,11 +2955,93 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                eof_block = map->m_lblk + map->m_len;
 
        depth = ext_depth(inode);
+       eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
 
+       trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
+
+       /* Pre-conditions */
+       BUG_ON(!ext4_ext_is_uninitialized(ex));
+       BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
+       BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
+
+       /*
+        * Attempt to transfer newly initialized blocks from the currently
+        * uninitialized extent to its left neighbor. This is much cheaper
+        * than an insertion followed by a merge as those involve costly
+        * memmove() calls. This is the common case in steady state for
+        * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
+        * writes.
+        *
+        * Limitations of the current logic:
+        *  - L1: we only deal with writes at the start of the extent.
+        *    The approach could be extended to writes at the end
+        *    of the extent but this scenario was deemed less common.
+        *  - L2: we do not deal with writes covering the whole extent.
+        *    This would require removing the extent if the transfer
+        *    is possible.
+        *  - L3: we only attempt to merge with an extent stored in the
+        *    same extent tree node.
+        */
+       if ((map->m_lblk == ee_block) &&        /*L1*/
+               (map->m_len < ee_len) &&        /*L2*/
+               (ex > EXT_FIRST_EXTENT(eh))) {  /*L3*/
+               struct ext4_extent *prev_ex;
+               ext4_lblk_t prev_lblk;
+               ext4_fsblk_t prev_pblk, ee_pblk;
+               unsigned int prev_len, write_len;
+
+               prev_ex = ex - 1;
+               prev_lblk = le32_to_cpu(prev_ex->ee_block);
+               prev_len = ext4_ext_get_actual_len(prev_ex);
+               prev_pblk = ext4_ext_pblock(prev_ex);
+               ee_pblk = ext4_ext_pblock(ex);
+               write_len = map->m_len;
+
+               /*
+                * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+                * upon those conditions:
+                * - C1: prev_ex is initialized,
+                * - C2: prev_ex is logically abutting ex,
+                * - C3: prev_ex is physically abutting ex,
+                * - C4: prev_ex can receive the additional blocks without
+                *   overflowing the (initialized) length limit.
+                */
+               if ((!ext4_ext_is_uninitialized(prev_ex)) &&            /*C1*/
+                       ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
+                       ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
+                       (prev_len < (EXT_INIT_MAX_LEN - write_len))) {  /*C4*/
+                       err = ext4_ext_get_access(handle, inode, path + depth);
+                       if (err)
+                               goto out;
+
+                       trace_ext4_ext_convert_to_initialized_fastpath(inode,
+                               map, ex, prev_ex);
+
+                       /* Shift the start of ex by 'write_len' blocks */
+                       ex->ee_block = cpu_to_le32(ee_block + write_len);
+                       ext4_ext_store_pblock(ex, ee_pblk + write_len);
+                       ex->ee_len = cpu_to_le16(ee_len - write_len);
+                       ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+
+                       /* Extend prev_ex by 'write_len' blocks */
+                       prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+
+                       /* Mark the block containing both extents as dirty */
+                       ext4_ext_dirty(handle, inode, path + depth);
+
+                       /* Update path to point to the right extent */
+                       path[depth].p_ext = prev_ex;
+
+                       /* Result: number of initialized blocks past m_lblk */
+                       allocated = write_len;
+                       goto out;
+               }
+       }
+
        WARN_ON(map->m_lblk < ee_block);
        /*
         * It is safe to convert extent to initialized via explicit
index dcc460537bc72e0fb85c47deb7c1330f4da8ca80..9953d80145ad0f6331086053a7d80301c418f3a7 100644 (file)
@@ -45,6 +45,7 @@
 #include <linux/freezer.h>
 
 #include "ext4.h"
+#include "ext4_extents.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
index c9a341e385a38779b3621e36e37bd1f19ec1ea71..748ff7cbe5557989e6fd40fcacb70e5b8760956f 100644 (file)
@@ -9,6 +9,7 @@
 
 struct ext4_allocation_context;
 struct ext4_allocation_request;
+struct ext4_extent;
 struct ext4_prealloc_space;
 struct ext4_inode_info;
 struct mpage_da_data;
@@ -1394,6 +1395,87 @@ DEFINE_EVENT(ext4__truncate, ext4_truncate_exit,
        TP_ARGS(inode)
 );
 
+/* 'ux' is the uninitialized extent. */
+TRACE_EVENT(ext4_ext_convert_to_initialized_enter,
+       TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+                struct ext4_extent *ux),
+
+       TP_ARGS(inode, map, ux),
+
+       TP_STRUCT__entry(
+               __field(        ino_t,          ino     )
+               __field(        dev_t,          dev     )
+               __field(        ext4_lblk_t,    m_lblk  )
+               __field(        unsigned,       m_len   )
+               __field(        ext4_lblk_t,    u_lblk  )
+               __field(        unsigned,       u_len   )
+               __field(        ext4_fsblk_t,   u_pblk  )
+       ),
+
+       TP_fast_assign(
+               __entry->ino            = inode->i_ino;
+               __entry->dev            = inode->i_sb->s_dev;
+               __entry->m_lblk         = map->m_lblk;
+               __entry->m_len          = map->m_len;
+               __entry->u_lblk         = le32_to_cpu(ux->ee_block);
+               __entry->u_len          = ext4_ext_get_actual_len(ux);
+               __entry->u_pblk         = ext4_ext_pblock(ux);
+       ),
+
+       TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u "
+                 "u_pblk %llu",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 __entry->m_lblk, __entry->m_len,
+                 __entry->u_lblk, __entry->u_len, __entry->u_pblk)
+);
+
+/*
+ * 'ux' is the uninitialized extent.
+ * 'ix' is the initialized extent to which blocks are transferred.
+ */
+TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath,
+       TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+                struct ext4_extent *ux, struct ext4_extent *ix),
+
+       TP_ARGS(inode, map, ux, ix),
+
+       TP_STRUCT__entry(
+               __field(        ino_t,          ino     )
+               __field(        dev_t,          dev     )
+               __field(        ext4_lblk_t,    m_lblk  )
+               __field(        unsigned,       m_len   )
+               __field(        ext4_lblk_t,    u_lblk  )
+               __field(        unsigned,       u_len   )
+               __field(        ext4_fsblk_t,   u_pblk  )
+               __field(        ext4_lblk_t,    i_lblk  )
+               __field(        unsigned,       i_len   )
+               __field(        ext4_fsblk_t,   i_pblk  )
+       ),
+
+       TP_fast_assign(
+               __entry->ino            = inode->i_ino;
+               __entry->dev            = inode->i_sb->s_dev;
+               __entry->m_lblk         = map->m_lblk;
+               __entry->m_len          = map->m_len;
+               __entry->u_lblk         = le32_to_cpu(ux->ee_block);
+               __entry->u_len          = ext4_ext_get_actual_len(ux);
+               __entry->u_pblk         = ext4_ext_pblock(ux);
+               __entry->i_lblk         = le32_to_cpu(ix->ee_block);
+               __entry->i_len          = ext4_ext_get_actual_len(ix);
+               __entry->i_pblk         = ext4_ext_pblock(ix);
+       ),
+
+       TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u "
+                 "u_lblk %u u_len %u u_pblk %llu "
+                 "i_lblk %u i_len %u i_pblk %llu ",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 __entry->m_lblk, __entry->m_len,
+                 __entry->u_lblk, __entry->u_len, __entry->u_pblk,
+                 __entry->i_lblk, __entry->i_len, __entry->i_pblk)
+);
+
 DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned int len, unsigned int flags),