direct-io: only inc/dec inode->i_dio_count for file systems
authorJens Axboe <axboe@fb.com>
Fri, 10 Jul 2015 05:17:35 +0000 (13:17 +0800)
committerShawn Lin <shawn.lin@rock-chips.com>
Fri, 10 Jul 2015 05:17:35 +0000 (13:17 +0800)
do_blockdev_direct_IO() increments and decrements the inode
->i_dio_count for each IO operation. It does this to protect against
truncate of a file. Block devices don't need this sort of protection.

For a capable multiqueue setup, this atomic int is the only shared
state between applications accessing the device for O_DIRECT, and it
presents a scaling wall for that. In my testing, as much as 30% of
system time is spent incrementing and decrementing this value. A mixed
read/write workload improved from ~2.5M IOPS to ~9.6M IOPS, with
better latencies too. Before:

clat percentiles (usec):
|  1.00th=[   33],  5.00th=[   34], 10.00th=[   34], 20.00th=[   34],
| 30.00th=[   34], 40.00th=[   34], 50.00th=[   35], 60.00th=[   35],
| 70.00th=[   35], 80.00th=[   35], 90.00th=[   37], 95.00th=[   80],
| 99.00th=[   98], 99.50th=[  151], 99.90th=[  155], 99.95th=[  155],
| 99.99th=[  165]

After:

clat percentiles (usec):
|  1.00th=[   95],  5.00th=[  108], 10.00th=[  129], 20.00th=[  149],
| 30.00th=[  155], 40.00th=[  161], 50.00th=[  167], 60.00th=[  171],
| 70.00th=[  177], 80.00th=[  185], 90.00th=[  201], 95.00th=[  270],
| 99.00th=[  390], 99.50th=[  398], 99.90th=[  418], 99.95th=[  422],
| 99.99th=[  438]

In other setups, Robert Elliott reported seeing good performance
improvements:

https://lkml.org/lkml/2015/4/3/557

The more applications accessing the device, the worse it gets.

Add a new direct-io flags, DIO_SKIP_DIO_COUNT, which tells
do_blockdev_direct_IO() that it need not worry about incrementing
or decrementing the inode i_dio_count for this caller.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Elliott, Robert (Server Storage) <elliott@hp.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Tested-and-Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
fs/block_dev.c
fs/direct-io.c
fs/ext4/indirect.c
fs/ext4/inode.c
fs/ext4/page-io.c
fs/inode.c
include/linux/fs.h

index 85f5c85ec91c050818495c49c1e4a2a669c14322..66738e403f970977db5d56fa256612529da14101 100644 (file)
@@ -172,7 +172,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
        struct inode *inode = file->f_mapping->host;
 
        return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
-                                   nr_segs, blkdev_get_block, NULL, NULL, 0);
+                                   nr_segs, blkdev_get_block, NULL, NULL, DIO_SKIP_DIO_COUNT);
 }
 
 int __sync_blockdev(struct block_device *bdev, int wait)
index 6e5dd6f5df1a096b9dd55ab486b080314e9114da..08a8c037396252df1ef279c984b144a27c75ab3f 100644 (file)
@@ -262,7 +262,9 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
                dio->end_io(dio->iocb, offset, transferred,
                            dio->private, ret, is_async);
        } else {
-               inode_dio_done(dio->inode);
+               if (!(dio->flags & DIO_SKIP_DIO_COUNT))
+                       inode_dio_end(dio->inode);
+
                if (is_async)
                        aio_complete(dio->iocb, ret, 0);
        }
@@ -1135,7 +1137,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        /*
         * Will be decremented at I/O completion time.
         */
-       atomic_inc(&inode->i_dio_count);
+       if (!(dio->flags & DIO_SKIP_DIO_COUNT))
+               inode_dio_begin(inode);
 
        /*
         * For file extending writes updating i_size before data
index 5890614696870bdecef2ea18eb900741ebbeb339..167b64641409b6d5177902fd3d7e12cc756c12a4 100644 (file)
@@ -691,18 +691,18 @@ retry:
                 * via ext4_inode_block_unlocked_dio(). Check inode's state
                 * while holding extra i_dio_count ref.
                 */
-               atomic_inc(&inode->i_dio_count);
+               inode_dio_begin(inode);
                smp_mb();
                if (unlikely(ext4_test_inode_state(inode,
                                                    EXT4_STATE_DIOREAD_LOCK))) {
-                       inode_dio_done(inode);
+                       inode_dio_end(inode);
                        goto locked;
                }
                ret = __blockdev_direct_IO(rw, iocb, inode,
                                 inode->i_sb->s_bdev, iov,
                                 offset, nr_segs,
                                 ext4_get_block, NULL, NULL, 0);
-               inode_dio_done(inode);
+               inode_dio_end(inode);
        } else {
 locked:
                ret = blockdev_direct_IO(rw, iocb, inode, iov,
index 8a277505a3bebb0933a07637feb605ccf9e40e37..001f88f35e333a901f3205f481aeeca686e4fa4c 100644 (file)
@@ -3092,7 +3092,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                ext4_free_io_end(io_end);
 out:
-               inode_dio_done(inode);
+               inode_dio_end(inode);
                if (is_async)
                        aio_complete(iocb, ret, 0);
                return;
@@ -3150,7 +3150,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
        overwrite = *((int *)iocb->private);
 
        if (overwrite) {
-               atomic_inc(&inode->i_dio_count);
+               inode_dio_begin(inode);
                down_read(&EXT4_I(inode)->i_data_sem);
                mutex_unlock(&inode->i_mutex);
        }
@@ -3243,7 +3243,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 retake_lock:
        /* take i_mutex locking again if we do a ovewrite dio */
        if (overwrite) {
-               inode_dio_done(inode);
+               inode_dio_end(inode);
                up_read(&EXT4_I(inode)->i_data_sem);
                mutex_lock(&inode->i_mutex);
        }
index b12a4427aedc41f71bbbf697cbc61e17e30963cf..11c236686868cd49610e90a843499510954b4dfa 100644 (file)
@@ -97,7 +97,7 @@ static int ext4_end_io(ext4_io_end_t *io)
        if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
                wake_up_all(ext4_ioend_wq(inode));
        if (io->flag & EXT4_IO_END_DIRECT)
-               inode_dio_done(inode);
+               inode_dio_end(inode);
        if (io->iocb)
                aio_complete(io->iocb, io->result, 0);
        return ret;
index 1b300a06b8be34d822eb6b75d9ff0a8535aa0d2c..8b85f856ca4b60bb42b2873db221205ce471971d 100644 (file)
@@ -1887,16 +1887,3 @@ void inode_dio_wait(struct inode *inode)
 }
 EXPORT_SYMBOL(inode_dio_wait);
 
-/*
- * inode_dio_done - signal finish of a direct I/O requests
- * @inode: inode the direct I/O happens on
- *
- * This is called once we've finished processing a direct I/O request,
- * and is used to wake up callers waiting for direct I/O to be quiesced.
- */
-void inode_dio_done(struct inode *inode)
-{
-       if (atomic_dec_and_test(&inode->i_dio_count))
-               wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
-}
-EXPORT_SYMBOL(inode_dio_done);
index 9a3437377f718c2e24308a18513fe15d7c6e626d..405b18e31621d031deb616733b4f1c4a94abbfec 100644 (file)
@@ -2461,6 +2461,12 @@ enum {
 
        /* filesystem does not support filling holes */
        DIO_SKIP_HOLES  = 0x02,
+
+       /* filesystem can handle aio writes beyond i_size */
+       DIO_ASYNC_EXTEND = 0x04,
+
+       /* inode/fs/bdev does not need truncate protection */
+       DIO_SKIP_DIO_COUNT = 0x08,
 };
 
 void dio_end_io(struct bio *bio, int error);
@@ -2481,8 +2487,32 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
 #endif
 
 void inode_dio_wait(struct inode *inode);
-void inode_dio_done(struct inode *inode);
 
+
+/*
+ * inode_dio_begin - signal start of a direct I/O requests
+ * @inode: inode the direct I/O happens on
+ *
+ * This is called once we've finished processing a direct I/O request,
+ * and is used to wake up callers waiting for direct I/O to be quiesced.
+ */
+static inline void inode_dio_begin(struct inode *inode)
+{
+       atomic_inc(&inode->i_dio_count);
+}
+
+/*
+ * inode_dio_end - signal finish of a direct I/O requests
+ * @inode: inode the direct I/O happens on
+ *
+ * This is called once we've finished processing a direct I/O request,
+ * and is used to wake up callers waiting for direct I/O to be quiesced.
+ */
+static inline void inode_dio_end(struct inode *inode)
+{
+       if (atomic_dec_and_test(&inode->i_dio_count))
+               wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
+}
 extern const struct file_operations generic_ro_fops;
 
 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))