Merge tag 'trace-v4.4-2' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt...
[firefly-linux-kernel-4.4.55.git] / drivers / block / loop.c
index 674f800a3b5760ad6374c98fa11e88097e30d160..423f4ca7d712dda6f012c32954f19c9ce3af9d9c 100644 (file)
@@ -164,6 +164,62 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file)
        return get_size(lo->lo_offset, lo->lo_sizelimit, file);
 }
 
+static void __loop_update_dio(struct loop_device *lo, bool dio)
+{
+       struct file *file = lo->lo_backing_file;
+       struct address_space *mapping = file->f_mapping;
+       struct inode *inode = mapping->host;
+       unsigned short sb_bsize = 0;
+       unsigned dio_align = 0;
+       bool use_dio;
+
+       if (inode->i_sb->s_bdev) {
+               sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev);
+               dio_align = sb_bsize - 1;
+       }
+
+       /*
+        * We support direct I/O only if lo_offset is aligned with the
+        * logical I/O size of backing device, and the logical block
+        * size of loop is bigger than the backing device's and the loop
+        * needn't transform transfer.
+        *
+        * TODO: the above condition may be loosed in the future, and
+        * direct I/O may be switched runtime at that time because most
+        * of requests in sane appplications should be PAGE_SIZE algined
+        */
+       if (dio) {
+               if (queue_logical_block_size(lo->lo_queue) >= sb_bsize &&
+                               !(lo->lo_offset & dio_align) &&
+                               mapping->a_ops->direct_IO &&
+                               !lo->transfer)
+                       use_dio = true;
+               else
+                       use_dio = false;
+       } else {
+               use_dio = false;
+       }
+
+       if (lo->use_dio == use_dio)
+               return;
+
+       /* flush dirty pages before changing direct IO */
+       vfs_fsync(file, 0);
+
+       /*
+        * The flag of LO_FLAGS_DIRECT_IO is handled similarly with
+        * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup
+        * will get updated by ioctl(LOOP_GET_STATUS)
+        */
+       blk_mq_freeze_queue(lo->lo_queue);
+       lo->use_dio = use_dio;
+       if (use_dio)
+               lo->lo_flags |= LO_FLAGS_DIRECT_IO;
+       else
+               lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
+       blk_mq_unfreeze_queue(lo->lo_queue);
+}
+
 static int
 figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
 {
@@ -389,6 +445,89 @@ static int lo_req_flush(struct loop_device *lo, struct request *rq)
        return ret;
 }
 
+static inline void handle_partial_read(struct loop_cmd *cmd, long bytes)
+{
+       if (bytes < 0 || (cmd->rq->cmd_flags & REQ_WRITE))
+               return;
+
+       if (unlikely(bytes < blk_rq_bytes(cmd->rq))) {
+               struct bio *bio = cmd->rq->bio;
+
+               bio_advance(bio, bytes);
+               zero_fill_bio(bio);
+       }
+}
+
+static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
+{
+       struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
+       struct request *rq = cmd->rq;
+
+       handle_partial_read(cmd, ret);
+
+       if (ret > 0)
+               ret = 0;
+       else if (ret < 0)
+               ret = -EIO;
+
+       blk_mq_complete_request(rq, ret);
+}
+
+static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
+                    loff_t pos, bool rw)
+{
+       struct iov_iter iter;
+       struct bio_vec *bvec;
+       struct bio *bio = cmd->rq->bio;
+       struct file *file = lo->lo_backing_file;
+       int ret;
+
+       /* nomerge for loop request queue */
+       WARN_ON(cmd->rq->bio != cmd->rq->biotail);
+
+       bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+       iov_iter_bvec(&iter, ITER_BVEC | rw, bvec,
+                     bio_segments(bio), blk_rq_bytes(cmd->rq));
+
+       cmd->iocb.ki_pos = pos;
+       cmd->iocb.ki_filp = file;
+       cmd->iocb.ki_complete = lo_rw_aio_complete;
+       cmd->iocb.ki_flags = IOCB_DIRECT;
+
+       if (rw == WRITE)
+               ret = file->f_op->write_iter(&cmd->iocb, &iter);
+       else
+               ret = file->f_op->read_iter(&cmd->iocb, &iter);
+
+       if (ret != -EIOCBQUEUED)
+               cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
+       return 0;
+}
+
+
+static inline int lo_rw_simple(struct loop_device *lo,
+               struct request *rq, loff_t pos, bool rw)
+{
+       struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+       if (cmd->use_aio)
+               return lo_rw_aio(lo, cmd, pos, rw);
+
+       /*
+        * lo_write_simple and lo_read_simple should have been covered
+        * by io submit style function like lo_rw_aio(), one blocker
+        * is that lo_read_simple() need to call flush_dcache_page after
+        * the page is written from kernel, and it isn't easy to handle
+        * this in io submit style function which submits all segments
+        * of the req at one time. And direct read IO doesn't need to
+        * run flush_dcache_page().
+        */
+       if (rw == WRITE)
+               return lo_write_simple(lo, rq, pos);
+       else
+               return lo_read_simple(lo, rq, pos);
+}
+
 static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 {
        loff_t pos;
@@ -404,13 +543,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
                else if (lo->transfer)
                        ret = lo_write_transfer(lo, rq, pos);
                else
-                       ret = lo_write_simple(lo, rq, pos);
+                       ret = lo_rw_simple(lo, rq, pos, WRITE);
 
        } else {
                if (lo->transfer)
                        ret = lo_read_transfer(lo, rq, pos);
                else
-                       ret = lo_read_simple(lo, rq, pos);
+                       ret = lo_rw_simple(lo, rq, pos, READ);
        }
 
        return ret;
@@ -421,6 +560,12 @@ struct switch_request {
        struct completion wait;
 };
 
+static inline void loop_update_dio(struct loop_device *lo)
+{
+       __loop_update_dio(lo, io_is_direct(lo->lo_backing_file) |
+                       lo->use_dio);
+}
+
 /*
  * Do the actual switch; called from the BIO completion routine
  */
@@ -441,6 +586,7 @@ static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
                mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
        lo->old_gfp_mask = mapping_gfp_mask(mapping);
        mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+       loop_update_dio(lo);
 }
 
 /*
@@ -627,11 +773,19 @@ static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf)
        return sprintf(buf, "%s\n", partscan ? "1" : "0");
 }
 
+static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf)
+{
+       int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO);
+
+       return sprintf(buf, "%s\n", dio ? "1" : "0");
+}
+
 LOOP_ATTR_RO(backing_file);
 LOOP_ATTR_RO(offset);
 LOOP_ATTR_RO(sizelimit);
 LOOP_ATTR_RO(autoclear);
 LOOP_ATTR_RO(partscan);
+LOOP_ATTR_RO(dio);
 
 static struct attribute *loop_attrs[] = {
        &loop_attr_backing_file.attr,
@@ -639,6 +793,7 @@ static struct attribute *loop_attrs[] = {
        &loop_attr_sizelimit.attr,
        &loop_attr_autoclear.attr,
        &loop_attr_partscan.attr,
+       &loop_attr_dio.attr,
        NULL,
 };
 
@@ -688,6 +843,23 @@ static void loop_config_discard(struct loop_device *lo)
        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 }
 
+static void loop_unprepare_queue(struct loop_device *lo)
+{
+       flush_kthread_worker(&lo->worker);
+       kthread_stop(lo->worker_task);
+}
+
+static int loop_prepare_queue(struct loop_device *lo)
+{
+       init_kthread_worker(&lo->worker);
+       lo->worker_task = kthread_run(kthread_worker_fn,
+                       &lo->worker, "loop%d", lo->lo_number);
+       if (IS_ERR(lo->worker_task))
+               return -ENOMEM;
+       set_user_nice(lo->worker_task, MIN_NICE);
+       return 0;
+}
+
 static int loop_set_fd(struct loop_device *lo, fmode_t mode,
                       struct block_device *bdev, unsigned int arg)
 {
@@ -745,17 +917,15 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
        size = get_loop_size(lo, file);
        if ((loff_t)(sector_t)size != size)
                goto out_putf;
-       error = -ENOMEM;
-       lo->wq = alloc_workqueue("kloopd%d",
-                       WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 16,
-                       lo->lo_number);
-       if (!lo->wq)
+       error = loop_prepare_queue(lo);
+       if (error)
                goto out_putf;
 
        error = 0;
 
        set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
 
+       lo->use_dio = false;
        lo->lo_blocksize = lo_blocksize;
        lo->lo_device = bdev;
        lo->lo_flags = lo_flags;
@@ -769,6 +939,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
        if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
                blk_queue_flush(lo->lo_queue, REQ_FLUSH);
 
+       loop_update_dio(lo);
        set_capacity(lo->lo_disk, size);
        bd_set_size(bdev, size << 9);
        loop_sysfs_init(lo);
@@ -903,8 +1074,7 @@ static int loop_clr_fd(struct loop_device *lo)
        lo->lo_flags = 0;
        if (!part_shift)
                lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
-       destroy_workqueue(lo->wq);
-       lo->wq = NULL;
+       loop_unprepare_queue(lo);
        mutex_unlock(&lo->lo_ctl_mutex);
        /*
         * Need not hold lo_ctl_mutex to fput backing file.
@@ -988,6 +1158,9 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
                lo->lo_key_owner = uid;
        }
 
+       /* update dio if lo_offset or transfer is changed */
+       __loop_update_dio(lo, lo->use_dio);
+
        return 0;
 }
 
@@ -1138,6 +1311,20 @@ static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev)
        return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
 }
 
+static int loop_set_dio(struct loop_device *lo, unsigned long arg)
+{
+       int error = -ENXIO;
+       if (lo->lo_state != Lo_bound)
+               goto out;
+
+       __loop_update_dio(lo, !!arg);
+       if (lo->use_dio == !!arg)
+               return 0;
+       error = -EINVAL;
+ out:
+       return error;
+}
+
 static int lo_ioctl(struct block_device *bdev, fmode_t mode,
        unsigned int cmd, unsigned long arg)
 {
@@ -1181,6 +1368,11 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
                if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
                        err = loop_set_capacity(lo, bdev);
                break;
+       case LOOP_SET_DIRECT_IO:
+               err = -EPERM;
+               if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+                       err = loop_set_dio(lo, arg);
+               break;
        default:
                err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
        }
@@ -1461,23 +1653,13 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
        if (lo->lo_state != Lo_bound)
                return -EIO;
 
-       if (cmd->rq->cmd_flags & REQ_WRITE) {
-               struct loop_device *lo = cmd->rq->q->queuedata;
-               bool need_sched = true;
-
-               spin_lock_irq(&lo->lo_lock);
-               if (lo->write_started)
-                       need_sched = false;
-               else
-                       lo->write_started = true;
-               list_add_tail(&cmd->list, &lo->write_cmd_head);
-               spin_unlock_irq(&lo->lo_lock);
+       if (lo->use_dio && !(cmd->rq->cmd_flags & (REQ_FLUSH |
+                                       REQ_DISCARD)))
+               cmd->use_aio = true;
+       else
+               cmd->use_aio = false;
 
-               if (need_sched)
-                       queue_work(lo->wq, &lo->write_work);
-       } else {
-               queue_work(lo->wq, &cmd->read_work);
-       }
+       queue_kthread_work(&lo->worker, &cmd->work);
 
        return BLK_MQ_RQ_QUEUE_OK;
 }
@@ -1495,38 +1677,15 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 
        ret = do_req_filebacked(lo, cmd->rq);
  failed:
-       blk_mq_complete_request(cmd->rq, ret ? -EIO : 0);
+       /* complete non-aio request */
+       if (!cmd->use_aio || ret)
+               blk_mq_complete_request(cmd->rq, ret ? -EIO : 0);
 }
 
-static void loop_queue_write_work(struct work_struct *work)
-{
-       struct loop_device *lo =
-               container_of(work, struct loop_device, write_work);
-       LIST_HEAD(cmd_list);
-
-       spin_lock_irq(&lo->lo_lock);
- repeat:
-       list_splice_init(&lo->write_cmd_head, &cmd_list);
-       spin_unlock_irq(&lo->lo_lock);
-
-       while (!list_empty(&cmd_list)) {
-               struct loop_cmd *cmd = list_first_entry(&cmd_list,
-                               struct loop_cmd, list);
-               list_del_init(&cmd->list);
-               loop_handle_cmd(cmd);
-       }
-
-       spin_lock_irq(&lo->lo_lock);
-       if (!list_empty(&lo->write_cmd_head))
-               goto repeat;
-       lo->write_started = false;
-       spin_unlock_irq(&lo->lo_lock);
-}
-
-static void loop_queue_read_work(struct work_struct *work)
+static void loop_queue_work(struct kthread_work *work)
 {
        struct loop_cmd *cmd =
-               container_of(work, struct loop_cmd, read_work);
+               container_of(work, struct loop_cmd, work);
 
        loop_handle_cmd(cmd);
 }
@@ -1538,7 +1697,7 @@ static int loop_init_request(void *data, struct request *rq,
        struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
        cmd->rq = rq;
-       INIT_WORK(&cmd->read_work, loop_queue_read_work);
+       init_kthread_work(&cmd->work, loop_queue_work);
 
        return 0;
 }
@@ -1594,8 +1753,11 @@ static int loop_add(struct loop_device **l, int i)
        }
        lo->lo_queue->queuedata = lo;
 
-       INIT_LIST_HEAD(&lo->write_cmd_head);
-       INIT_WORK(&lo->write_work, loop_queue_write_work);
+       /*
+        * It doesn't make sense to enable merge because the I/O
+        * submitted to backing file is handled page by page.
+        */
+       queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
 
        disk = lo->lo_disk = alloc_disk(1 << part_shift);
        if (!disk)