Merge remote-tracking branch 'aosp/android-3.0' into develop-3.0
[firefly-linux-kernel-4.4.55.git] / drivers / block / xen-blkfront.c
index d773397575dbe5176c623424f2aa2ea423c7edff..9ea8c2576c70e768f22ad8ea2cc423337611fb4f 100644 (file)
@@ -41,7 +41,7 @@
 #include <linux/cdrom.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 #include <linux/scatterlist.h>
 
 #include <xen/xen.h>
@@ -65,10 +65,11 @@ enum blkif_state {
 
 struct blk_shadow {
        struct blkif_request req;
-       unsigned long request;
+       struct request *request;
        unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 };
 
+static DEFINE_MUTEX(blkfront_mutex);
 static const struct block_device_operations xlvbd_block_fops;
 
 #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
@@ -95,7 +96,8 @@ struct blkfront_info
        struct gnttab_free_callback callback;
        struct blk_shadow shadow[BLK_RING_SIZE];
        unsigned long shadow_free;
-       int feature_barrier;
+       unsigned int feature_flush;
+       unsigned int flush_op;
        int is_ready;
 };
 
@@ -119,6 +121,10 @@ static DEFINE_SPINLOCK(minor_lock);
 #define EXTENDED (1<<EXT_SHIFT)
 #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
 #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
+#define EMULATED_HD_DISK_MINOR_OFFSET (0)
+#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
+#define EMULATED_SD_DISK_MINOR_OFFSET (0)
+#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256)
 
 #define DEV_NAME       "xvd"   /* name in /dev */
 
@@ -135,7 +141,7 @@ static void add_id_to_freelist(struct blkfront_info *info,
                               unsigned long id)
 {
        info->shadow[id].req.id  = info->shadow_free;
-       info->shadow[id].request = 0;
+       info->shadow[id].request = NULL;
        info->shadow_free = id;
 }
 
@@ -244,14 +250,10 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
 }
 
 /*
- * blkif_queue_request
+ * Generate a Xen blkfront IO request from a blk layer request.  Reads
+ * and writes are handled as expected.
  *
- * request block io
- *
- * id: for guest use only.
- * operation: BLKIF_OP_{READ,WRITE,PROBE}
- * buffer: buffer to read/write into. this should be a
- *   virtual address in the guest os.
+ * @req: a request struct
  */
 static int blkif_queue_request(struct request *req)
 {
@@ -280,16 +282,25 @@ static int blkif_queue_request(struct request *req)
        /* Fill out a communications ring structure. */
        ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
        id = get_id_from_freelist(info);
-       info->shadow[id].request = (unsigned long)req;
+       info->shadow[id].request = req;
 
        ring_req->id = id;
-       ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
+       ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
        ring_req->handle = info->handle;
 
        ring_req->operation = rq_data_dir(req) ?
                BLKIF_OP_WRITE : BLKIF_OP_READ;
-       if (req->cmd_flags & REQ_HARDBARRIER)
-               ring_req->operation = BLKIF_OP_WRITE_BARRIER;
+
+       if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
+               /*
+                * Ideally we can do an unordered flush-to-disk. In case the
+                * backend onlysupports barriers, use that. A barrier request
+                * a superset of FUA, so we can implement it the same
+                * way.  (It's also a FLUSH+FUA, since it is
+                * guaranteed ordered WRT previous writes.)
+                */
+               ring_req->operation = info->flush_op;
+       }
 
        ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
        BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
@@ -309,7 +320,7 @@ static int blkif_queue_request(struct request *req)
                                rq_data_dir(req) );
 
                info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
-               ring_req->seg[i] =
+               ring_req->u.rw.seg[i] =
                                (struct blkif_request_segment) {
                                        .gref       = ref,
                                        .first_sect = fsect,
@@ -418,36 +429,84 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 }
 
 
-static int xlvbd_barrier(struct blkfront_info *info)
+static void xlvbd_flush(struct blkfront_info *info)
 {
-       int err;
-       const char *barrier;
+       blk_queue_flush(info->rq, info->feature_flush);
+       printk(KERN_INFO "blkfront: %s: %s: %s\n",
+              info->gd->disk_name,
+              info->flush_op == BLKIF_OP_WRITE_BARRIER ?
+               "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
+               "flush diskcache" : "barrier or flush"),
+              info->feature_flush ? "enabled" : "disabled");
+}
 
-       switch (info->feature_barrier) {
-       case QUEUE_ORDERED_DRAIN:       barrier = "enabled (drain)"; break;
-       case QUEUE_ORDERED_TAG:         barrier = "enabled (tag)"; break;
-       case QUEUE_ORDERED_NONE:        barrier = "disabled"; break;
-       default:                        return -EINVAL;
+static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
+{
+       int major;
+       major = BLKIF_MAJOR(vdevice);
+       *minor = BLKIF_MINOR(vdevice);
+       switch (major) {
+               case XEN_IDE0_MAJOR:
+                       *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
+                       *minor = ((*minor / 64) * PARTS_PER_DISK) +
+                               EMULATED_HD_DISK_MINOR_OFFSET;
+                       break;
+               case XEN_IDE1_MAJOR:
+                       *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
+                       *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
+                               EMULATED_HD_DISK_MINOR_OFFSET;
+                       break;
+               case XEN_SCSI_DISK0_MAJOR:
+                       *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
+                       *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
+                       break;
+               case XEN_SCSI_DISK1_MAJOR:
+               case XEN_SCSI_DISK2_MAJOR:
+               case XEN_SCSI_DISK3_MAJOR:
+               case XEN_SCSI_DISK4_MAJOR:
+               case XEN_SCSI_DISK5_MAJOR:
+               case XEN_SCSI_DISK6_MAJOR:
+               case XEN_SCSI_DISK7_MAJOR:
+                       *offset = (*minor / PARTS_PER_DISK) + 
+                               ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
+                               EMULATED_SD_DISK_NAME_OFFSET;
+                       *minor = *minor +
+                               ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
+                               EMULATED_SD_DISK_MINOR_OFFSET;
+                       break;
+               case XEN_SCSI_DISK8_MAJOR:
+               case XEN_SCSI_DISK9_MAJOR:
+               case XEN_SCSI_DISK10_MAJOR:
+               case XEN_SCSI_DISK11_MAJOR:
+               case XEN_SCSI_DISK12_MAJOR:
+               case XEN_SCSI_DISK13_MAJOR:
+               case XEN_SCSI_DISK14_MAJOR:
+               case XEN_SCSI_DISK15_MAJOR:
+                       *offset = (*minor / PARTS_PER_DISK) + 
+                               ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
+                               EMULATED_SD_DISK_NAME_OFFSET;
+                       *minor = *minor +
+                               ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
+                               EMULATED_SD_DISK_MINOR_OFFSET;
+                       break;
+               case XENVBD_MAJOR:
+                       *offset = *minor / PARTS_PER_DISK;
+                       break;
+               default:
+                       printk(KERN_WARNING "blkfront: your disk configuration is "
+                                       "incorrect, please use an xvd device instead\n");
+                       return -ENODEV;
        }
-
-       err = blk_queue_ordered(info->rq, info->feature_barrier);
-
-       if (err)
-               return err;
-
-       printk(KERN_INFO "blkfront: %s: barriers %s\n",
-              info->gd->disk_name, barrier);
        return 0;
 }
 
-
 static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
                               struct blkfront_info *info,
                               u16 vdisk_info, u16 sector_size)
 {
        struct gendisk *gd;
        int nr_minors = 1;
-       int err = -ENODEV;
+       int err;
        unsigned int offset;
        int minor;
        int nr_parts;
@@ -462,12 +521,20 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
        }
 
        if (!VDEV_IS_EXTENDED(info->vdevice)) {
-               minor = BLKIF_MINOR(info->vdevice);
-               nr_parts = PARTS_PER_DISK;
+               err = xen_translate_vdev(info->vdevice, &minor, &offset);
+               if (err)
+                       return err;             
+               nr_parts = PARTS_PER_DISK;
        } else {
                minor = BLKIF_MINOR_EXT(info->vdevice);
                nr_parts = PARTS_PER_EXT_DISK;
+               offset = minor / nr_parts;
+               if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4)
+                       printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
+                                       "emulated IDE disks,\n\t choose an xvd device name"
+                                       "from xvde on\n", info->vdevice);
        }
+       err = -ENODEV;
 
        if ((minor % nr_parts) == 0)
                nr_minors = nr_parts;
@@ -481,8 +548,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
        if (gd == NULL)
                goto release;
 
-       offset = minor / nr_parts;
-
        if (nr_minors > 1) {
                if (offset < 26)
                        sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
@@ -516,7 +581,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
        info->rq = gd->queue;
        info->gd = gd;
 
-       xlvbd_barrier(info);
+       xlvbd_flush(info);
 
        if (vdisk_info & VDISK_READONLY)
                set_disk_ro(gd, 1);
@@ -553,7 +618,7 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
        spin_unlock_irqrestore(&blkif_io_lock, flags);
 
        /* Flush gnttab callback work. Must be done with no locks held. */
-       flush_scheduled_work();
+       flush_work_sync(&info->work);
 
        del_gendisk(info->gd);
 
@@ -602,7 +667,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
        spin_unlock_irq(&blkif_io_lock);
 
        /* Flush gnttab callback work. Must be done with no locks held. */
-       flush_scheduled_work();
+       flush_work_sync(&info->work);
 
        /* Free resources associated with old device channel. */
        if (info->ring_ref != GRANT_INVALID_REF) {
@@ -621,7 +686,7 @@ static void blkif_completion(struct blk_shadow *s)
 {
        int i;
        for (i = 0; i < s->req.nr_segments; i++)
-               gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
+               gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
 }
 
 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -649,7 +714,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 
                bret = RING_GET_RESPONSE(&info->ring, i);
                id   = bret->id;
-               req  = (struct request *)info->shadow[id].request;
+               req  = info->shadow[id].request;
 
                blkif_completion(&info->shadow[id]);
 
@@ -657,13 +722,29 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 
                error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
                switch (bret->operation) {
+               case BLKIF_OP_FLUSH_DISKCACHE:
                case BLKIF_OP_WRITE_BARRIER:
                        if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
-                               printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
+                               printk(KERN_WARNING "blkfront: %s: write %s op failed\n",
+                                      info->flush_op == BLKIF_OP_WRITE_BARRIER ?
+                                      "barrier" :  "flush disk cache",
                                       info->gd->disk_name);
                                error = -EOPNOTSUPP;
-                               info->feature_barrier = QUEUE_ORDERED_NONE;
-                               xlvbd_barrier(info);
+                       }
+                       if (unlikely(bret->status == BLKIF_RSP_ERROR &&
+                                    info->shadow[id].req.nr_segments == 0)) {
+                               printk(KERN_WARNING "blkfront: %s: empty write %s op failed\n",
+                                      info->flush_op == BLKIF_OP_WRITE_BARRIER ?
+                                      "barrier" :  "flush disk cache",
+                                      info->gd->disk_name);
+                               error = -EOPNOTSUPP;
+                       }
+                       if (unlikely(error)) {
+                               if (error == -EOPNOTSUPP)
+                                       error = 0;
+                               info->feature_flush = 0;
+                               info->flush_op = 0;
+                               xlvbd_flush(info);
                        }
                        /* fall through */
                case BLKIF_OP_READ:
@@ -914,7 +995,7 @@ static int blkif_recover(struct blkfront_info *info)
        /* Stage 3: Find pending requests and requeue them. */
        for (i = 0; i < BLK_RING_SIZE; i++) {
                /* Not in use? */
-               if (copy[i].request == 0)
+               if (!copy[i].request)
                        continue;
 
                /* Grab a request slot and copy shadow state into it. */
@@ -928,12 +1009,10 @@ static int blkif_recover(struct blkfront_info *info)
                /* Rewrite any grant references invalidated by susp/resume. */
                for (j = 0; j < req->nr_segments; j++)
                        gnttab_grant_foreign_access_ref(
-                               req->seg[j].gref,
+                               req->u.rw.seg[j].gref,
                                info->xbdev->otherend_id,
                                pfn_to_mfn(info->shadow[req->id].frame[j]),
-                               rq_data_dir(
-                                       (struct request *)
-                                       info->shadow[req->id].request));
+                               rq_data_dir(info->shadow[req->id].request));
                info->shadow[req->id].req = *req;
 
                info->ring.req_prod_pvt++;
@@ -1029,7 +1108,7 @@ static void blkfront_connect(struct blkfront_info *info)
        unsigned long sector_size;
        unsigned int binfo;
        int err;
-       int barrier;
+       int barrier, flush;
 
        switch (info->connected) {
        case BLKIF_STATE_CONNECTED:
@@ -1069,28 +1148,37 @@ static void blkfront_connect(struct blkfront_info *info)
                return;
        }
 
+       info->feature_flush = 0;
+       info->flush_op = 0;
+
        err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
-                           "feature-barrier", "%lu", &barrier,
+                           "feature-barrier", "%d", &barrier,
                            NULL);
 
        /*
         * If there's no "feature-barrier" defined, then it means
         * we're dealing with a very old backend which writes
-        * synchronously; draining will do what needs to get done.
-        *
-        * If there are barriers, then we can do full queued writes
-        * with tagged barriers.
+        * synchronously; nothing to do.
         *
-        * If barriers are not supported, then there's no much we can
-        * do, so just set ordering to NONE.
+        * If there are barriers, then we use flush.
         */
-       if (err)
-               info->feature_barrier = QUEUE_ORDERED_DRAIN;
-       else if (barrier)
-               info->feature_barrier = QUEUE_ORDERED_TAG;
-       else
-               info->feature_barrier = QUEUE_ORDERED_NONE;
+       if (!err && barrier) {
+               info->feature_flush = REQ_FLUSH | REQ_FUA;
+               info->flush_op = BLKIF_OP_WRITE_BARRIER;
+       }
+       /*
+        * And if there is "feature-flush-cache" use that above
+        * barriers.
+        */
+       err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+                           "feature-flush-cache", "%d", &flush,
+                           NULL);
 
+       if (!err && flush) {
+               info->feature_flush = REQ_FLUSH;
+               info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
+       }
+               
        err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
        if (err) {
                xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
@@ -1125,6 +1213,8 @@ static void blkback_changed(struct xenbus_device *dev,
        case XenbusStateInitialising:
        case XenbusStateInitWait:
        case XenbusStateInitialised:
+       case XenbusStateReconfiguring:
+       case XenbusStateReconfigured:
        case XenbusStateUnknown:
        case XenbusStateClosed:
                break;
@@ -1201,7 +1291,7 @@ static int blkif_open(struct block_device *bdev, fmode_t mode)
        struct blkfront_info *info;
        int err = 0;
 
-       lock_kernel();
+       mutex_lock(&blkfront_mutex);
 
        info = disk->private_data;
        if (!info) {
@@ -1219,7 +1309,7 @@ static int blkif_open(struct block_device *bdev, fmode_t mode)
        mutex_unlock(&info->mutex);
 
 out:
-       unlock_kernel();
+       mutex_unlock(&blkfront_mutex);
        return err;
 }
 
@@ -1229,7 +1319,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
        struct block_device *bdev;
        struct xenbus_device *xbdev;
 
-       lock_kernel();
+       mutex_lock(&blkfront_mutex);
 
        bdev = bdget_disk(disk, 0);
        bdput(bdev);
@@ -1263,7 +1353,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
        }
 
 out:
-       unlock_kernel();
+       mutex_unlock(&blkfront_mutex);
        return 0;
 }