#include <linux/cdrom.h>
#include <linux/module.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/scatterlist.h>
#include <xen/xen.h>
struct blk_shadow {
struct blkif_request req;
- unsigned long request;
+ struct request *request;
unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
};
+static DEFINE_MUTEX(blkfront_mutex);
static const struct block_device_operations xlvbd_block_fops;
#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
struct gnttab_free_callback callback;
struct blk_shadow shadow[BLK_RING_SIZE];
unsigned long shadow_free;
- int feature_barrier;
+ unsigned int feature_flush;
+ unsigned int flush_op;
int is_ready;
};
#define EXTENDED (1<<EXT_SHIFT)
#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
+#define EMULATED_HD_DISK_MINOR_OFFSET (0)
+#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
+#define EMULATED_SD_DISK_MINOR_OFFSET (0)
+#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256)
#define DEV_NAME "xvd" /* name in /dev */
unsigned long id)
{
info->shadow[id].req.id = info->shadow_free;
- info->shadow[id].request = 0;
+ info->shadow[id].request = NULL;
info->shadow_free = id;
}
}
/*
- * blkif_queue_request
+ * Generate a Xen blkfront IO request from a blk layer request. Reads
+ * and writes are handled as expected.
*
- * request block io
- *
- * id: for guest use only.
- * operation: BLKIF_OP_{READ,WRITE,PROBE}
- * buffer: buffer to read/write into. this should be a
- * virtual address in the guest os.
+ * @req: a request struct
*/
static int blkif_queue_request(struct request *req)
{
/* Fill out a communications ring structure. */
ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
id = get_id_from_freelist(info);
- info->shadow[id].request = (unsigned long)req;
+ info->shadow[id].request = req;
ring_req->id = id;
- ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
+ ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
ring_req->handle = info->handle;
ring_req->operation = rq_data_dir(req) ?
BLKIF_OP_WRITE : BLKIF_OP_READ;
- if (req->cmd_flags & REQ_HARDBARRIER)
- ring_req->operation = BLKIF_OP_WRITE_BARRIER;
+
+ if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
+ /*
+ * Ideally we can do an unordered flush-to-disk. In case the
+ * backend onlysupports barriers, use that. A barrier request
+ * a superset of FUA, so we can implement it the same
+ * way. (It's also a FLUSH+FUA, since it is
+ * guaranteed ordered WRT previous writes.)
+ */
+ ring_req->operation = info->flush_op;
+ }
ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
rq_data_dir(req) );
info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
- ring_req->seg[i] =
+ ring_req->u.rw.seg[i] =
(struct blkif_request_segment) {
.gref = ref,
.first_sect = fsect,
}
-static int xlvbd_barrier(struct blkfront_info *info)
+static void xlvbd_flush(struct blkfront_info *info)
{
- int err;
- const char *barrier;
+ blk_queue_flush(info->rq, info->feature_flush);
+ printk(KERN_INFO "blkfront: %s: %s: %s\n",
+ info->gd->disk_name,
+ info->flush_op == BLKIF_OP_WRITE_BARRIER ?
+ "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
+ "flush diskcache" : "barrier or flush"),
+ info->feature_flush ? "enabled" : "disabled");
+}
- switch (info->feature_barrier) {
- case QUEUE_ORDERED_DRAIN: barrier = "enabled (drain)"; break;
- case QUEUE_ORDERED_TAG: barrier = "enabled (tag)"; break;
- case QUEUE_ORDERED_NONE: barrier = "disabled"; break;
- default: return -EINVAL;
+static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
+{
+ int major;
+ major = BLKIF_MAJOR(vdevice);
+ *minor = BLKIF_MINOR(vdevice);
+ switch (major) {
+ case XEN_IDE0_MAJOR:
+ *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
+ *minor = ((*minor / 64) * PARTS_PER_DISK) +
+ EMULATED_HD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_IDE1_MAJOR:
+ *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
+ *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
+ EMULATED_HD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_SCSI_DISK0_MAJOR:
+ *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
+ *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_SCSI_DISK1_MAJOR:
+ case XEN_SCSI_DISK2_MAJOR:
+ case XEN_SCSI_DISK3_MAJOR:
+ case XEN_SCSI_DISK4_MAJOR:
+ case XEN_SCSI_DISK5_MAJOR:
+ case XEN_SCSI_DISK6_MAJOR:
+ case XEN_SCSI_DISK7_MAJOR:
+ *offset = (*minor / PARTS_PER_DISK) +
+ ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
+ EMULATED_SD_DISK_NAME_OFFSET;
+ *minor = *minor +
+ ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
+ EMULATED_SD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_SCSI_DISK8_MAJOR:
+ case XEN_SCSI_DISK9_MAJOR:
+ case XEN_SCSI_DISK10_MAJOR:
+ case XEN_SCSI_DISK11_MAJOR:
+ case XEN_SCSI_DISK12_MAJOR:
+ case XEN_SCSI_DISK13_MAJOR:
+ case XEN_SCSI_DISK14_MAJOR:
+ case XEN_SCSI_DISK15_MAJOR:
+ *offset = (*minor / PARTS_PER_DISK) +
+ ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
+ EMULATED_SD_DISK_NAME_OFFSET;
+ *minor = *minor +
+ ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
+ EMULATED_SD_DISK_MINOR_OFFSET;
+ break;
+ case XENVBD_MAJOR:
+ *offset = *minor / PARTS_PER_DISK;
+ break;
+ default:
+ printk(KERN_WARNING "blkfront: your disk configuration is "
+ "incorrect, please use an xvd device instead\n");
+ return -ENODEV;
}
-
- err = blk_queue_ordered(info->rq, info->feature_barrier);
-
- if (err)
- return err;
-
- printk(KERN_INFO "blkfront: %s: barriers %s\n",
- info->gd->disk_name, barrier);
return 0;
}
-
static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
struct blkfront_info *info,
u16 vdisk_info, u16 sector_size)
{
struct gendisk *gd;
int nr_minors = 1;
- int err = -ENODEV;
+ int err;
unsigned int offset;
int minor;
int nr_parts;
}
if (!VDEV_IS_EXTENDED(info->vdevice)) {
- minor = BLKIF_MINOR(info->vdevice);
- nr_parts = PARTS_PER_DISK;
+ err = xen_translate_vdev(info->vdevice, &minor, &offset);
+ if (err)
+ return err;
+ nr_parts = PARTS_PER_DISK;
} else {
minor = BLKIF_MINOR_EXT(info->vdevice);
nr_parts = PARTS_PER_EXT_DISK;
+ offset = minor / nr_parts;
+ if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4)
+ printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
+ "emulated IDE disks,\n\t choose an xvd device name"
+ "from xvde on\n", info->vdevice);
}
+ err = -ENODEV;
if ((minor % nr_parts) == 0)
nr_minors = nr_parts;
if (gd == NULL)
goto release;
- offset = minor / nr_parts;
-
if (nr_minors > 1) {
if (offset < 26)
sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
info->rq = gd->queue;
info->gd = gd;
- xlvbd_barrier(info);
+ xlvbd_flush(info);
if (vdisk_info & VDISK_READONLY)
set_disk_ro(gd, 1);
spin_unlock_irqrestore(&blkif_io_lock, flags);
/* Flush gnttab callback work. Must be done with no locks held. */
- flush_scheduled_work();
+ flush_work_sync(&info->work);
del_gendisk(info->gd);
spin_unlock_irq(&blkif_io_lock);
/* Flush gnttab callback work. Must be done with no locks held. */
- flush_scheduled_work();
+ flush_work_sync(&info->work);
/* Free resources associated with old device channel. */
if (info->ring_ref != GRANT_INVALID_REF) {
{
int i;
for (i = 0; i < s->req.nr_segments; i++)
- gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
+ gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
}
static irqreturn_t blkif_interrupt(int irq, void *dev_id)
bret = RING_GET_RESPONSE(&info->ring, i);
id = bret->id;
- req = (struct request *)info->shadow[id].request;
+ req = info->shadow[id].request;
blkif_completion(&info->shadow[id]);
error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
switch (bret->operation) {
+ case BLKIF_OP_FLUSH_DISKCACHE:
case BLKIF_OP_WRITE_BARRIER:
if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
- printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
+ printk(KERN_WARNING "blkfront: %s: write %s op failed\n",
+ info->flush_op == BLKIF_OP_WRITE_BARRIER ?
+ "barrier" : "flush disk cache",
info->gd->disk_name);
error = -EOPNOTSUPP;
- info->feature_barrier = QUEUE_ORDERED_NONE;
- xlvbd_barrier(info);
+ }
+ if (unlikely(bret->status == BLKIF_RSP_ERROR &&
+ info->shadow[id].req.nr_segments == 0)) {
+ printk(KERN_WARNING "blkfront: %s: empty write %s op failed\n",
+ info->flush_op == BLKIF_OP_WRITE_BARRIER ?
+ "barrier" : "flush disk cache",
+ info->gd->disk_name);
+ error = -EOPNOTSUPP;
+ }
+ if (unlikely(error)) {
+ if (error == -EOPNOTSUPP)
+ error = 0;
+ info->feature_flush = 0;
+ info->flush_op = 0;
+ xlvbd_flush(info);
}
/* fall through */
case BLKIF_OP_READ:
/* Stage 3: Find pending requests and requeue them. */
for (i = 0; i < BLK_RING_SIZE; i++) {
/* Not in use? */
- if (copy[i].request == 0)
+ if (!copy[i].request)
continue;
/* Grab a request slot and copy shadow state into it. */
/* Rewrite any grant references invalidated by susp/resume. */
for (j = 0; j < req->nr_segments; j++)
gnttab_grant_foreign_access_ref(
- req->seg[j].gref,
+ req->u.rw.seg[j].gref,
info->xbdev->otherend_id,
pfn_to_mfn(info->shadow[req->id].frame[j]),
- rq_data_dir(
- (struct request *)
- info->shadow[req->id].request));
+ rq_data_dir(info->shadow[req->id].request));
info->shadow[req->id].req = *req;
info->ring.req_prod_pvt++;
unsigned long sector_size;
unsigned int binfo;
int err;
- int barrier;
+ int barrier, flush;
switch (info->connected) {
case BLKIF_STATE_CONNECTED:
return;
}
+ info->feature_flush = 0;
+ info->flush_op = 0;
+
err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
- "feature-barrier", "%lu", &barrier,
+ "feature-barrier", "%d", &barrier,
NULL);
/*
* If there's no "feature-barrier" defined, then it means
* we're dealing with a very old backend which writes
- * synchronously; draining will do what needs to get done.
- *
- * If there are barriers, then we can do full queued writes
- * with tagged barriers.
+ * synchronously; nothing to do.
*
- * If barriers are not supported, then there's no much we can
- * do, so just set ordering to NONE.
+ * If there are barriers, then we use flush.
*/
- if (err)
- info->feature_barrier = QUEUE_ORDERED_DRAIN;
- else if (barrier)
- info->feature_barrier = QUEUE_ORDERED_TAG;
- else
- info->feature_barrier = QUEUE_ORDERED_NONE;
+ if (!err && barrier) {
+ info->feature_flush = REQ_FLUSH | REQ_FUA;
+ info->flush_op = BLKIF_OP_WRITE_BARRIER;
+ }
+ /*
+ * And if there is "feature-flush-cache" use that above
+ * barriers.
+ */
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "feature-flush-cache", "%d", &flush,
+ NULL);
+ if (!err && flush) {
+ info->feature_flush = REQ_FLUSH;
+ info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
+ }
+
err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
if (err) {
xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
case XenbusStateInitialising:
case XenbusStateInitWait:
case XenbusStateInitialised:
+ case XenbusStateReconfiguring:
+ case XenbusStateReconfigured:
case XenbusStateUnknown:
case XenbusStateClosed:
break;
struct blkfront_info *info;
int err = 0;
- lock_kernel();
+ mutex_lock(&blkfront_mutex);
info = disk->private_data;
if (!info) {
mutex_unlock(&info->mutex);
out:
- unlock_kernel();
+ mutex_unlock(&blkfront_mutex);
return err;
}
struct block_device *bdev;
struct xenbus_device *xbdev;
- lock_kernel();
+ mutex_lock(&blkfront_mutex);
bdev = bdget_disk(disk, 0);
bdput(bdev);
}
out:
- unlock_kernel();
+ mutex_unlock(&blkfront_mutex);
return 0;
}