drivers/block/xen-blkback/blkback.c

   1 /******************************************************************************
   2  *
   3  * Back-end of the driver for virtual block devices. This portion of the
   4  * driver exports a 'unified' block-device interface that can be accessed
   5  * by any operating system that implements a compatible front end. A
   6  * reference front-end implementation can be found in:
   7  *  drivers/block/xen-blkfront.c
   8  *
   9  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
  10  * Copyright (c) 2005, Christopher Clark
  11  *
  12  * This program is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU General Public License version 2
  14  * as published by the Free Software Foundation; or, when distributed
  15  * separately from the Linux kernel or incorporated into other
  16  * software packages, subject to the following license:
  17  *
  18  * Permission is hereby granted, free of charge, to any person obtaining a copy
  19  * of this source file (the "Software"), to deal in the Software without
  20  * restriction, including without limitation the rights to use, copy, modify,
  21  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  22  * and to permit persons to whom the Software is furnished to do so, subject to
  23  * the following conditions:
  24  *
  25  * The above copyright notice and this permission notice shall be included in
  26  * all copies or substantial portions of the Software.
  27  *
  28  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  29  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  30  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  31  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  32  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  33  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  34  * IN THE SOFTWARE.
  35  */
  36
  37 #include <linux/spinlock.h>
  38 #include <linux/kthread.h>
  39 #include <linux/list.h>
  40 #include <linux/delay.h>
  41 #include <linux/freezer.h>
  42 #include <linux/bitmap.h>
  43
  44 #include <xen/events.h>
  45 #include <xen/page.h>
  46 #include <xen/xen.h>
  47 #include <asm/xen/hypervisor.h>
  48 #include <asm/xen/hypercall.h>
  49 #include <xen/balloon.h>
  50 #include "common.h"
  51
  52 /*
  53  * These are rather arbitrary. They are fairly large because adjacent requests
  54  * pulled from a communication ring are quite likely to end up being part of
  55  * the same scatter/gather request at the disc.
  56  *
  57  * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
  58  *
  59  * This will increase the chances of being able to write whole tracks.
  60  * 64 should be enough to keep us competitive with Linux.
  61  */
  62 static int xen_blkif_reqs = 64;
  63 module_param_named(reqs, xen_blkif_reqs, int, 0);
  64 MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
  65
  66 /* Run-time switchable: /sys/module/blkback/parameters/ */
  67 static unsigned int log_stats;
  68 module_param(log_stats, int, 0644);
  69
  70 /*
  71  * Each outstanding request that we've passed to the lower device layers has a
  72  * 'pending_req' allocated to it. Each buffer_head that completes decrements
  73  * the pendcnt towards zero. When it hits zero, the specified domain has a
  74  * response queued for it, with the saved 'id' passed back.
  75  */
  76 struct pending_req {
  77         struct xen_blkif        *blkif;
  78         u64                     id;
  79         int                     nr_pages;
  80         atomic_t                pendcnt;
  81         unsigned short          operation;
  82         int                     status;
  83         struct list_head        free_list;
  84         DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
  85 };
  86
  87 #define BLKBACK_INVALID_HANDLE (~0)
  88
  89 struct xen_blkbk {
  90         struct pending_req      *pending_reqs;
  91         /* List of all 'pending_req' available */
  92         struct list_head        pending_free;
  93         /* And its spinlock. */
  94         spinlock_t              pending_free_lock;
  95         wait_queue_head_t       pending_free_wq;
  96         /* The list of all pages that are available. */
  97         struct page             **pending_pages;
  98         /* And the grant handles that are available. */
  99         grant_handle_t          *pending_grant_handles;
 100 };
 101
 102 static struct xen_blkbk *blkbk;
 103
 104 /*
 105  * Maximum number of grant pages that can be mapped in blkback.
 106  * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of
 107  * pages that blkback will persistently map.
 108  * Currently, this is:
 109  * RING_SIZE = 32 (for all known ring types)
 110  * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11
 111  * sizeof(struct persistent_gnt) = 48
 112  * So the maximum memory used to store the grants is:
 113  * 32 * 11 * 48 = 16896 bytes
 114  */
 115 static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol)
 116 {
 117         switch (protocol) {
 118         case BLKIF_PROTOCOL_NATIVE:
 119                 return __CONST_RING_SIZE(blkif, PAGE_SIZE) *
 120                            BLKIF_MAX_SEGMENTS_PER_REQUEST;
 121         case BLKIF_PROTOCOL_X86_32:
 122                 return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) *
 123                            BLKIF_MAX_SEGMENTS_PER_REQUEST;
 124         case BLKIF_PROTOCOL_X86_64:
 125                 return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) *
 126                            BLKIF_MAX_SEGMENTS_PER_REQUEST;
 127         default:
 128                 BUG();
 129         }
 130         return 0;
 131 }
 132
 133
 134 /*
 135  * Little helpful macro to figure out the index and virtual address of the
 136  * pending_pages[..]. For each 'pending_req' we have have up to
 137  * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
 138  * 10 and would index in the pending_pages[..].
 139  */
 140 static inline int vaddr_pagenr(struct pending_req *req, int seg)
 141 {
 142         return (req - blkbk->pending_reqs) *
 143                 BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
 144 }
 145
 146 #define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
 147
 148 static inline unsigned long vaddr(struct pending_req *req, int seg)
 149 {
 150         unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg));
 151         return (unsigned long)pfn_to_kaddr(pfn);
 152 }
 153
 154 #define pending_handle(_req, _seg) \
 155         (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
 156
 157
 158 static int do_block_io_op(struct xen_blkif *blkif);
 159 static int dispatch_rw_block_io(struct xen_blkif *blkif,
 160                                 struct blkif_request *req,
 161                                 struct pending_req *pending_req);
 162 static void make_response(struct xen_blkif *blkif, u64 id,
 163                           unsigned short op, int st);
 164
 165 #define foreach_grant_safe(pos, n, rbtree, node) \
 166         for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
 167              (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \
 168              &(pos)->node != NULL; \
 169              (pos) = container_of(n, typeof(*(pos)), node), \
 170              (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
 171
 172
 173 static void add_persistent_gnt(struct rb_root *root,
 174                                struct persistent_gnt *persistent_gnt)
 175 {
 176         struct rb_node **new = &(root->rb_node), *parent = NULL;
 177         struct persistent_gnt *this;
 178
 179         /* Figure out where to put new node */
 180         while (*new) {
 181                 this = container_of(*new, struct persistent_gnt, node);
 182
 183                 parent = *new;
 184                 if (persistent_gnt->gnt < this->gnt)
 185                         new = &((*new)->rb_left);
 186                 else if (persistent_gnt->gnt > this->gnt)
 187                         new = &((*new)->rb_right);
 188                 else {
 189                         pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n");
 190                         BUG();
 191                 }
 192         }
 193
 194         /* Add new node and rebalance tree. */
 195         rb_link_node(&(persistent_gnt->node), parent, new);
 196         rb_insert_color(&(persistent_gnt->node), root);
 197 }
 198
 199 static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
 200                                                  grant_ref_t gref)
 201 {
 202         struct persistent_gnt *data;
 203         struct rb_node *node = root->rb_node;
 204
 205         while (node) {
 206                 data = container_of(node, struct persistent_gnt, node);
 207
 208                 if (gref < data->gnt)
 209                         node = node->rb_left;
 210                 else if (gref > data->gnt)
 211                         node = node->rb_right;
 212                 else
 213                         return data;
 214         }
 215         return NULL;
 216 }
 217
 218 static void free_persistent_gnts(struct rb_root *root, unsigned int num)
 219 {
 220         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 221         struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 222         struct persistent_gnt *persistent_gnt;
 223         struct rb_node *n;
 224         int ret = 0;
 225         int segs_to_unmap = 0;
 226
 227         foreach_grant_safe(persistent_gnt, n, root, node) {
 228                 BUG_ON(persistent_gnt->handle ==
 229                         BLKBACK_INVALID_HANDLE);
 230                 gnttab_set_unmap_op(&unmap[segs_to_unmap],
 231                         (unsigned long) pfn_to_kaddr(page_to_pfn(
 232                                 persistent_gnt->page)),
 233                         GNTMAP_host_map,
 234                         persistent_gnt->handle);
 235
 236                 pages[segs_to_unmap] = persistent_gnt->page;
 237
 238                 if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
 239                         !rb_next(&persistent_gnt->node)) {
 240                         ret = gnttab_unmap_refs(unmap, NULL, pages,
 241                                 segs_to_unmap);
 242                         BUG_ON(ret);
 243                         free_xenballooned_pages(segs_to_unmap, pages);
 244                         segs_to_unmap = 0;
 245                 }
 246
 247                 rb_erase(&persistent_gnt->node, root);
 248                 kfree(persistent_gnt);
 249                 num--;
 250         }
 251         BUG_ON(num != 0);
 252 }
 253
 254 /*
 255  * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
 256  */
 257 static struct pending_req *alloc_req(void)
 258 {
 259         struct pending_req *req = NULL;
 260         unsigned long flags;
 261
 262         spin_lock_irqsave(&blkbk->pending_free_lock, flags);
 263         if (!list_empty(&blkbk->pending_free)) {
 264                 req = list_entry(blkbk->pending_free.next, struct pending_req,
 265                                  free_list);
 266                 list_del(&req->free_list);
 267         }
 268         spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
 269         return req;
 270 }
 271
 272 /*
 273  * Return the 'pending_req' structure back to the freepool. We also
 274  * wake up the thread if it was waiting for a free page.
 275  */
 276 static void free_req(struct pending_req *req)
 277 {
 278         unsigned long flags;
 279         int was_empty;
 280
 281         spin_lock_irqsave(&blkbk->pending_free_lock, flags);
 282         was_empty = list_empty(&blkbk->pending_free);
 283         list_add(&req->free_list, &blkbk->pending_free);
 284         spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
 285         if (was_empty)
 286                 wake_up(&blkbk->pending_free_wq);
 287 }
 288
 289 /*
 290  * Routines for managing virtual block devices (vbds).
 291  */
 292 static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
 293                              int operation)
 294 {
 295         struct xen_vbd *vbd = &blkif->vbd;
 296         int rc = -EACCES;
 297
 298         if ((operation != READ) && vbd->readonly)
 299                 goto out;
 300
 301         if (likely(req->nr_sects)) {
 302                 blkif_sector_t end = req->sector_number + req->nr_sects;
 303
 304                 if (unlikely(end < req->sector_number))
 305                         goto out;
 306                 if (unlikely(end > vbd_sz(vbd)))
 307                         goto out;
 308         }
 309
 310         req->dev  = vbd->pdevice;
 311         req->bdev = vbd->bdev;
 312         rc = 0;
 313
 314  out:
 315         return rc;
 316 }
 317
 318 static void xen_vbd_resize(struct xen_blkif *blkif)
 319 {
 320         struct xen_vbd *vbd = &blkif->vbd;
 321         struct xenbus_transaction xbt;
 322         int err;
 323         struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
 324         unsigned long long new_size = vbd_sz(vbd);
 325
 326         pr_info(DRV_PFX "VBD Resize: Domid: %d, Device: (%d, %d)\n",
 327                 blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
 328         pr_info(DRV_PFX "VBD Resize: new size %llu\n", new_size);
 329         vbd->size = new_size;
 330 again:
 331         err = xenbus_transaction_start(&xbt);
 332         if (err) {
 333                 pr_warn(DRV_PFX "Error starting transaction");
 334                 return;
 335         }
 336         err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
 337                             (unsigned long long)vbd_sz(vbd));
 338         if (err) {
 339                 pr_warn(DRV_PFX "Error writing new size");
 340                 goto abort;
 341         }
 342         /*
 343          * Write the current state; we will use this to synchronize
 344          * the front-end. If the current state is "connected" the
 345          * front-end will get the new size information online.
 346          */
 347         err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
 348         if (err) {
 349                 pr_warn(DRV_PFX "Error writing the state");
 350                 goto abort;
 351         }
 352
 353         err = xenbus_transaction_end(xbt, 0);
 354         if (err == -EAGAIN)
 355                 goto again;
 356         if (err)
 357                 pr_warn(DRV_PFX "Error ending transaction");
 358         return;
 359 abort:
 360         xenbus_transaction_end(xbt, 1);
 361 }
 362
 363 /*
 364  * Notification from the guest OS.
 365  */
 366 static void blkif_notify_work(struct xen_blkif *blkif)
 367 {
 368         blkif->waiting_reqs = 1;
 369         wake_up(&blkif->wq);
 370 }
 371
 372 irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
 373 {
 374         blkif_notify_work(dev_id);
 375         return IRQ_HANDLED;
 376 }
 377
 378 /*
 379  * SCHEDULER FUNCTIONS
 380  */
 381
 382 static void print_stats(struct xen_blkif *blkif)
 383 {
 384         pr_info("xen-blkback (%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
 385                  "  |  ds %4llu\n",
 386                  current->comm, blkif->st_oo_req,
 387                  blkif->st_rd_req, blkif->st_wr_req,
 388                  blkif->st_f_req, blkif->st_ds_req);
 389         blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
 390         blkif->st_rd_req = 0;
 391         blkif->st_wr_req = 0;
 392         blkif->st_oo_req = 0;
 393         blkif->st_ds_req = 0;
 394 }
 395
 396 int xen_blkif_schedule(void *arg)
 397 {
 398         struct xen_blkif *blkif = arg;
 399         struct xen_vbd *vbd = &blkif->vbd;
 400
 401         xen_blkif_get(blkif);
 402
 403         while (!kthread_should_stop()) {
 404                 if (try_to_freeze())
 405                         continue;
 406                 if (unlikely(vbd->size != vbd_sz(vbd)))
 407                         xen_vbd_resize(blkif);
 408
 409                 wait_event_interruptible(
 410                         blkif->wq,
 411                         blkif->waiting_reqs || kthread_should_stop());
 412                 wait_event_interruptible(
 413                         blkbk->pending_free_wq,
 414                         !list_empty(&blkbk->pending_free) ||
 415                         kthread_should_stop());
 416
 417                 blkif->waiting_reqs = 0;
 418                 smp_mb(); /* clear flag *before* checking for work */
 419
 420                 if (do_block_io_op(blkif))
 421                         blkif->waiting_reqs = 1;
 422
 423                 if (log_stats && time_after(jiffies, blkif->st_print))
 424                         print_stats(blkif);
 425         }
 426
 427         /* Free all persistent grant pages */
 428         if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
 429                 free_persistent_gnts(&blkif->persistent_gnts,
 430                         blkif->persistent_gnt_c);
 431
 432         BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
 433         blkif->persistent_gnt_c = 0;
 434
 435         if (log_stats)
 436                 print_stats(blkif);
 437
 438         blkif->xenblkd = NULL;
 439         xen_blkif_put(blkif);
 440
 441         return 0;
 442 }
 443
 444 struct seg_buf {
 445         unsigned int offset;
 446         unsigned int nsec;
 447 };
 448 /*
 449  * Unmap the grant references, and also remove the M2P over-rides
 450  * used in the 'pending_req'.
 451  */
 452 static void xen_blkbk_unmap(struct pending_req *req)
 453 {
 454         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 455         struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 456         unsigned int i, invcount = 0;
 457         grant_handle_t handle;
 458         int ret;
 459
 460         for (i = 0; i < req->nr_pages; i++) {
 461                 if (!test_bit(i, req->unmap_seg))
 462                         continue;
 463                 handle = pending_handle(req, i);
 464                 if (handle == BLKBACK_INVALID_HANDLE)
 465                         continue;
 466                 gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
 467                                     GNTMAP_host_map, handle);
 468                 pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
 469                 pages[invcount] = virt_to_page(vaddr(req, i));
 470                 invcount++;
 471         }
 472
 473         ret = gnttab_unmap_refs(unmap, NULL, pages, invcount);
 474         BUG_ON(ret);
 475 }
 476
 477 static int xen_blkbk_map(struct blkif_request *req,
 478                          struct pending_req *pending_req,
 479                          struct seg_buf seg[],
 480                          struct page *pages[])
 481 {
 482         struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 483         struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 484         struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 485         struct persistent_gnt *persistent_gnt = NULL;
 486         struct xen_blkif *blkif = pending_req->blkif;
 487         phys_addr_t addr = 0;
 488         int i, j;
 489         bool new_map;
 490         int nseg = req->u.rw.nr_segments;
 491         int segs_to_map = 0;
 492         int ret = 0;
 493         int use_persistent_gnts;
 494
 495         use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
 496
 497         BUG_ON(blkif->persistent_gnt_c >
 498                    max_mapped_grant_pages(pending_req->blkif->blk_protocol));
 499
 500         /*
 501          * Fill out preq.nr_sects with proper amount of sectors, and setup
 502          * assign map[..] with the PFN of the page in our domain with the
 503          * corresponding grant reference for each page.
 504          */
 505         for (i = 0; i < nseg; i++) {
 506                 uint32_t flags;
 507
 508                 if (use_persistent_gnts)
 509                         persistent_gnt = get_persistent_gnt(
 510                                 &blkif->persistent_gnts,
 511                                 req->u.rw.seg[i].gref);
 512
 513                 if (persistent_gnt) {
 514                         /*
 515                          * We are using persistent grants and
 516                          * the grant is already mapped
 517                          */
 518                         new_map = false;
 519                 } else if (use_persistent_gnts &&
 520                            blkif->persistent_gnt_c <
 521                            max_mapped_grant_pages(blkif->blk_protocol)) {
 522                         /*
 523                          * We are using persistent grants, the grant is
 524                          * not mapped but we have room for it
 525                          */
 526                         new_map = true;
 527                         persistent_gnt = kmalloc(
 528                                 sizeof(struct persistent_gnt),
 529                                 GFP_KERNEL);
 530                         if (!persistent_gnt)
 531                                 return -ENOMEM;
 532                         if (alloc_xenballooned_pages(1, &persistent_gnt->page,
 533                             false)) {
 534                                 kfree(persistent_gnt);
 535                                 return -ENOMEM;
 536                         }
 537                         persistent_gnt->gnt = req->u.rw.seg[i].gref;
 538                         persistent_gnt->handle = BLKBACK_INVALID_HANDLE;
 539
 540                         pages_to_gnt[segs_to_map] =
 541                                 persistent_gnt->page;
 542                         addr = (unsigned long) pfn_to_kaddr(
 543                                 page_to_pfn(persistent_gnt->page));
 544
 545                         add_persistent_gnt(&blkif->persistent_gnts,
 546                                 persistent_gnt);
 547                         blkif->persistent_gnt_c++;
 548                         pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
 549                                  persistent_gnt->gnt, blkif->persistent_gnt_c,
 550                                  max_mapped_grant_pages(blkif->blk_protocol));
 551                 } else {
 552                         /*
 553                          * We are either using persistent grants and
 554                          * hit the maximum limit of grants mapped,
 555                          * or we are not using persistent grants.
 556                          */
 557                         if (use_persistent_gnts &&
 558                                 !blkif->vbd.overflow_max_grants) {
 559                                 blkif->vbd.overflow_max_grants = 1;
 560                                 pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
 561                                          blkif->domid, blkif->vbd.handle);
 562                         }
 563                         new_map = true;
 564                         pages[i] = blkbk->pending_page(pending_req, i);
 565                         addr = vaddr(pending_req, i);
 566                         pages_to_gnt[segs_to_map] =
 567                                 blkbk->pending_page(pending_req, i);
 568                 }
 569
 570                 if (persistent_gnt) {
 571                         pages[i] = persistent_gnt->page;
 572                         persistent_gnts[i] = persistent_gnt;
 573                 } else {
 574                         persistent_gnts[i] = NULL;
 575                 }
 576
 577                 if (new_map) {
 578                         flags = GNTMAP_host_map;
 579                         if (!persistent_gnt &&
 580                             (pending_req->operation != BLKIF_OP_READ))
 581                                 flags |= GNTMAP_readonly;
 582                         gnttab_set_map_op(&map[segs_to_map++], addr,
 583                                           flags, req->u.rw.seg[i].gref,
 584                                           blkif->domid);
 585                 }
 586         }
 587
 588         if (segs_to_map) {
 589                 ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
 590                 BUG_ON(ret);
 591         }
 592
 593         /*
 594          * Now swizzle the MFN in our domain with the MFN from the other domain
 595          * so that when we access vaddr(pending_req,i) it has the contents of
 596          * the page from the other domain.
 597          */
 598         bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 599         for (i = 0, j = 0; i < nseg; i++) {
 600                 if (!persistent_gnts[i] ||
 601                     persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) {
 602                         /* This is a newly mapped grant */
 603                         BUG_ON(j >= segs_to_map);
 604                         if (unlikely(map[j].status != 0)) {
 605                                 pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
 606                                 map[j].handle = BLKBACK_INVALID_HANDLE;
 607                                 ret |= 1;
 608                                 if (persistent_gnts[i]) {
 609                                         rb_erase(&persistent_gnts[i]->node,
 610                                                  &blkif->persistent_gnts);
 611                                         blkif->persistent_gnt_c--;
 612                                         kfree(persistent_gnts[i]);
 613                                         persistent_gnts[i] = NULL;
 614                                 }
 615                         }
 616                 }
 617                 if (persistent_gnts[i]) {
 618                         if (persistent_gnts[i]->handle ==
 619                             BLKBACK_INVALID_HANDLE) {
 620                                 /*
 621                                  * If this is a new persistent grant
 622                                  * save the handler
 623                                  */
 624                                 persistent_gnts[i]->handle = map[j++].handle;
 625                         }
 626                         pending_handle(pending_req, i) =
 627                                 persistent_gnts[i]->handle;
 628
 629                         if (ret)
 630                                 continue;
 631                 } else {
 632                         pending_handle(pending_req, i) = map[j++].handle;
 633                         bitmap_set(pending_req->unmap_seg, i, 1);
 634
 635                         if (ret)
 636                                 continue;
 637                 }
 638                 seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
 639         }
 640         return ret;
 641 }
 642
 643 static int dispatch_discard_io(struct xen_blkif *blkif,
 644                                 struct blkif_request *req)
 645 {
 646         int err = 0;
 647         int status = BLKIF_RSP_OKAY;
 648         struct block_device *bdev = blkif->vbd.bdev;
 649         unsigned long secure;
 650
 651         blkif->st_ds_req++;
 652
 653         xen_blkif_get(blkif);
 654         secure = (blkif->vbd.discard_secure &&
 655                  (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
 656                  BLKDEV_DISCARD_SECURE : 0;
 657
 658         err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
 659                                    req->u.discard.nr_sectors,
 660                                    GFP_KERNEL, secure);
 661
 662         if (err == -EOPNOTSUPP) {
 663                 pr_debug(DRV_PFX "discard op failed, not supported\n");
 664                 status = BLKIF_RSP_EOPNOTSUPP;
 665         } else if (err)
 666                 status = BLKIF_RSP_ERROR;
 667
 668         make_response(blkif, req->u.discard.id, req->operation, status);
 669         xen_blkif_put(blkif);
 670         return err;
 671 }
 672
 673 static int dispatch_other_io(struct xen_blkif *blkif,
 674                              struct blkif_request *req,
 675                              struct pending_req *pending_req)
 676 {
 677         free_req(pending_req);
 678         make_response(blkif, req->u.other.id, req->operation,
 679                       BLKIF_RSP_EOPNOTSUPP);
 680         return -EIO;
 681 }
 682
 683 static void xen_blk_drain_io(struct xen_blkif *blkif)
 684 {
 685         atomic_set(&blkif->drain, 1);
 686         do {
 687                 /* The initial value is one, and one refcnt taken at the
 688                  * start of the xen_blkif_schedule thread. */
 689                 if (atomic_read(&blkif->refcnt) <= 2)
 690                         break;
 691                 wait_for_completion_interruptible_timeout(
 692                                 &blkif->drain_complete, HZ);
 693
 694                 if (!atomic_read(&blkif->drain))
 695                         break;
 696         } while (!kthread_should_stop());
 697         atomic_set(&blkif->drain, 0);
 698 }
 699
 700 /*
 701  * Completion callback on the bio's. Called as bh->b_end_io()
 702  */
 703
 704 static void __end_block_io_op(struct pending_req *pending_req, int error)
 705 {
 706         /* An error fails the entire request. */
 707         if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
 708             (error == -EOPNOTSUPP)) {
 709                 pr_debug(DRV_PFX "flush diskcache op failed, not supported\n");
 710                 xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
 711                 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
 712         } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
 713                     (error == -EOPNOTSUPP)) {
 714                 pr_debug(DRV_PFX "write barrier op failed, not supported\n");
 715                 xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
 716                 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
 717         } else if (error) {
 718                 pr_debug(DRV_PFX "Buffer not up-to-date at end of operation,"
 719                          " error=%d\n", error);
 720                 pending_req->status = BLKIF_RSP_ERROR;
 721         }
 722
 723         /*
 724          * If all of the bio's have completed it is time to unmap
 725          * the grant references associated with 'request' and provide
 726          * the proper response on the ring.
 727          */
 728         if (atomic_dec_and_test(&pending_req->pendcnt)) {
 729                 xen_blkbk_unmap(pending_req);
 730                 make_response(pending_req->blkif, pending_req->id,
 731                               pending_req->operation, pending_req->status);
 732                 xen_blkif_put(pending_req->blkif);
 733                 if (atomic_read(&pending_req->blkif->refcnt) <= 2) {
 734                         if (atomic_read(&pending_req->blkif->drain))
 735                                 complete(&pending_req->blkif->drain_complete);
 736                 }
 737                 free_req(pending_req);
 738         }
 739 }
 740
 741 /*
 742  * bio callback.
 743  */
 744 static void end_block_io_op(struct bio *bio, int error)
 745 {
 746         __end_block_io_op(bio->bi_private, error);
 747         bio_put(bio);
 748 }
 749
 750
 751
 752 /*
 753  * Function to copy the from the ring buffer the 'struct blkif_request'
 754  * (which has the sectors we want, number of them, grant references, etc),
 755  * and transmute  it to the block API to hand it over to the proper block disk.
 756  */
 757 static int
 758 __do_block_io_op(struct xen_blkif *blkif)
 759 {
 760         union blkif_back_rings *blk_rings = &blkif->blk_rings;
 761         struct blkif_request req;
 762         struct pending_req *pending_req;
 763         RING_IDX rc, rp;
 764         int more_to_do = 0;
 765
 766         rc = blk_rings->common.req_cons;
 767         rp = blk_rings->common.sring->req_prod;
 768         rmb(); /* Ensure we see queued requests up to 'rp'. */
 769
 770         while (rc != rp) {
 771
 772                 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
 773                         break;
 774
 775                 if (kthread_should_stop()) {
 776                         more_to_do = 1;
 777                         break;
 778                 }
 779
 780                 pending_req = alloc_req();
 781                 if (NULL == pending_req) {
 782                         blkif->st_oo_req++;
 783                         more_to_do = 1;
 784                         break;
 785                 }
 786
 787                 switch (blkif->blk_protocol) {
 788                 case BLKIF_PROTOCOL_NATIVE:
 789                         memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
 790                         break;
 791                 case BLKIF_PROTOCOL_X86_32:
 792                         blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
 793                         break;
 794                 case BLKIF_PROTOCOL_X86_64:
 795                         blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
 796                         break;
 797                 default:
 798                         BUG();
 799                 }
 800                 blk_rings->common.req_cons = ++rc; /* before make_response() */
 801
 802                 /* Apply all sanity checks to /private copy/ of request. */
 803                 barrier();
 804
 805                 switch (req.operation) {
 806                 case BLKIF_OP_READ:
 807                 case BLKIF_OP_WRITE:
 808                 case BLKIF_OP_WRITE_BARRIER:
 809                 case BLKIF_OP_FLUSH_DISKCACHE:
 810                         if (dispatch_rw_block_io(blkif, &req, pending_req))
 811                                 goto done;
 812                         break;
 813                 case BLKIF_OP_DISCARD:
 814                         free_req(pending_req);
 815                         if (dispatch_discard_io(blkif, &req))
 816                                 goto done;
 817                         break;
 818                 default:
 819                         if (dispatch_other_io(blkif, &req, pending_req))
 820                                 goto done;
 821                         break;
 822                 }
 823
 824                 /* Yield point for this unbounded loop. */
 825                 cond_resched();
 826         }
 827 done:
 828         return more_to_do;
 829 }
 830
 831 static int
 832 do_block_io_op(struct xen_blkif *blkif)
 833 {
 834         union blkif_back_rings *blk_rings = &blkif->blk_rings;
 835         int more_to_do;
 836
 837         do {
 838                 more_to_do = __do_block_io_op(blkif);
 839                 if (more_to_do)
 840                         break;
 841
 842                 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
 843         } while (more_to_do);
 844
 845         return more_to_do;
 846 }
 847 /*
 848  * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
 849  * and call the 'submit_bio' to pass it to the underlying storage.
 850  */
 851 static int dispatch_rw_block_io(struct xen_blkif *blkif,
 852                                 struct blkif_request *req,
 853                                 struct pending_req *pending_req)
 854 {
 855         struct phys_req preq;
 856         struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 857         unsigned int nseg;
 858         struct bio *bio = NULL;
 859         struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 860         int i, nbio = 0;
 861         int operation;
 862         struct blk_plug plug;
 863         bool drain = false;
 864         struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 865
 866         switch (req->operation) {
 867         case BLKIF_OP_READ:
 868                 blkif->st_rd_req++;
 869                 operation = READ;
 870                 break;
 871         case BLKIF_OP_WRITE:
 872                 blkif->st_wr_req++;
 873                 operation = WRITE_ODIRECT;
 874                 break;
 875         case BLKIF_OP_WRITE_BARRIER:
 876                 drain = true;
 877         case BLKIF_OP_FLUSH_DISKCACHE:
 878                 blkif->st_f_req++;
 879                 operation = WRITE_FLUSH;
 880                 break;
 881         default:
 882                 operation = 0; /* make gcc happy */
 883                 goto fail_response;
 884                 break;
 885         }
 886
 887         /* Check that the number of segments is sane. */
 888         nseg = req->u.rw.nr_segments;
 889
 890         if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
 891             unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 892                 pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
 893                          nseg);
 894                 /* Haven't submitted any bio's yet. */
 895                 goto fail_response;
 896         }
 897
 898         preq.sector_number = req->u.rw.sector_number;
 899         preq.nr_sects      = 0;
 900
 901         pending_req->blkif     = blkif;
 902         pending_req->id        = req->u.rw.id;
 903         pending_req->operation = req->operation;
 904         pending_req->status    = BLKIF_RSP_OKAY;
 905         pending_req->nr_pages  = nseg;
 906
 907         for (i = 0; i < nseg; i++) {
 908                 seg[i].nsec = req->u.rw.seg[i].last_sect -
 909                         req->u.rw.seg[i].first_sect + 1;
 910                 if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
 911                     (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
 912                         goto fail_response;
 913                 preq.nr_sects += seg[i].nsec;
 914
 915         }
 916
 917         if (xen_vbd_translate(&preq, blkif, operation) != 0) {
 918                 pr_debug(DRV_PFX "access denied: %s of [%llu,%llu] on dev=%04x\n",
 919                          operation == READ ? "read" : "write",
 920                          preq.sector_number,
 921                          preq.sector_number + preq.nr_sects,
 922                          blkif->vbd.pdevice);
 923                 goto fail_response;
 924         }
 925
 926         /*
 927          * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
 928          * is set there.
 929          */
 930         for (i = 0; i < nseg; i++) {
 931                 if (((int)preq.sector_number|(int)seg[i].nsec) &
 932                     ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
 933                         pr_debug(DRV_PFX "Misaligned I/O request from domain %d",
 934                                  blkif->domid);
 935                         goto fail_response;
 936                 }
 937         }
 938
 939         /* Wait on all outstanding I/O's and once that has been completed
 940          * issue the WRITE_FLUSH.
 941          */
 942         if (drain)
 943                 xen_blk_drain_io(pending_req->blkif);
 944
 945         /*
 946          * If we have failed at this point, we need to undo the M2P override,
 947          * set gnttab_set_unmap_op on all of the grant references and perform
 948          * the hypercall to unmap the grants - that is all done in
 949          * xen_blkbk_unmap.
 950          */
 951         if (xen_blkbk_map(req, pending_req, seg, pages))
 952                 goto fail_flush;
 953
 954         /*
 955          * This corresponding xen_blkif_put is done in __end_block_io_op, or
 956          * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
 957          */
 958         xen_blkif_get(blkif);
 959
 960         for (i = 0; i < nseg; i++) {
 961                 while ((bio == NULL) ||
 962                        (bio_add_page(bio,
 963                                      pages[i],
 964                                      seg[i].nsec << 9,
 965                                      seg[i].offset) == 0)) {
 966
 967                         bio = bio_alloc(GFP_KERNEL, nseg-i);
 968                         if (unlikely(bio == NULL))
 969                                 goto fail_put_bio;
 970
 971                         biolist[nbio++] = bio;
 972                         bio->bi_bdev    = preq.bdev;
 973                         bio->bi_private = pending_req;
 974                         bio->bi_end_io  = end_block_io_op;
 975                         bio->bi_sector  = preq.sector_number;
 976                 }
 977
 978                 preq.sector_number += seg[i].nsec;
 979         }
 980
 981         /* This will be hit if the operation was a flush or discard. */
 982         if (!bio) {
 983                 BUG_ON(operation != WRITE_FLUSH);
 984
 985                 bio = bio_alloc(GFP_KERNEL, 0);
 986                 if (unlikely(bio == NULL))
 987                         goto fail_put_bio;
 988
 989                 biolist[nbio++] = bio;
 990                 bio->bi_bdev    = preq.bdev;
 991                 bio->bi_private = pending_req;
 992                 bio->bi_end_io  = end_block_io_op;
 993         }
 994
 995         atomic_set(&pending_req->pendcnt, nbio);
 996         blk_start_plug(&plug);
 997
 998         for (i = 0; i < nbio; i++)
 999                 submit_bio(operation, biolist[i]);
1000
1001         /* Let the I/Os go.. */
1002         blk_finish_plug(&plug);
1003
1004         if (operation == READ)
1005                 blkif->st_rd_sect += preq.nr_sects;
1006         else if (operation & WRITE)
1007                 blkif->st_wr_sect += preq.nr_sects;
1008
1009         return 0;
1010
1011  fail_flush:
1012         xen_blkbk_unmap(pending_req);
1013  fail_response:
1014         /* Haven't submitted any bio's yet. */
1015         make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
1016         free_req(pending_req);
1017         msleep(1); /* back off a bit */
1018         return -EIO;
1019
1020  fail_put_bio:
1021         for (i = 0; i < nbio; i++)
1022                 bio_put(biolist[i]);
1023         atomic_set(&pending_req->pendcnt, 1);
1024         __end_block_io_op(pending_req, -EINVAL);
1025         msleep(1); /* back off a bit */
1026         return -EIO;
1027 }
1028
1029
1030
1031 /*
1032  * Put a response on the ring on how the operation fared.
1033  */
1034 static void make_response(struct xen_blkif *blkif, u64 id,
1035                           unsigned short op, int st)
1036 {
1037         struct blkif_response  resp;
1038         unsigned long     flags;
1039         union blkif_back_rings *blk_rings = &blkif->blk_rings;
1040         int notify;
1041
1042         resp.id        = id;
1043         resp.operation = op;
1044         resp.status    = st;
1045
1046         spin_lock_irqsave(&blkif->blk_ring_lock, flags);
1047         /* Place on the response ring for the relevant domain. */
1048         switch (blkif->blk_protocol) {
1049         case BLKIF_PROTOCOL_NATIVE:
1050                 memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
1051                        &resp, sizeof(resp));
1052                 break;
1053         case BLKIF_PROTOCOL_X86_32:
1054                 memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
1055                        &resp, sizeof(resp));
1056                 break;
1057         case BLKIF_PROTOCOL_X86_64:
1058                 memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
1059                        &resp, sizeof(resp));
1060                 break;
1061         default:
1062                 BUG();
1063         }
1064         blk_rings->common.rsp_prod_pvt++;
1065         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
1066         spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
1067         if (notify)
1068                 notify_remote_via_irq(blkif->irq);
1069 }
1070
1071 static int __init xen_blkif_init(void)
1072 {
1073         int i, mmap_pages;
1074         int rc = 0;
1075
1076         if (!xen_domain())
1077                 return -ENODEV;
1078
1079         blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
1080         if (!blkbk) {
1081                 pr_alert(DRV_PFX "%s: out of memory!\n", __func__);
1082                 return -ENOMEM;
1083         }
1084
1085         mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
1086
1087         blkbk->pending_reqs          = kzalloc(sizeof(blkbk->pending_reqs[0]) *
1088                                         xen_blkif_reqs, GFP_KERNEL);
1089         blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) *
1090                                         mmap_pages, GFP_KERNEL);
1091         blkbk->pending_pages         = kzalloc(sizeof(blkbk->pending_pages[0]) *
1092                                         mmap_pages, GFP_KERNEL);
1093
1094         if (!blkbk->pending_reqs || !blkbk->pending_grant_handles ||
1095             !blkbk->pending_pages) {
1096                 rc = -ENOMEM;
1097                 goto out_of_memory;
1098         }
1099
1100         for (i = 0; i < mmap_pages; i++) {
1101                 blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
1102                 blkbk->pending_pages[i] = alloc_page(GFP_KERNEL);
1103                 if (blkbk->pending_pages[i] == NULL) {
1104                         rc = -ENOMEM;
1105                         goto out_of_memory;
1106                 }
1107         }
1108         rc = xen_blkif_interface_init();
1109         if (rc)
1110                 goto failed_init;
1111
1112         INIT_LIST_HEAD(&blkbk->pending_free);
1113         spin_lock_init(&blkbk->pending_free_lock);
1114         init_waitqueue_head(&blkbk->pending_free_wq);
1115
1116         for (i = 0; i < xen_blkif_reqs; i++)
1117                 list_add_tail(&blkbk->pending_reqs[i].free_list,
1118                               &blkbk->pending_free);
1119
1120         rc = xen_blkif_xenbus_init();
1121         if (rc)
1122                 goto failed_init;
1123
1124         return 0;
1125
1126  out_of_memory:
1127         pr_alert(DRV_PFX "%s: out of memory\n", __func__);
1128  failed_init:
1129         kfree(blkbk->pending_reqs);
1130         kfree(blkbk->pending_grant_handles);
1131         if (blkbk->pending_pages) {
1132                 for (i = 0; i < mmap_pages; i++) {
1133                         if (blkbk->pending_pages[i])
1134                                 __free_page(blkbk->pending_pages[i]);
1135                 }
1136                 kfree(blkbk->pending_pages);
1137         }
1138         kfree(blkbk);
1139         blkbk = NULL;
1140         return rc;
1141 }
1142
1143 module_init(xen_blkif_init);
1144
1145 MODULE_LICENSE("Dual BSD/GPL");
1146 MODULE_ALIAS("xen-backend:vbd");