Merge branch 'for-4.4/reservations' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 5 Nov 2015 05:01:27 +0000 (21:01 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 5 Nov 2015 05:01:27 +0000 (21:01 -0800)
Pull block reservation support from Jens Axboe:
 "This adds support for persistent reservations, both at the core level,
  as well as for sd and NVMe"

[ Background from the docs: "Persistent Reservations allow restricting
  access to block devices to specific initiators in a shared storage
  setup.  All implementations are expected to ensure the reservations
  survive a power loss and cover all connections in a multi path
  environment" ]

* 'for-4.4/reservations' of git://git.kernel.dk/linux-block:
  NVMe: Precedence error in nvme_pr_clear()
  nvme: add missing endianess annotations in nvme_pr_command
  NVMe: Add persistent reservation ops
  sd: implement the Persistent Reservation API
  block: add an API for Persistent Reservations
  block: cleanup blkdev_ioctl

1  2 
drivers/nvme/host/pci.c
drivers/scsi/sd.c
include/linux/blkdev.h

diff --combined drivers/nvme/host/pci.c
index 381d2a0aa4615222a6b00c6f5eca6361a5fb0ed6,9a12d5a325551b6e1217bf788fc640e3bd2b8918..e878590e71b68ad3863ddc83006e76420996a4f4
  #include <linux/slab.h>
  #include <linux/t10-pi.h>
  #include <linux/types.h>
+ #include <linux/pr.h>
  #include <scsi/sg.h>
  #include <asm-generic/io-64-nonatomic-lo-hi.h>
+ #include <asm/unaligned.h>
  
  #include <uapi/linux/nvme_ioctl.h>
  #include "nvme.h"
@@@ -538,7 -540,7 +540,7 @@@ static void nvme_dif_remap(struct reque
        virt = bip_get_seed(bip);
        phys = nvme_block_nr(ns, blk_rq_pos(req));
        nlb = (blk_rq_bytes(req) >> ns->lba_shift);
 -      ts = ns->disk->integrity->tuple_size;
 +      ts = ns->disk->queue->integrity.tuple_size;
  
        for (i = 0; i < nlb; i++, virt++, phys++) {
                pi = (struct t10_pi_tuple *)p;
        kunmap_atomic(pmap);
  }
  
 -static int nvme_noop_verify(struct blk_integrity_iter *iter)
 -{
 -      return 0;
 -}
 -
 -static int nvme_noop_generate(struct blk_integrity_iter *iter)
 -{
 -      return 0;
 -}
 -
 -struct blk_integrity nvme_meta_noop = {
 -      .name                   = "NVME_META_NOOP",
 -      .generate_fn            = nvme_noop_generate,
 -      .verify_fn              = nvme_noop_verify,
 -};
 -
  static void nvme_init_integrity(struct nvme_ns *ns)
  {
        struct blk_integrity integrity;
  
        switch (ns->pi_type) {
        case NVME_NS_DPS_PI_TYPE3:
 -              integrity = t10_pi_type3_crc;
 +              integrity.profile = &t10_pi_type3_crc;
                break;
        case NVME_NS_DPS_PI_TYPE1:
        case NVME_NS_DPS_PI_TYPE2:
 -              integrity = t10_pi_type1_crc;
 +              integrity.profile = &t10_pi_type1_crc;
                break;
        default:
 -              integrity = nvme_meta_noop;
 +              integrity.profile = NULL;
                break;
        }
        integrity.tuple_size = ns->ms;
@@@ -591,7 -609,6 +593,7 @@@ static void req_completion(struct nvme_
        struct request *req = iod_get_private(iod);
        struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
        u16 status = le16_to_cpup(&cqe->status) >> 1;
 +      bool requeue = false;
        int error = 0;
  
        if (unlikely(status)) {
                    && (jiffies - req->start_time) < req->timeout) {
                        unsigned long flags;
  
 +                      requeue = true;
                        blk_mq_requeue_request(req);
                        spin_lock_irqsave(req->q->queue_lock, flags);
                        if (!blk_queue_stopped(req->q))
                                blk_mq_kick_requeue_list(req->q);
                        spin_unlock_irqrestore(req->q->queue_lock, flags);
 -                      return;
 +                      goto release_iod;
                }
  
                if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
                        "completing aborted command with status:%04x\n",
                        error);
  
 +release_iod:
        if (iod->nents) {
                dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
                        rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
        }
        nvme_free_iod(nvmeq->dev, iod);
  
 -      blk_mq_complete_request(req, error);
 +      if (likely(!requeue))
 +              blk_mq_complete_request(req, error);
  }
  
  /* length is in bytes.  gfp flags indicates whether we may sleep. */
@@@ -1936,9 -1950,6 +1938,9 @@@ static void nvme_free_ns(struct kref *k
  {
        struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
  
 +      if (ns->type == NVME_NS_LIGHTNVM)
 +              nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
 +
        spin_lock(&dev_list_lock);
        ns->disk->private_data = NULL;
        spin_unlock(&dev_list_lock);
@@@ -2008,16 -2019,6 +2010,16 @@@ static int nvme_revalidate_disk(struct 
                return -ENODEV;
        }
  
 +      if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
 +              if (nvme_nvm_register(ns->queue, disk->disk_name)) {
 +                      dev_warn(dev->dev,
 +                              "%s: LightNVM init failure\n", __func__);
 +                      kfree(id);
 +                      return -ENODEV;
 +              }
 +              ns->type = NVME_NS_LIGHTNVM;
 +      }
 +
        old_ms = ns->ms;
        lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
        ns->lba_shift = id->lbaf[lbaf].ds;
        pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
                                        id->dps & NVME_NS_DPS_PI_MASK : 0;
  
 +      blk_mq_freeze_queue(disk->queue);
        if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
                                ns->ms != old_ms ||
                                bs != queue_logical_block_size(disk->queue) ||
        ns->pi_type = pi_type;
        blk_queue_logical_block_size(ns->queue, bs);
  
 -      if (ns->ms && !blk_get_integrity(disk) && (disk->flags & GENHD_FL_UP) &&
 -                                                              !ns->ext)
 +      if (ns->ms && !ns->ext)
                nvme_init_integrity(ns);
  
 -      if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
 +      if ((ns->ms && !(ns->ms == 8 && ns->pi_type) &&
 +                                              !blk_get_integrity(disk)) ||
 +                                              ns->type == NVME_NS_LIGHTNVM)
                set_capacity(disk, 0);
        else
                set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
  
        if (dev->oncs & NVME_CTRL_ONCS_DSM)
                nvme_config_discard(ns);
 +      blk_mq_unfreeze_queue(disk->queue);
  
        kfree(id);
        return 0;
  }
  
+ static char nvme_pr_type(enum pr_type type)
+ {
+       switch (type) {
+       case PR_WRITE_EXCLUSIVE:
+               return 1;
+       case PR_EXCLUSIVE_ACCESS:
+               return 2;
+       case PR_WRITE_EXCLUSIVE_REG_ONLY:
+               return 3;
+       case PR_EXCLUSIVE_ACCESS_REG_ONLY:
+               return 4;
+       case PR_WRITE_EXCLUSIVE_ALL_REGS:
+               return 5;
+       case PR_EXCLUSIVE_ACCESS_ALL_REGS:
+               return 6;
+       default:
+               return 0;
+       }
+ };
+ static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
+                               u64 key, u64 sa_key, u8 op)
+ {
+       struct nvme_ns *ns = bdev->bd_disk->private_data;
+       struct nvme_command c;
+       u8 data[16] = { 0, };
+       put_unaligned_le64(key, &data[0]);
+       put_unaligned_le64(sa_key, &data[8]);
+       memset(&c, 0, sizeof(c));
+       c.common.opcode = op;
+       c.common.nsid = cpu_to_le32(ns->ns_id);
+       c.common.cdw10[0] = cpu_to_le32(cdw10);
+       return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+ }
+ static int nvme_pr_register(struct block_device *bdev, u64 old,
+               u64 new, unsigned flags)
+ {
+       u32 cdw10;
+       if (flags & ~PR_FL_IGNORE_KEY)
+               return -EOPNOTSUPP;
+       cdw10 = old ? 2 : 0;
+       cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
+       cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
+       return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
+ }
+ static int nvme_pr_reserve(struct block_device *bdev, u64 key,
+               enum pr_type type, unsigned flags)
+ {
+       u32 cdw10;
+       if (flags & ~PR_FL_IGNORE_KEY)
+               return -EOPNOTSUPP;
+       cdw10 = nvme_pr_type(type) << 8;
+       cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
+       return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
+ }
+ static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
+               enum pr_type type, bool abort)
+ {
+       u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
+       return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
+ }
+ static int nvme_pr_clear(struct block_device *bdev, u64 key)
+ {
+       u32 cdw10 = 1 | (key ? 1 << 3 : 0);
+       return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
+ }
+ static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
+ {
+       u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
+       return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
+ }
+ static const struct pr_ops nvme_pr_ops = {
+       .pr_register    = nvme_pr_register,
+       .pr_reserve     = nvme_pr_reserve,
+       .pr_release     = nvme_pr_release,
+       .pr_preempt     = nvme_pr_preempt,
+       .pr_clear       = nvme_pr_clear,
+ };
  static const struct block_device_operations nvme_fops = {
        .owner          = THIS_MODULE,
        .ioctl          = nvme_ioctl,
        .release        = nvme_release,
        .getgeo         = nvme_getgeo,
        .revalidate_disk= nvme_revalidate_disk,
+       .pr_ops         = &nvme_pr_ops,
  };
  
  static int nvme_kthread(void *data)
@@@ -2175,19 -2266,17 +2270,19 @@@ static void nvme_alloc_ns(struct nvme_d
                goto out_free_disk;
  
        kref_get(&dev->kref);
 -      add_disk(ns->disk);
 -      if (ns->ms) {
 -              struct block_device *bd = bdget_disk(ns->disk, 0);
 -              if (!bd)
 -                      return;
 -              if (blkdev_get(bd, FMODE_READ, NULL)) {
 -                      bdput(bd);
 -                      return;
 +      if (ns->type != NVME_NS_LIGHTNVM) {
 +              add_disk(ns->disk);
 +              if (ns->ms) {
 +                      struct block_device *bd = bdget_disk(ns->disk, 0);
 +                      if (!bd)
 +                              return;
 +                      if (blkdev_get(bd, FMODE_READ, NULL)) {
 +                              bdput(bd);
 +                              return;
 +                      }
 +                      blkdev_reread_part(bd);
 +                      blkdev_put(bd, FMODE_READ);
                }
 -              blkdev_reread_part(bd);
 -              blkdev_put(bd, FMODE_READ);
        }
        return;
   out_free_disk:
@@@ -2414,8 -2503,11 +2509,8 @@@ static void nvme_ns_remove(struct nvme_
  
        if (kill)
                blk_set_queue_dying(ns->queue);
 -      if (ns->disk->flags & GENHD_FL_UP) {
 -              if (blk_get_integrity(ns->disk))
 -                      blk_integrity_unregister(ns->disk);
 +      if (ns->disk->flags & GENHD_FL_UP)
                del_gendisk(ns->disk);
 -      }
        if (kill || !blk_queue_dying(ns->queue)) {
                blk_mq_abort_requeue_list(ns->queue);
                blk_cleanup_queue(ns->queue);
diff --combined drivers/scsi/sd.c
index 9e85211ea1d1560a6709d689a014e47bcb29e5d8,a1eeb202160ffa55545780e26cf65516315452fd..5e170a6809fde2fe3c8c6ff51d382c2a570485ea
@@@ -51,6 -51,7 +51,7 @@@
  #include <linux/async.h>
  #include <linux/slab.h>
  #include <linux/pm_runtime.h>
+ #include <linux/pr.h>
  #include <asm/uaccess.h>
  #include <asm/unaligned.h>
  
@@@ -1535,6 -1536,100 +1536,100 @@@ static int sd_compat_ioctl(struct block
  }
  #endif
  
+ static char sd_pr_type(enum pr_type type)
+ {
+       switch (type) {
+       case PR_WRITE_EXCLUSIVE:
+               return 0x01;
+       case PR_EXCLUSIVE_ACCESS:
+               return 0x03;
+       case PR_WRITE_EXCLUSIVE_REG_ONLY:
+               return 0x05;
+       case PR_EXCLUSIVE_ACCESS_REG_ONLY:
+               return 0x06;
+       case PR_WRITE_EXCLUSIVE_ALL_REGS:
+               return 0x07;
+       case PR_EXCLUSIVE_ACCESS_ALL_REGS:
+               return 0x08;
+       default:
+               return 0;
+       }
+ };
+ static int sd_pr_command(struct block_device *bdev, u8 sa,
+               u64 key, u64 sa_key, u8 type, u8 flags)
+ {
+       struct scsi_device *sdev = scsi_disk(bdev->bd_disk)->device;
+       struct scsi_sense_hdr sshdr;
+       int result;
+       u8 cmd[16] = { 0, };
+       u8 data[24] = { 0, };
+       cmd[0] = PERSISTENT_RESERVE_OUT;
+       cmd[1] = sa;
+       cmd[2] = type;
+       put_unaligned_be32(sizeof(data), &cmd[5]);
+       put_unaligned_be64(key, &data[0]);
+       put_unaligned_be64(sa_key, &data[8]);
+       data[20] = flags;
+       result = scsi_execute_req(sdev, cmd, DMA_TO_DEVICE, &data, sizeof(data),
+                       &sshdr, SD_TIMEOUT, SD_MAX_RETRIES, NULL);
+       if ((driver_byte(result) & DRIVER_SENSE) &&
+           (scsi_sense_valid(&sshdr))) {
+               sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result);
+               scsi_print_sense_hdr(sdev, NULL, &sshdr);
+       }
+       return result;
+ }
+ static int sd_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
+               u32 flags)
+ {
+       if (flags & ~PR_FL_IGNORE_KEY)
+               return -EOPNOTSUPP;
+       return sd_pr_command(bdev, (flags & PR_FL_IGNORE_KEY) ? 0x06 : 0x00,
+                       old_key, new_key, 0,
+                       (1 << 0) /* APTPL */ |
+                       (1 << 2) /* ALL_TG_PT */);
+ }
+ static int sd_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
+               u32 flags)
+ {
+       if (flags)
+               return -EOPNOTSUPP;
+       return sd_pr_command(bdev, 0x01, key, 0, sd_pr_type(type), 0);
+ }
+ static int sd_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
+ {
+       return sd_pr_command(bdev, 0x02, key, 0, sd_pr_type(type), 0);
+ }
+ static int sd_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
+               enum pr_type type, bool abort)
+ {
+       return sd_pr_command(bdev, abort ? 0x05 : 0x04, old_key, new_key,
+                            sd_pr_type(type), 0);
+ }
+ static int sd_pr_clear(struct block_device *bdev, u64 key)
+ {
+       return sd_pr_command(bdev, 0x03, key, 0, 0, 0);
+ }
+ static const struct pr_ops sd_pr_ops = {
+       .pr_register    = sd_pr_register,
+       .pr_reserve     = sd_pr_reserve,
+       .pr_release     = sd_pr_release,
+       .pr_preempt     = sd_pr_preempt,
+       .pr_clear       = sd_pr_clear,
+ };
  static const struct block_device_operations sd_fops = {
        .owner                  = THIS_MODULE,
        .open                   = sd_open,
        .check_events           = sd_check_events,
        .revalidate_disk        = sd_revalidate_disk,
        .unlock_native_capacity = sd_unlock_native_capacity,
+       .pr_ops                 = &sd_pr_ops,
  };
  
  /**
@@@ -3068,6 -3164,7 +3164,6 @@@ static void scsi_disk_release(struct de
        ida_remove(&sd_index_ida, sdkp->index);
        spin_unlock(&sd_index_lock);
  
 -      blk_integrity_unregister(disk);
        disk->private_data = NULL;
        put_disk(disk);
        put_device(&sdkp->device->sdev_gendev);
diff --combined include/linux/blkdev.h
index cf57884db4b7c9f7e32d59b39122a3c0d82b5180,fe25da05e8233c120cda5a1cbffe38f224658c4f..d045ca8487af17eb2aee07a8aed267f5b74e1b83
@@@ -35,6 -35,7 +35,7 @@@ struct sg_io_hdr
  struct bsg_job;
  struct blkcg_gq;
  struct blk_flush_queue;
+ struct pr_ops;
  
  #define BLKDEV_MIN_RQ 4
  #define BLKDEV_MAX_RQ 128     /* Default maximum */
@@@ -369,10 -370,6 +370,10 @@@ struct request_queue 
         */
        struct kobject mq_kobj;
  
 +#ifdef  CONFIG_BLK_DEV_INTEGRITY
 +      struct blk_integrity integrity;
 +#endif        /* CONFIG_BLK_DEV_INTEGRITY */
 +
  #ifdef CONFIG_PM
        struct device           *dev;
        int                     rpm_status;
  #endif
        struct rcu_head         rcu_head;
        wait_queue_head_t       mq_freeze_wq;
 -      struct percpu_ref       mq_usage_counter;
 +      struct percpu_ref       q_usage_counter;
        struct list_head        all_q_node;
  
        struct blk_mq_tag_set   *tag_set;
@@@ -1466,13 -1463,22 +1467,13 @@@ struct blk_integrity_iter 
  
  typedef int (integrity_processing_fn) (struct blk_integrity_iter *);
  
 -struct blk_integrity {
 -      integrity_processing_fn *generate_fn;
 -      integrity_processing_fn *verify_fn;
 -
 -      unsigned short          flags;
 -      unsigned short          tuple_size;
 -      unsigned short          interval;
 -      unsigned short          tag_size;
 -
 -      const char              *name;
 -
 -      struct kobject          kobj;
 +struct blk_integrity_profile {
 +      integrity_processing_fn         *generate_fn;
 +      integrity_processing_fn         *verify_fn;
 +      const char                      *name;
  };
  
 -extern bool blk_integrity_is_initialized(struct gendisk *);
 -extern int blk_integrity_register(struct gendisk *, struct blk_integrity *);
 +extern void blk_integrity_register(struct gendisk *, struct blk_integrity *);
  extern void blk_integrity_unregister(struct gendisk *);
  extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
  extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
@@@ -1483,20 -1489,15 +1484,20 @@@ extern bool blk_integrity_merge_rq(stru
  extern bool blk_integrity_merge_bio(struct request_queue *, struct request *,
                                    struct bio *);
  
 -static inline
 -struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
 +static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
  {
 -      return bdev->bd_disk->integrity;
 +      struct blk_integrity *bi = &disk->queue->integrity;
 +
 +      if (!bi->profile)
 +              return NULL;
 +
 +      return bi;
  }
  
 -static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
 +static inline
 +struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
  {
 -      return disk->integrity;
 +      return blk_get_integrity(bdev->bd_disk);
  }
  
  static inline bool blk_integrity_rq(struct request *rq)
@@@ -1570,9 -1571,10 +1571,9 @@@ static inline int blk_integrity_compare
  {
        return 0;
  }
 -static inline int blk_integrity_register(struct gendisk *d,
 +static inline void blk_integrity_register(struct gendisk *d,
                                         struct blk_integrity *b)
  {
 -      return 0;
  }
  static inline void blk_integrity_unregister(struct gendisk *d)
  {
@@@ -1597,7 -1599,10 +1598,7 @@@ static inline bool blk_integrity_merge_
  {
        return true;
  }
 -static inline bool blk_integrity_is_initialized(struct gendisk *g)
 -{
 -      return 0;
 -}
 +
  static inline bool integrity_req_gap_back_merge(struct request *req,
                                                struct bio *next)
  {
@@@ -1629,6 -1634,7 +1630,7 @@@ struct block_device_operations 
        /* this callback is with swap_lock and sometimes page table lock held */
        void (*swap_slot_free_notify) (struct block_device *, unsigned long);
        struct module *owner;
+       const struct pr_ops *pr_ops;
  };
  
  extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,