dm thin: support discards
[firefly-linux-kernel-4.4.55.git] / drivers / md / dm-thin.c
index c3087575fef0ffe5330c261a721abddef3f6771e..703bbbc4f16f52923932cb7e90451df882402cd3 100644 (file)
@@ -23,6 +23,7 @@
 #define DEFERRED_SET_SIZE 64
 #define MAPPING_POOL_SIZE 1024
 #define PRISON_CELLS 1024
+#define COMMIT_PERIOD HZ
 
 /*
  * The block size of the device holding pool data must be
 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
 
-/*
- * The metadata device is currently limited in size.  The limitation is
- * checked lower down in dm-space-map-metadata, but we also check it here
- * so we can fail early.
- *
- * We have one block of index, which can hold 255 index entries.  Each
- * index entry contains allocation info about 16k metadata blocks.
- */
-#define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
-
 /*
  * Device id is restricted to 24 bits.
  */
@@ -72,7 +63,7 @@
  * missed out if the io covers the block. (schedule_copy).
  *
  * iv) insert the new mapping into the origin's btree
- * (process_prepared_mappings).  This act of inserting breaks some
+ * (process_prepared_mapping).  This act of inserting breaks some
  * sharing of btree nodes between the two devices.  Breaking sharing only
  * effects the btree of that specific device.  Btrees for the other
  * devices that share the block never change.  The btree for the origin
@@ -124,7 +115,7 @@ struct cell {
        struct hlist_node list;
        struct bio_prison *prison;
        struct cell_key key;
-       unsigned count;
+       struct bio *holder;
        struct bio_list bios;
 };
 
@@ -220,54 +211,59 @@ static struct cell *__search_bucket(struct hlist_head *bucket,
  * This may block if a new cell needs allocating.  You must ensure that
  * cells will be unlocked even if the calling thread is blocked.
  *
- * Returns the number of entries in the cell prior to the new addition
- * or < 0 on failure.
+ * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
  */
 static int bio_detain(struct bio_prison *prison, struct cell_key *key,
                      struct bio *inmate, struct cell **ref)
 {
-       int r;
+       int r = 1;
        unsigned long flags;
        uint32_t hash = hash_key(prison, key);
-       struct cell *uninitialized_var(cell), *cell2 = NULL;
+       struct cell *cell, *cell2;
 
        BUG_ON(hash > prison->nr_buckets);
 
        spin_lock_irqsave(&prison->lock, flags);
+
        cell = __search_bucket(prison->cells + hash, key);
+       if (cell) {
+               bio_list_add(&cell->bios, inmate);
+               goto out;
+       }
 
-       if (!cell) {
-               /*
-                * Allocate a new cell
-                */
-               spin_unlock_irqrestore(&prison->lock, flags);
-               cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
-               spin_lock_irqsave(&prison->lock, flags);
+       /*
+        * Allocate a new cell
+        */
+       spin_unlock_irqrestore(&prison->lock, flags);
+       cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
+       spin_lock_irqsave(&prison->lock, flags);
 
-               /*
-                * We've been unlocked, so we have to double check that
-                * nobody else has inserted this cell in the meantime.
-                */
-               cell = __search_bucket(prison->cells + hash, key);
+       /*
+        * We've been unlocked, so we have to double check that
+        * nobody else has inserted this cell in the meantime.
+        */
+       cell = __search_bucket(prison->cells + hash, key);
+       if (cell) {
+               mempool_free(cell2, prison->cell_pool);
+               bio_list_add(&cell->bios, inmate);
+               goto out;
+       }
 
-               if (!cell) {
-                       cell = cell2;
-                       cell2 = NULL;
+       /*
+        * Use new cell.
+        */
+       cell = cell2;
 
-                       cell->prison = prison;
-                       memcpy(&cell->key, key, sizeof(cell->key));
-                       cell->count = 0;
-                       bio_list_init(&cell->bios);
-                       hlist_add_head(&cell->list, prison->cells + hash);
-               }
-       }
+       cell->prison = prison;
+       memcpy(&cell->key, key, sizeof(cell->key));
+       cell->holder = inmate;
+       bio_list_init(&cell->bios);
+       hlist_add_head(&cell->list, prison->cells + hash);
 
-       r = cell->count++;
-       bio_list_add(&cell->bios, inmate);
-       spin_unlock_irqrestore(&prison->lock, flags);
+       r = 0;
 
-       if (cell2)
-               mempool_free(cell2, prison->cell_pool);
+out:
+       spin_unlock_irqrestore(&prison->lock, flags);
 
        *ref = cell;
 
@@ -283,8 +279,8 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates)
 
        hlist_del(&cell->list);
 
-       if (inmates)
-               bio_list_merge(inmates, &cell->bios);
+       bio_list_add(inmates, cell->holder);
+       bio_list_merge(inmates, &cell->bios);
 
        mempool_free(cell, prison->cell_pool);
 }
@@ -305,22 +301,44 @@ static void cell_release(struct cell *cell, struct bio_list *bios)
  * bio may be in the cell.  This function releases the cell, and also does
  * a sanity check.
  */
+static void __cell_release_singleton(struct cell *cell, struct bio *bio)
+{
+       hlist_del(&cell->list);
+       BUG_ON(cell->holder != bio);
+       BUG_ON(!bio_list_empty(&cell->bios));
+}
+
 static void cell_release_singleton(struct cell *cell, struct bio *bio)
 {
-       struct bio_prison *prison = cell->prison;
-       struct bio_list bios;
-       struct bio *b;
        unsigned long flags;
-
-       bio_list_init(&bios);
+       struct bio_prison *prison = cell->prison;
 
        spin_lock_irqsave(&prison->lock, flags);
-       __cell_release(cell, &bios);
+       __cell_release_singleton(cell, bio);
        spin_unlock_irqrestore(&prison->lock, flags);
+}
 
-       b = bio_list_pop(&bios);
-       BUG_ON(b != bio);
-       BUG_ON(!bio_list_empty(&bios));
+/*
+ * Sometimes we don't want the holder, just the additional bios.
+ */
+static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
+{
+       struct bio_prison *prison = cell->prison;
+
+       hlist_del(&cell->list);
+       bio_list_merge(inmates, &cell->bios);
+
+       mempool_free(cell, prison->cell_pool);
+}
+
+static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
+{
+       unsigned long flags;
+       struct bio_prison *prison = cell->prison;
+
+       spin_lock_irqsave(&prison->lock, flags);
+       __cell_release_no_holder(cell, inmates);
+       spin_unlock_irqrestore(&prison->lock, flags);
 }
 
 static void cell_error(struct cell *cell)
@@ -493,17 +511,21 @@ struct pool {
 
        struct workqueue_struct *wq;
        struct work_struct worker;
+       struct delayed_work waker;
 
        unsigned ref_count;
+       unsigned long last_commit_jiffies;
 
        spinlock_t lock;
        struct bio_list deferred_bios;
        struct bio_list deferred_flush_bios;
        struct list_head prepared_mappings;
+       struct list_head prepared_discards;
 
        struct bio_list retry_on_resume_list;
 
-       struct deferred_set ds; /* FIXME: move to thin_c */
+       struct deferred_set shared_read_ds;
+       struct deferred_set all_io_ds;
 
        struct new_mapping *next_mapping;
        mempool_t *mapping_pool;
@@ -529,6 +551,7 @@ struct pool_c {
  */
 struct thin_c {
        struct dm_dev *pool_dev;
+       struct dm_dev *origin_dev;
        dm_thin_id dev_id;
 
        struct pool *pool;
@@ -597,6 +620,13 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
 
 /*----------------------------------------------------------------*/
 
+struct endio_hook {
+       struct thin_c *tc;
+       struct deferred_entry *shared_read_entry;
+       struct deferred_entry *all_io_entry;
+       struct new_mapping *overwrite_mapping;
+};
+
 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 {
        struct bio *bio;
@@ -607,7 +637,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
        bio_list_init(master);
 
        while ((bio = bio_list_pop(&bios))) {
-               if (dm_get_mapinfo(bio)->ptr == tc)
+               struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+               if (h->tc == tc)
                        bio_endio(bio, DM_ENDIO_REQUEUE);
                else
                        bio_list_add(master, bio);
@@ -646,14 +677,16 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
                (bio->bi_sector & pool->offset_mask);
 }
 
-static void remap_and_issue(struct thin_c *tc, struct bio *bio,
-                           dm_block_t block)
+static void remap_to_origin(struct thin_c *tc, struct bio *bio)
+{
+       bio->bi_bdev = tc->origin_dev->bdev;
+}
+
+static void issue(struct thin_c *tc, struct bio *bio)
 {
        struct pool *pool = tc->pool;
        unsigned long flags;
 
-       remap(tc, bio, block);
-
        /*
         * Batch together any FUA/FLUSH bios we find and then issue
         * a single commit for them in process_deferred_bios().
@@ -666,6 +699,19 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
                generic_make_request(bio);
 }
 
+static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
+{
+       remap_to_origin(tc, bio);
+       issue(tc, bio);
+}
+
+static void remap_and_issue(struct thin_c *tc, struct bio *bio,
+                           dm_block_t block)
+{
+       remap(tc, bio, block);
+       issue(tc, bio);
+}
+
 /*
  * wake_worker() is used when new work is queued and when pool_resume is
  * ready to continue deferred IO processing.
@@ -680,21 +726,17 @@ static void wake_worker(struct pool *pool)
 /*
  * Bio endio functions.
  */
-struct endio_hook {
-       struct thin_c *tc;
-       bio_end_io_t *saved_bi_end_io;
-       struct deferred_entry *entry;
-};
-
 struct new_mapping {
        struct list_head list;
 
-       int prepared;
+       unsigned quiesced:1;
+       unsigned prepared:1;
+       unsigned pass_discard:1;
 
        struct thin_c *tc;
        dm_block_t virt_block;
        dm_block_t data_block;
-       struct cell *cell;
+       struct cell *cell, *cell2;
        int err;
 
        /*
@@ -711,7 +753,7 @@ static void __maybe_add_mapping(struct new_mapping *m)
 {
        struct pool *pool = m->tc->pool;
 
-       if (list_empty(&m->list) && m->prepared) {
+       if (m->quiesced && m->prepared) {
                list_add(&m->list, &pool->prepared_mappings);
                wake_worker(pool);
        }
@@ -734,7 +776,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
 static void overwrite_endio(struct bio *bio, int err)
 {
        unsigned long flags;
-       struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
+       struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+       struct new_mapping *m = h->overwrite_mapping;
        struct pool *pool = m->tc->pool;
 
        m->err = err;
@@ -745,31 +788,6 @@ static void overwrite_endio(struct bio *bio, int err)
        spin_unlock_irqrestore(&pool->lock, flags);
 }
 
-static void shared_read_endio(struct bio *bio, int err)
-{
-       struct list_head mappings;
-       struct new_mapping *m, *tmp;
-       struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
-       unsigned long flags;
-       struct pool *pool = h->tc->pool;
-
-       bio->bi_end_io = h->saved_bi_end_io;
-       bio_endio(bio, err);
-
-       INIT_LIST_HEAD(&mappings);
-       ds_dec(h->entry, &mappings);
-
-       spin_lock_irqsave(&pool->lock, flags);
-       list_for_each_entry_safe(m, tmp, &mappings, list) {
-               list_del(&m->list);
-               INIT_LIST_HEAD(&m->list);
-               __maybe_add_mapping(m);
-       }
-       spin_unlock_irqrestore(&pool->lock, flags);
-
-       mempool_free(h, pool->endio_hook_pool);
-}
-
 /*----------------------------------------------------------------*/
 
 /*
@@ -800,21 +818,16 @@ static void cell_defer(struct thin_c *tc, struct cell *cell,
  * Same as cell_defer above, except it omits one particular detainee,
  * a write bio that covers the block and has already been processed.
  */
-static void cell_defer_except(struct thin_c *tc, struct cell *cell,
-                             struct bio *exception)
+static void cell_defer_except(struct thin_c *tc, struct cell *cell)
 {
        struct bio_list bios;
-       struct bio *bio;
        struct pool *pool = tc->pool;
        unsigned long flags;
 
        bio_list_init(&bios);
-       cell_release(cell, &bios);
 
        spin_lock_irqsave(&pool->lock, flags);
-       while ((bio = bio_list_pop(&bios)))
-               if (bio != exception)
-                       bio_list_add(&pool->deferred_bios, bio);
+       cell_release_no_holder(cell, &pool->deferred_bios);
        spin_unlock_irqrestore(&pool->lock, flags);
 
        wake_worker(pool);
@@ -854,7 +867,7 @@ static void process_prepared_mapping(struct new_mapping *m)
         * the bios in the cell.
         */
        if (bio) {
-               cell_defer_except(tc, m->cell, bio);
+               cell_defer_except(tc, m->cell);
                bio_endio(bio, 0);
        } else
                cell_defer(tc, m->cell, m->data_block);
@@ -863,7 +876,30 @@ static void process_prepared_mapping(struct new_mapping *m)
        mempool_free(m, tc->pool->mapping_pool);
 }
 
-static void process_prepared_mappings(struct pool *pool)
+static void process_prepared_discard(struct new_mapping *m)
+{
+       int r;
+       struct thin_c *tc = m->tc;
+
+       r = dm_thin_remove_block(tc->td, m->virt_block);
+       if (r)
+               DMERR("dm_thin_remove_block() failed");
+
+       /*
+        * Pass the discard down to the underlying device?
+        */
+       if (m->pass_discard)
+               remap_and_issue(tc, m->bio, m->data_block);
+       else
+               bio_endio(m->bio, 0);
+
+       cell_defer_except(tc, m->cell);
+       cell_defer_except(tc, m->cell2);
+       mempool_free(m, tc->pool->mapping_pool);
+}
+
+static void process_prepared(struct pool *pool, struct list_head *head,
+                            void (*fn)(struct new_mapping *))
 {
        unsigned long flags;
        struct list_head maps;
@@ -871,21 +907,27 @@ static void process_prepared_mappings(struct pool *pool)
 
        INIT_LIST_HEAD(&maps);
        spin_lock_irqsave(&pool->lock, flags);
-       list_splice_init(&pool->prepared_mappings, &maps);
+       list_splice_init(head, &maps);
        spin_unlock_irqrestore(&pool->lock, flags);
 
        list_for_each_entry_safe(m, tmp, &maps, list)
-               process_prepared_mapping(m);
+               fn(m);
 }
 
 /*
  * Deferred bio jobs.
  */
-static int io_overwrites_block(struct pool *pool, struct bio *bio)
+static int io_overlaps_block(struct pool *pool, struct bio *bio)
 {
-       return ((bio_data_dir(bio) == WRITE) &&
-               !(bio->bi_sector & pool->offset_mask)) &&
+       return !(bio->bi_sector & pool->offset_mask) &&
                (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
+
+}
+
+static int io_overwrites_block(struct pool *pool, struct bio *bio)
+{
+       return (bio_data_dir(bio) == WRITE) &&
+               io_overlaps_block(pool, bio);
 }
 
 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
@@ -917,7 +959,8 @@ static struct new_mapping *get_next_mapping(struct pool *pool)
 }
 
 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
-                         dm_block_t data_origin, dm_block_t data_dest,
+                         struct dm_dev *origin, dm_block_t data_origin,
+                         dm_block_t data_dest,
                          struct cell *cell, struct bio *bio)
 {
        int r;
@@ -925,6 +968,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
        struct new_mapping *m = get_next_mapping(pool);
 
        INIT_LIST_HEAD(&m->list);
+       m->quiesced = 0;
        m->prepared = 0;
        m->tc = tc;
        m->virt_block = virt_block;
@@ -933,7 +977,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
        m->err = 0;
        m->bio = NULL;
 
-       ds_add_work(&pool->ds, &m->list);
+       if (!ds_add_work(&pool->shared_read_ds, &m->list))
+               m->quiesced = 1;
 
        /*
         * IO to pool_dev remaps to the pool target's data_dev.
@@ -942,14 +987,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
         * bio immediately. Otherwise we use kcopyd to clone the data first.
         */
        if (io_overwrites_block(pool, bio)) {
+               struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+               h->overwrite_mapping = m;
                m->bio = bio;
                save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
-               dm_get_mapinfo(bio)->ptr = m;
                remap_and_issue(tc, bio, data_dest);
        } else {
                struct dm_io_region from, to;
 
-               from.bdev = tc->pool_dev->bdev;
+               from.bdev = origin->bdev;
                from.sector = data_origin * pool->sectors_per_block;
                from.count = pool->sectors_per_block;
 
@@ -967,6 +1013,22 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
        }
 }
 
+static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
+                                  dm_block_t data_origin, dm_block_t data_dest,
+                                  struct cell *cell, struct bio *bio)
+{
+       schedule_copy(tc, virt_block, tc->pool_dev,
+                     data_origin, data_dest, cell, bio);
+}
+
+static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
+                                  dm_block_t data_dest,
+                                  struct cell *cell, struct bio *bio)
+{
+       schedule_copy(tc, virt_block, tc->origin_dev,
+                     virt_block, data_dest, cell, bio);
+}
+
 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
                          dm_block_t data_block, struct cell *cell,
                          struct bio *bio)
@@ -975,6 +1037,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
        struct new_mapping *m = get_next_mapping(pool);
 
        INIT_LIST_HEAD(&m->list);
+       m->quiesced = 1;
        m->prepared = 0;
        m->tc = tc;
        m->virt_block = virt_block;
@@ -992,9 +1055,10 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
                process_prepared_mapping(m);
 
        else if (io_overwrites_block(pool, bio)) {
+               struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+               h->overwrite_mapping = m;
                m->bio = bio;
                save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
-               dm_get_mapinfo(bio)->ptr = m;
                remap_and_issue(tc, bio, data_block);
 
        } else {
@@ -1081,7 +1145,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
  */
 static void retry_on_resume(struct bio *bio)
 {
-       struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
+       struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+       struct thin_c *tc = h->tc;
        struct pool *pool = tc->pool;
        unsigned long flags;
 
@@ -1102,6 +1167,86 @@ static void no_space(struct cell *cell)
                retry_on_resume(bio);
 }
 
+static void process_discard(struct thin_c *tc, struct bio *bio)
+{
+       int r;
+       struct pool *pool = tc->pool;
+       struct cell *cell, *cell2;
+       struct cell_key key, key2;
+       dm_block_t block = get_bio_block(tc, bio);
+       struct dm_thin_lookup_result lookup_result;
+       struct new_mapping *m;
+
+       build_virtual_key(tc->td, block, &key);
+       if (bio_detain(tc->pool->prison, &key, bio, &cell))
+               return;
+
+       r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+       switch (r) {
+       case 0:
+               /*
+                * Check nobody is fiddling with this pool block.  This can
+                * happen if someone's in the process of breaking sharing
+                * on this block.
+                */
+               build_data_key(tc->td, lookup_result.block, &key2);
+               if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
+                       cell_release_singleton(cell, bio);
+                       break;
+               }
+
+               if (io_overlaps_block(pool, bio)) {
+                       /*
+                        * IO may still be going to the destination block.  We must
+                        * quiesce before we can do the removal.
+                        */
+                       m = get_next_mapping(pool);
+                       m->tc = tc;
+                       m->pass_discard = !lookup_result.shared;
+                       m->virt_block = block;
+                       m->data_block = lookup_result.block;
+                       m->cell = cell;
+                       m->cell2 = cell2;
+                       m->err = 0;
+                       m->bio = bio;
+
+                       if (!ds_add_work(&pool->all_io_ds, &m->list)) {
+                               list_add(&m->list, &pool->prepared_discards);
+                               wake_worker(pool);
+                       }
+               } else {
+                       /*
+                        * This path is hit if people are ignoring
+                        * limits->discard_granularity.  It ignores any
+                        * part of the discard that is in a subsequent
+                        * block.
+                        */
+                       sector_t offset = bio->bi_sector - (block << pool->block_shift);
+                       unsigned remaining = (pool->sectors_per_block - offset) << 9;
+                       bio->bi_size = min(bio->bi_size, remaining);
+
+                       cell_release_singleton(cell, bio);
+                       cell_release_singleton(cell2, bio);
+                       remap_and_issue(tc, bio, lookup_result.block);
+               }
+               break;
+
+       case -ENODATA:
+               /*
+                * It isn't provisioned, just forget it.
+                */
+               cell_release_singleton(cell, bio);
+               bio_endio(bio, 0);
+               break;
+
+       default:
+               DMERR("discard: find block unexpectedly returned %d", r);
+               cell_release_singleton(cell, bio);
+               bio_io_error(bio);
+               break;
+       }
+}
+
 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
                          struct cell_key *key,
                          struct dm_thin_lookup_result *lookup_result,
@@ -1113,8 +1258,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
        r = alloc_data_block(tc, &data_block);
        switch (r) {
        case 0:
-               schedule_copy(tc, block, lookup_result->block,
-                             data_block, cell, bio);
+               schedule_internal_copy(tc, block, lookup_result->block,
+                                      data_block, cell, bio);
                break;
 
        case -ENOSPC:
@@ -1147,13 +1292,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
        if (bio_data_dir(bio) == WRITE)
                break_sharing(tc, bio, block, &key, lookup_result, cell);
        else {
-               struct endio_hook *h;
-               h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+               struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
 
-               h->tc = tc;
-               h->entry = ds_inc(&pool->ds);
-               save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
-               dm_get_mapinfo(bio)->ptr = h;
+               h->shared_read_entry = ds_inc(&pool->shared_read_ds);
 
                cell_release_singleton(cell, bio);
                remap_and_issue(tc, bio, lookup_result->block);
@@ -1188,7 +1329,10 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
        r = alloc_data_block(tc, &data_block);
        switch (r) {
        case 0:
-               schedule_zero(tc, block, data_block, cell, bio);
+               if (tc->origin_dev)
+                       schedule_external_copy(tc, block, data_block, cell, bio);
+               else
+                       schedule_zero(tc, block, data_block, cell, bio);
                break;
 
        case -ENOSPC:
@@ -1239,16 +1383,27 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
                break;
 
        case -ENODATA:
-               provision_block(tc, bio, block, cell);
+               if (bio_data_dir(bio) == READ && tc->origin_dev) {
+                       cell_release_singleton(cell, bio);
+                       remap_to_origin_and_issue(tc, bio);
+               } else
+                       provision_block(tc, bio, block, cell);
                break;
 
        default:
                DMERR("dm_thin_find_block() failed, error = %d", r);
+               cell_release_singleton(cell, bio);
                bio_io_error(bio);
                break;
        }
 }
 
+static int need_commit_due_to_time(struct pool *pool)
+{
+       return jiffies < pool->last_commit_jiffies ||
+              jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
+}
+
 static void process_deferred_bios(struct pool *pool)
 {
        unsigned long flags;
@@ -1264,7 +1419,9 @@ static void process_deferred_bios(struct pool *pool)
        spin_unlock_irqrestore(&pool->lock, flags);
 
        while ((bio = bio_list_pop(&bios))) {
-               struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
+               struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+               struct thin_c *tc = h->tc;
+
                /*
                 * If we've got no free new_mapping structs, and processing
                 * this bio might require one, we pause until there are some
@@ -1277,7 +1434,11 @@ static void process_deferred_bios(struct pool *pool)
 
                        break;
                }
-               process_bio(tc, bio);
+
+               if (bio->bi_rw & REQ_DISCARD)
+                       process_discard(tc, bio);
+               else
+                       process_bio(tc, bio);
        }
 
        /*
@@ -1290,7 +1451,7 @@ static void process_deferred_bios(struct pool *pool)
        bio_list_init(&pool->deferred_flush_bios);
        spin_unlock_irqrestore(&pool->lock, flags);
 
-       if (bio_list_empty(&bios))
+       if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
                return;
 
        r = dm_pool_commit_metadata(pool->pmd);
@@ -1301,6 +1462,7 @@ static void process_deferred_bios(struct pool *pool)
                        bio_io_error(bio);
                return;
        }
+       pool->last_commit_jiffies = jiffies;
 
        while ((bio = bio_list_pop(&bios)))
                generic_make_request(bio);
@@ -1310,10 +1472,22 @@ static void do_worker(struct work_struct *ws)
 {
        struct pool *pool = container_of(ws, struct pool, worker);
 
-       process_prepared_mappings(pool);
+       process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
+       process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
        process_deferred_bios(pool);
 }
 
+/*
+ * We want to commit periodically so that not too much
+ * unwritten data builds up.
+ */
+static void do_waker(struct work_struct *ws)
+{
+       struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
+       wake_worker(pool);
+       queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
+}
+
 /*----------------------------------------------------------------*/
 
 /*
@@ -1335,6 +1509,19 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
        wake_worker(pool);
 }
 
+static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
+{
+       struct pool *pool = tc->pool;
+       struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+
+       h->tc = tc;
+       h->shared_read_entry = NULL;
+       h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
+       h->overwrite_mapping = NULL;
+
+       return h;
+}
+
 /*
  * Non-blocking function called from the thin target's map function.
  */
@@ -1347,12 +1534,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
        struct dm_thin_device *td = tc->td;
        struct dm_thin_lookup_result result;
 
-       /*
-        * Save the thin context for easy access from the deferred bio later.
-        */
-       map_context->ptr = tc;
-
-       if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+       map_context->ptr = thin_hook_bio(tc, bio);
+       if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
                thin_defer_bio(tc, bio);
                return DM_MAPIO_SUBMITTED;
        }
@@ -1523,14 +1706,17 @@ static struct pool *pool_create(struct mapped_device *pool_md,
        }
 
        INIT_WORK(&pool->worker, do_worker);
+       INIT_DELAYED_WORK(&pool->waker, do_waker);
        spin_lock_init(&pool->lock);
        bio_list_init(&pool->deferred_bios);
        bio_list_init(&pool->deferred_flush_bios);
        INIT_LIST_HEAD(&pool->prepared_mappings);
+       INIT_LIST_HEAD(&pool->prepared_discards);
        pool->low_water_triggered = 0;
        pool->no_free_space = 0;
        bio_list_init(&pool->retry_on_resume_list);
-       ds_init(&pool->ds);
+       ds_init(&pool->shared_read_ds);
+       ds_init(&pool->all_io_ds);
 
        pool->next_mapping = NULL;
        pool->mapping_pool =
@@ -1549,6 +1735,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
                goto bad_endio_hook_pool;
        }
        pool->ref_count = 1;
+       pool->last_commit_jiffies = jiffies;
        pool->pool_md = pool_md;
        pool->md_dev = metadata_dev;
        __pool_table_insert(pool);
@@ -1691,6 +1878,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
        dm_block_t low_water_blocks;
        struct dm_dev *metadata_dev;
        sector_t metadata_dev_size;
+       char b[BDEVNAME_SIZE];
 
        /*
         * FIXME Remove validation from scope of lock.
@@ -1712,11 +1900,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
        }
 
        metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
-       if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) {
-               ti->error = "Metadata device is too large";
-               r = -EINVAL;
-               goto out_metadata;
-       }
+       if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
+               DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
+                      bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
 
        r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
        if (r) {
@@ -1770,7 +1956,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
        pt->low_water_blocks = low_water_blocks;
        pt->zero_new_blocks = pf.zero_new_blocks;
        ti->num_flush_requests = 1;
-       ti->num_discard_requests = 0;
+       ti->num_discard_requests = 1;
+       ti->discards_supported = 1;
        ti->private = pt;
 
        pt->callbacks.congested_fn = pool_is_congested;
@@ -1878,7 +2065,7 @@ static void pool_resume(struct dm_target *ti)
        __requeue_bios(pool);
        spin_unlock_irqrestore(&pool->lock, flags);
 
-       wake_worker(pool);
+       do_waker(&pool->waker.work);
 }
 
 static void pool_postsuspend(struct dm_target *ti)
@@ -1887,6 +2074,7 @@ static void pool_postsuspend(struct dm_target *ti)
        struct pool_c *pt = ti->private;
        struct pool *pool = pt->pool;
 
+       cancel_delayed_work(&pool->waker);
        flush_workqueue(pool->wq);
 
        r = dm_pool_commit_metadata(pool->pmd);
@@ -2162,6 +2350,17 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
+static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
+{
+       limits->max_discard_sectors = pool->sectors_per_block;
+
+       /*
+        * This is just a hint, and not enforced.  We have to cope with
+        * bios that overlap 2 blocks.
+        */
+       limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+}
+
 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
        struct pool_c *pt = ti->private;
@@ -2169,6 +2368,7 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
        blk_limits_io_min(limits, 0);
        blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+       set_discard_limits(pool, limits);
 }
 
 static struct target_type pool_target = {
@@ -2202,6 +2402,8 @@ static void thin_dtr(struct dm_target *ti)
        __pool_dec(tc->pool);
        dm_pool_close_thin_device(tc->td);
        dm_put_device(ti, tc->pool_dev);
+       if (tc->origin_dev)
+               dm_put_device(ti, tc->origin_dev);
        kfree(tc);
 
        mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2210,21 +2412,22 @@ static void thin_dtr(struct dm_target *ti)
 /*
  * Thin target parameters:
  *
- * <pool_dev> <dev_id>
+ * <pool_dev> <dev_id> [origin_dev]
  *
  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
  * dev_id: the internal device identifier
+ * origin_dev: a device external to the pool that should act as the origin
  */
 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
        int r;
        struct thin_c *tc;
-       struct dm_dev *pool_dev;
+       struct dm_dev *pool_dev, *origin_dev;
        struct mapped_device *pool_md;
 
        mutex_lock(&dm_thin_pool_table.mutex);
 
-       if (argc != 2) {
+       if (argc != 2 && argc != 3) {
                ti->error = "Invalid argument count";
                r = -EINVAL;
                goto out_unlock;
@@ -2237,6 +2440,15 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
                goto out_unlock;
        }
 
+       if (argc == 3) {
+               r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
+               if (r) {
+                       ti->error = "Error opening origin device";
+                       goto bad_origin_dev;
+               }
+               tc->origin_dev = origin_dev;
+       }
+
        r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
        if (r) {
                ti->error = "Error opening pool device";
@@ -2273,8 +2485,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
        ti->split_io = tc->pool->sectors_per_block;
        ti->num_flush_requests = 1;
-       ti->num_discard_requests = 0;
-       ti->discards_supported = 0;
+       ti->num_discard_requests = 1;
+       ti->discards_supported = 1;
 
        dm_put(pool_md);
 
@@ -2289,6 +2501,9 @@ bad_pool_lookup:
 bad_common:
        dm_put_device(ti, tc->pool_dev);
 bad_pool_dev:
+       if (tc->origin_dev)
+               dm_put_device(ti, tc->origin_dev);
+bad_origin_dev:
        kfree(tc);
 out_unlock:
        mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2299,11 +2514,46 @@ out_unlock:
 static int thin_map(struct dm_target *ti, struct bio *bio,
                    union map_info *map_context)
 {
-       bio->bi_sector -= ti->begin;
+       bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
 
        return thin_bio_map(ti, bio, map_context);
 }
 
+static int thin_endio(struct dm_target *ti,
+                     struct bio *bio, int err,
+                     union map_info *map_context)
+{
+       unsigned long flags;
+       struct endio_hook *h = map_context->ptr;
+       struct list_head work;
+       struct new_mapping *m, *tmp;
+       struct pool *pool = h->tc->pool;
+
+       if (h->shared_read_entry) {
+               INIT_LIST_HEAD(&work);
+               ds_dec(h->shared_read_entry, &work);
+
+               spin_lock_irqsave(&pool->lock, flags);
+               list_for_each_entry_safe(m, tmp, &work, list) {
+                       list_del(&m->list);
+                       m->quiesced = 1;
+                       __maybe_add_mapping(m);
+               }
+               spin_unlock_irqrestore(&pool->lock, flags);
+       }
+
+       if (h->all_io_entry) {
+               INIT_LIST_HEAD(&work);
+               ds_dec(h->all_io_entry, &work);
+               list_for_each_entry_safe(m, tmp, &work, list)
+                       list_add(&m->list, &pool->prepared_discards);
+       }
+
+       mempool_free(h, pool->endio_hook_pool);
+
+       return 0;
+}
+
 static void thin_postsuspend(struct dm_target *ti)
 {
        if (dm_noflush_suspending(ti))
@@ -2347,6 +2597,8 @@ static int thin_status(struct dm_target *ti, status_type_t type,
                        DMEMIT("%s %lu",
                               format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
                               (unsigned long) tc->dev_id);
+                       if (tc->origin_dev)
+                               DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
                        break;
                }
        }
@@ -2377,18 +2629,21 @@ static int thin_iterate_devices(struct dm_target *ti,
 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
        struct thin_c *tc = ti->private;
+       struct pool *pool = tc->pool;
 
        blk_limits_io_min(limits, 0);
-       blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
+       blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+       set_discard_limits(pool, limits);
 }
 
 static struct target_type thin_target = {
        .name = "thin",
-       .version = {1, 0, 0},
+       .version = {1, 1, 0},
        .module = THIS_MODULE,
        .ctr = thin_ctr,
        .dtr = thin_dtr,
        .map = thin_map,
+       .end_io = thin_endio,
        .postsuspend = thin_postsuspend,
        .status = thin_status,
        .iterate_devices = thin_iterate_devices,