Merge branch 'writeback-workqueue' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorJens Axboe <axboe@kernel.dk>
Tue, 2 Apr 2013 08:04:39 +0000 (10:04 +0200)
committerJens Axboe <axboe@kernel.dk>
Tue, 2 Apr 2013 08:04:39 +0000 (10:04 +0200)
Tejun writes:

-----

This is the pull request for the earlier patchset[1] with the same
name.  It's only three patches (the first one was committed to
workqueue tree) but the merge strategy is a bit involved due to the
dependencies.

* Because the conversion needs features from wq/for-3.10,
  block/for-3.10/core is based on rc3, and wq/for-3.10 has conflicts
  with rc3, I pulled mainline (rc5) into wq/for-3.10 to prevent those
  workqueue conflicts from flaring up in block tree.

* Resolving the issue that Jan and Dave raised about debugging
  requires arch-wide changes.  The patchset is being worked on[2] but
  it'll have to go through -mm after these changes show up in -next,
  and not included in this pull request.

The three commits are located in the following git branch.

  git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git writeback-workqueue

Pulling it into block/for-3.10/core produces a conflict in
drivers/md/raid5.c between the following two commits.

  e3620a3ad5 ("MD RAID5: Avoid accessing gendisk or queue structs when not available")
  2f6db2a707 ("raid5: use bio_reset()")

The conflict is trivial - one removes an "if ()" conditional while the
other removes "rbi->bi_next = NULL" right above it.  We just need to
remove both.  The merged branch is available at

  git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git block-test-merge

so that you can use it for verification.  The test merge commit has
proper merge description.

While these changes are a bit of pain to route, they make code simpler
and even have, while minute, measureable performance gain[3] even on a
workload which isn't particularly favorable to showing the benefits of
this conversion.

----

Fixed up the conflict.

Conflicts:
drivers/md/raid5.c

Signed-off-by: Jens Axboe <axboe@kernel.dk>
1  2 
drivers/block/rbd.c
drivers/md/dm-verity.c
drivers/md/md.c
drivers/md/raid5.c
fs/btrfs/extent_io.c
fs/btrfs/volumes.c

diff --combined drivers/block/rbd.c
index 11e179826b60d4df3125c56ff2eec0a8ceff0274,f556f8a8b3f9b476949c6133f39778c49502e380..6b2b039c191fc5da53418371f290babdc5cd02c1
@@@ -952,7 -952,7 +952,7 @@@ static struct bio *bio_clone_range(stru
        /* Find first affected segment... */
  
        resid = offset;
 -      __bio_for_each_segment(bv, bio_src, idx, 0) {
 +      bio_for_each_segment(bv, bio_src, idx) {
                if (resid < bv->bv_len)
                        break;
                resid -= bv->bv_len;
@@@ -1264,6 -1264,32 +1264,32 @@@ static bool obj_request_done_test(struc
        return atomic_read(&obj_request->done) != 0;
  }
  
+ static void
+ rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
+ {
+       dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
+               obj_request, obj_request->img_request, obj_request->result,
+               obj_request->xferred, obj_request->length);
+       /*
+        * ENOENT means a hole in the image.  We zero-fill the
+        * entire length of the request.  A short read also implies
+        * zero-fill to the end of the request.  Either way we
+        * update the xferred count to indicate the whole request
+        * was satisfied.
+        */
+       BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
+       if (obj_request->result == -ENOENT) {
+               zero_bio_chain(obj_request->bio_list, 0);
+               obj_request->result = 0;
+               obj_request->xferred = obj_request->length;
+       } else if (obj_request->xferred < obj_request->length &&
+                       !obj_request->result) {
+               zero_bio_chain(obj_request->bio_list, obj_request->xferred);
+               obj_request->xferred = obj_request->length;
+       }
+       obj_request_done_set(obj_request);
+ }
  static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
  {
        dout("%s: obj %p cb %p\n", __func__, obj_request,
@@@ -1284,23 -1310,10 +1310,10 @@@ static void rbd_osd_read_callback(struc
  {
        dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
                obj_request->result, obj_request->xferred, obj_request->length);
-       /*
-        * ENOENT means a hole in the object.  We zero-fill the
-        * entire length of the request.  A short read also implies
-        * zero-fill to the end of the request.  Either way we
-        * update the xferred count to indicate the whole request
-        * was satisfied.
-        */
-       if (obj_request->result == -ENOENT) {
-               zero_bio_chain(obj_request->bio_list, 0);
-               obj_request->result = 0;
-               obj_request->xferred = obj_request->length;
-       } else if (obj_request->xferred < obj_request->length &&
-                       !obj_request->result) {
-               zero_bio_chain(obj_request->bio_list, obj_request->xferred);
-               obj_request->xferred = obj_request->length;
-       }
-       obj_request_done_set(obj_request);
+       if (obj_request->img_request)
+               rbd_img_obj_request_read_callback(obj_request);
+       else
+               obj_request_done_set(obj_request);
  }
  
  static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
diff --combined drivers/md/dm-verity.c
index 4f06d9adf1edb06da296ec9ba695e467063cc74d,a746f1d21c661bec7b809c9a688fddf44284581f..b948fd864d457e9ce857d1d04fd74bf8997e0aee
@@@ -93,6 -93,13 +93,13 @@@ struct dm_verity_io 
         */
  };
  
+ struct dm_verity_prefetch_work {
+       struct work_struct work;
+       struct dm_verity *v;
+       sector_t block;
+       unsigned n_blocks;
+ };
  static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
  {
        return (struct shash_desc *)(io + 1);
@@@ -424,15 -431,18 +431,18 @@@ static void verity_end_io(struct bio *b
   * The root buffer is not prefetched, it is assumed that it will be cached
   * all the time.
   */
- static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io)
+ static void verity_prefetch_io(struct work_struct *work)
  {
+       struct dm_verity_prefetch_work *pw =
+               container_of(work, struct dm_verity_prefetch_work, work);
+       struct dm_verity *v = pw->v;
        int i;
  
        for (i = v->levels - 2; i >= 0; i--) {
                sector_t hash_block_start;
                sector_t hash_block_end;
-               verity_hash_at_level(v, io->block, i, &hash_block_start, NULL);
-               verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL);
+               verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL);
+               verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL);
                if (!i) {
                        unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster);
  
@@@ -452,6 -462,25 +462,25 @@@ no_prefetch_cluster
                dm_bufio_prefetch(v->bufio, hash_block_start,
                                  hash_block_end - hash_block_start + 1);
        }
+       kfree(pw);
+ }
+ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
+ {
+       struct dm_verity_prefetch_work *pw;
+       pw = kmalloc(sizeof(struct dm_verity_prefetch_work),
+               GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+       if (!pw)
+               return;
+       INIT_WORK(&pw->work, verity_prefetch_io);
+       pw->v = v;
+       pw->block = io->block;
+       pw->n_blocks = io->n_blocks;
+       queue_work(v->verify_wq, &pw->work);
  }
  
  /*
@@@ -472,7 -501,7 +501,7 @@@ static int verity_map(struct dm_target 
                return -EIO;
        }
  
 -      if ((bio->bi_sector + bio_sectors(bio)) >>
 +      if (bio_end_sector(bio) >>
            (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
                DMERR_LIMIT("io out of range");
                return -EIO;
  
        bio->bi_end_io = verity_end_io;
        bio->bi_private = io;
 -      io->io_vec_size = bio->bi_vcnt - bio->bi_idx;
 +      io->io_vec_size = bio_segments(bio);
        if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
                io->io_vec = io->io_vec_inline;
        else
        memcpy(io->io_vec, bio_iovec(bio),
               io->io_vec_size * sizeof(struct bio_vec));
  
-       verity_prefetch_io(v, io);
+       verity_submit_prefetch(v, io);
  
        generic_make_request(bio);
  
@@@ -858,7 -887,7 +887,7 @@@ bad
  
  static struct target_type verity_target = {
        .name           = "verity",
-       .version        = {1, 1, 1},
+       .version        = {1, 2, 0},
        .module         = THIS_MODULE,
        .ctr            = verity_ctr,
        .dtr            = verity_dtr,
diff --combined drivers/md/md.c
index d323676580a97d600f6c57a0644bc06522e6a062,aeceedfc530b90b5da5b45e12ffdc9e4d665e16d..1d03ebde40b51885cd63950d695b19fe5b9214de
@@@ -194,12 -194,21 +194,12 @@@ void md_trim_bio(struct bio *bio, int o
        if (offset == 0 && size == bio->bi_size)
                return;
  
 -      bio->bi_sector += offset;
 -      bio->bi_size = size;
 -      offset <<= 9;
        clear_bit(BIO_SEG_VALID, &bio->bi_flags);
  
 -      while (bio->bi_idx < bio->bi_vcnt &&
 -             bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
 -              /* remove this whole bio_vec */
 -              offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
 -              bio->bi_idx++;
 -      }
 -      if (bio->bi_idx < bio->bi_vcnt) {
 -              bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
 -              bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
 -      }
 +      bio_advance(bio, offset << 9);
 +
 +      bio->bi_size = size;
 +
        /* avoid any complications with bi_idx being non-zero*/
        if (bio->bi_idx) {
                memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
@@@ -7654,10 -7663,8 +7654,8 @@@ static int remove_and_add_spares(struc
                                removed++;
                        }
                }
-       if (removed)
-               sysfs_notify(&mddev->kobj, NULL,
-                            "degraded");
+       if (removed && mddev->kobj.sd)
+               sysfs_notify(&mddev->kobj, NULL, "degraded");
  
        rdev_for_each(rdev, mddev) {
                if (rdev->raid_disk >= 0 &&
diff --combined drivers/md/raid5.c
index 7bbd28546214c4a9be5a4e08bc98df4ee4d5616d,24909eb13fec1b0bf22e40caa4ff7a76967f80e4..2fefb9f2198e2269bafa26d62e60eb96dbb63aa0
@@@ -90,7 -90,7 +90,7 @@@ static inline struct hlist_head *stripe
   */
  static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
  {
 -      int sectors = bio->bi_size >> 9;
 +      int sectors = bio_sectors(bio);
        if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
                return bio->bi_next;
        else
@@@ -567,6 -567,14 +567,6 @@@ static void ops_run_io(struct stripe_he
                bi = &sh->dev[i].req;
                rbi = &sh->dev[i].rreq; /* For writing to replacement */
  
 -              bi->bi_rw = rw;
 -              rbi->bi_rw = rw;
 -              if (rw & WRITE) {
 -                      bi->bi_end_io = raid5_end_write_request;
 -                      rbi->bi_end_io = raid5_end_write_request;
 -              } else
 -                      bi->bi_end_io = raid5_end_read_request;
 -
                rcu_read_lock();
                rrdev = rcu_dereference(conf->disks[i].replacement);
                smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
  
                        set_bit(STRIPE_IO_STARTED, &sh->state);
  
 +                      bio_reset(bi);
                        bi->bi_bdev = rdev->bdev;
 +                      bi->bi_rw = rw;
 +                      bi->bi_end_io = (rw & WRITE)
 +                              ? raid5_end_write_request
 +                              : raid5_end_read_request;
 +                      bi->bi_private = sh;
 +
                        pr_debug("%s: for %llu schedule op %ld on disc %d\n",
                                __func__, (unsigned long long)sh->sector,
                                bi->bi_rw, i);
                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
                                bi->bi_rw |= REQ_FLUSH;
  
 -                      bi->bi_flags = 1 << BIO_UPTODATE;
 -                      bi->bi_idx = 0;
                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                        bi->bi_io_vec[0].bv_offset = 0;
                        bi->bi_size = STRIPE_SIZE;
 -                      bi->bi_next = NULL;
                        if (rrdev)
                                set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
-                       trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
-                                             bi, disk_devt(conf->mddev->gendisk),
-                                             sh->dev[i].sector);
+                       if (conf->mddev->gendisk)
+                               trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
+                                                     bi, disk_devt(conf->mddev->gendisk),
+                                                     sh->dev[i].sector);
                        generic_make_request(bi);
                }
                if (rrdev) {
  
                        set_bit(STRIPE_IO_STARTED, &sh->state);
  
 +                      bio_reset(rbi);
                        rbi->bi_bdev = rrdev->bdev;
 +                      rbi->bi_rw = rw;
 +                      BUG_ON(!(rw & WRITE));
 +                      rbi->bi_end_io = raid5_end_write_request;
 +                      rbi->bi_private = sh;
 +
                        pr_debug("%s: for %llu schedule op %ld on "
                                 "replacement disc %d\n",
                                __func__, (unsigned long long)sh->sector,
                        else
                                rbi->bi_sector = (sh->sector
                                                  + rrdev->data_offset);
 -                      rbi->bi_flags = 1 << BIO_UPTODATE;
 -                      rbi->bi_idx = 0;
                        rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                        rbi->bi_io_vec[0].bv_offset = 0;
                        rbi->bi_size = STRIPE_SIZE;
-                       trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
-                                             rbi, disk_devt(conf->mddev->gendisk),
-                                             sh->dev[i].sector);
 -                      rbi->bi_next = NULL;
+                       if (conf->mddev->gendisk)
+                               trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
+                                                     rbi, disk_devt(conf->mddev->gendisk),
+                                                     sh->dev[i].sector);
                        generic_make_request(rbi);
                }
                if (!rdev && !rrdev) {
@@@ -2279,17 -2283,6 +2282,6 @@@ schedule_reconstruction(struct stripe_h
        int level = conf->level;
  
        if (rcw) {
-               /* if we are not expanding this is a proper write request, and
-                * there will be bios with new data to be drained into the
-                * stripe cache
-                */
-               if (!expand) {
-                       sh->reconstruct_state = reconstruct_state_drain_run;
-                       set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
-               } else
-                       sh->reconstruct_state = reconstruct_state_run;
-               set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
  
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
                                s->locked++;
                        }
                }
+               /* if we are not expanding this is a proper write request, and
+                * there will be bios with new data to be drained into the
+                * stripe cache
+                */
+               if (!expand) {
+                       if (!s->locked)
+                               /* False alarm, nothing to do */
+                               return;
+                       sh->reconstruct_state = reconstruct_state_drain_run;
+                       set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+               } else
+                       sh->reconstruct_state = reconstruct_state_run;
+               set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
                if (s->locked + conf->max_degraded == disks)
                        if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
                                atomic_inc(&conf->pending_full_writes);
                BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
                        test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
  
-               sh->reconstruct_state = reconstruct_state_prexor_drain_run;
-               set_bit(STRIPE_OP_PREXOR, &s->ops_request);
-               set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
-               set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
                        if (i == pd_idx)
                                s->locked++;
                        }
                }
+               if (!s->locked)
+                       /* False alarm - nothing to do */
+                       return;
+               sh->reconstruct_state = reconstruct_state_prexor_drain_run;
+               set_bit(STRIPE_OP_PREXOR, &s->ops_request);
+               set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+               set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
        }
  
        /* keep the parity disk(s) locked while asynchronous operations
@@@ -2383,11 -2393,11 +2392,11 @@@ static int add_stripe_bio(struct stripe
        } else
                bip = &sh->dev[dd_idx].toread;
        while (*bip && (*bip)->bi_sector < bi->bi_sector) {
 -              if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
 +              if (bio_end_sector(*bip) > bi->bi_sector)
                        goto overlap;
                bip = & (*bip)->bi_next;
        }
 -      if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
 +      if (*bip && (*bip)->bi_sector < bio_end_sector(bi))
                goto overlap;
  
        BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
                     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
                             bi && bi->bi_sector <= sector;
                     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
 -                      if (bi->bi_sector + (bi->bi_size>>9) >= sector)
 -                              sector = bi->bi_sector + (bi->bi_size>>9);
 +                      if (bio_end_sector(bi) >= sector)
 +                              sector = bio_end_sector(bi);
                }
                if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
                        set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
@@@ -2563,6 -2573,8 +2572,8 @@@ handle_failed_sync(struct r5conf *conf
        int i;
  
        clear_bit(STRIPE_SYNCING, &sh->state);
+       if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
+               wake_up(&conf->wait_for_overlap);
        s->syncing = 0;
        s->replacing = 0;
        /* There is nothing more to do for sync/check/repair.
@@@ -2736,6 -2748,7 +2747,7 @@@ static void handle_stripe_clean_event(s
  {
        int i;
        struct r5dev *dev;
+       int discard_pending = 0;
  
        for (i = disks; i--; )
                if (sh->dev[i].written) {
                                                STRIPE_SECTORS,
                                         !test_bit(STRIPE_DEGRADED, &sh->state),
                                                0);
-                       }
-               } else if (test_bit(R5_Discard, &sh->dev[i].flags))
-                       clear_bit(R5_Discard, &sh->dev[i].flags);
+                       } else if (test_bit(R5_Discard, &dev->flags))
+                               discard_pending = 1;
+               }
+       if (!discard_pending &&
+           test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
+               clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+               clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
+               if (sh->qd_idx >= 0) {
+                       clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+                       clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
+               }
+               /* now that discard is done we can proceed with any sync */
+               clear_bit(STRIPE_DISCARD, &sh->state);
+               if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
+                       set_bit(STRIPE_HANDLE, &sh->state);
+       }
  
        if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
                if (atomic_dec_and_test(&conf->pending_full_writes))
@@@ -2825,8 -2852,10 +2851,10 @@@ static void handle_stripe_dirtying(stru
        set_bit(STRIPE_HANDLE, &sh->state);
        if (rmw < rcw && rmw > 0) {
                /* prefer read-modify-write, but need to get some data */
-               blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d",
-                                 (unsigned long long)sh->sector, rmw);
+               if (conf->mddev->queue)
+                       blk_add_trace_msg(conf->mddev->queue,
+                                         "raid5 rmw %llu %d",
+                                         (unsigned long long)sh->sector, rmw);
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
                        if ((dev->towrite || i == sh->pd_idx) &&
                                }
                        }
                }
-               if (rcw)
+               if (rcw && conf->mddev->queue)
                        blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
                                          (unsigned long long)sh->sector,
                                          rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
@@@ -3416,9 -3445,15 +3444,15 @@@ static void handle_stripe(struct stripe
                return;
        }
  
-       if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
-               set_bit(STRIPE_SYNCING, &sh->state);
-               clear_bit(STRIPE_INSYNC, &sh->state);
+       if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+               spin_lock(&sh->stripe_lock);
+               /* Cannot process 'sync' concurrently with 'discard' */
+               if (!test_bit(STRIPE_DISCARD, &sh->state) &&
+                   test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+                       set_bit(STRIPE_SYNCING, &sh->state);
+                       clear_bit(STRIPE_INSYNC, &sh->state);
+               }
+               spin_unlock(&sh->stripe_lock);
        }
        clear_bit(STRIPE_DELAYED, &sh->state);
  
            test_bit(STRIPE_INSYNC, &sh->state)) {
                md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
                clear_bit(STRIPE_SYNCING, &sh->state);
+               if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
+                       wake_up(&conf->wait_for_overlap);
        }
  
        /* If the failed drives are just a ReadError, then we might need
@@@ -3803,7 -3840,7 +3839,7 @@@ static int in_chunk_boundary(struct mdd
  {
        sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
        unsigned int chunk_sectors = mddev->chunk_sectors;
 -      unsigned int bio_sectors = bio->bi_size >> 9;
 +      unsigned int bio_sectors = bio_sectors(bio);
  
        if (mddev->new_chunk_sectors < mddev->chunk_sectors)
                chunk_sectors = mddev->new_chunk_sectors;
@@@ -3893,7 -3930,7 +3929,7 @@@ static int bio_fits_rdev(struct bio *bi
  {
        struct request_queue *q = bdev_get_queue(bi->bi_bdev);
  
 -      if ((bi->bi_size>>9) > queue_max_sectors(q))
 +      if (bio_sectors(bi) > queue_max_sectors(q))
                return 0;
        blk_recount_segments(q, bi);
        if (bi->bi_phys_segments > queue_max_segments(q))
@@@ -3940,7 -3977,7 +3976,7 @@@ static int chunk_aligned_read(struct md
                                                    0,
                                                    &dd_idx, NULL);
  
 -      end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
 +      end_sector = bio_end_sector(align_bi);
        rcu_read_lock();
        rdev = rcu_dereference(conf->disks[dd_idx].replacement);
        if (!rdev || test_bit(Faulty, &rdev->flags) ||
                align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
  
                if (!bio_fits_rdev(align_bi) ||
 -                  is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
 +                  is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi),
                                &first_bad, &bad_sectors)) {
                        /* too big in some way, or has a known bad block */
                        bio_put(align_bi);
                atomic_inc(&conf->active_aligned_reads);
                spin_unlock_irq(&conf->device_lock);
  
-               trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
-                                     align_bi, disk_devt(mddev->gendisk),
-                                     raid_bio->bi_sector);
+               if (mddev->gendisk)
+                       trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
+                                             align_bi, disk_devt(mddev->gendisk),
+                                             raid_bio->bi_sector);
                generic_make_request(align_bi);
                return 1;
        } else {
@@@ -4077,7 -4115,8 +4114,8 @@@ static void raid5_unplug(struct blk_plu
                }
                spin_unlock_irq(&conf->device_lock);
        }
-       trace_block_unplug(mddev->queue, cnt, !from_schedule);
+       if (mddev->queue)
+               trace_block_unplug(mddev->queue, cnt, !from_schedule);
        kfree(cb);
  }
  
@@@ -4140,6 -4179,13 +4178,13 @@@ static void make_discard_request(struc
                sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
                prepare_to_wait(&conf->wait_for_overlap, &w,
                                TASK_UNINTERRUPTIBLE);
+               set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
+               if (test_bit(STRIPE_SYNCING, &sh->state)) {
+                       release_stripe(sh);
+                       schedule();
+                       goto again;
+               }
+               clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
                spin_lock_irq(&sh->stripe_lock);
                for (d = 0; d < conf->raid_disks; d++) {
                        if (d == sh->pd_idx || d == sh->qd_idx)
                                goto again;
                        }
                }
+               set_bit(STRIPE_DISCARD, &sh->state);
                finish_wait(&conf->wait_for_overlap, &w);
                for (d = 0; d < conf->raid_disks; d++) {
                        if (d == sh->pd_idx || d == sh->qd_idx)
@@@ -4215,7 -4262,7 +4261,7 @@@ static void make_request(struct mddev *
        }
  
        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 -      last_sector = bi->bi_sector + (bi->bi_size>>9);
 +      last_sector = bio_end_sector(bi);
        bi->bi_next = NULL;
        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
  
@@@ -4678,7 -4725,7 +4724,7 @@@ static int  retry_aligned_read(struct r
        logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
        sector = raid5_compute_sector(conf, logical_sector,
                                      0, &dd_idx, NULL);
 -      last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
 +      last_sector = bio_end_sector(raid_bio);
  
        for (; logical_sector < last_sector;
             logical_sector += STRIPE_SECTORS,
diff --combined fs/btrfs/extent_io.c
index bed072aa461f3089903987382de616bb143f43ef,cdee391fc7bfd57c596204a8474142e7afe60335..73f2bfe3ac9302091608beae85b4aecf28622240
@@@ -1257,6 -1257,39 +1257,39 @@@ int unlock_extent(struct extent_io_tre
                                GFP_NOFS);
  }
  
+ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+ {
+       unsigned long index = start >> PAGE_CACHE_SHIFT;
+       unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+       struct page *page;
+       while (index <= end_index) {
+               page = find_get_page(inode->i_mapping, index);
+               BUG_ON(!page); /* Pages should be in the extent_io_tree */
+               clear_page_dirty_for_io(page);
+               page_cache_release(page);
+               index++;
+       }
+       return 0;
+ }
+ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
+ {
+       unsigned long index = start >> PAGE_CACHE_SHIFT;
+       unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+       struct page *page;
+       while (index <= end_index) {
+               page = find_get_page(inode->i_mapping, index);
+               BUG_ON(!page); /* Pages should be in the extent_io_tree */
+               account_page_redirty(page);
+               __set_page_dirty_nobuffers(page);
+               page_cache_release(page);
+               index++;
+       }
+       return 0;
+ }
  /*
   * helper function to set both pages and extents in the tree writeback
   */
@@@ -2527,7 -2560,8 +2560,7 @@@ static int submit_extent_page(int rw, s
                if (old_compressed)
                        contig = bio->bi_sector == sector;
                else
 -                      contig = bio->bi_sector + (bio->bi_size >> 9) ==
 -                              sector;
 +                      contig = bio_end_sector(bio) == sector;
  
                if (prev_bio_flags != bio_flags || !contig ||
                    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
diff --combined fs/btrfs/volumes.c
index d90e0485e01b0442d523f86c3f527f79f00393e3,2854c824ab6443d04eac19cf0b540f79c257cfa0..6789772265707bdd02d97f91dc92e8b34a2fd6bb
@@@ -4935,7 -4935,18 +4935,18 @@@ int btrfs_rmap_block(struct btrfs_mappi
        em = lookup_extent_mapping(em_tree, chunk_start, 1);
        read_unlock(&em_tree->lock);
  
-       BUG_ON(!em || em->start != chunk_start);
+       if (!em) {
+               printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n",
+                      chunk_start);
+               return -EIO;
+       }
+       if (em->start != chunk_start) {
+               printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n",
+                      em->start, chunk_start);
+               free_extent_map(em);
+               return -EIO;
+       }
        map = (struct map_lookup *)em->bdev;
  
        length = em->len;
@@@ -5166,7 -5177,7 +5177,7 @@@ static int bio_size_ok(struct block_dev
        }
  
        prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
 -      if ((bio->bi_size >> 9) > max_sectors)
 +      if (bio_sectors(bio) > max_sectors)
                return 0;
  
        if (!q->merge_bvec_fn)