Merge tag 'md-3.8' of git://neil.brown.name/md
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 18 Dec 2012 17:32:44 +0000 (09:32 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 18 Dec 2012 17:32:44 +0000 (09:32 -0800)
Pull md update from Neil Brown:
 "Mostly just little fixes.  Probably biggest part is AVX accelerated
  RAID6 calculations."

* tag 'md-3.8' of git://neil.brown.name/md:
  md/raid5: add blktrace calls
  md/raid5: use async_tx_quiesce() instead of open-coding it.
  md: Use ->curr_resync as last completed request when cleanly aborting resync.
  lib/raid6: build proper files on corresponding arch
  lib/raid6: Add AVX2 optimized gen_syndrome functions
  lib/raid6: Add AVX2 optimized recovery functions
  md: Update checkpoint of resync/recovery based on time.
  md:Add place to update ->recovery_cp.
  md.c: re-indent various 'switch' statements.
  md: close race between removing and adding a device.
  md: removed unused variable in calc_sb_1_csm.

1  2 
arch/x86/Makefile
drivers/md/md.c
drivers/md/md.h
drivers/md/raid5.c

diff --combined arch/x86/Makefile
index 05afcca66de68cf65c7c354b543881ea96a2cf74,95477aae9ff7fe3ce4d854bcfc458ea0ccc795f9..e71fc4279aab62825524bba4357bd3cb3b6e847e
@@@ -123,9 -123,10 +123,10 @@@ cfi-sections := $(call as-instr,.cfi_se
  # does binutils support specific instructions?
  asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
  avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
+ avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
  
- KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
- KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
+ KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
+ KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
  
  LDFLAGS := -m elf_$(UTS_MACHINE)
  
@@@ -142,7 -143,7 +143,7 @@@ KBUILD_CFLAGS += $(call cc-option,-mno-
  KBUILD_CFLAGS += $(mflags-y)
  KBUILD_AFLAGS += $(mflags-y)
  
 -archscripts:
 +archscripts: scripts_basic
        $(Q)$(MAKE) $(build)=arch/x86/tools relocs
  
  ###
diff --combined drivers/md/md.c
index 4843b004c55864233be8f2cc7d1eefe32d9e2c0b,30ba223e5ed0726048801d1af65371e237cd9c60..3db3d1b271f7ef18650a227a8dd07dbc2b8fefbc
@@@ -452,7 -452,7 +452,7 @@@ void md_flush_request(struct mddev *mdd
        spin_lock_irq(&mddev->write_lock);
        wait_event_lock_irq(mddev->sb_wait,
                            !mddev->flush_bio,
 -                          mddev->write_lock, /*nothing*/);
 +                          mddev->write_lock);
        mddev->flush_bio = bio;
        spin_unlock_irq(&mddev->write_lock);
  
@@@ -1414,12 -1414,11 +1414,11 @@@ static __le32 calc_sb_1_csum(struct mdp
        unsigned long long newcsum;
        int size = 256 + le32_to_cpu(sb->max_dev)*2;
        __le32 *isuper = (__le32*)sb;
-       int i;
  
        disk_csum = sb->sb_csum;
        sb->sb_csum = 0;
        newcsum = 0;
-       for (i=0; size>=4; size -= 4 )
+       for (; size >= 4; size -= 4)
                newcsum += le32_to_cpu(*isuper++);
  
        if (size == 2)
@@@ -4124,7 -4123,7 +4123,7 @@@ static struct md_sysfs_entry md_size 
  __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
  
  
 -/* Metdata version.
 +/* Metadata version.
   * This is one of
   *   'none' for arrays with no metadata (good luck...)
   *   'external' for arrays with externally managed metadata,
@@@ -4753,6 -4752,8 +4752,8 @@@ md_attr_store(struct kobject *kobj, str
        }
        mddev_get(mddev);
        spin_unlock(&all_mddevs_lock);
+       if (entry->store == new_dev_store)
+               flush_workqueue(md_misc_wq);
        rv = mddev_lock(mddev);
        if (!rv) {
                rv = entry->store(mddev, page, length);
@@@ -6346,24 -6347,23 +6347,23 @@@ static int md_ioctl(struct block_devic
         * Commands dealing with the RAID driver but not any
         * particular array:
         */
-       switch (cmd)
-       {
-               case RAID_VERSION:
-                       err = get_version(argp);
-                       goto done;
+       switch (cmd) {
+       case RAID_VERSION:
+               err = get_version(argp);
+               goto done;
  
-               case PRINT_RAID_DEBUG:
-                       err = 0;
-                       md_print_devices();
-                       goto done;
+       case PRINT_RAID_DEBUG:
+               err = 0;
+               md_print_devices();
+               goto done;
  
  #ifndef MODULE
-               case RAID_AUTORUN:
-                       err = 0;
-                       autostart_arrays(arg);
-                       goto done;
+       case RAID_AUTORUN:
+               err = 0;
+               autostart_arrays(arg);
+               goto done;
  #endif
-               default:;
+       default:;
        }
  
        /*
                goto abort;
        }
  
+       if (cmd == ADD_NEW_DISK)
+               /* need to ensure md_delayed_delete() has completed */
+               flush_workqueue(md_misc_wq);
        err = mddev_lock(mddev);
        if (err) {
                printk(KERN_INFO 
                goto abort;
        }
  
-       switch (cmd)
-       {
-               case SET_ARRAY_INFO:
-                       {
-                               mdu_array_info_t info;
-                               if (!arg)
-                                       memset(&info, 0, sizeof(info));
-                               else if (copy_from_user(&info, argp, sizeof(info))) {
-                                       err = -EFAULT;
-                                       goto abort_unlock;
-                               }
-                               if (mddev->pers) {
-                                       err = update_array_info(mddev, &info);
-                                       if (err) {
-                                               printk(KERN_WARNING "md: couldn't update"
-                                                      " array info. %d\n", err);
-                                               goto abort_unlock;
-                                       }
-                                       goto done_unlock;
-                               }
-                               if (!list_empty(&mddev->disks)) {
-                                       printk(KERN_WARNING
-                                              "md: array %s already has disks!\n",
-                                              mdname(mddev));
-                                       err = -EBUSY;
-                                       goto abort_unlock;
-                               }
-                               if (mddev->raid_disks) {
-                                       printk(KERN_WARNING
-                                              "md: array %s already initialised!\n",
-                                              mdname(mddev));
-                                       err = -EBUSY;
-                                       goto abort_unlock;
-                               }
-                               err = set_array_info(mddev, &info);
-                               if (err) {
-                                       printk(KERN_WARNING "md: couldn't set"
-                                              " array info. %d\n", err);
-                                       goto abort_unlock;
-                               }
+       if (cmd == SET_ARRAY_INFO) {
+               mdu_array_info_t info;
+               if (!arg)
+                       memset(&info, 0, sizeof(info));
+               else if (copy_from_user(&info, argp, sizeof(info))) {
+                       err = -EFAULT;
+                       goto abort_unlock;
+               }
+               if (mddev->pers) {
+                       err = update_array_info(mddev, &info);
+                       if (err) {
+                               printk(KERN_WARNING "md: couldn't update"
+                                      " array info. %d\n", err);
+                               goto abort_unlock;
                        }
                        goto done_unlock;
-               default:;
+               }
+               if (!list_empty(&mddev->disks)) {
+                       printk(KERN_WARNING
+                              "md: array %s already has disks!\n",
+                              mdname(mddev));
+                       err = -EBUSY;
+                       goto abort_unlock;
+               }
+               if (mddev->raid_disks) {
+                       printk(KERN_WARNING
+                              "md: array %s already initialised!\n",
+                              mdname(mddev));
+                       err = -EBUSY;
+                       goto abort_unlock;
+               }
+               err = set_array_info(mddev, &info);
+               if (err) {
+                       printk(KERN_WARNING "md: couldn't set"
+                              " array info. %d\n", err);
+                       goto abort_unlock;
+               }
+               goto done_unlock;
        }
  
        /*
        /*
         * Commands even a read-only array can execute:
         */
-       switch (cmd)
-       {
-               case GET_BITMAP_FILE:
-                       err = get_bitmap_file(mddev, argp);
-                       goto done_unlock;
+       switch (cmd) {
+       case GET_BITMAP_FILE:
+               err = get_bitmap_file(mddev, argp);
+               goto done_unlock;
  
-               case RESTART_ARRAY_RW:
-                       err = restart_array(mddev);
-                       goto done_unlock;
+       case RESTART_ARRAY_RW:
+               err = restart_array(mddev);
+               goto done_unlock;
  
-               case STOP_ARRAY:
-                       err = do_md_stop(mddev, 0, bdev);
-                       goto done_unlock;
+       case STOP_ARRAY:
+               err = do_md_stop(mddev, 0, bdev);
+               goto done_unlock;
  
-               case STOP_ARRAY_RO:
-                       err = md_set_readonly(mddev, bdev);
-                       goto done_unlock;
+       case STOP_ARRAY_RO:
+               err = md_set_readonly(mddev, bdev);
+               goto done_unlock;
  
-               case BLKROSET:
-                       if (get_user(ro, (int __user *)(arg))) {
-                               err = -EFAULT;
-                               goto done_unlock;
-                       }
-                       err = -EINVAL;
+       case BLKROSET:
+               if (get_user(ro, (int __user *)(arg))) {
+                       err = -EFAULT;
+                       goto done_unlock;
+               }
+               err = -EINVAL;
  
-                       /* if the bdev is going readonly the value of mddev->ro
-                        * does not matter, no writes are coming
-                        */
-                       if (ro)
-                               goto done_unlock;
+               /* if the bdev is going readonly the value of mddev->ro
+                * does not matter, no writes are coming
+                */
+               if (ro)
+                       goto done_unlock;
  
-                       /* are we are already prepared for writes? */
-                       if (mddev->ro != 1)
-                               goto done_unlock;
+               /* are we are already prepared for writes? */
+               if (mddev->ro != 1)
+                       goto done_unlock;
  
-                       /* transitioning to readauto need only happen for
-                        * arrays that call md_write_start
-                        */
-                       if (mddev->pers) {
-                               err = restart_array(mddev);
-                               if (err == 0) {
-                                       mddev->ro = 2;
-                                       set_disk_ro(mddev->gendisk, 0);
-                               }
+               /* transitioning to readauto need only happen for
+                * arrays that call md_write_start
+                */
+               if (mddev->pers) {
+                       err = restart_array(mddev);
+                       if (err == 0) {
+                               mddev->ro = 2;
+                               set_disk_ro(mddev->gendisk, 0);
                        }
-                       goto done_unlock;
+               }
+               goto done_unlock;
        }
  
        /*
                }
        }
  
-       switch (cmd)
+       switch (cmd) {
+       case ADD_NEW_DISK:
        {
-               case ADD_NEW_DISK:
-               {
-                       mdu_disk_info_t info;
-                       if (copy_from_user(&info, argp, sizeof(info)))
-                               err = -EFAULT;
-                       else
-                               err = add_new_disk(mddev, &info);
-                       goto done_unlock;
-               }
+               mdu_disk_info_t info;
+               if (copy_from_user(&info, argp, sizeof(info)))
+                       err = -EFAULT;
+               else
+                       err = add_new_disk(mddev, &info);
+               goto done_unlock;
+       }
  
-               case HOT_REMOVE_DISK:
-                       err = hot_remove_disk(mddev, new_decode_dev(arg));
-                       goto done_unlock;
+       case HOT_REMOVE_DISK:
+               err = hot_remove_disk(mddev, new_decode_dev(arg));
+               goto done_unlock;
  
-               case HOT_ADD_DISK:
-                       err = hot_add_disk(mddev, new_decode_dev(arg));
-                       goto done_unlock;
+       case HOT_ADD_DISK:
+               err = hot_add_disk(mddev, new_decode_dev(arg));
+               goto done_unlock;
  
-               case RUN_ARRAY:
-                       err = do_md_run(mddev);
-                       goto done_unlock;
+       case RUN_ARRAY:
+               err = do_md_run(mddev);
+               goto done_unlock;
  
-               case SET_BITMAP_FILE:
-                       err = set_bitmap_file(mddev, (int)arg);
-                       goto done_unlock;
+       case SET_BITMAP_FILE:
+               err = set_bitmap_file(mddev, (int)arg);
+               goto done_unlock;
  
-               default:
-                       err = -EINVAL;
-                       goto abort_unlock;
+       default:
+               err = -EINVAL;
+               goto abort_unlock;
        }
  
  done_unlock:
@@@ -7184,6 -7180,7 +7180,7 @@@ void md_done_sync(struct mddev *mddev, 
        wake_up(&mddev->recovery_wait);
        if (!ok) {
                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+               set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
                md_wakeup_thread(mddev->thread);
                // stop recovery, signal do_sync ....
        }
@@@ -7281,6 -7278,7 +7278,7 @@@ EXPORT_SYMBOL_GPL(md_allow_write)
  
  #define SYNC_MARKS    10
  #define       SYNC_MARK_STEP  (3*HZ)
+ #define UPDATE_FREQUENCY (5*60*HZ)
  void md_do_sync(struct md_thread *thread)
  {
        struct mddev *mddev = thread->mddev;
                 window;
        sector_t max_sectors,j, io_sectors;
        unsigned long mark[SYNC_MARKS];
+       unsigned long update_time;
        sector_t mark_cnt[SYNC_MARKS];
        int last_mark,m;
        struct list_head *tmp;
        mddev->curr_resync_completed = j;
        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
        md_new_event(mddev);
+       update_time = jiffies;
  
        blk_start_plug(&plug);
        while (j < max_sectors) {
                    ((mddev->curr_resync > mddev->curr_resync_completed &&
                      (mddev->curr_resync - mddev->curr_resync_completed)
                      > (max_sectors >> 4)) ||
+                    time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
                     (j - mddev->curr_resync_completed)*2
                     >= mddev->resync_max - mddev->curr_resync_completed
                            )) {
                        wait_event(mddev->recovery_wait,
                                   atomic_read(&mddev->recovery_active) == 0);
                        mddev->curr_resync_completed = j;
+                       if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+                           j > mddev->recovery_cp)
+                               mddev->recovery_cp = j;
+                       update_time = jiffies;
                        set_bit(MD_CHANGE_CLEAN, &mddev->flags);
                        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
                }
                                        printk(KERN_INFO
                                               "md: checkpointing %s of %s.\n",
                                               desc, mdname(mddev));
-                                       mddev->recovery_cp =
-                                               mddev->curr_resync_completed;
+                                       if (test_bit(MD_RECOVERY_ERROR,
+                                               &mddev->recovery))
+                                               mddev->recovery_cp =
+                                                       mddev->curr_resync_completed;
+                                       else
+                                               mddev->recovery_cp =
+                                                       mddev->curr_resync;
                                }
                        } else
                                mddev->recovery_cp = MaxSector;
diff --combined drivers/md/md.h
index 1e2fc3d9c74c5665b7910f76f4e3c86780c0768e,c29e62ebc488ba4570798803f404803ad9c72de5..eca59c3074ef65bee75301f92411d9968adbaf9d
@@@ -307,6 -307,7 +307,7 @@@ struct mddev 
         * REQUEST:  user-space has requested a sync (used with SYNC)
         * CHECK:    user-space request for check-only, no repair
         * RESHAPE:  A reshape is happening
+        * ERROR:    sync-action interrupted because io-error
         *
         * If neither SYNC or RESHAPE are set, then it is a recovery.
         */
  #define       MD_RECOVERY_CHECK       7
  #define MD_RECOVERY_RESHAPE   8
  #define       MD_RECOVERY_FROZEN      9
+ #define       MD_RECOVERY_ERROR       10
  
        unsigned long                   recovery;
        /* If a RAID personality determines that recovery (of a particular
@@@ -551,6 -553,32 +553,6 @@@ struct md_thread 
  
  #define THREAD_WAKEUP  0
  
 -#define __wait_event_lock_irq(wq, condition, lock, cmd)               \
 -do {                                                                  \
 -      wait_queue_t __wait;                                            \
 -      init_waitqueue_entry(&__wait, current);                         \
 -                                                                      \
 -      add_wait_queue(&wq, &__wait);                                   \
 -      for (;;) {                                                      \
 -              set_current_state(TASK_UNINTERRUPTIBLE);                \
 -              if (condition)                                          \
 -                      break;                                          \
 -              spin_unlock_irq(&lock);                                 \
 -              cmd;                                                    \
 -              schedule();                                             \
 -              spin_lock_irq(&lock);                                   \
 -      }                                                               \
 -      current->state = TASK_RUNNING;                                  \
 -      remove_wait_queue(&wq, &__wait);                                \
 -} while (0)
 -
 -#define wait_event_lock_irq(wq, condition, lock, cmd)                         \
 -do {                                                                  \
 -      if (condition)                                                  \
 -              break;                                                  \
 -      __wait_event_lock_irq(wq, condition, lock, cmd);                \
 -} while (0)
 -
  static inline void safe_put_page(struct page *p)
  {
        if (p) put_page(p);
diff --combined drivers/md/raid5.c
index 8d8555bf3e1d9bb697922334638e487ee9c0f65d,ffebc1e8f483179a809bf6a87132898d8238a5c3..19d77a02663972c3daa9d74295c820d2bb85ba37
@@@ -53,6 -53,8 +53,8 @@@
  #include <linux/cpu.h>
  #include <linux/slab.h>
  #include <linux/ratelimit.h>
+ #include <trace/events/block.h>
  #include "md.h"
  #include "raid5.h"
  #include "raid0.h"
@@@ -182,6 -184,8 +184,8 @@@ static void return_io(struct bio *retur
                return_bi = bi->bi_next;
                bi->bi_next = NULL;
                bi->bi_size = 0;
+               trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+                                        bi, 0);
                bio_endio(bi, 0);
                bi = return_bi;
        }
@@@ -466,7 -470,7 +470,7 @@@ get_active_stripe(struct r5conf *conf, 
        do {
                wait_event_lock_irq(conf->wait_for_stripe,
                                    conf->quiesce == 0 || noquiesce,
 -                                  conf->device_lock, /* nothing */);
 +                                  conf->device_lock);
                sh = __find_stripe(conf, sector, conf->generation - previous);
                if (!sh) {
                        if (!conf->inactive_blocked)
                                                    (atomic_read(&conf->active_stripes)
                                                     < (conf->max_nr_stripes *3/4)
                                                     || !conf->inactive_blocked),
 -                                                  conf->device_lock,
 -                                                  );
 +                                                  conf->device_lock);
                                conf->inactive_blocked = 0;
                        } else
                                init_stripe(sh, sector, previous);
@@@ -670,6 -675,9 +674,9 @@@ static void ops_run_io(struct stripe_he
                        bi->bi_next = NULL;
                        if (rrdev)
                                set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
+                       trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
+                                             bi, disk_devt(conf->mddev->gendisk),
+                                             sh->dev[i].sector);
                        generic_make_request(bi);
                }
                if (rrdev) {
                        rbi->bi_io_vec[0].bv_offset = 0;
                        rbi->bi_size = STRIPE_SIZE;
                        rbi->bi_next = NULL;
+                       trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
+                                             rbi, disk_devt(conf->mddev->gendisk),
+                                             sh->dev[i].sector);
                        generic_make_request(rbi);
                }
                if (!rdev && !rrdev) {
@@@ -1575,7 -1586,7 +1585,7 @@@ static int resize_stripes(struct r5con
         * This happens in stages:
         * 1/ create a new kmem_cache and allocate the required number of
         *    stripe_heads.
 -       * 2/ gather all the old stripe_heads and tranfer the pages across
 +       * 2/ gather all the old stripe_heads and transfer the pages across
         *    to the new stripe_heads.  This will have the side effect of
         *    freezing the array as once all stripe_heads have been collected,
         *    no IO will be possible.  Old stripe heads are freed once their
                spin_lock_irq(&conf->device_lock);
                wait_event_lock_irq(conf->wait_for_stripe,
                                    !list_empty(&conf->inactive_list),
 -                                  conf->device_lock,
 -                                  );
 +                                  conf->device_lock);
                osh = get_free_stripe(conf);
                spin_unlock_irq(&conf->device_lock);
                atomic_set(&nsh->count, 1);
@@@ -2853,8 -2865,10 +2863,10 @@@ static void handle_stripe_dirtying(stru
        pr_debug("for sector %llu, rmw=%d rcw=%d\n",
                (unsigned long long)sh->sector, rmw, rcw);
        set_bit(STRIPE_HANDLE, &sh->state);
-       if (rmw < rcw && rmw > 0)
+       if (rmw < rcw && rmw > 0) {
                /* prefer read-modify-write, but need to get some data */
+               blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d",
+                                 (unsigned long long)sh->sector, rmw);
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
                        if ((dev->towrite || i == sh->pd_idx) &&
                                if (
                                  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
                                        pr_debug("Read_old block "
-                                               "%d for r-m-w\n", i);
+                                                "%d for r-m-w\n", i);
                                        set_bit(R5_LOCKED, &dev->flags);
                                        set_bit(R5_Wantread, &dev->flags);
                                        s->locked++;
                                }
                        }
                }
+       }
        if (rcw <= rmw && rcw > 0) {
                /* want reconstruct write, but need to get some data */
+               int qread =0;
                rcw = 0;
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
                                        set_bit(R5_LOCKED, &dev->flags);
                                        set_bit(R5_Wantread, &dev->flags);
                                        s->locked++;
+                                       qread++;
                                } else {
                                        set_bit(STRIPE_DELAYED, &sh->state);
                                        set_bit(STRIPE_HANDLE, &sh->state);
                                }
                        }
                }
+               if (rcw)
+                       blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
+                                         (unsigned long long)sh->sector,
+                                         rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
        }
        /* now if nothing is locked, and if we have enough data,
         * we can start a write request
@@@ -3222,10 -3243,7 +3241,7 @@@ static void handle_stripe_expansion(str
  
                }
        /* done submitting copies, wait for them to complete */
-       if (tx) {
-               async_tx_ack(tx);
-               dma_wait_for_async_tx(tx);
-       }
+       async_tx_quiesce(&tx);
  }
  
  /*
@@@ -3901,6 -3919,8 +3917,8 @@@ static void raid5_align_endio(struct bi
        rdev_dec_pending(rdev, conf->mddev);
  
        if (!error && uptodate) {
+               trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
+                                        raid_bi, 0);
                bio_endio(raid_bi, 0);
                if (atomic_dec_and_test(&conf->active_aligned_reads))
                        wake_up(&conf->wait_for_stripe);
@@@ -4001,10 -4021,13 +4019,13 @@@ static int chunk_aligned_read(struct md
                spin_lock_irq(&conf->device_lock);
                wait_event_lock_irq(conf->wait_for_stripe,
                                    conf->quiesce == 0,
 -                                  conf->device_lock, /* nothing */);
 +                                  conf->device_lock);
                atomic_inc(&conf->active_aligned_reads);
                spin_unlock_irq(&conf->device_lock);
  
+               trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
+                                     align_bi, disk_devt(mddev->gendisk),
+                                     raid_bio->bi_sector);
                generic_make_request(align_bi);
                return 1;
        } else {
@@@ -4079,6 -4102,7 +4100,7 @@@ static void raid5_unplug(struct blk_plu
        struct stripe_head *sh;
        struct mddev *mddev = cb->cb.data;
        struct r5conf *conf = mddev->private;
+       int cnt = 0;
  
        if (cb->list.next && !list_empty(&cb->list)) {
                spin_lock_irq(&conf->device_lock);
                        smp_mb__before_clear_bit();
                        clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
                        __release_stripe(conf, sh);
+                       cnt++;
                }
                spin_unlock_irq(&conf->device_lock);
        }
+       trace_block_unplug(mddev->queue, cnt, !from_schedule);
        kfree(cb);
  }
  
@@@ -4353,6 -4379,8 +4377,8 @@@ static void make_request(struct mddev *
                if ( rw == WRITE )
                        md_write_end(mddev);
  
+               trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+                                        bi, 0);
                bio_endio(bi, 0);
        }
  }
@@@ -4729,8 -4757,11 +4755,11 @@@ static int  retry_aligned_read(struct r
                handled++;
        }
        remaining = raid5_dec_bi_active_stripes(raid_bio);
-       if (remaining == 0)
+       if (remaining == 0) {
+               trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
+                                        raid_bio, 0);
                bio_endio(raid_bio, 0);
+       }
        if (atomic_dec_and_test(&conf->active_aligned_reads))
                wake_up(&conf->wait_for_stripe);
        return handled;
@@@ -6093,7 -6124,7 +6122,7 @@@ static void raid5_quiesce(struct mddev 
                wait_event_lock_irq(conf->wait_for_stripe,
                                    atomic_read(&conf->active_stripes) == 0 &&
                                    atomic_read(&conf->active_aligned_reads) == 0,
 -                                  conf->device_lock, /* nothing */);
 +                                  conf->device_lock);
                conf->quiesce = 1;
                spin_unlock_irq(&conf->device_lock);
                /* allow reshape to continue */