raid5: add a per-stripe lock
authorShaohua Li <shli@kernel.org>
Thu, 19 Jul 2012 06:01:31 +0000 (16:01 +1000)
committerNeilBrown <neilb@suse.de>
Thu, 19 Jul 2012 06:01:31 +0000 (16:01 +1000)
Add a per-stripe lock to protect stripe specific data. The purpose is to reduce
lock contention of conf->device_lock.

stripe ->toread, ->towrite are protected by per-stripe lock.  Accessing bio
list of the stripe is always serialized by this lock, so adding bio to the
lists (add_stripe_bio()) and removing bio from the lists (like
ops_run_biofill()) not race.

If bio in ->read, ->written ... list are not shared by multiple stripes, we
don't need any lock to protect ->read, ->written, because STRIPE_ACTIVE will
protect them. If the bio are shared,  there are two protections:
1. bi_phys_segments acts as a reference count
2. traverse the list uses r5_next_bio, which makes traverse never access bio
not belonging to the stripe

Let's have an example:
|  stripe1 |  stripe2    |  stripe3  |
...bio1......|bio2|bio3|....bio4.....

stripe2 has 4 bios, when it's finished, it will decrement bi_phys_segments for
all bios, but only end_bio for bio2 and bio3. bio1->bi_next still points to
bio2, but this doesn't matter. When stripe1 is finished, it will not touch bio2
because of r5_next_bio check. Next time stripe1 will end_bio for bio1 and
stripe3 will end_bio bio4.

before add_stripe_bio() addes a bio to a stripe, we already increament the bio
bi_phys_segments, so don't worry other stripes release the bio.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
drivers/md/raid5.c
drivers/md/raid5.h

index 9ad452c6d7e3e074cc0987a86b9197043f7fc9ed..c2192a2907e4b4315f6501d5c5c20c3ce1181997 100644 (file)
@@ -762,14 +762,12 @@ static void ops_complete_biofill(void *stripe_head_ref)
 {
        struct stripe_head *sh = stripe_head_ref;
        struct bio *return_bi = NULL;
-       struct r5conf *conf = sh->raid_conf;
        int i;
 
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
 
        /* clear completed biofills */
-       spin_lock_irq(&conf->device_lock);
        for (i = sh->disks; i--; ) {
                struct r5dev *dev = &sh->dev[i];
 
@@ -795,7 +793,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
                        }
                }
        }
-       spin_unlock_irq(&conf->device_lock);
        clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
 
        return_io(return_bi);
@@ -807,7 +804,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
 static void ops_run_biofill(struct stripe_head *sh)
 {
        struct dma_async_tx_descriptor *tx = NULL;
-       struct r5conf *conf = sh->raid_conf;
        struct async_submit_ctl submit;
        int i;
 
@@ -818,10 +814,10 @@ static void ops_run_biofill(struct stripe_head *sh)
                struct r5dev *dev = &sh->dev[i];
                if (test_bit(R5_Wantfill, &dev->flags)) {
                        struct bio *rbi;
-                       spin_lock_irq(&conf->device_lock);
+                       spin_lock_irq(&sh->stripe_lock);
                        dev->read = rbi = dev->toread;
                        dev->toread = NULL;
-                       spin_unlock_irq(&conf->device_lock);
+                       spin_unlock_irq(&sh->stripe_lock);
                        while (rbi && rbi->bi_sector <
                                dev->sector + STRIPE_SECTORS) {
                                tx = async_copy_data(0, rbi, dev->page,
@@ -1157,12 +1153,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
                if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
                        struct bio *wbi;
 
-                       spin_lock_irq(&sh->raid_conf->device_lock);
+                       spin_lock_irq(&sh->stripe_lock);
                        chosen = dev->towrite;
                        dev->towrite = NULL;
                        BUG_ON(dev->written);
                        wbi = dev->written = chosen;
-                       spin_unlock_irq(&sh->raid_conf->device_lock);
+                       spin_unlock_irq(&sh->stripe_lock);
 
                        while (wbi && wbi->bi_sector <
                                dev->sector + STRIPE_SECTORS) {
@@ -1467,6 +1463,8 @@ static int grow_one_stripe(struct r5conf *conf)
        init_waitqueue_head(&sh->ops.wait_for_ops);
        #endif
 
+       spin_lock_init(&sh->stripe_lock);
+
        if (grow_buffers(sh)) {
                shrink_buffers(sh);
                kmem_cache_free(conf->slab_cache, sh);
@@ -2353,8 +2351,15 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                (unsigned long long)bi->bi_sector,
                (unsigned long long)sh->sector);
 
-
-       spin_lock_irq(&conf->device_lock);
+       /*
+        * If several bio share a stripe. The bio bi_phys_segments acts as a
+        * reference count to avoid race. The reference count should already be
+        * increased before this function is called (for example, in
+        * make_request()), so other bio sharing this stripe will not free the
+        * stripe. If a stripe is owned by one stripe, the stripe lock will
+        * protect it.
+        */
+       spin_lock_irq(&sh->stripe_lock);
        if (forwrite) {
                bip = &sh->dev[dd_idx].towrite;
                if (*bip == NULL)
@@ -2388,7 +2393,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
                        set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
        }
-       spin_unlock_irq(&conf->device_lock);
+       spin_unlock_irq(&sh->stripe_lock);
 
        pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
                (unsigned long long)(*bip)->bi_sector,
@@ -2404,7 +2409,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 
  overlap:
        set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
-       spin_unlock_irq(&conf->device_lock);
+       spin_unlock_irq(&sh->stripe_lock);
        return 0;
 }
 
@@ -2454,11 +2459,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                                rdev_dec_pending(rdev, conf->mddev);
                        }
                }
-               spin_lock_irq(&conf->device_lock);
+               spin_lock_irq(&sh->stripe_lock);
                /* fail all writes first */
                bi = sh->dev[i].towrite;
                sh->dev[i].towrite = NULL;
-               spin_unlock_irq(&conf->device_lock);
+               spin_unlock_irq(&sh->stripe_lock);
                if (bi) {
                        s->to_write--;
                        bitmap_end = 1;
@@ -3192,7 +3197,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 
        /* Now to look around and see what can be done */
        rcu_read_lock();
-       spin_lock_irq(&conf->device_lock);
        for (i=disks; i--; ) {
                struct md_rdev *rdev;
                sector_t first_bad;
@@ -3338,7 +3342,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                                do_recovery = 1;
                }
        }
-       spin_unlock_irq(&conf->device_lock);
        if (test_bit(STRIPE_SYNCING, &sh->state)) {
                /* If there is a failed device being replaced,
                 *     we must be recovering.
index 2164021f3b5f6548b96ac067aac67bd7874dcd10..f03fb3395183bdb580923e9154d0b2405cf3b4f3 100644 (file)
@@ -210,6 +210,7 @@ struct stripe_head {
        int                     disks;          /* disks in stripe */
        enum check_states       check_state;
        enum reconstruct_states reconstruct_state;
+       spinlock_t              stripe_lock;
        /**
         * struct stripe_operations
         * @target - STRIPE_OP_COMPUTE_BLK target