From: NeilBrown Date: Sun, 1 Apr 2012 13:48:38 +0000 (+1000) Subject: md/raid5: fix handling of bad blocks during recovery. X-Git-Tag: firefly_0821_release~3680^2~3014^2~10 X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=18b9837ea0dc3cf844c6c4196871ce91d047bddb;p=firefly-linux-kernel-4.4.55.git md/raid5: fix handling of bad blocks during recovery. 1/ We can only treat a known-bad-block like a read-error if we have the data that belongs in that block. So fix that test. 2/ If we cannot recovery a stripe due to insufficient data, don't tell "md_done_sync" that the sync failed unless we really did fail something. If we successfully record bad blocks, that is success. Reported-by: "majianpeng" Signed-off-by: NeilBrown --- diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 23ac880bba9a..9799be80bf31 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2471,39 +2471,41 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, int abort = 0; int i; - md_done_sync(conf->mddev, STRIPE_SECTORS, 0); clear_bit(STRIPE_SYNCING, &sh->state); s->syncing = 0; s->replacing = 0; /* There is nothing more to do for sync/check/repair. + * Don't even need to abort as that is handled elsewhere + * if needed, and not always wanted e.g. if there is a known + * bad block here. * For recover/replace we need to record a bad block on all * non-sync devices, or abort the recovery */ - if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) - return; - /* During recovery devices cannot be removed, so locking and - * refcounting of rdevs is not needed - */ - for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = conf->disks[i].rdev; - if (rdev - && !test_bit(Faulty, &rdev->flags) - && !test_bit(In_sync, &rdev->flags) - && !rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) - abort = 1; - rdev = conf->disks[i].replacement; - if (rdev - && !test_bit(Faulty, &rdev->flags) - && !test_bit(In_sync, &rdev->flags) - && !rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) - abort = 1; - } - if (abort) { - conf->recovery_disabled = conf->mddev->recovery_disabled; - set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery); + if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { + /* During recovery devices cannot be removed, so + * locking and refcounting of rdevs is not needed + */ + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = conf->disks[i].rdev; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && !rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) + abort = 1; + rdev = conf->disks[i].replacement; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && !rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) + abort = 1; + } + if (abort) + conf->recovery_disabled = + conf->mddev->recovery_disabled; } + md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); } static int want_replace(struct stripe_head *sh, int disk_idx) @@ -3203,7 +3205,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) /* Not in-sync */; else if (is_bad) { /* also not in-sync */ - if (!test_bit(WriteErrorSeen, &rdev->flags)) { + if (!test_bit(WriteErrorSeen, &rdev->flags) && + test_bit(R5_UPTODATE, &dev->flags)) { /* treat as in-sync, but with a read error * which we can now try to correct */