[v3,06/15] md/raid5: Factor out helper from raid5_make_request() loop

Message ID	20220616191945.23935-7-logang@deltatee.com (mailing list archive)
State	Accepted, archived
Headers	show Return-Path: <linux-raid-owner@kernel.org> From: Logan Gunthorpe <logang@deltatee.com> To: linux-kernel@vger.kernel.org, linux-raid@vger.kernel.org, Song Liu <song@kernel.org> Cc: Christoph Hellwig <hch@infradead.org>, Guoqing Jiang <guoqing.jiang@linux.dev>, Stephen Bates <sbates@raithlin.com>, Martin Oliveira <Martin.Oliveira@eideticom.com>, David Sloan <David.Sloan@eideticom.com>, Logan Gunthorpe <logang@deltatee.com> Date: Thu, 16 Jun 2022 13:19:36 -0600 Message-Id: <20220616191945.23935-7-logang@deltatee.com> In-Reply-To: <20220616191945.23935-1-logang@deltatee.com> References: <20220616191945.23935-1-logang@deltatee.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [PATCH v3 06/15] md/raid5: Factor out helper from raid5_make_request() loop Precedence: bulk
Series	Improve Raid5 Lock Contention \| expand [v3,00/15] Improve Raid5 Lock Contention [v3,01/15] md/raid5: Make logic blocking check consistent with logic that blocks [v3,02/15] md/raid5: Factor out ahead_of_reshape() function [v3,03/15] md/raid5: Refactor raid5_make_request loop [v3,04/15] md/raid5: Move stripe_add_to_batch_list() call out of add_stripe_bio() [v3,05/15] md/raid5: Move common stripe get code into new find_get_stripe() helper [v3,06/15] md/raid5: Factor out helper from raid5_make_request() loop [v3,07/15] md/raid5: Drop the do_prepare flag in raid5_make_request() [v3,08/15] md/raid5: Move read_seqcount_begin() into make_stripe_request() [v3,09/15] md/raid5: Refactor for loop in raid5_make_request() into while loop [v3,10/15] md/raid5: Keep a reference to last stripe_head for batch [v3,11/15] md/raid5: Refactor add_stripe_bio() [v3,12/15] md/raid5: Check all disks in a stripe_head for reshape progress [v3,13/15] md/raid5: Pivot raid5_make_request() [v3,14/15] md/raid5: Improve debug prints [v3,15/15] md/raid5: Increase restriction on max segments per request

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1bbf87d15bc8..26ef292842de 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5786,17 +5786,139 @@ static bool ahead_of_reshape(struct mddev *mddev, sector_t sector, sector >= reshape_sector; } +enum stripe_result { + STRIPE_SUCCESS = 0, + STRIPE_RETRY, + STRIPE_SCHEDULE_AND_RETRY, + STRIPE_FAIL, +}; + +struct stripe_request_ctx { + /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ + bool do_flush; +}; + +static enum stripe_result make_stripe_request(struct mddev *mddev, + struct r5conf *conf, struct stripe_request_ctx *ctx, + sector_t logical_sector, struct bio *bi, int seq) +{ + const int rw = bio_data_dir(bi); + enum stripe_result ret; + struct stripe_head *sh; + sector_t new_sector; + int previous = 0; + int dd_idx; + + if (unlikely(conf->reshape_progress != MaxSector)) { + /* + * Spinlock is needed as reshape_progress may be + * 64bit on a 32bit platform, and so it might be + * possible to see a half-updated value + * Of course reshape_progress could change after + * the lock is dropped, so once we get a reference + * to the stripe that we think it is, we will have + * to check again. + */ + spin_lock_irq(&conf->device_lock); + if (ahead_of_reshape(mddev, logical_sector, + conf->reshape_progress)) { + previous = 1; + } else { + if (ahead_of_reshape(mddev, logical_sector, + conf->reshape_safe)) { + spin_unlock_irq(&conf->device_lock); + return STRIPE_SCHEDULE_AND_RETRY; + } + } + spin_unlock_irq(&conf->device_lock); + } + + new_sector = raid5_compute_sector(conf, logical_sector, previous, + &dd_idx, NULL); + pr_debug("raid456: %s, sector %llu logical %llu\n", __func__, + new_sector, logical_sector); + + sh = raid5_get_active_stripe(conf, new_sector, previous, + (bi->bi_opf & REQ_RAHEAD), 0); + if (unlikely(!sh)) { + /* cannot get stripe, just give-up */ + bi->bi_status = BLK_STS_IOERR; + return STRIPE_FAIL; + } + + if (unlikely(previous)) { + /* + * Expansion might have moved on while waiting for a + * stripe, so we must do the range check again. + * Expansion could still move past after this + * test, but as we are holding a reference to + * 'sh', we know that if that happens, + * STRIPE_EXPANDING will get set and the expansion + * won't proceed until we finish with the stripe. + */ + int must_retry = 0; + spin_lock_irq(&conf->device_lock); + if (!ahead_of_reshape(mddev, logical_sector, + conf->reshape_progress)) + /* mismatch, need to try again */ + must_retry = 1; + spin_unlock_irq(&conf->device_lock); + if (must_retry) { + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out_release; + } + } + + if (read_seqcount_retry(&conf->gen_lock, seq)) { + /* Might have got the wrong stripe_head by accident */ + ret = STRIPE_RETRY; + goto out_release; + } + + if (test_bit(STRIPE_EXPANDING, &sh->state) || + !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { + /* + * Stripe is busy expanding or add failed due to + * overlap. Flush everything and wait a while. + */ + md_wakeup_thread(mddev->thread); + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out_release; + } + + if (stripe_can_batch(sh)) + stripe_add_to_batch_list(conf, sh); + + if (ctx->do_flush) { + set_bit(STRIPE_R5C_PREFLUSH, &sh->state); + /* we only need flush for one stripe */ + ctx->do_flush = false; + } + + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + if ((!sh->batch_head || sh == sh->batch_head) && + (bi->bi_opf & REQ_SYNC) && + !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + + release_stripe_plug(mddev, sh); + return STRIPE_SUCCESS; + +out_release: + raid5_release_stripe(sh); + return ret; +} + static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { struct r5conf *conf = mddev->private; - int dd_idx; - sector_t new_sector; sector_t logical_sector, last_sector; - struct stripe_head *sh; + struct stripe_request_ctx ctx = {}; const int rw = bio_data_dir(bi); + enum stripe_result res; DEFINE_WAIT(w); bool do_prepare; - bool do_flush = false; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = log_handle_flush_request(conf, bi); @@ -5812,7 +5934,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, * we need to flush journal device */ - do_flush = bi->bi_opf & REQ_PREFLUSH; + ctx.do_flush = bi->bi_opf & REQ_PREFLUSH; } if (!md_write_start(mddev, bi)) @@ -5852,117 +5974,30 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) md_account_bio(mddev, &bi); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { - int previous; int seq; do_prepare = false; retry: seq = read_seqcount_begin(&conf->gen_lock); - previous = 0; if (do_prepare) prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - if (unlikely(conf->reshape_progress != MaxSector)) { - /* spinlock is needed as reshape_progress may be - * 64bit on a 32bit platform, and so it might be - * possible to see a half-updated value - * Of course reshape_progress could change after - * the lock is dropped, so once we get a reference - * to the stripe that we think it is, we will have - * to check again. - */ - spin_lock_irq(&conf->device_lock); - if (ahead_of_reshape(mddev, logical_sector, - conf->reshape_progress)) { - previous = 1; - } else { - if (ahead_of_reshape(mddev, logical_sector, - conf->reshape_safe)) { - spin_unlock_irq(&conf->device_lock); - schedule(); - do_prepare = true; - goto retry; - } - } - spin_unlock_irq(&conf->device_lock); - } - - new_sector = raid5_compute_sector(conf, logical_sector, - previous, - &dd_idx, NULL); - pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", - (unsigned long long)new_sector, - (unsigned long long)logical_sector); - sh = raid5_get_active_stripe(conf, new_sector, previous, - (bi->bi_opf & REQ_RAHEAD), 0); - if (unlikely(!sh)) { - /* cannot get stripe, just give-up */ - bi->bi_status = BLK_STS_IOERR; + res = make_stripe_request(mddev, conf, &ctx, logical_sector, + bi, seq); + if (res == STRIPE_FAIL) break; - } - - if (unlikely(previous)) { - /* expansion might have moved on while waiting for a - * stripe, so we must do the range check again. - * Expansion could still move past after this - * test, but as we are holding a reference to - * 'sh', we know that if that happens, - * STRIPE_EXPANDING will get set and the expansion - * won't proceed until we finish with the stripe. - */ - int must_retry = 0; - spin_lock_irq(&conf->device_lock); - if (!ahead_of_reshape(mddev, logical_sector, - conf->reshape_progress)) - /* mismatch, need to try again */ - must_retry = 1; - spin_unlock_irq(&conf->device_lock); - if (must_retry) { - raid5_release_stripe(sh); - schedule(); - do_prepare = true; - goto retry; - } - } - if (read_seqcount_retry(&conf->gen_lock, seq)) { - /* Might have got the wrong stripe_head by accident */ - raid5_release_stripe(sh); + if (res == STRIPE_RETRY) goto retry; - } - if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { - /* - * Stripe is busy expanding or add failed due to - * overlap. Flush everything and wait a while. - */ - md_wakeup_thread(mddev->thread); - raid5_release_stripe(sh); + if (res == STRIPE_SCHEDULE_AND_RETRY) { schedule(); do_prepare = true; goto retry; } - - if (stripe_can_batch(sh)) - stripe_add_to_batch_list(conf, sh); - - if (do_flush) { - set_bit(STRIPE_R5C_PREFLUSH, &sh->state); - /* we only need flush for one stripe */ - do_flush = false; - } - - set_bit(STRIPE_HANDLE, &sh->state); - clear_bit(STRIPE_DELAYED, &sh->state); - if ((!sh->batch_head || sh == sh->batch_head) && - (bi->bi_opf & REQ_SYNC) && - !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - atomic_inc(&conf->preread_active_stripes); - - release_stripe_plug(mddev, sh); } + finish_wait(&conf->wait_for_overlap, &w); if (rw == WRITE)

[v3,06/15] md/raid5: Factor out helper from raid5_make_request() loop

Commit Message

Patch