[08/14] btrfs: scrub: extract the common preparation before scrubbing a block group

Message ID	f0cd2611725d035f479a01161dc4afc4d8c07cea.1688368617.git.wqu@suse.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-btrfs-owner@vger.kernel.org> From: Qu Wenruo <wqu@suse.com> To: linux-btrfs@vger.kernel.org Subject: [PATCH 08/14] btrfs: scrub: extract the common preparation before scrubbing a block group Date: Mon, 3 Jul 2023 15:32:32 +0800 Message-ID: <f0cd2611725d035f479a01161dc4afc4d8c07cea.1688368617.git.wqu@suse.com> In-Reply-To: <cover.1688368617.git.wqu@suse.com> References: <cover.1688368617.git.wqu@suse.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	btrfs: scrub: introduce SCRUB_LOGICAL flag \| expand [00/14] btrfs: scrub: introduce SCRUB_LOGICAL flag [01/14] btrfs: raid56: remove unnecessary parameter for raid56_parity_alloc_scrub_rbio() [02/14] btrfs: raid56: allow scrub operation to update both P and Q stripes [03/14] btrfs: raid56: allow caching P/Q stripes [04/14] btrfs: raid56: add the interfaces to submit recovery rbio [05/14] btrfs: add the ability to read P/Q stripes directly [06/14] btrfs: scrub: make scrub_ctx::stripes dynamically allocated [07/14] btrfs: scrub: introduce the skeleton for logical-scrub [08/14] btrfs: scrub: extract the common preparation before scrubbing a block group [09/14] btrfs: scrub: implement the chunk iteration code for scrub_logical [10/14] btrfs: scrub: implement the basic extent iteration code [11/14] btrfs: scrub: implement the repair part of scrub logical [12/14] btrfs: scrub: add RAID56 support for queue_scrub_logical_stripes() [13/14] btrfs: scrub: introduce the RAID56 data recovery path for scrub logical [14/14] btrfs: scrub: implement the RAID56 P/Q scrub code

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ff637f83aa0e..806c4683a7ef 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2343,6 +2343,143 @@ static int finish_extent_writes_for_zoned(struct btrfs_root *root, return btrfs_commit_transaction(trans); } +/* + * Do the preparation before we scrub the block group. + * + * Return >0 if we don't need to scrub the block group. + * Return 0 if we have done the preparation and set @ro_set_ret. + * Return <0 for error and can not scrub the bg. + */ +static int prepare_scrub_block_group(struct scrub_ctx *sctx, + struct btrfs_block_group *bg, + bool *ro_set_ret) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_root *root = fs_info->dev_root; + bool ro_set = false; + int ret; + + if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { + if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &bg->runtime_flags)) + return 1; + } + + /* + * Make sure that while we are scrubbing the corresponding block + * group doesn't get its logical address and its device extents + * reused for another block group, which can possibly be of a + * different type and different profile. We do this to prevent + * false error detections and crashes due to bogus attempts to + * repair extents. + */ + spin_lock(&bg->lock); + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { + spin_unlock(&bg->lock); + return 1; + } + btrfs_freeze_block_group(bg); + spin_unlock(&bg->lock); + + /* + * we need call btrfs_inc_block_group_ro() with scrubs_paused, + * to avoid deadlock caused by: + * btrfs_inc_block_group_ro() + * -> btrfs_wait_for_commit() + * -> btrfs_commit_transaction() + * -> btrfs_scrub_pause() + */ + scrub_pause_on(fs_info); + + /* + * Don't do chunk preallocation for scrub. + * + * This is especially important for SYSTEM bgs, or we can hit + * -EFBIG from btrfs_finish_chunk_alloc() like: + * 1. The only SYSTEM bg is marked RO. + * Since SYSTEM bg is small, that's pretty common. + * 2. New SYSTEM bg will be allocated + * Due to regular version will allocate new chunk. + * 3. New SYSTEM bg is empty and will get cleaned up + * Before cleanup really happens, it's marked RO again. + * 4. Empty SYSTEM bg get scrubbed + * We go back to 2. + * + * This can easily boost the amount of SYSTEM chunks if cleaner + * thread can't be triggered fast enough, and use up all space + * of btrfs_super_block::sys_chunk_array + * + * While for dev replace, we need to try our best to mark block + * group RO, to prevent race between: + * - Write duplication + * Contains latest data + * - Scrub copy + * Contains data from commit tree + * + * If target block group is not marked RO, nocow writes can + * be overwritten by scrub copy, causing data corruption. + * So for dev-replace, it's not allowed to continue if a block + * group is not RO. + */ + ret = btrfs_inc_block_group_ro(bg, sctx->is_dev_replace); + if (!ret && sctx->is_dev_replace) { + ret = finish_extent_writes_for_zoned(root, bg); + if (ret) { + btrfs_dec_block_group_ro(bg); + scrub_pause_off(fs_info); + return ret; + } + } + + if (ret == 0) + ro_set = 1; + if (ret == -ENOSPC && !sctx->is_dev_replace && + !(bg->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) + /* + * btrfs_inc_block_group_ro return -ENOSPC when it + * failed in creating new chunk for metadata. + * It is not a problem for scrub, because + * metadata are always cowed, and our scrub paused + * commit_transactions. + * + * For RAID56 chunks, we have to mark them read-only + * for scrub, as later we would use our own cache + * out of RAID56 realm. + * Thus we want the RAID56 bg to be marked RO to + * prevent RMW from screwing up out cache. + */ + ro_set = 0; + + if (ret == -ETXTBSY) { + btrfs_warn(fs_info, + "skipping scrub of block group %llu due to active swapfile", + bg->start); + btrfs_unfreeze_block_group(bg); + scrub_pause_off(fs_info); + return ret; + } + if (ret < 0) { + btrfs_warn(fs_info, + "failed setting block group ro: %d", ret); + btrfs_unfreeze_block_group(bg); + scrub_pause_off(fs_info); + return ret; + } + + /* + * Now the target block is marked RO, wait for nocow writes to + * finish before dev-replace. + * COW is fine, as COW never overwrites extents in commit tree. + */ + if (sctx->is_dev_replace) { + btrfs_wait_nocow_writers(bg); + btrfs_wait_ordered_roots(fs_info, U64_MAX, bg->start, + bg->length); + } + scrub_pause_off(fs_info); + *ro_set_ret = ro_set; + return 0; +} + static noinline_for_stack int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 start, u64 end) @@ -2353,7 +2490,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_root *root = fs_info->dev_root; u64 chunk_offset; int ret = 0; - int ro_set; int slot; struct extent_buffer *l; struct btrfs_key key; @@ -2374,6 +2510,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, key.type = BTRFS_DEV_EXTENT_KEY; while (1) { + bool ro_set = false; u64 dev_extent_len; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -2455,127 +2592,20 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, goto skip; } - if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { - if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) { - btrfs_put_block_group(cache); - goto skip; - } + ret = prepare_scrub_block_group(sctx, cache, &ro_set); + if (ret == -ETXTBSY) { + ret = 0; + goto skip_unfreeze; } - - /* - * Make sure that while we are scrubbing the corresponding block - * group doesn't get its logical address and its device extents - * reused for another block group, which can possibly be of a - * different type and different profile. We do this to prevent - * false error detections and crashes due to bogus attempts to - * repair extents. - */ - spin_lock(&cache->lock); - if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { - spin_unlock(&cache->lock); + if (ret < 0) { + btrfs_put_block_group(cache); + break; + } + if (ret > 0) { btrfs_put_block_group(cache); goto skip; } - btrfs_freeze_block_group(cache); - spin_unlock(&cache->lock); - /* - * we need call btrfs_inc_block_group_ro() with scrubs_paused, - * to avoid deadlock caused by: - * btrfs_inc_block_group_ro() - * -> btrfs_wait_for_commit() - * -> btrfs_commit_transaction() - * -> btrfs_scrub_pause() - */ - scrub_pause_on(fs_info); - - /* - * Don't do chunk preallocation for scrub. - * - * This is especially important for SYSTEM bgs, or we can hit - * -EFBIG from btrfs_finish_chunk_alloc() like: - * 1. The only SYSTEM bg is marked RO. - * Since SYSTEM bg is small, that's pretty common. - * 2. New SYSTEM bg will be allocated - * Due to regular version will allocate new chunk. - * 3. New SYSTEM bg is empty and will get cleaned up - * Before cleanup really happens, it's marked RO again. - * 4. Empty SYSTEM bg get scrubbed - * We go back to 2. - * - * This can easily boost the amount of SYSTEM chunks if cleaner - * thread can't be triggered fast enough, and use up all space - * of btrfs_super_block::sys_chunk_array - * - * While for dev replace, we need to try our best to mark block - * group RO, to prevent race between: - * - Write duplication - * Contains latest data - * - Scrub copy - * Contains data from commit tree - * - * If target block group is not marked RO, nocow writes can - * be overwritten by scrub copy, causing data corruption. - * So for dev-replace, it's not allowed to continue if a block - * group is not RO. - */ - ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); - if (!ret && sctx->is_dev_replace) { - ret = finish_extent_writes_for_zoned(root, cache); - if (ret) { - btrfs_dec_block_group_ro(cache); - scrub_pause_off(fs_info); - btrfs_put_block_group(cache); - break; - } - } - - if (ret == 0) { - ro_set = 1; - } else if (ret == -ENOSPC && !sctx->is_dev_replace && - !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) { - /* - * btrfs_inc_block_group_ro return -ENOSPC when it - * failed in creating new chunk for metadata. - * It is not a problem for scrub, because - * metadata are always cowed, and our scrub paused - * commit_transactions. - * - * For RAID56 chunks, we have to mark them read-only - * for scrub, as later we would use our own cache - * out of RAID56 realm. - * Thus we want the RAID56 bg to be marked RO to - * prevent RMW from screwing up out cache. - */ - ro_set = 0; - } else if (ret == -ETXTBSY) { - btrfs_warn(fs_info, - "skipping scrub of block group %llu due to active swapfile", - cache->start); - scrub_pause_off(fs_info); - ret = 0; - goto skip_unfreeze; - } else { - btrfs_warn(fs_info, - "failed setting block group ro: %d", ret); - btrfs_unfreeze_block_group(cache); - btrfs_put_block_group(cache); - scrub_pause_off(fs_info); - break; - } - - /* - * Now the target block is marked RO, wait for nocow writes to - * finish before dev-replace. - * COW is fine, as COW never overwrites extents in commit tree. - */ - if (sctx->is_dev_replace) { - btrfs_wait_nocow_writers(cache); - btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, - cache->length); - } - - scrub_pause_off(fs_info); down_write(&dev_replace->rwsem); dev_replace->cursor_right = found_key.offset + dev_extent_len; dev_replace->cursor_left = found_key.offset;

[08/14] btrfs: scrub: extract the common preparation before scrubbing a block group

Commit Message

Patch