Message ID | 2454cd4eb1694d37056e492af32b23743c63202b.1714663442.git.jth@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [RFC] btrfs: don't hold dev_replace rwsem over whole of btrfs_map_block | expand |
On Fri, May 3, 2024 at 6:35 AM Johannes Thumshirn <jth@kernel.org> wrote: > > From: Johannes Thumshirn <johannes.thumshirn@wdc.com> > > Don't hold the dev_replace rwsem for the entirety of btrfs_map_block(). > > It is only needed to protect > a) calls to find_live_mirror() and > b) calling into handle_ops_on_dev_replace(). > > But there is no need to hold the rwsem for any kind of set_io_stripe() > calls. > > So relax taking the dev_replace rwsem to only protect both cases and check > if the device replace status has changed in the meantime, for which we have > to re-do the find_live_mirror() calls. > > This fixes a deadlock on raid-stripe-tree where device replace performs a > scrub operation, which in turn calls into btrfs_map_block() to find the > physical location of the block. Do you have a stack trace you can paste to the changelog? That helps make it more clear and greppable. > > Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> > --- > fs/btrfs/volumes.c | 30 +++++++++++++++++++----------- > 1 file changed, 19 insertions(+), 11 deletions(-) > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index a3dc88e420d1..3a842b9960b2 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -6649,14 +6649,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, > max_len = btrfs_max_io_len(map, map_offset, &io_geom); > *length = min_t(u64, map->chunk_len - map_offset, max_len); > > +again: > down_read(&dev_replace->rwsem); > dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); > - /* > - * Hold the semaphore for read during the whole operation, write is > - * requested at commit time but must wait. > - */ > - if (!dev_replace_is_ongoing) > - up_read(&dev_replace->rwsem); > > switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { > case BTRFS_BLOCK_GROUP_RAID0: > @@ -6689,6 +6684,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, > map_blocks_single(map, &io_geom); > break; > } > + > + up_read(&dev_replace->rwsem); > + > if (io_geom.stripe_index >= map->num_stripes) { > btrfs_crit(fs_info, > "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", > @@ -6784,10 +6782,25 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, > if (op != BTRFS_MAP_READ) > io_geom.max_errors = btrfs_chunk_max_errors(map); > > + /* > + * Check if something changed the dev_replace state since > + * we've checked it for the last time and if redo the whole > + * mapping operation. > + */ > + down_read(&dev_replace->rwsem); > + if (!dev_replace_is_ongoing && > + btrfs_dev_replace_is_ongoing(dev_replace)) { > + up_read(&dev_replace->rwsem); > + goto again; What about the case where we found device replace was running but it's not running anymore? I would change the if condition to: if (dev_replace_is_ongoing != btrfs_dev_replace_is_ongoing(dev_replace)) > + } > + up_read(&dev_replace->rwsem); > + > if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && So here we also need to be under the protection of the rwsem before checking ->tgtdev. The device replace might finish just after the check, and then we call handle_ops_on_dev_replace() and use a NULL tgtdev which is not expected. Why not remove the up_read() right above, and... > op != BTRFS_MAP_READ) { > + down_read(&dev_replace->rwsem); > handle_ops_on_dev_replace(op, bioc, dev_replace, logical, > &io_geom.num_stripes, &io_geom.max_errors); > + up_read(&dev_replace->rwsem); Stop doing here the down_read() + up_read(). > } And then do the up_read() here. Thanks. > > *bioc_ret = bioc; > @@ -6796,11 +6809,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, > bioc->mirror_num = io_geom.mirror_num; > > out: > - if (dev_replace_is_ongoing) { > - lockdep_assert_held(&dev_replace->rwsem); > - /* Unlock and let waiting writers proceed */ > - up_read(&dev_replace->rwsem); > - } > btrfs_free_chunk_map(map); > return ret; > } > -- > 2.35.3 > >
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a3dc88e420d1..3a842b9960b2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6649,14 +6649,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, max_len = btrfs_max_io_len(map, map_offset, &io_geom); *length = min_t(u64, map->chunk_len - map_offset, max_len); +again: down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); - /* - * Hold the semaphore for read during the whole operation, write is - * requested at commit time but must wait. - */ - if (!dev_replace_is_ongoing) - up_read(&dev_replace->rwsem); switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case BTRFS_BLOCK_GROUP_RAID0: @@ -6689,6 +6684,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, map_blocks_single(map, &io_geom); break; } + + up_read(&dev_replace->rwsem); + if (io_geom.stripe_index >= map->num_stripes) { btrfs_crit(fs_info, "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", @@ -6784,10 +6782,25 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (op != BTRFS_MAP_READ) io_geom.max_errors = btrfs_chunk_max_errors(map); + /* + * Check if something changed the dev_replace state since + * we've checked it for the last time and if redo the whole + * mapping operation. + */ + down_read(&dev_replace->rwsem); + if (!dev_replace_is_ongoing && + btrfs_dev_replace_is_ongoing(dev_replace)) { + up_read(&dev_replace->rwsem); + goto again; + } + up_read(&dev_replace->rwsem); + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && op != BTRFS_MAP_READ) { + down_read(&dev_replace->rwsem); handle_ops_on_dev_replace(op, bioc, dev_replace, logical, &io_geom.num_stripes, &io_geom.max_errors); + up_read(&dev_replace->rwsem); } *bioc_ret = bioc; @@ -6796,11 +6809,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, bioc->mirror_num = io_geom.mirror_num; out: - if (dev_replace_is_ongoing) { - lockdep_assert_held(&dev_replace->rwsem); - /* Unlock and let waiting writers proceed */ - up_read(&dev_replace->rwsem); - } btrfs_free_chunk_map(map); return ret; }