Message ID | 20240701-b4-rst-updates-v3-2-e0437e1e04a6@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | btrfs: rst: updates for RAID stripe tree | expand |
On Mon, Jul 01, 2024 at 12:25:16PM +0200, Johannes Thumshirn wrote: > From: Johannes Thumshirn <johannes.thumshirn@wdc.com> > > The current RAID stripe code assumes, that we will always remove a > whole stripe entry. > > But if we're only removing a part of a RAID stripe we're hitting the > ASSERT()ion checking for this condition. > > Instead of assuming the complete deletion of a RAID stripe, split the > stripe if we need to. > > Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> > --- > fs/btrfs/ctree.c | 1 + > fs/btrfs/raid-stripe-tree.c | 100 +++++++++++++++++++++++++++++++++----------- > 2 files changed, 77 insertions(+), 24 deletions(-) > > diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c > index e33f9f5a228d..16f9cf6360a4 100644 > --- a/fs/btrfs/ctree.c > +++ b/fs/btrfs/ctree.c > @@ -3863,6 +3863,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, > btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); > > BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && > + key.type != BTRFS_RAID_STRIPE_KEY && > key.type != BTRFS_EXTENT_CSUM_KEY); > > if (btrfs_leaf_free_space(leaf) >= ins_len) > diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c > index 3020820dd6e2..64e36b46cbab 100644 > --- a/fs/btrfs/raid-stripe-tree.c > +++ b/fs/btrfs/raid-stripe-tree.c > @@ -33,42 +33,94 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le > if (!path) > return -ENOMEM; > > - while (1) { > - key.objectid = start; > - key.type = BTRFS_RAID_STRIPE_KEY; > - key.offset = length; > +again: > + key.objectid = start; > + key.type = BTRFS_RAID_STRIPE_KEY; > + key.offset = length; > > - ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); > - if (ret < 0) > - break; > - if (ret > 0) { > - ret = 0; > - if (path->slots[0] == 0) > - break; > - path->slots[0]--; > - } > + ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); > + if (ret < 0) > + goto out; > + if (ret > 0) { > + ret = 0; > + if (path->slots[0] == 0) > + goto out; > + path->slots[0]--; > + } > + > + leaf = path->nodes[0]; > + slot = path->slots[0]; > + btrfs_item_key_to_cpu(leaf, &key, slot); > + found_start = key.objectid; > + found_end = found_start + key.offset; > + > + /* That stripe ends before we start, we're done. */ > + if (found_end <= start) > + goto out; > + > + trace_btrfs_raid_extent_delete(fs_info, start, end, > + found_start, found_end); > + > + if (found_start < start) { > + u64 diff = start - found_start; > + struct btrfs_key new_key; > + int num_stripes; > + struct btrfs_stripe_extent *stripe_extent; > + > + new_key.objectid = start; > + new_key.type = BTRFS_RAID_STRIPE_KEY; > + new_key.offset = length - diff; > + > + ret = btrfs_duplicate_item(trans, stripe_root, path, > + &new_key); > + if (ret) > + goto out; > > leaf = path->nodes[0]; > slot = path->slots[0]; > - btrfs_item_key_to_cpu(leaf, &key, slot); > - found_start = key.objectid; > - found_end = found_start + key.offset; > > - /* That stripe ends before we start, we're done. */ > - if (found_end <= start) > - break; > + num_stripes = > + btrfs_num_raid_stripes(btrfs_item_size(leaf, slot)); > + stripe_extent = > + btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); > + > + for (int i = 0; i < num_stripes; i++) { > + struct btrfs_raid_stride *raid_stride = > + &stripe_extent->strides[i]; > + u64 physical = > + btrfs_raid_stride_physical(leaf, raid_stride); > + > + btrfs_set_raid_stride_physical(leaf, raid_stride, > + physical + diff); > + } > + > + btrfs_mark_buffer_dirty(trans, leaf); > + btrfs_release_path(path); > + goto again; > + } > + > + if (found_end > end) { > + u64 diff = found_end - end; > + struct btrfs_key new_key; > > - trace_btrfs_raid_extent_delete(fs_info, start, end, > - found_start, found_end); > + new_key.objectid = found_start; > + new_key.type = BTRFS_RAID_STRIPE_KEY; > + new_key.offset = length - diff; > > - ASSERT(found_start >= start && found_end <= end); > - ret = btrfs_del_item(trans, stripe_root, path); > + ret = btrfs_duplicate_item(trans, stripe_root, path, > + &new_key); This seems incorrect to me. If we have [0, 1MiB) and we're deleting [0,512KiB) then the tree at this point is [0, BTRFS_RAID_STRIPE_KEY, 512KiB] [0, BTRFS_RAID_STRIPE_KEY, 1MiB] which is valid as far as key ordering goes, but is a violation of the raid stripe tree design correct? And then you do goto again, and then you'll delete [0, BTRFS_RAID_STRIPE_KEY, 512KiB] but leave the old one in place, correct? Thanks, Josef
On 01.07.24 16:07, Josef Bacik wrote: > On Mon, Jul 01, 2024 at 12:25:16PM +0200, Johannes Thumshirn wrote: >> From: Johannes Thumshirn <johannes.thumshirn@wdc.com> >> >> The current RAID stripe code assumes, that we will always remove a >> whole stripe entry. >> >> But if we're only removing a part of a RAID stripe we're hitting the >> ASSERT()ion checking for this condition. >> >> Instead of assuming the complete deletion of a RAID stripe, split the >> stripe if we need to. >> >> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> >> --- >> fs/btrfs/ctree.c | 1 + >> fs/btrfs/raid-stripe-tree.c | 100 +++++++++++++++++++++++++++++++++----------- >> 2 files changed, 77 insertions(+), 24 deletions(-) >> >> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c >> index e33f9f5a228d..16f9cf6360a4 100644 >> --- a/fs/btrfs/ctree.c >> +++ b/fs/btrfs/ctree.c >> @@ -3863,6 +3863,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, >> btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); >> >> BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && >> + key.type != BTRFS_RAID_STRIPE_KEY && >> key.type != BTRFS_EXTENT_CSUM_KEY); >> >> if (btrfs_leaf_free_space(leaf) >= ins_len) >> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c >> index 3020820dd6e2..64e36b46cbab 100644 >> --- a/fs/btrfs/raid-stripe-tree.c >> +++ b/fs/btrfs/raid-stripe-tree.c >> @@ -33,42 +33,94 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le >> if (!path) >> return -ENOMEM; >> >> - while (1) { >> - key.objectid = start; >> - key.type = BTRFS_RAID_STRIPE_KEY; >> - key.offset = length; >> +again: >> + key.objectid = start; >> + key.type = BTRFS_RAID_STRIPE_KEY; >> + key.offset = length; >> >> - ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); >> - if (ret < 0) >> - break; >> - if (ret > 0) { >> - ret = 0; >> - if (path->slots[0] == 0) >> - break; >> - path->slots[0]--; >> - } >> + ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); >> + if (ret < 0) >> + goto out; >> + if (ret > 0) { >> + ret = 0; >> + if (path->slots[0] == 0) >> + goto out; >> + path->slots[0]--; >> + } >> + >> + leaf = path->nodes[0]; >> + slot = path->slots[0]; >> + btrfs_item_key_to_cpu(leaf, &key, slot); >> + found_start = key.objectid; >> + found_end = found_start + key.offset; >> + >> + /* That stripe ends before we start, we're done. */ >> + if (found_end <= start) >> + goto out; >> + >> + trace_btrfs_raid_extent_delete(fs_info, start, end, >> + found_start, found_end); >> + >> + if (found_start < start) { >> + u64 diff = start - found_start; >> + struct btrfs_key new_key; >> + int num_stripes; >> + struct btrfs_stripe_extent *stripe_extent; >> + >> + new_key.objectid = start; >> + new_key.type = BTRFS_RAID_STRIPE_KEY; >> + new_key.offset = length - diff; >> + >> + ret = btrfs_duplicate_item(trans, stripe_root, path, >> + &new_key); >> + if (ret) >> + goto out; >> >> leaf = path->nodes[0]; >> slot = path->slots[0]; >> - btrfs_item_key_to_cpu(leaf, &key, slot); >> - found_start = key.objectid; >> - found_end = found_start + key.offset; >> >> - /* That stripe ends before we start, we're done. */ >> - if (found_end <= start) >> - break; >> + num_stripes = >> + btrfs_num_raid_stripes(btrfs_item_size(leaf, slot)); >> + stripe_extent = >> + btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); >> + >> + for (int i = 0; i < num_stripes; i++) { >> + struct btrfs_raid_stride *raid_stride = >> + &stripe_extent->strides[i]; >> + u64 physical = >> + btrfs_raid_stride_physical(leaf, raid_stride); >> + >> + btrfs_set_raid_stride_physical(leaf, raid_stride, >> + physical + diff); >> + } >> + >> + btrfs_mark_buffer_dirty(trans, leaf); >> + btrfs_release_path(path); >> + goto again; >> + } >> + >> + if (found_end > end) { >> + u64 diff = found_end - end; >> + struct btrfs_key new_key; >> >> - trace_btrfs_raid_extent_delete(fs_info, start, end, >> - found_start, found_end); >> + new_key.objectid = found_start; >> + new_key.type = BTRFS_RAID_STRIPE_KEY; >> + new_key.offset = length - diff; >> >> - ASSERT(found_start >= start && found_end <= end); >> - ret = btrfs_del_item(trans, stripe_root, path); >> + ret = btrfs_duplicate_item(trans, stripe_root, path, >> + &new_key); > > This seems incorrect to me. If we have [0, 1MiB) and we're deleting [0,512KiB) > then the tree at this point is > > [0, BTRFS_RAID_STRIPE_KEY, 512KiB] > [0, BTRFS_RAID_STRIPE_KEY, 1MiB] > > which is valid as far as key ordering goes, but is a violation of the raid > stripe tree design correct? And then you do goto again, and then you'll delete > > [0, BTRFS_RAID_STRIPE_KEY, 512KiB] > > but leave the old one in place, correct? Thanks, Oh right, jumping back to again removes the wrong one :/
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e33f9f5a228d..16f9cf6360a4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3863,6 +3863,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_RAID_STRIPE_KEY && key.type != BTRFS_EXTENT_CSUM_KEY); if (btrfs_leaf_free_space(leaf) >= ins_len) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 3020820dd6e2..64e36b46cbab 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -33,42 +33,94 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le if (!path) return -ENOMEM; - while (1) { - key.objectid = start; - key.type = BTRFS_RAID_STRIPE_KEY; - key.offset = length; +again: + key.objectid = start; + key.type = BTRFS_RAID_STRIPE_KEY; + key.offset = length; - ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); - if (ret < 0) - break; - if (ret > 0) { - ret = 0; - if (path->slots[0] == 0) - break; - path->slots[0]--; - } + ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + found_start = key.objectid; + found_end = found_start + key.offset; + + /* That stripe ends before we start, we're done. */ + if (found_end <= start) + goto out; + + trace_btrfs_raid_extent_delete(fs_info, start, end, + found_start, found_end); + + if (found_start < start) { + u64 diff = start - found_start; + struct btrfs_key new_key; + int num_stripes; + struct btrfs_stripe_extent *stripe_extent; + + new_key.objectid = start; + new_key.type = BTRFS_RAID_STRIPE_KEY; + new_key.offset = length - diff; + + ret = btrfs_duplicate_item(trans, stripe_root, path, + &new_key); + if (ret) + goto out; leaf = path->nodes[0]; slot = path->slots[0]; - btrfs_item_key_to_cpu(leaf, &key, slot); - found_start = key.objectid; - found_end = found_start + key.offset; - /* That stripe ends before we start, we're done. */ - if (found_end <= start) - break; + num_stripes = + btrfs_num_raid_stripes(btrfs_item_size(leaf, slot)); + stripe_extent = + btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); + + for (int i = 0; i < num_stripes; i++) { + struct btrfs_raid_stride *raid_stride = + &stripe_extent->strides[i]; + u64 physical = + btrfs_raid_stride_physical(leaf, raid_stride); + + btrfs_set_raid_stride_physical(leaf, raid_stride, + physical + diff); + } + + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_release_path(path); + goto again; + } + + if (found_end > end) { + u64 diff = found_end - end; + struct btrfs_key new_key; - trace_btrfs_raid_extent_delete(fs_info, start, end, - found_start, found_end); + new_key.objectid = found_start; + new_key.type = BTRFS_RAID_STRIPE_KEY; + new_key.offset = length - diff; - ASSERT(found_start >= start && found_end <= end); - ret = btrfs_del_item(trans, stripe_root, path); + ret = btrfs_duplicate_item(trans, stripe_root, path, + &new_key); if (ret) - break; + goto out; + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); + goto again; + } + ret = btrfs_del_item(trans, stripe_root, path); + + out: btrfs_free_path(path); return ret; }