diff mbox series

[v3,2/5] btrfs: split RAID stripes on deletion

Message ID 20240701-b4-rst-updates-v3-2-e0437e1e04a6@kernel.org (mailing list archive)
State New, archived
Headers show
Series btrfs: rst: updates for RAID stripe tree | expand

Commit Message

Johannes Thumshirn July 1, 2024, 10:25 a.m. UTC
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>

The current RAID stripe code assumes, that we will always remove a
whole stripe entry.

But if we're only removing a part of a RAID stripe we're hitting the
ASSERT()ion checking for this condition.

Instead of assuming the complete deletion of a RAID stripe, split the
stripe if we need to.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/ctree.c            |   1 +
 fs/btrfs/raid-stripe-tree.c | 100 +++++++++++++++++++++++++++++++++-----------
 2 files changed, 77 insertions(+), 24 deletions(-)

Comments

Josef Bacik July 1, 2024, 2:07 p.m. UTC | #1
On Mon, Jul 01, 2024 at 12:25:16PM +0200, Johannes Thumshirn wrote:
> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> 
> The current RAID stripe code assumes, that we will always remove a
> whole stripe entry.
> 
> But if we're only removing a part of a RAID stripe we're hitting the
> ASSERT()ion checking for this condition.
> 
> Instead of assuming the complete deletion of a RAID stripe, split the
> stripe if we need to.
> 
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>  fs/btrfs/ctree.c            |   1 +
>  fs/btrfs/raid-stripe-tree.c | 100 +++++++++++++++++++++++++++++++++-----------
>  2 files changed, 77 insertions(+), 24 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index e33f9f5a228d..16f9cf6360a4 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -3863,6 +3863,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
>  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
>  
>  	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
> +	       key.type != BTRFS_RAID_STRIPE_KEY &&
>  	       key.type != BTRFS_EXTENT_CSUM_KEY);
>  
>  	if (btrfs_leaf_free_space(leaf) >= ins_len)
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> index 3020820dd6e2..64e36b46cbab 100644
> --- a/fs/btrfs/raid-stripe-tree.c
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -33,42 +33,94 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
>  	if (!path)
>  		return -ENOMEM;
>  
> -	while (1) {
> -		key.objectid = start;
> -		key.type = BTRFS_RAID_STRIPE_KEY;
> -		key.offset = length;
> +again:
> +	key.objectid = start;
> +	key.type = BTRFS_RAID_STRIPE_KEY;
> +	key.offset = length;
>  
> -		ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
> -		if (ret < 0)
> -			break;
> -		if (ret > 0) {
> -			ret = 0;
> -			if (path->slots[0] == 0)
> -				break;
> -			path->slots[0]--;
> -		}
> +	ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
> +	if (ret < 0)
> +		goto out;
> +	if (ret > 0) {
> +		ret = 0;
> +		if (path->slots[0] == 0)
> +			goto out;
> +		path->slots[0]--;
> +	}
> +
> +	leaf = path->nodes[0];
> +	slot = path->slots[0];
> +	btrfs_item_key_to_cpu(leaf, &key, slot);
> +	found_start = key.objectid;
> +	found_end = found_start + key.offset;
> +
> +	/* That stripe ends before we start, we're done. */
> +	if (found_end <= start)
> +		goto out;
> +
> +	trace_btrfs_raid_extent_delete(fs_info, start, end,
> +				       found_start, found_end);
> +
> +	if (found_start < start) {
> +		u64 diff = start - found_start;
> +		struct btrfs_key new_key;
> +		int num_stripes;
> +		struct btrfs_stripe_extent *stripe_extent;
> +
> +		new_key.objectid = start;
> +		new_key.type = BTRFS_RAID_STRIPE_KEY;
> +		new_key.offset = length - diff;
> +
> +		ret = btrfs_duplicate_item(trans, stripe_root, path,
> +					   &new_key);
> +		if (ret)
> +			goto out;
>  
>  		leaf = path->nodes[0];
>  		slot = path->slots[0];
> -		btrfs_item_key_to_cpu(leaf, &key, slot);
> -		found_start = key.objectid;
> -		found_end = found_start + key.offset;
>  
> -		/* That stripe ends before we start, we're done. */
> -		if (found_end <= start)
> -			break;
> +		num_stripes =
> +			btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
> +		stripe_extent =
> +			btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
> +
> +		for (int i = 0; i < num_stripes; i++) {
> +			struct btrfs_raid_stride *raid_stride =
> +				&stripe_extent->strides[i];
> +			u64 physical =
> +				btrfs_raid_stride_physical(leaf, raid_stride);
> +
> +			btrfs_set_raid_stride_physical(leaf, raid_stride,
> +							     physical + diff);
> +		}
> +
> +		btrfs_mark_buffer_dirty(trans, leaf);
> +		btrfs_release_path(path);
> +		goto again;
> +	}
> +
> +	if (found_end > end) {
> +		u64 diff = found_end - end;
> +		struct btrfs_key new_key;
>  
> -		trace_btrfs_raid_extent_delete(fs_info, start, end,
> -					       found_start, found_end);
> +		new_key.objectid = found_start;
> +		new_key.type = BTRFS_RAID_STRIPE_KEY;
> +		new_key.offset = length - diff;
>  
> -		ASSERT(found_start >= start && found_end <= end);
> -		ret = btrfs_del_item(trans, stripe_root, path);
> +		ret = btrfs_duplicate_item(trans, stripe_root, path,
> +					   &new_key);

This seems incorrect to me.  If we have [0, 1MiB) and we're deleting [0,512KiB)
then the tree at this point is

[0, BTRFS_RAID_STRIPE_KEY, 512KiB]
[0, BTRFS_RAID_STRIPE_KEY, 1MiB]

which is valid as far as key ordering goes, but is a violation of the raid
stripe tree design correct?  And then you do goto again, and then you'll delete

[0, BTRFS_RAID_STRIPE_KEY, 512KiB]

but leave the old one in place, correct?  Thanks,

Josef
Johannes Thumshirn July 3, 2024, 3:47 p.m. UTC | #2
On 01.07.24 16:07, Josef Bacik wrote:
> On Mon, Jul 01, 2024 at 12:25:16PM +0200, Johannes Thumshirn wrote:
>> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>
>> The current RAID stripe code assumes, that we will always remove a
>> whole stripe entry.
>>
>> But if we're only removing a part of a RAID stripe we're hitting the
>> ASSERT()ion checking for this condition.
>>
>> Instead of assuming the complete deletion of a RAID stripe, split the
>> stripe if we need to.
>>
>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>> ---
>>   fs/btrfs/ctree.c            |   1 +
>>   fs/btrfs/raid-stripe-tree.c | 100 +++++++++++++++++++++++++++++++++-----------
>>   2 files changed, 77 insertions(+), 24 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
>> index e33f9f5a228d..16f9cf6360a4 100644
>> --- a/fs/btrfs/ctree.c
>> +++ b/fs/btrfs/ctree.c
>> @@ -3863,6 +3863,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
>>   	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
>>   
>>   	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
>> +	       key.type != BTRFS_RAID_STRIPE_KEY &&
>>   	       key.type != BTRFS_EXTENT_CSUM_KEY);
>>   
>>   	if (btrfs_leaf_free_space(leaf) >= ins_len)
>> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
>> index 3020820dd6e2..64e36b46cbab 100644
>> --- a/fs/btrfs/raid-stripe-tree.c
>> +++ b/fs/btrfs/raid-stripe-tree.c
>> @@ -33,42 +33,94 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
>>   	if (!path)
>>   		return -ENOMEM;
>>   
>> -	while (1) {
>> -		key.objectid = start;
>> -		key.type = BTRFS_RAID_STRIPE_KEY;
>> -		key.offset = length;
>> +again:
>> +	key.objectid = start;
>> +	key.type = BTRFS_RAID_STRIPE_KEY;
>> +	key.offset = length;
>>   
>> -		ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
>> -		if (ret < 0)
>> -			break;
>> -		if (ret > 0) {
>> -			ret = 0;
>> -			if (path->slots[0] == 0)
>> -				break;
>> -			path->slots[0]--;
>> -		}
>> +	ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
>> +	if (ret < 0)
>> +		goto out;
>> +	if (ret > 0) {
>> +		ret = 0;
>> +		if (path->slots[0] == 0)
>> +			goto out;
>> +		path->slots[0]--;
>> +	}
>> +
>> +	leaf = path->nodes[0];
>> +	slot = path->slots[0];
>> +	btrfs_item_key_to_cpu(leaf, &key, slot);
>> +	found_start = key.objectid;
>> +	found_end = found_start + key.offset;
>> +
>> +	/* That stripe ends before we start, we're done. */
>> +	if (found_end <= start)
>> +		goto out;
>> +
>> +	trace_btrfs_raid_extent_delete(fs_info, start, end,
>> +				       found_start, found_end);
>> +
>> +	if (found_start < start) {
>> +		u64 diff = start - found_start;
>> +		struct btrfs_key new_key;
>> +		int num_stripes;
>> +		struct btrfs_stripe_extent *stripe_extent;
>> +
>> +		new_key.objectid = start;
>> +		new_key.type = BTRFS_RAID_STRIPE_KEY;
>> +		new_key.offset = length - diff;
>> +
>> +		ret = btrfs_duplicate_item(trans, stripe_root, path,
>> +					   &new_key);
>> +		if (ret)
>> +			goto out;
>>   
>>   		leaf = path->nodes[0];
>>   		slot = path->slots[0];
>> -		btrfs_item_key_to_cpu(leaf, &key, slot);
>> -		found_start = key.objectid;
>> -		found_end = found_start + key.offset;
>>   
>> -		/* That stripe ends before we start, we're done. */
>> -		if (found_end <= start)
>> -			break;
>> +		num_stripes =
>> +			btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
>> +		stripe_extent =
>> +			btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
>> +
>> +		for (int i = 0; i < num_stripes; i++) {
>> +			struct btrfs_raid_stride *raid_stride =
>> +				&stripe_extent->strides[i];
>> +			u64 physical =
>> +				btrfs_raid_stride_physical(leaf, raid_stride);
>> +
>> +			btrfs_set_raid_stride_physical(leaf, raid_stride,
>> +							     physical + diff);
>> +		}
>> +
>> +		btrfs_mark_buffer_dirty(trans, leaf);
>> +		btrfs_release_path(path);
>> +		goto again;
>> +	}
>> +
>> +	if (found_end > end) {
>> +		u64 diff = found_end - end;
>> +		struct btrfs_key new_key;
>>   
>> -		trace_btrfs_raid_extent_delete(fs_info, start, end,
>> -					       found_start, found_end);
>> +		new_key.objectid = found_start;
>> +		new_key.type = BTRFS_RAID_STRIPE_KEY;
>> +		new_key.offset = length - diff;
>>   
>> -		ASSERT(found_start >= start && found_end <= end);
>> -		ret = btrfs_del_item(trans, stripe_root, path);
>> +		ret = btrfs_duplicate_item(trans, stripe_root, path,
>> +					   &new_key);
> 
> This seems incorrect to me.  If we have [0, 1MiB) and we're deleting [0,512KiB)
> then the tree at this point is
> 
> [0, BTRFS_RAID_STRIPE_KEY, 512KiB]
> [0, BTRFS_RAID_STRIPE_KEY, 1MiB]
> 
> which is valid as far as key ordering goes, but is a violation of the raid
> stripe tree design correct?  And then you do goto again, and then you'll delete
> 
> [0, BTRFS_RAID_STRIPE_KEY, 512KiB]
> 
> but leave the old one in place, correct?  Thanks,

Oh right, jumping back to again removes the wrong one :/
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e33f9f5a228d..16f9cf6360a4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3863,6 +3863,7 @@  static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 
 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
+	       key.type != BTRFS_RAID_STRIPE_KEY &&
 	       key.type != BTRFS_EXTENT_CSUM_KEY);
 
 	if (btrfs_leaf_free_space(leaf) >= ins_len)
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 3020820dd6e2..64e36b46cbab 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -33,42 +33,94 @@  int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 	if (!path)
 		return -ENOMEM;
 
-	while (1) {
-		key.objectid = start;
-		key.type = BTRFS_RAID_STRIPE_KEY;
-		key.offset = length;
+again:
+	key.objectid = start;
+	key.type = BTRFS_RAID_STRIPE_KEY;
+	key.offset = length;
 
-		ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
-		if (ret < 0)
-			break;
-		if (ret > 0) {
-			ret = 0;
-			if (path->slots[0] == 0)
-				break;
-			path->slots[0]--;
-		}
+	ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = 0;
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+	}
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	found_start = key.objectid;
+	found_end = found_start + key.offset;
+
+	/* That stripe ends before we start, we're done. */
+	if (found_end <= start)
+		goto out;
+
+	trace_btrfs_raid_extent_delete(fs_info, start, end,
+				       found_start, found_end);
+
+	if (found_start < start) {
+		u64 diff = start - found_start;
+		struct btrfs_key new_key;
+		int num_stripes;
+		struct btrfs_stripe_extent *stripe_extent;
+
+		new_key.objectid = start;
+		new_key.type = BTRFS_RAID_STRIPE_KEY;
+		new_key.offset = length - diff;
+
+		ret = btrfs_duplicate_item(trans, stripe_root, path,
+					   &new_key);
+		if (ret)
+			goto out;
 
 		leaf = path->nodes[0];
 		slot = path->slots[0];
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-		found_start = key.objectid;
-		found_end = found_start + key.offset;
 
-		/* That stripe ends before we start, we're done. */
-		if (found_end <= start)
-			break;
+		num_stripes =
+			btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
+		stripe_extent =
+			btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+
+		for (int i = 0; i < num_stripes; i++) {
+			struct btrfs_raid_stride *raid_stride =
+				&stripe_extent->strides[i];
+			u64 physical =
+				btrfs_raid_stride_physical(leaf, raid_stride);
+
+			btrfs_set_raid_stride_physical(leaf, raid_stride,
+							     physical + diff);
+		}
+
+		btrfs_mark_buffer_dirty(trans, leaf);
+		btrfs_release_path(path);
+		goto again;
+	}
+
+	if (found_end > end) {
+		u64 diff = found_end - end;
+		struct btrfs_key new_key;
 
-		trace_btrfs_raid_extent_delete(fs_info, start, end,
-					       found_start, found_end);
+		new_key.objectid = found_start;
+		new_key.type = BTRFS_RAID_STRIPE_KEY;
+		new_key.offset = length - diff;
 
-		ASSERT(found_start >= start && found_end <= end);
-		ret = btrfs_del_item(trans, stripe_root, path);
+		ret = btrfs_duplicate_item(trans, stripe_root, path,
+					   &new_key);
 		if (ret)
-			break;
+			goto out;
 
+		btrfs_mark_buffer_dirty(trans, leaf);
 		btrfs_release_path(path);
+		goto again;
+
 	}
 
+	ret = btrfs_del_item(trans, stripe_root, path);
+
+ out:
 	btrfs_free_path(path);
 	return ret;
 }