diff mbox series

[RFC,ONLY,3/8] btrfs: read raid-stripe-tree from disk

Message ID 2ccf8b77759a80a09d083446d5adb3d03947394b.1652711187.git.johannes.thumshirn@wdc.com (mailing list archive)
State New, archived
Headers show
Series btrfs: introduce raid-stripe-tree | expand

Commit Message

Johannes Thumshirn May 16, 2022, 2:31 p.m. UTC
If we're discovering a raid-stripe-tree on mount, read it from disk.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/ctree.h           |  1 +
 fs/btrfs/disk-io.c         | 12 ++++++++++++
 include/uapi/linux/btrfs.h |  1 +
 3 files changed, 14 insertions(+)

Comments

Qu Wenruo May 17, 2022, 8:09 a.m. UTC | #1
On 2022/5/16 22:31, Johannes Thumshirn wrote:
> If we're discovering a raid-stripe-tree on mount, read it from disk.
>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>   fs/btrfs/ctree.h           |  1 +
>   fs/btrfs/disk-io.c         | 12 ++++++++++++
>   include/uapi/linux/btrfs.h |  1 +
>   3 files changed, 14 insertions(+)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 20aa2ebac7cd..1db669662f61 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -667,6 +667,7 @@ struct btrfs_fs_info {
>   	struct btrfs_root *uuid_root;
>   	struct btrfs_root *data_reloc_root;
>   	struct btrfs_root *block_group_root;
> +	struct btrfs_root *stripe_root;
>
>   	/* the log root tree is a directory of all the other log roots */
>   	struct btrfs_root *log_root_tree;
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index d456f426924c..c0f08917465a 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -1706,6 +1706,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
>
>   		return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
>   	}
> +	if (objectid == BTRFS_RAID_STRIPE_TREE_OBJECTID)
> +		return btrfs_grab_root(fs_info->stripe_root) ?
> +			fs_info->stripe_root : ERR_PTR(-ENOENT);
>   	return NULL;
>   }
>
> @@ -1784,6 +1787,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
>   	btrfs_put_root(fs_info->fs_root);
>   	btrfs_put_root(fs_info->data_reloc_root);
>   	btrfs_put_root(fs_info->block_group_root);
> +	btrfs_put_root(fs_info->stripe_root);
>   	btrfs_check_leaked_roots(fs_info);
>   	btrfs_extent_buffer_leak_debug_check(fs_info);
>   	kfree(fs_info->super_copy);
> @@ -2337,6 +2341,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
>   	free_root_extent_buffers(info->fs_root);
>   	free_root_extent_buffers(info->data_reloc_root);
>   	free_root_extent_buffers(info->block_group_root);
> +	free_root_extent_buffers(info->stripe_root);
>   	if (free_chunk_root)
>   		free_root_extent_buffers(info->chunk_root);
>   }
> @@ -2773,6 +2778,13 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
>   		fs_info->uuid_root = root;
>   	}
>

I guess in the real patch, we need to check the incompatble feature first.

Another problem is, how do we do bootstrap?

If our metadata (especially chunk tree) is also in some chunks which is
stripe-tree mapped, without stripe tree we're even unable to read the
chunk tree.

Or do you plan to not support metadata on stripe-tree mapped chunks?

Thanks,
Qu
> +	location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
> +	root = btrfs_read_tree_root(tree_root, &location);
> +	if (!IS_ERR(root)) {
> +		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
> +		fs_info->stripe_root = root;
> +	}
> +
>   	return 0;
>   out:
>   	btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
> index d956b2993970..4e0429fc4e87 100644
> --- a/include/uapi/linux/btrfs.h
> +++ b/include/uapi/linux/btrfs.h
> @@ -310,6 +310,7 @@ struct btrfs_ioctl_fs_info_args {
>   #define BTRFS_FEATURE_INCOMPAT_RAID1C34		(1ULL << 11)
>   #define BTRFS_FEATURE_INCOMPAT_ZONED		(1ULL << 12)
>   #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2	(1ULL << 13)
> +#define BTRFS_FEATURE_INCOMPAT_STRIPE_TREE	(1ULL << 14)
>
>   struct btrfs_ioctl_feature_flags {
>   	__u64 compat_flags;
Johannes Thumshirn May 17, 2022, 8:13 a.m. UTC | #2
On 17/05/2022 10:10, Qu Wenruo wrote:
> 
> 
> On 2022/5/16 22:31, Johannes Thumshirn wrote:
>> If we're discovering a raid-stripe-tree on mount, read it from disk.
>>
>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>> ---
>>   fs/btrfs/ctree.h           |  1 +
>>   fs/btrfs/disk-io.c         | 12 ++++++++++++
>>   include/uapi/linux/btrfs.h |  1 +
>>   3 files changed, 14 insertions(+)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 20aa2ebac7cd..1db669662f61 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -667,6 +667,7 @@ struct btrfs_fs_info {
>>   	struct btrfs_root *uuid_root;
>>   	struct btrfs_root *data_reloc_root;
>>   	struct btrfs_root *block_group_root;
>> +	struct btrfs_root *stripe_root;
>>
>>   	/* the log root tree is a directory of all the other log roots */
>>   	struct btrfs_root *log_root_tree;
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index d456f426924c..c0f08917465a 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -1706,6 +1706,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
>>
>>   		return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
>>   	}
>> +	if (objectid == BTRFS_RAID_STRIPE_TREE_OBJECTID)
>> +		return btrfs_grab_root(fs_info->stripe_root) ?
>> +			fs_info->stripe_root : ERR_PTR(-ENOENT);
>>   	return NULL;
>>   }
>>
>> @@ -1784,6 +1787,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
>>   	btrfs_put_root(fs_info->fs_root);
>>   	btrfs_put_root(fs_info->data_reloc_root);
>>   	btrfs_put_root(fs_info->block_group_root);
>> +	btrfs_put_root(fs_info->stripe_root);
>>   	btrfs_check_leaked_roots(fs_info);
>>   	btrfs_extent_buffer_leak_debug_check(fs_info);
>>   	kfree(fs_info->super_copy);
>> @@ -2337,6 +2341,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
>>   	free_root_extent_buffers(info->fs_root);
>>   	free_root_extent_buffers(info->data_reloc_root);
>>   	free_root_extent_buffers(info->block_group_root);
>> +	free_root_extent_buffers(info->stripe_root);
>>   	if (free_chunk_root)
>>   		free_root_extent_buffers(info->chunk_root);
>>   }
>> @@ -2773,6 +2778,13 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
>>   		fs_info->uuid_root = root;
>>   	}
>>
> 
> I guess in the real patch, we need to check the incompatble feature first.

Or at least a compatible_ro. For regular drives it should be sufficient, for
zoned drives mounting with raid without a stripe tree will fail.

> 
> Another problem is, how do we do bootstrap?
> 
> If our metadata (especially chunk tree) is also in some chunks which is
> stripe-tree mapped, without stripe tree we're even unable to read the
> chunk tree.
> 
> Or do you plan to not support metadata on stripe-tree mapped chunks?

I do, but I have no clue yet how to attack this problem. I was hoping to get some
insights from Josef's extent-tree v2 series.

Metadata on the stripe tree really is the main blocker right now.
Qu Wenruo May 17, 2022, 8:28 a.m. UTC | #3
On 2022/5/17 16:13, Johannes Thumshirn wrote:
> On 17/05/2022 10:10, Qu Wenruo wrote:
>>
>>
>> On 2022/5/16 22:31, Johannes Thumshirn wrote:
>>> If we're discovering a raid-stripe-tree on mount, read it from disk.
>>>
>>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>> ---
>>>    fs/btrfs/ctree.h           |  1 +
>>>    fs/btrfs/disk-io.c         | 12 ++++++++++++
>>>    include/uapi/linux/btrfs.h |  1 +
>>>    3 files changed, 14 insertions(+)
>>>
>>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>>> index 20aa2ebac7cd..1db669662f61 100644
>>> --- a/fs/btrfs/ctree.h
>>> +++ b/fs/btrfs/ctree.h
>>> @@ -667,6 +667,7 @@ struct btrfs_fs_info {
>>>    	struct btrfs_root *uuid_root;
>>>    	struct btrfs_root *data_reloc_root;
>>>    	struct btrfs_root *block_group_root;
>>> +	struct btrfs_root *stripe_root;
>>>
>>>    	/* the log root tree is a directory of all the other log roots */
>>>    	struct btrfs_root *log_root_tree;
>>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>>> index d456f426924c..c0f08917465a 100644
>>> --- a/fs/btrfs/disk-io.c
>>> +++ b/fs/btrfs/disk-io.c
>>> @@ -1706,6 +1706,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
>>>
>>>    		return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
>>>    	}
>>> +	if (objectid == BTRFS_RAID_STRIPE_TREE_OBJECTID)
>>> +		return btrfs_grab_root(fs_info->stripe_root) ?
>>> +			fs_info->stripe_root : ERR_PTR(-ENOENT);
>>>    	return NULL;
>>>    }
>>>
>>> @@ -1784,6 +1787,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
>>>    	btrfs_put_root(fs_info->fs_root);
>>>    	btrfs_put_root(fs_info->data_reloc_root);
>>>    	btrfs_put_root(fs_info->block_group_root);
>>> +	btrfs_put_root(fs_info->stripe_root);
>>>    	btrfs_check_leaked_roots(fs_info);
>>>    	btrfs_extent_buffer_leak_debug_check(fs_info);
>>>    	kfree(fs_info->super_copy);
>>> @@ -2337,6 +2341,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
>>>    	free_root_extent_buffers(info->fs_root);
>>>    	free_root_extent_buffers(info->data_reloc_root);
>>>    	free_root_extent_buffers(info->block_group_root);
>>> +	free_root_extent_buffers(info->stripe_root);
>>>    	if (free_chunk_root)
>>>    		free_root_extent_buffers(info->chunk_root);
>>>    }
>>> @@ -2773,6 +2778,13 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
>>>    		fs_info->uuid_root = root;
>>>    	}
>>>
>>
>> I guess in the real patch, we need to check the incompatble feature first.
>
> Or at least a compatible_ro. For regular drives it should be sufficient, for
> zoned drives mounting with raid without a stripe tree will fail.
>
>>
>> Another problem is, how do we do bootstrap?
>>
>> If our metadata (especially chunk tree) is also in some chunks which is
>> stripe-tree mapped, without stripe tree we're even unable to read the
>> chunk tree.
>>
>> Or do you plan to not support metadata on stripe-tree mapped chunks?
>
> I do, but I have no clue yet how to attack this problem. I was hoping to get some
> insights from Josef's extent-tree v2 series.

Personally speaking, a per-chunk flag/type allowing us to know if a
chunk has stripe mapped is much better for testing, and can bring you
much needed time for further improvement.

>
> Metadata on the stripe tree really is the main blocker right now.

That's no doubt.

Thanks,
Qu
Johannes Thumshirn May 18, 2022, 11:29 a.m. UTC | #4
On 17/05/2022 10:28, Qu Wenruo wrote:
>>
>> Metadata on the stripe tree really is the main blocker right now.
> 
> That's no doubt.

What could be done and I think this is the only way forward, is to have
the stripe tree in the system block group and force system to be RAID1
on a fs with stripe tree.

Thoughts?
Qu Wenruo May 19, 2022, 8:36 a.m. UTC | #5
On 2022/5/18 19:29, Johannes Thumshirn wrote:
> On 17/05/2022 10:28, Qu Wenruo wrote:
>>>
>>> Metadata on the stripe tree really is the main blocker right now.
>>
>> That's no doubt.
>
> What could be done and I think this is the only way forward, is to have
> the stripe tree in the system block group

This behavior itself has its problems, unfortunately.

Currently the system chunks are pretty small, in fact system chunks has
the minimal stripe size for current code base.
(Data: 1G, Meta: 1G/256M, sys: 32M)

This means, if we put stripe tree (which can be as large as extent tree
afaik), we need way much larger system chunks.

And this can further increase the possibility on ENOSPC due to
unbalanced data/metadata/sys usage.

Although this is really the last problem we need to bother.

> and force system to be RAID1 on a fs with stripe tree.

Then the system RAID1 chunks also need stripe tree for zoned devices.

This means we're unable to bootstrap at all.

Or did you mean, make system chunks RAID1 but not using stripe tree?
That can solve the boot strap problem, but it doesn't really look
elegant to me...

Thanks,
Qu

>
> Thoughts?
Johannes Thumshirn May 19, 2022, 8:39 a.m. UTC | #6
On 19/05/2022 10:36, Qu Wenruo wrote:
> 
> 
> On 2022/5/18 19:29, Johannes Thumshirn wrote:
>> On 17/05/2022 10:28, Qu Wenruo wrote:
>>>>
>>>> Metadata on the stripe tree really is the main blocker right now.
>>>
>>> That's no doubt.
>>
>> What could be done and I think this is the only way forward, is to have
>> the stripe tree in the system block group
> 
> This behavior itself has its problems, unfortunately.
> 
> Currently the system chunks are pretty small, in fact system chunks has
> the minimal stripe size for current code base.
> (Data: 1G, Meta: 1G/256M, sys: 32M)
> 
> This means, if we put stripe tree (which can be as large as extent tree
> afaik), we need way much larger system chunks.

I know, but IIRC (need to look this up again) Josef increased the max size
of sys chunks to 2G

> 
> And this can further increase the possibility on ENOSPC due to
> unbalanced data/metadata/sys usage.
> 
> Although this is really the last problem we need to bother.
> 
>> and force system to be RAID1 on a fs with stripe tree.
> 
> Then the system RAID1 chunks also need stripe tree for zoned devices.
> 
> This means we're unable to bootstrap at all.
> 
> Or did you mean, make system chunks RAID1 but not using stripe tree?
> That can solve the boot strap problem, but it doesn't really look
> elegant to me...

RAID1 on zoned only needs a stripe tree for data, not for meta-data/system,
so it will work and we can bootstrap from it.
Qu Wenruo May 19, 2022, 10:37 a.m. UTC | #7
On 2022/5/19 16:39, Johannes Thumshirn wrote:
> On 19/05/2022 10:36, Qu Wenruo wrote:
>>
>>
>> On 2022/5/18 19:29, Johannes Thumshirn wrote:
>>> On 17/05/2022 10:28, Qu Wenruo wrote:
>>>>>
>>>>> Metadata on the stripe tree really is the main blocker right now.
>>>>
>>>> That's no doubt.
>>>
>>> What could be done and I think this is the only way forward, is to have
>>> the stripe tree in the system block group
>>
>> This behavior itself has its problems, unfortunately.
>>
>> Currently the system chunks are pretty small, in fact system chunks has
>> the minimal stripe size for current code base.
>> (Data: 1G, Meta: 1G/256M, sys: 32M)
>>
>> This means, if we put stripe tree (which can be as large as extent tree
>> afaik), we need way much larger system chunks.
> 
> I know, but IIRC (need to look this up again) Josef increased the max size
> of sys chunks to 2G
> 
>>
>> And this can further increase the possibility on ENOSPC due to
>> unbalanced data/metadata/sys usage.
>>
>> Although this is really the last problem we need to bother.
>>
>>> and force system to be RAID1 on a fs with stripe tree.
>>
>> Then the system RAID1 chunks also need stripe tree for zoned devices.
>>
>> This means we're unable to bootstrap at all.
>>
>> Or did you mean, make system chunks RAID1 but not using stripe tree?
>> That can solve the boot strap problem, but it doesn't really look
>> elegant to me...
> 
> RAID1 on zoned only needs a stripe tree for data, not for meta-data/system,
> so it will work and we can bootstrap from it.
> 
That sounds good.

And in that case, we don't need to put stripe tree into system chunks at 
all.

So this method means, stripe tree is only useful for data.
Although it's less elegant, it's much saner.

Thanks,
Qu
Johannes Thumshirn May 19, 2022, 11:44 a.m. UTC | #8
On 19/05/2022 12:37, Qu Wenruo wrote:
>> RAID1 on zoned only needs a stripe tree for data, not for meta-data/system,
>> so it will work and we can bootstrap from it.
>>
> That sounds good.
> 
> And in that case, we don't need to put stripe tree into system chunks at 
> all.
> 
> So this method means, stripe tree is only useful for data.
> Although it's less elegant, it's much saner.

Yes and no. People still might want to use different metadata profiles than
RAID1. I'd prefer to have system on RAID1 (forced) with stripe trees and 
data/meta-data can be whatever. Of cause only RAID5/6 or higher level encodings
which might need a stripe-tree should be accpeted with a stripe tree.
Qu Wenruo May 19, 2022, 11:48 a.m. UTC | #9
On 2022/5/19 19:44, Johannes Thumshirn wrote:
> On 19/05/2022 12:37, Qu Wenruo wrote:
>>> RAID1 on zoned only needs a stripe tree for data, not for meta-data/system,
>>> so it will work and we can bootstrap from it.
>>>
>> That sounds good.
>>
>> And in that case, we don't need to put stripe tree into system chunks at
>> all.
>>
>> So this method means, stripe tree is only useful for data.
>> Although it's less elegant, it's much saner.
>
> Yes and no. People still might want to use different metadata profiles than
> RAID1.

For RAID1 variants like RAID1C3/4, I guess we don't need stripe tree either?

What about DUP? If RAID1*/DUP/SINGLE all doesn't need stripe tree, I
believe that's already a pretty good profile set for most zoned device
users.

Personally speaking, it would be much simpler to avoid bothering the
stripe tree for metadata.

Thanks,
Qu

> I'd prefer to have system on RAID1 (forced) with stripe trees and
> data/meta-data can be whatever. Of cause only RAID5/6 or higher level encodings
> which might need a stripe-tree should be accpeted with a stripe tree.
Johannes Thumshirn May 19, 2022, 11:53 a.m. UTC | #10
On 19/05/2022 13:48, Qu Wenruo wrote:
> 
> On 2022/5/19 19:44, Johannes Thumshirn wrote:
>> On 19/05/2022 12:37, Qu Wenruo wrote:
>>>> RAID1 on zoned only needs a stripe tree for data, not for meta-data/system,
>>>> so it will work and we can bootstrap from it.
>>>>
>>> That sounds good.
>>>
>>> And in that case, we don't need to put stripe tree into system chunks at
>>> all.
>>>
>>> So this method means, stripe tree is only useful for data.
>>> Although it's less elegant, it's much saner.
>> Yes and no. People still might want to use different metadata profiles than
>> RAID1.
> For RAID1 variants like RAID1C3/4, I guess we don't need stripe tree either?
> 
> What about DUP? If RAID1*/DUP/SINGLE all doesn't need stripe tree, I
> believe that's already a pretty good profile set for most zoned device
> users.
> 
> Personally speaking, it would be much simpler to avoid bothering the
> stripe tree for metadata.

I totally agree, but once you get past say 10 drives you might want to have
different encoding schemes and also have a higher level of redundancy for your 
metadata than just 4 copies.

The stripe tree will also hold any l2p information for erasure coded RAID 
arrays once that's done.

So this definitively should be considered.
Qu Wenruo May 19, 2022, 1:26 p.m. UTC | #11
On 2022/5/19 19:53, Johannes Thumshirn wrote:
> On 19/05/2022 13:48, Qu Wenruo wrote:
>>
>> On 2022/5/19 19:44, Johannes Thumshirn wrote:
>>> On 19/05/2022 12:37, Qu Wenruo wrote:
>>>>> RAID1 on zoned only needs a stripe tree for data, not for meta-data/system,
>>>>> so it will work and we can bootstrap from it.
>>>>>
>>>> That sounds good.
>>>>
>>>> And in that case, we don't need to put stripe tree into system chunks at
>>>> all.
>>>>
>>>> So this method means, stripe tree is only useful for data.
>>>> Although it's less elegant, it's much saner.
>>> Yes and no. People still might want to use different metadata profiles than
>>> RAID1.
>> For RAID1 variants like RAID1C3/4, I guess we don't need stripe tree either?
>>
>> What about DUP? If RAID1*/DUP/SINGLE all doesn't need stripe tree, I
>> believe that's already a pretty good profile set for most zoned device
>> users.
>>
>> Personally speaking, it would be much simpler to avoid bothering the
>> stripe tree for metadata.
>
> I totally agree, but once you get past say 10 drives you might want to have
> different encoding schemes and also have a higher level of redundancy for your
> metadata than just 4 copies.
>
> The stripe tree will also hold any l2p information for erasure coded RAID
> arrays once that's done.
>
> So this definitively should be considered.


Then let us consider the extra chunk type flag, like
BTRFS_BLOCK_GROUP_HAS_STRIPE_TREE, and then expand the combination from
the initial RAID1*|HAS_STRIPE_TREE to other profiles.

But for over 10 devices, I doubt we really need to bother metadata that
much. Consider we go RAID1C4, we have the ability to lose 3 devices
already, that's way stronger than RAID6. For metadata I believe it's
completely fine already.

Normally it's data requiring more balance between cost and redundancy as
they are the main part of a fs.

Thus even for 10 disk, metadata RAID1C4, data RAID6 (with stripe tree
for zoned), it still looks very reasonable to me at least.

Thanks,
Qu
Johannes Thumshirn May 19, 2022, 1:49 p.m. UTC | #12
On 19/05/2022 15:27, Qu Wenruo wrote:
> 
> 
> Then let us consider the extra chunk type flag, like
> BTRFS_BLOCK_GROUP_HAS_STRIPE_TREE, and then expand the combination from
> the initial RAID1*|HAS_STRIPE_TREE to other profiles.


That would definitively work for me.
Qu Wenruo May 19, 2022, 10:56 p.m. UTC | #13
On 2022/5/19 21:49, Johannes Thumshirn wrote:
> On 19/05/2022 15:27, Qu Wenruo wrote:
>>
>>
>> Then let us consider the extra chunk type flag, like
>> BTRFS_BLOCK_GROUP_HAS_STRIPE_TREE, and then expand the combination from
>> the initial RAID1*|HAS_STRIPE_TREE to other profiles.
>
>
> That would definitively work for me.

Just one thing to mention, does RAID10 also need stripe tree for
metadata? Or since we're doing depth = 1 IO for metadata anyway, RAID10
is also safe for metadata without using a stripe tree?

If so, I really believe the metadata has already a super good profile
set already.

Thanks,
Qu
Johannes Thumshirn May 20, 2022, 8:27 a.m. UTC | #14
On 20/05/2022 00:56, Qu Wenruo wrote:
> 
> 
> On 2022/5/19 21:49, Johannes Thumshirn wrote:
>> On 19/05/2022 15:27, Qu Wenruo wrote:
>>>
>>>
>>> Then let us consider the extra chunk type flag, like
>>> BTRFS_BLOCK_GROUP_HAS_STRIPE_TREE, and then expand the combination from
>>> the initial RAID1*|HAS_STRIPE_TREE to other profiles.
>>
>>
>> That would definitively work for me.
> 
> Just one thing to mention, does RAID10 also need stripe tree for
> metadata? Or since we're doing depth = 1 IO for metadata anyway, RAID10
> is also safe for metadata without using a stripe tree?
> 
> If so, I really believe the metadata has already a super good profile
> set already.

Yep I think so, as no meta-data is written with zone-append.

I just think for meta-data on raid56 we need something.
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 20aa2ebac7cd..1db669662f61 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -667,6 +667,7 @@  struct btrfs_fs_info {
 	struct btrfs_root *uuid_root;
 	struct btrfs_root *data_reloc_root;
 	struct btrfs_root *block_group_root;
+	struct btrfs_root *stripe_root;
 
 	/* the log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d456f426924c..c0f08917465a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1706,6 +1706,9 @@  static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
 
 		return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
 	}
+	if (objectid == BTRFS_RAID_STRIPE_TREE_OBJECTID)
+		return btrfs_grab_root(fs_info->stripe_root) ?
+			fs_info->stripe_root : ERR_PTR(-ENOENT);
 	return NULL;
 }
 
@@ -1784,6 +1787,7 @@  void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 	btrfs_put_root(fs_info->fs_root);
 	btrfs_put_root(fs_info->data_reloc_root);
 	btrfs_put_root(fs_info->block_group_root);
+	btrfs_put_root(fs_info->stripe_root);
 	btrfs_check_leaked_roots(fs_info);
 	btrfs_extent_buffer_leak_debug_check(fs_info);
 	kfree(fs_info->super_copy);
@@ -2337,6 +2341,7 @@  static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
 	free_root_extent_buffers(info->fs_root);
 	free_root_extent_buffers(info->data_reloc_root);
 	free_root_extent_buffers(info->block_group_root);
+	free_root_extent_buffers(info->stripe_root);
 	if (free_chunk_root)
 		free_root_extent_buffers(info->chunk_root);
 }
@@ -2773,6 +2778,13 @@  static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 		fs_info->uuid_root = root;
 	}
 
+	location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
+	root = btrfs_read_tree_root(tree_root, &location);
+	if (!IS_ERR(root)) {
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+		fs_info->stripe_root = root;
+	}
+
 	return 0;
 out:
 	btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index d956b2993970..4e0429fc4e87 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -310,6 +310,7 @@  struct btrfs_ioctl_fs_info_args {
 #define BTRFS_FEATURE_INCOMPAT_RAID1C34		(1ULL << 11)
 #define BTRFS_FEATURE_INCOMPAT_ZONED		(1ULL << 12)
 #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2	(1ULL << 13)
+#define BTRFS_FEATURE_INCOMPAT_STRIPE_TREE	(1ULL << 14)
 
 struct btrfs_ioctl_feature_flags {
 	__u64 compat_flags;