diff mbox series

btrfs: pseudo device-scan for single-device filesystems

Message ID b0e0240254557461c137cd9b943f00b0d5048083.1693959204.git.anand.jain@oracle.com (mailing list archive)
State New, archived
Headers show
Series btrfs: pseudo device-scan for single-device filesystems | expand

Commit Message

Anand Jain Sept. 6, 2023, 4:27 p.m. UTC
After the commit [1], we unregister the device from the kernel memory upon
unmounting for a single device.

  [1] 5f58d783fd78 ("btrfs: free device in btrfs_close_devices for a single device filesystem")

So, device registration that was performed before mounting if any is no
longer in the kernel memory.

However, in fact, note that device registration is unnecessary for a
single-device Btrfs filesystem unless it's a seed device.

So for commands like 'btrfs device scan' or 'btrfs device ready' with a
non-seed single-device Btrfs filesystem, they can return success without
the actual device scan.

The seed device must remain in the kernel memory to allow the sprout
device to mount without the need to specify the seed device explicitly.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
---

Passes fstests with a fstests fix as below.

  [PATCH] fstests: btrfs/185 update for single device pseudo device-scan

Testing as a boot device is going on.

 fs/btrfs/super.c   | 21 +++++++++++++++------
 fs/btrfs/volumes.c | 12 +++++++++++-
 fs/btrfs/volumes.h |  3 ++-
 3 files changed, 28 insertions(+), 8 deletions(-)

Comments

Anand Jain Sept. 7, 2023, 2:33 p.m. UTC | #1
On 7/9/23 00:27, Anand Jain wrote:
> After the commit [1], we unregister the device from the kernel memory upon
> unmounting for a single device.
> 
>    [1] 5f58d783fd78 ("btrfs: free device in btrfs_close_devices for a single device filesystem")
> 
> So, device registration that was performed before mounting if any is no
> longer in the kernel memory.
> 
> However, in fact, note that device registration is unnecessary for a
> single-device Btrfs filesystem unless it's a seed device.
> 
> So for commands like 'btrfs device scan' or 'btrfs device ready' with a
> non-seed single-device Btrfs filesystem, they can return success without
> the actual device scan.
> 
> The seed device must remain in the kernel memory to allow the sprout
> device to mount without the need to specify the seed device explicitly.
> 
> Signed-off-by: Anand Jain <anand.jain@oracle.com>
> ---
> 
> Passes fstests with a fstests fix as below.
> 
>    [PATCH] fstests: btrfs/185 update for single device pseudo device-scan
> 
> Testing as a boot device is going on.

Confirmed working on Fedora as a boot device.

Thanks.

> 
>   fs/btrfs/super.c   | 21 +++++++++++++++------
>   fs/btrfs/volumes.c | 12 +++++++++++-
>   fs/btrfs/volumes.h |  3 ++-
>   3 files changed, 28 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 32ff441d2c13..22910e0d39a2 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -891,7 +891,7 @@ static int btrfs_parse_device_options(const char *options, blk_mode_t flags)
>   				error = -ENOMEM;
>   				goto out;
>   			}
> -			device = btrfs_scan_one_device(device_name, flags);
> +			device = btrfs_scan_one_device(device_name, flags, false);
>   			kfree(device_name);
>   			if (IS_ERR(device)) {
>   				error = PTR_ERR(device);
> @@ -1486,10 +1486,17 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
>   		goto error_fs_info;
>   	}
>   
> -	device = btrfs_scan_one_device(device_name, mode);
> -	if (IS_ERR(device)) {
> +	device = btrfs_scan_one_device(device_name, mode, true);
> +	if (IS_ERR_OR_NULL(device)) {
>   		mutex_unlock(&uuid_mutex);
>   		error = PTR_ERR(device);
> +		/*
> +		 * As 3rd argument in the funtion
> +		 * btrfs_scan_one_device( , ,mount_arg_dev) above is true, the
> +		 * 'device' or the 'error' won't be NULL or 0 respectively
> +		 * unless for a bug.
> +		 */
> +		ASSERT(error);
>   		goto error_fs_info;
>   	}
>   
> @@ -2199,7 +2206,8 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
>   	switch (cmd) {
>   	case BTRFS_IOC_SCAN_DEV:
>   		mutex_lock(&uuid_mutex);
> -		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
> +		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
> +		/* Return success i.e. 0 for device == NULL */
>   		ret = PTR_ERR_OR_ZERO(device);
>   		mutex_unlock(&uuid_mutex);
>   		break;
> @@ -2213,9 +2221,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
>   		break;
>   	case BTRFS_IOC_DEVICES_READY:
>   		mutex_lock(&uuid_mutex);
> -		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
> -		if (IS_ERR(device)) {
> +		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
> +		if (IS_ERR_OR_NULL(device)) {
>   			mutex_unlock(&uuid_mutex);
> +			/* Return success i.e. 0 for device == NULL */
>   			ret = PTR_ERR(device);
>   			break;
>   		}
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index fa18ea7ef8e3..38e714661286 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1218,7 +1218,8 @@ int btrfs_forget_devices(dev_t devt)
>    * and we are not allowed to call set_blocksize during the scan. The superblock
>    * is read via pagecache
>    */
> -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
> +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
> +					   bool mount_arg_dev)
>   {
>   	struct btrfs_super_block *disk_super;
>   	bool new_device_added = false;
> @@ -1263,10 +1264,19 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
>   		goto error_bdev_put;
>   	}
>   
> +	if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
> +	    !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
> +		pr_info("skip registering single non seed device path %s\n",
> +			path);
> +		device = NULL;
> +		goto free_disk_super;
> +	}
> +
>   	device = device_list_add(path, disk_super, &new_device_added);
>   	if (!IS_ERR(device) && new_device_added)
>   		btrfs_free_stale_devices(device->devt, device);
>   
> +free_disk_super:
>   	btrfs_release_disk_super(disk_super);
>   
>   error_bdev_put:
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index d2a04ede41dd..e4a3470814c5 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -619,7 +619,8 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
>   void btrfs_mapping_tree_free(struct extent_map_tree *tree);
>   int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
>   		       blk_mode_t flags, void *holder);
> -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags);
> +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
> +					   bool mount_arg_dev);
>   int btrfs_forget_devices(dev_t devt);
>   void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
>   void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
David Sterba Sept. 7, 2023, 2:36 p.m. UTC | #2
On Thu, Sep 07, 2023 at 12:27:41AM +0800, Anand Jain wrote:
> After the commit [1], we unregister the device from the kernel memory upon
> unmounting for a single device.
> 
>   [1] 5f58d783fd78 ("btrfs: free device in btrfs_close_devices for a single device filesystem")

You can write patch references into the text, the [1] references are
more suitable for links.

> So, device registration that was performed before mounting if any is no
> longer in the kernel memory.
> 
> However, in fact, note that device registration is unnecessary for a
> single-device Btrfs filesystem unless it's a seed device.
> 
> So for commands like 'btrfs device scan' or 'btrfs device ready' with a
> non-seed single-device Btrfs filesystem, they can return success without
> the actual device scan.

Just to clarify, the device will be scanned as read by kernel, signature
verified but not added to the fs_devices lists, right?

> The seed device must remain in the kernel memory to allow the sprout
> device to mount without the need to specify the seed device explicitly.

And in case the seeding status of the already scanned and registered
device is changed another scan will happen by udev due to openning for
write. So I guess it's safe.

> 
> Signed-off-by: Anand Jain <anand.jain@oracle.com>
> ---
> 
> Passes fstests with a fstests fix as below.
> 
>   [PATCH] fstests: btrfs/185 update for single device pseudo device-scan
> 
> Testing as a boot device is going on.
> 
>  fs/btrfs/super.c   | 21 +++++++++++++++------
>  fs/btrfs/volumes.c | 12 +++++++++++-
>  fs/btrfs/volumes.h |  3 ++-
>  3 files changed, 28 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 32ff441d2c13..22910e0d39a2 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -891,7 +891,7 @@ static int btrfs_parse_device_options(const char *options, blk_mode_t flags)
>  				error = -ENOMEM;
>  				goto out;
>  			}
> -			device = btrfs_scan_one_device(device_name, flags);
> +			device = btrfs_scan_one_device(device_name, flags, false);
>  			kfree(device_name);
>  			if (IS_ERR(device)) {
>  				error = PTR_ERR(device);
> @@ -1486,10 +1486,17 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
>  		goto error_fs_info;
>  	}
>  
> -	device = btrfs_scan_one_device(device_name, mode);
> -	if (IS_ERR(device)) {
> +	device = btrfs_scan_one_device(device_name, mode, true);
> +	if (IS_ERR_OR_NULL(device)) {
>  		mutex_unlock(&uuid_mutex);
>  		error = PTR_ERR(device);
> +		/*
> +		 * As 3rd argument in the funtion
> +		 * btrfs_scan_one_device( , ,mount_arg_dev) above is true, the
> +		 * 'device' or the 'error' won't be NULL or 0 respectively
> +		 * unless for a bug.
> +		 */
> +		ASSERT(error);

Could the case when device is NULL be handled separately? That way it's
not so obvious that we expect an error and also a NULL pointer.

>  		goto error_fs_info;
>  	}
>  
> @@ -2199,7 +2206,8 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
>  	switch (cmd) {
>  	case BTRFS_IOC_SCAN_DEV:
>  		mutex_lock(&uuid_mutex);
> -		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
> +		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
> +		/* Return success i.e. 0 for device == NULL */
>  		ret = PTR_ERR_OR_ZERO(device);
>  		mutex_unlock(&uuid_mutex);
>  		break;
> @@ -2213,9 +2221,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
>  		break;
>  	case BTRFS_IOC_DEVICES_READY:
>  		mutex_lock(&uuid_mutex);
> -		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
> -		if (IS_ERR(device)) {
> +		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
> +		if (IS_ERR_OR_NULL(device)) {
>  			mutex_unlock(&uuid_mutex);
> +			/* Return success i.e. 0 for device == NULL */
>  			ret = PTR_ERR(device);
>  			break;
>  		}
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index fa18ea7ef8e3..38e714661286 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1218,7 +1218,8 @@ int btrfs_forget_devices(dev_t devt)
>   * and we are not allowed to call set_blocksize during the scan. The superblock
>   * is read via pagecache
>   */
> -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
> +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
> +					   bool mount_arg_dev)
>  {
>  	struct btrfs_super_block *disk_super;
>  	bool new_device_added = false;
> @@ -1263,10 +1264,19 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
>  		goto error_bdev_put;
>  	}
>  
> +	if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
> +	    !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
> +		pr_info("skip registering single non seed device path %s\n",
> +			path);

Wouldn't this be too noisy in the logs? With the scanning and
registration repeated scans of a device there will be only the first
message, but on a system with potentially many single-dev devices each
time udev would want to scan it and it'd get logged.

> +		device = NULL;
> +		goto free_disk_super;
> +	}
> +
>  	device = device_list_add(path, disk_super, &new_device_added);
>  	if (!IS_ERR(device) && new_device_added)
>  		btrfs_free_stale_devices(device->devt, device);
>  
> +free_disk_super:
>  	btrfs_release_disk_super(disk_super);
>  
>  error_bdev_put:
Anand Jain Sept. 7, 2023, 4:48 p.m. UTC | #3
On 07/09/2023 22:36, David Sterba wrote:
> On Thu, Sep 07, 2023 at 12:27:41AM +0800, Anand Jain wrote:
>> After the commit [1], we unregister the device from the kernel memory upon
>> unmounting for a single device.
>>
>>    [1] 5f58d783fd78 ("btrfs: free device in btrfs_close_devices for a single device filesystem")
> 
> You can write patch references into the text, the [1] references are
> more suitable for links.
> 

Got it. Thx.

>> So, device registration that was performed before mounting if any is no
>> longer in the kernel memory.
>>
>> However, in fact, note that device registration is unnecessary for a
>> single-device Btrfs filesystem unless it's a seed device.
>>
>> So for commands like 'btrfs device scan' or 'btrfs device ready' with a
>> non-seed single-device Btrfs filesystem, they can return success without
>> the actual device scan.
> 
> Just to clarify, the device will be scanned as read by kernel, signature
> verified but not added to the fs_devices lists, right?
> 

Right. btrfs_read_disk_super() called during scan, performs sanity
checks on the superblock.

>> The seed device must remain in the kernel memory to allow the sprout
>> device to mount without the need to specify the seed device explicitly.
> 
> And in case the seeding status of the already scanned and registered
> device is changed another scan will happen by udev due to openning for
> write. So I guess it's safe.

Changed? I think you meant converting the seed device back to a normal
device.

With the current code, the stale fs_devices will remain until the
changed device is mounted, as its udev scan will return success without
calling the device_list_add() function.

However, there are two things we can do to fix it:

In the kernel, call btrfs_free_stale_devices() before the pseudo scan's
return.

In the btrfs-progs, 'btrfstune -S 0 <dev-seed>' thread to call 'scan
--forget <dev-seed> ioctl'.

> 
>>
>> Signed-off-by: Anand Jain <anand.jain@oracle.com>
>> ---
>>
>> Passes fstests with a fstests fix as below.
>>
>>    [PATCH] fstests: btrfs/185 update for single device pseudo device-scan
>>
>> Testing as a boot device is going on.
>>
>>   fs/btrfs/super.c   | 21 +++++++++++++++------
>>   fs/btrfs/volumes.c | 12 +++++++++++-
>>   fs/btrfs/volumes.h |  3 ++-
>>   3 files changed, 28 insertions(+), 8 deletions(-)
>>
>> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
>> index 32ff441d2c13..22910e0d39a2 100644
>> --- a/fs/btrfs/super.c
>> +++ b/fs/btrfs/super.c
>> @@ -891,7 +891,7 @@ static int btrfs_parse_device_options(const char *options, blk_mode_t flags)
>>   				error = -ENOMEM;
>>   				goto out;
>>   			}
>> -			device = btrfs_scan_one_device(device_name, flags);
>> +			device = btrfs_scan_one_device(device_name, flags, false);
>>   			kfree(device_name);
>>   			if (IS_ERR(device)) {
>>   				error = PTR_ERR(device);
>> @@ -1486,10 +1486,17 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
>>   		goto error_fs_info;
>>   	}
>>   
>> -	device = btrfs_scan_one_device(device_name, mode);
>> -	if (IS_ERR(device)) {
>> +	device = btrfs_scan_one_device(device_name, mode, true);
>> +	if (IS_ERR_OR_NULL(device)) {
>>   		mutex_unlock(&uuid_mutex);
>>   		error = PTR_ERR(device);
>> +		/*
>> +		 * As 3rd argument in the funtion
>> +		 * btrfs_scan_one_device( , ,mount_arg_dev) above is true, the
>> +		 * 'device' or the 'error' won't be NULL or 0 respectively
>> +		 * unless for a bug.
>> +		 */
>> +		ASSERT(error);
> 
> Could the case when device is NULL be handled separately? That way it's
> not so obvious that we expect an error and also a NULL pointer.

That's neat. Will do.

> 
>>   		goto error_fs_info;
>>   	}
>>   
>> @@ -2199,7 +2206,8 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
>>   	switch (cmd) {
>>   	case BTRFS_IOC_SCAN_DEV:
>>   		mutex_lock(&uuid_mutex);
>> -		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
>> +		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
>> +		/* Return success i.e. 0 for device == NULL */
>>   		ret = PTR_ERR_OR_ZERO(device);
>>   		mutex_unlock(&uuid_mutex);
>>   		break;
>> @@ -2213,9 +2221,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
>>   		break;
>>   	case BTRFS_IOC_DEVICES_READY:
>>   		mutex_lock(&uuid_mutex);
>> -		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
>> -		if (IS_ERR(device)) {
>> +		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
>> +		if (IS_ERR_OR_NULL(device)) {
>>   			mutex_unlock(&uuid_mutex);
>> +			/* Return success i.e. 0 for device == NULL */
>>   			ret = PTR_ERR(device);
>>   			break;
>>   		}
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index fa18ea7ef8e3..38e714661286 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -1218,7 +1218,8 @@ int btrfs_forget_devices(dev_t devt)
>>    * and we are not allowed to call set_blocksize during the scan. The superblock
>>    * is read via pagecache
>>    */
>> -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
>> +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
>> +					   bool mount_arg_dev)
>>   {
>>   	struct btrfs_super_block *disk_super;
>>   	bool new_device_added = false;
>> @@ -1263,10 +1264,19 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
>>   		goto error_bdev_put;
>>   	}
>>   
>> +	if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
>> +	    !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
>> +		pr_info("skip registering single non seed device path %s\n",
>> +			path);
> 
> Wouldn't this be too noisy in the logs? With the scanning and
> registration repeated scans of a device there will be only the first
> message, but on a system with potentially many single-dev devices each
> time udev would want to scan it and it'd get logged.

Ah. I used it for debugging; it should be removed.

Will send v2.

Thanks, Anand

> 
>> +		device = NULL;
>> +		goto free_disk_super;
>> +	}
>> +
>>   	device = device_list_add(path, disk_super, &new_device_added);
>>   	if (!IS_ERR(device) && new_device_added)
>>   		btrfs_free_stale_devices(device->devt, device);
>>   
>> +free_disk_super:
>>   	btrfs_release_disk_super(disk_super);
>>   
>>   error_bdev_put:
David Sterba Sept. 7, 2023, 9:10 p.m. UTC | #4
On Fri, Sep 08, 2023 at 12:48:04AM +0800, Anand Jain wrote:
> Right. btrfs_read_disk_super() called during scan, performs sanity
> checks on the superblock.
> 
> >> The seed device must remain in the kernel memory to allow the sprout
> >> device to mount without the need to specify the seed device explicitly.
> > 
> > And in case the seeding status of the already scanned and registered
> > device is changed another scan will happen by udev due to openning for
> > write. So I guess it's safe.
> 
> Changed? I think you meant converting the seed device back to a normal
> device.
> 
> With the current code, the stale fs_devices will remain until the
> changed device is mounted, as its udev scan will return success without
> calling the device_list_add() function.
> 
> However, there are two things we can do to fix it:
> 
> In the kernel, call btrfs_free_stale_devices() before the pseudo scan's
> return.
> 
> In the btrfs-progs, 'btrfstune -S 0 <dev-seed>' thread to call 'scan
> --forget <dev-seed> ioctl'.

This should work without involvement of userspace regarding the single
devices, while adding an explicit 'device scan --forget' to the scan
command would make sure that our tools do the right thing in case
somebody copies that.

The device scanning is quite specific but systemd/udevd has own utility
that scans devices so it could be updated (depends when) and if kernel
does it right regardless of the systemd scan then we can cover more
cases.

> >> --- a/fs/btrfs/volumes.c
> >> +++ b/fs/btrfs/volumes.c
> >> @@ -1218,7 +1218,8 @@ int btrfs_forget_devices(dev_t devt)
> >>    * and we are not allowed to call set_blocksize during the scan. The superblock
> >>    * is read via pagecache
> >>    */
> >> -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
> >> +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
> >> +					   bool mount_arg_dev)
> >>   {
> >>   	struct btrfs_super_block *disk_super;
> >>   	bool new_device_added = false;
> >> @@ -1263,10 +1264,19 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
> >>   		goto error_bdev_put;
> >>   	}
> >>   
> >> +	if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
> >> +	    !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
> >> +		pr_info("skip registering single non seed device path %s\n",
> >> +			path);
> > 
> > Wouldn't this be too noisy in the logs? With the scanning and
> > registration repeated scans of a device there will be only the first
> > message, but on a system with potentially many single-dev devices each
> > time udev would want to scan it and it'd get logged.
> 
> Ah. I used it for debugging; it should be removed.

You can turn it to pr_debug if you found it useful, for testing setups
we don't mind more messages.
Anand Jain Sept. 8, 2023, 4:07 p.m. UTC | #5
On 08/09/2023 05:10, David Sterba wrote:
> On Fri, Sep 08, 2023 at 12:48:04AM +0800, Anand Jain wrote:
>> Right. btrfs_read_disk_super() called during scan, performs sanity
>> checks on the superblock.
>>
>>>> The seed device must remain in the kernel memory to allow the sprout
>>>> device to mount without the need to specify the seed device explicitly.
>>>
>>> And in case the seeding status of the already scanned and registered
>>> device is changed another scan will happen by udev due to openning for
>>> write. So I guess it's safe.
>>
>> Changed? I think you meant converting the seed device back to a normal
>> device.
>>
>> With the current code, the stale fs_devices will remain until the
>> changed device is mounted, as its udev scan will return success without
>> calling the device_list_add() function.
>>
>> However, there are two things we can do to fix it:
>>
>> In the kernel, call btrfs_free_stale_devices() before the pseudo scan's
>> return.
>>
>> In the btrfs-progs, 'btrfstune -S 0 <dev-seed>' thread to call 'scan
>> --forget <dev-seed> ioctl'.
> 
> This should work without involvement of userspace regarding the single
> devices, while adding an explicit 'device scan --forget' to the scan
> command would make sure that our tools do the right thing in case
> somebody copies that.
> 
> The device scanning is quite specific but systemd/udevd has own utility
> that scans devices so it could be updated (depends when) and if kernel
> does it right regardless of the systemd scan then we can cover more
> cases.

Kernel fix added in v2.

> 
>>>> --- a/fs/btrfs/volumes.c
>>>> +++ b/fs/btrfs/volumes.c
>>>> @@ -1218,7 +1218,8 @@ int btrfs_forget_devices(dev_t devt)
>>>>     * and we are not allowed to call set_blocksize during the scan. The superblock
>>>>     * is read via pagecache
>>>>     */
>>>> -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
>>>> +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
>>>> +					   bool mount_arg_dev)
>>>>    {
>>>>    	struct btrfs_super_block *disk_super;
>>>>    	bool new_device_added = false;
>>>> @@ -1263,10 +1264,19 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
>>>>    		goto error_bdev_put;
>>>>    	}
>>>>    
>>>> +	if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
>>>> +	    !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
>>>> +		pr_info("skip registering single non seed device path %s\n",
>>>> +			path);
>>>
>>> Wouldn't this be too noisy in the logs? With the scanning and
>>> registration repeated scans of a device there will be only the first
>>> message, but on a system with potentially many single-dev devices each
>>> time udev would want to scan it and it'd get logged.
>>
>> Ah. I used it for debugging; it should be removed.
> 
> You can turn it to pr_debug if you found it useful, for testing setups
> we don't mind more messages.

Converted it to pr_debug() in v2.

Thanks, Anand
diff mbox series

Patch

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 32ff441d2c13..22910e0d39a2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -891,7 +891,7 @@  static int btrfs_parse_device_options(const char *options, blk_mode_t flags)
 				error = -ENOMEM;
 				goto out;
 			}
-			device = btrfs_scan_one_device(device_name, flags);
+			device = btrfs_scan_one_device(device_name, flags, false);
 			kfree(device_name);
 			if (IS_ERR(device)) {
 				error = PTR_ERR(device);
@@ -1486,10 +1486,17 @@  static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
 		goto error_fs_info;
 	}
 
-	device = btrfs_scan_one_device(device_name, mode);
-	if (IS_ERR(device)) {
+	device = btrfs_scan_one_device(device_name, mode, true);
+	if (IS_ERR_OR_NULL(device)) {
 		mutex_unlock(&uuid_mutex);
 		error = PTR_ERR(device);
+		/*
+		 * As 3rd argument in the funtion
+		 * btrfs_scan_one_device( , ,mount_arg_dev) above is true, the
+		 * 'device' or the 'error' won't be NULL or 0 respectively
+		 * unless for a bug.
+		 */
+		ASSERT(error);
 		goto error_fs_info;
 	}
 
@@ -2199,7 +2206,8 @@  static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 	switch (cmd) {
 	case BTRFS_IOC_SCAN_DEV:
 		mutex_lock(&uuid_mutex);
-		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
+		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
+		/* Return success i.e. 0 for device == NULL */
 		ret = PTR_ERR_OR_ZERO(device);
 		mutex_unlock(&uuid_mutex);
 		break;
@@ -2213,9 +2221,10 @@  static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 		break;
 	case BTRFS_IOC_DEVICES_READY:
 		mutex_lock(&uuid_mutex);
-		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
-		if (IS_ERR(device)) {
+		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
+		if (IS_ERR_OR_NULL(device)) {
 			mutex_unlock(&uuid_mutex);
+			/* Return success i.e. 0 for device == NULL */
 			ret = PTR_ERR(device);
 			break;
 		}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fa18ea7ef8e3..38e714661286 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1218,7 +1218,8 @@  int btrfs_forget_devices(dev_t devt)
  * and we are not allowed to call set_blocksize during the scan. The superblock
  * is read via pagecache
  */
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
+struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+					   bool mount_arg_dev)
 {
 	struct btrfs_super_block *disk_super;
 	bool new_device_added = false;
@@ -1263,10 +1264,19 @@  struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
 		goto error_bdev_put;
 	}
 
+	if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
+	    !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
+		pr_info("skip registering single non seed device path %s\n",
+			path);
+		device = NULL;
+		goto free_disk_super;
+	}
+
 	device = device_list_add(path, disk_super, &new_device_added);
 	if (!IS_ERR(device) && new_device_added)
 		btrfs_free_stale_devices(device->devt, device);
 
+free_disk_super:
 	btrfs_release_disk_super(disk_super);
 
 error_bdev_put:
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d2a04ede41dd..e4a3470814c5 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -619,7 +619,8 @@  struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
 void btrfs_mapping_tree_free(struct extent_map_tree *tree);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       blk_mode_t flags, void *holder);
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags);
+struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+					   bool mount_arg_dev);
 int btrfs_forget_devices(dev_t devt);
 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);