diff mbox series

btrfs: Create sysfs entries only for non-stale devices

Message ID 20211215144639.876776-1-nborisov@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: Create sysfs entries only for non-stale devices | expand

Commit Message

Nikolay Borisov Dec. 15, 2021, 2:46 p.m. UTC
Currently /sys/fs/btrfs/<uuid>/devinfo/<devid> entry is always created
for a device present in btrfs_fs_devices on the other hand
/sys/fs/btrfs/<uuid>/devices/<devname> sysfs link is created only when
the given btrfs_device::bdisk member is populated. This can lead to
cases where a filesystem consisting of 2 device, one of which is stale
ends up having 2 entries under /sys/fs/btrfs/<uuid>/devinfo/<devid>
but only one under /sys/fs/btrfs/<uuid>/devices/<devname>.

Another case that occurs is if a filesystem initially occupied 2
devices, then got unmounted, and a new filesystem is created, which
occupies a single device but with the same UUID as the one occupying 2
devices. In this case /sys/fs/btrfs/<uuid>/devices/<devname> will
correctly have 1 entry but /sys/fs/btrfs/<uuid>/devices/<devname> will
incorrectly has 2. This behavior is demonstrated by the following
script:

    UUID=292afefb-6e8c-4fb3-9d12-8c4ecb1f2374
    rm /tmp/d1
    rm /tmp/d2
    truncate -s 1G /tmp/d1
    truncate -s 1G /tmp/d2
    sudo losetup /dev/loop1 /tmp/d1
    sudo losetup /dev/loop2 /tmp/d2
    sudo mkfs.btrfs -U $UUID /dev/loop1 /dev/loop2
    sudo mount /dev/loop1 /mnt/btrfs1
    sudo umount /dev/loop1
    sudo losetup -d /dev/loop2
    sudo losetup -d /dev/loop1

    # create a new filesystem with only ONE loop-device; mount it
    rm /tmp/d1
    truncate -s 1G /tmp/d1
    sudo losetup /dev/loop1 /tmp/d1
    sudo mkfs.btrfs -U $UUID /dev/loop1
    sudo mount /dev/loop1 /mnt/btrfs1

Fix this by ensuring that device sysfs attributes are only added for
devices which are actually present at the time of mount.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
---
 fs/btrfs/sysfs.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

Comments

Nikolay Borisov Dec. 15, 2021, 2:47 p.m. UTC | #1
On 15.12.21 г. 16:46, Nikolay Borisov wrote:
> Currently /sys/fs/btrfs/<uuid>/devinfo/<devid> entry is always created
> for a device present in btrfs_fs_devices on the other hand
> /sys/fs/btrfs/<uuid>/devices/<devname> sysfs link is created only when
> the given btrfs_device::bdisk member is populated. This can lead to
> cases where a filesystem consisting of 2 device, one of which is stale
> ends up having 2 entries under /sys/fs/btrfs/<uuid>/devinfo/<devid>
> but only one under /sys/fs/btrfs/<uuid>/devices/<devname>.
> 
> Another case that occurs is if a filesystem initially occupied 2
> devices, then got unmounted, and a new filesystem is created, which
> occupies a single device but with the same UUID as the one occupying 2
> devices. In this case /sys/fs/btrfs/<uuid>/devices/<devname> will
> correctly have 1 entry but /sys/fs/btrfs/<uuid>/devices/<devname> will
> incorrectly has 2. This behavior is demonstrated by the following
> script:
> 
>     UUID=292afefb-6e8c-4fb3-9d12-8c4ecb1f2374
>     rm /tmp/d1
>     rm /tmp/d2
>     truncate -s 1G /tmp/d1
>     truncate -s 1G /tmp/d2
>     sudo losetup /dev/loop1 /tmp/d1
>     sudo losetup /dev/loop2 /tmp/d2
>     sudo mkfs.btrfs -U $UUID /dev/loop1 /dev/loop2
>     sudo mount /dev/loop1 /mnt/btrfs1
>     sudo umount /dev/loop1
>     sudo losetup -d /dev/loop2
>     sudo losetup -d /dev/loop1
> 
>     # create a new filesystem with only ONE loop-device; mount it
>     rm /tmp/d1
>     truncate -s 1G /tmp/d1
>     sudo losetup /dev/loop1 /tmp/d1
>     sudo mkfs.btrfs -U $UUID /dev/loop1
>     sudo mount /dev/loop1 /mnt/btrfs1
> 
> Fix this by ensuring that device sysfs attributes are only added for
> devices which are actually present at the time of mount.
> 
> Signed-off-by: Nikolay Borisov <nborisov@suse.com>
> ---
>  fs/btrfs/sysfs.c | 21 ++++++++++++---------
>  1 file changed, 12 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
> index beb7f72d50b8..e2e110d7798a 100644
> --- a/fs/btrfs/sysfs.c
> +++ b/fs/btrfs/sysfs.c
> @@ -12,6 +12,7 @@
>  #include <crypto/hash.h>
>  
>  #include "ctree.h"
> +#include "rcu-string.h"

Doh, that's unrelated and is a remain of some debug aid. If there are no
major feedback on the patch David could you remove it when merging.

>  #include "discard.h"
>  #include "disk-io.h"
>  #include "send.h"
> @@ -1611,9 +1612,13 @@ int btrfs_sysfs_add_device(struct btrfs_device *device)
>  {
>  	int ret;
>  	unsigned int nofs_flag;
> +	struct kobject *disk_kobj;
>  	struct kobject *devices_kobj;
>  	struct kobject *devinfo_kobj;
>  
> +	if (!device->bdev)
> +		return 0;
> +
>  	/*
>  	 * Make sure we use the fs_info::fs_devices to fetch the kobjects even
>  	 * for the seed fs_devices
> @@ -1625,16 +1630,14 @@ int btrfs_sysfs_add_device(struct btrfs_device *device)
>  
>  	nofs_flag = memalloc_nofs_save();
>  
> -	if (device->bdev) {
> -		struct kobject *disk_kobj = bdev_kobj(device->bdev);
> +	disk_kobj = bdev_kobj(device->bdev);
>  
> -		ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
> -		if (ret) {
> -			btrfs_warn(device->fs_info,
> -				"creating sysfs device link for devid %llu failed: %d",
> -				device->devid, ret);
> -			goto out;
> -		}
> +	ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
> +	if (ret) {
> +		btrfs_warn(device->fs_info,
> +			   "creating sysfs device link for devid %llu failed: %d",
> +			   device->devid, ret);
> +		goto out;
>  	}
>  
>  	init_completion(&device->kobj_unregister);
>
Goffredo Baroncelli Dec. 15, 2021, 6:55 p.m. UTC | #2
Hi Nikolay,

On 12/15/21 15:46, Nikolay Borisov wrote:
> Currently /sys/fs/btrfs/<uuid>/devinfo/<devid> entry is always created
> for a device present in btrfs_fs_devices on the other hand
> /sys/fs/btrfs/<uuid>/devices/<devname> sysfs link is created only when
> the given btrfs_device::bdisk member is populated. This can lead to
> cases where a filesystem consisting of 2 device, one of which is stale
> ends up having 2 entries under /sys/fs/btrfs/<uuid>/devinfo/<devid>
> but only one under /sys/fs/btrfs/<uuid>/devices/<devname>.

What happened in case of a degraded mode ? Is correct to not show a missing devices ?


> Another case that occurs is if a filesystem initially occupied 2
> devices, then got unmounted, and a new filesystem is created, which
> occupies a single device but with the same UUID as the one occupying 2
> devices. In this case /sys/fs/btrfs/<uuid>/devices/<devname> will
> correctly have 1 entry but /sys/fs/btrfs/<uuid>/devices/<devname> will
> incorrectly has 2. This behavior is demonstrated by the following
> script:
> 
>      UUID=292afefb-6e8c-4fb3-9d12-8c4ecb1f2374
>      rm /tmp/d1
>      rm /tmp/d2
>      truncate -s 1G /tmp/d1
>      truncate -s 1G /tmp/d2
>      sudo losetup /dev/loop1 /tmp/d1
>      sudo losetup /dev/loop2 /tmp/d2
>      sudo mkfs.btrfs -U $UUID /dev/loop1 /dev/loop2
>      sudo mount /dev/loop1 /mnt/btrfs1
>      sudo umount /dev/loop1
>      sudo losetup -d /dev/loop2
>      sudo losetup -d /dev/loop1
> 
>      # create a new filesystem with only ONE loop-device; mount it
>      rm /tmp/d1
>      truncate -s 1G /tmp/d1
>      sudo losetup /dev/loop1 /tmp/d1
>      sudo mkfs.btrfs -U $UUID /dev/loop1
>      sudo mount /dev/loop1 /mnt/btrfs1
> 
> Fix this by ensuring that device sysfs attributes are only added for
> devices which are actually present at the time of mount.
> 
> Signed-off-by: Nikolay Borisov <nborisov@suse.com>
> ---
>   fs/btrfs/sysfs.c | 21 ++++++++++++---------
>   1 file changed, 12 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
> index beb7f72d50b8..e2e110d7798a 100644
> --- a/fs/btrfs/sysfs.c
> +++ b/fs/btrfs/sysfs.c
> @@ -12,6 +12,7 @@
>   #include <crypto/hash.h>
>   
>   #include "ctree.h"
> +#include "rcu-string.h"
>   #include "discard.h"
>   #include "disk-io.h"
>   #include "send.h"
> @@ -1611,9 +1612,13 @@ int btrfs_sysfs_add_device(struct btrfs_device *device)
>   {
>   	int ret;
>   	unsigned int nofs_flag;
> +	struct kobject *disk_kobj;
>   	struct kobject *devices_kobj;
>   	struct kobject *devinfo_kobj;
>   
> +	if (!device->bdev)
> +		return 0;
> +
>   	/*
>   	 * Make sure we use the fs_info::fs_devices to fetch the kobjects even
>   	 * for the seed fs_devices
> @@ -1625,16 +1630,14 @@ int btrfs_sysfs_add_device(struct btrfs_device *device)
>   
>   	nofs_flag = memalloc_nofs_save();
>   
> -	if (device->bdev) {
> -		struct kobject *disk_kobj = bdev_kobj(device->bdev);
> +	disk_kobj = bdev_kobj(device->bdev);
>   
> -		ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
> -		if (ret) {
> -			btrfs_warn(device->fs_info,
> -				"creating sysfs device link for devid %llu failed: %d",
> -				device->devid, ret);
> -			goto out;
> -		}
> +	ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
> +	if (ret) {
> +		btrfs_warn(device->fs_info,
> +			   "creating sysfs device link for devid %llu failed: %d",
> +			   device->devid, ret);
> +		goto out;
>   	}
>   
>   	init_completion(&device->kobj_unregister);
Nikolay Borisov Dec. 15, 2021, 10:02 p.m. UTC | #3
On 15.12.21 г. 20:55, Goffredo Baroncelli wrote:
> Hi Nikolay,
> 
> On 12/15/21 15:46, Nikolay Borisov wrote:
>> Currently /sys/fs/btrfs/<uuid>/devinfo/<devid> entry is always created
>> for a device present in btrfs_fs_devices on the other hand
>> /sys/fs/btrfs/<uuid>/devices/<devname> sysfs link is created only when
>> the given btrfs_device::bdisk member is populated. This can lead to
>> cases where a filesystem consisting of 2 device, one of which is stale
>> ends up having 2 entries under /sys/fs/btrfs/<uuid>/devinfo/<devid>
>> but only one under /sys/fs/btrfs/<uuid>/devices/<devname>.
> 
> What happened in case of a degraded mode ? Is correct to not show a
> missing devices ?

Good question, now I'm thinking if 'devices' show the currently
available devices to the filesystem, whilst 'devinfo' is supposed to
show what devices the fs knows about. So in the case of degraded mount
it should really have 2 devices under devinfo and only 1 under device.
But this also means the case you reported shouldn't be handled by the
devinfo sysfs code but rather the admin should do 'btrfs device scan -u'
to remove the stale device.


I'd say this is all pretty open to interpretation so I'd like to also
see David's and Josef's opinions on this.


> 
> 
<snip>
Goffredo Baroncelli Dec. 16, 2021, 6:52 p.m. UTC | #4
On 12/15/21 23:02, Nikolay Borisov wrote:
> 
> 
> On 15.12.21 г. 20:55, Goffredo Baroncelli wrote:
>> Hi Nikolay,
>>
>> On 12/15/21 15:46, Nikolay Borisov wrote:
>>> Currently /sys/fs/btrfs/<uuid>/devinfo/<devid> entry is always created
>>> for a device present in btrfs_fs_devices on the other hand
>>> /sys/fs/btrfs/<uuid>/devices/<devname> sysfs link is created only when
>>> the given btrfs_device::bdisk member is populated. This can lead to
>>> cases where a filesystem consisting of 2 device, one of which is stale
>>> ends up having 2 entries under /sys/fs/btrfs/<uuid>/devinfo/<devid>
>>> but only one under /sys/fs/btrfs/<uuid>/devices/<devname>.
>>
>> What happened in case of a degraded mode ? Is correct to not show a
>> missing devices ?
> 
> Good question, now I'm thinking if 'devices' show the currently
> available devices to the filesystem, whilst 'devinfo' is supposed to
> show what devices the fs knows about. So in the case of degraded mount
> it should really have 2 devices under devinfo and only 1 under device.
> But this also means the case you reported shouldn't be handled by the
> devinfo sysfs code but rather the admin should do 'btrfs device scan -u'
> to remove the stale device.
> 
> 
> I'd say this is all pretty open to interpretation so I'd like to also
> see David's and Josef's opinions on this.

After some thinking, I reach the conclusion that devices/ shows the correct
values by mistake :-)

My understanding of a btrfs filesystem bootstrap is the following:

- each time a new block device appears, if this is a valid BTRFS device,
it is registered in a list
- at mount time, BTRFS groups all the devices with the same FS-UUID
- during the mount, it is assumed that if a device has a valid FS-UUID it
is a valid block devices for the filesystem

BTRFS has the following information to detect "foreign" block devices:
1) the UUID of each block devices should match the ones stored in DEV_ITEM (in the metadata)
2) the generation number should match
3) the "num_devices" field of superblock should match the total number of devices

In this case I see two problems:

a) the device was registered but it was unavailable at the time of mounting.
I don't think that it should be considered valid at all and it should be
removed from the list of the available devices when the first access for
check 1) and 2) is tried.
I know that a device can disappear after, but this case is different: if we
can perform 1) and 2) we can classify the device as valid/invalid
If we cannot do (as this case) we can consider it an artifact and discard enterly

b) in any case the check 3) ( 1) and 1) cannot be performed without the device)
should prevent the filesystem to be mounted if num_devices < number of the
listed devices (which is different from the available device). Looking at the
code it seems that only the case of a missing devices is handled










> 
>>
>>
> <snip>
diff mbox series

Patch

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index beb7f72d50b8..e2e110d7798a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -12,6 +12,7 @@ 
 #include <crypto/hash.h>
 
 #include "ctree.h"
+#include "rcu-string.h"
 #include "discard.h"
 #include "disk-io.h"
 #include "send.h"
@@ -1611,9 +1612,13 @@  int btrfs_sysfs_add_device(struct btrfs_device *device)
 {
 	int ret;
 	unsigned int nofs_flag;
+	struct kobject *disk_kobj;
 	struct kobject *devices_kobj;
 	struct kobject *devinfo_kobj;
 
+	if (!device->bdev)
+		return 0;
+
 	/*
 	 * Make sure we use the fs_info::fs_devices to fetch the kobjects even
 	 * for the seed fs_devices
@@ -1625,16 +1630,14 @@  int btrfs_sysfs_add_device(struct btrfs_device *device)
 
 	nofs_flag = memalloc_nofs_save();
 
-	if (device->bdev) {
-		struct kobject *disk_kobj = bdev_kobj(device->bdev);
+	disk_kobj = bdev_kobj(device->bdev);
 
-		ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
-		if (ret) {
-			btrfs_warn(device->fs_info,
-				"creating sysfs device link for devid %llu failed: %d",
-				device->devid, ret);
-			goto out;
-		}
+	ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
+	if (ret) {
+		btrfs_warn(device->fs_info,
+			   "creating sysfs device link for devid %llu failed: %d",
+			   device->devid, ret);
+		goto out;
 	}
 
 	init_completion(&device->kobj_unregister);