diff mbox series

[v4,2/2] btrfs: reset device back to allocation state when removing

Message ID 20191126084006.23262-3-jthumshirn@suse.de (mailing list archive)
State New, archived
Headers show
Series remove BUG_ON()s in btrfs_close_one_device() | expand

Commit Message

Johannes Thumshirn Nov. 26, 2019, 8:40 a.m. UTC
When closing a device, btrfs_close_one_device() first allocates a new
device, copies the device to close's name, replaces it in the dev_list
with the copy and then finally frees it.

This involves two memory allocation, which can potentially fail. As this
code path is tricky to unwind, the allocation failures where handled by
BUG_ON()s.

But this copying isn't strictly needed, all that is needed is resetting
the device in question to it's state it had after the allocation.

Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>

---
Changes to v3:
- Clear DEV_STATE_WRITABLE _after_ btrfs_close_bdev() (Nik)
---
 fs/btrfs/volumes.c | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

Comments

Nikolay Borisov Nov. 26, 2019, 10:21 a.m. UTC | #1
On 26.11.19 г. 10:40 ч., Johannes Thumshirn wrote:
> When closing a device, btrfs_close_one_device() first allocates a new
> device, copies the device to close's name, replaces it in the dev_list
> with the copy and then finally frees it.
> 
> This involves two memory allocation, which can potentially fail. As this
> code path is tricky to unwind, the allocation failures where handled by
> BUG_ON()s.
> 
> But this copying isn't strictly needed, all that is needed is resetting
> the device in question to it's state it had after the allocation.
> 
> Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>

Overall looks good one minor nit which can be fixed at commit time (or
ignored).

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
> 
> ---
> Changes to v3:
> - Clear DEV_STATE_WRITABLE _after_ btrfs_close_bdev() (Nik)
> ---
>  fs/btrfs/volumes.c | 38 +++++++++++++++++---------------------
>  1 file changed, 17 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 2398b071bcf6..efabffa54a45 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1064,8 +1064,6 @@ static void btrfs_close_bdev(struct btrfs_device *device)
>  static void btrfs_close_one_device(struct btrfs_device *device)
>  {
>  	struct btrfs_fs_devices *fs_devices = device->fs_devices;
> -	struct btrfs_device *new_device;
> -	struct rcu_string *name;
>  
>  	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
>  	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
> @@ -1073,29 +1071,27 @@ static void btrfs_close_one_device(struct btrfs_device *device)
>  		fs_devices->rw_devices--;
>  	}
>  
> -	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
> +	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
>  		fs_devices->missing_devices--;
> +		clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
> +	}
nit: you can use test_and_clear_bit

>  
> +	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
>  	btrfs_close_bdev(device);
> -	if (device->bdev)
> +	if (device->bdev) {
>  		fs_devices->open_devices--;
> -
> -	new_device = btrfs_alloc_device(NULL, &device->devid,
> -					device->uuid);
> -	BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
> -
> -	/* Safe because we are under uuid_mutex */
> -	if (device->name) {
> -		name = rcu_string_strdup(device->name->str, GFP_NOFS);
> -		BUG_ON(!name); /* -ENOMEM */
> -		rcu_assign_pointer(new_device->name, name);
> -	}
> -
> -	list_replace_rcu(&device->dev_list, &new_device->dev_list);
> -	new_device->fs_devices = device->fs_devices;
> -
> -	synchronize_rcu();
> -	btrfs_free_device(device);
> +		device->bdev = NULL;
> +	}
> +	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
> +
> +	/* Verify the device is back in a pristine state  */
> +	ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
> +	ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
> +	ASSERT(list_empty(&device->dev_alloc_list));
> +	ASSERT(list_empty(&device->post_commit_list));
> +	ASSERT(atomic_read(&device->reada_in_flight) == 0);
> +	ASSERT(atomic_read(&device->dev_stats_ccnt) == 0);
> +	ASSERT(RB_EMPTY_ROOT(&device->alloc_state.state));
>  }
>  
>  static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
>
David Sterba Nov. 26, 2019, 5:17 p.m. UTC | #2
On Tue, Nov 26, 2019 at 09:40:06AM +0100, Johannes Thumshirn wrote:
> When closing a device, btrfs_close_one_device() first allocates a new
> device, copies the device to close's name, replaces it in the dev_list
> with the copy and then finally frees it.
> 
> This involves two memory allocation, which can potentially fail. As this
> code path is tricky to unwind, the allocation failures where handled by
> BUG_ON()s.
> 
> But this copying isn't strictly needed, all that is needed is resetting
> the device in question to it's state it had after the allocation.
> 
> Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
> 
> ---
> Changes to v3:
> - Clear DEV_STATE_WRITABLE _after_ btrfs_close_bdev() (Nik)
> ---
>  fs/btrfs/volumes.c | 38 +++++++++++++++++---------------------
>  1 file changed, 17 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 2398b071bcf6..efabffa54a45 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1064,8 +1064,6 @@ static void btrfs_close_bdev(struct btrfs_device *device)
>  static void btrfs_close_one_device(struct btrfs_device *device)
>  {
>  	struct btrfs_fs_devices *fs_devices = device->fs_devices;
> -	struct btrfs_device *new_device;
> -	struct rcu_string *name;
>  
>  	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
>  	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
> @@ -1073,29 +1071,27 @@ static void btrfs_close_one_device(struct btrfs_device *device)
>  		fs_devices->rw_devices--;
>  	}
>  
> -	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
> +	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
>  		fs_devices->missing_devices--;
> +		clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
> +	}
>  
> +	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
>  	btrfs_close_bdev(device);
> -	if (device->bdev)
> +	if (device->bdev) {
>  		fs_devices->open_devices--;
> -
> -	new_device = btrfs_alloc_device(NULL, &device->devid,
> -					device->uuid);
> -	BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
> -
> -	/* Safe because we are under uuid_mutex */
> -	if (device->name) {
> -		name = rcu_string_strdup(device->name->str, GFP_NOFS);
> -		BUG_ON(!name); /* -ENOMEM */
> -		rcu_assign_pointer(new_device->name, name);
> -	}
> -
> -	list_replace_rcu(&device->dev_list, &new_device->dev_list);
> -	new_device->fs_devices = device->fs_devices;
> -
> -	synchronize_rcu();

The changelong should say something about RCU as this is being changed
here.

RCU was for the dev_list and and device name, with some read-only list
traversal that can run in parallel (like FS_INFO or DEV_INFO ioctls).
This is safe because the device list hook is not removed and the name is
not changed.

> -	btrfs_free_device(device);
> +		device->bdev = NULL;
> +	}
> +	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
> +
> +	/* Verify the device is back in a pristine state  */
> +	ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
> +	ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
> +	ASSERT(list_empty(&device->dev_alloc_list));
> +	ASSERT(list_empty(&device->post_commit_list));
> +	ASSERT(atomic_read(&device->reada_in_flight) == 0);
> +	ASSERT(atomic_read(&device->dev_stats_ccnt) == 0);
> +	ASSERT(RB_EMPTY_ROOT(&device->alloc_state.state));

I went through members of the device struct, lots of them are set once
so don't change. last_flush_error is set and read during commit,

Besides the dev_state bits handled above, I think tre rest should be
here too, ie.  BTRFS_DEV_STATE_IN_FS_METADATA and
BTRFS_DEV_STATE_MISSING (though this might be ok to keep as-is).
David Sterba Nov. 27, 2019, 3:26 p.m. UTC | #3
On Tue, Nov 26, 2019 at 06:17:03PM +0100, David Sterba wrote:
> > +	/* Verify the device is back in a pristine state  */
> > +	ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
> > +	ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
> > +	ASSERT(list_empty(&device->dev_alloc_list));
> > +	ASSERT(list_empty(&device->post_commit_list));
> > +	ASSERT(atomic_read(&device->reada_in_flight) == 0);
> > +	ASSERT(atomic_read(&device->dev_stats_ccnt) == 0);
> > +	ASSERT(RB_EMPTY_ROOT(&device->alloc_state.state));
> 
> I went through members of the device struct, lots of them are set once
> so don't change. last_flush_error is set and read during commit,
> 
> Besides the dev_state bits handled above, I think tre rest should be
> here too, ie.  BTRFS_DEV_STATE_IN_FS_METADATA and
> BTRFS_DEV_STATE_MISSING (though this might be ok to keep as-is).

So BTRFS_DEV_STATE_MISSING should stay, the state is changed through the
scanning.

BTRFS_DEV_STATE_IN_FS_METADATA should be asserted for 'not-set', this is
normally set_bit at mount time so the last use of devices with the bit
set should set it back to zero.
David Sterba Nov. 27, 2019, 5:07 p.m. UTC | #4
On Tue, Nov 26, 2019 at 09:40:06AM +0100, Johannes Thumshirn wrote:
> +	ASSERT(atomic_read(&device->dev_stats_ccnt) == 0);

btrfs/020               [16:59:58][ 3477.975072] run fstests btrfs/020 at 2019-11-27 16:59:58
[ 3478.974314] kworker/dying (5607) used greatest stack depth: 10792 bytes left
[ 3479.206988] BTRFS: device fsid 818a4909-467d-4599-979c-3b258fb4fc41 devid 1 transid 5 /dev/loop0 scanned by mkfs.btrfs (27347)
[ 3479.234212] BTRFS: device fsid 818a4909-467d-4599-979c-3b258fb4fc41 devid 2 transid 5 /dev/loop1 scanned by mkfs.btrfs (27347)
[ 3479.343996] BTRFS info (device loop0): disk space caching is enabled
[ 3479.349590] BTRFS info (device loop0): has skinny extents
[ 3479.352721] BTRFS info (device loop0): flagging fs with big metadata feature
[ 3479.360793] BTRFS info (device loop0): enabling ssd optimizations
[ 3479.614065] assertion failed: atomic_read(&device->dev_stats_ccnt) == 0, in fs/btrfs/volumes.c:1093
[ 3479.622041] ------------[ cut here ]------------
[ 3479.625272] kernel BUG at fs/btrfs/ctree.h:3118!
[ 3479.628555] invalid opcode: 0000 [#1] SMP
[ 3479.631350] CPU: 1 PID: 27375 Comm: umount Not tainted 5.4.0-rc8-default+ #882
[ 3479.633838] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58-rebuilt.opensuse.org 04/01/2014
[ 3479.637681] RIP: 0010:assfail.constprop.0+0x18/0x26 [btrfs]
[ 3479.646004] RSP: 0018:ffff9f4444a6fd98 EFLAGS: 00010246
[ 3479.647947] RAX: 0000000000000057 RBX: ffff9e0e6bb64c00 RCX: 0000000000000006
[ 3479.650343] RDX: 0000000000000000 RSI: ffff9e0e71352408 RDI: ffff9e0e71351bc0
[ 3479.652838] RBP: ffff9e0e6bb64c60 R08: 0000032a29a25ac1 R09: 0000000000000000
[ 3479.654766] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9e0e746e0200
[ 3479.656866] R13: ffff9e0e746e0200 R14: ffff9e0e746e0318 R15: ffff9e0e6bb61000
[ 3479.658852] FS:  00007fabe5d8a800(0000) GS:ffff9e0e7d800000(0000) knlGS:0000000000000000
[ 3479.661542] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3479.663642] CR2: 00007ffeb8101f68 CR3: 000000004e979002 CR4: 0000000000160ee0
[ 3479.666132] Call Trace:
[ 3479.667333]  close_fs_devices.cold+0x66/0x77 [btrfs]
[ 3479.669143]  btrfs_close_devices+0x22/0x90 [btrfs]
[ 3479.670733]  close_ctree+0x2b9/0x32c [btrfs]
[ 3479.672296]  generic_shutdown_super+0x69/0x100
[ 3479.673662]  kill_anon_super+0x14/0x30
[ 3479.675045]  btrfs_kill_super+0x12/0xa0 [btrfs]
[ 3479.676550]  deactivate_locked_super+0x2c/0x70
[ 3479.678056]  cleanup_mnt+0x100/0x160
[ 3479.679245]  task_work_run+0x90/0xc0
[ 3479.680582]  exit_to_usermode_loop+0x96/0xa0
[ 3479.681945]  do_syscall_64+0x19d/0x1f0
[ 3479.683243]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 3479.684908] RIP: 0033:0x7fabe5fce4e7
[ 3479.691687] RSP: 002b:00007ffeb81037c8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
[ 3479.694285] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 00007fabe5fce4e7
[ 3479.696603] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000056236d27cb80
[ 3479.698989] RBP: 000056236d27c970 R08: 0000000000000000 R09: 00007ffeb8102540
[ 3479.701220] R10: 000056236d27cba0 R11: 0000000000000246 R12: 000056236d27cb80
[ 3479.703416] R13: 0000000000000000 R14: 000056236d27ca68 R15: 0000000000000000
[ 3479.705272] Modules linked in: xxhash_generic btrfs libcrc32c crc32c_intel xor zstd_decompress zstd_compress xxhash lzo_compress lzo_decompress raid6_pq loop
[ 3479.709315] ---[ end trace 6a6ab278b1509a6f ]---
[ 3479.710971] RIP: 0010:assfail.constprop.0+0x18/0x26 [btrfs]
[ 3479.718120] RSP: 0018:ffff9f4444a6fd98 EFLAGS: 00010246
[ 3479.719761] RAX: 0000000000000057 RBX: ffff9e0e6bb64c00 RCX: 0000000000000006
[ 3479.721828] RDX: 0000000000000000 RSI: ffff9e0e71352408 RDI: ffff9e0e71351bc0
[ 3479.723586] RBP: ffff9e0e6bb64c60 R08: 0000032a29a25ac1 R09: 0000000000000000
[ 3479.725770] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9e0e746e0200
[ 3479.727906] R13: ffff9e0e746e0200 R14: ffff9e0e746e0318 R15: ffff9e0e6bb61000
[ 3479.739121] FS:  00007fabe5d8a800(0000) GS:ffff9e0e7d800000(0000) knlGS:0000000000000000
[ 3479.741653] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3479.743411] CR2: 00007ffeb8101f68 CR3: 000000004e979002 CR4: 0000000000160ee0
Johannes Thumshirn Nov. 28, 2019, 9:29 a.m. UTC | #5
On 27/11/2019 18:07, David Sterba wrote:
> On Tue, Nov 26, 2019 at 09:40:06AM +0100, Johannes Thumshirn wrote:
>> +	ASSERT(atomic_read(&device->dev_stats_ccnt) == 0);
> 
> btrfs/020               [16:59:58][ 3477.975072] run fstests btrfs/020 at 2019-11-27 16:59:58
> [ 3478.974314] kworker/dying (5607) used greatest stack depth: 10792 bytes left
> [ 3479.206988] BTRFS: device fsid 818a4909-467d-4599-979c-3b258fb4fc41 devid 1 transid 5 /dev/loop0 scanned by mkfs.btrfs (27347)
> [ 3479.234212] BTRFS: device fsid 818a4909-467d-4599-979c-3b258fb4fc41 devid 2 transid 5 /dev/loop1 scanned by mkfs.btrfs (27347)
> [ 3479.343996] BTRFS info (device loop0): disk space caching is enabled
> [ 3479.349590] BTRFS info (device loop0): has skinny extents
> [ 3479.352721] BTRFS info (device loop0): flagging fs with big metadata feature
> [ 3479.360793] BTRFS info (device loop0): enabling ssd optimizations
> [ 3479.614065] assertion failed: atomic_read(&device->dev_stats_ccnt) == 0, in fs/btrfs/volumes.c:1093
> [ 3479.622041] ------------[ cut here ]------------
> [ 3479.625272] kernel BUG at fs/btrfs/ctree.h:3118!

My test setup was missing loopback device support, fixed that.
diff mbox series

Patch

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2398b071bcf6..efabffa54a45 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1064,8 +1064,6 @@  static void btrfs_close_bdev(struct btrfs_device *device)
 static void btrfs_close_one_device(struct btrfs_device *device)
 {
 	struct btrfs_fs_devices *fs_devices = device->fs_devices;
-	struct btrfs_device *new_device;
-	struct rcu_string *name;
 
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
 	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
@@ -1073,29 +1071,27 @@  static void btrfs_close_one_device(struct btrfs_device *device)
 		fs_devices->rw_devices--;
 	}
 
-	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
 		fs_devices->missing_devices--;
+		clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
+	}
 
+	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	btrfs_close_bdev(device);
-	if (device->bdev)
+	if (device->bdev) {
 		fs_devices->open_devices--;
-
-	new_device = btrfs_alloc_device(NULL, &device->devid,
-					device->uuid);
-	BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
-
-	/* Safe because we are under uuid_mutex */
-	if (device->name) {
-		name = rcu_string_strdup(device->name->str, GFP_NOFS);
-		BUG_ON(!name); /* -ENOMEM */
-		rcu_assign_pointer(new_device->name, name);
-	}
-
-	list_replace_rcu(&device->dev_list, &new_device->dev_list);
-	new_device->fs_devices = device->fs_devices;
-
-	synchronize_rcu();
-	btrfs_free_device(device);
+		device->bdev = NULL;
+	}
+	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+
+	/* Verify the device is back in a pristine state  */
+	ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
+	ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
+	ASSERT(list_empty(&device->dev_alloc_list));
+	ASSERT(list_empty(&device->post_commit_list));
+	ASSERT(atomic_read(&device->reada_in_flight) == 0);
+	ASSERT(atomic_read(&device->dev_stats_ccnt) == 0);
+	ASSERT(RB_EMPTY_ROOT(&device->alloc_state.state));
 }
 
 static int close_fs_devices(struct btrfs_fs_devices *fs_devices)