Message ID | 20240705-b4-rst-updates-v4-3-f3eed3f2cfad@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | btrfs: rst: updates for RAID stripe tree | expand |
在 2024/7/6 00:43, Johannes Thumshirn 写道: > From: Johannes Thumshirn <johannes.thumshirn@wdc.com> > > The current RAID stripe code assumes, that we will always remove a > whole stripe entry. > > But if we're only removing a part of a RAID stripe we're hitting the > ASSERT()ion checking for this condition. > > Instead of assuming the complete deletion of a RAID stripe, split the > stripe if we need to. Sorry to be so critical, but if I understand correctly, btrfs_insert_one_raid_extent() does not do any merge of stripe extent. Thus one stripe extent always means part of a data extent. In that case a removal of a data extent should always remove all its stripe extents. Furthermore due to the COW nature on zoned/rst devices, the space of a deleted data extent should not be re-allocated until a transaction commitment. Thus I'm wonder if this split is masking some bigger problems. Thanks, Qu > > Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> > --- > fs/btrfs/ctree.c | 1 + > fs/btrfs/raid-stripe-tree.c | 98 ++++++++++++++++++++++++++++++++++----------- > 2 files changed, 75 insertions(+), 24 deletions(-) > > diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c > index 451203055bbf..82278e54969e 100644 > --- a/fs/btrfs/ctree.c > +++ b/fs/btrfs/ctree.c > @@ -3863,6 +3863,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, > btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); > > BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && > + key.type != BTRFS_RAID_STRIPE_KEY && > key.type != BTRFS_EXTENT_CSUM_KEY); > > if (btrfs_leaf_free_space(leaf) >= ins_len) > diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c > index 84746c8886be..d2a6e409b3e8 100644 > --- a/fs/btrfs/raid-stripe-tree.c > +++ b/fs/btrfs/raid-stripe-tree.c > @@ -33,42 +33,92 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le > if (!path) > return -ENOMEM; > > - while (1) { > - key.objectid = start; > - key.type = BTRFS_RAID_STRIPE_KEY; > - key.offset = length; > +again: > + key.objectid = start; > + key.type = BTRFS_RAID_STRIPE_KEY; > + key.offset = length; > > - ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); > - if (ret < 0) > - break; > - if (ret > 0) { > - ret = 0; > - if (path->slots[0] == 0) > - break; > - path->slots[0]--; > - } > + ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); > + if (ret < 0) > + goto out; > + if (ret > 0) { > + ret = 0; > + if (path->slots[0] == 0) > + goto out; > + path->slots[0]--; > + } > + > + leaf = path->nodes[0]; > + slot = path->slots[0]; > + btrfs_item_key_to_cpu(leaf, &key, slot); > + found_start = key.objectid; > + found_end = found_start + key.offset; > + > + /* That stripe ends before we start, we're done. */ > + if (found_end <= start) > + goto out; > + > + trace_btrfs_raid_extent_delete(fs_info, start, end, > + found_start, found_end); > + > + if (found_start < start) { > + u64 diff = start - found_start; > + struct btrfs_key new_key; > + int num_stripes; > + struct btrfs_stripe_extent *stripe_extent; > + > + new_key.objectid = start; > + new_key.type = BTRFS_RAID_STRIPE_KEY; > + new_key.offset = length - diff; > + > + ret = btrfs_duplicate_item(trans, stripe_root, path, > + &new_key); > + if (ret) > + goto out; > > leaf = path->nodes[0]; > slot = path->slots[0]; > - btrfs_item_key_to_cpu(leaf, &key, slot); > - found_start = key.objectid; > - found_end = found_start + key.offset; > > - /* That stripe ends before we start, we're done. */ > - if (found_end <= start) > - break; > + num_stripes = > + btrfs_num_raid_stripes(btrfs_item_size(leaf, slot)); > + stripe_extent = > + btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); > + > + for (int i = 0; i < num_stripes; i++) { > + struct btrfs_raid_stride *raid_stride = > + &stripe_extent->strides[i]; > + u64 physical = > + btrfs_raid_stride_physical(leaf, raid_stride); > + > + btrfs_set_raid_stride_physical(leaf, raid_stride, > + physical + diff); > + } > + > + btrfs_mark_buffer_dirty(trans, leaf); > + btrfs_release_path(path); > + goto again; > + } > > - trace_btrfs_raid_extent_delete(fs_info, start, end, > - found_start, found_end); > + if (found_end > end) { > + u64 diff = found_end - end; > + struct btrfs_key new_key; > > - ASSERT(found_start >= start && found_end <= end); > - ret = btrfs_del_item(trans, stripe_root, path); > + new_key.objectid = found_start; > + new_key.type = BTRFS_RAID_STRIPE_KEY; > + new_key.offset = length - diff; > + > + ret = btrfs_duplicate_item(trans, stripe_root, path, > + &new_key); > if (ret) > - break; > + goto out; > > + btrfs_mark_buffer_dirty(trans, leaf); > btrfs_release_path(path); > } > > + ret = btrfs_del_item(trans, stripe_root, path); > + > + out: > btrfs_free_path(path); > return ret; > } >
On 06.07.24 01:26, Qu Wenruo wrote: > > > 在 2024/7/6 00:43, Johannes Thumshirn 写道: >> From: Johannes Thumshirn <johannes.thumshirn@wdc.com> >> >> The current RAID stripe code assumes, that we will always remove a >> whole stripe entry. >> >> But if we're only removing a part of a RAID stripe we're hitting the >> ASSERT()ion checking for this condition. >> >> Instead of assuming the complete deletion of a RAID stripe, split the >> stripe if we need to. > > Sorry to be so critical, but if I understand correctly, > btrfs_insert_one_raid_extent() does not do any merge of stripe extent. No problem at all. I want to solve bugs, not increase my patch count ;). > > Thus one stripe extent always means part of a data extent. > > In that case a removal of a data extent should always remove all its > stripe extents. > > Furthermore due to the COW nature on zoned/rst devices, the space of a > deleted data extent should not be re-allocated until a transaction > commitment. > > Thus I'm wonder if this split is masking some bigger problems. Hmm now that you're saying it. The reason I wrote this path is, that I did hit the following ASSERT() in my testing: >> - ASSERT(found_start >= start && found_end <= end); This indicates a partial delete of a stripe extent. But I agree as stripe extents are tied to extent items, this shouldn't really happen. So maybe most of this series (apart from the deadlock fix) masks problems? I'm back to the drawing board :(.
在 2024/7/8 14:26, Johannes Thumshirn 写道: > On 06.07.24 01:26, Qu Wenruo wrote: >> >> >> 在 2024/7/6 00:43, Johannes Thumshirn 写道: >>> From: Johannes Thumshirn <johannes.thumshirn@wdc.com> >>> >>> The current RAID stripe code assumes, that we will always remove a >>> whole stripe entry. >>> >>> But if we're only removing a part of a RAID stripe we're hitting the >>> ASSERT()ion checking for this condition. >>> >>> Instead of assuming the complete deletion of a RAID stripe, split the >>> stripe if we need to. >> >> Sorry to be so critical, but if I understand correctly, >> btrfs_insert_one_raid_extent() does not do any merge of stripe extent. > > No problem at all. I want to solve bugs, not increase my patch count ;). > >> >> Thus one stripe extent always means part of a data extent. >> >> In that case a removal of a data extent should always remove all its >> stripe extents. >> >> Furthermore due to the COW nature on zoned/rst devices, the space of a >> deleted data extent should not be re-allocated until a transaction >> commitment. >> >> Thus I'm wonder if this split is masking some bigger problems. > > Hmm now that you're saying it. The reason I wrote this path is, that I > did hit the following ASSERT() in my testing: > >>> - ASSERT(found_start >= start && found_end <= end); > > This indicates a partial delete of a stripe extent. But I agree as > stripe extents are tied to extent items, this shouldn't really happen. > > So maybe most of this series (apart from the deadlock fix) masks problems? > > I'm back to the drawing board :(. Can the ASSERT() be reproduced without a zoned device? (I'm really not a fan of the existing tcmu emulated solution, meanwhile libvirt still doesn't support ZNS devices) If it can be reproduced just with RST feature, I may provide some help digging into the ASSERT(). Thanks, Qu
On 08.07.24 07:20, Qu Wenruo wrote: > > Can the ASSERT() be reproduced without a zoned device? (I'm really not a > fan of the existing tcmu emulated solution, meanwhile libvirt still > doesn't support ZNS devices) > > If it can be reproduced just with RST feature, I may provide some help > digging into the ASSERT(). Let me check. It's very sporadic as well unfortunately.
On 08.07.24 07:26, Johannes Thumshirn wrote: > On 08.07.24 07:20, Qu Wenruo wrote: >> >> Can the ASSERT() be reproduced without a zoned device? (I'm really not a >> fan of the existing tcmu emulated solution, meanwhile libvirt still >> doesn't support ZNS devices) >> >> If it can be reproduced just with RST feature, I may provide some help >> digging into the ASSERT(). > > Let me check. It's very sporadic as well unfortunately. > > OK, I've managed to trigger the failure with btrfs/070 on a SCRATCH_DEV_POOL with 5 non-zoned devices.
在 2024/7/8 20:22, Johannes Thumshirn 写道: > On 08.07.24 07:26, Johannes Thumshirn wrote: >> On 08.07.24 07:20, Qu Wenruo wrote: >>> >>> Can the ASSERT() be reproduced without a zoned device? (I'm really not a >>> fan of the existing tcmu emulated solution, meanwhile libvirt still >>> doesn't support ZNS devices) >>> >>> If it can be reproduced just with RST feature, I may provide some help >>> digging into the ASSERT(). >> >> Let me check. It's very sporadic as well unfortunately. >> >> > > OK, I've managed to trigger the failure with btrfs/070 on a > SCRATCH_DEV_POOL with 5 non-zoned devices. I'm hitting errors like this: [ 227.898320] ------------[ cut here ]------------ [ 227.898817] BTRFS: Transaction aborted (error -17) [ 227.899250] WARNING: CPU: 7 PID: 65 at fs/btrfs/raid-stripe-tree.c:116 btrfs_insert_raid_extent+0x337/0x3d0 [btrfs] [ 227.900122] Modules linked in: btrfs blake2b_generic xor zstd_compress vfat fat intel_rapl_msr intel_rapl_common crct10dif_pclmul crc32_pclmul ghash_clmulni_intel iTCO_wdt iTCO_vendor_support aesni_intel crypto_simd cryptd psmouse i2c_i801 pcspkr i2c_smbus lpc_ich intel_agp intel_gtt joydev agpgart mousedev raid6_pq libcrc32c loop drm fuse qemu_fw_cfg ext4 crc32c_generic crc16 mbcache jbd2 dm_mod virtio_rng virtio_net virtio_blk virtio_balloon net_failover virtio_console failover virtio_scsi rng_core dimlib usbhid virtio_pci virtio_pci_legacy_dev crc32c_intel virtio_pci_modern_dev serio_raw [ 227.904452] CPU: 7 PID: 65 Comm: kworker/u40:0 Not tainted 6.10.0-rc6-custom+ #167 [ 227.905220] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 2/2/2022 [ 227.905827] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs] [ 227.906558] RIP: 0010:btrfs_insert_raid_extent+0x337/0x3d0 [btrfs] [ 227.907246] Code: 89 6b 08 e8 4b 18 f7 ff 49 8b 84 24 50 02 00 00 4c 39 f0 75 be 31 db e9 7d fe ff ff 89 de 48 c7 c7 f0 8d 9d a0 e8 29 a1 79 e0 <0f> 0b e9 69 ff ff ff e8 bd 95 3e e1 49 8b 46 60 48 05 48 1a 00 00 [ 227.908277] BTRFS: error (device dm-3 state A) in btrfs_insert_one_raid_extent:116: errno=-17 Object already exists [ 227.909356] RSP: 0018:ffffc9000026fca0 EFLAGS: 00010282 [ 227.909361] RAX: 0000000000000000 RBX: 00000000ffffffef RCX: 0000000000000027 [ 227.911934] RDX: ffff88817bda1948 RSI: 0000000000000001 RDI: ffff88817bda1940 [ 227.912722] RBP: ffff8881029dcbe0 R08: 0000000000000000 R09: 0000000000000003 [ 227.913095] BTRFS info (device dm-3 state EA): forced readonly [ 227.913569] R10: ffffc9000026fb38 R11: ffffffff826d0508 R12: 0000000000000010 [ 227.915182] R13: ffff8881029dcbe0 R14: ffff88812a5ff790 R15: ffff8881488f2780 [ 227.916130] FS: 0000000000000000(0000) GS:ffff88817bd80000(0000) knlGS:0000000000000000 [ 227.916912] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 227.917500] CR2: 0000561364dec000 CR3: 00000001583ca000 CR4: 0000000000750ef0 [ 227.918210] PKRU: 55555554 [ 227.918484] Call Trace: [ 227.918727] <TASK> [ 227.918940] ? __warn+0x8c/0x180 [ 227.919299] ? btrfs_insert_raid_extent+0x337/0x3d0 [btrfs] [ 227.919891] ? report_bug+0x164/0x190 [ 227.920272] ? prb_read_valid+0x1b/0x30 [ 227.920666] ? handle_bug+0x3c/0x80 [ 227.921013] ? exc_invalid_op+0x17/0x70 [ 227.921397] ? asm_exc_invalid_op+0x1a/0x20 [ 227.921835] ? btrfs_insert_raid_extent+0x337/0x3d0 [btrfs] [ 227.922440] btrfs_finish_one_ordered+0x3c3/0xaa0 [btrfs] [ 227.923055] ? srso_alias_return_thunk+0x5/0xfbef5 [ 227.923549] ? srso_alias_return_thunk+0x5/0xfbef5 [ 227.924062] btrfs_work_helper+0x107/0x4c0 [btrfs] [ 227.924612] ? lock_is_held_type+0x9a/0x110 [ 227.925040] process_one_work+0x212/0x720 [ 227.925454] ? srso_alias_return_thunk+0x5/0xfbef5 [ 227.926010] worker_thread+0x1dc/0x3b0 [ 227.926411] ? __pfx_worker_thread+0x10/0x10 [ 227.926918] kthread+0xe0/0x110 [ 227.927377] ? __pfx_kthread+0x10/0x10 [ 227.927776] ret_from_fork+0x31/0x50 [ 227.928151] ? __pfx_kthread+0x10/0x10 [ 227.928564] ret_from_fork_asm+0x1a/0x30 [ 227.929035] </TASK> [ 227.929305] irq event stamp: 11077 [ 227.929710] hardirqs last enabled at (11085): [<ffffffff8115daf5>] console_unlock+0x135/0x160 [ 227.930725] hardirqs last disabled at (11094): [<ffffffff8115dada>] console_unlock+0x11a/0x160 [ 227.931730] softirqs last enabled at (10728): [<ffffffff810b4684>] __irq_exit_rcu+0x84/0xa0 [ 227.932568] softirqs last disabled at (10723): [<ffffffff810b4684>] __irq_exit_rcu+0x84/0xa0 [ 227.933494] ---[ end trace 0000000000000000 ]--- [ 227.933992] BTRFS: error (device dm-3 state EA) in btrfs_insert_one_raid_extent:116: errno=-17 Object already exists [ 227.935193] BTRFS warning (device dm-3 state EA): Skipping commit of aborted transaction. [ 227.936383] BTRFS: error (device dm-3 state EA) in cleanup_transaction:2018: errno=-17 Object already exists But not that ASSERT() yet. I guess I need the first patch to pass this first? Thanks, Qu
On 09.07.24 01:02, Qu Wenruo wrote: > But not that ASSERT() yet. > > I guess I need the first patch to pass this first? Yes, otherwise the EEXIST path will be triggered more frequently.
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 451203055bbf..82278e54969e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3863,6 +3863,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_RAID_STRIPE_KEY && key.type != BTRFS_EXTENT_CSUM_KEY); if (btrfs_leaf_free_space(leaf) >= ins_len) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 84746c8886be..d2a6e409b3e8 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -33,42 +33,92 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le if (!path) return -ENOMEM; - while (1) { - key.objectid = start; - key.type = BTRFS_RAID_STRIPE_KEY; - key.offset = length; +again: + key.objectid = start; + key.type = BTRFS_RAID_STRIPE_KEY; + key.offset = length; - ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); - if (ret < 0) - break; - if (ret > 0) { - ret = 0; - if (path->slots[0] == 0) - break; - path->slots[0]--; - } + ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + found_start = key.objectid; + found_end = found_start + key.offset; + + /* That stripe ends before we start, we're done. */ + if (found_end <= start) + goto out; + + trace_btrfs_raid_extent_delete(fs_info, start, end, + found_start, found_end); + + if (found_start < start) { + u64 diff = start - found_start; + struct btrfs_key new_key; + int num_stripes; + struct btrfs_stripe_extent *stripe_extent; + + new_key.objectid = start; + new_key.type = BTRFS_RAID_STRIPE_KEY; + new_key.offset = length - diff; + + ret = btrfs_duplicate_item(trans, stripe_root, path, + &new_key); + if (ret) + goto out; leaf = path->nodes[0]; slot = path->slots[0]; - btrfs_item_key_to_cpu(leaf, &key, slot); - found_start = key.objectid; - found_end = found_start + key.offset; - /* That stripe ends before we start, we're done. */ - if (found_end <= start) - break; + num_stripes = + btrfs_num_raid_stripes(btrfs_item_size(leaf, slot)); + stripe_extent = + btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); + + for (int i = 0; i < num_stripes; i++) { + struct btrfs_raid_stride *raid_stride = + &stripe_extent->strides[i]; + u64 physical = + btrfs_raid_stride_physical(leaf, raid_stride); + + btrfs_set_raid_stride_physical(leaf, raid_stride, + physical + diff); + } + + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_release_path(path); + goto again; + } - trace_btrfs_raid_extent_delete(fs_info, start, end, - found_start, found_end); + if (found_end > end) { + u64 diff = found_end - end; + struct btrfs_key new_key; - ASSERT(found_start >= start && found_end <= end); - ret = btrfs_del_item(trans, stripe_root, path); + new_key.objectid = found_start; + new_key.type = BTRFS_RAID_STRIPE_KEY; + new_key.offset = length - diff; + + ret = btrfs_duplicate_item(trans, stripe_root, path, + &new_key); if (ret) - break; + goto out; + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); } + ret = btrfs_del_item(trans, stripe_root, path); + + out: btrfs_free_path(path); return ret; }