Message ID | 1f3982ec56c49f642ef35698c39e16f56f4bc4f5.1744840038.git.josef@toxicpanda.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | btrfs: simplify extent buffer writeback | expand |
On Wed, 16 Apr 2025 at 23:50, Josef Bacik <josef@toxicpanda.com> wrote: > > In order to fully utilize xarray tagging to improve writeback we need to > convert the buffer_radix to a proper xarray. This conversion is > relatively straightforward as the radix code uses the xarray underneath. > Using xarray directly allows for quite a lot less code. > > Signed-off-by: Josef Bacik <josef@toxicpanda.com> > --- > fs/btrfs/disk-io.c | 15 ++- > fs/btrfs/extent_io.c | 196 +++++++++++++++-------------------- > fs/btrfs/fs.h | 4 +- > fs/btrfs/tests/btrfs-tests.c | 27 ++--- > fs/btrfs/zoned.c | 16 +-- > 5 files changed, 113 insertions(+), 145 deletions(-) > > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > index 59da809b7d57..5593873f5c0f 100644 > --- a/fs/btrfs/disk-io.c > +++ b/fs/btrfs/disk-io.c > @@ -2762,10 +2762,22 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) > return ret; > } > > +/* > + * lockdep gets confused between our buffer_xarray which requires IRQ locking > + * because we modify marks in the IRQ context, and our delayed inode xarray > + * which doesn't have these requirements. Use a class key so lockdep doesn't get > + * them mixed up. > + */ > +static struct lock_class_key buffer_xa_class; > + > void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) > { > INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); > - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); > + > + /* Use the same flags as mapping->i_pages. */ > + xa_init_flags(&fs_info->buffer_xarray, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); > + lockdep_set_class(&fs_info->buffer_xarray.xa_lock, &buffer_xa_class); > + > INIT_LIST_HEAD(&fs_info->trans_list); > INIT_LIST_HEAD(&fs_info->dead_roots); > INIT_LIST_HEAD(&fs_info->delayed_iputs); > @@ -2777,7 +2789,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) > spin_lock_init(&fs_info->delayed_iput_lock); > spin_lock_init(&fs_info->defrag_inodes_lock); > spin_lock_init(&fs_info->super_lock); > - spin_lock_init(&fs_info->buffer_lock); > spin_lock_init(&fs_info->unused_bgs_lock); > spin_lock_init(&fs_info->treelog_bg_lock); > spin_lock_init(&fs_info->zone_active_bgs_lock); > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > index 6cfd286b8bbc..309c86d1a696 100644 > --- a/fs/btrfs/extent_io.c > +++ b/fs/btrfs/extent_io.c > @@ -1893,19 +1893,20 @@ static void set_btree_ioerr(struct extent_buffer *eb) > * context. > */ > static struct extent_buffer *find_extent_buffer_nolock( > - const struct btrfs_fs_info *fs_info, u64 start) > + struct btrfs_fs_info *fs_info, u64 start) > { > + XA_STATE(xas, &fs_info->buffer_xarray, start >> fs_info->sectorsize_bits); > struct extent_buffer *eb; > > rcu_read_lock(); > - eb = radix_tree_lookup(&fs_info->buffer_radix, > - start >> fs_info->sectorsize_bits); > - if (eb && atomic_inc_not_zero(&eb->refs)) { > - rcu_read_unlock(); > - return eb; > - } > + do { > + eb = xas_load(&xas); > + } while (xas_retry(&xas, eb)); > + > + if (eb && !atomic_inc_not_zero(&eb->refs)) > + eb = NULL; > rcu_read_unlock(); > - return NULL; > + return eb; > } > > static void end_bbio_meta_write(struct btrfs_bio *bbio) > @@ -2769,11 +2770,10 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo > > if (!btrfs_meta_is_subpage(fs_info)) { > /* > - * We do this since we'll remove the pages after we've > - * removed the eb from the radix tree, so we could race > - * and have this page now attached to the new eb. So > - * only clear folio if it's still connected to > - * this eb. > + * We do this since we'll remove the pages after we've removed > + * the eb from the xarray, so we could race and have this page > + * now attached to the new eb. So only clear folio if it's > + * still connected to this eb. > */ > if (folio_test_private(folio) && folio_get_private(folio) == eb) { > BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); > @@ -2938,9 +2938,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) > { > int refs; > /* > - * The TREE_REF bit is first set when the extent_buffer is added > - * to the radix tree. It is also reset, if unset, when a new reference > - * is created by find_extent_buffer. > + * The TREE_REF bit is first set when the extent_buffer is added to the > + * xarray. It is also reset, if unset, when a new reference is created > + * by find_extent_buffer. > * > * It is only cleared in two cases: freeing the last non-tree > * reference to the extent_buffer when its STALE bit is set or > @@ -2952,13 +2952,12 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) > * conditions between the calls to check_buffer_tree_ref in those > * codepaths and clearing TREE_REF in try_release_extent_buffer. > * > - * The actual lifetime of the extent_buffer in the radix tree is > - * adequately protected by the refcount, but the TREE_REF bit and > - * its corresponding reference are not. To protect against this > - * class of races, we call check_buffer_tree_ref from the codepaths > - * which trigger io. Note that once io is initiated, TREE_REF can no > - * longer be cleared, so that is the moment at which any such race is > - * best fixed. > + * The actual lifetime of the extent_buffer in the xarray is adequately > + * protected by the refcount, but the TREE_REF bit and its corresponding > + * reference are not. To protect against this class of races, we call > + * check_buffer_tree_ref from the codepaths which trigger io. Note that > + * once io is initiated, TREE_REF can no longer be cleared, so that is > + * the moment at which any such race is best fixed. > */ > refs = atomic_read(&eb->refs); > if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) > @@ -3022,23 +3021,26 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, > return ERR_PTR(-ENOMEM); > eb->fs_info = fs_info; > again: > - ret = radix_tree_preload(GFP_NOFS); > - if (ret) { > - exists = ERR_PTR(ret); > + xa_lock_irq(&fs_info->buffer_xarray); > + exists = __xa_cmpxchg(&fs_info->buffer_xarray, > + start >> fs_info->sectorsize_bits, NULL, eb, > + GFP_NOFS); I think the xa_unlock_irq() can go straight here. Or rather you could use xa_cmpxchg_irq() instead? Or do you protect exists->refs with it as well? It seems so, IIUC. Not sure it is strictly needed, though. > + if (xa_is_err(exists)) { > + ret = xa_err(exists); > + xa_unlock_irq(&fs_info->buffer_xarray); > + btrfs_release_extent_buffer(eb); > + return ERR_PTR(ret); > + } > + if (exists) { > + if (!atomic_inc_not_zero(&exists->refs)) { > + /* The extent buffer is being freed, retry. */ > + xa_unlock_irq(&fs_info->buffer_xarray); > + goto again; > + } > + xa_unlock_irq(&fs_info->buffer_xarray); > goto free_eb; > } > - spin_lock(&fs_info->buffer_lock); > - ret = radix_tree_insert(&fs_info->buffer_radix, > - start >> fs_info->sectorsize_bits, eb); > - spin_unlock(&fs_info->buffer_lock); > - radix_tree_preload_end(); > - if (ret == -EEXIST) { > - exists = find_extent_buffer(fs_info, start); > - if (exists) > - goto free_eb; > - else > - goto again; > - } > + xa_unlock_irq(&fs_info->buffer_xarray); > check_buffer_tree_ref(eb); > > return eb; > @@ -3059,9 +3061,9 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, > lockdep_assert_held(&folio->mapping->i_private_lock); > > /* > - * For subpage case, we completely rely on radix tree to ensure we > - * don't try to insert two ebs for the same bytenr. So here we always > - * return NULL and just continue. > + * For subpage case, we completely rely on xarray to ensure we don't try > + * to insert two ebs for the same bytenr. So here we always return NULL > + * and just continue. > */ > if (btrfs_meta_is_subpage(fs_info)) > return NULL; > @@ -3194,7 +3196,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, > /* > * To inform we have an extra eb under allocation, so that > * detach_extent_buffer_page() won't release the folio private when the > - * eb hasn't been inserted into radix tree yet. > + * eb hasn't been inserted into the xarray yet. > * > * The ref will be decreased when the eb releases the page, in > * detach_extent_buffer_page(). Thus needs no special handling in the > @@ -3328,10 +3330,10 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, > > /* > * We can't unlock the pages just yet since the extent buffer > - * hasn't been properly inserted in the radix tree, this > - * opens a race with btree_release_folio which can free a page > - * while we are still filling in all pages for the buffer and > - * we could crash. > + * hasn't been properly inserted in the xarray, this opens a > + * race with btree_release_folio which can free a page while we > + * are still filling in all pages for the buffer and we could > + * crash. > */ > } > if (uptodate) > @@ -3340,23 +3342,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, > if (page_contig) > eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); > again: > - ret = radix_tree_preload(GFP_NOFS); > - if (ret) > + xa_lock_irq(&fs_info->buffer_xarray); > + existing_eb = __xa_cmpxchg(&fs_info->buffer_xarray, > + start >> fs_info->sectorsize_bits, NULL, eb, > + GFP_NOFS); Ditto. > + if (xa_is_err(existing_eb)) { > + ret = xa_err(existing_eb); > + xa_unlock_irq(&fs_info->buffer_xarray); > goto out; > - > - spin_lock(&fs_info->buffer_lock); > - ret = radix_tree_insert(&fs_info->buffer_radix, > - start >> fs_info->sectorsize_bits, eb); > - spin_unlock(&fs_info->buffer_lock); > - radix_tree_preload_end(); > - if (ret == -EEXIST) { > - ret = 0; > - existing_eb = find_extent_buffer(fs_info, start); > - if (existing_eb) > - goto out; > - else > - goto again; > } > + if (existing_eb) { > + if (!atomic_inc_not_zero(&existing_eb->refs)) { > + xa_unlock_irq(&fs_info->buffer_xarray); > + goto again; > + } > + xa_unlock_irq(&fs_info->buffer_xarray); > + goto out; > + } > + xa_unlock_irq(&fs_info->buffer_xarray); > + > /* add one reference for the tree */ > check_buffer_tree_ref(eb); > > @@ -3426,10 +3430,13 @@ static int release_extent_buffer(struct extent_buffer *eb) > > spin_unlock(&eb->refs_lock); > > - spin_lock(&fs_info->buffer_lock); > - radix_tree_delete_item(&fs_info->buffer_radix, > - eb->start >> fs_info->sectorsize_bits, eb); > - spin_unlock(&fs_info->buffer_lock); > + /* > + * We're erasing, theoretically there will be no allocations, so > + * just use GFP_ATOMIC. > + */ > + xa_cmpxchg_irq(&fs_info->buffer_xarray, > + eb->start >> fs_info->sectorsize_bits, eb, NULL, > + GFP_ATOMIC); > > btrfs_leak_debug_del_eb(eb); > /* Should be safe to release folios at this point. */ > @@ -4260,44 +4267,6 @@ void memmove_extent_buffer(const struct extent_buffer *dst, > } > } > > -#define GANG_LOOKUP_SIZE 16 > -static struct extent_buffer *get_next_extent_buffer( > - const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) > -{ > - struct extent_buffer *gang[GANG_LOOKUP_SIZE]; > - struct extent_buffer *found = NULL; > - u64 folio_start = folio_pos(folio); > - u64 cur = folio_start; > - > - ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); > - lockdep_assert_held(&fs_info->buffer_lock); > - > - while (cur < folio_start + PAGE_SIZE) { > - int ret; > - int i; > - > - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, > - (void **)gang, cur >> fs_info->sectorsize_bits, > - min_t(unsigned int, GANG_LOOKUP_SIZE, > - PAGE_SIZE / fs_info->nodesize)); > - if (ret == 0) > - goto out; > - for (i = 0; i < ret; i++) { > - /* Already beyond page end */ > - if (gang[i]->start >= folio_start + PAGE_SIZE) > - goto out; > - /* Found one */ > - if (gang[i]->start >= bytenr) { > - found = gang[i]; > - goto out; > - } > - } > - cur = gang[ret - 1]->start + gang[ret - 1]->len; > - } > -out: > - return found; > -} > - > static int try_release_subpage_extent_buffer(struct folio *folio) > { > struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); > @@ -4306,21 +4275,26 @@ static int try_release_subpage_extent_buffer(struct folio *folio) > int ret; > > while (cur < end) { > + XA_STATE(xas, &fs_info->buffer_xarray, > + cur >> fs_info->sectorsize_bits); > struct extent_buffer *eb = NULL; > > /* > * Unlike try_release_extent_buffer() which uses folio private > - * to grab buffer, for subpage case we rely on radix tree, thus > - * we need to ensure radix tree consistency. > + * to grab buffer, for subpage case we rely on xarray, thus we > + * need to ensure xarray tree consistency. > * > - * We also want an atomic snapshot of the radix tree, thus go > + * We also want an atomic snapshot of the xarray tree, thus go > * with spinlock rather than RCU. > */ > - spin_lock(&fs_info->buffer_lock); > - eb = get_next_extent_buffer(fs_info, folio, cur); > + xa_lock_irq(&fs_info->buffer_xarray); > + do { > + eb = xas_find(&xas, end >> fs_info->sectorsize_bits); > + } while (xas_retry(&xas, eb)); > + > if (!eb) { > /* No more eb in the page range after or at cur */ > - spin_unlock(&fs_info->buffer_lock); > + xa_unlock(&fs_info->buffer_xarray); I may be missing something but is this a typo? Should this be xa_unlock_irq()? > break; > } > cur = eb->start + eb->len; > @@ -4332,10 +4306,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio) > spin_lock(&eb->refs_lock); > if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { > spin_unlock(&eb->refs_lock); > - spin_unlock(&fs_info->buffer_lock); > + xa_unlock(&fs_info->buffer_xarray); xa_unlock_irq() again? I don't even see any reason for XA_FLAGS_LOCK_IRQ in the first place to be honest. (Ah, OK. It's because of clearing the writeback tag in the end_bbio_meta_write() in the second patch. On my machine it runs from a threaded interrupt but that's not universal I guess.) > break; > } > - spin_unlock(&fs_info->buffer_lock); > + xa_unlock_irq(&fs_info->buffer_xarray); > > /* > * If tree ref isn't set then we know the ref on this eb is a > diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h > index bcca43046064..d5d94977860c 100644 > --- a/fs/btrfs/fs.h > +++ b/fs/btrfs/fs.h > @@ -776,10 +776,8 @@ struct btrfs_fs_info { > > struct btrfs_delayed_root *delayed_root; > > - /* Extent buffer radix tree */ > - spinlock_t buffer_lock; > /* Entries are eb->start / sectorsize */ > - struct radix_tree_root buffer_radix; > + struct xarray buffer_xarray; I'm not exactly a fan of encoding the type in the name. I'd simply call this 'buffers' or 'extent_buffers'. > /* Next backup root to be overwritten */ > int backup_root_index; > diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c > index 02a915eb51fb..741117ce7d3f 100644 > --- a/fs/btrfs/tests/btrfs-tests.c > +++ b/fs/btrfs/tests/btrfs-tests.c > @@ -157,9 +157,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) > > void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) > { > - struct radix_tree_iter iter; > - void **slot; > + XA_STATE(xas, &fs_info->buffer_xarray, 0); > struct btrfs_device *dev, *tmp; > + struct extent_buffer *eb; > > if (!fs_info) > return; > @@ -169,25 +169,16 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) > > test_mnt->mnt_sb->s_fs_info = NULL; > > - spin_lock(&fs_info->buffer_lock); > - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { > - struct extent_buffer *eb; > - > - eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); > - if (!eb) > + xa_lock_irq(&fs_info->buffer_xarray); > + xas_for_each(&xas, eb, ULONG_MAX) { > + if (xas_retry(&xas, eb)) > continue; > - /* Shouldn't happen but that kind of thinking creates CVE's */ > - if (radix_tree_exception(eb)) { > - if (radix_tree_deref_retry(eb)) > - slot = radix_tree_iter_retry(&iter); > - continue; > - } > - slot = radix_tree_iter_resume(slot, &iter); > - spin_unlock(&fs_info->buffer_lock); > + xas_pause(&xas); > + xa_unlock_irq(&fs_info->buffer_xarray); > free_extent_buffer_stale(eb); > - spin_lock(&fs_info->buffer_lock); > + xa_lock_irq(&fs_info->buffer_xarray); > } > - spin_unlock(&fs_info->buffer_lock); > + xa_unlock_irq(&fs_info->buffer_xarray); > > btrfs_mapping_tree_free(fs_info); > list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, > diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c > index 7b30700ec930..7ed19ca399f3 100644 > --- a/fs/btrfs/zoned.c > +++ b/fs/btrfs/zoned.c > @@ -2170,28 +2170,22 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) > static void wait_eb_writebacks(struct btrfs_block_group *block_group) > { > struct btrfs_fs_info *fs_info = block_group->fs_info; > + XA_STATE(xas, &fs_info->buffer_xarray, > + block_group->start >> fs_info->sectorsize_bits); > const u64 end = block_group->start + block_group->length; > - struct radix_tree_iter iter; > struct extent_buffer *eb; > - void __rcu **slot; > > rcu_read_lock(); > - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, > - block_group->start >> fs_info->sectorsize_bits) { > - eb = radix_tree_deref_slot(slot); > - if (!eb) > + xas_for_each(&xas, eb, end >> fs_info->sectorsize_bits) { > + if (xas_retry(&xas, eb)) > continue; > - if (radix_tree_deref_retry(eb)) { > - slot = radix_tree_iter_retry(&iter); > - continue; > - } > > if (eb->start < block_group->start) > continue; > if (eb->start >= end) > break; > > - slot = radix_tree_iter_resume(slot, &iter); > + xas_pause(&xas); > rcu_read_unlock(); > wait_on_extent_buffer_writeback(eb); > rcu_read_lock(); > -- > 2.48.1 > >
On Thu, Apr 17, 2025 at 09:32:28PM +0200, Daniel Vacek wrote: > On Wed, 16 Apr 2025 at 23:50, Josef Bacik <josef@toxicpanda.com> wrote: > > > > In order to fully utilize xarray tagging to improve writeback we need to > > convert the buffer_radix to a proper xarray. This conversion is > > relatively straightforward as the radix code uses the xarray underneath. > > Using xarray directly allows for quite a lot less code. > > > > Signed-off-by: Josef Bacik <josef@toxicpanda.com> > > --- > > fs/btrfs/disk-io.c | 15 ++- > > fs/btrfs/extent_io.c | 196 +++++++++++++++-------------------- > > fs/btrfs/fs.h | 4 +- > > fs/btrfs/tests/btrfs-tests.c | 27 ++--- > > fs/btrfs/zoned.c | 16 +-- > > 5 files changed, 113 insertions(+), 145 deletions(-) > > > > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > > index 59da809b7d57..5593873f5c0f 100644 > > --- a/fs/btrfs/disk-io.c > > +++ b/fs/btrfs/disk-io.c > > @@ -2762,10 +2762,22 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) > > return ret; > > } > > > > +/* > > + * lockdep gets confused between our buffer_xarray which requires IRQ locking > > + * because we modify marks in the IRQ context, and our delayed inode xarray > > + * which doesn't have these requirements. Use a class key so lockdep doesn't get > > + * them mixed up. > > + */ > > +static struct lock_class_key buffer_xa_class; > > + > > void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) > > { > > INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); > > - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); > > + > > + /* Use the same flags as mapping->i_pages. */ > > + xa_init_flags(&fs_info->buffer_xarray, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); > > + lockdep_set_class(&fs_info->buffer_xarray.xa_lock, &buffer_xa_class); > > + > > INIT_LIST_HEAD(&fs_info->trans_list); > > INIT_LIST_HEAD(&fs_info->dead_roots); > > INIT_LIST_HEAD(&fs_info->delayed_iputs); > > @@ -2777,7 +2789,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) > > spin_lock_init(&fs_info->delayed_iput_lock); > > spin_lock_init(&fs_info->defrag_inodes_lock); > > spin_lock_init(&fs_info->super_lock); > > - spin_lock_init(&fs_info->buffer_lock); > > spin_lock_init(&fs_info->unused_bgs_lock); > > spin_lock_init(&fs_info->treelog_bg_lock); > > spin_lock_init(&fs_info->zone_active_bgs_lock); > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > > index 6cfd286b8bbc..309c86d1a696 100644 > > --- a/fs/btrfs/extent_io.c > > +++ b/fs/btrfs/extent_io.c > > @@ -1893,19 +1893,20 @@ static void set_btree_ioerr(struct extent_buffer *eb) > > * context. > > */ > > static struct extent_buffer *find_extent_buffer_nolock( > > - const struct btrfs_fs_info *fs_info, u64 start) > > + struct btrfs_fs_info *fs_info, u64 start) > > { > > + XA_STATE(xas, &fs_info->buffer_xarray, start >> fs_info->sectorsize_bits); > > struct extent_buffer *eb; > > > > rcu_read_lock(); > > - eb = radix_tree_lookup(&fs_info->buffer_radix, > > - start >> fs_info->sectorsize_bits); > > - if (eb && atomic_inc_not_zero(&eb->refs)) { > > - rcu_read_unlock(); > > - return eb; > > - } > > + do { > > + eb = xas_load(&xas); > > + } while (xas_retry(&xas, eb)); > > + > > + if (eb && !atomic_inc_not_zero(&eb->refs)) > > + eb = NULL; > > rcu_read_unlock(); > > - return NULL; > > + return eb; > > } > > > > static void end_bbio_meta_write(struct btrfs_bio *bbio) > > @@ -2769,11 +2770,10 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo > > > > if (!btrfs_meta_is_subpage(fs_info)) { > > /* > > - * We do this since we'll remove the pages after we've > > - * removed the eb from the radix tree, so we could race > > - * and have this page now attached to the new eb. So > > - * only clear folio if it's still connected to > > - * this eb. > > + * We do this since we'll remove the pages after we've removed > > + * the eb from the xarray, so we could race and have this page > > + * now attached to the new eb. So only clear folio if it's > > + * still connected to this eb. > > */ > > if (folio_test_private(folio) && folio_get_private(folio) == eb) { > > BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); > > @@ -2938,9 +2938,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) > > { > > int refs; > > /* > > - * The TREE_REF bit is first set when the extent_buffer is added > > - * to the radix tree. It is also reset, if unset, when a new reference > > - * is created by find_extent_buffer. > > + * The TREE_REF bit is first set when the extent_buffer is added to the > > + * xarray. It is also reset, if unset, when a new reference is created > > + * by find_extent_buffer. > > * > > * It is only cleared in two cases: freeing the last non-tree > > * reference to the extent_buffer when its STALE bit is set or > > @@ -2952,13 +2952,12 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) > > * conditions between the calls to check_buffer_tree_ref in those > > * codepaths and clearing TREE_REF in try_release_extent_buffer. > > * > > - * The actual lifetime of the extent_buffer in the radix tree is > > - * adequately protected by the refcount, but the TREE_REF bit and > > - * its corresponding reference are not. To protect against this > > - * class of races, we call check_buffer_tree_ref from the codepaths > > - * which trigger io. Note that once io is initiated, TREE_REF can no > > - * longer be cleared, so that is the moment at which any such race is > > - * best fixed. > > + * The actual lifetime of the extent_buffer in the xarray is adequately > > + * protected by the refcount, but the TREE_REF bit and its corresponding > > + * reference are not. To protect against this class of races, we call > > + * check_buffer_tree_ref from the codepaths which trigger io. Note that > > + * once io is initiated, TREE_REF can no longer be cleared, so that is > > + * the moment at which any such race is best fixed. > > */ > > refs = atomic_read(&eb->refs); > > if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) > > @@ -3022,23 +3021,26 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, > > return ERR_PTR(-ENOMEM); > > eb->fs_info = fs_info; > > again: > > - ret = radix_tree_preload(GFP_NOFS); > > - if (ret) { > > - exists = ERR_PTR(ret); > > + xa_lock_irq(&fs_info->buffer_xarray); > > + exists = __xa_cmpxchg(&fs_info->buffer_xarray, > > + start >> fs_info->sectorsize_bits, NULL, eb, > > + GFP_NOFS); > > I think the xa_unlock_irq() can go straight here. Or rather you could > use xa_cmpxchg_irq() instead? > Or do you protect exists->refs with it as well? It seems so, IIUC. Not > sure it is strictly needed, though. > We can't trust the value of exists after we've unlocked the xarray. > > + if (xa_is_err(exists)) { > > + ret = xa_err(exists); > > + xa_unlock_irq(&fs_info->buffer_xarray); > > + btrfs_release_extent_buffer(eb); > > + return ERR_PTR(ret); > > + } > > + if (exists) { > > + if (!atomic_inc_not_zero(&exists->refs)) { > > + /* The extent buffer is being freed, retry. */ > > + xa_unlock_irq(&fs_info->buffer_xarray); > > + goto again; > > + } > > + xa_unlock_irq(&fs_info->buffer_xarray); > > goto free_eb; > > } > > - spin_lock(&fs_info->buffer_lock); > > - ret = radix_tree_insert(&fs_info->buffer_radix, > > - start >> fs_info->sectorsize_bits, eb); > > - spin_unlock(&fs_info->buffer_lock); > > - radix_tree_preload_end(); > > - if (ret == -EEXIST) { > > - exists = find_extent_buffer(fs_info, start); > > - if (exists) > > - goto free_eb; > > - else > > - goto again; > > - } > > + xa_unlock_irq(&fs_info->buffer_xarray); > > check_buffer_tree_ref(eb); > > > > return eb; > > @@ -3059,9 +3061,9 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, > > lockdep_assert_held(&folio->mapping->i_private_lock); > > > > /* > > - * For subpage case, we completely rely on radix tree to ensure we > > - * don't try to insert two ebs for the same bytenr. So here we always > > - * return NULL and just continue. > > + * For subpage case, we completely rely on xarray to ensure we don't try > > + * to insert two ebs for the same bytenr. So here we always return NULL > > + * and just continue. > > */ > > if (btrfs_meta_is_subpage(fs_info)) > > return NULL; > > @@ -3194,7 +3196,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, > > /* > > * To inform we have an extra eb under allocation, so that > > * detach_extent_buffer_page() won't release the folio private when the > > - * eb hasn't been inserted into radix tree yet. > > + * eb hasn't been inserted into the xarray yet. > > * > > * The ref will be decreased when the eb releases the page, in > > * detach_extent_buffer_page(). Thus needs no special handling in the > > @@ -3328,10 +3330,10 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, > > > > /* > > * We can't unlock the pages just yet since the extent buffer > > - * hasn't been properly inserted in the radix tree, this > > - * opens a race with btree_release_folio which can free a page > > - * while we are still filling in all pages for the buffer and > > - * we could crash. > > + * hasn't been properly inserted in the xarray, this opens a > > + * race with btree_release_folio which can free a page while we > > + * are still filling in all pages for the buffer and we could > > + * crash. > > */ > > } > > if (uptodate) > > @@ -3340,23 +3342,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, > > if (page_contig) > > eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); > > again: > > - ret = radix_tree_preload(GFP_NOFS); > > - if (ret) > > + xa_lock_irq(&fs_info->buffer_xarray); > > + existing_eb = __xa_cmpxchg(&fs_info->buffer_xarray, > > + start >> fs_info->sectorsize_bits, NULL, eb, > > + GFP_NOFS); > > Ditto. Same comment as above. > > > + if (xa_is_err(existing_eb)) { > > + ret = xa_err(existing_eb); > > + xa_unlock_irq(&fs_info->buffer_xarray); > > goto out; > > - > > - spin_lock(&fs_info->buffer_lock); > > - ret = radix_tree_insert(&fs_info->buffer_radix, > > - start >> fs_info->sectorsize_bits, eb); > > - spin_unlock(&fs_info->buffer_lock); > > - radix_tree_preload_end(); > > - if (ret == -EEXIST) { > > - ret = 0; > > - existing_eb = find_extent_buffer(fs_info, start); > > - if (existing_eb) > > - goto out; > > - else > > - goto again; > > } > > + if (existing_eb) { > > + if (!atomic_inc_not_zero(&existing_eb->refs)) { > > + xa_unlock_irq(&fs_info->buffer_xarray); > > + goto again; > > + } > > + xa_unlock_irq(&fs_info->buffer_xarray); > > + goto out; > > + } > > + xa_unlock_irq(&fs_info->buffer_xarray); > > + > > /* add one reference for the tree */ > > check_buffer_tree_ref(eb); > > > > @@ -3426,10 +3430,13 @@ static int release_extent_buffer(struct extent_buffer *eb) > > > > spin_unlock(&eb->refs_lock); > > > > - spin_lock(&fs_info->buffer_lock); > > - radix_tree_delete_item(&fs_info->buffer_radix, > > - eb->start >> fs_info->sectorsize_bits, eb); > > - spin_unlock(&fs_info->buffer_lock); > > + /* > > + * We're erasing, theoretically there will be no allocations, so > > + * just use GFP_ATOMIC. > > + */ > > + xa_cmpxchg_irq(&fs_info->buffer_xarray, > > + eb->start >> fs_info->sectorsize_bits, eb, NULL, > > + GFP_ATOMIC); > > > > btrfs_leak_debug_del_eb(eb); > > /* Should be safe to release folios at this point. */ > > @@ -4260,44 +4267,6 @@ void memmove_extent_buffer(const struct extent_buffer *dst, > > } > > } > > > > -#define GANG_LOOKUP_SIZE 16 > > -static struct extent_buffer *get_next_extent_buffer( > > - const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) > > -{ > > - struct extent_buffer *gang[GANG_LOOKUP_SIZE]; > > - struct extent_buffer *found = NULL; > > - u64 folio_start = folio_pos(folio); > > - u64 cur = folio_start; > > - > > - ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); > > - lockdep_assert_held(&fs_info->buffer_lock); > > - > > - while (cur < folio_start + PAGE_SIZE) { > > - int ret; > > - int i; > > - > > - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, > > - (void **)gang, cur >> fs_info->sectorsize_bits, > > - min_t(unsigned int, GANG_LOOKUP_SIZE, > > - PAGE_SIZE / fs_info->nodesize)); > > - if (ret == 0) > > - goto out; > > - for (i = 0; i < ret; i++) { > > - /* Already beyond page end */ > > - if (gang[i]->start >= folio_start + PAGE_SIZE) > > - goto out; > > - /* Found one */ > > - if (gang[i]->start >= bytenr) { > > - found = gang[i]; > > - goto out; > > - } > > - } > > - cur = gang[ret - 1]->start + gang[ret - 1]->len; > > - } > > -out: > > - return found; > > -} > > - > > static int try_release_subpage_extent_buffer(struct folio *folio) > > { > > struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); > > @@ -4306,21 +4275,26 @@ static int try_release_subpage_extent_buffer(struct folio *folio) > > int ret; > > > > while (cur < end) { > > + XA_STATE(xas, &fs_info->buffer_xarray, > > + cur >> fs_info->sectorsize_bits); > > struct extent_buffer *eb = NULL; > > > > /* > > * Unlike try_release_extent_buffer() which uses folio private > > - * to grab buffer, for subpage case we rely on radix tree, thus > > - * we need to ensure radix tree consistency. > > + * to grab buffer, for subpage case we rely on xarray, thus we > > + * need to ensure xarray tree consistency. > > * > > - * We also want an atomic snapshot of the radix tree, thus go > > + * We also want an atomic snapshot of the xarray tree, thus go > > * with spinlock rather than RCU. > > */ > > - spin_lock(&fs_info->buffer_lock); > > - eb = get_next_extent_buffer(fs_info, folio, cur); > > + xa_lock_irq(&fs_info->buffer_xarray); > > + do { > > + eb = xas_find(&xas, end >> fs_info->sectorsize_bits); > > + } while (xas_retry(&xas, eb)); > > + > > if (!eb) { > > /* No more eb in the page range after or at cur */ > > - spin_unlock(&fs_info->buffer_lock); > > + xa_unlock(&fs_info->buffer_xarray); > > I may be missing something but is this a typo? Should this be xa_unlock_irq()? > > > break; > > } > > cur = eb->start + eb->len; > > @@ -4332,10 +4306,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio) > > spin_lock(&eb->refs_lock); > > if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { > > spin_unlock(&eb->refs_lock); > > - spin_unlock(&fs_info->buffer_lock); > > + xa_unlock(&fs_info->buffer_xarray); > > xa_unlock_irq() again? > Ah yeah, I'll fix this, I must not have hit reclaim while running fstests otherwise lockdep would have complained about this. > I don't even see any reason for XA_FLAGS_LOCK_IRQ in the first place > to be honest. > (Ah, OK. It's because of clearing the writeback tag in the > end_bbio_meta_write() in the second patch. On my machine it runs from > a threaded interrupt but that's not universal I guess.) > > > break; > > } > > - spin_unlock(&fs_info->buffer_lock); > > + xa_unlock_irq(&fs_info->buffer_xarray); > > > > /* > > * If tree ref isn't set then we know the ref on this eb is a > > diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h > > index bcca43046064..d5d94977860c 100644 > > --- a/fs/btrfs/fs.h > > +++ b/fs/btrfs/fs.h > > @@ -776,10 +776,8 @@ struct btrfs_fs_info { > > > > struct btrfs_delayed_root *delayed_root; > > > > - /* Extent buffer radix tree */ > > - spinlock_t buffer_lock; > > /* Entries are eb->start / sectorsize */ > > - struct radix_tree_root buffer_radix; > > + struct xarray buffer_xarray; > > I'm not exactly a fan of encoding the type in the name. I'd simply > call this 'buffers' or 'extent_buffers'. Those are too generic, I prefer the more specific name. Multiple times throughout this project I did `git grep buffer_radix` or `git grep buffer_xarra` to check stuff, a more generic name would make it harder to spot the relevant thing I care about. Thanks, Josef
On Thu, 17 Apr 2025 at 23:12, Josef Bacik <josef@toxicpanda.com> wrote: > > On Thu, Apr 17, 2025 at 09:32:28PM +0200, Daniel Vacek wrote: > > On Wed, 16 Apr 2025 at 23:50, Josef Bacik <josef@toxicpanda.com> wrote: > > > > > > In order to fully utilize xarray tagging to improve writeback we need to > > > convert the buffer_radix to a proper xarray. This conversion is > > > relatively straightforward as the radix code uses the xarray underneath. > > > Using xarray directly allows for quite a lot less code. > > > > > > Signed-off-by: Josef Bacik <josef@toxicpanda.com> > > > --- > > > fs/btrfs/disk-io.c | 15 ++- > > > fs/btrfs/extent_io.c | 196 +++++++++++++++-------------------- > > > fs/btrfs/fs.h | 4 +- > > > fs/btrfs/tests/btrfs-tests.c | 27 ++--- > > > fs/btrfs/zoned.c | 16 +-- > > > 5 files changed, 113 insertions(+), 145 deletions(-) > > > > > > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > > > index 59da809b7d57..5593873f5c0f 100644 > > > --- a/fs/btrfs/disk-io.c > > > +++ b/fs/btrfs/disk-io.c > > > @@ -2762,10 +2762,22 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) > > > return ret; > > > } > > > > > > +/* > > > + * lockdep gets confused between our buffer_xarray which requires IRQ locking > > > + * because we modify marks in the IRQ context, and our delayed inode xarray > > > + * which doesn't have these requirements. Use a class key so lockdep doesn't get > > > + * them mixed up. > > > + */ > > > +static struct lock_class_key buffer_xa_class; > > > + > > > void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) > > > { > > > INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); > > > - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); > > > + > > > + /* Use the same flags as mapping->i_pages. */ > > > + xa_init_flags(&fs_info->buffer_xarray, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); > > > + lockdep_set_class(&fs_info->buffer_xarray.xa_lock, &buffer_xa_class); > > > + > > > INIT_LIST_HEAD(&fs_info->trans_list); > > > INIT_LIST_HEAD(&fs_info->dead_roots); > > > INIT_LIST_HEAD(&fs_info->delayed_iputs); > > > @@ -2777,7 +2789,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) > > > spin_lock_init(&fs_info->delayed_iput_lock); > > > spin_lock_init(&fs_info->defrag_inodes_lock); > > > spin_lock_init(&fs_info->super_lock); > > > - spin_lock_init(&fs_info->buffer_lock); > > > spin_lock_init(&fs_info->unused_bgs_lock); > > > spin_lock_init(&fs_info->treelog_bg_lock); > > > spin_lock_init(&fs_info->zone_active_bgs_lock); > > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > > > index 6cfd286b8bbc..309c86d1a696 100644 > > > --- a/fs/btrfs/extent_io.c > > > +++ b/fs/btrfs/extent_io.c > > > @@ -1893,19 +1893,20 @@ static void set_btree_ioerr(struct extent_buffer *eb) > > > * context. > > > */ > > > static struct extent_buffer *find_extent_buffer_nolock( > > > - const struct btrfs_fs_info *fs_info, u64 start) > > > + struct btrfs_fs_info *fs_info, u64 start) > > > { > > > + XA_STATE(xas, &fs_info->buffer_xarray, start >> fs_info->sectorsize_bits); > > > struct extent_buffer *eb; > > > > > > rcu_read_lock(); > > > - eb = radix_tree_lookup(&fs_info->buffer_radix, > > > - start >> fs_info->sectorsize_bits); > > > - if (eb && atomic_inc_not_zero(&eb->refs)) { > > > - rcu_read_unlock(); > > > - return eb; > > > - } > > > + do { > > > + eb = xas_load(&xas); > > > + } while (xas_retry(&xas, eb)); > > > + > > > + if (eb && !atomic_inc_not_zero(&eb->refs)) > > > + eb = NULL; > > > rcu_read_unlock(); > > > - return NULL; > > > + return eb; > > > } > > > > > > static void end_bbio_meta_write(struct btrfs_bio *bbio) > > > @@ -2769,11 +2770,10 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo > > > > > > if (!btrfs_meta_is_subpage(fs_info)) { > > > /* > > > - * We do this since we'll remove the pages after we've > > > - * removed the eb from the radix tree, so we could race > > > - * and have this page now attached to the new eb. So > > > - * only clear folio if it's still connected to > > > - * this eb. > > > + * We do this since we'll remove the pages after we've removed > > > + * the eb from the xarray, so we could race and have this page > > > + * now attached to the new eb. So only clear folio if it's > > > + * still connected to this eb. > > > */ > > > if (folio_test_private(folio) && folio_get_private(folio) == eb) { > > > BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); > > > @@ -2938,9 +2938,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) > > > { > > > int refs; > > > /* > > > - * The TREE_REF bit is first set when the extent_buffer is added > > > - * to the radix tree. It is also reset, if unset, when a new reference > > > - * is created by find_extent_buffer. > > > + * The TREE_REF bit is first set when the extent_buffer is added to the > > > + * xarray. It is also reset, if unset, when a new reference is created > > > + * by find_extent_buffer. > > > * > > > * It is only cleared in two cases: freeing the last non-tree > > > * reference to the extent_buffer when its STALE bit is set or > > > @@ -2952,13 +2952,12 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) > > > * conditions between the calls to check_buffer_tree_ref in those > > > * codepaths and clearing TREE_REF in try_release_extent_buffer. > > > * > > > - * The actual lifetime of the extent_buffer in the radix tree is > > > - * adequately protected by the refcount, but the TREE_REF bit and > > > - * its corresponding reference are not. To protect against this > > > - * class of races, we call check_buffer_tree_ref from the codepaths > > > - * which trigger io. Note that once io is initiated, TREE_REF can no > > > - * longer be cleared, so that is the moment at which any such race is > > > - * best fixed. > > > + * The actual lifetime of the extent_buffer in the xarray is adequately > > > + * protected by the refcount, but the TREE_REF bit and its corresponding > > > + * reference are not. To protect against this class of races, we call > > > + * check_buffer_tree_ref from the codepaths which trigger io. Note that > > > + * once io is initiated, TREE_REF can no longer be cleared, so that is > > > + * the moment at which any such race is best fixed. > > > */ > > > refs = atomic_read(&eb->refs); > > > if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) > > > @@ -3022,23 +3021,26 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, > > > return ERR_PTR(-ENOMEM); > > > eb->fs_info = fs_info; > > > again: > > > - ret = radix_tree_preload(GFP_NOFS); > > > - if (ret) { > > > - exists = ERR_PTR(ret); > > > + xa_lock_irq(&fs_info->buffer_xarray); > > > + exists = __xa_cmpxchg(&fs_info->buffer_xarray, > > > + start >> fs_info->sectorsize_bits, NULL, eb, > > > + GFP_NOFS); > > > > I think the xa_unlock_irq() can go straight here. Or rather you could > > use xa_cmpxchg_irq() instead? > > Or do you protect exists->refs with it as well? It seems so, IIUC. Not > > sure it is strictly needed, though. > > > > We can't trust the value of exists after we've unlocked the xarray. Of course. I see. Because we don't rcu_lock() here. I could have noticed before, sorry. > > > + if (xa_is_err(exists)) { > > > + ret = xa_err(exists); > > > + xa_unlock_irq(&fs_info->buffer_xarray); > > > + btrfs_release_extent_buffer(eb); > > > + return ERR_PTR(ret); > > > + } > > > + if (exists) { > > > + if (!atomic_inc_not_zero(&exists->refs)) { > > > + /* The extent buffer is being freed, retry. */ > > > + xa_unlock_irq(&fs_info->buffer_xarray); > > > + goto again; > > > + } > > > + xa_unlock_irq(&fs_info->buffer_xarray); > > > goto free_eb; > > > } > > > - spin_lock(&fs_info->buffer_lock); > > > - ret = radix_tree_insert(&fs_info->buffer_radix, > > > - start >> fs_info->sectorsize_bits, eb); > > > - spin_unlock(&fs_info->buffer_lock); > > > - radix_tree_preload_end(); > > > - if (ret == -EEXIST) { > > > - exists = find_extent_buffer(fs_info, start); > > > - if (exists) > > > - goto free_eb; > > > - else > > > - goto again; > > > - } > > > + xa_unlock_irq(&fs_info->buffer_xarray); > > > check_buffer_tree_ref(eb); > > > > > > return eb; > > > @@ -3059,9 +3061,9 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, > > > lockdep_assert_held(&folio->mapping->i_private_lock); > > > > > > /* > > > - * For subpage case, we completely rely on radix tree to ensure we > > > - * don't try to insert two ebs for the same bytenr. So here we always > > > - * return NULL and just continue. > > > + * For subpage case, we completely rely on xarray to ensure we don't try > > > + * to insert two ebs for the same bytenr. So here we always return NULL > > > + * and just continue. > > > */ > > > if (btrfs_meta_is_subpage(fs_info)) > > > return NULL; > > > @@ -3194,7 +3196,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, > > > /* > > > * To inform we have an extra eb under allocation, so that > > > * detach_extent_buffer_page() won't release the folio private when the > > > - * eb hasn't been inserted into radix tree yet. > > > + * eb hasn't been inserted into the xarray yet. > > > * > > > * The ref will be decreased when the eb releases the page, in > > > * detach_extent_buffer_page(). Thus needs no special handling in the > > > @@ -3328,10 +3330,10 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, > > > > > > /* > > > * We can't unlock the pages just yet since the extent buffer > > > - * hasn't been properly inserted in the radix tree, this > > > - * opens a race with btree_release_folio which can free a page > > > - * while we are still filling in all pages for the buffer and > > > - * we could crash. > > > + * hasn't been properly inserted in the xarray, this opens a > > > + * race with btree_release_folio which can free a page while we > > > + * are still filling in all pages for the buffer and we could > > > + * crash. > > > */ > > > } > > > if (uptodate) > > > @@ -3340,23 +3342,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, > > > if (page_contig) > > > eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); > > > again: > > > - ret = radix_tree_preload(GFP_NOFS); > > > - if (ret) > > > + xa_lock_irq(&fs_info->buffer_xarray); > > > + existing_eb = __xa_cmpxchg(&fs_info->buffer_xarray, > > > + start >> fs_info->sectorsize_bits, NULL, eb, > > > + GFP_NOFS); > > > > Ditto. > > Same comment as above. > > > > > > + if (xa_is_err(existing_eb)) { > > > + ret = xa_err(existing_eb); > > > + xa_unlock_irq(&fs_info->buffer_xarray); > > > goto out; > > > - > > > - spin_lock(&fs_info->buffer_lock); > > > - ret = radix_tree_insert(&fs_info->buffer_radix, > > > - start >> fs_info->sectorsize_bits, eb); > > > - spin_unlock(&fs_info->buffer_lock); > > > - radix_tree_preload_end(); > > > - if (ret == -EEXIST) { > > > - ret = 0; > > > - existing_eb = find_extent_buffer(fs_info, start); > > > - if (existing_eb) > > > - goto out; > > > - else > > > - goto again; > > > } > > > + if (existing_eb) { > > > + if (!atomic_inc_not_zero(&existing_eb->refs)) { > > > + xa_unlock_irq(&fs_info->buffer_xarray); > > > + goto again; > > > + } > > > + xa_unlock_irq(&fs_info->buffer_xarray); > > > + goto out; > > > + } > > > + xa_unlock_irq(&fs_info->buffer_xarray); > > > + > > > /* add one reference for the tree */ > > > check_buffer_tree_ref(eb); > > > > > > @@ -3426,10 +3430,13 @@ static int release_extent_buffer(struct extent_buffer *eb) > > > > > > spin_unlock(&eb->refs_lock); > > > > > > - spin_lock(&fs_info->buffer_lock); > > > - radix_tree_delete_item(&fs_info->buffer_radix, > > > - eb->start >> fs_info->sectorsize_bits, eb); > > > - spin_unlock(&fs_info->buffer_lock); > > > + /* > > > + * We're erasing, theoretically there will be no allocations, so > > > + * just use GFP_ATOMIC. > > > + */ > > > + xa_cmpxchg_irq(&fs_info->buffer_xarray, > > > + eb->start >> fs_info->sectorsize_bits, eb, NULL, > > > + GFP_ATOMIC); > > > > > > btrfs_leak_debug_del_eb(eb); > > > /* Should be safe to release folios at this point. */ > > > @@ -4260,44 +4267,6 @@ void memmove_extent_buffer(const struct extent_buffer *dst, > > > } > > > } > > > > > > -#define GANG_LOOKUP_SIZE 16 > > > -static struct extent_buffer *get_next_extent_buffer( > > > - const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) > > > -{ > > > - struct extent_buffer *gang[GANG_LOOKUP_SIZE]; > > > - struct extent_buffer *found = NULL; > > > - u64 folio_start = folio_pos(folio); > > > - u64 cur = folio_start; > > > - > > > - ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); > > > - lockdep_assert_held(&fs_info->buffer_lock); > > > - > > > - while (cur < folio_start + PAGE_SIZE) { > > > - int ret; > > > - int i; > > > - > > > - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, > > > - (void **)gang, cur >> fs_info->sectorsize_bits, > > > - min_t(unsigned int, GANG_LOOKUP_SIZE, > > > - PAGE_SIZE / fs_info->nodesize)); > > > - if (ret == 0) > > > - goto out; > > > - for (i = 0; i < ret; i++) { > > > - /* Already beyond page end */ > > > - if (gang[i]->start >= folio_start + PAGE_SIZE) > > > - goto out; > > > - /* Found one */ > > > - if (gang[i]->start >= bytenr) { > > > - found = gang[i]; > > > - goto out; > > > - } > > > - } > > > - cur = gang[ret - 1]->start + gang[ret - 1]->len; > > > - } > > > -out: > > > - return found; > > > -} > > > - > > > static int try_release_subpage_extent_buffer(struct folio *folio) > > > { > > > struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); > > > @@ -4306,21 +4275,26 @@ static int try_release_subpage_extent_buffer(struct folio *folio) > > > int ret; > > > > > > while (cur < end) { > > > + XA_STATE(xas, &fs_info->buffer_xarray, > > > + cur >> fs_info->sectorsize_bits); > > > struct extent_buffer *eb = NULL; > > > > > > /* > > > * Unlike try_release_extent_buffer() which uses folio private > > > - * to grab buffer, for subpage case we rely on radix tree, thus > > > - * we need to ensure radix tree consistency. > > > + * to grab buffer, for subpage case we rely on xarray, thus we > > > + * need to ensure xarray tree consistency. > > > * > > > - * We also want an atomic snapshot of the radix tree, thus go > > > + * We also want an atomic snapshot of the xarray tree, thus go > > > * with spinlock rather than RCU. > > > */ > > > - spin_lock(&fs_info->buffer_lock); > > > - eb = get_next_extent_buffer(fs_info, folio, cur); > > > + xa_lock_irq(&fs_info->buffer_xarray); > > > + do { > > > + eb = xas_find(&xas, end >> fs_info->sectorsize_bits); > > > + } while (xas_retry(&xas, eb)); > > > + > > > if (!eb) { > > > /* No more eb in the page range after or at cur */ > > > - spin_unlock(&fs_info->buffer_lock); > > > + xa_unlock(&fs_info->buffer_xarray); > > > > I may be missing something but is this a typo? Should this be xa_unlock_irq()? > > > > > break; > > > } > > > cur = eb->start + eb->len; > > > @@ -4332,10 +4306,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio) > > > spin_lock(&eb->refs_lock); > > > if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { > > > spin_unlock(&eb->refs_lock); > > > - spin_unlock(&fs_info->buffer_lock); > > > + xa_unlock(&fs_info->buffer_xarray); > > > > xa_unlock_irq() again? > > > > Ah yeah, I'll fix this, I must not have hit reclaim while running fstests > otherwise lockdep would have complained about this. > > > I don't even see any reason for XA_FLAGS_LOCK_IRQ in the first place > > to be honest. > > (Ah, OK. It's because of clearing the writeback tag in the > > end_bbio_meta_write() in the second patch. On my machine it runs from > > a threaded interrupt but that's not universal I guess.) > > > > > break; > > > } > > > - spin_unlock(&fs_info->buffer_lock); > > > + xa_unlock_irq(&fs_info->buffer_xarray); > > > > > > /* > > > * If tree ref isn't set then we know the ref on this eb is a > > > diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h > > > index bcca43046064..d5d94977860c 100644 > > > --- a/fs/btrfs/fs.h > > > +++ b/fs/btrfs/fs.h > > > @@ -776,10 +776,8 @@ struct btrfs_fs_info { > > > > > > struct btrfs_delayed_root *delayed_root; > > > > > > - /* Extent buffer radix tree */ > > > - spinlock_t buffer_lock; > > > /* Entries are eb->start / sectorsize */ > > > - struct radix_tree_root buffer_radix; > > > + struct xarray buffer_xarray; > > > > I'm not exactly a fan of encoding the type in the name. I'd simply > > call this 'buffers' or 'extent_buffers'. > > Those are too generic, I prefer the more specific name. Multiple times > throughout this project I did `git grep buffer_radix` or `git grep buffer_xarra` > to check stuff, a more generic name would make it harder to spot the relevant > thing I care about. Thanks, Makes sense. I'd still go with `buffer_array` or `eb_tree` or `buffer_tree` but that's just nitpicking. > Josef
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 59da809b7d57..5593873f5c0f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2762,10 +2762,22 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) return ret; } +/* + * lockdep gets confused between our buffer_xarray which requires IRQ locking + * because we modify marks in the IRQ context, and our delayed inode xarray + * which doesn't have these requirements. Use a class key so lockdep doesn't get + * them mixed up. + */ +static struct lock_class_key buffer_xa_class; + void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + + /* Use the same flags as mapping->i_pages. */ + xa_init_flags(&fs_info->buffer_xarray, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); + lockdep_set_class(&fs_info->buffer_xarray.xa_lock, &buffer_xa_class); + INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -2777,7 +2789,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); - spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); spin_lock_init(&fs_info->treelog_bg_lock); spin_lock_init(&fs_info->zone_active_bgs_lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6cfd286b8bbc..309c86d1a696 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1893,19 +1893,20 @@ static void set_btree_ioerr(struct extent_buffer *eb) * context. */ static struct extent_buffer *find_extent_buffer_nolock( - const struct btrfs_fs_info *fs_info, u64 start) + struct btrfs_fs_info *fs_info, u64 start) { + XA_STATE(xas, &fs_info->buffer_xarray, start >> fs_info->sectorsize_bits); struct extent_buffer *eb; rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - return eb; - } + do { + eb = xas_load(&xas); + } while (xas_retry(&xas, eb)); + + if (eb && !atomic_inc_not_zero(&eb->refs)) + eb = NULL; rcu_read_unlock(); - return NULL; + return eb; } static void end_bbio_meta_write(struct btrfs_bio *bbio) @@ -2769,11 +2770,10 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo if (!btrfs_meta_is_subpage(fs_info)) { /* - * We do this since we'll remove the pages after we've - * removed the eb from the radix tree, so we could race - * and have this page now attached to the new eb. So - * only clear folio if it's still connected to - * this eb. + * We do this since we'll remove the pages after we've removed + * the eb from the xarray, so we could race and have this page + * now attached to the new eb. So only clear folio if it's + * still connected to this eb. */ if (folio_test_private(folio) && folio_get_private(folio) == eb) { BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); @@ -2938,9 +2938,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; /* - * The TREE_REF bit is first set when the extent_buffer is added - * to the radix tree. It is also reset, if unset, when a new reference - * is created by find_extent_buffer. + * The TREE_REF bit is first set when the extent_buffer is added to the + * xarray. It is also reset, if unset, when a new reference is created + * by find_extent_buffer. * * It is only cleared in two cases: freeing the last non-tree * reference to the extent_buffer when its STALE bit is set or @@ -2952,13 +2952,12 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * conditions between the calls to check_buffer_tree_ref in those * codepaths and clearing TREE_REF in try_release_extent_buffer. * - * The actual lifetime of the extent_buffer in the radix tree is - * adequately protected by the refcount, but the TREE_REF bit and - * its corresponding reference are not. To protect against this - * class of races, we call check_buffer_tree_ref from the codepaths - * which trigger io. Note that once io is initiated, TREE_REF can no - * longer be cleared, so that is the moment at which any such race is - * best fixed. + * The actual lifetime of the extent_buffer in the xarray is adequately + * protected by the refcount, but the TREE_REF bit and its corresponding + * reference are not. To protect against this class of races, we call + * check_buffer_tree_ref from the codepaths which trigger io. Note that + * once io is initiated, TREE_REF can no longer be cleared, so that is + * the moment at which any such race is best fixed. */ refs = atomic_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -3022,23 +3021,26 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); + xa_lock_irq(&fs_info->buffer_xarray); + exists = __xa_cmpxchg(&fs_info->buffer_xarray, + start >> fs_info->sectorsize_bits, NULL, eb, + GFP_NOFS); + if (xa_is_err(exists)) { + ret = xa_err(exists); + xa_unlock_irq(&fs_info->buffer_xarray); + btrfs_release_extent_buffer(eb); + return ERR_PTR(ret); + } + if (exists) { + if (!atomic_inc_not_zero(&exists->refs)) { + /* The extent buffer is being freed, retry. */ + xa_unlock_irq(&fs_info->buffer_xarray); + goto again; + } + xa_unlock_irq(&fs_info->buffer_xarray); goto free_eb; } - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - else - goto again; - } + xa_unlock_irq(&fs_info->buffer_xarray); check_buffer_tree_ref(eb); return eb; @@ -3059,9 +3061,9 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, lockdep_assert_held(&folio->mapping->i_private_lock); /* - * For subpage case, we completely rely on radix tree to ensure we - * don't try to insert two ebs for the same bytenr. So here we always - * return NULL and just continue. + * For subpage case, we completely rely on xarray to ensure we don't try + * to insert two ebs for the same bytenr. So here we always return NULL + * and just continue. */ if (btrfs_meta_is_subpage(fs_info)) return NULL; @@ -3194,7 +3196,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, /* * To inform we have an extra eb under allocation, so that * detach_extent_buffer_page() won't release the folio private when the - * eb hasn't been inserted into radix tree yet. + * eb hasn't been inserted into the xarray yet. * * The ref will be decreased when the eb releases the page, in * detach_extent_buffer_page(). Thus needs no special handling in the @@ -3328,10 +3330,10 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, /* * We can't unlock the pages just yet since the extent buffer - * hasn't been properly inserted in the radix tree, this - * opens a race with btree_release_folio which can free a page - * while we are still filling in all pages for the buffer and - * we could crash. + * hasn't been properly inserted in the xarray, this opens a + * race with btree_release_folio which can free a page while we + * are still filling in all pages for the buffer and we could + * crash. */ } if (uptodate) @@ -3340,23 +3342,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (page_contig) eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) + xa_lock_irq(&fs_info->buffer_xarray); + existing_eb = __xa_cmpxchg(&fs_info->buffer_xarray, + start >> fs_info->sectorsize_bits, NULL, eb, + GFP_NOFS); + if (xa_is_err(existing_eb)) { + ret = xa_err(existing_eb); + xa_unlock_irq(&fs_info->buffer_xarray); goto out; - - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - ret = 0; - existing_eb = find_extent_buffer(fs_info, start); - if (existing_eb) - goto out; - else - goto again; } + if (existing_eb) { + if (!atomic_inc_not_zero(&existing_eb->refs)) { + xa_unlock_irq(&fs_info->buffer_xarray); + goto again; + } + xa_unlock_irq(&fs_info->buffer_xarray); + goto out; + } + xa_unlock_irq(&fs_info->buffer_xarray); + /* add one reference for the tree */ check_buffer_tree_ref(eb); @@ -3426,10 +3430,13 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); - spin_lock(&fs_info->buffer_lock); - radix_tree_delete_item(&fs_info->buffer_radix, - eb->start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); + /* + * We're erasing, theoretically there will be no allocations, so + * just use GFP_ATOMIC. + */ + xa_cmpxchg_irq(&fs_info->buffer_xarray, + eb->start >> fs_info->sectorsize_bits, eb, NULL, + GFP_ATOMIC); btrfs_leak_debug_del_eb(eb); /* Should be safe to release folios at this point. */ @@ -4260,44 +4267,6 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } -#define GANG_LOOKUP_SIZE 16 -static struct extent_buffer *get_next_extent_buffer( - const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) -{ - struct extent_buffer *gang[GANG_LOOKUP_SIZE]; - struct extent_buffer *found = NULL; - u64 folio_start = folio_pos(folio); - u64 cur = folio_start; - - ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); - lockdep_assert_held(&fs_info->buffer_lock); - - while (cur < folio_start + PAGE_SIZE) { - int ret; - int i; - - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, - (void **)gang, cur >> fs_info->sectorsize_bits, - min_t(unsigned int, GANG_LOOKUP_SIZE, - PAGE_SIZE / fs_info->nodesize)); - if (ret == 0) - goto out; - for (i = 0; i < ret; i++) { - /* Already beyond page end */ - if (gang[i]->start >= folio_start + PAGE_SIZE) - goto out; - /* Found one */ - if (gang[i]->start >= bytenr) { - found = gang[i]; - goto out; - } - } - cur = gang[ret - 1]->start + gang[ret - 1]->len; - } -out: - return found; -} - static int try_release_subpage_extent_buffer(struct folio *folio) { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); @@ -4306,21 +4275,26 @@ static int try_release_subpage_extent_buffer(struct folio *folio) int ret; while (cur < end) { + XA_STATE(xas, &fs_info->buffer_xarray, + cur >> fs_info->sectorsize_bits); struct extent_buffer *eb = NULL; /* * Unlike try_release_extent_buffer() which uses folio private - * to grab buffer, for subpage case we rely on radix tree, thus - * we need to ensure radix tree consistency. + * to grab buffer, for subpage case we rely on xarray, thus we + * need to ensure xarray tree consistency. * - * We also want an atomic snapshot of the radix tree, thus go + * We also want an atomic snapshot of the xarray tree, thus go * with spinlock rather than RCU. */ - spin_lock(&fs_info->buffer_lock); - eb = get_next_extent_buffer(fs_info, folio, cur); + xa_lock_irq(&fs_info->buffer_xarray); + do { + eb = xas_find(&xas, end >> fs_info->sectorsize_bits); + } while (xas_retry(&xas, eb)); + if (!eb) { /* No more eb in the page range after or at cur */ - spin_unlock(&fs_info->buffer_lock); + xa_unlock(&fs_info->buffer_xarray); break; } cur = eb->start + eb->len; @@ -4332,10 +4306,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&fs_info->buffer_lock); + xa_unlock(&fs_info->buffer_xarray); break; } - spin_unlock(&fs_info->buffer_lock); + xa_unlock_irq(&fs_info->buffer_xarray); /* * If tree ref isn't set then we know the ref on this eb is a diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index bcca43046064..d5d94977860c 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -776,10 +776,8 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Extent buffer radix tree */ - spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ - struct radix_tree_root buffer_radix; + struct xarray buffer_xarray; /* Next backup root to be overwritten */ int backup_root_index; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 02a915eb51fb..741117ce7d3f 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -157,9 +157,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &fs_info->buffer_xarray, 0); struct btrfs_device *dev, *tmp; + struct extent_buffer *eb; if (!fs_info) return; @@ -169,25 +169,16 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) test_mnt->mnt_sb->s_fs_info = NULL; - spin_lock(&fs_info->buffer_lock); - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { - struct extent_buffer *eb; - - eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); - if (!eb) + xa_lock_irq(&fs_info->buffer_xarray); + xas_for_each(&xas, eb, ULONG_MAX) { + if (xas_retry(&xas, eb)) continue; - /* Shouldn't happen but that kind of thinking creates CVE's */ - if (radix_tree_exception(eb)) { - if (radix_tree_deref_retry(eb)) - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - spin_unlock(&fs_info->buffer_lock); + xas_pause(&xas); + xa_unlock_irq(&fs_info->buffer_xarray); free_extent_buffer_stale(eb); - spin_lock(&fs_info->buffer_lock); + xa_lock_irq(&fs_info->buffer_xarray); } - spin_unlock(&fs_info->buffer_lock); + xa_unlock_irq(&fs_info->buffer_xarray); btrfs_mapping_tree_free(fs_info); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 7b30700ec930..7ed19ca399f3 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2170,28 +2170,22 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) static void wait_eb_writebacks(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; + XA_STATE(xas, &fs_info->buffer_xarray, + block_group->start >> fs_info->sectorsize_bits); const u64 end = block_group->start + block_group->length; - struct radix_tree_iter iter; struct extent_buffer *eb; - void __rcu **slot; rcu_read_lock(); - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, - block_group->start >> fs_info->sectorsize_bits) { - eb = radix_tree_deref_slot(slot); - if (!eb) + xas_for_each(&xas, eb, end >> fs_info->sectorsize_bits) { + if (xas_retry(&xas, eb)) continue; - if (radix_tree_deref_retry(eb)) { - slot = radix_tree_iter_retry(&iter); - continue; - } if (eb->start < block_group->start) continue; if (eb->start >= end) break; - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); rcu_read_unlock(); wait_on_extent_buffer_writeback(eb); rcu_read_lock();
In order to fully utilize xarray tagging to improve writeback we need to convert the buffer_radix to a proper xarray. This conversion is relatively straightforward as the radix code uses the xarray underneath. Using xarray directly allows for quite a lot less code. Signed-off-by: Josef Bacik <josef@toxicpanda.com> --- fs/btrfs/disk-io.c | 15 ++- fs/btrfs/extent_io.c | 196 +++++++++++++++-------------------- fs/btrfs/fs.h | 4 +- fs/btrfs/tests/btrfs-tests.c | 27 ++--- fs/btrfs/zoned.c | 16 +-- 5 files changed, 113 insertions(+), 145 deletions(-)