[2/2] btrfs: use buffer radix for extent buffer writeback operations

Message ID	ebb84aae51d174b292693b389d28d758271c9ad7.1744822090.git.josef@toxicpanda.com (mailing list archive)
State	New
Headers	show Received: from mail-yb1-f179.google.com (mail-yb1-f179.google.com [209.85.219.179]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 163AF211497 for <linux-btrfs@vger.kernel.org>; Wed, 16 Apr 2025 16:51:18 +0000 (UTC) From: Josef Bacik <josef@toxicpanda.com> To: linux-btrfs@vger.kernel.org, kernel-team@fb.com Subject: [PATCH 2/2] btrfs: use buffer radix for extent buffer writeback operations Date: Wed, 16 Apr 2025 12:51:07 -0400 Message-ID: <ebb84aae51d174b292693b389d28d758271c9ad7.1744822090.git.josef@toxicpanda.com> In-Reply-To: <cover.1744822090.git.josef@toxicpanda.com> References: <cover.1744822090.git.josef@toxicpanda.com> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	btrfs: simplify extent buffer writeback \| expand [0/2] btrfs: simplify extent buffer writeback [1/2] btrfs: set DIRTY and WRITEBACK tags on the buffer_radix [2/2] btrfs: use buffer radix for extent buffer writeback operations

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 59da809b7d57..9e32b574b32b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2762,10 +2762,20 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) return ret; } +static struct lock_class_key buffer_radix_lock_key; + void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + + /* + * We have to do this because we also use a radix tree for the delayed + * inodes, and lockdep will sometimes confuse that radix tree for this + * one and then complain about IRQ locking differences. + */ + lockdep_set_class(&fs_info->buffer_radix.xa_lock, + &buffer_radix_lock_key); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e5de0f57cf7e..780120add4d5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1926,6 +1926,117 @@ static void buffer_radix_clear_mark(const struct extent_buffer *eb, xas_unlock_irqrestore(&xas, flags); } +static void buffer_radix_tag_for_writeback(struct btrfs_fs_info *fs_info, + unsigned long start, unsigned long end) +{ + XA_STATE(xas, &fs_info->buffer_radix, start); + unsigned int tagged = 0; + void *eb; + + xas_lock_irq(&xas); + xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) { + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); + if (++tagged % XA_CHECK_SCHED) + continue; + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); + } + xas_unlock_irq(&xas); +} + +struct eb_batch { + unsigned int nr; + unsigned int cur; + struct extent_buffer *ebs[PAGEVEC_SIZE]; +}; + +static inline bool eb_batch_add(struct eb_batch *batch, + struct extent_buffer *eb) +{ + batch->ebs[batch->nr++] = eb; + return (batch->nr < PAGEVEC_SIZE); +} + +static inline void eb_batch_init(struct eb_batch *batch) +{ + batch->nr = 0; + batch->cur = 0; +} + +static inline unsigned int eb_batch_count(struct eb_batch *batch) +{ + return batch->nr; +} + +static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch) +{ + if (batch->cur >= batch->nr) + return NULL; + return batch->ebs[batch->cur++]; +} + +static inline void eb_batch_release(struct eb_batch *batch) +{ + for (unsigned int i = 0; i < batch->nr; i++) + free_extent_buffer(batch->ebs[i]); + eb_batch_init(batch); +} + +static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max, + xa_mark_t mark) +{ + struct extent_buffer *eb; + +retry: + eb = xas_find_marked(xas, max, mark); + + if (xas_retry(xas, eb)) + goto retry; + + if (!eb) + return NULL; + + if (!atomic_inc_not_zero(&eb->refs)) + goto reset; + + if (unlikely(eb != xas_reload(xas))) { + free_extent_buffer(eb); + goto reset; + } + + return eb; +reset: + xas_reset(xas); + goto retry; +} + +static unsigned int buffer_radix_get_ebs_tag(struct btrfs_fs_info *fs_info, + unsigned long *start, + unsigned long end, xa_mark_t tag, + struct eb_batch *batch) +{ + XA_STATE(xas, &fs_info->buffer_radix, *start); + struct extent_buffer *eb; + + rcu_read_lock(); + while ((eb = find_get_eb(&xas, end, tag)) != NULL) { + if (!eb_batch_add(batch, eb)) { + *start = (eb->start + eb->len) >> fs_info->sectorsize_bits; + goto out; + } + } + if (end == (unsigned long)-1) + *start = (unsigned long)-1; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return eb_batch_count(batch); +} + /* * The endio specific version which won't touch any unsafe spinlock in endio * context. @@ -2030,163 +2141,37 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, } /* - * Submit one subpage btree page. + * Wait for all eb writeback in the given range to finish. * - * The main difference to submit_eb_page() is: - * - Page locking - * For subpage, we don't rely on page locking at all. - * - * - Flush write bio - * We only flush bio if we may be unable to fit current extent buffers into - * current bio. - * - * Return >=0 for the number of submitted extent buffers. - * Return <0 for fatal error. + * @fs_info: the fs_info for this file system + * @start: the offset of the range to start waiting on writeback + * @end: the end of the range, inclusive. This is meant to be used in + * conjuction with wait_marked_extents, so this will usually be + * the_next_eb->start - 1. */ -static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) +void btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { - struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - int submitted = 0; - u64 folio_start = folio_pos(folio); - int bit_start = 0; - int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; - const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); + struct eb_batch batch; + unsigned long start_index = start >> fs_info->sectorsize_bits; + unsigned long end_index = end >> fs_info->sectorsize_bits; - /* Lock and write each dirty extent buffers in the range */ - while (bit_start < blocks_per_folio) { - struct btrfs_subpage *subpage = folio_get_private(folio); + eb_batch_init(&batch); + while (start_index <= end_index) { struct extent_buffer *eb; - unsigned long flags; - u64 start; + unsigned int nr_ebs; - /* - * Take private lock to ensure the subpage won't be detached - * in the meantime. - */ - spin_lock(&folio->mapping->i_private_lock); - if (!folio_test_private(folio)) { - spin_unlock(&folio->mapping->i_private_lock); + nr_ebs = buffer_radix_get_ebs_tag(fs_info, &start_index, + end_index, + PAGECACHE_TAG_WRITEBACK, + &batch); + if (!nr_ebs) break; - } - spin_lock_irqsave(&subpage->lock, flags); - if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * blocks_per_folio, - subpage->bitmaps)) { - spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&folio->mapping->i_private_lock); - bit_start += sectors_per_node; - continue; - } - start = folio_start + bit_start * fs_info->sectorsize; - bit_start += sectors_per_node; - - /* - * Here we just want to grab the eb without touching extra - * spin locks, so call find_extent_buffer_nolock(). - */ - eb = find_extent_buffer_nolock(fs_info, start); - spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&folio->mapping->i_private_lock); - - /* - * The eb has already reached 0 refs thus find_extent_buffer() - * doesn't return it. We don't need to write back such eb - * anyway. - */ - if (!eb) - continue; - - if (lock_extent_buffer_for_io(eb, wbc)) { - write_one_eb(eb, wbc); - submitted++; - } - free_extent_buffer(eb); + while ((eb = eb_batch_next(&batch)) != NULL) + wait_on_extent_buffer_writeback(eb); + eb_batch_release(&batch); + cond_resched(); } - return submitted; -} - -/* - * Submit all page(s) of one extent buffer. - * - * @page: the page of one extent buffer - * @eb_context: to determine if we need to submit this page, if current page - * belongs to this eb, we don't need to submit - * - * The caller should pass each page in their bytenr order, and here we use - * @eb_context to determine if we have submitted pages of one extent buffer. - * - * If we have, we just skip until we hit a new page that doesn't belong to - * current @eb_context. - * - * If not, we submit all the page(s) of the extent buffer. - * - * Return >0 if we have submitted the extent buffer successfully. - * Return 0 if we don't need to submit the page, as it's already submitted by - * previous call. - * Return <0 for fatal error. - */ -static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx) -{ - struct writeback_control *wbc = ctx->wbc; - struct address_space *mapping = folio->mapping; - struct extent_buffer *eb; - int ret; - - if (!folio_test_private(folio)) - return 0; - - if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) - return submit_eb_subpage(folio, wbc); - - spin_lock(&mapping->i_private_lock); - if (!folio_test_private(folio)) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - - eb = folio_get_private(folio); - - /* - * Shouldn't happen and normally this would be a BUG_ON but no point - * crashing the machine for something we can survive anyway. - */ - if (WARN_ON(!eb)) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - - if (eb == ctx->eb) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->i_private_lock); - if (!ret) - return 0; - - ctx->eb = eb; - - ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx); - if (ret) { - if (ret == -EBUSY) - ret = 0; - free_extent_buffer(eb); - return ret; - } - - if (!lock_extent_buffer_for_io(eb, wbc)) { - free_extent_buffer(eb); - return 0; - } - /* Implies write in zoned mode. */ - if (ctx->zoned_bg) { - /* Mark the last eb in the block group. */ - btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb); - ctx->zoned_bg->meta_write_pointer += eb->len; - } - write_one_eb(eb, wbc); - free_extent_buffer(eb); - return 1; } int btree_write_cache_pages(struct address_space *mapping, @@ -2197,25 +2182,27 @@ int btree_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int nr_to_write_done = 0; - struct folio_batch fbatch; - unsigned int nr_folios; - pgoff_t index; - pgoff_t end; /* Inclusive */ + struct eb_batch batch; + unsigned int nr_ebs; + unsigned long index; + unsigned long end; int scanned = 0; xa_mark_t tag; - folio_batch_init(&fbatch); + eb_batch_init(&batch); if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ + index = (mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits; end = -1; + /* * Start from the beginning does not need to cycle over the * range, mark it as scanned. */ scanned = (index == 0); } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; + index = wbc->range_start >> fs_info->sectorsize_bits; + end = wbc->range_end >> fs_info->sectorsize_bits; + scanned = 1; } if (wbc->sync_mode == WB_SYNC_ALL) @@ -2225,31 +2212,40 @@ int btree_write_cache_pages(struct address_space *mapping, btrfs_zoned_meta_io_lock(fs_info); retry: if (wbc->sync_mode == WB_SYNC_ALL) - tag_pages_for_writeback(mapping, index, end); + buffer_radix_tag_for_writeback(fs_info, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_folios = filemap_get_folios_tag(mapping, &index, end, - tag, &fbatch))) { - unsigned i; + (nr_ebs = buffer_radix_get_ebs_tag(fs_info, &index, end, tag, + &batch))) { + struct extent_buffer *eb; - for (i = 0; i < nr_folios; i++) { - struct folio *folio = fbatch.folios[i]; + while ((eb = eb_batch_next(&batch)) != NULL) { + ctx.eb = eb; - ret = submit_eb_page(folio, &ctx); - if (ret == 0) + ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx); + if (ret) { + if (ret == -EBUSY) + ret = 0; + if (ret) { + done = 1; + break; + } + free_extent_buffer(eb); continue; - if (ret < 0) { - done = 1; - break; } - /* - * the filesystem may choose to bump up nr_to_write. - * We have to make sure to honor the new nr_to_write - * at any time - */ - nr_to_write_done = wbc->nr_to_write <= 0; + if (!lock_extent_buffer_for_io(eb, wbc)) + continue; + + /* Implies write in zoned mode. */ + if (ctx.zoned_bg) { + /* Mark the last eb in the block group. */ + btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb); + ctx.zoned_bg->meta_write_pointer += eb->len; + } + write_one_eb(eb, wbc); } - folio_batch_release(&fbatch); + nr_to_write_done = wbc->nr_to_write <= 0; + eb_batch_release(&batch); cond_resched(); } if (!scanned && !done) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b344162f790c..4f0cf5b0d38f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -240,6 +240,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); +void btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); void btrfs_readahead(struct readahead_control *rac); int set_folio_extent_mapped(struct folio *folio); void clear_folio_extent_mapped(struct folio *folio); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 39e48bf610a1..b72ac8b70e0e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1155,7 +1155,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, if (!ret) ret = filemap_fdatawrite_range(mapping, start, end); if (!ret && wait_writeback) - ret = filemap_fdatawait_range(mapping, start, end); + btree_wait_writeback_range(fs_info, start, end); btrfs_free_extent_state(cached_state); if (ret) break; @@ -1175,7 +1175,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { - struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; u64 end; @@ -1196,7 +1195,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, if (ret == -ENOMEM) ret = 0; if (!ret) - ret = filemap_fdatawait_range(mapping, start, end); + btree_wait_writeback_range(fs_info, start, end); btrfs_free_extent_state(cached_state); if (ret) break;

[2/2] btrfs: use buffer radix for extent buffer writeback operations

Commit Message

Patch