Message ID | f5a3cbb2f51851dc55ff51647a214f912d6d5043.1744931654.git.boris@bur.io (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [v2] btrfs: fix broken drop_caches on extent_buffer folios | expand |
On Fri, 18 Apr 2025 at 01:15, Boris Burkov <boris@bur.io> wrote: > > The (correct) commit > e41c81d0d30e ("mm/truncate: Replace page_mapped() call in invalidate_inode_page()") > replaced the page_mapped(page) check with a refcount check. However, > this refcount check does not work as expected with drop_caches for > btrfs's metadata pages. > > Btrfs has a per-sb metadata inode with cached pages, and when not in > active use by btrfs, they have a refcount of 3. One from the initial > call to alloc_pages, one (nr_pages == 1) from filemap_add_folio, and one > from folio_attach_private. We would expect such pages to get dropped by > drop_caches. However, drop_caches calls into mapping_evict_folio via > mapping_try_invalidate which gets a reference on the folio with > find_lock_entries(). As a result, these pages have a refcount of 4, and > fail this check. > > For what it's worth, such pages do get reclaimed under memory pressure, > so I would say that while this behavior is surprising, it is not really > dangerously broken. > > When I asked the mm folks about the expected refcount in this case, I > was told that the correct thing to do is to donate the refcount from the > original allocation to the page cache after inserting it. > https://lore.kernel.org/linux-mm/ZrwhTXKzgDnCK76Z@casper.infradead.org/ > > Therefore, attempt to fix this by adding a put_folio() to the critical > spot in alloc_extent_buffer where we are sure that we have really > allocated and attached new pages. We must also adjust > folio_detach_private to properly handle being the last reference to the > folio and not do a UAF after folio_detach_private(). > > Finally, extent_buffers allocated by clone_extent_buffer() and > alloc_dummy_extent_buffer() are unmapped, so this transfer of ownership > from allocation to insertion in the mapping does not apply to them. > Therefore, they still need a folio_put() as before. > > Reviewed-by: Filipe Manana <fdmanana@suse.com> > Signed-off-by: Boris Burkov <boris@bur.io> > --- > Changelog: > v2: > * Used Filipe's suggestion to simplify detach_extent_buffer_folio and > lose the extra folio_get()/folio_put() pair > * Noticed a memory leak causing failures in fstests on smaller vms. > Fixed a bug where I was missing a folio_put() for ebs that never got > their folios added to the mapping. > > fs/btrfs/extent_io.c | 28 ++++++++++++++++++++-------- > 1 file changed, 20 insertions(+), 8 deletions(-) > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > index 5f08615b334f..90499124b8a5 100644 > --- a/fs/btrfs/extent_io.c > +++ b/fs/btrfs/extent_io.c > @@ -2752,6 +2752,7 @@ static bool folio_range_has_eb(struct folio *folio) > static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) > { > struct btrfs_fs_info *fs_info = eb->fs_info; > + struct address_space *mapping = folio->mapping; > const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); > > /* > @@ -2759,11 +2760,11 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo > * be done under the i_private_lock. > */ > if (mapped) > - spin_lock(&folio->mapping->i_private_lock); > + spin_lock(&mapping->i_private_lock); > > if (!folio_test_private(folio)) { > if (mapped) > - spin_unlock(&folio->mapping->i_private_lock); > + spin_unlock(&mapping->i_private_lock); > return; > } > > @@ -2783,7 +2784,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo > folio_detach_private(folio); > } > if (mapped) > - spin_unlock(&folio->mapping->i_private_lock); > + spin_unlock(&mapping->i_private_lock); > return; > } > > @@ -2806,7 +2807,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo > if (!folio_range_has_eb(folio)) > btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); > > - spin_unlock(&folio->mapping->i_private_lock); > + spin_unlock(&mapping->i_private_lock); > } > > /* Release all folios attached to the extent buffer */ > @@ -2821,9 +2822,13 @@ static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) > continue; > > detach_extent_buffer_folio(eb, folio); > - > - /* One for when we allocated the folio. */ > - folio_put(folio); So far so good, but... > + /* > + * We only release the allocated refcount for mapped extent_buffer > + * folios. If the folio is unmapped, we have to release its allocated > + * refcount here. > + */ > + if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) > + folio_put(folio); ...is this really correct? I'd do rather this instead: @@ -3393,22 +3393,21 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * case. If we left eb->folios[i] populated in the subpage case we'd * double put our reference and be super sad. */ - for (int i = 0; i < attached; i++) { - ASSERT(eb->folios[i]); - detach_extent_buffer_folio(eb, eb->folios[i]); + for (int i = 0; i < num_extent_folios(eb); i++) { + if (i < attached) { + ASSERT(eb->folios[i]); + detach_extent_buffer_folio(eb, eb->folios[i]); + } else if (!eb->folios[i]) + continue; folio_unlock(eb->folios[i]); folio_put(eb->folios[i]); eb->folios[i] = NULL; And perhaps `struct folio *folio = eb->folios[i];` first and substitute. Or is this unrelated and we need both? And honestly, IMO, there's no reason to set the EXTENT_BUFFER_UNMAPPED at all after this loop as it's just going to be freed. } - /* - * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, - * so it can be cleaned up without utilizing folio->mapping. - */ - set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); - btrfs_release_extent_buffer(eb); + if (ret < 0) return ERR_PTR(ret); + ASSERT(existing_eb); return existing_eb; } > } > } > > @@ -3365,8 +3370,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, > * btree_release_folio will correctly detect that a page belongs to a > * live buffer and won't free them prematurely. > */ > - for (int i = 0; i < num_extent_folios(eb); i++) > + for (int i = 0; i < num_extent_folios(eb); i++) { > folio_unlock(eb->folios[i]); > + /* > + * A folio that has been added to an address_space mapping > + * should not continue holding the refcount from its original > + * allocation indefinitely. > + */ > + folio_put(eb->folios[i]); > + } > return eb; > > out: > -- > 2.49.0 > >
On Fri, Apr 18, 2025 at 10:24 AM Daniel Vacek <neelx@suse.com> wrote: > > On Fri, 18 Apr 2025 at 01:15, Boris Burkov <boris@bur.io> wrote: > > > > The (correct) commit > > e41c81d0d30e ("mm/truncate: Replace page_mapped() call in invalidate_inode_page()") > > replaced the page_mapped(page) check with a refcount check. However, > > this refcount check does not work as expected with drop_caches for > > btrfs's metadata pages. > > > > Btrfs has a per-sb metadata inode with cached pages, and when not in > > active use by btrfs, they have a refcount of 3. One from the initial > > call to alloc_pages, one (nr_pages == 1) from filemap_add_folio, and one > > from folio_attach_private. We would expect such pages to get dropped by > > drop_caches. However, drop_caches calls into mapping_evict_folio via > > mapping_try_invalidate which gets a reference on the folio with > > find_lock_entries(). As a result, these pages have a refcount of 4, and > > fail this check. > > > > For what it's worth, such pages do get reclaimed under memory pressure, > > so I would say that while this behavior is surprising, it is not really > > dangerously broken. > > > > When I asked the mm folks about the expected refcount in this case, I > > was told that the correct thing to do is to donate the refcount from the > > original allocation to the page cache after inserting it. > > https://lore.kernel.org/linux-mm/ZrwhTXKzgDnCK76Z@casper.infradead.org/ > > > > Therefore, attempt to fix this by adding a put_folio() to the critical > > spot in alloc_extent_buffer where we are sure that we have really > > allocated and attached new pages. We must also adjust > > folio_detach_private to properly handle being the last reference to the > > folio and not do a UAF after folio_detach_private(). > > > > Finally, extent_buffers allocated by clone_extent_buffer() and > > alloc_dummy_extent_buffer() are unmapped, so this transfer of ownership > > from allocation to insertion in the mapping does not apply to them. > > Therefore, they still need a folio_put() as before. > > > > Reviewed-by: Filipe Manana <fdmanana@suse.com> > > Signed-off-by: Boris Burkov <boris@bur.io> > > --- > > Changelog: > > v2: > > * Used Filipe's suggestion to simplify detach_extent_buffer_folio and > > lose the extra folio_get()/folio_put() pair > > * Noticed a memory leak causing failures in fstests on smaller vms. > > Fixed a bug where I was missing a folio_put() for ebs that never got > > their folios added to the mapping. > > > > fs/btrfs/extent_io.c | 28 ++++++++++++++++++++-------- > > 1 file changed, 20 insertions(+), 8 deletions(-) > > > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > > index 5f08615b334f..90499124b8a5 100644 > > --- a/fs/btrfs/extent_io.c > > +++ b/fs/btrfs/extent_io.c > > @@ -2752,6 +2752,7 @@ static bool folio_range_has_eb(struct folio *folio) > > static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) > > { > > struct btrfs_fs_info *fs_info = eb->fs_info; > > + struct address_space *mapping = folio->mapping; > > const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); > > > > /* > > @@ -2759,11 +2760,11 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo > > * be done under the i_private_lock. > > */ > > if (mapped) > > - spin_lock(&folio->mapping->i_private_lock); > > + spin_lock(&mapping->i_private_lock); > > > > if (!folio_test_private(folio)) { > > if (mapped) > > - spin_unlock(&folio->mapping->i_private_lock); > > + spin_unlock(&mapping->i_private_lock); > > return; > > } > > > > @@ -2783,7 +2784,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo > > folio_detach_private(folio); > > } > > if (mapped) > > - spin_unlock(&folio->mapping->i_private_lock); > > + spin_unlock(&mapping->i_private_lock); > > return; > > } > > > > @@ -2806,7 +2807,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo > > if (!folio_range_has_eb(folio)) > > btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); > > > > - spin_unlock(&folio->mapping->i_private_lock); > > + spin_unlock(&mapping->i_private_lock); > > } > > > > /* Release all folios attached to the extent buffer */ > > @@ -2821,9 +2822,13 @@ static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) > > continue; > > > > detach_extent_buffer_folio(eb, folio); > > - > > - /* One for when we allocated the folio. */ > > - folio_put(folio); > > So far so good, but... > > > + /* > > + * We only release the allocated refcount for mapped extent_buffer > > + * folios. If the folio is unmapped, we have to release its allocated > > + * refcount here. > > + */ > > + if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) > > + folio_put(folio); > > ...is this really correct? I'd do rather this instead: Yes, it is correct. It does deal with cloned extent buffers, where we need the extra put since we didn't do it after attaching the folio to it. Alternatively we could probably do the put after the attach at btrfs_clone_extent_buffer(), just like for regular extent buffers, and get rid of this special casing. > > @@ -3393,22 +3393,21 @@ struct extent_buffer > *alloc_extent_buffer(struct btrfs_fs_info *fs_info, > * case. If we left eb->folios[i] populated in the subpage case we'd > * double put our reference and be super sad. > */ > - for (int i = 0; i < attached; i++) { > - ASSERT(eb->folios[i]); > - detach_extent_buffer_folio(eb, eb->folios[i]); > + for (int i = 0; i < num_extent_folios(eb); i++) { > + if (i < attached) { > + ASSERT(eb->folios[i]); > + detach_extent_buffer_folio(eb, eb->folios[i]); > + } else if (!eb->folios[i]) > + continue; > folio_unlock(eb->folios[i]); > folio_put(eb->folios[i]); > eb->folios[i] = NULL; > > And perhaps `struct folio *folio = eb->folios[i];` first and substitute. > > Or is this unrelated and we need both? This is unrelated. What you're trying to do here is to simplify the error path of the allocation of a regular extent buffer. Nothing to do with Boris' fix, and out of the scope of the fix. > > And honestly, IMO, there's no reason to set the EXTENT_BUFFER_UNMAPPED > at all after this loop as it's just going to be freed. > > } > - /* > - * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, > - * so it can be cleaned up without utilizing folio->mapping. > - */ > - set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); > - > btrfs_release_extent_buffer(eb); > + > if (ret < 0) > return ERR_PTR(ret); > + > ASSERT(existing_eb); > return existing_eb; > } > > > } > > } > > > > @@ -3365,8 +3370,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, > > * btree_release_folio will correctly detect that a page belongs to a > > * live buffer and won't free them prematurely. > > */ > > - for (int i = 0; i < num_extent_folios(eb); i++) > > + for (int i = 0; i < num_extent_folios(eb); i++) { > > folio_unlock(eb->folios[i]); > > + /* > > + * A folio that has been added to an address_space mapping > > + * should not continue holding the refcount from its original > > + * allocation indefinitely. > > + */ > > + folio_put(eb->folios[i]); > > + } > > return eb; > > > > out: > > -- > > 2.49.0 > > > > >
On Fri, 18 Apr 2025 at 12:50, Filipe Manana <fdmanana@kernel.org> wrote: > > On Fri, Apr 18, 2025 at 10:24 AM Daniel Vacek <neelx@suse.com> wrote: > > > > On Fri, 18 Apr 2025 at 01:15, Boris Burkov <boris@bur.io> wrote: > > > > > > The (correct) commit > > > e41c81d0d30e ("mm/truncate: Replace page_mapped() call in invalidate_inode_page()") > > > replaced the page_mapped(page) check with a refcount check. However, > > > this refcount check does not work as expected with drop_caches for > > > btrfs's metadata pages. > > > > > > Btrfs has a per-sb metadata inode with cached pages, and when not in > > > active use by btrfs, they have a refcount of 3. One from the initial > > > call to alloc_pages, one (nr_pages == 1) from filemap_add_folio, and one > > > from folio_attach_private. We would expect such pages to get dropped by > > > drop_caches. However, drop_caches calls into mapping_evict_folio via > > > mapping_try_invalidate which gets a reference on the folio with > > > find_lock_entries(). As a result, these pages have a refcount of 4, and > > > fail this check. > > > > > > For what it's worth, such pages do get reclaimed under memory pressure, > > > so I would say that while this behavior is surprising, it is not really > > > dangerously broken. > > > > > > When I asked the mm folks about the expected refcount in this case, I > > > was told that the correct thing to do is to donate the refcount from the > > > original allocation to the page cache after inserting it. > > > https://lore.kernel.org/linux-mm/ZrwhTXKzgDnCK76Z@casper.infradead.org/ > > > > > > Therefore, attempt to fix this by adding a put_folio() to the critical > > > spot in alloc_extent_buffer where we are sure that we have really > > > allocated and attached new pages. We must also adjust > > > folio_detach_private to properly handle being the last reference to the > > > folio and not do a UAF after folio_detach_private(). > > > > > > Finally, extent_buffers allocated by clone_extent_buffer() and > > > alloc_dummy_extent_buffer() are unmapped, so this transfer of ownership > > > from allocation to insertion in the mapping does not apply to them. > > > Therefore, they still need a folio_put() as before. > > > > > > Reviewed-by: Filipe Manana <fdmanana@suse.com> > > > Signed-off-by: Boris Burkov <boris@bur.io> > > > --- > > > Changelog: > > > v2: > > > * Used Filipe's suggestion to simplify detach_extent_buffer_folio and > > > lose the extra folio_get()/folio_put() pair > > > * Noticed a memory leak causing failures in fstests on smaller vms. > > > Fixed a bug where I was missing a folio_put() for ebs that never got > > > their folios added to the mapping. > > > > > > fs/btrfs/extent_io.c | 28 ++++++++++++++++++++-------- > > > 1 file changed, 20 insertions(+), 8 deletions(-) > > > > > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > > > index 5f08615b334f..90499124b8a5 100644 > > > --- a/fs/btrfs/extent_io.c > > > +++ b/fs/btrfs/extent_io.c > > > @@ -2752,6 +2752,7 @@ static bool folio_range_has_eb(struct folio *folio) > > > static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) > > > { > > > struct btrfs_fs_info *fs_info = eb->fs_info; > > > + struct address_space *mapping = folio->mapping; > > > const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); > > > > > > /* > > > @@ -2759,11 +2760,11 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo > > > * be done under the i_private_lock. > > > */ > > > if (mapped) > > > - spin_lock(&folio->mapping->i_private_lock); > > > + spin_lock(&mapping->i_private_lock); > > > > > > if (!folio_test_private(folio)) { > > > if (mapped) > > > - spin_unlock(&folio->mapping->i_private_lock); > > > + spin_unlock(&mapping->i_private_lock); > > > return; > > > } > > > > > > @@ -2783,7 +2784,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo > > > folio_detach_private(folio); > > > } > > > if (mapped) > > > - spin_unlock(&folio->mapping->i_private_lock); > > > + spin_unlock(&mapping->i_private_lock); > > > return; > > > } > > > > > > @@ -2806,7 +2807,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo > > > if (!folio_range_has_eb(folio)) > > > btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); > > > > > > - spin_unlock(&folio->mapping->i_private_lock); > > > + spin_unlock(&mapping->i_private_lock); > > > } > > > > > > /* Release all folios attached to the extent buffer */ > > > @@ -2821,9 +2822,13 @@ static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) > > > continue; > > > > > > detach_extent_buffer_folio(eb, folio); > > > - > > > - /* One for when we allocated the folio. */ > > > - folio_put(folio); > > > > So far so good, but... > > > > > + /* > > > + * We only release the allocated refcount for mapped extent_buffer > > > + * folios. If the folio is unmapped, we have to release its allocated > > > + * refcount here. > > > + */ > > > + if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) > > > + folio_put(folio); > > > > ...is this really correct? I'd do rather this instead: > > Yes, it is correct. It does deal with cloned extent buffers, where we > need the extra put since we didn't do it after attaching the folio to > it. > Alternatively we could probably do the put after the attach at > btrfs_clone_extent_buffer(), just like for regular extent buffers, and > get rid of this special casing. Right. That sounds more correct actually. > > > > @@ -3393,22 +3393,21 @@ struct extent_buffer > > *alloc_extent_buffer(struct btrfs_fs_info *fs_info, > > * case. If we left eb->folios[i] populated in the subpage case we'd > > * double put our reference and be super sad. > > */ > > - for (int i = 0; i < attached; i++) { > > - ASSERT(eb->folios[i]); > > - detach_extent_buffer_folio(eb, eb->folios[i]); > > + for (int i = 0; i < num_extent_folios(eb); i++) { > > + if (i < attached) { > > + ASSERT(eb->folios[i]); > > + detach_extent_buffer_folio(eb, eb->folios[i]); > > + } else if (!eb->folios[i]) > > + continue; > > folio_unlock(eb->folios[i]); > > folio_put(eb->folios[i]); > > eb->folios[i] = NULL; > > > > And perhaps `struct folio *folio = eb->folios[i];` first and substitute. > > > > Or is this unrelated and we need both? > > This is unrelated. > What you're trying to do here is to simplify the error path of the > allocation of a regular extent buffer. > Nothing to do with Boris' fix, and out of the scope of the fix. Thanks for the explanation. I'll send it as a separate patch then. > > > > And honestly, IMO, there's no reason to set the EXTENT_BUFFER_UNMAPPED > > at all after this loop as it's just going to be freed. > > > > } > > - /* > > - * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, > > - * so it can be cleaned up without utilizing folio->mapping. > > - */ > > - set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); > > - > > btrfs_release_extent_buffer(eb); > > + > > if (ret < 0) > > return ERR_PTR(ret); > > + > > ASSERT(existing_eb); > > return existing_eb; > > } > > > > > } > > > } > > > > > > @@ -3365,8 +3370,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, > > > * btree_release_folio will correctly detect that a page belongs to a > > > * live buffer and won't free them prematurely. > > > */ > > > - for (int i = 0; i < num_extent_folios(eb); i++) > > > + for (int i = 0; i < num_extent_folios(eb); i++) { > > > folio_unlock(eb->folios[i]); > > > + /* > > > + * A folio that has been added to an address_space mapping > > > + * should not continue holding the refcount from its original > > > + * allocation indefinitely. > > > + */ > > > + folio_put(eb->folios[i]); > > > + } > > > return eb; > > > > > > out: > > > -- > > > 2.49.0 > > > > > > > >
On Fri, Apr 18, 2025 at 11:49:50AM +0100, Filipe Manana wrote: > On Fri, Apr 18, 2025 at 10:24 AM Daniel Vacek <neelx@suse.com> wrote: > > > > On Fri, 18 Apr 2025 at 01:15, Boris Burkov <boris@bur.io> wrote: > > > > > > The (correct) commit > > > e41c81d0d30e ("mm/truncate: Replace page_mapped() call in invalidate_inode_page()") > > > replaced the page_mapped(page) check with a refcount check. However, > > > this refcount check does not work as expected with drop_caches for > > > btrfs's metadata pages. > > > > > > Btrfs has a per-sb metadata inode with cached pages, and when not in > > > active use by btrfs, they have a refcount of 3. One from the initial > > > call to alloc_pages, one (nr_pages == 1) from filemap_add_folio, and one > > > from folio_attach_private. We would expect such pages to get dropped by > > > drop_caches. However, drop_caches calls into mapping_evict_folio via > > > mapping_try_invalidate which gets a reference on the folio with > > > find_lock_entries(). As a result, these pages have a refcount of 4, and > > > fail this check. > > > > > > For what it's worth, such pages do get reclaimed under memory pressure, > > > so I would say that while this behavior is surprising, it is not really > > > dangerously broken. > > > > > > When I asked the mm folks about the expected refcount in this case, I > > > was told that the correct thing to do is to donate the refcount from the > > > original allocation to the page cache after inserting it. > > > https://lore.kernel.org/linux-mm/ZrwhTXKzgDnCK76Z@casper.infradead.org/ > > > > > > Therefore, attempt to fix this by adding a put_folio() to the critical > > > spot in alloc_extent_buffer where we are sure that we have really > > > allocated and attached new pages. We must also adjust > > > folio_detach_private to properly handle being the last reference to the > > > folio and not do a UAF after folio_detach_private(). > > > > > > Finally, extent_buffers allocated by clone_extent_buffer() and > > > alloc_dummy_extent_buffer() are unmapped, so this transfer of ownership > > > from allocation to insertion in the mapping does not apply to them. > > > Therefore, they still need a folio_put() as before. > > > > > > Reviewed-by: Filipe Manana <fdmanana@suse.com> > > > Signed-off-by: Boris Burkov <boris@bur.io> > > > --- > > > Changelog: > > > v2: > > > * Used Filipe's suggestion to simplify detach_extent_buffer_folio and > > > lose the extra folio_get()/folio_put() pair > > > * Noticed a memory leak causing failures in fstests on smaller vms. > > > Fixed a bug where I was missing a folio_put() for ebs that never got > > > their folios added to the mapping. > > > > > > fs/btrfs/extent_io.c | 28 ++++++++++++++++++++-------- > > > 1 file changed, 20 insertions(+), 8 deletions(-) > > > > > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > > > index 5f08615b334f..90499124b8a5 100644 > > > --- a/fs/btrfs/extent_io.c > > > +++ b/fs/btrfs/extent_io.c > > > @@ -2752,6 +2752,7 @@ static bool folio_range_has_eb(struct folio *folio) > > > static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) > > > { > > > struct btrfs_fs_info *fs_info = eb->fs_info; > > > + struct address_space *mapping = folio->mapping; > > > const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); > > > > > > /* > > > @@ -2759,11 +2760,11 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo > > > * be done under the i_private_lock. > > > */ > > > if (mapped) > > > - spin_lock(&folio->mapping->i_private_lock); > > > + spin_lock(&mapping->i_private_lock); > > > > > > if (!folio_test_private(folio)) { > > > if (mapped) > > > - spin_unlock(&folio->mapping->i_private_lock); > > > + spin_unlock(&mapping->i_private_lock); > > > return; > > > } > > > > > > @@ -2783,7 +2784,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo > > > folio_detach_private(folio); > > > } > > > if (mapped) > > > - spin_unlock(&folio->mapping->i_private_lock); > > > + spin_unlock(&mapping->i_private_lock); > > > return; > > > } > > > > > > @@ -2806,7 +2807,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo > > > if (!folio_range_has_eb(folio)) > > > btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); > > > > > > - spin_unlock(&folio->mapping->i_private_lock); > > > + spin_unlock(&mapping->i_private_lock); > > > } > > > > > > /* Release all folios attached to the extent buffer */ > > > @@ -2821,9 +2822,13 @@ static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) > > > continue; > > > > > > detach_extent_buffer_folio(eb, folio); > > > - > > > - /* One for when we allocated the folio. */ > > > - folio_put(folio); > > > > So far so good, but... > > > > > + /* > > > + * We only release the allocated refcount for mapped extent_buffer > > > + * folios. If the folio is unmapped, we have to release its allocated > > > + * refcount here. > > > + */ > > > + if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) > > > + folio_put(folio); > > > > ...is this really correct? I'd do rather this instead: > > Yes, it is correct. It does deal with cloned extent buffers, where we > need the extra put since we didn't do it after attaching the folio to > it. > Alternatively we could probably do the put after the attach at > btrfs_clone_extent_buffer(), just like for regular extent buffers, and > get rid of this special casing. > I didn't do it this way originally because I was hewing to the model that we were donating the allocated refcount to the mapping when inserting it (modeled after readahead) and I did not have a corresponding model for these ebs which are not added to an address_space. I agree that your proposal results in simpler code so I will run it through fstests as well. It's difficult to predict if there are any negative side effects to these eb folios having one lower ref count at steady state. It _seems_ like it ought to be fine :) Thanks to both of you for the review, Boris > > > > @@ -3393,22 +3393,21 @@ struct extent_buffer > > *alloc_extent_buffer(struct btrfs_fs_info *fs_info, > > * case. If we left eb->folios[i] populated in the subpage case we'd > > * double put our reference and be super sad. > > */ > > - for (int i = 0; i < attached; i++) { > > - ASSERT(eb->folios[i]); > > - detach_extent_buffer_folio(eb, eb->folios[i]); > > + for (int i = 0; i < num_extent_folios(eb); i++) { > > + if (i < attached) { > > + ASSERT(eb->folios[i]); > > + detach_extent_buffer_folio(eb, eb->folios[i]); > > + } else if (!eb->folios[i]) > > + continue; > > folio_unlock(eb->folios[i]); > > folio_put(eb->folios[i]); > > eb->folios[i] = NULL; > > > > And perhaps `struct folio *folio = eb->folios[i];` first and substitute. > > > > Or is this unrelated and we need both? > > This is unrelated. > What you're trying to do here is to simplify the error path of the > allocation of a regular extent buffer. > Nothing to do with Boris' fix, and out of the scope of the fix. > > > > > And honestly, IMO, there's no reason to set the EXTENT_BUFFER_UNMAPPED > > at all after this loop as it's just going to be freed. > > > > } > > - /* > > - * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, > > - * so it can be cleaned up without utilizing folio->mapping. > > - */ > > - set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); > > - > > btrfs_release_extent_buffer(eb); > > + > > if (ret < 0) > > return ERR_PTR(ret); > > + > > ASSERT(existing_eb); > > return existing_eb; > > } > > > > > } > > > } > > > > > > @@ -3365,8 +3370,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, > > > * btree_release_folio will correctly detect that a page belongs to a > > > * live buffer and won't free them prematurely. > > > */ > > > - for (int i = 0; i < num_extent_folios(eb); i++) > > > + for (int i = 0; i < num_extent_folios(eb); i++) { > > > folio_unlock(eb->folios[i]); > > > + /* > > > + * A folio that has been added to an address_space mapping > > > + * should not continue holding the refcount from its original > > > + * allocation indefinitely. > > > + */ > > > + folio_put(eb->folios[i]); > > > + } > > > return eb; > > > > > > out: > > > -- > > > 2.49.0 > > > > > > > >
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5f08615b334f..90499124b8a5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2752,6 +2752,7 @@ static bool folio_range_has_eb(struct folio *folio) static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) { struct btrfs_fs_info *fs_info = eb->fs_info; + struct address_space *mapping = folio->mapping; const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); /* @@ -2759,11 +2760,11 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * be done under the i_private_lock. */ if (mapped) - spin_lock(&folio->mapping->i_private_lock); + spin_lock(&mapping->i_private_lock); if (!folio_test_private(folio)) { if (mapped) - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); return; } @@ -2783,7 +2784,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo folio_detach_private(folio); } if (mapped) - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); return; } @@ -2806,7 +2807,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo if (!folio_range_has_eb(folio)) btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); } /* Release all folios attached to the extent buffer */ @@ -2821,9 +2822,13 @@ static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) continue; detach_extent_buffer_folio(eb, folio); - - /* One for when we allocated the folio. */ - folio_put(folio); + /* + * We only release the allocated refcount for mapped extent_buffer + * folios. If the folio is unmapped, we have to release its allocated + * refcount here. + */ + if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) + folio_put(folio); } } @@ -3365,8 +3370,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ - for (int i = 0; i < num_extent_folios(eb); i++) + for (int i = 0; i < num_extent_folios(eb); i++) { folio_unlock(eb->folios[i]); + /* + * A folio that has been added to an address_space mapping + * should not continue holding the refcount from its original + * allocation indefinitely. + */ + folio_put(eb->folios[i]); + } return eb; out: