diff mbox series

[v2,5/5] btrfs: prepare btrfs_punch_hole_lock_range() for large data folios

Message ID 86cc5b6bd21e489ea6838b01bb0948c0a19b2cb5.1743239672.git.wqu@suse.com (mailing list archive)
State New
Headers show
Series btrfs: add the missing preparations exposed by initial large data folio support | expand

Commit Message

Qu Wenruo March 29, 2025, 9:19 a.m. UTC
The function btrfs_punch_hole_lock_range() needs to make sure there is
no other folio in the range, thus it goes with filemap_range_has_page(),
which works pretty fine.

But if we have large folios, under the following case
filemap_range_has_page() will always return true, forcing
btrfs_punch_hole_lock_range() to do a very time consuming busy loop:

        start                            end
        |                                |
  |//|//|//|//|  |  |  |  |  |  |  |  |//|//|
   \         /                         \   /
    Folio A                            Folio B

In above case, folio A and B contain our start/end indexes, and there
are no other folios in the range.
Thus we do not need to retry inside btrfs_punch_hole_lock_range().

To prepare for large data folios, introduce a helper,
check_range_has_page(), which will:

- Shrink the search range towards page boundaries
  If the rounded down end (exclusive, otherwise it can underflow when @end
  is inside the folio at file offset 0) is no larger than the rounded up
  start, it means the range contains no other pages other than the ones
  covering @start and @end.

  Can return false directly in that case.

- Grab all the folios inside the range

- Skip any large folios that cover the start and end indexes

- If any other folios are found return true

- Otherwise return false

This new helper is going to handle both large folios and regular ones.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/file.c | 69 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 58 insertions(+), 11 deletions(-)

Comments

Filipe Manana March 31, 2025, 11:50 a.m. UTC | #1
On Sat, Mar 29, 2025 at 9:20 AM Qu Wenruo <wqu@suse.com> wrote:
>
> The function btrfs_punch_hole_lock_range() needs to make sure there is
> no other folio in the range, thus it goes with filemap_range_has_page(),
> which works pretty fine.
>
> But if we have large folios, under the following case
> filemap_range_has_page() will always return true, forcing
> btrfs_punch_hole_lock_range() to do a very time consuming busy loop:
>
>         start                            end
>         |                                |
>   |//|//|//|//|  |  |  |  |  |  |  |  |//|//|
>    \         /                         \   /
>     Folio A                            Folio B
>
> In above case, folio A and B contain our start/end indexes, and there
> are no other folios in the range.
> Thus we do not need to retry inside btrfs_punch_hole_lock_range().
>
> To prepare for large data folios, introduce a helper,
> check_range_has_page(), which will:
>
> - Shrink the search range towards page boundaries
>   If the rounded down end (exclusive, otherwise it can underflow when @end
>   is inside the folio at file offset 0) is no larger than the rounded up
>   start, it means the range contains no other pages other than the ones
>   covering @start and @end.
>
>   Can return false directly in that case.
>
> - Grab all the folios inside the range
>
> - Skip any large folios that cover the start and end indexes
>
> - If any other folios are found return true
>
> - Otherwise return false
>
> This new helper is going to handle both large folios and regular ones.
>
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/file.c | 69 +++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 58 insertions(+), 11 deletions(-)
>
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index a7afc55bab2a..bd0bb7aea99d 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2159,11 +2159,29 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
>         return ret;
>  }
>
> -static void btrfs_punch_hole_lock_range(struct inode *inode,
> -                                       const u64 lockstart,
> -                                       const u64 lockend,
> -                                       struct extent_state **cached_state)
> +/*
> + * The helper to check if there is no folio in the range.
> + *
> + * We can not utilized filemap_range_has_page() in a filemap with large folios
> + * as we can hit the following false positive:
> + *
> + *        start                            end
> + *        |                                |
> + *  |//|//|//|//|  |  |  |  |  |  |  |  |//|//|
> + *   \         /                         \   /
> + *    Folio A                            Folio B
> + *
> + * That large folio A and B cover the start and end indexes.
> + * In that case filemap_range_has_page() will always return true, but the above
> + * case is fine for btrfs_punch_hole_lock_range() usage.
> + *
> + * So here we only ensure that no other folios is in the range, excluding the
> + * head/tail large folio.
> + */
> +static bool check_range_has_page(struct inode *inode, u64 start, u64 end)
>  {
> +       struct folio_batch fbatch;
> +       bool ret = false;
>         /*
>          * For subpage case, if the range is not at page boundary, we could
>          * have pages at the leading/tailing part of the range.
> @@ -2174,17 +2192,47 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
>          *
>          * And do not decrease page_lockend right now, as it can be 0.
>          */
> -       const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
> -       const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE);
> +       const u64 page_lockstart = round_up(start, PAGE_SIZE);
> +       const u64 page_lockend = round_down(end+ 1, PAGE_SIZE);

Missing space between 'end' and '+ 1'.

> +       const pgoff_t start_index = page_lockstart >> PAGE_SHIFT;
> +       const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT;
> +       pgoff_t tmp = start_index;
> +       int found_folios;
>
> +       /* The same page or adjacent pages. */
> +       if (page_lockend <= page_lockstart)
> +               return false;
> +
> +       folio_batch_init(&fbatch);
> +       found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index,
> +                                         &fbatch);
> +       for (int i = 0; i < found_folios; i++) {
> +               struct folio *folio = fbatch.folios[i];
> +
> +               /* A large folio begins before the start. Not a target. */
> +               if (folio->index < start_index)
> +                       continue;

We passed start_index (via tmp) to filemap_get_folios(). Isn't the
function supposed to return folios only at an index >= start_index?
It's what the documentation says and the implementation seems to
behave that way too.

Removing that we could also use start_index to pass to
filemap_get_folios(), making it non-const, and remove the tmp
variable.

Either way it looks good and that's a minor thing:

Reviewed-by: Filipe Manana <fdmanana@suse.com>

Thanks.

> +               /* A large folio extends beyond the end. Not a target. */
> +               if (folio->index + folio_nr_pages(folio) > end_index)
> +                       continue;
> +               /* A folio doesn't cover the head/tail index. Found a target. */
> +               ret = true;
> +               break;
> +       }
> +       folio_batch_release(&fbatch);
> +       return ret;
> +}
> +
> +static void btrfs_punch_hole_lock_range(struct inode *inode,
> +                                       const u64 lockstart,
> +                                       const u64 lockend,
> +                                       struct extent_state **cached_state)
> +{
>         while (1) {
>                 truncate_pagecache_range(inode, lockstart, lockend);
>
>                 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
>                             cached_state);
> -               /* The same page or adjacent pages. */
> -               if (page_lockend <= page_lockstart)
> -                       break;
>                 /*
>                  * We can't have ordered extents in the range, nor dirty/writeback
>                  * pages, because we have locked the inode's VFS lock in exclusive
> @@ -2195,8 +2243,7 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
>                  * locking the range check if we have pages in the range, and if
>                  * we do, unlock the range and retry.
>                  */
> -               if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
> -                                           page_lockend - 1))
> +               if (!check_range_has_page(inode, lockstart, lockend))
>                         break;
>
>                 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> --
> 2.49.0
>
>
Qu Wenruo March 31, 2025, 9:19 p.m. UTC | #2
在 2025/3/31 22:20, Filipe Manana 写道:
> On Sat, Mar 29, 2025 at 9:20 AM Qu Wenruo <wqu@suse.com> wrote:
[...]
>> +       found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index,
>> +                                         &fbatch);
>> +       for (int i = 0; i < found_folios; i++) {
>> +               struct folio *folio = fbatch.folios[i];
>> +
>> +               /* A large folio begins before the start. Not a target. */
>> +               if (folio->index < start_index)
>> +                       continue;
> 
> We passed start_index (via tmp) to filemap_get_folios(). Isn't the
> function supposed to return folios only at an index >= start_index?
> It's what the documentation says and the implementation seems to
> behave that way too.

Not exactly, comments of filemap_get_folios_tag() shows exactly this:

  * The first folio may start before @start; if it does, it will contain
  * @start.  The final folio may extend beyond @end; if it does, it will
  * contain @end.

This is exactly what we need to support large folios.

> 
> Removing that we could also use start_index to pass to
> filemap_get_folios(), making it non-const, and remove the tmp
> variable.

The reason I use @tmp is, filemap_get_folios() can update the @tmp to 
the next folio's start index, which makes us unable to use the original 
start_index to compare.

Thanks,
Qu

> 
> Either way it looks good and that's a minor thing:
> 
> Reviewed-by: Filipe Manana <fdmanana@suse.com>
> 
> Thanks.
> 
>> +               /* A large folio extends beyond the end. Not a target. */
>> +               if (folio->index + folio_nr_pages(folio) > end_index)
>> +                       continue;
>> +               /* A folio doesn't cover the head/tail index. Found a target. */
>> +               ret = true;
>> +               break;
>> +       }
>> +       folio_batch_release(&fbatch);
>> +       return ret;
>> +}
>> +
>> +static void btrfs_punch_hole_lock_range(struct inode *inode,
>> +                                       const u64 lockstart,
>> +                                       const u64 lockend,
>> +                                       struct extent_state **cached_state)
>> +{
>>          while (1) {
>>                  truncate_pagecache_range(inode, lockstart, lockend);
>>
>>                  lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
>>                              cached_state);
>> -               /* The same page or adjacent pages. */
>> -               if (page_lockend <= page_lockstart)
>> -                       break;
>>                  /*
>>                   * We can't have ordered extents in the range, nor dirty/writeback
>>                   * pages, because we have locked the inode's VFS lock in exclusive
>> @@ -2195,8 +2243,7 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
>>                   * locking the range check if we have pages in the range, and if
>>                   * we do, unlock the range and retry.
>>                   */
>> -               if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
>> -                                           page_lockend - 1))
>> +               if (!check_range_has_page(inode, lockstart, lockend))
>>                          break;
>>
>>                  unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
>> --
>> 2.49.0
>>
>>\
diff mbox series

Patch

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a7afc55bab2a..bd0bb7aea99d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2159,11 +2159,29 @@  static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
 	return ret;
 }
 
-static void btrfs_punch_hole_lock_range(struct inode *inode,
-					const u64 lockstart,
-					const u64 lockend,
-					struct extent_state **cached_state)
+/*
+ * The helper to check if there is no folio in the range.
+ *
+ * We can not utilized filemap_range_has_page() in a filemap with large folios
+ * as we can hit the following false positive:
+ *
+ *        start                            end
+ *        |                                |
+ *  |//|//|//|//|  |  |  |  |  |  |  |  |//|//|
+ *   \         /                         \   /
+ *    Folio A                            Folio B
+ *
+ * That large folio A and B cover the start and end indexes.
+ * In that case filemap_range_has_page() will always return true, but the above
+ * case is fine for btrfs_punch_hole_lock_range() usage.
+ *
+ * So here we only ensure that no other folios is in the range, excluding the
+ * head/tail large folio.
+ */
+static bool check_range_has_page(struct inode *inode, u64 start, u64 end)
 {
+	struct folio_batch fbatch;
+	bool ret = false;
 	/*
 	 * For subpage case, if the range is not at page boundary, we could
 	 * have pages at the leading/tailing part of the range.
@@ -2174,17 +2192,47 @@  static void btrfs_punch_hole_lock_range(struct inode *inode,
 	 *
 	 * And do not decrease page_lockend right now, as it can be 0.
 	 */
-	const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
-	const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE);
+	const u64 page_lockstart = round_up(start, PAGE_SIZE);
+	const u64 page_lockend = round_down(end+ 1, PAGE_SIZE);
+	const pgoff_t start_index = page_lockstart >> PAGE_SHIFT;
+	const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT;
+	pgoff_t tmp = start_index;
+	int found_folios;
 
+	/* The same page or adjacent pages. */
+	if (page_lockend <= page_lockstart)
+		return false;
+
+	folio_batch_init(&fbatch);
+	found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index,
+					  &fbatch);
+	for (int i = 0; i < found_folios; i++) {
+		struct folio *folio = fbatch.folios[i];
+
+		/* A large folio begins before the start. Not a target. */
+		if (folio->index < start_index)
+			continue;
+		/* A large folio extends beyond the end. Not a target. */
+		if (folio->index + folio_nr_pages(folio) > end_index)
+			continue;
+		/* A folio doesn't cover the head/tail index. Found a target. */
+		ret = true;
+		break;
+	}
+	folio_batch_release(&fbatch);
+	return ret;
+}
+
+static void btrfs_punch_hole_lock_range(struct inode *inode,
+					const u64 lockstart,
+					const u64 lockend,
+					struct extent_state **cached_state)
+{
 	while (1) {
 		truncate_pagecache_range(inode, lockstart, lockend);
 
 		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 			    cached_state);
-		/* The same page or adjacent pages. */
-		if (page_lockend <= page_lockstart)
-			break;
 		/*
 		 * We can't have ordered extents in the range, nor dirty/writeback
 		 * pages, because we have locked the inode's VFS lock in exclusive
@@ -2195,8 +2243,7 @@  static void btrfs_punch_hole_lock_range(struct inode *inode,
 		 * locking the range check if we have pages in the range, and if
 		 * we do, unlock the range and retry.
 		 */
-		if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
-					    page_lockend - 1))
+		if (!check_range_has_page(inode, lockstart, lockend))
 			break;
 
 		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,