diff mbox series

[1/7] btrfs: prevent inline data extents read from touching blocks beyond its range

Message ID 31a8b02ec99fd3bf6399b7f543ef6de786665fd2.1740354271.git.wqu@suse.com (mailing list archive)
State New
Headers show
Series btrfs: make subpage handling be feature full | expand

Commit Message

Qu Wenruo Feb. 23, 2025, 11:46 p.m. UTC
Currently reading an inline data extent will zero out all the remaining
range in the page.

This is not yet causing problems even for block size < page size
(subpage) cases because:

1) An inline data extent always starts at file offset 0
   Meaning at page read, we always read the inline extent first, before
   any other blocks in the page. Then later blocks are properly read out
   and re-fill the zeroed out ranges.

2) Currently btrfs will read out the whole page if a buffered write is
   not page aligned
   So a page is either fully uptodate at buffered write time (covers the
   whole page), or we will read out the whole page first.
   Meaning there is nothing to lose for such an inline extent read.

But it's still not ideal:

- We're zeroing out the page twice
  One done by read_inline_extent()/uncompress_inline(), one done by
  btrfs_do_readpage() for ranges beyond i_size.

- We're touching blocks that doesn't belong to the inline extent
  In the incoming patches, we can have a partial uptodate folio, that
  some dirty blocks can exist while the page is not fully uptodate:

  The page size is 16K and block size is 4K:

  0         4K        8K        12K        16K
  |         |         |/////////|          |

  And range [8K, 12K) is dirtied by a buffered write, the remaining
  blocks are not uptodate.

  If range [0, 4K) contains an inline data extent, and we try to read
  the whole page, the current behavior will overwrite range [8K, 12K)
  with zero and cause data loss.

So to make the behavior more consistent and in preparation for future
changes, limit the inline data extents read to only zero out the range
inside the first block, not the whole page.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/inode.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

Comments

Filipe Manana Feb. 25, 2025, 3:07 p.m. UTC | #1
On Sun, Feb 23, 2025 at 11:46 PM Qu Wenruo <wqu@suse.com> wrote:
>
> Currently reading an inline data extent will zero out all the remaining
> range in the page.
>
> This is not yet causing problems even for block size < page size
> (subpage) cases because:
>
> 1) An inline data extent always starts at file offset 0
>    Meaning at page read, we always read the inline extent first, before
>    any other blocks in the page. Then later blocks are properly read out
>    and re-fill the zeroed out ranges.
>
> 2) Currently btrfs will read out the whole page if a buffered write is
>    not page aligned
>    So a page is either fully uptodate at buffered write time (covers the
>    whole page), or we will read out the whole page first.
>    Meaning there is nothing to lose for such an inline extent read.
>
> But it's still not ideal:
>
> - We're zeroing out the page twice
>   One done by read_inline_extent()/uncompress_inline(), one done by
>   btrfs_do_readpage() for ranges beyond i_size.
>
> - We're touching blocks that doesn't belong to the inline extent
>   In the incoming patches, we can have a partial uptodate folio, that
>   some dirty blocks can exist while the page is not fully uptodate:
>
>   The page size is 16K and block size is 4K:
>
>   0         4K        8K        12K        16K
>   |         |         |/////////|          |
>
>   And range [8K, 12K) is dirtied by a buffered write, the remaining
>   blocks are not uptodate.
>
>   If range [0, 4K) contains an inline data extent, and we try to read
>   the whole page, the current behavior will overwrite range [8K, 12K)
>   with zero and cause data loss.
>
> So to make the behavior more consistent and in preparation for future
> changes, limit the inline data extents read to only zero out the range
> inside the first block, not the whole page.
>
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/inode.c | 15 +++++++++------
>  1 file changed, 9 insertions(+), 6 deletions(-)
>
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index c432ccfba56e..fe2c6038064a 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -6788,6 +6788,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
>  {
>         int ret;
>         struct extent_buffer *leaf = path->nodes[0];
> +       const u32 sectorsize = leaf->fs_info->sectorsize;
>         char *tmp;
>         size_t max_size;
>         unsigned long inline_size;
> @@ -6816,17 +6817,19 @@ static noinline int uncompress_inline(struct btrfs_path *path,
>          * cover that region here.
>          */
>
> -       if (max_size < PAGE_SIZE)
> -               folio_zero_range(folio, max_size, PAGE_SIZE - max_size);
> +       if (max_size < sectorsize)
> +               folio_zero_range(folio, max_size, sectorsize - max_size);
>         kfree(tmp);
>         return ret;
>  }
>
> -static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
> +static int read_inline_extent(struct btrfs_fs_info *fs_info,
> +                             struct btrfs_path *path, struct folio *folio)
>  {
>         struct btrfs_file_extent_item *fi;
>         void *kaddr;
>         size_t copy_size;
> +       const u32 sectorsize = fs_info->sectorsize;

There's no need to add the fs_info argument just to get the sectorsize.
We can get it like this:

const u32 sectorsize = path->nodes[0]->fs_info->sectorsize;

Otherwise it looks good, so:

Reviewed-by: Filipe Manana <fdmanana@suse.com>

Thanks.



>
>         if (!folio || folio_test_uptodate(folio))
>                 return 0;
> @@ -6844,8 +6847,8 @@ static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
>         read_extent_buffer(path->nodes[0], kaddr,
>                            btrfs_file_extent_inline_start(fi), copy_size);
>         kunmap_local(kaddr);
> -       if (copy_size < PAGE_SIZE)
> -               folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size);
> +       if (copy_size < sectorsize)
> +               folio_zero_range(folio, copy_size, sectorsize - copy_size);
>         return 0;
>  }
>
> @@ -7020,7 +7023,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
>                 ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
>                 ASSERT(em->len == fs_info->sectorsize);
>
> -               ret = read_inline_extent(path, folio);
> +               ret = read_inline_extent(fs_info, path, folio);
>                 if (ret < 0)
>                         goto out;
>                 goto insert;
> --
> 2.48.1
>
>
Filipe Manana Feb. 25, 2025, 4:10 p.m. UTC | #2
On Tue, Feb 25, 2025 at 3:07 PM Filipe Manana <fdmanana@kernel.org> wrote:
>
> On Sun, Feb 23, 2025 at 11:46 PM Qu Wenruo <wqu@suse.com> wrote:
> >
> > Currently reading an inline data extent will zero out all the remaining
> > range in the page.
> >
> > This is not yet causing problems even for block size < page size
> > (subpage) cases because:
> >
> > 1) An inline data extent always starts at file offset 0
> >    Meaning at page read, we always read the inline extent first, before
> >    any other blocks in the page. Then later blocks are properly read out
> >    and re-fill the zeroed out ranges.
> >
> > 2) Currently btrfs will read out the whole page if a buffered write is
> >    not page aligned
> >    So a page is either fully uptodate at buffered write time (covers the
> >    whole page), or we will read out the whole page first.
> >    Meaning there is nothing to lose for such an inline extent read.
> >
> > But it's still not ideal:
> >
> > - We're zeroing out the page twice
> >   One done by read_inline_extent()/uncompress_inline(), one done by
> >   btrfs_do_readpage() for ranges beyond i_size.
> >
> > - We're touching blocks that doesn't belong to the inline extent
> >   In the incoming patches, we can have a partial uptodate folio, that
> >   some dirty blocks can exist while the page is not fully uptodate:
> >
> >   The page size is 16K and block size is 4K:
> >
> >   0         4K        8K        12K        16K
> >   |         |         |/////////|          |
> >
> >   And range [8K, 12K) is dirtied by a buffered write, the remaining
> >   blocks are not uptodate.
> >
> >   If range [0, 4K) contains an inline data extent, and we try to read
> >   the whole page, the current behavior will overwrite range [8K, 12K)
> >   with zero and cause data loss.
> >
> > So to make the behavior more consistent and in preparation for future
> > changes, limit the inline data extents read to only zero out the range
> > inside the first block, not the whole page.
> >
> > Signed-off-by: Qu Wenruo <wqu@suse.com>
> > ---
> >  fs/btrfs/inode.c | 15 +++++++++------
> >  1 file changed, 9 insertions(+), 6 deletions(-)
> >
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index c432ccfba56e..fe2c6038064a 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > @@ -6788,6 +6788,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
> >  {
> >         int ret;
> >         struct extent_buffer *leaf = path->nodes[0];
> > +       const u32 sectorsize = leaf->fs_info->sectorsize;
> >         char *tmp;
> >         size_t max_size;
> >         unsigned long inline_size;
> > @@ -6816,17 +6817,19 @@ static noinline int uncompress_inline(struct btrfs_path *path,
> >          * cover that region here.
> >          */
> >
> > -       if (max_size < PAGE_SIZE)
> > -               folio_zero_range(folio, max_size, PAGE_SIZE - max_size);
> > +       if (max_size < sectorsize)
> > +               folio_zero_range(folio, max_size, sectorsize - max_size);

Right above this we have:

max_size = min_t(unsigned long, PAGE_SIZE, max_size);
ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size,
       max_size);

Should't we replace PAGE_SIZE with sectorsize too? Why not?

And there's a similar case at read_inline_extent() too.

Thanks.


> >         kfree(tmp);
> >         return ret;
> >  }
> >
> > -static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
> > +static int read_inline_extent(struct btrfs_fs_info *fs_info,
> > +                             struct btrfs_path *path, struct folio *folio)
> >  {
> >         struct btrfs_file_extent_item *fi;
> >         void *kaddr;
> >         size_t copy_size;
> > +       const u32 sectorsize = fs_info->sectorsize;
>
> There's no need to add the fs_info argument just to get the sectorsize.
> We can get it like this:
>
> const u32 sectorsize = path->nodes[0]->fs_info->sectorsize;
>
> Otherwise it looks good, so:
>
> Reviewed-by: Filipe Manana <fdmanana@suse.com>
>
> Thanks.
>
>
>
> >
> >         if (!folio || folio_test_uptodate(folio))
> >                 return 0;
> > @@ -6844,8 +6847,8 @@ static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
> >         read_extent_buffer(path->nodes[0], kaddr,
> >                            btrfs_file_extent_inline_start(fi), copy_size);
> >         kunmap_local(kaddr);
> > -       if (copy_size < PAGE_SIZE)
> > -               folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size);
> > +       if (copy_size < sectorsize)
> > +               folio_zero_range(folio, copy_size, sectorsize - copy_size);
> >         return 0;
> >  }
> >
> > @@ -7020,7 +7023,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
> >                 ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
> >                 ASSERT(em->len == fs_info->sectorsize);
> >
> > -               ret = read_inline_extent(path, folio);
> > +               ret = read_inline_extent(fs_info, path, folio);
> >                 if (ret < 0)
> >                         goto out;
> >                 goto insert;
> > --
> > 2.48.1
> >
> >
diff mbox series

Patch

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c432ccfba56e..fe2c6038064a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6788,6 +6788,7 @@  static noinline int uncompress_inline(struct btrfs_path *path,
 {
 	int ret;
 	struct extent_buffer *leaf = path->nodes[0];
+	const u32 sectorsize = leaf->fs_info->sectorsize;
 	char *tmp;
 	size_t max_size;
 	unsigned long inline_size;
@@ -6816,17 +6817,19 @@  static noinline int uncompress_inline(struct btrfs_path *path,
 	 * cover that region here.
 	 */
 
-	if (max_size < PAGE_SIZE)
-		folio_zero_range(folio, max_size, PAGE_SIZE - max_size);
+	if (max_size < sectorsize)
+		folio_zero_range(folio, max_size, sectorsize - max_size);
 	kfree(tmp);
 	return ret;
 }
 
-static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
+static int read_inline_extent(struct btrfs_fs_info *fs_info,
+			      struct btrfs_path *path, struct folio *folio)
 {
 	struct btrfs_file_extent_item *fi;
 	void *kaddr;
 	size_t copy_size;
+	const u32 sectorsize = fs_info->sectorsize;
 
 	if (!folio || folio_test_uptodate(folio))
 		return 0;
@@ -6844,8 +6847,8 @@  static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
 	read_extent_buffer(path->nodes[0], kaddr,
 			   btrfs_file_extent_inline_start(fi), copy_size);
 	kunmap_local(kaddr);
-	if (copy_size < PAGE_SIZE)
-		folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size);
+	if (copy_size < sectorsize)
+		folio_zero_range(folio, copy_size, sectorsize - copy_size);
 	return 0;
 }
 
@@ -7020,7 +7023,7 @@  struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 		ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
 		ASSERT(em->len == fs_info->sectorsize);
 
-		ret = read_inline_extent(path, folio);
+		ret = read_inline_extent(fs_info, path, folio);
 		if (ret < 0)
 			goto out;
 		goto insert;