diff mbox series

[v4] iomap: support tail packing inline read

Message ID 20210720133554.44058-1-hsiangkao@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series [v4] iomap: support tail packing inline read | expand

Commit Message

Gao Xiang July 20, 2021, 1:35 p.m. UTC
This tries to add tail packing inline read to iomap, which can support
several inline tail blocks. Similar to the previous approach, it cleans
post-EOF in one iteration.

The write path remains untouched since EROFS cannot be used for testing.
It'd be better to be implemented if upcoming real users care rather than
leave untested dead code around.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Andreas Gruenbacher <andreas.gruenbacher@gmail.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
v3: https://lore.kernel.org/r/20210719144747.189634-1-hsiangkao@linux.alibaba.com
changes since v3:
 - update return value type of iomap_read_inline_data to int;
 - fix iomap_write_begin_inline() pointed out by Andreas.

 fs/iomap/buffered-io.c | 52 ++++++++++++++++++++++++++----------------
 fs/iomap/direct-io.c   | 11 +++++----
 2 files changed, 39 insertions(+), 24 deletions(-)

Comments

Darrick J. Wong July 20, 2021, 8:42 p.m. UTC | #1
On Tue, Jul 20, 2021 at 09:35:54PM +0800, Gao Xiang wrote:
> This tries to add tail packing inline read to iomap, which can support
> several inline tail blocks. Similar to the previous approach, it cleans
> post-EOF in one iteration.
> 
> The write path remains untouched since EROFS cannot be used for testing.
> It'd be better to be implemented if upcoming real users care rather than
> leave untested dead code around.
> 
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Darrick J. Wong <djwong@kernel.org>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Andreas Gruenbacher <andreas.gruenbacher@gmail.com>
> Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
> ---
> v3: https://lore.kernel.org/r/20210719144747.189634-1-hsiangkao@linux.alibaba.com
> changes since v3:
>  - update return value type of iomap_read_inline_data to int;
>  - fix iomap_write_begin_inline() pointed out by Andreas.
> 
>  fs/iomap/buffered-io.c | 52 ++++++++++++++++++++++++++----------------
>  fs/iomap/direct-io.c   | 11 +++++----
>  2 files changed, 39 insertions(+), 24 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 87ccb3438bec..0edc8bbb35d1 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -205,25 +205,25 @@ struct iomap_readpage_ctx {
>  	struct readahead_control *rac;
>  };
>  
> -static void
> +static int
>  iomap_read_inline_data(struct inode *inode, struct page *page,
> -		struct iomap *iomap)
> +		struct iomap *iomap, loff_t pos)
>  {
> -	size_t size = i_size_read(inode);
> +	unsigned int size, poff = offset_in_page(pos);
>  	void *addr;
>  
> -	if (PageUptodate(page))
> -		return;
> -
> -	BUG_ON(page_has_private(page));
> -	BUG_ON(page->index);
> -	BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
> +	/* inline source data must be inside a single page */
> +	BUG_ON(iomap->length > PAGE_SIZE - offset_in_page(iomap->inline_data));

Can we reduce the strength of these checks to a warning and an -EIO
return?

> +	/* handle tail-packing blocks cross the current page into the next */
> +	size = min_t(unsigned int, iomap->length + pos - iomap->offset,
> +		     PAGE_SIZE - poff);
>  
>  	addr = kmap_atomic(page);
> -	memcpy(addr, iomap->inline_data, size);
> -	memset(addr + size, 0, PAGE_SIZE - size);
> +	memcpy(addr + poff, iomap->inline_data - iomap->offset + pos, size);
> +	memset(addr + poff + size, 0, PAGE_SIZE - poff - size);

Hmm, so I guess the point of this is to support reading data from a
tail-packing block, where each file gets some arbitrary byte range
within the tp-block, and the range isn't aligned to an fs block?  Hence
you have to use the inline data code to read the relevant bytes and copy
them into the pagecache?

Aka this thing from the v3 discussion:

> The other one is actual file tail blocks, I think it can cross pages due
> to multiple tail inline blocks.
>
>                             |<---------- inline data ------------->|
>   _________________________________________________________________
>   | file block | file block | file block | file block | file block |
>   |<----------------    page   ---------------------->|<---  page

Except ... is this diagram a little misleading?  Each of these "file
blocks" isn't i_blocksize bytes in size, right?  Because if they were,
you could use the standard iomap codepaths?

So the real layout might look a bit more like this?

                                |<--- inline data ---->|
  _________________________________________________________________
  | file1 |     file2     |file3|  file4  |   file4    |
  |<-------------   page   -------------->|<---  page ----...

(Sorry, /me isn't all that familiar with erofs layout...)

>  	kunmap_atomic(addr);
> -	SetPageUptodate(page);
> +	iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff);
> +	return PAGE_SIZE - poff;
>  }
>  
>  static inline bool iomap_block_needs_zeroing(struct inode *inode,
> @@ -246,18 +246,18 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>  	unsigned poff, plen;
>  	sector_t sector;
>  
> -	if (iomap->type == IOMAP_INLINE) {
> -		WARN_ON_ONCE(pos);
> -		iomap_read_inline_data(inode, page, iomap);
> -		return PAGE_SIZE;
> -	}
> -
> -	/* zero post-eof blocks as the page may be mapped */
>  	iop = iomap_page_create(inode, page);
> +	/* needs to skip some leading uptodate blocks */
>  	iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
>  	if (plen == 0)
>  		goto done;
>  
> +	if (iomap->type == IOMAP_INLINE) {
> +		plen = iomap_read_inline_data(inode, page, iomap, pos);
> +		goto done;
> +	}
> +
> +	/* zero post-eof blocks as the page may be mapped */
>  	if (iomap_block_needs_zeroing(inode, iomap, pos)) {
>  		zero_user(page, poff, plen);
>  		iomap_set_range_uptodate(page, poff, plen);
> @@ -589,6 +589,18 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
>  	return 0;
>  }
>  
> +static int iomap_write_begin_inline(struct inode *inode, loff_t pos,
> +		struct page *page, struct iomap *srcmap)
> +{
> +	/* needs more work for the tailpacking case, disable for now */
> +	if (WARN_ON_ONCE(srcmap->offset != 0))
> +		return -EIO;
> +	if (PageUptodate(page))
> +		return 0;
> +	iomap_read_inline_data(inode, page, srcmap, 0);
> +	return 0;
> +}
> +
>  static int
>  iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
>  		struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
> @@ -618,7 +630,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
>  	}
>  
>  	if (srcmap->type == IOMAP_INLINE)
> -		iomap_read_inline_data(inode, page, srcmap);
> +		status = iomap_write_begin_inline(inode, pos, page, srcmap);
>  	else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
>  		status = __block_write_begin_int(page, pos, len, NULL, srcmap);
>  	else
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 9398b8c31323..ee6309967b77 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -379,22 +379,25 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
>  {
>  	struct iov_iter *iter = dio->submit.iter;
>  	size_t copied;
> +	void *dst = iomap->inline_data + pos - iomap->offset;
>  
> -	BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
> +	/* inline data must be inside a single page */
> +	BUG_ON(length > PAGE_SIZE - offset_in_page(iomap->inline_data));

Same here, can we convert these to warnings + EIO return?

--D

>  	if (dio->flags & IOMAP_DIO_WRITE) {
>  		loff_t size = inode->i_size;
>  
>  		if (pos > size)
> -			memset(iomap->inline_data + size, 0, pos - size);
> -		copied = copy_from_iter(iomap->inline_data + pos, length, iter);
> +			memset(iomap->inline_data + size - iomap->offset,
> +			       0, pos - size);
> +		copied = copy_from_iter(dst, length, iter);
>  		if (copied) {
>  			if (pos + copied > size)
>  				i_size_write(inode, pos + copied);
>  			mark_inode_dirty(inode);
>  		}
>  	} else {
> -		copied = copy_to_iter(iomap->inline_data + pos, length, iter);
> +		copied = copy_to_iter(dst, length, iter);
>  	}
>  	dio->size += copied;
>  	return copied;
> -- 
> 2.24.4
>
Matthew Wilcox (Oracle) July 20, 2021, 9:18 p.m. UTC | #2
On Tue, Jul 20, 2021 at 01:42:24PM -0700, Darrick J. Wong wrote:
> > -	BUG_ON(page_has_private(page));
> > -	BUG_ON(page->index);
> > -	BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
> > +	/* inline source data must be inside a single page */
> > +	BUG_ON(iomap->length > PAGE_SIZE - offset_in_page(iomap->inline_data));
> 
> Can we reduce the strength of these checks to a warning and an -EIO
> return?

I'm not entirely sure that we need this check, tbh.

> > +	/* handle tail-packing blocks cross the current page into the next */
> > +	size = min_t(unsigned int, iomap->length + pos - iomap->offset,
> > +		     PAGE_SIZE - poff);
> >  
> >  	addr = kmap_atomic(page);
> > -	memcpy(addr, iomap->inline_data, size);
> > -	memset(addr + size, 0, PAGE_SIZE - size);
> > +	memcpy(addr + poff, iomap->inline_data - iomap->offset + pos, size);
> > +	memset(addr + poff + size, 0, PAGE_SIZE - poff - size);
> 
> Hmm, so I guess the point of this is to support reading data from a
> tail-packing block, where each file gets some arbitrary byte range
> within the tp-block, and the range isn't aligned to an fs block?  Hence
> you have to use the inline data code to read the relevant bytes and copy
> them into the pagecache?

I think there are two distinct cases for IOMAP_INLINE.  One is
where the tail of the file is literally embedded into the inode.
Like ext4 fast symbolic links.  Taking the ext4 i_blocks layout
as an example, you could have a 4kB block stored in i_block[0]
and then store bytes 4096-4151 in i_block[1-14] (although reading
https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html
makes me think that ext4 only supports storing 0-59 in the i_blocks;
it doesn't support 0-4095 in i_block[0] and then 4096-4151 in i_blocks)

The other is what I think erofs is doing where, for example, you'd
specify in i_block[1] the block which contains the tail and then in
i_block[2] what offset of the block the tail starts at.
Gao Xiang July 20, 2021, 11:46 p.m. UTC | #3
Hi Darrick,

On Tue, Jul 20, 2021 at 01:42:24PM -0700, Darrick J. Wong wrote:
> On Tue, Jul 20, 2021 at 09:35:54PM +0800, Gao Xiang wrote:
> > This tries to add tail packing inline read to iomap, which can support
> > several inline tail blocks. Similar to the previous approach, it cleans
> > post-EOF in one iteration.
> > 
> > The write path remains untouched since EROFS cannot be used for testing.
> > It'd be better to be implemented if upcoming real users care rather than
> > leave untested dead code around.
> > 
> > Cc: Christoph Hellwig <hch@lst.de>
> > Cc: Darrick J. Wong <djwong@kernel.org>
> > Cc: Matthew Wilcox <willy@infradead.org>
> > Cc: Andreas Gruenbacher <andreas.gruenbacher@gmail.com>
> > Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
> > ---
> > v3: https://lore.kernel.org/r/20210719144747.189634-1-hsiangkao@linux.alibaba.com
> > changes since v3:
> >  - update return value type of iomap_read_inline_data to int;
> >  - fix iomap_write_begin_inline() pointed out by Andreas.
> > 
> >  fs/iomap/buffered-io.c | 52 ++++++++++++++++++++++++++----------------
> >  fs/iomap/direct-io.c   | 11 +++++----
> >  2 files changed, 39 insertions(+), 24 deletions(-)
> > 
> > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> > index 87ccb3438bec..0edc8bbb35d1 100644
> > --- a/fs/iomap/buffered-io.c
> > +++ b/fs/iomap/buffered-io.c
> > @@ -205,25 +205,25 @@ struct iomap_readpage_ctx {
> >  	struct readahead_control *rac;
> >  };
> >  
> > -static void
> > +static int
> >  iomap_read_inline_data(struct inode *inode, struct page *page,
> > -		struct iomap *iomap)
> > +		struct iomap *iomap, loff_t pos)
> >  {
> > -	size_t size = i_size_read(inode);
> > +	unsigned int size, poff = offset_in_page(pos);
> >  	void *addr;
> >  
> > -	if (PageUptodate(page))
> > -		return;
> > -
> > -	BUG_ON(page_has_private(page));
> > -	BUG_ON(page->index);
> > -	BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
> > +	/* inline source data must be inside a single page */
> > +	BUG_ON(iomap->length > PAGE_SIZE - offset_in_page(iomap->inline_data));
> 
> Can we reduce the strength of these checks to a warning and an -EIO
> return?

Ok, will update it.

> 
> > +	/* handle tail-packing blocks cross the current page into the next */
> > +	size = min_t(unsigned int, iomap->length + pos - iomap->offset,
> > +		     PAGE_SIZE - poff);
> >  
> >  	addr = kmap_atomic(page);
> > -	memcpy(addr, iomap->inline_data, size);
> > -	memset(addr + size, 0, PAGE_SIZE - size);
> > +	memcpy(addr + poff, iomap->inline_data - iomap->offset + pos, size);
> > +	memset(addr + poff + size, 0, PAGE_SIZE - poff - size);
> 
> Hmm, so I guess the point of this is to support reading data from a
> tail-packing block, where each file gets some arbitrary byte range
> within the tp-block, and the range isn't aligned to an fs block?  Hence
> you have to use the inline data code to read the relevant bytes and copy
> them into the pagecache?

Yes, the source data isn't aligned to an fs block.

> 
> Aka this thing from the v3 discussion:
> 
> > The other one is actual file tail blocks, I think it can cross pages due
> > to multiple tail inline blocks.
> >
> >                             |<---------- inline data ------------->|
> >   _________________________________________________________________
> >   | file block | file block | file block | file block | file block |
> >   |<----------------    page   ---------------------->|<---  page
> 
> Except ... is this diagram a little misleading?  Each of these "file
> blocks" isn't i_blocksize bytes in size, right?  Because if they were,
> you could use the standard iomap codepaths?

The disgram above describe logical file extents.

The real physical layout is like this:
  _________________________________________________________
 | ... | inode |     inline data     | other inodes | .... |
               |<- arbitary length ->|
> 
> So the real layout might look a bit more like this?
> 
>                                 |<--- inline data ---->|
>   _________________________________________________________________
>   | file1 |     file2     |file3|  file4  |   file4    |
>   |<-------------   page   -------------->|<---  page ----...
> 
> (Sorry, /me isn't all that familiar with erofs layout...)

Nope, that is what erofs is like, erofs tail packing data inline with
inode itself, so when reading inode, the cache page itself can read
the tail blocks as well. When it read tail blocks again, it can save
I/O due to buffered before. Also this approach can save storage space
since it saves entire tail blocks as well.

> 
> >  	kunmap_atomic(addr);
> > -	SetPageUptodate(page);
> > +	iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff);
> > +	return PAGE_SIZE - poff;
> >  }
> >  
> >  static inline bool iomap_block_needs_zeroing(struct inode *inode,
> > @@ -246,18 +246,18 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> >  	unsigned poff, plen;
> >  	sector_t sector;
> >  
> > -	if (iomap->type == IOMAP_INLINE) {
> > -		WARN_ON_ONCE(pos);
> > -		iomap_read_inline_data(inode, page, iomap);
> > -		return PAGE_SIZE;
> > -	}
> > -
> > -	/* zero post-eof blocks as the page may be mapped */
> >  	iop = iomap_page_create(inode, page);
> > +	/* needs to skip some leading uptodate blocks */
> >  	iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
> >  	if (plen == 0)
> >  		goto done;
> >  
> > +	if (iomap->type == IOMAP_INLINE) {
> > +		plen = iomap_read_inline_data(inode, page, iomap, pos);
> > +		goto done;
> > +	}
> > +
> > +	/* zero post-eof blocks as the page may be mapped */
> >  	if (iomap_block_needs_zeroing(inode, iomap, pos)) {
> >  		zero_user(page, poff, plen);
> >  		iomap_set_range_uptodate(page, poff, plen);
> > @@ -589,6 +589,18 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
> >  	return 0;
> >  }
> >  
> > +static int iomap_write_begin_inline(struct inode *inode, loff_t pos,
> > +		struct page *page, struct iomap *srcmap)
> > +{
> > +	/* needs more work for the tailpacking case, disable for now */
> > +	if (WARN_ON_ONCE(srcmap->offset != 0))
> > +		return -EIO;
> > +	if (PageUptodate(page))
> > +		return 0;
> > +	iomap_read_inline_data(inode, page, srcmap, 0);
> > +	return 0;
> > +}
> > +
> >  static int
> >  iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
> >  		struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
> > @@ -618,7 +630,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
> >  	}
> >  
> >  	if (srcmap->type == IOMAP_INLINE)
> > -		iomap_read_inline_data(inode, page, srcmap);
> > +		status = iomap_write_begin_inline(inode, pos, page, srcmap);
> >  	else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
> >  		status = __block_write_begin_int(page, pos, len, NULL, srcmap);
> >  	else
> > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> > index 9398b8c31323..ee6309967b77 100644
> > --- a/fs/iomap/direct-io.c
> > +++ b/fs/iomap/direct-io.c
> > @@ -379,22 +379,25 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
> >  {
> >  	struct iov_iter *iter = dio->submit.iter;
> >  	size_t copied;
> > +	void *dst = iomap->inline_data + pos - iomap->offset;
> >  
> > -	BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
> > +	/* inline data must be inside a single page */
> > +	BUG_ON(length > PAGE_SIZE - offset_in_page(iomap->inline_data));
> 
> Same here, can we convert these to warnings + EIO return?

Sure, I could update it as well.

Thanks,
Gao Xiang

> 
> --D
> 
> >  	if (dio->flags & IOMAP_DIO_WRITE) {
> >  		loff_t size = inode->i_size;
> >  
> >  		if (pos > size)
> > -			memset(iomap->inline_data + size, 0, pos - size);
> > -		copied = copy_from_iter(iomap->inline_data + pos, length, iter);
> > +			memset(iomap->inline_data + size - iomap->offset,
> > +			       0, pos - size);
> > +		copied = copy_from_iter(dst, length, iter);
> >  		if (copied) {
> >  			if (pos + copied > size)
> >  				i_size_write(inode, pos + copied);
> >  			mark_inode_dirty(inode);
> >  		}
> >  	} else {
> > -		copied = copy_to_iter(iomap->inline_data + pos, length, iter);
> > +		copied = copy_to_iter(dst, length, iter);
> >  	}
> >  	dio->size += copied;
> >  	return copied;
> > -- 
> > 2.24.4
> >
Gao Xiang July 21, 2021, 12:03 a.m. UTC | #4
On Tue, Jul 20, 2021 at 10:18:54PM +0100, Matthew Wilcox wrote:
> On Tue, Jul 20, 2021 at 01:42:24PM -0700, Darrick J. Wong wrote:
> > > -	BUG_ON(page_has_private(page));
> > > -	BUG_ON(page->index);
> > > -	BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
> > > +	/* inline source data must be inside a single page */
> > > +	BUG_ON(iomap->length > PAGE_SIZE - offset_in_page(iomap->inline_data));
> > 
> > Can we reduce the strength of these checks to a warning and an -EIO
> > return?
> 
> I'm not entirely sure that we need this check, tbh.

I'm fine to get rid of this check, it just inherited from:
 - BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));

It has no real effect, but when reading INLINE extent, its .iomap_begin()
does:
	iomap->private = erofs_get_meta_page()	/* get meta page */

and in the .iomap_end(), it does:
	struct page *ipage = iomap->private;
	if (ipage) {
		unlock_page(ipage);
		put_page(ipage);
	}

> 
> > > +	/* handle tail-packing blocks cross the current page into the next */
> > > +	size = min_t(unsigned int, iomap->length + pos - iomap->offset,
> > > +		     PAGE_SIZE - poff);
> > >  
> > >  	addr = kmap_atomic(page);
> > > -	memcpy(addr, iomap->inline_data, size);
> > > -	memset(addr + size, 0, PAGE_SIZE - size);
> > > +	memcpy(addr + poff, iomap->inline_data - iomap->offset + pos, size);
> > > +	memset(addr + poff + size, 0, PAGE_SIZE - poff - size);
> > 
> > Hmm, so I guess the point of this is to support reading data from a
> > tail-packing block, where each file gets some arbitrary byte range
> > within the tp-block, and the range isn't aligned to an fs block?  Hence
> > you have to use the inline data code to read the relevant bytes and copy
> > them into the pagecache?
> 
> I think there are two distinct cases for IOMAP_INLINE.  One is
> where the tail of the file is literally embedded into the inode.
> Like ext4 fast symbolic links.  Taking the ext4 i_blocks layout
> as an example, you could have a 4kB block stored in i_block[0]
> and then store bytes 4096-4151 in i_block[1-14] (although reading
> https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html
> makes me think that ext4 only supports storing 0-59 in the i_blocks;
> it doesn't support 0-4095 in i_block[0] and then 4096-4151 in i_blocks)
> 
> The other is what I think erofs is doing where, for example, you'd
> specify in i_block[1] the block which contains the tail and then in
> i_block[2] what offset of the block the tail starts at.

Nope, EROFS inline data is embedded into the inode in order to save
I/O as well as space (maybe I didn't express clear before [1]). 

I understand the other one, but it can only save storage space but
cannot save I/O (we still need another independent I/O to read its
meta buffered page).

In the view of INLINE extent itself, I think both ways can be
supported with this approach.

[1] https://www.kernel.org/doc/html/latest/filesystems/erofs.html
    "On-disk details" section.

Thanks,
Gao Xiang
Darrick J. Wong July 21, 2021, 12:17 a.m. UTC | #5
On Wed, Jul 21, 2021 at 08:03:44AM +0800, Gao Xiang wrote:
> On Tue, Jul 20, 2021 at 10:18:54PM +0100, Matthew Wilcox wrote:
> > On Tue, Jul 20, 2021 at 01:42:24PM -0700, Darrick J. Wong wrote:
> > > > -	BUG_ON(page_has_private(page));
> > > > -	BUG_ON(page->index);
> > > > -	BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
> > > > +	/* inline source data must be inside a single page */
> > > > +	BUG_ON(iomap->length > PAGE_SIZE - offset_in_page(iomap->inline_data));
> > > 
> > > Can we reduce the strength of these checks to a warning and an -EIO
> > > return?
> > 
> > I'm not entirely sure that we need this check, tbh.
> 
> I'm fine to get rid of this check, it just inherited from:
>  - BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
> 
> It has no real effect, but when reading INLINE extent, its .iomap_begin()
> does:
> 	iomap->private = erofs_get_meta_page()	/* get meta page */
> 
> and in the .iomap_end(), it does:
> 	struct page *ipage = iomap->private;
> 	if (ipage) {
> 		unlock_page(ipage);
> 		put_page(ipage);
> 	}
> 
> > 
> > > > +	/* handle tail-packing blocks cross the current page into the next */
> > > > +	size = min_t(unsigned int, iomap->length + pos - iomap->offset,
> > > > +		     PAGE_SIZE - poff);
> > > >  
> > > >  	addr = kmap_atomic(page);
> > > > -	memcpy(addr, iomap->inline_data, size);
> > > > -	memset(addr + size, 0, PAGE_SIZE - size);
> > > > +	memcpy(addr + poff, iomap->inline_data - iomap->offset + pos, size);
> > > > +	memset(addr + poff + size, 0, PAGE_SIZE - poff - size);
> > > 
> > > Hmm, so I guess the point of this is to support reading data from a
> > > tail-packing block, where each file gets some arbitrary byte range
> > > within the tp-block, and the range isn't aligned to an fs block?  Hence
> > > you have to use the inline data code to read the relevant bytes and copy
> > > them into the pagecache?
> > 
> > I think there are two distinct cases for IOMAP_INLINE.  One is
> > where the tail of the file is literally embedded into the inode.
> > Like ext4 fast symbolic links.  Taking the ext4 i_blocks layout
> > as an example, you could have a 4kB block stored in i_block[0]
> > and then store bytes 4096-4151 in i_block[1-14] (although reading
> > https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html
> > makes me think that ext4 only supports storing 0-59 in the i_blocks;
> > it doesn't support 0-4095 in i_block[0] and then 4096-4151 in i_blocks)
> > 
> > The other is what I think erofs is doing where, for example, you'd
> > specify in i_block[1] the block which contains the tail and then in
> > i_block[2] what offset of the block the tail starts at.
> 
> Nope, EROFS inline data is embedded into the inode in order to save
> I/O as well as space (maybe I didn't express clear before [1]). 
> 
> I understand the other one, but it can only save storage space but
> cannot save I/O (we still need another independent I/O to read its
> meta buffered page).
> 
> In the view of INLINE extent itself, I think both ways can be
> supported with this approach.

OH, I see, so you need the multi-page inline data support because the
ondisk layout is something like this:

+----------- page one ---------+----------- page two...
V                              V
+-------+-----------------------------+---------
| inode |   inline data               | inode...
+-------+-----------------------------+---------

And since you can only kmap one page at a time, an inline read grabs the
first part of the data in "page one" and then we have to call
iomap_begin a second time get a new address so that we can read the rest
from "page two"?

--D

> 
> [1] https://www.kernel.org/doc/html/latest/filesystems/erofs.html
>     "On-disk details" section.
> 
> Thanks,
> Gao Xiang
Gao Xiang July 21, 2021, 12:33 a.m. UTC | #6
On Tue, Jul 20, 2021 at 05:17:20PM -0700, Darrick J. Wong wrote:
> On Wed, Jul 21, 2021 at 08:03:44AM +0800, Gao Xiang wrote:
> > On Tue, Jul 20, 2021 at 10:18:54PM +0100, Matthew Wilcox wrote:
> > > On Tue, Jul 20, 2021 at 01:42:24PM -0700, Darrick J. Wong wrote:
> > > > > -	BUG_ON(page_has_private(page));
> > > > > -	BUG_ON(page->index);
> > > > > -	BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
> > > > > +	/* inline source data must be inside a single page */
> > > > > +	BUG_ON(iomap->length > PAGE_SIZE - offset_in_page(iomap->inline_data));
> > > > 
> > > > Can we reduce the strength of these checks to a warning and an -EIO
> > > > return?
> > > 
> > > I'm not entirely sure that we need this check, tbh.
> > 
> > I'm fine to get rid of this check, it just inherited from:
> >  - BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
> > 
> > It has no real effect, but when reading INLINE extent, its .iomap_begin()
> > does:
> > 	iomap->private = erofs_get_meta_page()	/* get meta page */
> > 
> > and in the .iomap_end(), it does:
> > 	struct page *ipage = iomap->private;
> > 	if (ipage) {
> > 		unlock_page(ipage);
> > 		put_page(ipage);
> > 	}
> > 
> > > 
> > > > > +	/* handle tail-packing blocks cross the current page into the next */
> > > > > +	size = min_t(unsigned int, iomap->length + pos - iomap->offset,
> > > > > +		     PAGE_SIZE - poff);
> > > > >  
> > > > >  	addr = kmap_atomic(page);
> > > > > -	memcpy(addr, iomap->inline_data, size);
> > > > > -	memset(addr + size, 0, PAGE_SIZE - size);
> > > > > +	memcpy(addr + poff, iomap->inline_data - iomap->offset + pos, size);
> > > > > +	memset(addr + poff + size, 0, PAGE_SIZE - poff - size);
> > > > 
> > > > Hmm, so I guess the point of this is to support reading data from a
> > > > tail-packing block, where each file gets some arbitrary byte range
> > > > within the tp-block, and the range isn't aligned to an fs block?  Hence
> > > > you have to use the inline data code to read the relevant bytes and copy
> > > > them into the pagecache?
> > > 
> > > I think there are two distinct cases for IOMAP_INLINE.  One is
> > > where the tail of the file is literally embedded into the inode.
> > > Like ext4 fast symbolic links.  Taking the ext4 i_blocks layout
> > > as an example, you could have a 4kB block stored in i_block[0]
> > > and then store bytes 4096-4151 in i_block[1-14] (although reading
> > > https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html
> > > makes me think that ext4 only supports storing 0-59 in the i_blocks;
> > > it doesn't support 0-4095 in i_block[0] and then 4096-4151 in i_blocks)
> > > 
> > > The other is what I think erofs is doing where, for example, you'd
> > > specify in i_block[1] the block which contains the tail and then in
> > > i_block[2] what offset of the block the tail starts at.
> > 
> > Nope, EROFS inline data is embedded into the inode in order to save
> > I/O as well as space (maybe I didn't express clear before [1]). 
> > 
> > I understand the other one, but it can only save storage space but
> > cannot save I/O (we still need another independent I/O to read its
> > meta buffered page).
> > 
> > In the view of INLINE extent itself, I think both ways can be
> > supported with this approach.
> 
> OH, I see, so you need the multi-page inline data support because the
> ondisk layout is something like this:
> 
> +----------- page one ---------+----------- page two...
> V                              V
> +-------+-----------------------------+---------
> | inode |   inline data               | inode...
> +-------+-----------------------------+---------
> 
> And since you can only kmap one page at a time, an inline read grabs the
> first part of the data in "page one" and then we have to call
> iomap_begin a second time get a new address so that we can read the rest
> from "page two"?

Nope, currently EROFS inline data won't cross page like this.

But in principle, yes, I don't want to limit it to the current
EROFS or gfs2 usage. I think we could make this iomap function
more generally (I mean, I'd like to make the INLINE extent
functionity as general as possible, my v1 original approach
in principle can support any inline extent in the middle of
file rather than just tail blocks, but zeroing out post-EOF
needs another iteration) and I don't see it add more code and
complexity.

Thanks,
Gao Xiang

> 
> --D
> 
> > 
> > [1] https://www.kernel.org/doc/html/latest/filesystems/erofs.html
> >     "On-disk details" section.
> > 
> > Thanks,
> > Gao Xiang
Andreas Grünbacher July 21, 2021, 2:26 a.m. UTC | #7
Am Mi., 21. Juli 2021 um 02:33 Uhr schrieb Gao Xiang
<hsiangkao@linux.alibaba.com>:
> > And since you can only kmap one page at a time, an inline read grabs the
> > first part of the data in "page one" and then we have to call
> > iomap_begin a second time get a new address so that we can read the rest
> > from "page two"?
>
> Nope, currently EROFS inline data won't cross page like this.
>
> But in principle, yes, I don't want to limit it to the current
> EROFS or gfs2 usage. I think we could make this iomap function
> more generally (I mean, I'd like to make the INLINE extent
> functionity as general as possible,

Nono. Can we please limit this patch what we actually need right now,
and worry about extending it later?

> my v1 original approach
> in principle can support any inline extent in the middle of
> file rather than just tail blocks, but zeroing out post-EOF
> needs another iteration) and I don't see it add more code and
> complexity.

Thanks,
Andreas
Gao Xiang July 21, 2021, 2:53 a.m. UTC | #8
Hi Andreas,

On Wed, Jul 21, 2021 at 04:26:47AM +0200, Andreas Grünbacher wrote:
> Am Mi., 21. Juli 2021 um 02:33 Uhr schrieb Gao Xiang
> <hsiangkao@linux.alibaba.com>:
> > > And since you can only kmap one page at a time, an inline read grabs the
> > > first part of the data in "page one" and then we have to call
> > > iomap_begin a second time get a new address so that we can read the rest
> > > from "page two"?
> >
> > Nope, currently EROFS inline data won't cross page like this.
> >
> > But in principle, yes, I don't want to limit it to the current
> > EROFS or gfs2 usage. I think we could make this iomap function
> > more generally (I mean, I'd like to make the INLINE extent
> > functionity as general as possible,
> 
> Nono. Can we please limit this patch what we actually need right now,
> and worry about extending it later?

Can you elaborate what it will benefit us if we only support one tail
block for iomap_read_inline_data()? (I mean it has similar LOC changes,
similar implementation / complexity.) The only concern I think is if
it causes gfs2 regression, so that is what I'd like to confirm.

In contrast, I'd like to avoid iomap_write_begin() tail-packing because
it's complex and no fs user interests in it for now. So I leave it
untouched for now.

Another concern I really like to convert EROFS to iomap is I'd like to
support sub-page blocksize for EROFS after converting. I don't want to
touch iomap inline code again like this since it interacts 2 directories
thus cause too much coupling.

Thanks,
Gao Xiang

> 
> > my v1 original approach
> > in principle can support any inline extent in the middle of
> > file rather than just tail blocks, but zeroing out post-EOF
> > needs another iteration) and I don't see it add more code and
> > complexity.
> 
> Thanks,
> Andreas
Andreas Grünbacher July 21, 2021, 6:33 a.m. UTC | #9
Am Di., 20. Juli 2021 um 23:19 Uhr schrieb Matthew Wilcox <willy@infradead.org>:
> On Tue, Jul 20, 2021 at 01:42:24PM -0700, Darrick J. Wong wrote:
> > > -   BUG_ON(page_has_private(page));
> > > -   BUG_ON(page->index);
> > > -   BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
> > > +   /* inline source data must be inside a single page */
> > > +   BUG_ON(iomap->length > PAGE_SIZE - offset_in_page(iomap->inline_data));
> >
> > Can we reduce the strength of these checks to a warning and an -EIO
> > return?

Yes, we could do that.

> I'm not entirely sure that we need this check, tbh.

I wanted to make sure the memcpy / memset won't corrupt random kernel
memory when the filesystem gets the iomap_begin wrong.

> > > +   /* handle tail-packing blocks cross the current page into the next */
> > > +   size = min_t(unsigned int, iomap->length + pos - iomap->offset,
> > > +                PAGE_SIZE - poff);
> > >
> > >     addr = kmap_atomic(page);
> > > -   memcpy(addr, iomap->inline_data, size);
> > > -   memset(addr + size, 0, PAGE_SIZE - size);
> > > +   memcpy(addr + poff, iomap->inline_data - iomap->offset + pos, size);
> > > +   memset(addr + poff + size, 0, PAGE_SIZE - poff - size);
> >
> > Hmm, so I guess the point of this is to support reading data from a
> > tail-packing block, where each file gets some arbitrary byte range
> > within the tp-block, and the range isn't aligned to an fs block?  Hence
> > you have to use the inline data code to read the relevant bytes and copy
> > them into the pagecache?
>
> I think there are two distinct cases for IOMAP_INLINE.  One is
> where the tail of the file is literally embedded into the inode.
> Like ext4 fast symbolic links.  Taking the ext4 i_blocks layout
> as an example, you could have a 4kB block stored in i_block[0]
> and then store bytes 4096-4151 in i_block[1-14] (although reading
> https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html
> makes me think that ext4 only supports storing 0-59 in the i_blocks;
> it doesn't support 0-4095 in i_block[0] and then 4096-4151 in i_blocks)
>
> The other is what I think erofs is doing where, for example, you'd
> specify in i_block[1] the block which contains the tail and then in
> i_block[2] what offset of the block the tail starts at.

Andreas
Andreas Grünbacher July 21, 2021, 6:43 a.m. UTC | #10
Am Mi., 21. Juli 2021 um 04:54 Uhr schrieb Gao Xiang
<hsiangkao@linux.alibaba.com>:
> Hi Andreas,
>
> On Wed, Jul 21, 2021 at 04:26:47AM +0200, Andreas Grünbacher wrote:
> > Am Mi., 21. Juli 2021 um 02:33 Uhr schrieb Gao Xiang
> > <hsiangkao@linux.alibaba.com>:
> > > > And since you can only kmap one page at a time, an inline read grabs the
> > > > first part of the data in "page one" and then we have to call
> > > > iomap_begin a second time get a new address so that we can read the rest
> > > > from "page two"?
> > >
> > > Nope, currently EROFS inline data won't cross page like this.
> > >
> > > But in principle, yes, I don't want to limit it to the current
> > > EROFS or gfs2 usage. I think we could make this iomap function
> > > more generally (I mean, I'd like to make the INLINE extent
> > > functionity as general as possible,
> >
> > Nono. Can we please limit this patch what we actually need right now,
> > and worry about extending it later?
>
> Can you elaborate what it will benefit us if we only support one tail
> block for iomap_read_inline_data()? (I mean it has similar LOC changes,
> similar implementation / complexity.) The only concern I think is if
> it causes gfs2 regression, so that is what I'd like to confirm.

iomap_read_inline_data supports one inline page now (i.e., from the
beginning of the file). It seems that you don't need more than one
tail page in EROFS, right?

You were speculating about inline data in the middle of a file. That
sounds like a really, really bad idea to me, and I don't think we
should waste any time on it.

> In contrast, I'd like to avoid iomap_write_begin() tail-packing because
> it's complex and no fs user interests in it for now. So I leave it
> untouched for now.

I have no idea what you mean by that.

> Another concern I really like to convert EROFS to iomap is I'd like to
> support sub-page blocksize for EROFS after converting. I don't want to
> touch iomap inline code again like this since it interacts 2 directories
> thus cause too much coupling.
>
> Thanks,
> Gao Xiang
>
> >
> > > my v1 original approach
> > > in principle can support any inline extent in the middle of
> > > file rather than just tail blocks, but zeroing out post-EOF
> > > needs another iteration) and I don't see it add more code and
> > > complexity.
> >
> > Thanks,
> > Andreas

Andreas
Gao Xiang July 21, 2021, 7:28 a.m. UTC | #11
On Wed, Jul 21, 2021 at 08:43:00AM +0200, Andreas Grünbacher wrote:
> Am Mi., 21. Juli 2021 um 04:54 Uhr schrieb Gao Xiang
> <hsiangkao@linux.alibaba.com>:
> > Hi Andreas,
> >
> > On Wed, Jul 21, 2021 at 04:26:47AM +0200, Andreas Grünbacher wrote:
> > > Am Mi., 21. Juli 2021 um 02:33 Uhr schrieb Gao Xiang
> > > <hsiangkao@linux.alibaba.com>:
> > > > > And since you can only kmap one page at a time, an inline read grabs the
> > > > > first part of the data in "page one" and then we have to call
> > > > > iomap_begin a second time get a new address so that we can read the rest
> > > > > from "page two"?
> > > >
> > > > Nope, currently EROFS inline data won't cross page like this.
> > > >
> > > > But in principle, yes, I don't want to limit it to the current
> > > > EROFS or gfs2 usage. I think we could make this iomap function
> > > > more generally (I mean, I'd like to make the INLINE extent
> > > > functionity as general as possible,
> > >
> > > Nono. Can we please limit this patch what we actually need right now,
> > > and worry about extending it later?
> >
> > Can you elaborate what it will benefit us if we only support one tail
> > block for iomap_read_inline_data()? (I mean it has similar LOC changes,
> > similar implementation / complexity.) The only concern I think is if
> > it causes gfs2 regression, so that is what I'd like to confirm.
> 
> iomap_read_inline_data supports one inline page now (i.e., from the
> beginning of the file). It seems that you don't need more than one
> tail page in EROFS, right?
> 
> You were speculating about inline data in the middle of a file. That
> sounds like a really, really bad idea to me, and I don't think we
> should waste any time on it.

Huh? why do you think it's a bad idea? I could give real example to you.

At least, it can be used for some encoded data or repeated pattern (such
as AABBAABBAABB...) in a packed way (marked in extent metadata).
Is that enough?

Again, I don't see what the benefits if limiting it to one tail block,
it (maybe) just modifies:
+	/* handle tail-packing blocks cross the current page into the next */
+	size = min_t(unsigned int, iomap->length + pos - iomap->offset,
+		     PAGE_SIZE - poff);

to
+	size = min_t(unsigned int, i_blocksize(inode),
+		     PAGE_SIZE - poff);

And which has the similar complexity, so why not using iomap->length
here instead?

Thanks,
Gao Xiang
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 87ccb3438bec..0edc8bbb35d1 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -205,25 +205,25 @@  struct iomap_readpage_ctx {
 	struct readahead_control *rac;
 };
 
-static void
+static int
 iomap_read_inline_data(struct inode *inode, struct page *page,
-		struct iomap *iomap)
+		struct iomap *iomap, loff_t pos)
 {
-	size_t size = i_size_read(inode);
+	unsigned int size, poff = offset_in_page(pos);
 	void *addr;
 
-	if (PageUptodate(page))
-		return;
-
-	BUG_ON(page_has_private(page));
-	BUG_ON(page->index);
-	BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
+	/* inline source data must be inside a single page */
+	BUG_ON(iomap->length > PAGE_SIZE - offset_in_page(iomap->inline_data));
+	/* handle tail-packing blocks cross the current page into the next */
+	size = min_t(unsigned int, iomap->length + pos - iomap->offset,
+		     PAGE_SIZE - poff);
 
 	addr = kmap_atomic(page);
-	memcpy(addr, iomap->inline_data, size);
-	memset(addr + size, 0, PAGE_SIZE - size);
+	memcpy(addr + poff, iomap->inline_data - iomap->offset + pos, size);
+	memset(addr + poff + size, 0, PAGE_SIZE - poff - size);
 	kunmap_atomic(addr);
-	SetPageUptodate(page);
+	iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff);
+	return PAGE_SIZE - poff;
 }
 
 static inline bool iomap_block_needs_zeroing(struct inode *inode,
@@ -246,18 +246,18 @@  iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 	unsigned poff, plen;
 	sector_t sector;
 
-	if (iomap->type == IOMAP_INLINE) {
-		WARN_ON_ONCE(pos);
-		iomap_read_inline_data(inode, page, iomap);
-		return PAGE_SIZE;
-	}
-
-	/* zero post-eof blocks as the page may be mapped */
 	iop = iomap_page_create(inode, page);
+	/* needs to skip some leading uptodate blocks */
 	iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
 	if (plen == 0)
 		goto done;
 
+	if (iomap->type == IOMAP_INLINE) {
+		plen = iomap_read_inline_data(inode, page, iomap, pos);
+		goto done;
+	}
+
+	/* zero post-eof blocks as the page may be mapped */
 	if (iomap_block_needs_zeroing(inode, iomap, pos)) {
 		zero_user(page, poff, plen);
 		iomap_set_range_uptodate(page, poff, plen);
@@ -589,6 +589,18 @@  __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
 	return 0;
 }
 
+static int iomap_write_begin_inline(struct inode *inode, loff_t pos,
+		struct page *page, struct iomap *srcmap)
+{
+	/* needs more work for the tailpacking case, disable for now */
+	if (WARN_ON_ONCE(srcmap->offset != 0))
+		return -EIO;
+	if (PageUptodate(page))
+		return 0;
+	iomap_read_inline_data(inode, page, srcmap, 0);
+	return 0;
+}
+
 static int
 iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
@@ -618,7 +630,7 @@  iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 	}
 
 	if (srcmap->type == IOMAP_INLINE)
-		iomap_read_inline_data(inode, page, srcmap);
+		status = iomap_write_begin_inline(inode, pos, page, srcmap);
 	else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
 		status = __block_write_begin_int(page, pos, len, NULL, srcmap);
 	else
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 9398b8c31323..ee6309967b77 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -379,22 +379,25 @@  iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
 {
 	struct iov_iter *iter = dio->submit.iter;
 	size_t copied;
+	void *dst = iomap->inline_data + pos - iomap->offset;
 
-	BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
+	/* inline data must be inside a single page */
+	BUG_ON(length > PAGE_SIZE - offset_in_page(iomap->inline_data));
 
 	if (dio->flags & IOMAP_DIO_WRITE) {
 		loff_t size = inode->i_size;
 
 		if (pos > size)
-			memset(iomap->inline_data + size, 0, pos - size);
-		copied = copy_from_iter(iomap->inline_data + pos, length, iter);
+			memset(iomap->inline_data + size - iomap->offset,
+			       0, pos - size);
+		copied = copy_from_iter(dst, length, iter);
 		if (copied) {
 			if (pos + copied > size)
 				i_size_write(inode, pos + copied);
 			mark_inode_dirty(inode);
 		}
 	} else {
-		copied = copy_to_iter(iomap->inline_data + pos, length, iter);
+		copied = copy_to_iter(dst, length, iter);
 	}
 	dio->size += copied;
 	return copied;