Message ID | 20210716050724.225041-2-hsiangkao@linux.alibaba.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | erofs: iomap support for tailpacking cases | expand |
I'm pretty sure gfs2 supports direct writes to inline data, so we should not disable it. I also think we should share the code rather than duplicating it. Suggested version against the iomap-for-next branch attached, but this needs careful check from Andreas (please keep him on CC). --- From 6067cd3462cea80cb2739602862296db41fc5638 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 16 Jul 2021 10:52:48 +0200 Subject: iomap: support tail packing inline read This tries to add tail packing inline read to iomap. Different from the previous approach, it only marks the block range uptodate in the page it covers. The write path remains untouched since EROFS cannot be used for testing. It'd be better to be implemented if upcoming real users care rather than leave untested dead code around. Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com> --- fs/iomap/buffered-io.c | 56 ++++++++++++++++++++++++++++-------------- fs/iomap/direct-io.c | 6 +++-- 2 files changed, 41 insertions(+), 21 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 87ccb3438becd9..2efd4bc0328995 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -207,29 +207,28 @@ struct iomap_readpage_ctx { static void iomap_read_inline_data(struct inode *inode, struct page *page, - struct iomap *iomap) + struct iomap *iomap, loff_t pos, unsigned int size) { - size_t size = i_size_read(inode); + unsigned int block_aligned_size = round_up(size, i_blocksize(inode)); + unsigned int poff = offset_in_page(pos); void *addr; - if (PageUptodate(page)) - return; - - BUG_ON(page_has_private(page)); - BUG_ON(page->index); + /* make sure that inline_data doesn't cross page boundary */ BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); + BUG_ON(size != i_size_read(inode) - pos); addr = kmap_atomic(page); - memcpy(addr, iomap->inline_data, size); - memset(addr + size, 0, PAGE_SIZE - size); + memcpy(addr + poff, iomap->inline_data - iomap->offset + pos, size); + memset(addr + poff + size, 0, block_aligned_size - size); kunmap_atomic(addr); - SetPageUptodate(page); + + iomap_set_range_uptodate(page, poff, block_aligned_size); } static inline bool iomap_block_needs_zeroing(struct inode *inode, struct iomap *iomap, loff_t pos) { - return iomap->type != IOMAP_MAPPED || + return (iomap->type != IOMAP_MAPPED && iomap->type != IOMAP_INLINE) || (iomap->flags & IOMAP_F_NEW) || pos >= i_size_read(inode); } @@ -240,20 +239,18 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, { struct iomap_readpage_ctx *ctx = data; struct page *page = ctx->cur_page; - struct iomap_page *iop; + struct iomap_page *iop = NULL; bool same_page = false, is_contig = false; loff_t orig_pos = pos; unsigned poff, plen; sector_t sector; - if (iomap->type == IOMAP_INLINE) { - WARN_ON_ONCE(pos); - iomap_read_inline_data(inode, page, iomap); - return PAGE_SIZE; - } + if (iomap->type == IOMAP_INLINE && !pos) + WARN_ON_ONCE(to_iomap_page(page) != NULL); + else + iop = iomap_page_create(inode, page); /* zero post-eof blocks as the page may be mapped */ - iop = iomap_page_create(inode, page); iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen); if (plen == 0) goto done; @@ -264,6 +261,15 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, goto done; } + if (iomap->type == IOMAP_INLINE) { + iomap_read_inline_data(inode, page, iomap, pos, plen); + /* + * TODO: the old code used to return PAGE_SIZE here + * unconditionally. I think the actual i_size return should + * be fine for gfs2 as well, but please double check. + */ + goto done; + } ctx->cur_page_in_bio = true; if (iop) atomic_add(plen, &iop->read_bytes_pending); @@ -589,6 +595,18 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, return 0; } +static int iomap_write_begin_inline(struct inode *inode, loff_t pos, + struct page *page, struct iomap *srcmap) +{ + /* needs more work for the tailpacking case, disable for now */ + if (WARN_ON_ONCE(pos != 0)) + return -EIO; + if (PageUptodate(page)) + return 0; + iomap_read_inline_data(inode, page, srcmap, pos, i_size_read(inode)); + return 0; +} + static int iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, struct page **pagep, struct iomap *iomap, struct iomap *srcmap) @@ -618,7 +636,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, } if (srcmap->type == IOMAP_INLINE) - iomap_read_inline_data(inode, page, srcmap); + status = iomap_write_begin_inline(inode, pos, page, srcmap); else if (iomap->flags & IOMAP_F_BUFFER_HEAD) status = __block_write_begin_int(page, pos, len, NULL, srcmap); else diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 9398b8c31323b3..a70a8632df226f 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -380,7 +380,8 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, struct iov_iter *iter = dio->submit.iter; size_t copied; - BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data)); + /* inline data must be inside a single page */ + BUG_ON(length > PAGE_SIZE - offset_in_page(iomap->inline_data)); if (dio->flags & IOMAP_DIO_WRITE) { loff_t size = inode->i_size; @@ -394,7 +395,8 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, mark_inode_dirty(inode); } } else { - copied = copy_to_iter(iomap->inline_data + pos, length, iter); + copied = copy_to_iter(iomap->inline_data + pos - iomap->offset, + length, iter); } dio->size += copied; return copied;
Hi Christoph, On Fri, Jul 16, 2021 at 10:19:09AM +0100, Christoph Hellwig wrote: > I'm pretty sure gfs2 supports direct writes to inline data, so we should > not disable it. I also think we should share the code rather than > duplicating it. Suggested version against the iomap-for-next branch > attached, but this needs careful check from Andreas (please keep him on > CC). Thanks for your time and revising, I once thought using an unique iomap_read_inline_data() as well but then I thought maybe leaving iomap_read_inline_page() could make gfs2 logic easier... I'm fine with this modification, and will re-test on my side as well... (hopefully Andreas could also check this and then targeting for the next merge window since it's quite a small change....) Thanks, Gao Xiang > > --- > From 6067cd3462cea80cb2739602862296db41fc5638 Mon Sep 17 00:00:00 2001 > From: Christoph Hellwig <hch@lst.de> > Date: Fri, 16 Jul 2021 10:52:48 +0200 > Subject: iomap: support tail packing inline read > > This tries to add tail packing inline read to iomap. Different from > the previous approach, it only marks the block range uptodate in the > page it covers. > > The write path remains untouched since EROFS cannot be used for > testing. It'd be better to be implemented if upcoming real users care > rather than leave untested dead code around. > > Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com> > --- > fs/iomap/buffered-io.c | 56 ++++++++++++++++++++++++++++-------------- > fs/iomap/direct-io.c | 6 +++-- > 2 files changed, 41 insertions(+), 21 deletions(-) > > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c > index 87ccb3438becd9..2efd4bc0328995 100644 > --- a/fs/iomap/buffered-io.c > +++ b/fs/iomap/buffered-io.c > @@ -207,29 +207,28 @@ struct iomap_readpage_ctx { > > static void > iomap_read_inline_data(struct inode *inode, struct page *page, > - struct iomap *iomap) > + struct iomap *iomap, loff_t pos, unsigned int size) > { > - size_t size = i_size_read(inode); > + unsigned int block_aligned_size = round_up(size, i_blocksize(inode)); > + unsigned int poff = offset_in_page(pos); > void *addr; > > - if (PageUptodate(page)) > - return; > - > - BUG_ON(page_has_private(page)); > - BUG_ON(page->index); > + /* make sure that inline_data doesn't cross page boundary */ > BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); > + BUG_ON(size != i_size_read(inode) - pos); > > addr = kmap_atomic(page); > - memcpy(addr, iomap->inline_data, size); > - memset(addr + size, 0, PAGE_SIZE - size); > + memcpy(addr + poff, iomap->inline_data - iomap->offset + pos, size); > + memset(addr + poff + size, 0, block_aligned_size - size); > kunmap_atomic(addr); > - SetPageUptodate(page); > + > + iomap_set_range_uptodate(page, poff, block_aligned_size); > } > > static inline bool iomap_block_needs_zeroing(struct inode *inode, > struct iomap *iomap, loff_t pos) > { > - return iomap->type != IOMAP_MAPPED || > + return (iomap->type != IOMAP_MAPPED && iomap->type != IOMAP_INLINE) || > (iomap->flags & IOMAP_F_NEW) || > pos >= i_size_read(inode); > } > @@ -240,20 +239,18 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > { > struct iomap_readpage_ctx *ctx = data; > struct page *page = ctx->cur_page; > - struct iomap_page *iop; > + struct iomap_page *iop = NULL; > bool same_page = false, is_contig = false; > loff_t orig_pos = pos; > unsigned poff, plen; > sector_t sector; > > - if (iomap->type == IOMAP_INLINE) { > - WARN_ON_ONCE(pos); > - iomap_read_inline_data(inode, page, iomap); > - return PAGE_SIZE; > - } > + if (iomap->type == IOMAP_INLINE && !pos) > + WARN_ON_ONCE(to_iomap_page(page) != NULL); > + else > + iop = iomap_page_create(inode, page); > > /* zero post-eof blocks as the page may be mapped */ > - iop = iomap_page_create(inode, page); > iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen); > if (plen == 0) > goto done; > @@ -264,6 +261,15 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > goto done; > } > > + if (iomap->type == IOMAP_INLINE) { > + iomap_read_inline_data(inode, page, iomap, pos, plen); > + /* > + * TODO: the old code used to return PAGE_SIZE here > + * unconditionally. I think the actual i_size return should > + * be fine for gfs2 as well, but please double check. > + */ > + goto done; > + } > ctx->cur_page_in_bio = true; > if (iop) > atomic_add(plen, &iop->read_bytes_pending); > @@ -589,6 +595,18 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, > return 0; > } > > +static int iomap_write_begin_inline(struct inode *inode, loff_t pos, > + struct page *page, struct iomap *srcmap) > +{ > + /* needs more work for the tailpacking case, disable for now */ > + if (WARN_ON_ONCE(pos != 0)) > + return -EIO; > + if (PageUptodate(page)) > + return 0; > + iomap_read_inline_data(inode, page, srcmap, pos, i_size_read(inode)); > + return 0; > +} > + > static int > iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, > struct page **pagep, struct iomap *iomap, struct iomap *srcmap) > @@ -618,7 +636,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, > } > > if (srcmap->type == IOMAP_INLINE) > - iomap_read_inline_data(inode, page, srcmap); > + status = iomap_write_begin_inline(inode, pos, page, srcmap); > else if (iomap->flags & IOMAP_F_BUFFER_HEAD) > status = __block_write_begin_int(page, pos, len, NULL, srcmap); > else > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c > index 9398b8c31323b3..a70a8632df226f 100644 > --- a/fs/iomap/direct-io.c > +++ b/fs/iomap/direct-io.c > @@ -380,7 +380,8 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, > struct iov_iter *iter = dio->submit.iter; > size_t copied; > > - BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data)); > + /* inline data must be inside a single page */ > + BUG_ON(length > PAGE_SIZE - offset_in_page(iomap->inline_data)); > > if (dio->flags & IOMAP_DIO_WRITE) { > loff_t size = inode->i_size; > @@ -394,7 +395,8 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, > mark_inode_dirty(inode); > } > } else { > - copied = copy_to_iter(iomap->inline_data + pos, length, iter); > + copied = copy_to_iter(iomap->inline_data + pos - iomap->offset, > + length, iter); > } > dio->size += copied; > return copied; > -- > 2.30.2
On Fri, Jul 16, 2021 at 01:07:23PM +0800, Gao Xiang wrote: > This tries to add tail packing inline read to iomap. Different from > the previous approach, it only marks the block range uptodate in the > page it covers. Why? This path is called under two circumstances: readahead and readpage. In both cases, we're trying to bring the entire page uptodate. The inline extent is always the tail of the file, so we may as well zero the part of the page past the end of file and mark the entire page uptodate instead and leaving the end of the page !uptodate. I see the case where, eg, we have the first 2048 bytes of the file out-of-inode and then 20 bytes in the inode. So we'll create the iop for the head of the file, but then we may as well finish the entire PAGE_SIZE chunk as part of this iteration rather than update 2048-3071 as being uptodate and leave the 3072-4095 block for a future iteration.
On Fri, Jul 16, 2021 at 10:19:09AM +0100, Christoph Hellwig wrote: > static void > iomap_read_inline_data(struct inode *inode, struct page *page, > - struct iomap *iomap) > + struct iomap *iomap, loff_t pos, unsigned int size) > { > - size_t size = i_size_read(inode); > + unsigned int block_aligned_size = round_up(size, i_blocksize(inode)); > + unsigned int poff = offset_in_page(pos); > void *addr; > > - if (PageUptodate(page)) > - return; > - > - BUG_ON(page_has_private(page)); > - BUG_ON(page->index); > + /* make sure that inline_data doesn't cross page boundary */ > BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); > + BUG_ON(size != i_size_read(inode) - pos); > > addr = kmap_atomic(page); > - memcpy(addr, iomap->inline_data, size); > - memset(addr + size, 0, PAGE_SIZE - size); > + memcpy(addr + poff, iomap->inline_data - iomap->offset + pos, size); > + memset(addr + poff + size, 0, block_aligned_size - size); > kunmap_atomic(addr); > - SetPageUptodate(page); > + > + iomap_set_range_uptodate(page, poff, block_aligned_size); > } This should be relatively straightforward to port to folios. I think it looks something like this ... @@ -211,23 +211,18 @@ struct iomap_readpage_ctx { }; static void iomap_read_inline_data(struct inode *inode, struct folio *folio, - struct iomap *iomap) + struct iomap *iomap, loff_t pos, size_t size) { - size_t size = i_size_read(inode); void *addr; + size_t offset = offset_in_folio(folio, pos); - if (folio_test_uptodate(folio)) - return; + BUG_ON(size != i_size_read(inode) - pos); - BUG_ON(folio->index); - BUG_ON(folio_multi(folio)); - BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); - - addr = kmap_local_folio(folio, 0); + addr = kmap_local_folio(folio, offset); memcpy(addr, iomap->inline_data, size); memset(addr + size, 0, PAGE_SIZE - size); kunmap_local(addr); - folio_mark_uptodate(folio); + iomap_set_range_uptodate(folio, to_iomap_page(folio), pos, size); } > - if (iomap->type == IOMAP_INLINE) { > - WARN_ON_ONCE(pos); > - iomap_read_inline_data(inode, page, iomap); > - return PAGE_SIZE; > - } > + if (iomap->type == IOMAP_INLINE && !pos) > + WARN_ON_ONCE(to_iomap_page(page) != NULL); > + else > + iop = iomap_page_create(inode, page); This WARN_ON doesn't make sense to me. If a file contains bytes 0-2047 that are !INLINE and then bytes 2048-2050 that are INLINE, we're going to trigger it. Perhaps just make this: if (iomap->type != IOMAP_INLINE || pos) iop = iomap_page_create(inode, page);
Hi Matthew, On Fri, Jul 16, 2021 at 02:02:29PM +0100, Matthew Wilcox wrote: > On Fri, Jul 16, 2021 at 01:07:23PM +0800, Gao Xiang wrote: > > This tries to add tail packing inline read to iomap. Different from > > the previous approach, it only marks the block range uptodate in the > > page it covers. > > Why? This path is called under two circumstances: readahead and readpage. > In both cases, we're trying to bring the entire page uptodate. The inline > extent is always the tail of the file, so we may as well zero the part of > the page past the end of file and mark the entire page uptodate instead > and leaving the end of the page !uptodate. > > I see the case where, eg, we have the first 2048 bytes of the file > out-of-inode and then 20 bytes in the inode. So we'll create the iop > for the head of the file, but then we may as well finish the entire > PAGE_SIZE chunk as part of this iteration rather than update 2048-3071 > as being uptodate and leave the 3072-4095 block for a future iteration. Thanks for your comments. Hmm... If I understand the words above correctly, what I'd like to do is to cover the inline extents (blocks) only reported by iomap_begin() rather than handling other (maybe) logical-not-strictly-relevant areas such as post-EOF (even pages will be finally entirely uptodated), I think such zeroed area should be handled by from the point of view of the extent itself if (iomap_block_needs_zeroing(inode, iomap, pos)) { zero_user(page, poff, plen); iomap_set_range_uptodate(page, poff, plen); goto done; } The benefits I can think out are 1) it makes the logic understand easier and no special cases just for tail-packing handling 2) it can be then used for any inline extent cases (I mean e.g. in the middle of the file) rather than just tail-packing inline blocks although currently there is a BUG_ON to prevent this but it's easier to extend even further. 3) it can be used as a part for later partial page uptodate logic in order to match the legacy buffer_head logic (I remember something if my memory is not broken about this...) Thanks, Gao Xiang
On Fri, Jul 16, 2021 at 02:47:35PM +0100, Matthew Wilcox wrote: > I think it looks something like this ... > > @@ -211,23 +211,18 @@ struct iomap_readpage_ctx { > }; > > static void iomap_read_inline_data(struct inode *inode, struct folio *folio, > - struct iomap *iomap) > + struct iomap *iomap, loff_t pos, size_t size) > { > - size_t size = i_size_read(inode); > void *addr; > + size_t offset = offset_in_folio(folio, pos); > > - if (folio_test_uptodate(folio)) > - return; > + BUG_ON(size != i_size_read(inode) - pos); > > - BUG_ON(folio->index); > - BUG_ON(folio_multi(folio)); > - BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); > - > - addr = kmap_local_folio(folio, 0); > + addr = kmap_local_folio(folio, offset); > memcpy(addr, iomap->inline_data, size); > memset(addr + size, 0, PAGE_SIZE - size); > kunmap_local(addr); > - folio_mark_uptodate(folio); > + iomap_set_range_uptodate(folio, to_iomap_page(folio), pos, size); That should be s/size/PAGE_SIZE - offset/ (because this has just zeroed all the bytes in the page after end-of-file)
On Fri, Jul 16, 2021 at 09:56:23PM +0800, Gao Xiang wrote: > Hi Matthew, > > On Fri, Jul 16, 2021 at 02:02:29PM +0100, Matthew Wilcox wrote: > > On Fri, Jul 16, 2021 at 01:07:23PM +0800, Gao Xiang wrote: > > > This tries to add tail packing inline read to iomap. Different from > > > the previous approach, it only marks the block range uptodate in the > > > page it covers. > > > > Why? This path is called under two circumstances: readahead and readpage. > > In both cases, we're trying to bring the entire page uptodate. The inline > > extent is always the tail of the file, so we may as well zero the part of > > the page past the end of file and mark the entire page uptodate instead > > and leaving the end of the page !uptodate. > > > > I see the case where, eg, we have the first 2048 bytes of the file > > out-of-inode and then 20 bytes in the inode. So we'll create the iop > > for the head of the file, but then we may as well finish the entire > > PAGE_SIZE chunk as part of this iteration rather than update 2048-3071 > > as being uptodate and leave the 3072-4095 block for a future iteration. > > Thanks for your comments. Hmm... If I understand the words above correctly, > what I'd like to do is to cover the inline extents (blocks) only > reported by iomap_begin() rather than handling other (maybe) > logical-not-strictly-relevant areas such as post-EOF (even pages > will be finally entirely uptodated), I think such zeroed area should > be handled by from the point of view of the extent itself > > if (iomap_block_needs_zeroing(inode, iomap, pos)) { > zero_user(page, poff, plen); > iomap_set_range_uptodate(page, poff, plen); > goto done; > } That does work. But we already mapped the page to write to it, and we already have to zero to the end of the block. Why not zero to the end of the page? It saves an iteration around the loop, it saves a mapping of the page, and it saves a call to flush_dcache_page(). > The benefits I can think out are 1) it makes the logic understand > easier and no special cases just for tail-packing handling 2) it can > be then used for any inline extent cases (I mean e.g. in the middle of > the file) rather than just tail-packing inline blocks although currently > there is a BUG_ON to prevent this but it's easier to extend even further. > 3) it can be used as a part for later partial page uptodate logic in > order to match the legacy buffer_head logic (I remember something if my > memory is not broken about this...) Hopefully the legacy buffer_head logic will go away soon.
On Fri, Jul 16, 2021 at 03:44:04PM +0100, Matthew Wilcox wrote: > On Fri, Jul 16, 2021 at 09:56:23PM +0800, Gao Xiang wrote: > > Hi Matthew, > > > > On Fri, Jul 16, 2021 at 02:02:29PM +0100, Matthew Wilcox wrote: > > > On Fri, Jul 16, 2021 at 01:07:23PM +0800, Gao Xiang wrote: > > > > This tries to add tail packing inline read to iomap. Different from > > > > the previous approach, it only marks the block range uptodate in the > > > > page it covers. > > > > > > Why? This path is called under two circumstances: readahead and readpage. > > > In both cases, we're trying to bring the entire page uptodate. The inline > > > extent is always the tail of the file, so we may as well zero the part of > > > the page past the end of file and mark the entire page uptodate instead > > > and leaving the end of the page !uptodate. > > > > > > I see the case where, eg, we have the first 2048 bytes of the file > > > out-of-inode and then 20 bytes in the inode. So we'll create the iop > > > for the head of the file, but then we may as well finish the entire > > > PAGE_SIZE chunk as part of this iteration rather than update 2048-3071 > > > as being uptodate and leave the 3072-4095 block for a future iteration. > > > > Thanks for your comments. Hmm... If I understand the words above correctly, > > what I'd like to do is to cover the inline extents (blocks) only > > reported by iomap_begin() rather than handling other (maybe) > > logical-not-strictly-relevant areas such as post-EOF (even pages > > will be finally entirely uptodated), I think such zeroed area should > > be handled by from the point of view of the extent itself > > > > if (iomap_block_needs_zeroing(inode, iomap, pos)) { > > zero_user(page, poff, plen); > > iomap_set_range_uptodate(page, poff, plen); > > goto done; > > } > > That does work. But we already mapped the page to write to it, and > we already have to zero to the end of the block. Why not zero to > the end of the page? It saves an iteration around the loop, it saves > a mapping of the page, and it saves a call to flush_dcache_page(). I completely understand your concern, and that's also (sort of) why I left iomap_read_inline_page() to make the old !pos behavior as before. Anyway, I could update Christoph's patch to behave like what you suggested. Will do later since I'm now taking some rest... > > > The benefits I can think out are 1) it makes the logic understand > > easier and no special cases just for tail-packing handling 2) it can > > be then used for any inline extent cases (I mean e.g. in the middle of > > the file) rather than just tail-packing inline blocks although currently > > there is a BUG_ON to prevent this but it's easier to extend even further. > > 3) it can be used as a part for later partial page uptodate logic in > > order to match the legacy buffer_head logic (I remember something if my > > memory is not broken about this...) > > Hopefully the legacy buffer_head logic will go away soon. Hmmm.. I partially agree on this (I agree buffer_head is a legacy stuff but...), considering some big PAGE_SIZE like 64kb or bigger, partial uptodate can save I/O for random file read pattern in general (not mmap read, yes, also considering readahead, but I received some regression due to I/O amplification like this when I was at the previous * 2 company). Thanks, Gao Xiang
Am Fr., 16. Juli 2021 um 17:03 Uhr schrieb Gao Xiang <hsiangkao@linux.alibaba.com>: > On Fri, Jul 16, 2021 at 03:44:04PM +0100, Matthew Wilcox wrote: > > On Fri, Jul 16, 2021 at 09:56:23PM +0800, Gao Xiang wrote: > > > Hi Matthew, > > > > > > On Fri, Jul 16, 2021 at 02:02:29PM +0100, Matthew Wilcox wrote: > > > > On Fri, Jul 16, 2021 at 01:07:23PM +0800, Gao Xiang wrote: > > > > > This tries to add tail packing inline read to iomap. Different from > > > > > the previous approach, it only marks the block range uptodate in the > > > > > page it covers. > > > > > > > > Why? This path is called under two circumstances: readahead and readpage. > > > > In both cases, we're trying to bring the entire page uptodate. The inline > > > > extent is always the tail of the file, so we may as well zero the part of > > > > the page past the end of file and mark the entire page uptodate instead > > > > and leaving the end of the page !uptodate. > > > > > > > > I see the case where, eg, we have the first 2048 bytes of the file > > > > out-of-inode and then 20 bytes in the inode. So we'll create the iop > > > > for the head of the file, but then we may as well finish the entire > > > > PAGE_SIZE chunk as part of this iteration rather than update 2048-3071 > > > > as being uptodate and leave the 3072-4095 block for a future iteration. > > > > > > Thanks for your comments. Hmm... If I understand the words above correctly, > > > what I'd like to do is to cover the inline extents (blocks) only > > > reported by iomap_begin() rather than handling other (maybe) > > > logical-not-strictly-relevant areas such as post-EOF (even pages > > > will be finally entirely uptodated), I think such zeroed area should > > > be handled by from the point of view of the extent itself > > > > > > if (iomap_block_needs_zeroing(inode, iomap, pos)) { > > > zero_user(page, poff, plen); > > > iomap_set_range_uptodate(page, poff, plen); > > > goto done; > > > } > > > > That does work. But we already mapped the page to write to it, and > > we already have to zero to the end of the block. Why not zero to > > the end of the page? It saves an iteration around the loop, it saves > > a mapping of the page, and it saves a call to flush_dcache_page(). > > I completely understand your concern, and that's also (sort of) why I > left iomap_read_inline_page() to make the old !pos behavior as before. > > Anyway, I could update Christoph's patch to behave like what you > suggested. Will do later since I'm now taking some rest... Looking forward to that for some testing; Christoph's version was already looking pretty good. This code is a bit brittle, hopefully less so with the recent iop fixes on iomap-for-next. > > > The benefits I can think out are 1) it makes the logic understand > > > easier and no special cases just for tail-packing handling 2) it can > > > be then used for any inline extent cases (I mean e.g. in the middle of > > > the file) rather than just tail-packing inline blocks although currently > > > there is a BUG_ON to prevent this but it's easier to extend even further. > > > 3) it can be used as a part for later partial page uptodate logic in > > > order to match the legacy buffer_head logic (I remember something if my > > > memory is not broken about this...) > > > > Hopefully the legacy buffer_head logic will go away soon. > > Hmmm.. I partially agree on this (I agree buffer_head is a legacy stuff > but...), considering some big PAGE_SIZE like 64kb or bigger, partial > uptodate can save I/O for random file read pattern in general (not mmap > read, yes, also considering readahead, but I received some regression > due to I/O amplification like this when I was at the previous * 2 company). Thanks, Andreas
Hi Andreas, Christoph, Matthew, and all, On Fri, Jul 16, 2021 at 05:53:19PM +0200, Andreas Grünbacher wrote: > Am Fr., 16. Juli 2021 um 17:03 Uhr schrieb Gao Xiang > <hsiangkao@linux.alibaba.com>: > > On Fri, Jul 16, 2021 at 03:44:04PM +0100, Matthew Wilcox wrote: > > > On Fri, Jul 16, 2021 at 09:56:23PM +0800, Gao Xiang wrote: > > > > Hi Matthew, > > > > > > > > On Fri, Jul 16, 2021 at 02:02:29PM +0100, Matthew Wilcox wrote: > > > > > On Fri, Jul 16, 2021 at 01:07:23PM +0800, Gao Xiang wrote: > > > > > > This tries to add tail packing inline read to iomap. Different from > > > > > > the previous approach, it only marks the block range uptodate in the > > > > > > page it covers. > > > > > > > > > > Why? This path is called under two circumstances: readahead and readpage. > > > > > In both cases, we're trying to bring the entire page uptodate. The inline > > > > > extent is always the tail of the file, so we may as well zero the part of > > > > > the page past the end of file and mark the entire page uptodate instead > > > > > and leaving the end of the page !uptodate. > > > > > > > > > > I see the case where, eg, we have the first 2048 bytes of the file > > > > > out-of-inode and then 20 bytes in the inode. So we'll create the iop > > > > > for the head of the file, but then we may as well finish the entire > > > > > PAGE_SIZE chunk as part of this iteration rather than update 2048-3071 > > > > > as being uptodate and leave the 3072-4095 block for a future iteration. > > > > > > > > Thanks for your comments. Hmm... If I understand the words above correctly, > > > > what I'd like to do is to cover the inline extents (blocks) only > > > > reported by iomap_begin() rather than handling other (maybe) > > > > logical-not-strictly-relevant areas such as post-EOF (even pages > > > > will be finally entirely uptodated), I think such zeroed area should > > > > be handled by from the point of view of the extent itself > > > > > > > > if (iomap_block_needs_zeroing(inode, iomap, pos)) { > > > > zero_user(page, poff, plen); > > > > iomap_set_range_uptodate(page, poff, plen); > > > > goto done; > > > > } > > > > > > That does work. But we already mapped the page to write to it, and > > > we already have to zero to the end of the block. Why not zero to > > > the end of the page? It saves an iteration around the loop, it saves > > > a mapping of the page, and it saves a call to flush_dcache_page(). > > > > I completely understand your concern, and that's also (sort of) why I > > left iomap_read_inline_page() to make the old !pos behavior as before. > > > > Anyway, I could update Christoph's patch to behave like what you > > suggested. Will do later since I'm now taking some rest... > > Looking forward to that for some testing; Christoph's version was > already looking pretty good. > > This code is a bit brittle, hopefully less so with the recent iop > fixes on iomap-for-next. > Sorry about some late. I've revised a version based on Christoph's version and Matthew's thought above. I've preliminary checked with EROFS, if it does make sense, please kindly help check on the gfs2 side as well.. Thanks, Gao Xiang From 62f367245492e389711bcebbf7da5adae586299f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 16 Jul 2021 10:52:48 +0200 Subject: [PATCH] iomap: support tail packing inline read This tries to add tail packing inline read to iomap, which can support several inline tail blocks. Similar to the previous approach, it cleans post-EOF in one iteration. The write path remains untouched since EROFS cannot be used for testing. It'd be better to be implemented if upcoming real users care rather than leave untested dead code around. Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com> --- fs/iomap/buffered-io.c | 59 ++++++++++++++++++++++++++++-------------- fs/iomap/direct-io.c | 6 +++-- 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 87ccb3438bec..95d4d0a76dbc 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -207,23 +207,25 @@ struct iomap_readpage_ctx { static void iomap_read_inline_data(struct inode *inode, struct page *page, - struct iomap *iomap) + struct iomap *iomap, loff_t pos) { - size_t size = i_size_read(inode); + unsigned int size = iomap->length + pos - iomap->offset; + unsigned int poff = offset_in_page(pos); void *addr; - if (PageUptodate(page)) - return; - - BUG_ON(page_has_private(page)); - BUG_ON(page->index); - BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); + /* inline source data must be inside a single page */ + BUG_ON(iomap->length > PAGE_SIZE - offset_in_page(iomap->inline_data)); + /* handle tail-packing blocks cross the current page into the next */ + if (size > PAGE_SIZE - poff) + size = PAGE_SIZE - poff; addr = kmap_atomic(page); - memcpy(addr, iomap->inline_data, size); - memset(addr + size, 0, PAGE_SIZE - size); + memcpy(addr + poff, iomap->inline_data - iomap->offset + pos, size); + memset(addr + poff + size, 0, PAGE_SIZE - poff - size); kunmap_atomic(addr); - SetPageUptodate(page); + flush_dcache_page(page); + + iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff); } static inline bool iomap_block_needs_zeroing(struct inode *inode, @@ -240,24 +242,29 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, { struct iomap_readpage_ctx *ctx = data; struct page *page = ctx->cur_page; - struct iomap_page *iop; + struct iomap_page *iop = NULL; bool same_page = false, is_contig = false; loff_t orig_pos = pos; unsigned poff, plen; sector_t sector; - if (iomap->type == IOMAP_INLINE) { - WARN_ON_ONCE(pos); - iomap_read_inline_data(inode, page, iomap); - return PAGE_SIZE; - } + if (iomap->type == IOMAP_INLINE && !pos) + WARN_ON_ONCE(to_iomap_page(page) != NULL); + else + iop = iomap_page_create(inode, page); - /* zero post-eof blocks as the page may be mapped */ - iop = iomap_page_create(inode, page); + /* needs to skip some leading uptodated blocks */ iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen); if (plen == 0) goto done; + if (iomap->type == IOMAP_INLINE) { + iomap_read_inline_data(inode, page, iomap, pos); + plen = PAGE_SIZE - poff; + goto done; + } + + /* zero post-eof blocks as the page may be mapped */ if (iomap_block_needs_zeroing(inode, iomap, pos)) { zero_user(page, poff, plen); iomap_set_range_uptodate(page, poff, plen); @@ -589,6 +596,18 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, return 0; } +static int iomap_write_begin_inline(struct inode *inode, loff_t pos, + struct page *page, struct iomap *srcmap) +{ + /* needs more work for the tailpacking case, disable for now */ + if (WARN_ON_ONCE(pos != 0)) + return -EIO; + if (PageUptodate(page)) + return 0; + iomap_read_inline_data(inode, page, srcmap, pos); + return 0; +} + static int iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, struct page **pagep, struct iomap *iomap, struct iomap *srcmap) @@ -618,7 +637,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, } if (srcmap->type == IOMAP_INLINE) - iomap_read_inline_data(inode, page, srcmap); + status = iomap_write_begin_inline(inode, pos, page, srcmap); else if (iomap->flags & IOMAP_F_BUFFER_HEAD) status = __block_write_begin_int(page, pos, len, NULL, srcmap); else diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 9398b8c31323..a70a8632df22 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -380,7 +380,8 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, struct iov_iter *iter = dio->submit.iter; size_t copied; - BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data)); + /* inline data must be inside a single page */ + BUG_ON(length > PAGE_SIZE - offset_in_page(iomap->inline_data)); if (dio->flags & IOMAP_DIO_WRITE) { loff_t size = inode->i_size; @@ -394,7 +395,8 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, mark_inode_dirty(inode); } } else { - copied = copy_to_iter(iomap->inline_data + pos, length, iter); + copied = copy_to_iter(iomap->inline_data + pos - iomap->offset, + length, iter); } dio->size += copied; return copied;
On Sat, Jul 17, 2021 at 09:38:18PM +0800, Gao Xiang wrote: > Sorry about some late. I've revised a version based on Christoph's > version and Matthew's thought above. I've preliminary checked with > EROFS, if it does make sense, please kindly help check on the gfs2 > side as well.. I don't understand how this bit works: > struct page *page = ctx->cur_page; > - struct iomap_page *iop; > + struct iomap_page *iop = NULL; > bool same_page = false, is_contig = false; > loff_t orig_pos = pos; > unsigned poff, plen; > sector_t sector; > > - if (iomap->type == IOMAP_INLINE) { > - WARN_ON_ONCE(pos); > - iomap_read_inline_data(inode, page, iomap); > - return PAGE_SIZE; > - } > + if (iomap->type == IOMAP_INLINE && !pos) > + WARN_ON_ONCE(to_iomap_page(page) != NULL); > + else > + iop = iomap_page_create(inode, page); Imagine you have a file with bytes 0-2047 in an extent which is !INLINE and bytes 2048-2051 in the INLINE extent. When you read the page, first you create an iop for the !INLINE extent. Then this function is called again for the INLINE extent and you'll hit the WARN_ON_ONCE. No?
Hi Matthew, On Sat, Jul 17, 2021 at 04:01:38PM +0100, Matthew Wilcox wrote: > On Sat, Jul 17, 2021 at 09:38:18PM +0800, Gao Xiang wrote: > > Sorry about some late. I've revised a version based on Christoph's > > version and Matthew's thought above. I've preliminary checked with > > EROFS, if it does make sense, please kindly help check on the gfs2 > > side as well.. > > I don't understand how this bit works: This part inherited from the Christoph version without change. The following thoughts are just my own understanding... > > > struct page *page = ctx->cur_page; > > - struct iomap_page *iop; > > + struct iomap_page *iop = NULL; > > bool same_page = false, is_contig = false; > > loff_t orig_pos = pos; > > unsigned poff, plen; > > sector_t sector; > > > > - if (iomap->type == IOMAP_INLINE) { > > - WARN_ON_ONCE(pos); > > - iomap_read_inline_data(inode, page, iomap); > > - return PAGE_SIZE; > > - } > > + if (iomap->type == IOMAP_INLINE && !pos) > > + WARN_ON_ONCE(to_iomap_page(page) != NULL); > > + else > > + iop = iomap_page_create(inode, page); > > Imagine you have a file with bytes 0-2047 in an extent which is !INLINE > and bytes 2048-2051 in the INLINE extent. When you read the page, first > you create an iop for the !INLINE extent. Then this function is called Yes, it first created an iop for the !INLINE extent. > again for the INLINE extent and you'll hit the WARN_ON_ONCE. No? If it is called again with another INLINE extent, pos will be non-0? so (!pos) == false. Am I missing something? Thanks, Gao Xiang >
On Sat, Jul 17, 2021 at 11:15:58PM +0800, Gao Xiang wrote: > Hi Matthew, > > On Sat, Jul 17, 2021 at 04:01:38PM +0100, Matthew Wilcox wrote: > > On Sat, Jul 17, 2021 at 09:38:18PM +0800, Gao Xiang wrote: > > > Sorry about some late. I've revised a version based on Christoph's > > > version and Matthew's thought above. I've preliminary checked with > > > EROFS, if it does make sense, please kindly help check on the gfs2 > > > side as well.. > > > > I don't understand how this bit works: > > This part inherited from the Christoph version without change. > The following thoughts are just my own understanding... > > > > > > struct page *page = ctx->cur_page; > > > - struct iomap_page *iop; > > > + struct iomap_page *iop = NULL; > > > bool same_page = false, is_contig = false; > > > loff_t orig_pos = pos; > > > unsigned poff, plen; > > > sector_t sector; > > > > > > - if (iomap->type == IOMAP_INLINE) { > > > - WARN_ON_ONCE(pos); > > > - iomap_read_inline_data(inode, page, iomap); > > > - return PAGE_SIZE; > > > - } > > > + if (iomap->type == IOMAP_INLINE && !pos) > > > + WARN_ON_ONCE(to_iomap_page(page) != NULL); > > > + else > > > + iop = iomap_page_create(inode, page); > > > > Imagine you have a file with bytes 0-2047 in an extent which is !INLINE > > and bytes 2048-2051 in the INLINE extent. When you read the page, first > > you create an iop for the !INLINE extent. Then this function is called > > Yes, it first created an iop for the !INLINE extent. > > > again for the INLINE extent and you'll hit the WARN_ON_ONCE. No? > > If it is called again with another INLINE extent, pos will be non-0? > so (!pos) == false. Am I missing something? Well, either sense of a WARN_ON is wrong. For a file which is PAGE_SIZE + 3 bytes in size, to_iomap_page() will be NULL. For a file which is PAGE_SIZE/2 + 3 bytes in size, to_iomap_page() will not be NULL. (assuming the block size is <= PAGE_SIZE / 2). I think we need a prep patch that looks something like this: +++ b/fs/iomap/buffered-io.c @@ -252,8 +252,12 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, return PAGE_SIZE; } + if (offset_in_page(pos) || length < PAGE_SIZE) + iop = iomap_page_create(inode, page); + else + iop = NULL; + /* zero post-eof blocks as the page may be mapped */ - iop = iomap_page_create(inode, page); iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen); if (plen == 0) goto done; ie first get the conditions right under which we should create an iop.
On Sat, Jul 17, 2021 at 09:38:18PM +0800, Gao Xiang wrote: > >From 62f367245492e389711bcebbf7da5adae586299f Mon Sep 17 00:00:00 2001 > From: Christoph Hellwig <hch@lst.de> > Date: Fri, 16 Jul 2021 10:52:48 +0200 > Subject: [PATCH] iomap: support tail packing inline read I'd still credit this to you as you did all the hard work. > + /* inline source data must be inside a single page */ > + BUG_ON(iomap->length > PAGE_SIZE - offset_in_page(iomap->inline_data)); > + /* handle tail-packing blocks cross the current page into the next */ > + if (size > PAGE_SIZE - poff) > + size = PAGE_SIZE - poff; This should probably use min or min_t. > > addr = kmap_atomic(page); > - memcpy(addr, iomap->inline_data, size); > - memset(addr + size, 0, PAGE_SIZE - size); > + memcpy(addr + poff, iomap->inline_data - iomap->offset + pos, size); > + memset(addr + poff + size, 0, PAGE_SIZE - poff - size); > kunmap_atomic(addr); > - SetPageUptodate(page); > + flush_dcache_page(page); The flush_dcache_page addition should be a separate patch. > > if (dio->flags & IOMAP_DIO_WRITE) { > loff_t size = inode->i_size; > @@ -394,7 +395,8 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, > mark_inode_dirty(inode); > } > } else { > - copied = copy_to_iter(iomap->inline_data + pos, length, iter); > + copied = copy_to_iter(iomap->inline_data + pos - iomap->offset, > + length, iter); We also need to take the offset into account for the write side. I guess it would be nice to have a local variable for the inline address to not duplicate that calculation multiple times.
On Sat, Jul 17, 2021 at 07:40:41PM +0100, Matthew Wilcox wrote: > Well, either sense of a WARN_ON is wrong. > > For a file which is PAGE_SIZE + 3 bytes in size, to_iomap_page() will > be NULL. For a file which is PAGE_SIZE/2 + 3 bytes in size, > to_iomap_page() will not be NULL. (assuming the block size is <= > PAGE_SIZE / 2). > > I think we need a prep patch that looks something like this: Something like this is where we should eventually end up, but it also affects the read from disk case so we need to be careful.
On Mon, Jul 19, 2021 at 12:15:47PM +0100, Christoph Hellwig wrote: > On Sat, Jul 17, 2021 at 09:38:18PM +0800, Gao Xiang wrote: > > >From 62f367245492e389711bcebbf7da5adae586299f Mon Sep 17 00:00:00 2001 > > From: Christoph Hellwig <hch@lst.de> > > Date: Fri, 16 Jul 2021 10:52:48 +0200 > > Subject: [PATCH] iomap: support tail packing inline read > > I'd still credit this to you as you did all the hard work. Ok. > > > + /* inline source data must be inside a single page */ > > + BUG_ON(iomap->length > PAGE_SIZE - offset_in_page(iomap->inline_data)); > > + /* handle tail-packing blocks cross the current page into the next */ > > + if (size > PAGE_SIZE - poff) > > + size = PAGE_SIZE - poff; > > This should probably use min or min_t. Okay, will update. > > > > > addr = kmap_atomic(page); > > - memcpy(addr, iomap->inline_data, size); > > - memset(addr + size, 0, PAGE_SIZE - size); > > + memcpy(addr + poff, iomap->inline_data - iomap->offset + pos, size); > > + memset(addr + poff + size, 0, PAGE_SIZE - poff - size); > > kunmap_atomic(addr); > > - SetPageUptodate(page); > > + flush_dcache_page(page); > > The flush_dcache_page addition should be a separate patch. I wondered what is the target order of recent iomap patches, since this is a quite small change, so I'd like to just rebase on for-next for now . So ok, I won't touch this in this patchset and I think flush_dcache_page() does no harm to x86(_64) and arm(64) at least if I remember correctly. > > > > > if (dio->flags & IOMAP_DIO_WRITE) { > > loff_t size = inode->i_size; > > @@ -394,7 +395,8 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, > > mark_inode_dirty(inode); > > } > > } else { > > - copied = copy_to_iter(iomap->inline_data + pos, length, iter); > > + copied = copy_to_iter(iomap->inline_data + pos - iomap->offset, > > + length, iter); > > We also need to take the offset into account for the write side. > I guess it would be nice to have a local variable for the inline > address to not duplicate that calculation multiple times. ok. Will update. Thanks, Gao Xiang
On Mon, Jul 19, 2021 at 12:19:34PM +0100, Christoph Hellwig wrote: > On Sat, Jul 17, 2021 at 07:40:41PM +0100, Matthew Wilcox wrote: > > Well, either sense of a WARN_ON is wrong. > > > > For a file which is PAGE_SIZE + 3 bytes in size, to_iomap_page() will > > be NULL. For a file which is PAGE_SIZE/2 + 3 bytes in size, > > to_iomap_page() will not be NULL. (assuming the block size is <= > > PAGE_SIZE / 2). > > > > I think we need a prep patch that looks something like this: > > Something like this is where we should eventually end up, but it > also affects the read from disk case so we need to be careful. I also think it'd be better to leave this hunk as it-is (don't touch it in this patch), I mean just iop = iomap_page_create(inode, page); as https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/iomap/buffered-io.c?id=229adf3c64dbeae4e2f45fb561907ada9fcc0d0c#n256 since iomap_read_inline_data() now calls iomap_set_range_uptodate() to set blocks uptodate rather than SetPageUptodate() directly and we also have iomap_page_release() as well. Some follow-up optimized patch can be raised up independently since it's somewhat out of current topic for now. Thanks, Gao Xiang
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 9023717c5188..c6d6d7f9d5a6 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -206,7 +206,7 @@ struct iomap_readpage_ctx { }; static void -iomap_read_inline_data(struct inode *inode, struct page *page, +iomap_read_inline_page(struct inode *inode, struct page *page, struct iomap *iomap) { size_t size = i_size_read(inode); @@ -225,10 +225,33 @@ iomap_read_inline_data(struct inode *inode, struct page *page, SetPageUptodate(page); } +/* + * Different from iomap_read_inline_page, which makes the range of + * some tail blocks in the page uptodate and doesn't clean post-EOF. + */ +static void +iomap_read_inline_data(struct inode *inode, struct page *page, + struct iomap *iomap, loff_t pos, unsigned int plen) +{ + unsigned int poff = offset_in_page(pos); + unsigned int delta = pos - iomap->offset; + unsigned int alignedsize = roundup(plen, i_blocksize(inode)); + void *addr; + + /* make sure that inline_data doesn't cross page boundary */ + BUG_ON(plen > PAGE_SIZE - offset_in_page(iomap->inline_data)); + BUG_ON(plen != i_size_read(inode) - pos); + addr = kmap_atomic(page); + memcpy(addr + poff, iomap->inline_data + delta, plen); + memset(addr + poff + plen, 0, alignedsize - plen); + kunmap_atomic(addr); + iomap_set_range_uptodate(page, poff, alignedsize); +} + static inline bool iomap_block_needs_zeroing(struct inode *inode, struct iomap *iomap, loff_t pos) { - return iomap->type != IOMAP_MAPPED || + return (iomap->type != IOMAP_MAPPED && iomap->type != IOMAP_INLINE) || (iomap->flags & IOMAP_F_NEW) || pos >= i_size_read(inode); } @@ -245,9 +268,8 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, unsigned poff, plen; sector_t sector; - if (iomap->type == IOMAP_INLINE) { - WARN_ON_ONCE(pos); - iomap_read_inline_data(inode, page, iomap); + if (iomap->type == IOMAP_INLINE && !pos) { + iomap_read_inline_page(inode, page, iomap); return PAGE_SIZE; } @@ -262,6 +284,10 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, goto done; } + if (iomap->type == IOMAP_INLINE) { + iomap_read_inline_data(inode, page, iomap, pos, plen); + goto done; + } ctx->cur_page_in_bio = true; if (iop) atomic_add(plen, &iop->read_bytes_pending); @@ -598,6 +624,9 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, BUG_ON(pos + len > iomap->offset + iomap->length); if (srcmap != iomap) BUG_ON(pos + len > srcmap->offset + srcmap->length); + /* no available tail-packing write user yet, never allow it for now */ + if (WARN_ON_ONCE(srcmap->type == IOMAP_INLINE && iomap->offset)) + return -EIO; if (fatal_signal_pending(current)) return -EINTR; @@ -616,7 +645,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, } if (srcmap->type == IOMAP_INLINE) - iomap_read_inline_data(inode, page, srcmap); + iomap_read_inline_page(inode, page, srcmap); else if (iomap->flags & IOMAP_F_BUFFER_HEAD) status = __block_write_begin_int(page, pos, len, NULL, srcmap); else diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 9398b8c31323..a905939dea4e 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -380,7 +380,10 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, struct iov_iter *iter = dio->submit.iter; size_t copied; - BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data)); + if (WARN_ON_ONCE(pos && (dio->flags & IOMAP_DIO_WRITE))) + return -EIO; + /* inline data should be in the same page boundary */ + BUG_ON(length > PAGE_SIZE - offset_in_page(iomap->inline_data)); if (dio->flags & IOMAP_DIO_WRITE) { loff_t size = inode->i_size; @@ -394,7 +397,8 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, mark_inode_dirty(inode); } } else { - copied = copy_to_iter(iomap->inline_data + pos, length, iter); + copied = copy_to_iter(iomap->inline_data + pos - iomap->offset, + length, iter); } dio->size += copied; return copied;
This tries to add tail packing inline read to iomap. Different from the previous approach, it only marks the block range uptodate in the page it covers. Also, leave the original pos == 0 case as a fast path but rename it to iomap_read_inline_page(). The write path remains untouched since EROFS cannot be used for testing. It'd be better to be implemented if upcoming real users care rather than leave untested dead code around. Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com> --- fs/iomap/buffered-io.c | 41 +++++++++++++++++++++++++++++++++++------ fs/iomap/direct-io.c | 8 ++++++-- 2 files changed, 41 insertions(+), 8 deletions(-)