Message ID | 20240607143919.2622319-3-john.g.garry@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | block atomic writes for xfs | expand |
On Fri, Jun 07, 2024 at 02:38:59PM +0000, John Garry wrote: > Allow filesystems to set the io_block_size for sub-fs block size zeroing, > as in future we will want to extend this feature to support zeroing of > block sizes of larger than the inode block size. > > The value in io_block_size does not have to be a power-of-2, so fix up > zeroing code to handle that. > > Signed-off-by: John Garry <john.g.garry@oracle.com> > --- > block/fops.c | 1 + > fs/btrfs/inode.c | 1 + > fs/erofs/data.c | 1 + > fs/erofs/zmap.c | 1 + > fs/ext2/inode.c | 1 + > fs/ext4/extents.c | 1 + > fs/ext4/inode.c | 1 + > fs/f2fs/data.c | 1 + > fs/fuse/dax.c | 1 + > fs/gfs2/bmap.c | 1 + > fs/hpfs/file.c | 1 + > fs/iomap/direct-io.c | 23 +++++++++++++++++++---- > fs/xfs/xfs_iomap.c | 1 + > fs/zonefs/file.c | 2 ++ > include/linux/iomap.h | 2 ++ > 15 files changed, 35 insertions(+), 4 deletions(-) > > diff --git a/block/fops.c b/block/fops.c > index 9d6d86ebefb9..020443078630 100644 > --- a/block/fops.c > +++ b/block/fops.c > @@ -402,6 +402,7 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, > iomap->addr = iomap->offset; > iomap->length = isize - iomap->offset; > iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */ > + iomap->io_block_size = i_blocksize(inode); > return 0; > } > <snip a bunch of filesystems setting io_block_size to i_blocksize> > diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c > index 1bb8d97cd9ae..5d2718faf520 100644 > --- a/fs/hpfs/file.c > +++ b/fs/hpfs/file.c > @@ -149,6 +149,7 @@ static int hpfs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, > iomap->addr = IOMAP_NULL_ADDR; > iomap->length = 1 << blkbits; > } > + iomap->io_block_size = i_blocksize(inode); HPFS does iomap now? Yikes. > > hpfs_unlock(sb); > return 0; > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c > index f3b43d223a46..5be8d886ab4a 100644 > --- a/fs/iomap/direct-io.c > +++ b/fs/iomap/direct-io.c > @@ -277,7 +277,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, > { > const struct iomap *iomap = &iter->iomap; > struct inode *inode = iter->inode; > - unsigned int fs_block_size = i_blocksize(inode), pad; > + u64 io_block_size = iomap->io_block_size; I wonder, should iomap be nice and not require filesystems to set io_block_size themselves unless they really need it? Anyone working on an iomap port while this patchset is in progress may or may not remember to add this bit if they get their port merged after atomicwrites is merged; and you might not remember to prevent the bitrot if the reverse order happens. u64 io_block_size = iomap->io_block_size ?: i_blocksize(inode); > loff_t length = iomap_length(iter); > loff_t pos = iter->pos; > blk_opf_t bio_opf; > @@ -287,6 +287,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, > int nr_pages, ret = 0; > size_t copied = 0; > size_t orig_count; > + unsigned int pad; > > if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || > !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) > @@ -355,7 +356,14 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, > > if (need_zeroout) { > /* zero out from the start of the block to the write offset */ > - pad = pos & (fs_block_size - 1); > + if (is_power_of_2(io_block_size)) { > + pad = pos & (io_block_size - 1); > + } else { > + loff_t _pos = pos; > + > + pad = do_div(_pos, io_block_size); > + } Please don't opencode this twice. static unsigned int offset_in_block(loff_t pos, u64 blocksize) { if (likely(is_power_of_2(blocksize))) return pos & (blocksize - 1); return do_div(pos, blocksize); } pad = offset_in_block(pos, io_block_size); if (pad) ... Also, what happens if pos-pad points to a byte before the mapping? > + > if (pad) > iomap_dio_zero(iter, dio, pos - pad, pad); > } > @@ -429,9 +437,16 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, > if (need_zeroout || > ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { > /* zero out from the end of the write to the end of the block */ > - pad = pos & (fs_block_size - 1); > + if (is_power_of_2(io_block_size)) { > + pad = pos & (io_block_size - 1); > + } else { > + loff_t _pos = pos; > + > + pad = do_div(_pos, io_block_size); > + } > + > if (pad) > - iomap_dio_zero(iter, dio, pos, fs_block_size - pad); > + iomap_dio_zero(iter, dio, pos, io_block_size - pad); What if pos + io_block_size - pad points to a byte after the end of the mapping? > } > out: > /* Undo iter limitation to current extent */ > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c > index 378342673925..ecb4cae88248 100644 > --- a/fs/xfs/xfs_iomap.c > +++ b/fs/xfs/xfs_iomap.c > @@ -127,6 +127,7 @@ xfs_bmbt_to_iomap( > } > iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); > iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); > + iomap->io_block_size = i_blocksize(VFS_I(ip)); > if (mapping_flags & IOMAP_DAX) > iomap->dax_dev = target->bt_daxdev; > else > diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c > index 3b103715acc9..bf2cc4bee309 100644 > --- a/fs/zonefs/file.c > +++ b/fs/zonefs/file.c > @@ -50,6 +50,7 @@ static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, > iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; > iomap->length = isize - iomap->offset; > } > + iomap->io_block_size = i_blocksize(inode); > mutex_unlock(&zi->i_truncate_mutex); > > trace_zonefs_iomap_begin(inode, iomap); > @@ -99,6 +100,7 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, > iomap->type = IOMAP_MAPPED; > iomap->length = isize - iomap->offset; > } > + iomap->io_block_size = i_blocksize(inode); > mutex_unlock(&zi->i_truncate_mutex); > > trace_zonefs_iomap_begin(inode, iomap); > diff --git a/include/linux/iomap.h b/include/linux/iomap.h > index 6fc1c858013d..d63a35b77907 100644 > --- a/include/linux/iomap.h > +++ b/include/linux/iomap.h > @@ -103,6 +103,8 @@ struct iomap { > void *private; /* filesystem private */ > const struct iomap_folio_ops *folio_ops; > u64 validity_cookie; /* used with .iomap_valid() */ > + /* io block zeroing size, not necessarily a power-of-2 */ size in bytes? I'm not sure what "io block zeroing" means. What are you trying to accomplish here? Let's say the fsblock size is 4k and the allocation unit (aka the atomic write size) is 16k. Userspace wants a direct write to file offset 8192-12287, and that space is unwritten: uuuu ^ Currently we'd just write the 4k and run the io completion handler, so the final state is: uuWu Instead, if the fs sets io_block_size to 16384, does this direct write now amplify into a full 16k write? With the end result being: ZZWZ only.... I don't see the unwritten areas being converted to written? I guess for an atomic write you'd require the user to write 0-16383? <still confused about why we need to do this, maybe i'll figure it out as I go along> --D > + u64 io_block_size; > }; > > static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) > -- > 2.31.1 > >
On 12/06/2024 22:32, Darrick J. Wong wrote: >> unsigned int fs_block_size = i_blocksize(inode), pad; >> + u64 io_block_size = iomap->io_block_size; > I wonder, should iomap be nice and not require filesystems to set > io_block_size themselves unless they really need it? That's what I had in v3, like: if (iomap->io_block_size) io_block_size = iomap->io_block_size; else io_block_size = i_block_size(inode) but it was suggested to change that (to like what I have here). > Anyone working on > an iomap port while this patchset is in progress may or may not remember > to add this bit if they get their port merged after atomicwrites is > merged; and you might not remember to prevent the bitrot if the reverse > order happens. Sure, I get your point. However, OTOH, if we check xfs_bmbt_to_iomap(), it does set all or close to all members of struct iomap, so we are just continuing that trend, i.e. it is the job of the FS callback to set all these members. > > u64 io_block_size = iomap->io_block_size ?: i_blocksize(inode); > >> loff_t length = iomap_length(iter); >> loff_t pos = iter->pos; >> blk_opf_t bio_opf; >> @@ -287,6 +287,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, >> int nr_pages, ret = 0; >> size_t copied = 0; >> size_t orig_count; >> + unsigned int pad; >> >> if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || >> !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) >> @@ -355,7 +356,14 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, >> >> if (need_zeroout) { >> /* zero out from the start of the block to the write offset */ >> - pad = pos & (fs_block_size - 1); >> + if (is_power_of_2(io_block_size)) { >> + pad = pos & (io_block_size - 1); >> + } else { >> + loff_t _pos = pos; >> + >> + pad = do_div(_pos, io_block_size); >> + } > Please don't opencode this twice. > > static unsigned int offset_in_block(loff_t pos, u64 blocksize) > { > if (likely(is_power_of_2(blocksize))) > return pos & (blocksize - 1); > return do_div(pos, blocksize); > } ok, fine > > pad = offset_in_block(pos, io_block_size); > if (pad) > ... > > Also, what happens if pos-pad points to a byte before the mapping? It's the job of the FS to map in something aligned to io_block_size. Having said that, I don't think we are doing that for XFS (which sets io_block_size > i_block_size(inode)), so I need to check that. > >> + >> if (pad) >> iomap_dio_zero(iter, dio, pos - pad, pad); >> } >> @@ -429,9 +437,16 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, >> if (need_zeroout || >> ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { >> /* zero out from the end of the write to the end of the block */ >> - pad = pos & (fs_block_size - 1); >> + if (is_power_of_2(io_block_size)) { >> + pad = pos & (io_block_size - 1); >> + } else { >> + loff_t _pos = pos; >> + >> + pad = do_div(_pos, io_block_size); >> + } >> + >> if (pad) >> - iomap_dio_zero(iter, dio, pos, fs_block_size - pad); >> + iomap_dio_zero(iter, dio, pos, io_block_size - pad); > What if pos + io_block_size - pad points to a byte after the end of the > mapping? as above, we expect this to be mapped in (so ok to zero) > >> } >> out: >> /* Undo iter limitation to current extent */ >> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c >> index 378342673925..ecb4cae88248 100644 >> --- a/fs/xfs/xfs_iomap.c >> +++ b/fs/xfs/xfs_iomap.c >> @@ -127,6 +127,7 @@ xfs_bmbt_to_iomap( >> } >> iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); >> iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); >> + iomap->io_block_size = i_blocksize(VFS_I(ip)); >> if (mapping_flags & IOMAP_DAX) >> iomap->dax_dev = target->bt_daxdev; >> else >> diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c >> index 3b103715acc9..bf2cc4bee309 100644 >> --- a/fs/zonefs/file.c >> +++ b/fs/zonefs/file.c >> @@ -50,6 +50,7 @@ static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, >> iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; >> iomap->length = isize - iomap->offset; >> } >> + iomap->io_block_size = i_blocksize(inode); >> mutex_unlock(&zi->i_truncate_mutex); >> >> trace_zonefs_iomap_begin(inode, iomap); >> @@ -99,6 +100,7 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, >> iomap->type = IOMAP_MAPPED; >> iomap->length = isize - iomap->offset; >> } >> + iomap->io_block_size = i_blocksize(inode); >> mutex_unlock(&zi->i_truncate_mutex); >> >> trace_zonefs_iomap_begin(inode, iomap); >> diff --git a/include/linux/iomap.h b/include/linux/iomap.h >> index 6fc1c858013d..d63a35b77907 100644 >> --- a/include/linux/iomap.h >> +++ b/include/linux/iomap.h >> @@ -103,6 +103,8 @@ struct iomap { >> void *private; /* filesystem private */ >> const struct iomap_folio_ops *folio_ops; >> u64 validity_cookie; /* used with .iomap_valid() */ >> + /* io block zeroing size, not necessarily a power-of-2 */ > size in bytes? > > I'm not sure what "io block zeroing" means. Naming is hard. Essentally we are trying to reuse the sub-fs block zeroing code for sub-extent granule writes. More below. > What are you trying to > accomplish here? Let's say the fsblock size is 4k and the allocation > unit (aka the atomic write size) is 16k. ok, so I say here that the extent granule is 16k > Userspace wants a direct write > to file offset 8192-12287, and that space is unwritten: > > uuuu > ^ > > Currently we'd just write the 4k and run the io completion handler, so > the final state is: > > uuWu > > Instead, if the fs sets io_block_size to 16384, does this direct write > now amplify into a full 16k write? Yes, but only when the extent is newly allocated and we require zeroing. > With the end result being: > ZZWZ Yes > > only.... I don't see the unwritten areas being converted to written? See xfs_iomap_write_unwritten() change in the next patch > I guess for an atomic write you'd require the user to write 0-16383? Not exactly > > <still confused about why we need to do this, maybe i'll figure it out > as I go along> This zeroing is just really required for atomic writes. The purpose is to zero the extent granule for any write within a newly allocated granule. Consider we have uuWu, above. If the user then attempts to write the full 16K as an atomic write, the iomap iter code would generate writes for sizes 8k, 4k, and 4k, i.e. not a single 16K write. This is not acceptable. So the idea is to zero the full extent granule when allocated, so we have ZZWZ after the 4k write at offset 8192, above. As such, if we then attempt this 16K atomic write, we get a single 16K BIO, i.e. there is no unwritten extent conversion. I am not sure if we should be doing this only for atomic writes inodes, or also forcealign only or RT. Thanks, John
On Thu, Jun 13, 2024 at 11:31:35AM +0100, John Garry wrote: > On 12/06/2024 22:32, Darrick J. Wong wrote: > > > unsigned int fs_block_size = i_blocksize(inode), pad; > > > + u64 io_block_size = iomap->io_block_size; > > I wonder, should iomap be nice and not require filesystems to set > > io_block_size themselves unless they really need it? > > That's what I had in v3, like: > > if (iomap->io_block_size) > io_block_size = iomap->io_block_size; > else > io_block_size = i_block_size(inode) > > but it was suggested to change that (to like what I have here). oh, ok. Ignore that comment, then. :) > > Anyone working on > > an iomap port while this patchset is in progress may or may not remember > > to add this bit if they get their port merged after atomicwrites is > > merged; and you might not remember to prevent the bitrot if the reverse > > order happens. > > Sure, I get your point. > > However, OTOH, if we check xfs_bmbt_to_iomap(), it does set all or close to > all members of struct iomap, so we are just continuing that trend, i.e. it > is the job of the FS callback to set all these members. > > > > > u64 io_block_size = iomap->io_block_size ?: i_blocksize(inode); > > > > > loff_t length = iomap_length(iter); > > > loff_t pos = iter->pos; > > > blk_opf_t bio_opf; > > > @@ -287,6 +287,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, > > > int nr_pages, ret = 0; > > > size_t copied = 0; > > > size_t orig_count; > > > + unsigned int pad; > > > if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || > > > !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) > > > @@ -355,7 +356,14 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, > > > if (need_zeroout) { > > > /* zero out from the start of the block to the write offset */ > > > - pad = pos & (fs_block_size - 1); > > > + if (is_power_of_2(io_block_size)) { > > > + pad = pos & (io_block_size - 1); > > > + } else { > > > + loff_t _pos = pos; > > > + > > > + pad = do_div(_pos, io_block_size); > > > + } > > Please don't opencode this twice. > > > > static unsigned int offset_in_block(loff_t pos, u64 blocksize) > > { > > if (likely(is_power_of_2(blocksize))) > > return pos & (blocksize - 1); > > return do_div(pos, blocksize); > > } > > ok, fine > > > > > pad = offset_in_block(pos, io_block_size); > > if (pad) > > ... > > > > Also, what happens if pos-pad points to a byte before the mapping? > > It's the job of the FS to map in something aligned to io_block_size. Having > said that, I don't think we are doing that for XFS (which sets io_block_size > > i_block_size(inode)), so I need to check that. <nod> You can only play with the mapping that the fs gave you. If xfs doesn't give you a big enough mapping, then that's a programming bug to WARN_ON_ONCE about and return EIO. I hadn't realized that the ->iomap_begin function is required to provide mappings that are aligned to io_block_size. > > > > > > + > > > if (pad) > > > iomap_dio_zero(iter, dio, pos - pad, pad); > > > } > > > @@ -429,9 +437,16 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, > > > if (need_zeroout || > > > ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { > > > /* zero out from the end of the write to the end of the block */ > > > - pad = pos & (fs_block_size - 1); > > > + if (is_power_of_2(io_block_size)) { > > > + pad = pos & (io_block_size - 1); > > > + } else { > > > + loff_t _pos = pos; > > > + > > > + pad = do_div(_pos, io_block_size); > > > + } > > > + > > > if (pad) > > > - iomap_dio_zero(iter, dio, pos, fs_block_size - pad); > > > + iomap_dio_zero(iter, dio, pos, io_block_size - pad); > > What if pos + io_block_size - pad points to a byte after the end of the > > mapping? > > as above, we expect this to be mapped in (so ok to zero) > > > > > > } > > > out: > > > /* Undo iter limitation to current extent */ > > > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c > > > index 378342673925..ecb4cae88248 100644 > > > --- a/fs/xfs/xfs_iomap.c > > > +++ b/fs/xfs/xfs_iomap.c > > > @@ -127,6 +127,7 @@ xfs_bmbt_to_iomap( > > > } > > > iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); > > > iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); > > > + iomap->io_block_size = i_blocksize(VFS_I(ip)); > > > if (mapping_flags & IOMAP_DAX) > > > iomap->dax_dev = target->bt_daxdev; > > > else > > > diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c > > > index 3b103715acc9..bf2cc4bee309 100644 > > > --- a/fs/zonefs/file.c > > > +++ b/fs/zonefs/file.c > > > @@ -50,6 +50,7 @@ static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, > > > iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; > > > iomap->length = isize - iomap->offset; > > > } > > > + iomap->io_block_size = i_blocksize(inode); > > > mutex_unlock(&zi->i_truncate_mutex); > > > trace_zonefs_iomap_begin(inode, iomap); > > > @@ -99,6 +100,7 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, > > > iomap->type = IOMAP_MAPPED; > > > iomap->length = isize - iomap->offset; > > > } > > > + iomap->io_block_size = i_blocksize(inode); > > > mutex_unlock(&zi->i_truncate_mutex); > > > trace_zonefs_iomap_begin(inode, iomap); > > > diff --git a/include/linux/iomap.h b/include/linux/iomap.h > > > index 6fc1c858013d..d63a35b77907 100644 > > > --- a/include/linux/iomap.h > > > +++ b/include/linux/iomap.h > > > @@ -103,6 +103,8 @@ struct iomap { > > > void *private; /* filesystem private */ > > > const struct iomap_folio_ops *folio_ops; > > > u64 validity_cookie; /* used with .iomap_valid() */ > > > + /* io block zeroing size, not necessarily a power-of-2 */ > > size in bytes? > > > > I'm not sure what "io block zeroing" means. > > Naming is hard. Essentally we are trying to reuse the sub-fs block zeroing > code for sub-extent granule writes. More below. Yeah. For sub-fsblock zeroing we issue (chained) bios to write zeroes to the sectors surrounding the part we're actually writing, then we're issuing the write itself, and finally the ioend converts the mapping to unwritten. For untorn writes we're doing the same thing, but now on the level of multiple fsblocks. I guess this is all going to support a <nod> "IO granularity" ? For untorn writes I guess you want mappings that are aligned to a supported untorn write granularity; for bs > ps filesystems I guess the IO granularity is > > What are you trying to > > accomplish here? Let's say the fsblock size is 4k and the allocation > > unit (aka the atomic write size) is 16k. > > ok, so I say here that the extent granule is 16k > > > Userspace wants a direct write > > to file offset 8192-12287, and that space is unwritten: > > > > uuuu > > ^ > > > > Currently we'd just write the 4k and run the io completion handler, so > > the final state is: > > > > uuWu > > > > Instead, if the fs sets io_block_size to 16384, does this direct write > > now amplify into a full 16k write? > > Yes, but only when the extent is newly allocated and we require zeroing. > > > With the end result being: > > ZZWZ > > Yes > > > > > only.... I don't see the unwritten areas being converted to written? > > See xfs_iomap_write_unwritten() change in the next patch > > > I guess for an atomic write you'd require the user to write 0-16383? > > Not exactly > > > > > <still confused about why we need to do this, maybe i'll figure it out > > as I go along> > > This zeroing is just really required for atomic writes. The purpose is to > zero the extent granule for any write within a newly allocated granule. > > Consider we have uuWu, above. If the user then attempts to write the full > 16K as an atomic write, the iomap iter code would generate writes for sizes > 8k, 4k, and 4k, i.e. not a single 16K write. This is not acceptable. So the > idea is to zero the full extent granule when allocated, so we have ZZWZ > after the 4k write at offset 8192, above. As such, if we then attempt this > 16K atomic write, we get a single 16K BIO, i.e. there is no unwritten extent > conversion. Wait, are we issuing zeroing writes for 0-8191 and 12288-16383, then issuing a single atomic write for 0-16383? That won't work, because all the bios attached to an iomap_dio are submitted and execute asynchronously. I think you need ->iomap_begin to do XFS_BMAPI_ZERO allocations if the writes aren't aligned to the minimum untorn write granularity. > I am not sure if we should be doing this only for atomic writes inodes, or > also forcealign only or RT. I think it only applies to untorn writes because the default behavior everywhere is is that writes can tear. --D > Thanks, > John > > >
On 21/06/2024 22:18, Darrick J. Wong wrote: > On Thu, Jun 13, 2024 at 11:31:35AM +0100, John Garry wrote: >> On 12/06/2024 22:32, Darrick J. Wong wrote: >>>> unsigned int fs_block_size = i_blocksize(inode), pad; >>>> + u64 io_block_size = iomap->io_block_size; >>> I wonder, should iomap be nice and not require filesystems to set >>> io_block_size themselves unless they really need it? >> >> That's what I had in v3, like: >> >> if (iomap->io_block_size) >> io_block_size = iomap->io_block_size; >> else >> io_block_size = i_block_size(inode) >> >> but it was suggested to change that (to like what I have here). > > oh, ok. Ignore that comment, then. :) > To be clear, Dave actually suggested that change. >>> Anyone working on >>> an iomap port while this patchset is in progress may or may not remember >>> to add this bit if they get their port merged after atomicwrites is >>> merged; and you might not remember to prevent the bitrot if the reverse >>> order happens. >> >> Sure, I get your point. >> >> However, OTOH, if we check xfs_bmbt_to_iomap(), it does set all or close to >> all members of struct iomap, so we are just continuing that trend, i.e. it >> is the job of the FS callback to set all these members. >> >>> >>> u64 io_block_size = iomap->io_block_size ?: i_blocksize(inode); >>> >>>> loff_t length = iomap_length(iter); >>>> loff_t pos = iter->pos; >>>> blk_opf_t bio_opf; >>>> @@ -287,6 +287,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, >>>> int nr_pages, ret = 0; >>>> size_t copied = 0; >>>> size_t orig_count; >>>> + unsigned int pad; >>>> if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || >>>> !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) >>>> @@ -355,7 +356,14 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, >>>> if (need_zeroout) { >>>> /* zero out from the start of the block to the write offset */ >>>> - pad = pos & (fs_block_size - 1); >>>> + if (is_power_of_2(io_block_size)) { >>>> + pad = pos & (io_block_size - 1); >>>> + } else { >>>> + loff_t _pos = pos; >>>> + >>>> + pad = do_div(_pos, io_block_size); >>>> + } >>> Please don't opencode this twice. >>> >>> static unsigned int offset_in_block(loff_t pos, u64 blocksize) >>> { >>> if (likely(is_power_of_2(blocksize))) >>> return pos & (blocksize - 1); >>> return do_div(pos, blocksize); >>> } >> >> ok, fine >> >>> >>> pad = offset_in_block(pos, io_block_size); >>> if (pad) >>> ... >>> >>> Also, what happens if pos-pad points to a byte before the mapping? >> >> It's the job of the FS to map in something aligned to io_block_size. Having >> said that, I don't think we are doing that for XFS (which sets io_block_size >>> i_block_size(inode)), so I need to check that. > > <nod> You can only play with the mapping that the fs gave you. > If xfs doesn't give you a big enough mapping, then that's a programming > bug to WARN_ON_ONCE about and return EIO. I think that this is pretty easy to solve by just ensuring that for an atomic write inode, the call xfs_direct_write_iomap_being() -> xfs_bmapi_read(bno, len) is such that bno and len are extent size aligned. >> >> Naming is hard. Essentally we are trying to reuse the sub-fs block zeroing >> code for sub-extent granule writes. More below. > > Yeah. For sub-fsblock zeroing we issue (chained) bios to write zeroes > to the sectors surrounding the part we're actually writing, then we're > issuing the write itself, and finally the ioend converts the mapping to > unwritten. > > For untorn writes we're doing the same thing, but now on the level of > multiple fsblocks. I guess this is all going to support a > > > <nod> "IO granularity" ? For untorn writes I guess you want mappings > that are aligned to a supported untorn write granularity; for bs > ps > filesystems I guess the IO granularity is For LBS, it's still going to be inode block size. >>> >>> <still confused about why we need to do this, maybe i'll figure it out >>> as I go along> >> >> This zeroing is just really required for atomic writes. The purpose is to >> zero the extent granule for any write within a newly allocated granule. >> >> Consider we have uuWu, above. If the user then attempts to write the full >> 16K as an atomic write, the iomap iter code would generate writes for sizes >> 8k, 4k, and 4k, i.e. not a single 16K write. This is not acceptable. So the >> idea is to zero the full extent granule when allocated, so we have ZZWZ >> after the 4k write at offset 8192, above. As such, if we then attempt this >> 16K atomic write, we get a single 16K BIO, i.e. there is no unwritten extent >> conversion. > > Wait, are we issuing zeroing writes for 0-8191 and 12288-16383, then > issuing a single atomic write for 0-16383? When we have uuuu and attempt the first 4k write @ offset 4k, we also issue zeroes for 0-8191 and 12288-16383. But this is done synchronously. We are leveraging the existing code to issue the write with the exclusive IOLOCK in xfs_file_dio_write_unaligned(), so no one else can access that data while we do that initial write+zeroing to the extent. > That won't work, because all > the bios attached to an iomap_dio are submitted and execute > asynchronously. I think you need ->iomap_begin to do XFS_BMAPI_ZERO > allocations if the writes aren't aligned to the minimum untorn write > granularity. > >> I am not sure if we should be doing this only for atomic writes inodes, or >> also forcealign only or RT. > > I think it only applies to untorn writes because the default behavior > everywhere is is that writes can tear. > ok, fine. Thanks, John
diff --git a/block/fops.c b/block/fops.c index 9d6d86ebefb9..020443078630 100644 --- a/block/fops.c +++ b/block/fops.c @@ -402,6 +402,7 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->addr = iomap->offset; iomap->length = isize - iomap->offset; iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */ + iomap->io_block_size = i_blocksize(inode); return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 753db965f7c0..665811b1578b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7740,6 +7740,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->offset = start; iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; + iomap->io_block_size = i_blocksize(inode); free_extent_map(em); return 0; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 8be60797ea2f..ea9d2f3eadb3 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -305,6 +305,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (flags & IOMAP_DAX) iomap->addr += mdev.m_dax_part_off; } + iomap->io_block_size = i_blocksize(inode); return 0; } diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 9b248ee5fef2..6ee89f6a078c 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -749,6 +749,7 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, if (iomap->offset >= inode->i_size) iomap->length = length + offset - map.m_la; } + iomap->io_block_size = i_blocksize(inode); iomap->flags = 0; return 0; } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 0caa1650cee8..7a5539a52844 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -862,6 +862,7 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->length = (u64)ret << blkbits; iomap->flags |= IOMAP_F_MERGED; } + iomap->io_block_size = i_blocksize(inode); if (new) iomap->flags |= IOMAP_F_NEW; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e067f2dd0335..ce3269874fde 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4933,6 +4933,7 @@ static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap) iomap->length = length; iomap->type = iomap_type; iomap->flags = 0; + iomap->io_block_size = i_blocksize(inode); out: return error; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4bae9ccf5fe0..3ec82e4d71c4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3235,6 +3235,7 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64) map->m_lblk << blkbits; iomap->length = (u64) map->m_len << blkbits; + iomap->io_block_size = i_blocksize(inode); if ((map->m_flags & EXT4_MAP_MAPPED) && !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b9b0debc6b3d..6c12641b9a7b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -4233,6 +4233,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, } iomap->addr = IOMAP_NULL_ADDR; } + iomap->io_block_size = i_blocksize(inode); if (map.m_flags & F2FS_MAP_NEW) iomap->flags |= IOMAP_F_NEW; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 12ef91d170bb..68ddc74cb31e 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -577,6 +577,7 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, iomap->flags = 0; iomap->bdev = NULL; iomap->dax_dev = fc->dax->dev; + iomap->io_block_size = i_blocksize(inode); /* * Both read/write and mmap path can race here. So we need something diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1795c4e8dbf6..8d2de42b1da9 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -927,6 +927,7 @@ static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, out: iomap->bdev = inode->i_sb->s_bdev; + iomap->io_block_size = i_blocksize(inode); unlock: up_read(&ip->i_rw_mutex); return ret; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 1bb8d97cd9ae..5d2718faf520 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -149,6 +149,7 @@ static int hpfs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->addr = IOMAP_NULL_ADDR; iomap->length = 1 << blkbits; } + iomap->io_block_size = i_blocksize(inode); hpfs_unlock(sb); return 0; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index f3b43d223a46..5be8d886ab4a 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -277,7 +277,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, { const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; - unsigned int fs_block_size = i_blocksize(inode), pad; + u64 io_block_size = iomap->io_block_size; loff_t length = iomap_length(iter); loff_t pos = iter->pos; blk_opf_t bio_opf; @@ -287,6 +287,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, int nr_pages, ret = 0; size_t copied = 0; size_t orig_count; + unsigned int pad; if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) @@ -355,7 +356,14 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, if (need_zeroout) { /* zero out from the start of the block to the write offset */ - pad = pos & (fs_block_size - 1); + if (is_power_of_2(io_block_size)) { + pad = pos & (io_block_size - 1); + } else { + loff_t _pos = pos; + + pad = do_div(_pos, io_block_size); + } + if (pad) iomap_dio_zero(iter, dio, pos - pad, pad); } @@ -429,9 +437,16 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, if (need_zeroout || ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { /* zero out from the end of the write to the end of the block */ - pad = pos & (fs_block_size - 1); + if (is_power_of_2(io_block_size)) { + pad = pos & (io_block_size - 1); + } else { + loff_t _pos = pos; + + pad = do_div(_pos, io_block_size); + } + if (pad) - iomap_dio_zero(iter, dio, pos, fs_block_size - pad); + iomap_dio_zero(iter, dio, pos, io_block_size - pad); } out: /* Undo iter limitation to current extent */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 378342673925..ecb4cae88248 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -127,6 +127,7 @@ xfs_bmbt_to_iomap( } iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); + iomap->io_block_size = i_blocksize(VFS_I(ip)); if (mapping_flags & IOMAP_DAX) iomap->dax_dev = target->bt_daxdev; else diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 3b103715acc9..bf2cc4bee309 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -50,6 +50,7 @@ static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; iomap->length = isize - iomap->offset; } + iomap->io_block_size = i_blocksize(inode); mutex_unlock(&zi->i_truncate_mutex); trace_zonefs_iomap_begin(inode, iomap); @@ -99,6 +100,7 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, iomap->type = IOMAP_MAPPED; iomap->length = isize - iomap->offset; } + iomap->io_block_size = i_blocksize(inode); mutex_unlock(&zi->i_truncate_mutex); trace_zonefs_iomap_begin(inode, iomap); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 6fc1c858013d..d63a35b77907 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -103,6 +103,8 @@ struct iomap { void *private; /* filesystem private */ const struct iomap_folio_ops *folio_ops; u64 validity_cookie; /* used with .iomap_valid() */ + /* io block zeroing size, not necessarily a power-of-2 */ + u64 io_block_size; }; static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
Allow filesystems to set the io_block_size for sub-fs block size zeroing, as in future we will want to extend this feature to support zeroing of block sizes of larger than the inode block size. The value in io_block_size does not have to be a power-of-2, so fix up zeroing code to handle that. Signed-off-by: John Garry <john.g.garry@oracle.com> --- block/fops.c | 1 + fs/btrfs/inode.c | 1 + fs/erofs/data.c | 1 + fs/erofs/zmap.c | 1 + fs/ext2/inode.c | 1 + fs/ext4/extents.c | 1 + fs/ext4/inode.c | 1 + fs/f2fs/data.c | 1 + fs/fuse/dax.c | 1 + fs/gfs2/bmap.c | 1 + fs/hpfs/file.c | 1 + fs/iomap/direct-io.c | 23 +++++++++++++++++++---- fs/xfs/xfs_iomap.c | 1 + fs/zonefs/file.c | 2 ++ include/linux/iomap.h | 2 ++ 15 files changed, 35 insertions(+), 4 deletions(-)