diff mbox series

[v5,3/7] fs: iomap: Atomic write support

Message ID 20240817094800.776408-4-john.g.garry@oracle.com (mailing list archive)
State New, archived
Headers show
Series block atomic writes for xfs | expand

Commit Message

John Garry Aug. 17, 2024, 9:47 a.m. UTC
Support direct I/O atomic writes by producing a single bio with REQ_ATOMIC
flag set.

We rely on the FS to guarantee extent alignment, such that an atomic write
should never straddle two or more extents. The FS should also check for
validity of an atomic write length/alignment.

For each iter, data is appended to the single bio. That bio is allocated
on the first iter.

If that total bio data does not match the expected total, then error and
do not submit the bio as we cannot tolerate a partial write.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/iomap/direct-io.c  | 122 +++++++++++++++++++++++++++++++++++-------
 fs/iomap/trace.h      |   3 +-
 include/linux/iomap.h |   1 +
 3 files changed, 106 insertions(+), 20 deletions(-)

Comments

Darrick J. Wong Aug. 21, 2024, 4:58 p.m. UTC | #1
On Sat, Aug 17, 2024 at 09:47:56AM +0000, John Garry wrote:
> Support direct I/O atomic writes by producing a single bio with REQ_ATOMIC
> flag set.
> 
> We rely on the FS to guarantee extent alignment, such that an atomic write
> should never straddle two or more extents. The FS should also check for
> validity of an atomic write length/alignment.
> 
> For each iter, data is appended to the single bio. That bio is allocated
> on the first iter.
> 
> If that total bio data does not match the expected total, then error and
> do not submit the bio as we cannot tolerate a partial write.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/iomap/direct-io.c  | 122 +++++++++++++++++++++++++++++++++++-------
>  fs/iomap/trace.h      |   3 +-
>  include/linux/iomap.h |   1 +
>  3 files changed, 106 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index f3b43d223a46..72f28d53ab03 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -37,6 +37,7 @@ struct iomap_dio {
>  	int			error;
>  	size_t			done_before;
>  	bool			wait_for_completion;
> +	struct bio		*atomic_bio;
>  
>  	union {
>  		/* used during submission and for synchronous completion: */
> @@ -61,6 +62,24 @@ static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
>  	return bio_alloc(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL);
>  }
>  
> +static struct bio *iomap_dio_alloc_bio_data(const struct iomap_iter *iter,
> +		struct iomap_dio *dio, unsigned short nr_vecs, blk_opf_t opf,
> +		loff_t pos)
> +{
> +	struct bio *bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, opf);
> +	struct inode *inode = iter->inode;
> +
> +	fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
> +				  GFP_KERNEL);
> +	bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
> +	bio->bi_write_hint = inode->i_write_hint;
> +	bio->bi_ioprio = dio->iocb->ki_ioprio;
> +	bio->bi_private = dio;
> +	bio->bi_end_io = iomap_dio_bio_end_io;
> +
> +	return bio;
> +}
> +
>  static void iomap_dio_submit_bio(const struct iomap_iter *iter,
>  		struct iomap_dio *dio, struct bio *bio, loff_t pos)
>  {
> @@ -256,7 +275,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
>   * clearing the WRITE_THROUGH flag in the dio request.
>   */
>  static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
> -		const struct iomap *iomap, bool use_fua)
> +		const struct iomap *iomap, bool use_fua, bool atomic)
>  {
>  	blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
>  
> @@ -268,6 +287,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
>  		opflags |= REQ_FUA;
>  	else
>  		dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
> +	if (atomic)
> +		opflags |= REQ_ATOMIC;
>  
>  	return opflags;
>  }
> @@ -275,21 +296,23 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
>  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		struct iomap_dio *dio)
>  {
> +	bool atomic = dio->iocb->ki_flags & IOCB_ATOMIC;
>  	const struct iomap *iomap = &iter->iomap;
>  	struct inode *inode = iter->inode;
>  	unsigned int fs_block_size = i_blocksize(inode), pad;
> +	struct iov_iter *i = dio->submit.iter;

If you're going to pull this out into a convenience variable, please do
that as a separate patch so that the actual untorn write additions don't
get mixed in.

>  	loff_t length = iomap_length(iter);
>  	loff_t pos = iter->pos;
>  	blk_opf_t bio_opf;
>  	struct bio *bio;
>  	bool need_zeroout = false;
>  	bool use_fua = false;
> -	int nr_pages, ret = 0;
> +	int nr_pages, orig_nr_pages, ret = 0;
>  	size_t copied = 0;
>  	size_t orig_count;
>  
>  	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
> -	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
> +	    !bdev_iter_is_aligned(iomap->bdev, i))
>  		return -EINVAL;
>  
>  	if (iomap->type == IOMAP_UNWRITTEN) {
> @@ -322,15 +345,35 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  			dio->flags &= ~IOMAP_DIO_CALLER_COMP;
>  	}
>  
> +	if (dio->atomic_bio) {
> +		/*
> +		 * These should not fail, but check just in case.
> +		 * Caller takes care of freeing the bio.
> +		 */
> +		if (iter->iomap.bdev != dio->atomic_bio->bi_bdev) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +
> +		if (dio->atomic_bio->bi_iter.bi_sector +
> +		    (dio->atomic_bio->bi_iter.bi_size >> SECTOR_SHIFT) !=

Hmm, so I guess you stash an untorn write bio in the iomap_dio so that
multiple iomap_dio_bio_iter can try to combine a mixed mapping into a
single contiguous untorn write that can be completed all at once?
I suppose that works as long as the iomap->type is the same across all
the _iter calls, but I think that needs explicit checking here.

> +			iomap_sector(iomap, pos)) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +	} else if (atomic) {
> +		orig_nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
> +	}
> +
>  	/*
>  	 * Save the original count and trim the iter to just the extent we
>  	 * are operating on right now.  The iter will be re-expanded once
>  	 * we are done.
>  	 */
> -	orig_count = iov_iter_count(dio->submit.iter);
> -	iov_iter_truncate(dio->submit.iter, length);
> +	orig_count = iov_iter_count(i);
> +	iov_iter_truncate(i, length);
>  
> -	if (!iov_iter_count(dio->submit.iter))
> +	if (!iov_iter_count(i))
>  		goto out;
>  
>  	/*
> @@ -365,27 +408,46 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  	 * can set up the page vector appropriately for a ZONE_APPEND
>  	 * operation.
>  	 */
> -	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
> +	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
> +
> +	if (atomic) {
> +		size_t orig_atomic_size;
> +
> +		if (!dio->atomic_bio) {
> +			dio->atomic_bio = iomap_dio_alloc_bio_data(iter,
> +					dio, orig_nr_pages, bio_opf, pos);
> +		}
> +		orig_atomic_size = dio->atomic_bio->bi_iter.bi_size;
> +
> +		/*
> +		 * In case of error, caller takes care of freeing the bio. The
> +		 * smallest size of atomic write is i_node size, so no need for

What is "i_node size"?  Are you referring to i_blocksize?

> +		 * tail zeroing out.
> +		 */
> +		ret = bio_iov_iter_get_pages(dio->atomic_bio, i);
> +		if (!ret) {
> +			copied = dio->atomic_bio->bi_iter.bi_size -
> +				orig_atomic_size;
> +		}
>  
> -	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
> +		dio->size += copied;
> +		goto out;
> +	}
> +
> +	nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
>  	do {
>  		size_t n;
>  		if (dio->error) {
> -			iov_iter_revert(dio->submit.iter, copied);
> +			iov_iter_revert(i, copied);
>  			copied = ret = 0;
>  			goto out;
>  		}
>  
> -		bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
> +		bio = iomap_dio_alloc_bio_data(iter, dio, nr_pages, bio_opf, pos);
>  		fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
>  					  GFP_KERNEL);
> -		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
> -		bio->bi_write_hint = inode->i_write_hint;
> -		bio->bi_ioprio = dio->iocb->ki_ioprio;
> -		bio->bi_private = dio;
> -		bio->bi_end_io = iomap_dio_bio_end_io;

I see two places (here and iomap_dio_zero) that allocate a bio and
perform some initialization of it.  Can you move the common pieces to
iomap_dio_alloc_bio instead of adding a iomap_dio_alloc_bio_data
variant, and move all that to a separate cleanup patch?

> -		ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
> +		ret = bio_iov_iter_get_pages(bio, i);
>  		if (unlikely(ret)) {
>  			/*
>  			 * We have to stop part way through an IO. We must fall
> @@ -408,8 +470,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		dio->size += n;
>  		copied += n;
>  
> -		nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
> -						 BIO_MAX_VECS);
> +		nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
>  		/*
>  		 * We can only poll for single bio I/Os.
>  		 */
> @@ -435,7 +496,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  	}
>  out:
>  	/* Undo iter limitation to current extent */
> -	iov_iter_reexpand(dio->submit.iter, orig_count - copied);
> +	iov_iter_reexpand(i, orig_count - copied);
>  	if (copied)
>  		return copied;
>  	return ret;
> @@ -555,6 +616,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	struct blk_plug plug;
>  	struct iomap_dio *dio;
>  	loff_t ret = 0;
> +	size_t orig_count = iov_iter_count(iter);
>  
>  	trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
>  
> @@ -580,6 +642,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	if (iocb->ki_flags & IOCB_NOWAIT)
>  		iomi.flags |= IOMAP_NOWAIT;
>  
> +	if (iocb->ki_flags & IOCB_ATOMIC) {
> +		if (bio_iov_vecs_to_alloc(iter, INT_MAX) > BIO_MAX_VECS)
> +			return ERR_PTR(-EINVAL);
> +		iomi.flags |= IOMAP_ATOMIC;
> +	}
> +	dio->atomic_bio = NULL;
> +
>  	if (iov_iter_rw(iter) == READ) {
>  		/* reads can always complete inline */
>  		dio->flags |= IOMAP_DIO_INLINE_COMP;
> @@ -665,6 +734,21 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		iocb->ki_flags &= ~IOCB_HIPRI;
>  	}
>  
> +	if (iocb->ki_flags & IOCB_ATOMIC) {
> +		if (ret >= 0) {
> +			if (dio->size == orig_count) {
> +				iomap_dio_submit_bio(&iomi, dio,
> +					dio->atomic_bio, iocb->ki_pos);

Does this need to do task_io_account_write like regular direct writes
do?

> +			} else {
> +				if (dio->atomic_bio)
> +					bio_put(dio->atomic_bio);
> +				ret = -EINVAL;
> +			}
> +		} else if (dio->atomic_bio) {
> +			bio_put(dio->atomic_bio);

This ought to null out dio->atomic_bio to prevent accidental UAF.

--D

> +		}
> +	}
> +
>  	blk_finish_plug(&plug);
>  
>  	/*
> diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
> index 0a991c4ce87d..4118a42cdab0 100644
> --- a/fs/iomap/trace.h
> +++ b/fs/iomap/trace.h
> @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
>  	{ IOMAP_REPORT,		"REPORT" }, \
>  	{ IOMAP_FAULT,		"FAULT" }, \
>  	{ IOMAP_DIRECT,		"DIRECT" }, \
> -	{ IOMAP_NOWAIT,		"NOWAIT" }
> +	{ IOMAP_NOWAIT,		"NOWAIT" }, \
> +	{ IOMAP_ATOMIC,		"ATOMIC" }
>  
>  #define IOMAP_F_FLAGS_STRINGS \
>  	{ IOMAP_F_NEW,		"NEW" }, \
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 6fc1c858013d..8fd949442262 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -178,6 +178,7 @@ struct iomap_folio_ops {
>  #else
>  #define IOMAP_DAX		0
>  #endif /* CONFIG_FS_DAX */
> +#define IOMAP_ATOMIC		(1 << 9)
>  
>  struct iomap_ops {
>  	/*
> -- 
> 2.31.1
> 
>
John Garry Aug. 22, 2024, 3:29 p.m. UTC | #2
>> +
>>   static void iomap_dio_submit_bio(const struct iomap_iter *iter,
>>   		struct iomap_dio *dio, struct bio *bio, loff_t pos)
>>   {
>> @@ -256,7 +275,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
>>    * clearing the WRITE_THROUGH flag in the dio request.
>>    */
>>   static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
>> -		const struct iomap *iomap, bool use_fua)
>> +		const struct iomap *iomap, bool use_fua, bool atomic)
>>   {
>>   	blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
>>   
>> @@ -268,6 +287,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
>>   		opflags |= REQ_FUA;
>>   	else
>>   		dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
>> +	if (atomic)
>> +		opflags |= REQ_ATOMIC;
>>   
>>   	return opflags;
>>   }
>> @@ -275,21 +296,23 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
>>   static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   		struct iomap_dio *dio)
>>   {
>> +	bool atomic = dio->iocb->ki_flags & IOCB_ATOMIC;
>>   	const struct iomap *iomap = &iter->iomap;
>>   	struct inode *inode = iter->inode;
>>   	unsigned int fs_block_size = i_blocksize(inode), pad;
>> +	struct iov_iter *i = dio->submit.iter;
> 
> If you're going to pull this out into a convenience variable, please do
> that as a separate patch so that the actual untorn write additions don't
> get mixed in.

Yeah, I was thinking of doing that, so ok.

> 
>>   	loff_t length = iomap_length(iter);
>>   	loff_t pos = iter->pos;
>>   	blk_opf_t bio_opf;
>>   	struct bio *bio;
>>   	bool need_zeroout = false;
>>   	bool use_fua = false;
>> -	int nr_pages, ret = 0;
>> +	int nr_pages, orig_nr_pages, ret = 0;
>>   	size_t copied = 0;
>>   	size_t orig_count;
>>   
>>   	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
>> -	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
>> +	    !bdev_iter_is_aligned(iomap->bdev, i))
>>   		return -EINVAL;
>>   
>>   	if (iomap->type == IOMAP_UNWRITTEN) {
>> @@ -322,15 +345,35 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   			dio->flags &= ~IOMAP_DIO_CALLER_COMP;
>>   	}
>>   
>> +	if (dio->atomic_bio) {
>> +		/*
>> +		 * These should not fail, but check just in case.
>> +		 * Caller takes care of freeing the bio.
>> +		 */
>> +		if (iter->iomap.bdev != dio->atomic_bio->bi_bdev) {
>> +			ret = -EINVAL;
>> +			goto out;
>> +		}
>> +
>> +		if (dio->atomic_bio->bi_iter.bi_sector +
>> +		    (dio->atomic_bio->bi_iter.bi_size >> SECTOR_SHIFT) !=
> 
> Hmm, so I guess you stash an untorn write bio in the iomap_dio so that
> multiple iomap_dio_bio_iter can try to combine a mixed mapping into a
> single contiguous untorn write that can be completed all at once?

Right, we are writing to a contiguous LBA address range with a single 
bio that happens to cover many different extents.

> I suppose that works as long as the iomap->type is the same across all
> the _iter calls, but I think that needs explicit checking here.

As an sample, if we try to atomically write over the data in the 
following file:

# xfs_bmap -vvp mnt/file
mnt/file:
EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
   0: [0..127]:        hole                                   128
   1: [128..135]:      256..263          0 (256..263)           8 010000
   2: [136..143]:      264..271          0 (264..271)           8 000000
   3: [144..255]:      272..383          0 (272..383)         112 010000
FLAG Values:
    0100000 Shared extent
    0010000 Unwritten preallocated extent
    0001000 Doesn't begin on stripe unit
    0000100 Doesn't end   on stripe unit
    0000010 Doesn't begin on stripe width
    0000001 Doesn't end   on stripe width
#


Then, the iomap->type/flag is either IOMAP_UNWRITTEN/IOMAP_F_DIRTY or 
IOMAP_MAPPED/IOMAP_F_DIRTY per iter. So the type is not consistent. 
However we will set IOMAP_DIO_UNWRITTEN in dio->flags, so call 
xfs_dio_write_endio() -> xfs_iomap_write_unwritten() for the complete 
FSB range.

Do you see a problem with this?

Please see this also for some more background:
https://lore.kernel.org/linux-xfs/20240726171358.GA27612@lst.de/


> 
>> +			iomap_sector(iomap, pos)) {
>> +			ret = -EINVAL;
>> +			goto out;
>> +		}
>> +	} else if (atomic) {
>> +		orig_nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
>> +	}
>> +
>>   	/*
>>   	 * Save the original count and trim the iter to just the extent we
>>   	 * are operating on right now.  The iter will be re-expanded once
>>   	 * we are done.
>>   	 */
>> -	orig_count = iov_iter_count(dio->submit.iter);
>> -	iov_iter_truncate(dio->submit.iter, length);
>> +	orig_count = iov_iter_count(i);
>> +	iov_iter_truncate(i, length);
>>   
>> -	if (!iov_iter_count(dio->submit.iter))
>> +	if (!iov_iter_count(i))
>>   		goto out;
>>   
>>   	/*
>> @@ -365,27 +408,46 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   	 * can set up the page vector appropriately for a ZONE_APPEND
>>   	 * operation.
>>   	 */
>> -	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
>> +	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
>> +
>> +	if (atomic) {
>> +		size_t orig_atomic_size;
>> +
>> +		if (!dio->atomic_bio) {
>> +			dio->atomic_bio = iomap_dio_alloc_bio_data(iter,
>> +					dio, orig_nr_pages, bio_opf, pos);
>> +		}
>> +		orig_atomic_size = dio->atomic_bio->bi_iter.bi_size;
>> +
>> +		/*
>> +		 * In case of error, caller takes care of freeing the bio. The
>> +		 * smallest size of atomic write is i_node size, so no need for
> 
> What is "i_node size"?  Are you referring to i_blocksize?

Yes, I meant i_blocksize()

> 
>> +		 * tail zeroing out.
>> +		 */
>> +		ret = bio_iov_iter_get_pages(dio->atomic_bio, i);
>> +		if (!ret) {
>> +			copied = dio->atomic_bio->bi_iter.bi_size -
>> +				orig_atomic_size;
>> +		}
>>   
>> -	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
>> +		dio->size += copied;
>> +		goto out;
>> +	}
>> +
>> +	nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
>>   	do {
>>   		size_t n;
>>   		if (dio->error) {
>> -			iov_iter_revert(dio->submit.iter, copied);
>> +			iov_iter_revert(i, copied);
>>   			copied = ret = 0;
>>   			goto out;
>>   		}
>>   
>> -		bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
>> +		bio = iomap_dio_alloc_bio_data(iter, dio, nr_pages, bio_opf, pos);
>>   		fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
>>   					  GFP_KERNEL);
>> -		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
>> -		bio->bi_write_hint = inode->i_write_hint;
>> -		bio->bi_ioprio = dio->iocb->ki_ioprio;
>> -		bio->bi_private = dio;
>> -		bio->bi_end_io = iomap_dio_bio_end_io;
> 
> I see two places (here and iomap_dio_zero) that allocate a bio and
> perform some initialization of it.  Can you move the common pieces to
> iomap_dio_alloc_bio instead of adding a iomap_dio_alloc_bio_data
> variant, and move all that to a separate cleanup patch?

Sure

So can it cause harm if we set bio->bi_write_hint and ->bi_ioprio with 
the same values as iomap_dio_alloc_bio() for iomap_dio_zero()? If no, 
this would help make all the bio alloc code common

> 
>> -		ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
>> +		ret = bio_iov_iter_get_pages(bio, i);
>>   		if (unlikely(ret)) {
>>   			/*
>>   			 * We have to stop part way through an IO. We must fall
>> @@ -408,8 +470,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   		dio->size += n;
>>   		copied += n;
>>   
>> -		nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
>> -						 BIO_MAX_VECS);
>> +		nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
>>   		/*
>>   		 * We can only poll for single bio I/Os.
>>   		 */
>> @@ -435,7 +496,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   	}
>>   out:
>>   	/* Undo iter limitation to current extent */
>> -	iov_iter_reexpand(dio->submit.iter, orig_count - copied);
>> +	iov_iter_reexpand(i, orig_count - copied);
>>   	if (copied)
>>   		return copied;
>>   	return ret;
>> @@ -555,6 +616,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>>   	struct blk_plug plug;
>>   	struct iomap_dio *dio;
>>   	loff_t ret = 0;
>> +	size_t orig_count = iov_iter_count(iter);
>>   
>>   	trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
>>   
>> @@ -580,6 +642,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>>   	if (iocb->ki_flags & IOCB_NOWAIT)
>>   		iomi.flags |= IOMAP_NOWAIT;
>>   
>> +	if (iocb->ki_flags & IOCB_ATOMIC) {
>> +		if (bio_iov_vecs_to_alloc(iter, INT_MAX) > BIO_MAX_VECS)
>> +			return ERR_PTR(-EINVAL);
>> +		iomi.flags |= IOMAP_ATOMIC;
>> +	}
>> +	dio->atomic_bio = NULL;
>> +
>>   	if (iov_iter_rw(iter) == READ) {
>>   		/* reads can always complete inline */
>>   		dio->flags |= IOMAP_DIO_INLINE_COMP;
>> @@ -665,6 +734,21 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>>   		iocb->ki_flags &= ~IOCB_HIPRI;
>>   	}
>>   
>> +	if (iocb->ki_flags & IOCB_ATOMIC) {
>> +		if (ret >= 0) {
>> +			if (dio->size == orig_count) {
>> +				iomap_dio_submit_bio(&iomi, dio,
>> +					dio->atomic_bio, iocb->ki_pos);
> 
> Does this need to do task_io_account_write like regular direct writes
> do?

yes, I missed that, will fix

> 
>> +			} else {
>> +				if (dio->atomic_bio)
>> +					bio_put(dio->atomic_bio);
>> +				ret = -EINVAL;
>> +			}
>> +		} else if (dio->atomic_bio) {
>> +			bio_put(dio->atomic_bio);
> 
> This ought to null out dio->atomic_bio to prevent accidental UAF.

ok, fine

Thanks,
John
Darrick J. Wong Aug. 22, 2024, 8:30 p.m. UTC | #3
On Thu, Aug 22, 2024 at 04:29:34PM +0100, John Garry wrote:
> 
> > > +
> > >   static void iomap_dio_submit_bio(const struct iomap_iter *iter,
> > >   		struct iomap_dio *dio, struct bio *bio, loff_t pos)
> > >   {
> > > @@ -256,7 +275,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
> > >    * clearing the WRITE_THROUGH flag in the dio request.
> > >    */
> > >   static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
> > > -		const struct iomap *iomap, bool use_fua)
> > > +		const struct iomap *iomap, bool use_fua, bool atomic)
> > >   {
> > >   	blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
> > > @@ -268,6 +287,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
> > >   		opflags |= REQ_FUA;
> > >   	else
> > >   		dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
> > > +	if (atomic)
> > > +		opflags |= REQ_ATOMIC;
> > >   	return opflags;
> > >   }
> > > @@ -275,21 +296,23 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
> > >   static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> > >   		struct iomap_dio *dio)
> > >   {
> > > +	bool atomic = dio->iocb->ki_flags & IOCB_ATOMIC;
> > >   	const struct iomap *iomap = &iter->iomap;
> > >   	struct inode *inode = iter->inode;
> > >   	unsigned int fs_block_size = i_blocksize(inode), pad;
> > > +	struct iov_iter *i = dio->submit.iter;
> > 
> > If you're going to pull this out into a convenience variable, please do
> > that as a separate patch so that the actual untorn write additions don't
> > get mixed in.
> 
> Yeah, I was thinking of doing that, so ok.
> 
> > 
> > >   	loff_t length = iomap_length(iter);
> > >   	loff_t pos = iter->pos;
> > >   	blk_opf_t bio_opf;
> > >   	struct bio *bio;
> > >   	bool need_zeroout = false;
> > >   	bool use_fua = false;
> > > -	int nr_pages, ret = 0;
> > > +	int nr_pages, orig_nr_pages, ret = 0;
> > >   	size_t copied = 0;
> > >   	size_t orig_count;
> > >   	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
> > > -	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
> > > +	    !bdev_iter_is_aligned(iomap->bdev, i))
> > >   		return -EINVAL;
> > >   	if (iomap->type == IOMAP_UNWRITTEN) {
> > > @@ -322,15 +345,35 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> > >   			dio->flags &= ~IOMAP_DIO_CALLER_COMP;
> > >   	}
> > > +	if (dio->atomic_bio) {
> > > +		/*
> > > +		 * These should not fail, but check just in case.
> > > +		 * Caller takes care of freeing the bio.
> > > +		 */
> > > +		if (iter->iomap.bdev != dio->atomic_bio->bi_bdev) {
> > > +			ret = -EINVAL;
> > > +			goto out;
> > > +		}
> > > +
> > > +		if (dio->atomic_bio->bi_iter.bi_sector +
> > > +		    (dio->atomic_bio->bi_iter.bi_size >> SECTOR_SHIFT) !=
> > 
> > Hmm, so I guess you stash an untorn write bio in the iomap_dio so that
> > multiple iomap_dio_bio_iter can try to combine a mixed mapping into a
> > single contiguous untorn write that can be completed all at once?
> 
> Right, we are writing to a contiguous LBA address range with a single bio
> that happens to cover many different extents.
> 
> > I suppose that works as long as the iomap->type is the same across all
> > the _iter calls, but I think that needs explicit checking here.
> 
> As an sample, if we try to atomically write over the data in the following
> file:
> 
> # xfs_bmap -vvp mnt/file
> mnt/file:
> EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
>   0: [0..127]:        hole                                   128
>   1: [128..135]:      256..263          0 (256..263)           8 010000
>   2: [136..143]:      264..271          0 (264..271)           8 000000
>   3: [144..255]:      272..383          0 (272..383)         112 010000
> FLAG Values:
>    0100000 Shared extent
>    0010000 Unwritten preallocated extent
>    0001000 Doesn't begin on stripe unit
>    0000100 Doesn't end   on stripe unit
>    0000010 Doesn't begin on stripe width
>    0000001 Doesn't end   on stripe width
> #
> 
> 
> Then, the iomap->type/flag is either IOMAP_UNWRITTEN/IOMAP_F_DIRTY or
> IOMAP_MAPPED/IOMAP_F_DIRTY per iter. So the type is not consistent. However
> we will set IOMAP_DIO_UNWRITTEN in dio->flags, so call xfs_dio_write_endio()
> -> xfs_iomap_write_unwritten() for the complete FSB range.
> 
> Do you see a problem with this?
> 
> Please see this also for some more background:
> https://lore.kernel.org/linux-xfs/20240726171358.GA27612@lst.de/

Yes -- if you have a mix of written and unwritten blocks for the same
chunk of physical space:

0      7
WUWUWUWU

the directio ioend function will start four separate transactions to
convert blocks 1, 3, 5, and 7 to written status.  If the system crashes
midway through, they will see this afterwards:

WWWWW0W0

IOWs, although the *disk write* was completed successfully, the mapping
updates were torn, and the user program sees a torn write.

The most performant/painful way to fix this would be to make the whole
ioend completion a logged operation so that we could commit to updating
all the unwritten mappings and restart it after a crash.

The least performant of course is to write zeroes at allocation time,
like we do for fsdax.

A possible middle ground would be to detect IOMAP_ATOMIC in the
->iomap_begin method, notice that there are mixed mappings under the
proposed untorn IO, and pre-convert the unwritten blocks by writing
zeroes to disk and updating the mappings before handing the one single
mapping back to iomap_dio_rw to stage the untorn writes bio.  At least
you'd only be suffering that penalty for the (probable) corner case of
someone creating mixed mappings.

> 
> > 
> > > +			iomap_sector(iomap, pos)) {
> > > +			ret = -EINVAL;
> > > +			goto out;
> > > +		}
> > > +	} else if (atomic) {
> > > +		orig_nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
> > > +	}
> > > +
> > >   	/*
> > >   	 * Save the original count and trim the iter to just the extent we
> > >   	 * are operating on right now.  The iter will be re-expanded once
> > >   	 * we are done.
> > >   	 */
> > > -	orig_count = iov_iter_count(dio->submit.iter);
> > > -	iov_iter_truncate(dio->submit.iter, length);
> > > +	orig_count = iov_iter_count(i);
> > > +	iov_iter_truncate(i, length);
> > > -	if (!iov_iter_count(dio->submit.iter))
> > > +	if (!iov_iter_count(i))
> > >   		goto out;
> > >   	/*
> > > @@ -365,27 +408,46 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> > >   	 * can set up the page vector appropriately for a ZONE_APPEND
> > >   	 * operation.
> > >   	 */
> > > -	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
> > > +	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
> > > +
> > > +	if (atomic) {
> > > +		size_t orig_atomic_size;
> > > +
> > > +		if (!dio->atomic_bio) {
> > > +			dio->atomic_bio = iomap_dio_alloc_bio_data(iter,
> > > +					dio, orig_nr_pages, bio_opf, pos);
> > > +		}
> > > +		orig_atomic_size = dio->atomic_bio->bi_iter.bi_size;
> > > +
> > > +		/*
> > > +		 * In case of error, caller takes care of freeing the bio. The
> > > +		 * smallest size of atomic write is i_node size, so no need for
> > 
> > What is "i_node size"?  Are you referring to i_blocksize?
> 
> Yes, I meant i_blocksize()
> 
> > 
> > > +		 * tail zeroing out.
> > > +		 */
> > > +		ret = bio_iov_iter_get_pages(dio->atomic_bio, i);
> > > +		if (!ret) {
> > > +			copied = dio->atomic_bio->bi_iter.bi_size -
> > > +				orig_atomic_size;
> > > +		}
> > > -	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
> > > +		dio->size += copied;
> > > +		goto out;
> > > +	}
> > > +
> > > +	nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
> > >   	do {
> > >   		size_t n;
> > >   		if (dio->error) {
> > > -			iov_iter_revert(dio->submit.iter, copied);
> > > +			iov_iter_revert(i, copied);
> > >   			copied = ret = 0;
> > >   			goto out;
> > >   		}
> > > -		bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
> > > +		bio = iomap_dio_alloc_bio_data(iter, dio, nr_pages, bio_opf, pos);
> > >   		fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
> > >   					  GFP_KERNEL);
> > > -		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
> > > -		bio->bi_write_hint = inode->i_write_hint;
> > > -		bio->bi_ioprio = dio->iocb->ki_ioprio;
> > > -		bio->bi_private = dio;
> > > -		bio->bi_end_io = iomap_dio_bio_end_io;
> > 
> > I see two places (here and iomap_dio_zero) that allocate a bio and
> > perform some initialization of it.  Can you move the common pieces to
> > iomap_dio_alloc_bio instead of adding a iomap_dio_alloc_bio_data
> > variant, and move all that to a separate cleanup patch?
> 
> Sure
> 
> So can it cause harm if we set bio->bi_write_hint and ->bi_ioprio with the
> same values as iomap_dio_alloc_bio() for iomap_dio_zero()? If no, this would
> help make all the bio alloc code common

I'd leave the bi_write_hint and bi_ioprio initialization out of the
common function.

--D

> > 
> > > -		ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
> > > +		ret = bio_iov_iter_get_pages(bio, i);
> > >   		if (unlikely(ret)) {
> > >   			/*
> > >   			 * We have to stop part way through an IO. We must fall
> > > @@ -408,8 +470,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> > >   		dio->size += n;
> > >   		copied += n;
> > > -		nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
> > > -						 BIO_MAX_VECS);
> > > +		nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
> > >   		/*
> > >   		 * We can only poll for single bio I/Os.
> > >   		 */
> > > @@ -435,7 +496,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> > >   	}
> > >   out:
> > >   	/* Undo iter limitation to current extent */
> > > -	iov_iter_reexpand(dio->submit.iter, orig_count - copied);
> > > +	iov_iter_reexpand(i, orig_count - copied);
> > >   	if (copied)
> > >   		return copied;
> > >   	return ret;
> > > @@ -555,6 +616,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> > >   	struct blk_plug plug;
> > >   	struct iomap_dio *dio;
> > >   	loff_t ret = 0;
> > > +	size_t orig_count = iov_iter_count(iter);
> > >   	trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
> > > @@ -580,6 +642,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> > >   	if (iocb->ki_flags & IOCB_NOWAIT)
> > >   		iomi.flags |= IOMAP_NOWAIT;
> > > +	if (iocb->ki_flags & IOCB_ATOMIC) {
> > > +		if (bio_iov_vecs_to_alloc(iter, INT_MAX) > BIO_MAX_VECS)
> > > +			return ERR_PTR(-EINVAL);
> > > +		iomi.flags |= IOMAP_ATOMIC;
> > > +	}
> > > +	dio->atomic_bio = NULL;
> > > +
> > >   	if (iov_iter_rw(iter) == READ) {
> > >   		/* reads can always complete inline */
> > >   		dio->flags |= IOMAP_DIO_INLINE_COMP;
> > > @@ -665,6 +734,21 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> > >   		iocb->ki_flags &= ~IOCB_HIPRI;
> > >   	}
> > > +	if (iocb->ki_flags & IOCB_ATOMIC) {
> > > +		if (ret >= 0) {
> > > +			if (dio->size == orig_count) {
> > > +				iomap_dio_submit_bio(&iomi, dio,
> > > +					dio->atomic_bio, iocb->ki_pos);
> > 
> > Does this need to do task_io_account_write like regular direct writes
> > do?
> 
> yes, I missed that, will fix
> 
> > 
> > > +			} else {
> > > +				if (dio->atomic_bio)
> > > +					bio_put(dio->atomic_bio);
> > > +				ret = -EINVAL;
> > > +			}
> > > +		} else if (dio->atomic_bio) {
> > > +			bio_put(dio->atomic_bio);
> > 
> > This ought to null out dio->atomic_bio to prevent accidental UAF.
> 
> ok, fine
> 
> Thanks,
> John
>
John Garry Aug. 30, 2024, 3:48 p.m. UTC | #4
On 22/08/2024 21:30, Darrick J. Wong wrote:
>> Then, the iomap->type/flag is either IOMAP_UNWRITTEN/IOMAP_F_DIRTY or
>> IOMAP_MAPPED/IOMAP_F_DIRTY per iter. So the type is not consistent. However
>> we will set IOMAP_DIO_UNWRITTEN in dio->flags, so call xfs_dio_write_endio()
>> -> xfs_iomap_write_unwritten() for the complete FSB range.
>>
>> Do you see a problem with this?

Sorry again for the slow response.

>>
>> Please see this also for some more background:
>> https://urldefense.com/v3/__https://lore.kernel.org/linux- 
>> xfs/20240726171358.GA27612@lst.de/__;!!ACWV5N9M2RV99hQ! 
>> P5jeP96F8wAtRAblbm8NvRo8nlpil03vA26UMMX8qrYa4IzKecAAk7x1l1M45bBshC3Czxn1CkDXypNSAg$ 
> Yes -- if you have a mix of written and unwritten blocks for the same
> chunk of physical space:
> 
> 0      7
> WUWUWUWU
> 
> the directio ioend function will start four separate transactions to
> convert blocks 1, 3, 5, and 7 to written status.  If the system crashes
> midway through, they will see this afterwards:
> 
> WWWWW0W0
> 
> IOWs, although the*disk write* was completed successfully, the mapping
> updates were torn, and the user program sees a torn write.
 > > The most performant/painful way to fix this would be to make the whole
> ioend completion a logged operation so that we could commit to updating
> all the unwritten mappings and restart it after a crash.

could we make it logged for those special cases which we are interested 
in only?

> 
> The least performant of course is to write zeroes at allocation time,
> like we do for fsdax.

That idea was already proposed:
https://lore.kernel.org/linux-xfs/ZcGIPlNCkL6EDx3Z@dread.disaster.area/

> 
> A possible middle ground would be to detect IOMAP_ATOMIC in the
> ->iomap_begin method, notice that there are mixed mappings under the
> proposed untorn IO, and pre-convert the unwritten blocks by writing
> zeroes to disk and updating the mappings 

Won't that have the same issue as using XFS_BMAPI_ZERO, above i.e. 
zeroing during allocation?

> before handing the one single
> mapping back to iomap_dio_rw to stage the untorn writes bio.  At least
> you'd only be suffering that penalty for the (probable) corner case of
> someone creating mixed mappings.

BTW, one issue I have with the sub-extent(or -alloc unit) zeroing from 
v4 series is how the unwritten conversion has changed, like:

xfs_iomap_write_unwritten()
{
	unsigned int rounding;

	/* when converting anything unwritten, we must be spanning an alloc 
unit, so round up/down */
	if (rounding > 1) {
		offset_fsb = rounddown(rounding);
		count_fsb = roundup(rounding);
	}

	...
	do {
		xfs_bmapi_write();
		...
		xfs_trans_commit();
	} while ();
}

I'm not too happy with it and it seems a bit of a bodge, as I would 
rather we report the complete size written (user data and zeroes); then 
xfs_iomap_write_unwritten() would do proper individual block conversion. 
However, we do something similar for zeroing for sub-FSB writes. I am 
not sure if that is the same thing really, as we only round up to FSB 
size. Opinion?

Thanks,
John
Darrick J. Wong Aug. 30, 2024, 11:56 p.m. UTC | #5
On Fri, Aug 30, 2024 at 04:48:36PM +0100, John Garry wrote:
> On 22/08/2024 21:30, Darrick J. Wong wrote:
> > > Then, the iomap->type/flag is either IOMAP_UNWRITTEN/IOMAP_F_DIRTY or
> > > IOMAP_MAPPED/IOMAP_F_DIRTY per iter. So the type is not consistent. However
> > > we will set IOMAP_DIO_UNWRITTEN in dio->flags, so call xfs_dio_write_endio()
> > > -> xfs_iomap_write_unwritten() for the complete FSB range.
> > > 
> > > Do you see a problem with this?
> 
> Sorry again for the slow response.
> 
> > > 
> > > Please see this also for some more background:
> > > https://urldefense.com/v3/__https://lore.kernel.org/linux-
> > > xfs/20240726171358.GA27612@lst.de/__;!!ACWV5N9M2RV99hQ! P5jeP96F8wAtRAblbm8NvRo8nlpil03vA26UMMX8qrYa4IzKecAAk7x1l1M45bBshC3Czxn1CkDXypNSAg$
> > Yes -- if you have a mix of written and unwritten blocks for the same
> > chunk of physical space:
> > 
> > 0      7
> > WUWUWUWU
> > 
> > the directio ioend function will start four separate transactions to
> > convert blocks 1, 3, 5, and 7 to written status.  If the system crashes
> > midway through, they will see this afterwards:
> > 
> > WWWWW0W0
> > 
> > IOWs, although the*disk write* was completed successfully, the mapping
> > updates were torn, and the user program sees a torn write.
> > > The most performant/painful way to fix this would be to make the whole
> > ioend completion a logged operation so that we could commit to updating
> > all the unwritten mappings and restart it after a crash.
> 
> could we make it logged for those special cases which we are interested in
> only?

Yes, though this is the long route -- you get to define a new ondisk log
item, build all the incore structures to process them, and then define a
new high level operation that uses the state encoded in that new log
item to run all the ioend completion transactions within that framework.
Also you get to add a new log incompat feature bit for this.

Perhaps we should analyze the cost of writing and QA'ing all that vs.
the amount of time saved in the handling of this corner case using one
of the less exciting options.

> > The least performant of course is to write zeroes at allocation time,
> > like we do for fsdax.
> 
> That idea was already proposed:
> https://lore.kernel.org/linux-xfs/ZcGIPlNCkL6EDx3Z@dread.disaster.area/

Yes, I'm aware.

> > A possible middle ground would be to detect IOMAP_ATOMIC in the
> > ->iomap_begin method, notice that there are mixed mappings under the
> > proposed untorn IO, and pre-convert the unwritten blocks by writing
> > zeroes to disk and updating the mappings
> 
> Won't that have the same issue as using XFS_BMAPI_ZERO, above i.e. zeroing
> during allocation?

Only if you set the forcealign size to > 1fsb and fail to write new
file data in forcealign units, even for non-untorn writes.  If all
writes to the file are aligned to the forcealign size then there's only
one extent conversion to be done, and that cannot be torn.

> > before handing the one single
> > mapping back to iomap_dio_rw to stage the untorn writes bio.  At least
> > you'd only be suffering that penalty for the (probable) corner case of
> > someone creating mixed mappings.
> 
> BTW, one issue I have with the sub-extent(or -alloc unit) zeroing from v4
> series is how the unwritten conversion has changed, like:
> 
> xfs_iomap_write_unwritten()
> {
> 	unsigned int rounding;
> 
> 	/* when converting anything unwritten, we must be spanning an alloc unit,
> so round up/down */
> 	if (rounding > 1) {
> 		offset_fsb = rounddown(rounding);
> 		count_fsb = roundup(rounding);
> 	}
> 
> 	...
> 	do {
> 		xfs_bmapi_write();
> 		...
> 		xfs_trans_commit();
> 	} while ();
> }
> 
> I'm not too happy with it and it seems a bit of a bodge, as I would rather
> we report the complete size written (user data and zeroes); then
> xfs_iomap_write_unwritten() would do proper individual block conversion.
> However, we do something similar for zeroing for sub-FSB writes. I am not
> sure if that is the same thing really, as we only round up to FSB size.
> Opinion?

xfs_iomap_write_unwritten is in the ioend path; that's not what I was
talking about.

I'm talking about a separate change to the xfs_direct_write_iomap_begin
function that would detect the case where the bmapi_read returns an
@imap that doesn't span the whole forcealign region, then repeatedly
calls bmapi_write(BMAPI_ZERO | BMAPI_CONVERT) on any unwritten mappings
within that file range until the original bmapi_read would return a
single written mapping.

--D

> 
> Thanks,
> John
> 
> 
>
John Garry Sept. 3, 2024, 12:43 p.m. UTC | #6
On 31/08/2024 00:56, Darrick J. Wong wrote:
>>> IOWs, although the*disk write* was completed successfully, the mapping
>>> updates were torn, and the user program sees a torn write.
>>>> The most performant/painful way to fix this would be to make the whole
>>> ioend completion a logged operation so that we could commit to updating
>>> all the unwritten mappings and restart it after a crash.
>> could we make it logged for those special cases which we are interested in
>> only?
> Yes, though this is the long route -- you get to define a new ondisk log
> item, build all the incore structures to process them, and then define a
> new high level operation that uses the state encoded in that new log
> item to run all the ioend completion transactions within that framework.
> Also you get to add a new log incompat feature bit for this.
> 
> Perhaps we should analyze the cost of writing and QA'ing all that vs.
> the amount of time saved in the handling of this corner case using one
> of the less exciting options.

 From the sound of all the changes required, I am not too keen on that 
option...

> 
>>> The least performant of course is to write zeroes at allocation time,
>>> like we do for fsdax.
>> That idea was already proposed:
>> https://urldefense.com/v3/__https://lore.kernel.org/linux-xfs/ 
>> ZcGIPlNCkL6EDx3Z@dread.disaster.area/__;!!ACWV5N9M2RV99hQ! 
>> Kmx2Rrrot3GTqBS3kwhTi1nxIrpiPDyiy3TEfowsRKonvY90W7o4xUv9r9seOfDAMa2gT- 
>> TCNVlpH-CGXA$ 
> Yes, I'm aware.
> 
>>> A possible middle ground would be to detect IOMAP_ATOMIC in the
>>> ->iomap_begin method, notice that there are mixed mappings under the
>>> proposed untorn IO, and pre-convert the unwritten blocks by writing
>>> zeroes to disk and updating the mappings
>> Won't that have the same issue as using XFS_BMAPI_ZERO, above i.e. zeroing
>> during allocation?
> Only if you set the forcealign size to > 1fsb and fail to write new
> file data in forcealign units, even for non-untorn writes.  If all
> writes to the file are aligned to the forcealign size then there's only
> one extent conversion to be done, and that cannot be torn.
 > >>> before handing the one single
>>> mapping back to iomap_dio_rw to stage the untorn writes bio.  At least
>>> you'd only be suffering that penalty for the (probable) corner case of
>>> someone creating mixed mappings.
>> BTW, one issue I have with the sub-extent(or -alloc unit) zeroing from v4
>> series is how the unwritten conversion has changed, like:
>>
>> xfs_iomap_write_unwritten()
>> {
>> 	unsigned int rounding;
>>
>> 	/* when converting anything unwritten, we must be spanning an alloc unit,
>> so round up/down */
>> 	if (rounding > 1) {
>> 		offset_fsb = rounddown(rounding);
>> 		count_fsb = roundup(rounding);
>> 	}
>>
>> 	...
>> 	do {
>> 		xfs_bmapi_write();
>> 		...
>> 		xfs_trans_commit();
>> 	} while ();
>> }
>>
>> I'm not too happy with it and it seems a bit of a bodge, as I would rather
>> we report the complete size written (user data and zeroes); then
>> xfs_iomap_write_unwritten() would do proper individual block conversion.
>> However, we do something similar for zeroing for sub-FSB writes. I am not
>> sure if that is the same thing really, as we only round up to FSB size.
>> Opinion?
> xfs_iomap_write_unwritten is in the ioend path; that's not what I was
> talking about.

Sure, it's not the same as what you are talking about, but I am just 
mentioning it as it was included in my sub-FS extent zeroing solution 
and I am not too happy about it. It's just a concern there.

> 
> I'm talking about a separate change to the xfs_direct_write_iomap_begin
> function that would detect the case where the bmapi_read returns an
> @imap that doesn't span the whole forcealign region, then repeatedly
> calls bmapi_write(BMAPI_ZERO | BMAPI_CONVERT) on any unwritten mappings
> within that file range until the original bmapi_read would return a
> single written mapping.

Right, I get the idea. I'll check it further.

Cheers,
John
diff mbox series

Patch

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f3b43d223a46..72f28d53ab03 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -37,6 +37,7 @@  struct iomap_dio {
 	int			error;
 	size_t			done_before;
 	bool			wait_for_completion;
+	struct bio		*atomic_bio;
 
 	union {
 		/* used during submission and for synchronous completion: */
@@ -61,6 +62,24 @@  static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
 	return bio_alloc(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL);
 }
 
+static struct bio *iomap_dio_alloc_bio_data(const struct iomap_iter *iter,
+		struct iomap_dio *dio, unsigned short nr_vecs, blk_opf_t opf,
+		loff_t pos)
+{
+	struct bio *bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, opf);
+	struct inode *inode = iter->inode;
+
+	fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
+				  GFP_KERNEL);
+	bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
+	bio->bi_write_hint = inode->i_write_hint;
+	bio->bi_ioprio = dio->iocb->ki_ioprio;
+	bio->bi_private = dio;
+	bio->bi_end_io = iomap_dio_bio_end_io;
+
+	return bio;
+}
+
 static void iomap_dio_submit_bio(const struct iomap_iter *iter,
 		struct iomap_dio *dio, struct bio *bio, loff_t pos)
 {
@@ -256,7 +275,7 @@  static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
  * clearing the WRITE_THROUGH flag in the dio request.
  */
 static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
-		const struct iomap *iomap, bool use_fua)
+		const struct iomap *iomap, bool use_fua, bool atomic)
 {
 	blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
 
@@ -268,6 +287,8 @@  static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
 		opflags |= REQ_FUA;
 	else
 		dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+	if (atomic)
+		opflags |= REQ_ATOMIC;
 
 	return opflags;
 }
@@ -275,21 +296,23 @@  static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
 static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		struct iomap_dio *dio)
 {
+	bool atomic = dio->iocb->ki_flags & IOCB_ATOMIC;
 	const struct iomap *iomap = &iter->iomap;
 	struct inode *inode = iter->inode;
 	unsigned int fs_block_size = i_blocksize(inode), pad;
+	struct iov_iter *i = dio->submit.iter;
 	loff_t length = iomap_length(iter);
 	loff_t pos = iter->pos;
 	blk_opf_t bio_opf;
 	struct bio *bio;
 	bool need_zeroout = false;
 	bool use_fua = false;
-	int nr_pages, ret = 0;
+	int nr_pages, orig_nr_pages, ret = 0;
 	size_t copied = 0;
 	size_t orig_count;
 
 	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
-	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
+	    !bdev_iter_is_aligned(iomap->bdev, i))
 		return -EINVAL;
 
 	if (iomap->type == IOMAP_UNWRITTEN) {
@@ -322,15 +345,35 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 			dio->flags &= ~IOMAP_DIO_CALLER_COMP;
 	}
 
+	if (dio->atomic_bio) {
+		/*
+		 * These should not fail, but check just in case.
+		 * Caller takes care of freeing the bio.
+		 */
+		if (iter->iomap.bdev != dio->atomic_bio->bi_bdev) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (dio->atomic_bio->bi_iter.bi_sector +
+		    (dio->atomic_bio->bi_iter.bi_size >> SECTOR_SHIFT) !=
+			iomap_sector(iomap, pos)) {
+			ret = -EINVAL;
+			goto out;
+		}
+	} else if (atomic) {
+		orig_nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
+	}
+
 	/*
 	 * Save the original count and trim the iter to just the extent we
 	 * are operating on right now.  The iter will be re-expanded once
 	 * we are done.
 	 */
-	orig_count = iov_iter_count(dio->submit.iter);
-	iov_iter_truncate(dio->submit.iter, length);
+	orig_count = iov_iter_count(i);
+	iov_iter_truncate(i, length);
 
-	if (!iov_iter_count(dio->submit.iter))
+	if (!iov_iter_count(i))
 		goto out;
 
 	/*
@@ -365,27 +408,46 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	 * can set up the page vector appropriately for a ZONE_APPEND
 	 * operation.
 	 */
-	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
+	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
+
+	if (atomic) {
+		size_t orig_atomic_size;
+
+		if (!dio->atomic_bio) {
+			dio->atomic_bio = iomap_dio_alloc_bio_data(iter,
+					dio, orig_nr_pages, bio_opf, pos);
+		}
+		orig_atomic_size = dio->atomic_bio->bi_iter.bi_size;
+
+		/*
+		 * In case of error, caller takes care of freeing the bio. The
+		 * smallest size of atomic write is i_node size, so no need for
+		 * tail zeroing out.
+		 */
+		ret = bio_iov_iter_get_pages(dio->atomic_bio, i);
+		if (!ret) {
+			copied = dio->atomic_bio->bi_iter.bi_size -
+				orig_atomic_size;
+		}
 
-	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
+		dio->size += copied;
+		goto out;
+	}
+
+	nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
 	do {
 		size_t n;
 		if (dio->error) {
-			iov_iter_revert(dio->submit.iter, copied);
+			iov_iter_revert(i, copied);
 			copied = ret = 0;
 			goto out;
 		}
 
-		bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
+		bio = iomap_dio_alloc_bio_data(iter, dio, nr_pages, bio_opf, pos);
 		fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
 					  GFP_KERNEL);
-		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
-		bio->bi_write_hint = inode->i_write_hint;
-		bio->bi_ioprio = dio->iocb->ki_ioprio;
-		bio->bi_private = dio;
-		bio->bi_end_io = iomap_dio_bio_end_io;
 
-		ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
+		ret = bio_iov_iter_get_pages(bio, i);
 		if (unlikely(ret)) {
 			/*
 			 * We have to stop part way through an IO. We must fall
@@ -408,8 +470,7 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		dio->size += n;
 		copied += n;
 
-		nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
-						 BIO_MAX_VECS);
+		nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
 		/*
 		 * We can only poll for single bio I/Os.
 		 */
@@ -435,7 +496,7 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	}
 out:
 	/* Undo iter limitation to current extent */
-	iov_iter_reexpand(dio->submit.iter, orig_count - copied);
+	iov_iter_reexpand(i, orig_count - copied);
 	if (copied)
 		return copied;
 	return ret;
@@ -555,6 +616,7 @@  __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	struct blk_plug plug;
 	struct iomap_dio *dio;
 	loff_t ret = 0;
+	size_t orig_count = iov_iter_count(iter);
 
 	trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
 
@@ -580,6 +642,13 @@  __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	if (iocb->ki_flags & IOCB_NOWAIT)
 		iomi.flags |= IOMAP_NOWAIT;
 
+	if (iocb->ki_flags & IOCB_ATOMIC) {
+		if (bio_iov_vecs_to_alloc(iter, INT_MAX) > BIO_MAX_VECS)
+			return ERR_PTR(-EINVAL);
+		iomi.flags |= IOMAP_ATOMIC;
+	}
+	dio->atomic_bio = NULL;
+
 	if (iov_iter_rw(iter) == READ) {
 		/* reads can always complete inline */
 		dio->flags |= IOMAP_DIO_INLINE_COMP;
@@ -665,6 +734,21 @@  __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		iocb->ki_flags &= ~IOCB_HIPRI;
 	}
 
+	if (iocb->ki_flags & IOCB_ATOMIC) {
+		if (ret >= 0) {
+			if (dio->size == orig_count) {
+				iomap_dio_submit_bio(&iomi, dio,
+					dio->atomic_bio, iocb->ki_pos);
+			} else {
+				if (dio->atomic_bio)
+					bio_put(dio->atomic_bio);
+				ret = -EINVAL;
+			}
+		} else if (dio->atomic_bio) {
+			bio_put(dio->atomic_bio);
+		}
+	}
+
 	blk_finish_plug(&plug);
 
 	/*
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 0a991c4ce87d..4118a42cdab0 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -98,7 +98,8 @@  DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
 	{ IOMAP_REPORT,		"REPORT" }, \
 	{ IOMAP_FAULT,		"FAULT" }, \
 	{ IOMAP_DIRECT,		"DIRECT" }, \
-	{ IOMAP_NOWAIT,		"NOWAIT" }
+	{ IOMAP_NOWAIT,		"NOWAIT" }, \
+	{ IOMAP_ATOMIC,		"ATOMIC" }
 
 #define IOMAP_F_FLAGS_STRINGS \
 	{ IOMAP_F_NEW,		"NEW" }, \
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 6fc1c858013d..8fd949442262 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -178,6 +178,7 @@  struct iomap_folio_ops {
 #else
 #define IOMAP_DAX		0
 #endif /* CONFIG_FS_DAX */
+#define IOMAP_ATOMIC		(1 << 9)
 
 struct iomap_ops {
 	/*