diff mbox series

[8/8] iomap: support IOCB_DIO_DEFER

Message ID 20230720181310.71589-9-axboe@kernel.dk (mailing list archive)
State New
Headers show
Series Improve async iomap DIO performance | expand

Commit Message

Jens Axboe July 20, 2023, 6:13 p.m. UTC
If IOCB_DIO_DEFER is set, utilize that to set kiocb->dio_complete handler
and data for that callback. Rather than punt the completion to a
workqueue, we pass back the handler and data to the issuer and will get a
callback from a safe task context.

Using the following fio job to randomly dio write 4k blocks at
queue depths of 1..16:

fio --name=dio-write --filename=/data1/file --time_based=1 \
--runtime=10 --bs=4096 --rw=randwrite --norandommap --buffered=0 \
--cpus_allowed=4 --ioengine=io_uring --iodepth=$depth

shows the following results before and after this patch:

	Stock	Patched		Diff
=======================================
QD1	155K	162K		+ 4.5%
QD2	290K	313K		+ 7.9%
QD4	533K	597K		+12.0%
QD8	604K	827K		+36.9%
QD16	615K	845K		+37.4%

which shows nice wins all around. If we factored in per-IOP efficiency,
the wins look even nicer. This becomes apparent as queue depth rises,
as the offloaded workqueue completions runs out of steam.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/iomap/direct-io.c | 54 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

Comments

Christoph Hellwig July 21, 2023, 6:19 a.m. UTC | #1
Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>
Darrick J. Wong July 21, 2023, 4:01 p.m. UTC | #2
On Thu, Jul 20, 2023 at 12:13:10PM -0600, Jens Axboe wrote:
> If IOCB_DIO_DEFER is set, utilize that to set kiocb->dio_complete handler
> and data for that callback. Rather than punt the completion to a
> workqueue, we pass back the handler and data to the issuer and will get a
> callback from a safe task context.
> 
> Using the following fio job to randomly dio write 4k blocks at
> queue depths of 1..16:
> 
> fio --name=dio-write --filename=/data1/file --time_based=1 \
> --runtime=10 --bs=4096 --rw=randwrite --norandommap --buffered=0 \
> --cpus_allowed=4 --ioengine=io_uring --iodepth=$depth
> 
> shows the following results before and after this patch:
> 
> 	Stock	Patched		Diff
> =======================================
> QD1	155K	162K		+ 4.5%
> QD2	290K	313K		+ 7.9%
> QD4	533K	597K		+12.0%
> QD8	604K	827K		+36.9%
> QD16	615K	845K		+37.4%

Nice!

> which shows nice wins all around. If we factored in per-IOP efficiency,
> the wins look even nicer. This becomes apparent as queue depth rises,
> as the offloaded workqueue completions runs out of steam.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  fs/iomap/direct-io.c | 54 +++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 53 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index cce9af019705..de86680968a4 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -20,6 +20,7 @@
>   * Private flags for iomap_dio, must not overlap with the public ones in
>   * iomap.h:
>   */
> +#define IOMAP_DIO_DEFER_COMP	(1 << 26)

IOMAP_DIO_CALLER_COMP, to go with IOCB_CALLER_COMP?

>  #define IOMAP_DIO_INLINE_COMP	(1 << 27)
>  #define IOMAP_DIO_STABLE_WRITE	(1 << 28)
>  #define IOMAP_DIO_NEED_SYNC	(1 << 29)
> @@ -132,6 +133,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  }
>  EXPORT_SYMBOL_GPL(iomap_dio_complete);
>  
> +static ssize_t iomap_dio_deferred_complete(void *data)
> +{
> +	return iomap_dio_complete(data);
> +}
> +
>  static void iomap_dio_complete_work(struct work_struct *work)
>  {
>  	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
> @@ -192,6 +198,31 @@ void iomap_dio_bio_end_io(struct bio *bio)
>  		goto release_bio;
>  	}
>  
> +	/*
> +	 * If this dio is flagged with IOMAP_DIO_DEFER_COMP, then schedule
> +	 * our completion that way to avoid an async punt to a workqueue.
> +	 */
> +	if (dio->flags & IOMAP_DIO_DEFER_COMP) {
> +		/* only polled IO cares about private cleared */
> +		iocb->private = dio;
> +		iocb->dio_complete = iomap_dio_deferred_complete;
> +
> +		/*
> +		 * Invoke ->ki_complete() directly. We've assigned out

"We've assigned our..."

> +		 * dio_complete callback handler, and since the issuer set
> +		 * IOCB_DIO_DEFER, we know their ki_complete handler will
> +		 * notice ->dio_complete being set and will defer calling that
> +		 * handler until it can be done from a safe task context.
> +		 *
> +		 * Note that the 'res' being passed in here is not important
> +		 * for this case. The actual completion value of the request
> +		 * will be gotten from dio_complete when that is run by the
> +		 * issuer.
> +		 */
> +		iocb->ki_complete(iocb, 0);
> +		goto release_bio;
> +	}
> +
>  	/*
>  	 * Async DIO completion that requires filesystem level completion work
>  	 * gets punted to a work queue to complete as the operation may require
> @@ -288,12 +319,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		 * after IO completion such as unwritten extent conversion) and
>  		 * the underlying device either supports FUA or doesn't have
>  		 * a volatile write cache. This allows us to avoid cache flushes
> -		 * on IO completion.
> +		 * on IO completion. If we can't use stable writes and need to

"If we can't use writethrough and need to sync..."

> +		 * sync, disable in-task completions as dio completion will
> +		 * need to call generic_write_sync() which will do a blocking
> +		 * fsync / cache flush call.
>  		 */
>  		if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
>  		    (dio->flags & IOMAP_DIO_STABLE_WRITE) &&
>  		    (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
>  			use_fua = true;
> +		else if (dio->flags & IOMAP_DIO_NEED_SYNC)
> +			dio->flags &= ~IOMAP_DIO_DEFER_COMP;
>  	}
>  
>  	/*
> @@ -319,6 +355,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		pad = pos & (fs_block_size - 1);
>  		if (pad)
>  			iomap_dio_zero(iter, dio, pos - pad, pad);
> +
> +		/*
> +		 * If need_zeroout is set, then this is a new or unwritten
> +		 * extent. These need extra handling at completion time, so

"...then this is a new or unwritten extent, or dirty file metadata have
not been persisted to disk."

> +		 * disable in-task deferred completion for those.
> +		 */
> +		dio->flags &= ~IOMAP_DIO_DEFER_COMP;
>  	}
>  
>  	/*
> @@ -557,6 +600,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		iomi.flags |= IOMAP_WRITE;
>  		dio->flags |= IOMAP_DIO_WRITE;
>  
> +		/*
> +		 * Flag as supporting deferred completions, if the issuer
> +		 * groks it. This can avoid a workqueue punt for writes.
> +		 * We may later clear this flag if we need to do other IO
> +		 * as part of this IO completion.
> +		 */
> +		if (iocb->ki_flags & IOCB_DIO_DEFER)
> +			dio->flags |= IOMAP_DIO_DEFER_COMP;
> +

With those comment clarifications added,

Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

>  		if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
>  			ret = -EAGAIN;
>  			if (iomi.pos >= dio->i_size ||
> -- 
> 2.40.1
>
Jens Axboe July 21, 2023, 4:30 p.m. UTC | #3
On 7/21/23 10:01?AM, Darrick J. Wong wrote:
>> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
>> index cce9af019705..de86680968a4 100644
>> --- a/fs/iomap/direct-io.c
>> +++ b/fs/iomap/direct-io.c
>> @@ -20,6 +20,7 @@
>>   * Private flags for iomap_dio, must not overlap with the public ones in
>>   * iomap.h:
>>   */
>> +#define IOMAP_DIO_DEFER_COMP	(1 << 26)
> 
> IOMAP_DIO_CALLER_COMP, to go with IOCB_CALLER_COMP?

Yep, already made that change in conjunction with the other rename.

>>  #define IOMAP_DIO_INLINE_COMP	(1 << 27)
>> +	/*
>> +	 * If this dio is flagged with IOMAP_DIO_DEFER_COMP, then schedule
>> +	 * our completion that way to avoid an async punt to a workqueue.
>> +	 */
>> +	if (dio->flags & IOMAP_DIO_DEFER_COMP) {
>> +		/* only polled IO cares about private cleared */
>> +		iocb->private = dio;
>> +		iocb->dio_complete = iomap_dio_deferred_complete;
>> +
>> +		/*
>> +		 * Invoke ->ki_complete() directly. We've assigned out
> 
> "We've assigned our..."

Fixed.

>> @@ -288,12 +319,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>  		 * after IO completion such as unwritten extent conversion) and
>>  		 * the underlying device either supports FUA or doesn't have
>>  		 * a volatile write cache. This allows us to avoid cache flushes
>> -		 * on IO completion.
>> +		 * on IO completion. If we can't use stable writes and need to
> 
> "If we can't use writethrough and need to sync..."

Fixed.

>> @@ -319,6 +355,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>  		pad = pos & (fs_block_size - 1);
>>  		if (pad)
>>  			iomap_dio_zero(iter, dio, pos - pad, pad);
>> +
>> +		/*
>> +		 * If need_zeroout is set, then this is a new or unwritten
>> +		 * extent. These need extra handling at completion time, so
> 
> "...then this is a new or unwritten extent, or dirty file metadata have
> not been persisted to disk."

Fixed.
Dave Chinner July 21, 2023, 10:05 p.m. UTC | #4
On Thu, Jul 20, 2023 at 12:13:10PM -0600, Jens Axboe wrote:
> If IOCB_DIO_DEFER is set, utilize that to set kiocb->dio_complete handler
> and data for that callback. Rather than punt the completion to a
> workqueue, we pass back the handler and data to the issuer and will get a
> callback from a safe task context.
....
> @@ -288,12 +319,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		 * after IO completion such as unwritten extent conversion) and
>  		 * the underlying device either supports FUA or doesn't have
>  		 * a volatile write cache. This allows us to avoid cache flushes
> -		 * on IO completion.
> +		 * on IO completion. If we can't use stable writes and need to
> +		 * sync, disable in-task completions as dio completion will
> +		 * need to call generic_write_sync() which will do a blocking
> +		 * fsync / cache flush call.
>  		 */
>  		if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
>  		    (dio->flags & IOMAP_DIO_STABLE_WRITE) &&
>  		    (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
>  			use_fua = true;
> +		else if (dio->flags & IOMAP_DIO_NEED_SYNC)
> +			dio->flags &= ~IOMAP_DIO_DEFER_COMP;
>  	}
>  
>  	/*
> @@ -319,6 +355,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		pad = pos & (fs_block_size - 1);
>  		if (pad)
>  			iomap_dio_zero(iter, dio, pos - pad, pad);
> +
> +		/*
> +		 * If need_zeroout is set, then this is a new or unwritten
> +		 * extent. These need extra handling at completion time, so
> +		 * disable in-task deferred completion for those.
> +		 */
> +		dio->flags &= ~IOMAP_DIO_DEFER_COMP;
>  	}

I don't think these are quite right. They miss the file extension
case that I pointed out in an earlier patch (i.e. where IOCB_HIPRI
gets cleared).

Fundamentally, I don't like have three different sets of logic which
all end up being set/cleared for the same situation - polled bios
and defered completion should only be used in situations where
inline iomap completion can be run.

IOWs, I think the iomap_dio_bio_iter() code needs to first decide
whether IOMAP_DIO_INLINE_COMP can be set, and if it cannot be set,
we then clear both IOCB_HIPRI and IOMAP_DIO_DEFER_COMP, because
neither should be used for an IO that can not do inline completion.

i.e. this all comes down to something like this:

-	/*
-	 * We can only poll for single bio I/Os.
-	 */
-	if (need_zeroout ||
-	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
-		dio->iocb->ki_flags &= ~IOCB_HIPRI;
+	/*
+	 * We can only do inline completion for pure overwrites that
+	 * don't require additional IO at completion. This rules out
+	 * writes that need zeroing or extent conversion, extend
+	 * the file size, or issue journal IO or cache flushes
+	 * during completion processing.
+	 */
+	if (need_zeroout ||
+	    ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
+	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
+		dio->flags &= ~IOMAP_DIO_INLINE_COMP;
+
+	/*
+	 * We can only used polled for single bio IOs or defer
+	 * completion for IOs that will run inline completion.
+	 */
+	if (!(dio->flags & IOMAP_DIO_INLINE_COMP) {
+		dio->iocb->ki_flags &= ~IOCB_HIPRI;
+		dio->flags &= ~IOMAP_DIO_DEFER_COMP;
+	}

This puts the iomap inline completion decision logic all in one
place in the submission code and clearly keys the fast path IO
completion cases to the inline completion paths.

Cheers,

Dave.
Jens Axboe July 22, 2023, 3:12 a.m. UTC | #5
On 7/21/23 4:05?PM, Dave Chinner wrote:
> On Thu, Jul 20, 2023 at 12:13:10PM -0600, Jens Axboe wrote:
>> If IOCB_DIO_DEFER is set, utilize that to set kiocb->dio_complete handler
>> and data for that callback. Rather than punt the completion to a
>> workqueue, we pass back the handler and data to the issuer and will get a
>> callback from a safe task context.
> ....
>> @@ -288,12 +319,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>  		 * after IO completion such as unwritten extent conversion) and
>>  		 * the underlying device either supports FUA or doesn't have
>>  		 * a volatile write cache. This allows us to avoid cache flushes
>> -		 * on IO completion.
>> +		 * on IO completion. If we can't use stable writes and need to
>> +		 * sync, disable in-task completions as dio completion will
>> +		 * need to call generic_write_sync() which will do a blocking
>> +		 * fsync / cache flush call.
>>  		 */
>>  		if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
>>  		    (dio->flags & IOMAP_DIO_STABLE_WRITE) &&
>>  		    (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
>>  			use_fua = true;
>> +		else if (dio->flags & IOMAP_DIO_NEED_SYNC)
>> +			dio->flags &= ~IOMAP_DIO_DEFER_COMP;
>>  	}
>>  
>>  	/*
>> @@ -319,6 +355,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>  		pad = pos & (fs_block_size - 1);
>>  		if (pad)
>>  			iomap_dio_zero(iter, dio, pos - pad, pad);
>> +
>> +		/*
>> +		 * If need_zeroout is set, then this is a new or unwritten
>> +		 * extent. These need extra handling at completion time, so
>> +		 * disable in-task deferred completion for those.
>> +		 */
>> +		dio->flags &= ~IOMAP_DIO_DEFER_COMP;
>>  	}
> 
> I don't think these are quite right. They miss the file extension
> case that I pointed out in an earlier patch (i.e. where IOCB_HIPRI
> gets cleared).
> 
> Fundamentally, I don't like have three different sets of logic which
> all end up being set/cleared for the same situation - polled bios
> and defered completion should only be used in situations where
> inline iomap completion can be run.
> 
> IOWs, I think the iomap_dio_bio_iter() code needs to first decide
> whether IOMAP_DIO_INLINE_COMP can be set, and if it cannot be set,
> we then clear both IOCB_HIPRI and IOMAP_DIO_DEFER_COMP, because
> neither should be used for an IO that can not do inline completion.
> 
> i.e. this all comes down to something like this:
> 
> -	/*
> -	 * We can only poll for single bio I/Os.
> -	 */
> -	if (need_zeroout ||
> -	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
> -		dio->iocb->ki_flags &= ~IOCB_HIPRI;
> +	/*
> +	 * We can only do inline completion for pure overwrites that
> +	 * don't require additional IO at completion. This rules out
> +	 * writes that need zeroing or extent conversion, extend
> +	 * the file size, or issue journal IO or cache flushes
> +	 * during completion processing.
> +	 */
> +	if (need_zeroout ||
> +	    ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
> +	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
> +		dio->flags &= ~IOMAP_DIO_INLINE_COMP;
> +
> +	/*
> +	 * We can only used polled for single bio IOs or defer
> +	 * completion for IOs that will run inline completion.
> +	 */
> +	if (!(dio->flags & IOMAP_DIO_INLINE_COMP) {
> +		dio->iocb->ki_flags &= ~IOCB_HIPRI;
> +		dio->flags &= ~IOMAP_DIO_DEFER_COMP;
> +	}
> 
> This puts the iomap inline completion decision logic all in one
> place in the submission code and clearly keys the fast path IO
> completion cases to the inline completion paths.

I do like the suggestion of figuring out the inline part, and then
clearing HIPRI if the iocb was marked for polling and we don't have the
inline flag set. That makes it easier to follow rather than juggling two
sets of logic.

I'll make that change.
diff mbox series

Patch

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index cce9af019705..de86680968a4 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -20,6 +20,7 @@ 
  * Private flags for iomap_dio, must not overlap with the public ones in
  * iomap.h:
  */
+#define IOMAP_DIO_DEFER_COMP	(1 << 26)
 #define IOMAP_DIO_INLINE_COMP	(1 << 27)
 #define IOMAP_DIO_STABLE_WRITE	(1 << 28)
 #define IOMAP_DIO_NEED_SYNC	(1 << 29)
@@ -132,6 +133,11 @@  ssize_t iomap_dio_complete(struct iomap_dio *dio)
 }
 EXPORT_SYMBOL_GPL(iomap_dio_complete);
 
+static ssize_t iomap_dio_deferred_complete(void *data)
+{
+	return iomap_dio_complete(data);
+}
+
 static void iomap_dio_complete_work(struct work_struct *work)
 {
 	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@@ -192,6 +198,31 @@  void iomap_dio_bio_end_io(struct bio *bio)
 		goto release_bio;
 	}
 
+	/*
+	 * If this dio is flagged with IOMAP_DIO_DEFER_COMP, then schedule
+	 * our completion that way to avoid an async punt to a workqueue.
+	 */
+	if (dio->flags & IOMAP_DIO_DEFER_COMP) {
+		/* only polled IO cares about private cleared */
+		iocb->private = dio;
+		iocb->dio_complete = iomap_dio_deferred_complete;
+
+		/*
+		 * Invoke ->ki_complete() directly. We've assigned out
+		 * dio_complete callback handler, and since the issuer set
+		 * IOCB_DIO_DEFER, we know their ki_complete handler will
+		 * notice ->dio_complete being set and will defer calling that
+		 * handler until it can be done from a safe task context.
+		 *
+		 * Note that the 'res' being passed in here is not important
+		 * for this case. The actual completion value of the request
+		 * will be gotten from dio_complete when that is run by the
+		 * issuer.
+		 */
+		iocb->ki_complete(iocb, 0);
+		goto release_bio;
+	}
+
 	/*
 	 * Async DIO completion that requires filesystem level completion work
 	 * gets punted to a work queue to complete as the operation may require
@@ -288,12 +319,17 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		 * after IO completion such as unwritten extent conversion) and
 		 * the underlying device either supports FUA or doesn't have
 		 * a volatile write cache. This allows us to avoid cache flushes
-		 * on IO completion.
+		 * on IO completion. If we can't use stable writes and need to
+		 * sync, disable in-task completions as dio completion will
+		 * need to call generic_write_sync() which will do a blocking
+		 * fsync / cache flush call.
 		 */
 		if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
 		    (dio->flags & IOMAP_DIO_STABLE_WRITE) &&
 		    (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
 			use_fua = true;
+		else if (dio->flags & IOMAP_DIO_NEED_SYNC)
+			dio->flags &= ~IOMAP_DIO_DEFER_COMP;
 	}
 
 	/*
@@ -319,6 +355,13 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		pad = pos & (fs_block_size - 1);
 		if (pad)
 			iomap_dio_zero(iter, dio, pos - pad, pad);
+
+		/*
+		 * If need_zeroout is set, then this is a new or unwritten
+		 * extent. These need extra handling at completion time, so
+		 * disable in-task deferred completion for those.
+		 */
+		dio->flags &= ~IOMAP_DIO_DEFER_COMP;
 	}
 
 	/*
@@ -557,6 +600,15 @@  __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		iomi.flags |= IOMAP_WRITE;
 		dio->flags |= IOMAP_DIO_WRITE;
 
+		/*
+		 * Flag as supporting deferred completions, if the issuer
+		 * groks it. This can avoid a workqueue punt for writes.
+		 * We may later clear this flag if we need to do other IO
+		 * as part of this IO completion.
+		 */
+		if (iocb->ki_flags & IOCB_DIO_DEFER)
+			dio->flags |= IOMAP_DIO_DEFER_COMP;
+
 		if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
 			ret = -EAGAIN;
 			if (iomi.pos >= dio->i_size ||