diff mbox series

[1/5] iomap: simplify logic for when a dio can get completed inline

Message ID 20230718194920.1472184-3-axboe@kernel.dk (mailing list archive)
State New
Headers show
Series Improve async iomap DIO performance | expand

Commit Message

Jens Axboe July 18, 2023, 7:49 p.m. UTC
Currently iomap gates this on !IOMAP_DIO_WRITE, but this isn't entirely
accurate. Some writes can complete just fine inline. One such example is
polled IO, where the completion always happens in task context.

Add IOMAP_DIO_INLINE_COMP which tells the completion side if we can
complete this dio inline, or if it needs punting to a workqueue. We set
this flag by default for any dio, and turn it off for unwritten extents
or blocks that require a sync at completion time.

Gate the inline completion on whether we're in a task or not as well.
This will always be true for polled IO, but for IRQ driven IO, the
completion context may not allow for inline completions.

Testing a basic QD 1..8 dio random write with polled IO with the
following fio job:

fio --name=polled-dio-write --filename=/data1/file --time_based=1 \
--runtime=10 --bs=4096 --rw=randwrite --norandommap --buffered=0 \
--cpus_allowed=4 --ioengine=io_uring --iodepth=$depth --hipri=1

yields:

        Stock   Patched         Diff
=======================================
QD1     180K    201K            +11%
QD2     356K    394K            +10%
QD4     608K    650K            +7%
QD8     827K    831K            +0.5%

which shows a nice win, particularly for lower queue depth writes.
This is expected, as higher queue depths will be busy polling
completions while the offloaded workqueue completions can happen in
parallel.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/iomap/direct-io.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

Comments

Dave Chinner July 18, 2023, 10:56 p.m. UTC | #1
On Tue, Jul 18, 2023 at 01:49:16PM -0600, Jens Axboe wrote:
> Currently iomap gates this on !IOMAP_DIO_WRITE, but this isn't entirely
> accurate. Some writes can complete just fine inline. One such example is
> polled IO, where the completion always happens in task context.
> 
> Add IOMAP_DIO_INLINE_COMP which tells the completion side if we can
> complete this dio inline, or if it needs punting to a workqueue. We set
> this flag by default for any dio, and turn it off for unwritten extents
> or blocks that require a sync at completion time.

Ignoring the O_DSYNC case (I'll get to that at the end), this is
still wrong - it misses extending writes that need to change file
size at IO completion. For some filesystems, file extension at IO
completion has the same constraints as unwritten extent conversion
(i.e. requires locking and transactions), but the iomap
infrastructure has no idea whether the filesystem performing the IO
requires this or not.

i.e. if iomap always punts unwritten extent IO to a workqueue, we
also have to punt extending writes to a workqueue.  Fundamentally,
the iomap code itself cannot make a correct determination of whether
IO completion of any specific write IO requires completion in task
context.

Only the filesystem knows that,

However, the filesystem knows if the IO is going to need IO
completion processing at submission time. It tells iomap that it
needs completion processing via the IOMAP_F_DIRTY flag. This allows
filesystems to determine what IOs iomap can consider as "writes that
don't need filesystem completion processing".

With this flag, iomap can optimise the IO appropriately. We can use
REQ_FUA for O_DSYNC writes if IOMAP_F_DIRTY is not set. We can do
inline completion if IOMAP_F_DIRTY is not set. But if IOMAP_F_DIRTY
is set, the filesystem needs to run it's own completion processing,
and so iomap cannot run that write with an inline completion.

> Gate the inline completion on whether we're in a task or not as well.
> This will always be true for polled IO, but for IRQ driven IO, the
> completion context may not allow for inline completions.

Again, context does not matter for pure overwrites - we can complete
them inline regardless of completion context. The task context only
matters when the filesystem needs to do completion work, and we've
already established that we are not doing inline completion
for polled IO for unwritten, O_DSYNC or extending file writes.

IOWs, we already avoid polled completions for all the situations
where IOMAP_F_DIRTY is set by the filesystem to indicate the
operation is not a pure overwrite....


> ---
>  fs/iomap/direct-io.c | 14 +++++++++-----
>  1 file changed, 9 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index ea3b868c8355..6fa77094cf0a 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -20,6 +20,7 @@
>   * Private flags for iomap_dio, must not overlap with the public ones in
>   * iomap.h:
>   */
> +#define IOMAP_DIO_INLINE_COMP	(1 << 27)
>  #define IOMAP_DIO_WRITE_FUA	(1 << 28)
>  #define IOMAP_DIO_NEED_SYNC	(1 << 29)
>  #define IOMAP_DIO_WRITE		(1 << 30)
> @@ -161,15 +162,15 @@ void iomap_dio_bio_end_io(struct bio *bio)
>  			struct task_struct *waiter = dio->submit.waiter;
>  			WRITE_ONCE(dio->submit.waiter, NULL);
>  			blk_wake_io_task(waiter);
> -		} else if (dio->flags & IOMAP_DIO_WRITE) {
> +		} else if ((dio->flags & IOMAP_DIO_INLINE_COMP) && in_task()) {

Regardless of whether the code is correct or not, this needs a
comment explaining what problem the in_task() check is working
around...

> +			WRITE_ONCE(dio->iocb->private, NULL);
> +			iomap_dio_complete_work(&dio->aio.work);
> +		} else {
>  			struct inode *inode = file_inode(dio->iocb->ki_filp);
>  
>  			WRITE_ONCE(dio->iocb->private, NULL);
>  			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
>  			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
> -		} else {
> -			WRITE_ONCE(dio->iocb->private, NULL);
> -			iomap_dio_complete_work(&dio->aio.work);
>  		}
>  	}
>  
> @@ -244,6 +245,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  
>  	if (iomap->type == IOMAP_UNWRITTEN) {
>  		dio->flags |= IOMAP_DIO_UNWRITTEN;
> +		dio->flags &= ~IOMAP_DIO_INLINE_COMP;
>  		need_zeroout = true;
>  	}
>  
> @@ -500,7 +502,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	dio->i_size = i_size_read(inode);
>  	dio->dops = dops;
>  	dio->error = 0;
> -	dio->flags = 0;
> +	/* default to inline completion, turned off when not supported */
> +	dio->flags = IOMAP_DIO_INLINE_COMP;
>  	dio->done_before = done_before;

I think this is poorly coded. If we get the clearing logic
wrong (as is the case in this patch) then bad things will
happen when we run inline completion in an irq context when
the filesystem needs to run a transaction. e.g. file extension.

It looks to me like you hacked around this "default is wrong" case
with the "in_task()" check in completion, but given that check is
completely undocumented....

>  	dio->submit.iter = iter;
> @@ -535,6 +538,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		/* for data sync or sync, we need sync completion processing */
>  		if (iocb_is_dsync(iocb)) {
>  			dio->flags |= IOMAP_DIO_NEED_SYNC;
> +			dio->flags &= ~IOMAP_DIO_INLINE_COMP;

This is looks wrong, too. We set IOMAP_DIO_WRITE_FUA ca couple of
lines later, and during bio submission we check if REQ_FUA can be
used if IOMAP_F_DIRTY is not set. If all the bios we submit use
REQ_FUA, then we clear IOMAP_DIO_NEED_SYNC before we drop the dio
submission reference.

For such a REQ_FUA bio chains, we can now safely do inline
completion because we don't run generic_write_sync() in IO
completion now. The filesystem does not need to perform blocking or
IO operations in completion, either, so these IOs can be completed
in line like any other pure overwrite DIO....

-Dave.
Jens Axboe July 19, 2023, 3:23 p.m. UTC | #2
On 7/18/23 4:56?PM, Dave Chinner wrote:
> On Tue, Jul 18, 2023 at 01:49:16PM -0600, Jens Axboe wrote:
>> Currently iomap gates this on !IOMAP_DIO_WRITE, but this isn't entirely
>> accurate. Some writes can complete just fine inline. One such example is
>> polled IO, where the completion always happens in task context.
>>
>> Add IOMAP_DIO_INLINE_COMP which tells the completion side if we can
>> complete this dio inline, or if it needs punting to a workqueue. We set
>> this flag by default for any dio, and turn it off for unwritten extents
>> or blocks that require a sync at completion time.
> 
> Ignoring the O_DSYNC case (I'll get to that at the end), this is
> still wrong - it misses extending writes that need to change file
> size at IO completion. For some filesystems, file extension at IO
> completion has the same constraints as unwritten extent conversion
> (i.e. requires locking and transactions), but the iomap
> infrastructure has no idea whether the filesystem performing the IO
> requires this or not.
> 
> i.e. if iomap always punts unwritten extent IO to a workqueue, we
> also have to punt extending writes to a workqueue.  Fundamentally,
> the iomap code itself cannot make a correct determination of whether
> IO completion of any specific write IO requires completion in task
> context.
> 
> Only the filesystem knows that,
> 
> However, the filesystem knows if the IO is going to need IO
> completion processing at submission time. It tells iomap that it
> needs completion processing via the IOMAP_F_DIRTY flag. This allows
> filesystems to determine what IOs iomap can consider as "writes that
> don't need filesystem completion processing".
> 
> With this flag, iomap can optimise the IO appropriately. We can use
> REQ_FUA for O_DSYNC writes if IOMAP_F_DIRTY is not set. We can do
> inline completion if IOMAP_F_DIRTY is not set. But if IOMAP_F_DIRTY
> is set, the filesystem needs to run it's own completion processing,
> and so iomap cannot run that write with an inline completion.

Gotcha, so we need to gate INLINE_COMP on !IOMAP_F_DIRTY as well. I'll
make that change.

>> Gate the inline completion on whether we're in a task or not as well.
>> This will always be true for polled IO, but for IRQ driven IO, the
>> completion context may not allow for inline completions.
> 
> Again, context does not matter for pure overwrites - we can complete
> them inline regardless of completion context. The task context only
> matters when the filesystem needs to do completion work, and we've
> already established that we are not doing inline completion
> for polled IO for unwritten, O_DSYNC or extending file writes.

Right, looks like I was just missing that bit as well, I assumed that
the previous case would co er it.

> IOWs, we already avoid polled completions for all the situations
> where IOMAP_F_DIRTY is set by the filesystem to indicate the
> operation is not a pure overwrite....

Yep

>>  fs/iomap/direct-io.c | 14 +++++++++-----
>>  1 file changed, 9 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
>> index ea3b868c8355..6fa77094cf0a 100644
>> --- a/fs/iomap/direct-io.c
>> +++ b/fs/iomap/direct-io.c
>> @@ -20,6 +20,7 @@
>>   * Private flags for iomap_dio, must not overlap with the public ones in
>>   * iomap.h:
>>   */
>> +#define IOMAP_DIO_INLINE_COMP	(1 << 27)
>>  #define IOMAP_DIO_WRITE_FUA	(1 << 28)
>>  #define IOMAP_DIO_NEED_SYNC	(1 << 29)
>>  #define IOMAP_DIO_WRITE		(1 << 30)
>> @@ -161,15 +162,15 @@ void iomap_dio_bio_end_io(struct bio *bio)
>>  			struct task_struct *waiter = dio->submit.waiter;
>>  			WRITE_ONCE(dio->submit.waiter, NULL);
>>  			blk_wake_io_task(waiter);
>> -		} else if (dio->flags & IOMAP_DIO_WRITE) {
>> +		} else if ((dio->flags & IOMAP_DIO_INLINE_COMP) && in_task()) {
> 
> Regardless of whether the code is correct or not, this needs a
> comment explaining what problem the in_task() check is working
> around...

It's meant to catch cases where we're doing polled IO, but it got
cleared/disabled in the block layer. We cannot catch this at submission
time, it has to be checked at completion time. There are a few ways we
could check for that, one would be in_task(), another would be to check
the bio REQ_POLLED flag like v1 did. I don't have a strong preference
here, though it did seem like a saner check to use in_task() as generic
catch-all for if we're doing this from soft/hard irq processing or not,
unexpectedly.


>> +			WRITE_ONCE(dio->iocb->private, NULL);
>> +			iomap_dio_complete_work(&dio->aio.work);
>> +		} else {
>>  			struct inode *inode = file_inode(dio->iocb->ki_filp);
>>  
>>  			WRITE_ONCE(dio->iocb->private, NULL);
>>  			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
>>  			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
>> -		} else {
>> -			WRITE_ONCE(dio->iocb->private, NULL);
>> -			iomap_dio_complete_work(&dio->aio.work);
>>  		}
>>  	}
>>  
>> @@ -244,6 +245,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>  
>>  	if (iomap->type == IOMAP_UNWRITTEN) {
>>  		dio->flags |= IOMAP_DIO_UNWRITTEN;
>> +		dio->flags &= ~IOMAP_DIO_INLINE_COMP;
>>  		need_zeroout = true;
>>  	}
>>  
>> @@ -500,7 +502,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>>  	dio->i_size = i_size_read(inode);
>>  	dio->dops = dops;
>>  	dio->error = 0;
>> -	dio->flags = 0;
>> +	/* default to inline completion, turned off when not supported */
>> +	dio->flags = IOMAP_DIO_INLINE_COMP;
>>  	dio->done_before = done_before;
> 
> I think this is poorly coded. If we get the clearing logic
> wrong (as is the case in this patch) then bad things will
> happen when we run inline completion in an irq context when
> the filesystem needs to run a transaction. e.g. file extension.

Agree, it seems a bit fragile. The alternative is doing it the other way
around, enabling it for cases that we know it'll work for instead. I'll
take a stab at that approach along with the other changes.

> It looks to me like you hacked around this "default is wrong" case
> with the "in_task()" check in completion, but given that check is
> completely undocumented....

It's not a hacky work-around, it's a known case that could go wrong.
> 
>>  	dio->submit.iter = iter;
>> @@ -535,6 +538,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>>  		/* for data sync or sync, we need sync completion processing */
>>  		if (iocb_is_dsync(iocb)) {
>>  			dio->flags |= IOMAP_DIO_NEED_SYNC;
>> +			dio->flags &= ~IOMAP_DIO_INLINE_COMP;
> 
> This is looks wrong, too. We set IOMAP_DIO_WRITE_FUA ca couple of
> lines later, and during bio submission we check if REQ_FUA can be
> used if IOMAP_F_DIRTY is not set. If all the bios we submit use
> REQ_FUA, then we clear IOMAP_DIO_NEED_SYNC before we drop the dio
> submission reference.
> 
> For such a REQ_FUA bio chains, we can now safely do inline
> completion because we don't run generic_write_sync() in IO
> completion now. The filesystem does not need to perform blocking or
> IO operations in completion, either, so these IOs can be completed
> in line like any other pure overwrite DIO....

True, non-extending FUA writes would be fine as well.

Thanks for the review!
diff mbox series

Patch

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index ea3b868c8355..6fa77094cf0a 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -20,6 +20,7 @@ 
  * Private flags for iomap_dio, must not overlap with the public ones in
  * iomap.h:
  */
+#define IOMAP_DIO_INLINE_COMP	(1 << 27)
 #define IOMAP_DIO_WRITE_FUA	(1 << 28)
 #define IOMAP_DIO_NEED_SYNC	(1 << 29)
 #define IOMAP_DIO_WRITE		(1 << 30)
@@ -161,15 +162,15 @@  void iomap_dio_bio_end_io(struct bio *bio)
 			struct task_struct *waiter = dio->submit.waiter;
 			WRITE_ONCE(dio->submit.waiter, NULL);
 			blk_wake_io_task(waiter);
-		} else if (dio->flags & IOMAP_DIO_WRITE) {
+		} else if ((dio->flags & IOMAP_DIO_INLINE_COMP) && in_task()) {
+			WRITE_ONCE(dio->iocb->private, NULL);
+			iomap_dio_complete_work(&dio->aio.work);
+		} else {
 			struct inode *inode = file_inode(dio->iocb->ki_filp);
 
 			WRITE_ONCE(dio->iocb->private, NULL);
 			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
 			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
-		} else {
-			WRITE_ONCE(dio->iocb->private, NULL);
-			iomap_dio_complete_work(&dio->aio.work);
 		}
 	}
 
@@ -244,6 +245,7 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 
 	if (iomap->type == IOMAP_UNWRITTEN) {
 		dio->flags |= IOMAP_DIO_UNWRITTEN;
+		dio->flags &= ~IOMAP_DIO_INLINE_COMP;
 		need_zeroout = true;
 	}
 
@@ -500,7 +502,8 @@  __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	dio->i_size = i_size_read(inode);
 	dio->dops = dops;
 	dio->error = 0;
-	dio->flags = 0;
+	/* default to inline completion, turned off when not supported */
+	dio->flags = IOMAP_DIO_INLINE_COMP;
 	dio->done_before = done_before;
 
 	dio->submit.iter = iter;
@@ -535,6 +538,7 @@  __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		/* for data sync or sync, we need sync completion processing */
 		if (iocb_is_dsync(iocb)) {
 			dio->flags |= IOMAP_DIO_NEED_SYNC;
+			dio->flags &= ~IOMAP_DIO_INLINE_COMP;
 
 		       /*
 			* For datasync only writes, we optimistically try