diff mbox series

[5/8] iomap: optionally use ioends for direct I/O

Message ID 20241211085420.1380396-6-hch@lst.de (mailing list archive)
State New
Headers show
Series [1/8] iomap: allow the file system to submit the writeback bios | expand

Commit Message

Christoph Hellwig Dec. 11, 2024, 8:53 a.m. UTC
struct iomap_ioend currently tracks outstanding buffered writes and has
some really nice code in core iomap and XFS to merge contiguous I/Os
an defer them to userspace for completion in a very efficient way.

For zoned writes we'll also need a per-bio user context completion to
record the written blocks, and the infrastructure for that would look
basically like the ioend handling for buffered I/O.

So instead of reinventing the wheel, reuse the existing infrastructure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap/buffered-io.c |  3 +++
 fs/iomap/direct-io.c   | 50 +++++++++++++++++++++++++++++++++++++++++-
 fs/iomap/internal.h    |  7 ++++++
 include/linux/iomap.h  |  4 +++-
 4 files changed, 62 insertions(+), 2 deletions(-)
 create mode 100644 fs/iomap/internal.h

Comments

Brian Foster Dec. 12, 2024, 1:29 p.m. UTC | #1
On Wed, Dec 11, 2024 at 09:53:45AM +0100, Christoph Hellwig wrote:
> struct iomap_ioend currently tracks outstanding buffered writes and has
> some really nice code in core iomap and XFS to merge contiguous I/Os
> an defer them to userspace for completion in a very efficient way.
> 
> For zoned writes we'll also need a per-bio user context completion to
> record the written blocks, and the infrastructure for that would look
> basically like the ioend handling for buffered I/O.
> 
> So instead of reinventing the wheel, reuse the existing infrastructure.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/iomap/buffered-io.c |  3 +++
>  fs/iomap/direct-io.c   | 50 +++++++++++++++++++++++++++++++++++++++++-
>  fs/iomap/internal.h    |  7 ++++++
>  include/linux/iomap.h  |  4 +++-
>  4 files changed, 62 insertions(+), 2 deletions(-)
>  create mode 100644 fs/iomap/internal.h
> 
...
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index b521eb15759e..b5466361cafe 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
...
> @@ -163,6 +166,51 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
>  	cmpxchg(&dio->error, 0, ret);
>  }
>  
> +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
> +{
> +	struct iomap_dio *dio = ioend->io_bio.bi_private;
> +	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
> +	struct kiocb *iocb = dio->iocb;
> +	u32 vec_count = ioend->io_bio.bi_vcnt;
> +
> +	if (ioend->io_error)
> +		iomap_dio_set_error(dio, ioend->io_error);
> +
> +	if (atomic_dec_and_test(&dio->ref)) {
> +		struct inode *inode = file_inode(iocb->ki_filp);
> +
> +		if (dio->wait_for_completion) {
> +			struct task_struct *waiter = dio->submit.waiter;
> +
> +			WRITE_ONCE(dio->submit.waiter, NULL);
> +			blk_wake_io_task(waiter);
> +		} else if (!inode->i_mapping->nrpages) {
> +			WRITE_ONCE(iocb->private, NULL);
> +
> +			/*
> +			 * We must never invalidate pages from this thread to
> +			 * avoid deadlocks with buffered I/O completions.
> +			 * Tough luck if you hit the tiny race with someone
> +			 * dirtying the range now.
> +			 */
> +			dio->flags |= IOMAP_DIO_NO_INVALIDATE;
> +			iomap_dio_complete_work(&dio->aio.work);
> +		} else {
> +			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
> +			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
> +		}
> +	}
> +
> +	if (should_dirty) {
> +		bio_check_pages_dirty(&ioend->io_bio);
> +	} else {
> +		bio_release_pages(&ioend->io_bio, false);
> +		bio_put(&ioend->io_bio);
> +	}
> +

Not that it matters all that much, but I'm a little curious about the
reasoning for using vec_count here. AFAICS this correlates to per-folio
writeback completions for buffered I/O, but that doesn't seem to apply
to direct I/O. Is there a reason to have the caller throttle based on
vec_counts, or are we just pulling some non-zero value for consistency
sake?

Brian

> +	return vec_count;
> +}
> +
>  void iomap_dio_bio_end_io(struct bio *bio)
>  {
>  	struct iomap_dio *dio = bio->bi_private;
> diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
> new file mode 100644
> index 000000000000..20cccfc3bb13
> --- /dev/null
> +++ b/fs/iomap/internal.h
> @@ -0,0 +1,7 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _IOMAP_INTERNAL_H
> +#define _IOMAP_INTERNAL_H 1
> +
> +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
> +
> +#endif /* _IOMAP_INTERNAL_H */
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index eaa8cb9083eb..f6943c80e5fd 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -343,9 +343,11 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
>  #define IOMAP_IOEND_UNWRITTEN		(1U << 1)
>  /* don't merge into previous ioend */
>  #define IOMAP_IOEND_BOUNDARY		(1U << 2)
> +/* is direct I/O */
> +#define IOMAP_IOEND_DIRECT		(1U << 3)
>  
>  #define IOMAP_IOEND_NOMERGE_FLAGS \
> -	(IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN)
> +	(IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT)
>  
>  /*
>   * Structure for writeback I/O completions.
> -- 
> 2.45.2
> 
>
Christoph Hellwig Dec. 12, 2024, 3:12 p.m. UTC | #2
On Thu, Dec 12, 2024 at 08:29:36AM -0500, Brian Foster wrote:
> > +	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
> > +	struct kiocb *iocb = dio->iocb;
> > +	u32 vec_count = ioend->io_bio.bi_vcnt;
> > +
> > +	if (ioend->io_error)
> > +		iomap_dio_set_error(dio, ioend->io_error);
> > +
> > +	if (atomic_dec_and_test(&dio->ref)) {
> > +		struct inode *inode = file_inode(iocb->ki_filp);
> > +
> > +		if (dio->wait_for_completion) {
> > +			struct task_struct *waiter = dio->submit.waiter;
> > +
> > +			WRITE_ONCE(dio->submit.waiter, NULL);
> > +			blk_wake_io_task(waiter);
> > +		} else if (!inode->i_mapping->nrpages) {
> > +			WRITE_ONCE(iocb->private, NULL);
> > +
> > +			/*
> > +			 * We must never invalidate pages from this thread to
> > +			 * avoid deadlocks with buffered I/O completions.
> > +			 * Tough luck if you hit the tiny race with someone
> > +			 * dirtying the range now.
> > +			 */
> > +			dio->flags |= IOMAP_DIO_NO_INVALIDATE;
> > +			iomap_dio_complete_work(&dio->aio.work);
> > +		} else {
> > +			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
> > +			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
> > +		}
> > +	}
> > +
> > +	if (should_dirty) {
> > +		bio_check_pages_dirty(&ioend->io_bio);
> > +	} else {
> > +		bio_release_pages(&ioend->io_bio, false);
> > +		bio_put(&ioend->io_bio);
> > +	}
> > +
> 
> Not that it matters all that much, but I'm a little curious about the
> reasoning for using vec_count here. AFAICS this correlates to per-folio
> writeback completions for buffered I/O, but that doesn't seem to apply
> to direct I/O. Is there a reason to have the caller throttle based on
> vec_counts, or are we just pulling some non-zero value for consistency
> sake?

So direct I/O also iterates over all folios for the bio, to unpin,
and in case of reads dirty all of them.

I wanted to plug something useful into cond_resched condition in the
caller.  Now number of bvecs isn't the number of folios as we can
physically merge outside the folio context, but I think this is about
as goot as it gets without changing the block code to return the
number of folios processed from __bio_release_pages and
bio_check_pages_dirty.
Darrick J. Wong Dec. 12, 2024, 7:56 p.m. UTC | #3
On Wed, Dec 11, 2024 at 09:53:45AM +0100, Christoph Hellwig wrote:
> struct iomap_ioend currently tracks outstanding buffered writes and has
> some really nice code in core iomap and XFS to merge contiguous I/Os
> an defer them to userspace for completion in a very efficient way.
> 
> For zoned writes we'll also need a per-bio user context completion to
> record the written blocks, and the infrastructure for that would look
> basically like the ioend handling for buffered I/O.
> 
> So instead of reinventing the wheel, reuse the existing infrastructure.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/iomap/buffered-io.c |  3 +++
>  fs/iomap/direct-io.c   | 50 +++++++++++++++++++++++++++++++++++++++++-
>  fs/iomap/internal.h    |  7 ++++++
>  include/linux/iomap.h  |  4 +++-
>  4 files changed, 62 insertions(+), 2 deletions(-)
>  create mode 100644 fs/iomap/internal.h
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 8125f758a99d..ceca9473a09c 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -17,6 +17,7 @@
>  #include <linux/bio.h>
>  #include <linux/sched/signal.h>
>  #include <linux/migrate.h>
> +#include "internal.h"
>  #include "trace.h"
>  
>  #include "../internal.h"
> @@ -1582,6 +1583,8 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
>  
>  	if (!atomic_dec_and_test(&ioend->io_remaining))
>  		return 0;
> +	if (ioend->io_flags & IOMAP_IOEND_DIRECT)
> +		return iomap_finish_ioend_direct(ioend);
>  	return iomap_finish_ioend_buffered(ioend);
>  }

I'm a little surprised that more of the iomap_ioend* functions didn't
end up in ioend.c.

> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index b521eb15759e..b5466361cafe 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -12,6 +12,7 @@
>  #include <linux/backing-dev.h>
>  #include <linux/uio.h>
>  #include <linux/task_io_accounting_ops.h>
> +#include "internal.h"
>  #include "trace.h"
>  
>  #include "../internal.h"
> @@ -20,6 +21,7 @@
>   * Private flags for iomap_dio, must not overlap with the public ones in
>   * iomap.h:
>   */
> +#define IOMAP_DIO_NO_INVALIDATE	(1U << 25)
>  #define IOMAP_DIO_CALLER_COMP	(1U << 26)
>  #define IOMAP_DIO_INLINE_COMP	(1U << 27)
>  #define IOMAP_DIO_WRITE_THROUGH	(1U << 28)
> @@ -117,7 +119,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  	 * ->end_io() when necessary, otherwise a racing buffer read would cache
>  	 * zeros from unwritten extents.
>  	 */
> -	if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE))
> +	if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) &&
> +	    !(dio->flags & IOMAP_DIO_NO_INVALIDATE))
>  		kiocb_invalidate_post_direct_write(iocb, dio->size);
>  
>  	inode_dio_end(file_inode(iocb->ki_filp));
> @@ -163,6 +166,51 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
>  	cmpxchg(&dio->error, 0, ret);
>  }
>  
> +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
> +{
> +	struct iomap_dio *dio = ioend->io_bio.bi_private;
> +	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
> +	struct kiocb *iocb = dio->iocb;
> +	u32 vec_count = ioend->io_bio.bi_vcnt;
> +
> +	if (ioend->io_error)
> +		iomap_dio_set_error(dio, ioend->io_error);
> +
> +	if (atomic_dec_and_test(&dio->ref)) {
> +		struct inode *inode = file_inode(iocb->ki_filp);
> +
> +		if (dio->wait_for_completion) {
> +			struct task_struct *waiter = dio->submit.waiter;
> +
> +			WRITE_ONCE(dio->submit.waiter, NULL);
> +			blk_wake_io_task(waiter);
> +		} else if (!inode->i_mapping->nrpages) {
> +			WRITE_ONCE(iocb->private, NULL);
> +
> +			/*
> +			 * We must never invalidate pages from this thread to
> +			 * avoid deadlocks with buffered I/O completions.
> +			 * Tough luck if you hit the tiny race with someone
> +			 * dirtying the range now.

What happens, exactly?  Does that mean that the dirty pagecache always
survives?

--D

> +			 */
> +			dio->flags |= IOMAP_DIO_NO_INVALIDATE;
> +			iomap_dio_complete_work(&dio->aio.work);
> +		} else {
> +			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
> +			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
> +		}
> +	}
> +
> +	if (should_dirty) {
> +		bio_check_pages_dirty(&ioend->io_bio);
> +	} else {
> +		bio_release_pages(&ioend->io_bio, false);
> +		bio_put(&ioend->io_bio);
> +	}
> +
> +	return vec_count;
> +}
> +
>  void iomap_dio_bio_end_io(struct bio *bio)
>  {
>  	struct iomap_dio *dio = bio->bi_private;
> diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
> new file mode 100644
> index 000000000000..20cccfc3bb13
> --- /dev/null
> +++ b/fs/iomap/internal.h
> @@ -0,0 +1,7 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _IOMAP_INTERNAL_H
> +#define _IOMAP_INTERNAL_H 1
> +
> +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
> +
> +#endif /* _IOMAP_INTERNAL_H */
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index eaa8cb9083eb..f6943c80e5fd 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -343,9 +343,11 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
>  #define IOMAP_IOEND_UNWRITTEN		(1U << 1)
>  /* don't merge into previous ioend */
>  #define IOMAP_IOEND_BOUNDARY		(1U << 2)
> +/* is direct I/O */
> +#define IOMAP_IOEND_DIRECT		(1U << 3)
>  
>  #define IOMAP_IOEND_NOMERGE_FLAGS \
> -	(IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN)
> +	(IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT)
>  
>  /*
>   * Structure for writeback I/O completions.
> -- 
> 2.45.2
> 
>
Christoph Hellwig Dec. 13, 2024, 4:51 a.m. UTC | #4
On Thu, Dec 12, 2024 at 11:56:24AM -0800, Darrick J. Wong wrote:
> >  		return 0;
> > +	if (ioend->io_flags & IOMAP_IOEND_DIRECT)
> > +		return iomap_finish_ioend_direct(ioend);
> >  	return iomap_finish_ioend_buffered(ioend);
> >  }
> 
> I'm a little surprised that more of the iomap_ioend* functions didn't
> end up in ioend.c.

See the cover letter.  For development I wanted to avoid churn.  Once
we have general approval for the concept I'd like to move more code.

> > +			WRITE_ONCE(dio->submit.waiter, NULL);
> > +			blk_wake_io_task(waiter);
> > +		} else if (!inode->i_mapping->nrpages) {
> > +			WRITE_ONCE(iocb->private, NULL);
> > +
> > +			/*
> > +			 * We must never invalidate pages from this thread to
> > +			 * avoid deadlocks with buffered I/O completions.
> > +			 * Tough luck if you hit the tiny race with someone
> > +			 * dirtying the range now.
> 
> What happens, exactly?  Does that mean that the dirty pagecache always
> survives?

Yes.
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 8125f758a99d..ceca9473a09c 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -17,6 +17,7 @@ 
 #include <linux/bio.h>
 #include <linux/sched/signal.h>
 #include <linux/migrate.h>
+#include "internal.h"
 #include "trace.h"
 
 #include "../internal.h"
@@ -1582,6 +1583,8 @@  iomap_finish_ioend(struct iomap_ioend *ioend, int error)
 
 	if (!atomic_dec_and_test(&ioend->io_remaining))
 		return 0;
+	if (ioend->io_flags & IOMAP_IOEND_DIRECT)
+		return iomap_finish_ioend_direct(ioend);
 	return iomap_finish_ioend_buffered(ioend);
 }
 
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index b521eb15759e..b5466361cafe 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -12,6 +12,7 @@ 
 #include <linux/backing-dev.h>
 #include <linux/uio.h>
 #include <linux/task_io_accounting_ops.h>
+#include "internal.h"
 #include "trace.h"
 
 #include "../internal.h"
@@ -20,6 +21,7 @@ 
  * Private flags for iomap_dio, must not overlap with the public ones in
  * iomap.h:
  */
+#define IOMAP_DIO_NO_INVALIDATE	(1U << 25)
 #define IOMAP_DIO_CALLER_COMP	(1U << 26)
 #define IOMAP_DIO_INLINE_COMP	(1U << 27)
 #define IOMAP_DIO_WRITE_THROUGH	(1U << 28)
@@ -117,7 +119,8 @@  ssize_t iomap_dio_complete(struct iomap_dio *dio)
 	 * ->end_io() when necessary, otherwise a racing buffer read would cache
 	 * zeros from unwritten extents.
 	 */
-	if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE))
+	if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) &&
+	    !(dio->flags & IOMAP_DIO_NO_INVALIDATE))
 		kiocb_invalidate_post_direct_write(iocb, dio->size);
 
 	inode_dio_end(file_inode(iocb->ki_filp));
@@ -163,6 +166,51 @@  static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
 	cmpxchg(&dio->error, 0, ret);
 }
 
+u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
+{
+	struct iomap_dio *dio = ioend->io_bio.bi_private;
+	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+	struct kiocb *iocb = dio->iocb;
+	u32 vec_count = ioend->io_bio.bi_vcnt;
+
+	if (ioend->io_error)
+		iomap_dio_set_error(dio, ioend->io_error);
+
+	if (atomic_dec_and_test(&dio->ref)) {
+		struct inode *inode = file_inode(iocb->ki_filp);
+
+		if (dio->wait_for_completion) {
+			struct task_struct *waiter = dio->submit.waiter;
+
+			WRITE_ONCE(dio->submit.waiter, NULL);
+			blk_wake_io_task(waiter);
+		} else if (!inode->i_mapping->nrpages) {
+			WRITE_ONCE(iocb->private, NULL);
+
+			/*
+			 * We must never invalidate pages from this thread to
+			 * avoid deadlocks with buffered I/O completions.
+			 * Tough luck if you hit the tiny race with someone
+			 * dirtying the range now.
+			 */
+			dio->flags |= IOMAP_DIO_NO_INVALIDATE;
+			iomap_dio_complete_work(&dio->aio.work);
+		} else {
+			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
+		}
+	}
+
+	if (should_dirty) {
+		bio_check_pages_dirty(&ioend->io_bio);
+	} else {
+		bio_release_pages(&ioend->io_bio, false);
+		bio_put(&ioend->io_bio);
+	}
+
+	return vec_count;
+}
+
 void iomap_dio_bio_end_io(struct bio *bio)
 {
 	struct iomap_dio *dio = bio->bi_private;
diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
new file mode 100644
index 000000000000..20cccfc3bb13
--- /dev/null
+++ b/fs/iomap/internal.h
@@ -0,0 +1,7 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _IOMAP_INTERNAL_H
+#define _IOMAP_INTERNAL_H 1
+
+u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
+
+#endif /* _IOMAP_INTERNAL_H */
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index eaa8cb9083eb..f6943c80e5fd 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -343,9 +343,11 @@  sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
 #define IOMAP_IOEND_UNWRITTEN		(1U << 1)
 /* don't merge into previous ioend */
 #define IOMAP_IOEND_BOUNDARY		(1U << 2)
+/* is direct I/O */
+#define IOMAP_IOEND_DIRECT		(1U << 3)
 
 #define IOMAP_IOEND_NOMERGE_FLAGS \
-	(IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN)
+	(IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT)
 
 /*
  * Structure for writeback I/O completions.