Message ID | 20241211085420.1380396-6-hch@lst.de (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [1/8] iomap: allow the file system to submit the writeback bios | expand |
On Wed, Dec 11, 2024 at 09:53:45AM +0100, Christoph Hellwig wrote: > struct iomap_ioend currently tracks outstanding buffered writes and has > some really nice code in core iomap and XFS to merge contiguous I/Os > an defer them to userspace for completion in a very efficient way. > > For zoned writes we'll also need a per-bio user context completion to > record the written blocks, and the infrastructure for that would look > basically like the ioend handling for buffered I/O. > > So instead of reinventing the wheel, reuse the existing infrastructure. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/iomap/buffered-io.c | 3 +++ > fs/iomap/direct-io.c | 50 +++++++++++++++++++++++++++++++++++++++++- > fs/iomap/internal.h | 7 ++++++ > include/linux/iomap.h | 4 +++- > 4 files changed, 62 insertions(+), 2 deletions(-) > create mode 100644 fs/iomap/internal.h > ... > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c > index b521eb15759e..b5466361cafe 100644 > --- a/fs/iomap/direct-io.c > +++ b/fs/iomap/direct-io.c ... > @@ -163,6 +166,51 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) > cmpxchg(&dio->error, 0, ret); > } > > +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) > +{ > + struct iomap_dio *dio = ioend->io_bio.bi_private; > + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); > + struct kiocb *iocb = dio->iocb; > + u32 vec_count = ioend->io_bio.bi_vcnt; > + > + if (ioend->io_error) > + iomap_dio_set_error(dio, ioend->io_error); > + > + if (atomic_dec_and_test(&dio->ref)) { > + struct inode *inode = file_inode(iocb->ki_filp); > + > + if (dio->wait_for_completion) { > + struct task_struct *waiter = dio->submit.waiter; > + > + WRITE_ONCE(dio->submit.waiter, NULL); > + blk_wake_io_task(waiter); > + } else if (!inode->i_mapping->nrpages) { > + WRITE_ONCE(iocb->private, NULL); > + > + /* > + * We must never invalidate pages from this thread to > + * avoid deadlocks with buffered I/O completions. > + * Tough luck if you hit the tiny race with someone > + * dirtying the range now. > + */ > + dio->flags |= IOMAP_DIO_NO_INVALIDATE; > + iomap_dio_complete_work(&dio->aio.work); > + } else { > + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); > + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); > + } > + } > + > + if (should_dirty) { > + bio_check_pages_dirty(&ioend->io_bio); > + } else { > + bio_release_pages(&ioend->io_bio, false); > + bio_put(&ioend->io_bio); > + } > + Not that it matters all that much, but I'm a little curious about the reasoning for using vec_count here. AFAICS this correlates to per-folio writeback completions for buffered I/O, but that doesn't seem to apply to direct I/O. Is there a reason to have the caller throttle based on vec_counts, or are we just pulling some non-zero value for consistency sake? Brian > + return vec_count; > +} > + > void iomap_dio_bio_end_io(struct bio *bio) > { > struct iomap_dio *dio = bio->bi_private; > diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h > new file mode 100644 > index 000000000000..20cccfc3bb13 > --- /dev/null > +++ b/fs/iomap/internal.h > @@ -0,0 +1,7 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef _IOMAP_INTERNAL_H > +#define _IOMAP_INTERNAL_H 1 > + > +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); > + > +#endif /* _IOMAP_INTERNAL_H */ > diff --git a/include/linux/iomap.h b/include/linux/iomap.h > index eaa8cb9083eb..f6943c80e5fd 100644 > --- a/include/linux/iomap.h > +++ b/include/linux/iomap.h > @@ -343,9 +343,11 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, > #define IOMAP_IOEND_UNWRITTEN (1U << 1) > /* don't merge into previous ioend */ > #define IOMAP_IOEND_BOUNDARY (1U << 2) > +/* is direct I/O */ > +#define IOMAP_IOEND_DIRECT (1U << 3) > > #define IOMAP_IOEND_NOMERGE_FLAGS \ > - (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN) > + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT) > > /* > * Structure for writeback I/O completions. > -- > 2.45.2 > >
On Thu, Dec 12, 2024 at 08:29:36AM -0500, Brian Foster wrote: > > + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); > > + struct kiocb *iocb = dio->iocb; > > + u32 vec_count = ioend->io_bio.bi_vcnt; > > + > > + if (ioend->io_error) > > + iomap_dio_set_error(dio, ioend->io_error); > > + > > + if (atomic_dec_and_test(&dio->ref)) { > > + struct inode *inode = file_inode(iocb->ki_filp); > > + > > + if (dio->wait_for_completion) { > > + struct task_struct *waiter = dio->submit.waiter; > > + > > + WRITE_ONCE(dio->submit.waiter, NULL); > > + blk_wake_io_task(waiter); > > + } else if (!inode->i_mapping->nrpages) { > > + WRITE_ONCE(iocb->private, NULL); > > + > > + /* > > + * We must never invalidate pages from this thread to > > + * avoid deadlocks with buffered I/O completions. > > + * Tough luck if you hit the tiny race with someone > > + * dirtying the range now. > > + */ > > + dio->flags |= IOMAP_DIO_NO_INVALIDATE; > > + iomap_dio_complete_work(&dio->aio.work); > > + } else { > > + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); > > + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); > > + } > > + } > > + > > + if (should_dirty) { > > + bio_check_pages_dirty(&ioend->io_bio); > > + } else { > > + bio_release_pages(&ioend->io_bio, false); > > + bio_put(&ioend->io_bio); > > + } > > + > > Not that it matters all that much, but I'm a little curious about the > reasoning for using vec_count here. AFAICS this correlates to per-folio > writeback completions for buffered I/O, but that doesn't seem to apply > to direct I/O. Is there a reason to have the caller throttle based on > vec_counts, or are we just pulling some non-zero value for consistency > sake? So direct I/O also iterates over all folios for the bio, to unpin, and in case of reads dirty all of them. I wanted to plug something useful into cond_resched condition in the caller. Now number of bvecs isn't the number of folios as we can physically merge outside the folio context, but I think this is about as goot as it gets without changing the block code to return the number of folios processed from __bio_release_pages and bio_check_pages_dirty.
On Wed, Dec 11, 2024 at 09:53:45AM +0100, Christoph Hellwig wrote: > struct iomap_ioend currently tracks outstanding buffered writes and has > some really nice code in core iomap and XFS to merge contiguous I/Os > an defer them to userspace for completion in a very efficient way. > > For zoned writes we'll also need a per-bio user context completion to > record the written blocks, and the infrastructure for that would look > basically like the ioend handling for buffered I/O. > > So instead of reinventing the wheel, reuse the existing infrastructure. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/iomap/buffered-io.c | 3 +++ > fs/iomap/direct-io.c | 50 +++++++++++++++++++++++++++++++++++++++++- > fs/iomap/internal.h | 7 ++++++ > include/linux/iomap.h | 4 +++- > 4 files changed, 62 insertions(+), 2 deletions(-) > create mode 100644 fs/iomap/internal.h > > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c > index 8125f758a99d..ceca9473a09c 100644 > --- a/fs/iomap/buffered-io.c > +++ b/fs/iomap/buffered-io.c > @@ -17,6 +17,7 @@ > #include <linux/bio.h> > #include <linux/sched/signal.h> > #include <linux/migrate.h> > +#include "internal.h" > #include "trace.h" > > #include "../internal.h" > @@ -1582,6 +1583,8 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) > > if (!atomic_dec_and_test(&ioend->io_remaining)) > return 0; > + if (ioend->io_flags & IOMAP_IOEND_DIRECT) > + return iomap_finish_ioend_direct(ioend); > return iomap_finish_ioend_buffered(ioend); > } I'm a little surprised that more of the iomap_ioend* functions didn't end up in ioend.c. > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c > index b521eb15759e..b5466361cafe 100644 > --- a/fs/iomap/direct-io.c > +++ b/fs/iomap/direct-io.c > @@ -12,6 +12,7 @@ > #include <linux/backing-dev.h> > #include <linux/uio.h> > #include <linux/task_io_accounting_ops.h> > +#include "internal.h" > #include "trace.h" > > #include "../internal.h" > @@ -20,6 +21,7 @@ > * Private flags for iomap_dio, must not overlap with the public ones in > * iomap.h: > */ > +#define IOMAP_DIO_NO_INVALIDATE (1U << 25) > #define IOMAP_DIO_CALLER_COMP (1U << 26) > #define IOMAP_DIO_INLINE_COMP (1U << 27) > #define IOMAP_DIO_WRITE_THROUGH (1U << 28) > @@ -117,7 +119,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) > * ->end_io() when necessary, otherwise a racing buffer read would cache > * zeros from unwritten extents. > */ > - if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) > + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && > + !(dio->flags & IOMAP_DIO_NO_INVALIDATE)) > kiocb_invalidate_post_direct_write(iocb, dio->size); > > inode_dio_end(file_inode(iocb->ki_filp)); > @@ -163,6 +166,51 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) > cmpxchg(&dio->error, 0, ret); > } > > +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) > +{ > + struct iomap_dio *dio = ioend->io_bio.bi_private; > + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); > + struct kiocb *iocb = dio->iocb; > + u32 vec_count = ioend->io_bio.bi_vcnt; > + > + if (ioend->io_error) > + iomap_dio_set_error(dio, ioend->io_error); > + > + if (atomic_dec_and_test(&dio->ref)) { > + struct inode *inode = file_inode(iocb->ki_filp); > + > + if (dio->wait_for_completion) { > + struct task_struct *waiter = dio->submit.waiter; > + > + WRITE_ONCE(dio->submit.waiter, NULL); > + blk_wake_io_task(waiter); > + } else if (!inode->i_mapping->nrpages) { > + WRITE_ONCE(iocb->private, NULL); > + > + /* > + * We must never invalidate pages from this thread to > + * avoid deadlocks with buffered I/O completions. > + * Tough luck if you hit the tiny race with someone > + * dirtying the range now. What happens, exactly? Does that mean that the dirty pagecache always survives? --D > + */ > + dio->flags |= IOMAP_DIO_NO_INVALIDATE; > + iomap_dio_complete_work(&dio->aio.work); > + } else { > + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); > + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); > + } > + } > + > + if (should_dirty) { > + bio_check_pages_dirty(&ioend->io_bio); > + } else { > + bio_release_pages(&ioend->io_bio, false); > + bio_put(&ioend->io_bio); > + } > + > + return vec_count; > +} > + > void iomap_dio_bio_end_io(struct bio *bio) > { > struct iomap_dio *dio = bio->bi_private; > diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h > new file mode 100644 > index 000000000000..20cccfc3bb13 > --- /dev/null > +++ b/fs/iomap/internal.h > @@ -0,0 +1,7 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef _IOMAP_INTERNAL_H > +#define _IOMAP_INTERNAL_H 1 > + > +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); > + > +#endif /* _IOMAP_INTERNAL_H */ > diff --git a/include/linux/iomap.h b/include/linux/iomap.h > index eaa8cb9083eb..f6943c80e5fd 100644 > --- a/include/linux/iomap.h > +++ b/include/linux/iomap.h > @@ -343,9 +343,11 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, > #define IOMAP_IOEND_UNWRITTEN (1U << 1) > /* don't merge into previous ioend */ > #define IOMAP_IOEND_BOUNDARY (1U << 2) > +/* is direct I/O */ > +#define IOMAP_IOEND_DIRECT (1U << 3) > > #define IOMAP_IOEND_NOMERGE_FLAGS \ > - (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN) > + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT) > > /* > * Structure for writeback I/O completions. > -- > 2.45.2 > >
On Thu, Dec 12, 2024 at 11:56:24AM -0800, Darrick J. Wong wrote: > > return 0; > > + if (ioend->io_flags & IOMAP_IOEND_DIRECT) > > + return iomap_finish_ioend_direct(ioend); > > return iomap_finish_ioend_buffered(ioend); > > } > > I'm a little surprised that more of the iomap_ioend* functions didn't > end up in ioend.c. See the cover letter. For development I wanted to avoid churn. Once we have general approval for the concept I'd like to move more code. > > + WRITE_ONCE(dio->submit.waiter, NULL); > > + blk_wake_io_task(waiter); > > + } else if (!inode->i_mapping->nrpages) { > > + WRITE_ONCE(iocb->private, NULL); > > + > > + /* > > + * We must never invalidate pages from this thread to > > + * avoid deadlocks with buffered I/O completions. > > + * Tough luck if you hit the tiny race with someone > > + * dirtying the range now. > > What happens, exactly? Does that mean that the dirty pagecache always > survives? Yes.
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8125f758a99d..ceca9473a09c 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -17,6 +17,7 @@ #include <linux/bio.h> #include <linux/sched/signal.h> #include <linux/migrate.h> +#include "internal.h" #include "trace.h" #include "../internal.h" @@ -1582,6 +1583,8 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) if (!atomic_dec_and_test(&ioend->io_remaining)) return 0; + if (ioend->io_flags & IOMAP_IOEND_DIRECT) + return iomap_finish_ioend_direct(ioend); return iomap_finish_ioend_buffered(ioend); } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b521eb15759e..b5466361cafe 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -12,6 +12,7 @@ #include <linux/backing-dev.h> #include <linux/uio.h> #include <linux/task_io_accounting_ops.h> +#include "internal.h" #include "trace.h" #include "../internal.h" @@ -20,6 +21,7 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ +#define IOMAP_DIO_NO_INVALIDATE (1U << 25) #define IOMAP_DIO_CALLER_COMP (1U << 26) #define IOMAP_DIO_INLINE_COMP (1U << 27) #define IOMAP_DIO_WRITE_THROUGH (1U << 28) @@ -117,7 +119,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && + !(dio->flags & IOMAP_DIO_NO_INVALIDATE)) kiocb_invalidate_post_direct_write(iocb, dio->size); inode_dio_end(file_inode(iocb->ki_filp)); @@ -163,6 +166,51 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) cmpxchg(&dio->error, 0, ret); } +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) +{ + struct iomap_dio *dio = ioend->io_bio.bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + struct kiocb *iocb = dio->iocb; + u32 vec_count = ioend->io_bio.bi_vcnt; + + if (ioend->io_error) + iomap_dio_set_error(dio, ioend->io_error); + + if (atomic_dec_and_test(&dio->ref)) { + struct inode *inode = file_inode(iocb->ki_filp); + + if (dio->wait_for_completion) { + struct task_struct *waiter = dio->submit.waiter; + + WRITE_ONCE(dio->submit.waiter, NULL); + blk_wake_io_task(waiter); + } else if (!inode->i_mapping->nrpages) { + WRITE_ONCE(iocb->private, NULL); + + /* + * We must never invalidate pages from this thread to + * avoid deadlocks with buffered I/O completions. + * Tough luck if you hit the tiny race with someone + * dirtying the range now. + */ + dio->flags |= IOMAP_DIO_NO_INVALIDATE; + iomap_dio_complete_work(&dio->aio.work); + } else { + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); + } + } + + if (should_dirty) { + bio_check_pages_dirty(&ioend->io_bio); + } else { + bio_release_pages(&ioend->io_bio, false); + bio_put(&ioend->io_bio); + } + + return vec_count; +} + void iomap_dio_bio_end_io(struct bio *bio) { struct iomap_dio *dio = bio->bi_private; diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h new file mode 100644 index 000000000000..20cccfc3bb13 --- /dev/null +++ b/fs/iomap/internal.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _IOMAP_INTERNAL_H +#define _IOMAP_INTERNAL_H 1 + +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); + +#endif /* _IOMAP_INTERNAL_H */ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eaa8cb9083eb..f6943c80e5fd 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -343,9 +343,11 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, #define IOMAP_IOEND_UNWRITTEN (1U << 1) /* don't merge into previous ioend */ #define IOMAP_IOEND_BOUNDARY (1U << 2) +/* is direct I/O */ +#define IOMAP_IOEND_DIRECT (1U << 3) #define IOMAP_IOEND_NOMERGE_FLAGS \ - (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN) + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT) /* * Structure for writeback I/O completions.
struct iomap_ioend currently tracks outstanding buffered writes and has some really nice code in core iomap and XFS to merge contiguous I/Os an defer them to userspace for completion in a very efficient way. For zoned writes we'll also need a per-bio user context completion to record the written blocks, and the infrastructure for that would look basically like the ioend handling for buffered I/O. So instead of reinventing the wheel, reuse the existing infrastructure. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/iomap/buffered-io.c | 3 +++ fs/iomap/direct-io.c | 50 +++++++++++++++++++++++++++++++++++++++++- fs/iomap/internal.h | 7 ++++++ include/linux/iomap.h | 4 +++- 4 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 fs/iomap/internal.h