diff mbox series

[PATCHv9,4/7] block, fs: add write hint to kiocb

Message ID 20241025213645.3464331-5-kbusch@meta.com (mailing list archive)
State New, archived
Headers show
Series write hints with nvme fdp, scsi streams | expand

Commit Message

Keith Busch Oct. 25, 2024, 9:36 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

This prepares for sources other than the inode to provide a write hint.
The block layer will use it for direct IO if the requested hint is
within the block device's capabilities.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/fops.c       | 26 +++++++++++++++++++++++---
 include/linux/fs.h |  1 +
 2 files changed, 24 insertions(+), 3 deletions(-)

Comments

Christoph Hellwig Oct. 28, 2024, 11:59 a.m. UTC | #1
On Fri, Oct 25, 2024 at 02:36:42PM -0700, Keith Busch wrote:
> +static u16 blkdev_write_hint(struct kiocb *iocb, struct block_device *bdev)
> +{
> +	u16 hint = iocb->ki_write_hint;
> +
> +	if (!hint)
> +		return file_inode(iocb->ki_filp)->i_write_hint;
> +
> +	if (hint > bdev_max_write_hints(bdev))
> +		return file_inode(iocb->ki_filp)->i_write_hint;
> +
> +	if (bdev_is_partition(bdev) &&
> +	    !test_bit(hint - 1, bdev->write_hint_mask))
> +		return file_inode(iocb->ki_filp)->i_write_hint;

I would have expected an error when using an invalid stream identifier.

That of course requires telling the application how many are available
through e.g. statx as requested last time.
Keith Busch Oct. 28, 2024, 2:38 p.m. UTC | #2
On Mon, Oct 28, 2024 at 12:59:32PM +0100, Christoph Hellwig wrote:
> On Fri, Oct 25, 2024 at 02:36:42PM -0700, Keith Busch wrote:
> > +static u16 blkdev_write_hint(struct kiocb *iocb, struct block_device *bdev)
> > +{
> > +	u16 hint = iocb->ki_write_hint;
> > +
> > +	if (!hint)
> > +		return file_inode(iocb->ki_filp)->i_write_hint;
> > +
> > +	if (hint > bdev_max_write_hints(bdev))
> > +		return file_inode(iocb->ki_filp)->i_write_hint;
> > +
> > +	if (bdev_is_partition(bdev) &&
> > +	    !test_bit(hint - 1, bdev->write_hint_mask))
> > +		return file_inode(iocb->ki_filp)->i_write_hint;
> 
> I would have expected an error when using an invalid stream identifier.

It's a hint. fcntl doesn't error if you give an unusable hint, so
neither should this. You get sane default behavior.
Christoph Hellwig Oct. 28, 2024, 4:08 p.m. UTC | #3
On Mon, Oct 28, 2024 at 08:38:05AM -0600, Keith Busch wrote:
> > > +	if (bdev_is_partition(bdev) &&
> > > +	    !test_bit(hint - 1, bdev->write_hint_mask))
> > > +		return file_inode(iocb->ki_filp)->i_write_hint;
> > 
> > I would have expected an error when using an invalid stream identifier.
> 
> It's a hint. fcntl doesn't error if you give an unusable hint, so
> neither should this. You get sane default behavior.

Well, why does it have to be a hint?

If I have a data placement aware application I really want to know
into how many buckets I can sort and adjust my algorithm for it.
And I'd rather have an error checked interface to pass that down
as far as I can.

Same for my file system use case.  I guess you have a use case where
a hint would be enough, at least with good enough knowledge of the
underlying implementation.  But would there be an actual downside
in not having a hint?  Because historically speaking everything we've
done as a not error checked vaguely defined hint has not been all
the useful, but it in hardware or software interfaces.
diff mbox series

Patch

diff --git a/block/fops.c b/block/fops.c
index 2d01c90076813..e3f3f1957d86d 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -71,7 +71,7 @@  static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 		bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
 	}
 	bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
-	bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+	bio.bi_write_hint = iocb->ki_write_hint;
 	bio.bi_ioprio = iocb->ki_ioprio;
 	if (iocb->ki_flags & IOCB_ATOMIC)
 		bio.bi_opf |= REQ_ATOMIC;
@@ -200,7 +200,7 @@  static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 	for (;;) {
 		bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
-		bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+		bio->bi_write_hint = iocb->ki_write_hint;
 		bio->bi_private = dio;
 		bio->bi_end_io = blkdev_bio_end_io;
 		bio->bi_ioprio = iocb->ki_ioprio;
@@ -316,7 +316,7 @@  static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	dio->flags = 0;
 	dio->iocb = iocb;
 	bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
-	bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+	bio->bi_write_hint = iocb->ki_write_hint;
 	bio->bi_end_io = blkdev_bio_end_io_async;
 	bio->bi_ioprio = iocb->ki_ioprio;
 
@@ -362,6 +362,23 @@  static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	return -EIOCBQUEUED;
 }
 
+static u16 blkdev_write_hint(struct kiocb *iocb, struct block_device *bdev)
+{
+	u16 hint = iocb->ki_write_hint;
+
+	if (!hint)
+		return file_inode(iocb->ki_filp)->i_write_hint;
+
+	if (hint > bdev_max_write_hints(bdev))
+		return file_inode(iocb->ki_filp)->i_write_hint;
+
+	if (bdev_is_partition(bdev) &&
+	    !test_bit(hint - 1, bdev->write_hint_mask))
+		return file_inode(iocb->ki_filp)->i_write_hint;
+
+	return hint;
+}
+
 static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
@@ -373,6 +390,9 @@  static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (blkdev_dio_invalid(bdev, iocb, iter))
 		return -EINVAL;
 
+	if (iov_iter_rw(iter) == WRITE)
+		iocb->ki_write_hint = blkdev_write_hint(iocb, bdev);
+
 	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
 	if (likely(nr_pages <= BIO_MAX_VECS)) {
 		if (is_sync_kiocb(iocb))
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4b5cad44a1268..1a00accf412e5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -370,6 +370,7 @@  struct kiocb {
 	void			*private;
 	int			ki_flags;
 	u16			ki_ioprio; /* See linux/ioprio.h */
+	u16			ki_write_hint;
 	union {
 		/*
 		 * Only used for async buffered reads, where it denotes the