diff mbox series

[3/8] iomap: add a IOMAP_F_ZONE_APPEND flag

Message ID 20241211085420.1380396-4-hch@lst.de (mailing list archive)
State New
Headers show
Series [1/8] iomap: allow the file system to submit the writeback bios | expand

Commit Message

Christoph Hellwig Dec. 11, 2024, 8:53 a.m. UTC
This doesn't much - just always returns the start block number for each
iomap instead of increasing it.  This is because we'll keep building bios
unconstrained by the hardware limits and just split them in file system
submission handler.

Maybe we should find another name for it, because it might be useful for
btrfs compressed bio submissions as well, but I can't come up with a
good one.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap/buffered-io.c | 19 ++++++++++++++++---
 include/linux/iomap.h  |  7 +++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

Comments

Darrick J. Wong Dec. 12, 2024, 6:05 p.m. UTC | #1
On Wed, Dec 11, 2024 at 09:53:43AM +0100, Christoph Hellwig wrote:
> This doesn't much - just always returns the start block number for each

"This doesn't much" - I don't understand the sentence very well.  How
about:

"Add a new IOMAP_F_ZONE_APPEND flag for the filesystem to indicate that
the storage device must inform us where it wrote the data, so that the
filesystem can update its own internal mapping metadata.  The filesystem
should set the starting address of the zone in iomap::addr, and extract
the LBA address from the bio during ioend processing.  iomap builds
bios unconstrained by the hardware limits and will split them in the bio
submission handler."

The splitting happens whenever IOMAP_F_BOUNDARY gets set by
->map_blocks, right?

> iomap instead of increasing it.  This is because we'll keep building bios
> unconstrained by the hardware limits and just split them in file system
> submission handler.
> 
> Maybe we should find another name for it, because it might be useful for
> btrfs compressed bio submissions as well, but I can't come up with a
> good one.

Since you have to tell the device the starting LBA of the zone you want
to write to, I think _ZONE_APPEND is a reasonable name.  The code looks
correct though.

> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/iomap/buffered-io.c | 19 ++++++++++++++++---
>  include/linux/iomap.h  |  7 +++++++
>  2 files changed, 23 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 3176dc996fb7..129cd96c6c96 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -1744,9 +1744,22 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
>  		return false;
>  	if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
>  		return false;
> -	if (iomap_sector(&wpc->iomap, pos) !=
> -	    bio_end_sector(&wpc->ioend->io_bio))
> -		return false;
> +	if (wpc->iomap.flags & IOMAP_F_ZONE_APPEND) {
> +		/*
> +		 * For Zone Append command, bi_sector points to the zone start
> +		 * before submission.  We can merge all I/O for the same zone.
> +		 */
> +		if (iomap_sector(&wpc->iomap, pos) !=
> +		    wpc->ioend->io_bio.bi_iter.bi_sector)
> +			return false;
> +	} else {
> +		/*
> +		 * For regular writes, the disk blocks needs to be contiguous.
> +		 */
> +		if (iomap_sector(&wpc->iomap, pos) !=
> +		    bio_end_sector(&wpc->ioend->io_bio))
> +			return false;
> +	}
>  	/*
>  	 * Limit ioend bio chain lengths to minimise IO completion latency. This
>  	 * also prevents long tight loops ending page writeback on all the
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 1d8658c7beb8..173d490c20ba 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -56,6 +56,10 @@ struct vm_fault;
>   *
>   * IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must
>   * never be merged with the mapping before it.
> + *
> + * IOMAP_F_ZONE_APPEND indicates that (write) I/O should be done as a zone
> + * append command for zoned devices.  Note that the file system needs to
> + * override the bi_end_io handler to record the actual written sector.
>   */
>  #define IOMAP_F_NEW		(1U << 0)
>  #define IOMAP_F_DIRTY		(1U << 1)
> @@ -68,6 +72,7 @@ struct vm_fault;
>  #endif /* CONFIG_BUFFER_HEAD */
>  #define IOMAP_F_XATTR		(1U << 5)
>  #define IOMAP_F_BOUNDARY	(1U << 6)
> +#define IOMAP_F_ZONE_APPEND	(1U << 7)

Needs a corresponding update in Documentation/iomap/ before we merge
this series.

--D

>  
>  /*
>   * Flags set by the core iomap code during operations:
> @@ -111,6 +116,8 @@ struct iomap {
>  
>  static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
>  {
> +	if (iomap->flags & IOMAP_F_ZONE_APPEND)
> +		return iomap->addr >> SECTOR_SHIFT;
>  	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
>  }
>  
> -- 
> 2.45.2
> 
>
Christoph Hellwig Dec. 13, 2024, 4:55 a.m. UTC | #2
On Thu, Dec 12, 2024 at 10:05:47AM -0800, Darrick J. Wong wrote:
> On Wed, Dec 11, 2024 at 09:53:43AM +0100, Christoph Hellwig wrote:
> > This doesn't much - just always returns the start block number for each
> 
> "This doesn't much" - I don't understand the sentence very well.  How
> about:
> 
> "Add a new IOMAP_F_ZONE_APPEND flag for the filesystem to indicate that
> the storage device must inform us where it wrote the data, so that the
> filesystem can update its own internal mapping metadata.  The filesystem
> should set the starting address of the zone in iomap::addr, and extract
> the LBA address from the bio during ioend processing.  iomap builds
> bios unconstrained by the hardware limits and will split them in the bio
> submission handler."

Sounds reasonable.

> The splitting happens whenever IOMAP_F_BOUNDARY gets set by
> ->map_blocks, right?

The splitting happens when:

 (a) we run out of enough space in one zone and have to switch to another
     one.  That then most likely picks an new zone, in which case
     IOMAP_F_BOUNDARY will be set
 (b) if the hardware I/O constraints require splitting the I/O (typically
     because it's too larger)
Christoph Hellwig Dec. 16, 2024, 4:55 a.m. UTC | #3
On Thu, Dec 12, 2024 at 10:05:47AM -0800, Darrick J. Wong wrote:
> >  #endif /* CONFIG_BUFFER_HEAD */
> >  #define IOMAP_F_XATTR		(1U << 5)
> >  #define IOMAP_F_BOUNDARY	(1U << 6)
> > +#define IOMAP_F_ZONE_APPEND	(1U << 7)
> 
> Needs a corresponding update in Documentation/iomap/ before we merge
> this series.

Btw, while doing these updates I start to really despise
Documentation/filesystems/iomap/.  It basically just duplicates what's
alredy in the code comments, just far away from the code to which it
belongs in that awkward RST format.  I'm not sure what it is supposed
to buy us?
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 3176dc996fb7..129cd96c6c96 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1744,9 +1744,22 @@  static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
 		return false;
 	if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
 		return false;
-	if (iomap_sector(&wpc->iomap, pos) !=
-	    bio_end_sector(&wpc->ioend->io_bio))
-		return false;
+	if (wpc->iomap.flags & IOMAP_F_ZONE_APPEND) {
+		/*
+		 * For Zone Append command, bi_sector points to the zone start
+		 * before submission.  We can merge all I/O for the same zone.
+		 */
+		if (iomap_sector(&wpc->iomap, pos) !=
+		    wpc->ioend->io_bio.bi_iter.bi_sector)
+			return false;
+	} else {
+		/*
+		 * For regular writes, the disk blocks needs to be contiguous.
+		 */
+		if (iomap_sector(&wpc->iomap, pos) !=
+		    bio_end_sector(&wpc->ioend->io_bio))
+			return false;
+	}
 	/*
 	 * Limit ioend bio chain lengths to minimise IO completion latency. This
 	 * also prevents long tight loops ending page writeback on all the
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 1d8658c7beb8..173d490c20ba 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -56,6 +56,10 @@  struct vm_fault;
  *
  * IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must
  * never be merged with the mapping before it.
+ *
+ * IOMAP_F_ZONE_APPEND indicates that (write) I/O should be done as a zone
+ * append command for zoned devices.  Note that the file system needs to
+ * override the bi_end_io handler to record the actual written sector.
  */
 #define IOMAP_F_NEW		(1U << 0)
 #define IOMAP_F_DIRTY		(1U << 1)
@@ -68,6 +72,7 @@  struct vm_fault;
 #endif /* CONFIG_BUFFER_HEAD */
 #define IOMAP_F_XATTR		(1U << 5)
 #define IOMAP_F_BOUNDARY	(1U << 6)
+#define IOMAP_F_ZONE_APPEND	(1U << 7)
 
 /*
  * Flags set by the core iomap code during operations:
@@ -111,6 +116,8 @@  struct iomap {
 
 static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
 {
+	if (iomap->flags & IOMAP_F_ZONE_APPEND)
+		return iomap->addr >> SECTOR_SHIFT;
 	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
 }