diff mbox series

[RFC,01/16] block: Add atomic write operations to request_queue limits

Message ID 20230503183821.1473305-2-john.g.garry@oracle.com (mailing list archive)
State Handled Elsewhere
Headers show
Series block atomic writes | expand

Commit Message

John Garry May 3, 2023, 6:38 p.m. UTC
From: Himanshu Madhani <himanshu.madhani@oracle.com>

Add the following limits:
- atomic_write_boundary
- atomic_write_max_bytes
- atomic_write_unit_max
- atomic_write_unit_min

Signed-off-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 Documentation/ABI/stable/sysfs-block | 42 +++++++++++++++++++++
 block/blk-settings.c                 | 56 ++++++++++++++++++++++++++++
 block/blk-sysfs.c                    | 33 ++++++++++++++++
 include/linux/blkdev.h               | 23 ++++++++++++
 4 files changed, 154 insertions(+)

Comments

Dave Chinner May 3, 2023, 9:39 p.m. UTC | #1
On Wed, May 03, 2023 at 06:38:06PM +0000, John Garry wrote:
> From: Himanshu Madhani <himanshu.madhani@oracle.com>
> 
> Add the following limits:
> - atomic_write_boundary
> - atomic_write_max_bytes
> - atomic_write_unit_max
> - atomic_write_unit_min
> 
> Signed-off-by: Himanshu Madhani <himanshu.madhani@oracle.com>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  Documentation/ABI/stable/sysfs-block | 42 +++++++++++++++++++++
>  block/blk-settings.c                 | 56 ++++++++++++++++++++++++++++
>  block/blk-sysfs.c                    | 33 ++++++++++++++++
>  include/linux/blkdev.h               | 23 ++++++++++++
>  4 files changed, 154 insertions(+)
> 
> diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
> index 282de3680367..f3ed9890e03b 100644
> --- a/Documentation/ABI/stable/sysfs-block
> +++ b/Documentation/ABI/stable/sysfs-block
> @@ -21,6 +21,48 @@ Description:
>  		device is offset from the internal allocation unit's
>  		natural alignment.
>  
> +What:		/sys/block/<disk>/atomic_write_max_bytes
> +Date:		May 2023
> +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
> +Description:
> +		[RO] This parameter specifies the maximum atomic write
> +		size reported by the device. An atomic write operation
> +		must not exceed this number of bytes.
> +
> +
> +What:		/sys/block/<disk>/atomic_write_unit_min
> +Date:		May 2023
> +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
> +Description:
> +		[RO] This parameter specifies the smallest block which can
> +		be written atomically with an atomic write operation. All
> +		atomic write operations must begin at a
> +		atomic_write_unit_min boundary and must be multiples of
> +		atomic_write_unit_min. This value must be a power-of-two.

What units is this defined to use? Bytes?

> +
> +
> +What:		/sys/block/<disk>/atomic_write_unit_max
> +Date:		January 2023
> +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
> +Description:
> +		[RO] This parameter defines the largest block which can be
> +		written atomically with an atomic write operation. This
> +		value must be a multiple of atomic_write_unit_min and must
> +		be a power-of-two.

Same question. Also, how is this different to
atomic_write_max_bytes?

> +
> +
> +What:		/sys/block/<disk>/atomic_write_boundary
> +Date:		May 2023
> +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
> +Description:
> +		[RO] A device may need to internally split I/Os which
> +		straddle a given logical block address boundary. In that
> +		case a single atomic write operation will be processed as
> +		one of more sub-operations which each complete atomically.
> +		This parameter specifies the size in bytes of the atomic
> +		boundary if one is reported by the device. This value must
> +		be a power-of-two.

How are users/filesystems supposed to use this?

> +
>  
>  What:		/sys/block/<disk>/diskseq
>  Date:		February 2021
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index 896b4654ab00..e21731715a12 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -59,6 +59,9 @@ void blk_set_default_limits(struct queue_limits *lim)
>  	lim->zoned = BLK_ZONED_NONE;
>  	lim->zone_write_granularity = 0;
>  	lim->dma_alignment = 511;
> +	lim->atomic_write_unit_min = lim->atomic_write_unit_max = 1;

A value of "1" isn't obviously a power of 2, nor does it tell me
what units these values use.

> +	lim->atomic_write_max_bytes = 512;
> +	lim->atomic_write_boundary = 0;

The behaviour when the value is zero is not defined by the syfs
description above.

>  }
>  
>  /**
> @@ -183,6 +186,59 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
>  }
>  EXPORT_SYMBOL(blk_queue_max_discard_sectors);
>  
> +/**
> + * blk_queue_atomic_write_max_bytes - set max bytes supported by
> + * the device for atomic write operations.
> + * @q:  the request queue for the device
> + * @size: maximum bytes supported
> + */
> +void blk_queue_atomic_write_max_bytes(struct request_queue *q,
> +				      unsigned int size)
> +{
> +	q->limits.atomic_write_max_bytes = size;
> +}
> +EXPORT_SYMBOL(blk_queue_atomic_write_max_bytes);
> +
> +/**
> + * blk_queue_atomic_write_boundary - Device's logical block address space
> + * which an atomic write should not cross.

I have no idea what "logical block address space which an atomic
write should not cross" means, especially as the unit is in bytes
and not in sectors (which are the units LBAs are expressed in).

> + * @q:  the request queue for the device
> + * @size: size in bytes. Must be a power-of-two.
> + */
> +void blk_queue_atomic_write_boundary(struct request_queue *q,
> +				     unsigned int size)
> +{
> +	q->limits.atomic_write_boundary = size;
> +}
> +EXPORT_SYMBOL(blk_queue_atomic_write_boundary);
> +
> +/**
> + * blk_queue_atomic_write_unit_min - smallest unit that can be written
> + *				     atomically to the device.
> + * @q:  the request queue for the device
> + * @sectors: must be a power-of-two.
> + */
> +void blk_queue_atomic_write_unit_min(struct request_queue *q,
> +				     unsigned int sectors)
> +{
> +	q->limits.atomic_write_unit_min = sectors;
> +}
> +EXPORT_SYMBOL(blk_queue_atomic_write_unit_min);

Oh, these are sectors?

What size sector? Are we talking about fixed size 512 byte basic
block units, or are we talking about physical device sector sizes
(e.g. 4kB, maybe larger in future?)

These really should be in bytes, as they are directly exposed to
userspace applications via statx and applications will have no idea
what the sector size actually is without having to query the block
device directly...

> +
> +/*
> + * blk_queue_atomic_write_unit_max - largest unit that can be written
> + * atomically to the device.
> + * @q: the reqeust queue for the device
> + * @sectors: must be a power-of-two.
> + */
> +void blk_queue_atomic_write_unit_max(struct request_queue *q,
> +				     unsigned int sectors)
> +{
> +	struct queue_limits *limits = &q->limits;
> +	limits->atomic_write_unit_max = sectors;
> +}
> +EXPORT_SYMBOL(blk_queue_atomic_write_unit_max);
> +
>  /**
>   * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase
>   * @q:  the request queue for the device
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index f1fce1c7fa44..1025beff2281 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -132,6 +132,30 @@ static ssize_t queue_max_discard_segments_show(struct request_queue *q,
>  	return queue_var_show(queue_max_discard_segments(q), page);
>  }
>  
> +static ssize_t queue_atomic_write_max_bytes_show(struct request_queue *q,
> +						char *page)
> +{
> +	return queue_var_show(q->limits.atomic_write_max_bytes, page);
> +}
> +
> +static ssize_t queue_atomic_write_boundary_show(struct request_queue *q,
> +						char *page)
> +{
> +	return queue_var_show(q->limits.atomic_write_boundary, page);
> +}
> +
> +static ssize_t queue_atomic_write_unit_min_show(struct request_queue *q,
> +						char *page)
> +{
> +	return queue_var_show(queue_atomic_write_unit_min(q), page);
> +}
> +
> +static ssize_t queue_atomic_write_unit_max_show(struct request_queue *q,
> +						char *page)
> +{
> +	return queue_var_show(queue_atomic_write_unit_max(q), page);
> +}
> +
>  static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
>  {
>  	return queue_var_show(q->limits.max_integrity_segments, page);
> @@ -604,6 +628,11 @@ QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes");
>  QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes");
>  QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
>  
> +QUEUE_RO_ENTRY(queue_atomic_write_max_bytes, "atomic_write_max_bytes");
> +QUEUE_RO_ENTRY(queue_atomic_write_boundary, "atomic_write_boundary");
> +QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max");
> +QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min");
> +
>  QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
>  QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
>  QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
> @@ -661,6 +690,10 @@ static struct attribute *queue_attrs[] = {
>  	&queue_discard_max_entry.attr,
>  	&queue_discard_max_hw_entry.attr,
>  	&queue_discard_zeroes_data_entry.attr,
> +	&queue_atomic_write_max_bytes_entry.attr,
> +	&queue_atomic_write_boundary_entry.attr,
> +	&queue_atomic_write_unit_min_entry.attr,
> +	&queue_atomic_write_unit_max_entry.attr,
>  	&queue_write_same_max_entry.attr,
>  	&queue_write_zeroes_max_entry.attr,
>  	&queue_zone_append_max_entry.attr,
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 941304f17492..6b6f2992338c 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -304,6 +304,11 @@ struct queue_limits {
>  	unsigned int		discard_alignment;
>  	unsigned int		zone_write_granularity;
>  
> +	unsigned int		atomic_write_boundary;
> +	unsigned int		atomic_write_max_bytes;
> +	unsigned int		atomic_write_unit_min;
> +	unsigned int		atomic_write_unit_max;
> +
>  	unsigned short		max_segments;
>  	unsigned short		max_integrity_segments;
>  	unsigned short		max_discard_segments;
> @@ -929,6 +934,14 @@ void blk_queue_zone_write_granularity(struct request_queue *q,
>  				      unsigned int size);
>  extern void blk_queue_alignment_offset(struct request_queue *q,
>  				       unsigned int alignment);
> +extern void blk_queue_atomic_write_max_bytes(struct request_queue *q,
> +					     unsigned int size);
> +extern void blk_queue_atomic_write_unit_max(struct request_queue *q,
> +					    unsigned int sectors);
> +extern void blk_queue_atomic_write_unit_min(struct request_queue *q,
> +					    unsigned int sectors);
> +extern void blk_queue_atomic_write_boundary(struct request_queue *q,
> +					    unsigned int size);
>  void disk_update_readahead(struct gendisk *disk);
>  extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
>  extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
> @@ -1331,6 +1344,16 @@ static inline int queue_dma_alignment(const struct request_queue *q)
>  	return q ? q->limits.dma_alignment : 511;
>  }
>  
> +static inline unsigned int queue_atomic_write_unit_max(const struct request_queue *q)
> +{
> +	return q->limits.atomic_write_unit_max << SECTOR_SHIFT;
> +}
> +
> +static inline unsigned int queue_atomic_write_unit_min(const struct request_queue *q)
> +{
> +	return q->limits.atomic_write_unit_min << SECTOR_SHIFT;
> +}

Ah, what? This undocumented interface reports "unit limits" in
bytes, but it's not using the physical device sector size to convert
between sector units and bytes. This really needs some more
documentation and work to make it present all units consistently and
not result in confusion when devices have 4kB sector sizes and not
512 byte sectors...

Also, I think all the byte ranges should support full 64 bit values,
otherwise there will be silent overflows in converting 32 bit sector
counts to byte ranges. And, eventually, something will want to do
larger than 4GB atomic IOs

Cheers,

Dave.
John Garry May 4, 2023, 6:14 p.m. UTC | #2
Hi Dave,

>> diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
>> index 282de3680367..f3ed9890e03b 100644
>> --- a/Documentation/ABI/stable/sysfs-block
>> +++ b/Documentation/ABI/stable/sysfs-block
>> @@ -21,6 +21,48 @@ Description:
>>   		device is offset from the internal allocation unit's
>>   		natural alignment.
>>   
>> +What:		/sys/block/<disk>/atomic_write_max_bytes
>> +Date:		May 2023
>> +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
>> +Description:
>> +		[RO] This parameter specifies the maximum atomic write
>> +		size reported by the device. An atomic write operation
>> +		must not exceed this number of bytes.
>> +
>> +
>> +What:		/sys/block/<disk>/atomic_write_unit_min
>> +Date:		May 2023
>> +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
>> +Description:
>> +		[RO] This parameter specifies the smallest block which can
>> +		be written atomically with an atomic write operation. All
>> +		atomic write operations must begin at a
>> +		atomic_write_unit_min boundary and must be multiples of
>> +		atomic_write_unit_min. This value must be a power-of-two.
> 
> What units is this defined to use? Bytes?

Bytes

> 
>> +
>> +
>> +What:		/sys/block/<disk>/atomic_write_unit_max
>> +Date:		January 2023
>> +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
>> +Description:
>> +		[RO] This parameter defines the largest block which can be
>> +		written atomically with an atomic write operation. This
>> +		value must be a multiple of atomic_write_unit_min and must
>> +		be a power-of-two.
> 
> Same question. Also, how is this different to
> atomic_write_max_bytes?

Again, this is bytes. We can add "bytes" to the name of these other 
files if people think it's better. Unfortunately request_queue sysfs 
file naming isn't consistent here to begin with.

atomic_write_unit_max is largest application block size which we can 
support, while atomic_write_max_bytes is the max size of an atomic 
operation which the HW supports.

 From your review on the iomap patch, I assume that now you realise that 
we are proposing a write which may include multiple application data 
blocks (each limited in size to atomic_write_unit_max), and the limit in 
total size of that write is atomic_write_max_bytes.

user applications should only pay attention to what we return from 
statx, that being atomic_write_unit_min and atomic_write_unit_max.

atomic_write_max_bytes and atomic_write_boundary is only relevant to the 
block layer.

> 
>> +
>> +
>> +What:		/sys/block/<disk>/atomic_write_boundary
>> +Date:		May 2023
>> +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
>> +Description:
>> +		[RO] A device may need to internally split I/Os which
>> +		straddle a given logical block address boundary. In that
>> +		case a single atomic write operation will be processed as
>> +		one of more sub-operations which each complete atomically.
>> +		This parameter specifies the size in bytes of the atomic
>> +		boundary if one is reported by the device. This value must
>> +		be a power-of-two.
> 
> How are users/filesystems supposed to use this?

As above, this is not relevant to the user.

> 
>> +
>>   
>>   What:		/sys/block/<disk>/diskseq
>>   Date:		February 2021
>> diff --git a/block/blk-settings.c b/block/blk-settings.c
>> index 896b4654ab00..e21731715a12 100644
>> --- a/block/blk-settings.c
>> +++ b/block/blk-settings.c
>> @@ -59,6 +59,9 @@ void blk_set_default_limits(struct queue_limits *lim)
>>   	lim->zoned = BLK_ZONED_NONE;
>>   	lim->zone_write_granularity = 0;
>>   	lim->dma_alignment = 511;
>> +	lim->atomic_write_unit_min = lim->atomic_write_unit_max = 1;
> 
> A value of "1" isn't obviously a power of 2, nor does it tell me
> what units these values use.

I think that we should store these in bytes.

> 
>> +	lim->atomic_write_max_bytes = 512;
>> +	lim->atomic_write_boundary = 0;
> 
> The behaviour when the value is zero is not defined by the syfs
> description above.

I'll add it. A value of zero means no atomic boundary.

> 
>>   }
>>   
>>   /**
>> @@ -183,6 +186,59 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
>>   }
>>   EXPORT_SYMBOL(blk_queue_max_discard_sectors);
>>   
>> +/**
>> + * blk_queue_atomic_write_max_bytes - set max bytes supported by
>> + * the device for atomic write operations.
>> + * @q:  the request queue for the device
>> + * @size: maximum bytes supported
>> + */
>> +void blk_queue_atomic_write_max_bytes(struct request_queue *q,
>> +				      unsigned int size)
>> +{
>> +	q->limits.atomic_write_max_bytes = size;
>> +}
>> +EXPORT_SYMBOL(blk_queue_atomic_write_max_bytes);
>> +
>> +/**
>> + * blk_queue_atomic_write_boundary - Device's logical block address space
>> + * which an atomic write should not cross.
> 
> I have no idea what "logical block address space which an atomic
> write should not cross" means, especially as the unit is in bytes
> and not in sectors (which are the units LBAs are expressed in).

It means that an atomic operation which straddles the atomic boundary is 
not guaranteed to be atomic by the device, so we should (must) not cross 
it to maintain atomic behaviour for an application block. That's one 
reason that we have all these size and alignment rules.

> 
>> + * @q:  the request queue for the device
>> + * @size: size in bytes. Must be a power-of-two.
>> + */
>> +void blk_queue_atomic_write_boundary(struct request_queue *q,
>> +				     unsigned int size)
>> +{
>> +	q->limits.atomic_write_boundary = size;
>> +}
>> +EXPORT_SYMBOL(blk_queue_atomic_write_boundary);
>> +
>> +/**
>> + * blk_queue_atomic_write_unit_min - smallest unit that can be written
>> + *				     atomically to the device.
>> + * @q:  the request queue for the device
>> + * @sectors: must be a power-of-two.
>> + */
>> +void blk_queue_atomic_write_unit_min(struct request_queue *q,
>> +				     unsigned int sectors)
>> +{
>> +	q->limits.atomic_write_unit_min = sectors;
>> +}
>> +EXPORT_SYMBOL(blk_queue_atomic_write_unit_min);
> 
> Oh, these are sectors?

Again, we'll change to bytes.

> 
> What size sector? Are we talking about fixed size 512 byte basic
> block units,

Normally we would be referring to fixed size 512 byte basic
block unit

> or are we talking about physical device sector sizes
> (e.g. 4kB, maybe larger in future?)
> 
> These really should be in bytes, as they are directly exposed to
> userspace applications via statx and applications will have no idea
> what the sector size actually is without having to query the block
> device directly...

ok

> 
>> +
>> +/*
>> + * blk_queue_atomic_write_unit_max - largest unit that can be written
>> + * atomically to the device.
>> + * @q: the reqeust queue for the device
>> + * @sectors: must be a power-of-two.
>> + */
>> +void blk_queue_atomic_write_unit_max(struct request_queue *q,
>> +				     unsigned int sectors)
>> +{
>> +	struct queue_limits *limits = &q->limits;
>> +	limits->atomic_write_unit_max = sectors;
>> +}
>> +EXPORT_SYMBOL(blk_queue_atomic_write_unit_max);
>> +
>>   /**
>>    * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase
>>    * @q:  the request queue for the device
>> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
>> index f1fce1c7fa44..1025beff2281 100644
>> --- a/block/blk-sysfs.c
>> +++ b/block/blk-sysfs.c
>> @@ -132,6 +132,30 @@ static ssize_t queue_max_discard_segments_show(struct request_queue *q,
>>   	return queue_var_show(queue_max_discard_segments(q), page);
>>   }
>>   

...

>>   
>> +static inline unsigned int queue_atomic_write_unit_max(const struct request_queue *q)
>> +{
>> +	return q->limits.atomic_write_unit_max << SECTOR_SHIFT;
>> +}
>> +
>> +static inline unsigned int queue_atomic_write_unit_min(const struct request_queue *q)
>> +{
>> +	return q->limits.atomic_write_unit_min << SECTOR_SHIFT;
>> +}
> 
> Ah, what? This undocumented interface reports "unit limits" in
> bytes, but it's not using the physical device sector size to convert
> between sector units and bytes. This really needs some more
> documentation and work to make it present all units consistently and
> not result in confusion when devices have 4kB sector sizes and not
> 512 byte sectors...

ok, we'll look to fix this up to give a coherent and clear interface.

> 
> Also, I think all the byte ranges should support full 64 bit values,
> otherwise there will be silent overflows in converting 32 bit sector
> counts to byte ranges. And, eventually, something will want to do
> larger than 4GB atomic IOs
> 

ok, we can do that but would also then make statx field 64b. I'm fine 
with that if it is wise to do so - I don't don't want to wastefully use 
up an extra 2 x 32b in struct statx.

Thanks,
John
Dave Chinner May 4, 2023, 10:26 p.m. UTC | #3
On Thu, May 04, 2023 at 07:14:29PM +0100, John Garry wrote:
> Hi Dave,
> 
> > > diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
> > > index 282de3680367..f3ed9890e03b 100644
> > > --- a/Documentation/ABI/stable/sysfs-block
> > > +++ b/Documentation/ABI/stable/sysfs-block
> > > @@ -21,6 +21,48 @@ Description:
> > >   		device is offset from the internal allocation unit's
> > >   		natural alignment.
> > > +What:		/sys/block/<disk>/atomic_write_max_bytes
> > > +Date:		May 2023
> > > +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
> > > +Description:
> > > +		[RO] This parameter specifies the maximum atomic write
> > > +		size reported by the device. An atomic write operation
> > > +		must not exceed this number of bytes.
> > > +
> > > +
> > > +What:		/sys/block/<disk>/atomic_write_unit_min
> > > +Date:		May 2023
> > > +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
> > > +Description:
> > > +		[RO] This parameter specifies the smallest block which can
> > > +		be written atomically with an atomic write operation. All
> > > +		atomic write operations must begin at a
> > > +		atomic_write_unit_min boundary and must be multiples of
> > > +		atomic_write_unit_min. This value must be a power-of-two.
> > 
> > What units is this defined to use? Bytes?
> 
> Bytes
> 
> > 
> > > +
> > > +
> > > +What:		/sys/block/<disk>/atomic_write_unit_max
> > > +Date:		January 2023
> > > +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
> > > +Description:
> > > +		[RO] This parameter defines the largest block which can be
> > > +		written atomically with an atomic write operation. This
> > > +		value must be a multiple of atomic_write_unit_min and must
> > > +		be a power-of-two.
> > 
> > Same question. Also, how is this different to
> > atomic_write_max_bytes?
> 
> Again, this is bytes. We can add "bytes" to the name of these other files if
> people think it's better. Unfortunately request_queue sysfs file naming
> isn't consistent here to begin with.
> 
> atomic_write_unit_max is largest application block size which we can
> support, while atomic_write_max_bytes is the max size of an atomic operation
> which the HW supports.

Why are these different? If the hardware supports 128kB atomic
writes, why limit applications to something smaller?

> From your review on the iomap patch, I assume that now you realise that we
> are proposing a write which may include multiple application data blocks
> (each limited in size to atomic_write_unit_max), and the limit in total size
> of that write is atomic_write_max_bytes.

I still don't get it - you haven't explained why/what an application
atomic block write might be, nor why the block device should be
determining the size of application data blocks, etc.  If the block
device can do 128kB atomic writes, why wouldn't the device allow the
application to do 128kB atomic writes if they've aligned the atomic
write correctly?

What happens we we get hardware that can do atomic writes at any
alignment, of any size up to atomic_write_max_bytes? Because this
interface defines atomic writes as "must be a multiple of 2 of
atomic_write_unit_min" then hardware that can do atomic writes of
any size can not be effectively utilised by this interface....

> user applications should only pay attention to what we return from statx,
> that being atomic_write_unit_min and atomic_write_unit_max.
> 
> atomic_write_max_bytes and atomic_write_boundary is only relevant to the
> block layer.

If applications can issue an multi-atomic_write_unit_max-block
writes as a single, non-atomic, multi-bio RWF_ATOMIC pwritev2() IO
and such IO is constrainted to atomic_write_max_bytes, then
atomic_write_max_bytes is most definitely relevant to user
applications.


> > > +What:		/sys/block/<disk>/atomic_write_boundary
> > > +Date:		May 2023
> > > +Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
> > > +Description:
> > > +		[RO] A device may need to internally split I/Os which
> > > +		straddle a given logical block address boundary. In that
> > > +		case a single atomic write operation will be processed as
> > > +		one of more sub-operations which each complete atomically.
> > > +		This parameter specifies the size in bytes of the atomic
> > > +		boundary if one is reported by the device. This value must
> > > +		be a power-of-two.
> > 
> > How are users/filesystems supposed to use this?
> 
> As above, this is not relevant to the user.

Applications will greatly care if their atomic IO gets split into
multiple IOs whose persistence order is undefined. I think it also
matters for filesystems when it comes to allocation, because we are
going to have to be very careful not to have extents straddle ranges
that will cause an atomic write to be split.

e.g. how does this work with striped devices? e.g. we have a stripe
unit of 16kB, but the devices support atomic_write_unit_max = 32kB.
Instantly, we have a configuration where atomic writes need to be
split at 16kB boundaries, and so the maximum atomic write size that
can be supported is actually 16kB - the stripe unit of RAID device.

This means the filesystem must, at minimum, align all allocations
for atomic IO to 16kB stripe unit alignment, and must not allow
atomic IOs that are not stripe unit aligned or sized to proceed
because they can't be processed as an atomic IO....


> > >   /**
> > > @@ -183,6 +186,59 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
> > >   }
> > >   EXPORT_SYMBOL(blk_queue_max_discard_sectors);
> > > +/**
> > > + * blk_queue_atomic_write_max_bytes - set max bytes supported by
> > > + * the device for atomic write operations.
> > > + * @q:  the request queue for the device
> > > + * @size: maximum bytes supported
> > > + */
> > > +void blk_queue_atomic_write_max_bytes(struct request_queue *q,
> > > +				      unsigned int size)
> > > +{
> > > +	q->limits.atomic_write_max_bytes = size;
> > > +}
> > > +EXPORT_SYMBOL(blk_queue_atomic_write_max_bytes);
> > > +
> > > +/**
> > > + * blk_queue_atomic_write_boundary - Device's logical block address space
> > > + * which an atomic write should not cross.
> > 
> > I have no idea what "logical block address space which an atomic
> > write should not cross" means, especially as the unit is in bytes
> > and not in sectors (which are the units LBAs are expressed in).
> 
> It means that an atomic operation which straddles the atomic boundary is not
> guaranteed to be atomic by the device, so we should (must) not cross it to
> maintain atomic behaviour for an application block. That's one reason that
> we have all these size and alignment rules.

Yes, That much is obvious. What I have no idea diea about is what
this means in practice. When is this ever going to be non-zero, and
what should be we doing at the filesystem allocation level when it
is non-zero to ensure that allocations for atomic writes never cross
such a boundary. i.e. how do we prevent applications from ever
needing this functionality to be triggered? i.e. so the filesystem
can guarantee a single RWF_ATOMIC user IO is actually dispatched
as a single REQ_ATOMIC IO....

> ...
> 
> > > +static inline unsigned int queue_atomic_write_unit_max(const struct request_queue *q)
> > > +{
> > > +	return q->limits.atomic_write_unit_max << SECTOR_SHIFT;
> > > +}
> > > +
> > > +static inline unsigned int queue_atomic_write_unit_min(const struct request_queue *q)
> > > +{
> > > +	return q->limits.atomic_write_unit_min << SECTOR_SHIFT;
> > > +}
> > 
> > Ah, what? This undocumented interface reports "unit limits" in
> > bytes, but it's not using the physical device sector size to convert
> > between sector units and bytes. This really needs some more
> > documentation and work to make it present all units consistently and
> > not result in confusion when devices have 4kB sector sizes and not
> > 512 byte sectors...
> 
> ok, we'll look to fix this up to give a coherent and clear interface.
> 
> > 
> > Also, I think all the byte ranges should support full 64 bit values,
> > otherwise there will be silent overflows in converting 32 bit sector
> > counts to byte ranges. And, eventually, something will want to do
> > larger than 4GB atomic IOs
> > 
> 
> ok, we can do that but would also then make statx field 64b. I'm fine with
> that if it is wise to do so - I don't don't want to wastefully use up an
> extra 2 x 32b in struct statx.

Why do we need specific varibles for DIO atomic write alignment
limits? We already have direct IO alignment and size constraints in statx(),
so why wouldn't we just reuse those variables when the user requests
atomic limits for DIO?

i.e. if STATX_DIOALIGN is set, we return normal DIO alignment
constraints. If STATX_DIOALIGN_ATOMIC is set, we return the atomic
DIO alignment requirements in those variables.....

Yes, we probably need the dio max size to be added to statx for
this. Historically speaking, I wanted statx to support this in the
first place because that's what we were already giving userspace
with XFS_IOC_DIOINFO and we already knew that atomic IO when it came
along would require a bound maximum IO size much smaller than normal
DIO limits.  i.e.:

struct dioattr {
        __u32           d_mem;          /* data buffer memory alignment */
        __u32           d_miniosz;      /* min xfer size                */
        __u32           d_maxiosz;      /* max xfer size                */
};

where d_miniosz defined the alignment and size constraints for DIOs.

If we simply document that STATX_DIOALIGN_ATOMIC returns minimum
(unit) atomic IO size and alignment in statx->dio_offset_align (as
per STATX_DIOALIGN) and the maximum atomic IO size in
statx->dio_max_iosize, then we don't burn up anywhere near as much
space in the statx structure....

Cheers,

Dave.
John Garry May 5, 2023, 7:54 a.m. UTC | #4
On 04/05/2023 23:26, Dave Chinner wrote:

Hi Dave,

>> atomic_write_unit_max is largest application block size which we can
>> support, while atomic_write_max_bytes is the max size of an atomic operation
>> which the HW supports.
> Why are these different? If the hardware supports 128kB atomic
> writes, why limit applications to something smaller?

Two reasons:
a. If you see patch 6/16, we need to apply a limit on 
atomic_write_unit_max from what is guaranteed we can fit in a bio 
without it being required to be split when submitted.

Consider iomap generates an atomic write bio for a single userspace 
block and submits to the block layer - if the block layer needs to split 
due to block driver request_queue limits, like max_segments, then we're 
in trouble. So we need to limit atomic_write_unit_max such that this 
will not occur. That same limit should not apply to atomic_write_max_bytes.

b. For NVMe, atomic_write_unit_max and atomic_write_max_bytes which the 
host reports will be the same (ignoring a.).

However for SCSI they may be different. SCSI has its own concept of 
boundary and it is relevant here. This is confusing as it is very 
different from NVMe boundary. NVMe is a media boundary really. For SCSI, 
a boundary is a sub-segment which the device may split an atomic write 
operation. For a SCSI device which only supports this boundary mode of 
operation, we limit atomic_write_unit_max to the max boundary segment 
size (such that we don't get splitting of an atomic write by the device) 
and then limit atomic_write_max_bytes to what is known in the spec as 
"maximum atomic transfer length with boundary". So in this device mode 
of operation, atomic_write_max_bytes and atomic_write_unit_max should be 
different.

> 
>>  From your review on the iomap patch, I assume that now you realise that we
>> are proposing a write which may include multiple application data blocks
>> (each limited in size to atomic_write_unit_max), and the limit in total size
>> of that write is atomic_write_max_bytes.
> I still don't get it - you haven't explained why/what an application
> atomic block write might be, nor why the block device should be
> determining the size of application data blocks, etc.  If the block
> device can do 128kB atomic writes, why wouldn't the device allow the
> application to do 128kB atomic writes if they've aligned the atomic
> write correctly?

An application block needs to be:
- sized at a power-of-two
- sized between atomic_write_unit_min and atomic_write_unit_max, inclusive
- naturally aligned

Please consider that the application does not explicitly tell the kernel 
the size of its data blocks, it's implied from the size of the write and 
file offset. So, assuming that userspace follows the rules properly when 
issuing a write, the kernel may deduce the application block size and 
ensure only that each individual user data block is not split.

If userspace wants a guarantee of no splitting of all in its write, then 
it may issue a write for a single userspace data block, e.g. userspace 
block size is 16KB, then write at a file offset aligned to 16KB and a 
total write size of 16KB will be guaranteed to be written atomically by 
the device.

> 
> What happens we we get hardware that can do atomic writes at any
> alignment, of any size up to atomic_write_max_bytes? Because this
> interface defines atomic writes as "must be a multiple of 2 of
> atomic_write_unit_min" then hardware that can do atomic writes of
> any size can not be effectively utilised by this interface....
> 
>> user applications should only pay attention to what we return from statx,
>> that being atomic_write_unit_min and atomic_write_unit_max.
>>
>> atomic_write_max_bytes and atomic_write_boundary is only relevant to the
>> block layer.
> If applications can issue an multi-atomic_write_unit_max-block
> writes as a single, non-atomic, multi-bio RWF_ATOMIC pwritev2() IO
> and such IO is constrainted to atomic_write_max_bytes, then
> atomic_write_max_bytes is most definitely relevant to user
> applications.

But we still do not guarantee that multi-atomic_write_unit_max-block 
writes as a single, non-atomic, multi-bio RWF_ATOMIC pwritev2() IO and 
such IO is constrained to atomic_write_max_bytes will be written 
atomically by the device.

Three things may happen in the kernel:
- we may need to split due to atomic boundary
- we may need to split due to the write spanning discontig extents
- atomic_write_max_bytes may be much larger than what we could fit in a 
bio, so may need multiple bios

And maybe more which does not come to mind.

So I am not sure what value there is in reporting atomic_write_max_bytes 
to the user. The description would need to be something like "we 
guarantee that if the total write length is greater than 
atomic_write_max_bytes, then all data will never be submitted to the 
device atomically. Otherwise it might be".

> 
> 
>>>> +What:		/sys/block/<disk>/atomic_write_boundary
>>>> +Date:		May 2023
>>>> +Contact:	Himanshu Madhani<himanshu.madhani@oracle.com>
>>>> +Description:
>>>> +		[RO] A device may need to internally split I/Os which
>>>> +		straddle a given logical block address boundary. In that
>>>> +		case a single atomic write operation will be processed as
>>>> +		one of more sub-operations which each complete atomically.
>>>> +		This parameter specifies the size in bytes of the atomic
>>>> +		boundary if one is reported by the device. This value must
>>>> +		be a power-of-two.
>>> How are users/filesystems supposed to use this?
>> As above, this is not relevant to the user.
> Applications will greatly care if their atomic IO gets split into
> multiple IOs whose persistence order is undefined.

Sure, so maybe then we need to define and support persistence ordering 
rules. But still, any atomic_write_boundary is already taken into 
account when we report atomic_write_unit_min and atomic_write_unit_max 
to the user.

> I think it also
> matters for filesystems when it comes to allocation, because we are
> going to have to be very careful not to have extents straddle ranges
> that will cause an atomic write to be split.

Note that block drivers need to ensure that they report the following:
- atomic_write_unit_max is a power-of-2
- atomic_write_boundary is a power-of-2 (and naturally it would need to 
be greater or equal to atomic_write_unit_max)
[sidenote: I actually think that atomic_write_boundary needs to be just 
a multiple of atomic_write_unit_max, but let's stick with these rules 
for the moment]

As such, if we split a write due to a boundary, we would still always be 
able to split such that we don't need to split an individual userspace 
data block.

> 
> e.g. how does this work with striped devices? e.g. we have a stripe
> unit of 16kB, but the devices support atomic_write_unit_max = 32kB.
> Instantly, we have a configuration where atomic writes need to be
> split at 16kB boundaries, and so the maximum atomic write size that
> can be supported is actually 16kB - the stripe unit of RAID device.

OK, so in that case, I think that we would need to limit the reported 
atomic_write_unit_max value to the stripe value in a RAID config.

> 
> This means the filesystem must, at minimum, align all allocations
> for atomic IO to 16kB stripe unit alignment, and must not allow
> atomic IOs that are not stripe unit aligned or sized to proceed
> because they can't be processed as an atomic IO....

As above. Martin may be able to comment more on this.

> 
> 
>>>>    /**
>>>> @@ -183,6 +186,59 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
>>>>    }
>>>>    EXPORT_SYMBOL(blk_queue_max_discard_sectors);
>>>> +/**
>>>> + * blk_queue_atomic_write_max_bytes - set max bytes supported by
>>>> + * the device for atomic write operations.
>>>> + * @q:  the request queue for the device
>>>> + * @size: maximum bytes supported
>>>> + */
>>>> +void blk_queue_atomic_write_max_bytes(struct request_queue *q,
>>>> +				      unsigned int size)
>>>> +{
>>>> +	q->limits.atomic_write_max_bytes = size;
>>>> +}
>>>> +EXPORT_SYMBOL(blk_queue_atomic_write_max_bytes);
>>>> +
>>>> +/**
>>>> + * blk_queue_atomic_write_boundary - Device's logical block address space
>>>> + * which an atomic write should not cross.
>>> I have no idea what "logical block address space which an atomic
>>> write should not cross" means, especially as the unit is in bytes
>>> and not in sectors (which are the units LBAs are expressed in).
>> It means that an atomic operation which straddles the atomic boundary is not
>> guaranteed to be atomic by the device, so we should (must) not cross it to
>> maintain atomic behaviour for an application block. That's one reason that
>> we have all these size and alignment rules.
> Yes, That much is obvious. What I have no idea diea about is what
> this means in practice. When is this ever going to be non-zero, and
> what should be we doing at the filesystem allocation level when it
> is non-zero to ensure that allocations for atomic writes never cross
> such a boundary. i.e. how do we prevent applications from ever
> needing this functionality to be triggered? i.e. so the filesystem
> can guarantee a single RWF_ATOMIC user IO is actually dispatched
> as a single REQ_ATOMIC IO....

We only guarantee that a single user data block will not be split. So to 
avoid any splitting at all, all you can do is write a single user data 
block. That's the best which we can offer.

As mentioned earlier, atomic boundary is only relevant to NVMe. If the 
device does not support an atomic boundary which is not compliant with 
the rules, then we cannot support atomic writes for that device.

> 
>> ...
>>
>>>> +static inline unsigned int queue_atomic_write_unit_max(const struct request_queue *q)
>>>> +{
>>>> +	return q->limits.atomic_write_unit_max << SECTOR_SHIFT;
>>>> +}
>>>> +
>>>> +static inline unsigned int queue_atomic_write_unit_min(const struct request_queue *q)
>>>> +{
>>>> +	return q->limits.atomic_write_unit_min << SECTOR_SHIFT;
>>>> +}
>>> Ah, what? This undocumented interface reports "unit limits" in
>>> bytes, but it's not using the physical device sector size to convert
>>> between sector units and bytes. This really needs some more
>>> documentation and work to make it present all units consistently and
>>> not result in confusion when devices have 4kB sector sizes and not
>>> 512 byte sectors...
>> ok, we'll look to fix this up to give a coherent and clear interface.
>>
>>> Also, I think all the byte ranges should support full 64 bit values,
>>> otherwise there will be silent overflows in converting 32 bit sector
>>> counts to byte ranges. And, eventually, something will want to do
>>> larger than 4GB atomic IOs
>>>
>> ok, we can do that but would also then make statx field 64b. I'm fine with
>> that if it is wise to do so - I don't don't want to wastefully use up an
>> extra 2 x 32b in struct statx.
> Why do we need specific varibles for DIO atomic write alignment
> limits?

I guess that we don't

> We already have direct IO alignment and size constraints in statx(),
> so why wouldn't we just reuse those variables when the user requests
> atomic limits for DIO?
> 
> i.e. if STATX_DIOALIGN is set, we return normal DIO alignment
> constraints. If STATX_DIOALIGN_ATOMIC is set, we return the atomic
> DIO alignment requirements in those variables.....
> 
> Yes, we probably need the dio max size to be added to statx for
> this. Historically speaking, I wanted statx to support this in the
> first place because that's what we were already giving userspace
> with XFS_IOC_DIOINFO and we already knew that atomic IO when it came
> along would require a bound maximum IO size much smaller than normal
> DIO limits.  i.e.:
> 
> struct dioattr {
>          __u32           d_mem;          /* data buffer memory alignment */
>          __u32           d_miniosz;      /* min xfer size                */
>          __u32           d_maxiosz;      /* max xfer size                */
> };
> 
> where d_miniosz defined the alignment and size constraints for DIOs.
> 
> If we simply document that STATX_DIOALIGN_ATOMIC returns minimum
> (unit) atomic IO size and alignment in statx->dio_offset_align (as
> per STATX_DIOALIGN) and the maximum atomic IO size in
> statx->dio_max_iosize, then we don't burn up anywhere near as much
> space in the statx structure....

ok, so you are saying to unionize them, right? That would seem 
reasonable to me.

Thanks,
John
Darrick J. Wong May 5, 2023, 10 p.m. UTC | #5
On Fri, May 05, 2023 at 08:54:12AM +0100, John Garry wrote:
> On 04/05/2023 23:26, Dave Chinner wrote:
> 
> Hi Dave,
> 
> > > atomic_write_unit_max is largest application block size which we can
> > > support, while atomic_write_max_bytes is the max size of an atomic operation
> > > which the HW supports.
> > Why are these different? If the hardware supports 128kB atomic
> > writes, why limit applications to something smaller?
> 
> Two reasons:
> a. If you see patch 6/16, we need to apply a limit on atomic_write_unit_max
> from what is guaranteed we can fit in a bio without it being required to be
> split when submitted.
> 
> Consider iomap generates an atomic write bio for a single userspace block
> and submits to the block layer - if the block layer needs to split due to
> block driver request_queue limits, like max_segments, then we're in trouble.
> So we need to limit atomic_write_unit_max such that this will not occur.
> That same limit should not apply to atomic_write_max_bytes.
> 
> b. For NVMe, atomic_write_unit_max and atomic_write_max_bytes which the host
> reports will be the same (ignoring a.).
> 
> However for SCSI they may be different. SCSI has its own concept of boundary
> and it is relevant here. This is confusing as it is very different from NVMe
> boundary. NVMe is a media boundary really. For SCSI, a boundary is a
> sub-segment which the device may split an atomic write operation. For a SCSI
> device which only supports this boundary mode of operation, we limit
> atomic_write_unit_max to the max boundary segment size (such that we don't
> get splitting of an atomic write by the device) and then limit
> atomic_write_max_bytes to what is known in the spec as "maximum atomic
> transfer length with boundary". So in this device mode of operation,
> atomic_write_max_bytes and atomic_write_unit_max should be different.

Hmm, maybe some concrete examples would be useful here?  I find the
queue limits stuff pretty confusing too.

Could a SCSI device could advertise 512b LBAs, 4096b physical blocks, a
64k atomic_write_unit_max, and a 1MB maximum transfer length
(atomic_write_max_bytes)?  And does that mean that application software
can send one 64k-aligned write and expect it either to be persisted
completely or not at all?

And, does that mean that the application can send up to 16 of these
64k-aligned blocks as a single 1MB IO and expect that each of those 16
blocks will either be persisted entirely or not at all?  There doesn't
seem to be any means for the device to report /which/ of the 16 were
persisted, which is disappointing.  But maybe the application encodes
LSNs and can tell after the fact that something went wrong, and recover?

If the same device reports a 2048b atomic_write_unit_min, does that mean
that I can send between 2 and 64k of data as a single atomic write and
that's ok?  I assume that this weird situation (512b LBA, 4k physical,
2k atomic unit min) requires some fancy RMW but that the device is
prepared to cr^Wpersist that correctly?

What if the device also advertises a 128k atomic_write_boundary?
That means that a 2k atomic block write will fail if it starts at 127k,
but if it starts at 126k then thats ok.  Right?

As for avoiding splits in the block layer, I guess that also means that
someone needs to reduce atomic_write_unit_max and atomic_write_boundary
if (say) some sysadmin decides to create a raid0 of these devices with a
32k stripe size?

It sounds like NVME is simpler in that it would report 64k for both the
max unit and the max transfer length?  And for the 1M write I mentioned
above, the application must send 16 individual writes?

(Did I get all that correctly?)

> > 
> > >  From your review on the iomap patch, I assume that now you realise that we
> > > are proposing a write which may include multiple application data blocks
> > > (each limited in size to atomic_write_unit_max), and the limit in total size
> > > of that write is atomic_write_max_bytes.
> > I still don't get it - you haven't explained why/what an application
> > atomic block write might be, nor why the block device should be
> > determining the size of application data blocks, etc.  If the block
> > device can do 128kB atomic writes, why wouldn't the device allow the
> > application to do 128kB atomic writes if they've aligned the atomic
> > write correctly?
> 
> An application block needs to be:
> - sized at a power-of-two
> - sized between atomic_write_unit_min and atomic_write_unit_max, inclusive
> - naturally aligned
> 
> Please consider that the application does not explicitly tell the kernel the
> size of its data blocks, it's implied from the size of the write and file
> offset. So, assuming that userspace follows the rules properly when issuing
> a write, the kernel may deduce the application block size and ensure only
> that each individual user data block is not split.
> 
> If userspace wants a guarantee of no splitting of all in its write, then it
> may issue a write for a single userspace data block, e.g. userspace block
> size is 16KB, then write at a file offset aligned to 16KB and a total write
> size of 16KB will be guaranteed to be written atomically by the device.

I'm ... not sure what the userspace block size is?

With my app developer hat on, the simplest mental model of this is that
if I want to persist a blob of data that is larger than one device LBA,
then atomic_write_unit_min <= blob size <= atomic_write_unit_max must be
true, and the LBA range for the write cannot cross a atomic_write_boundary.

Does that sound right?

Going back to my sample device above, the XFS buffer cache could write
individual 4k filesystem metadata blocks using REQ_ATOMIC because 4k is
between the atomic write unit min/max, 4k metadata blocks will never
cross a 128k boundary, and we'd never have to worry about torn writes
in metadata ever again?

Furthermore, if I want to persist a bunch of blobs in a contiguous LBA
range and atomic_write_max_bytes > atomic_write_unit_max, then I can do
that with a single direct write?  I'm assuming that the blobs in the
middle of the range must all be exactly atomic_write_unit_max bytes in
size?  And I had better be prepared to (I guess) re-read the entire
range after the system goes down to find out if any of them did or did
not persist?

(This part sounds like a PITA.)

> > 
> > What happens we we get hardware that can do atomic writes at any
> > alignment, of any size up to atomic_write_max_bytes? Because this
> > interface defines atomic writes as "must be a multiple of 2 of
> > atomic_write_unit_min" then hardware that can do atomic writes of
> > any size can not be effectively utilised by this interface....
> > 
> > > user applications should only pay attention to what we return from statx,
> > > that being atomic_write_unit_min and atomic_write_unit_max.
> > > 
> > > atomic_write_max_bytes and atomic_write_boundary is only relevant to the
> > > block layer.
> > If applications can issue an multi-atomic_write_unit_max-block
> > writes as a single, non-atomic, multi-bio RWF_ATOMIC pwritev2() IO
> > and such IO is constrainted to atomic_write_max_bytes, then
> > atomic_write_max_bytes is most definitely relevant to user
> > applications.
> 
> But we still do not guarantee that multi-atomic_write_unit_max-block writes
> as a single, non-atomic, multi-bio RWF_ATOMIC pwritev2() IO and such IO is
> constrained to atomic_write_max_bytes will be written atomically by the
> device.
> 
> Three things may happen in the kernel:
> - we may need to split due to atomic boundary
> - we may need to split due to the write spanning discontig extents
> - atomic_write_max_bytes may be much larger than what we could fit in a bio,
> so may need multiple bios
> 
> And maybe more which does not come to mind.
> 
> So I am not sure what value there is in reporting atomic_write_max_bytes to
> the user. The description would need to be something like "we guarantee that
> if the total write length is greater than atomic_write_max_bytes, then all
> data will never be submitted to the device atomically. Otherwise it might
> be".
> 
> > 
> > 
> > > > > +What:		/sys/block/<disk>/atomic_write_boundary
> > > > > +Date:		May 2023
> > > > > +Contact:	Himanshu Madhani<himanshu.madhani@oracle.com>
> > > > > +Description:
> > > > > +		[RO] A device may need to internally split I/Os which
> > > > > +		straddle a given logical block address boundary. In that
> > > > > +		case a single atomic write operation will be processed as
> > > > > +		one of more sub-operations which each complete atomically.
> > > > > +		This parameter specifies the size in bytes of the atomic
> > > > > +		boundary if one is reported by the device. This value must
> > > > > +		be a power-of-two.
> > > > How are users/filesystems supposed to use this?
> > > As above, this is not relevant to the user.
> > Applications will greatly care if their atomic IO gets split into
> > multiple IOs whose persistence order is undefined.
> 
> Sure, so maybe then we need to define and support persistence ordering
> rules. But still, any atomic_write_boundary is already taken into account
> when we report atomic_write_unit_min and atomic_write_unit_max to the user.
> 
> > I think it also
> > matters for filesystems when it comes to allocation, because we are
> > going to have to be very careful not to have extents straddle ranges
> > that will cause an atomic write to be split.
> 
> Note that block drivers need to ensure that they report the following:
> - atomic_write_unit_max is a power-of-2
> - atomic_write_boundary is a power-of-2 (and naturally it would need to be
> greater or equal to atomic_write_unit_max)
> [sidenote: I actually think that atomic_write_boundary needs to be just a
> multiple of atomic_write_unit_max, but let's stick with these rules for the
> moment]
> 
> As such, if we split a write due to a boundary, we would still always be
> able to split such that we don't need to split an individual userspace data
> block.

...but only if userspace data blocks (whatever those are) don't
themselves split atomic_write_boundary.

> > 
> > e.g. how does this work with striped devices? e.g. we have a stripe
> > unit of 16kB, but the devices support atomic_write_unit_max = 32kB.
> > Instantly, we have a configuration where atomic writes need to be
> > split at 16kB boundaries, and so the maximum atomic write size that
> > can be supported is actually 16kB - the stripe unit of RAID device.
> 
> OK, so in that case, I think that we would need to limit the reported
> atomic_write_unit_max value to the stripe value in a RAID config.
> 
> > 
> > This means the filesystem must, at minimum, align all allocations
> > for atomic IO to 16kB stripe unit alignment, and must not allow
> > atomic IOs that are not stripe unit aligned or sized to proceed
> > because they can't be processed as an atomic IO....
> 
> As above. Martin may be able to comment more on this.
> 
> > 
> > 
> > > > >    /**
> > > > > @@ -183,6 +186,59 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
> > > > >    }
> > > > >    EXPORT_SYMBOL(blk_queue_max_discard_sectors);
> > > > > +/**
> > > > > + * blk_queue_atomic_write_max_bytes - set max bytes supported by
> > > > > + * the device for atomic write operations.
> > > > > + * @q:  the request queue for the device
> > > > > + * @size: maximum bytes supported
> > > > > + */
> > > > > +void blk_queue_atomic_write_max_bytes(struct request_queue *q,
> > > > > +				      unsigned int size)
> > > > > +{
> > > > > +	q->limits.atomic_write_max_bytes = size;
> > > > > +}
> > > > > +EXPORT_SYMBOL(blk_queue_atomic_write_max_bytes);
> > > > > +
> > > > > +/**
> > > > > + * blk_queue_atomic_write_boundary - Device's logical block address space
> > > > > + * which an atomic write should not cross.
> > > > I have no idea what "logical block address space which an atomic
> > > > write should not cross" means, especially as the unit is in bytes
> > > > and not in sectors (which are the units LBAs are expressed in).
> > > It means that an atomic operation which straddles the atomic boundary is not
> > > guaranteed to be atomic by the device, so we should (must) not cross it to
> > > maintain atomic behaviour for an application block. That's one reason that
> > > we have all these size and alignment rules.
> > Yes, That much is obvious. What I have no idea diea about is what
> > this means in practice. When is this ever going to be non-zero, and
> > what should be we doing at the filesystem allocation level when it
> > is non-zero to ensure that allocations for atomic writes never cross
> > such a boundary. i.e. how do we prevent applications from ever
> > needing this functionality to be triggered? i.e. so the filesystem
> > can guarantee a single RWF_ATOMIC user IO is actually dispatched
> > as a single REQ_ATOMIC IO....
> 
> We only guarantee that a single user data block will not be split. So to
> avoid any splitting at all, all you can do is write a single user data
> block. That's the best which we can offer.
> 
> As mentioned earlier, atomic boundary is only relevant to NVMe. If the
> device does not support an atomic boundary which is not compliant with the
> rules, then we cannot support atomic writes for that device.

I guess here that any device advertising a atomic_write_boundary > 0
internally splits its LBA address space into chunks of that size and can
only persist full chunks.  The descriptions of how flash storage work
would seem to fit that description to me.  <shrug>

> > 
> > > ...
> > > 
> > > > > +static inline unsigned int queue_atomic_write_unit_max(const struct request_queue *q)
> > > > > +{
> > > > > +	return q->limits.atomic_write_unit_max << SECTOR_SHIFT;
> > > > > +}
> > > > > +
> > > > > +static inline unsigned int queue_atomic_write_unit_min(const struct request_queue *q)
> > > > > +{
> > > > > +	return q->limits.atomic_write_unit_min << SECTOR_SHIFT;
> > > > > +}
> > > > Ah, what? This undocumented interface reports "unit limits" in
> > > > bytes, but it's not using the physical device sector size to convert
> > > > between sector units and bytes. This really needs some more
> > > > documentation and work to make it present all units consistently and
> > > > not result in confusion when devices have 4kB sector sizes and not
> > > > 512 byte sectors...
> > > ok, we'll look to fix this up to give a coherent and clear interface.
> > > 
> > > > Also, I think all the byte ranges should support full 64 bit values,
> > > > otherwise there will be silent overflows in converting 32 bit sector
> > > > counts to byte ranges. And, eventually, something will want to do
> > > > larger than 4GB atomic IOs
> > > > 
> > > ok, we can do that but would also then make statx field 64b. I'm fine with
> > > that if it is wise to do so - I don't don't want to wastefully use up an
> > > extra 2 x 32b in struct statx.
> > Why do we need specific varibles for DIO atomic write alignment
> > limits?
> 
> I guess that we don't
> 
> > We already have direct IO alignment and size constraints in statx(),
> > so why wouldn't we just reuse those variables when the user requests
> > atomic limits for DIO?
> > 
> > i.e. if STATX_DIOALIGN is set, we return normal DIO alignment
> > constraints. If STATX_DIOALIGN_ATOMIC is set, we return the atomic
> > DIO alignment requirements in those variables.....
> > 
> > Yes, we probably need the dio max size to be added to statx for
> > this. Historically speaking, I wanted statx to support this in the
> > first place because that's what we were already giving userspace
> > with XFS_IOC_DIOINFO and we already knew that atomic IO when it came
> > along would require a bound maximum IO size much smaller than normal
> > DIO limits.  i.e.:
> > 
> > struct dioattr {
> >          __u32           d_mem;          /* data buffer memory alignment */
> >          __u32           d_miniosz;      /* min xfer size                */
> >          __u32           d_maxiosz;      /* max xfer size                */
> > };
> > 
> > where d_miniosz defined the alignment and size constraints for DIOs.
> > 
> > If we simply document that STATX_DIOALIGN_ATOMIC returns minimum
> > (unit) atomic IO size and alignment in statx->dio_offset_align (as
> > per STATX_DIOALIGN) and the maximum atomic IO size in
> > statx->dio_max_iosize, then we don't burn up anywhere near as much
> > space in the statx structure....
> 
> ok, so you are saying to unionize them, right? That would seem reasonable to
> me.
> Thanks,
> John
>
Eric Biggers May 5, 2023, 10:47 p.m. UTC | #6
On Fri, May 05, 2023 at 08:26:23AM +1000, Dave Chinner wrote:
> > ok, we can do that but would also then make statx field 64b. I'm fine with
> > that if it is wise to do so - I don't don't want to wastefully use up an
> > extra 2 x 32b in struct statx.
> 
> Why do we need specific varibles for DIO atomic write alignment
> limits? We already have direct IO alignment and size constraints in statx(),
> so why wouldn't we just reuse those variables when the user requests
> atomic limits for DIO?
> 
> i.e. if STATX_DIOALIGN is set, we return normal DIO alignment
> constraints. If STATX_DIOALIGN_ATOMIC is set, we return the atomic
> DIO alignment requirements in those variables.....
> 
> Yes, we probably need the dio max size to be added to statx for
> this. Historically speaking, I wanted statx to support this in the
> first place because that's what we were already giving userspace
> with XFS_IOC_DIOINFO and we already knew that atomic IO when it came
> along would require a bound maximum IO size much smaller than normal
> DIO limits.  i.e.:
> 
> struct dioattr {
>         __u32           d_mem;          /* data buffer memory alignment */
>         __u32           d_miniosz;      /* min xfer size                */
>         __u32           d_maxiosz;      /* max xfer size                */
> };
> 
> where d_miniosz defined the alignment and size constraints for DIOs.
> 
> If we simply document that STATX_DIOALIGN_ATOMIC returns minimum
> (unit) atomic IO size and alignment in statx->dio_offset_align (as
> per STATX_DIOALIGN) and the maximum atomic IO size in
> statx->dio_max_iosize, then we don't burn up anywhere near as much
> space in the statx structure....

I don't think that's how statx() is meant to work.  The request mask is a bitmask, and the user can
request an arbitrary combination of different items.  For example, the user could request both
STATX_DIOALIGN and STATX_WRITE_ATOMIC at the same time.  That doesn't work if different items share
the same fields.

- Eric
Dave Chinner May 5, 2023, 11:18 p.m. UTC | #7
On Fri, May 05, 2023 at 08:54:12AM +0100, John Garry wrote:
> On 04/05/2023 23:26, Dave Chinner wrote:
> 
> Hi Dave,
> 
> > > atomic_write_unit_max is largest application block size which we can
> > > support, while atomic_write_max_bytes is the max size of an atomic operation
> > > which the HW supports.
> > Why are these different? If the hardware supports 128kB atomic
> > writes, why limit applications to something smaller?
> 
> Two reasons:
> a. If you see patch 6/16, we need to apply a limit on atomic_write_unit_max
> from what is guaranteed we can fit in a bio without it being required to be
> split when submitted.

Yes, that's obvious behaviour for an atomic IO.

> Consider iomap generates an atomic write bio for a single userspace block
> and submits to the block layer - if the block layer needs to split due to
> block driver request_queue limits, like max_segments, then we're in trouble.
> So we need to limit atomic_write_unit_max such that this will not occur.
> That same limit should not apply to atomic_write_max_bytes.

Except the block layer doesn't provide any mechanism to do
REQ_ATOMIC IOs larger than atomic_write_unit_max. So in what case
will atomic_write_max_bytes > atomic_write_unit_max ever be
relevant to anyone?

> b. For NVMe, atomic_write_unit_max and atomic_write_max_bytes which the host
> reports will be the same (ignoring a.).
> 
> However for SCSI they may be different. SCSI has its own concept of boundary
> and it is relevant here. This is confusing as it is very different from NVMe
> boundary. NVMe is a media boundary really. For SCSI, a boundary is a
> sub-segment which the device may split an atomic write operation. For a SCSI
> device which only supports this boundary mode of operation, we limit
> atomic_write_unit_max to the max boundary segment size (such that we don't
> get splitting of an atomic write by the device) and then limit
> atomic_write_max_bytes to what is known in the spec as "maximum atomic
> transfer length with boundary". So in this device mode of operation,
> atomic_write_max_bytes and atomic_write_unit_max should be different.

But if the application is limited to atomic_write_unit_max sized
IOs, and that is always less than or equal to the size of the atomic
write boundary, why does the block layer even need to care about
this whacky quirk of the SCSI protocol implementation?

The block layer shouldn't even need to be aware that SCSI can split
"atomic" IOs into smaller individual IOs that result in the larger
requested IO being non-atomic. the SCSI layer should just expose 
"write with boundary" as the max atomic IO size it supports to the
block layer. 

At this point, both atomic_write_max_bytes and atomic write
boundary size are completely irrelevant to anything in the block
layer or above. If usrespace is limited to atomic_write_unit_max IO
sizes and it is enforced at the ->write_iter() layer, then the block
layer will never need to split REQ_ATOMIC bios because the entire
stack has already stated that it guarantees atomic_write_unit_max
bios will not get split....

In what cases does hardware that supports atomic_write_max_bytes >
atomic_write_unit_max actually be useful? I can see one situation,
and one situation only: merging adjacent small REQ_ATOMIC write
requests into single larger IOs before issuing them to the hardware.

This is exactly the sort of optimisation the block layers should be
doing - it fits perfectly with the SCSI "write with boundary"
behaviour - the merged bios can be split by the hardware at the
point where they were merged by the block layer, and everything is
fine because the are independent IOs, not a single RWF_ATOMIC IO
from userspace. And for NVMe, it allows IOs from small atomic write
limits (because, say, 16kB RAID stripe unit) to be merged into
larger atomic IOs with no penalty...


> > >  From your review on the iomap patch, I assume that now you realise that we
> > > are proposing a write which may include multiple application data blocks
> > > (each limited in size to atomic_write_unit_max), and the limit in total size
> > > of that write is atomic_write_max_bytes.
> > I still don't get it - you haven't explained why/what an application
> > atomic block write might be, nor why the block device should be
> > determining the size of application data blocks, etc.  If the block
> > device can do 128kB atomic writes, why wouldn't the device allow the
> > application to do 128kB atomic writes if they've aligned the atomic
> > write correctly?
> 
> An application block needs to be:
> - sized at a power-of-two
> - sized between atomic_write_unit_min and atomic_write_unit_max, inclusive
> - naturally aligned
> 
> Please consider that the application does not explicitly tell the kernel the
> size of its data blocks, it's implied from the size of the write and file
> offset. So, assuming that userspace follows the rules properly when issuing
> a write, the kernel may deduce the application block size and ensure only
> that each individual user data block is not split.

That's just *gross*. The kernel has no business assuming anything
about the data layout inside an IO request. The kernel cannot assume
that the application uses a single IO size for atomic writes when it
expicitly provides a range of IO sizes that the application can use.

e.g. min unit = 4kB, max unit = 128kB allows IO sizes of 4kB, 8kiB,
16kiB, 32kB, 64kB and 128kB. How does the kernel infer what that
application data block size is based on a 32kB atomic write vs a
128kB atomic write?

The kernel can't use file offset alignment to infer application
block size, either. e.g. a 16kB write at 128kB could be a single
16kB data block, it could be 2x8kB data blocks, or it could be 4x4kB
data blocks - they all follow the rules you set above. So how does
the kernel know that for two of these cases it is safe to split the
IO at 8kB, and for one it isn't safe at all?

AFAICS, there is no way the kernel can accurately derive this sort
of information, so any assumptions that the "kernel can infer the
application data layout" to split IOs correctly simply won't work.
And that very important because we are talking about operations that
provide data persistence guarantees....

> If userspace wants a guarantee of no splitting of all in its write, then it
> may issue a write for a single userspace data block, e.g. userspace block
> size is 16KB, then write at a file offset aligned to 16KB and a total write
> size of 16KB will be guaranteed to be written atomically by the device.

Exactly what has "userspace block size" got to do with the kernel
providing a guarantee that a RWF_ATOMIC write of a 16kB buffer at
offset 16kB will be written atomically?

> > What happens we we get hardware that can do atomic writes at any
> > alignment, of any size up to atomic_write_max_bytes? Because this
> > interface defines atomic writes as "must be a multiple of 2 of
> > atomic_write_unit_min" then hardware that can do atomic writes of
> > any size can not be effectively utilised by this interface....
> > 
> > > user applications should only pay attention to what we return from statx,
> > > that being atomic_write_unit_min and atomic_write_unit_max.
> > > 
> > > atomic_write_max_bytes and atomic_write_boundary is only relevant to the
> > > block layer.
> > If applications can issue an multi-atomic_write_unit_max-block
> > writes as a single, non-atomic, multi-bio RWF_ATOMIC pwritev2() IO
> > and such IO is constrainted to atomic_write_max_bytes, then
> > atomic_write_max_bytes is most definitely relevant to user
> > applications.
> 
> But we still do not guarantee that multi-atomic_write_unit_max-block writes
> as a single, non-atomic, multi-bio RWF_ATOMIC pwritev2() IO and such IO is
> constrained to atomic_write_max_bytes will be written atomically by the
> device.
>
> Three things may happen in the kernel:
> - we may need to split due to atomic boundary

Block layer rejects the IO - cannot be performed atomically.

> - we may need to split due to the write spanning discontig extents

Filesystem rejects the IO - cannot be performed atomically.

> - atomic_write_max_bytes may be much larger than what we could fit in a bio,
> so may need multiple bios

Filesystem/blockdev rejects the IO - cannot be performed atomically.

> And maybe more which does not come to mind.

Relevant layer rejects the IO - cannot be performed atomically.

> So I am not sure what value there is in reporting atomic_write_max_bytes to
> the user. The description would need to be something like "we guarantee that
> if the total write length is greater than atomic_write_max_bytes, then all
> data will never be submitted to the device atomically. Otherwise it might
> be".

Exactly my point - there's a change of guarantee that the kernel
provides userspace at that point, and hence application developers
need to know it exists and, likely, be able to discover that
threshold programatically.

But this, to me, is a just another symptom of what I see as the
wider issue here: trying to allow RWF_ATOMIC IO to do more than a
*single atomic IO*.

This reeks of premature API optimisation. We should be make
RWF_ATOMIC do one thing, and one thing only: guaranteed single
atomic IO submission.

It doesn't matter what data userspace is trying to write atomically;
it only matters that the kernel submits the write as a single atomic
unit to the hardware which then guarantees that it completes the
whole IO as a single atomic unit.

What functionality the hardware can provide is largely irrelevant
here; it's the IO semantics that we guarantee userspace that matter.
The kernel interface needs to have simple, well defined behaviour
and provide clear data persistence guarantees.

Once we have that, we can optimise both the applications and the
kernel implementation around that behaviour and guarantees. e.g.
adjacent IO merging (either in the application or in the block
layer), using AIO/io_uring with completion to submission ordering,
etc.

There are many well known IO optimisation techniques that do not
require the kernel to infer or assume the format of the data in the
user buffers as this current API does. May the API simple and hard
to get wrong first, then optimise from there....



> > We already have direct IO alignment and size constraints in statx(),
> > so why wouldn't we just reuse those variables when the user requests
> > atomic limits for DIO?
> > 
> > i.e. if STATX_DIOALIGN is set, we return normal DIO alignment
> > constraints. If STATX_DIOALIGN_ATOMIC is set, we return the atomic
> > DIO alignment requirements in those variables.....
> > 
> > Yes, we probably need the dio max size to be added to statx for
> > this. Historically speaking, I wanted statx to support this in the
> > first place because that's what we were already giving userspace
> > with XFS_IOC_DIOINFO and we already knew that atomic IO when it came
> > along would require a bound maximum IO size much smaller than normal
> > DIO limits.  i.e.:
> > 
> > struct dioattr {
> >          __u32           d_mem;          /* data buffer memory alignment */
> >          __u32           d_miniosz;      /* min xfer size                */
> >          __u32           d_maxiosz;      /* max xfer size                */
> > };
> > 
> > where d_miniosz defined the alignment and size constraints for DIOs.
> > 
> > If we simply document that STATX_DIOALIGN_ATOMIC returns minimum
> > (unit) atomic IO size and alignment in statx->dio_offset_align (as
> > per STATX_DIOALIGN) and the maximum atomic IO size in
> > statx->dio_max_iosize, then we don't burn up anywhere near as much
> > space in the statx structure....
> 
> ok, so you are saying to unionize them, right? That would seem reasonable to
> me.

No, I don't recommend unionising them. RWF_ATOMIC only applies to
direct IO, so if the application ask for ATOMIC DIO limits, we put
the atomic dio limits in the dio limits variables rather than the
looser non-atomic dio limits......

Cheers,

Dave.
Dave Chinner May 5, 2023, 11:31 p.m. UTC | #8
On Fri, May 05, 2023 at 10:47:19PM +0000, Eric Biggers wrote:
> On Fri, May 05, 2023 at 08:26:23AM +1000, Dave Chinner wrote:
> > > ok, we can do that but would also then make statx field 64b. I'm fine with
> > > that if it is wise to do so - I don't don't want to wastefully use up an
> > > extra 2 x 32b in struct statx.
> > 
> > Why do we need specific varibles for DIO atomic write alignment
> > limits? We already have direct IO alignment and size constraints in statx(),
> > so why wouldn't we just reuse those variables when the user requests
> > atomic limits for DIO?
> > 
> > i.e. if STATX_DIOALIGN is set, we return normal DIO alignment
> > constraints. If STATX_DIOALIGN_ATOMIC is set, we return the atomic
> > DIO alignment requirements in those variables.....
> > 
> > Yes, we probably need the dio max size to be added to statx for
> > this. Historically speaking, I wanted statx to support this in the
> > first place because that's what we were already giving userspace
> > with XFS_IOC_DIOINFO and we already knew that atomic IO when it came
> > along would require a bound maximum IO size much smaller than normal
> > DIO limits.  i.e.:
> > 
> > struct dioattr {
> >         __u32           d_mem;          /* data buffer memory alignment */
> >         __u32           d_miniosz;      /* min xfer size                */
> >         __u32           d_maxiosz;      /* max xfer size                */
> > };
> > 
> > where d_miniosz defined the alignment and size constraints for DIOs.
> > 
> > If we simply document that STATX_DIOALIGN_ATOMIC returns minimum
> > (unit) atomic IO size and alignment in statx->dio_offset_align (as
> > per STATX_DIOALIGN) and the maximum atomic IO size in
> > statx->dio_max_iosize, then we don't burn up anywhere near as much
> > space in the statx structure....
> 
> I don't think that's how statx() is meant to work.  The request mask is a bitmask, and the user can
> request an arbitrary combination of different items.  For example, the user could request both
> STATX_DIOALIGN and STATX_WRITE_ATOMIC at the same time.  That doesn't work if different items share
> the same fields.

Sure it does - what is contained in the field on return is defined
by the result mask. In this case, whatever the filesystem puts in
the DIO fields will match which flag it asserts in the result mask.

i.e. if the application wants RWF_ATOMIC and so asks for STATX_DIOALIGN |
STATX_DIOALIGN_ATOMIC in the request mask then:

- if the filesystem does not support RWF_ATOMIC it fills in the
  normal DIO alingment values and puts STATX_DIOALIGN in the result
  mask.

  Now the application knows that it can't use RWF_ATOMIC, and it
  doesn't need to do another statx() call to get the dio alignment
  values it needs.

- if the filesystem supports RWF_ATOMIC, it fills in the values with
  the atomic DIO constraints and puts STATX_DIOALIGN_ATOMIC in the
  result mask.

  Now the application knows it can use RWF_ATOMIC and has the atomic
  DIO constraints in the dio alignment fields returned.

This uses the request/result masks exactly as intended, yes?

Cheers,

Dave.
Eric Biggers May 6, 2023, 12:08 a.m. UTC | #9
On Sat, May 06, 2023 at 09:31:52AM +1000, Dave Chinner wrote:
> On Fri, May 05, 2023 at 10:47:19PM +0000, Eric Biggers wrote:
> > On Fri, May 05, 2023 at 08:26:23AM +1000, Dave Chinner wrote:
> > > > ok, we can do that but would also then make statx field 64b. I'm fine with
> > > > that if it is wise to do so - I don't don't want to wastefully use up an
> > > > extra 2 x 32b in struct statx.
> > > 
> > > Why do we need specific varibles for DIO atomic write alignment
> > > limits? We already have direct IO alignment and size constraints in statx(),
> > > so why wouldn't we just reuse those variables when the user requests
> > > atomic limits for DIO?
> > > 
> > > i.e. if STATX_DIOALIGN is set, we return normal DIO alignment
> > > constraints. If STATX_DIOALIGN_ATOMIC is set, we return the atomic
> > > DIO alignment requirements in those variables.....
> > > 
> > > Yes, we probably need the dio max size to be added to statx for
> > > this. Historically speaking, I wanted statx to support this in the
> > > first place because that's what we were already giving userspace
> > > with XFS_IOC_DIOINFO and we already knew that atomic IO when it came
> > > along would require a bound maximum IO size much smaller than normal
> > > DIO limits.  i.e.:
> > > 
> > > struct dioattr {
> > >         __u32           d_mem;          /* data buffer memory alignment */
> > >         __u32           d_miniosz;      /* min xfer size                */
> > >         __u32           d_maxiosz;      /* max xfer size                */
> > > };
> > > 
> > > where d_miniosz defined the alignment and size constraints for DIOs.
> > > 
> > > If we simply document that STATX_DIOALIGN_ATOMIC returns minimum
> > > (unit) atomic IO size and alignment in statx->dio_offset_align (as
> > > per STATX_DIOALIGN) and the maximum atomic IO size in
> > > statx->dio_max_iosize, then we don't burn up anywhere near as much
> > > space in the statx structure....
> > 
> > I don't think that's how statx() is meant to work.  The request mask is a bitmask, and the user can
> > request an arbitrary combination of different items.  For example, the user could request both
> > STATX_DIOALIGN and STATX_WRITE_ATOMIC at the same time.  That doesn't work if different items share
> > the same fields.
> 
> Sure it does - what is contained in the field on return is defined
> by the result mask. In this case, whatever the filesystem puts in
> the DIO fields will match which flag it asserts in the result mask.
> 
> i.e. if the application wants RWF_ATOMIC and so asks for STATX_DIOALIGN |
> STATX_DIOALIGN_ATOMIC in the request mask then:
> 
> - if the filesystem does not support RWF_ATOMIC it fills in the
>   normal DIO alingment values and puts STATX_DIOALIGN in the result
>   mask.
> 
>   Now the application knows that it can't use RWF_ATOMIC, and it
>   doesn't need to do another statx() call to get the dio alignment
>   values it needs.
> 
> - if the filesystem supports RWF_ATOMIC, it fills in the values with
>   the atomic DIO constraints and puts STATX_DIOALIGN_ATOMIC in the
>   result mask.
> 
>   Now the application knows it can use RWF_ATOMIC and has the atomic
>   DIO constraints in the dio alignment fields returned.
> 
> This uses the request/result masks exactly as intended, yes?
> 

We could certainly implement some scheme like that, but I don't think that was
how statx() was intended to work.  I think that each bit in the mask was
intended to correspond to an independent piece of information.

- Eric
John Garry May 6, 2023, 9:38 a.m. UTC | #10
On 06/05/2023 00:18, Dave Chinner wrote:

Hi Dave,

>> Consider iomap generates an atomic write bio for a single userspace block
>> and submits to the block layer - if the block layer needs to split due to
>> block driver request_queue limits, like max_segments, then we're in trouble.
>> So we need to limit atomic_write_unit_max such that this will not occur.
>> That same limit should not apply to atomic_write_max_bytes.
> Except the block layer doesn't provide any mechanism to do
> REQ_ATOMIC IOs larger than atomic_write_unit_max. So in what case
> will atomic_write_max_bytes > atomic_write_unit_max ever be
> relevant to anyone?

atomic_write_max_bytes is only relevant to the block layer.

Consider userspace does a single pwritev2 RWF_ATOMIC call of size 1M and 
offset aligned to 8K, which is 128x 8k userspace data blocks. It so 
happens that this write spans 2x extents right in the middle (for 
simplicity of the example), so iomap produces 2x bios, each of size 
512K. In this example atomic_write_max_bytes is 256K. So when those 2x 
bios are submitted to the block layer, they each need to be split into 
2x 256K bios - so we end up with a total of 4x 256K bios being submitted 
to the device. Splitting into these 4x 256K bios will satisfy the 
guarantee to not split any 8k data blocks.

When the kernel handles this pwritev2 RWF_ATOMIC call, it deduces the 
userspace block size, but does not always split into multiple bios each 
of that size. When iomap or block fops creates a bio, it will still fill 
the bio as large as it can, but also it needs to trim such that it is 
always a multiple of the userspace block size. And when we submit this 
bio, the block layer will only split it when it needs to, e.g. when bio 
exceeds atomic_write_max_bytes in size, or crosses a boundary, or any 
other reason you can see in bio_split_rw(). Please see patch 8/16 for 
how this is done.

> 
>> b. For NVMe, atomic_write_unit_max and atomic_write_max_bytes which the host
>> reports will be the same (ignoring a.).
>>
>> However for SCSI they may be different. SCSI has its own concept of boundary
>> and it is relevant here. This is confusing as it is very different from NVMe
>> boundary. NVMe is a media boundary really. For SCSI, a boundary is a
>> sub-segment which the device may split an atomic write operation. For a SCSI
>> device which only supports this boundary mode of operation, we limit
>> atomic_write_unit_max to the max boundary segment size (such that we don't
>> get splitting of an atomic write by the device) and then limit
>> atomic_write_max_bytes to what is known in the spec as "maximum atomic
>> transfer length with boundary". So in this device mode of operation,
>> atomic_write_max_bytes and atomic_write_unit_max should be different.
> But if the application is limited to atomic_write_unit_max sized
> IOs, and that is always less than or equal to the size of the atomic
> write boundary, why does the block layer even need to care about
> this whacky quirk of the SCSI protocol implementation?

The block layer just exposes some atomic write queue limits to the block 
device driver to be set. We tried to make the limits generic so that 
they fit the orthogonal features and wackiness of both NVMe and SCSI.

> 
> The block layer shouldn't even need to be aware that SCSI can split
> "atomic" IOs into smaller individual IOs that result in the larger
> requested IO being non-atomic. the SCSI layer should just expose
> "write with boundary" as the max atomic IO size it supports to the
> block layer.
> 
> At this point, both atomic_write_max_bytes and atomic write
> boundary size are completely irrelevant to anything in the block
> layer or above. If usrespace is limited to atomic_write_unit_max IO
> sizes and it is enforced at the ->write_iter() layer, then the block
> layer will never need to split REQ_ATOMIC bios because the entire
> stack has already stated that it guarantees atomic_write_unit_max
> bios will not get split....

Please refer to my first point on this.

> 
> In what cases does hardware that supports atomic_write_max_bytes >
> atomic_write_unit_max actually be useful?

This is only possible for SCSI. As I mentioned before, SCSI supports its 
own boundary feature, and it is very different from NVMe. For SCSI, it 
is a sub-segment which a device which split an atomic write operation.

So consider the example of the max boundary size of the SCSI device is 
8K, but max atomic length with with boundary is 256K. This would mean 
that an atomic write operation for the device supports upto 32x 8K 
segments. Each of the sub-segments may atomically complete in the 
device. So, in terms of fitting our atomic write request_queue limits 
for this device, atomic_write_max_bytes would be 256K and 
atomic_write_unit_max would be 8k.

> I can see one situation,
> and one situation only: merging adjacent small REQ_ATOMIC write
> requests into single larger IOs before issuing them to the hardware.
> 
> This is exactly the sort of optimisation the block layers should be
> doing - it fits perfectly with the SCSI "write with boundary"
> behaviour - the merged bios can be split by the hardware at the
> point where they were merged by the block layer, and everything is
> fine because the are independent IOs, not a single RWF_ATOMIC IO
> from userspace. And for NVMe, it allows IOs from small atomic write
> limits (because, say, 16kB RAID stripe unit) to be merged into
> larger atomic IOs with no penalty...

Yes, that is another scenario. FWIW, we disallow merging currently, but 
it should be possible to support it.

> 
> 
>>>>   From your review on the iomap patch, I assume that now you realise that we
>>>> are proposing a write which may include multiple application data blocks
>>>> (each limited in size to atomic_write_unit_max), and the limit in total size
>>>> of that write is atomic_write_max_bytes.
>>> I still don't get it - you haven't explained why/what an application
>>> atomic block write might be, nor why the block device should be
>>> determining the size of application data blocks, etc.  If the block
>>> device can do 128kB atomic writes, why wouldn't the device allow the
>>> application to do 128kB atomic writes if they've aligned the atomic
>>> write correctly?
>> An application block needs to be:
>> - sized at a power-of-two
>> - sized between atomic_write_unit_min and atomic_write_unit_max, inclusive
>> - naturally aligned
>>
>> Please consider that the application does not explicitly tell the kernel the
>> size of its data blocks, it's implied from the size of the write and file
>> offset. So, assuming that userspace follows the rules properly when issuing
>> a write, the kernel may deduce the application block size and ensure only
>> that each individual user data block is not split.
> That's just*gross*. The kernel has no business assuming anything
> about the data layout inside an IO request. The kernel cannot assume
> that the application uses a single IO size for atomic writes when it
> expicitly provides a range of IO sizes that the application can use.
> 
> e.g. min unit = 4kB, max unit = 128kB allows IO sizes of 4kB, 8kiB,
> 16kiB, 32kB, 64kB and 128kB. How does the kernel infer what that
> application data block size is based on a 32kB atomic write vs a
> 128kB atomic write?

 From the requirement that userspace always naturally aligns writes to 
its own block size.

If a write so happens to be sized and aligned such that it could be 16x 
4k, or 8x 8k, or 4x 16K, or 2x 32K, or 1x 64K, then the kernel will 
always assume the largest size, i.e. we will assume 64K in this example, 
and always submit a 64K atomic write operation to the device.

We're coming at this from the database perspective, which generally uses 
fixed block sizes.

> 
> The kernel can't use file offset alignment to infer application
> block size, either. e.g. a 16kB write at 128kB could be a single
> 16kB data block, it could be 2x8kB data blocks, or it could be 4x4kB
> data blocks - they all follow the rules you set above. So how does
> the kernel know that for two of these cases it is safe to split the
> IO at 8kB, and for one it isn't safe at all?

As above, the kernel assumes the largest block size which fits the 
length and alignment of the write. In doing so, it will always satisfy 
requirement to atomically write userspace data blocks.

> 
> AFAICS, there is no way the kernel can accurately derive this sort
> of information, so any assumptions that the "kernel can infer the
> application data layout" to split IOs correctly simply won't work.
> And that very important because we are talking about operations that
> provide data persistence guarantees....

What I describe is not ideal, and it would be nice for userspace to be 
able to explicitly tell the kernel its block size, as to avoid any doubt 
and catch any userspace misbehavior.

Could there be a way to do this for both block device and FS files, like 
fcntl?

> 
>> If userspace wants a guarantee of no splitting of all in its write, then it
>> may issue a write for a single userspace data block, e.g. userspace block
>> size is 16KB, then write at a file offset aligned to 16KB and a total write
>> size of 16KB will be guaranteed to be written atomically by the device.
> Exactly what has "userspace block size" got to do with the kernel
> providing a guarantee that a RWF_ATOMIC write of a 16kB buffer at
> offset 16kB will be written atomically?

Please let me know if I have not explained this well enough.

> 
>>> What happens we we get hardware that can do atomic writes at any
>>> alignment, of any size up to atomic_write_max_bytes? Because this
>>> interface defines atomic writes as "must be a multiple of 2 of
>>> atomic_write_unit_min" then hardware that can do atomic writes of
>>> any size can not be effectively utilised by this interface....
>>>
>>>> user applications should only pay attention to what we return from statx,
>>>> that being atomic_write_unit_min and atomic_write_unit_max.
>>>>
>>>> atomic_write_max_bytes and atomic_write_boundary is only relevant to the
>>>> block layer.
>>> If applications can issue an multi-atomic_write_unit_max-block
>>> writes as a single, non-atomic, multi-bio RWF_ATOMIC pwritev2() IO
>>> and such IO is constrainted to atomic_write_max_bytes, then
>>> atomic_write_max_bytes is most definitely relevant to user
>>> applications.
>> But we still do not guarantee that multi-atomic_write_unit_max-block writes
>> as a single, non-atomic, multi-bio RWF_ATOMIC pwritev2() IO and such IO is
>> constrained to atomic_write_max_bytes will be written atomically by the
>> device.
>>
>> Three things may happen in the kernel:
>> - we may need to split due to atomic boundary
> Block layer rejects the IO - cannot be performed atomically.

But what good is that to the user?

As far as the user is concerned, they tried to write some data and the 
kernel rejected it. They don't know why. Even worse is that there is 
nothing which they can do about it, apart from trying smaller sized 
writes, which may not suit them.

In an ideal world we would not have atomic write boundaries or discontig 
FS extents or bio limits, but that is not what we have.

> 
>> - we may need to split due to the write spanning discontig extents
> Filesystem rejects the IO - cannot be performed atomically.
> 
>> - atomic_write_max_bytes may be much larger than what we could fit in a bio,
>> so may need multiple bios
> Filesystem/blockdev rejects the IO - cannot be performed atomically.
> 
>> And maybe more which does not come to mind.
> Relevant layer rejects the IO - cannot be performed atomically.
> 
>> So I am not sure what value there is in reporting atomic_write_max_bytes to
>> the user. The description would need to be something like "we guarantee that
>> if the total write length is greater than atomic_write_max_bytes, then all
>> data will never be submitted to the device atomically. Otherwise it might
>> be".
> Exactly my point - there's a change of guarantee that the kernel
> provides userspace at that point, and hence application developers
> need to know it exists and, likely, be able to discover that
> threshold programatically.

hmmm, ok, if you think it would help the userspace programmer.

> 
> But this, to me, is a just another symptom of what I see as the
> wider issue here: trying to allow RWF_ATOMIC IO to do more than a
> *single atomic IO*.
> 
> This reeks of premature API optimisation. We should be make
> RWF_ATOMIC do one thing, and one thing only: guaranteed single
> atomic IO submission.
> 
> It doesn't matter what data userspace is trying to write atomically;
> it only matters that the kernel submits the write as a single atomic
> unit to the hardware which then guarantees that it completes the
> whole IO as a single atomic unit.
> 
> What functionality the hardware can provide is largely irrelevant
> here; it's the IO semantics that we guarantee userspace that matter.
> The kernel interface needs to have simple, well defined behaviour
> and provide clear data persistence guarantees.

ok, I'm hearing you. So we could just say to the user: the rule is that 
you are allowed to submit a power-of-2 sized write of size between 
atomic_write_unit_min and atomic_write_unit_max and it must be naturally 
aligned. Nothing else is permitted for RWF_ATOMIC.

But then for a 10M write of 8K blocks, we have userspace issuing 1280x 
pwritev2() calls, which isn’t efficient. However, if for example 
atomic_write_unit_max was 128K, the userspace app could issue 80x 
pwritv2() calls. Still, not ideal. The block layer could be merging a 
lot of those writes, but we still have overhead per pwritev2(). And 
block layer merging takes many CPU cycles also.

In our proposal, we're just giving userspace the option to write a large 
range of blocks in a single pwritev2() call, hopefully improving 
performance. Most often, we would not be getting any splitting - 
splitting should only really happen at the extremes, so it should give 
good performance. We don't have performance figures yet, though, to 
enforce this point.

> 
> Once we have that, we can optimise both the applications and the
> kernel implementation around that behaviour and guarantees. e.g.
> adjacent IO merging (either in the application or in the block
> layer), using AIO/io_uring with completion to submission ordering,
> etc.
> 
> There are many well known IO optimisation techniques that do not
> require the kernel to infer or assume the format of the data in the
> user buffers as this current API does. May the API simple and hard
> to get wrong first, then optimise from there....
> 

OK, so these could help. We need performance figures...

> 
> 
>>> We already have direct IO alignment and size constraints in statx(),
>>> so why wouldn't we just reuse those variables when the user requests
>>> atomic limits for DIO?
>>>
>>> i.e. if STATX_DIOALIGN is set, we return normal DIO alignment
>>> constraints. If STATX_DIOALIGN_ATOMIC is set, we return the atomic
>>> DIO alignment requirements in those variables.....
>>>
>>> Yes, we probably need the dio max size to be added to statx for
>>> this. Historically speaking, I wanted statx to support this in the
>>> first place because that's what we were already giving userspace
>>> with XFS_IOC_DIOINFO and we already knew that atomic IO when it came
>>> along would require a bound maximum IO size much smaller than normal
>>> DIO limits.  i.e.:
>>>
>>> struct dioattr {
>>>           __u32           d_mem;          /* data buffer memory alignment */
>>>           __u32           d_miniosz;      /* min xfer size                */
>>>           __u32           d_maxiosz;      /* max xfer size                */
>>> };
>>>
>>> where d_miniosz defined the alignment and size constraints for DIOs.
>>>
>>> If we simply document that STATX_DIOALIGN_ATOMIC returns minimum
>>> (unit) atomic IO size and alignment in statx->dio_offset_align (as
>>> per STATX_DIOALIGN) and the maximum atomic IO size in
>>> statx->dio_max_iosize, then we don't burn up anywhere near as much
>>> space in the statx structure....
>> ok, so you are saying to unionize them, right? That would seem reasonable to
>> me.
> No, I don't recommend unionising them.

ah, ok, I assumed that we would not support asking for both 
STATX_DIOALIGN and STATX_DIOATOMIC. I should have read your proposal 
more closely.

> RWF_ATOMIC only applies to
> direct IO, so if the application ask for ATOMIC DIO limits, we put
> the atomic dio limits in the dio limits variables rather than the
> looser non-atomic dio limits......

I see. TBH, I'm not sure about this and will be let other experts comment.

Thanks,
John
Martin K. Petersen May 7, 2023, 1:59 a.m. UTC | #11
Darrick,

> Could a SCSI device could advertise 512b LBAs, 4096b physical blocks,
> a 64k atomic_write_unit_max, and a 1MB maximum transfer length
> (atomic_write_max_bytes)?

Yes.

> And does that mean that application software can send one 64k-aligned
> write and expect it either to be persisted completely or not at all?

Yes.

> And, does that mean that the application can send up to 16 of these
> 64k-aligned blocks as a single 1MB IO and expect that each of those 16
> blocks will either be persisted entirely or not at all?

Yes.

> There doesn't seem to be any means for the device to report /which/ of
> the 16 were persisted, which is disappointing. But maybe the
> application encodes LSNs and can tell after the fact that something
> went wrong, and recover?

Correct. Although we traditionally haven't had too much fun with partial
completion for sequential I/O either.

> If the same device reports a 2048b atomic_write_unit_min, does that mean
> that I can send between 2 and 64k of data as a single atomic write and
> that's ok?  I assume that this weird situation (512b LBA, 4k physical,
> 2k atomic unit min) requires some fancy RMW but that the device is
> prepared to cr^Wpersist that correctly?

Yes.

It would not make much sense for a device to report a minimum atomic
granularity smaller than the reported physical block size. But in theory
it could.

> What if the device also advertises a 128k atomic_write_boundary?
> That means that a 2k atomic block write will fail if it starts at 127k,
> but if it starts at 126k then thats ok.  Right?

Correct.

> As for avoiding splits in the block layer, I guess that also means that
> someone needs to reduce atomic_write_unit_max and atomic_write_boundary
> if (say) some sysadmin decides to create a raid0 of these devices with a
> 32k stripe size?

Correct. Atomic limits will need to be stacked for MD and DM like we do
with the remaining queue limits.

> It sounds like NVME is simpler in that it would report 64k for both the
> max unit and the max transfer length?  And for the 1M write I mentioned
> above, the application must send 16 individual writes?

Correct.

> With my app developer hat on, the simplest mental model of this is that
> if I want to persist a blob of data that is larger than one device LBA,
> then atomic_write_unit_min <= blob size <= atomic_write_unit_max must be
> true, and the LBA range for the write cannot cross a atomic_write_boundary.
>
> Does that sound right?

Yep.

> Going back to my sample device above, the XFS buffer cache could write
> individual 4k filesystem metadata blocks using REQ_ATOMIC because 4k is
> between the atomic write unit min/max, 4k metadata blocks will never
> cross a 128k boundary, and we'd never have to worry about torn writes
> in metadata ever again?

Correct.

> Furthermore, if I want to persist a bunch of blobs in a contiguous LBA
> range and atomic_write_max_bytes > atomic_write_unit_max, then I can do
> that with a single direct write?

Yes.

> I'm assuming that the blobs in the middle of the range must all be
> exactly atomic_write_unit_max bytes in size?

If you care about each blob being written atomically, yes.

> And I had better be prepared to (I guess) re-read the entire range
> after the system goes down to find out if any of them did or did not
> persist?

If you crash or get an I/O error, then yes. There is no way to inquire
which blobs were written. Just like we don't know which LBAs were
written if the OS crashes in the middle of a regular write operation.
Martin K. Petersen May 7, 2023, 2:35 a.m. UTC | #12
Dave,

> But if the application is limited to atomic_write_unit_max sized
> IOs, and that is always less than or equal to the size of the atomic
> write boundary, why does the block layer even need to care about
> this whacky quirk of the SCSI protocol implementation?

Dealing with boundaries is mainly an NVMe issue. NVMe boundaries are
fixed in LBA space. SCSI boundaries are per-I/O.

> In what cases does hardware that supports atomic_write_max_bytes >
> atomic_write_unit_max actually be useful?

The common case is a database using 16K blocks and wanting to do 1M
writes for performance reasons.

> There are many well known IO optimisation techniques that do not
> require the kernel to infer or assume the format of the data in the
> user buffers as this current API does. May the API simple and hard
> to get wrong first, then optimise from there....

We discussed whether it made sense to have an explicit interface to set
an "application" block size when creating a file. I am not against it,
but our experience is that it doesn't buy you anything over what the
careful alignment of powers-of-two provides. As long as everything is
properly aligned, there is no need for the kernel to infer or assume
anything. It's the application's business what it is doing inside the
file.
Mike Snitzer May 9, 2023, 12:19 a.m. UTC | #13
On Wed, May 3, 2023 at 2:40 PM John Garry <john.g.garry@oracle.com> wrote:
>
> From: Himanshu Madhani <himanshu.madhani@oracle.com>
>
> Add the following limits:
> - atomic_write_boundary
> - atomic_write_max_bytes
> - atomic_write_unit_max
> - atomic_write_unit_min
>
> Signed-off-by: Himanshu Madhani <himanshu.madhani@oracle.com>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  Documentation/ABI/stable/sysfs-block | 42 +++++++++++++++++++++
>  block/blk-settings.c                 | 56 ++++++++++++++++++++++++++++
>  block/blk-sysfs.c                    | 33 ++++++++++++++++
>  include/linux/blkdev.h               | 23 ++++++++++++
>  4 files changed, 154 insertions(+)
>

...

> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index 896b4654ab00..e21731715a12 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -59,6 +59,9 @@ void blk_set_default_limits(struct queue_limits *lim)
>         lim->zoned = BLK_ZONED_NONE;
>         lim->zone_write_granularity = 0;
>         lim->dma_alignment = 511;
> +       lim->atomic_write_unit_min = lim->atomic_write_unit_max = 1;
> +       lim->atomic_write_max_bytes = 512;
> +       lim->atomic_write_boundary = 0;
>  }

Not seeing required changes to blk_set_stacking_limits() nor blk_stack_limits().

Sorry to remind you of DM and MD limits stacking requirements. ;)

Mike
John Garry May 17, 2023, 5:02 p.m. UTC | #14
On 09/05/2023 01:19, Mike Snitzer wrote:
> On Wed, May 3, 2023 at 2:40 PM John Garry <john.g.garry@oracle.com> wrote:
>>
>> From: Himanshu Madhani <himanshu.madhani@oracle.com>
>>
>> Add the following limits:
>> - atomic_write_boundary
>> - atomic_write_max_bytes
>> - atomic_write_unit_max
>> - atomic_write_unit_min
>>
>> Signed-off-by: Himanshu Madhani <himanshu.madhani@oracle.com>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   Documentation/ABI/stable/sysfs-block | 42 +++++++++++++++++++++
>>   block/blk-settings.c                 | 56 ++++++++++++++++++++++++++++
>>   block/blk-sysfs.c                    | 33 ++++++++++++++++
>>   include/linux/blkdev.h               | 23 ++++++++++++
>>   4 files changed, 154 insertions(+)
>>
> 
> ...
> 
>> diff --git a/block/blk-settings.c b/block/blk-settings.c
>> index 896b4654ab00..e21731715a12 100644
>> --- a/block/blk-settings.c
>> +++ b/block/blk-settings.c
>> @@ -59,6 +59,9 @@ void blk_set_default_limits(struct queue_limits *lim)
>>          lim->zoned = BLK_ZONED_NONE;
>>          lim->zone_write_granularity = 0;
>>          lim->dma_alignment = 511;
>> +       lim->atomic_write_unit_min = lim->atomic_write_unit_max = 1;
>> +       lim->atomic_write_max_bytes = 512;
>> +       lim->atomic_write_boundary = 0;
>>   }
> 
> Not seeing required changes to blk_set_stacking_limits() nor blk_stack_limits().
> 
> Sorry to remind you of DM and MD limits stacking requirements. ;)
> 

Hi Mike,

Sorry for the slow response.

The idea is that initially we would not be adding stacked device 
support, so we can leave atomic defaults as min unit we always consider 
atomic, i.e. logical block size/fixed 512B sector size.

Thanks,
John
diff mbox series

Patch

diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 282de3680367..f3ed9890e03b 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -21,6 +21,48 @@  Description:
 		device is offset from the internal allocation unit's
 		natural alignment.
 
+What:		/sys/block/<disk>/atomic_write_max_bytes
+Date:		May 2023
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] This parameter specifies the maximum atomic write
+		size reported by the device. An atomic write operation
+		must not exceed this number of bytes.
+
+
+What:		/sys/block/<disk>/atomic_write_unit_min
+Date:		May 2023
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] This parameter specifies the smallest block which can
+		be written atomically with an atomic write operation. All
+		atomic write operations must begin at a
+		atomic_write_unit_min boundary and must be multiples of
+		atomic_write_unit_min. This value must be a power-of-two.
+
+
+What:		/sys/block/<disk>/atomic_write_unit_max
+Date:		January 2023
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] This parameter defines the largest block which can be
+		written atomically with an atomic write operation. This
+		value must be a multiple of atomic_write_unit_min and must
+		be a power-of-two.
+
+
+What:		/sys/block/<disk>/atomic_write_boundary
+Date:		May 2023
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] A device may need to internally split I/Os which
+		straddle a given logical block address boundary. In that
+		case a single atomic write operation will be processed as
+		one of more sub-operations which each complete atomically.
+		This parameter specifies the size in bytes of the atomic
+		boundary if one is reported by the device. This value must
+		be a power-of-two.
+
 
 What:		/sys/block/<disk>/diskseq
 Date:		February 2021
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 896b4654ab00..e21731715a12 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -59,6 +59,9 @@  void blk_set_default_limits(struct queue_limits *lim)
 	lim->zoned = BLK_ZONED_NONE;
 	lim->zone_write_granularity = 0;
 	lim->dma_alignment = 511;
+	lim->atomic_write_unit_min = lim->atomic_write_unit_max = 1;
+	lim->atomic_write_max_bytes = 512;
+	lim->atomic_write_boundary = 0;
 }
 
 /**
@@ -183,6 +186,59 @@  void blk_queue_max_discard_sectors(struct request_queue *q,
 }
 EXPORT_SYMBOL(blk_queue_max_discard_sectors);
 
+/**
+ * blk_queue_atomic_write_max_bytes - set max bytes supported by
+ * the device for atomic write operations.
+ * @q:  the request queue for the device
+ * @size: maximum bytes supported
+ */
+void blk_queue_atomic_write_max_bytes(struct request_queue *q,
+				      unsigned int size)
+{
+	q->limits.atomic_write_max_bytes = size;
+}
+EXPORT_SYMBOL(blk_queue_atomic_write_max_bytes);
+
+/**
+ * blk_queue_atomic_write_boundary - Device's logical block address space
+ * which an atomic write should not cross.
+ * @q:  the request queue for the device
+ * @size: size in bytes. Must be a power-of-two.
+ */
+void blk_queue_atomic_write_boundary(struct request_queue *q,
+				     unsigned int size)
+{
+	q->limits.atomic_write_boundary = size;
+}
+EXPORT_SYMBOL(blk_queue_atomic_write_boundary);
+
+/**
+ * blk_queue_atomic_write_unit_min - smallest unit that can be written
+ *				     atomically to the device.
+ * @q:  the request queue for the device
+ * @sectors: must be a power-of-two.
+ */
+void blk_queue_atomic_write_unit_min(struct request_queue *q,
+				     unsigned int sectors)
+{
+	q->limits.atomic_write_unit_min = sectors;
+}
+EXPORT_SYMBOL(blk_queue_atomic_write_unit_min);
+
+/*
+ * blk_queue_atomic_write_unit_max - largest unit that can be written
+ * atomically to the device.
+ * @q: the reqeust queue for the device
+ * @sectors: must be a power-of-two.
+ */
+void blk_queue_atomic_write_unit_max(struct request_queue *q,
+				     unsigned int sectors)
+{
+	struct queue_limits *limits = &q->limits;
+	limits->atomic_write_unit_max = sectors;
+}
+EXPORT_SYMBOL(blk_queue_atomic_write_unit_max);
+
 /**
  * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase
  * @q:  the request queue for the device
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f1fce1c7fa44..1025beff2281 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -132,6 +132,30 @@  static ssize_t queue_max_discard_segments_show(struct request_queue *q,
 	return queue_var_show(queue_max_discard_segments(q), page);
 }
 
+static ssize_t queue_atomic_write_max_bytes_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(q->limits.atomic_write_max_bytes, page);
+}
+
+static ssize_t queue_atomic_write_boundary_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(q->limits.atomic_write_boundary, page);
+}
+
+static ssize_t queue_atomic_write_unit_min_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(queue_atomic_write_unit_min(q), page);
+}
+
+static ssize_t queue_atomic_write_unit_max_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(queue_atomic_write_unit_max(q), page);
+}
+
 static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(q->limits.max_integrity_segments, page);
@@ -604,6 +628,11 @@  QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes");
 QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes");
 QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
 
+QUEUE_RO_ENTRY(queue_atomic_write_max_bytes, "atomic_write_max_bytes");
+QUEUE_RO_ENTRY(queue_atomic_write_boundary, "atomic_write_boundary");
+QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max");
+QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min");
+
 QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
 QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
@@ -661,6 +690,10 @@  static struct attribute *queue_attrs[] = {
 	&queue_discard_max_entry.attr,
 	&queue_discard_max_hw_entry.attr,
 	&queue_discard_zeroes_data_entry.attr,
+	&queue_atomic_write_max_bytes_entry.attr,
+	&queue_atomic_write_boundary_entry.attr,
+	&queue_atomic_write_unit_min_entry.attr,
+	&queue_atomic_write_unit_max_entry.attr,
 	&queue_write_same_max_entry.attr,
 	&queue_write_zeroes_max_entry.attr,
 	&queue_zone_append_max_entry.attr,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 941304f17492..6b6f2992338c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -304,6 +304,11 @@  struct queue_limits {
 	unsigned int		discard_alignment;
 	unsigned int		zone_write_granularity;
 
+	unsigned int		atomic_write_boundary;
+	unsigned int		atomic_write_max_bytes;
+	unsigned int		atomic_write_unit_min;
+	unsigned int		atomic_write_unit_max;
+
 	unsigned short		max_segments;
 	unsigned short		max_integrity_segments;
 	unsigned short		max_discard_segments;
@@ -929,6 +934,14 @@  void blk_queue_zone_write_granularity(struct request_queue *q,
 				      unsigned int size);
 extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
+extern void blk_queue_atomic_write_max_bytes(struct request_queue *q,
+					     unsigned int size);
+extern void blk_queue_atomic_write_unit_max(struct request_queue *q,
+					    unsigned int sectors);
+extern void blk_queue_atomic_write_unit_min(struct request_queue *q,
+					    unsigned int sectors);
+extern void blk_queue_atomic_write_boundary(struct request_queue *q,
+					    unsigned int size);
 void disk_update_readahead(struct gendisk *disk);
 extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
 extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
@@ -1331,6 +1344,16 @@  static inline int queue_dma_alignment(const struct request_queue *q)
 	return q ? q->limits.dma_alignment : 511;
 }
 
+static inline unsigned int queue_atomic_write_unit_max(const struct request_queue *q)
+{
+	return q->limits.atomic_write_unit_max << SECTOR_SHIFT;
+}
+
+static inline unsigned int queue_atomic_write_unit_min(const struct request_queue *q)
+{
+	return q->limits.atomic_write_unit_min << SECTOR_SHIFT;
+}
+
 static inline unsigned int bdev_dma_alignment(struct block_device *bdev)
 {
 	return queue_dma_alignment(bdev_get_queue(bdev));