diff mbox series

[v5,4/7] xfs: Support FS_XFLAG_ATOMICWRITES for forcealign

Message ID 20240817094800.776408-5-john.g.garry@oracle.com (mailing list archive)
State New, archived
Headers show
Series block atomic writes for xfs | expand

Commit Message

John Garry Aug. 17, 2024, 9:47 a.m. UTC
Add initial support for new flag FS_XFLAG_ATOMICWRITES for forcealign
enabled.

This flag is a file attribute that mirrors an ondisk inode flag.  Actual
support for untorn file writes (for now) depends on both the iflag and the
underlying storage devices, which we can only really check at statx and
pwritev2() time.  This is the same story as FS_XFLAG_DAX, which signals to
the fs that we should try to enable the fsdax IO path on the file (instead
of the regular page cache), but applications have to query STAT_ATTR_DAX to
find out if they really got that IO path.

Current kernel support for atomic writes is based on HW support (for atomic
writes). As such, it is required to ensure extent alignment with
atomic_write_unit_max so that an atomic write can result in a single
HW-compliant IO operation.

rtvol also guarantees extent alignment, but we are basing support initially
on forcealign, which is not supported for rtvol yet.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/xfs/libxfs/xfs_format.h     | 11 +++++--
 fs/xfs/libxfs/xfs_inode_buf.c  | 52 ++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_inode_util.c |  4 +++
 fs/xfs/libxfs/xfs_sb.c         |  2 ++
 fs/xfs/xfs_buf.c               | 15 +++++++++-
 fs/xfs/xfs_buf.h               |  4 ++-
 fs/xfs/xfs_buf_mem.c           |  2 +-
 fs/xfs/xfs_inode.h             |  5 ++++
 fs/xfs/xfs_ioctl.c             | 52 ++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_mount.h             |  2 ++
 fs/xfs/xfs_super.c             | 12 ++++++++
 include/uapi/linux/fs.h        |  1 +
 12 files changed, 157 insertions(+), 5 deletions(-)

Comments

Darrick J. Wong Aug. 21, 2024, 5:07 p.m. UTC | #1
On Sat, Aug 17, 2024 at 09:47:57AM +0000, John Garry wrote:
> Add initial support for new flag FS_XFLAG_ATOMICWRITES for forcealign
> enabled.
> 
> This flag is a file attribute that mirrors an ondisk inode flag.  Actual
> support for untorn file writes (for now) depends on both the iflag and the
> underlying storage devices, which we can only really check at statx and
> pwritev2() time.  This is the same story as FS_XFLAG_DAX, which signals to
> the fs that we should try to enable the fsdax IO path on the file (instead
> of the regular page cache), but applications have to query STAT_ATTR_DAX to
> find out if they really got that IO path.
> 
> Current kernel support for atomic writes is based on HW support (for atomic
> writes). As such, it is required to ensure extent alignment with
> atomic_write_unit_max so that an atomic write can result in a single
> HW-compliant IO operation.
> 
> rtvol also guarantees extent alignment, but we are basing support initially
> on forcealign, which is not supported for rtvol yet.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/xfs/libxfs/xfs_format.h     | 11 +++++--
>  fs/xfs/libxfs/xfs_inode_buf.c  | 52 ++++++++++++++++++++++++++++++++++
>  fs/xfs/libxfs/xfs_inode_util.c |  4 +++
>  fs/xfs/libxfs/xfs_sb.c         |  2 ++
>  fs/xfs/xfs_buf.c               | 15 +++++++++-
>  fs/xfs/xfs_buf.h               |  4 ++-
>  fs/xfs/xfs_buf_mem.c           |  2 +-
>  fs/xfs/xfs_inode.h             |  5 ++++
>  fs/xfs/xfs_ioctl.c             | 52 ++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_mount.h             |  2 ++
>  fs/xfs/xfs_super.c             | 12 ++++++++
>  include/uapi/linux/fs.h        |  1 +
>  12 files changed, 157 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> index 04c6cbc943c2..a9f3389438a6 100644
> --- a/fs/xfs/libxfs/xfs_format.h
> +++ b/fs/xfs/libxfs/xfs_format.h
> @@ -353,12 +353,16 @@ xfs_sb_has_compat_feature(
>  #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
>  #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
>  #define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30)	/* aligned file data extents */
> +#define XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES (1 << 31)	/* atomicwrites enabled */

Do you ever see test failures in xfs/270?

> +
>  #define XFS_SB_FEAT_RO_COMPAT_ALL \
>  		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
>  		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
>  		 XFS_SB_FEAT_RO_COMPAT_REFLINK| \
>  		 XFS_SB_FEAT_RO_COMPAT_INOBTCNT | \
> -		 XFS_SB_FEAT_RO_COMPAT_FORCEALIGN)
> +		 XFS_SB_FEAT_RO_COMPAT_FORCEALIGN | \
> +		 XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES)
> +
>  #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN	~XFS_SB_FEAT_RO_COMPAT_ALL
>  static inline bool
>  xfs_sb_has_ro_compat_feature(
> @@ -1097,6 +1101,7 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
>  #define XFS_DIFLAG2_NREXT64_BIT 4	/* large extent counters */
>  /* data extent mappings for regular files must be aligned to extent size hint */
>  #define XFS_DIFLAG2_FORCEALIGN_BIT 5
> +#define XFS_DIFLAG2_ATOMICWRITES_BIT 6
>  
>  #define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
>  #define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
> @@ -1104,10 +1109,12 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
>  #define XFS_DIFLAG2_BIGTIME	(1 << XFS_DIFLAG2_BIGTIME_BIT)
>  #define XFS_DIFLAG2_NREXT64	(1 << XFS_DIFLAG2_NREXT64_BIT)
>  #define XFS_DIFLAG2_FORCEALIGN	(1 << XFS_DIFLAG2_FORCEALIGN_BIT)
> +#define XFS_DIFLAG2_ATOMICWRITES	(1 << XFS_DIFLAG2_ATOMICWRITES_BIT)
>  
>  #define XFS_DIFLAG2_ANY \
>  	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
> -	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN)
> +	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN | \
> +	 XFS_DIFLAG2_ATOMICWRITES)
>  
>  static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
>  {
> diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
> index 1c59891fa9e2..59933c7df56d 100644
> --- a/fs/xfs/libxfs/xfs_inode_buf.c
> +++ b/fs/xfs/libxfs/xfs_inode_buf.c
> @@ -178,7 +178,10 @@ xfs_inode_from_disk(
>  	struct xfs_inode	*ip,
>  	struct xfs_dinode	*from)
>  {
> +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
>  	struct inode		*inode = VFS_I(ip);
> +	struct xfs_mount	*mp = ip->i_mount;
> +	struct xfs_sb		*sbp = &mp->m_sb;
>  	int			error;
>  	xfs_failaddr_t		fa;
>  
> @@ -261,6 +264,13 @@ xfs_inode_from_disk(
>  	}
>  	if (xfs_is_reflink_inode(ip))
>  		xfs_ifork_init_cow(ip);
> +
> +	if (xfs_inode_has_atomicwrites(ip)) {
> +		if (sbp->sb_blocksize < target->bt_bdev_awu_min ||
> +		    sbp->sb_blocksize * ip->i_extsize > target->bt_bdev_awu_max)

Can this multiplication trigger integer overflows?

> +			ip->i_diflags2 &= ~XFS_DIFLAG2_ATOMICWRITES;

Ondisk iflag updates must use transactions.

Or you can fail IOCB_ATOMIC writes if XFS_DIFLAG2_ATOMICWRITES is set
but the forcealign blocksize doesn't fit with awu_min/max.

> +	}
> +
>  	return 0;
>  
>  out_destroy_data_fork:
> @@ -483,6 +493,40 @@ xfs_dinode_verify_nrext64(
>  	return NULL;
>  }
>  
> +static xfs_failaddr_t
> +xfs_inode_validate_atomicwrites(
> +	struct xfs_mount	*mp,
> +	uint32_t		extsize,
> +	uint64_t		flags2)
> +{
> +	/* superblock rocompat feature flag */
> +	if (!xfs_has_atomicwrites(mp))
> +		return __this_address;
> +
> +	/*
> +	 * forcealign is required, so rely on sanity checks in
> +	 * xfs_inode_validate_forcealign()
> +	 */
> +	if (!(flags2 & XFS_DIFLAG2_FORCEALIGN))
> +		return __this_address;
> +
> +	if (!is_power_of_2(extsize))
> +		return __this_address;
> +
> +	/* Required to guarantee data block alignment */
> +	if (mp->m_sb.sb_agblocks % extsize)
> +		return __this_address;
> +
> +	/* Requires stripe unit+width be a multiple of extsize */
> +	if (mp->m_dalign && (mp->m_dalign % extsize))
> +		return __this_address;
> +
> +	if (mp->m_swidth && (mp->m_swidth % extsize))

IIRC m_dalign and m_swidth can be set at mount time, which can result in
inode verifiers logging corruption errors if those parameters change.  I
think we should validate these two congruencies when setting
FMODE_CAN_ATOMIC_WRITE.

> +		return __this_address;
> +
> +	return NULL;
> +}
> +
>  xfs_failaddr_t
>  xfs_dinode_verify(
>  	struct xfs_mount	*mp,
> @@ -666,6 +710,14 @@ xfs_dinode_verify(
>  			return fa;
>  	}
>  
> +	if (flags2 & XFS_DIFLAG2_ATOMICWRITES) {
> +		fa = xfs_inode_validate_atomicwrites(mp,
> +				be32_to_cpu(dip->di_extsize),
> +				flags2);
> +		if (fa)
> +			return fa;
> +	}
> +
>  	return NULL;
>  }
>  
> diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
> index b264939d8855..dbd5b16e1844 100644
> --- a/fs/xfs/libxfs/xfs_inode_util.c
> +++ b/fs/xfs/libxfs/xfs_inode_util.c
> @@ -82,6 +82,8 @@ xfs_flags2diflags2(
>  		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
>  	if (xflags & FS_XFLAG_FORCEALIGN)
>  		di_flags2 |= XFS_DIFLAG2_FORCEALIGN;
> +	if (xflags & FS_XFLAG_ATOMICWRITES)
> +		di_flags2 |= XFS_DIFLAG2_ATOMICWRITES;
>  
>  	return di_flags2;
>  }
> @@ -130,6 +132,8 @@ xfs_ip2xflags(
>  			flags |= FS_XFLAG_COWEXTSIZE;
>  		if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN)
>  			flags |= FS_XFLAG_FORCEALIGN;
> +		if (ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES)
> +			flags |= FS_XFLAG_ATOMICWRITES;
>  	}
>  
>  	if (xfs_inode_has_attr_fork(ip))
> diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
> index e56911553edd..5de8725bf93a 100644
> --- a/fs/xfs/libxfs/xfs_sb.c
> +++ b/fs/xfs/libxfs/xfs_sb.c
> @@ -166,6 +166,8 @@ xfs_sb_version_to_features(
>  		features |= XFS_FEAT_INOBTCNT;
>  	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FORCEALIGN)
>  		features |= XFS_FEAT_FORCEALIGN;
> +	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES)
> +		features |= XFS_FEAT_ATOMICWRITES;
>  	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
>  		features |= XFS_FEAT_FTYPE;
>  	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index aa4dbda7b536..44bee3e2b2bb 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -2060,6 +2060,8 @@ int
>  xfs_init_buftarg(
>  	struct xfs_buftarg		*btp,
>  	size_t				logical_sectorsize,
> +	unsigned int			awu_min,
> +	unsigned int			awu_max,
>  	const char			*descr)
>  {
>  	/* Set up device logical sector size mask */
> @@ -2086,6 +2088,9 @@ xfs_init_buftarg(
>  	btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
>  	btp->bt_shrinker->private_data = btp;
>  	shrinker_register(btp->bt_shrinker);
> +
> +	btp->bt_bdev_awu_min = awu_min;
> +	btp->bt_bdev_awu_max = awu_max;
>  	return 0;
>  
>  out_destroy_io_count:
> @@ -2102,6 +2107,7 @@ xfs_alloc_buftarg(
>  {
>  	struct xfs_buftarg	*btp;
>  	const struct dax_holder_operations *ops = NULL;
> +	unsigned int awu_min = 0, awu_max = 0;
>  
>  #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
>  	ops = &xfs_dax_holder_operations;
> @@ -2115,6 +2121,13 @@ xfs_alloc_buftarg(
>  	btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
>  					    mp, ops);
>  
> +	if (bdev_can_atomic_write(btp->bt_bdev)) {
> +		struct request_queue *q = bdev_get_queue(btp->bt_bdev);
> +
> +		awu_min = queue_atomic_write_unit_min_bytes(q);
> +		awu_max = queue_atomic_write_unit_max_bytes(q);
> +	}
> +
>  	/*
>  	 * When allocating the buftargs we have not yet read the super block and
>  	 * thus don't know the file system sector size yet.
> @@ -2122,7 +2135,7 @@ xfs_alloc_buftarg(
>  	if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)))
>  		goto error_free;
>  	if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
> -			mp->m_super->s_id))
> +			awu_min, awu_max, mp->m_super->s_id))
>  		goto error_free;
>  
>  	return btp;
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index b1580644501f..3bcd8137d739 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -124,6 +124,8 @@ struct xfs_buftarg {
>  	struct percpu_counter	bt_io_count;
>  	struct ratelimit_state	bt_ioerror_rl;
>  
> +	unsigned int		bt_bdev_awu_min, bt_bdev_awu_max;

Please add a comment here about what these mean.  Not everyone is going
to know what "awu" abbreviates.

> +
>  	/* built-in cache, if we're not using the perag one */
>  	struct xfs_buf_cache	bt_cache[];
>  };
> @@ -393,7 +395,7 @@ bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
>  
>  /* for xfs_buf_mem.c only: */
>  int xfs_init_buftarg(struct xfs_buftarg *btp, size_t logical_sectorsize,
> -		const char *descr);
> +		unsigned int awu_min, unsigned int awu_max, const char *descr);
>  void xfs_destroy_buftarg(struct xfs_buftarg *btp);
>  
>  #endif	/* __XFS_BUF_H__ */
> diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
> index 9bb2d24de709..af48a8da2f0f 100644
> --- a/fs/xfs/xfs_buf_mem.c
> +++ b/fs/xfs/xfs_buf_mem.c
> @@ -93,7 +93,7 @@ xmbuf_alloc(
>  	btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE;
>  	btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1;
>  
> -	error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
> +	error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, 0, 0, descr);
>  	if (error)
>  		goto out_bcache;
>  
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 336124105c47..cfcb67da12cb 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -321,6 +321,11 @@ static inline bool xfs_inode_has_forcealign(struct xfs_inode *ip)
>  	return ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN;
>  }
>  
> +static inline bool xfs_inode_has_atomicwrites(struct xfs_inode *ip)
> +{
> +	return ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES;
> +}
> +
>  /*
>   * Decide if this file is a realtime file whose data allocation unit is larger
>   * than a single filesystem block.
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index a68ec68e5b92..7af2837779e8 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -502,6 +502,49 @@ xfs_ioctl_setattr_forcealign(
>  	return 0;
>  }
>  
> +
> +/*
> + * Forcealign requires a power-of-2 extent size hint.
> + */
> +static int
> +xfs_ioctl_setattr_atomicwrites(
> +	struct xfs_inode	*ip,
> +	struct fileattr		*fa)
> +{
> +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
> +	struct xfs_mount	*mp = ip->i_mount;
> +	uint32_t		extsize = XFS_B_TO_FSB(mp, fa->fsx_extsize);
> +	struct xfs_sb		*sbp = &mp->m_sb;
> +
> +	if (!xfs_has_atomicwrites(mp))
> +		return -EINVAL;
> +
> +	if (!(fa->fsx_xflags & FS_XFLAG_FORCEALIGN))
> +		return -EINVAL;
> +
> +	if (!is_power_of_2(extsize))
> +		return -EINVAL;
> +
> +	/* Required to guarantee data block alignment */
> +	if (mp->m_sb.sb_agblocks % extsize)
> +		return -EINVAL;
> +
> +	/* Requires stripe unit+width be a multiple of extsize */
> +	if (mp->m_dalign && (mp->m_dalign % extsize))
> +		return -EINVAL;
> +
> +	if (mp->m_swidth && (mp->m_swidth % extsize))
> +		return -EINVAL;
> +
> +	if (target->bt_bdev_awu_min > sbp->sb_blocksize)
> +		return -EINVAL;
> +
> +	if (target->bt_bdev_awu_max < fa->fsx_extsize)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
>  static int
>  xfs_ioctl_setattr_xflags(
>  	struct xfs_trans	*tp,
> @@ -511,9 +554,12 @@ xfs_ioctl_setattr_xflags(
>  	struct xfs_mount	*mp = ip->i_mount;
>  	bool			rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME);
>  	bool			forcealign = fa->fsx_xflags & FS_XFLAG_FORCEALIGN;
> +	bool			atomic_writes;
>  	uint64_t		i_flags2;
>  	int			error;
>  
> +	atomic_writes = fa->fsx_xflags & FS_XFLAG_ATOMICWRITES;
> +
>  	/* Can't change RT or forcealign flags if any extents are allocated. */
>  	if (rtflag != XFS_IS_REALTIME_INODE(ip) ||
>  	    forcealign != xfs_inode_has_forcealign(ip)) {
> @@ -554,6 +600,12 @@ xfs_ioctl_setattr_xflags(
>  			return error;
>  	}
>  
> +	if (atomic_writes) {
> +		error = xfs_ioctl_setattr_atomicwrites(ip, fa);
> +		if (error)
> +			return error;
> +	}
> +
>  	ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
>  	ip->i_diflags2 = i_flags2;
>  
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 30228fea908d..0c5a3ae3cdaf 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -300,6 +300,7 @@ typedef struct xfs_mount {
>  #define XFS_FEAT_NREXT64	(1ULL << 26)	/* large extent counters */
>  #define XFS_FEAT_EXCHANGE_RANGE	(1ULL << 27)	/* exchange range */
>  #define XFS_FEAT_FORCEALIGN	(1ULL << 28)	/* aligned file data extents */
> +#define XFS_FEAT_ATOMICWRITES	(1ULL << 29)	/* atomic writes support */
>  
>  /* Mount features */
>  #define XFS_FEAT_NOATTR2	(1ULL << 48)	/* disable attr2 creation */
> @@ -387,6 +388,7 @@ __XFS_HAS_V4_FEAT(v3inodes, V3INODES)
>  __XFS_HAS_V4_FEAT(crc, CRC)
>  __XFS_HAS_V4_FEAT(pquotino, PQUOTINO)
>  __XFS_HAS_FEAT(forcealign, FORCEALIGN)
> +__XFS_HAS_FEAT(atomicwrites, ATOMICWRITES)
>  
>  /*
>   * Mount features
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index b52a01b50387..5352b90b2bb6 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1721,6 +1721,18 @@ xfs_fs_fill_super(
>  		mp->m_features &= ~XFS_FEAT_DISCARD;
>  	}
>  
> +	if (xfs_has_atomicwrites(mp)) {
> +		if (!xfs_has_forcealign(mp)) {
> +			xfs_alert(mp,
> +	"forcealign required for atomicwrites!");

This (atomicwrites && !forcealign) ought to be checked in the superblock
verifier.

--D

> +			error = -EINVAL;
> +			goto out_filestream_unmount;
> +		}
> +
> +		xfs_warn(mp,
> +"EXPERIMENTAL atomicwrites feature in use. Use at your own risk!");
> +	}
> +
>  	if (xfs_has_reflink(mp)) {
>  		if (mp->m_sb.sb_rblocks) {
>  			xfs_alert(mp,
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index f55d650f904a..c416f549e94d 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -160,6 +160,7 @@ struct fsxattr {
>  #define FS_XFLAG_COWEXTSIZE	0x00010000	/* CoW extent size allocator hint */
>  /* data extent mappings for regular files must be aligned to extent size hint */
>  #define FS_XFLAG_FORCEALIGN	0x00020000
> +#define FS_XFLAG_ATOMICWRITES	0x00040000	/* atomic writes enabled */
>  #define FS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
>  
>  /* the read-only stuff doesn't really belong here, but any other place is
> -- 
> 2.31.1
> 
>
John Garry Aug. 22, 2024, 5:45 p.m. UTC | #2
On 21/08/2024 18:07, Darrick J. Wong wrote:
> On Sat, Aug 17, 2024 at 09:47:57AM +0000, John Garry wrote:
>> Add initial support for new flag FS_XFLAG_ATOMICWRITES for forcealign
>> enabled.
>>
>> This flag is a file attribute that mirrors an ondisk inode flag.  Actual
>> support for untorn file writes (for now) depends on both the iflag and the
>> underlying storage devices, which we can only really check at statx and
>> pwritev2() time.  This is the same story as FS_XFLAG_DAX, which signals to
>> the fs that we should try to enable the fsdax IO path on the file (instead
>> of the regular page cache), but applications have to query STAT_ATTR_DAX to
>> find out if they really got that IO path.
>>
>> Current kernel support for atomic writes is based on HW support (for atomic
>> writes). As such, it is required to ensure extent alignment with
>> atomic_write_unit_max so that an atomic write can result in a single
>> HW-compliant IO operation.
>>
>> rtvol also guarantees extent alignment, but we are basing support initially
>> on forcealign, which is not supported for rtvol yet.
>>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   fs/xfs/libxfs/xfs_format.h     | 11 +++++--
>>   fs/xfs/libxfs/xfs_inode_buf.c  | 52 ++++++++++++++++++++++++++++++++++
>>   fs/xfs/libxfs/xfs_inode_util.c |  4 +++
>>   fs/xfs/libxfs/xfs_sb.c         |  2 ++
>>   fs/xfs/xfs_buf.c               | 15 +++++++++-
>>   fs/xfs/xfs_buf.h               |  4 ++-
>>   fs/xfs/xfs_buf_mem.c           |  2 +-
>>   fs/xfs/xfs_inode.h             |  5 ++++
>>   fs/xfs/xfs_ioctl.c             | 52 ++++++++++++++++++++++++++++++++++
>>   fs/xfs/xfs_mount.h             |  2 ++
>>   fs/xfs/xfs_super.c             | 12 ++++++++
>>   include/uapi/linux/fs.h        |  1 +
>>   12 files changed, 157 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
>> index 04c6cbc943c2..a9f3389438a6 100644
>> --- a/fs/xfs/libxfs/xfs_format.h
>> +++ b/fs/xfs/libxfs/xfs_format.h
>> @@ -353,12 +353,16 @@ xfs_sb_has_compat_feature(
>>   #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
>>   #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
>>   #define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30)	/* aligned file data extents */
>> +#define XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES (1 << 31)	/* atomicwrites enabled */
> 
> Do you ever see test failures in xfs/270?

Well it wasn't with forcealign only. I'll check again for atomic writes.

> 
>> +
>>   #define XFS_SB_FEAT_RO_COMPAT_ALL \
>>   		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
>>   		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
>>   		 XFS_SB_FEAT_RO_COMPAT_REFLINK| \
>>   		 XFS_SB_FEAT_RO_COMPAT_INOBTCNT | \
>> -		 XFS_SB_FEAT_RO_COMPAT_FORCEALIGN)
>> +		 XFS_SB_FEAT_RO_COMPAT_FORCEALIGN | \
>> +		 XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES)
>> +
>>   #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN	~XFS_SB_FEAT_RO_COMPAT_ALL
>>   static inline bool
>>   xfs_sb_has_ro_compat_feature(
>> @@ -1097,6 +1101,7 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
>>   #define XFS_DIFLAG2_NREXT64_BIT 4	/* large extent counters */
>>   /* data extent mappings for regular files must be aligned to extent size hint */
>>   #define XFS_DIFLAG2_FORCEALIGN_BIT 5
>> +#define XFS_DIFLAG2_ATOMICWRITES_BIT 6
>>   
>>   #define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
>>   #define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
>> @@ -1104,10 +1109,12 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
>>   #define XFS_DIFLAG2_BIGTIME	(1 << XFS_DIFLAG2_BIGTIME_BIT)
>>   #define XFS_DIFLAG2_NREXT64	(1 << XFS_DIFLAG2_NREXT64_BIT)
>>   #define XFS_DIFLAG2_FORCEALIGN	(1 << XFS_DIFLAG2_FORCEALIGN_BIT)
>> +#define XFS_DIFLAG2_ATOMICWRITES	(1 << XFS_DIFLAG2_ATOMICWRITES_BIT)
>>   
>>   #define XFS_DIFLAG2_ANY \
>>   	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
>> -	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN)
>> +	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN | \
>> +	 XFS_DIFLAG2_ATOMICWRITES)
>>   
>>   static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
>>   {
>> diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
>> index 1c59891fa9e2..59933c7df56d 100644
>> --- a/fs/xfs/libxfs/xfs_inode_buf.c
>> +++ b/fs/xfs/libxfs/xfs_inode_buf.c
>> @@ -178,7 +178,10 @@ xfs_inode_from_disk(
>>   	struct xfs_inode	*ip,
>>   	struct xfs_dinode	*from)
>>   {
>> +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
>>   	struct inode		*inode = VFS_I(ip);
>> +	struct xfs_mount	*mp = ip->i_mount;
>> +	struct xfs_sb		*sbp = &mp->m_sb;
>>   	int			error;
>>   	xfs_failaddr_t		fa;
>>   
>> @@ -261,6 +264,13 @@ xfs_inode_from_disk(
>>   	}
>>   	if (xfs_is_reflink_inode(ip))
>>   		xfs_ifork_init_cow(ip);
>> +
>> +	if (xfs_inode_has_atomicwrites(ip)) {
>> +		if (sbp->sb_blocksize < target->bt_bdev_awu_min ||
>> +		    sbp->sb_blocksize * ip->i_extsize > target->bt_bdev_awu_max)
> 
> Can this multiplication trigger integer overflows?

I should just use xfs_inode_alloc_unitsize()

> 
>> +			ip->i_diflags2 &= ~XFS_DIFLAG2_ATOMICWRITES;
> 
> Ondisk iflag updates must use transactions.

I want to change this.

The idea was to runtime clear this flag in case the bdev cannot satisfy 
the FS atomic write range, but that does not work.

> 
> Or you can fail IOCB_ATOMIC writes if XFS_DIFLAG2_ATOMICWRITES is set
> but the forcealign blocksize doesn't fit with awu_min/max.

I'd rather just not set FMODE_CAN_ATOMIC_WRITE

> 
>> +	}
>> +
>>   	return 0;
>>   
>>   out_destroy_data_fork:
>> @@ -483,6 +493,40 @@ xfs_dinode_verify_nrext64(
>>   	return NULL;
>>   }
>>   
>> +static xfs_failaddr_t
>> +xfs_inode_validate_atomicwrites(
>> +	struct xfs_mount	*mp,
>> +	uint32_t		extsize,
>> +	uint64_t		flags2)
>> +{
>> +	/* superblock rocompat feature flag */
>> +	if (!xfs_has_atomicwrites(mp))
>> +		return __this_address;
>> +
>> +	/*
>> +	 * forcealign is required, so rely on sanity checks in
>> +	 * xfs_inode_validate_forcealign()
>> +	 */
>> +	if (!(flags2 & XFS_DIFLAG2_FORCEALIGN))
>> +		return __this_address;
>> +
>> +	if (!is_power_of_2(extsize))
>> +		return __this_address;
>> +
>> +	/* Required to guarantee data block alignment */
>> +	if (mp->m_sb.sb_agblocks % extsize)
>> +		return __this_address;
>> +
>> +	/* Requires stripe unit+width be a multiple of extsize */
>> +	if (mp->m_dalign && (mp->m_dalign % extsize))
>> +		return __this_address;
>> +
>> +	if (mp->m_swidth && (mp->m_swidth % extsize))
> 
> IIRC m_dalign and m_swidth can be set at mount time,

I thought that these were fixed at mkfs time, however I see that the 
comment for xfs_update_alignment() mentions "values based on mount 
options". And we are reading mp values, and not sbp values, which is a 
good clue...

> which can result in
> inode verifiers logging corruption errors if those parameters change.  I
> think we should validate these two congruencies when setting
> FMODE_CAN_ATOMIC_WRITE.

That would make sense.

> 
>> +		return __this_address;
>> +
>> +	return NULL;
>> +}
>> +
>>   xfs_failaddr_t
>>   xfs_dinode_verify(
>>   	struct xfs_mount	*mp,
>> @@ -666,6 +710,14 @@ xfs_dinode_verify(
>>   			return fa;
>>   	}
>>   
>> +	if (flags2 & XFS_DIFLAG2_ATOMICWRITES) {
>> +		fa = xfs_inode_validate_atomicwrites(mp,
>> +				be32_to_cpu(dip->di_extsize),
>> +				flags2);
>> +		if (fa)
>> +			return fa;
>> +	}
>> +
>>   	return NULL;
>>   }
>>   
>> diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
>> index b264939d8855..dbd5b16e1844 100644
>> --- a/fs/xfs/libxfs/xfs_inode_util.c
>> +++ b/fs/xfs/libxfs/xfs_inode_util.c
>> @@ -82,6 +82,8 @@ xfs_flags2diflags2(
>>   		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
>>   	if (xflags & FS_XFLAG_FORCEALIGN)
>>   		di_flags2 |= XFS_DIFLAG2_FORCEALIGN;
>> +	if (xflags & FS_XFLAG_ATOMICWRITES)
>> +		di_flags2 |= XFS_DIFLAG2_ATOMICWRITES;
>>   
>>   	return di_flags2;
>>   }
>> @@ -130,6 +132,8 @@ xfs_ip2xflags(
>>   			flags |= FS_XFLAG_COWEXTSIZE;
>>   		if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN)
>>   			flags |= FS_XFLAG_FORCEALIGN;
>> +		if (ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES)
>> +			flags |= FS_XFLAG_ATOMICWRITES;
>>   	}
>>   
>>   	if (xfs_inode_has_attr_fork(ip))
>> diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
>> index e56911553edd..5de8725bf93a 100644
>> --- a/fs/xfs/libxfs/xfs_sb.c
>> +++ b/fs/xfs/libxfs/xfs_sb.c
>> @@ -166,6 +166,8 @@ xfs_sb_version_to_features(
>>   		features |= XFS_FEAT_INOBTCNT;
>>   	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FORCEALIGN)
>>   		features |= XFS_FEAT_FORCEALIGN;
>> +	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES)
>> +		features |= XFS_FEAT_ATOMICWRITES;
>>   	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
>>   		features |= XFS_FEAT_FTYPE;
>>   	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
>> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
>> index aa4dbda7b536..44bee3e2b2bb 100644
>> --- a/fs/xfs/xfs_buf.c
>> +++ b/fs/xfs/xfs_buf.c
>> @@ -2060,6 +2060,8 @@ int
>>   xfs_init_buftarg(
>>   	struct xfs_buftarg		*btp,
>>   	size_t				logical_sectorsize,
>> +	unsigned int			awu_min,
>> +	unsigned int			awu_max,
>>   	const char			*descr)
>>   {
>>   	/* Set up device logical sector size mask */
>> @@ -2086,6 +2088,9 @@ xfs_init_buftarg(
>>   	btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
>>   	btp->bt_shrinker->private_data = btp;
>>   	shrinker_register(btp->bt_shrinker);
>> +
>> +	btp->bt_bdev_awu_min = awu_min;
>> +	btp->bt_bdev_awu_max = awu_max;
>>   	return 0;
>>   
>>   out_destroy_io_count:
>> @@ -2102,6 +2107,7 @@ xfs_alloc_buftarg(
>>   {
>>   	struct xfs_buftarg	*btp;
>>   	const struct dax_holder_operations *ops = NULL;
>> +	unsigned int awu_min = 0, awu_max = 0;
>>   
>>   #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
>>   	ops = &xfs_dax_holder_operations;
>> @@ -2115,6 +2121,13 @@ xfs_alloc_buftarg(
>>   	btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
>>   					    mp, ops);
>>   
>> +	if (bdev_can_atomic_write(btp->bt_bdev)) {
>> +		struct request_queue *q = bdev_get_queue(btp->bt_bdev);
>> +
>> +		awu_min = queue_atomic_write_unit_min_bytes(q);
>> +		awu_max = queue_atomic_write_unit_max_bytes(q);
>> +	}
>> +
>>   	/*
>>   	 * When allocating the buftargs we have not yet read the super block and
>>   	 * thus don't know the file system sector size yet.
>> @@ -2122,7 +2135,7 @@ xfs_alloc_buftarg(
>>   	if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)))
>>   		goto error_free;
>>   	if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
>> -			mp->m_super->s_id))
>> +			awu_min, awu_max, mp->m_super->s_id))
>>   		goto error_free;
>>   
>>   	return btp;
>> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
>> index b1580644501f..3bcd8137d739 100644
>> --- a/fs/xfs/xfs_buf.h
>> +++ b/fs/xfs/xfs_buf.h
>> @@ -124,6 +124,8 @@ struct xfs_buftarg {
>>   	struct percpu_counter	bt_io_count;
>>   	struct ratelimit_state	bt_ioerror_rl;
>>   
>> +	unsigned int		bt_bdev_awu_min, bt_bdev_awu_max;
> 
> Please add a comment here about what these mean.  Not everyone is going
> to know what "awu" abbreviates.

sure

>> +
>>   static int
>>   xfs_ioctl_setattr_xflags(
>>   	struct xfs_trans	*tp,
>> @@ -511,9 +554,12 @@ xfs_ioctl_setattr_xflags(
>>   	struct xfs_mount	*mp = ip->i_mount;
>>   	bool			rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME);
>>   	bool			forcealign = fa->fsx_xflags & FS_XFLAG_FORCEALIGN;
>> +	bool			atomic_writes;
>>   	uint64_t		i_flags2;
>>   	int			error;
>>   
>> +	atomic_writes = fa->fsx_xflags & FS_XFLAG_ATOMICWRITES;
>> +
>>   	/* Can't change RT or forcealign flags if any extents are allocated. */
>>   	if (rtflag != XFS_IS_REALTIME_INODE(ip) ||
>>   	    forcealign != xfs_inode_has_forcealign(ip)) {
>> @@ -554,6 +600,12 @@ xfs_ioctl_setattr_xflags(
>>   			return error;
>>   	}
>>   
>> +	if (atomic_writes) {
>> +		error = xfs_ioctl_setattr_atomicwrites(ip, fa);
>> +		if (error)
>> +			return error;
>> +	}
>> +
>>   	ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
>>   	ip->i_diflags2 = i_flags2;
>>   
>> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
>> index 30228fea908d..0c5a3ae3cdaf 100644
>> --- a/fs/xfs/xfs_mount.h
>> +++ b/fs/xfs/xfs_mount.h
>> @@ -300,6 +300,7 @@ typedef struct xfs_mount {
>>   #define XFS_FEAT_NREXT64	(1ULL << 26)	/* large extent counters */
>>   #define XFS_FEAT_EXCHANGE_RANGE	(1ULL << 27)	/* exchange range */
>>   #define XFS_FEAT_FORCEALIGN	(1ULL << 28)	/* aligned file data extents */
>> +#define XFS_FEAT_ATOMICWRITES	(1ULL << 29)	/* atomic writes support */
>>   
>>   /* Mount features */
>>   #define XFS_FEAT_NOATTR2	(1ULL << 48)	/* disable attr2 creation */
>> @@ -387,6 +388,7 @@ __XFS_HAS_V4_FEAT(v3inodes, V3INODES)
>>   __XFS_HAS_V4_FEAT(crc, CRC)
>>   __XFS_HAS_V4_FEAT(pquotino, PQUOTINO)
>>   __XFS_HAS_FEAT(forcealign, FORCEALIGN)
>> +__XFS_HAS_FEAT(atomicwrites, ATOMICWRITES)
>>   
>>   /*
>>    * Mount features
>> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
>> index b52a01b50387..5352b90b2bb6 100644
>> --- a/fs/xfs/xfs_super.c
>> +++ b/fs/xfs/xfs_super.c
>> @@ -1721,6 +1721,18 @@ xfs_fs_fill_super(
>>   		mp->m_features &= ~XFS_FEAT_DISCARD;
>>   	}
>>   
>> +	if (xfs_has_atomicwrites(mp)) {
>> +		if (!xfs_has_forcealign(mp)) {
>> +			xfs_alert(mp,
>> +	"forcealign required for atomicwrites!");
> 
> This (atomicwrites && !forcealign) ought to be checked in the superblock
> verifier.

You mean in xfs_fs_validate_params(), right?

Thanks,
John

>
Darrick J. Wong Aug. 22, 2024, 8:38 p.m. UTC | #3
On Thu, Aug 22, 2024 at 06:45:16PM +0100, John Garry wrote:
> On 21/08/2024 18:07, Darrick J. Wong wrote:
> > On Sat, Aug 17, 2024 at 09:47:57AM +0000, John Garry wrote:
> > > Add initial support for new flag FS_XFLAG_ATOMICWRITES for forcealign
> > > enabled.
> > > 
> > > This flag is a file attribute that mirrors an ondisk inode flag.  Actual
> > > support for untorn file writes (for now) depends on both the iflag and the
> > > underlying storage devices, which we can only really check at statx and
> > > pwritev2() time.  This is the same story as FS_XFLAG_DAX, which signals to
> > > the fs that we should try to enable the fsdax IO path on the file (instead
> > > of the regular page cache), but applications have to query STAT_ATTR_DAX to
> > > find out if they really got that IO path.
> > > 
> > > Current kernel support for atomic writes is based on HW support (for atomic
> > > writes). As such, it is required to ensure extent alignment with
> > > atomic_write_unit_max so that an atomic write can result in a single
> > > HW-compliant IO operation.
> > > 
> > > rtvol also guarantees extent alignment, but we are basing support initially
> > > on forcealign, which is not supported for rtvol yet.
> > > 
> > > Signed-off-by: John Garry <john.g.garry@oracle.com>
> > > ---
> > >   fs/xfs/libxfs/xfs_format.h     | 11 +++++--
> > >   fs/xfs/libxfs/xfs_inode_buf.c  | 52 ++++++++++++++++++++++++++++++++++
> > >   fs/xfs/libxfs/xfs_inode_util.c |  4 +++
> > >   fs/xfs/libxfs/xfs_sb.c         |  2 ++
> > >   fs/xfs/xfs_buf.c               | 15 +++++++++-
> > >   fs/xfs/xfs_buf.h               |  4 ++-
> > >   fs/xfs/xfs_buf_mem.c           |  2 +-
> > >   fs/xfs/xfs_inode.h             |  5 ++++
> > >   fs/xfs/xfs_ioctl.c             | 52 ++++++++++++++++++++++++++++++++++
> > >   fs/xfs/xfs_mount.h             |  2 ++
> > >   fs/xfs/xfs_super.c             | 12 ++++++++
> > >   include/uapi/linux/fs.h        |  1 +
> > >   12 files changed, 157 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> > > index 04c6cbc943c2..a9f3389438a6 100644
> > > --- a/fs/xfs/libxfs/xfs_format.h
> > > +++ b/fs/xfs/libxfs/xfs_format.h
> > > @@ -353,12 +353,16 @@ xfs_sb_has_compat_feature(
> > >   #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
> > >   #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
> > >   #define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30)	/* aligned file data extents */
> > > +#define XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES (1 << 31)	/* atomicwrites enabled */
> > 
> > Do you ever see test failures in xfs/270?
> 
> Well it wasn't with forcealign only. I'll check again for atomic writes.
> 
> > 
> > > +
> > >   #define XFS_SB_FEAT_RO_COMPAT_ALL \
> > >   		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
> > >   		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
> > >   		 XFS_SB_FEAT_RO_COMPAT_REFLINK| \
> > >   		 XFS_SB_FEAT_RO_COMPAT_INOBTCNT | \
> > > -		 XFS_SB_FEAT_RO_COMPAT_FORCEALIGN)
> > > +		 XFS_SB_FEAT_RO_COMPAT_FORCEALIGN | \
> > > +		 XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES)
> > > +
> > >   #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN	~XFS_SB_FEAT_RO_COMPAT_ALL
> > >   static inline bool
> > >   xfs_sb_has_ro_compat_feature(
> > > @@ -1097,6 +1101,7 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
> > >   #define XFS_DIFLAG2_NREXT64_BIT 4	/* large extent counters */
> > >   /* data extent mappings for regular files must be aligned to extent size hint */
> > >   #define XFS_DIFLAG2_FORCEALIGN_BIT 5
> > > +#define XFS_DIFLAG2_ATOMICWRITES_BIT 6
> > >   #define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
> > >   #define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
> > > @@ -1104,10 +1109,12 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
> > >   #define XFS_DIFLAG2_BIGTIME	(1 << XFS_DIFLAG2_BIGTIME_BIT)
> > >   #define XFS_DIFLAG2_NREXT64	(1 << XFS_DIFLAG2_NREXT64_BIT)
> > >   #define XFS_DIFLAG2_FORCEALIGN	(1 << XFS_DIFLAG2_FORCEALIGN_BIT)
> > > +#define XFS_DIFLAG2_ATOMICWRITES	(1 << XFS_DIFLAG2_ATOMICWRITES_BIT)
> > >   #define XFS_DIFLAG2_ANY \
> > >   	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
> > > -	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN)
> > > +	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN | \
> > > +	 XFS_DIFLAG2_ATOMICWRITES)
> > >   static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
> > >   {
> > > diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
> > > index 1c59891fa9e2..59933c7df56d 100644
> > > --- a/fs/xfs/libxfs/xfs_inode_buf.c
> > > +++ b/fs/xfs/libxfs/xfs_inode_buf.c
> > > @@ -178,7 +178,10 @@ xfs_inode_from_disk(
> > >   	struct xfs_inode	*ip,
> > >   	struct xfs_dinode	*from)
> > >   {
> > > +	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
> > >   	struct inode		*inode = VFS_I(ip);
> > > +	struct xfs_mount	*mp = ip->i_mount;
> > > +	struct xfs_sb		*sbp = &mp->m_sb;
> > >   	int			error;
> > >   	xfs_failaddr_t		fa;
> > > @@ -261,6 +264,13 @@ xfs_inode_from_disk(
> > >   	}
> > >   	if (xfs_is_reflink_inode(ip))
> > >   		xfs_ifork_init_cow(ip);
> > > +
> > > +	if (xfs_inode_has_atomicwrites(ip)) {
> > > +		if (sbp->sb_blocksize < target->bt_bdev_awu_min ||
> > > +		    sbp->sb_blocksize * ip->i_extsize > target->bt_bdev_awu_max)
> > 
> > Can this multiplication trigger integer overflows?
> 
> I should just use xfs_inode_alloc_unitsize()
> 
> > 
> > > +			ip->i_diflags2 &= ~XFS_DIFLAG2_ATOMICWRITES;
> > 
> > Ondisk iflag updates must use transactions.
> 
> I want to change this.
> 
> The idea was to runtime clear this flag in case the bdev cannot satisfy the
> FS atomic write range, but that does not work.

Yeah, the ondisk and incore state have to be separate when we're playing
with hardware features because the fs image can be created on a device
that supports $feature and then moved to a device that does not.  For
that case you don't enable the incore state.  Though I suppose for
untorn writes you /could/ just let things EIO...

> > Or you can fail IOCB_ATOMIC writes if XFS_DIFLAG2_ATOMICWRITES is set
> > but the forcealign blocksize doesn't fit with awu_min/max.
> 
> I'd rather just not set FMODE_CAN_ATOMIC_WRITE

...but I'll move that discussion to that email.

> > > +	}
> > > +
> > >   	return 0;
> > >   out_destroy_data_fork:
> > > @@ -483,6 +493,40 @@ xfs_dinode_verify_nrext64(
> > >   	return NULL;
> > >   }
> > > +static xfs_failaddr_t
> > > +xfs_inode_validate_atomicwrites(
> > > +	struct xfs_mount	*mp,
> > > +	uint32_t		extsize,
> > > +	uint64_t		flags2)
> > > +{
> > > +	/* superblock rocompat feature flag */
> > > +	if (!xfs_has_atomicwrites(mp))
> > > +		return __this_address;
> > > +
> > > +	/*
> > > +	 * forcealign is required, so rely on sanity checks in
> > > +	 * xfs_inode_validate_forcealign()
> > > +	 */
> > > +	if (!(flags2 & XFS_DIFLAG2_FORCEALIGN))
> > > +		return __this_address;
> > > +
> > > +	if (!is_power_of_2(extsize))
> > > +		return __this_address;
> > > +
> > > +	/* Required to guarantee data block alignment */
> > > +	if (mp->m_sb.sb_agblocks % extsize)
> > > +		return __this_address;
> > > +
> > > +	/* Requires stripe unit+width be a multiple of extsize */
> > > +	if (mp->m_dalign && (mp->m_dalign % extsize))
> > > +		return __this_address;
> > > +
> > > +	if (mp->m_swidth && (mp->m_swidth % extsize))
> > 
> > IIRC m_dalign and m_swidth can be set at mount time,
> 
> I thought that these were fixed at mkfs time, however I see that the comment
> for xfs_update_alignment() mentions "values based on mount options". And we
> are reading mp values, and not sbp values, which is a good clue...

Yes.

> > which can result in
> > inode verifiers logging corruption errors if those parameters change.  I
> > think we should validate these two congruencies when setting
> > FMODE_CAN_ATOMIC_WRITE.
> 
> That would make sense.
> 
> > 
> > > +		return __this_address;
> > > +
> > > +	return NULL;
> > > +}
> > > +
> > >   xfs_failaddr_t
> > >   xfs_dinode_verify(
> > >   	struct xfs_mount	*mp,
> > > @@ -666,6 +710,14 @@ xfs_dinode_verify(
> > >   			return fa;
> > >   	}
> > > +	if (flags2 & XFS_DIFLAG2_ATOMICWRITES) {
> > > +		fa = xfs_inode_validate_atomicwrites(mp,
> > > +				be32_to_cpu(dip->di_extsize),
> > > +				flags2);
> > > +		if (fa)
> > > +			return fa;
> > > +	}
> > > +
> > >   	return NULL;
> > >   }
> > > diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
> > > index b264939d8855..dbd5b16e1844 100644
> > > --- a/fs/xfs/libxfs/xfs_inode_util.c
> > > +++ b/fs/xfs/libxfs/xfs_inode_util.c
> > > @@ -82,6 +82,8 @@ xfs_flags2diflags2(
> > >   		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
> > >   	if (xflags & FS_XFLAG_FORCEALIGN)
> > >   		di_flags2 |= XFS_DIFLAG2_FORCEALIGN;
> > > +	if (xflags & FS_XFLAG_ATOMICWRITES)
> > > +		di_flags2 |= XFS_DIFLAG2_ATOMICWRITES;
> > >   	return di_flags2;
> > >   }
> > > @@ -130,6 +132,8 @@ xfs_ip2xflags(
> > >   			flags |= FS_XFLAG_COWEXTSIZE;
> > >   		if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN)
> > >   			flags |= FS_XFLAG_FORCEALIGN;
> > > +		if (ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES)
> > > +			flags |= FS_XFLAG_ATOMICWRITES;
> > >   	}
> > >   	if (xfs_inode_has_attr_fork(ip))
> > > diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
> > > index e56911553edd..5de8725bf93a 100644
> > > --- a/fs/xfs/libxfs/xfs_sb.c
> > > +++ b/fs/xfs/libxfs/xfs_sb.c
> > > @@ -166,6 +166,8 @@ xfs_sb_version_to_features(
> > >   		features |= XFS_FEAT_INOBTCNT;
> > >   	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FORCEALIGN)
> > >   		features |= XFS_FEAT_FORCEALIGN;
> > > +	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES)
> > > +		features |= XFS_FEAT_ATOMICWRITES;
> > >   	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
> > >   		features |= XFS_FEAT_FTYPE;
> > >   	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
> > > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> > > index aa4dbda7b536..44bee3e2b2bb 100644
> > > --- a/fs/xfs/xfs_buf.c
> > > +++ b/fs/xfs/xfs_buf.c
> > > @@ -2060,6 +2060,8 @@ int
> > >   xfs_init_buftarg(
> > >   	struct xfs_buftarg		*btp,
> > >   	size_t				logical_sectorsize,
> > > +	unsigned int			awu_min,
> > > +	unsigned int			awu_max,
> > >   	const char			*descr)
> > >   {
> > >   	/* Set up device logical sector size mask */
> > > @@ -2086,6 +2088,9 @@ xfs_init_buftarg(
> > >   	btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
> > >   	btp->bt_shrinker->private_data = btp;
> > >   	shrinker_register(btp->bt_shrinker);
> > > +
> > > +	btp->bt_bdev_awu_min = awu_min;
> > > +	btp->bt_bdev_awu_max = awu_max;
> > >   	return 0;
> > >   out_destroy_io_count:
> > > @@ -2102,6 +2107,7 @@ xfs_alloc_buftarg(
> > >   {
> > >   	struct xfs_buftarg	*btp;
> > >   	const struct dax_holder_operations *ops = NULL;
> > > +	unsigned int awu_min = 0, awu_max = 0;
> > >   #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
> > >   	ops = &xfs_dax_holder_operations;
> > > @@ -2115,6 +2121,13 @@ xfs_alloc_buftarg(
> > >   	btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
> > >   					    mp, ops);
> > > +	if (bdev_can_atomic_write(btp->bt_bdev)) {
> > > +		struct request_queue *q = bdev_get_queue(btp->bt_bdev);
> > > +
> > > +		awu_min = queue_atomic_write_unit_min_bytes(q);
> > > +		awu_max = queue_atomic_write_unit_max_bytes(q);
> > > +	}
> > > +
> > >   	/*
> > >   	 * When allocating the buftargs we have not yet read the super block and
> > >   	 * thus don't know the file system sector size yet.
> > > @@ -2122,7 +2135,7 @@ xfs_alloc_buftarg(
> > >   	if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)))
> > >   		goto error_free;
> > >   	if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
> > > -			mp->m_super->s_id))
> > > +			awu_min, awu_max, mp->m_super->s_id))
> > >   		goto error_free;
> > >   	return btp;
> > > diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> > > index b1580644501f..3bcd8137d739 100644
> > > --- a/fs/xfs/xfs_buf.h
> > > +++ b/fs/xfs/xfs_buf.h
> > > @@ -124,6 +124,8 @@ struct xfs_buftarg {
> > >   	struct percpu_counter	bt_io_count;
> > >   	struct ratelimit_state	bt_ioerror_rl;
> > > +	unsigned int		bt_bdev_awu_min, bt_bdev_awu_max;
> > 
> > Please add a comment here about what these mean.  Not everyone is going
> > to know what "awu" abbreviates.
> 
> sure
> 
> > > +
> > >   static int
> > >   xfs_ioctl_setattr_xflags(
> > >   	struct xfs_trans	*tp,
> > > @@ -511,9 +554,12 @@ xfs_ioctl_setattr_xflags(
> > >   	struct xfs_mount	*mp = ip->i_mount;
> > >   	bool			rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME);
> > >   	bool			forcealign = fa->fsx_xflags & FS_XFLAG_FORCEALIGN;
> > > +	bool			atomic_writes;
> > >   	uint64_t		i_flags2;
> > >   	int			error;
> > > +	atomic_writes = fa->fsx_xflags & FS_XFLAG_ATOMICWRITES;
> > > +
> > >   	/* Can't change RT or forcealign flags if any extents are allocated. */
> > >   	if (rtflag != XFS_IS_REALTIME_INODE(ip) ||
> > >   	    forcealign != xfs_inode_has_forcealign(ip)) {
> > > @@ -554,6 +600,12 @@ xfs_ioctl_setattr_xflags(
> > >   			return error;
> > >   	}
> > > +	if (atomic_writes) {
> > > +		error = xfs_ioctl_setattr_atomicwrites(ip, fa);
> > > +		if (error)
> > > +			return error;
> > > +	}
> > > +
> > >   	ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
> > >   	ip->i_diflags2 = i_flags2;
> > > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > > index 30228fea908d..0c5a3ae3cdaf 100644
> > > --- a/fs/xfs/xfs_mount.h
> > > +++ b/fs/xfs/xfs_mount.h
> > > @@ -300,6 +300,7 @@ typedef struct xfs_mount {
> > >   #define XFS_FEAT_NREXT64	(1ULL << 26)	/* large extent counters */
> > >   #define XFS_FEAT_EXCHANGE_RANGE	(1ULL << 27)	/* exchange range */
> > >   #define XFS_FEAT_FORCEALIGN	(1ULL << 28)	/* aligned file data extents */
> > > +#define XFS_FEAT_ATOMICWRITES	(1ULL << 29)	/* atomic writes support */
> > >   /* Mount features */
> > >   #define XFS_FEAT_NOATTR2	(1ULL << 48)	/* disable attr2 creation */
> > > @@ -387,6 +388,7 @@ __XFS_HAS_V4_FEAT(v3inodes, V3INODES)
> > >   __XFS_HAS_V4_FEAT(crc, CRC)
> > >   __XFS_HAS_V4_FEAT(pquotino, PQUOTINO)
> > >   __XFS_HAS_FEAT(forcealign, FORCEALIGN)
> > > +__XFS_HAS_FEAT(atomicwrites, ATOMICWRITES)
> > >   /*
> > >    * Mount features
> > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > > index b52a01b50387..5352b90b2bb6 100644
> > > --- a/fs/xfs/xfs_super.c
> > > +++ b/fs/xfs/xfs_super.c
> > > @@ -1721,6 +1721,18 @@ xfs_fs_fill_super(
> > >   		mp->m_features &= ~XFS_FEAT_DISCARD;
> > >   	}
> > > +	if (xfs_has_atomicwrites(mp)) {
> > > +		if (!xfs_has_forcealign(mp)) {
> > > +			xfs_alert(mp,
> > > +	"forcealign required for atomicwrites!");
> > 
> > This (atomicwrites && !forcealign) ought to be checked in the superblock
> > verifier.
> 
> You mean in xfs_fs_validate_params(), right?

xfs_validate_sb_common, where we do all the other ondisk superblock
validation.

--D

> 
> Thanks,
> John
> 
> > 
>
John Garry Aug. 23, 2024, 8:39 a.m. UTC | #4
On 22/08/2024 21:38, Darrick J. Wong wrote:
>>> This (atomicwrites && !forcealign) ought to be checked in the superblock
>>> verifier.
>> You mean in xfs_fs_validate_params(), right?
> xfs_validate_sb_common, where we do all the other ondisk superblock
> validation.

I don't see any other xfs_has_XXX checks in xfs_validate_sb_common(), 
but this could be the first...

The only other place in which I see a pattern of similar SB feature flag 
checks is in xfs_finish_flags() for checking xfs_has_crc() && 
xfs_has_noattr2().

So if we go with xfs_validate_sb_common(), then should the check in 
xfs_fs_fill_super() for xfs_has_forcealign() && 
xfs_has_realtime()/reflink() be relocated to xfs_validate_sb_common() also:

https://lore.kernel.org/linux-xfs/20240813163638.3751939-8-john.g.garry@oracle.com/

Cheers,
John
Darrick J. Wong Aug. 23, 2024, 4:03 p.m. UTC | #5
On Fri, Aug 23, 2024 at 09:39:44AM +0100, John Garry wrote:
> On 22/08/2024 21:38, Darrick J. Wong wrote:
> > > > This (atomicwrites && !forcealign) ought to be checked in the superblock
> > > > verifier.
> > > You mean in xfs_fs_validate_params(), right?
> > xfs_validate_sb_common, where we do all the other ondisk superblock
> > validation.
> 
> I don't see any other xfs_has_XXX checks in xfs_validate_sb_common(), but
> this could be the first...

The superblock verifier runs at a lower level in the filesystem -- it
checks that the ondisk superblock doesn't contain any inconsistent
fields or impossible feature combinations, etc.  Once the ondisk
superblock is verified, the information there is used to set XFS_FEAT_*
bits in m_features, which is what the xfs_has_* predicates access.

Therefore, you have to look at the raw superblock fields, not the
xfs_has_ predicates:

	if ((sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES) &&
	    !(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FORCEALIGN)) {
		xfs_warn(mp, "atomic writes feature requires force align feature.");
		return -EINVAL;
	}

The reason for checking this state here is that atomicwrites absolutely
requires forcealign and that dependency will always be true.

> The only other place in which I see a pattern of similar SB feature flag
> checks is in xfs_finish_flags() for checking xfs_has_crc() &&
> xfs_has_noattr2().
> 
> So if we go with xfs_validate_sb_common(), then should the check in
> xfs_fs_fill_super() for xfs_has_forcealign() && xfs_has_realtime()/reflink()
> be relocated to xfs_validate_sb_common() also:

No.  Contrast the above with (forcealign && !realtime), which at least
in theory is temporary, so that should live in xfs_fs_fill_super.  Or
put another way, xfs_fs_fill_super is where we screen out the kernel
being too stupid to support something it found on disk.

--D

> 
> https://lore.kernel.org/linux-xfs/20240813163638.3751939-8-john.g.garry@oracle.com/
> 
> Cheers,
> John
>
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 04c6cbc943c2..a9f3389438a6 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -353,12 +353,16 @@  xfs_sb_has_compat_feature(
 #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
 #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
 #define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30)	/* aligned file data extents */
+#define XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES (1 << 31)	/* atomicwrites enabled */
+
 #define XFS_SB_FEAT_RO_COMPAT_ALL \
 		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
 		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
 		 XFS_SB_FEAT_RO_COMPAT_REFLINK| \
 		 XFS_SB_FEAT_RO_COMPAT_INOBTCNT | \
-		 XFS_SB_FEAT_RO_COMPAT_FORCEALIGN)
+		 XFS_SB_FEAT_RO_COMPAT_FORCEALIGN | \
+		 XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES)
+
 #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN	~XFS_SB_FEAT_RO_COMPAT_ALL
 static inline bool
 xfs_sb_has_ro_compat_feature(
@@ -1097,6 +1101,7 @@  static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
 #define XFS_DIFLAG2_NREXT64_BIT 4	/* large extent counters */
 /* data extent mappings for regular files must be aligned to extent size hint */
 #define XFS_DIFLAG2_FORCEALIGN_BIT 5
+#define XFS_DIFLAG2_ATOMICWRITES_BIT 6
 
 #define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
 #define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
@@ -1104,10 +1109,12 @@  static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
 #define XFS_DIFLAG2_BIGTIME	(1 << XFS_DIFLAG2_BIGTIME_BIT)
 #define XFS_DIFLAG2_NREXT64	(1 << XFS_DIFLAG2_NREXT64_BIT)
 #define XFS_DIFLAG2_FORCEALIGN	(1 << XFS_DIFLAG2_FORCEALIGN_BIT)
+#define XFS_DIFLAG2_ATOMICWRITES	(1 << XFS_DIFLAG2_ATOMICWRITES_BIT)
 
 #define XFS_DIFLAG2_ANY \
 	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
-	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN)
+	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN | \
+	 XFS_DIFLAG2_ATOMICWRITES)
 
 static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
 {
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 1c59891fa9e2..59933c7df56d 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -178,7 +178,10 @@  xfs_inode_from_disk(
 	struct xfs_inode	*ip,
 	struct xfs_dinode	*from)
 {
+	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
 	struct inode		*inode = VFS_I(ip);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_sb		*sbp = &mp->m_sb;
 	int			error;
 	xfs_failaddr_t		fa;
 
@@ -261,6 +264,13 @@  xfs_inode_from_disk(
 	}
 	if (xfs_is_reflink_inode(ip))
 		xfs_ifork_init_cow(ip);
+
+	if (xfs_inode_has_atomicwrites(ip)) {
+		if (sbp->sb_blocksize < target->bt_bdev_awu_min ||
+		    sbp->sb_blocksize * ip->i_extsize > target->bt_bdev_awu_max)
+			ip->i_diflags2 &= ~XFS_DIFLAG2_ATOMICWRITES;
+	}
+
 	return 0;
 
 out_destroy_data_fork:
@@ -483,6 +493,40 @@  xfs_dinode_verify_nrext64(
 	return NULL;
 }
 
+static xfs_failaddr_t
+xfs_inode_validate_atomicwrites(
+	struct xfs_mount	*mp,
+	uint32_t		extsize,
+	uint64_t		flags2)
+{
+	/* superblock rocompat feature flag */
+	if (!xfs_has_atomicwrites(mp))
+		return __this_address;
+
+	/*
+	 * forcealign is required, so rely on sanity checks in
+	 * xfs_inode_validate_forcealign()
+	 */
+	if (!(flags2 & XFS_DIFLAG2_FORCEALIGN))
+		return __this_address;
+
+	if (!is_power_of_2(extsize))
+		return __this_address;
+
+	/* Required to guarantee data block alignment */
+	if (mp->m_sb.sb_agblocks % extsize)
+		return __this_address;
+
+	/* Requires stripe unit+width be a multiple of extsize */
+	if (mp->m_dalign && (mp->m_dalign % extsize))
+		return __this_address;
+
+	if (mp->m_swidth && (mp->m_swidth % extsize))
+		return __this_address;
+
+	return NULL;
+}
+
 xfs_failaddr_t
 xfs_dinode_verify(
 	struct xfs_mount	*mp,
@@ -666,6 +710,14 @@  xfs_dinode_verify(
 			return fa;
 	}
 
+	if (flags2 & XFS_DIFLAG2_ATOMICWRITES) {
+		fa = xfs_inode_validate_atomicwrites(mp,
+				be32_to_cpu(dip->di_extsize),
+				flags2);
+		if (fa)
+			return fa;
+	}
+
 	return NULL;
 }
 
diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index b264939d8855..dbd5b16e1844 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -82,6 +82,8 @@  xfs_flags2diflags2(
 		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
 	if (xflags & FS_XFLAG_FORCEALIGN)
 		di_flags2 |= XFS_DIFLAG2_FORCEALIGN;
+	if (xflags & FS_XFLAG_ATOMICWRITES)
+		di_flags2 |= XFS_DIFLAG2_ATOMICWRITES;
 
 	return di_flags2;
 }
@@ -130,6 +132,8 @@  xfs_ip2xflags(
 			flags |= FS_XFLAG_COWEXTSIZE;
 		if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN)
 			flags |= FS_XFLAG_FORCEALIGN;
+		if (ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES)
+			flags |= FS_XFLAG_ATOMICWRITES;
 	}
 
 	if (xfs_inode_has_attr_fork(ip))
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index e56911553edd..5de8725bf93a 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -166,6 +166,8 @@  xfs_sb_version_to_features(
 		features |= XFS_FEAT_INOBTCNT;
 	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FORCEALIGN)
 		features |= XFS_FEAT_FORCEALIGN;
+	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES)
+		features |= XFS_FEAT_ATOMICWRITES;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
 		features |= XFS_FEAT_FTYPE;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index aa4dbda7b536..44bee3e2b2bb 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -2060,6 +2060,8 @@  int
 xfs_init_buftarg(
 	struct xfs_buftarg		*btp,
 	size_t				logical_sectorsize,
+	unsigned int			awu_min,
+	unsigned int			awu_max,
 	const char			*descr)
 {
 	/* Set up device logical sector size mask */
@@ -2086,6 +2088,9 @@  xfs_init_buftarg(
 	btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
 	btp->bt_shrinker->private_data = btp;
 	shrinker_register(btp->bt_shrinker);
+
+	btp->bt_bdev_awu_min = awu_min;
+	btp->bt_bdev_awu_max = awu_max;
 	return 0;
 
 out_destroy_io_count:
@@ -2102,6 +2107,7 @@  xfs_alloc_buftarg(
 {
 	struct xfs_buftarg	*btp;
 	const struct dax_holder_operations *ops = NULL;
+	unsigned int awu_min = 0, awu_max = 0;
 
 #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
 	ops = &xfs_dax_holder_operations;
@@ -2115,6 +2121,13 @@  xfs_alloc_buftarg(
 	btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
 					    mp, ops);
 
+	if (bdev_can_atomic_write(btp->bt_bdev)) {
+		struct request_queue *q = bdev_get_queue(btp->bt_bdev);
+
+		awu_min = queue_atomic_write_unit_min_bytes(q);
+		awu_max = queue_atomic_write_unit_max_bytes(q);
+	}
+
 	/*
 	 * When allocating the buftargs we have not yet read the super block and
 	 * thus don't know the file system sector size yet.
@@ -2122,7 +2135,7 @@  xfs_alloc_buftarg(
 	if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)))
 		goto error_free;
 	if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
-			mp->m_super->s_id))
+			awu_min, awu_max, mp->m_super->s_id))
 		goto error_free;
 
 	return btp;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b1580644501f..3bcd8137d739 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -124,6 +124,8 @@  struct xfs_buftarg {
 	struct percpu_counter	bt_io_count;
 	struct ratelimit_state	bt_ioerror_rl;
 
+	unsigned int		bt_bdev_awu_min, bt_bdev_awu_max;
+
 	/* built-in cache, if we're not using the perag one */
 	struct xfs_buf_cache	bt_cache[];
 };
@@ -393,7 +395,7 @@  bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
 
 /* for xfs_buf_mem.c only: */
 int xfs_init_buftarg(struct xfs_buftarg *btp, size_t logical_sectorsize,
-		const char *descr);
+		unsigned int awu_min, unsigned int awu_max, const char *descr);
 void xfs_destroy_buftarg(struct xfs_buftarg *btp);
 
 #endif	/* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 9bb2d24de709..af48a8da2f0f 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -93,7 +93,7 @@  xmbuf_alloc(
 	btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE;
 	btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1;
 
-	error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
+	error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, 0, 0, descr);
 	if (error)
 		goto out_bcache;
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 336124105c47..cfcb67da12cb 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -321,6 +321,11 @@  static inline bool xfs_inode_has_forcealign(struct xfs_inode *ip)
 	return ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN;
 }
 
+static inline bool xfs_inode_has_atomicwrites(struct xfs_inode *ip)
+{
+	return ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES;
+}
+
 /*
  * Decide if this file is a realtime file whose data allocation unit is larger
  * than a single filesystem block.
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index a68ec68e5b92..7af2837779e8 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -502,6 +502,49 @@  xfs_ioctl_setattr_forcealign(
 	return 0;
 }
 
+
+/*
+ * Forcealign requires a power-of-2 extent size hint.
+ */
+static int
+xfs_ioctl_setattr_atomicwrites(
+	struct xfs_inode	*ip,
+	struct fileattr		*fa)
+{
+	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+	struct xfs_mount	*mp = ip->i_mount;
+	uint32_t		extsize = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+	struct xfs_sb		*sbp = &mp->m_sb;
+
+	if (!xfs_has_atomicwrites(mp))
+		return -EINVAL;
+
+	if (!(fa->fsx_xflags & FS_XFLAG_FORCEALIGN))
+		return -EINVAL;
+
+	if (!is_power_of_2(extsize))
+		return -EINVAL;
+
+	/* Required to guarantee data block alignment */
+	if (mp->m_sb.sb_agblocks % extsize)
+		return -EINVAL;
+
+	/* Requires stripe unit+width be a multiple of extsize */
+	if (mp->m_dalign && (mp->m_dalign % extsize))
+		return -EINVAL;
+
+	if (mp->m_swidth && (mp->m_swidth % extsize))
+		return -EINVAL;
+
+	if (target->bt_bdev_awu_min > sbp->sb_blocksize)
+		return -EINVAL;
+
+	if (target->bt_bdev_awu_max < fa->fsx_extsize)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int
 xfs_ioctl_setattr_xflags(
 	struct xfs_trans	*tp,
@@ -511,9 +554,12 @@  xfs_ioctl_setattr_xflags(
 	struct xfs_mount	*mp = ip->i_mount;
 	bool			rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME);
 	bool			forcealign = fa->fsx_xflags & FS_XFLAG_FORCEALIGN;
+	bool			atomic_writes;
 	uint64_t		i_flags2;
 	int			error;
 
+	atomic_writes = fa->fsx_xflags & FS_XFLAG_ATOMICWRITES;
+
 	/* Can't change RT or forcealign flags if any extents are allocated. */
 	if (rtflag != XFS_IS_REALTIME_INODE(ip) ||
 	    forcealign != xfs_inode_has_forcealign(ip)) {
@@ -554,6 +600,12 @@  xfs_ioctl_setattr_xflags(
 			return error;
 	}
 
+	if (atomic_writes) {
+		error = xfs_ioctl_setattr_atomicwrites(ip, fa);
+		if (error)
+			return error;
+	}
+
 	ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
 	ip->i_diflags2 = i_flags2;
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 30228fea908d..0c5a3ae3cdaf 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -300,6 +300,7 @@  typedef struct xfs_mount {
 #define XFS_FEAT_NREXT64	(1ULL << 26)	/* large extent counters */
 #define XFS_FEAT_EXCHANGE_RANGE	(1ULL << 27)	/* exchange range */
 #define XFS_FEAT_FORCEALIGN	(1ULL << 28)	/* aligned file data extents */
+#define XFS_FEAT_ATOMICWRITES	(1ULL << 29)	/* atomic writes support */
 
 /* Mount features */
 #define XFS_FEAT_NOATTR2	(1ULL << 48)	/* disable attr2 creation */
@@ -387,6 +388,7 @@  __XFS_HAS_V4_FEAT(v3inodes, V3INODES)
 __XFS_HAS_V4_FEAT(crc, CRC)
 __XFS_HAS_V4_FEAT(pquotino, PQUOTINO)
 __XFS_HAS_FEAT(forcealign, FORCEALIGN)
+__XFS_HAS_FEAT(atomicwrites, ATOMICWRITES)
 
 /*
  * Mount features
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b52a01b50387..5352b90b2bb6 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1721,6 +1721,18 @@  xfs_fs_fill_super(
 		mp->m_features &= ~XFS_FEAT_DISCARD;
 	}
 
+	if (xfs_has_atomicwrites(mp)) {
+		if (!xfs_has_forcealign(mp)) {
+			xfs_alert(mp,
+	"forcealign required for atomicwrites!");
+			error = -EINVAL;
+			goto out_filestream_unmount;
+		}
+
+		xfs_warn(mp,
+"EXPERIMENTAL atomicwrites feature in use. Use at your own risk!");
+	}
+
 	if (xfs_has_reflink(mp)) {
 		if (mp->m_sb.sb_rblocks) {
 			xfs_alert(mp,
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index f55d650f904a..c416f549e94d 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -160,6 +160,7 @@  struct fsxattr {
 #define FS_XFLAG_COWEXTSIZE	0x00010000	/* CoW extent size allocator hint */
 /* data extent mappings for regular files must be aligned to extent size hint */
 #define FS_XFLAG_FORCEALIGN	0x00020000
+#define FS_XFLAG_ATOMICWRITES	0x00040000	/* atomic writes enabled */
 #define FS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
 
 /* the read-only stuff doesn't really belong here, but any other place is