diff mbox series

[v6,6/9] nfsd: use the getattr operation to fetch i_version

Message ID 20220930111840.10695-7-jlayton@kernel.org (mailing list archive)
State New, archived
Headers show
Series vfs/nfsd: clean up handling of i_version counter | expand

Commit Message

Jeff Layton Sept. 30, 2022, 11:18 a.m. UTC
Now that we can call into vfs_getattr to get the i_version field, use
that facility to fetch it instead of doing it in nfsd4_change_attribute.

Neil also pointed out recently that IS_I_VERSION directory operations
are always logged, and so we only need to mitigate the rollback problem
on regular files. Also, we don't need to factor in the ctime when
reexporting NFS or Ceph.

Set the STATX_VERSION (and BTIME) bits in the request when we're dealing
with a v4 request. Then, instead of looking at IS_I_VERSION when
generating the change attr, look at the result mask and only use it if
STATX_VERSION is set. With this change, we can drop the fetch_iversion
export operation as well.

Move nfsd4_change_attribute into nfsfh.c, and change it to only factor
in the ctime if it's a regular file and the fs doesn't advertise
STATX_ATTR_VERSION_MONOTONIC.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfs/export.c          |  7 -------
 fs/nfsd/nfs4xdr.c        |  4 +++-
 fs/nfsd/nfsfh.c          | 40 ++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfsfh.h          | 29 +----------------------------
 fs/nfsd/vfs.h            |  7 ++++++-
 include/linux/exportfs.h |  1 -
 6 files changed, 50 insertions(+), 38 deletions(-)

Comments

Chuck Lever III Sept. 30, 2022, 2:34 p.m. UTC | #1
> On Sep 30, 2022, at 7:18 AM, Jeff Layton <jlayton@kernel.org> wrote:
> 
> Now that we can call into vfs_getattr to get the i_version field, use
> that facility to fetch it instead of doing it in nfsd4_change_attribute.
> 
> Neil also pointed out recently that IS_I_VERSION directory operations
> are always logged,

^logged^synchronous maybe?

> and so we only need to mitigate the rollback problem
> on regular files. Also, we don't need to factor in the ctime when
> reexporting NFS or Ceph.
> 
> Set the STATX_VERSION (and BTIME) bits in the request when we're dealing
> with a v4 request. Then, instead of looking at IS_I_VERSION when
> generating the change attr, look at the result mask and only use it if
> STATX_VERSION is set. With this change, we can drop the fetch_iversion
> export operation as well.
> 
> Move nfsd4_change_attribute into nfsfh.c, and change it to only factor
> in the ctime if it's a regular file and the fs doesn't advertise
> STATX_ATTR_VERSION_MONOTONIC.

This patch is doing some heavy lifting. I'd prefer to see it split
up to improve bisectability:

1. Move nfsd4_change_attribute() to fs/nfsd/nfsfh.c -- no functional changes

2. Change nfsd4_change_attribute() and vfs_getattr() call sites

3. Eliminate .fetch_iversion


> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
> fs/nfs/export.c          |  7 -------
> fs/nfsd/nfs4xdr.c        |  4 +++-
> fs/nfsd/nfsfh.c          | 40 ++++++++++++++++++++++++++++++++++++++++
> fs/nfsd/nfsfh.h          | 29 +----------------------------
> fs/nfsd/vfs.h            |  7 ++++++-
> include/linux/exportfs.h |  1 -
> 6 files changed, 50 insertions(+), 38 deletions(-)
> 
> diff --git a/fs/nfs/export.c b/fs/nfs/export.c
> index 01596f2d0a1e..1a9d5aa51dfb 100644
> --- a/fs/nfs/export.c
> +++ b/fs/nfs/export.c
> @@ -145,17 +145,10 @@ nfs_get_parent(struct dentry *dentry)
> 	return parent;
> }
> 
> -static u64 nfs_fetch_iversion(struct inode *inode)
> -{
> -	nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
> -	return inode_peek_iversion_raw(inode);
> -}
> -
> const struct export_operations nfs_export_ops = {
> 	.encode_fh = nfs_encode_fh,
> 	.fh_to_dentry = nfs_fh_to_dentry,
> 	.get_parent = nfs_get_parent,
> -	.fetch_iversion = nfs_fetch_iversion,
> 	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
> 		EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
> 		EXPORT_OP_NOATOMIC_ATTR,
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 1e9690a061ec..779c009314c6 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -2869,7 +2869,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
> 			goto out;
> 	}
> 
> -	err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
> +	err = vfs_getattr(&path, &stat,
> +			  STATX_BASIC_STATS | STATX_BTIME | STATX_VERSION,
> +			  AT_STATX_SYNC_AS_STAT);
> 	if (err)
> 		goto out_nfserr;
> 	if (!(stat.result_mask & STATX_BTIME))
> diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
> index a5b71526cee0..9168bc657378 100644
> --- a/fs/nfsd/nfsfh.c
> +++ b/fs/nfsd/nfsfh.c
> @@ -634,6 +634,10 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
> 		stat.mtime = inode->i_mtime;
> 		stat.ctime = inode->i_ctime;
> 		stat.size  = inode->i_size;
> +		if (v4 && IS_I_VERSION(inode)) {
> +			stat.version = inode_query_iversion(inode);
> +			stat.result_mask |= STATX_VERSION;
> +		}
> 	}
> 	if (v4)
> 		fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
> @@ -665,6 +669,8 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
> 	if (err) {
> 		fhp->fh_post_saved = false;
> 		fhp->fh_post_attr.ctime = inode->i_ctime;
> +		if (v4 && IS_I_VERSION(inode))
> +			fhp->fh_post_attr.version = inode_query_iversion(inode);

Since fh_fill_post_attrs() calls nfsd4_change_attribute() as its
next step, don't you need to set STATX_VERSION here too?

I kind of hate the way we're handling both NFSv3 and NFSv4 post_attr
here, it's pretty difficult to reason about.


> 	} else
> 		fhp->fh_post_saved = true;
> 	if (v4)
> @@ -754,3 +760,37 @@ enum fsid_source fsid_source(const struct svc_fh *fhp)
> 		return FSIDSOURCE_UUID;
> 	return FSIDSOURCE_DEV;
> }
> +
> +/*
> + * We could use i_version alone as the change attribute.  However, i_version
> + * can go backwards on a regular file after an unclean shutdown.  On its own
> + * that doesn't necessarily cause a problem, but if i_version goes backwards
> + * and then is incremented again it could reuse a value that was previously
> + * used before boot, and a client who queried the two values might incorrectly
> + * assume nothing changed.
> + *
> + * By using both ctime and the i_version counter we guarantee that as long as
> + * time doesn't go backwards we never reuse an old value. If the filesystem
> + * advertises STATX_ATTR_VERSION_MONOTONIC, then this mitigation is not needed.
> + *
> + * We only need to do this for regular files as well. For directories, we
> + * assume that the new change attr is always logged to stable storage in some
> + * fashion before the results can be seen.
> + */

Let's make this a kdoc-style comment. Thanks!


> +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode)

I'd prefer to use "const" pointers for both function parameters.


> +{
> +	u64 chattr;
> +
> +	if (stat->result_mask & STATX_VERSION) {
> +		chattr = stat->version;
> +
> +		if (S_ISREG(inode->i_mode) &&
> +		    !(stat->attributes & STATX_ATTR_VERSION_MONOTONIC)) {
> +			chattr += (u64)stat->ctime.tv_sec << 30;
> +			chattr += stat->ctime.tv_nsec;
> +		}
> +	} else {
> +		chattr = time_to_chattr(&stat->ctime);
> +	}
> +	return chattr;
> +}
> diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> index c3ae6414fc5c..4c223a7a91d4 100644
> --- a/fs/nfsd/nfsfh.h
> +++ b/fs/nfsd/nfsfh.h
> @@ -291,34 +291,7 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
> 	fhp->fh_pre_saved = false;
> }
> 
> -/*
> - * We could use i_version alone as the change attribute.  However,
> - * i_version can go backwards after a reboot.  On its own that doesn't
> - * necessarily cause a problem, but if i_version goes backwards and then
> - * is incremented again it could reuse a value that was previously used
> - * before boot, and a client who queried the two values might
> - * incorrectly assume nothing changed.
> - *
> - * By using both ctime and the i_version counter we guarantee that as
> - * long as time doesn't go backwards we never reuse an old value.
> - */
> -static inline u64 nfsd4_change_attribute(struct kstat *stat,
> -					 struct inode *inode)
> -{
> -	if (inode->i_sb->s_export_op->fetch_iversion)
> -		return inode->i_sb->s_export_op->fetch_iversion(inode);
> -	else if (IS_I_VERSION(inode)) {
> -		u64 chattr;
> -
> -		chattr =  stat->ctime.tv_sec;
> -		chattr <<= 30;
> -		chattr += stat->ctime.tv_nsec;
> -		chattr += inode_query_iversion(inode);
> -		return chattr;
> -	} else
> -		return time_to_chattr(&stat->ctime);
> -}
> -
> +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode);
> extern void fh_fill_pre_attrs(struct svc_fh *fhp);
> extern void fh_fill_post_attrs(struct svc_fh *fhp);
> extern void fh_fill_both_attrs(struct svc_fh *fhp);
> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> index c95cd414b4bb..a905f59481ee 100644
> --- a/fs/nfsd/vfs.h
> +++ b/fs/nfsd/vfs.h
> @@ -168,9 +168,14 @@ static inline void fh_drop_write(struct svc_fh *fh)
> 
> static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat)
> {
> +	u32 request_mask = STATX_BASIC_STATS;

Reverse christmas tree, please.


> 	struct path p = {.mnt = fh->fh_export->ex_path.mnt,
> 			 .dentry = fh->fh_dentry};
> -	return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS,
> +
> +	if (fh->fh_maxsize == NFS4_FHSIZE)

Would it hurt to set BTIME and VERSION unconditionally?


> +		request_mask |= (STATX_BTIME | STATX_VERSION);
> +
> +	return nfserrno(vfs_getattr(&p, stat, request_mask,
> 				    AT_STATX_SYNC_AS_STAT));
> }
> 
> diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
> index fe848901fcc3..9f4d4bcbf251 100644
> --- a/include/linux/exportfs.h
> +++ b/include/linux/exportfs.h
> @@ -213,7 +213,6 @@ struct export_operations {
> 			  bool write, u32 *device_generation);
> 	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
> 			     int nr_iomaps, struct iattr *iattr);
> -	u64 (*fetch_iversion)(struct inode *);
> #define	EXPORT_OP_NOWCC			(0x1) /* don't collect v3 wcc data */
> #define	EXPORT_OP_NOSUBTREECHK		(0x2) /* no subtree checking */
> #define	EXPORT_OP_CLOSE_BEFORE_UNLINK	(0x4) /* close files before unlink */
> -- 
> 2.37.3
> 

--
Chuck Lever
Dave Chinner Sept. 30, 2022, 10:32 p.m. UTC | #2
On Fri, Sep 30, 2022 at 02:34:51PM +0000, Chuck Lever III wrote:
> 
> 
> > On Sep 30, 2022, at 7:18 AM, Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > Now that we can call into vfs_getattr to get the i_version field, use
> > that facility to fetch it instead of doing it in nfsd4_change_attribute.
> > 
> > Neil also pointed out recently that IS_I_VERSION directory operations
> > are always logged,
> 
> ^logged^synchronous maybe?

A pedantic note, but I think necessary because so many people still
get this wrong when it comes to filesystems and IO: synchronous !=
persistent.

Ext4 and XFS both use *asynchronous journalling* - they journal
changes first to memory buffers, and only make those recorded
changes persistent when they hit internal checkpoint thresholds or
something external requires persistence to be guaranteed.

->commit_metadata is the operation filesystems provide the NFS
server to *guarantee persistence*. This allows filesystems to use
asynchronous journalling for most operations, right up to the point
the NFS server requires a change to be persistent. "synchronous
operation" is a side effect of guaranteeing persistence on some
filesytems and storage media, whereas "synchronous operation"
does not provide any guarantee of persistence...

IOWs, please talk about persistence guarantees the NFS server
application requires and implements, not about the operations (or
the nature of the operations) that may be performed by the
underlying filesystems to provide that persistence guarantee.

Cheers,

Dave.
NeilBrown Oct. 3, 2022, 11:39 p.m. UTC | #3
On Fri, 30 Sep 2022, Jeff Layton wrote:
> Now that we can call into vfs_getattr to get the i_version field, use
> that facility to fetch it instead of doing it in nfsd4_change_attribute.
> 
> Neil also pointed out recently that IS_I_VERSION directory operations
> are always logged, and so we only need to mitigate the rollback problem
> on regular files. Also, we don't need to factor in the ctime when
> reexporting NFS or Ceph.
> 
> Set the STATX_VERSION (and BTIME) bits in the request when we're dealing
> with a v4 request. Then, instead of looking at IS_I_VERSION when
> generating the change attr, look at the result mask and only use it if
> STATX_VERSION is set. With this change, we can drop the fetch_iversion
> export operation as well.
> 
> Move nfsd4_change_attribute into nfsfh.c, and change it to only factor
> in the ctime if it's a regular file and the fs doesn't advertise
> STATX_ATTR_VERSION_MONOTONIC.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/nfs/export.c          |  7 -------
>  fs/nfsd/nfs4xdr.c        |  4 +++-
>  fs/nfsd/nfsfh.c          | 40 ++++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfsfh.h          | 29 +----------------------------
>  fs/nfsd/vfs.h            |  7 ++++++-
>  include/linux/exportfs.h |  1 -
>  6 files changed, 50 insertions(+), 38 deletions(-)
> 
> diff --git a/fs/nfs/export.c b/fs/nfs/export.c
> index 01596f2d0a1e..1a9d5aa51dfb 100644
> --- a/fs/nfs/export.c
> +++ b/fs/nfs/export.c
> @@ -145,17 +145,10 @@ nfs_get_parent(struct dentry *dentry)
>  	return parent;
>  }
>  
> -static u64 nfs_fetch_iversion(struct inode *inode)
> -{
> -	nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
> -	return inode_peek_iversion_raw(inode);
> -}
> -
>  const struct export_operations nfs_export_ops = {
>  	.encode_fh = nfs_encode_fh,
>  	.fh_to_dentry = nfs_fh_to_dentry,
>  	.get_parent = nfs_get_parent,
> -	.fetch_iversion = nfs_fetch_iversion,
>  	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
>  		EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
>  		EXPORT_OP_NOATOMIC_ATTR,
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 1e9690a061ec..779c009314c6 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -2869,7 +2869,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
>  			goto out;
>  	}
>  
> -	err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
> +	err = vfs_getattr(&path, &stat,
> +			  STATX_BASIC_STATS | STATX_BTIME | STATX_VERSION,
> +			  AT_STATX_SYNC_AS_STAT);
>  	if (err)
>  		goto out_nfserr;
>  	if (!(stat.result_mask & STATX_BTIME))
> diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
> index a5b71526cee0..9168bc657378 100644
> --- a/fs/nfsd/nfsfh.c
> +++ b/fs/nfsd/nfsfh.c
> @@ -634,6 +634,10 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
>  		stat.mtime = inode->i_mtime;
>  		stat.ctime = inode->i_ctime;
>  		stat.size  = inode->i_size;
> +		if (v4 && IS_I_VERSION(inode)) {
> +			stat.version = inode_query_iversion(inode);
> +			stat.result_mask |= STATX_VERSION;
> +		}

This is increasingly ugly.  I wonder if it is justified at all...

>  	}
>  	if (v4)
>  		fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
> @@ -665,6 +669,8 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
>  	if (err) {
>  		fhp->fh_post_saved = false;
>  		fhp->fh_post_attr.ctime = inode->i_ctime;
> +		if (v4 && IS_I_VERSION(inode))
> +			fhp->fh_post_attr.version = inode_query_iversion(inode);

... ditto ...

>  	} else
>  		fhp->fh_post_saved = true;
>  	if (v4)
> @@ -754,3 +760,37 @@ enum fsid_source fsid_source(const struct svc_fh *fhp)
>  		return FSIDSOURCE_UUID;
>  	return FSIDSOURCE_DEV;
>  }
> +
> +/*
> + * We could use i_version alone as the change attribute.  However, i_version
> + * can go backwards on a regular file after an unclean shutdown.  On its own
> + * that doesn't necessarily cause a problem, but if i_version goes backwards
> + * and then is incremented again it could reuse a value that was previously
> + * used before boot, and a client who queried the two values might incorrectly
> + * assume nothing changed.
> + *
> + * By using both ctime and the i_version counter we guarantee that as long as
> + * time doesn't go backwards we never reuse an old value. If the filesystem
> + * advertises STATX_ATTR_VERSION_MONOTONIC, then this mitigation is not needed.
> + *
> + * We only need to do this for regular files as well. For directories, we
> + * assume that the new change attr is always logged to stable storage in some
> + * fashion before the results can be seen.
> + */
> +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode)
> +{
> +	u64 chattr;
> +
> +	if (stat->result_mask & STATX_VERSION) {
> +		chattr = stat->version;
> +
> +		if (S_ISREG(inode->i_mode) &&
> +		    !(stat->attributes & STATX_ATTR_VERSION_MONOTONIC)) {

I would really rather that the fs got to make this decision.
If it can guarantee that the i_version is monotonic even over a crash
(which is probably can for directory, and might need changes to do for
files) then it sets STATX_ATTR_VERSION_MONOTONIC and nfsd trusts it
completely.
If it cannot, then it doesn't set the flag.
i.e. the S_ISREG() test should be in the filesystem, not in nfsd.


> +			chattr += (u64)stat->ctime.tv_sec << 30;
> +			chattr += stat->ctime.tv_nsec;
> +		}
> +	} else {
> +		chattr = time_to_chattr(&stat->ctime);
> +	}
> +	return chattr;
> +}
> diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> index c3ae6414fc5c..4c223a7a91d4 100644
> --- a/fs/nfsd/nfsfh.h
> +++ b/fs/nfsd/nfsfh.h
> @@ -291,34 +291,7 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
>  	fhp->fh_pre_saved = false;
>  }
>  
> -/*
> - * We could use i_version alone as the change attribute.  However,
> - * i_version can go backwards after a reboot.  On its own that doesn't
> - * necessarily cause a problem, but if i_version goes backwards and then
> - * is incremented again it could reuse a value that was previously used
> - * before boot, and a client who queried the two values might
> - * incorrectly assume nothing changed.
> - *
> - * By using both ctime and the i_version counter we guarantee that as
> - * long as time doesn't go backwards we never reuse an old value.
> - */
> -static inline u64 nfsd4_change_attribute(struct kstat *stat,
> -					 struct inode *inode)
> -{
> -	if (inode->i_sb->s_export_op->fetch_iversion)
> -		return inode->i_sb->s_export_op->fetch_iversion(inode);
> -	else if (IS_I_VERSION(inode)) {
> -		u64 chattr;
> -
> -		chattr =  stat->ctime.tv_sec;
> -		chattr <<= 30;
> -		chattr += stat->ctime.tv_nsec;
> -		chattr += inode_query_iversion(inode);
> -		return chattr;
> -	} else
> -		return time_to_chattr(&stat->ctime);
> -}
> -
> +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode);
>  extern void fh_fill_pre_attrs(struct svc_fh *fhp);
>  extern void fh_fill_post_attrs(struct svc_fh *fhp);
>  extern void fh_fill_both_attrs(struct svc_fh *fhp);
> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> index c95cd414b4bb..a905f59481ee 100644
> --- a/fs/nfsd/vfs.h
> +++ b/fs/nfsd/vfs.h
> @@ -168,9 +168,14 @@ static inline void fh_drop_write(struct svc_fh *fh)
>  
>  static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat)
>  {
> +	u32 request_mask = STATX_BASIC_STATS;
>  	struct path p = {.mnt = fh->fh_export->ex_path.mnt,
>  			 .dentry = fh->fh_dentry};
> -	return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS,
> +
> +	if (fh->fh_maxsize == NFS4_FHSIZE)
> +		request_mask |= (STATX_BTIME | STATX_VERSION);
> +
> +	return nfserrno(vfs_getattr(&p, stat, request_mask,
>  				    AT_STATX_SYNC_AS_STAT));
>  }
>  
> diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
> index fe848901fcc3..9f4d4bcbf251 100644
> --- a/include/linux/exportfs.h
> +++ b/include/linux/exportfs.h
> @@ -213,7 +213,6 @@ struct export_operations {
>  			  bool write, u32 *device_generation);
>  	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
>  			     int nr_iomaps, struct iattr *iattr);
> -	u64 (*fetch_iversion)(struct inode *);
>  #define	EXPORT_OP_NOWCC			(0x1) /* don't collect v3 wcc data */
>  #define	EXPORT_OP_NOSUBTREECHK		(0x2) /* no subtree checking */
>  #define	EXPORT_OP_CLOSE_BEFORE_UNLINK	(0x4) /* close files before unlink */
> -- 
> 2.37.3
> 
> 

Definitely more to like than to dislike here, so

Reviewed-by: NeilBrown <neilb@suse.de>

Thanks,
NeilBrown
Jeff Layton Oct. 5, 2022, 10:06 a.m. UTC | #4
On Tue, 2022-10-04 at 10:39 +1100, NeilBrown wrote:
> On Fri, 30 Sep 2022, Jeff Layton wrote:
> > Now that we can call into vfs_getattr to get the i_version field, use
> > that facility to fetch it instead of doing it in nfsd4_change_attribute.
> > 
> > Neil also pointed out recently that IS_I_VERSION directory operations
> > are always logged, and so we only need to mitigate the rollback problem
> > on regular files. Also, we don't need to factor in the ctime when
> > reexporting NFS or Ceph.
> > 
> > Set the STATX_VERSION (and BTIME) bits in the request when we're dealing
> > with a v4 request. Then, instead of looking at IS_I_VERSION when
> > generating the change attr, look at the result mask and only use it if
> > STATX_VERSION is set. With this change, we can drop the fetch_iversion
> > export operation as well.
> > 
> > Move nfsd4_change_attribute into nfsfh.c, and change it to only factor
> > in the ctime if it's a regular file and the fs doesn't advertise
> > STATX_ATTR_VERSION_MONOTONIC.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/nfs/export.c          |  7 -------
> >  fs/nfsd/nfs4xdr.c        |  4 +++-
> >  fs/nfsd/nfsfh.c          | 40 ++++++++++++++++++++++++++++++++++++++++
> >  fs/nfsd/nfsfh.h          | 29 +----------------------------
> >  fs/nfsd/vfs.h            |  7 ++++++-
> >  include/linux/exportfs.h |  1 -
> >  6 files changed, 50 insertions(+), 38 deletions(-)
> > 
> > diff --git a/fs/nfs/export.c b/fs/nfs/export.c
> > index 01596f2d0a1e..1a9d5aa51dfb 100644
> > --- a/fs/nfs/export.c
> > +++ b/fs/nfs/export.c
> > @@ -145,17 +145,10 @@ nfs_get_parent(struct dentry *dentry)
> >  	return parent;
> >  }
> >  
> > -static u64 nfs_fetch_iversion(struct inode *inode)
> > -{
> > -	nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
> > -	return inode_peek_iversion_raw(inode);
> > -}
> > -
> >  const struct export_operations nfs_export_ops = {
> >  	.encode_fh = nfs_encode_fh,
> >  	.fh_to_dentry = nfs_fh_to_dentry,
> >  	.get_parent = nfs_get_parent,
> > -	.fetch_iversion = nfs_fetch_iversion,
> >  	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
> >  		EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
> >  		EXPORT_OP_NOATOMIC_ATTR,
> > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > index 1e9690a061ec..779c009314c6 100644
> > --- a/fs/nfsd/nfs4xdr.c
> > +++ b/fs/nfsd/nfs4xdr.c
> > @@ -2869,7 +2869,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
> >  			goto out;
> >  	}
> >  
> > -	err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
> > +	err = vfs_getattr(&path, &stat,
> > +			  STATX_BASIC_STATS | STATX_BTIME | STATX_VERSION,
> > +			  AT_STATX_SYNC_AS_STAT);
> >  	if (err)
> >  		goto out_nfserr;
> >  	if (!(stat.result_mask & STATX_BTIME))
> > diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
> > index a5b71526cee0..9168bc657378 100644
> > --- a/fs/nfsd/nfsfh.c
> > +++ b/fs/nfsd/nfsfh.c
> > @@ -634,6 +634,10 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
> >  		stat.mtime = inode->i_mtime;
> >  		stat.ctime = inode->i_ctime;
> >  		stat.size  = inode->i_size;
> > +		if (v4 && IS_I_VERSION(inode)) {
> > +			stat.version = inode_query_iversion(inode);
> > +			stat.result_mask |= STATX_VERSION;
> > +		}
> 
> This is increasingly ugly.  I wonder if it is justified at all...
> 

I'm fine with dropping that. So if the getattrs fail, we should just not
offer up pre/post attrs?

> >  	}
> >  	if (v4)
> >  		fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
> > @@ -665,6 +669,8 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
> >  	if (err) {
> >  		fhp->fh_post_saved = false;
> >  		fhp->fh_post_attr.ctime = inode->i_ctime;
> > +		if (v4 && IS_I_VERSION(inode))
> > +			fhp->fh_post_attr.version = inode_query_iversion(inode);
> 
> ... ditto ...
> 
> >  	} else
> >  		fhp->fh_post_saved = true;
> >  	if (v4)
> > @@ -754,3 +760,37 @@ enum fsid_source fsid_source(const struct svc_fh *fhp)
> >  		return FSIDSOURCE_UUID;
> >  	return FSIDSOURCE_DEV;
> >  }
> > +
> > +/*
> > + * We could use i_version alone as the change attribute.  However, i_version
> > + * can go backwards on a regular file after an unclean shutdown.  On its own
> > + * that doesn't necessarily cause a problem, but if i_version goes backwards
> > + * and then is incremented again it could reuse a value that was previously
> > + * used before boot, and a client who queried the two values might incorrectly
> > + * assume nothing changed.
> > + *
> > + * By using both ctime and the i_version counter we guarantee that as long as
> > + * time doesn't go backwards we never reuse an old value. If the filesystem
> > + * advertises STATX_ATTR_VERSION_MONOTONIC, then this mitigation is not needed.
> > + *
> > + * We only need to do this for regular files as well. For directories, we
> > + * assume that the new change attr is always logged to stable storage in some
> > + * fashion before the results can be seen.
> > + */
> > +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode)
> > +{
> > +	u64 chattr;
> > +
> > +	if (stat->result_mask & STATX_VERSION) {
> > +		chattr = stat->version;
> > +
> > +		if (S_ISREG(inode->i_mode) &&
> > +		    !(stat->attributes & STATX_ATTR_VERSION_MONOTONIC)) {
> 
> I would really rather that the fs got to make this decision.
> If it can guarantee that the i_version is monotonic even over a crash
> (which is probably can for directory, and might need changes to do for
> files) then it sets STATX_ATTR_VERSION_MONOTONIC and nfsd trusts it
> completely.
> If it cannot, then it doesn't set the flag.
> i.e. the S_ISREG() test should be in the filesystem, not in nfsd.
> 

This sounds reasonable, but for one thing.

From RFC 7862:

   While Section 5.4 of [RFC5661] discusses
   per-file system attributes, it is expected that the value of
   change_attr_type will not depend on the value of "homogeneous" and
   will only change in the event of a migration.

The change_attr_type4 must be the same for all filehandles under a
particular filesystem.

If we do what you suggest though, then it's easily possible for the fs
to set STATX_ATTR_VERSION_MONOTONIC on directories but not files. If we
later want to allow nfsd to advertise a change_attr_type4, we won't be
able to rely on the STATX_ATTR_VERSION_MONOTONIC to tell us how to fill
that out.

Maybe that's ok. I suppose we could add a new field to the export
options that filesystems can set to advertise what sort of change attr
they offer?

> 
> > +			chattr += (u64)stat->ctime.tv_sec << 30;
> > +			chattr += stat->ctime.tv_nsec;
> > +		}
> > +	} else {
> > +		chattr = time_to_chattr(&stat->ctime);
> > +	}
> > +	return chattr;
> > +}
> > diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> > index c3ae6414fc5c..4c223a7a91d4 100644
> > --- a/fs/nfsd/nfsfh.h
> > +++ b/fs/nfsd/nfsfh.h
> > @@ -291,34 +291,7 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
> >  	fhp->fh_pre_saved = false;
> >  }
> >  
> > -/*
> > - * We could use i_version alone as the change attribute.  However,
> > - * i_version can go backwards after a reboot.  On its own that doesn't
> > - * necessarily cause a problem, but if i_version goes backwards and then
> > - * is incremented again it could reuse a value that was previously used
> > - * before boot, and a client who queried the two values might
> > - * incorrectly assume nothing changed.
> > - *
> > - * By using both ctime and the i_version counter we guarantee that as
> > - * long as time doesn't go backwards we never reuse an old value.
> > - */
> > -static inline u64 nfsd4_change_attribute(struct kstat *stat,
> > -					 struct inode *inode)
> > -{
> > -	if (inode->i_sb->s_export_op->fetch_iversion)
> > -		return inode->i_sb->s_export_op->fetch_iversion(inode);
> > -	else if (IS_I_VERSION(inode)) {
> > -		u64 chattr;
> > -
> > -		chattr =  stat->ctime.tv_sec;
> > -		chattr <<= 30;
> > -		chattr += stat->ctime.tv_nsec;
> > -		chattr += inode_query_iversion(inode);
> > -		return chattr;
> > -	} else
> > -		return time_to_chattr(&stat->ctime);
> > -}
> > -
> > +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode);
> >  extern void fh_fill_pre_attrs(struct svc_fh *fhp);
> >  extern void fh_fill_post_attrs(struct svc_fh *fhp);
> >  extern void fh_fill_both_attrs(struct svc_fh *fhp);
> > diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> > index c95cd414b4bb..a905f59481ee 100644
> > --- a/fs/nfsd/vfs.h
> > +++ b/fs/nfsd/vfs.h
> > @@ -168,9 +168,14 @@ static inline void fh_drop_write(struct svc_fh *fh)
> >  
> >  static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat)
> >  {
> > +	u32 request_mask = STATX_BASIC_STATS;
> >  	struct path p = {.mnt = fh->fh_export->ex_path.mnt,
> >  			 .dentry = fh->fh_dentry};
> > -	return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS,
> > +
> > +	if (fh->fh_maxsize == NFS4_FHSIZE)
> > +		request_mask |= (STATX_BTIME | STATX_VERSION);
> > +
> > +	return nfserrno(vfs_getattr(&p, stat, request_mask,
> >  				    AT_STATX_SYNC_AS_STAT));
> >  }
> >  
> > diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
> > index fe848901fcc3..9f4d4bcbf251 100644
> > --- a/include/linux/exportfs.h
> > +++ b/include/linux/exportfs.h
> > @@ -213,7 +213,6 @@ struct export_operations {
> >  			  bool write, u32 *device_generation);
> >  	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
> >  			     int nr_iomaps, struct iattr *iattr);
> > -	u64 (*fetch_iversion)(struct inode *);
> >  #define	EXPORT_OP_NOWCC			(0x1) /* don't collect v3 wcc data */
> >  #define	EXPORT_OP_NOSUBTREECHK		(0x2) /* no subtree checking */
> >  #define	EXPORT_OP_CLOSE_BEFORE_UNLINK	(0x4) /* close files before unlink */
> > -- 
> > 2.37.3
> > 
> > 
> 
> Definitely more to like than to dislike here, so
> 
> Reviewed-by: NeilBrown <neilb@suse.de>
> 
> Thanks,
> NeilBrown
Chuck Lever III Oct. 5, 2022, 1:33 p.m. UTC | #5
> On Oct 5, 2022, at 6:06 AM, Jeff Layton <jlayton@kernel.org> wrote:
> 
> On Tue, 2022-10-04 at 10:39 +1100, NeilBrown wrote:
>> On Fri, 30 Sep 2022, Jeff Layton wrote:
>>> 
>>> diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
>>> index a5b71526cee0..9168bc657378 100644
>>> --- a/fs/nfsd/nfsfh.c
>>> +++ b/fs/nfsd/nfsfh.c
>>> @@ -634,6 +634,10 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
>>> 		stat.mtime = inode->i_mtime;
>>> 		stat.ctime = inode->i_ctime;
>>> 		stat.size  = inode->i_size;
>>> +		if (v4 && IS_I_VERSION(inode)) {
>>> +			stat.version = inode_query_iversion(inode);
>>> +			stat.result_mask |= STATX_VERSION;
>>> +		}
>> 
>> This is increasingly ugly.  I wonder if it is justified at all...
>> 
> 
> I'm fine with dropping that. So if the getattrs fail, we should just not
> offer up pre/post attrs?

That sounds good to me.


--
Chuck Lever
Trond Myklebust Oct. 5, 2022, 1:34 p.m. UTC | #6
On Wed, 2022-10-05 at 06:06 -0400, Jeff Layton wrote:
> On Tue, 2022-10-04 at 10:39 +1100, NeilBrown wrote:
> > On Fri, 30 Sep 2022, Jeff Layton wrote:
> > > Now that we can call into vfs_getattr to get the i_version field,
> > > use
> > > that facility to fetch it instead of doing it in
> > > nfsd4_change_attribute.
> > > 
> > > Neil also pointed out recently that IS_I_VERSION directory
> > > operations
> > > are always logged, and so we only need to mitigate the rollback
> > > problem
> > > on regular files. Also, we don't need to factor in the ctime when
> > > reexporting NFS or Ceph.
> > > 
> > > Set the STATX_VERSION (and BTIME) bits in the request when we're
> > > dealing
> > > with a v4 request. Then, instead of looking at IS_I_VERSION when
> > > generating the change attr, look at the result mask and only use
> > > it if
> > > STATX_VERSION is set. With this change, we can drop the
> > > fetch_iversion
> > > export operation as well.
> > > 
> > > Move nfsd4_change_attribute into nfsfh.c, and change it to only
> > > factor
> > > in the ctime if it's a regular file and the fs doesn't advertise
> > > STATX_ATTR_VERSION_MONOTONIC.
> > > 
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > >  fs/nfs/export.c          |  7 -------
> > >  fs/nfsd/nfs4xdr.c        |  4 +++-
> > >  fs/nfsd/nfsfh.c          | 40
> > > ++++++++++++++++++++++++++++++++++++++++
> > >  fs/nfsd/nfsfh.h          | 29 +----------------------------
> > >  fs/nfsd/vfs.h            |  7 ++++++-
> > >  include/linux/exportfs.h |  1 -
> > >  6 files changed, 50 insertions(+), 38 deletions(-)
> > > 
> > > diff --git a/fs/nfs/export.c b/fs/nfs/export.c
> > > index 01596f2d0a1e..1a9d5aa51dfb 100644
> > > --- a/fs/nfs/export.c
> > > +++ b/fs/nfs/export.c
> > > @@ -145,17 +145,10 @@ nfs_get_parent(struct dentry *dentry)
> > >         return parent;
> > >  }
> > >  
> > > -static u64 nfs_fetch_iversion(struct inode *inode)
> > > -{
> > > -       nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
> > > -       return inode_peek_iversion_raw(inode);
> > > -}
> > > -
> > >  const struct export_operations nfs_export_ops = {
> > >         .encode_fh = nfs_encode_fh,
> > >         .fh_to_dentry = nfs_fh_to_dentry,
> > >         .get_parent = nfs_get_parent,
> > > -       .fetch_iversion = nfs_fetch_iversion,
> > >         .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
> > >                 EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS
> > > |
> > >                 EXPORT_OP_NOATOMIC_ATTR,
> > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > > index 1e9690a061ec..779c009314c6 100644
> > > --- a/fs/nfsd/nfs4xdr.c
> > > +++ b/fs/nfsd/nfs4xdr.c
> > > @@ -2869,7 +2869,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr,
> > > struct svc_fh *fhp,
> > >                         goto out;
> > >         }
> > >  
> > > -       err = vfs_getattr(&path, &stat, STATX_BASIC_STATS,
> > > AT_STATX_SYNC_AS_STAT);
> > > +       err = vfs_getattr(&path, &stat,
> > > +                         STATX_BASIC_STATS | STATX_BTIME |
> > > STATX_VERSION,
> > > +                         AT_STATX_SYNC_AS_STAT);
> > >         if (err)
> > >                 goto out_nfserr;
> > >         if (!(stat.result_mask & STATX_BTIME))
> > > diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
> > > index a5b71526cee0..9168bc657378 100644
> > > --- a/fs/nfsd/nfsfh.c
> > > +++ b/fs/nfsd/nfsfh.c
> > > @@ -634,6 +634,10 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
> > >                 stat.mtime = inode->i_mtime;
> > >                 stat.ctime = inode->i_ctime;
> > >                 stat.size  = inode->i_size;
> > > +               if (v4 && IS_I_VERSION(inode)) {
> > > +                       stat.version =
> > > inode_query_iversion(inode);
> > > +                       stat.result_mask |= STATX_VERSION;
> > > +               }
> > 
> > This is increasingly ugly.  I wonder if it is justified at all...
> > 
> 
> I'm fine with dropping that. So if the getattrs fail, we should just
> not
> offer up pre/post attrs?
> 
> > >         }
> > >         if (v4)
> > >                 fhp->fh_pre_change =
> > > nfsd4_change_attribute(&stat, inode);
> > > @@ -665,6 +669,8 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
> > >         if (err) {
> > >                 fhp->fh_post_saved = false;
> > >                 fhp->fh_post_attr.ctime = inode->i_ctime;
> > > +               if (v4 && IS_I_VERSION(inode))
> > > +                       fhp->fh_post_attr.version =
> > > inode_query_iversion(inode);
> > 
> > ... ditto ...
> > 
> > >         } else
> > >                 fhp->fh_post_saved = true;
> > >         if (v4)
> > > @@ -754,3 +760,37 @@ enum fsid_source fsid_source(const struct
> > > svc_fh *fhp)
> > >                 return FSIDSOURCE_UUID;
> > >         return FSIDSOURCE_DEV;
> > >  }
> > > +
> > > +/*
> > > + * We could use i_version alone as the change attribute. 
> > > However, i_version
> > > + * can go backwards on a regular file after an unclean
> > > shutdown.  On its own
> > > + * that doesn't necessarily cause a problem, but if i_version
> > > goes backwards
> > > + * and then is incremented again it could reuse a value that was
> > > previously
> > > + * used before boot, and a client who queried the two values
> > > might incorrectly
> > > + * assume nothing changed.
> > > + *
> > > + * By using both ctime and the i_version counter we guarantee
> > > that as long as
> > > + * time doesn't go backwards we never reuse an old value. If the
> > > filesystem
> > > + * advertises STATX_ATTR_VERSION_MONOTONIC, then this mitigation
> > > is not needed.
> > > + *
> > > + * We only need to do this for regular files as well. For
> > > directories, we
> > > + * assume that the new change attr is always logged to stable
> > > storage in some
> > > + * fashion before the results can be seen.
> > > + */
> > > +u64 nfsd4_change_attribute(struct kstat *stat, struct inode
> > > *inode)
> > > +{
> > > +       u64 chattr;
> > > +
> > > +       if (stat->result_mask & STATX_VERSION) {
> > > +               chattr = stat->version;
> > > +
> > > +               if (S_ISREG(inode->i_mode) &&
> > > +                   !(stat->attributes &
> > > STATX_ATTR_VERSION_MONOTONIC)) {
> > 
> > I would really rather that the fs got to make this decision.
> > If it can guarantee that the i_version is monotonic even over a
> > crash
> > (which is probably can for directory, and might need changes to do
> > for
> > files) then it sets STATX_ATTR_VERSION_MONOTONIC and nfsd trusts it
> > completely.
> > If it cannot, then it doesn't set the flag.
> > i.e. the S_ISREG() test should be in the filesystem, not in nfsd.
> > 
> 
> This sounds reasonable, but for one thing.
> 
> From RFC 7862:
> 
>    While Section 5.4 of [RFC5661] discusses
>    per-file system attributes, it is expected that the value of
>    change_attr_type will not depend on the value of "homogeneous" and
>    will only change in the event of a migration.
> 
> The change_attr_type4 must be the same for all filehandles under a
> particular filesystem.
> 
> If we do what you suggest though, then it's easily possible for the
> fs
> to set STATX_ATTR_VERSION_MONOTONIC on directories but not files. If
> we
> later want to allow nfsd to advertise a change_attr_type4, we won't
> be
> able to rely on the STATX_ATTR_VERSION_MONOTONIC to tell us how to
> fill
> that out.

That will break clients. So no, that's not acceptable.
Jeff Layton Oct. 5, 2022, 1:57 p.m. UTC | #7
On Wed, 2022-10-05 at 13:34 +0000, Trond Myklebust wrote:
> On Wed, 2022-10-05 at 06:06 -0400, Jeff Layton wrote:
> > On Tue, 2022-10-04 at 10:39 +1100, NeilBrown wrote:
> > > On Fri, 30 Sep 2022, Jeff Layton wrote:
> > > > Now that we can call into vfs_getattr to get the i_version field,
> > > > use
> > > > that facility to fetch it instead of doing it in
> > > > nfsd4_change_attribute.
> > > > 
> > > > Neil also pointed out recently that IS_I_VERSION directory
> > > > operations
> > > > are always logged, and so we only need to mitigate the rollback
> > > > problem
> > > > on regular files. Also, we don't need to factor in the ctime when
> > > > reexporting NFS or Ceph.
> > > > 
> > > > Set the STATX_VERSION (and BTIME) bits in the request when we're
> > > > dealing
> > > > with a v4 request. Then, instead of looking at IS_I_VERSION when
> > > > generating the change attr, look at the result mask and only use
> > > > it if
> > > > STATX_VERSION is set. With this change, we can drop the
> > > > fetch_iversion
> > > > export operation as well.
> > > > 
> > > > Move nfsd4_change_attribute into nfsfh.c, and change it to only
> > > > factor
> > > > in the ctime if it's a regular file and the fs doesn't advertise
> > > > STATX_ATTR_VERSION_MONOTONIC.
> > > > 
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > ---
> > > >  fs/nfs/export.c          |  7 -------
> > > >  fs/nfsd/nfs4xdr.c        |  4 +++-
> > > >  fs/nfsd/nfsfh.c          | 40
> > > > ++++++++++++++++++++++++++++++++++++++++
> > > >  fs/nfsd/nfsfh.h          | 29 +----------------------------
> > > >  fs/nfsd/vfs.h            |  7 ++++++-
> > > >  include/linux/exportfs.h |  1 -
> > > >  6 files changed, 50 insertions(+), 38 deletions(-)
> > > > 
> > > > diff --git a/fs/nfs/export.c b/fs/nfs/export.c
> > > > index 01596f2d0a1e..1a9d5aa51dfb 100644
> > > > --- a/fs/nfs/export.c
> > > > +++ b/fs/nfs/export.c
> > > > @@ -145,17 +145,10 @@ nfs_get_parent(struct dentry *dentry)
> > > >         return parent;
> > > >  }
> > > >  
> > > > -static u64 nfs_fetch_iversion(struct inode *inode)
> > > > -{
> > > > -       nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
> > > > -       return inode_peek_iversion_raw(inode);
> > > > -}
> > > > -
> > > >  const struct export_operations nfs_export_ops = {
> > > >         .encode_fh = nfs_encode_fh,
> > > >         .fh_to_dentry = nfs_fh_to_dentry,
> > > >         .get_parent = nfs_get_parent,
> > > > -       .fetch_iversion = nfs_fetch_iversion,
> > > >         .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
> > > >                 EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS
> > > > > 
> > > >                 EXPORT_OP_NOATOMIC_ATTR,
> > > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > > > index 1e9690a061ec..779c009314c6 100644
> > > > --- a/fs/nfsd/nfs4xdr.c
> > > > +++ b/fs/nfsd/nfs4xdr.c
> > > > @@ -2869,7 +2869,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr,
> > > > struct svc_fh *fhp,
> > > >                         goto out;
> > > >         }
> > > >  
> > > > -       err = vfs_getattr(&path, &stat, STATX_BASIC_STATS,
> > > > AT_STATX_SYNC_AS_STAT);
> > > > +       err = vfs_getattr(&path, &stat,
> > > > +                         STATX_BASIC_STATS | STATX_BTIME |
> > > > STATX_VERSION,
> > > > +                         AT_STATX_SYNC_AS_STAT);
> > > >         if (err)
> > > >                 goto out_nfserr;
> > > >         if (!(stat.result_mask & STATX_BTIME))
> > > > diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
> > > > index a5b71526cee0..9168bc657378 100644
> > > > --- a/fs/nfsd/nfsfh.c
> > > > +++ b/fs/nfsd/nfsfh.c
> > > > @@ -634,6 +634,10 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
> > > >                 stat.mtime = inode->i_mtime;
> > > >                 stat.ctime = inode->i_ctime;
> > > >                 stat.size  = inode->i_size;
> > > > +               if (v4 && IS_I_VERSION(inode)) {
> > > > +                       stat.version =
> > > > inode_query_iversion(inode);
> > > > +                       stat.result_mask |= STATX_VERSION;
> > > > +               }
> > > 
> > > This is increasingly ugly.  I wonder if it is justified at all...
> > > 
> > 
> > I'm fine with dropping that. So if the getattrs fail, we should just
> > not
> > offer up pre/post attrs?
> > 
> > > >         }
> > > >         if (v4)
> > > >                 fhp->fh_pre_change =
> > > > nfsd4_change_attribute(&stat, inode);
> > > > @@ -665,6 +669,8 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
> > > >         if (err) {
> > > >                 fhp->fh_post_saved = false;
> > > >                 fhp->fh_post_attr.ctime = inode->i_ctime;
> > > > +               if (v4 && IS_I_VERSION(inode))
> > > > +                       fhp->fh_post_attr.version =
> > > > inode_query_iversion(inode);
> > > 
> > > ... ditto ...
> > > 
> > > >         } else
> > > >                 fhp->fh_post_saved = true;
> > > >         if (v4)
> > > > @@ -754,3 +760,37 @@ enum fsid_source fsid_source(const struct
> > > > svc_fh *fhp)
> > > >                 return FSIDSOURCE_UUID;
> > > >         return FSIDSOURCE_DEV;
> > > >  }
> > > > +
> > > > +/*
> > > > + * We could use i_version alone as the change attribute. 
> > > > However, i_version
> > > > + * can go backwards on a regular file after an unclean
> > > > shutdown.  On its own
> > > > + * that doesn't necessarily cause a problem, but if i_version
> > > > goes backwards
> > > > + * and then is incremented again it could reuse a value that was
> > > > previously
> > > > + * used before boot, and a client who queried the two values
> > > > might incorrectly
> > > > + * assume nothing changed.
> > > > + *
> > > > + * By using both ctime and the i_version counter we guarantee
> > > > that as long as
> > > > + * time doesn't go backwards we never reuse an old value. If the
> > > > filesystem
> > > > + * advertises STATX_ATTR_VERSION_MONOTONIC, then this mitigation
> > > > is not needed.
> > > > + *
> > > > + * We only need to do this for regular files as well. For
> > > > directories, we
> > > > + * assume that the new change attr is always logged to stable
> > > > storage in some
> > > > + * fashion before the results can be seen.
> > > > + */
> > > > +u64 nfsd4_change_attribute(struct kstat *stat, struct inode
> > > > *inode)
> > > > +{
> > > > +       u64 chattr;
> > > > +
> > > > +       if (stat->result_mask & STATX_VERSION) {
> > > > +               chattr = stat->version;
> > > > +
> > > > +               if (S_ISREG(inode->i_mode) &&
> > > > +                   !(stat->attributes &
> > > > STATX_ATTR_VERSION_MONOTONIC)) {
> > > 
> > > I would really rather that the fs got to make this decision.
> > > If it can guarantee that the i_version is monotonic even over a
> > > crash
> > > (which is probably can for directory, and might need changes to do
> > > for
> > > files) then it sets STATX_ATTR_VERSION_MONOTONIC and nfsd trusts it
> > > completely.
> > > If it cannot, then it doesn't set the flag.
> > > i.e. the S_ISREG() test should be in the filesystem, not in nfsd.
> > > 
> > 
> > This sounds reasonable, but for one thing.
> > 
> > From RFC 7862:
> > 
> >    While Section 5.4 of [RFC5661] discusses
> >    per-file system attributes, it is expected that the value of
> >    change_attr_type will not depend on the value of "homogeneous" and
> >    will only change in the event of a migration.
> > 
> > The change_attr_type4 must be the same for all filehandles under a
> > particular filesystem.
> > 
> > If we do what you suggest though, then it's easily possible for the
> > fs
> > to set STATX_ATTR_VERSION_MONOTONIC on directories but not files. If
> > we
> > later want to allow nfsd to advertise a change_attr_type4, we won't
> > be
> > able to rely on the STATX_ATTR_VERSION_MONOTONIC to tell us how to
> > fill
> > that out.
> 
> That will break clients. So no, that's not acceptable.
> 

Yeah. This is why I mentioned that this flag would have been better
advertised via fsinfo(), had that been a thing.

One option is to just document that an fs must advertise the same flag
value for all inodes.

Alternately, we could allow the fs to set the STATX_ATTR_* flag with
per-inode granularity, and for nfsd, just add a new change_attr_type()
op to export_operations. Most filesystems would just have that return a
hardcoded value, but an nfs reexport could just pass through whatever
value it got from the server.
NeilBrown Oct. 5, 2022, 9:14 p.m. UTC | #8
On Wed, 05 Oct 2022, Jeff Layton wrote:
> On Tue, 2022-10-04 at 10:39 +1100, NeilBrown wrote:
> > On Fri, 30 Sep 2022, Jeff Layton wrote:
> > > Now that we can call into vfs_getattr to get the i_version field, use
> > > that facility to fetch it instead of doing it in nfsd4_change_attribute.
> > > 
> > > Neil also pointed out recently that IS_I_VERSION directory operations
> > > are always logged, and so we only need to mitigate the rollback problem
> > > on regular files. Also, we don't need to factor in the ctime when
> > > reexporting NFS or Ceph.
> > > 
> > > Set the STATX_VERSION (and BTIME) bits in the request when we're dealing
> > > with a v4 request. Then, instead of looking at IS_I_VERSION when
> > > generating the change attr, look at the result mask and only use it if
> > > STATX_VERSION is set. With this change, we can drop the fetch_iversion
> > > export operation as well.
> > > 
> > > Move nfsd4_change_attribute into nfsfh.c, and change it to only factor
> > > in the ctime if it's a regular file and the fs doesn't advertise
> > > STATX_ATTR_VERSION_MONOTONIC.
> > > 
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > >  fs/nfs/export.c          |  7 -------
> > >  fs/nfsd/nfs4xdr.c        |  4 +++-
> > >  fs/nfsd/nfsfh.c          | 40 ++++++++++++++++++++++++++++++++++++++++
> > >  fs/nfsd/nfsfh.h          | 29 +----------------------------
> > >  fs/nfsd/vfs.h            |  7 ++++++-
> > >  include/linux/exportfs.h |  1 -
> > >  6 files changed, 50 insertions(+), 38 deletions(-)
> > > 
> > > diff --git a/fs/nfs/export.c b/fs/nfs/export.c
> > > index 01596f2d0a1e..1a9d5aa51dfb 100644
> > > --- a/fs/nfs/export.c
> > > +++ b/fs/nfs/export.c
> > > @@ -145,17 +145,10 @@ nfs_get_parent(struct dentry *dentry)
> > >  	return parent;
> > >  }
> > >  
> > > -static u64 nfs_fetch_iversion(struct inode *inode)
> > > -{
> > > -	nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
> > > -	return inode_peek_iversion_raw(inode);
> > > -}
> > > -
> > >  const struct export_operations nfs_export_ops = {
> > >  	.encode_fh = nfs_encode_fh,
> > >  	.fh_to_dentry = nfs_fh_to_dentry,
> > >  	.get_parent = nfs_get_parent,
> > > -	.fetch_iversion = nfs_fetch_iversion,
> > >  	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
> > >  		EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
> > >  		EXPORT_OP_NOATOMIC_ATTR,
> > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > > index 1e9690a061ec..779c009314c6 100644
> > > --- a/fs/nfsd/nfs4xdr.c
> > > +++ b/fs/nfsd/nfs4xdr.c
> > > @@ -2869,7 +2869,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
> > >  			goto out;
> > >  	}
> > >  
> > > -	err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
> > > +	err = vfs_getattr(&path, &stat,
> > > +			  STATX_BASIC_STATS | STATX_BTIME | STATX_VERSION,
> > > +			  AT_STATX_SYNC_AS_STAT);
> > >  	if (err)
> > >  		goto out_nfserr;
> > >  	if (!(stat.result_mask & STATX_BTIME))
> > > diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
> > > index a5b71526cee0..9168bc657378 100644
> > > --- a/fs/nfsd/nfsfh.c
> > > +++ b/fs/nfsd/nfsfh.c
> > > @@ -634,6 +634,10 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
> > >  		stat.mtime = inode->i_mtime;
> > >  		stat.ctime = inode->i_ctime;
> > >  		stat.size  = inode->i_size;
> > > +		if (v4 && IS_I_VERSION(inode)) {
> > > +			stat.version = inode_query_iversion(inode);
> > > +			stat.result_mask |= STATX_VERSION;
> > > +		}
> > 
> > This is increasingly ugly.  I wonder if it is justified at all...
> > 
> 
> I'm fine with dropping that. So if the getattrs fail, we should just not
> offer up pre/post attrs?
> 
> > >  	}
> > >  	if (v4)
> > >  		fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
> > > @@ -665,6 +669,8 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
> > >  	if (err) {
> > >  		fhp->fh_post_saved = false;
> > >  		fhp->fh_post_attr.ctime = inode->i_ctime;
> > > +		if (v4 && IS_I_VERSION(inode))
> > > +			fhp->fh_post_attr.version = inode_query_iversion(inode);
> > 
> > ... ditto ...
> > 
> > >  	} else
> > >  		fhp->fh_post_saved = true;
> > >  	if (v4)
> > > @@ -754,3 +760,37 @@ enum fsid_source fsid_source(const struct svc_fh *fhp)
> > >  		return FSIDSOURCE_UUID;
> > >  	return FSIDSOURCE_DEV;
> > >  }
> > > +
> > > +/*
> > > + * We could use i_version alone as the change attribute.  However, i_version
> > > + * can go backwards on a regular file after an unclean shutdown.  On its own
> > > + * that doesn't necessarily cause a problem, but if i_version goes backwards
> > > + * and then is incremented again it could reuse a value that was previously
> > > + * used before boot, and a client who queried the two values might incorrectly
> > > + * assume nothing changed.
> > > + *
> > > + * By using both ctime and the i_version counter we guarantee that as long as
> > > + * time doesn't go backwards we never reuse an old value. If the filesystem
> > > + * advertises STATX_ATTR_VERSION_MONOTONIC, then this mitigation is not needed.
> > > + *
> > > + * We only need to do this for regular files as well. For directories, we
> > > + * assume that the new change attr is always logged to stable storage in some
> > > + * fashion before the results can be seen.
> > > + */
> > > +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode)
> > > +{
> > > +	u64 chattr;
> > > +
> > > +	if (stat->result_mask & STATX_VERSION) {
> > > +		chattr = stat->version;
> > > +
> > > +		if (S_ISREG(inode->i_mode) &&
> > > +		    !(stat->attributes & STATX_ATTR_VERSION_MONOTONIC)) {
> > 
> > I would really rather that the fs got to make this decision.
> > If it can guarantee that the i_version is monotonic even over a crash
> > (which is probably can for directory, and might need changes to do for
> > files) then it sets STATX_ATTR_VERSION_MONOTONIC and nfsd trusts it
> > completely.
> > If it cannot, then it doesn't set the flag.
> > i.e. the S_ISREG() test should be in the filesystem, not in nfsd.
> > 
> 
> This sounds reasonable, but for one thing.
> 
> From RFC 7862:
> 
>    While Section 5.4 of [RFC5661] discusses
>    per-file system attributes, it is expected that the value of
>    change_attr_type will not depend on the value of "homogeneous" and
>    will only change in the event of a migration.
> 
> The change_attr_type4 must be the same for all filehandles under a
> particular filesystem.
> 
> If we do what you suggest though, then it's easily possible for the fs
> to set STATX_ATTR_VERSION_MONOTONIC on directories but not files. If we
> later want to allow nfsd to advertise a change_attr_type4, we won't be
> able to rely on the STATX_ATTR_VERSION_MONOTONIC to tell us how to fill
> that out.
> 
> Maybe that's ok. I suppose we could add a new field to the export
> options that filesystems can set to advertise what sort of change attr
> they offer?
> 

There are 3 cases:
1/ a file/dir which advertises MONOTONIC is easy to handle.
2/ an IS_I_VERSION file/dir that does not advertise MONOTONIC will only fail
   to be MONOTONIC across unclean restart (correct?).  nfsd can
   compensate using an xattr on the root to count crashes, or just adding ctime.
3/ a non-IS_I_VERSION fs that does not advertise MONOTONIC cannot
   be compensated for by nfsd.

If we ever want nfsd to advertise MONOTONIC, then we must be able to
reject non-IS_I_VERSION filesystems that don't advertise MONOTONIC on
all files.

Maybe we need a global nfsd option which defaults to "monotoric" and
causes those files to be rejected, but can be set to "non-monotonic" and
then allows all files to be exported.

It would be nice to make it easy to run multiple nfsd instances each on a
different IP address.  Each can then have different options.  This could
also be used to reexport an NFS mount using unmodified filehandles.

Currently you need a network namespace to create a new nfsd.  I wonder
if that is a little too much of a barrier.  But maybe we could automate
the creation of working network namespaces for nfsd....

NeilBrown
Jeff Layton Oct. 6, 2022, 11:15 a.m. UTC | #9
On Thu, 2022-10-06 at 08:14 +1100, NeilBrown wrote:
> On Wed, 05 Oct 2022, Jeff Layton wrote:
> > On Tue, 2022-10-04 at 10:39 +1100, NeilBrown wrote:
> > > On Fri, 30 Sep 2022, Jeff Layton wrote:
> > > > Now that we can call into vfs_getattr to get the i_version field, use
> > > > that facility to fetch it instead of doing it in nfsd4_change_attribute.
> > > > 
> > > > Neil also pointed out recently that IS_I_VERSION directory operations
> > > > are always logged, and so we only need to mitigate the rollback problem
> > > > on regular files. Also, we don't need to factor in the ctime when
> > > > reexporting NFS or Ceph.
> > > > 
> > > > Set the STATX_VERSION (and BTIME) bits in the request when we're dealing
> > > > with a v4 request. Then, instead of looking at IS_I_VERSION when
> > > > generating the change attr, look at the result mask and only use it if
> > > > STATX_VERSION is set. With this change, we can drop the fetch_iversion
> > > > export operation as well.
> > > > 
> > > > Move nfsd4_change_attribute into nfsfh.c, and change it to only factor
> > > > in the ctime if it's a regular file and the fs doesn't advertise
> > > > STATX_ATTR_VERSION_MONOTONIC.
> > > > 
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > ---
> > > >  fs/nfs/export.c          |  7 -------
> > > >  fs/nfsd/nfs4xdr.c        |  4 +++-
> > > >  fs/nfsd/nfsfh.c          | 40 ++++++++++++++++++++++++++++++++++++++++
> > > >  fs/nfsd/nfsfh.h          | 29 +----------------------------
> > > >  fs/nfsd/vfs.h            |  7 ++++++-
> > > >  include/linux/exportfs.h |  1 -
> > > >  6 files changed, 50 insertions(+), 38 deletions(-)
> > > > 
> > > > diff --git a/fs/nfs/export.c b/fs/nfs/export.c
> > > > index 01596f2d0a1e..1a9d5aa51dfb 100644
> > > > --- a/fs/nfs/export.c
> > > > +++ b/fs/nfs/export.c
> > > > @@ -145,17 +145,10 @@ nfs_get_parent(struct dentry *dentry)
> > > >  	return parent;
> > > >  }
> > > >  
> > > > -static u64 nfs_fetch_iversion(struct inode *inode)
> > > > -{
> > > > -	nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
> > > > -	return inode_peek_iversion_raw(inode);
> > > > -}
> > > > -
> > > >  const struct export_operations nfs_export_ops = {
> > > >  	.encode_fh = nfs_encode_fh,
> > > >  	.fh_to_dentry = nfs_fh_to_dentry,
> > > >  	.get_parent = nfs_get_parent,
> > > > -	.fetch_iversion = nfs_fetch_iversion,
> > > >  	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
> > > >  		EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
> > > >  		EXPORT_OP_NOATOMIC_ATTR,
> > > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > > > index 1e9690a061ec..779c009314c6 100644
> > > > --- a/fs/nfsd/nfs4xdr.c
> > > > +++ b/fs/nfsd/nfs4xdr.c
> > > > @@ -2869,7 +2869,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
> > > >  			goto out;
> > > >  	}
> > > >  
> > > > -	err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
> > > > +	err = vfs_getattr(&path, &stat,
> > > > +			  STATX_BASIC_STATS | STATX_BTIME | STATX_VERSION,
> > > > +			  AT_STATX_SYNC_AS_STAT);
> > > >  	if (err)
> > > >  		goto out_nfserr;
> > > >  	if (!(stat.result_mask & STATX_BTIME))
> > > > diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
> > > > index a5b71526cee0..9168bc657378 100644
> > > > --- a/fs/nfsd/nfsfh.c
> > > > +++ b/fs/nfsd/nfsfh.c
> > > > @@ -634,6 +634,10 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
> > > >  		stat.mtime = inode->i_mtime;
> > > >  		stat.ctime = inode->i_ctime;
> > > >  		stat.size  = inode->i_size;
> > > > +		if (v4 && IS_I_VERSION(inode)) {
> > > > +			stat.version = inode_query_iversion(inode);
> > > > +			stat.result_mask |= STATX_VERSION;
> > > > +		}
> > > 
> > > This is increasingly ugly.  I wonder if it is justified at all...
> > > 
> > 
> > I'm fine with dropping that. So if the getattrs fail, we should just not
> > offer up pre/post attrs?
> > 
> > > >  	}
> > > >  	if (v4)
> > > >  		fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
> > > > @@ -665,6 +669,8 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
> > > >  	if (err) {
> > > >  		fhp->fh_post_saved = false;
> > > >  		fhp->fh_post_attr.ctime = inode->i_ctime;
> > > > +		if (v4 && IS_I_VERSION(inode))
> > > > +			fhp->fh_post_attr.version = inode_query_iversion(inode);
> > > 
> > > ... ditto ...
> > > 
> > > >  	} else
> > > >  		fhp->fh_post_saved = true;
> > > >  	if (v4)
> > > > @@ -754,3 +760,37 @@ enum fsid_source fsid_source(const struct svc_fh *fhp)
> > > >  		return FSIDSOURCE_UUID;
> > > >  	return FSIDSOURCE_DEV;
> > > >  }
> > > > +
> > > > +/*
> > > > + * We could use i_version alone as the change attribute.  However, i_version
> > > > + * can go backwards on a regular file after an unclean shutdown.  On its own
> > > > + * that doesn't necessarily cause a problem, but if i_version goes backwards
> > > > + * and then is incremented again it could reuse a value that was previously
> > > > + * used before boot, and a client who queried the two values might incorrectly
> > > > + * assume nothing changed.
> > > > + *
> > > > + * By using both ctime and the i_version counter we guarantee that as long as
> > > > + * time doesn't go backwards we never reuse an old value. If the filesystem
> > > > + * advertises STATX_ATTR_VERSION_MONOTONIC, then this mitigation is not needed.
> > > > + *
> > > > + * We only need to do this for regular files as well. For directories, we
> > > > + * assume that the new change attr is always logged to stable storage in some
> > > > + * fashion before the results can be seen.
> > > > + */
> > > > +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode)
> > > > +{
> > > > +	u64 chattr;
> > > > +
> > > > +	if (stat->result_mask & STATX_VERSION) {
> > > > +		chattr = stat->version;
> > > > +
> > > > +		if (S_ISREG(inode->i_mode) &&
> > > > +		    !(stat->attributes & STATX_ATTR_VERSION_MONOTONIC)) {
> > > 
> > > I would really rather that the fs got to make this decision.
> > > If it can guarantee that the i_version is monotonic even over a crash
> > > (which is probably can for directory, and might need changes to do for
> > > files) then it sets STATX_ATTR_VERSION_MONOTONIC and nfsd trusts it
> > > completely.
> > > If it cannot, then it doesn't set the flag.
> > > i.e. the S_ISREG() test should be in the filesystem, not in nfsd.
> > > 
> > 
> > This sounds reasonable, but for one thing.
> > 
> > From RFC 7862:
> > 
> >    While Section 5.4 of [RFC5661] discusses
> >    per-file system attributes, it is expected that the value of
> >    change_attr_type will not depend on the value of "homogeneous" and
> >    will only change in the event of a migration.
> > 
> > The change_attr_type4 must be the same for all filehandles under a
> > particular filesystem.
> > 
> > If we do what you suggest though, then it's easily possible for the fs
> > to set STATX_ATTR_VERSION_MONOTONIC on directories but not files. If we
> > later want to allow nfsd to advertise a change_attr_type4, we won't be
> > able to rely on the STATX_ATTR_VERSION_MONOTONIC to tell us how to fill
> > that out.
> > 
> > Maybe that's ok. I suppose we could add a new field to the export
> > options that filesystems can set to advertise what sort of change attr
> > they offer?
> > 
> 
> There are 3 cases:
> 1/ a file/dir which advertises MONOTONIC is easy to handle.
> 2/ an IS_I_VERSION file/dir that does not advertise MONOTONIC will only fail
>    to be MONOTONIC across unclean restart (correct?).  nfsd can
>    compensate using an xattr on the root to count crashes, or just adding ctime.
> 3/ a non-IS_I_VERSION fs that does not advertise MONOTONIC cannot
>    be compensated for by nfsd.
> 
> If we ever want nfsd to advertise MONOTONIC, then we must be able to
> reject non-IS_I_VERSION filesystems that don't advertise MONOTONIC on
> all files.
> 
> Maybe we need a global nfsd option which defaults to "monotoric" and
> causes those files to be rejected, but can be set to "non-monotonic" and
> then allows all files to be exported.
> 
> It would be nice to make it easy to run multiple nfsd instances each on a
> different IP address.  Each can then have different options.  This could
> also be used to reexport an NFS mount using unmodified filehandles.
> 
> Currently you need a network namespace to create a new nfsd.  I wonder
> if that is a little too much of a barrier.  But maybe we could automate
> the creation of working network namespaces for nfsd....
> 

My current thinking is to just allow the filesystem to set
STATX_ATTR_VERSION_MONOTONIC flag on a per-inode basis, and create a new
change_attr_type() export operation and leave it up to the filesystem to
fill that out appropriately.

I think that'll give us maximum flexibility, and would also allow NFS to
pass the change attr type from the server directly through to the client
when reexporting.
Dave Chinner Oct. 6, 2022, 9:17 p.m. UTC | #10
On Tue, Oct 04, 2022 at 10:39:09AM +1100, NeilBrown wrote:
> On Fri, 30 Sep 2022, Jeff Layton wrote:
> > Now that we can call into vfs_getattr to get the i_version field, use
> > that facility to fetch it instead of doing it in nfsd4_change_attribute.
> > 
> > Neil also pointed out recently that IS_I_VERSION directory operations
> > are always logged, and so we only need to mitigate the rollback problem
> > on regular files. Also, we don't need to factor in the ctime when
> > reexporting NFS or Ceph.
> > 
> > Set the STATX_VERSION (and BTIME) bits in the request when we're dealing
> > with a v4 request. Then, instead of looking at IS_I_VERSION when
> > generating the change attr, look at the result mask and only use it if
> > STATX_VERSION is set. With this change, we can drop the fetch_iversion
> > export operation as well.
> > 
> > Move nfsd4_change_attribute into nfsfh.c, and change it to only factor
> > in the ctime if it's a regular file and the fs doesn't advertise
> > STATX_ATTR_VERSION_MONOTONIC.
....
> > +
> > +/*
> > + * We could use i_version alone as the change attribute.  However, i_version
> > + * can go backwards on a regular file after an unclean shutdown.  On its own
> > + * that doesn't necessarily cause a problem, but if i_version goes backwards
> > + * and then is incremented again it could reuse a value that was previously
> > + * used before boot, and a client who queried the two values might incorrectly
> > + * assume nothing changed.
> > + *
> > + * By using both ctime and the i_version counter we guarantee that as long as
> > + * time doesn't go backwards we never reuse an old value. If the filesystem
> > + * advertises STATX_ATTR_VERSION_MONOTONIC, then this mitigation is not needed.
> > + *
> > + * We only need to do this for regular files as well. For directories, we
> > + * assume that the new change attr is always logged to stable storage in some
> > + * fashion before the results can be seen.
> > + */
> > +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode)
> > +{
> > +	u64 chattr;
> > +
> > +	if (stat->result_mask & STATX_VERSION) {
> > +		chattr = stat->version;
> > +
> > +		if (S_ISREG(inode->i_mode) &&
> > +		    !(stat->attributes & STATX_ATTR_VERSION_MONOTONIC)) {
> 
> I would really rather that the fs got to make this decision.
> If it can guarantee that the i_version is monotonic even over a crash
> (which is probably can for directory, and might need changes to do for
> files) then it sets STATX_ATTR_VERSION_MONOTONIC and nfsd trusts it
> completely.

If you want a internal-to-nfsd solution to this monotonic iversion
problem for file data writes, then all we need to do is take the NFS
server back to NFSv2 days where all server side data writes writes
were performed as O_DSYNC writes. The NFSv3 protocol modifications
for (unstable writes + COMMIT) to allow the NFS server to use
volatile caching is the source of all the i_version persistent
guarantees we are talking about here....

So if the NFS server uses O_DSYNC writes again, by the time the
WRITE response is sent to the client (which can now use
NFS_DATA_SYNC instead of NFS_UNSTABLE so the client can mark pages
clean immediately) both the file data and the inode metadata have
been persisted.

Further, if the nfsd runs ->commit_metadata after the write has
completed, the nfsd now has a guarantee that i_version it places in
the post-ops is both persistent and covers the data change that was
just made. Problem solved, yes?

Ideally we'd want writethrough IO operation with ->commit_metadata
then guaranteeing both completed data and metadata changes are
persistent (XFS already provides this guarantee) for efficiency,
but the point remains that STATX_ATTR_VERSION_MONOTONIC behaviour
can already be implemented by the NFS server application regardless
of when the underlying filesystem bumps i_version for data writes...

Yeah, I know writethrough can result in some workloads being a bit
slower, but NFS clients are already implemented to hide write
latency from the applications. Also, NFS servers are being built
from fast SSDs these days, so this mitigates the worst of the
latency issues that writethrough IO creates....

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index 01596f2d0a1e..1a9d5aa51dfb 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -145,17 +145,10 @@  nfs_get_parent(struct dentry *dentry)
 	return parent;
 }
 
-static u64 nfs_fetch_iversion(struct inode *inode)
-{
-	nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
-	return inode_peek_iversion_raw(inode);
-}
-
 const struct export_operations nfs_export_ops = {
 	.encode_fh = nfs_encode_fh,
 	.fh_to_dentry = nfs_fh_to_dentry,
 	.get_parent = nfs_get_parent,
-	.fetch_iversion = nfs_fetch_iversion,
 	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
 		EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
 		EXPORT_OP_NOATOMIC_ATTR,
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1e9690a061ec..779c009314c6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2869,7 +2869,9 @@  nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 			goto out;
 	}
 
-	err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
+	err = vfs_getattr(&path, &stat,
+			  STATX_BASIC_STATS | STATX_BTIME | STATX_VERSION,
+			  AT_STATX_SYNC_AS_STAT);
 	if (err)
 		goto out_nfserr;
 	if (!(stat.result_mask & STATX_BTIME))
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index a5b71526cee0..9168bc657378 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -634,6 +634,10 @@  void fh_fill_pre_attrs(struct svc_fh *fhp)
 		stat.mtime = inode->i_mtime;
 		stat.ctime = inode->i_ctime;
 		stat.size  = inode->i_size;
+		if (v4 && IS_I_VERSION(inode)) {
+			stat.version = inode_query_iversion(inode);
+			stat.result_mask |= STATX_VERSION;
+		}
 	}
 	if (v4)
 		fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
@@ -665,6 +669,8 @@  void fh_fill_post_attrs(struct svc_fh *fhp)
 	if (err) {
 		fhp->fh_post_saved = false;
 		fhp->fh_post_attr.ctime = inode->i_ctime;
+		if (v4 && IS_I_VERSION(inode))
+			fhp->fh_post_attr.version = inode_query_iversion(inode);
 	} else
 		fhp->fh_post_saved = true;
 	if (v4)
@@ -754,3 +760,37 @@  enum fsid_source fsid_source(const struct svc_fh *fhp)
 		return FSIDSOURCE_UUID;
 	return FSIDSOURCE_DEV;
 }
+
+/*
+ * We could use i_version alone as the change attribute.  However, i_version
+ * can go backwards on a regular file after an unclean shutdown.  On its own
+ * that doesn't necessarily cause a problem, but if i_version goes backwards
+ * and then is incremented again it could reuse a value that was previously
+ * used before boot, and a client who queried the two values might incorrectly
+ * assume nothing changed.
+ *
+ * By using both ctime and the i_version counter we guarantee that as long as
+ * time doesn't go backwards we never reuse an old value. If the filesystem
+ * advertises STATX_ATTR_VERSION_MONOTONIC, then this mitigation is not needed.
+ *
+ * We only need to do this for regular files as well. For directories, we
+ * assume that the new change attr is always logged to stable storage in some
+ * fashion before the results can be seen.
+ */
+u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode)
+{
+	u64 chattr;
+
+	if (stat->result_mask & STATX_VERSION) {
+		chattr = stat->version;
+
+		if (S_ISREG(inode->i_mode) &&
+		    !(stat->attributes & STATX_ATTR_VERSION_MONOTONIC)) {
+			chattr += (u64)stat->ctime.tv_sec << 30;
+			chattr += stat->ctime.tv_nsec;
+		}
+	} else {
+		chattr = time_to_chattr(&stat->ctime);
+	}
+	return chattr;
+}
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index c3ae6414fc5c..4c223a7a91d4 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -291,34 +291,7 @@  static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
 	fhp->fh_pre_saved = false;
 }
 
-/*
- * We could use i_version alone as the change attribute.  However,
- * i_version can go backwards after a reboot.  On its own that doesn't
- * necessarily cause a problem, but if i_version goes backwards and then
- * is incremented again it could reuse a value that was previously used
- * before boot, and a client who queried the two values might
- * incorrectly assume nothing changed.
- *
- * By using both ctime and the i_version counter we guarantee that as
- * long as time doesn't go backwards we never reuse an old value.
- */
-static inline u64 nfsd4_change_attribute(struct kstat *stat,
-					 struct inode *inode)
-{
-	if (inode->i_sb->s_export_op->fetch_iversion)
-		return inode->i_sb->s_export_op->fetch_iversion(inode);
-	else if (IS_I_VERSION(inode)) {
-		u64 chattr;
-
-		chattr =  stat->ctime.tv_sec;
-		chattr <<= 30;
-		chattr += stat->ctime.tv_nsec;
-		chattr += inode_query_iversion(inode);
-		return chattr;
-	} else
-		return time_to_chattr(&stat->ctime);
-}
-
+u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode);
 extern void fh_fill_pre_attrs(struct svc_fh *fhp);
 extern void fh_fill_post_attrs(struct svc_fh *fhp);
 extern void fh_fill_both_attrs(struct svc_fh *fhp);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index c95cd414b4bb..a905f59481ee 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -168,9 +168,14 @@  static inline void fh_drop_write(struct svc_fh *fh)
 
 static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat)
 {
+	u32 request_mask = STATX_BASIC_STATS;
 	struct path p = {.mnt = fh->fh_export->ex_path.mnt,
 			 .dentry = fh->fh_dentry};
-	return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS,
+
+	if (fh->fh_maxsize == NFS4_FHSIZE)
+		request_mask |= (STATX_BTIME | STATX_VERSION);
+
+	return nfserrno(vfs_getattr(&p, stat, request_mask,
 				    AT_STATX_SYNC_AS_STAT));
 }
 
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index fe848901fcc3..9f4d4bcbf251 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -213,7 +213,6 @@  struct export_operations {
 			  bool write, u32 *device_generation);
 	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
 			     int nr_iomaps, struct iattr *iattr);
-	u64 (*fetch_iversion)(struct inode *);
 #define	EXPORT_OP_NOWCC			(0x1) /* don't collect v3 wcc data */
 #define	EXPORT_OP_NOSUBTREECHK		(0x2) /* no subtree checking */
 #define	EXPORT_OP_CLOSE_BEFORE_UNLINK	(0x4) /* close files before unlink */