diff mbox

[RFC,v1,00/30] fs: inode->i_version rework and optimization

Message ID 20170511185942.GD25434@fieldses.org (mailing list archive)
State Not Applicable
Headers show

Commit Message

J. Bruce Fields May 11, 2017, 6:59 p.m. UTC
On Wed, Apr 05, 2017 at 02:14:09PM -0400, J. Bruce Fields wrote:
> On Wed, Apr 05, 2017 at 10:05:51AM +0200, Jan Kara wrote:
> > 1) Keep i_version as is, make clients also check for i_ctime.
> 
> That would be a protocol revision, which we'd definitely rather avoid.
> 
> But can't we accomplish the same by using something like
> 
> 	ctime * (some constant) + i_version
> 
> ?
> 
> >    Pro: No on-disk format changes.
> >    Cons: After a crash, i_version can go backwards (but when file changes
> >    i_version, i_ctime pair should be still different) or not, data can be
> >    old or not.
> 
> This is probably good enough for NFS purposes: typically on an NFS
> filesystem, results of a read in the face of a concurrent write open are
> undefined.  And writers sync before close.
> 
> So after a crash with a dirty inode, we're in a situation where an NFS
> client still needs to resend some writes, sync, and close.  I'm OK with
> things being inconsistent during this window.
> 
> I do expect things to return to normal once that client's has resent its
> writes--hence the worry about actually resuing old values after boot
> (such as if i_version regresses on boot and then increments back to the
> same value after further writes).  Factoring in ctime fixes that.

So for now I'm thinking of just doing something like the following.

Only nfsd needs it for now, but it could be moved to a vfs helper for
statx, or for individual filesystems that want to do something
different.  (The NFSv4 client will want to use the server's change
attribute instead, I think.  And other filesystems might want to try
something more ambitious like Neil's proposal.)

--b.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

NeilBrown May 11, 2017, 10:22 p.m. UTC | #1
On Thu, May 11 2017, J. Bruce Fields wrote:

> On Wed, Apr 05, 2017 at 02:14:09PM -0400, J. Bruce Fields wrote:
>> On Wed, Apr 05, 2017 at 10:05:51AM +0200, Jan Kara wrote:
>> > 1) Keep i_version as is, make clients also check for i_ctime.
>> 
>> That would be a protocol revision, which we'd definitely rather avoid.
>> 
>> But can't we accomplish the same by using something like
>> 
>> 	ctime * (some constant) + i_version
>> 
>> ?
>> 
>> >    Pro: No on-disk format changes.
>> >    Cons: After a crash, i_version can go backwards (but when file changes
>> >    i_version, i_ctime pair should be still different) or not, data can be
>> >    old or not.
>> 
>> This is probably good enough for NFS purposes: typically on an NFS
>> filesystem, results of a read in the face of a concurrent write open are
>> undefined.  And writers sync before close.
>> 
>> So after a crash with a dirty inode, we're in a situation where an NFS
>> client still needs to resend some writes, sync, and close.  I'm OK with
>> things being inconsistent during this window.
>> 
>> I do expect things to return to normal once that client's has resent its
>> writes--hence the worry about actually resuing old values after boot
>> (such as if i_version regresses on boot and then increments back to the
>> same value after further writes).  Factoring in ctime fixes that.
>
> So for now I'm thinking of just doing something like the following.
>
> Only nfsd needs it for now, but it could be moved to a vfs helper for
> statx, or for individual filesystems that want to do something
> different.  (The NFSv4 client will want to use the server's change
> attribute instead, I think.  And other filesystems might want to try
> something more ambitious like Neil's proposal.)
>
> --b.
>
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index 12feac6ee2fd..9636c9a60aba 100644
> diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> index f84fe6bf9aee..14f09f1ef605 100644
> --- a/fs/nfsd/nfsfh.h
> +++ b/fs/nfsd/nfsfh.h
> @@ -240,6 +240,16 @@ fh_clear_wcc(struct svc_fh *fhp)
>  	fhp->fh_pre_saved = false;
>  }
>  
> +static inline u64 nfsd4_change_attribute(struct inode *inode)
> +{
> +	u64 chattr;
> +
> +	chattr = inode->i_ctime.tv_sec << 30;
> +	chattr += inode->i_ctime.tv_nsec;
> +	chattr += inode->i_version;
> +	return chattr;

So if I chmod a file, all clients will need to flush the content from their cache?
Maybe they already do?  Maybe it is a boring corner case?

> +}
> +
>  /*
>   * Fill in the pre_op attr for the wcc data
>   */
> @@ -253,7 +263,7 @@ fill_pre_wcc(struct svc_fh *fhp)
>  		fhp->fh_pre_mtime = inode->i_mtime;
>  		fhp->fh_pre_ctime = inode->i_ctime;
>  		fhp->fh_pre_size  = inode->i_size;
> -		fhp->fh_pre_change = inode->i_version;
> +		fhp->fh_pre_change = nfsd4_change_attribute(inode);
>  		fhp->fh_pre_saved = true;
>  	}
>  }
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -260,7 +260,7 @@ void fill_post_wcc(struct svc_fh *fhp)
>  		printk("nfsd: inode locked twice during operation.\n");
>  
>  	err = fh_getattr(fhp, &fhp->fh_post_attr);
> -	fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
> +	fhp->fh_post_change = nfsd4_change_attribute(d_inode(fhp->fh_dentry));
>  	if (err) {
>  		fhp->fh_post_saved = false;
>  		/* Grab the ctime anyway - set_change_info might use it */
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 26780d53a6f9..a09532d4a383 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -1973,7 +1973,7 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
>  		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
>  		*p++ = 0;
>  	} else if (IS_I_VERSION(inode)) {
> -		p = xdr_encode_hyper(p, inode->i_version);
> +		p = xdr_encode_hyper(p, nfsd4_change_attribute(inode));
>  	} else {
>  		*p++ = cpu_to_be32(stat->ctime.tv_sec);
>  		*p++ = cpu_to_be32(stat->ctime.tv_nsec);

It is *really* confusing to find that fh_post_change is only set in nfs3
code, and only used in nfs4 code.
It is probably time to get a 'version' field in 'struct kstat'.
That would allow this code to get a little cleaner.

(to me, this exercise is just a reminder that the NFSv4 change attribute
is poorly designed ... so it just makes me grumpy).

NeilBrown


> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara May 12, 2017, 8:27 a.m. UTC | #2
On Thu 11-05-17 14:59:43, J. Bruce Fields wrote:
> On Wed, Apr 05, 2017 at 02:14:09PM -0400, J. Bruce Fields wrote:
> > On Wed, Apr 05, 2017 at 10:05:51AM +0200, Jan Kara wrote:
> > > 1) Keep i_version as is, make clients also check for i_ctime.
> > 
> > That would be a protocol revision, which we'd definitely rather avoid.
> > 
> > But can't we accomplish the same by using something like
> > 
> > 	ctime * (some constant) + i_version
> > 
> > ?
> > 
> > >    Pro: No on-disk format changes.
> > >    Cons: After a crash, i_version can go backwards (but when file changes
> > >    i_version, i_ctime pair should be still different) or not, data can be
> > >    old or not.
> > 
> > This is probably good enough for NFS purposes: typically on an NFS
> > filesystem, results of a read in the face of a concurrent write open are
> > undefined.  And writers sync before close.
> > 
> > So after a crash with a dirty inode, we're in a situation where an NFS
> > client still needs to resend some writes, sync, and close.  I'm OK with
> > things being inconsistent during this window.
> > 
> > I do expect things to return to normal once that client's has resent its
> > writes--hence the worry about actually resuing old values after boot
> > (such as if i_version regresses on boot and then increments back to the
> > same value after further writes).  Factoring in ctime fixes that.
> 
> So for now I'm thinking of just doing something like the following.
> 
> Only nfsd needs it for now, but it could be moved to a vfs helper for
> statx, or for individual filesystems that want to do something
> different.  (The NFSv4 client will want to use the server's change
> attribute instead, I think.  And other filesystems might want to try
> something more ambitious like Neil's proposal.)
> 
> --b.
> 
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index 12feac6ee2fd..9636c9a60aba 100644
> diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> index f84fe6bf9aee..14f09f1ef605 100644
> --- a/fs/nfsd/nfsfh.h
> +++ b/fs/nfsd/nfsfh.h
> @@ -240,6 +240,16 @@ fh_clear_wcc(struct svc_fh *fhp)
>  	fhp->fh_pre_saved = false;
>  }
>  
> +static inline u64 nfsd4_change_attribute(struct inode *inode)
> +{
> +	u64 chattr;
> +
> +	chattr = inode->i_ctime.tv_sec << 30;

Won't this overflow on 32-bit archs? tv_sec seems to be defined as long?
Probably you need explicit (u64) cast... Otherwise I'm fine with this.

								Honza

> +	chattr += inode->i_ctime.tv_nsec;
> +	chattr += inode->i_version;
> +	return chattr;
> +}
> +
>  /*
>   * Fill in the pre_op attr for the wcc data
>   */
> @@ -253,7 +263,7 @@ fill_pre_wcc(struct svc_fh *fhp)
>  		fhp->fh_pre_mtime = inode->i_mtime;
>  		fhp->fh_pre_ctime = inode->i_ctime;
>  		fhp->fh_pre_size  = inode->i_size;
> -		fhp->fh_pre_change = inode->i_version;
> +		fhp->fh_pre_change = nfsd4_change_attribute(inode);
>  		fhp->fh_pre_saved = true;
>  	}
>  }
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -260,7 +260,7 @@ void fill_post_wcc(struct svc_fh *fhp)
>  		printk("nfsd: inode locked twice during operation.\n");
>  
>  	err = fh_getattr(fhp, &fhp->fh_post_attr);
> -	fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
> +	fhp->fh_post_change = nfsd4_change_attribute(d_inode(fhp->fh_dentry));
>  	if (err) {
>  		fhp->fh_post_saved = false;
>  		/* Grab the ctime anyway - set_change_info might use it */
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 26780d53a6f9..a09532d4a383 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -1973,7 +1973,7 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
>  		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
>  		*p++ = 0;
>  	} else if (IS_I_VERSION(inode)) {
> -		p = xdr_encode_hyper(p, inode->i_version);
> +		p = xdr_encode_hyper(p, nfsd4_change_attribute(inode));
>  	} else {
>  		*p++ = cpu_to_be32(stat->ctime.tv_sec);
>  		*p++ = cpu_to_be32(stat->ctime.tv_nsec);
Jeff Layton May 12, 2017, 11:01 a.m. UTC | #3
On Thu, 2017-05-11 at 14:59 -0400, J. Bruce Fields wrote:
> On Wed, Apr 05, 2017 at 02:14:09PM -0400, J. Bruce Fields wrote:
> > On Wed, Apr 05, 2017 at 10:05:51AM +0200, Jan Kara wrote:
> > > 1) Keep i_version as is, make clients also check for i_ctime.
> > 
> > That would be a protocol revision, which we'd definitely rather avoid.
> > 
> > But can't we accomplish the same by using something like
> > 
> > 	ctime * (some constant) + i_version
> > 
> > ?
> > 
> > >    Pro: No on-disk format changes.
> > >    Cons: After a crash, i_version can go backwards (but when file changes
> > >    i_version, i_ctime pair should be still different) or not, data can be
> > >    old or not.
> > 
> > This is probably good enough for NFS purposes: typically on an NFS
> > filesystem, results of a read in the face of a concurrent write open are
> > undefined.  And writers sync before close.
> > 
> > So after a crash with a dirty inode, we're in a situation where an NFS
> > client still needs to resend some writes, sync, and close.  I'm OK with
> > things being inconsistent during this window.
> > 
> > I do expect things to return to normal once that client's has resent its
> > writes--hence the worry about actually resuing old values after boot
> > (such as if i_version regresses on boot and then increments back to the
> > same value after further writes).  Factoring in ctime fixes that.
> 
> So for now I'm thinking of just doing something like the following.
> 
> Only nfsd needs it for now, but it could be moved to a vfs helper for
> statx, or for individual filesystems that want to do something
> different.  (The NFSv4 client will want to use the server's change
> attribute instead, I think.  And other filesystems might want to try
> something more ambitious like Neil's proposal.)
> 
> --b.
> 
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index 12feac6ee2fd..9636c9a60aba 100644
> diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> index f84fe6bf9aee..14f09f1ef605 100644
> --- a/fs/nfsd/nfsfh.h
> +++ b/fs/nfsd/nfsfh.h
> @@ -240,6 +240,16 @@ fh_clear_wcc(struct svc_fh *fhp)
>  	fhp->fh_pre_saved = false;
>  }
>  
> +static inline u64 nfsd4_change_attribute(struct inode *inode)
> +{
> +	u64 chattr;
> +
> +	chattr = inode->i_ctime.tv_sec << 30;
> +	chattr += inode->i_ctime.tv_nsec;
> +	chattr += inode->i_version;
> +	return chattr;
> +}
> +
>  /*
>   * Fill in the pre_op attr for the wcc data
>   */
> @@ -253,7 +263,7 @@ fill_pre_wcc(struct svc_fh *fhp)
>  		fhp->fh_pre_mtime = inode->i_mtime;
>  		fhp->fh_pre_ctime = inode->i_ctime;
>  		fhp->fh_pre_size  = inode->i_size;
> -		fhp->fh_pre_change = inode->i_version;
> +		fhp->fh_pre_change = nfsd4_change_attribute(inode);
>  		fhp->fh_pre_saved = true;
>  	}
>  }
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -260,7 +260,7 @@ void fill_post_wcc(struct svc_fh *fhp)
>  		printk("nfsd: inode locked twice during operation.\n");
>  
>  	err = fh_getattr(fhp, &fhp->fh_post_attr);
> -	fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
> +	fhp->fh_post_change = nfsd4_change_attribute(d_inode(fhp->fh_dentry));
>  	if (err) {
>  		fhp->fh_post_saved = false;
>  		/* Grab the ctime anyway - set_change_info might use it */
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 26780d53a6f9..a09532d4a383 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -1973,7 +1973,7 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
>  		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
>  		*p++ = 0;
>  	} else if (IS_I_VERSION(inode)) {
> -		p = xdr_encode_hyper(p, inode->i_version);
> +		p = xdr_encode_hyper(p, nfsd4_change_attribute(inode));
>  	} else {
>  		*p++ = cpu_to_be32(stat->ctime.tv_sec);
>  		*p++ = cpu_to_be32(stat->ctime.tv_nsec);


Sorry I've been MIA on this discussion. I've had a very busy spring...

This looks reasonable to me (modulo Jan's comment about casting tv_sec
to u64).

To be clear, I think this is mostly orthogonal to the changes that I was
originally proposing, right? I think we can still benefit from only
bumping and storing i_version values after they've been queried.
J. Bruce Fields May 12, 2017, 3:56 p.m. UTC | #4
On Fri, May 12, 2017 at 10:27:54AM +0200, Jan Kara wrote:
> On Thu 11-05-17 14:59:43, J. Bruce Fields wrote:
> > On Wed, Apr 05, 2017 at 02:14:09PM -0400, J. Bruce Fields wrote:
> > > On Wed, Apr 05, 2017 at 10:05:51AM +0200, Jan Kara wrote:
> > > > 1) Keep i_version as is, make clients also check for i_ctime.
> > > 
> > > That would be a protocol revision, which we'd definitely rather avoid.
> > > 
> > > But can't we accomplish the same by using something like
> > > 
> > > 	ctime * (some constant) + i_version
> > > 
> > > ?
> > > 
> > > >    Pro: No on-disk format changes.
> > > >    Cons: After a crash, i_version can go backwards (but when file changes
> > > >    i_version, i_ctime pair should be still different) or not, data can be
> > > >    old or not.
> > > 
> > > This is probably good enough for NFS purposes: typically on an NFS
> > > filesystem, results of a read in the face of a concurrent write open are
> > > undefined.  And writers sync before close.
> > > 
> > > So after a crash with a dirty inode, we're in a situation where an NFS
> > > client still needs to resend some writes, sync, and close.  I'm OK with
> > > things being inconsistent during this window.
> > > 
> > > I do expect things to return to normal once that client's has resent its
> > > writes--hence the worry about actually resuing old values after boot
> > > (such as if i_version regresses on boot and then increments back to the
> > > same value after further writes).  Factoring in ctime fixes that.
> > 
> > So for now I'm thinking of just doing something like the following.
> > 
> > Only nfsd needs it for now, but it could be moved to a vfs helper for
> > statx, or for individual filesystems that want to do something
> > different.  (The NFSv4 client will want to use the server's change
> > attribute instead, I think.  And other filesystems might want to try
> > something more ambitious like Neil's proposal.)
> > 
> > --b.
> > 
> > diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> > index 12feac6ee2fd..9636c9a60aba 100644
> > diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> > index f84fe6bf9aee..14f09f1ef605 100644
> > --- a/fs/nfsd/nfsfh.h
> > +++ b/fs/nfsd/nfsfh.h
> > @@ -240,6 +240,16 @@ fh_clear_wcc(struct svc_fh *fhp)
> >  	fhp->fh_pre_saved = false;
> >  }
> >  
> > +static inline u64 nfsd4_change_attribute(struct inode *inode)
> > +{
> > +	u64 chattr;
> > +
> > +	chattr = inode->i_ctime.tv_sec << 30;
> 
> Won't this overflow on 32-bit archs? tv_sec seems to be defined as long?
> Probably you need explicit (u64) cast... Otherwise I'm fine with this.

Whoops, yes.  Or just assign to chattr as a separate step.  I'll fix
that.

--b.

> > +	chattr += inode->i_ctime.tv_nsec;
> > +	chattr += inode->i_version;
> > +	return chattr;
> > +}
> > +
> >  /*
> >   * Fill in the pre_op attr for the wcc data
> >   */
> > @@ -253,7 +263,7 @@ fill_pre_wcc(struct svc_fh *fhp)
> >  		fhp->fh_pre_mtime = inode->i_mtime;
> >  		fhp->fh_pre_ctime = inode->i_ctime;
> >  		fhp->fh_pre_size  = inode->i_size;
> > -		fhp->fh_pre_change = inode->i_version;
> > +		fhp->fh_pre_change = nfsd4_change_attribute(inode);
> >  		fhp->fh_pre_saved = true;
> >  	}
> >  }
> > --- a/fs/nfsd/nfs3xdr.c
> > +++ b/fs/nfsd/nfs3xdr.c
> > @@ -260,7 +260,7 @@ void fill_post_wcc(struct svc_fh *fhp)
> >  		printk("nfsd: inode locked twice during operation.\n");
> >  
> >  	err = fh_getattr(fhp, &fhp->fh_post_attr);
> > -	fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
> > +	fhp->fh_post_change = nfsd4_change_attribute(d_inode(fhp->fh_dentry));
> >  	if (err) {
> >  		fhp->fh_post_saved = false;
> >  		/* Grab the ctime anyway - set_change_info might use it */
> > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > index 26780d53a6f9..a09532d4a383 100644
> > --- a/fs/nfsd/nfs4xdr.c
> > +++ b/fs/nfsd/nfs4xdr.c
> > @@ -1973,7 +1973,7 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
> >  		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
> >  		*p++ = 0;
> >  	} else if (IS_I_VERSION(inode)) {
> > -		p = xdr_encode_hyper(p, inode->i_version);
> > +		p = xdr_encode_hyper(p, nfsd4_change_attribute(inode));
> >  	} else {
> >  		*p++ = cpu_to_be32(stat->ctime.tv_sec);
> >  		*p++ = cpu_to_be32(stat->ctime.tv_nsec);
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields May 12, 2017, 3:57 p.m. UTC | #5
On Fri, May 12, 2017 at 07:01:25AM -0400, Jeff Layton wrote:
> This looks reasonable to me (modulo Jan's comment about casting tv_sec
> to u64).
> 
> To be clear, I think this is mostly orthogonal to the changes that I was
> originally proposing, right? I think we can still benefit from only
> bumping and storing i_version values after they've been queried.

Definitely, yes.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields May 12, 2017, 4:21 p.m. UTC | #6
On Fri, May 12, 2017 at 08:22:23AM +1000, NeilBrown wrote:
> On Thu, May 11 2017, J. Bruce Fields wrote:
> > +static inline u64 nfsd4_change_attribute(struct inode *inode)
> > +{
> > +	u64 chattr;
> > +
> > +	chattr = inode->i_ctime.tv_sec << 30;
> > +	chattr += inode->i_ctime.tv_nsec;
> > +	chattr += inode->i_version;
> > +	return chattr;
> 
> So if I chmod a file, all clients will need to flush the content from their cache?
> Maybe they already do?  Maybe it is a boring corner case?

Yeah, that's the assumption, maybe it's wrong.  I can't recall
complaints about anyone bitten by that case.

> >  /*
> >   * Fill in the pre_op attr for the wcc data
> >   */
> > @@ -253,7 +263,7 @@ fill_pre_wcc(struct svc_fh *fhp)
> >  		fhp->fh_pre_mtime = inode->i_mtime;
> >  		fhp->fh_pre_ctime = inode->i_ctime;
> >  		fhp->fh_pre_size  = inode->i_size;
> > -		fhp->fh_pre_change = inode->i_version;
> > +		fhp->fh_pre_change = nfsd4_change_attribute(inode);
> >  		fhp->fh_pre_saved = true;
> >  	}
> >  }
> > --- a/fs/nfsd/nfs3xdr.c
> > +++ b/fs/nfsd/nfs3xdr.c
> > @@ -260,7 +260,7 @@ void fill_post_wcc(struct svc_fh *fhp)
> >  		printk("nfsd: inode locked twice during operation.\n");
> >  
> >  	err = fh_getattr(fhp, &fhp->fh_post_attr);
> > -	fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
> > +	fhp->fh_post_change = nfsd4_change_attribute(d_inode(fhp->fh_dentry));
> >  	if (err) {
> >  		fhp->fh_post_saved = false;
> >  		/* Grab the ctime anyway - set_change_info might use it */
> > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > index 26780d53a6f9..a09532d4a383 100644
> > --- a/fs/nfsd/nfs4xdr.c
> > +++ b/fs/nfsd/nfs4xdr.c
> > @@ -1973,7 +1973,7 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
> >  		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
> >  		*p++ = 0;
> >  	} else if (IS_I_VERSION(inode)) {
> > -		p = xdr_encode_hyper(p, inode->i_version);
> > +		p = xdr_encode_hyper(p, nfsd4_change_attribute(inode));
> >  	} else {
> >  		*p++ = cpu_to_be32(stat->ctime.tv_sec);
> >  		*p++ = cpu_to_be32(stat->ctime.tv_nsec);
> 
> It is *really* confusing to find that fh_post_change is only set in nfs3
> code, and only used in nfs4 code.

Yup.

> It is probably time to get a 'version' field in 'struct kstat'.

The pre/post_wcc code doesn't seem to be doing an explicit stat, I
wonder if that matters?

--b.

> That would allow this code to get a little cleaner.
> 
> (to me, this exercise is just a reminder that the NFSv4 change attribute
> is poorly designed ... so it just makes me grumpy).
> 
> NeilBrown
> 
> 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Oct. 30, 2017, 1:21 p.m. UTC | #7
On Fri, 2017-05-12 at 12:21 -0400, J. Bruce Fields wrote:
> On Fri, May 12, 2017 at 08:22:23AM +1000, NeilBrown wrote:
> > On Thu, May 11 2017, J. Bruce Fields wrote:
> > > +static inline u64 nfsd4_change_attribute(struct inode *inode)
> > > +{
> > > +	u64 chattr;
> > > +
> > > +	chattr = inode->i_ctime.tv_sec << 30;
> > > +	chattr += inode->i_ctime.tv_nsec;
> > > +	chattr += inode->i_version;
> > > +	return chattr;
> > 
> > So if I chmod a file, all clients will need to flush the content from their cache?
> > Maybe they already do?  Maybe it is a boring corner case?
> 
> Yeah, that's the assumption, maybe it's wrong.  I can't recall
> complaints about anyone bitten by that case.
> 

I'm pretty sure that's required by the RFC. The change attribute changes
with both data and metadata changes, and there is no way to tell what
sort of change it was. You have to dump everything out of the cache when
it changes.

> > >  /*
> > >   * Fill in the pre_op attr for the wcc data
> > >   */
> > > @@ -253,7 +263,7 @@ fill_pre_wcc(struct svc_fh *fhp)
> > >  		fhp->fh_pre_mtime = inode->i_mtime;
> > >  		fhp->fh_pre_ctime = inode->i_ctime;
> > >  		fhp->fh_pre_size  = inode->i_size;
> > > -		fhp->fh_pre_change = inode->i_version;
> > > +		fhp->fh_pre_change = nfsd4_change_attribute(inode);
> > >  		fhp->fh_pre_saved = true;
> > >  	}
> > >  }
> > > --- a/fs/nfsd/nfs3xdr.c
> > > +++ b/fs/nfsd/nfs3xdr.c
> > > @@ -260,7 +260,7 @@ void fill_post_wcc(struct svc_fh *fhp)
> > >  		printk("nfsd: inode locked twice during operation.\n");
> > >  
> > >  	err = fh_getattr(fhp, &fhp->fh_post_attr);
> > > -	fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
> > > +	fhp->fh_post_change = nfsd4_change_attribute(d_inode(fhp->fh_dentry));
> > >  	if (err) {
> > >  		fhp->fh_post_saved = false;
> > >  		/* Grab the ctime anyway - set_change_info might use it */
> > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > > index 26780d53a6f9..a09532d4a383 100644
> > > --- a/fs/nfsd/nfs4xdr.c
> > > +++ b/fs/nfsd/nfs4xdr.c
> > > @@ -1973,7 +1973,7 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
> > >  		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
> > >  		*p++ = 0;
> > >  	} else if (IS_I_VERSION(inode)) {
> > > -		p = xdr_encode_hyper(p, inode->i_version);
> > > +		p = xdr_encode_hyper(p, nfsd4_change_attribute(inode));
> > >  	} else {
> > >  		*p++ = cpu_to_be32(stat->ctime.tv_sec);
> > >  		*p++ = cpu_to_be32(stat->ctime.tv_nsec);
> > 
> > It is *really* confusing to find that fh_post_change is only set in nfs3
> > code, and only used in nfs4 code.
> 
> Yup.
> 
> > It is probably time to get a 'version' field in 'struct kstat'.
> 
> The pre/post_wcc code doesn't seem to be doing an explicit stat, I
> wonder if that matters?
> 

Probably not for now. We only use this for namespace altering operations
anyway (create, link, unlink, and rename).

The post code does do a fh_getattr. It's only the pre-op i_version that
comes out of the cache. Only btrfs, xfs, and ext4 have a real i_version
counter today, and they just scrape that info out of the in-core inode.
So while not completely atomic, you should see a difference in the
change_info4 during any of those operations.

FWIW, userland cephfs now supports a cluster-coherent change attribute,
though the kernel client still needs some work before we can implement
it there. Eventually we'll add that, and at that point we might need to
have nfsd do a getattr in the pre part as well.
diff mbox

Patch

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 12feac6ee2fd..9636c9a60aba 100644
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index f84fe6bf9aee..14f09f1ef605 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -240,6 +240,16 @@  fh_clear_wcc(struct svc_fh *fhp)
 	fhp->fh_pre_saved = false;
 }
 
+static inline u64 nfsd4_change_attribute(struct inode *inode)
+{
+	u64 chattr;
+
+	chattr = inode->i_ctime.tv_sec << 30;
+	chattr += inode->i_ctime.tv_nsec;
+	chattr += inode->i_version;
+	return chattr;
+}
+
 /*
  * Fill in the pre_op attr for the wcc data
  */
@@ -253,7 +263,7 @@  fill_pre_wcc(struct svc_fh *fhp)
 		fhp->fh_pre_mtime = inode->i_mtime;
 		fhp->fh_pre_ctime = inode->i_ctime;
 		fhp->fh_pre_size  = inode->i_size;
-		fhp->fh_pre_change = inode->i_version;
+		fhp->fh_pre_change = nfsd4_change_attribute(inode);
 		fhp->fh_pre_saved = true;
 	}
 }
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -260,7 +260,7 @@  void fill_post_wcc(struct svc_fh *fhp)
 		printk("nfsd: inode locked twice during operation.\n");
 
 	err = fh_getattr(fhp, &fhp->fh_post_attr);
-	fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
+	fhp->fh_post_change = nfsd4_change_attribute(d_inode(fhp->fh_dentry));
 	if (err) {
 		fhp->fh_post_saved = false;
 		/* Grab the ctime anyway - set_change_info might use it */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 26780d53a6f9..a09532d4a383 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1973,7 +1973,7 @@  static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
 		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
 		*p++ = 0;
 	} else if (IS_I_VERSION(inode)) {
-		p = xdr_encode_hyper(p, inode->i_version);
+		p = xdr_encode_hyper(p, nfsd4_change_attribute(inode));
 	} else {
 		*p++ = cpu_to_be32(stat->ctime.tv_sec);
 		*p++ = cpu_to_be32(stat->ctime.tv_nsec);