diff mbox series

[v4,3/7] NFSD: Add an NFSD_FILE_GC flag to enable nfsd_file garbage collection

Message ID 166612311828.1291.6808456808715954510.stgit@manet.1015granger.net (mailing list archive)
State New, archived
Headers show
Series A course adjustment, for sure | expand

Commit Message

Chuck Lever Oct. 18, 2022, 7:58 p.m. UTC
NFSv4 operations manage the lifetime of nfsd_file items they use by
means of NFSv4 OPEN and CLOSE. Hence there's no need for them to be
garbage collected.

Introduce a mechanism to enable garbage collection for nfsd_file
items used only by NFSv2/3 callers.

Note that the change in nfsd_file_put() ensures that both CLOSE and
DELEGRETURN will actually close out and free an nfsd_file on last
reference of a non-garbage-collected file.

Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=394
Suggested-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/filecache.c |   61 +++++++++++++++++++++++++++++++++++++++++++++------
 fs/nfsd/filecache.h |    3 +++
 fs/nfsd/nfs3proc.c  |    4 ++-
 fs/nfsd/trace.h     |    3 ++-
 fs/nfsd/vfs.c       |    4 ++-
 5 files changed, 63 insertions(+), 12 deletions(-)

Comments

Jeff Layton Oct. 19, 2022, 11:24 a.m. UTC | #1
On Tue, 2022-10-18 at 15:58 -0400, Chuck Lever wrote:
> NFSv4 operations manage the lifetime of nfsd_file items they use by
> means of NFSv4 OPEN and CLOSE. Hence there's no need for them to be
> garbage collected.
> 
> Introduce a mechanism to enable garbage collection for nfsd_file
> items used only by NFSv2/3 callers.
> 
> Note that the change in nfsd_file_put() ensures that both CLOSE and
> DELEGRETURN will actually close out and free an nfsd_file on last
> reference of a non-garbage-collected file.
> 
> Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=394
> Suggested-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> Tested-by: Jeff Layton <jlayton@kernel.org>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  fs/nfsd/filecache.c |   61 +++++++++++++++++++++++++++++++++++++++++++++------
>  fs/nfsd/filecache.h |    3 +++
>  fs/nfsd/nfs3proc.c  |    4 ++-
>  fs/nfsd/trace.h     |    3 ++-
>  fs/nfsd/vfs.c       |    4 ++-
>  5 files changed, 63 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index b7aa523c2010..87fce5c95726 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -63,6 +63,7 @@ struct nfsd_file_lookup_key {
>  	struct net			*net;
>  	const struct cred		*cred;
>  	unsigned char			need;
> +	unsigned char			gc:1;

Is it worth it to mess around with bitfields here? There are exising
holes in this struct, so you're really not saving anything by using one
here.

Also, it seems like "need" is used as a bool anyway. Maybe both of those
fields should just be bools? It looks like changing that won't change
the size of the struct anyway. You might even be able to shrink it by
moving the "type" above "need".
 
>  	enum nfsd_file_lookup_type	type;
>  };
>  
> @@ -162,6 +163,8 @@ static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
>  			return 1;
>  		if (!nfsd_match_cred(nf->nf_cred, key->cred))
>  			return 1;
> +		if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc)
> +			return 1;
>  		if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
>  			return 1;
>  		break;
> @@ -297,6 +300,8 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
>  		nf->nf_flags = 0;
>  		__set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
>  		__set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
> +		if (key->gc)
> +			__set_bit(NFSD_FILE_GC, &nf->nf_flags);
>  		nf->nf_inode = key->inode;
>  		/* nf_ref is pre-incremented for hash table */
>  		refcount_set(&nf->nf_ref, 2);
> @@ -428,16 +433,27 @@ nfsd_file_put_noref(struct nfsd_file *nf)
>  	}
>  }
>  
> +static void
> +nfsd_file_unhash_and_put(struct nfsd_file *nf)
> +{
> +	if (nfsd_file_unhash(nf))
> +		nfsd_file_put_noref(nf);
> +}
> +
>  void
>  nfsd_file_put(struct nfsd_file *nf)
>  {
>  	might_sleep();
>  
> -	nfsd_file_lru_add(nf);
> +	if (test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1)
> +		nfsd_file_lru_add(nf);
> +	else if (refcount_read(&nf->nf_ref) == 2)
> +		nfsd_file_unhash_and_put(nf);
> +
>  	if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
>  		nfsd_file_flush(nf);
>  		nfsd_file_put_noref(nf);
> -	} else if (nf->nf_file) {
> +	} else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1) {
>  		nfsd_file_put_noref(nf);
>  		nfsd_file_schedule_laundrette();
>  	} else
> @@ -1017,12 +1033,14 @@ nfsd_file_is_cached(struct inode *inode)
>  
>  static __be32
>  nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> -		     unsigned int may_flags, struct nfsd_file **pnf, bool open)
> +		     unsigned int may_flags, struct nfsd_file **pnf,
> +		     bool open, int want_gc)
>  {
>  	struct nfsd_file_lookup_key key = {
>  		.type	= NFSD_FILE_KEY_FULL,
>  		.need	= may_flags & NFSD_FILE_MAY_MASK,
>  		.net	= SVC_NET(rqstp),
> +		.gc	= want_gc,
>  	};
>  	bool open_retry = true;
>  	struct nfsd_file *nf;
> @@ -1117,14 +1135,35 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  	 * then unhash.
>  	 */
>  	if (status != nfs_ok || key.inode->i_nlink == 0)
> -		if (nfsd_file_unhash(nf))
> -			nfsd_file_put_noref(nf);
> +		nfsd_file_unhash_and_put(nf);
>  	clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
>  	smp_mb__after_atomic();
>  	wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
>  	goto out;
>  }
>  
> +/**
> + * nfsd_file_acquire_gc - Get a struct nfsd_file with an open file
> + * @rqstp: the RPC transaction being executed
> + * @fhp: the NFS filehandle of the file to be opened
> + * @may_flags: NFSD_MAY_ settings for the file
> + * @pnf: OUT: new or found "struct nfsd_file" object
> + *
> + * The nfsd_file object returned by this API is reference-counted
> + * and garbage-collected. The object is retained for a few
> + * seconds after the final nfsd_file_put() in case the caller
> + * wants to re-use it.
> + *
> + * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
> + * network byte order is returned.
> + */
> +__be32
> +nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
> +		     unsigned int may_flags, struct nfsd_file **pnf)
> +{
> +	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, 1);
> +}
> +
>  /**
>   * nfsd_file_acquire - Get a struct nfsd_file with an open file
>   * @rqstp: the RPC transaction being executed
> @@ -1132,6 +1171,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>   * @may_flags: NFSD_MAY_ settings for the file
>   * @pnf: OUT: new or found "struct nfsd_file" object
>   *
> + * The nfsd_file_object returned by this API is reference-counted
> + * but not garbage-collected. The object is unhashed after the
> + * final nfsd_file_put().
> + *
>   * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
>   * network byte order is returned.
>   */
> @@ -1139,7 +1182,7 @@ __be32
>  nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  		  unsigned int may_flags, struct nfsd_file **pnf)
>  {
> -	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true);
> +	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, 0);
>  }
>  
>  /**
> @@ -1149,6 +1192,10 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>   * @may_flags: NFSD_MAY_ settings for the file
>   * @pnf: OUT: new or found "struct nfsd_file" object
>   *
> + * The nfsd_file_object returned by this API is reference-counted
> + * but not garbage-collected. The object is released immediately
> + * one RCU grace period after the final nfsd_file_put().
> + *
>   * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
>   * network byte order is returned.
>   */
> @@ -1156,7 +1203,7 @@ __be32
>  nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  		 unsigned int may_flags, struct nfsd_file **pnf)
>  {
> -	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false);
> +	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false, 0);
>  }
>  
>  /*
> diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
> index f81c198f4ed6..0f6546bcd3e0 100644
> --- a/fs/nfsd/filecache.h
> +++ b/fs/nfsd/filecache.h
> @@ -38,6 +38,7 @@ struct nfsd_file {
>  #define NFSD_FILE_HASHED	(0)
>  #define NFSD_FILE_PENDING	(1)
>  #define NFSD_FILE_REFERENCED	(2)
> +#define NFSD_FILE_GC		(3)
>  	unsigned long		nf_flags;
>  	struct inode		*nf_inode;	/* don't deref */
>  	refcount_t		nf_ref;
> @@ -55,6 +56,8 @@ void nfsd_file_put(struct nfsd_file *nf);
>  struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
>  void nfsd_file_close_inode_sync(struct inode *inode);
>  bool nfsd_file_is_cached(struct inode *inode);
> +__be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
> +		  unsigned int may_flags, struct nfsd_file **nfp);
>  __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  		  unsigned int may_flags, struct nfsd_file **nfp);
>  __be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
> diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
> index d12823371857..6a5ad6c91d8e 100644
> --- a/fs/nfsd/nfs3proc.c
> +++ b/fs/nfsd/nfs3proc.c
> @@ -779,8 +779,8 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
>  				(unsigned long long) argp->offset);
>  
>  	fh_copy(&resp->fh, &argp->fh);
> -	resp->status = nfsd_file_acquire(rqstp, &resp->fh, NFSD_MAY_WRITE |
> -					 NFSD_MAY_NOT_BREAK_LEASE, &nf);
> +	resp->status = nfsd_file_acquire_gc(rqstp, &resp->fh, NFSD_MAY_WRITE |
> +					    NFSD_MAY_NOT_BREAK_LEASE, &nf);
>  	if (resp->status)
>  		goto out;
>  	resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset,
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index 9ebd67d461f9..4921144880d3 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -742,7 +742,8 @@ DEFINE_CLID_EVENT(confirmed_r);
>  	__print_flags(val, "|",						\
>  		{ 1 << NFSD_FILE_HASHED,	"HASHED" },		\
>  		{ 1 << NFSD_FILE_PENDING,	"PENDING" },		\
> -		{ 1 << NFSD_FILE_REFERENCED,	"REFERENCED"})
> +		{ 1 << NFSD_FILE_REFERENCED,	"REFERENCED"},		\
> +		{ 1 << NFSD_FILE_GC,		"GC"})
>  
>  DECLARE_EVENT_CLASS(nfsd_file_class,
>  	TP_PROTO(struct nfsd_file *nf),
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 44f210ba17cc..89d682a56fc4 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -1060,7 +1060,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  	__be32 err;
>  
>  	trace_nfsd_read_start(rqstp, fhp, offset, *count);
> -	err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
> +	err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_READ, &nf);
>  	if (err)
>  		return err;
>  
> @@ -1092,7 +1092,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
>  
>  	trace_nfsd_write_start(rqstp, fhp, offset, *cnt);
>  
> -	err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf);
> +	err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_WRITE, &nf);
>  	if (err)
>  		goto out;
>  
> 
> 

Patch itself looks fine though. You can add:

Reviewed-by: Jeff Layton <jlayton@kernel.org>

...but I'd probably prefer to see the flags in that struct as bools
instead if you're amenable to the change.
Chuck Lever Oct. 19, 2022, 2:07 p.m. UTC | #2
> On Oct 19, 2022, at 7:24 AM, Jeff Layton <jlayton@kernel.org> wrote:
> 
> On Tue, 2022-10-18 at 15:58 -0400, Chuck Lever wrote:
>> NFSv4 operations manage the lifetime of nfsd_file items they use by
>> means of NFSv4 OPEN and CLOSE. Hence there's no need for them to be
>> garbage collected.
>> 
>> Introduce a mechanism to enable garbage collection for nfsd_file
>> items used only by NFSv2/3 callers.
>> 
>> Note that the change in nfsd_file_put() ensures that both CLOSE and
>> DELEGRETURN will actually close out and free an nfsd_file on last
>> reference of a non-garbage-collected file.
>> 
>> Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=394
>> Suggested-by: Trond Myklebust <trond.myklebust@hammerspace.com>
>> Tested-by: Jeff Layton <jlayton@kernel.org>
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>> fs/nfsd/filecache.c |   61 +++++++++++++++++++++++++++++++++++++++++++++------
>> fs/nfsd/filecache.h |    3 +++
>> fs/nfsd/nfs3proc.c  |    4 ++-
>> fs/nfsd/trace.h     |    3 ++-
>> fs/nfsd/vfs.c       |    4 ++-
>> 5 files changed, 63 insertions(+), 12 deletions(-)
>> 
>> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
>> index b7aa523c2010..87fce5c95726 100644
>> --- a/fs/nfsd/filecache.c
>> +++ b/fs/nfsd/filecache.c
>> @@ -63,6 +63,7 @@ struct nfsd_file_lookup_key {
>> 	struct net			*net;
>> 	const struct cred		*cred;
>> 	unsigned char			need;
>> +	unsigned char			gc:1;
> 
> Is it worth it to mess around with bitfields here? There are exising
> holes in this struct, so you're really not saving anything by using one
> here.
> 
> Also, it seems like "need" is used as a bool anyway.

AFAICS, @need is a set of NFSD_MAY flags, not a bool.


> Maybe both of those
> fields should just be bools? It looks like changing that won't change
> the size of the struct anyway. You might even be able to shrink it by
> moving the "type" above "need".

I forgot to check this with pahole. I assumed the compiler would
squeeze these fields into a single 32-bit word, but in fact it
does not do that.

But, I don't think there's any arrangement of these fields that
avoids a hole. We can revisit if more fields are added to the
lookup key struct.


>> 	enum nfsd_file_lookup_type	type;
>> };
>> 
>> @@ -162,6 +163,8 @@ static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
>> 			return 1;
>> 		if (!nfsd_match_cred(nf->nf_cred, key->cred))
>> 			return 1;
>> +		if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc)
>> +			return 1;
>> 		if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
>> 			return 1;
>> 		break;
>> @@ -297,6 +300,8 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
>> 		nf->nf_flags = 0;
>> 		__set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
>> 		__set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
>> +		if (key->gc)
>> +			__set_bit(NFSD_FILE_GC, &nf->nf_flags);
>> 		nf->nf_inode = key->inode;
>> 		/* nf_ref is pre-incremented for hash table */
>> 		refcount_set(&nf->nf_ref, 2);
>> @@ -428,16 +433,27 @@ nfsd_file_put_noref(struct nfsd_file *nf)
>> 	}
>> }
>> 
>> +static void
>> +nfsd_file_unhash_and_put(struct nfsd_file *nf)
>> +{
>> +	if (nfsd_file_unhash(nf))
>> +		nfsd_file_put_noref(nf);
>> +}
>> +
>> void
>> nfsd_file_put(struct nfsd_file *nf)
>> {
>> 	might_sleep();
>> 
>> -	nfsd_file_lru_add(nf);
>> +	if (test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1)
>> +		nfsd_file_lru_add(nf);
>> +	else if (refcount_read(&nf->nf_ref) == 2)
>> +		nfsd_file_unhash_and_put(nf);
>> +
>> 	if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
>> 		nfsd_file_flush(nf);
>> 		nfsd_file_put_noref(nf);
>> -	} else if (nf->nf_file) {
>> +	} else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1) {
>> 		nfsd_file_put_noref(nf);
>> 		nfsd_file_schedule_laundrette();
>> 	} else
>> @@ -1017,12 +1033,14 @@ nfsd_file_is_cached(struct inode *inode)
>> 
>> static __be32
>> nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>> -		     unsigned int may_flags, struct nfsd_file **pnf, bool open)
>> +		     unsigned int may_flags, struct nfsd_file **pnf,
>> +		     bool open, int want_gc)
>> {
>> 	struct nfsd_file_lookup_key key = {
>> 		.type	= NFSD_FILE_KEY_FULL,
>> 		.need	= may_flags & NFSD_FILE_MAY_MASK,
>> 		.net	= SVC_NET(rqstp),
>> +		.gc	= want_gc,
>> 	};
>> 	bool open_retry = true;
>> 	struct nfsd_file *nf;
>> @@ -1117,14 +1135,35 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>> 	 * then unhash.
>> 	 */
>> 	if (status != nfs_ok || key.inode->i_nlink == 0)
>> -		if (nfsd_file_unhash(nf))
>> -			nfsd_file_put_noref(nf);
>> +		nfsd_file_unhash_and_put(nf);
>> 	clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
>> 	smp_mb__after_atomic();
>> 	wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
>> 	goto out;
>> }
>> 
>> +/**
>> + * nfsd_file_acquire_gc - Get a struct nfsd_file with an open file
>> + * @rqstp: the RPC transaction being executed
>> + * @fhp: the NFS filehandle of the file to be opened
>> + * @may_flags: NFSD_MAY_ settings for the file
>> + * @pnf: OUT: new or found "struct nfsd_file" object
>> + *
>> + * The nfsd_file object returned by this API is reference-counted
>> + * and garbage-collected. The object is retained for a few
>> + * seconds after the final nfsd_file_put() in case the caller
>> + * wants to re-use it.
>> + *
>> + * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
>> + * network byte order is returned.
>> + */
>> +__be32
>> +nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
>> +		     unsigned int may_flags, struct nfsd_file **pnf)
>> +{
>> +	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, 1);
>> +}
>> +
>> /**
>>  * nfsd_file_acquire - Get a struct nfsd_file with an open file
>>  * @rqstp: the RPC transaction being executed
>> @@ -1132,6 +1171,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>  * @may_flags: NFSD_MAY_ settings for the file
>>  * @pnf: OUT: new or found "struct nfsd_file" object
>>  *
>> + * The nfsd_file_object returned by this API is reference-counted
>> + * but not garbage-collected. The object is unhashed after the
>> + * final nfsd_file_put().
>> + *
>>  * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
>>  * network byte order is returned.
>>  */
>> @@ -1139,7 +1182,7 @@ __be32
>> nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>> 		  unsigned int may_flags, struct nfsd_file **pnf)
>> {
>> -	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true);
>> +	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, 0);
>> }
>> 
>> /**
>> @@ -1149,6 +1192,10 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>  * @may_flags: NFSD_MAY_ settings for the file
>>  * @pnf: OUT: new or found "struct nfsd_file" object
>>  *
>> + * The nfsd_file_object returned by this API is reference-counted
>> + * but not garbage-collected. The object is released immediately
>> + * one RCU grace period after the final nfsd_file_put().
>> + *
>>  * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
>>  * network byte order is returned.
>>  */
>> @@ -1156,7 +1203,7 @@ __be32
>> nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
>> 		 unsigned int may_flags, struct nfsd_file **pnf)
>> {
>> -	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false);
>> +	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false, 0);
>> }
>> 
>> /*
>> diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
>> index f81c198f4ed6..0f6546bcd3e0 100644
>> --- a/fs/nfsd/filecache.h
>> +++ b/fs/nfsd/filecache.h
>> @@ -38,6 +38,7 @@ struct nfsd_file {
>> #define NFSD_FILE_HASHED	(0)
>> #define NFSD_FILE_PENDING	(1)
>> #define NFSD_FILE_REFERENCED	(2)
>> +#define NFSD_FILE_GC		(3)
>> 	unsigned long		nf_flags;
>> 	struct inode		*nf_inode;	/* don't deref */
>> 	refcount_t		nf_ref;
>> @@ -55,6 +56,8 @@ void nfsd_file_put(struct nfsd_file *nf);
>> struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
>> void nfsd_file_close_inode_sync(struct inode *inode);
>> bool nfsd_file_is_cached(struct inode *inode);
>> +__be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
>> +		  unsigned int may_flags, struct nfsd_file **nfp);
>> __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>> 		  unsigned int may_flags, struct nfsd_file **nfp);
>> __be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
>> diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
>> index d12823371857..6a5ad6c91d8e 100644
>> --- a/fs/nfsd/nfs3proc.c
>> +++ b/fs/nfsd/nfs3proc.c
>> @@ -779,8 +779,8 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
>> 				(unsigned long long) argp->offset);
>> 
>> 	fh_copy(&resp->fh, &argp->fh);
>> -	resp->status = nfsd_file_acquire(rqstp, &resp->fh, NFSD_MAY_WRITE |
>> -					 NFSD_MAY_NOT_BREAK_LEASE, &nf);
>> +	resp->status = nfsd_file_acquire_gc(rqstp, &resp->fh, NFSD_MAY_WRITE |
>> +					    NFSD_MAY_NOT_BREAK_LEASE, &nf);
>> 	if (resp->status)
>> 		goto out;
>> 	resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset,
>> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
>> index 9ebd67d461f9..4921144880d3 100644
>> --- a/fs/nfsd/trace.h
>> +++ b/fs/nfsd/trace.h
>> @@ -742,7 +742,8 @@ DEFINE_CLID_EVENT(confirmed_r);
>> 	__print_flags(val, "|",						\
>> 		{ 1 << NFSD_FILE_HASHED,	"HASHED" },		\
>> 		{ 1 << NFSD_FILE_PENDING,	"PENDING" },		\
>> -		{ 1 << NFSD_FILE_REFERENCED,	"REFERENCED"})
>> +		{ 1 << NFSD_FILE_REFERENCED,	"REFERENCED"},		\
>> +		{ 1 << NFSD_FILE_GC,		"GC"})
>> 
>> DECLARE_EVENT_CLASS(nfsd_file_class,
>> 	TP_PROTO(struct nfsd_file *nf),
>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>> index 44f210ba17cc..89d682a56fc4 100644
>> --- a/fs/nfsd/vfs.c
>> +++ b/fs/nfsd/vfs.c
>> @@ -1060,7 +1060,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
>> 	__be32 err;
>> 
>> 	trace_nfsd_read_start(rqstp, fhp, offset, *count);
>> -	err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
>> +	err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_READ, &nf);
>> 	if (err)
>> 		return err;
>> 
>> @@ -1092,7 +1092,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
>> 
>> 	trace_nfsd_write_start(rqstp, fhp, offset, *cnt);
>> 
>> -	err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf);
>> +	err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_WRITE, &nf);
>> 	if (err)
>> 		goto out;
>> 
>> 
>> 
> 
> Patch itself looks fine though. You can add:
> 
> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> 
> ...but I'd probably prefer to see the flags in that struct as bools
> instead if you're amenable to the change.

--
Chuck Lever
NeilBrown Oct. 24, 2022, 2:33 a.m. UTC | #3
On Wed, 19 Oct 2022, Chuck Lever wrote:
> NFSv4 operations manage the lifetime of nfsd_file items they use by
> means of NFSv4 OPEN and CLOSE. Hence there's no need for them to be
> garbage collected.
> 
> Introduce a mechanism to enable garbage collection for nfsd_file
> items used only by NFSv2/3 callers.
> 
> Note that the change in nfsd_file_put() ensures that both CLOSE and
> DELEGRETURN will actually close out and free an nfsd_file on last
> reference of a non-garbage-collected file.
> 
> Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=394
> Suggested-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> Tested-by: Jeff Layton <jlayton@kernel.org>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  fs/nfsd/filecache.c |   61 +++++++++++++++++++++++++++++++++++++++++++++------
>  fs/nfsd/filecache.h |    3 +++
>  fs/nfsd/nfs3proc.c  |    4 ++-
>  fs/nfsd/trace.h     |    3 ++-
>  fs/nfsd/vfs.c       |    4 ++-
>  5 files changed, 63 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index b7aa523c2010..87fce5c95726 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -63,6 +63,7 @@ struct nfsd_file_lookup_key {
>  	struct net			*net;
>  	const struct cred		*cred;
>  	unsigned char			need;
> +	unsigned char			gc:1;
>  	enum nfsd_file_lookup_type	type;
>  };
>  
> @@ -162,6 +163,8 @@ static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
>  			return 1;
>  		if (!nfsd_match_cred(nf->nf_cred, key->cred))
>  			return 1;
> +		if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc)
> +			return 1;
>  		if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
>  			return 1;
>  		break;
> @@ -297,6 +300,8 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
>  		nf->nf_flags = 0;
>  		__set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
>  		__set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
> +		if (key->gc)
> +			__set_bit(NFSD_FILE_GC, &nf->nf_flags);
>  		nf->nf_inode = key->inode;
>  		/* nf_ref is pre-incremented for hash table */
>  		refcount_set(&nf->nf_ref, 2);
> @@ -428,16 +433,27 @@ nfsd_file_put_noref(struct nfsd_file *nf)
>  	}
>  }
>  
> +static void
> +nfsd_file_unhash_and_put(struct nfsd_file *nf)
> +{
> +	if (nfsd_file_unhash(nf))
> +		nfsd_file_put_noref(nf);
> +}
> +
>  void
>  nfsd_file_put(struct nfsd_file *nf)
>  {
>  	might_sleep();
>  
> -	nfsd_file_lru_add(nf);
> +	if (test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1)

Clearly this is a style choice on which sensible people might disagree,
but I much prefer to leave out the "== 1" That is what most callers in
fs/nfsd/ do - only exceptions are here in filecache.c.
Even "!= 0" would be better than "== 1".
I think test_bit() is declared as a bool, but it is hard to be certain.

> +		nfsd_file_lru_add(nf);
> +	else if (refcount_read(&nf->nf_ref) == 2)
> +		nfsd_file_unhash_and_put(nf);

Tests on the value of a refcount are almost always racy.
I suspect there is an implication that as NFSD_FILE_GC is not set, this
*must* be hashed which implies there is guaranteed to be a refcount from
the hashtable.  So this is really a test to see if the pre-biased
refcount is one.  The safe way to test if a refcount is 1 is dec_and_test.

i.e. linkage from the hash table should not count as a reference (in the
not-GC case).  Lookup in the hash table should fail if the found entry
cannot achieve an inc_not_zero.  When dec_and_test says the refcount is
zero, we remove from the hash table (certain that no new references will
be taken).


> +
>  	if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
>  		nfsd_file_flush(nf);
>  		nfsd_file_put_noref(nf);

This seems weird.  If the file was unhashed above (because nf_ref was
2), it would now not be flushed.  Why don't we want it to be flushed in
that case?

> -	} else if (nf->nf_file) {
> +	} else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1) {
>  		nfsd_file_put_noref(nf);
>  		nfsd_file_schedule_laundrette();
>  	} else
> @@ -1017,12 +1033,14 @@ nfsd_file_is_cached(struct inode *inode)
>  
>  static __be32
>  nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> -		     unsigned int may_flags, struct nfsd_file **pnf, bool open)
> +		     unsigned int may_flags, struct nfsd_file **pnf,
> +		     bool open, int want_gc)

I too would prefer "bool" for all intstance of gc and want_gc.

NeilBrown
Jeff Layton Oct. 24, 2022, 4:57 p.m. UTC | #4
On Mon, 2022-10-24 at 13:33 +1100, NeilBrown wrote:
> On Wed, 19 Oct 2022, Chuck Lever wrote:
> > NFSv4 operations manage the lifetime of nfsd_file items they use by
> > means of NFSv4 OPEN and CLOSE. Hence there's no need for them to be
> > garbage collected.
> > 
> > Introduce a mechanism to enable garbage collection for nfsd_file
> > items used only by NFSv2/3 callers.
> > 
> > Note that the change in nfsd_file_put() ensures that both CLOSE and
> > DELEGRETURN will actually close out and free an nfsd_file on last
> > reference of a non-garbage-collected file.
> > 
> > Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=394
> > Suggested-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > Tested-by: Jeff Layton <jlayton@kernel.org>
> > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > ---
> >  fs/nfsd/filecache.c |   61 +++++++++++++++++++++++++++++++++++++++++++++------
> >  fs/nfsd/filecache.h |    3 +++
> >  fs/nfsd/nfs3proc.c  |    4 ++-
> >  fs/nfsd/trace.h     |    3 ++-
> >  fs/nfsd/vfs.c       |    4 ++-
> >  5 files changed, 63 insertions(+), 12 deletions(-)
> > 
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index b7aa523c2010..87fce5c95726 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -63,6 +63,7 @@ struct nfsd_file_lookup_key {
> >  	struct net			*net;
> >  	const struct cred		*cred;
> >  	unsigned char			need;
> > +	unsigned char			gc:1;
> >  	enum nfsd_file_lookup_type	type;
> >  };
> >  
> > @@ -162,6 +163,8 @@ static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
> >  			return 1;
> >  		if (!nfsd_match_cred(nf->nf_cred, key->cred))
> >  			return 1;
> > +		if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc)
> > +			return 1;
> >  		if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
> >  			return 1;
> >  		break;
> > @@ -297,6 +300,8 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> >  		nf->nf_flags = 0;
> >  		__set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
> >  		__set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
> > +		if (key->gc)
> > +			__set_bit(NFSD_FILE_GC, &nf->nf_flags);
> >  		nf->nf_inode = key->inode;
> >  		/* nf_ref is pre-incremented for hash table */
> >  		refcount_set(&nf->nf_ref, 2);
> > @@ -428,16 +433,27 @@ nfsd_file_put_noref(struct nfsd_file *nf)
> >  	}
> >  }
> >  
> > +static void
> > +nfsd_file_unhash_and_put(struct nfsd_file *nf)
> > +{
> > +	if (nfsd_file_unhash(nf))
> > +		nfsd_file_put_noref(nf);
> > +}
> > +
> >  void
> >  nfsd_file_put(struct nfsd_file *nf)
> >  {
> >  	might_sleep();
> >  
> > -	nfsd_file_lru_add(nf);
> > +	if (test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1)
> 
> Clearly this is a style choice on which sensible people might disagree,
> but I much prefer to leave out the "== 1" That is what most callers in
> fs/nfsd/ do - only exceptions are here in filecache.c.
> Even "!= 0" would be better than "== 1".
> I think test_bit() is declared as a bool, but it is hard to be certain.
> 
> > +		nfsd_file_lru_add(nf);
> > +	else if (refcount_read(&nf->nf_ref) == 2)
> > +		nfsd_file_unhash_and_put(nf);
> 
> Tests on the value of a refcount are almost always racy.

Agreed, and there's a clear race above, now that I look more closely. If
nf_ref is 3 and two puts are racing then neither of them will call
nfsd_file_unhash_and_put. We really should be letting the outcome of the
decrement drive things (like you say below).

> I suspect there is an implication that as NFSD_FILE_GC is not set, this
> *must* be hashed which implies there is guaranteed to be a refcount from
> the hashtable.  So this is really a test to see if the pre-biased
> refcount is one.  The safe way to test if a refcount is 1 is dec_and_test.
> 
> i.e. linkage from the hash table should not count as a reference (in the
> not-GC case).  Lookup in the hash table should fail if the found entry
> cannot achieve an inc_not_zero.  When dec_and_test says the refcount is
> zero, we remove from the hash table (certain that no new references will
> be taken).
> 

This does seem a more sensible approach. That would go a long way toward
simplifying nfsd_file_put.

> 
> > +
> >  	if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
> >  		nfsd_file_flush(nf);
> >  		nfsd_file_put_noref(nf);
> 
> This seems weird.  If the file was unhashed above (because nf_ref was
> 2), it would now not be flushed.  Why don't we want it to be flushed in
> that case?
>
> > -	} else if (nf->nf_file) {
> > +	} else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1) {
> >  		nfsd_file_put_noref(nf);
> >  		nfsd_file_schedule_laundrette();
> >  	} else
> > @@ -1017,12 +1033,14 @@ nfsd_file_is_cached(struct inode *inode)
> >  
> >  static __be32
> >  nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > -		     unsigned int may_flags, struct nfsd_file **pnf, bool open)
> > +		     unsigned int may_flags, struct nfsd_file **pnf,
> > +		     bool open, int want_gc)
> 
> I too would prefer "bool" for all intstance of gc and want_gc.
> 
> NeilBrown
Chuck Lever Oct. 24, 2022, 5:29 p.m. UTC | #5
> On Oct 24, 2022, at 12:57 PM, Jeff Layton <jlayton@kernel.org> wrote:
> 
> On Mon, 2022-10-24 at 13:33 +1100, NeilBrown wrote:
>> On Wed, 19 Oct 2022, Chuck Lever wrote:
>>> NFSv4 operations manage the lifetime of nfsd_file items they use by
>>> means of NFSv4 OPEN and CLOSE. Hence there's no need for them to be
>>> garbage collected.
>>> 
>>> Introduce a mechanism to enable garbage collection for nfsd_file
>>> items used only by NFSv2/3 callers.
>>> 
>>> Note that the change in nfsd_file_put() ensures that both CLOSE and
>>> DELEGRETURN will actually close out and free an nfsd_file on last
>>> reference of a non-garbage-collected file.
>>> 
>>> Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=394
>>> Suggested-by: Trond Myklebust <trond.myklebust@hammerspace.com>
>>> Tested-by: Jeff Layton <jlayton@kernel.org>
>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>> ---
>>> fs/nfsd/filecache.c |   61 +++++++++++++++++++++++++++++++++++++++++++++------
>>> fs/nfsd/filecache.h |    3 +++
>>> fs/nfsd/nfs3proc.c  |    4 ++-
>>> fs/nfsd/trace.h     |    3 ++-
>>> fs/nfsd/vfs.c       |    4 ++-
>>> 5 files changed, 63 insertions(+), 12 deletions(-)
>>> 
>>> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
>>> index b7aa523c2010..87fce5c95726 100644
>>> --- a/fs/nfsd/filecache.c
>>> +++ b/fs/nfsd/filecache.c
>>> @@ -63,6 +63,7 @@ struct nfsd_file_lookup_key {
>>> 	struct net			*net;
>>> 	const struct cred		*cred;
>>> 	unsigned char			need;
>>> +	unsigned char			gc:1;
>>> 	enum nfsd_file_lookup_type	type;
>>> };
>>> 
>>> @@ -162,6 +163,8 @@ static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
>>> 			return 1;
>>> 		if (!nfsd_match_cred(nf->nf_cred, key->cred))
>>> 			return 1;
>>> +		if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc)
>>> +			return 1;
>>> 		if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
>>> 			return 1;
>>> 		break;
>>> @@ -297,6 +300,8 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
>>> 		nf->nf_flags = 0;
>>> 		__set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
>>> 		__set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
>>> +		if (key->gc)
>>> +			__set_bit(NFSD_FILE_GC, &nf->nf_flags);
>>> 		nf->nf_inode = key->inode;
>>> 		/* nf_ref is pre-incremented for hash table */
>>> 		refcount_set(&nf->nf_ref, 2);
>>> @@ -428,16 +433,27 @@ nfsd_file_put_noref(struct nfsd_file *nf)
>>> 	}
>>> }
>>> 
>>> +static void
>>> +nfsd_file_unhash_and_put(struct nfsd_file *nf)
>>> +{
>>> +	if (nfsd_file_unhash(nf))
>>> +		nfsd_file_put_noref(nf);
>>> +}
>>> +
>>> void
>>> nfsd_file_put(struct nfsd_file *nf)
>>> {
>>> 	might_sleep();
>>> 
>>> -	nfsd_file_lru_add(nf);
>>> +	if (test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1)
>> 
>> Clearly this is a style choice on which sensible people might disagree,
>> but I much prefer to leave out the "== 1" That is what most callers in
>> fs/nfsd/ do - only exceptions are here in filecache.c.
>> Even "!= 0" would be better than "== 1".
>> I think test_bit() is declared as a bool, but it is hard to be certain.
>> 
>>> +		nfsd_file_lru_add(nf);
>>> +	else if (refcount_read(&nf->nf_ref) == 2)
>>> +		nfsd_file_unhash_and_put(nf);
>> 
>> Tests on the value of a refcount are almost always racy.
> 
> Agreed, and there's a clear race above, now that I look more closely. If
> nf_ref is 3 and two puts are racing then neither of them will call
> nfsd_file_unhash_and_put. We really should be letting the outcome of the
> decrement drive things (like you say below).
> 
>> I suspect there is an implication that as NFSD_FILE_GC is not set, this
>> *must* be hashed which implies there is guaranteed to be a refcount from
>> the hashtable.  So this is really a test to see if the pre-biased
>> refcount is one.  The safe way to test if a refcount is 1 is dec_and_test.
>> 
>> i.e. linkage from the hash table should not count as a reference (in the
>> not-GC case).  Lookup in the hash table should fail if the found entry
>> cannot achieve an inc_not_zero.  When dec_and_test says the refcount is
>> zero, we remove from the hash table (certain that no new references will
>> be taken).
>> 
> 
> This does seem a more sensible approach. That would go a long way toward
> simplifying nfsd_file_put.

So I cut-and-pasted the approach you used in the patch you sent a few
weeks ago. I don't object to replacing that... but I don't see exactly
where you guys are going with this.


>>> +
>>> 	if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
>>> 		nfsd_file_flush(nf);
>>> 		nfsd_file_put_noref(nf);
>> 
>> This seems weird.  If the file was unhashed above (because nf_ref was
>> 2), it would now not be flushed.  Why don't we want it to be flushed in
>> that case?
>> 
>>> -	} else if (nf->nf_file) {
>>> +	} else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1) {
>>> 		nfsd_file_put_noref(nf);
>>> 		nfsd_file_schedule_laundrette();
>>> 	} else
>>> @@ -1017,12 +1033,14 @@ nfsd_file_is_cached(struct inode *inode)
>>> 
>>> static __be32
>>> nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>> -		     unsigned int may_flags, struct nfsd_file **pnf, bool open)
>>> +		     unsigned int may_flags, struct nfsd_file **pnf,
>>> +		     bool open, int want_gc)
>> 
>> I too would prefer "bool" for all intstance of gc and want_gc.
>> 
>> NeilBrown
> 
> -- 
> Jeff Layton <jlayton@kernel.org>

--
Chuck Lever
Chuck Lever Oct. 24, 2022, 6:43 p.m. UTC | #6
> On Oct 23, 2022, at 10:33 PM, NeilBrown <neilb@suse.de> wrote:
> 
> On Wed, 19 Oct 2022, Chuck Lever wrote:
>> NFSv4 operations manage the lifetime of nfsd_file items they use by
>> means of NFSv4 OPEN and CLOSE. Hence there's no need for them to be
>> garbage collected.
>> 
>> Introduce a mechanism to enable garbage collection for nfsd_file
>> items used only by NFSv2/3 callers.
>> 
>> Note that the change in nfsd_file_put() ensures that both CLOSE and
>> DELEGRETURN will actually close out and free an nfsd_file on last
>> reference of a non-garbage-collected file.
>> 
>> Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=394
>> Suggested-by: Trond Myklebust <trond.myklebust@hammerspace.com>
>> Tested-by: Jeff Layton <jlayton@kernel.org>
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>> fs/nfsd/filecache.c |   61 +++++++++++++++++++++++++++++++++++++++++++++------
>> fs/nfsd/filecache.h |    3 +++
>> fs/nfsd/nfs3proc.c  |    4 ++-
>> fs/nfsd/trace.h     |    3 ++-
>> fs/nfsd/vfs.c       |    4 ++-
>> 5 files changed, 63 insertions(+), 12 deletions(-)
>> 
>> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
>> index b7aa523c2010..87fce5c95726 100644
>> --- a/fs/nfsd/filecache.c
>> +++ b/fs/nfsd/filecache.c
>> @@ -63,6 +63,7 @@ struct nfsd_file_lookup_key {
>> 	struct net			*net;
>> 	const struct cred		*cred;
>> 	unsigned char			need;
>> +	unsigned char			gc:1;
>> 	enum nfsd_file_lookup_type	type;
>> };
>> 
>> @@ -162,6 +163,8 @@ static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
>> 			return 1;
>> 		if (!nfsd_match_cred(nf->nf_cred, key->cred))
>> 			return 1;
>> +		if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc)
>> +			return 1;
>> 		if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
>> 			return 1;
>> 		break;
>> @@ -297,6 +300,8 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
>> 		nf->nf_flags = 0;
>> 		__set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
>> 		__set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
>> +		if (key->gc)
>> +			__set_bit(NFSD_FILE_GC, &nf->nf_flags);
>> 		nf->nf_inode = key->inode;
>> 		/* nf_ref is pre-incremented for hash table */
>> 		refcount_set(&nf->nf_ref, 2);
>> @@ -428,16 +433,27 @@ nfsd_file_put_noref(struct nfsd_file *nf)
>> 	}
>> }
>> 
>> +static void
>> +nfsd_file_unhash_and_put(struct nfsd_file *nf)
>> +{
>> +	if (nfsd_file_unhash(nf))
>> +		nfsd_file_put_noref(nf);
>> +}
>> +
>> void
>> nfsd_file_put(struct nfsd_file *nf)
>> {
>> 	might_sleep();
>> 
>> -	nfsd_file_lru_add(nf);
>> +	if (test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1)
> 
> Clearly this is a style choice on which sensible people might disagree,
> but I much prefer to leave out the "== 1" That is what most callers in
> fs/nfsd/ do - only exceptions are here in filecache.c.
> Even "!= 0" would be better than "== 1".
> I think test_bit() is declared as a bool, but it is hard to be certain.

The definitions of test_bit() I've seen return "int" which is why
I wrote it this way.


>> @@ -1017,12 +1033,14 @@ nfsd_file_is_cached(struct inode *inode)
>> 
>> static __be32
>> nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
>> -		     unsigned int may_flags, struct nfsd_file **pnf, bool open)
>> +		     unsigned int may_flags, struct nfsd_file **pnf,
>> +		     bool open, int want_gc)
> 
> I too would prefer "bool" for all intstance of gc and want_gc.

OK. I think I've figured out a way to do that and still deal
safely with the test_bit() return type silliness.

v5 coming soon.


--
Chuck Lever
NeilBrown Oct. 24, 2022, 9:27 p.m. UTC | #7
On Tue, 25 Oct 2022, Chuck Lever III wrote:
> 
> > On Oct 23, 2022, at 10:33 PM, NeilBrown <neilb@suse.de> wrote:
> > 
> > On Wed, 19 Oct 2022, Chuck Lever wrote:
> >> NFSv4 operations manage the lifetime of nfsd_file items they use by
> >> means of NFSv4 OPEN and CLOSE. Hence there's no need for them to be
> >> garbage collected.
> >> 
> >> Introduce a mechanism to enable garbage collection for nfsd_file
> >> items used only by NFSv2/3 callers.
> >> 
> >> Note that the change in nfsd_file_put() ensures that both CLOSE and
> >> DELEGRETURN will actually close out and free an nfsd_file on last
> >> reference of a non-garbage-collected file.
> >> 
> >> Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=394
> >> Suggested-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> >> Tested-by: Jeff Layton <jlayton@kernel.org>
> >> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> >> ---
> >> fs/nfsd/filecache.c |   61 +++++++++++++++++++++++++++++++++++++++++++++------
> >> fs/nfsd/filecache.h |    3 +++
> >> fs/nfsd/nfs3proc.c  |    4 ++-
> >> fs/nfsd/trace.h     |    3 ++-
> >> fs/nfsd/vfs.c       |    4 ++-
> >> 5 files changed, 63 insertions(+), 12 deletions(-)
> >> 
> >> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> >> index b7aa523c2010..87fce5c95726 100644
> >> --- a/fs/nfsd/filecache.c
> >> +++ b/fs/nfsd/filecache.c
> >> @@ -63,6 +63,7 @@ struct nfsd_file_lookup_key {
> >> 	struct net			*net;
> >> 	const struct cred		*cred;
> >> 	unsigned char			need;
> >> +	unsigned char			gc:1;
> >> 	enum nfsd_file_lookup_type	type;
> >> };
> >> 
> >> @@ -162,6 +163,8 @@ static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
> >> 			return 1;
> >> 		if (!nfsd_match_cred(nf->nf_cred, key->cred))
> >> 			return 1;
> >> +		if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc)
> >> +			return 1;
> >> 		if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
> >> 			return 1;
> >> 		break;
> >> @@ -297,6 +300,8 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> >> 		nf->nf_flags = 0;
> >> 		__set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
> >> 		__set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
> >> +		if (key->gc)
> >> +			__set_bit(NFSD_FILE_GC, &nf->nf_flags);
> >> 		nf->nf_inode = key->inode;
> >> 		/* nf_ref is pre-incremented for hash table */
> >> 		refcount_set(&nf->nf_ref, 2);
> >> @@ -428,16 +433,27 @@ nfsd_file_put_noref(struct nfsd_file *nf)
> >> 	}
> >> }
> >> 
> >> +static void
> >> +nfsd_file_unhash_and_put(struct nfsd_file *nf)
> >> +{
> >> +	if (nfsd_file_unhash(nf))
> >> +		nfsd_file_put_noref(nf);
> >> +}
> >> +
> >> void
> >> nfsd_file_put(struct nfsd_file *nf)
> >> {
> >> 	might_sleep();
> >> 
> >> -	nfsd_file_lru_add(nf);
> >> +	if (test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1)
> > 
> > Clearly this is a style choice on which sensible people might disagree,
> > but I much prefer to leave out the "== 1" That is what most callers in
> > fs/nfsd/ do - only exceptions are here in filecache.c.
> > Even "!= 0" would be better than "== 1".
> > I think test_bit() is declared as a bool, but it is hard to be certain.
> 
> The definitions of test_bit() I've seen return "int" which is why
> I wrote it this way.

Just for completeness, I found

#define test_bit(nr, addr)              bitop(_test_bit, nr, addr)

in linux/bitops.h.

bitop() is defined just above and (I think) in the case where nr is
constant and addr is not NULL, this becomes a call to const_test_bit().
In include/asm-generic/bitops/generic-non-atomic.h I found

static __always_inline bool
const_test_bit(unsigned long nr, const volatile unsigned long *addr)

In the non-constant case it calls _test_bit() which is

static __always_inline bool
_test_bit(unsigned long nr, const volatile unsigned long *addr)

But as an int that is known to be 0 or 1 is type-compatible with a bool,
ht shouldn't really matter from a type perspective, only for a human
readability perspective.

Thanks,
NeilBrown
NeilBrown Oct. 25, 2022, 12:15 a.m. UTC | #8
On Tue, 25 Oct 2022, Chuck Lever III wrote:
> 
> > On Oct 24, 2022, at 12:57 PM, Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > On Mon, 2022-10-24 at 13:33 +1100, NeilBrown wrote:
> >> On Wed, 19 Oct 2022, Chuck Lever wrote:
> >>> +		nfsd_file_lru_add(nf);
> >>> +	else if (refcount_read(&nf->nf_ref) == 2)
> >>> +		nfsd_file_unhash_and_put(nf);
> >> 
> >> Tests on the value of a refcount are almost always racy.
> > 
> > Agreed, and there's a clear race above, now that I look more closely. If
> > nf_ref is 3 and two puts are racing then neither of them will call
> > nfsd_file_unhash_and_put. We really should be letting the outcome of the
> > decrement drive things (like you say below).
> > 
> >> I suspect there is an implication that as NFSD_FILE_GC is not set, this
> >> *must* be hashed which implies there is guaranteed to be a refcount from
> >> the hashtable.  So this is really a test to see if the pre-biased
> >> refcount is one.  The safe way to test if a refcount is 1 is dec_and_test.
> >> 
> >> i.e. linkage from the hash table should not count as a reference (in the
> >> not-GC case).  Lookup in the hash table should fail if the found entry
> >> cannot achieve an inc_not_zero.  When dec_and_test says the refcount is
> >> zero, we remove from the hash table (certain that no new references will
> >> be taken).
> >> 
> > 
> > This does seem a more sensible approach. That would go a long way toward
> > simplifying nfsd_file_put.
> 
> So I cut-and-pasted the approach you used in the patch you sent a few
> weeks ago. I don't object to replacing that... but I don't see exactly
> where you guys are going with this.
> 

Where I'm going with this is to suggest that this ref-counting oddity is
possibly the cause of the occasional refcounting problems that you
have had with nfsd.

I think that the hash table should never own a reference to the
nfsd_file.  If the refcount ever reaches zero, then it gets removed from
the hash table.

  if (refcount_dec_and_test())
      if (test_and_clear_bit(HASHED,...))
         delete from hash table

Transient references are counted.
For NFSv2,3 existence in the LRU is counted (I don't think they are at present).
For NFSv4, references from nfs4_file are counted.
But presence in the hash table is only indicated by the HASHED flag.

I think this would make the ref counting more robust.

NeilBrown
diff mbox series

Patch

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index b7aa523c2010..87fce5c95726 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -63,6 +63,7 @@  struct nfsd_file_lookup_key {
 	struct net			*net;
 	const struct cred		*cred;
 	unsigned char			need;
+	unsigned char			gc:1;
 	enum nfsd_file_lookup_type	type;
 };
 
@@ -162,6 +163,8 @@  static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
 			return 1;
 		if (!nfsd_match_cred(nf->nf_cred, key->cred))
 			return 1;
+		if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc)
+			return 1;
 		if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
 			return 1;
 		break;
@@ -297,6 +300,8 @@  nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
 		nf->nf_flags = 0;
 		__set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
 		__set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+		if (key->gc)
+			__set_bit(NFSD_FILE_GC, &nf->nf_flags);
 		nf->nf_inode = key->inode;
 		/* nf_ref is pre-incremented for hash table */
 		refcount_set(&nf->nf_ref, 2);
@@ -428,16 +433,27 @@  nfsd_file_put_noref(struct nfsd_file *nf)
 	}
 }
 
+static void
+nfsd_file_unhash_and_put(struct nfsd_file *nf)
+{
+	if (nfsd_file_unhash(nf))
+		nfsd_file_put_noref(nf);
+}
+
 void
 nfsd_file_put(struct nfsd_file *nf)
 {
 	might_sleep();
 
-	nfsd_file_lru_add(nf);
+	if (test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1)
+		nfsd_file_lru_add(nf);
+	else if (refcount_read(&nf->nf_ref) == 2)
+		nfsd_file_unhash_and_put(nf);
+
 	if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
 		nfsd_file_flush(nf);
 		nfsd_file_put_noref(nf);
-	} else if (nf->nf_file) {
+	} else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags) == 1) {
 		nfsd_file_put_noref(nf);
 		nfsd_file_schedule_laundrette();
 	} else
@@ -1017,12 +1033,14 @@  nfsd_file_is_cached(struct inode *inode)
 
 static __be32
 nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
-		     unsigned int may_flags, struct nfsd_file **pnf, bool open)
+		     unsigned int may_flags, struct nfsd_file **pnf,
+		     bool open, int want_gc)
 {
 	struct nfsd_file_lookup_key key = {
 		.type	= NFSD_FILE_KEY_FULL,
 		.need	= may_flags & NFSD_FILE_MAY_MASK,
 		.net	= SVC_NET(rqstp),
+		.gc	= want_gc,
 	};
 	bool open_retry = true;
 	struct nfsd_file *nf;
@@ -1117,14 +1135,35 @@  nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	 * then unhash.
 	 */
 	if (status != nfs_ok || key.inode->i_nlink == 0)
-		if (nfsd_file_unhash(nf))
-			nfsd_file_put_noref(nf);
+		nfsd_file_unhash_and_put(nf);
 	clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
 	smp_mb__after_atomic();
 	wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
 	goto out;
 }
 
+/**
+ * nfsd_file_acquire_gc - Get a struct nfsd_file with an open file
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file to be opened
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * The nfsd_file object returned by this API is reference-counted
+ * and garbage-collected. The object is retained for a few
+ * seconds after the final nfsd_file_put() in case the caller
+ * wants to re-use it.
+ *
+ * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
+ * network byte order is returned.
+ */
+__be32
+nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		     unsigned int may_flags, struct nfsd_file **pnf)
+{
+	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, 1);
+}
+
 /**
  * nfsd_file_acquire - Get a struct nfsd_file with an open file
  * @rqstp: the RPC transaction being executed
@@ -1132,6 +1171,10 @@  nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
  * @may_flags: NFSD_MAY_ settings for the file
  * @pnf: OUT: new or found "struct nfsd_file" object
  *
+ * The nfsd_file_object returned by this API is reference-counted
+ * but not garbage-collected. The object is unhashed after the
+ * final nfsd_file_put().
+ *
  * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
  * network byte order is returned.
  */
@@ -1139,7 +1182,7 @@  __be32
 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		  unsigned int may_flags, struct nfsd_file **pnf)
 {
-	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true);
+	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, 0);
 }
 
 /**
@@ -1149,6 +1192,10 @@  nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
  * @may_flags: NFSD_MAY_ settings for the file
  * @pnf: OUT: new or found "struct nfsd_file" object
  *
+ * The nfsd_file_object returned by this API is reference-counted
+ * but not garbage-collected. The object is released immediately
+ * one RCU grace period after the final nfsd_file_put().
+ *
  * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
  * network byte order is returned.
  */
@@ -1156,7 +1203,7 @@  __be32
 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		 unsigned int may_flags, struct nfsd_file **pnf)
 {
-	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false);
+	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false, 0);
 }
 
 /*
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index f81c198f4ed6..0f6546bcd3e0 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -38,6 +38,7 @@  struct nfsd_file {
 #define NFSD_FILE_HASHED	(0)
 #define NFSD_FILE_PENDING	(1)
 #define NFSD_FILE_REFERENCED	(2)
+#define NFSD_FILE_GC		(3)
 	unsigned long		nf_flags;
 	struct inode		*nf_inode;	/* don't deref */
 	refcount_t		nf_ref;
@@ -55,6 +56,8 @@  void nfsd_file_put(struct nfsd_file *nf);
 struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
 void nfsd_file_close_inode_sync(struct inode *inode);
 bool nfsd_file_is_cached(struct inode *inode);
+__be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		  unsigned int may_flags, struct nfsd_file **nfp);
 __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		  unsigned int may_flags, struct nfsd_file **nfp);
 __be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index d12823371857..6a5ad6c91d8e 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -779,8 +779,8 @@  nfsd3_proc_commit(struct svc_rqst *rqstp)
 				(unsigned long long) argp->offset);
 
 	fh_copy(&resp->fh, &argp->fh);
-	resp->status = nfsd_file_acquire(rqstp, &resp->fh, NFSD_MAY_WRITE |
-					 NFSD_MAY_NOT_BREAK_LEASE, &nf);
+	resp->status = nfsd_file_acquire_gc(rqstp, &resp->fh, NFSD_MAY_WRITE |
+					    NFSD_MAY_NOT_BREAK_LEASE, &nf);
 	if (resp->status)
 		goto out;
 	resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset,
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 9ebd67d461f9..4921144880d3 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -742,7 +742,8 @@  DEFINE_CLID_EVENT(confirmed_r);
 	__print_flags(val, "|",						\
 		{ 1 << NFSD_FILE_HASHED,	"HASHED" },		\
 		{ 1 << NFSD_FILE_PENDING,	"PENDING" },		\
-		{ 1 << NFSD_FILE_REFERENCED,	"REFERENCED"})
+		{ 1 << NFSD_FILE_REFERENCED,	"REFERENCED"},		\
+		{ 1 << NFSD_FILE_GC,		"GC"})
 
 DECLARE_EVENT_CLASS(nfsd_file_class,
 	TP_PROTO(struct nfsd_file *nf),
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 44f210ba17cc..89d682a56fc4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1060,7 +1060,7 @@  __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	__be32 err;
 
 	trace_nfsd_read_start(rqstp, fhp, offset, *count);
-	err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
+	err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_READ, &nf);
 	if (err)
 		return err;
 
@@ -1092,7 +1092,7 @@  nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 
 	trace_nfsd_write_start(rqstp, fhp, offset, *cnt);
 
-	err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf);
+	err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_WRITE, &nf);
 	if (err)
 		goto out;