diff mbox series

[v4,5/7] NFSD: Use rhashtable for managing nfs4_file objects

Message ID 166612313084.1291.5764156173845222109.stgit@manet.1015granger.net (mailing list archive)
State New, archived
Headers show
Series A course adjustment, for sure | expand

Commit Message

Chuck Lever Oct. 18, 2022, 7:58 p.m. UTC
fh_match() is costly, especially when filehandles are large (as is
the case for NFSv4). It needs to be used sparingly when searching
data structures. Unfortunately, with common workloads, I see
multiple thousands of objects stored in file_hashtbl[], which always
has only 256 buckets, which makes the hash chains quite lengthy.

Walking long hash chains with the state_lock held blocks other
activity that needs that lock.

To help mitigate the cost of searching with fh_match(), replace the
nfs4_file hash table with an rhashtable, which can dynamically
resize its bucket array to minimize hash chain length. The ideal for
this use case is one bucket per inode.

The result is an improvement in the latency of NFSv4 operations
and the reduction of nfsd CPU utilization due to the cost of
fh_match() and the CPU cache misses incurred while walking long
hash chains in the nfs4_file hash table.

Note that hash removal is no longer protected by the state_lock.
This is because insert_nfs4_file() takes the RCU read lock then the
state_lock. rhltable_remove() takes the RCU read lock internally;
if remove_nfs4_file() took the state_lock before calling
rhltable_remove(), that would result in a lock inversion.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c |  215 +++++++++++++++++++++++++++++++++++----------------
 fs/nfsd/state.h     |    5 -
 2 files changed, 147 insertions(+), 73 deletions(-)

Comments

Jeff Layton Oct. 19, 2022, 11:39 a.m. UTC | #1
On Tue, 2022-10-18 at 15:58 -0400, Chuck Lever wrote:
> fh_match() is costly, especially when filehandles are large (as is
> the case for NFSv4). It needs to be used sparingly when searching
> data structures. Unfortunately, with common workloads, I see
> multiple thousands of objects stored in file_hashtbl[], which always
> has only 256 buckets, which makes the hash chains quite lengthy.
> 
> Walking long hash chains with the state_lock held blocks other
> activity that needs that lock.
> 
> To help mitigate the cost of searching with fh_match(), replace the
> nfs4_file hash table with an rhashtable, which can dynamically
> resize its bucket array to minimize hash chain length. The ideal for
> this use case is one bucket per inode.
> 
> The result is an improvement in the latency of NFSv4 operations
> and the reduction of nfsd CPU utilization due to the cost of
> fh_match() and the CPU cache misses incurred while walking long
> hash chains in the nfs4_file hash table.
> 
> Note that hash removal is no longer protected by the state_lock.
> This is because insert_nfs4_file() takes the RCU read lock then the
> state_lock. rhltable_remove() takes the RCU read lock internally;
> if remove_nfs4_file() took the state_lock before calling
> rhltable_remove(), that would result in a lock inversion.
> 
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  fs/nfsd/nfs4state.c |  215 +++++++++++++++++++++++++++++++++++----------------
>  fs/nfsd/state.h     |    5 -
>  2 files changed, 147 insertions(+), 73 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 2b850de288cf..a63334ad61f6 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -44,7 +44,9 @@
>  #include <linux/jhash.h>
>  #include <linux/string_helpers.h>
>  #include <linux/fsnotify.h>
> +#include <linux/rhashtable.h>
>  #include <linux/nfs_ssc.h>
> +
>  #include "xdr4.h"
>  #include "xdr4cb.h"
>  #include "vfs.h"
> @@ -84,6 +86,7 @@ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
>  static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
>  void nfsd4_end_grace(struct nfsd_net *nn);
>  static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps);
> +static void remove_nfs4_file(struct nfs4_file *fi);
>  
>  /* Locking: */
>  
> @@ -577,11 +580,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
>  void
>  put_nfs4_file(struct nfs4_file *fi)
>  {
> -	might_lock(&state_lock);
> -
> -	if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
> -		hlist_del_rcu(&fi->fi_hash);
> -		spin_unlock(&state_lock);
> +	if (refcount_dec_and_test(&fi->fi_ref)) {
> +		remove_nfs4_file(fi);
>  		WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
>  		WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
>  		call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
> @@ -695,19 +695,82 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
>  	return ret & OWNER_HASH_MASK;
>  }
>  
> -/* hash table for nfs4_file */
> -#define FILE_HASH_BITS                   8
> -#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
> +static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
> +
> +/*
> + * The returned hash value is based solely on the address of an in-code
> + * inode, a pointer to a slab-allocated object. The entropy in such a
> + * pointer is concentrated in its middle bits.
> + */
> +static u32 nfs4_file_inode_hash(const struct inode *inode, u32 seed)
> +{
> +	u32 k;
> +
> +	k = ((unsigned long)inode) >> L1_CACHE_SHIFT;
> +	return jhash2(&k, 1, seed);
> +}
>  
> -static unsigned int file_hashval(struct svc_fh *fh)
> +/**
> + * nfs4_file_key_hashfn - Compute the hash value of a lookup key
> + * @data: key on which to compute the hash value
> + * @len: rhash table's key_len parameter (unused)
> + * @seed: rhash table's random seed of the day
> + *
> + * Return value:
> + *   Computed 32-bit hash value
> + */
> +static u32 nfs4_file_key_hashfn(const void *data, u32 len, u32 seed)
>  {
> -	struct inode *inode = d_inode(fh->fh_dentry);
> +	const struct svc_fh *fhp = data;
>  
> -	/* XXX: why not (here & in file cache) use inode? */
> -	return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
> +	return nfs4_file_inode_hash(d_inode(fhp->fh_dentry), seed);
>  }
>  
> -static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
> +/**
> + * nfs4_file_obj_hashfn - Compute the hash value of an nfs4_file object
> + * @data: object on which to compute the hash value
> + * @len: rhash table's key_len parameter (unused)
> + * @seed: rhash table's random seed of the day
> + *
> + * Return value:
> + *   Computed 32-bit hash value
> + */
> +static u32 nfs4_file_obj_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	const struct nfs4_file *fi = data;
> +
> +	return nfs4_file_inode_hash(fi->fi_inode, seed);
> +}
> +
> +/**
> + * nfs4_file_obj_cmpfn - Match a cache item against search criteria
> + * @arg: search criteria
> + * @ptr: cache item to check
> + *
> + * Return values:
> + *   %0 - Success; item matches search criteria
> + */
> +static int nfs4_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
> +			       const void *ptr)
> +{
> +	const struct svc_fh *fhp = arg->key;
> +	const struct nfs4_file *fi = ptr;
> +
> +	return fh_match(&fi->fi_fhandle, &fhp->fh_handle) ? 0 : 1;
> +}
> +
> +static const struct rhashtable_params nfs4_file_rhash_params = {
> +	.key_len		= sizeof_field(struct nfs4_file, fi_inode),
> +	.key_offset		= offsetof(struct nfs4_file, fi_inode),
> +	.head_offset		= offsetof(struct nfs4_file, fi_rhash),
> +	.hashfn			= nfs4_file_key_hashfn,
> +	.obj_hashfn		= nfs4_file_obj_hashfn,
> +	.obj_cmpfn		= nfs4_file_obj_cmpfn,
> +
> +	/* Reduce resizing churn on light workloads */
> +	.min_size		= 512,		/* buckets */
> +	.automatic_shrinking	= true,
> +};
>  
>  /*
>   * Check if courtesy clients have conflicting access and resolve it if possible
> @@ -4251,11 +4314,8 @@ static struct nfs4_file *nfsd4_alloc_file(void)
>  }
>  
>  /* OPEN Share state helper functions */
> -static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
> -				struct nfs4_file *fp)
> +static void init_nfs4_file(const struct svc_fh *fh, struct nfs4_file *fp)
>  {
> -	lockdep_assert_held(&state_lock);
> -
>  	refcount_set(&fp->fi_ref, 1);
>  	spin_lock_init(&fp->fi_lock);
>  	INIT_LIST_HEAD(&fp->fi_stateids);
> @@ -4273,7 +4333,6 @@ static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
>  	INIT_LIST_HEAD(&fp->fi_lo_states);
>  	atomic_set(&fp->fi_lo_recalls, 0);
>  #endif
> -	hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
>  }
>  
>  void
> @@ -4626,71 +4685,75 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
>  		nfs4_put_stid(&last->st_stid);
>  }
>  
> -/* search file_hashtbl[] for file */
> -static struct nfs4_file *
> -find_file_locked(struct svc_fh *fh, unsigned int hashval)
> +static struct nfs4_file *find_nfs4_file(const struct svc_fh *fhp)
>  {
> -	struct nfs4_file *fp;
> +	struct rhlist_head *tmp, *list;
> +	struct nfs4_file *fi = NULL;
>  
> -	hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
> -				lockdep_is_held(&state_lock)) {
> -		if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
> -			if (refcount_inc_not_zero(&fp->fi_ref))
> -				return fp;
> +	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
> +			       nfs4_file_rhash_params);
> +	rhl_for_each_entry_rcu(fi, tmp, list, fi_rhash)
> +		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
> +			if (!refcount_inc_not_zero(&fi->fi_ref))
> +				fi = NULL;
> +			break;
>  		}
> -	}
> -	return NULL;
> +	return fi;
>  }
>  
> -static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh,
> -				     unsigned int hashval)
> +/*
> + * On hash insertion, identify entries with the same inode but
> + * distinct filehandles. They will all be in the same hash bucket
> + * because we hash by the address of the nfs4_file's struct inode.
> + */
> +static struct nfs4_file *insert_file(struct nfs4_file *new,
> +				     const struct svc_fh *fhp)
>  {
> -	struct nfs4_file *fp;
> -	struct nfs4_file *ret = NULL;
> -	bool alias_found = false;
> +	struct rhlist_head *tmp, *list;
> +	struct nfs4_file *fi;
> +	int err;
>  
>  	spin_lock(&state_lock);
> -	hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
> -				 lockdep_is_held(&state_lock)) {
> -		if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
> -			if (refcount_inc_not_zero(&fp->fi_ref))
> -				ret = fp;
> -		} else if (d_inode(fh->fh_dentry) == fp->fi_inode)
> -			fp->fi_aliased = alias_found = true;
> -	}
> -	if (likely(ret == NULL)) {
> -		nfsd4_init_file(fh, hashval, new);
> -		new->fi_aliased = alias_found;
> -		ret = new;
> +
> +	err = rhltable_insert_key(&nfs4_file_rhltable, fhp,
> +				  &new->fi_rhash,
> +				  nfs4_file_rhash_params);
> +	if (err) {
> +		spin_unlock(&state_lock);
> +		return NULL;
>  	}
> +
> +	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
> +			       nfs4_file_rhash_params);
> +	rhl_for_each_entry_rcu(fi, tmp, list, fi_rhash)
> +		if (fi->fi_inode == d_inode(fhp->fh_dentry) &&
> +		    !fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
> +			fi->fi_aliased = true;
> +			new->fi_aliased = true;
> +		}
> +
>  	spin_unlock(&state_lock);
> -	return ret;
> +	return new;
>  }
>  
> -static struct nfs4_file * find_file(struct svc_fh *fh)
> +static noinline struct nfs4_file *
> +insert_nfs4_file(struct nfs4_file *new, const struct svc_fh *fhp)
>  {
> -	struct nfs4_file *fp;
> -	unsigned int hashval = file_hashval(fh);
> +	struct nfs4_file *fi;
>  
> -	rcu_read_lock();
> -	fp = find_file_locked(fh, hashval);
> -	rcu_read_unlock();
> -	return fp;
> +	/* Do not hash the same filehandle more than once */
> +	fi = find_nfs4_file(fhp);
> +	if (unlikely(fi))
> +		return fi;
> +
> +	init_nfs4_file(fhp, new);
> +	return insert_file(new, fhp);
>  }
>  
> -static struct nfs4_file *
> -find_or_add_file(struct nfs4_file *new, struct svc_fh *fh)
> +static noinline void remove_nfs4_file(struct nfs4_file *fi)
>  {
> -	struct nfs4_file *fp;
> -	unsigned int hashval = file_hashval(fh);
> -
> -	rcu_read_lock();
> -	fp = find_file_locked(fh, hashval);
> -	rcu_read_unlock();
> -	if (fp)
> -		return fp;
> -
> -	return insert_file(new, fh, hashval);
> +	rhltable_remove(&nfs4_file_rhltable, &fi->fi_rhash,
> +			nfs4_file_rhash_params);
>  }
>  
>  /*
> @@ -4703,9 +4766,12 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
>  	struct nfs4_file *fp;
>  	__be32 ret = nfs_ok;
>  
> -	fp = find_file(current_fh);
> +	rcu_read_lock();
> +	fp = find_nfs4_file(current_fh);
> +	rcu_read_unlock();
>  	if (!fp)
>  		return ret;
> +
>  	/* Check for conflicting share reservations */
>  	spin_lock(&fp->fi_lock);
>  	if (fp->fi_share_deny & deny_type)
> @@ -5548,7 +5614,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
>  	 * and check for delegations in the process of being recalled.
>  	 * If not found, create the nfs4_file struct
>  	 */
> -	fp = find_or_add_file(open->op_file, current_fh);
> +	rcu_read_lock();
> +	fp = insert_nfs4_file(open->op_file, current_fh);
> +	rcu_read_unlock();

It'd probably be better to push this rcu_read_lock down into
insert_nfs4_file. You don't need to hold it over the actual insertion,
since that requires the state_lock.


> +	if (unlikely(!fp))
> +		return nfserr_jukebox;
>  	if (fp != open->op_file) {
>  		status = nfs4_check_deleg(cl, open, &dp);
>  		if (status)
> @@ -7905,10 +7975,16 @@ nfs4_state_start(void)
>  {
>  	int ret;
>  
> -	ret = nfsd4_create_callback_queue();
> +	ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params);
>  	if (ret)
>  		return ret;
>  
> +	ret = nfsd4_create_callback_queue();
> +	if (ret) {
> +		rhltable_destroy(&nfs4_file_rhltable);
> +		return ret;
> +	}
> +
>  	set_max_delegations();
>  	return 0;
>  }
> @@ -7939,6 +8015,7 @@ nfs4_state_shutdown_net(struct net *net)
>  
>  	nfsd4_client_tracking_exit(net);
>  	nfs4_state_destroy_net(net);
> +	rhltable_destroy(&nfs4_file_rhltable);
>  #ifdef CONFIG_NFSD_V4_2_INTER_SSC
>  	nfsd4_ssc_shutdown_umount(nn);
>  #endif
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index ae596dbf8667..ad20467c9dee 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -536,16 +536,13 @@ struct nfs4_clnt_odstate {
>   * inode can have multiple filehandles associated with it, so there is
>   * (potentially) a many to one relationship between this struct and struct
>   * inode.
> - *
> - * These are hashed by filehandle in the file_hashtbl, which is protected by
> - * the global state_lock spinlock.
>   */
>  struct nfs4_file {
>  	refcount_t		fi_ref;
>  	struct inode *		fi_inode;
>  	bool			fi_aliased;
>  	spinlock_t		fi_lock;
> -	struct hlist_node       fi_hash;	/* hash on fi_fhandle */
> +	struct rhlist_head	fi_rhash;
>  	struct list_head        fi_stateids;
>  	union {
>  		struct list_head	fi_delegations;
> 
>
Chuck Lever Oct. 19, 2022, 2:25 p.m. UTC | #2
> On Oct 19, 2022, at 7:39 AM, Jeff Layton <jlayton@poochiereds.net> wrote:
> 
> On Tue, 2022-10-18 at 15:58 -0400, Chuck Lever wrote:
>> fh_match() is costly, especially when filehandles are large (as is
>> the case for NFSv4). It needs to be used sparingly when searching
>> data structures. Unfortunately, with common workloads, I see
>> multiple thousands of objects stored in file_hashtbl[], which always
>> has only 256 buckets, which makes the hash chains quite lengthy.
>> 
>> Walking long hash chains with the state_lock held blocks other
>> activity that needs that lock.
>> 
>> To help mitigate the cost of searching with fh_match(), replace the
>> nfs4_file hash table with an rhashtable, which can dynamically
>> resize its bucket array to minimize hash chain length. The ideal for
>> this use case is one bucket per inode.
>> 
>> The result is an improvement in the latency of NFSv4 operations
>> and the reduction of nfsd CPU utilization due to the cost of
>> fh_match() and the CPU cache misses incurred while walking long
>> hash chains in the nfs4_file hash table.
>> 
>> Note that hash removal is no longer protected by the state_lock.
>> This is because insert_nfs4_file() takes the RCU read lock then the
>> state_lock. rhltable_remove() takes the RCU read lock internally;
>> if remove_nfs4_file() took the state_lock before calling
>> rhltable_remove(), that would result in a lock inversion.
>> 
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>> fs/nfsd/nfs4state.c |  215 +++++++++++++++++++++++++++++++++++----------------
>> fs/nfsd/state.h     |    5 -
>> 2 files changed, 147 insertions(+), 73 deletions(-)
>> 
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index 2b850de288cf..a63334ad61f6 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -44,7 +44,9 @@
>> #include <linux/jhash.h>
>> #include <linux/string_helpers.h>
>> #include <linux/fsnotify.h>
>> +#include <linux/rhashtable.h>
>> #include <linux/nfs_ssc.h>
>> +
>> #include "xdr4.h"
>> #include "xdr4cb.h"
>> #include "vfs.h"
>> @@ -84,6 +86,7 @@ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
>> static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
>> void nfsd4_end_grace(struct nfsd_net *nn);
>> static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps);
>> +static void remove_nfs4_file(struct nfs4_file *fi);
>> 
>> /* Locking: */
>> 
>> @@ -577,11 +580,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
>> void
>> put_nfs4_file(struct nfs4_file *fi)
>> {
>> -	might_lock(&state_lock);
>> -
>> -	if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
>> -		hlist_del_rcu(&fi->fi_hash);
>> -		spin_unlock(&state_lock);
>> +	if (refcount_dec_and_test(&fi->fi_ref)) {
>> +		remove_nfs4_file(fi);
>> 		WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
>> 		WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
>> 		call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
>> @@ -695,19 +695,82 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
>> 	return ret & OWNER_HASH_MASK;
>> }
>> 
>> -/* hash table for nfs4_file */
>> -#define FILE_HASH_BITS                   8
>> -#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
>> +static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
>> +
>> +/*
>> + * The returned hash value is based solely on the address of an in-code
>> + * inode, a pointer to a slab-allocated object. The entropy in such a
>> + * pointer is concentrated in its middle bits.
>> + */
>> +static u32 nfs4_file_inode_hash(const struct inode *inode, u32 seed)
>> +{
>> +	u32 k;
>> +
>> +	k = ((unsigned long)inode) >> L1_CACHE_SHIFT;
>> +	return jhash2(&k, 1, seed);
>> +}
>> 
>> -static unsigned int file_hashval(struct svc_fh *fh)
>> +/**
>> + * nfs4_file_key_hashfn - Compute the hash value of a lookup key
>> + * @data: key on which to compute the hash value
>> + * @len: rhash table's key_len parameter (unused)
>> + * @seed: rhash table's random seed of the day
>> + *
>> + * Return value:
>> + *   Computed 32-bit hash value
>> + */
>> +static u32 nfs4_file_key_hashfn(const void *data, u32 len, u32 seed)
>> {
>> -	struct inode *inode = d_inode(fh->fh_dentry);
>> +	const struct svc_fh *fhp = data;
>> 
>> -	/* XXX: why not (here & in file cache) use inode? */
>> -	return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
>> +	return nfs4_file_inode_hash(d_inode(fhp->fh_dentry), seed);
>> }
>> 
>> -static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
>> +/**
>> + * nfs4_file_obj_hashfn - Compute the hash value of an nfs4_file object
>> + * @data: object on which to compute the hash value
>> + * @len: rhash table's key_len parameter (unused)
>> + * @seed: rhash table's random seed of the day
>> + *
>> + * Return value:
>> + *   Computed 32-bit hash value
>> + */
>> +static u32 nfs4_file_obj_hashfn(const void *data, u32 len, u32 seed)
>> +{
>> +	const struct nfs4_file *fi = data;
>> +
>> +	return nfs4_file_inode_hash(fi->fi_inode, seed);
>> +}
>> +
>> +/**
>> + * nfs4_file_obj_cmpfn - Match a cache item against search criteria
>> + * @arg: search criteria
>> + * @ptr: cache item to check
>> + *
>> + * Return values:
>> + *   %0 - Success; item matches search criteria
>> + */
>> +static int nfs4_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
>> +			       const void *ptr)
>> +{
>> +	const struct svc_fh *fhp = arg->key;
>> +	const struct nfs4_file *fi = ptr;
>> +
>> +	return fh_match(&fi->fi_fhandle, &fhp->fh_handle) ? 0 : 1;
>> +}
>> +
>> +static const struct rhashtable_params nfs4_file_rhash_params = {
>> +	.key_len		= sizeof_field(struct nfs4_file, fi_inode),
>> +	.key_offset		= offsetof(struct nfs4_file, fi_inode),
>> +	.head_offset		= offsetof(struct nfs4_file, fi_rhash),
>> +	.hashfn			= nfs4_file_key_hashfn,
>> +	.obj_hashfn		= nfs4_file_obj_hashfn,
>> +	.obj_cmpfn		= nfs4_file_obj_cmpfn,
>> +
>> +	/* Reduce resizing churn on light workloads */
>> +	.min_size		= 512,		/* buckets */
>> +	.automatic_shrinking	= true,
>> +};
>> 
>> /*
>>  * Check if courtesy clients have conflicting access and resolve it if possible
>> @@ -4251,11 +4314,8 @@ static struct nfs4_file *nfsd4_alloc_file(void)
>> }
>> 
>> /* OPEN Share state helper functions */
>> -static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
>> -				struct nfs4_file *fp)
>> +static void init_nfs4_file(const struct svc_fh *fh, struct nfs4_file *fp)
>> {
>> -	lockdep_assert_held(&state_lock);
>> -
>> 	refcount_set(&fp->fi_ref, 1);
>> 	spin_lock_init(&fp->fi_lock);
>> 	INIT_LIST_HEAD(&fp->fi_stateids);
>> @@ -4273,7 +4333,6 @@ static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
>> 	INIT_LIST_HEAD(&fp->fi_lo_states);
>> 	atomic_set(&fp->fi_lo_recalls, 0);
>> #endif
>> -	hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
>> }
>> 
>> void
>> @@ -4626,71 +4685,75 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
>> 		nfs4_put_stid(&last->st_stid);
>> }
>> 
>> -/* search file_hashtbl[] for file */
>> -static struct nfs4_file *
>> -find_file_locked(struct svc_fh *fh, unsigned int hashval)
>> +static struct nfs4_file *find_nfs4_file(const struct svc_fh *fhp)
>> {
>> -	struct nfs4_file *fp;
>> +	struct rhlist_head *tmp, *list;
>> +	struct nfs4_file *fi = NULL;
>> 
>> -	hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
>> -				lockdep_is_held(&state_lock)) {
>> -		if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
>> -			if (refcount_inc_not_zero(&fp->fi_ref))
>> -				return fp;
>> +	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
>> +			       nfs4_file_rhash_params);
>> +	rhl_for_each_entry_rcu(fi, tmp, list, fi_rhash)
>> +		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
>> +			if (!refcount_inc_not_zero(&fi->fi_ref))
>> +				fi = NULL;
>> +			break;
>> 		}
>> -	}
>> -	return NULL;
>> +	return fi;
>> }
>> 
>> -static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh,
>> -				     unsigned int hashval)
>> +/*
>> + * On hash insertion, identify entries with the same inode but
>> + * distinct filehandles. They will all be in the same hash bucket
>> + * because we hash by the address of the nfs4_file's struct inode.
>> + */
>> +static struct nfs4_file *insert_file(struct nfs4_file *new,
>> +				     const struct svc_fh *fhp)
>> {
>> -	struct nfs4_file *fp;
>> -	struct nfs4_file *ret = NULL;
>> -	bool alias_found = false;
>> +	struct rhlist_head *tmp, *list;
>> +	struct nfs4_file *fi;
>> +	int err;
>> 
>> 	spin_lock(&state_lock);
>> -	hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
>> -				 lockdep_is_held(&state_lock)) {
>> -		if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
>> -			if (refcount_inc_not_zero(&fp->fi_ref))
>> -				ret = fp;
>> -		} else if (d_inode(fh->fh_dentry) == fp->fi_inode)
>> -			fp->fi_aliased = alias_found = true;
>> -	}
>> -	if (likely(ret == NULL)) {
>> -		nfsd4_init_file(fh, hashval, new);
>> -		new->fi_aliased = alias_found;
>> -		ret = new;
>> +
>> +	err = rhltable_insert_key(&nfs4_file_rhltable, fhp,
>> +				  &new->fi_rhash,
>> +				  nfs4_file_rhash_params);
>> +	if (err) {
>> +		spin_unlock(&state_lock);
>> +		return NULL;
>> 	}
>> +
>> +	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
>> +			       nfs4_file_rhash_params);
>> +	rhl_for_each_entry_rcu(fi, tmp, list, fi_rhash)
>> +		if (fi->fi_inode == d_inode(fhp->fh_dentry) &&
>> +		    !fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
>> +			fi->fi_aliased = true;
>> +			new->fi_aliased = true;
>> +		}
>> +
>> 	spin_unlock(&state_lock);
>> -	return ret;
>> +	return new;
>> }
>> 
>> -static struct nfs4_file * find_file(struct svc_fh *fh)
>> +static noinline struct nfs4_file *
>> +insert_nfs4_file(struct nfs4_file *new, const struct svc_fh *fhp)
>> {
>> -	struct nfs4_file *fp;
>> -	unsigned int hashval = file_hashval(fh);
>> +	struct nfs4_file *fi;
>> 
>> -	rcu_read_lock();
>> -	fp = find_file_locked(fh, hashval);
>> -	rcu_read_unlock();
>> -	return fp;
>> +	/* Do not hash the same filehandle more than once */
>> +	fi = find_nfs4_file(fhp);
>> +	if (unlikely(fi))
>> +		return fi;
>> +
>> +	init_nfs4_file(fhp, new);
>> +	return insert_file(new, fhp);
>> }
>> 
>> -static struct nfs4_file *
>> -find_or_add_file(struct nfs4_file *new, struct svc_fh *fh)
>> +static noinline void remove_nfs4_file(struct nfs4_file *fi)
>> {
>> -	struct nfs4_file *fp;
>> -	unsigned int hashval = file_hashval(fh);
>> -
>> -	rcu_read_lock();
>> -	fp = find_file_locked(fh, hashval);
>> -	rcu_read_unlock();
>> -	if (fp)
>> -		return fp;
>> -
>> -	return insert_file(new, fh, hashval);
>> +	rhltable_remove(&nfs4_file_rhltable, &fi->fi_rhash,
>> +			nfs4_file_rhash_params);
>> }
>> 
>> /*
>> @@ -4703,9 +4766,12 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
>> 	struct nfs4_file *fp;
>> 	__be32 ret = nfs_ok;
>> 
>> -	fp = find_file(current_fh);
>> +	rcu_read_lock();
>> +	fp = find_nfs4_file(current_fh);
>> +	rcu_read_unlock();
>> 	if (!fp)
>> 		return ret;
>> +
>> 	/* Check for conflicting share reservations */
>> 	spin_lock(&fp->fi_lock);
>> 	if (fp->fi_share_deny & deny_type)
>> @@ -5548,7 +5614,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
>> 	 * and check for delegations in the process of being recalled.
>> 	 * If not found, create the nfs4_file struct
>> 	 */
>> -	fp = find_or_add_file(open->op_file, current_fh);
>> +	rcu_read_lock();
>> +	fp = insert_nfs4_file(open->op_file, current_fh);
>> +	rcu_read_unlock();
> 
> It'd probably be better to push this rcu_read_lock down into
> insert_nfs4_file. You don't need to hold it over the actual insertion,
> since that requires the state_lock.

I used this arrangement because:

insert_nfs4_file() invokes only find_nfs4_file() and the
insert_file() helper. Both find_nfs4_file() and the
insert_file() helper invoke rhltable_lookup(), which
must be called with the RCU read lock held.

And this is the reason why put_nfs4_file() no longer takes
the state_lock: it would take the state_lock first and
then the RCU read lock (which is implicitly taken in
rhltable_remove()), which results in a lock inversion
relative to insert_nfs4_file(), which takes the RCU read
lock first, then the state_lock.


I'm certainly not an expert, so I'm willing to listen to
alternative approaches. Can we rely on only the RCU read
lock for exclusion on hash insertion?


>> +	if (unlikely(!fp))
>> +		return nfserr_jukebox;
>> 	if (fp != open->op_file) {
>> 		status = nfs4_check_deleg(cl, open, &dp);
>> 		if (status)
>> @@ -7905,10 +7975,16 @@ nfs4_state_start(void)
>> {
>> 	int ret;
>> 
>> -	ret = nfsd4_create_callback_queue();
>> +	ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params);
>> 	if (ret)
>> 		return ret;
>> 
>> +	ret = nfsd4_create_callback_queue();
>> +	if (ret) {
>> +		rhltable_destroy(&nfs4_file_rhltable);
>> +		return ret;
>> +	}
>> +
>> 	set_max_delegations();
>> 	return 0;
>> }
>> @@ -7939,6 +8015,7 @@ nfs4_state_shutdown_net(struct net *net)
>> 
>> 	nfsd4_client_tracking_exit(net);
>> 	nfs4_state_destroy_net(net);
>> +	rhltable_destroy(&nfs4_file_rhltable);
>> #ifdef CONFIG_NFSD_V4_2_INTER_SSC
>> 	nfsd4_ssc_shutdown_umount(nn);
>> #endif
>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>> index ae596dbf8667..ad20467c9dee 100644
>> --- a/fs/nfsd/state.h
>> +++ b/fs/nfsd/state.h
>> @@ -536,16 +536,13 @@ struct nfs4_clnt_odstate {
>>  * inode can have multiple filehandles associated with it, so there is
>>  * (potentially) a many to one relationship between this struct and struct
>>  * inode.
>> - *
>> - * These are hashed by filehandle in the file_hashtbl, which is protected by
>> - * the global state_lock spinlock.
>>  */
>> struct nfs4_file {
>> 	refcount_t		fi_ref;
>> 	struct inode *		fi_inode;
>> 	bool			fi_aliased;
>> 	spinlock_t		fi_lock;
>> -	struct hlist_node       fi_hash;	/* hash on fi_fhandle */
>> +	struct rhlist_head	fi_rhash;
>> 	struct list_head        fi_stateids;
>> 	union {
>> 		struct list_head	fi_delegations;
>> 
>> 
> 
> -- 
> Jeff Layton <jlayton@poochiereds.net>

--
Chuck Lever
NeilBrown Oct. 24, 2022, 2:07 a.m. UTC | #3
On Thu, 20 Oct 2022, Chuck Lever III wrote:
> 
> > On Oct 19, 2022, at 7:39 AM, Jeff Layton <jlayton@poochiereds.net> wrote:
> > 
> >> -	fp = find_or_add_file(open->op_file, current_fh);
> >> +	rcu_read_lock();
> >> +	fp = insert_nfs4_file(open->op_file, current_fh);
> >> +	rcu_read_unlock();
> > 
> > It'd probably be better to push this rcu_read_lock down into
> > insert_nfs4_file. You don't need to hold it over the actual insertion,
> > since that requires the state_lock.
> 
> I used this arrangement because:
> 
> insert_nfs4_file() invokes only find_nfs4_file() and the
> insert_file() helper. Both find_nfs4_file() and the
> insert_file() helper invoke rhltable_lookup(), which
> must be called with the RCU read lock held.
> 
> And this is the reason why put_nfs4_file() no longer takes
> the state_lock: it would take the state_lock first and
> then the RCU read lock (which is implicitly taken in
> rhltable_remove()), which results in a lock inversion
> relative to insert_nfs4_file(), which takes the RCU read
> lock first, then the state_lock.

It doesn't make any sense to talk about lock inversion with
rcu_read_lock().  It isn't really a lock in any traditional sense in
that it can never block (which is what cause lock-inversion problems).
I prefer to think for rcu_read_lock() as taking a reference on some
global state.

> 
> 
> I'm certainly not an expert, so I'm willing to listen to
> alternative approaches. Can we rely on only the RCU read
> lock for exclusion on hash insertion?

Probably we can.  I'll read through all the patches now and provide some
review.

NeilBrown
NeilBrown Oct. 24, 2022, 3:26 a.m. UTC | #4
On Wed, 19 Oct 2022, Chuck Lever wrote:
> fh_match() is costly, especially when filehandles are large (as is
> the case for NFSv4). It needs to be used sparingly when searching
> data structures. Unfortunately, with common workloads, I see
> multiple thousands of objects stored in file_hashtbl[], which always
> has only 256 buckets, which makes the hash chains quite lengthy.
> 
> Walking long hash chains with the state_lock held blocks other
> activity that needs that lock.

Using a bit-spin-lock for each hash chain would be a simple fix if this
was the only concern.  See for example fscache_hash_cookie().
I'm not at all against using rhltables, but thought I would mention that
there are other options.

> 
> To help mitigate the cost of searching with fh_match(), replace the
> nfs4_file hash table with an rhashtable, which can dynamically
> resize its bucket array to minimize hash chain length. The ideal for
> this use case is one bucket per inode.
> 
> The result is an improvement in the latency of NFSv4 operations
> and the reduction of nfsd CPU utilization due to the cost of
> fh_match() and the CPU cache misses incurred while walking long
> hash chains in the nfs4_file hash table.
> 
> Note that hash removal is no longer protected by the state_lock.
> This is because insert_nfs4_file() takes the RCU read lock then the
> state_lock. rhltable_remove() takes the RCU read lock internally;
> if remove_nfs4_file() took the state_lock before calling
> rhltable_remove(), that would result in a lock inversion.

As mentioned separately, lock inversion is not relevant for
rcu_read_lock().
Also, I cannot see any need for state_lock to be used for protecting
additions to, or removals from, the rhash table.

> 
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  fs/nfsd/nfs4state.c |  215 +++++++++++++++++++++++++++++++++++----------------
>  fs/nfsd/state.h     |    5 -
>  2 files changed, 147 insertions(+), 73 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 2b850de288cf..a63334ad61f6 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -44,7 +44,9 @@
>  #include <linux/jhash.h>
>  #include <linux/string_helpers.h>
>  #include <linux/fsnotify.h>
> +#include <linux/rhashtable.h>
>  #include <linux/nfs_ssc.h>
> +
>  #include "xdr4.h"
>  #include "xdr4cb.h"
>  #include "vfs.h"
> @@ -84,6 +86,7 @@ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
>  static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
>  void nfsd4_end_grace(struct nfsd_net *nn);
>  static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps);
> +static void remove_nfs4_file(struct nfs4_file *fi);
>  
>  /* Locking: */
>  
> @@ -577,11 +580,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
>  void
>  put_nfs4_file(struct nfs4_file *fi)
>  {
> -	might_lock(&state_lock);
> -
> -	if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
> -		hlist_del_rcu(&fi->fi_hash);
> -		spin_unlock(&state_lock);
> +	if (refcount_dec_and_test(&fi->fi_ref)) {
> +		remove_nfs4_file(fi);
>  		WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
>  		WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
>  		call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
> @@ -695,19 +695,82 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
>  	return ret & OWNER_HASH_MASK;
>  }
>  
> -/* hash table for nfs4_file */
> -#define FILE_HASH_BITS                   8
> -#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
> +static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
> +
> +/*
> + * The returned hash value is based solely on the address of an in-code
> + * inode, a pointer to a slab-allocated object. The entropy in such a
> + * pointer is concentrated in its middle bits.
> + */
> +static u32 nfs4_file_inode_hash(const struct inode *inode, u32 seed)
> +{
> +	u32 k;
> +
> +	k = ((unsigned long)inode) >> L1_CACHE_SHIFT;
> +	return jhash2(&k, 1, seed);

I still don't think this makes any sense at all.

        return jhash2(&inode, sizeof(inode)/4, seed);

uses all of the entropy, which is best for rhashtables.

> +}
>  
> -static unsigned int file_hashval(struct svc_fh *fh)
> +/**
> + * nfs4_file_key_hashfn - Compute the hash value of a lookup key
> + * @data: key on which to compute the hash value
> + * @len: rhash table's key_len parameter (unused)
> + * @seed: rhash table's random seed of the day
> + *
> + * Return value:
> + *   Computed 32-bit hash value
> + */
> +static u32 nfs4_file_key_hashfn(const void *data, u32 len, u32 seed)
>  {
> -	struct inode *inode = d_inode(fh->fh_dentry);
> +	const struct svc_fh *fhp = data;
>  
> -	/* XXX: why not (here & in file cache) use inode? */
> -	return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
> +	return nfs4_file_inode_hash(d_inode(fhp->fh_dentry), seed);
>  }
>  
> -static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
> +/**
> + * nfs4_file_obj_hashfn - Compute the hash value of an nfs4_file object
> + * @data: object on which to compute the hash value
> + * @len: rhash table's key_len parameter (unused)
> + * @seed: rhash table's random seed of the day
> + *
> + * Return value:
> + *   Computed 32-bit hash value
> + */
> +static u32 nfs4_file_obj_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	const struct nfs4_file *fi = data;
> +
> +	return nfs4_file_inode_hash(fi->fi_inode, seed);
> +}
> +
> +/**
> + * nfs4_file_obj_cmpfn - Match a cache item against search criteria
> + * @arg: search criteria
> + * @ptr: cache item to check
> + *
> + * Return values:
> + *   %0 - Success; item matches search criteria
> + */
> +static int nfs4_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
> +			       const void *ptr)
> +{
> +	const struct svc_fh *fhp = arg->key;
> +	const struct nfs4_file *fi = ptr;
> +
> +	return fh_match(&fi->fi_fhandle, &fhp->fh_handle) ? 0 : 1;
> +}

This doesn't make sense.  Maybe the subtleties of rhl-tables are a bit
obscure.

An rhltable is like an rhashtable except that insertion can never fail. 
Multiple entries with the same key can co-exist in a linked list.
Lookup does not return an entry but returns a linked list of entries.

For the nfs4_file table this isn't really a good match (though I
previously thought it was).  Insertion must be able to fail if an entry
with the same filehandle exists.

One approach that might work would be to take the inode->i_lock after a
lookup fails then repeat the lookup and if it fails again, do the
insert.  This keeps the locking fine-grained but means we don't have to
depend on insert failing.

So the hash and cmp functions would only look at the inode pointer.
If lookup returns something, we would walk the list calling fh_match()
to see if we have the right entry - all under RCU and using
refcount_inc_not_zero() when we find the matching filehandle.

NeilBrown
Jeff Layton Oct. 24, 2022, 10:57 a.m. UTC | #5
On Mon, 2022-10-24 at 13:07 +1100, NeilBrown wrote:
> On Thu, 20 Oct 2022, Chuck Lever III wrote:
> > 
> > > On Oct 19, 2022, at 7:39 AM, Jeff Layton <jlayton@poochiereds.net> wrote:
> > > 
> > > > -	fp = find_or_add_file(open->op_file, current_fh);
> > > > +	rcu_read_lock();
> > > > +	fp = insert_nfs4_file(open->op_file, current_fh);
> > > > +	rcu_read_unlock();
> > > 
> > > It'd probably be better to push this rcu_read_lock down into
> > > insert_nfs4_file. You don't need to hold it over the actual insertion,
> > > since that requires the state_lock.
> > 
> > I used this arrangement because:
> > 
> > insert_nfs4_file() invokes only find_nfs4_file() and the
> > insert_file() helper. Both find_nfs4_file() and the
> > insert_file() helper invoke rhltable_lookup(), which
> > must be called with the RCU read lock held.
> > 
> > And this is the reason why put_nfs4_file() no longer takes
> > the state_lock: it would take the state_lock first and
> > then the RCU read lock (which is implicitly taken in
> > rhltable_remove()), which results in a lock inversion
> > relative to insert_nfs4_file(), which takes the RCU read
> > lock first, then the state_lock.
> 
> It doesn't make any sense to talk about lock inversion with
> rcu_read_lock().  It isn't really a lock in any traditional sense in
> that it can never block (which is what cause lock-inversion problems).
> I prefer to think for rcu_read_lock() as taking a reference on some
> global state.
> 

Right. To be clear, you can deadlock with synchronize_rcu if you use it
improperly, but the rcu_read_lock itself never blocks.

> > 
> > 
> > I'm certainly not an expert, so I'm willing to listen to
> > alternative approaches. Can we rely on only the RCU read
> > lock for exclusion on hash insertion?
> 
> Probably we can.  I'll read through all the patches now and provide some
> review.
> 

The rcu_read_lock provides _no_ exclusion whatsoever, so it's not
usually suitable for things that need exclusive access (like a hash
insertion). If each rhl hash chain has its own lock though, then we may
not require other locking to serialize insertions.
Chuck Lever Oct. 24, 2022, 5:06 p.m. UTC | #6
> On Oct 23, 2022, at 10:07 PM, NeilBrown <neilb@suse.de> wrote:
> 
> On Thu, 20 Oct 2022, Chuck Lever III wrote:
>> 
>>> On Oct 19, 2022, at 7:39 AM, Jeff Layton <jlayton@poochiereds.net> wrote:
>>> 
>>>> -	fp = find_or_add_file(open->op_file, current_fh);
>>>> +	rcu_read_lock();
>>>> +	fp = insert_nfs4_file(open->op_file, current_fh);
>>>> +	rcu_read_unlock();
>>> 
>>> It'd probably be better to push this rcu_read_lock down into
>>> insert_nfs4_file. You don't need to hold it over the actual insertion,
>>> since that requires the state_lock.
>> 
>> I used this arrangement because:
>> 
>> insert_nfs4_file() invokes only find_nfs4_file() and the
>> insert_file() helper. Both find_nfs4_file() and the
>> insert_file() helper invoke rhltable_lookup(), which
>> must be called with the RCU read lock held.
>> 
>> And this is the reason why put_nfs4_file() no longer takes
>> the state_lock: it would take the state_lock first and
>> then the RCU read lock (which is implicitly taken in
>> rhltable_remove()), which results in a lock inversion
>> relative to insert_nfs4_file(), which takes the RCU read
>> lock first, then the state_lock.
> 
> It doesn't make any sense to talk about lock inversion with
> rcu_read_lock().  It isn't really a lock in any traditional sense in
> that it can never block (which is what cause lock-inversion problems).
> I prefer to think for rcu_read_lock() as taking a reference on some
> global state.
> 
>> 
>> 
>> I'm certainly not an expert, so I'm willing to listen to
>> alternative approaches. Can we rely on only the RCU read
>> lock for exclusion on hash insertion?
> 
> Probably we can.  I'll read through all the patches now and provide some
> review.

I've been testing a version all weekend that removes the use of
state_lock. I haven't found any issues with it. I'll post what
I have in a moment.


--
Chuck Lever
Chuck Lever Oct. 24, 2022, 5:26 p.m. UTC | #7
> On Oct 23, 2022, at 11:26 PM, NeilBrown <neilb@suse.de> wrote:
> 
> On Wed, 19 Oct 2022, Chuck Lever wrote:
>> fh_match() is costly, especially when filehandles are large (as is
>> the case for NFSv4). It needs to be used sparingly when searching
>> data structures. Unfortunately, with common workloads, I see
>> multiple thousands of objects stored in file_hashtbl[], which always
>> has only 256 buckets, which makes the hash chains quite lengthy.
>> 
>> Walking long hash chains with the state_lock held blocks other
>> activity that needs that lock.
> 
> Using a bit-spin-lock for each hash chain would be a simple fix if this
> was the only concern.  See for example fscache_hash_cookie().
> I'm not at all against using rhltables, but thought I would mention that
> there are other options.

rhashtable uses bit-spin-locks, and as I said in the other thread,
without the state_lock this all seems to work without crashing,
lockdep or KASAN splat, or test workload failures.


>> To help mitigate the cost of searching with fh_match(), replace the
>> nfs4_file hash table with an rhashtable, which can dynamically
>> resize its bucket array to minimize hash chain length. The ideal for
>> this use case is one bucket per inode.
>> 
>> The result is an improvement in the latency of NFSv4 operations
>> and the reduction of nfsd CPU utilization due to the cost of
>> fh_match() and the CPU cache misses incurred while walking long
>> hash chains in the nfs4_file hash table.
>> 
>> Note that hash removal is no longer protected by the state_lock.
>> This is because insert_nfs4_file() takes the RCU read lock then the
>> state_lock. rhltable_remove() takes the RCU read lock internally;
>> if remove_nfs4_file() took the state_lock before calling
>> rhltable_remove(), that would result in a lock inversion.
> 
> As mentioned separately, lock inversion is not relevant for
> rcu_read_lock().

Understood.


> Also, I cannot see any need for state_lock to be used for protecting
> additions to, or removals from, the rhash table.

Agreed.


>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>> fs/nfsd/nfs4state.c |  215 +++++++++++++++++++++++++++++++++++----------------
>> fs/nfsd/state.h     |    5 -
>> 2 files changed, 147 insertions(+), 73 deletions(-)
>> 
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index 2b850de288cf..a63334ad61f6 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -44,7 +44,9 @@
>> #include <linux/jhash.h>
>> #include <linux/string_helpers.h>
>> #include <linux/fsnotify.h>
>> +#include <linux/rhashtable.h>
>> #include <linux/nfs_ssc.h>
>> +
>> #include "xdr4.h"
>> #include "xdr4cb.h"
>> #include "vfs.h"
>> @@ -84,6 +86,7 @@ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
>> static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
>> void nfsd4_end_grace(struct nfsd_net *nn);
>> static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps);
>> +static void remove_nfs4_file(struct nfs4_file *fi);
>> 
>> /* Locking: */
>> 
>> @@ -577,11 +580,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
>> void
>> put_nfs4_file(struct nfs4_file *fi)
>> {
>> -	might_lock(&state_lock);
>> -
>> -	if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
>> -		hlist_del_rcu(&fi->fi_hash);
>> -		spin_unlock(&state_lock);
>> +	if (refcount_dec_and_test(&fi->fi_ref)) {
>> +		remove_nfs4_file(fi);
>> 		WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
>> 		WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
>> 		call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
>> @@ -695,19 +695,82 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
>> 	return ret & OWNER_HASH_MASK;
>> }
>> 
>> -/* hash table for nfs4_file */
>> -#define FILE_HASH_BITS                   8
>> -#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
>> +static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
>> +
>> +/*
>> + * The returned hash value is based solely on the address of an in-code
>> + * inode, a pointer to a slab-allocated object. The entropy in such a
>> + * pointer is concentrated in its middle bits.
>> + */
>> +static u32 nfs4_file_inode_hash(const struct inode *inode, u32 seed)
>> +{
>> +	u32 k;
>> +
>> +	k = ((unsigned long)inode) >> L1_CACHE_SHIFT;
>> +	return jhash2(&k, 1, seed);
> 
> I still don't think this makes any sense at all.
> 
>        return jhash2(&inode, sizeof(inode)/4, seed);
> 
> uses all of the entropy, which is best for rhashtables.

I don't really disagree, but the L1_CACHE_SHIFT was added at
Al's behest. OK, I'll replace it.


>> +}
>> 
>> -static unsigned int file_hashval(struct svc_fh *fh)
>> +/**
>> + * nfs4_file_key_hashfn - Compute the hash value of a lookup key
>> + * @data: key on which to compute the hash value
>> + * @len: rhash table's key_len parameter (unused)
>> + * @seed: rhash table's random seed of the day
>> + *
>> + * Return value:
>> + *   Computed 32-bit hash value
>> + */
>> +static u32 nfs4_file_key_hashfn(const void *data, u32 len, u32 seed)
>> {
>> -	struct inode *inode = d_inode(fh->fh_dentry);
>> +	const struct svc_fh *fhp = data;
>> 
>> -	/* XXX: why not (here & in file cache) use inode? */
>> -	return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
>> +	return nfs4_file_inode_hash(d_inode(fhp->fh_dentry), seed);
>> }
>> 
>> -static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
>> +/**
>> + * nfs4_file_obj_hashfn - Compute the hash value of an nfs4_file object
>> + * @data: object on which to compute the hash value
>> + * @len: rhash table's key_len parameter (unused)
>> + * @seed: rhash table's random seed of the day
>> + *
>> + * Return value:
>> + *   Computed 32-bit hash value
>> + */
>> +static u32 nfs4_file_obj_hashfn(const void *data, u32 len, u32 seed)
>> +{
>> +	const struct nfs4_file *fi = data;
>> +
>> +	return nfs4_file_inode_hash(fi->fi_inode, seed);
>> +}
>> +
>> +/**
>> + * nfs4_file_obj_cmpfn - Match a cache item against search criteria
>> + * @arg: search criteria
>> + * @ptr: cache item to check
>> + *
>> + * Return values:
>> + *   %0 - Success; item matches search criteria
>> + */
>> +static int nfs4_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
>> +			       const void *ptr)
>> +{
>> +	const struct svc_fh *fhp = arg->key;
>> +	const struct nfs4_file *fi = ptr;
>> +
>> +	return fh_match(&fi->fi_fhandle, &fhp->fh_handle) ? 0 : 1;
>> +}
> 
> This doesn't make sense.  Maybe the subtleties of rhl-tables are a bit
> obscure.
> 
> An rhltable is like an rhashtable except that insertion can never fail. 
> Multiple entries with the same key can co-exist in a linked list.
> Lookup does not return an entry but returns a linked list of entries.
> 
> For the nfs4_file table this isn't really a good match (though I
> previously thought it was).  Insertion must be able to fail if an entry
> with the same filehandle exists.

I think I've arrived at the same conclusion (about nfs4_file insertion
needing to fail for duplicate filehandles). That's more or less open-coded
in my current revision of this work rather than relying on the behavior
of rhltable_insert().


> One approach that might work would be to take the inode->i_lock after a
> lookup fails then repeat the lookup and if it fails again, do the
> insert.  This keeps the locking fine-grained but means we don't have to
> depend on insert failing.
> 
> So the hash and cmp functions would only look at the inode pointer.
> If lookup returns something, we would walk the list calling fh_match()
> to see if we have the right entry - all under RCU and using
> refcount_inc_not_zero() when we find the matching filehandle.

That's basically what's going on now. But I tried removing obj_cmpfn()
and the rhltable_init() failed outright, so it's still there like a
vestigial organ.

If you now believe rhltable is not a good fit, I can revert all of the
rhltable changes and go back to rhashtable quite easily.

Let me do that and post what I have.


--
Chuck Lever
Chuck Lever Oct. 24, 2022, 5:28 p.m. UTC | #8
> On Oct 24, 2022, at 6:57 AM, Jeff Layton <jlayton@kernel.org> wrote:
> 
> On Mon, 2022-10-24 at 13:07 +1100, NeilBrown wrote:
>> On Thu, 20 Oct 2022, Chuck Lever III wrote:
>>> 
>>>> On Oct 19, 2022, at 7:39 AM, Jeff Layton <jlayton@poochiereds.net> wrote:
>>>> 
>>>>> -	fp = find_or_add_file(open->op_file, current_fh);
>>>>> +	rcu_read_lock();
>>>>> +	fp = insert_nfs4_file(open->op_file, current_fh);
>>>>> +	rcu_read_unlock();
>>>> 
>>>> It'd probably be better to push this rcu_read_lock down into
>>>> insert_nfs4_file. You don't need to hold it over the actual insertion,
>>>> since that requires the state_lock.
>>> 
>>> I used this arrangement because:
>>> 
>>> insert_nfs4_file() invokes only find_nfs4_file() and the
>>> insert_file() helper. Both find_nfs4_file() and the
>>> insert_file() helper invoke rhltable_lookup(), which
>>> must be called with the RCU read lock held.
>>> 
>>> And this is the reason why put_nfs4_file() no longer takes
>>> the state_lock: it would take the state_lock first and
>>> then the RCU read lock (which is implicitly taken in
>>> rhltable_remove()), which results in a lock inversion
>>> relative to insert_nfs4_file(), which takes the RCU read
>>> lock first, then the state_lock.
>> 
>> It doesn't make any sense to talk about lock inversion with
>> rcu_read_lock().  It isn't really a lock in any traditional sense in
>> that it can never block (which is what cause lock-inversion problems).
>> I prefer to think for rcu_read_lock() as taking a reference on some
>> global state.
>> 
> 
> Right. To be clear, you can deadlock with synchronize_rcu if you use it
> improperly, but the rcu_read_lock itself never blocks.
> 
>>> 
>>> 
>>> I'm certainly not an expert, so I'm willing to listen to
>>> alternative approaches. Can we rely on only the RCU read
>>> lock for exclusion on hash insertion?
>> 
>> Probably we can.  I'll read through all the patches now and provide some
>> review.
>> 
> 
> The rcu_read_lock provides _no_ exclusion whatsoever, so it's not
> usually suitable for things that need exclusive access (like a hash
> insertion). If each rhl hash chain has its own lock though, then we may
> not require other locking to serialize insertions.

rhash does have per-bucket serialization using bit-spin-locks,
as far as I can tell.


--
Chuck Lever
NeilBrown Oct. 24, 2022, 10:02 p.m. UTC | #9
On Tue, 25 Oct 2022, Chuck Lever III wrote:
> 
> > On Oct 23, 2022, at 11:26 PM, NeilBrown <neilb@suse.de> wrote:
> > 
> > On Wed, 19 Oct 2022, Chuck Lever wrote:
> >> +/*
> >> + * The returned hash value is based solely on the address of an in-code
> >> + * inode, a pointer to a slab-allocated object. The entropy in such a
> >> + * pointer is concentrated in its middle bits.
> >> + */
> >> +static u32 nfs4_file_inode_hash(const struct inode *inode, u32 seed)
> >> +{
> >> +	u32 k;
> >> +
> >> +	k = ((unsigned long)inode) >> L1_CACHE_SHIFT;
> >> +	return jhash2(&k, 1, seed);
> > 
> > I still don't think this makes any sense at all.
> > 
> >        return jhash2(&inode, sizeof(inode)/4, seed);
> > 
> > uses all of the entropy, which is best for rhashtables.
> 
> I don't really disagree, but the L1_CACHE_SHIFT was added at
> Al's behest. OK, I'll replace it.

I think that was in a different context though.

If you are using a simple fixed array of bucket-chains for a hash
table, then shifting down L1_CACHE_SHIFT and masking off the number of
bits to match the array size is a perfectly sensible hash function.  It
will occasionally produce longer chains, but no often.

But rhashtables does something a bit different.  It mixes a seed into
the key as part of producing the hash, and assumes that if an
unfortunate distribution of keys results is a substantially non-uniform
distribution into buckets, then choosing a new random seed will
re-distribute the keys into buckets and will probably produce a more
uniform distribution.

The purpose of this seed is (as I understand it) primarily focused on
network-faced hash tables where an attacker might be able to choose keys
that all hash to the same value.  The random seed will defeat the
attacker.

When hashing inodes there is no opportunity for a deliberate attack, and
we would probably be happy to not use a seed and accept the small
possibility of occasional long chains.  However the rhashtable code
doesn't offer us that option.  It insists on rehashing if any chain
exceeds 16.  So we need to provide a much entropy as we can, and mix the
seed in as effectively as we can, to ensure that rehashing really works.

> >> +static int nfs4_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
> >> +			       const void *ptr)
> >> +{
> >> +	const struct svc_fh *fhp = arg->key;
> >> +	const struct nfs4_file *fi = ptr;
> >> +
> >> +	return fh_match(&fi->fi_fhandle, &fhp->fh_handle) ? 0 : 1;
> >> +}
> > 
> > This doesn't make sense.  Maybe the subtleties of rhl-tables are a bit
> > obscure.
> > 
> > An rhltable is like an rhashtable except that insertion can never fail. 
> > Multiple entries with the same key can co-exist in a linked list.
> > Lookup does not return an entry but returns a linked list of entries.
> > 
> > For the nfs4_file table this isn't really a good match (though I
> > previously thought it was).  Insertion must be able to fail if an entry
> > with the same filehandle exists.
> 
> I think I've arrived at the same conclusion (about nfs4_file insertion
> needing to fail for duplicate filehandles). That's more or less open-coded
> in my current revision of this work rather than relying on the behavior
> of rhltable_insert().
> 
> 
> > One approach that might work would be to take the inode->i_lock after a
> > lookup fails then repeat the lookup and if it fails again, do the
> > insert.  This keeps the locking fine-grained but means we don't have to
> > depend on insert failing.
> > 
> > So the hash and cmp functions would only look at the inode pointer.
> > If lookup returns something, we would walk the list calling fh_match()
> > to see if we have the right entry - all under RCU and using
> > refcount_inc_not_zero() when we find the matching filehandle.
> 
> That's basically what's going on now. But I tried removing obj_cmpfn()
> and the rhltable_init() failed outright, so it's still there like a
> vestigial organ.

You need an obj_cmpfn if you have an obj_hashfn.  If you don't have
obj_hashfn and just provide hashfn and key_len, then you don't need the
cmpfn.

If you have a cmpfn, just compare the inode (the same thing that you
hash) - don't compare the file handle.

> 
> If you now believe rhltable is not a good fit, I can revert all of the
> rhltable changes and go back to rhashtable quite easily.

I wasn't very clear, as I was still working out what I though.

I think an rhashtable cannot work as you need two different keys, the
inode and the filehandle.

I think rhltable can work but it needs help, particularly as it will
never fail an insertion.
The help we can provide is to take a lock, perform a lookup, then only
try to insert if the lookup failed (and we are still holding an
appropriate lock to stop another thread inserting).  Thus any time we
try an insert, we don't want it to fail.

The lock I suggest is ->i_lock in the inode.

The rhltable should only consider the inode as a key, and should provide
a linked list of nfs4_files with the same inode.
We then code a linear search of this linked list (which is expected to
have one entry) to find if there is an entry with the required file
handle.


An alternative is to not use a resizable table at all.  We could stick
with the table we currently have and make small changes.

1/ change the compare function to test the inode pointer first and only
   call fh_match when the inodes match.  This should make it much
   faster.
2/ Instead of using state_lock for locking, use a bit-spin-lock on the
   lsb of the bucket pointers in the array to lock each bucket.  This
   will substantially reduce locking conflicts.

I don't see a big performance difference between this approach and the
rhltable approach.  It really depends which you would feel more
comfortable working with.  Would you rather work with common code which
isn't quite a perfect fit, or with private code that you have complete
control over?

Thanks,
NeilBrown
Chuck Lever Oct. 24, 2022, 10:30 p.m. UTC | #10
> On Oct 24, 2022, at 6:02 PM, NeilBrown <neilb@suse.de> wrote:
> 
> On Tue, 25 Oct 2022, Chuck Lever III wrote:
>> 
>>> On Oct 23, 2022, at 11:26 PM, NeilBrown <neilb@suse.de> wrote:
>>> 
>>> On Wed, 19 Oct 2022, Chuck Lever wrote:
>>>> +/*
>>>> + * The returned hash value is based solely on the address of an in-code
>>>> + * inode, a pointer to a slab-allocated object. The entropy in such a
>>>> + * pointer is concentrated in its middle bits.
>>>> + */
>>>> +static u32 nfs4_file_inode_hash(const struct inode *inode, u32 seed)
>>>> +{
>>>> +	u32 k;
>>>> +
>>>> +	k = ((unsigned long)inode) >> L1_CACHE_SHIFT;
>>>> +	return jhash2(&k, 1, seed);
>>> 
>>> I still don't think this makes any sense at all.
>>> 
>>>       return jhash2(&inode, sizeof(inode)/4, seed);
>>> 
>>> uses all of the entropy, which is best for rhashtables.
>> 
>> I don't really disagree, but the L1_CACHE_SHIFT was added at
>> Al's behest. OK, I'll replace it.
> 
> I think that was in a different context though.
> 
> If you are using a simple fixed array of bucket-chains for a hash
> table, then shifting down L1_CACHE_SHIFT and masking off the number of
> bits to match the array size is a perfectly sensible hash function.  It
> will occasionally produce longer chains, but no often.
> 
> But rhashtables does something a bit different.  It mixes a seed into
> the key as part of producing the hash, and assumes that if an
> unfortunate distribution of keys results is a substantially non-uniform
> distribution into buckets, then choosing a new random seed will
> re-distribute the keys into buckets and will probably produce a more
> uniform distribution.
> 
> The purpose of this seed is (as I understand it) primarily focused on
> network-faced hash tables where an attacker might be able to choose keys
> that all hash to the same value.  The random seed will defeat the
> attacker.
> 
> When hashing inodes there is no opportunity for a deliberate attack, and
> we would probably be happy to not use a seed and accept the small
> possibility of occasional long chains.  However the rhashtable code
> doesn't offer us that option.  It insists on rehashing if any chain
> exceeds 16.  So we need to provide a much entropy as we can, and mix the
> seed in as effectively as we can, to ensure that rehashing really works.
> 
>>>> +static int nfs4_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
>>>> +			       const void *ptr)
>>>> +{
>>>> +	const struct svc_fh *fhp = arg->key;
>>>> +	const struct nfs4_file *fi = ptr;
>>>> +
>>>> +	return fh_match(&fi->fi_fhandle, &fhp->fh_handle) ? 0 : 1;
>>>> +}
>>> 
>>> This doesn't make sense.  Maybe the subtleties of rhl-tables are a bit
>>> obscure.
>>> 
>>> An rhltable is like an rhashtable except that insertion can never fail. 
>>> Multiple entries with the same key can co-exist in a linked list.
>>> Lookup does not return an entry but returns a linked list of entries.
>>> 
>>> For the nfs4_file table this isn't really a good match (though I
>>> previously thought it was).  Insertion must be able to fail if an entry
>>> with the same filehandle exists.
>> 
>> I think I've arrived at the same conclusion (about nfs4_file insertion
>> needing to fail for duplicate filehandles). That's more or less open-coded
>> in my current revision of this work rather than relying on the behavior
>> of rhltable_insert().
>> 
>> 
>>> One approach that might work would be to take the inode->i_lock after a
>>> lookup fails then repeat the lookup and if it fails again, do the
>>> insert.  This keeps the locking fine-grained but means we don't have to
>>> depend on insert failing.
>>> 
>>> So the hash and cmp functions would only look at the inode pointer.
>>> If lookup returns something, we would walk the list calling fh_match()
>>> to see if we have the right entry - all under RCU and using
>>> refcount_inc_not_zero() when we find the matching filehandle.
>> 
>> That's basically what's going on now. But I tried removing obj_cmpfn()
>> and the rhltable_init() failed outright, so it's still there like a
>> vestigial organ.
> 
> You need an obj_cmpfn if you have an obj_hashfn.  If you don't have
> obj_hashfn and just provide hashfn and key_len, then you don't need the
> cmpfn.
> 
> If you have a cmpfn, just compare the inode (the same thing that you
> hash) - don't compare the file handle.
> 
>> 
>> If you now believe rhltable is not a good fit, I can revert all of the
>> rhltable changes and go back to rhashtable quite easily.
> 
> I wasn't very clear, as I was still working out what I though.
> 
> I think an rhashtable cannot work as you need two different keys, the
> inode and the filehandle.
> 
> I think rhltable can work but it needs help, particularly as it will
> never fail an insertion.
> The help we can provide is to take a lock, perform a lookup, then only
> try to insert if the lookup failed (and we are still holding an
> appropriate lock to stop another thread inserting).  Thus any time we
> try an insert, we don't want it to fail.
> 
> The lock I suggest is ->i_lock in the inode.

I will give that a try next.


> The rhltable should only consider the inode as a key, and should provide
> a linked list of nfs4_files with the same inode.

The implementation I've arrived at is rhltable keyed on inode only.
The bucket chains are searched with fh_match().

I've gotten rid of all of the special obj_hash/cmp functions as a
result of this simplification. If I've set up the rhashtable
parameters correctly, the rhashtable code should use jhash/jhash2
on the whole inode pointer by default.


> We then code a linear search of this linked list (which is expected to
> have one entry) to find if there is an entry with the required file
> handle.

I'm not sure about that expectation: multiple inode pointers could
hash to the same bucket. Also, there are cases where multiple file
handles refer to the same inode (the aliasing that a0ce48375a36
("nfsd: track filehandle aliasing in nfs4_files") tries to deal
with).

I will post what I have right now. It's not passing all tests due
to very recent changes (and probably because of lack of insert
serialization). We can pick up from there; I know I've likely missed
some review comments still.


> An alternative is to not use a resizable table at all.  We could stick
> with the table we currently have and make small changes.
> 
> 1/ change the compare function to test the inode pointer first and only
>   call fh_match when the inodes match.  This should make it much
>   faster.

I had the same thought, but this doesn't reduce the number of CPU
cache misses I observed from long bucket chain walks. I'd prefer a
solution that maintains small buckets.


> 2/ Instead of using state_lock for locking, use a bit-spin-lock on the
>   lsb of the bucket pointers in the array to lock each bucket.  This
>   will substantially reduce locking conflicts.
> 
> I don't see a big performance difference between this approach and the
> rhltable approach.  It really depends which you would feel more
> comfortable working with.  Would you rather work with common code which
> isn't quite a perfect fit, or with private code that you have complete
> control over?

I think rhltable still has some legs, will keep chipping away at that.

--
Chuck Lever
Jeff Layton Oct. 25, 2022, 5:18 p.m. UTC | #11
On Mon, 2022-10-24 at 22:30 +0000, Chuck Lever III wrote:
> 
> > On Oct 24, 2022, at 6:02 PM, NeilBrown <neilb@suse.de> wrote:
> > 
> > On Tue, 25 Oct 2022, Chuck Lever III wrote:
> > > 
> > > > On Oct 23, 2022, at 11:26 PM, NeilBrown <neilb@suse.de> wrote:
> > > > 
> > > > On Wed, 19 Oct 2022, Chuck Lever wrote:
> > > > > +/*
> > > > > + * The returned hash value is based solely on the address of an in-code
> > > > > + * inode, a pointer to a slab-allocated object. The entropy in such a
> > > > > + * pointer is concentrated in its middle bits.
> > > > > + */
> > > > > +static u32 nfs4_file_inode_hash(const struct inode *inode, u32 seed)
> > > > > +{
> > > > > +	u32 k;
> > > > > +
> > > > > +	k = ((unsigned long)inode) >> L1_CACHE_SHIFT;
> > > > > +	return jhash2(&k, 1, seed);
> > > > 
> > > > I still don't think this makes any sense at all.
> > > > 
> > > >       return jhash2(&inode, sizeof(inode)/4, seed);
> > > > 
> > > > uses all of the entropy, which is best for rhashtables.
> > > 
> > > I don't really disagree, but the L1_CACHE_SHIFT was added at
> > > Al's behest. OK, I'll replace it.
> > 
> > I think that was in a different context though.
> > 
> > If you are using a simple fixed array of bucket-chains for a hash
> > table, then shifting down L1_CACHE_SHIFT and masking off the number of
> > bits to match the array size is a perfectly sensible hash function.  It
> > will occasionally produce longer chains, but no often.
> > 
> > But rhashtables does something a bit different.  It mixes a seed into
> > the key as part of producing the hash, and assumes that if an
> > unfortunate distribution of keys results is a substantially non-uniform
> > distribution into buckets, then choosing a new random seed will
> > re-distribute the keys into buckets and will probably produce a more
> > uniform distribution.
> > 
> > The purpose of this seed is (as I understand it) primarily focused on
> > network-faced hash tables where an attacker might be able to choose keys
> > that all hash to the same value.  The random seed will defeat the
> > attacker.
> > 
> > When hashing inodes there is no opportunity for a deliberate attack, and
> > we would probably be happy to not use a seed and accept the small
> > possibility of occasional long chains.  However the rhashtable code
> > doesn't offer us that option.  It insists on rehashing if any chain
> > exceeds 16.  So we need to provide a much entropy as we can, and mix the
> > seed in as effectively as we can, to ensure that rehashing really works.
> > 
> > > > > +static int nfs4_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
> > > > > +			       const void *ptr)
> > > > > +{
> > > > > +	const struct svc_fh *fhp = arg->key;
> > > > > +	const struct nfs4_file *fi = ptr;
> > > > > +
> > > > > +	return fh_match(&fi->fi_fhandle, &fhp->fh_handle) ? 0 : 1;
> > > > > +}
> > > > 
> > > > This doesn't make sense.  Maybe the subtleties of rhl-tables are a bit
> > > > obscure.
> > > > 
> > > > An rhltable is like an rhashtable except that insertion can never fail. 
> > > > Multiple entries with the same key can co-exist in a linked list.
> > > > Lookup does not return an entry but returns a linked list of entries.
> > > > 
> > > > For the nfs4_file table this isn't really a good match (though I
> > > > previously thought it was).  Insertion must be able to fail if an entry
> > > > with the same filehandle exists.
> > > 
> > > I think I've arrived at the same conclusion (about nfs4_file insertion
> > > needing to fail for duplicate filehandles). That's more or less open-coded
> > > in my current revision of this work rather than relying on the behavior
> > > of rhltable_insert().
> > > 
> > > 
> > > > One approach that might work would be to take the inode->i_lock after a
> > > > lookup fails then repeat the lookup and if it fails again, do the
> > > > insert.  This keeps the locking fine-grained but means we don't have to
> > > > depend on insert failing.
> > > > 
> > > > So the hash and cmp functions would only look at the inode pointer.
> > > > If lookup returns something, we would walk the list calling fh_match()
> > > > to see if we have the right entry - all under RCU and using
> > > > refcount_inc_not_zero() when we find the matching filehandle.
> > > 
> > > That's basically what's going on now. But I tried removing obj_cmpfn()
> > > and the rhltable_init() failed outright, so it's still there like a
> > > vestigial organ.
> > 
> > You need an obj_cmpfn if you have an obj_hashfn.  If you don't have
> > obj_hashfn and just provide hashfn and key_len, then you don't need the
> > cmpfn.
> > 
> > If you have a cmpfn, just compare the inode (the same thing that you
> > hash) - don't compare the file handle.
> > 
> > > 
> > > If you now believe rhltable is not a good fit, I can revert all of the
> > > rhltable changes and go back to rhashtable quite easily.
> > 
> > I wasn't very clear, as I was still working out what I though.
> > 
> > I think an rhashtable cannot work as you need two different keys, the
> > inode and the filehandle.
> > 
> > I think rhltable can work but it needs help, particularly as it will
> > never fail an insertion.
> > The help we can provide is to take a lock, perform a lookup, then only
> > try to insert if the lookup failed (and we are still holding an
> > appropriate lock to stop another thread inserting).  Thus any time we
> > try an insert, we don't want it to fail.
> > 
> > The lock I suggest is ->i_lock in the inode.
> 
> I will give that a try next.
> 
> 
> > The rhltable should only consider the inode as a key, and should provide
> > a linked list of nfs4_files with the same inode.
> 
> The implementation I've arrived at is rhltable keyed on inode only.
> The bucket chains are searched with fh_match().
> 
> I've gotten rid of all of the special obj_hash/cmp functions as a
> result of this simplification. If I've set up the rhashtable
> parameters correctly, the rhashtable code should use jhash/jhash2
> on the whole inode pointer by default.
> 
> 
> > We then code a linear search of this linked list (which is expected to
> > have one entry) to find if there is an entry with the required file
> > handle.
> 
> I'm not sure about that expectation: multiple inode pointers could
> hash to the same bucket. Also, there are cases where multiple file
> handles refer to the same inode (the aliasing that a0ce48375a36
> ("nfsd: track filehandle aliasing in nfs4_files") tries to deal
> with).
> 
> I will post what I have right now. It's not passing all tests due
> to very recent changes (and probably because of lack of insert
> serialization). We can pick up from there; I know I've likely missed
> some review comments still.
> 
> 

Thanks. I've started looking over this and some other changes, and found
at least a couple of problems with the existing code (not necessarily
due to your patches, fwiw):

1/ the nf_lru is the linkage to the LRU, but it's also used when we move
an object to a dispose list. I don't see anything that prevents you from
trying to add an entry back onto the LRU while it's still on a dispose
list (which could cause corruption). I think we need some clear rules
around how that field get used.

2/ nfsd_file_close_inode_sync can end up putting an nf onto the dispose
list without ensuring that nf_ref has gone to zero. I think there are
some situations where the file can end up being freed out from under a
valid reference due to that.

I'm working on some patches to clean this up along the lines of the
scheme Neil is advocating for. For now, I'm basing this on top of your
series.

> > An alternative is to not use a resizable table at all.  We could stick
> > with the table we currently have and make small changes.
> > 
> > 1/ change the compare function to test the inode pointer first and only
> >   call fh_match when the inodes match.  This should make it much
> >   faster.
> 
> I had the same thought, but this doesn't reduce the number of CPU
> cache misses I observed from long bucket chain walks. I'd prefer a
> solution that maintains small buckets.
> 
> 
> > 2/ Instead of using state_lock for locking, use a bit-spin-lock on the
> >   lsb of the bucket pointers in the array to lock each bucket.  This
> >   will substantially reduce locking conflicts.
> > 
> > I don't see a big performance difference between this approach and the
> > rhltable approach.  It really depends which you would feel more
> > comfortable working with.  Would you rather work with common code which
> > isn't quite a perfect fit, or with private code that you have complete
> > control over?
> 
> I think rhltable still has some legs, will keep chipping away at that.
> 
> --
> Chuck Lever
> 
> 
>
Chuck Lever Oct. 25, 2022, 5:23 p.m. UTC | #12
> On Oct 25, 2022, at 1:18 PM, Jeff Layton <jlayton@kernel.org> wrote:
> 
> On Mon, 2022-10-24 at 22:30 +0000, Chuck Lever III wrote:
>> 
>>> On Oct 24, 2022, at 6:02 PM, NeilBrown <neilb@suse.de> wrote:
>>> 
>>> On Tue, 25 Oct 2022, Chuck Lever III wrote:
>>>> 
>>>>> On Oct 23, 2022, at 11:26 PM, NeilBrown <neilb@suse.de> wrote:
>>>>> 
>>>>> On Wed, 19 Oct 2022, Chuck Lever wrote:
>>>>>> +/*
>>>>>> + * The returned hash value is based solely on the address of an in-code
>>>>>> + * inode, a pointer to a slab-allocated object. The entropy in such a
>>>>>> + * pointer is concentrated in its middle bits.
>>>>>> + */
>>>>>> +static u32 nfs4_file_inode_hash(const struct inode *inode, u32 seed)
>>>>>> +{
>>>>>> +	u32 k;
>>>>>> +
>>>>>> +	k = ((unsigned long)inode) >> L1_CACHE_SHIFT;
>>>>>> +	return jhash2(&k, 1, seed);
>>>>> 
>>>>> I still don't think this makes any sense at all.
>>>>> 
>>>>>      return jhash2(&inode, sizeof(inode)/4, seed);
>>>>> 
>>>>> uses all of the entropy, which is best for rhashtables.
>>>> 
>>>> I don't really disagree, but the L1_CACHE_SHIFT was added at
>>>> Al's behest. OK, I'll replace it.
>>> 
>>> I think that was in a different context though.
>>> 
>>> If you are using a simple fixed array of bucket-chains for a hash
>>> table, then shifting down L1_CACHE_SHIFT and masking off the number of
>>> bits to match the array size is a perfectly sensible hash function.  It
>>> will occasionally produce longer chains, but no often.
>>> 
>>> But rhashtables does something a bit different.  It mixes a seed into
>>> the key as part of producing the hash, and assumes that if an
>>> unfortunate distribution of keys results is a substantially non-uniform
>>> distribution into buckets, then choosing a new random seed will
>>> re-distribute the keys into buckets and will probably produce a more
>>> uniform distribution.
>>> 
>>> The purpose of this seed is (as I understand it) primarily focused on
>>> network-faced hash tables where an attacker might be able to choose keys
>>> that all hash to the same value.  The random seed will defeat the
>>> attacker.
>>> 
>>> When hashing inodes there is no opportunity for a deliberate attack, and
>>> we would probably be happy to not use a seed and accept the small
>>> possibility of occasional long chains.  However the rhashtable code
>>> doesn't offer us that option.  It insists on rehashing if any chain
>>> exceeds 16.  So we need to provide a much entropy as we can, and mix the
>>> seed in as effectively as we can, to ensure that rehashing really works.
>>> 
>>>>>> +static int nfs4_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
>>>>>> +			       const void *ptr)
>>>>>> +{
>>>>>> +	const struct svc_fh *fhp = arg->key;
>>>>>> +	const struct nfs4_file *fi = ptr;
>>>>>> +
>>>>>> +	return fh_match(&fi->fi_fhandle, &fhp->fh_handle) ? 0 : 1;
>>>>>> +}
>>>>> 
>>>>> This doesn't make sense.  Maybe the subtleties of rhl-tables are a bit
>>>>> obscure.
>>>>> 
>>>>> An rhltable is like an rhashtable except that insertion can never fail. 
>>>>> Multiple entries with the same key can co-exist in a linked list.
>>>>> Lookup does not return an entry but returns a linked list of entries.
>>>>> 
>>>>> For the nfs4_file table this isn't really a good match (though I
>>>>> previously thought it was).  Insertion must be able to fail if an entry
>>>>> with the same filehandle exists.
>>>> 
>>>> I think I've arrived at the same conclusion (about nfs4_file insertion
>>>> needing to fail for duplicate filehandles). That's more or less open-coded
>>>> in my current revision of this work rather than relying on the behavior
>>>> of rhltable_insert().
>>>> 
>>>> 
>>>>> One approach that might work would be to take the inode->i_lock after a
>>>>> lookup fails then repeat the lookup and if it fails again, do the
>>>>> insert.  This keeps the locking fine-grained but means we don't have to
>>>>> depend on insert failing.
>>>>> 
>>>>> So the hash and cmp functions would only look at the inode pointer.
>>>>> If lookup returns something, we would walk the list calling fh_match()
>>>>> to see if we have the right entry - all under RCU and using
>>>>> refcount_inc_not_zero() when we find the matching filehandle.
>>>> 
>>>> That's basically what's going on now. But I tried removing obj_cmpfn()
>>>> and the rhltable_init() failed outright, so it's still there like a
>>>> vestigial organ.
>>> 
>>> You need an obj_cmpfn if you have an obj_hashfn.  If you don't have
>>> obj_hashfn and just provide hashfn and key_len, then you don't need the
>>> cmpfn.
>>> 
>>> If you have a cmpfn, just compare the inode (the same thing that you
>>> hash) - don't compare the file handle.
>>> 
>>>> 
>>>> If you now believe rhltable is not a good fit, I can revert all of the
>>>> rhltable changes and go back to rhashtable quite easily.
>>> 
>>> I wasn't very clear, as I was still working out what I though.
>>> 
>>> I think an rhashtable cannot work as you need two different keys, the
>>> inode and the filehandle.
>>> 
>>> I think rhltable can work but it needs help, particularly as it will
>>> never fail an insertion.
>>> The help we can provide is to take a lock, perform a lookup, then only
>>> try to insert if the lookup failed (and we are still holding an
>>> appropriate lock to stop another thread inserting).  Thus any time we
>>> try an insert, we don't want it to fail.
>>> 
>>> The lock I suggest is ->i_lock in the inode.
>> 
>> I will give that a try next.
>> 
>> 
>>> The rhltable should only consider the inode as a key, and should provide
>>> a linked list of nfs4_files with the same inode.
>> 
>> The implementation I've arrived at is rhltable keyed on inode only.
>> The bucket chains are searched with fh_match().
>> 
>> I've gotten rid of all of the special obj_hash/cmp functions as a
>> result of this simplification. If I've set up the rhashtable
>> parameters correctly, the rhashtable code should use jhash/jhash2
>> on the whole inode pointer by default.
>> 
>> 
>>> We then code a linear search of this linked list (which is expected to
>>> have one entry) to find if there is an entry with the required file
>>> handle.
>> 
>> I'm not sure about that expectation: multiple inode pointers could
>> hash to the same bucket. Also, there are cases where multiple file
>> handles refer to the same inode (the aliasing that a0ce48375a36
>> ("nfsd: track filehandle aliasing in nfs4_files") tries to deal
>> with).
>> 
>> I will post what I have right now. It's not passing all tests due
>> to very recent changes (and probably because of lack of insert
>> serialization). We can pick up from there; I know I've likely missed
>> some review comments still.
>> 
>> 
> 
> Thanks. I've started looking over this and some other changes, and found
> at least a couple of problems with the existing code (not necessarily
> due to your patches, fwiw):
> 
> 1/ the nf_lru is the linkage to the LRU, but it's also used when we move
> an object to a dispose list. I don't see anything that prevents you from
> trying to add an entry back onto the LRU while it's still on a dispose
> list (which could cause corruption). I think we need some clear rules
> around how that field get used.

I chased this when converting filecache to rhashtable. There don't seem
to be any logic errors in the current code, and it seems to be a common
trope in other consumers of list_lru, so I left it alone.

Yeah, it makes me nervous too.


> 2/ nfsd_file_close_inode_sync can end up putting an nf onto the dispose
> list without ensuring that nf_ref has gone to zero. I think there are
> some situations where the file can end up being freed out from under a
> valid reference due to that.
> 
> I'm working on some patches to clean this up along the lines of the
> scheme Neil is advocating for. For now, I'm basing this on top of your
> series.

I'll try to refresh kernel.org:cel for-next later today.


>>> An alternative is to not use a resizable table at all.  We could stick
>>> with the table we currently have and make small changes.
>>> 
>>> 1/ change the compare function to test the inode pointer first and only
>>>  call fh_match when the inodes match.  This should make it much
>>>  faster.
>> 
>> I had the same thought, but this doesn't reduce the number of CPU
>> cache misses I observed from long bucket chain walks. I'd prefer a
>> solution that maintains small buckets.
>> 
>> 
>>> 2/ Instead of using state_lock for locking, use a bit-spin-lock on the
>>>  lsb of the bucket pointers in the array to lock each bucket.  This
>>>  will substantially reduce locking conflicts.
>>> 
>>> I don't see a big performance difference between this approach and the
>>> rhltable approach.  It really depends which you would feel more
>>> comfortable working with.  Would you rather work with common code which
>>> isn't quite a perfect fit, or with private code that you have complete
>>> control over?
>> 
>> I think rhltable still has some legs, will keep chipping away at that.
>> 
>> --
>> Chuck Lever
>> 
>> 
>> 
> 
> -- 
> Jeff Layton <jlayton@kernel.org>

--
Chuck Lever
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2b850de288cf..a63334ad61f6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -44,7 +44,9 @@ 
 #include <linux/jhash.h>
 #include <linux/string_helpers.h>
 #include <linux/fsnotify.h>
+#include <linux/rhashtable.h>
 #include <linux/nfs_ssc.h>
+
 #include "xdr4.h"
 #include "xdr4cb.h"
 #include "vfs.h"
@@ -84,6 +86,7 @@  static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
 void nfsd4_end_grace(struct nfsd_net *nn);
 static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps);
+static void remove_nfs4_file(struct nfs4_file *fi);
 
 /* Locking: */
 
@@ -577,11 +580,8 @@  static void nfsd4_free_file_rcu(struct rcu_head *rcu)
 void
 put_nfs4_file(struct nfs4_file *fi)
 {
-	might_lock(&state_lock);
-
-	if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
-		hlist_del_rcu(&fi->fi_hash);
-		spin_unlock(&state_lock);
+	if (refcount_dec_and_test(&fi->fi_ref)) {
+		remove_nfs4_file(fi);
 		WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
 		WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
 		call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
@@ -695,19 +695,82 @@  static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
 	return ret & OWNER_HASH_MASK;
 }
 
-/* hash table for nfs4_file */
-#define FILE_HASH_BITS                   8
-#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
+static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
+
+/*
+ * The returned hash value is based solely on the address of an in-code
+ * inode, a pointer to a slab-allocated object. The entropy in such a
+ * pointer is concentrated in its middle bits.
+ */
+static u32 nfs4_file_inode_hash(const struct inode *inode, u32 seed)
+{
+	u32 k;
+
+	k = ((unsigned long)inode) >> L1_CACHE_SHIFT;
+	return jhash2(&k, 1, seed);
+}
 
-static unsigned int file_hashval(struct svc_fh *fh)
+/**
+ * nfs4_file_key_hashfn - Compute the hash value of a lookup key
+ * @data: key on which to compute the hash value
+ * @len: rhash table's key_len parameter (unused)
+ * @seed: rhash table's random seed of the day
+ *
+ * Return value:
+ *   Computed 32-bit hash value
+ */
+static u32 nfs4_file_key_hashfn(const void *data, u32 len, u32 seed)
 {
-	struct inode *inode = d_inode(fh->fh_dentry);
+	const struct svc_fh *fhp = data;
 
-	/* XXX: why not (here & in file cache) use inode? */
-	return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
+	return nfs4_file_inode_hash(d_inode(fhp->fh_dentry), seed);
 }
 
-static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
+/**
+ * nfs4_file_obj_hashfn - Compute the hash value of an nfs4_file object
+ * @data: object on which to compute the hash value
+ * @len: rhash table's key_len parameter (unused)
+ * @seed: rhash table's random seed of the day
+ *
+ * Return value:
+ *   Computed 32-bit hash value
+ */
+static u32 nfs4_file_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct nfs4_file *fi = data;
+
+	return nfs4_file_inode_hash(fi->fi_inode, seed);
+}
+
+/**
+ * nfs4_file_obj_cmpfn - Match a cache item against search criteria
+ * @arg: search criteria
+ * @ptr: cache item to check
+ *
+ * Return values:
+ *   %0 - Success; item matches search criteria
+ */
+static int nfs4_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
+			       const void *ptr)
+{
+	const struct svc_fh *fhp = arg->key;
+	const struct nfs4_file *fi = ptr;
+
+	return fh_match(&fi->fi_fhandle, &fhp->fh_handle) ? 0 : 1;
+}
+
+static const struct rhashtable_params nfs4_file_rhash_params = {
+	.key_len		= sizeof_field(struct nfs4_file, fi_inode),
+	.key_offset		= offsetof(struct nfs4_file, fi_inode),
+	.head_offset		= offsetof(struct nfs4_file, fi_rhash),
+	.hashfn			= nfs4_file_key_hashfn,
+	.obj_hashfn		= nfs4_file_obj_hashfn,
+	.obj_cmpfn		= nfs4_file_obj_cmpfn,
+
+	/* Reduce resizing churn on light workloads */
+	.min_size		= 512,		/* buckets */
+	.automatic_shrinking	= true,
+};
 
 /*
  * Check if courtesy clients have conflicting access and resolve it if possible
@@ -4251,11 +4314,8 @@  static struct nfs4_file *nfsd4_alloc_file(void)
 }
 
 /* OPEN Share state helper functions */
-static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
-				struct nfs4_file *fp)
+static void init_nfs4_file(const struct svc_fh *fh, struct nfs4_file *fp)
 {
-	lockdep_assert_held(&state_lock);
-
 	refcount_set(&fp->fi_ref, 1);
 	spin_lock_init(&fp->fi_lock);
 	INIT_LIST_HEAD(&fp->fi_stateids);
@@ -4273,7 +4333,6 @@  static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
 	INIT_LIST_HEAD(&fp->fi_lo_states);
 	atomic_set(&fp->fi_lo_recalls, 0);
 #endif
-	hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
 }
 
 void
@@ -4626,71 +4685,75 @@  move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
 		nfs4_put_stid(&last->st_stid);
 }
 
-/* search file_hashtbl[] for file */
-static struct nfs4_file *
-find_file_locked(struct svc_fh *fh, unsigned int hashval)
+static struct nfs4_file *find_nfs4_file(const struct svc_fh *fhp)
 {
-	struct nfs4_file *fp;
+	struct rhlist_head *tmp, *list;
+	struct nfs4_file *fi = NULL;
 
-	hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
-				lockdep_is_held(&state_lock)) {
-		if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
-			if (refcount_inc_not_zero(&fp->fi_ref))
-				return fp;
+	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
+			       nfs4_file_rhash_params);
+	rhl_for_each_entry_rcu(fi, tmp, list, fi_rhash)
+		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
+			if (!refcount_inc_not_zero(&fi->fi_ref))
+				fi = NULL;
+			break;
 		}
-	}
-	return NULL;
+	return fi;
 }
 
-static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh,
-				     unsigned int hashval)
+/*
+ * On hash insertion, identify entries with the same inode but
+ * distinct filehandles. They will all be in the same hash bucket
+ * because we hash by the address of the nfs4_file's struct inode.
+ */
+static struct nfs4_file *insert_file(struct nfs4_file *new,
+				     const struct svc_fh *fhp)
 {
-	struct nfs4_file *fp;
-	struct nfs4_file *ret = NULL;
-	bool alias_found = false;
+	struct rhlist_head *tmp, *list;
+	struct nfs4_file *fi;
+	int err;
 
 	spin_lock(&state_lock);
-	hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
-				 lockdep_is_held(&state_lock)) {
-		if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
-			if (refcount_inc_not_zero(&fp->fi_ref))
-				ret = fp;
-		} else if (d_inode(fh->fh_dentry) == fp->fi_inode)
-			fp->fi_aliased = alias_found = true;
-	}
-	if (likely(ret == NULL)) {
-		nfsd4_init_file(fh, hashval, new);
-		new->fi_aliased = alias_found;
-		ret = new;
+
+	err = rhltable_insert_key(&nfs4_file_rhltable, fhp,
+				  &new->fi_rhash,
+				  nfs4_file_rhash_params);
+	if (err) {
+		spin_unlock(&state_lock);
+		return NULL;
 	}
+
+	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
+			       nfs4_file_rhash_params);
+	rhl_for_each_entry_rcu(fi, tmp, list, fi_rhash)
+		if (fi->fi_inode == d_inode(fhp->fh_dentry) &&
+		    !fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
+			fi->fi_aliased = true;
+			new->fi_aliased = true;
+		}
+
 	spin_unlock(&state_lock);
-	return ret;
+	return new;
 }
 
-static struct nfs4_file * find_file(struct svc_fh *fh)
+static noinline struct nfs4_file *
+insert_nfs4_file(struct nfs4_file *new, const struct svc_fh *fhp)
 {
-	struct nfs4_file *fp;
-	unsigned int hashval = file_hashval(fh);
+	struct nfs4_file *fi;
 
-	rcu_read_lock();
-	fp = find_file_locked(fh, hashval);
-	rcu_read_unlock();
-	return fp;
+	/* Do not hash the same filehandle more than once */
+	fi = find_nfs4_file(fhp);
+	if (unlikely(fi))
+		return fi;
+
+	init_nfs4_file(fhp, new);
+	return insert_file(new, fhp);
 }
 
-static struct nfs4_file *
-find_or_add_file(struct nfs4_file *new, struct svc_fh *fh)
+static noinline void remove_nfs4_file(struct nfs4_file *fi)
 {
-	struct nfs4_file *fp;
-	unsigned int hashval = file_hashval(fh);
-
-	rcu_read_lock();
-	fp = find_file_locked(fh, hashval);
-	rcu_read_unlock();
-	if (fp)
-		return fp;
-
-	return insert_file(new, fh, hashval);
+	rhltable_remove(&nfs4_file_rhltable, &fi->fi_rhash,
+			nfs4_file_rhash_params);
 }
 
 /*
@@ -4703,9 +4766,12 @@  nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 	struct nfs4_file *fp;
 	__be32 ret = nfs_ok;
 
-	fp = find_file(current_fh);
+	rcu_read_lock();
+	fp = find_nfs4_file(current_fh);
+	rcu_read_unlock();
 	if (!fp)
 		return ret;
+
 	/* Check for conflicting share reservations */
 	spin_lock(&fp->fi_lock);
 	if (fp->fi_share_deny & deny_type)
@@ -5548,7 +5614,11 @@  nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	 * and check for delegations in the process of being recalled.
 	 * If not found, create the nfs4_file struct
 	 */
-	fp = find_or_add_file(open->op_file, current_fh);
+	rcu_read_lock();
+	fp = insert_nfs4_file(open->op_file, current_fh);
+	rcu_read_unlock();
+	if (unlikely(!fp))
+		return nfserr_jukebox;
 	if (fp != open->op_file) {
 		status = nfs4_check_deleg(cl, open, &dp);
 		if (status)
@@ -7905,10 +7975,16 @@  nfs4_state_start(void)
 {
 	int ret;
 
-	ret = nfsd4_create_callback_queue();
+	ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params);
 	if (ret)
 		return ret;
 
+	ret = nfsd4_create_callback_queue();
+	if (ret) {
+		rhltable_destroy(&nfs4_file_rhltable);
+		return ret;
+	}
+
 	set_max_delegations();
 	return 0;
 }
@@ -7939,6 +8015,7 @@  nfs4_state_shutdown_net(struct net *net)
 
 	nfsd4_client_tracking_exit(net);
 	nfs4_state_destroy_net(net);
+	rhltable_destroy(&nfs4_file_rhltable);
 #ifdef CONFIG_NFSD_V4_2_INTER_SSC
 	nfsd4_ssc_shutdown_umount(nn);
 #endif
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index ae596dbf8667..ad20467c9dee 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -536,16 +536,13 @@  struct nfs4_clnt_odstate {
  * inode can have multiple filehandles associated with it, so there is
  * (potentially) a many to one relationship between this struct and struct
  * inode.
- *
- * These are hashed by filehandle in the file_hashtbl, which is protected by
- * the global state_lock spinlock.
  */
 struct nfs4_file {
 	refcount_t		fi_ref;
 	struct inode *		fi_inode;
 	bool			fi_aliased;
 	spinlock_t		fi_lock;
-	struct hlist_node       fi_hash;	/* hash on fi_fhandle */
+	struct rhlist_head	fi_rhash;
 	struct list_head        fi_stateids;
 	union {
 		struct list_head	fi_delegations;