diff mbox series

[v4] nfsd: allow for up to 32 callback session slots

Message ID 20241105-bcwide-v4-1-48f52ee0fb0c@kernel.org (mailing list archive)
State New
Headers show
Series [v4] nfsd: allow for up to 32 callback session slots | expand

Commit Message

Jeff Layton Nov. 6, 2024, 12:31 a.m. UTC
nfsd currently only uses a single slot in the callback channel, which is
proving to be a bottleneck in some cases. Widen the callback channel to
a max of 32 slots (subject to the client's target_maxreqs value).

Change the cb_holds_slot boolean to an integer that tracks the current
slot number (with -1 meaning "unassigned").  Move the callback slot
tracking info into the session. Add a new u32 that acts as a bitmap to
track which slots are in use, and a u32 to track the latest callback
target_slotid that the client reports. To protect the new fields, add
a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
search for the lowest slotid (using ffs()).

Finally, convert the session->se_cb_seq_nr field into an array of
counters and add the necessary handling to ensure that the seqids get
reset at the appropriate times.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
v3 has a bug that Olga hit in testing. This version should fix the wait
when the slot table is full. Olga, if you're able to test this one, it
would be much appreciated.
---
Changes in v4:
- Fix the wait for a slot in nfsd41_cb_get_slot()
- Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org

Changes in v3:
- add patch to convert se_flags to single se_dead bool
- fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
- don't reject target highest slot value of 0
- Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org

Changes in v2:
- take cl_lock when fetching fields from session to be encoded
- use fls() instead of bespoke highest_unset_index()
- rename variables in several functions with more descriptive names
- clamp limit of for loop in update_cb_slot_table()
- re-add missing rpc_wake_up_queued_task() call
- fix slotid check in decode_cb_sequence4resok()
- add new per-session spinlock
---
 fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
 fs/nfsd/nfs4state.c    |  11 +++--
 fs/nfsd/state.h        |  15 ++++---
 fs/nfsd/trace.h        |   2 +-
 4 files changed, 101 insertions(+), 40 deletions(-)


---
base-commit: 3c16aac09d20f9005fbb0e737b3ec520bbb5badd
change-id: 20241025-bcwide-6bd7e4b63db2

Best regards,

Comments

Chuck Lever III Nov. 6, 2024, 3:40 p.m. UTC | #1
On Tue, Nov 05, 2024 at 07:31:06PM -0500, Jeff Layton wrote:
> nfsd currently only uses a single slot in the callback channel, which is
> proving to be a bottleneck in some cases. Widen the callback channel to
> a max of 32 slots (subject to the client's target_maxreqs value).
> 
> Change the cb_holds_slot boolean to an integer that tracks the current
> slot number (with -1 meaning "unassigned").  Move the callback slot
> tracking info into the session. Add a new u32 that acts as a bitmap to
> track which slots are in use, and a u32 to track the latest callback
> target_slotid that the client reports. To protect the new fields, add
> a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> search for the lowest slotid (using ffs()).
> 
> Finally, convert the session->se_cb_seq_nr field into an array of
> counters and add the necessary handling to ensure that the seqids get
> reset at the appropriate times.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
> v3 has a bug that Olga hit in testing. This version should fix the wait
> when the slot table is full. Olga, if you're able to test this one, it
> would be much appreciated.

Note: I've replaced v3 in nfsd-next with this version. Thanks for
the update!


> ---
> Changes in v4:
> - Fix the wait for a slot in nfsd41_cb_get_slot()
> - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> 
> Changes in v3:
> - add patch to convert se_flags to single se_dead bool
> - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> - don't reject target highest slot value of 0
> - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> 
> Changes in v2:
> - take cl_lock when fetching fields from session to be encoded
> - use fls() instead of bespoke highest_unset_index()
> - rename variables in several functions with more descriptive names
> - clamp limit of for loop in update_cb_slot_table()
> - re-add missing rpc_wake_up_queued_task() call
> - fix slotid check in decode_cb_sequence4resok()
> - add new per-session spinlock
> ---
>  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
>  fs/nfsd/nfs4state.c    |  11 +++--
>  fs/nfsd/state.h        |  15 ++++---
>  fs/nfsd/trace.h        |   2 +-
>  4 files changed, 101 insertions(+), 40 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> --- a/fs/nfsd/nfs4callback.c
> +++ b/fs/nfsd/nfs4callback.c
> @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
>  	hdr->nops++;
>  }
>  
> +static u32 highest_slotid(struct nfsd4_session *ses)
> +{
> +	u32 idx;
> +
> +	spin_lock(&ses->se_lock);
> +	idx = fls(~ses->se_cb_slot_avail);
> +	if (idx > 0)
> +		--idx;
> +	idx = max(idx, ses->se_cb_highest_slot);
> +	spin_unlock(&ses->se_lock);
> +	return idx;
> +}
> +
>  /*
>   * CB_SEQUENCE4args
>   *
> @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
>  	encode_sessionid4(xdr, session);
>  
>  	p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> -	*p++ = cpu_to_be32(session->se_cb_seq_nr);	/* csa_sequenceid */
> -	*p++ = xdr_zero;			/* csa_slotid */
> -	*p++ = xdr_zero;			/* csa_highest_slotid */
> +	*p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);	/* csa_sequenceid */
> +	*p++ = cpu_to_be32(cb->cb_held_slot);		/* csa_slotid */
> +	*p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
>  	*p++ = xdr_zero;			/* csa_cachethis */
>  	xdr_encode_empty_array(p);		/* csa_referring_call_lists */
>  
>  	hdr->nops++;
>  }
>  
> +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> +{
> +	/* No need to do anything if nothing changed */
> +	if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> +		return;
> +
> +	spin_lock(&ses->se_lock);
> +	if (target > ses->se_cb_highest_slot) {
> +		int i;
> +
> +		target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> +
> +		/* Growing the slot table. Reset any new sequences to 1 */
> +		for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> +			ses->se_cb_seq_nr[i] = 1;
> +	}
> +	ses->se_cb_highest_slot = target;
> +	spin_unlock(&ses->se_lock);
> +}
> +
>  /*
>   * CB_SEQUENCE4resok
>   *
> @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
>  	struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
>  	int status = -ESERVERFAULT;
>  	__be32 *p;
> -	u32 dummy;
> +	u32 seqid, slotid, target;
>  
>  	/*
>  	 * If the server returns different values for sessionID, slotID or
> @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
>  	}
>  	p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
>  
> -	dummy = be32_to_cpup(p++);
> -	if (dummy != session->se_cb_seq_nr) {
> +	seqid = be32_to_cpup(p++);
> +	if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
>  		dprintk("NFS: %s Invalid sequence number\n", __func__);
>  		goto out;
>  	}
>  
> -	dummy = be32_to_cpup(p++);
> -	if (dummy != 0) {
> +	slotid = be32_to_cpup(p++);
> +	if (slotid != cb->cb_held_slot) {
>  		dprintk("NFS: %s Invalid slotid\n", __func__);
>  		goto out;
>  	}
>  
> -	/*
> -	 * FIXME: process highest slotid and target highest slotid
> -	 */
> +	p++; // ignore current highest slot value
> +
> +	target = be32_to_cpup(p++);
> +	update_cb_slot_table(session, target);
>  	status = 0;
>  out:
>  	cb->cb_seq_status = status;
> @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
>  	spin_unlock(&clp->cl_lock);
>  }
>  
> +static int grab_slot(struct nfsd4_session *ses)
> +{
> +	int idx;
> +
> +	spin_lock(&ses->se_lock);
> +	idx = ffs(ses->se_cb_slot_avail) - 1;
> +	if (idx < 0 || idx > ses->se_cb_highest_slot) {
> +		spin_unlock(&ses->se_lock);
> +		return -1;
> +	}
> +	/* clear the bit for the slot */
> +	ses->se_cb_slot_avail &= ~BIT(idx);
> +	spin_unlock(&ses->se_lock);
> +	return idx;
> +}
> +
>  /*
>   * There's currently a single callback channel slot.
>   * If the slot is available, then mark it busy.  Otherwise, set the
> @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
>  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
>  {
>  	struct nfs4_client *clp = cb->cb_clp;
> +	struct nfsd4_session *ses = clp->cl_cb_session;
>  
> -	if (!cb->cb_holds_slot &&
> -	    test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> +	if (cb->cb_held_slot >= 0)
> +		return true;
> +	cb->cb_held_slot = grab_slot(ses);
> +	if (cb->cb_held_slot < 0) {
>  		rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
>  		/* Race breaker */
> -		if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> -			dprintk("%s slot is busy\n", __func__);
> +		cb->cb_held_slot = grab_slot(ses);
> +		if (cb->cb_held_slot < 0)
>  			return false;
> -		}
>  		rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
>  	}
> -	cb->cb_holds_slot = true;
>  	return true;
>  }
>  
>  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
>  {
>  	struct nfs4_client *clp = cb->cb_clp;
> +	struct nfsd4_session *ses = clp->cl_cb_session;
>  
> -	if (cb->cb_holds_slot) {
> -		cb->cb_holds_slot = false;
> -		clear_bit(0, &clp->cl_cb_slot_busy);
> +	if (cb->cb_held_slot >= 0) {
> +		spin_lock(&ses->se_lock);
> +		ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> +		spin_unlock(&ses->se_lock);
> +		cb->cb_held_slot = -1;
>  		rpc_wake_up_next(&clp->cl_cb_waitq);
>  	}
>  }
> @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
>  }
>  
>  /*
> - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> - * slots, and mark callback channel down on communication errors.
> + * TODO: cb_sequence should support referring call lists, cachethis,
> + * and mark callback channel down on communication errors.
>   */
>  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
>  {
> @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
>  		return true;
>  	}
>  
> -	if (!cb->cb_holds_slot)
> +	if (cb->cb_held_slot < 0)
>  		goto need_restart;
>  
>  	/* This is the operation status code for CB_SEQUENCE */
> @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
>  		 * If CB_SEQUENCE returns an error, then the state of the slot
>  		 * (sequence ID, cached reply) MUST NOT change.
>  		 */
> -		++session->se_cb_seq_nr;
> +		++session->se_cb_seq_nr[cb->cb_held_slot];
>  		break;
>  	case -ESERVERFAULT:
> -		++session->se_cb_seq_nr;
> +		++session->se_cb_seq_nr[cb->cb_held_slot];
>  		nfsd4_mark_cb_fault(cb->cb_clp);
>  		ret = false;
>  		break;
> @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
>  	case -NFS4ERR_BADSLOT:
>  		goto retry_nowait;
>  	case -NFS4ERR_SEQ_MISORDERED:
> -		if (session->se_cb_seq_nr != 1) {
> -			session->se_cb_seq_nr = 1;
> +		if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> +			session->se_cb_seq_nr[cb->cb_held_slot] = 1;
>  			goto retry_nowait;
>  		}
>  		break;
>  	default:
>  		nfsd4_mark_cb_fault(cb->cb_clp);
>  	}
> -	nfsd41_cb_release_slot(cb);
> -
>  	trace_nfsd_cb_free_slot(task, cb);
> +	nfsd41_cb_release_slot(cb);
>  
>  	if (RPC_SIGNALLED(task))
>  		goto need_restart;
> @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
>  	INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
>  	cb->cb_status = 0;
>  	cb->cb_need_restart = false;
> -	cb->cb_holds_slot = false;
> +	cb->cb_held_slot = -1;
>  }
>  
>  /**
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
>  	}
>  
>  	memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> +	new->se_cb_slot_avail = ~0U;
> +	new->se_cb_highest_slot = battrs->maxreqs - 1;
> +	spin_lock_init(&new->se_lock);
>  	return new;
>  out_free:
>  	while (i--)
> @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
>  
>  	INIT_LIST_HEAD(&new->se_conns);
>  
> -	new->se_cb_seq_nr = 1;
> +	atomic_set(&new->se_ref, 0);
>  	new->se_dead = false;
>  	new->se_cb_prog = cses->callback_prog;
>  	new->se_cb_sec = cses->cb_sec;
> -	atomic_set(&new->se_ref, 0);
> +
> +	for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> +		new->se_cb_seq_nr[idx] = 1;
> +
>  	idx = hash_sessionid(&new->se_sessionid);
>  	list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
>  	spin_lock(&clp->cl_lock);
> @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
>  	kref_init(&clp->cl_nfsdfs.cl_ref);
>  	nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
>  	clp->cl_time = ktime_get_boottime_seconds();
> -	clear_bit(0, &clp->cl_cb_slot_busy);
>  	copy_verf(clp, verf);
>  	memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
>  	clp->cl_cb_session = NULL;
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -71,8 +71,8 @@ struct nfsd4_callback {
>  	struct work_struct cb_work;
>  	int cb_seq_status;
>  	int cb_status;
> +	int cb_held_slot;
>  	bool cb_need_restart;
> -	bool cb_holds_slot;
>  };
>  
>  struct nfsd4_callback_ops {
> @@ -307,6 +307,9 @@ struct nfsd4_conn {
>  	unsigned char cn_flags;
>  };
>  
> +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> +#define NFSD_BC_SLOT_TABLE_MAX	(sizeof(u32) * 8 - 1)
> +
>  /*
>   * Representation of a v4.1+ session. These are refcounted in a similar fashion
>   * to the nfs4_client. References are only taken when the server is actively
> @@ -314,6 +317,10 @@ struct nfsd4_conn {
>   */
>  struct nfsd4_session {
>  	atomic_t		se_ref;
> +	spinlock_t		se_lock;
> +	u32			se_cb_slot_avail; /* bitmap of available slots */
> +	u32			se_cb_highest_slot;	/* highest slot client wants */
> +	u32			se_cb_prog;
>  	bool			se_dead;
>  	struct list_head	se_hash;	/* hash by sessionid */
>  	struct list_head	se_perclnt;
> @@ -322,8 +329,7 @@ struct nfsd4_session {
>  	struct nfsd4_channel_attrs se_fchannel;
>  	struct nfsd4_cb_sec	se_cb_sec;
>  	struct list_head	se_conns;
> -	u32			se_cb_prog;
> -	u32			se_cb_seq_nr;
> +	u32			se_cb_seq_nr[NFSD_BC_SLOT_TABLE_MAX + 1];
>  	struct nfsd4_slot	*se_slots[];	/* forward channel slots */
>  };
>  
> @@ -457,9 +463,6 @@ struct nfs4_client {
>  	 */
>  	struct dentry		*cl_nfsd_info_dentry;
>  
> -	/* for nfs41 callbacks */
> -	/* We currently support a single back channel with a single slot */
> -	unsigned long		cl_cb_slot_busy;
>  	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
>  						/* wait here for slots */
>  	struct net		*net;
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index f318898cfc31614b5a84a4867e18c2b3a07122c9..a9c17186b6892f1df8d7f7b90e250c2913ab23fe 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -1697,7 +1697,7 @@ TRACE_EVENT(nfsd_cb_free_slot,
>  		__entry->cl_id = sid->clientid.cl_id;
>  		__entry->seqno = sid->sequence;
>  		__entry->reserved = sid->reserved;
> -		__entry->slot_seqno = session->se_cb_seq_nr;
> +		__entry->slot_seqno = session->se_cb_seq_nr[cb->cb_held_slot];
>  	),
>  	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
>  		" sessionid=%08x:%08x:%08x:%08x new slot seqno=%u",
> 
> ---
> base-commit: 3c16aac09d20f9005fbb0e737b3ec520bbb5badd
> change-id: 20241025-bcwide-6bd7e4b63db2
> 
> Best regards,
> -- 
> Jeff Layton <jlayton@kernel.org>
>
Olga Kornievskaia Nov. 6, 2024, 4:44 p.m. UTC | #2
On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> nfsd currently only uses a single slot in the callback channel, which is
> proving to be a bottleneck in some cases. Widen the callback channel to
> a max of 32 slots (subject to the client's target_maxreqs value).
>
> Change the cb_holds_slot boolean to an integer that tracks the current
> slot number (with -1 meaning "unassigned").  Move the callback slot
> tracking info into the session. Add a new u32 that acts as a bitmap to
> track which slots are in use, and a u32 to track the latest callback
> target_slotid that the client reports. To protect the new fields, add
> a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> search for the lowest slotid (using ffs()).
>
> Finally, convert the session->se_cb_seq_nr field into an array of
> counters and add the necessary handling to ensure that the seqids get
> reset at the appropriate times.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
> v3 has a bug that Olga hit in testing. This version should fix the wait
> when the slot table is full. Olga, if you're able to test this one, it
> would be much appreciated.

I have tested this version. I can confirm that I'm not seeing the
softlockup. But the server still does not use the lowest available
slot. It is hard for me to describe the algorithm of picking the slot
number (in general it still seems to be picking the next slot value,
even though slots have been replied to). I have seen slot 0 re-used
eventually but it seemed to be when the server came to using slot=13.

The other unfortunate thing that's happening when I use these patches
is my test case that recalling delegations and making sure that the
state management gets handled properly (ie., the patch that I've
submitted to fix a race between the laundromat thread and free_state)
is not working. After all the recalls, the server still thinks it has
revoked state. I have to debug more to figure out what's going on.

> ---
> Changes in v4:
> - Fix the wait for a slot in nfsd41_cb_get_slot()
> - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
>
> Changes in v3:
> - add patch to convert se_flags to single se_dead bool
> - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> - don't reject target highest slot value of 0
> - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
>
> Changes in v2:
> - take cl_lock when fetching fields from session to be encoded
> - use fls() instead of bespoke highest_unset_index()
> - rename variables in several functions with more descriptive names
> - clamp limit of for loop in update_cb_slot_table()
> - re-add missing rpc_wake_up_queued_task() call
> - fix slotid check in decode_cb_sequence4resok()
> - add new per-session spinlock
> ---
>  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
>  fs/nfsd/nfs4state.c    |  11 +++--
>  fs/nfsd/state.h        |  15 ++++---
>  fs/nfsd/trace.h        |   2 +-
>  4 files changed, 101 insertions(+), 40 deletions(-)
>
> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> --- a/fs/nfsd/nfs4callback.c
> +++ b/fs/nfsd/nfs4callback.c
> @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
>         hdr->nops++;
>  }
>
> +static u32 highest_slotid(struct nfsd4_session *ses)
> +{
> +       u32 idx;
> +
> +       spin_lock(&ses->se_lock);
> +       idx = fls(~ses->se_cb_slot_avail);
> +       if (idx > 0)
> +               --idx;
> +       idx = max(idx, ses->se_cb_highest_slot);
> +       spin_unlock(&ses->se_lock);
> +       return idx;
> +}
> +
>  /*
>   * CB_SEQUENCE4args
>   *
> @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
>         encode_sessionid4(xdr, session);
>
>         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> -       *p++ = xdr_zero;                        /* csa_slotid */
> -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
>         *p++ = xdr_zero;                        /* csa_cachethis */
>         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
>
>         hdr->nops++;
>  }
>
> +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> +{
> +       /* No need to do anything if nothing changed */
> +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> +               return;
> +
> +       spin_lock(&ses->se_lock);
> +       if (target > ses->se_cb_highest_slot) {
> +               int i;
> +
> +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> +
> +               /* Growing the slot table. Reset any new sequences to 1 */
> +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> +                       ses->se_cb_seq_nr[i] = 1;
> +       }
> +       ses->se_cb_highest_slot = target;
> +       spin_unlock(&ses->se_lock);
> +}
> +
>  /*
>   * CB_SEQUENCE4resok
>   *
> @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
>         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
>         int status = -ESERVERFAULT;
>         __be32 *p;
> -       u32 dummy;
> +       u32 seqid, slotid, target;
>
>         /*
>          * If the server returns different values for sessionID, slotID or
> @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
>         }
>         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
>
> -       dummy = be32_to_cpup(p++);
> -       if (dummy != session->se_cb_seq_nr) {
> +       seqid = be32_to_cpup(p++);
> +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
>                 dprintk("NFS: %s Invalid sequence number\n", __func__);
>                 goto out;
>         }
>
> -       dummy = be32_to_cpup(p++);
> -       if (dummy != 0) {
> +       slotid = be32_to_cpup(p++);
> +       if (slotid != cb->cb_held_slot) {
>                 dprintk("NFS: %s Invalid slotid\n", __func__);
>                 goto out;
>         }
>
> -       /*
> -        * FIXME: process highest slotid and target highest slotid
> -        */
> +       p++; // ignore current highest slot value
> +
> +       target = be32_to_cpup(p++);
> +       update_cb_slot_table(session, target);
>         status = 0;
>  out:
>         cb->cb_seq_status = status;
> @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
>         spin_unlock(&clp->cl_lock);
>  }
>
> +static int grab_slot(struct nfsd4_session *ses)
> +{
> +       int idx;
> +
> +       spin_lock(&ses->se_lock);
> +       idx = ffs(ses->se_cb_slot_avail) - 1;
> +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> +               spin_unlock(&ses->se_lock);
> +               return -1;
> +       }
> +       /* clear the bit for the slot */
> +       ses->se_cb_slot_avail &= ~BIT(idx);
> +       spin_unlock(&ses->se_lock);
> +       return idx;
> +}
> +
>  /*
>   * There's currently a single callback channel slot.
>   * If the slot is available, then mark it busy.  Otherwise, set the
> @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
>  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
>  {
>         struct nfs4_client *clp = cb->cb_clp;
> +       struct nfsd4_session *ses = clp->cl_cb_session;
>
> -       if (!cb->cb_holds_slot &&
> -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> +       if (cb->cb_held_slot >= 0)
> +               return true;
> +       cb->cb_held_slot = grab_slot(ses);
> +       if (cb->cb_held_slot < 0) {
>                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
>                 /* Race breaker */
> -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> -                       dprintk("%s slot is busy\n", __func__);
> +               cb->cb_held_slot = grab_slot(ses);
> +               if (cb->cb_held_slot < 0)
>                         return false;
> -               }
>                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
>         }
> -       cb->cb_holds_slot = true;
>         return true;
>  }
>
>  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
>  {
>         struct nfs4_client *clp = cb->cb_clp;
> +       struct nfsd4_session *ses = clp->cl_cb_session;
>
> -       if (cb->cb_holds_slot) {
> -               cb->cb_holds_slot = false;
> -               clear_bit(0, &clp->cl_cb_slot_busy);
> +       if (cb->cb_held_slot >= 0) {
> +               spin_lock(&ses->se_lock);
> +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> +               spin_unlock(&ses->se_lock);
> +               cb->cb_held_slot = -1;
>                 rpc_wake_up_next(&clp->cl_cb_waitq);
>         }
>  }
> @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
>  }
>
>  /*
> - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> - * slots, and mark callback channel down on communication errors.
> + * TODO: cb_sequence should support referring call lists, cachethis,
> + * and mark callback channel down on communication errors.
>   */
>  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
>  {
> @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
>                 return true;
>         }
>
> -       if (!cb->cb_holds_slot)
> +       if (cb->cb_held_slot < 0)
>                 goto need_restart;
>
>         /* This is the operation status code for CB_SEQUENCE */
> @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
>                  * If CB_SEQUENCE returns an error, then the state of the slot
>                  * (sequence ID, cached reply) MUST NOT change.
>                  */
> -               ++session->se_cb_seq_nr;
> +               ++session->se_cb_seq_nr[cb->cb_held_slot];
>                 break;
>         case -ESERVERFAULT:
> -               ++session->se_cb_seq_nr;
> +               ++session->se_cb_seq_nr[cb->cb_held_slot];
>                 nfsd4_mark_cb_fault(cb->cb_clp);
>                 ret = false;
>                 break;
> @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
>         case -NFS4ERR_BADSLOT:
>                 goto retry_nowait;
>         case -NFS4ERR_SEQ_MISORDERED:
> -               if (session->se_cb_seq_nr != 1) {
> -                       session->se_cb_seq_nr = 1;
> +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
>                         goto retry_nowait;
>                 }
>                 break;
>         default:
>                 nfsd4_mark_cb_fault(cb->cb_clp);
>         }
> -       nfsd41_cb_release_slot(cb);
> -
>         trace_nfsd_cb_free_slot(task, cb);
> +       nfsd41_cb_release_slot(cb);
>
>         if (RPC_SIGNALLED(task))
>                 goto need_restart;
> @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
>         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
>         cb->cb_status = 0;
>         cb->cb_need_restart = false;
> -       cb->cb_holds_slot = false;
> +       cb->cb_held_slot = -1;
>  }
>
>  /**
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
>         }
>
>         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> +       new->se_cb_slot_avail = ~0U;
> +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> +       spin_lock_init(&new->se_lock);
>         return new;
>  out_free:
>         while (i--)
> @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
>
>         INIT_LIST_HEAD(&new->se_conns);
>
> -       new->se_cb_seq_nr = 1;
> +       atomic_set(&new->se_ref, 0);
>         new->se_dead = false;
>         new->se_cb_prog = cses->callback_prog;
>         new->se_cb_sec = cses->cb_sec;
> -       atomic_set(&new->se_ref, 0);
> +
> +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> +               new->se_cb_seq_nr[idx] = 1;
> +
>         idx = hash_sessionid(&new->se_sessionid);
>         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
>         spin_lock(&clp->cl_lock);
> @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
>         kref_init(&clp->cl_nfsdfs.cl_ref);
>         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
>         clp->cl_time = ktime_get_boottime_seconds();
> -       clear_bit(0, &clp->cl_cb_slot_busy);
>         copy_verf(clp, verf);
>         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
>         clp->cl_cb_session = NULL;
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -71,8 +71,8 @@ struct nfsd4_callback {
>         struct work_struct cb_work;
>         int cb_seq_status;
>         int cb_status;
> +       int cb_held_slot;
>         bool cb_need_restart;
> -       bool cb_holds_slot;
>  };
>
>  struct nfsd4_callback_ops {
> @@ -307,6 +307,9 @@ struct nfsd4_conn {
>         unsigned char cn_flags;
>  };
>
> +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> +
>  /*
>   * Representation of a v4.1+ session. These are refcounted in a similar fashion
>   * to the nfs4_client. References are only taken when the server is actively
> @@ -314,6 +317,10 @@ struct nfsd4_conn {
>   */
>  struct nfsd4_session {
>         atomic_t                se_ref;
> +       spinlock_t              se_lock;
> +       u32                     se_cb_slot_avail; /* bitmap of available slots */
> +       u32                     se_cb_highest_slot;     /* highest slot client wants */
> +       u32                     se_cb_prog;
>         bool                    se_dead;
>         struct list_head        se_hash;        /* hash by sessionid */
>         struct list_head        se_perclnt;
> @@ -322,8 +329,7 @@ struct nfsd4_session {
>         struct nfsd4_channel_attrs se_fchannel;
>         struct nfsd4_cb_sec     se_cb_sec;
>         struct list_head        se_conns;
> -       u32                     se_cb_prog;
> -       u32                     se_cb_seq_nr;
> +       u32                     se_cb_seq_nr[NFSD_BC_SLOT_TABLE_MAX + 1];
>         struct nfsd4_slot       *se_slots[];    /* forward channel slots */
>  };
>
> @@ -457,9 +463,6 @@ struct nfs4_client {
>          */
>         struct dentry           *cl_nfsd_info_dentry;
>
> -       /* for nfs41 callbacks */
> -       /* We currently support a single back channel with a single slot */
> -       unsigned long           cl_cb_slot_busy;
>         struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
>                                                 /* wait here for slots */
>         struct net              *net;
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index f318898cfc31614b5a84a4867e18c2b3a07122c9..a9c17186b6892f1df8d7f7b90e250c2913ab23fe 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -1697,7 +1697,7 @@ TRACE_EVENT(nfsd_cb_free_slot,
>                 __entry->cl_id = sid->clientid.cl_id;
>                 __entry->seqno = sid->sequence;
>                 __entry->reserved = sid->reserved;
> -               __entry->slot_seqno = session->se_cb_seq_nr;
> +               __entry->slot_seqno = session->se_cb_seq_nr[cb->cb_held_slot];
>         ),
>         TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
>                 " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u",
>
> ---
> base-commit: 3c16aac09d20f9005fbb0e737b3ec520bbb5badd
> change-id: 20241025-bcwide-6bd7e4b63db2
>
> Best regards,
> --
> Jeff Layton <jlayton@kernel.org>
>
>
Jeff Layton Nov. 6, 2024, 5:12 p.m. UTC | #3
On Wed, 2024-11-06 at 11:44 -0500, Olga Kornievskaia wrote:
> On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > nfsd currently only uses a single slot in the callback channel, which is
> > proving to be a bottleneck in some cases. Widen the callback channel to
> > a max of 32 slots (subject to the client's target_maxreqs value).
> > 
> > Change the cb_holds_slot boolean to an integer that tracks the current
> > slot number (with -1 meaning "unassigned").  Move the callback slot
> > tracking info into the session. Add a new u32 that acts as a bitmap to
> > track which slots are in use, and a u32 to track the latest callback
> > target_slotid that the client reports. To protect the new fields, add
> > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > search for the lowest slotid (using ffs()).
> > 
> > Finally, convert the session->se_cb_seq_nr field into an array of
> > counters and add the necessary handling to ensure that the seqids get
> > reset at the appropriate times.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > ---
> > v3 has a bug that Olga hit in testing. This version should fix the wait
> > when the slot table is full. Olga, if you're able to test this one, it
> > would be much appreciated.
> 
> I have tested this version. I can confirm that I'm not seeing the
> softlockup. But the server still does not use the lowest available
> slot. It is hard for me to describe the algorithm of picking the slot
> number (in general it still seems to be picking the next slot value,
> even though slots have been replied to). I have seen slot 0 re-used
> eventually but it seemed to be when the server came to using slot=13.
> 

Could this just be a sign that there is a significant amount of
concurrency going on? There is some delay between receiving the call on
the wire and the slot being released. With some workloads I can drive
up the concurrent number of requests pretty hard too. We release the
slot in nfsd41_cb_release_slot(), which is called from rpc_call_done.
That's run in a workqueue (rpciod), so maybe there are just queuing
delays.

The algorithm for picking a slot is very simple. It's just using ffs()
to find the first set bit in a 32 bit word and then it clears it and
returns that index. It should always prefer the lowest available slot.

> The other unfortunate thing that's happening when I use these patches
> is my test case that recalling delegations and making sure that the
> state management gets handled properly (ie., the patch that I've
> submitted to fix a race between the laundromat thread and free_state)
> is not working. After all the recalls, the server still thinks it has
> revoked state. I have to debug more to figure out what's going on.
> 

Thanks, keep us posted.

> > ---
> > Changes in v4:
> > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > 
> > Changes in v3:
> > - add patch to convert se_flags to single se_dead bool
> > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > - don't reject target highest slot value of 0
> > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > 
> > Changes in v2:
> > - take cl_lock when fetching fields from session to be encoded
> > - use fls() instead of bespoke highest_unset_index()
> > - rename variables in several functions with more descriptive names
> > - clamp limit of for loop in update_cb_slot_table()
> > - re-add missing rpc_wake_up_queued_task() call
> > - fix slotid check in decode_cb_sequence4resok()
> > - add new per-session spinlock
> > ---
> >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> >  fs/nfsd/nfs4state.c    |  11 +++--
> >  fs/nfsd/state.h        |  15 ++++---
> >  fs/nfsd/trace.h        |   2 +-
> >  4 files changed, 101 insertions(+), 40 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > --- a/fs/nfsd/nfs4callback.c
> > +++ b/fs/nfsd/nfs4callback.c
> > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> >         hdr->nops++;
> >  }
> > 
> > +static u32 highest_slotid(struct nfsd4_session *ses)
> > +{
> > +       u32 idx;
> > +
> > +       spin_lock(&ses->se_lock);
> > +       idx = fls(~ses->se_cb_slot_avail);
> > +       if (idx > 0)
> > +               --idx;
> > +       idx = max(idx, ses->se_cb_highest_slot);
> > +       spin_unlock(&ses->se_lock);
> > +       return idx;
> > +}
> > +
> >  /*
> >   * CB_SEQUENCE4args
> >   *
> > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> >         encode_sessionid4(xdr, session);
> > 
> >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > -       *p++ = xdr_zero;                        /* csa_slotid */
> > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> >         *p++ = xdr_zero;                        /* csa_cachethis */
> >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > 
> >         hdr->nops++;
> >  }
> > 
> > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > +{
> > +       /* No need to do anything if nothing changed */
> > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > +               return;
> > +
> > +       spin_lock(&ses->se_lock);
> > +       if (target > ses->se_cb_highest_slot) {
> > +               int i;
> > +
> > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > +
> > +               /* Growing the slot table. Reset any new sequences to 1 */
> > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > +                       ses->se_cb_seq_nr[i] = 1;
> > +       }
> > +       ses->se_cb_highest_slot = target;
> > +       spin_unlock(&ses->se_lock);
> > +}
> > +
> >  /*
> >   * CB_SEQUENCE4resok
> >   *
> > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> >         int status = -ESERVERFAULT;
> >         __be32 *p;
> > -       u32 dummy;
> > +       u32 seqid, slotid, target;
> > 
> >         /*
> >          * If the server returns different values for sessionID, slotID or
> > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> >         }
> >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > 
> > -       dummy = be32_to_cpup(p++);
> > -       if (dummy != session->se_cb_seq_nr) {
> > +       seqid = be32_to_cpup(p++);
> > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> >                 goto out;
> >         }
> > 
> > -       dummy = be32_to_cpup(p++);
> > -       if (dummy != 0) {
> > +       slotid = be32_to_cpup(p++);
> > +       if (slotid != cb->cb_held_slot) {
> >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> >                 goto out;
> >         }
> > 
> > -       /*
> > -        * FIXME: process highest slotid and target highest slotid
> > -        */
> > +       p++; // ignore current highest slot value
> > +
> > +       target = be32_to_cpup(p++);
> > +       update_cb_slot_table(session, target);
> >         status = 0;
> >  out:
> >         cb->cb_seq_status = status;
> > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> >         spin_unlock(&clp->cl_lock);
> >  }
> > 
> > +static int grab_slot(struct nfsd4_session *ses)
> > +{
> > +       int idx;
> > +
> > +       spin_lock(&ses->se_lock);
> > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > +               spin_unlock(&ses->se_lock);
> > +               return -1;
> > +       }
> > +       /* clear the bit for the slot */
> > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > +       spin_unlock(&ses->se_lock);
> > +       return idx;
> > +}
> > +
> >  /*
> >   * There's currently a single callback channel slot.
> >   * If the slot is available, then mark it busy.  Otherwise, set the
> > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> >  {
> >         struct nfs4_client *clp = cb->cb_clp;
> > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > 
> > -       if (!cb->cb_holds_slot &&
> > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > +       if (cb->cb_held_slot >= 0)
> > +               return true;
> > +       cb->cb_held_slot = grab_slot(ses);
> > +       if (cb->cb_held_slot < 0) {
> >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> >                 /* Race breaker */
> > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > -                       dprintk("%s slot is busy\n", __func__);
> > +               cb->cb_held_slot = grab_slot(ses);
> > +               if (cb->cb_held_slot < 0)
> >                         return false;
> > -               }
> >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> >         }
> > -       cb->cb_holds_slot = true;
> >         return true;
> >  }
> > 
> >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> >  {
> >         struct nfs4_client *clp = cb->cb_clp;
> > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > 
> > -       if (cb->cb_holds_slot) {
> > -               cb->cb_holds_slot = false;
> > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > +       if (cb->cb_held_slot >= 0) {
> > +               spin_lock(&ses->se_lock);
> > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > +               spin_unlock(&ses->se_lock);
> > +               cb->cb_held_slot = -1;
> >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> >         }
> >  }
> > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> >  }
> > 
> >  /*
> > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > - * slots, and mark callback channel down on communication errors.
> > + * TODO: cb_sequence should support referring call lists, cachethis,
> > + * and mark callback channel down on communication errors.
> >   */
> >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> >  {
> > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >                 return true;
> >         }
> > 
> > -       if (!cb->cb_holds_slot)
> > +       if (cb->cb_held_slot < 0)
> >                 goto need_restart;
> > 
> >         /* This is the operation status code for CB_SEQUENCE */
> > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >                  * If CB_SEQUENCE returns an error, then the state of the slot
> >                  * (sequence ID, cached reply) MUST NOT change.
> >                  */
> > -               ++session->se_cb_seq_nr;
> > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> >                 break;
> >         case -ESERVERFAULT:
> > -               ++session->se_cb_seq_nr;
> > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> >                 nfsd4_mark_cb_fault(cb->cb_clp);
> >                 ret = false;
> >                 break;
> > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >         case -NFS4ERR_BADSLOT:
> >                 goto retry_nowait;
> >         case -NFS4ERR_SEQ_MISORDERED:
> > -               if (session->se_cb_seq_nr != 1) {
> > -                       session->se_cb_seq_nr = 1;
> > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> >                         goto retry_nowait;
> >                 }
> >                 break;
> >         default:
> >                 nfsd4_mark_cb_fault(cb->cb_clp);
> >         }
> > -       nfsd41_cb_release_slot(cb);
> > -
> >         trace_nfsd_cb_free_slot(task, cb);
> > +       nfsd41_cb_release_slot(cb);
> > 
> >         if (RPC_SIGNALLED(task))
> >                 goto need_restart;
> > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> >         cb->cb_status = 0;
> >         cb->cb_need_restart = false;
> > -       cb->cb_holds_slot = false;
> > +       cb->cb_held_slot = -1;
> >  }
> > 
> >  /**
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> >         }
> > 
> >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > +       new->se_cb_slot_avail = ~0U;
> > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > +       spin_lock_init(&new->se_lock);
> >         return new;
> >  out_free:
> >         while (i--)
> > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > 
> >         INIT_LIST_HEAD(&new->se_conns);
> > 
> > -       new->se_cb_seq_nr = 1;
> > +       atomic_set(&new->se_ref, 0);
> >         new->se_dead = false;
> >         new->se_cb_prog = cses->callback_prog;
> >         new->se_cb_sec = cses->cb_sec;
> > -       atomic_set(&new->se_ref, 0);
> > +
> > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > +               new->se_cb_seq_nr[idx] = 1;
> > +
> >         idx = hash_sessionid(&new->se_sessionid);
> >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> >         spin_lock(&clp->cl_lock);
> > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> >         kref_init(&clp->cl_nfsdfs.cl_ref);
> >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> >         clp->cl_time = ktime_get_boottime_seconds();
> > -       clear_bit(0, &clp->cl_cb_slot_busy);
> >         copy_verf(clp, verf);
> >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> >         clp->cl_cb_session = NULL;
> > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > --- a/fs/nfsd/state.h
> > +++ b/fs/nfsd/state.h
> > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> >         struct work_struct cb_work;
> >         int cb_seq_status;
> >         int cb_status;
> > +       int cb_held_slot;
> >         bool cb_need_restart;
> > -       bool cb_holds_slot;
> >  };
> > 
> >  struct nfsd4_callback_ops {
> > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> >         unsigned char cn_flags;
> >  };
> > 
> > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > +
> >  /*
> >   * Representation of a v4.1+ session. These are refcounted in a similar fashion
> >   * to the nfs4_client. References are only taken when the server is actively
> > @@ -314,6 +317,10 @@ struct nfsd4_conn {
> >   */
> >  struct nfsd4_session {
> >         atomic_t                se_ref;
> > +       spinlock_t              se_lock;
> > +       u32                     se_cb_slot_avail; /* bitmap of available slots */
> > +       u32                     se_cb_highest_slot;     /* highest slot client wants */
> > +       u32                     se_cb_prog;
> >         bool                    se_dead;
> >         struct list_head        se_hash;        /* hash by sessionid */
> >         struct list_head        se_perclnt;
> > @@ -322,8 +329,7 @@ struct nfsd4_session {
> >         struct nfsd4_channel_attrs se_fchannel;
> >         struct nfsd4_cb_sec     se_cb_sec;
> >         struct list_head        se_conns;
> > -       u32                     se_cb_prog;
> > -       u32                     se_cb_seq_nr;
> > +       u32                     se_cb_seq_nr[NFSD_BC_SLOT_TABLE_MAX + 1];
> >         struct nfsd4_slot       *se_slots[];    /* forward channel slots */
> >  };
> > 
> > @@ -457,9 +463,6 @@ struct nfs4_client {
> >          */
> >         struct dentry           *cl_nfsd_info_dentry;
> > 
> > -       /* for nfs41 callbacks */
> > -       /* We currently support a single back channel with a single slot */
> > -       unsigned long           cl_cb_slot_busy;
> >         struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
> >                                                 /* wait here for slots */
> >         struct net              *net;
> > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > index f318898cfc31614b5a84a4867e18c2b3a07122c9..a9c17186b6892f1df8d7f7b90e250c2913ab23fe 100644
> > --- a/fs/nfsd/trace.h
> > +++ b/fs/nfsd/trace.h
> > @@ -1697,7 +1697,7 @@ TRACE_EVENT(nfsd_cb_free_slot,
> >                 __entry->cl_id = sid->clientid.cl_id;
> >                 __entry->seqno = sid->sequence;
> >                 __entry->reserved = sid->reserved;
> > -               __entry->slot_seqno = session->se_cb_seq_nr;
> > +               __entry->slot_seqno = session->se_cb_seq_nr[cb->cb_held_slot];
> >         ),
> >         TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
> >                 " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u",
> > 
> > ---
> > base-commit: 3c16aac09d20f9005fbb0e737b3ec520bbb5badd
> > change-id: 20241025-bcwide-6bd7e4b63db2
> > 
> > Best regards,
> > --
> > Jeff Layton <jlayton@kernel.org>
> > 
> >
Olga Kornievskaia Nov. 9, 2024, 6:50 p.m. UTC | #4
On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> nfsd currently only uses a single slot in the callback channel, which is
> proving to be a bottleneck in some cases. Widen the callback channel to
> a max of 32 slots (subject to the client's target_maxreqs value).
>
> Change the cb_holds_slot boolean to an integer that tracks the current
> slot number (with -1 meaning "unassigned").  Move the callback slot
> tracking info into the session. Add a new u32 that acts as a bitmap to
> track which slots are in use, and a u32 to track the latest callback
> target_slotid that the client reports. To protect the new fields, add
> a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> search for the lowest slotid (using ffs()).
>
> Finally, convert the session->se_cb_seq_nr field into an array of
> counters and add the necessary handling to ensure that the seqids get
> reset at the appropriate times.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
> v3 has a bug that Olga hit in testing. This version should fix the wait
> when the slot table is full. Olga, if you're able to test this one, it
> would be much appreciated.
> ---
> Changes in v4:
> - Fix the wait for a slot in nfsd41_cb_get_slot()
> - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
>
> Changes in v3:
> - add patch to convert se_flags to single se_dead bool
> - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> - don't reject target highest slot value of 0
> - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
>
> Changes in v2:
> - take cl_lock when fetching fields from session to be encoded
> - use fls() instead of bespoke highest_unset_index()
> - rename variables in several functions with more descriptive names
> - clamp limit of for loop in update_cb_slot_table()
> - re-add missing rpc_wake_up_queued_task() call
> - fix slotid check in decode_cb_sequence4resok()
> - add new per-session spinlock
> ---
>  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
>  fs/nfsd/nfs4state.c    |  11 +++--
>  fs/nfsd/state.h        |  15 ++++---
>  fs/nfsd/trace.h        |   2 +-
>  4 files changed, 101 insertions(+), 40 deletions(-)
>
> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> --- a/fs/nfsd/nfs4callback.c
> +++ b/fs/nfsd/nfs4callback.c
> @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
>         hdr->nops++;
>  }
>
> +static u32 highest_slotid(struct nfsd4_session *ses)
> +{
> +       u32 idx;
> +
> +       spin_lock(&ses->se_lock);
> +       idx = fls(~ses->se_cb_slot_avail);
> +       if (idx > 0)
> +               --idx;
> +       idx = max(idx, ses->se_cb_highest_slot);
> +       spin_unlock(&ses->se_lock);
> +       return idx;
> +}
> +
>  /*
>   * CB_SEQUENCE4args
>   *
> @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
>         encode_sessionid4(xdr, session);
>
>         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> -       *p++ = xdr_zero;                        /* csa_slotid */
> -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
>         *p++ = xdr_zero;                        /* csa_cachethis */
>         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
>
>         hdr->nops++;
>  }
>
> +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> +{
> +       /* No need to do anything if nothing changed */
> +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> +               return;
> +
> +       spin_lock(&ses->se_lock);
> +       if (target > ses->se_cb_highest_slot) {
> +               int i;
> +
> +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> +
> +               /* Growing the slot table. Reset any new sequences to 1 */
> +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> +                       ses->se_cb_seq_nr[i] = 1;
> +       }
> +       ses->se_cb_highest_slot = target;
> +       spin_unlock(&ses->se_lock);
> +}
> +
>  /*
>   * CB_SEQUENCE4resok
>   *
> @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
>         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
>         int status = -ESERVERFAULT;
>         __be32 *p;
> -       u32 dummy;
> +       u32 seqid, slotid, target;
>
>         /*
>          * If the server returns different values for sessionID, slotID or
> @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
>         }
>         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
>
> -       dummy = be32_to_cpup(p++);
> -       if (dummy != session->se_cb_seq_nr) {
> +       seqid = be32_to_cpup(p++);
> +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
>                 dprintk("NFS: %s Invalid sequence number\n", __func__);
>                 goto out;
>         }
>
> -       dummy = be32_to_cpup(p++);
> -       if (dummy != 0) {
> +       slotid = be32_to_cpup(p++);
> +       if (slotid != cb->cb_held_slot) {
>                 dprintk("NFS: %s Invalid slotid\n", __func__);
>                 goto out;
>         }
>
> -       /*
> -        * FIXME: process highest slotid and target highest slotid
> -        */
> +       p++; // ignore current highest slot value
> +
> +       target = be32_to_cpup(p++);
> +       update_cb_slot_table(session, target);
>         status = 0;
>  out:
>         cb->cb_seq_status = status;
> @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
>         spin_unlock(&clp->cl_lock);
>  }
>
> +static int grab_slot(struct nfsd4_session *ses)
> +{
> +       int idx;
> +
> +       spin_lock(&ses->se_lock);
> +       idx = ffs(ses->se_cb_slot_avail) - 1;
> +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> +               spin_unlock(&ses->se_lock);
> +               return -1;
> +       }
> +       /* clear the bit for the slot */
> +       ses->se_cb_slot_avail &= ~BIT(idx);
> +       spin_unlock(&ses->se_lock);
> +       return idx;
> +}
> +
>  /*
>   * There's currently a single callback channel slot.
>   * If the slot is available, then mark it busy.  Otherwise, set the
> @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
>  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
>  {
>         struct nfs4_client *clp = cb->cb_clp;
> +       struct nfsd4_session *ses = clp->cl_cb_session;
>
> -       if (!cb->cb_holds_slot &&
> -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> +       if (cb->cb_held_slot >= 0)
> +               return true;
> +       cb->cb_held_slot = grab_slot(ses);
> +       if (cb->cb_held_slot < 0) {
>                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
>                 /* Race breaker */
> -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> -                       dprintk("%s slot is busy\n", __func__);
> +               cb->cb_held_slot = grab_slot(ses);
> +               if (cb->cb_held_slot < 0)
>                         return false;
> -               }
>                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
>         }
> -       cb->cb_holds_slot = true;
>         return true;
>  }
>
>  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
>  {
>         struct nfs4_client *clp = cb->cb_clp;
> +       struct nfsd4_session *ses = clp->cl_cb_session;
>
> -       if (cb->cb_holds_slot) {
> -               cb->cb_holds_slot = false;
> -               clear_bit(0, &clp->cl_cb_slot_busy);
> +       if (cb->cb_held_slot >= 0) {
> +               spin_lock(&ses->se_lock);
> +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> +               spin_unlock(&ses->se_lock);
> +               cb->cb_held_slot = -1;
>                 rpc_wake_up_next(&clp->cl_cb_waitq);
>         }
>  }
> @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
>  }
>
>  /*
> - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> - * slots, and mark callback channel down on communication errors.
> + * TODO: cb_sequence should support referring call lists, cachethis,
> + * and mark callback channel down on communication errors.
>   */
>  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
>  {
> @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
>                 return true;
>         }
>
> -       if (!cb->cb_holds_slot)
> +       if (cb->cb_held_slot < 0)
>                 goto need_restart;
>
>         /* This is the operation status code for CB_SEQUENCE */
> @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
>                  * If CB_SEQUENCE returns an error, then the state of the slot
>                  * (sequence ID, cached reply) MUST NOT change.
>                  */
> -               ++session->se_cb_seq_nr;
> +               ++session->se_cb_seq_nr[cb->cb_held_slot];
>                 break;
>         case -ESERVERFAULT:
> -               ++session->se_cb_seq_nr;
> +               ++session->se_cb_seq_nr[cb->cb_held_slot];
>                 nfsd4_mark_cb_fault(cb->cb_clp);
>                 ret = false;
>                 break;
> @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
>         case -NFS4ERR_BADSLOT:
>                 goto retry_nowait;
>         case -NFS4ERR_SEQ_MISORDERED:
> -               if (session->se_cb_seq_nr != 1) {
> -                       session->se_cb_seq_nr = 1;
> +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
>                         goto retry_nowait;
>                 }
>                 break;
>         default:
>                 nfsd4_mark_cb_fault(cb->cb_clp);
>         }
> -       nfsd41_cb_release_slot(cb);
> -
>         trace_nfsd_cb_free_slot(task, cb);
> +       nfsd41_cb_release_slot(cb);
>
>         if (RPC_SIGNALLED(task))
>                 goto need_restart;
> @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
>         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
>         cb->cb_status = 0;
>         cb->cb_need_restart = false;
> -       cb->cb_holds_slot = false;
> +       cb->cb_held_slot = -1;
>  }
>
>  /**
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
>         }
>
>         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> +       new->se_cb_slot_avail = ~0U;
> +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> +       spin_lock_init(&new->se_lock);
>         return new;
>  out_free:
>         while (i--)
> @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
>
>         INIT_LIST_HEAD(&new->se_conns);
>
> -       new->se_cb_seq_nr = 1;
> +       atomic_set(&new->se_ref, 0);
>         new->se_dead = false;
>         new->se_cb_prog = cses->callback_prog;
>         new->se_cb_sec = cses->cb_sec;
> -       atomic_set(&new->se_ref, 0);
> +
> +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> +               new->se_cb_seq_nr[idx] = 1;
> +
>         idx = hash_sessionid(&new->se_sessionid);
>         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
>         spin_lock(&clp->cl_lock);
> @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
>         kref_init(&clp->cl_nfsdfs.cl_ref);
>         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
>         clp->cl_time = ktime_get_boottime_seconds();
> -       clear_bit(0, &clp->cl_cb_slot_busy);
>         copy_verf(clp, verf);
>         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
>         clp->cl_cb_session = NULL;
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -71,8 +71,8 @@ struct nfsd4_callback {
>         struct work_struct cb_work;
>         int cb_seq_status;
>         int cb_status;
> +       int cb_held_slot;
>         bool cb_need_restart;
> -       bool cb_holds_slot;
>  };
>
>  struct nfsd4_callback_ops {
> @@ -307,6 +307,9 @@ struct nfsd4_conn {
>         unsigned char cn_flags;
>  };
>
> +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)

Are there some values that are known not to work? I was experimenting
with values and set it to 2 and 4 and the kernel oopsed. I understand
it's not a configurable value but it would still be good to know the
expectations...

[  198.625021] Unable to handle kernel paging request at virtual
address dfff800020000000
[  198.625870] KASAN: probably user-memory-access in range
[0x0000000100000000-0x0000000100000007]
[  198.626444] Mem abort info:
[  198.626630]   ESR = 0x0000000096000005
[  198.626882]   EC = 0x25: DABT (current EL), IL = 32 bits
[  198.627234]   SET = 0, FnV = 0
[  198.627441]   EA = 0, S1PTW = 0
[  198.627627]   FSC = 0x05: level 1 translation fault
[  198.627859] Data abort info:
[  198.628000]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
[  198.628272]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[  198.628619]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[  198.628967] [dfff800020000000] address between user and kernel address ranges
[  198.629438] Internal error: Oops: 0000000096000005 [#1] SMP
[  198.629806] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver
nfs netfs nfnetlink_queue nfnetlink_log nfnetlink bluetooth cfg80211
rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd auth_rpcgss nfs_acl lockd
grace isofs uinput snd_seq_dummy snd_hrtimer vsock_loopback
vmw_vsock_virtio_transport_common qrtr rfkill vmw_vsock_vmci_transport
vsock sunrpc vfat fat snd_hda_codec_generic snd_hda_intel
snd_intel_dspcfg snd_hda_codec snd_hda_core snd_hwdep snd_seq uvcvideo
videobuf2_vmalloc snd_seq_device videobuf2_memops uvc videobuf2_v4l2
videodev snd_pcm videobuf2_common mc snd_timer snd vmw_vmci soundcore
xfs libcrc32c vmwgfx drm_ttm_helper ttm nvme drm_kms_helper
crct10dif_ce nvme_core ghash_ce sha2_ce sha256_arm64 sha1_ce drm
nvme_auth sr_mod cdrom e1000e sg fuse
[  198.633799] CPU: 5 UID: 0 PID: 6081 Comm: nfsd Kdump: loaded Not
tainted 6.12.0-rc6+ #47
[  198.634345] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS
VMW201.00V.21805430.BA64.2305221830 05/22/2023
[  198.635014] pstate: 11400005 (nzcV daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
[  198.635492] pc : nfsd4_sequence+0x5a0/0x1f60 [nfsd]
[  198.635798] lr : nfsd4_sequence+0x340/0x1f60 [nfsd]
[  198.636065] sp : ffff8000884977e0
[  198.636234] x29: ffff800088497910 x28: ffff0000b1b39280 x27: ffff0000ab508128
[  198.636624] x26: ffff0000b1b39298 x25: ffff0000b1b39290 x24: ffff0000a65e1c64
[  198.637049] x23: 1fffe000212e6804 x22: ffff000109734024 x21: 1ffff00011092f16
[  198.637472] x20: ffff00010aed8000 x19: ffff000109734000 x18: 1fffe0002de20c8b
[  198.637883] x17: 0100000000000000 x16: 1ffff0000fcef234 x15: 1fffe000212e600f
[  198.638286] x14: ffff80007e779000 x13: ffff80007e7791a0 x12: 0000000000000000
[  198.638697] x11: ffff0000a65e1c38 x10: ffff00010aedaca0 x9 : 1fffe000215db594
[  198.639110] x8 : 1fffe00014cbc387 x7 : ffff0000a65e1c03 x6 : ffff0000a65e1c00
[  198.639541] x5 : ffff0000a65e1c00 x4 : 0000000020000000 x3 : 0000000100000001
[  198.639962] x2 : ffff000109730060 x1 : 0000000000000003 x0 : dfff800000000000
[  198.640332] Call trace:
[  198.640460]  nfsd4_sequence+0x5a0/0x1f60 [nfsd]
[  198.640715]  nfsd4_proc_compound+0xb94/0x23b0 [nfsd]
[  198.640997]  nfsd_dispatch+0x22c/0x718 [nfsd]
[  198.641260]  svc_process_common+0x8e8/0x1968 [sunrpc]
[  198.641566]  svc_process+0x3d4/0x7e0 [sunrpc]
[  198.641827]  svc_handle_xprt+0x828/0xe10 [sunrpc]
[  198.642108]  svc_recv+0x2cc/0x6a8 [sunrpc]
[  198.642346]  nfsd+0x270/0x400 [nfsd]
[  198.642562]  kthread+0x288/0x310
[  198.642745]  ret_from_fork+0x10/0x20
[  198.642937] Code: f2fbffe0 f9003be4 f94007e2 52800061 (38e06880)
[  198.643267] SMP: stopping secondary CPUs


> +
>  /*
>   * Representation of a v4.1+ session. These are refcounted in a similar fashion
>   * to the nfs4_client. References are only taken when the server is actively
> @@ -314,6 +317,10 @@ struct nfsd4_conn {
>   */
>  struct nfsd4_session {
>         atomic_t                se_ref;
> +       spinlock_t              se_lock;
> +       u32                     se_cb_slot_avail; /* bitmap of available slots */
> +       u32                     se_cb_highest_slot;     /* highest slot client wants */
> +       u32                     se_cb_prog;
>         bool                    se_dead;
>         struct list_head        se_hash;        /* hash by sessionid */
>         struct list_head        se_perclnt;
> @@ -322,8 +329,7 @@ struct nfsd4_session {
>         struct nfsd4_channel_attrs se_fchannel;
>         struct nfsd4_cb_sec     se_cb_sec;
>         struct list_head        se_conns;
> -       u32                     se_cb_prog;
> -       u32                     se_cb_seq_nr;
> +       u32                     se_cb_seq_nr[NFSD_BC_SLOT_TABLE_MAX + 1];
>         struct nfsd4_slot       *se_slots[];    /* forward channel slots */
>  };
>
> @@ -457,9 +463,6 @@ struct nfs4_client {
>          */
>         struct dentry           *cl_nfsd_info_dentry;
>
> -       /* for nfs41 callbacks */
> -       /* We currently support a single back channel with a single slot */
> -       unsigned long           cl_cb_slot_busy;
>         struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
>                                                 /* wait here for slots */
>         struct net              *net;
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index f318898cfc31614b5a84a4867e18c2b3a07122c9..a9c17186b6892f1df8d7f7b90e250c2913ab23fe 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -1697,7 +1697,7 @@ TRACE_EVENT(nfsd_cb_free_slot,
>                 __entry->cl_id = sid->clientid.cl_id;
>                 __entry->seqno = sid->sequence;
>                 __entry->reserved = sid->reserved;
> -               __entry->slot_seqno = session->se_cb_seq_nr;
> +               __entry->slot_seqno = session->se_cb_seq_nr[cb->cb_held_slot];
>         ),
>         TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
>                 " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u",
>
> ---
> base-commit: 3c16aac09d20f9005fbb0e737b3ec520bbb5badd
> change-id: 20241025-bcwide-6bd7e4b63db2
>
> Best regards,
> --
> Jeff Layton <jlayton@kernel.org>
>
>
Olga Kornievskaia Nov. 9, 2024, 7:24 p.m. UTC | #5
On Wed, Nov 6, 2024 at 11:44 AM Olga Kornievskaia <aglo@umich.edu> wrote:
>
> On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > nfsd currently only uses a single slot in the callback channel, which is
> > proving to be a bottleneck in some cases. Widen the callback channel to
> > a max of 32 slots (subject to the client's target_maxreqs value).
> >
> > Change the cb_holds_slot boolean to an integer that tracks the current
> > slot number (with -1 meaning "unassigned").  Move the callback slot
> > tracking info into the session. Add a new u32 that acts as a bitmap to
> > track which slots are in use, and a u32 to track the latest callback
> > target_slotid that the client reports. To protect the new fields, add
> > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > search for the lowest slotid (using ffs()).
> >
> > Finally, convert the session->se_cb_seq_nr field into an array of
> > counters and add the necessary handling to ensure that the seqids get
> > reset at the appropriate times.
> >
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > ---
> > v3 has a bug that Olga hit in testing. This version should fix the wait
> > when the slot table is full. Olga, if you're able to test this one, it
> > would be much appreciated.
>
> I have tested this version. I can confirm that I'm not seeing the
> softlockup. But the server still does not use the lowest available
> slot. It is hard for me to describe the algorithm of picking the slot
> number (in general it still seems to be picking the next slot value,
> even though slots have been replied to). I have seen slot 0 re-used
> eventually but it seemed to be when the server came to using slot=13.
>
> The other unfortunate thing that's happening when I use these patches
> is my test case that recalling delegations and making sure that the
> state management gets handled properly (ie., the patch that I've
> submitted to fix a race between the laundromat thread and free_state)
> is not working. After all the recalls, the server still thinks it has
> revoked state. I have to debug more to figure out what's going on.
>

I haven't been able to reproduce the cl_revoked list ending non-empty
but I have hit it, let's say 2-3times in the 4days that I've been
trying various things trying to reproduce it. And thus my attempt at
changing the number of callback session slots (and hitting a kernel
oops). Still trying.

Also another comment is that I don't see having multiple slots help
with the issue of having numerous recalls that end up resulting in 6
RPC exchanges I've described earlier.

Instead what I see is when the server starts setting the SEQUENCE flag
of revocable state, then the CB_RECALLs are getting ERR_DELAY error
(not there aren't multiple callbacks in flight, perhaps at most 2). So
it seems like things are "slowing down" even further. There are about
2-3 CB_RECALLs 3rd getting the reply then OPEN which gets BAD_STATEID,
then TEST_STATEID, FREE_STATEID, and then OPEN.

> > ---
> > Changes in v4:
> > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> >
> > Changes in v3:
> > - add patch to convert se_flags to single se_dead bool
> > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > - don't reject target highest slot value of 0
> > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> >
> > Changes in v2:
> > - take cl_lock when fetching fields from session to be encoded
> > - use fls() instead of bespoke highest_unset_index()
> > - rename variables in several functions with more descriptive names
> > - clamp limit of for loop in update_cb_slot_table()
> > - re-add missing rpc_wake_up_queued_task() call
> > - fix slotid check in decode_cb_sequence4resok()
> > - add new per-session spinlock
> > ---
> >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> >  fs/nfsd/nfs4state.c    |  11 +++--
> >  fs/nfsd/state.h        |  15 ++++---
> >  fs/nfsd/trace.h        |   2 +-
> >  4 files changed, 101 insertions(+), 40 deletions(-)
> >
> > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > --- a/fs/nfsd/nfs4callback.c
> > +++ b/fs/nfsd/nfs4callback.c
> > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> >         hdr->nops++;
> >  }
> >
> > +static u32 highest_slotid(struct nfsd4_session *ses)
> > +{
> > +       u32 idx;
> > +
> > +       spin_lock(&ses->se_lock);
> > +       idx = fls(~ses->se_cb_slot_avail);
> > +       if (idx > 0)
> > +               --idx;
> > +       idx = max(idx, ses->se_cb_highest_slot);
> > +       spin_unlock(&ses->se_lock);
> > +       return idx;
> > +}
> > +
> >  /*
> >   * CB_SEQUENCE4args
> >   *
> > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> >         encode_sessionid4(xdr, session);
> >
> >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > -       *p++ = xdr_zero;                        /* csa_slotid */
> > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> >         *p++ = xdr_zero;                        /* csa_cachethis */
> >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> >
> >         hdr->nops++;
> >  }
> >
> > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > +{
> > +       /* No need to do anything if nothing changed */
> > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > +               return;
> > +
> > +       spin_lock(&ses->se_lock);
> > +       if (target > ses->se_cb_highest_slot) {
> > +               int i;
> > +
> > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > +
> > +               /* Growing the slot table. Reset any new sequences to 1 */
> > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > +                       ses->se_cb_seq_nr[i] = 1;
> > +       }
> > +       ses->se_cb_highest_slot = target;
> > +       spin_unlock(&ses->se_lock);
> > +}
> > +
> >  /*
> >   * CB_SEQUENCE4resok
> >   *
> > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> >         int status = -ESERVERFAULT;
> >         __be32 *p;
> > -       u32 dummy;
> > +       u32 seqid, slotid, target;
> >
> >         /*
> >          * If the server returns different values for sessionID, slotID or
> > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> >         }
> >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> >
> > -       dummy = be32_to_cpup(p++);
> > -       if (dummy != session->se_cb_seq_nr) {
> > +       seqid = be32_to_cpup(p++);
> > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> >                 goto out;
> >         }
> >
> > -       dummy = be32_to_cpup(p++);
> > -       if (dummy != 0) {
> > +       slotid = be32_to_cpup(p++);
> > +       if (slotid != cb->cb_held_slot) {
> >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> >                 goto out;
> >         }
> >
> > -       /*
> > -        * FIXME: process highest slotid and target highest slotid
> > -        */
> > +       p++; // ignore current highest slot value
> > +
> > +       target = be32_to_cpup(p++);
> > +       update_cb_slot_table(session, target);
> >         status = 0;
> >  out:
> >         cb->cb_seq_status = status;
> > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> >         spin_unlock(&clp->cl_lock);
> >  }
> >
> > +static int grab_slot(struct nfsd4_session *ses)
> > +{
> > +       int idx;
> > +
> > +       spin_lock(&ses->se_lock);
> > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > +               spin_unlock(&ses->se_lock);
> > +               return -1;
> > +       }
> > +       /* clear the bit for the slot */
> > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > +       spin_unlock(&ses->se_lock);
> > +       return idx;
> > +}
> > +
> >  /*
> >   * There's currently a single callback channel slot.
> >   * If the slot is available, then mark it busy.  Otherwise, set the
> > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> >  {
> >         struct nfs4_client *clp = cb->cb_clp;
> > +       struct nfsd4_session *ses = clp->cl_cb_session;
> >
> > -       if (!cb->cb_holds_slot &&
> > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > +       if (cb->cb_held_slot >= 0)
> > +               return true;
> > +       cb->cb_held_slot = grab_slot(ses);
> > +       if (cb->cb_held_slot < 0) {
> >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> >                 /* Race breaker */
> > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > -                       dprintk("%s slot is busy\n", __func__);
> > +               cb->cb_held_slot = grab_slot(ses);
> > +               if (cb->cb_held_slot < 0)
> >                         return false;
> > -               }
> >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> >         }
> > -       cb->cb_holds_slot = true;
> >         return true;
> >  }
> >
> >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> >  {
> >         struct nfs4_client *clp = cb->cb_clp;
> > +       struct nfsd4_session *ses = clp->cl_cb_session;
> >
> > -       if (cb->cb_holds_slot) {
> > -               cb->cb_holds_slot = false;
> > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > +       if (cb->cb_held_slot >= 0) {
> > +               spin_lock(&ses->se_lock);
> > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > +               spin_unlock(&ses->se_lock);
> > +               cb->cb_held_slot = -1;
> >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> >         }
> >  }
> > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> >  }
> >
> >  /*
> > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > - * slots, and mark callback channel down on communication errors.
> > + * TODO: cb_sequence should support referring call lists, cachethis,
> > + * and mark callback channel down on communication errors.
> >   */
> >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> >  {
> > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >                 return true;
> >         }
> >
> > -       if (!cb->cb_holds_slot)
> > +       if (cb->cb_held_slot < 0)
> >                 goto need_restart;
> >
> >         /* This is the operation status code for CB_SEQUENCE */
> > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >                  * If CB_SEQUENCE returns an error, then the state of the slot
> >                  * (sequence ID, cached reply) MUST NOT change.
> >                  */
> > -               ++session->se_cb_seq_nr;
> > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> >                 break;
> >         case -ESERVERFAULT:
> > -               ++session->se_cb_seq_nr;
> > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> >                 nfsd4_mark_cb_fault(cb->cb_clp);
> >                 ret = false;
> >                 break;
> > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >         case -NFS4ERR_BADSLOT:
> >                 goto retry_nowait;
> >         case -NFS4ERR_SEQ_MISORDERED:
> > -               if (session->se_cb_seq_nr != 1) {
> > -                       session->se_cb_seq_nr = 1;
> > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> >                         goto retry_nowait;
> >                 }
> >                 break;
> >         default:
> >                 nfsd4_mark_cb_fault(cb->cb_clp);
> >         }
> > -       nfsd41_cb_release_slot(cb);
> > -
> >         trace_nfsd_cb_free_slot(task, cb);
> > +       nfsd41_cb_release_slot(cb);
> >
> >         if (RPC_SIGNALLED(task))
> >                 goto need_restart;
> > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> >         cb->cb_status = 0;
> >         cb->cb_need_restart = false;
> > -       cb->cb_holds_slot = false;
> > +       cb->cb_held_slot = -1;
> >  }
> >
> >  /**
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> >         }
> >
> >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > +       new->se_cb_slot_avail = ~0U;
> > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > +       spin_lock_init(&new->se_lock);
> >         return new;
> >  out_free:
> >         while (i--)
> > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> >
> >         INIT_LIST_HEAD(&new->se_conns);
> >
> > -       new->se_cb_seq_nr = 1;
> > +       atomic_set(&new->se_ref, 0);
> >         new->se_dead = false;
> >         new->se_cb_prog = cses->callback_prog;
> >         new->se_cb_sec = cses->cb_sec;
> > -       atomic_set(&new->se_ref, 0);
> > +
> > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > +               new->se_cb_seq_nr[idx] = 1;
> > +
> >         idx = hash_sessionid(&new->se_sessionid);
> >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> >         spin_lock(&clp->cl_lock);
> > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> >         kref_init(&clp->cl_nfsdfs.cl_ref);
> >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> >         clp->cl_time = ktime_get_boottime_seconds();
> > -       clear_bit(0, &clp->cl_cb_slot_busy);
> >         copy_verf(clp, verf);
> >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> >         clp->cl_cb_session = NULL;
> > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > --- a/fs/nfsd/state.h
> > +++ b/fs/nfsd/state.h
> > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> >         struct work_struct cb_work;
> >         int cb_seq_status;
> >         int cb_status;
> > +       int cb_held_slot;
> >         bool cb_need_restart;
> > -       bool cb_holds_slot;
> >  };
> >
> >  struct nfsd4_callback_ops {
> > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> >         unsigned char cn_flags;
> >  };
> >
> > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > +
> >  /*
> >   * Representation of a v4.1+ session. These are refcounted in a similar fashion
> >   * to the nfs4_client. References are only taken when the server is actively
> > @@ -314,6 +317,10 @@ struct nfsd4_conn {
> >   */
> >  struct nfsd4_session {
> >         atomic_t                se_ref;
> > +       spinlock_t              se_lock;
> > +       u32                     se_cb_slot_avail; /* bitmap of available slots */
> > +       u32                     se_cb_highest_slot;     /* highest slot client wants */
> > +       u32                     se_cb_prog;
> >         bool                    se_dead;
> >         struct list_head        se_hash;        /* hash by sessionid */
> >         struct list_head        se_perclnt;
> > @@ -322,8 +329,7 @@ struct nfsd4_session {
> >         struct nfsd4_channel_attrs se_fchannel;
> >         struct nfsd4_cb_sec     se_cb_sec;
> >         struct list_head        se_conns;
> > -       u32                     se_cb_prog;
> > -       u32                     se_cb_seq_nr;
> > +       u32                     se_cb_seq_nr[NFSD_BC_SLOT_TABLE_MAX + 1];
> >         struct nfsd4_slot       *se_slots[];    /* forward channel slots */
> >  };
> >
> > @@ -457,9 +463,6 @@ struct nfs4_client {
> >          */
> >         struct dentry           *cl_nfsd_info_dentry;
> >
> > -       /* for nfs41 callbacks */
> > -       /* We currently support a single back channel with a single slot */
> > -       unsigned long           cl_cb_slot_busy;
> >         struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
> >                                                 /* wait here for slots */
> >         struct net              *net;
> > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > index f318898cfc31614b5a84a4867e18c2b3a07122c9..a9c17186b6892f1df8d7f7b90e250c2913ab23fe 100644
> > --- a/fs/nfsd/trace.h
> > +++ b/fs/nfsd/trace.h
> > @@ -1697,7 +1697,7 @@ TRACE_EVENT(nfsd_cb_free_slot,
> >                 __entry->cl_id = sid->clientid.cl_id;
> >                 __entry->seqno = sid->sequence;
> >                 __entry->reserved = sid->reserved;
> > -               __entry->slot_seqno = session->se_cb_seq_nr;
> > +               __entry->slot_seqno = session->se_cb_seq_nr[cb->cb_held_slot];
> >         ),
> >         TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
> >                 " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u",
> >
> > ---
> > base-commit: 3c16aac09d20f9005fbb0e737b3ec520bbb5badd
> > change-id: 20241025-bcwide-6bd7e4b63db2
> >
> > Best regards,
> > --
> > Jeff Layton <jlayton@kernel.org>
> >
> >
Jeff Layton Nov. 9, 2024, 7:26 p.m. UTC | #6
On Sat, 2024-11-09 at 13:50 -0500, Olga Kornievskaia wrote:
> On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > nfsd currently only uses a single slot in the callback channel, which is
> > proving to be a bottleneck in some cases. Widen the callback channel to
> > a max of 32 slots (subject to the client's target_maxreqs value).
> > 
> > Change the cb_holds_slot boolean to an integer that tracks the current
> > slot number (with -1 meaning "unassigned").  Move the callback slot
> > tracking info into the session. Add a new u32 that acts as a bitmap to
> > track which slots are in use, and a u32 to track the latest callback
> > target_slotid that the client reports. To protect the new fields, add
> > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > search for the lowest slotid (using ffs()).
> > 
> > Finally, convert the session->se_cb_seq_nr field into an array of
> > counters and add the necessary handling to ensure that the seqids get
> > reset at the appropriate times.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > ---
> > v3 has a bug that Olga hit in testing. This version should fix the wait
> > when the slot table is full. Olga, if you're able to test this one, it
> > would be much appreciated.
> > ---
> > Changes in v4:
> > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > 
> > Changes in v3:
> > - add patch to convert se_flags to single se_dead bool
> > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > - don't reject target highest slot value of 0
> > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > 
> > Changes in v2:
> > - take cl_lock when fetching fields from session to be encoded
> > - use fls() instead of bespoke highest_unset_index()
> > - rename variables in several functions with more descriptive names
> > - clamp limit of for loop in update_cb_slot_table()
> > - re-add missing rpc_wake_up_queued_task() call
> > - fix slotid check in decode_cb_sequence4resok()
> > - add new per-session spinlock
> > ---
> >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> >  fs/nfsd/nfs4state.c    |  11 +++--
> >  fs/nfsd/state.h        |  15 ++++---
> >  fs/nfsd/trace.h        |   2 +-
> >  4 files changed, 101 insertions(+), 40 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > --- a/fs/nfsd/nfs4callback.c
> > +++ b/fs/nfsd/nfs4callback.c
> > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> >         hdr->nops++;
> >  }
> > 
> > +static u32 highest_slotid(struct nfsd4_session *ses)
> > +{
> > +       u32 idx;
> > +
> > +       spin_lock(&ses->se_lock);
> > +       idx = fls(~ses->se_cb_slot_avail);
> > +       if (idx > 0)
> > +               --idx;
> > +       idx = max(idx, ses->se_cb_highest_slot);
> > +       spin_unlock(&ses->se_lock);
> > +       return idx;
> > +}
> > +
> >  /*
> >   * CB_SEQUENCE4args
> >   *
> > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> >         encode_sessionid4(xdr, session);
> > 
> >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > -       *p++ = xdr_zero;                        /* csa_slotid */
> > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> >         *p++ = xdr_zero;                        /* csa_cachethis */
> >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > 
> >         hdr->nops++;
> >  }
> > 
> > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > +{
> > +       /* No need to do anything if nothing changed */
> > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > +               return;
> > +
> > +       spin_lock(&ses->se_lock);
> > +       if (target > ses->se_cb_highest_slot) {
> > +               int i;
> > +
> > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > +
> > +               /* Growing the slot table. Reset any new sequences to 1 */
> > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > +                       ses->se_cb_seq_nr[i] = 1;
> > +       }
> > +       ses->se_cb_highest_slot = target;
> > +       spin_unlock(&ses->se_lock);
> > +}
> > +
> >  /*
> >   * CB_SEQUENCE4resok
> >   *
> > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> >         int status = -ESERVERFAULT;
> >         __be32 *p;
> > -       u32 dummy;
> > +       u32 seqid, slotid, target;
> > 
> >         /*
> >          * If the server returns different values for sessionID, slotID or
> > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> >         }
> >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > 
> > -       dummy = be32_to_cpup(p++);
> > -       if (dummy != session->se_cb_seq_nr) {
> > +       seqid = be32_to_cpup(p++);
> > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> >                 goto out;
> >         }
> > 
> > -       dummy = be32_to_cpup(p++);
> > -       if (dummy != 0) {
> > +       slotid = be32_to_cpup(p++);
> > +       if (slotid != cb->cb_held_slot) {
> >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> >                 goto out;
> >         }
> > 
> > -       /*
> > -        * FIXME: process highest slotid and target highest slotid
> > -        */
> > +       p++; // ignore current highest slot value
> > +
> > +       target = be32_to_cpup(p++);
> > +       update_cb_slot_table(session, target);
> >         status = 0;
> >  out:
> >         cb->cb_seq_status = status;
> > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> >         spin_unlock(&clp->cl_lock);
> >  }
> > 
> > +static int grab_slot(struct nfsd4_session *ses)
> > +{
> > +       int idx;
> > +
> > +       spin_lock(&ses->se_lock);
> > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > +               spin_unlock(&ses->se_lock);
> > +               return -1;
> > +       }
> > +       /* clear the bit for the slot */
> > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > +       spin_unlock(&ses->se_lock);
> > +       return idx;
> > +}
> > +
> >  /*
> >   * There's currently a single callback channel slot.
> >   * If the slot is available, then mark it busy.  Otherwise, set the
> > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> >  {
> >         struct nfs4_client *clp = cb->cb_clp;
> > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > 
> > -       if (!cb->cb_holds_slot &&
> > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > +       if (cb->cb_held_slot >= 0)
> > +               return true;
> > +       cb->cb_held_slot = grab_slot(ses);
> > +       if (cb->cb_held_slot < 0) {
> >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> >                 /* Race breaker */
> > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > -                       dprintk("%s slot is busy\n", __func__);
> > +               cb->cb_held_slot = grab_slot(ses);
> > +               if (cb->cb_held_slot < 0)
> >                         return false;
> > -               }
> >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> >         }
> > -       cb->cb_holds_slot = true;
> >         return true;
> >  }
> > 
> >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> >  {
> >         struct nfs4_client *clp = cb->cb_clp;
> > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > 
> > -       if (cb->cb_holds_slot) {
> > -               cb->cb_holds_slot = false;
> > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > +       if (cb->cb_held_slot >= 0) {
> > +               spin_lock(&ses->se_lock);
> > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > +               spin_unlock(&ses->se_lock);
> > +               cb->cb_held_slot = -1;
> >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> >         }
> >  }
> > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> >  }
> > 
> >  /*
> > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > - * slots, and mark callback channel down on communication errors.
> > + * TODO: cb_sequence should support referring call lists, cachethis,
> > + * and mark callback channel down on communication errors.
> >   */
> >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> >  {
> > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >                 return true;
> >         }
> > 
> > -       if (!cb->cb_holds_slot)
> > +       if (cb->cb_held_slot < 0)
> >                 goto need_restart;
> > 
> >         /* This is the operation status code for CB_SEQUENCE */
> > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >                  * If CB_SEQUENCE returns an error, then the state of the slot
> >                  * (sequence ID, cached reply) MUST NOT change.
> >                  */
> > -               ++session->se_cb_seq_nr;
> > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> >                 break;
> >         case -ESERVERFAULT:
> > -               ++session->se_cb_seq_nr;
> > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> >                 nfsd4_mark_cb_fault(cb->cb_clp);
> >                 ret = false;
> >                 break;
> > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >         case -NFS4ERR_BADSLOT:
> >                 goto retry_nowait;
> >         case -NFS4ERR_SEQ_MISORDERED:
> > -               if (session->se_cb_seq_nr != 1) {
> > -                       session->se_cb_seq_nr = 1;
> > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> >                         goto retry_nowait;
> >                 }
> >                 break;
> >         default:
> >                 nfsd4_mark_cb_fault(cb->cb_clp);
> >         }
> > -       nfsd41_cb_release_slot(cb);
> > -
> >         trace_nfsd_cb_free_slot(task, cb);
> > +       nfsd41_cb_release_slot(cb);
> > 
> >         if (RPC_SIGNALLED(task))
> >                 goto need_restart;
> > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> >         cb->cb_status = 0;
> >         cb->cb_need_restart = false;
> > -       cb->cb_holds_slot = false;
> > +       cb->cb_held_slot = -1;
> >  }
> > 
> >  /**
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> >         }
> > 
> >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > +       new->se_cb_slot_avail = ~0U;
> > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > +       spin_lock_init(&new->se_lock);
> >         return new;
> >  out_free:
> >         while (i--)
> > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > 
> >         INIT_LIST_HEAD(&new->se_conns);
> > 
> > -       new->se_cb_seq_nr = 1;
> > +       atomic_set(&new->se_ref, 0);
> >         new->se_dead = false;
> >         new->se_cb_prog = cses->callback_prog;
> >         new->se_cb_sec = cses->cb_sec;
> > -       atomic_set(&new->se_ref, 0);
> > +
> > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > +               new->se_cb_seq_nr[idx] = 1;
> > +
> >         idx = hash_sessionid(&new->se_sessionid);
> >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> >         spin_lock(&clp->cl_lock);
> > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> >         kref_init(&clp->cl_nfsdfs.cl_ref);
> >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> >         clp->cl_time = ktime_get_boottime_seconds();
> > -       clear_bit(0, &clp->cl_cb_slot_busy);
> >         copy_verf(clp, verf);
> >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> >         clp->cl_cb_session = NULL;
> > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > --- a/fs/nfsd/state.h
> > +++ b/fs/nfsd/state.h
> > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> >         struct work_struct cb_work;
> >         int cb_seq_status;
> >         int cb_status;
> > +       int cb_held_slot;
> >         bool cb_need_restart;
> > -       bool cb_holds_slot;
> >  };
> > 
> >  struct nfsd4_callback_ops {
> > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> >         unsigned char cn_flags;
> >  };
> > 
> > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> 
> Are there some values that are known not to work? I was experimenting
> with values and set it to 2 and 4 and the kernel oopsed. I understand
> it's not a configurable value but it would still be good to know the
> expectations...
>
> [  198.625021] Unable to handle kernel paging request at virtual
> address dfff800020000000
> [  198.625870] KASAN: probably user-memory-access in range
> [0x0000000100000000-0x0000000100000007]
> [  198.626444] Mem abort info:
> [  198.626630]   ESR = 0x0000000096000005
> [  198.626882]   EC = 0x25: DABT (current EL), IL = 32 bits
> [  198.627234]   SET = 0, FnV = 0
> [  198.627441]   EA = 0, S1PTW = 0
> [  198.627627]   FSC = 0x05: level 1 translation fault
> [  198.627859] Data abort info:
> [  198.628000]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
> [  198.628272]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> [  198.628619]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> [  198.628967] [dfff800020000000] address between user and kernel address ranges
> [  198.629438] Internal error: Oops: 0000000096000005 [#1] SMP
> [  198.629806] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver
> nfs netfs nfnetlink_queue nfnetlink_log nfnetlink bluetooth cfg80211
> rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd auth_rpcgss nfs_acl lockd
> grace isofs uinput snd_seq_dummy snd_hrtimer vsock_loopback
> vmw_vsock_virtio_transport_common qrtr rfkill vmw_vsock_vmci_transport
> vsock sunrpc vfat fat snd_hda_codec_generic snd_hda_intel
> snd_intel_dspcfg snd_hda_codec snd_hda_core snd_hwdep snd_seq uvcvideo
> videobuf2_vmalloc snd_seq_device videobuf2_memops uvc videobuf2_v4l2
> videodev snd_pcm videobuf2_common mc snd_timer snd vmw_vmci soundcore
> xfs libcrc32c vmwgfx drm_ttm_helper ttm nvme drm_kms_helper
> crct10dif_ce nvme_core ghash_ce sha2_ce sha256_arm64 sha1_ce drm
> nvme_auth sr_mod cdrom e1000e sg fuse
> [  198.633799] CPU: 5 UID: 0 PID: 6081 Comm: nfsd Kdump: loaded Not
> tainted 6.12.0-rc6+ #47
> [  198.634345] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS
> VMW201.00V.21805430.BA64.2305221830 05/22/2023
> [  198.635014] pstate: 11400005 (nzcV daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> [  198.635492] pc : nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> [  198.635798] lr : nfsd4_sequence+0x340/0x1f60 [nfsd]
> [  198.636065] sp : ffff8000884977e0
> [  198.636234] x29: ffff800088497910 x28: ffff0000b1b39280 x27: ffff0000ab508128
> [  198.636624] x26: ffff0000b1b39298 x25: ffff0000b1b39290 x24: ffff0000a65e1c64
> [  198.637049] x23: 1fffe000212e6804 x22: ffff000109734024 x21: 1ffff00011092f16
> [  198.637472] x20: ffff00010aed8000 x19: ffff000109734000 x18: 1fffe0002de20c8b
> [  198.637883] x17: 0100000000000000 x16: 1ffff0000fcef234 x15: 1fffe000212e600f
> [  198.638286] x14: ffff80007e779000 x13: ffff80007e7791a0 x12: 0000000000000000
> [  198.638697] x11: ffff0000a65e1c38 x10: ffff00010aedaca0 x9 : 1fffe000215db594
> [  198.639110] x8 : 1fffe00014cbc387 x7 : ffff0000a65e1c03 x6 : ffff0000a65e1c00
> [  198.639541] x5 : ffff0000a65e1c00 x4 : 0000000020000000 x3 : 0000000100000001
> [  198.639962] x2 : ffff000109730060 x1 : 0000000000000003 x0 : dfff800000000000
> [  198.640332] Call trace:
> [  198.640460]  nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> [  198.640715]  nfsd4_proc_compound+0xb94/0x23b0 [nfsd]
> [  198.640997]  nfsd_dispatch+0x22c/0x718 [nfsd]
> [  198.641260]  svc_process_common+0x8e8/0x1968 [sunrpc]
> [  198.641566]  svc_process+0x3d4/0x7e0 [sunrpc]
> [  198.641827]  svc_handle_xprt+0x828/0xe10 [sunrpc]
> [  198.642108]  svc_recv+0x2cc/0x6a8 [sunrpc]
> [  198.642346]  nfsd+0x270/0x400 [nfsd]
> [  198.642562]  kthread+0x288/0x310
> [  198.642745]  ret_from_fork+0x10/0x20
> [  198.642937] Code: f2fbffe0 f9003be4 f94007e2 52800061 (38e06880)
> [  198.643267] SMP: stopping secondary CPUs
> 
> 
> 


Good catch. I think the problem here is that we don't currently cap the
initial value of se_cb_highest_slot at NFSD_BC_SLOT_TABLE_MAX. Does
this patch prevent the panic?

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3afe56ab9e0a..839be4ba765a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2011,7 +2011,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
 
 	memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
 	new->se_cb_slot_avail = ~0U;
-	new->se_cb_highest_slot = battrs->maxreqs - 1;
+	new->se_cb_highest_slot = min(battrs->maxreqs - 1, NFSD_BC_SLOT_TABLE_MAX);
 	spin_lock_init(&new->se_lock);
 	return new;
 out_free:
Jeff Layton Nov. 9, 2024, 9:10 p.m. UTC | #7
On Sat, 2024-11-09 at 14:24 -0500, Olga Kornievskaia wrote:
> On Wed, Nov 6, 2024 at 11:44 AM Olga Kornievskaia <aglo@umich.edu> wrote:
> > 
> > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > 
> > > nfsd currently only uses a single slot in the callback channel, which is
> > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > a max of 32 slots (subject to the client's target_maxreqs value).
> > > 
> > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > track which slots are in use, and a u32 to track the latest callback
> > > target_slotid that the client reports. To protect the new fields, add
> > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > search for the lowest slotid (using ffs()).
> > > 
> > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > counters and add the necessary handling to ensure that the seqids get
> > > reset at the appropriate times.
> > > 
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > ---
> > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > when the slot table is full. Olga, if you're able to test this one, it
> > > would be much appreciated.
> > 
> > I have tested this version. I can confirm that I'm not seeing the
> > softlockup. But the server still does not use the lowest available
> > slot. It is hard for me to describe the algorithm of picking the slot
> > number (in general it still seems to be picking the next slot value,
> > even though slots have been replied to). I have seen slot 0 re-used
> > eventually but it seemed to be when the server came to using slot=13.
> > 
> > The other unfortunate thing that's happening when I use these patches
> > is my test case that recalling delegations and making sure that the
> > state management gets handled properly (ie., the patch that I've
> > submitted to fix a race between the laundromat thread and free_state)
> > is not working. After all the recalls, the server still thinks it has
> > revoked state. I have to debug more to figure out what's going on.
> > 
> 
> I haven't been able to reproduce the cl_revoked list ending non-empty
> but I have hit it, let's say 2-3times in the 4days that I've been
> trying various things trying to reproduce it. And thus my attempt at
> changing the number of callback session slots (and hitting a kernel
> oops). Still trying.
> 
> Also another comment is that I don't see having multiple slots help
> with the issue of having numerous recalls that end up resulting in 6
> RPC exchanges I've described earlier.
> 
> Instead what I see is when the server starts setting the SEQUENCE flag
> of revocable state
> 

Which flag?

> , then the CB_RECALLs are getting ERR_DELAY error
> (not there aren't multiple callbacks in flight, perhaps at most 2). So
> it seems like things are "slowing down" even further. There are about
> 2-3 CB_RECALLs 3rd getting the reply then OPEN which gets BAD_STATEID,
> then TEST_STATEID, FREE_STATEID, and then OPEN.
> 

Sounds like a client-side capacity issue? nfs4_callback_recall()
returns NFS4ERR_DELAY when nfs_delegation_find_inode() returns -EAGAIN.
Maybe there is something weird going on there? Eventually the server
has no choice but to revoke an unreturned delegation.


> > > Changes in v4:
> > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > > 
> > > Changes in v3:
> > > - add patch to convert se_flags to single se_dead bool
> > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > - don't reject target highest slot value of 0
> > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > > 
> > > Changes in v2:
> > > - take cl_lock when fetching fields from session to be encoded
> > > - use fls() instead of bespoke highest_unset_index()
> > > - rename variables in several functions with more descriptive names
> > > - clamp limit of for loop in update_cb_slot_table()
> > > - re-add missing rpc_wake_up_queued_task() call
> > > - fix slotid check in decode_cb_sequence4resok()
> > > - add new per-session spinlock
> > > ---
> > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > >  fs/nfsd/nfs4state.c    |  11 +++--
> > >  fs/nfsd/state.h        |  15 ++++---
> > >  fs/nfsd/trace.h        |   2 +-
> > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > > 
> > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > --- a/fs/nfsd/nfs4callback.c
> > > +++ b/fs/nfsd/nfs4callback.c
> > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > >         hdr->nops++;
> > >  }
> > > 
> > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > +{
> > > +       u32 idx;
> > > +
> > > +       spin_lock(&ses->se_lock);
> > > +       idx = fls(~ses->se_cb_slot_avail);
> > > +       if (idx > 0)
> > > +               --idx;
> > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > +       spin_unlock(&ses->se_lock);
> > > +       return idx;
> > > +}
> > > +
> > >  /*
> > >   * CB_SEQUENCE4args
> > >   *
> > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > >         encode_sessionid4(xdr, session);
> > > 
> > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > > 
> > >         hdr->nops++;
> > >  }
> > > 
> > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > +{
> > > +       /* No need to do anything if nothing changed */
> > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > +               return;
> > > +
> > > +       spin_lock(&ses->se_lock);
> > > +       if (target > ses->se_cb_highest_slot) {
> > > +               int i;
> > > +
> > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > +
> > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > +                       ses->se_cb_seq_nr[i] = 1;
> > > +       }
> > > +       ses->se_cb_highest_slot = target;
> > > +       spin_unlock(&ses->se_lock);
> > > +}
> > > +
> > >  /*
> > >   * CB_SEQUENCE4resok
> > >   *
> > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > >         int status = -ESERVERFAULT;
> > >         __be32 *p;
> > > -       u32 dummy;
> > > +       u32 seqid, slotid, target;
> > > 
> > >         /*
> > >          * If the server returns different values for sessionID, slotID or
> > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > >         }
> > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > > 
> > > -       dummy = be32_to_cpup(p++);
> > > -       if (dummy != session->se_cb_seq_nr) {
> > > +       seqid = be32_to_cpup(p++);
> > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > >                 goto out;
> > >         }
> > > 
> > > -       dummy = be32_to_cpup(p++);
> > > -       if (dummy != 0) {
> > > +       slotid = be32_to_cpup(p++);
> > > +       if (slotid != cb->cb_held_slot) {
> > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > >                 goto out;
> > >         }
> > > 
> > > -       /*
> > > -        * FIXME: process highest slotid and target highest slotid
> > > -        */
> > > +       p++; // ignore current highest slot value
> > > +
> > > +       target = be32_to_cpup(p++);
> > > +       update_cb_slot_table(session, target);
> > >         status = 0;
> > >  out:
> > >         cb->cb_seq_status = status;
> > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > >         spin_unlock(&clp->cl_lock);
> > >  }
> > > 
> > > +static int grab_slot(struct nfsd4_session *ses)
> > > +{
> > > +       int idx;
> > > +
> > > +       spin_lock(&ses->se_lock);
> > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > +               spin_unlock(&ses->se_lock);
> > > +               return -1;
> > > +       }
> > > +       /* clear the bit for the slot */
> > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > +       spin_unlock(&ses->se_lock);
> > > +       return idx;
> > > +}
> > > +
> > >  /*
> > >   * There's currently a single callback channel slot.
> > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > >  {
> > >         struct nfs4_client *clp = cb->cb_clp;
> > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > 
> > > -       if (!cb->cb_holds_slot &&
> > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > +       if (cb->cb_held_slot >= 0)
> > > +               return true;
> > > +       cb->cb_held_slot = grab_slot(ses);
> > > +       if (cb->cb_held_slot < 0) {
> > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > >                 /* Race breaker */
> > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > -                       dprintk("%s slot is busy\n", __func__);
> > > +               cb->cb_held_slot = grab_slot(ses);
> > > +               if (cb->cb_held_slot < 0)
> > >                         return false;
> > > -               }
> > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > >         }
> > > -       cb->cb_holds_slot = true;
> > >         return true;
> > >  }
> > > 
> > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > >  {
> > >         struct nfs4_client *clp = cb->cb_clp;
> > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > 
> > > -       if (cb->cb_holds_slot) {
> > > -               cb->cb_holds_slot = false;
> > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > +       if (cb->cb_held_slot >= 0) {
> > > +               spin_lock(&ses->se_lock);
> > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > +               spin_unlock(&ses->se_lock);
> > > +               cb->cb_held_slot = -1;
> > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > >         }
> > >  }
> > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > >  }
> > > 
> > >  /*
> > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > - * slots, and mark callback channel down on communication errors.
> > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > + * and mark callback channel down on communication errors.
> > >   */
> > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > >  {
> > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > >                 return true;
> > >         }
> > > 
> > > -       if (!cb->cb_holds_slot)
> > > +       if (cb->cb_held_slot < 0)
> > >                 goto need_restart;
> > > 
> > >         /* This is the operation status code for CB_SEQUENCE */
> > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > >                  * (sequence ID, cached reply) MUST NOT change.
> > >                  */
> > > -               ++session->se_cb_seq_nr;
> > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > >                 break;
> > >         case -ESERVERFAULT:
> > > -               ++session->se_cb_seq_nr;
> > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > >                 ret = false;
> > >                 break;
> > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > >         case -NFS4ERR_BADSLOT:
> > >                 goto retry_nowait;
> > >         case -NFS4ERR_SEQ_MISORDERED:
> > > -               if (session->se_cb_seq_nr != 1) {
> > > -                       session->se_cb_seq_nr = 1;
> > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > >                         goto retry_nowait;
> > >                 }
> > >                 break;
> > >         default:
> > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > >         }
> > > -       nfsd41_cb_release_slot(cb);
> > > -
> > >         trace_nfsd_cb_free_slot(task, cb);
> > > +       nfsd41_cb_release_slot(cb);
> > > 
> > >         if (RPC_SIGNALLED(task))
> > >                 goto need_restart;
> > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > >         cb->cb_status = 0;
> > >         cb->cb_need_restart = false;
> > > -       cb->cb_holds_slot = false;
> > > +       cb->cb_held_slot = -1;
> > >  }
> > > 
> > >  /**
> > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > --- a/fs/nfsd/nfs4state.c
> > > +++ b/fs/nfsd/nfs4state.c
> > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > >         }
> > > 
> > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > +       new->se_cb_slot_avail = ~0U;
> > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > +       spin_lock_init(&new->se_lock);
> > >         return new;
> > >  out_free:
> > >         while (i--)
> > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > 
> > >         INIT_LIST_HEAD(&new->se_conns);
> > > 
> > > -       new->se_cb_seq_nr = 1;
> > > +       atomic_set(&new->se_ref, 0);
> > >         new->se_dead = false;
> > >         new->se_cb_prog = cses->callback_prog;
> > >         new->se_cb_sec = cses->cb_sec;
> > > -       atomic_set(&new->se_ref, 0);
> > > +
> > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > +               new->se_cb_seq_nr[idx] = 1;
> > > +
> > >         idx = hash_sessionid(&new->se_sessionid);
> > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > >         spin_lock(&clp->cl_lock);
> > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > >         clp->cl_time = ktime_get_boottime_seconds();
> > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > >         copy_verf(clp, verf);
> > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > >         clp->cl_cb_session = NULL;
> > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > --- a/fs/nfsd/state.h
> > > +++ b/fs/nfsd/state.h
> > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > >         struct work_struct cb_work;
> > >         int cb_seq_status;
> > >         int cb_status;
> > > +       int cb_held_slot;
> > >         bool cb_need_restart;
> > > -       bool cb_holds_slot;
> > >  };
> > > 
> > >  struct nfsd4_callback_ops {
> > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > >         unsigned char cn_flags;
> > >  };
> > > 
> > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > > +
> > >  /*
> > >   * Representation of a v4.1+ session. These are refcounted in a similar fashion
> > >   * to the nfs4_client. References are only taken when the server is actively
> > > @@ -314,6 +317,10 @@ struct nfsd4_conn {
> > >   */
> > >  struct nfsd4_session {
> > >         atomic_t                se_ref;
> > > +       spinlock_t              se_lock;
> > > +       u32                     se_cb_slot_avail; /* bitmap of available slots */
> > > +       u32                     se_cb_highest_slot;     /* highest slot client wants */
> > > +       u32                     se_cb_prog;
> > >         bool                    se_dead;
> > >         struct list_head        se_hash;        /* hash by sessionid */
> > >         struct list_head        se_perclnt;
> > > @@ -322,8 +329,7 @@ struct nfsd4_session {
> > >         struct nfsd4_channel_attrs se_fchannel;
> > >         struct nfsd4_cb_sec     se_cb_sec;
> > >         struct list_head        se_conns;
> > > -       u32                     se_cb_prog;
> > > -       u32                     se_cb_seq_nr;
> > > +       u32                     se_cb_seq_nr[NFSD_BC_SLOT_TABLE_MAX + 1];
> > >         struct nfsd4_slot       *se_slots[];    /* forward channel slots */
> > >  };
> > > 
> > > @@ -457,9 +463,6 @@ struct nfs4_client {
> > >          */
> > >         struct dentry           *cl_nfsd_info_dentry;
> > > 
> > > -       /* for nfs41 callbacks */
> > > -       /* We currently support a single back channel with a single slot */
> > > -       unsigned long           cl_cb_slot_busy;
> > >         struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
> > >                                                 /* wait here for slots */
> > >         struct net              *net;
> > > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > > index f318898cfc31614b5a84a4867e18c2b3a07122c9..a9c17186b6892f1df8d7f7b90e250c2913ab23fe 100644
> > > --- a/fs/nfsd/trace.h
> > > +++ b/fs/nfsd/trace.h
> > > @@ -1697,7 +1697,7 @@ TRACE_EVENT(nfsd_cb_free_slot,
> > >                 __entry->cl_id = sid->clientid.cl_id;
> > >                 __entry->seqno = sid->sequence;
> > >                 __entry->reserved = sid->reserved;
> > > -               __entry->slot_seqno = session->se_cb_seq_nr;
> > > +               __entry->slot_seqno = session->se_cb_seq_nr[cb->cb_held_slot];
> > >         ),
> > >         TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
> > >                 " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u",
> > > 
> > > ---
> > > base-commit: 3c16aac09d20f9005fbb0e737b3ec520bbb5badd
> > > change-id: 20241025-bcwide-6bd7e4b63db2
> > > 
> > > Best regards,
> > > --
> > > Jeff Layton <jlayton@kernel.org>
> > > 
> > >
Olga Kornievskaia Nov. 11, 2024, 2:19 a.m. UTC | #8
On Sat, Nov 9, 2024 at 2:26 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Sat, 2024-11-09 at 13:50 -0500, Olga Kornievskaia wrote:
> > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > >
> > > nfsd currently only uses a single slot in the callback channel, which is
> > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > a max of 32 slots (subject to the client's target_maxreqs value).
> > >
> > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > track which slots are in use, and a u32 to track the latest callback
> > > target_slotid that the client reports. To protect the new fields, add
> > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > search for the lowest slotid (using ffs()).
> > >
> > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > counters and add the necessary handling to ensure that the seqids get
> > > reset at the appropriate times.
> > >
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > ---
> > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > when the slot table is full. Olga, if you're able to test this one, it
> > > would be much appreciated.
> > > ---
> > > Changes in v4:
> > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > >
> > > Changes in v3:
> > > - add patch to convert se_flags to single se_dead bool
> > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > - don't reject target highest slot value of 0
> > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > >
> > > Changes in v2:
> > > - take cl_lock when fetching fields from session to be encoded
> > > - use fls() instead of bespoke highest_unset_index()
> > > - rename variables in several functions with more descriptive names
> > > - clamp limit of for loop in update_cb_slot_table()
> > > - re-add missing rpc_wake_up_queued_task() call
> > > - fix slotid check in decode_cb_sequence4resok()
> > > - add new per-session spinlock
> > > ---
> > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > >  fs/nfsd/nfs4state.c    |  11 +++--
> > >  fs/nfsd/state.h        |  15 ++++---
> > >  fs/nfsd/trace.h        |   2 +-
> > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > >
> > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > --- a/fs/nfsd/nfs4callback.c
> > > +++ b/fs/nfsd/nfs4callback.c
> > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > >         hdr->nops++;
> > >  }
> > >
> > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > +{
> > > +       u32 idx;
> > > +
> > > +       spin_lock(&ses->se_lock);
> > > +       idx = fls(~ses->se_cb_slot_avail);
> > > +       if (idx > 0)
> > > +               --idx;
> > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > +       spin_unlock(&ses->se_lock);
> > > +       return idx;
> > > +}
> > > +
> > >  /*
> > >   * CB_SEQUENCE4args
> > >   *
> > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > >         encode_sessionid4(xdr, session);
> > >
> > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > >
> > >         hdr->nops++;
> > >  }
> > >
> > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > +{
> > > +       /* No need to do anything if nothing changed */
> > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > +               return;
> > > +
> > > +       spin_lock(&ses->se_lock);
> > > +       if (target > ses->se_cb_highest_slot) {
> > > +               int i;
> > > +
> > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > +
> > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > +                       ses->se_cb_seq_nr[i] = 1;
> > > +       }
> > > +       ses->se_cb_highest_slot = target;
> > > +       spin_unlock(&ses->se_lock);
> > > +}
> > > +
> > >  /*
> > >   * CB_SEQUENCE4resok
> > >   *
> > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > >         int status = -ESERVERFAULT;
> > >         __be32 *p;
> > > -       u32 dummy;
> > > +       u32 seqid, slotid, target;
> > >
> > >         /*
> > >          * If the server returns different values for sessionID, slotID or
> > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > >         }
> > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > >
> > > -       dummy = be32_to_cpup(p++);
> > > -       if (dummy != session->se_cb_seq_nr) {
> > > +       seqid = be32_to_cpup(p++);
> > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > >                 goto out;
> > >         }
> > >
> > > -       dummy = be32_to_cpup(p++);
> > > -       if (dummy != 0) {
> > > +       slotid = be32_to_cpup(p++);
> > > +       if (slotid != cb->cb_held_slot) {
> > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > >                 goto out;
> > >         }
> > >
> > > -       /*
> > > -        * FIXME: process highest slotid and target highest slotid
> > > -        */
> > > +       p++; // ignore current highest slot value
> > > +
> > > +       target = be32_to_cpup(p++);
> > > +       update_cb_slot_table(session, target);
> > >         status = 0;
> > >  out:
> > >         cb->cb_seq_status = status;
> > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > >         spin_unlock(&clp->cl_lock);
> > >  }
> > >
> > > +static int grab_slot(struct nfsd4_session *ses)
> > > +{
> > > +       int idx;
> > > +
> > > +       spin_lock(&ses->se_lock);
> > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > +               spin_unlock(&ses->se_lock);
> > > +               return -1;
> > > +       }
> > > +       /* clear the bit for the slot */
> > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > +       spin_unlock(&ses->se_lock);
> > > +       return idx;
> > > +}
> > > +
> > >  /*
> > >   * There's currently a single callback channel slot.
> > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > >  {
> > >         struct nfs4_client *clp = cb->cb_clp;
> > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > >
> > > -       if (!cb->cb_holds_slot &&
> > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > +       if (cb->cb_held_slot >= 0)
> > > +               return true;
> > > +       cb->cb_held_slot = grab_slot(ses);
> > > +       if (cb->cb_held_slot < 0) {
> > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > >                 /* Race breaker */
> > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > -                       dprintk("%s slot is busy\n", __func__);
> > > +               cb->cb_held_slot = grab_slot(ses);
> > > +               if (cb->cb_held_slot < 0)
> > >                         return false;
> > > -               }
> > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > >         }
> > > -       cb->cb_holds_slot = true;
> > >         return true;
> > >  }
> > >
> > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > >  {
> > >         struct nfs4_client *clp = cb->cb_clp;
> > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > >
> > > -       if (cb->cb_holds_slot) {
> > > -               cb->cb_holds_slot = false;
> > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > +       if (cb->cb_held_slot >= 0) {
> > > +               spin_lock(&ses->se_lock);
> > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > +               spin_unlock(&ses->se_lock);
> > > +               cb->cb_held_slot = -1;
> > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > >         }
> > >  }
> > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > >  }
> > >
> > >  /*
> > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > - * slots, and mark callback channel down on communication errors.
> > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > + * and mark callback channel down on communication errors.
> > >   */
> > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > >  {
> > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > >                 return true;
> > >         }
> > >
> > > -       if (!cb->cb_holds_slot)
> > > +       if (cb->cb_held_slot < 0)
> > >                 goto need_restart;
> > >
> > >         /* This is the operation status code for CB_SEQUENCE */
> > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > >                  * (sequence ID, cached reply) MUST NOT change.
> > >                  */
> > > -               ++session->se_cb_seq_nr;
> > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > >                 break;
> > >         case -ESERVERFAULT:
> > > -               ++session->se_cb_seq_nr;
> > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > >                 ret = false;
> > >                 break;
> > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > >         case -NFS4ERR_BADSLOT:
> > >                 goto retry_nowait;
> > >         case -NFS4ERR_SEQ_MISORDERED:
> > > -               if (session->se_cb_seq_nr != 1) {
> > > -                       session->se_cb_seq_nr = 1;
> > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > >                         goto retry_nowait;
> > >                 }
> > >                 break;
> > >         default:
> > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > >         }
> > > -       nfsd41_cb_release_slot(cb);
> > > -
> > >         trace_nfsd_cb_free_slot(task, cb);
> > > +       nfsd41_cb_release_slot(cb);
> > >
> > >         if (RPC_SIGNALLED(task))
> > >                 goto need_restart;
> > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > >         cb->cb_status = 0;
> > >         cb->cb_need_restart = false;
> > > -       cb->cb_holds_slot = false;
> > > +       cb->cb_held_slot = -1;
> > >  }
> > >
> > >  /**
> > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > --- a/fs/nfsd/nfs4state.c
> > > +++ b/fs/nfsd/nfs4state.c
> > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > >         }
> > >
> > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > +       new->se_cb_slot_avail = ~0U;
> > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > +       spin_lock_init(&new->se_lock);
> > >         return new;
> > >  out_free:
> > >         while (i--)
> > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > >
> > >         INIT_LIST_HEAD(&new->se_conns);
> > >
> > > -       new->se_cb_seq_nr = 1;
> > > +       atomic_set(&new->se_ref, 0);
> > >         new->se_dead = false;
> > >         new->se_cb_prog = cses->callback_prog;
> > >         new->se_cb_sec = cses->cb_sec;
> > > -       atomic_set(&new->se_ref, 0);
> > > +
> > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > +               new->se_cb_seq_nr[idx] = 1;
> > > +
> > >         idx = hash_sessionid(&new->se_sessionid);
> > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > >         spin_lock(&clp->cl_lock);
> > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > >         clp->cl_time = ktime_get_boottime_seconds();
> > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > >         copy_verf(clp, verf);
> > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > >         clp->cl_cb_session = NULL;
> > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > --- a/fs/nfsd/state.h
> > > +++ b/fs/nfsd/state.h
> > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > >         struct work_struct cb_work;
> > >         int cb_seq_status;
> > >         int cb_status;
> > > +       int cb_held_slot;
> > >         bool cb_need_restart;
> > > -       bool cb_holds_slot;
> > >  };
> > >
> > >  struct nfsd4_callback_ops {
> > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > >         unsigned char cn_flags;
> > >  };
> > >
> > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> >
> > Are there some values that are known not to work? I was experimenting
> > with values and set it to 2 and 4 and the kernel oopsed. I understand
> > it's not a configurable value but it would still be good to know the
> > expectations...
> >
> > [  198.625021] Unable to handle kernel paging request at virtual
> > address dfff800020000000
> > [  198.625870] KASAN: probably user-memory-access in range
> > [0x0000000100000000-0x0000000100000007]
> > [  198.626444] Mem abort info:
> > [  198.626630]   ESR = 0x0000000096000005
> > [  198.626882]   EC = 0x25: DABT (current EL), IL = 32 bits
> > [  198.627234]   SET = 0, FnV = 0
> > [  198.627441]   EA = 0, S1PTW = 0
> > [  198.627627]   FSC = 0x05: level 1 translation fault
> > [  198.627859] Data abort info:
> > [  198.628000]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
> > [  198.628272]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> > [  198.628619]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> > [  198.628967] [dfff800020000000] address between user and kernel address ranges
> > [  198.629438] Internal error: Oops: 0000000096000005 [#1] SMP
> > [  198.629806] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver
> > nfs netfs nfnetlink_queue nfnetlink_log nfnetlink bluetooth cfg80211
> > rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd auth_rpcgss nfs_acl lockd
> > grace isofs uinput snd_seq_dummy snd_hrtimer vsock_loopback
> > vmw_vsock_virtio_transport_common qrtr rfkill vmw_vsock_vmci_transport
> > vsock sunrpc vfat fat snd_hda_codec_generic snd_hda_intel
> > snd_intel_dspcfg snd_hda_codec snd_hda_core snd_hwdep snd_seq uvcvideo
> > videobuf2_vmalloc snd_seq_device videobuf2_memops uvc videobuf2_v4l2
> > videodev snd_pcm videobuf2_common mc snd_timer snd vmw_vmci soundcore
> > xfs libcrc32c vmwgfx drm_ttm_helper ttm nvme drm_kms_helper
> > crct10dif_ce nvme_core ghash_ce sha2_ce sha256_arm64 sha1_ce drm
> > nvme_auth sr_mod cdrom e1000e sg fuse
> > [  198.633799] CPU: 5 UID: 0 PID: 6081 Comm: nfsd Kdump: loaded Not
> > tainted 6.12.0-rc6+ #47
> > [  198.634345] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS
> > VMW201.00V.21805430.BA64.2305221830 05/22/2023
> > [  198.635014] pstate: 11400005 (nzcV daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> > [  198.635492] pc : nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > [  198.635798] lr : nfsd4_sequence+0x340/0x1f60 [nfsd]
> > [  198.636065] sp : ffff8000884977e0
> > [  198.636234] x29: ffff800088497910 x28: ffff0000b1b39280 x27: ffff0000ab508128
> > [  198.636624] x26: ffff0000b1b39298 x25: ffff0000b1b39290 x24: ffff0000a65e1c64
> > [  198.637049] x23: 1fffe000212e6804 x22: ffff000109734024 x21: 1ffff00011092f16
> > [  198.637472] x20: ffff00010aed8000 x19: ffff000109734000 x18: 1fffe0002de20c8b
> > [  198.637883] x17: 0100000000000000 x16: 1ffff0000fcef234 x15: 1fffe000212e600f
> > [  198.638286] x14: ffff80007e779000 x13: ffff80007e7791a0 x12: 0000000000000000
> > [  198.638697] x11: ffff0000a65e1c38 x10: ffff00010aedaca0 x9 : 1fffe000215db594
> > [  198.639110] x8 : 1fffe00014cbc387 x7 : ffff0000a65e1c03 x6 : ffff0000a65e1c00
> > [  198.639541] x5 : ffff0000a65e1c00 x4 : 0000000020000000 x3 : 0000000100000001
> > [  198.639962] x2 : ffff000109730060 x1 : 0000000000000003 x0 : dfff800000000000
> > [  198.640332] Call trace:
> > [  198.640460]  nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > [  198.640715]  nfsd4_proc_compound+0xb94/0x23b0 [nfsd]
> > [  198.640997]  nfsd_dispatch+0x22c/0x718 [nfsd]
> > [  198.641260]  svc_process_common+0x8e8/0x1968 [sunrpc]
> > [  198.641566]  svc_process+0x3d4/0x7e0 [sunrpc]
> > [  198.641827]  svc_handle_xprt+0x828/0xe10 [sunrpc]
> > [  198.642108]  svc_recv+0x2cc/0x6a8 [sunrpc]
> > [  198.642346]  nfsd+0x270/0x400 [nfsd]
> > [  198.642562]  kthread+0x288/0x310
> > [  198.642745]  ret_from_fork+0x10/0x20
> > [  198.642937] Code: f2fbffe0 f9003be4 f94007e2 52800061 (38e06880)
> > [  198.643267] SMP: stopping secondary CPUs
> >
> >
> >
>
>
> Good catch. I think the problem here is that we don't currently cap the
> initial value of se_cb_highest_slot at NFSD_BC_SLOT_TABLE_MAX. Does
> this patch prevent the panic?
>
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 3afe56ab9e0a..839be4ba765a 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2011,7 +2011,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
>
>         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
>         new->se_cb_slot_avail = ~0U;
> -       new->se_cb_highest_slot = battrs->maxreqs - 1;
> +       new->se_cb_highest_slot = min(battrs->maxreqs - 1, NFSD_BC_SLOT_TABLE_MAX);
>         spin_lock_init(&new->se_lock);
>         return new;
>  out_free:

It does help. I thought that the CREATE_SESSION reply for the
backchannel would be guided by the NFSD_BC_SLOT_TABLE_MAX value but
instead it seems like it's not. But yes I can see that the highest
slot used by the server is capped by the NFSD_BC_SLOT_TABLE_MAX value.
Olga Kornievskaia Nov. 11, 2024, 2:24 a.m. UTC | #9
On Sat, Nov 9, 2024 at 4:10 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Sat, 2024-11-09 at 14:24 -0500, Olga Kornievskaia wrote:
> > On Wed, Nov 6, 2024 at 11:44 AM Olga Kornievskaia <aglo@umich.edu> wrote:
> > >
> > > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > >
> > > > nfsd currently only uses a single slot in the callback channel, which is
> > > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > > a max of 32 slots (subject to the client's target_maxreqs value).
> > > >
> > > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > > track which slots are in use, and a u32 to track the latest callback
> > > > target_slotid that the client reports. To protect the new fields, add
> > > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > > search for the lowest slotid (using ffs()).
> > > >
> > > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > > counters and add the necessary handling to ensure that the seqids get
> > > > reset at the appropriate times.
> > > >
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > > ---
> > > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > > when the slot table is full. Olga, if you're able to test this one, it
> > > > would be much appreciated.
> > >
> > > I have tested this version. I can confirm that I'm not seeing the
> > > softlockup. But the server still does not use the lowest available
> > > slot. It is hard for me to describe the algorithm of picking the slot
> > > number (in general it still seems to be picking the next slot value,
> > > even though slots have been replied to). I have seen slot 0 re-used
> > > eventually but it seemed to be when the server came to using slot=13.
> > >
> > > The other unfortunate thing that's happening when I use these patches
> > > is my test case that recalling delegations and making sure that the
> > > state management gets handled properly (ie., the patch that I've
> > > submitted to fix a race between the laundromat thread and free_state)
> > > is not working. After all the recalls, the server still thinks it has
> > > revoked state. I have to debug more to figure out what's going on.
> > >
> >
> > I haven't been able to reproduce the cl_revoked list ending non-empty
> > but I have hit it, let's say 2-3times in the 4days that I've been
> > trying various things trying to reproduce it. And thus my attempt at
> > changing the number of callback session slots (and hitting a kernel
> > oops). Still trying.
> >
> > Also another comment is that I don't see having multiple slots help
> > with the issue of having numerous recalls that end up resulting in 6
> > RPC exchanges I've described earlier.
> >
> > Instead what I see is when the server starts setting the SEQUENCE flag
> > of revocable state
> >
>
> Which flag?

When the server has state that it flags for revocation it sets the
SEQ4_STATUS_RECALLABLE_STATE_REVOKED. This is expected behaviour.

> > , then the CB_RECALLs are getting ERR_DELAY error
> > (not there aren't multiple callbacks in flight, perhaps at most 2). So
> > it seems like things are "slowing down" even further. There are about
> > 2-3 CB_RECALLs 3rd getting the reply then OPEN which gets BAD_STATEID,
> > then TEST_STATEID, FREE_STATEID, and then OPEN.
> >
>
> Sounds like a client-side capacity issue? nfs4_callback_recall()
> returns NFS4ERR_DELAY when nfs_delegation_find_inode() returns -EAGAIN.
> Maybe there is something weird going on there? Eventually the server
> has no choice but to revoke an unreturned delegation.

I'm not trying to imply in any matter that there is a problem with
either server or client side. I'm simply trying to state that there
was a theory that having multiple cb_table slots would help in the
case of having a lot of recalled/revoked state.  What I'm finding is
that it doesn't seem so (and possibly makes things just slightly
slower. But no measures were taken so my focus is elsewhere).

> > > > Changes in v4:
> > > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > > >
> > > > Changes in v3:
> > > > - add patch to convert se_flags to single se_dead bool
> > > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > > - don't reject target highest slot value of 0
> > > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > > >
> > > > Changes in v2:
> > > > - take cl_lock when fetching fields from session to be encoded
> > > > - use fls() instead of bespoke highest_unset_index()
> > > > - rename variables in several functions with more descriptive names
> > > > - clamp limit of for loop in update_cb_slot_table()
> > > > - re-add missing rpc_wake_up_queued_task() call
> > > > - fix slotid check in decode_cb_sequence4resok()
> > > > - add new per-session spinlock
> > > > ---
> > > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > > >  fs/nfsd/nfs4state.c    |  11 +++--
> > > >  fs/nfsd/state.h        |  15 ++++---
> > > >  fs/nfsd/trace.h        |   2 +-
> > > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > > >
> > > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > > --- a/fs/nfsd/nfs4callback.c
> > > > +++ b/fs/nfsd/nfs4callback.c
> > > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > > >         hdr->nops++;
> > > >  }
> > > >
> > > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > > +{
> > > > +       u32 idx;
> > > > +
> > > > +       spin_lock(&ses->se_lock);
> > > > +       idx = fls(~ses->se_cb_slot_avail);
> > > > +       if (idx > 0)
> > > > +               --idx;
> > > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > > +       spin_unlock(&ses->se_lock);
> > > > +       return idx;
> > > > +}
> > > > +
> > > >  /*
> > > >   * CB_SEQUENCE4args
> > > >   *
> > > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > > >         encode_sessionid4(xdr, session);
> > > >
> > > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > > >
> > > >         hdr->nops++;
> > > >  }
> > > >
> > > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > > +{
> > > > +       /* No need to do anything if nothing changed */
> > > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > > +               return;
> > > > +
> > > > +       spin_lock(&ses->se_lock);
> > > > +       if (target > ses->se_cb_highest_slot) {
> > > > +               int i;
> > > > +
> > > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > > +
> > > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > > +                       ses->se_cb_seq_nr[i] = 1;
> > > > +       }
> > > > +       ses->se_cb_highest_slot = target;
> > > > +       spin_unlock(&ses->se_lock);
> > > > +}
> > > > +
> > > >  /*
> > > >   * CB_SEQUENCE4resok
> > > >   *
> > > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > > >         int status = -ESERVERFAULT;
> > > >         __be32 *p;
> > > > -       u32 dummy;
> > > > +       u32 seqid, slotid, target;
> > > >
> > > >         /*
> > > >          * If the server returns different values for sessionID, slotID or
> > > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > >         }
> > > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > > >
> > > > -       dummy = be32_to_cpup(p++);
> > > > -       if (dummy != session->se_cb_seq_nr) {
> > > > +       seqid = be32_to_cpup(p++);
> > > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > > >                 goto out;
> > > >         }
> > > >
> > > > -       dummy = be32_to_cpup(p++);
> > > > -       if (dummy != 0) {
> > > > +       slotid = be32_to_cpup(p++);
> > > > +       if (slotid != cb->cb_held_slot) {
> > > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > > >                 goto out;
> > > >         }
> > > >
> > > > -       /*
> > > > -        * FIXME: process highest slotid and target highest slotid
> > > > -        */
> > > > +       p++; // ignore current highest slot value
> > > > +
> > > > +       target = be32_to_cpup(p++);
> > > > +       update_cb_slot_table(session, target);
> > > >         status = 0;
> > > >  out:
> > > >         cb->cb_seq_status = status;
> > > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > >         spin_unlock(&clp->cl_lock);
> > > >  }
> > > >
> > > > +static int grab_slot(struct nfsd4_session *ses)
> > > > +{
> > > > +       int idx;
> > > > +
> > > > +       spin_lock(&ses->se_lock);
> > > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > > +               spin_unlock(&ses->se_lock);
> > > > +               return -1;
> > > > +       }
> > > > +       /* clear the bit for the slot */
> > > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > > +       spin_unlock(&ses->se_lock);
> > > > +       return idx;
> > > > +}
> > > > +
> > > >  /*
> > > >   * There's currently a single callback channel slot.
> > > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > > >  {
> > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > >
> > > > -       if (!cb->cb_holds_slot &&
> > > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > +       if (cb->cb_held_slot >= 0)
> > > > +               return true;
> > > > +       cb->cb_held_slot = grab_slot(ses);
> > > > +       if (cb->cb_held_slot < 0) {
> > > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > > >                 /* Race breaker */
> > > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > -                       dprintk("%s slot is busy\n", __func__);
> > > > +               cb->cb_held_slot = grab_slot(ses);
> > > > +               if (cb->cb_held_slot < 0)
> > > >                         return false;
> > > > -               }
> > > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > > >         }
> > > > -       cb->cb_holds_slot = true;
> > > >         return true;
> > > >  }
> > > >
> > > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > > >  {
> > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > >
> > > > -       if (cb->cb_holds_slot) {
> > > > -               cb->cb_holds_slot = false;
> > > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > > +       if (cb->cb_held_slot >= 0) {
> > > > +               spin_lock(&ses->se_lock);
> > > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > > +               spin_unlock(&ses->se_lock);
> > > > +               cb->cb_held_slot = -1;
> > > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > > >         }
> > > >  }
> > > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > > >  }
> > > >
> > > >  /*
> > > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > > - * slots, and mark callback channel down on communication errors.
> > > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > > + * and mark callback channel down on communication errors.
> > > >   */
> > > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > > >  {
> > > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > >                 return true;
> > > >         }
> > > >
> > > > -       if (!cb->cb_holds_slot)
> > > > +       if (cb->cb_held_slot < 0)
> > > >                 goto need_restart;
> > > >
> > > >         /* This is the operation status code for CB_SEQUENCE */
> > > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > > >                  * (sequence ID, cached reply) MUST NOT change.
> > > >                  */
> > > > -               ++session->se_cb_seq_nr;
> > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > >                 break;
> > > >         case -ESERVERFAULT:
> > > > -               ++session->se_cb_seq_nr;
> > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > >                 ret = false;
> > > >                 break;
> > > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > >         case -NFS4ERR_BADSLOT:
> > > >                 goto retry_nowait;
> > > >         case -NFS4ERR_SEQ_MISORDERED:
> > > > -               if (session->se_cb_seq_nr != 1) {
> > > > -                       session->se_cb_seq_nr = 1;
> > > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > > >                         goto retry_nowait;
> > > >                 }
> > > >                 break;
> > > >         default:
> > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > >         }
> > > > -       nfsd41_cb_release_slot(cb);
> > > > -
> > > >         trace_nfsd_cb_free_slot(task, cb);
> > > > +       nfsd41_cb_release_slot(cb);
> > > >
> > > >         if (RPC_SIGNALLED(task))
> > > >                 goto need_restart;
> > > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > > >         cb->cb_status = 0;
> > > >         cb->cb_need_restart = false;
> > > > -       cb->cb_holds_slot = false;
> > > > +       cb->cb_held_slot = -1;
> > > >  }
> > > >
> > > >  /**
> > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > > --- a/fs/nfsd/nfs4state.c
> > > > +++ b/fs/nfsd/nfs4state.c
> > > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > >         }
> > > >
> > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > +       new->se_cb_slot_avail = ~0U;
> > > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > +       spin_lock_init(&new->se_lock);
> > > >         return new;
> > > >  out_free:
> > > >         while (i--)
> > > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > >
> > > >         INIT_LIST_HEAD(&new->se_conns);
> > > >
> > > > -       new->se_cb_seq_nr = 1;
> > > > +       atomic_set(&new->se_ref, 0);
> > > >         new->se_dead = false;
> > > >         new->se_cb_prog = cses->callback_prog;
> > > >         new->se_cb_sec = cses->cb_sec;
> > > > -       atomic_set(&new->se_ref, 0);
> > > > +
> > > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > > +               new->se_cb_seq_nr[idx] = 1;
> > > > +
> > > >         idx = hash_sessionid(&new->se_sessionid);
> > > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > > >         spin_lock(&clp->cl_lock);
> > > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > > >         clp->cl_time = ktime_get_boottime_seconds();
> > > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > > >         copy_verf(clp, verf);
> > > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > > >         clp->cl_cb_session = NULL;
> > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > > --- a/fs/nfsd/state.h
> > > > +++ b/fs/nfsd/state.h
> > > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > > >         struct work_struct cb_work;
> > > >         int cb_seq_status;
> > > >         int cb_status;
> > > > +       int cb_held_slot;
> > > >         bool cb_need_restart;
> > > > -       bool cb_holds_slot;
> > > >  };
> > > >
> > > >  struct nfsd4_callback_ops {
> > > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > > >         unsigned char cn_flags;
> > > >  };
> > > >
> > > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > > > +
> > > >  /*
> > > >   * Representation of a v4.1+ session. These are refcounted in a similar fashion
> > > >   * to the nfs4_client. References are only taken when the server is actively
> > > > @@ -314,6 +317,10 @@ struct nfsd4_conn {
> > > >   */
> > > >  struct nfsd4_session {
> > > >         atomic_t                se_ref;
> > > > +       spinlock_t              se_lock;
> > > > +       u32                     se_cb_slot_avail; /* bitmap of available slots */
> > > > +       u32                     se_cb_highest_slot;     /* highest slot client wants */
> > > > +       u32                     se_cb_prog;
> > > >         bool                    se_dead;
> > > >         struct list_head        se_hash;        /* hash by sessionid */
> > > >         struct list_head        se_perclnt;
> > > > @@ -322,8 +329,7 @@ struct nfsd4_session {
> > > >         struct nfsd4_channel_attrs se_fchannel;
> > > >         struct nfsd4_cb_sec     se_cb_sec;
> > > >         struct list_head        se_conns;
> > > > -       u32                     se_cb_prog;
> > > > -       u32                     se_cb_seq_nr;
> > > > +       u32                     se_cb_seq_nr[NFSD_BC_SLOT_TABLE_MAX + 1];
> > > >         struct nfsd4_slot       *se_slots[];    /* forward channel slots */
> > > >  };
> > > >
> > > > @@ -457,9 +463,6 @@ struct nfs4_client {
> > > >          */
> > > >         struct dentry           *cl_nfsd_info_dentry;
> > > >
> > > > -       /* for nfs41 callbacks */
> > > > -       /* We currently support a single back channel with a single slot */
> > > > -       unsigned long           cl_cb_slot_busy;
> > > >         struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
> > > >                                                 /* wait here for slots */
> > > >         struct net              *net;
> > > > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > > > index f318898cfc31614b5a84a4867e18c2b3a07122c9..a9c17186b6892f1df8d7f7b90e250c2913ab23fe 100644
> > > > --- a/fs/nfsd/trace.h
> > > > +++ b/fs/nfsd/trace.h
> > > > @@ -1697,7 +1697,7 @@ TRACE_EVENT(nfsd_cb_free_slot,
> > > >                 __entry->cl_id = sid->clientid.cl_id;
> > > >                 __entry->seqno = sid->sequence;
> > > >                 __entry->reserved = sid->reserved;
> > > > -               __entry->slot_seqno = session->se_cb_seq_nr;
> > > > +               __entry->slot_seqno = session->se_cb_seq_nr[cb->cb_held_slot];
> > > >         ),
> > > >         TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
> > > >                 " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u",
> > > >
> > > > ---
> > > > base-commit: 3c16aac09d20f9005fbb0e737b3ec520bbb5badd
> > > > change-id: 20241025-bcwide-6bd7e4b63db2
> > > >
> > > > Best regards,
> > > > --
> > > > Jeff Layton <jlayton@kernel.org>
> > > >
> > > >
>
> --
> Jeff Layton <jlayton@kernel.org>
Tom Talpey Nov. 11, 2024, 3:18 a.m. UTC | #10
On 11/10/2024 9:24 PM, Olga Kornievskaia wrote:
> On Sat, Nov 9, 2024 at 4:10 PM Jeff Layton <jlayton@kernel.org> wrote:
>> Sounds like a client-side capacity issue? nfs4_callback_recall()
>> returns NFS4ERR_DELAY when nfs_delegation_find_inode() returns -EAGAIN.
>> Maybe there is something weird going on there? Eventually the server
>> has no choice but to revoke an unreturned delegation.
> 
> I'm not trying to imply in any matter that there is a problem with
> either server or client side. I'm simply trying to state that there
> was a theory that having multiple cb_table slots would help in the
> case of having a lot of recalled/revoked state.  What I'm finding is
> that it doesn't seem so (and possibly makes things just slightly
> slower. But no measures were taken so my focus is elsewhere).

We should definitely understand this! I'd say that supporting multiple 
slots is worthwhile and should be merged if stable.

But if they're no better performance-wise, we're missing something. 
There's got to be something else serializing them.

Tom.
Jeff Layton Nov. 11, 2024, 1:22 p.m. UTC | #11
On Sun, 2024-11-10 at 21:19 -0500, Olga Kornievskaia wrote:
> On Sat, Nov 9, 2024 at 2:26 PM Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > On Sat, 2024-11-09 at 13:50 -0500, Olga Kornievskaia wrote:
> > > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > 
> > > > nfsd currently only uses a single slot in the callback channel, which is
> > > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > > a max of 32 slots (subject to the client's target_maxreqs value).
> > > > 
> > > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > > track which slots are in use, and a u32 to track the latest callback
> > > > target_slotid that the client reports. To protect the new fields, add
> > > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > > search for the lowest slotid (using ffs()).
> > > > 
> > > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > > counters and add the necessary handling to ensure that the seqids get
> > > > reset at the appropriate times.
> > > > 
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > > ---
> > > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > > when the slot table is full. Olga, if you're able to test this one, it
> > > > would be much appreciated.
> > > > ---
> > > > Changes in v4:
> > > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > > > 
> > > > Changes in v3:
> > > > - add patch to convert se_flags to single se_dead bool
> > > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > > - don't reject target highest slot value of 0
> > > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > > > 
> > > > Changes in v2:
> > > > - take cl_lock when fetching fields from session to be encoded
> > > > - use fls() instead of bespoke highest_unset_index()
> > > > - rename variables in several functions with more descriptive names
> > > > - clamp limit of for loop in update_cb_slot_table()
> > > > - re-add missing rpc_wake_up_queued_task() call
> > > > - fix slotid check in decode_cb_sequence4resok()
> > > > - add new per-session spinlock
> > > > ---
> > > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > > >  fs/nfsd/nfs4state.c    |  11 +++--
> > > >  fs/nfsd/state.h        |  15 ++++---
> > > >  fs/nfsd/trace.h        |   2 +-
> > > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > > > 
> > > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > > --- a/fs/nfsd/nfs4callback.c
> > > > +++ b/fs/nfsd/nfs4callback.c
> > > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > > >         hdr->nops++;
> > > >  }
> > > > 
> > > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > > +{
> > > > +       u32 idx;
> > > > +
> > > > +       spin_lock(&ses->se_lock);
> > > > +       idx = fls(~ses->se_cb_slot_avail);
> > > > +       if (idx > 0)
> > > > +               --idx;
> > > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > > +       spin_unlock(&ses->se_lock);
> > > > +       return idx;
> > > > +}
> > > > +
> > > >  /*
> > > >   * CB_SEQUENCE4args
> > > >   *
> > > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > > >         encode_sessionid4(xdr, session);
> > > > 
> > > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > > > 
> > > >         hdr->nops++;
> > > >  }
> > > > 
> > > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > > +{
> > > > +       /* No need to do anything if nothing changed */
> > > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > > +               return;
> > > > +
> > > > +       spin_lock(&ses->se_lock);
> > > > +       if (target > ses->se_cb_highest_slot) {
> > > > +               int i;
> > > > +
> > > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > > +
> > > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > > +                       ses->se_cb_seq_nr[i] = 1;
> > > > +       }
> > > > +       ses->se_cb_highest_slot = target;
> > > > +       spin_unlock(&ses->se_lock);
> > > > +}
> > > > +
> > > >  /*
> > > >   * CB_SEQUENCE4resok
> > > >   *
> > > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > > >         int status = -ESERVERFAULT;
> > > >         __be32 *p;
> > > > -       u32 dummy;
> > > > +       u32 seqid, slotid, target;
> > > > 
> > > >         /*
> > > >          * If the server returns different values for sessionID, slotID or
> > > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > >         }
> > > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > > > 
> > > > -       dummy = be32_to_cpup(p++);
> > > > -       if (dummy != session->se_cb_seq_nr) {
> > > > +       seqid = be32_to_cpup(p++);
> > > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > > >                 goto out;
> > > >         }
> > > > 
> > > > -       dummy = be32_to_cpup(p++);
> > > > -       if (dummy != 0) {
> > > > +       slotid = be32_to_cpup(p++);
> > > > +       if (slotid != cb->cb_held_slot) {
> > > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > > >                 goto out;
> > > >         }
> > > > 
> > > > -       /*
> > > > -        * FIXME: process highest slotid and target highest slotid
> > > > -        */
> > > > +       p++; // ignore current highest slot value
> > > > +
> > > > +       target = be32_to_cpup(p++);
> > > > +       update_cb_slot_table(session, target);
> > > >         status = 0;
> > > >  out:
> > > >         cb->cb_seq_status = status;
> > > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > >         spin_unlock(&clp->cl_lock);
> > > >  }
> > > > 
> > > > +static int grab_slot(struct nfsd4_session *ses)
> > > > +{
> > > > +       int idx;
> > > > +
> > > > +       spin_lock(&ses->se_lock);
> > > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > > +               spin_unlock(&ses->se_lock);
> > > > +               return -1;
> > > > +       }
> > > > +       /* clear the bit for the slot */
> > > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > > +       spin_unlock(&ses->se_lock);
> > > > +       return idx;
> > > > +}
> > > > +
> > > >  /*
> > > >   * There's currently a single callback channel slot.
> > > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > > >  {
> > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > 
> > > > -       if (!cb->cb_holds_slot &&
> > > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > +       if (cb->cb_held_slot >= 0)
> > > > +               return true;
> > > > +       cb->cb_held_slot = grab_slot(ses);
> > > > +       if (cb->cb_held_slot < 0) {
> > > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > > >                 /* Race breaker */
> > > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > -                       dprintk("%s slot is busy\n", __func__);
> > > > +               cb->cb_held_slot = grab_slot(ses);
> > > > +               if (cb->cb_held_slot < 0)
> > > >                         return false;
> > > > -               }
> > > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > > >         }
> > > > -       cb->cb_holds_slot = true;
> > > >         return true;
> > > >  }
> > > > 
> > > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > > >  {
> > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > 
> > > > -       if (cb->cb_holds_slot) {
> > > > -               cb->cb_holds_slot = false;
> > > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > > +       if (cb->cb_held_slot >= 0) {
> > > > +               spin_lock(&ses->se_lock);
> > > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > > +               spin_unlock(&ses->se_lock);
> > > > +               cb->cb_held_slot = -1;
> > > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > > >         }
> > > >  }
> > > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > > >  }
> > > > 
> > > >  /*
> > > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > > - * slots, and mark callback channel down on communication errors.
> > > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > > + * and mark callback channel down on communication errors.
> > > >   */
> > > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > > >  {
> > > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > >                 return true;
> > > >         }
> > > > 
> > > > -       if (!cb->cb_holds_slot)
> > > > +       if (cb->cb_held_slot < 0)
> > > >                 goto need_restart;
> > > > 
> > > >         /* This is the operation status code for CB_SEQUENCE */
> > > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > > >                  * (sequence ID, cached reply) MUST NOT change.
> > > >                  */
> > > > -               ++session->se_cb_seq_nr;
> > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > >                 break;
> > > >         case -ESERVERFAULT:
> > > > -               ++session->se_cb_seq_nr;
> > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > >                 ret = false;
> > > >                 break;
> > > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > >         case -NFS4ERR_BADSLOT:
> > > >                 goto retry_nowait;
> > > >         case -NFS4ERR_SEQ_MISORDERED:
> > > > -               if (session->se_cb_seq_nr != 1) {
> > > > -                       session->se_cb_seq_nr = 1;
> > > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > > >                         goto retry_nowait;
> > > >                 }
> > > >                 break;
> > > >         default:
> > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > >         }
> > > > -       nfsd41_cb_release_slot(cb);
> > > > -
> > > >         trace_nfsd_cb_free_slot(task, cb);
> > > > +       nfsd41_cb_release_slot(cb);
> > > > 
> > > >         if (RPC_SIGNALLED(task))
> > > >                 goto need_restart;
> > > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > > >         cb->cb_status = 0;
> > > >         cb->cb_need_restart = false;
> > > > -       cb->cb_holds_slot = false;
> > > > +       cb->cb_held_slot = -1;
> > > >  }
> > > > 
> > > >  /**
> > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > > --- a/fs/nfsd/nfs4state.c
> > > > +++ b/fs/nfsd/nfs4state.c
> > > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > >         }
> > > > 
> > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > +       new->se_cb_slot_avail = ~0U;
> > > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > +       spin_lock_init(&new->se_lock);
> > > >         return new;
> > > >  out_free:
> > > >         while (i--)
> > > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > > 
> > > >         INIT_LIST_HEAD(&new->se_conns);
> > > > 
> > > > -       new->se_cb_seq_nr = 1;
> > > > +       atomic_set(&new->se_ref, 0);
> > > >         new->se_dead = false;
> > > >         new->se_cb_prog = cses->callback_prog;
> > > >         new->se_cb_sec = cses->cb_sec;
> > > > -       atomic_set(&new->se_ref, 0);
> > > > +
> > > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > > +               new->se_cb_seq_nr[idx] = 1;
> > > > +
> > > >         idx = hash_sessionid(&new->se_sessionid);
> > > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > > >         spin_lock(&clp->cl_lock);
> > > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > > >         clp->cl_time = ktime_get_boottime_seconds();
> > > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > > >         copy_verf(clp, verf);
> > > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > > >         clp->cl_cb_session = NULL;
> > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > > --- a/fs/nfsd/state.h
> > > > +++ b/fs/nfsd/state.h
> > > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > > >         struct work_struct cb_work;
> > > >         int cb_seq_status;
> > > >         int cb_status;
> > > > +       int cb_held_slot;
> > > >         bool cb_need_restart;
> > > > -       bool cb_holds_slot;
> > > >  };
> > > > 
> > > >  struct nfsd4_callback_ops {
> > > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > > >         unsigned char cn_flags;
> > > >  };
> > > > 
> > > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > > 
> > > Are there some values that are known not to work? I was experimenting
> > > with values and set it to 2 and 4 and the kernel oopsed. I understand
> > > it's not a configurable value but it would still be good to know the
> > > expectations...
> > > 
> > > [  198.625021] Unable to handle kernel paging request at virtual
> > > address dfff800020000000
> > > [  198.625870] KASAN: probably user-memory-access in range
> > > [0x0000000100000000-0x0000000100000007]
> > > [  198.626444] Mem abort info:
> > > [  198.626630]   ESR = 0x0000000096000005
> > > [  198.626882]   EC = 0x25: DABT (current EL), IL = 32 bits
> > > [  198.627234]   SET = 0, FnV = 0
> > > [  198.627441]   EA = 0, S1PTW = 0
> > > [  198.627627]   FSC = 0x05: level 1 translation fault
> > > [  198.627859] Data abort info:
> > > [  198.628000]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
> > > [  198.628272]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> > > [  198.628619]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> > > [  198.628967] [dfff800020000000] address between user and kernel address ranges
> > > [  198.629438] Internal error: Oops: 0000000096000005 [#1] SMP
> > > [  198.629806] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver
> > > nfs netfs nfnetlink_queue nfnetlink_log nfnetlink bluetooth cfg80211
> > > rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd auth_rpcgss nfs_acl lockd
> > > grace isofs uinput snd_seq_dummy snd_hrtimer vsock_loopback
> > > vmw_vsock_virtio_transport_common qrtr rfkill vmw_vsock_vmci_transport
> > > vsock sunrpc vfat fat snd_hda_codec_generic snd_hda_intel
> > > snd_intel_dspcfg snd_hda_codec snd_hda_core snd_hwdep snd_seq uvcvideo
> > > videobuf2_vmalloc snd_seq_device videobuf2_memops uvc videobuf2_v4l2
> > > videodev snd_pcm videobuf2_common mc snd_timer snd vmw_vmci soundcore
> > > xfs libcrc32c vmwgfx drm_ttm_helper ttm nvme drm_kms_helper
> > > crct10dif_ce nvme_core ghash_ce sha2_ce sha256_arm64 sha1_ce drm
> > > nvme_auth sr_mod cdrom e1000e sg fuse
> > > [  198.633799] CPU: 5 UID: 0 PID: 6081 Comm: nfsd Kdump: loaded Not
> > > tainted 6.12.0-rc6+ #47
> > > [  198.634345] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS
> > > VMW201.00V.21805430.BA64.2305221830 05/22/2023
> > > [  198.635014] pstate: 11400005 (nzcV daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> > > [  198.635492] pc : nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > [  198.635798] lr : nfsd4_sequence+0x340/0x1f60 [nfsd]
> > > [  198.636065] sp : ffff8000884977e0
> > > [  198.636234] x29: ffff800088497910 x28: ffff0000b1b39280 x27: ffff0000ab508128
> > > [  198.636624] x26: ffff0000b1b39298 x25: ffff0000b1b39290 x24: ffff0000a65e1c64
> > > [  198.637049] x23: 1fffe000212e6804 x22: ffff000109734024 x21: 1ffff00011092f16
> > > [  198.637472] x20: ffff00010aed8000 x19: ffff000109734000 x18: 1fffe0002de20c8b
> > > [  198.637883] x17: 0100000000000000 x16: 1ffff0000fcef234 x15: 1fffe000212e600f
> > > [  198.638286] x14: ffff80007e779000 x13: ffff80007e7791a0 x12: 0000000000000000
> > > [  198.638697] x11: ffff0000a65e1c38 x10: ffff00010aedaca0 x9 : 1fffe000215db594
> > > [  198.639110] x8 : 1fffe00014cbc387 x7 : ffff0000a65e1c03 x6 : ffff0000a65e1c00
> > > [  198.639541] x5 : ffff0000a65e1c00 x4 : 0000000020000000 x3 : 0000000100000001
> > > [  198.639962] x2 : ffff000109730060 x1 : 0000000000000003 x0 : dfff800000000000
> > > [  198.640332] Call trace:
> > > [  198.640460]  nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > [  198.640715]  nfsd4_proc_compound+0xb94/0x23b0 [nfsd]
> > > [  198.640997]  nfsd_dispatch+0x22c/0x718 [nfsd]
> > > [  198.641260]  svc_process_common+0x8e8/0x1968 [sunrpc]
> > > [  198.641566]  svc_process+0x3d4/0x7e0 [sunrpc]
> > > [  198.641827]  svc_handle_xprt+0x828/0xe10 [sunrpc]
> > > [  198.642108]  svc_recv+0x2cc/0x6a8 [sunrpc]
> > > [  198.642346]  nfsd+0x270/0x400 [nfsd]
> > > [  198.642562]  kthread+0x288/0x310
> > > [  198.642745]  ret_from_fork+0x10/0x20
> > > [  198.642937] Code: f2fbffe0 f9003be4 f94007e2 52800061 (38e06880)
> > > [  198.643267] SMP: stopping secondary CPUs
> > > 
> > > 
> > > 
> > 
> > 
> > Good catch. I think the problem here is that we don't currently cap the
> > initial value of se_cb_highest_slot at NFSD_BC_SLOT_TABLE_MAX. Does
> > this patch prevent the panic?
> > 
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index 3afe56ab9e0a..839be4ba765a 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -2011,7 +2011,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > 
> >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> >         new->se_cb_slot_avail = ~0U;
> > -       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > +       new->se_cb_highest_slot = min(battrs->maxreqs - 1, NFSD_BC_SLOT_TABLE_MAX);
> >         spin_lock_init(&new->se_lock);
> >         return new;
> >  out_free:
> 
> It does help. I thought that the CREATE_SESSION reply for the
> backchannel would be guided by the NFSD_BC_SLOT_TABLE_MAX value but
> instead it seems like it's not. But yes I can see that the highest
> slot used by the server is capped by the NFSD_BC_SLOT_TABLE_MAX value.

Thanks for testing it, Olga.

Chuck, would you be OK with folding the above delta into 9ab4c4077de9,
or would you rather I resend the patch?
Chuck Lever III Nov. 11, 2024, 2:55 p.m. UTC | #12
On Mon, Nov 11, 2024 at 08:22:07AM -0500, Jeff Layton wrote:
> On Sun, 2024-11-10 at 21:19 -0500, Olga Kornievskaia wrote:
> > On Sat, Nov 9, 2024 at 2:26 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > 
> > > On Sat, 2024-11-09 at 13:50 -0500, Olga Kornievskaia wrote:
> > > > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > 
> > > > > nfsd currently only uses a single slot in the callback channel, which is
> > > > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > > > a max of 32 slots (subject to the client's target_maxreqs value).
> > > > > 
> > > > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > > > track which slots are in use, and a u32 to track the latest callback
> > > > > target_slotid that the client reports. To protect the new fields, add
> > > > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > > > search for the lowest slotid (using ffs()).
> > > > > 
> > > > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > > > counters and add the necessary handling to ensure that the seqids get
> > > > > reset at the appropriate times.
> > > > > 
> > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > > > ---
> > > > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > > > when the slot table is full. Olga, if you're able to test this one, it
> > > > > would be much appreciated.
> > > > > ---
> > > > > Changes in v4:
> > > > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > > > > 
> > > > > Changes in v3:
> > > > > - add patch to convert se_flags to single se_dead bool
> > > > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > > > - don't reject target highest slot value of 0
> > > > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > > > > 
> > > > > Changes in v2:
> > > > > - take cl_lock when fetching fields from session to be encoded
> > > > > - use fls() instead of bespoke highest_unset_index()
> > > > > - rename variables in several functions with more descriptive names
> > > > > - clamp limit of for loop in update_cb_slot_table()
> > > > > - re-add missing rpc_wake_up_queued_task() call
> > > > > - fix slotid check in decode_cb_sequence4resok()
> > > > > - add new per-session spinlock
> > > > > ---
> > > > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > > > >  fs/nfsd/nfs4state.c    |  11 +++--
> > > > >  fs/nfsd/state.h        |  15 ++++---
> > > > >  fs/nfsd/trace.h        |   2 +-
> > > > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > > > > 
> > > > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > > > --- a/fs/nfsd/nfs4callback.c
> > > > > +++ b/fs/nfsd/nfs4callback.c
> > > > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > > > >         hdr->nops++;
> > > > >  }
> > > > > 
> > > > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > > > +{
> > > > > +       u32 idx;
> > > > > +
> > > > > +       spin_lock(&ses->se_lock);
> > > > > +       idx = fls(~ses->se_cb_slot_avail);
> > > > > +       if (idx > 0)
> > > > > +               --idx;
> > > > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > > > +       spin_unlock(&ses->se_lock);
> > > > > +       return idx;
> > > > > +}
> > > > > +
> > > > >  /*
> > > > >   * CB_SEQUENCE4args
> > > > >   *
> > > > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > > > >         encode_sessionid4(xdr, session);
> > > > > 
> > > > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > > > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > > > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > > > > 
> > > > >         hdr->nops++;
> > > > >  }
> > > > > 
> > > > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > > > +{
> > > > > +       /* No need to do anything if nothing changed */
> > > > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > > > +               return;
> > > > > +
> > > > > +       spin_lock(&ses->se_lock);
> > > > > +       if (target > ses->se_cb_highest_slot) {
> > > > > +               int i;
> > > > > +
> > > > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > > > +
> > > > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > > > +                       ses->se_cb_seq_nr[i] = 1;
> > > > > +       }
> > > > > +       ses->se_cb_highest_slot = target;
> > > > > +       spin_unlock(&ses->se_lock);
> > > > > +}
> > > > > +
> > > > >  /*
> > > > >   * CB_SEQUENCE4resok
> > > > >   *
> > > > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > > > >         int status = -ESERVERFAULT;
> > > > >         __be32 *p;
> > > > > -       u32 dummy;
> > > > > +       u32 seqid, slotid, target;
> > > > > 
> > > > >         /*
> > > > >          * If the server returns different values for sessionID, slotID or
> > > > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > >         }
> > > > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > > > > 
> > > > > -       dummy = be32_to_cpup(p++);
> > > > > -       if (dummy != session->se_cb_seq_nr) {
> > > > > +       seqid = be32_to_cpup(p++);
> > > > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > > > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > > > >                 goto out;
> > > > >         }
> > > > > 
> > > > > -       dummy = be32_to_cpup(p++);
> > > > > -       if (dummy != 0) {
> > > > > +       slotid = be32_to_cpup(p++);
> > > > > +       if (slotid != cb->cb_held_slot) {
> > > > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > > > >                 goto out;
> > > > >         }
> > > > > 
> > > > > -       /*
> > > > > -        * FIXME: process highest slotid and target highest slotid
> > > > > -        */
> > > > > +       p++; // ignore current highest slot value
> > > > > +
> > > > > +       target = be32_to_cpup(p++);
> > > > > +       update_cb_slot_table(session, target);
> > > > >         status = 0;
> > > > >  out:
> > > > >         cb->cb_seq_status = status;
> > > > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > >         spin_unlock(&clp->cl_lock);
> > > > >  }
> > > > > 
> > > > > +static int grab_slot(struct nfsd4_session *ses)
> > > > > +{
> > > > > +       int idx;
> > > > > +
> > > > > +       spin_lock(&ses->se_lock);
> > > > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > > > +               spin_unlock(&ses->se_lock);
> > > > > +               return -1;
> > > > > +       }
> > > > > +       /* clear the bit for the slot */
> > > > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > > > +       spin_unlock(&ses->se_lock);
> > > > > +       return idx;
> > > > > +}
> > > > > +
> > > > >  /*
> > > > >   * There's currently a single callback channel slot.
> > > > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > > > >  {
> > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > 
> > > > > -       if (!cb->cb_holds_slot &&
> > > > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > +       if (cb->cb_held_slot >= 0)
> > > > > +               return true;
> > > > > +       cb->cb_held_slot = grab_slot(ses);
> > > > > +       if (cb->cb_held_slot < 0) {
> > > > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > > > >                 /* Race breaker */
> > > > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > -                       dprintk("%s slot is busy\n", __func__);
> > > > > +               cb->cb_held_slot = grab_slot(ses);
> > > > > +               if (cb->cb_held_slot < 0)
> > > > >                         return false;
> > > > > -               }
> > > > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > > > >         }
> > > > > -       cb->cb_holds_slot = true;
> > > > >         return true;
> > > > >  }
> > > > > 
> > > > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > > > >  {
> > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > 
> > > > > -       if (cb->cb_holds_slot) {
> > > > > -               cb->cb_holds_slot = false;
> > > > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > +       if (cb->cb_held_slot >= 0) {
> > > > > +               spin_lock(&ses->se_lock);
> > > > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > > > +               spin_unlock(&ses->se_lock);
> > > > > +               cb->cb_held_slot = -1;
> > > > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > > > >         }
> > > > >  }
> > > > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > > > >  }
> > > > > 
> > > > >  /*
> > > > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > > > - * slots, and mark callback channel down on communication errors.
> > > > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > > > + * and mark callback channel down on communication errors.
> > > > >   */
> > > > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > > > >  {
> > > > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > >                 return true;
> > > > >         }
> > > > > 
> > > > > -       if (!cb->cb_holds_slot)
> > > > > +       if (cb->cb_held_slot < 0)
> > > > >                 goto need_restart;
> > > > > 
> > > > >         /* This is the operation status code for CB_SEQUENCE */
> > > > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > > > >                  * (sequence ID, cached reply) MUST NOT change.
> > > > >                  */
> > > > > -               ++session->se_cb_seq_nr;
> > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > >                 break;
> > > > >         case -ESERVERFAULT:
> > > > > -               ++session->se_cb_seq_nr;
> > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > >                 ret = false;
> > > > >                 break;
> > > > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > >         case -NFS4ERR_BADSLOT:
> > > > >                 goto retry_nowait;
> > > > >         case -NFS4ERR_SEQ_MISORDERED:
> > > > > -               if (session->se_cb_seq_nr != 1) {
> > > > > -                       session->se_cb_seq_nr = 1;
> > > > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > > > >                         goto retry_nowait;
> > > > >                 }
> > > > >                 break;
> > > > >         default:
> > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > >         }
> > > > > -       nfsd41_cb_release_slot(cb);
> > > > > -
> > > > >         trace_nfsd_cb_free_slot(task, cb);
> > > > > +       nfsd41_cb_release_slot(cb);
> > > > > 
> > > > >         if (RPC_SIGNALLED(task))
> > > > >                 goto need_restart;
> > > > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > > > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > > > >         cb->cb_status = 0;
> > > > >         cb->cb_need_restart = false;
> > > > > -       cb->cb_holds_slot = false;
> > > > > +       cb->cb_held_slot = -1;
> > > > >  }
> > > > > 
> > > > >  /**
> > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > > > --- a/fs/nfsd/nfs4state.c
> > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > >         }
> > > > > 
> > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > +       new->se_cb_slot_avail = ~0U;
> > > > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > +       spin_lock_init(&new->se_lock);
> > > > >         return new;
> > > > >  out_free:
> > > > >         while (i--)
> > > > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > > > 
> > > > >         INIT_LIST_HEAD(&new->se_conns);
> > > > > 
> > > > > -       new->se_cb_seq_nr = 1;
> > > > > +       atomic_set(&new->se_ref, 0);
> > > > >         new->se_dead = false;
> > > > >         new->se_cb_prog = cses->callback_prog;
> > > > >         new->se_cb_sec = cses->cb_sec;
> > > > > -       atomic_set(&new->se_ref, 0);
> > > > > +
> > > > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > > > +               new->se_cb_seq_nr[idx] = 1;
> > > > > +
> > > > >         idx = hash_sessionid(&new->se_sessionid);
> > > > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > > > >         spin_lock(&clp->cl_lock);
> > > > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > > > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > > > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > > > >         clp->cl_time = ktime_get_boottime_seconds();
> > > > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > > > >         copy_verf(clp, verf);
> > > > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > > > >         clp->cl_cb_session = NULL;
> > > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > > > --- a/fs/nfsd/state.h
> > > > > +++ b/fs/nfsd/state.h
> > > > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > > > >         struct work_struct cb_work;
> > > > >         int cb_seq_status;
> > > > >         int cb_status;
> > > > > +       int cb_held_slot;
> > > > >         bool cb_need_restart;
> > > > > -       bool cb_holds_slot;
> > > > >  };
> > > > > 
> > > > >  struct nfsd4_callback_ops {
> > > > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > > > >         unsigned char cn_flags;
> > > > >  };
> > > > > 
> > > > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > > > 
> > > > Are there some values that are known not to work? I was experimenting
> > > > with values and set it to 2 and 4 and the kernel oopsed. I understand
> > > > it's not a configurable value but it would still be good to know the
> > > > expectations...
> > > > 
> > > > [  198.625021] Unable to handle kernel paging request at virtual
> > > > address dfff800020000000
> > > > [  198.625870] KASAN: probably user-memory-access in range
> > > > [0x0000000100000000-0x0000000100000007]
> > > > [  198.626444] Mem abort info:
> > > > [  198.626630]   ESR = 0x0000000096000005
> > > > [  198.626882]   EC = 0x25: DABT (current EL), IL = 32 bits
> > > > [  198.627234]   SET = 0, FnV = 0
> > > > [  198.627441]   EA = 0, S1PTW = 0
> > > > [  198.627627]   FSC = 0x05: level 1 translation fault
> > > > [  198.627859] Data abort info:
> > > > [  198.628000]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
> > > > [  198.628272]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> > > > [  198.628619]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> > > > [  198.628967] [dfff800020000000] address between user and kernel address ranges
> > > > [  198.629438] Internal error: Oops: 0000000096000005 [#1] SMP
> > > > [  198.629806] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver
> > > > nfs netfs nfnetlink_queue nfnetlink_log nfnetlink bluetooth cfg80211
> > > > rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd auth_rpcgss nfs_acl lockd
> > > > grace isofs uinput snd_seq_dummy snd_hrtimer vsock_loopback
> > > > vmw_vsock_virtio_transport_common qrtr rfkill vmw_vsock_vmci_transport
> > > > vsock sunrpc vfat fat snd_hda_codec_generic snd_hda_intel
> > > > snd_intel_dspcfg snd_hda_codec snd_hda_core snd_hwdep snd_seq uvcvideo
> > > > videobuf2_vmalloc snd_seq_device videobuf2_memops uvc videobuf2_v4l2
> > > > videodev snd_pcm videobuf2_common mc snd_timer snd vmw_vmci soundcore
> > > > xfs libcrc32c vmwgfx drm_ttm_helper ttm nvme drm_kms_helper
> > > > crct10dif_ce nvme_core ghash_ce sha2_ce sha256_arm64 sha1_ce drm
> > > > nvme_auth sr_mod cdrom e1000e sg fuse
> > > > [  198.633799] CPU: 5 UID: 0 PID: 6081 Comm: nfsd Kdump: loaded Not
> > > > tainted 6.12.0-rc6+ #47
> > > > [  198.634345] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS
> > > > VMW201.00V.21805430.BA64.2305221830 05/22/2023
> > > > [  198.635014] pstate: 11400005 (nzcV daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> > > > [  198.635492] pc : nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > [  198.635798] lr : nfsd4_sequence+0x340/0x1f60 [nfsd]
> > > > [  198.636065] sp : ffff8000884977e0
> > > > [  198.636234] x29: ffff800088497910 x28: ffff0000b1b39280 x27: ffff0000ab508128
> > > > [  198.636624] x26: ffff0000b1b39298 x25: ffff0000b1b39290 x24: ffff0000a65e1c64
> > > > [  198.637049] x23: 1fffe000212e6804 x22: ffff000109734024 x21: 1ffff00011092f16
> > > > [  198.637472] x20: ffff00010aed8000 x19: ffff000109734000 x18: 1fffe0002de20c8b
> > > > [  198.637883] x17: 0100000000000000 x16: 1ffff0000fcef234 x15: 1fffe000212e600f
> > > > [  198.638286] x14: ffff80007e779000 x13: ffff80007e7791a0 x12: 0000000000000000
> > > > [  198.638697] x11: ffff0000a65e1c38 x10: ffff00010aedaca0 x9 : 1fffe000215db594
> > > > [  198.639110] x8 : 1fffe00014cbc387 x7 : ffff0000a65e1c03 x6 : ffff0000a65e1c00
> > > > [  198.639541] x5 : ffff0000a65e1c00 x4 : 0000000020000000 x3 : 0000000100000001
> > > > [  198.639962] x2 : ffff000109730060 x1 : 0000000000000003 x0 : dfff800000000000
> > > > [  198.640332] Call trace:
> > > > [  198.640460]  nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > [  198.640715]  nfsd4_proc_compound+0xb94/0x23b0 [nfsd]
> > > > [  198.640997]  nfsd_dispatch+0x22c/0x718 [nfsd]
> > > > [  198.641260]  svc_process_common+0x8e8/0x1968 [sunrpc]
> > > > [  198.641566]  svc_process+0x3d4/0x7e0 [sunrpc]
> > > > [  198.641827]  svc_handle_xprt+0x828/0xe10 [sunrpc]
> > > > [  198.642108]  svc_recv+0x2cc/0x6a8 [sunrpc]
> > > > [  198.642346]  nfsd+0x270/0x400 [nfsd]
> > > > [  198.642562]  kthread+0x288/0x310
> > > > [  198.642745]  ret_from_fork+0x10/0x20
> > > > [  198.642937] Code: f2fbffe0 f9003be4 f94007e2 52800061 (38e06880)
> > > > [  198.643267] SMP: stopping secondary CPUs
> > > > 
> > > > 
> > > > 
> > > 
> > > 
> > > Good catch. I think the problem here is that we don't currently cap the
> > > initial value of se_cb_highest_slot at NFSD_BC_SLOT_TABLE_MAX. Does
> > > this patch prevent the panic?
> > > 
> > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > index 3afe56ab9e0a..839be4ba765a 100644
> > > --- a/fs/nfsd/nfs4state.c
> > > +++ b/fs/nfsd/nfs4state.c
> > > @@ -2011,7 +2011,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > 
> > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > >         new->se_cb_slot_avail = ~0U;
> > > -       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > +       new->se_cb_highest_slot = min(battrs->maxreqs - 1, NFSD_BC_SLOT_TABLE_MAX);
> > >         spin_lock_init(&new->se_lock);
> > >         return new;
> > >  out_free:
> > 
> > It does help. I thought that the CREATE_SESSION reply for the
> > backchannel would be guided by the NFSD_BC_SLOT_TABLE_MAX value but
> > instead it seems like it's not. But yes I can see that the highest
> > slot used by the server is capped by the NFSD_BC_SLOT_TABLE_MAX value.
> 
> Thanks for testing it, Olga.
> 
> Chuck, would you be OK with folding the above delta into 9ab4c4077de9,
> or would you rather I resend the patch?

I've folded the above one-liner into the applied patch.

I agree with Tom, I think there's probably a (surprising)
explanation lurking for not seeing the expected performance
improvement. I can delay sending the NFSD v6.13 merge window pull
request for a bit to see if you can get it teased out.
Olga Kornievskaia Nov. 11, 2024, 5:17 p.m. UTC | #13
On Mon, Nov 11, 2024 at 9:56 AM Chuck Lever <chuck.lever@oracle.com> wrote:
>
> On Mon, Nov 11, 2024 at 08:22:07AM -0500, Jeff Layton wrote:
> > On Sun, 2024-11-10 at 21:19 -0500, Olga Kornievskaia wrote:
> > > On Sat, Nov 9, 2024 at 2:26 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > >
> > > > On Sat, 2024-11-09 at 13:50 -0500, Olga Kornievskaia wrote:
> > > > > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > >
> > > > > > nfsd currently only uses a single slot in the callback channel, which is
> > > > > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > > > > a max of 32 slots (subject to the client's target_maxreqs value).
> > > > > >
> > > > > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > > > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > > > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > > > > track which slots are in use, and a u32 to track the latest callback
> > > > > > target_slotid that the client reports. To protect the new fields, add
> > > > > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > > > > search for the lowest slotid (using ffs()).
> > > > > >
> > > > > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > > > > counters and add the necessary handling to ensure that the seqids get
> > > > > > reset at the appropriate times.
> > > > > >
> > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > > > > ---
> > > > > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > > > > when the slot table is full. Olga, if you're able to test this one, it
> > > > > > would be much appreciated.
> > > > > > ---
> > > > > > Changes in v4:
> > > > > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > > > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > > > > >
> > > > > > Changes in v3:
> > > > > > - add patch to convert se_flags to single se_dead bool
> > > > > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > > > > - don't reject target highest slot value of 0
> > > > > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > > > > >
> > > > > > Changes in v2:
> > > > > > - take cl_lock when fetching fields from session to be encoded
> > > > > > - use fls() instead of bespoke highest_unset_index()
> > > > > > - rename variables in several functions with more descriptive names
> > > > > > - clamp limit of for loop in update_cb_slot_table()
> > > > > > - re-add missing rpc_wake_up_queued_task() call
> > > > > > - fix slotid check in decode_cb_sequence4resok()
> > > > > > - add new per-session spinlock
> > > > > > ---
> > > > > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > > > > >  fs/nfsd/nfs4state.c    |  11 +++--
> > > > > >  fs/nfsd/state.h        |  15 ++++---
> > > > > >  fs/nfsd/trace.h        |   2 +-
> > > > > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > > > > >
> > > > > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > > > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > > > > --- a/fs/nfsd/nfs4callback.c
> > > > > > +++ b/fs/nfsd/nfs4callback.c
> > > > > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > > > > >         hdr->nops++;
> > > > > >  }
> > > > > >
> > > > > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > > > > +{
> > > > > > +       u32 idx;
> > > > > > +
> > > > > > +       spin_lock(&ses->se_lock);
> > > > > > +       idx = fls(~ses->se_cb_slot_avail);
> > > > > > +       if (idx > 0)
> > > > > > +               --idx;
> > > > > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > +       return idx;
> > > > > > +}
> > > > > > +
> > > > > >  /*
> > > > > >   * CB_SEQUENCE4args
> > > > > >   *
> > > > > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > > > > >         encode_sessionid4(xdr, session);
> > > > > >
> > > > > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > > > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > > > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > > > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > > > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > > > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > > > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > > > > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > > > > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > > > > >
> > > > > >         hdr->nops++;
> > > > > >  }
> > > > > >
> > > > > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > > > > +{
> > > > > > +       /* No need to do anything if nothing changed */
> > > > > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > > > > +               return;
> > > > > > +
> > > > > > +       spin_lock(&ses->se_lock);
> > > > > > +       if (target > ses->se_cb_highest_slot) {
> > > > > > +               int i;
> > > > > > +
> > > > > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > > > > +
> > > > > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > > > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > > > > +                       ses->se_cb_seq_nr[i] = 1;
> > > > > > +       }
> > > > > > +       ses->se_cb_highest_slot = target;
> > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > +}
> > > > > > +
> > > > > >  /*
> > > > > >   * CB_SEQUENCE4resok
> > > > > >   *
> > > > > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > > > > >         int status = -ESERVERFAULT;
> > > > > >         __be32 *p;
> > > > > > -       u32 dummy;
> > > > > > +       u32 seqid, slotid, target;
> > > > > >
> > > > > >         /*
> > > > > >          * If the server returns different values for sessionID, slotID or
> > > > > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > >         }
> > > > > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > > > > >
> > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > -       if (dummy != session->se_cb_seq_nr) {
> > > > > > +       seqid = be32_to_cpup(p++);
> > > > > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > > > > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > > > > >                 goto out;
> > > > > >         }
> > > > > >
> > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > -       if (dummy != 0) {
> > > > > > +       slotid = be32_to_cpup(p++);
> > > > > > +       if (slotid != cb->cb_held_slot) {
> > > > > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > > > > >                 goto out;
> > > > > >         }
> > > > > >
> > > > > > -       /*
> > > > > > -        * FIXME: process highest slotid and target highest slotid
> > > > > > -        */
> > > > > > +       p++; // ignore current highest slot value
> > > > > > +
> > > > > > +       target = be32_to_cpup(p++);
> > > > > > +       update_cb_slot_table(session, target);
> > > > > >         status = 0;
> > > > > >  out:
> > > > > >         cb->cb_seq_status = status;
> > > > > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > >         spin_unlock(&clp->cl_lock);
> > > > > >  }
> > > > > >
> > > > > > +static int grab_slot(struct nfsd4_session *ses)
> > > > > > +{
> > > > > > +       int idx;
> > > > > > +
> > > > > > +       spin_lock(&ses->se_lock);
> > > > > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > > > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > +               return -1;
> > > > > > +       }
> > > > > > +       /* clear the bit for the slot */
> > > > > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > +       return idx;
> > > > > > +}
> > > > > > +
> > > > > >  /*
> > > > > >   * There's currently a single callback channel slot.
> > > > > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > > > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > > > > >  {
> > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > >
> > > > > > -       if (!cb->cb_holds_slot &&
> > > > > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > +       if (cb->cb_held_slot >= 0)
> > > > > > +               return true;
> > > > > > +       cb->cb_held_slot = grab_slot(ses);
> > > > > > +       if (cb->cb_held_slot < 0) {
> > > > > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > > > > >                 /* Race breaker */
> > > > > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > -                       dprintk("%s slot is busy\n", __func__);
> > > > > > +               cb->cb_held_slot = grab_slot(ses);
> > > > > > +               if (cb->cb_held_slot < 0)
> > > > > >                         return false;
> > > > > > -               }
> > > > > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > > > > >         }
> > > > > > -       cb->cb_holds_slot = true;
> > > > > >         return true;
> > > > > >  }
> > > > > >
> > > > > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > > > > >  {
> > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > >
> > > > > > -       if (cb->cb_holds_slot) {
> > > > > > -               cb->cb_holds_slot = false;
> > > > > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > +       if (cb->cb_held_slot >= 0) {
> > > > > > +               spin_lock(&ses->se_lock);
> > > > > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > +               cb->cb_held_slot = -1;
> > > > > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > > > > >         }
> > > > > >  }
> > > > > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > > > > >  }
> > > > > >
> > > > > >  /*
> > > > > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > > > > - * slots, and mark callback channel down on communication errors.
> > > > > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > > > > + * and mark callback channel down on communication errors.
> > > > > >   */
> > > > > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > > > > >  {
> > > > > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > >                 return true;
> > > > > >         }
> > > > > >
> > > > > > -       if (!cb->cb_holds_slot)
> > > > > > +       if (cb->cb_held_slot < 0)
> > > > > >                 goto need_restart;
> > > > > >
> > > > > >         /* This is the operation status code for CB_SEQUENCE */
> > > > > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > > > > >                  * (sequence ID, cached reply) MUST NOT change.
> > > > > >                  */
> > > > > > -               ++session->se_cb_seq_nr;
> > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > >                 break;
> > > > > >         case -ESERVERFAULT:
> > > > > > -               ++session->se_cb_seq_nr;
> > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > >                 ret = false;
> > > > > >                 break;
> > > > > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > >         case -NFS4ERR_BADSLOT:
> > > > > >                 goto retry_nowait;
> > > > > >         case -NFS4ERR_SEQ_MISORDERED:
> > > > > > -               if (session->se_cb_seq_nr != 1) {
> > > > > > -                       session->se_cb_seq_nr = 1;
> > > > > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > > > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > > > > >                         goto retry_nowait;
> > > > > >                 }
> > > > > >                 break;
> > > > > >         default:
> > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > >         }
> > > > > > -       nfsd41_cb_release_slot(cb);
> > > > > > -
> > > > > >         trace_nfsd_cb_free_slot(task, cb);
> > > > > > +       nfsd41_cb_release_slot(cb);
> > > > > >
> > > > > >         if (RPC_SIGNALLED(task))
> > > > > >                 goto need_restart;
> > > > > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > > > > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > > > > >         cb->cb_status = 0;
> > > > > >         cb->cb_need_restart = false;
> > > > > > -       cb->cb_holds_slot = false;
> > > > > > +       cb->cb_held_slot = -1;
> > > > > >  }
> > > > > >
> > > > > >  /**
> > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > >         }
> > > > > >
> > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > > +       new->se_cb_slot_avail = ~0U;
> > > > > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > +       spin_lock_init(&new->se_lock);
> > > > > >         return new;
> > > > > >  out_free:
> > > > > >         while (i--)
> > > > > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > > > >
> > > > > >         INIT_LIST_HEAD(&new->se_conns);
> > > > > >
> > > > > > -       new->se_cb_seq_nr = 1;
> > > > > > +       atomic_set(&new->se_ref, 0);
> > > > > >         new->se_dead = false;
> > > > > >         new->se_cb_prog = cses->callback_prog;
> > > > > >         new->se_cb_sec = cses->cb_sec;
> > > > > > -       atomic_set(&new->se_ref, 0);
> > > > > > +
> > > > > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > > > > +               new->se_cb_seq_nr[idx] = 1;
> > > > > > +
> > > > > >         idx = hash_sessionid(&new->se_sessionid);
> > > > > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > > > > >         spin_lock(&clp->cl_lock);
> > > > > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > > > > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > > > > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > > > > >         clp->cl_time = ktime_get_boottime_seconds();
> > > > > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > >         copy_verf(clp, verf);
> > > > > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > > > > >         clp->cl_cb_session = NULL;
> > > > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > > > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > > > > --- a/fs/nfsd/state.h
> > > > > > +++ b/fs/nfsd/state.h
> > > > > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > > > > >         struct work_struct cb_work;
> > > > > >         int cb_seq_status;
> > > > > >         int cb_status;
> > > > > > +       int cb_held_slot;
> > > > > >         bool cb_need_restart;
> > > > > > -       bool cb_holds_slot;
> > > > > >  };
> > > > > >
> > > > > >  struct nfsd4_callback_ops {
> > > > > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > > > > >         unsigned char cn_flags;
> > > > > >  };
> > > > > >
> > > > > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > > > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > > > >
> > > > > Are there some values that are known not to work? I was experimenting
> > > > > with values and set it to 2 and 4 and the kernel oopsed. I understand
> > > > > it's not a configurable value but it would still be good to know the
> > > > > expectations...
> > > > >
> > > > > [  198.625021] Unable to handle kernel paging request at virtual
> > > > > address dfff800020000000
> > > > > [  198.625870] KASAN: probably user-memory-access in range
> > > > > [0x0000000100000000-0x0000000100000007]
> > > > > [  198.626444] Mem abort info:
> > > > > [  198.626630]   ESR = 0x0000000096000005
> > > > > [  198.626882]   EC = 0x25: DABT (current EL), IL = 32 bits
> > > > > [  198.627234]   SET = 0, FnV = 0
> > > > > [  198.627441]   EA = 0, S1PTW = 0
> > > > > [  198.627627]   FSC = 0x05: level 1 translation fault
> > > > > [  198.627859] Data abort info:
> > > > > [  198.628000]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
> > > > > [  198.628272]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> > > > > [  198.628619]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> > > > > [  198.628967] [dfff800020000000] address between user and kernel address ranges
> > > > > [  198.629438] Internal error: Oops: 0000000096000005 [#1] SMP
> > > > > [  198.629806] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver
> > > > > nfs netfs nfnetlink_queue nfnetlink_log nfnetlink bluetooth cfg80211
> > > > > rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd auth_rpcgss nfs_acl lockd
> > > > > grace isofs uinput snd_seq_dummy snd_hrtimer vsock_loopback
> > > > > vmw_vsock_virtio_transport_common qrtr rfkill vmw_vsock_vmci_transport
> > > > > vsock sunrpc vfat fat snd_hda_codec_generic snd_hda_intel
> > > > > snd_intel_dspcfg snd_hda_codec snd_hda_core snd_hwdep snd_seq uvcvideo
> > > > > videobuf2_vmalloc snd_seq_device videobuf2_memops uvc videobuf2_v4l2
> > > > > videodev snd_pcm videobuf2_common mc snd_timer snd vmw_vmci soundcore
> > > > > xfs libcrc32c vmwgfx drm_ttm_helper ttm nvme drm_kms_helper
> > > > > crct10dif_ce nvme_core ghash_ce sha2_ce sha256_arm64 sha1_ce drm
> > > > > nvme_auth sr_mod cdrom e1000e sg fuse
> > > > > [  198.633799] CPU: 5 UID: 0 PID: 6081 Comm: nfsd Kdump: loaded Not
> > > > > tainted 6.12.0-rc6+ #47
> > > > > [  198.634345] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS
> > > > > VMW201.00V.21805430.BA64.2305221830 05/22/2023
> > > > > [  198.635014] pstate: 11400005 (nzcV daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> > > > > [  198.635492] pc : nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > [  198.635798] lr : nfsd4_sequence+0x340/0x1f60 [nfsd]
> > > > > [  198.636065] sp : ffff8000884977e0
> > > > > [  198.636234] x29: ffff800088497910 x28: ffff0000b1b39280 x27: ffff0000ab508128
> > > > > [  198.636624] x26: ffff0000b1b39298 x25: ffff0000b1b39290 x24: ffff0000a65e1c64
> > > > > [  198.637049] x23: 1fffe000212e6804 x22: ffff000109734024 x21: 1ffff00011092f16
> > > > > [  198.637472] x20: ffff00010aed8000 x19: ffff000109734000 x18: 1fffe0002de20c8b
> > > > > [  198.637883] x17: 0100000000000000 x16: 1ffff0000fcef234 x15: 1fffe000212e600f
> > > > > [  198.638286] x14: ffff80007e779000 x13: ffff80007e7791a0 x12: 0000000000000000
> > > > > [  198.638697] x11: ffff0000a65e1c38 x10: ffff00010aedaca0 x9 : 1fffe000215db594
> > > > > [  198.639110] x8 : 1fffe00014cbc387 x7 : ffff0000a65e1c03 x6 : ffff0000a65e1c00
> > > > > [  198.639541] x5 : ffff0000a65e1c00 x4 : 0000000020000000 x3 : 0000000100000001
> > > > > [  198.639962] x2 : ffff000109730060 x1 : 0000000000000003 x0 : dfff800000000000
> > > > > [  198.640332] Call trace:
> > > > > [  198.640460]  nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > [  198.640715]  nfsd4_proc_compound+0xb94/0x23b0 [nfsd]
> > > > > [  198.640997]  nfsd_dispatch+0x22c/0x718 [nfsd]
> > > > > [  198.641260]  svc_process_common+0x8e8/0x1968 [sunrpc]
> > > > > [  198.641566]  svc_process+0x3d4/0x7e0 [sunrpc]
> > > > > [  198.641827]  svc_handle_xprt+0x828/0xe10 [sunrpc]
> > > > > [  198.642108]  svc_recv+0x2cc/0x6a8 [sunrpc]
> > > > > [  198.642346]  nfsd+0x270/0x400 [nfsd]
> > > > > [  198.642562]  kthread+0x288/0x310
> > > > > [  198.642745]  ret_from_fork+0x10/0x20
> > > > > [  198.642937] Code: f2fbffe0 f9003be4 f94007e2 52800061 (38e06880)
> > > > > [  198.643267] SMP: stopping secondary CPUs
> > > > >
> > > > >
> > > > >
> > > >
> > > >
> > > > Good catch. I think the problem here is that we don't currently cap the
> > > > initial value of se_cb_highest_slot at NFSD_BC_SLOT_TABLE_MAX. Does
> > > > this patch prevent the panic?
> > > >
> > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > index 3afe56ab9e0a..839be4ba765a 100644
> > > > --- a/fs/nfsd/nfs4state.c
> > > > +++ b/fs/nfsd/nfs4state.c
> > > > @@ -2011,7 +2011,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > >
> > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > >         new->se_cb_slot_avail = ~0U;
> > > > -       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > +       new->se_cb_highest_slot = min(battrs->maxreqs - 1, NFSD_BC_SLOT_TABLE_MAX);
> > > >         spin_lock_init(&new->se_lock);
> > > >         return new;
> > > >  out_free:
> > >
> > > It does help. I thought that the CREATE_SESSION reply for the
> > > backchannel would be guided by the NFSD_BC_SLOT_TABLE_MAX value but
> > > instead it seems like it's not. But yes I can see that the highest
> > > slot used by the server is capped by the NFSD_BC_SLOT_TABLE_MAX value.
> >
> > Thanks for testing it, Olga.
> >
> > Chuck, would you be OK with folding the above delta into 9ab4c4077de9,
> > or would you rather I resend the patch?
>
> I've folded the above one-liner into the applied patch.
>
> I agree with Tom, I think there's probably a (surprising)
> explanation lurking for not seeing the expected performance
> improvement. I can delay sending the NFSD v6.13 merge window pull
> request for a bit to see if you can get it teased out.

I would like to raise a couple of issues:
(1) I believe the server should be reporting back an accurate value
for the backchannel session table size. I think if the
NFSD_BC_SLOT_TABLE_MAX was way lower than the client's value then the
client would be wasting resources for its bc session table?
->back_channel->maxreqs gets decoded in nfsd4_decode_create_session()
and is never adjusted for the reply to be based on the
NFSD_BC_SLOT_TABLE_MAX. The problem is currently invisible because
linux client's bc slot table size is 16 and nfsd's is higher.

Maybe something like (at least that sets the reply to the :
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9120363d58f5..9a0da585b61d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3825,7 +3825,7 @@ static __be32 check_backchannel_attrs(struct
nfsd4_channel_attrs *ca)
        ca->maxresp_cached = 0;
        if (ca->maxops < 2)
                return nfserr_toosmall;
-
+       ca->maxreqs = min(ca->maxreqs, NFSD_BC_SLOT_TABLE_MAX);
        return nfs_ok;
 }


(2) The server is not using the lowest available slotid value. I
thought it was a MUST but it's a SHOULD in the spec so I guess
technically the existing way is still spec compliant. I don't have
suggestions/explanations here as of now.

>


>
> --
> Chuck Lever
Jeff Layton Nov. 11, 2024, 5:40 p.m. UTC | #14
On Mon, 2024-11-11 at 12:17 -0500, Olga Kornievskaia wrote:
> On Mon, Nov 11, 2024 at 9:56 AM Chuck Lever <chuck.lever@oracle.com> wrote:
> > 
> > On Mon, Nov 11, 2024 at 08:22:07AM -0500, Jeff Layton wrote:
> > > On Sun, 2024-11-10 at 21:19 -0500, Olga Kornievskaia wrote:
> > > > On Sat, Nov 9, 2024 at 2:26 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > 
> > > > > On Sat, 2024-11-09 at 13:50 -0500, Olga Kornievskaia wrote:
> > > > > > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > 
> > > > > > > nfsd currently only uses a single slot in the callback channel, which is
> > > > > > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > > > > > a max of 32 slots (subject to the client's target_maxreqs value).
> > > > > > > 
> > > > > > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > > > > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > > > > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > > > > > track which slots are in use, and a u32 to track the latest callback
> > > > > > > target_slotid that the client reports. To protect the new fields, add
> > > > > > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > > > > > search for the lowest slotid (using ffs()).
> > > > > > > 
> > > > > > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > > > > > counters and add the necessary handling to ensure that the seqids get
> > > > > > > reset at the appropriate times.
> > > > > > > 
> > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > > > > > ---
> > > > > > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > > > > > when the slot table is full. Olga, if you're able to test this one, it
> > > > > > > would be much appreciated.
> > > > > > > ---
> > > > > > > Changes in v4:
> > > > > > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > > > > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > > > > > > 
> > > > > > > Changes in v3:
> > > > > > > - add patch to convert se_flags to single se_dead bool
> > > > > > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > > > > > - don't reject target highest slot value of 0
> > > > > > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > > > > > > 
> > > > > > > Changes in v2:
> > > > > > > - take cl_lock when fetching fields from session to be encoded
> > > > > > > - use fls() instead of bespoke highest_unset_index()
> > > > > > > - rename variables in several functions with more descriptive names
> > > > > > > - clamp limit of for loop in update_cb_slot_table()
> > > > > > > - re-add missing rpc_wake_up_queued_task() call
> > > > > > > - fix slotid check in decode_cb_sequence4resok()
> > > > > > > - add new per-session spinlock
> > > > > > > ---
> > > > > > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > > > > > >  fs/nfsd/nfs4state.c    |  11 +++--
> > > > > > >  fs/nfsd/state.h        |  15 ++++---
> > > > > > >  fs/nfsd/trace.h        |   2 +-
> > > > > > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > > > > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > > > > > --- a/fs/nfsd/nfs4callback.c
> > > > > > > +++ b/fs/nfsd/nfs4callback.c
> > > > > > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > > > > > >         hdr->nops++;
> > > > > > >  }
> > > > > > > 
> > > > > > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > > > > > +{
> > > > > > > +       u32 idx;
> > > > > > > +
> > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > +       idx = fls(~ses->se_cb_slot_avail);
> > > > > > > +       if (idx > 0)
> > > > > > > +               --idx;
> > > > > > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > +       return idx;
> > > > > > > +}
> > > > > > > +
> > > > > > >  /*
> > > > > > >   * CB_SEQUENCE4args
> > > > > > >   *
> > > > > > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > > > > > >         encode_sessionid4(xdr, session);
> > > > > > > 
> > > > > > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > > > > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > > > > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > > > > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > > > > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > > > > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > > > > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > > > > > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > > > > > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > > > > > > 
> > > > > > >         hdr->nops++;
> > > > > > >  }
> > > > > > > 
> > > > > > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > > > > > +{
> > > > > > > +       /* No need to do anything if nothing changed */
> > > > > > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > > > > > +               return;
> > > > > > > +
> > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > +       if (target > ses->se_cb_highest_slot) {
> > > > > > > +               int i;
> > > > > > > +
> > > > > > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > > > > > +
> > > > > > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > > > > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > > > > > +                       ses->se_cb_seq_nr[i] = 1;
> > > > > > > +       }
> > > > > > > +       ses->se_cb_highest_slot = target;
> > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > +}
> > > > > > > +
> > > > > > >  /*
> > > > > > >   * CB_SEQUENCE4resok
> > > > > > >   *
> > > > > > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > > > > > >         int status = -ESERVERFAULT;
> > > > > > >         __be32 *p;
> > > > > > > -       u32 dummy;
> > > > > > > +       u32 seqid, slotid, target;
> > > > > > > 
> > > > > > >         /*
> > > > > > >          * If the server returns different values for sessionID, slotID or
> > > > > > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > >         }
> > > > > > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > > > > > > 
> > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > -       if (dummy != session->se_cb_seq_nr) {
> > > > > > > +       seqid = be32_to_cpup(p++);
> > > > > > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > > > > > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > > > > > >                 goto out;
> > > > > > >         }
> > > > > > > 
> > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > -       if (dummy != 0) {
> > > > > > > +       slotid = be32_to_cpup(p++);
> > > > > > > +       if (slotid != cb->cb_held_slot) {
> > > > > > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > > > > > >                 goto out;
> > > > > > >         }
> > > > > > > 
> > > > > > > -       /*
> > > > > > > -        * FIXME: process highest slotid and target highest slotid
> > > > > > > -        */
> > > > > > > +       p++; // ignore current highest slot value
> > > > > > > +
> > > > > > > +       target = be32_to_cpup(p++);
> > > > > > > +       update_cb_slot_table(session, target);
> > > > > > >         status = 0;
> > > > > > >  out:
> > > > > > >         cb->cb_seq_status = status;
> > > > > > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > >         spin_unlock(&clp->cl_lock);
> > > > > > >  }
> > > > > > > 
> > > > > > > +static int grab_slot(struct nfsd4_session *ses)
> > > > > > > +{
> > > > > > > +       int idx;
> > > > > > > +
> > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > > > > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > +               return -1;
> > > > > > > +       }
> > > > > > > +       /* clear the bit for the slot */
> > > > > > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > +       return idx;
> > > > > > > +}
> > > > > > > +
> > > > > > >  /*
> > > > > > >   * There's currently a single callback channel slot.
> > > > > > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > > > > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > > > > > >  {
> > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > 
> > > > > > > -       if (!cb->cb_holds_slot &&
> > > > > > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > +       if (cb->cb_held_slot >= 0)
> > > > > > > +               return true;
> > > > > > > +       cb->cb_held_slot = grab_slot(ses);
> > > > > > > +       if (cb->cb_held_slot < 0) {
> > > > > > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > > > > > >                 /* Race breaker */
> > > > > > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > -                       dprintk("%s slot is busy\n", __func__);
> > > > > > > +               cb->cb_held_slot = grab_slot(ses);
> > > > > > > +               if (cb->cb_held_slot < 0)
> > > > > > >                         return false;
> > > > > > > -               }
> > > > > > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > > > > > >         }
> > > > > > > -       cb->cb_holds_slot = true;
> > > > > > >         return true;
> > > > > > >  }
> > > > > > > 
> > > > > > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > > > > > >  {
> > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > 
> > > > > > > -       if (cb->cb_holds_slot) {
> > > > > > > -               cb->cb_holds_slot = false;
> > > > > > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > > +       if (cb->cb_held_slot >= 0) {
> > > > > > > +               spin_lock(&ses->se_lock);
> > > > > > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > +               cb->cb_held_slot = -1;
> > > > > > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > > > > > >         }
> > > > > > >  }
> > > > > > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > > > > > >  }
> > > > > > > 
> > > > > > >  /*
> > > > > > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > > > > > - * slots, and mark callback channel down on communication errors.
> > > > > > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > > > > > + * and mark callback channel down on communication errors.
> > > > > > >   */
> > > > > > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > > > > > >  {
> > > > > > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > >                 return true;
> > > > > > >         }
> > > > > > > 
> > > > > > > -       if (!cb->cb_holds_slot)
> > > > > > > +       if (cb->cb_held_slot < 0)
> > > > > > >                 goto need_restart;
> > > > > > > 
> > > > > > >         /* This is the operation status code for CB_SEQUENCE */
> > > > > > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > > > > > >                  * (sequence ID, cached reply) MUST NOT change.
> > > > > > >                  */
> > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > >                 break;
> > > > > > >         case -ESERVERFAULT:
> > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > >                 ret = false;
> > > > > > >                 break;
> > > > > > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > >         case -NFS4ERR_BADSLOT:
> > > > > > >                 goto retry_nowait;
> > > > > > >         case -NFS4ERR_SEQ_MISORDERED:
> > > > > > > -               if (session->se_cb_seq_nr != 1) {
> > > > > > > -                       session->se_cb_seq_nr = 1;
> > > > > > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > > > > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > > > > > >                         goto retry_nowait;
> > > > > > >                 }
> > > > > > >                 break;
> > > > > > >         default:
> > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > >         }
> > > > > > > -       nfsd41_cb_release_slot(cb);
> > > > > > > -
> > > > > > >         trace_nfsd_cb_free_slot(task, cb);
> > > > > > > +       nfsd41_cb_release_slot(cb);
> > > > > > > 
> > > > > > >         if (RPC_SIGNALLED(task))
> > > > > > >                 goto need_restart;
> > > > > > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > > > > > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > > > > > >         cb->cb_status = 0;
> > > > > > >         cb->cb_need_restart = false;
> > > > > > > -       cb->cb_holds_slot = false;
> > > > > > > +       cb->cb_held_slot = -1;
> > > > > > >  }
> > > > > > > 
> > > > > > >  /**
> > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > > >         }
> > > > > > > 
> > > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > > > +       new->se_cb_slot_avail = ~0U;
> > > > > > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > > +       spin_lock_init(&new->se_lock);
> > > > > > >         return new;
> > > > > > >  out_free:
> > > > > > >         while (i--)
> > > > > > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > > > > > 
> > > > > > >         INIT_LIST_HEAD(&new->se_conns);
> > > > > > > 
> > > > > > > -       new->se_cb_seq_nr = 1;
> > > > > > > +       atomic_set(&new->se_ref, 0);
> > > > > > >         new->se_dead = false;
> > > > > > >         new->se_cb_prog = cses->callback_prog;
> > > > > > >         new->se_cb_sec = cses->cb_sec;
> > > > > > > -       atomic_set(&new->se_ref, 0);
> > > > > > > +
> > > > > > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > > > > > +               new->se_cb_seq_nr[idx] = 1;
> > > > > > > +
> > > > > > >         idx = hash_sessionid(&new->se_sessionid);
> > > > > > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > > > > > >         spin_lock(&clp->cl_lock);
> > > > > > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > > > > > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > > > > > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > > > > > >         clp->cl_time = ktime_get_boottime_seconds();
> > > > > > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > >         copy_verf(clp, verf);
> > > > > > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > > > > > >         clp->cl_cb_session = NULL;
> > > > > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > > > > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > > > > > --- a/fs/nfsd/state.h
> > > > > > > +++ b/fs/nfsd/state.h
> > > > > > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > > > > > >         struct work_struct cb_work;
> > > > > > >         int cb_seq_status;
> > > > > > >         int cb_status;
> > > > > > > +       int cb_held_slot;
> > > > > > >         bool cb_need_restart;
> > > > > > > -       bool cb_holds_slot;
> > > > > > >  };
> > > > > > > 
> > > > > > >  struct nfsd4_callback_ops {
> > > > > > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > > > > > >         unsigned char cn_flags;
> > > > > > >  };
> > > > > > > 
> > > > > > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > > > > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > > > > > 
> > > > > > Are there some values that are known not to work? I was experimenting
> > > > > > with values and set it to 2 and 4 and the kernel oopsed. I understand
> > > > > > it's not a configurable value but it would still be good to know the
> > > > > > expectations...
> > > > > > 
> > > > > > [  198.625021] Unable to handle kernel paging request at virtual
> > > > > > address dfff800020000000
> > > > > > [  198.625870] KASAN: probably user-memory-access in range
> > > > > > [0x0000000100000000-0x0000000100000007]
> > > > > > [  198.626444] Mem abort info:
> > > > > > [  198.626630]   ESR = 0x0000000096000005
> > > > > > [  198.626882]   EC = 0x25: DABT (current EL), IL = 32 bits
> > > > > > [  198.627234]   SET = 0, FnV = 0
> > > > > > [  198.627441]   EA = 0, S1PTW = 0
> > > > > > [  198.627627]   FSC = 0x05: level 1 translation fault
> > > > > > [  198.627859] Data abort info:
> > > > > > [  198.628000]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
> > > > > > [  198.628272]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> > > > > > [  198.628619]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> > > > > > [  198.628967] [dfff800020000000] address between user and kernel address ranges
> > > > > > [  198.629438] Internal error: Oops: 0000000096000005 [#1] SMP
> > > > > > [  198.629806] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver
> > > > > > nfs netfs nfnetlink_queue nfnetlink_log nfnetlink bluetooth cfg80211
> > > > > > rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd auth_rpcgss nfs_acl lockd
> > > > > > grace isofs uinput snd_seq_dummy snd_hrtimer vsock_loopback
> > > > > > vmw_vsock_virtio_transport_common qrtr rfkill vmw_vsock_vmci_transport
> > > > > > vsock sunrpc vfat fat snd_hda_codec_generic snd_hda_intel
> > > > > > snd_intel_dspcfg snd_hda_codec snd_hda_core snd_hwdep snd_seq uvcvideo
> > > > > > videobuf2_vmalloc snd_seq_device videobuf2_memops uvc videobuf2_v4l2
> > > > > > videodev snd_pcm videobuf2_common mc snd_timer snd vmw_vmci soundcore
> > > > > > xfs libcrc32c vmwgfx drm_ttm_helper ttm nvme drm_kms_helper
> > > > > > crct10dif_ce nvme_core ghash_ce sha2_ce sha256_arm64 sha1_ce drm
> > > > > > nvme_auth sr_mod cdrom e1000e sg fuse
> > > > > > [  198.633799] CPU: 5 UID: 0 PID: 6081 Comm: nfsd Kdump: loaded Not
> > > > > > tainted 6.12.0-rc6+ #47
> > > > > > [  198.634345] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS
> > > > > > VMW201.00V.21805430.BA64.2305221830 05/22/2023
> > > > > > [  198.635014] pstate: 11400005 (nzcV daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> > > > > > [  198.635492] pc : nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > [  198.635798] lr : nfsd4_sequence+0x340/0x1f60 [nfsd]
> > > > > > [  198.636065] sp : ffff8000884977e0
> > > > > > [  198.636234] x29: ffff800088497910 x28: ffff0000b1b39280 x27: ffff0000ab508128
> > > > > > [  198.636624] x26: ffff0000b1b39298 x25: ffff0000b1b39290 x24: ffff0000a65e1c64
> > > > > > [  198.637049] x23: 1fffe000212e6804 x22: ffff000109734024 x21: 1ffff00011092f16
> > > > > > [  198.637472] x20: ffff00010aed8000 x19: ffff000109734000 x18: 1fffe0002de20c8b
> > > > > > [  198.637883] x17: 0100000000000000 x16: 1ffff0000fcef234 x15: 1fffe000212e600f
> > > > > > [  198.638286] x14: ffff80007e779000 x13: ffff80007e7791a0 x12: 0000000000000000
> > > > > > [  198.638697] x11: ffff0000a65e1c38 x10: ffff00010aedaca0 x9 : 1fffe000215db594
> > > > > > [  198.639110] x8 : 1fffe00014cbc387 x7 : ffff0000a65e1c03 x6 : ffff0000a65e1c00
> > > > > > [  198.639541] x5 : ffff0000a65e1c00 x4 : 0000000020000000 x3 : 0000000100000001
> > > > > > [  198.639962] x2 : ffff000109730060 x1 : 0000000000000003 x0 : dfff800000000000
> > > > > > [  198.640332] Call trace:
> > > > > > [  198.640460]  nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > [  198.640715]  nfsd4_proc_compound+0xb94/0x23b0 [nfsd]
> > > > > > [  198.640997]  nfsd_dispatch+0x22c/0x718 [nfsd]
> > > > > > [  198.641260]  svc_process_common+0x8e8/0x1968 [sunrpc]
> > > > > > [  198.641566]  svc_process+0x3d4/0x7e0 [sunrpc]
> > > > > > [  198.641827]  svc_handle_xprt+0x828/0xe10 [sunrpc]
> > > > > > [  198.642108]  svc_recv+0x2cc/0x6a8 [sunrpc]
> > > > > > [  198.642346]  nfsd+0x270/0x400 [nfsd]
> > > > > > [  198.642562]  kthread+0x288/0x310
> > > > > > [  198.642745]  ret_from_fork+0x10/0x20
> > > > > > [  198.642937] Code: f2fbffe0 f9003be4 f94007e2 52800061 (38e06880)
> > > > > > [  198.643267] SMP: stopping secondary CPUs
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > 
> > > > > 
> > > > > Good catch. I think the problem here is that we don't currently cap the
> > > > > initial value of se_cb_highest_slot at NFSD_BC_SLOT_TABLE_MAX. Does
> > > > > this patch prevent the panic?
> > > > > 
> > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > index 3afe56ab9e0a..839be4ba765a 100644
> > > > > --- a/fs/nfsd/nfs4state.c
> > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > @@ -2011,7 +2011,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > 
> > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > >         new->se_cb_slot_avail = ~0U;
> > > > > -       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > +       new->se_cb_highest_slot = min(battrs->maxreqs - 1, NFSD_BC_SLOT_TABLE_MAX);
> > > > >         spin_lock_init(&new->se_lock);
> > > > >         return new;
> > > > >  out_free:
> > > > 
> > > > It does help. I thought that the CREATE_SESSION reply for the
> > > > backchannel would be guided by the NFSD_BC_SLOT_TABLE_MAX value but
> > > > instead it seems like it's not. But yes I can see that the highest
> > > > slot used by the server is capped by the NFSD_BC_SLOT_TABLE_MAX value.
> > > 
> > > Thanks for testing it, Olga.
> > > 
> > > Chuck, would you be OK with folding the above delta into 9ab4c4077de9,
> > > or would you rather I resend the patch?
> > 
> > I've folded the above one-liner into the applied patch.
> > 
> > I agree with Tom, I think there's probably a (surprising)
> > explanation lurking for not seeing the expected performance
> > improvement. I can delay sending the NFSD v6.13 merge window pull
> > request for a bit to see if you can get it teased out.
> 
> I would like to raise a couple of issues:
> (1) I believe the server should be reporting back an accurate value
> for the backchannel session table size. I think if the
> NFSD_BC_SLOT_TABLE_MAX was way lower than the client's value then the
> client would be wasting resources for its bc session table?

Yes, but those resources are 32-bit integer per wasted slot. The Linux
client allows for up to 16 slots, so we're wasting 64 bytes per session
with this scheme with the Linux client. I didn't think it was worth
doing a separate allocation for that.

We could make NFSD_BC_SLOT_TABLE_MAX smaller though. Maybe we should
match the client's size and make it 15?

> ->back_channel->maxreqs gets decoded in nfsd4_decode_create_session()
> and is never adjusted for the reply to be based on the
> NFSD_BC_SLOT_TABLE_MAX. The problem is currently invisible because
> linux client's bc slot table size is 16 and nfsd's is higher.
> 

I'm not sure I understand the problem here. We don't care about most of
the backchannel attributes. maxreqs is the only one that matters, and
track that in se_cb_highest_slot.
Olga Kornievskaia Nov. 11, 2024, 5:56 p.m. UTC | #15
On Mon, Nov 11, 2024 at 12:40 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Mon, 2024-11-11 at 12:17 -0500, Olga Kornievskaia wrote:
> > On Mon, Nov 11, 2024 at 9:56 AM Chuck Lever <chuck.lever@oracle.com> wrote:
> > >
> > > On Mon, Nov 11, 2024 at 08:22:07AM -0500, Jeff Layton wrote:
> > > > On Sun, 2024-11-10 at 21:19 -0500, Olga Kornievskaia wrote:
> > > > > On Sat, Nov 9, 2024 at 2:26 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > >
> > > > > > On Sat, 2024-11-09 at 13:50 -0500, Olga Kornievskaia wrote:
> > > > > > > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > >
> > > > > > > > nfsd currently only uses a single slot in the callback channel, which is
> > > > > > > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > > > > > > a max of 32 slots (subject to the client's target_maxreqs value).
> > > > > > > >
> > > > > > > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > > > > > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > > > > > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > > > > > > track which slots are in use, and a u32 to track the latest callback
> > > > > > > > target_slotid that the client reports. To protect the new fields, add
> > > > > > > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > > > > > > search for the lowest slotid (using ffs()).
> > > > > > > >
> > > > > > > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > > > > > > counters and add the necessary handling to ensure that the seqids get
> > > > > > > > reset at the appropriate times.
> > > > > > > >
> > > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > > > > > > ---
> > > > > > > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > > > > > > when the slot table is full. Olga, if you're able to test this one, it
> > > > > > > > would be much appreciated.
> > > > > > > > ---
> > > > > > > > Changes in v4:
> > > > > > > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > > > > > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > > > > > > >
> > > > > > > > Changes in v3:
> > > > > > > > - add patch to convert se_flags to single se_dead bool
> > > > > > > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > > > > > > - don't reject target highest slot value of 0
> > > > > > > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > > > > > > >
> > > > > > > > Changes in v2:
> > > > > > > > - take cl_lock when fetching fields from session to be encoded
> > > > > > > > - use fls() instead of bespoke highest_unset_index()
> > > > > > > > - rename variables in several functions with more descriptive names
> > > > > > > > - clamp limit of for loop in update_cb_slot_table()
> > > > > > > > - re-add missing rpc_wake_up_queued_task() call
> > > > > > > > - fix slotid check in decode_cb_sequence4resok()
> > > > > > > > - add new per-session spinlock
> > > > > > > > ---
> > > > > > > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > > > > > > >  fs/nfsd/nfs4state.c    |  11 +++--
> > > > > > > >  fs/nfsd/state.h        |  15 ++++---
> > > > > > > >  fs/nfsd/trace.h        |   2 +-
> > > > > > > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > > > > > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > > > > > > --- a/fs/nfsd/nfs4callback.c
> > > > > > > > +++ b/fs/nfsd/nfs4callback.c
> > > > > > > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > > > > > > >         hdr->nops++;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > > > > > > +{
> > > > > > > > +       u32 idx;
> > > > > > > > +
> > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > +       idx = fls(~ses->se_cb_slot_avail);
> > > > > > > > +       if (idx > 0)
> > > > > > > > +               --idx;
> > > > > > > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > +       return idx;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  /*
> > > > > > > >   * CB_SEQUENCE4args
> > > > > > > >   *
> > > > > > > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > > > > > > >         encode_sessionid4(xdr, session);
> > > > > > > >
> > > > > > > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > > > > > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > > > > > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > > > > > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > > > > > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > > > > > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > > > > > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > > > > > > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > > > > > > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > > > > > > >
> > > > > > > >         hdr->nops++;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > > > > > > +{
> > > > > > > > +       /* No need to do anything if nothing changed */
> > > > > > > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > > > > > > +               return;
> > > > > > > > +
> > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > +       if (target > ses->se_cb_highest_slot) {
> > > > > > > > +               int i;
> > > > > > > > +
> > > > > > > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > > > > > > +
> > > > > > > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > > > > > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > > > > > > +                       ses->se_cb_seq_nr[i] = 1;
> > > > > > > > +       }
> > > > > > > > +       ses->se_cb_highest_slot = target;
> > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  /*
> > > > > > > >   * CB_SEQUENCE4resok
> > > > > > > >   *
> > > > > > > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > > > > > > >         int status = -ESERVERFAULT;
> > > > > > > >         __be32 *p;
> > > > > > > > -       u32 dummy;
> > > > > > > > +       u32 seqid, slotid, target;
> > > > > > > >
> > > > > > > >         /*
> > > > > > > >          * If the server returns different values for sessionID, slotID or
> > > > > > > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > > >         }
> > > > > > > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > > > > > > >
> > > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > > -       if (dummy != session->se_cb_seq_nr) {
> > > > > > > > +       seqid = be32_to_cpup(p++);
> > > > > > > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > > > > > > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > > > > > > >                 goto out;
> > > > > > > >         }
> > > > > > > >
> > > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > > -       if (dummy != 0) {
> > > > > > > > +       slotid = be32_to_cpup(p++);
> > > > > > > > +       if (slotid != cb->cb_held_slot) {
> > > > > > > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > > > > > > >                 goto out;
> > > > > > > >         }
> > > > > > > >
> > > > > > > > -       /*
> > > > > > > > -        * FIXME: process highest slotid and target highest slotid
> > > > > > > > -        */
> > > > > > > > +       p++; // ignore current highest slot value
> > > > > > > > +
> > > > > > > > +       target = be32_to_cpup(p++);
> > > > > > > > +       update_cb_slot_table(session, target);
> > > > > > > >         status = 0;
> > > > > > > >  out:
> > > > > > > >         cb->cb_seq_status = status;
> > > > > > > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > > >         spin_unlock(&clp->cl_lock);
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static int grab_slot(struct nfsd4_session *ses)
> > > > > > > > +{
> > > > > > > > +       int idx;
> > > > > > > > +
> > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > > > > > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > > +               return -1;
> > > > > > > > +       }
> > > > > > > > +       /* clear the bit for the slot */
> > > > > > > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > +       return idx;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  /*
> > > > > > > >   * There's currently a single callback channel slot.
> > > > > > > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > > > > > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > > > > > > >  {
> > > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > >
> > > > > > > > -       if (!cb->cb_holds_slot &&
> > > > > > > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > > +       if (cb->cb_held_slot >= 0)
> > > > > > > > +               return true;
> > > > > > > > +       cb->cb_held_slot = grab_slot(ses);
> > > > > > > > +       if (cb->cb_held_slot < 0) {
> > > > > > > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > > > > > > >                 /* Race breaker */
> > > > > > > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > > -                       dprintk("%s slot is busy\n", __func__);
> > > > > > > > +               cb->cb_held_slot = grab_slot(ses);
> > > > > > > > +               if (cb->cb_held_slot < 0)
> > > > > > > >                         return false;
> > > > > > > > -               }
> > > > > > > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > > > > > > >         }
> > > > > > > > -       cb->cb_holds_slot = true;
> > > > > > > >         return true;
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > > > > > > >  {
> > > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > >
> > > > > > > > -       if (cb->cb_holds_slot) {
> > > > > > > > -               cb->cb_holds_slot = false;
> > > > > > > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > > > +       if (cb->cb_held_slot >= 0) {
> > > > > > > > +               spin_lock(&ses->se_lock);
> > > > > > > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > > +               cb->cb_held_slot = -1;
> > > > > > > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > > > > > > >         }
> > > > > > > >  }
> > > > > > > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  /*
> > > > > > > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > > > > > > - * slots, and mark callback channel down on communication errors.
> > > > > > > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > > > > > > + * and mark callback channel down on communication errors.
> > > > > > > >   */
> > > > > > > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > > > > > > >  {
> > > > > > > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > >                 return true;
> > > > > > > >         }
> > > > > > > >
> > > > > > > > -       if (!cb->cb_holds_slot)
> > > > > > > > +       if (cb->cb_held_slot < 0)
> > > > > > > >                 goto need_restart;
> > > > > > > >
> > > > > > > >         /* This is the operation status code for CB_SEQUENCE */
> > > > > > > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > > > > > > >                  * (sequence ID, cached reply) MUST NOT change.
> > > > > > > >                  */
> > > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > > >                 break;
> > > > > > > >         case -ESERVERFAULT:
> > > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > > >                 ret = false;
> > > > > > > >                 break;
> > > > > > > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > >         case -NFS4ERR_BADSLOT:
> > > > > > > >                 goto retry_nowait;
> > > > > > > >         case -NFS4ERR_SEQ_MISORDERED:
> > > > > > > > -               if (session->se_cb_seq_nr != 1) {
> > > > > > > > -                       session->se_cb_seq_nr = 1;
> > > > > > > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > > > > > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > > > > > > >                         goto retry_nowait;
> > > > > > > >                 }
> > > > > > > >                 break;
> > > > > > > >         default:
> > > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > > >         }
> > > > > > > > -       nfsd41_cb_release_slot(cb);
> > > > > > > > -
> > > > > > > >         trace_nfsd_cb_free_slot(task, cb);
> > > > > > > > +       nfsd41_cb_release_slot(cb);
> > > > > > > >
> > > > > > > >         if (RPC_SIGNALLED(task))
> > > > > > > >                 goto need_restart;
> > > > > > > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > > > > > > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > > > > > > >         cb->cb_status = 0;
> > > > > > > >         cb->cb_need_restart = false;
> > > > > > > > -       cb->cb_holds_slot = false;
> > > > > > > > +       cb->cb_held_slot = -1;
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  /**
> > > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > > > >         }
> > > > > > > >
> > > > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > > > > +       new->se_cb_slot_avail = ~0U;
> > > > > > > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > > > +       spin_lock_init(&new->se_lock);
> > > > > > > >         return new;
> > > > > > > >  out_free:
> > > > > > > >         while (i--)
> > > > > > > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > > > > > >
> > > > > > > >         INIT_LIST_HEAD(&new->se_conns);
> > > > > > > >
> > > > > > > > -       new->se_cb_seq_nr = 1;
> > > > > > > > +       atomic_set(&new->se_ref, 0);
> > > > > > > >         new->se_dead = false;
> > > > > > > >         new->se_cb_prog = cses->callback_prog;
> > > > > > > >         new->se_cb_sec = cses->cb_sec;
> > > > > > > > -       atomic_set(&new->se_ref, 0);
> > > > > > > > +
> > > > > > > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > > > > > > +               new->se_cb_seq_nr[idx] = 1;
> > > > > > > > +
> > > > > > > >         idx = hash_sessionid(&new->se_sessionid);
> > > > > > > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > > > > > > >         spin_lock(&clp->cl_lock);
> > > > > > > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > > > > > > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > > > > > > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > > > > > > >         clp->cl_time = ktime_get_boottime_seconds();
> > > > > > > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > > >         copy_verf(clp, verf);
> > > > > > > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > > > > > > >         clp->cl_cb_session = NULL;
> > > > > > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > > > > > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > > > > > > --- a/fs/nfsd/state.h
> > > > > > > > +++ b/fs/nfsd/state.h
> > > > > > > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > > > > > > >         struct work_struct cb_work;
> > > > > > > >         int cb_seq_status;
> > > > > > > >         int cb_status;
> > > > > > > > +       int cb_held_slot;
> > > > > > > >         bool cb_need_restart;
> > > > > > > > -       bool cb_holds_slot;
> > > > > > > >  };
> > > > > > > >
> > > > > > > >  struct nfsd4_callback_ops {
> > > > > > > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > > > > > > >         unsigned char cn_flags;
> > > > > > > >  };
> > > > > > > >
> > > > > > > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > > > > > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > > > > > >
> > > > > > > Are there some values that are known not to work? I was experimenting
> > > > > > > with values and set it to 2 and 4 and the kernel oopsed. I understand
> > > > > > > it's not a configurable value but it would still be good to know the
> > > > > > > expectations...
> > > > > > >
> > > > > > > [  198.625021] Unable to handle kernel paging request at virtual
> > > > > > > address dfff800020000000
> > > > > > > [  198.625870] KASAN: probably user-memory-access in range
> > > > > > > [0x0000000100000000-0x0000000100000007]
> > > > > > > [  198.626444] Mem abort info:
> > > > > > > [  198.626630]   ESR = 0x0000000096000005
> > > > > > > [  198.626882]   EC = 0x25: DABT (current EL), IL = 32 bits
> > > > > > > [  198.627234]   SET = 0, FnV = 0
> > > > > > > [  198.627441]   EA = 0, S1PTW = 0
> > > > > > > [  198.627627]   FSC = 0x05: level 1 translation fault
> > > > > > > [  198.627859] Data abort info:
> > > > > > > [  198.628000]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
> > > > > > > [  198.628272]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> > > > > > > [  198.628619]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> > > > > > > [  198.628967] [dfff800020000000] address between user and kernel address ranges
> > > > > > > [  198.629438] Internal error: Oops: 0000000096000005 [#1] SMP
> > > > > > > [  198.629806] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver
> > > > > > > nfs netfs nfnetlink_queue nfnetlink_log nfnetlink bluetooth cfg80211
> > > > > > > rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd auth_rpcgss nfs_acl lockd
> > > > > > > grace isofs uinput snd_seq_dummy snd_hrtimer vsock_loopback
> > > > > > > vmw_vsock_virtio_transport_common qrtr rfkill vmw_vsock_vmci_transport
> > > > > > > vsock sunrpc vfat fat snd_hda_codec_generic snd_hda_intel
> > > > > > > snd_intel_dspcfg snd_hda_codec snd_hda_core snd_hwdep snd_seq uvcvideo
> > > > > > > videobuf2_vmalloc snd_seq_device videobuf2_memops uvc videobuf2_v4l2
> > > > > > > videodev snd_pcm videobuf2_common mc snd_timer snd vmw_vmci soundcore
> > > > > > > xfs libcrc32c vmwgfx drm_ttm_helper ttm nvme drm_kms_helper
> > > > > > > crct10dif_ce nvme_core ghash_ce sha2_ce sha256_arm64 sha1_ce drm
> > > > > > > nvme_auth sr_mod cdrom e1000e sg fuse
> > > > > > > [  198.633799] CPU: 5 UID: 0 PID: 6081 Comm: nfsd Kdump: loaded Not
> > > > > > > tainted 6.12.0-rc6+ #47
> > > > > > > [  198.634345] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS
> > > > > > > VMW201.00V.21805430.BA64.2305221830 05/22/2023
> > > > > > > [  198.635014] pstate: 11400005 (nzcV daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> > > > > > > [  198.635492] pc : nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > > [  198.635798] lr : nfsd4_sequence+0x340/0x1f60 [nfsd]
> > > > > > > [  198.636065] sp : ffff8000884977e0
> > > > > > > [  198.636234] x29: ffff800088497910 x28: ffff0000b1b39280 x27: ffff0000ab508128
> > > > > > > [  198.636624] x26: ffff0000b1b39298 x25: ffff0000b1b39290 x24: ffff0000a65e1c64
> > > > > > > [  198.637049] x23: 1fffe000212e6804 x22: ffff000109734024 x21: 1ffff00011092f16
> > > > > > > [  198.637472] x20: ffff00010aed8000 x19: ffff000109734000 x18: 1fffe0002de20c8b
> > > > > > > [  198.637883] x17: 0100000000000000 x16: 1ffff0000fcef234 x15: 1fffe000212e600f
> > > > > > > [  198.638286] x14: ffff80007e779000 x13: ffff80007e7791a0 x12: 0000000000000000
> > > > > > > [  198.638697] x11: ffff0000a65e1c38 x10: ffff00010aedaca0 x9 : 1fffe000215db594
> > > > > > > [  198.639110] x8 : 1fffe00014cbc387 x7 : ffff0000a65e1c03 x6 : ffff0000a65e1c00
> > > > > > > [  198.639541] x5 : ffff0000a65e1c00 x4 : 0000000020000000 x3 : 0000000100000001
> > > > > > > [  198.639962] x2 : ffff000109730060 x1 : 0000000000000003 x0 : dfff800000000000
> > > > > > > [  198.640332] Call trace:
> > > > > > > [  198.640460]  nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > > [  198.640715]  nfsd4_proc_compound+0xb94/0x23b0 [nfsd]
> > > > > > > [  198.640997]  nfsd_dispatch+0x22c/0x718 [nfsd]
> > > > > > > [  198.641260]  svc_process_common+0x8e8/0x1968 [sunrpc]
> > > > > > > [  198.641566]  svc_process+0x3d4/0x7e0 [sunrpc]
> > > > > > > [  198.641827]  svc_handle_xprt+0x828/0xe10 [sunrpc]
> > > > > > > [  198.642108]  svc_recv+0x2cc/0x6a8 [sunrpc]
> > > > > > > [  198.642346]  nfsd+0x270/0x400 [nfsd]
> > > > > > > [  198.642562]  kthread+0x288/0x310
> > > > > > > [  198.642745]  ret_from_fork+0x10/0x20
> > > > > > > [  198.642937] Code: f2fbffe0 f9003be4 f94007e2 52800061 (38e06880)
> > > > > > > [  198.643267] SMP: stopping secondary CPUs
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > >
> > > > > >
> > > > > > Good catch. I think the problem here is that we don't currently cap the
> > > > > > initial value of se_cb_highest_slot at NFSD_BC_SLOT_TABLE_MAX. Does
> > > > > > this patch prevent the panic?
> > > > > >
> > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > index 3afe56ab9e0a..839be4ba765a 100644
> > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > @@ -2011,7 +2011,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > >
> > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > >         new->se_cb_slot_avail = ~0U;
> > > > > > -       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > +       new->se_cb_highest_slot = min(battrs->maxreqs - 1, NFSD_BC_SLOT_TABLE_MAX);
> > > > > >         spin_lock_init(&new->se_lock);
> > > > > >         return new;
> > > > > >  out_free:
> > > > >
> > > > > It does help. I thought that the CREATE_SESSION reply for the
> > > > > backchannel would be guided by the NFSD_BC_SLOT_TABLE_MAX value but
> > > > > instead it seems like it's not. But yes I can see that the highest
> > > > > slot used by the server is capped by the NFSD_BC_SLOT_TABLE_MAX value.
> > > >
> > > > Thanks for testing it, Olga.
> > > >
> > > > Chuck, would you be OK with folding the above delta into 9ab4c4077de9,
> > > > or would you rather I resend the patch?
> > >
> > > I've folded the above one-liner into the applied patch.
> > >
> > > I agree with Tom, I think there's probably a (surprising)
> > > explanation lurking for not seeing the expected performance
> > > improvement. I can delay sending the NFSD v6.13 merge window pull
> > > request for a bit to see if you can get it teased out.
> >
> > I would like to raise a couple of issues:
> > (1) I believe the server should be reporting back an accurate value
> > for the backchannel session table size. I think if the
> > NFSD_BC_SLOT_TABLE_MAX was way lower than the client's value then the
> > client would be wasting resources for its bc session table?
>
> Yes, but those resources are 32-bit integer per wasted slot. The Linux
> client allows for up to 16 slots, so we're wasting 64 bytes per session
> with this scheme with the Linux client. I didn't think it was worth
> doing a separate allocation for that.
>
> We could make NFSD_BC_SLOT_TABLE_MAX smaller though. Maybe we should
> match the client's size and make it 15?
>
> > ->back_channel->maxreqs gets decoded in nfsd4_decode_create_session()
> > and is never adjusted for the reply to be based on the
> > NFSD_BC_SLOT_TABLE_MAX. The problem is currently invisible because
> > linux client's bc slot table size is 16 and nfsd's is higher.
> >
>
> I'm not sure I understand the problem here. We don't care about most of
> the backchannel attributes. maxreqs is the only one that matters, and
> track that in se_cb_highest_slot.

Client sends a create_session with cba_back_chan_attrs with max_reqs
of 16 -- stating that the client can handle 16 slots in it's slot
table. Server currently doesn't do anything about reflecting back to
the client its session slot table. It blindly returns what the client
sent. Say NFSD_BC_SLOT_TABLE_MAX was 4. Server would never use more
than 4 slots and yet the client would have to create a reply cache
table for 16 slots. Isn't that poor sportsmanship on behalf of the
linux server?

>
> --
> Jeff Layton <jlayton@kernel.org>
Jeff Layton Nov. 11, 2024, 6:17 p.m. UTC | #16
On Mon, 2024-11-11 at 12:56 -0500, Olga Kornievskaia wrote:
> On Mon, Nov 11, 2024 at 12:40 PM Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > On Mon, 2024-11-11 at 12:17 -0500, Olga Kornievskaia wrote:
> > > On Mon, Nov 11, 2024 at 9:56 AM Chuck Lever <chuck.lever@oracle.com> wrote:
> > > > 
> > > > On Mon, Nov 11, 2024 at 08:22:07AM -0500, Jeff Layton wrote:
> > > > > On Sun, 2024-11-10 at 21:19 -0500, Olga Kornievskaia wrote:
> > > > > > On Sat, Nov 9, 2024 at 2:26 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > 
> > > > > > > On Sat, 2024-11-09 at 13:50 -0500, Olga Kornievskaia wrote:
> > > > > > > > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > > > 
> > > > > > > > > nfsd currently only uses a single slot in the callback channel, which is
> > > > > > > > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > > > > > > > a max of 32 slots (subject to the client's target_maxreqs value).
> > > > > > > > > 
> > > > > > > > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > > > > > > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > > > > > > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > > > > > > > track which slots are in use, and a u32 to track the latest callback
> > > > > > > > > target_slotid that the client reports. To protect the new fields, add
> > > > > > > > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > > > > > > > search for the lowest slotid (using ffs()).
> > > > > > > > > 
> > > > > > > > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > > > > > > > counters and add the necessary handling to ensure that the seqids get
> > > > > > > > > reset at the appropriate times.
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > > > > > > > ---
> > > > > > > > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > > > > > > > when the slot table is full. Olga, if you're able to test this one, it
> > > > > > > > > would be much appreciated.
> > > > > > > > > ---
> > > > > > > > > Changes in v4:
> > > > > > > > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > > > > > > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > > > > > > > > 
> > > > > > > > > Changes in v3:
> > > > > > > > > - add patch to convert se_flags to single se_dead bool
> > > > > > > > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > > > > > > > - don't reject target highest slot value of 0
> > > > > > > > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > > > > > > > > 
> > > > > > > > > Changes in v2:
> > > > > > > > > - take cl_lock when fetching fields from session to be encoded
> > > > > > > > > - use fls() instead of bespoke highest_unset_index()
> > > > > > > > > - rename variables in several functions with more descriptive names
> > > > > > > > > - clamp limit of for loop in update_cb_slot_table()
> > > > > > > > > - re-add missing rpc_wake_up_queued_task() call
> > > > > > > > > - fix slotid check in decode_cb_sequence4resok()
> > > > > > > > > - add new per-session spinlock
> > > > > > > > > ---
> > > > > > > > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > > > > > > > >  fs/nfsd/nfs4state.c    |  11 +++--
> > > > > > > > >  fs/nfsd/state.h        |  15 ++++---
> > > > > > > > >  fs/nfsd/trace.h        |   2 +-
> > > > > > > > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > > > > > > > > 
> > > > > > > > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > > > > > > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > > > > > > > --- a/fs/nfsd/nfs4callback.c
> > > > > > > > > +++ b/fs/nfsd/nfs4callback.c
> > > > > > > > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > > > > > > > >         hdr->nops++;
> > > > > > > > >  }
> > > > > > > > > 
> > > > > > > > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > > > > > > > +{
> > > > > > > > > +       u32 idx;
> > > > > > > > > +
> > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > +       idx = fls(~ses->se_cb_slot_avail);
> > > > > > > > > +       if (idx > 0)
> > > > > > > > > +               --idx;
> > > > > > > > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > +       return idx;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  /*
> > > > > > > > >   * CB_SEQUENCE4args
> > > > > > > > >   *
> > > > > > > > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > > > > > > > >         encode_sessionid4(xdr, session);
> > > > > > > > > 
> > > > > > > > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > > > > > > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > > > > > > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > > > > > > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > > > > > > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > > > > > > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > > > > > > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > > > > > > > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > > > > > > > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > > > > > > > > 
> > > > > > > > >         hdr->nops++;
> > > > > > > > >  }
> > > > > > > > > 
> > > > > > > > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > > > > > > > +{
> > > > > > > > > +       /* No need to do anything if nothing changed */
> > > > > > > > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > > > > > > > +               return;
> > > > > > > > > +
> > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > +       if (target > ses->se_cb_highest_slot) {
> > > > > > > > > +               int i;
> > > > > > > > > +
> > > > > > > > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > > > > > > > +
> > > > > > > > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > > > > > > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > > > > > > > +                       ses->se_cb_seq_nr[i] = 1;
> > > > > > > > > +       }
> > > > > > > > > +       ses->se_cb_highest_slot = target;
> > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  /*
> > > > > > > > >   * CB_SEQUENCE4resok
> > > > > > > > >   *
> > > > > > > > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > > > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > > > > > > > >         int status = -ESERVERFAULT;
> > > > > > > > >         __be32 *p;
> > > > > > > > > -       u32 dummy;
> > > > > > > > > +       u32 seqid, slotid, target;
> > > > > > > > > 
> > > > > > > > >         /*
> > > > > > > > >          * If the server returns different values for sessionID, slotID or
> > > > > > > > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > > > >         }
> > > > > > > > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > > > > > > > > 
> > > > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > > > -       if (dummy != session->se_cb_seq_nr) {
> > > > > > > > > +       seqid = be32_to_cpup(p++);
> > > > > > > > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > > > > > > > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > > > > > > > >                 goto out;
> > > > > > > > >         }
> > > > > > > > > 
> > > > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > > > -       if (dummy != 0) {
> > > > > > > > > +       slotid = be32_to_cpup(p++);
> > > > > > > > > +       if (slotid != cb->cb_held_slot) {
> > > > > > > > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > > > > > > > >                 goto out;
> > > > > > > > >         }
> > > > > > > > > 
> > > > > > > > > -       /*
> > > > > > > > > -        * FIXME: process highest slotid and target highest slotid
> > > > > > > > > -        */
> > > > > > > > > +       p++; // ignore current highest slot value
> > > > > > > > > +
> > > > > > > > > +       target = be32_to_cpup(p++);
> > > > > > > > > +       update_cb_slot_table(session, target);
> > > > > > > > >         status = 0;
> > > > > > > > >  out:
> > > > > > > > >         cb->cb_seq_status = status;
> > > > > > > > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > > > >         spin_unlock(&clp->cl_lock);
> > > > > > > > >  }
> > > > > > > > > 
> > > > > > > > > +static int grab_slot(struct nfsd4_session *ses)
> > > > > > > > > +{
> > > > > > > > > +       int idx;
> > > > > > > > > +
> > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > > > > > > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > > > +               return -1;
> > > > > > > > > +       }
> > > > > > > > > +       /* clear the bit for the slot */
> > > > > > > > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > +       return idx;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  /*
> > > > > > > > >   * There's currently a single callback channel slot.
> > > > > > > > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > > > > > > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > > > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > > > > > > > >  {
> > > > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > > > 
> > > > > > > > > -       if (!cb->cb_holds_slot &&
> > > > > > > > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > > > +       if (cb->cb_held_slot >= 0)
> > > > > > > > > +               return true;
> > > > > > > > > +       cb->cb_held_slot = grab_slot(ses);
> > > > > > > > > +       if (cb->cb_held_slot < 0) {
> > > > > > > > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > > > > > > > >                 /* Race breaker */
> > > > > > > > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > > > -                       dprintk("%s slot is busy\n", __func__);
> > > > > > > > > +               cb->cb_held_slot = grab_slot(ses);
> > > > > > > > > +               if (cb->cb_held_slot < 0)
> > > > > > > > >                         return false;
> > > > > > > > > -               }
> > > > > > > > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > > > > > > > >         }
> > > > > > > > > -       cb->cb_holds_slot = true;
> > > > > > > > >         return true;
> > > > > > > > >  }
> > > > > > > > > 
> > > > > > > > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > > > > > > > >  {
> > > > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > > > 
> > > > > > > > > -       if (cb->cb_holds_slot) {
> > > > > > > > > -               cb->cb_holds_slot = false;
> > > > > > > > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > > > > +       if (cb->cb_held_slot >= 0) {
> > > > > > > > > +               spin_lock(&ses->se_lock);
> > > > > > > > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > > > +               cb->cb_held_slot = -1;
> > > > > > > > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > > > > > > > >         }
> > > > > > > > >  }
> > > > > > > > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > > > > > > > >  }
> > > > > > > > > 
> > > > > > > > >  /*
> > > > > > > > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > > > > > > > - * slots, and mark callback channel down on communication errors.
> > > > > > > > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > > > > > > > + * and mark callback channel down on communication errors.
> > > > > > > > >   */
> > > > > > > > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > > > > > > > >  {
> > > > > > > > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > >                 return true;
> > > > > > > > >         }
> > > > > > > > > 
> > > > > > > > > -       if (!cb->cb_holds_slot)
> > > > > > > > > +       if (cb->cb_held_slot < 0)
> > > > > > > > >                 goto need_restart;
> > > > > > > > > 
> > > > > > > > >         /* This is the operation status code for CB_SEQUENCE */
> > > > > > > > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > > > > > > > >                  * (sequence ID, cached reply) MUST NOT change.
> > > > > > > > >                  */
> > > > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > > > >                 break;
> > > > > > > > >         case -ESERVERFAULT:
> > > > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > > > >                 ret = false;
> > > > > > > > >                 break;
> > > > > > > > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > >         case -NFS4ERR_BADSLOT:
> > > > > > > > >                 goto retry_nowait;
> > > > > > > > >         case -NFS4ERR_SEQ_MISORDERED:
> > > > > > > > > -               if (session->se_cb_seq_nr != 1) {
> > > > > > > > > -                       session->se_cb_seq_nr = 1;
> > > > > > > > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > > > > > > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > > > > > > > >                         goto retry_nowait;
> > > > > > > > >                 }
> > > > > > > > >                 break;
> > > > > > > > >         default:
> > > > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > > > >         }
> > > > > > > > > -       nfsd41_cb_release_slot(cb);
> > > > > > > > > -
> > > > > > > > >         trace_nfsd_cb_free_slot(task, cb);
> > > > > > > > > +       nfsd41_cb_release_slot(cb);
> > > > > > > > > 
> > > > > > > > >         if (RPC_SIGNALLED(task))
> > > > > > > > >                 goto need_restart;
> > > > > > > > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > > > > > > > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > > > > > > > >         cb->cb_status = 0;
> > > > > > > > >         cb->cb_need_restart = false;
> > > > > > > > > -       cb->cb_holds_slot = false;
> > > > > > > > > +       cb->cb_held_slot = -1;
> > > > > > > > >  }
> > > > > > > > > 
> > > > > > > > >  /**
> > > > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > > > > >         }
> > > > > > > > > 
> > > > > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > > > > > +       new->se_cb_slot_avail = ~0U;
> > > > > > > > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > > > > +       spin_lock_init(&new->se_lock);
> > > > > > > > >         return new;
> > > > > > > > >  out_free:
> > > > > > > > >         while (i--)
> > > > > > > > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > > > > > > > 
> > > > > > > > >         INIT_LIST_HEAD(&new->se_conns);
> > > > > > > > > 
> > > > > > > > > -       new->se_cb_seq_nr = 1;
> > > > > > > > > +       atomic_set(&new->se_ref, 0);
> > > > > > > > >         new->se_dead = false;
> > > > > > > > >         new->se_cb_prog = cses->callback_prog;
> > > > > > > > >         new->se_cb_sec = cses->cb_sec;
> > > > > > > > > -       atomic_set(&new->se_ref, 0);
> > > > > > > > > +
> > > > > > > > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > > > > > > > +               new->se_cb_seq_nr[idx] = 1;
> > > > > > > > > +
> > > > > > > > >         idx = hash_sessionid(&new->se_sessionid);
> > > > > > > > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > > > > > > > >         spin_lock(&clp->cl_lock);
> > > > > > > > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > > > > > > > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > > > > > > > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > > > > > > > >         clp->cl_time = ktime_get_boottime_seconds();
> > > > > > > > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > > > >         copy_verf(clp, verf);
> > > > > > > > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > > > > > > > >         clp->cl_cb_session = NULL;
> > > > > > > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > > > > > > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > > > > > > > --- a/fs/nfsd/state.h
> > > > > > > > > +++ b/fs/nfsd/state.h
> > > > > > > > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > > > > > > > >         struct work_struct cb_work;
> > > > > > > > >         int cb_seq_status;
> > > > > > > > >         int cb_status;
> > > > > > > > > +       int cb_held_slot;
> > > > > > > > >         bool cb_need_restart;
> > > > > > > > > -       bool cb_holds_slot;
> > > > > > > > >  };
> > > > > > > > > 
> > > > > > > > >  struct nfsd4_callback_ops {
> > > > > > > > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > > > > > > > >         unsigned char cn_flags;
> > > > > > > > >  };
> > > > > > > > > 
> > > > > > > > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > > > > > > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > > > > > > > 
> > > > > > > > Are there some values that are known not to work? I was experimenting
> > > > > > > > with values and set it to 2 and 4 and the kernel oopsed. I understand
> > > > > > > > it's not a configurable value but it would still be good to know the
> > > > > > > > expectations...
> > > > > > > > 
> > > > > > > > [  198.625021] Unable to handle kernel paging request at virtual
> > > > > > > > address dfff800020000000
> > > > > > > > [  198.625870] KASAN: probably user-memory-access in range
> > > > > > > > [0x0000000100000000-0x0000000100000007]
> > > > > > > > [  198.626444] Mem abort info:
> > > > > > > > [  198.626630]   ESR = 0x0000000096000005
> > > > > > > > [  198.626882]   EC = 0x25: DABT (current EL), IL = 32 bits
> > > > > > > > [  198.627234]   SET = 0, FnV = 0
> > > > > > > > [  198.627441]   EA = 0, S1PTW = 0
> > > > > > > > [  198.627627]   FSC = 0x05: level 1 translation fault
> > > > > > > > [  198.627859] Data abort info:
> > > > > > > > [  198.628000]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
> > > > > > > > [  198.628272]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> > > > > > > > [  198.628619]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> > > > > > > > [  198.628967] [dfff800020000000] address between user and kernel address ranges
> > > > > > > > [  198.629438] Internal error: Oops: 0000000096000005 [#1] SMP
> > > > > > > > [  198.629806] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver
> > > > > > > > nfs netfs nfnetlink_queue nfnetlink_log nfnetlink bluetooth cfg80211
> > > > > > > > rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd auth_rpcgss nfs_acl lockd
> > > > > > > > grace isofs uinput snd_seq_dummy snd_hrtimer vsock_loopback
> > > > > > > > vmw_vsock_virtio_transport_common qrtr rfkill vmw_vsock_vmci_transport
> > > > > > > > vsock sunrpc vfat fat snd_hda_codec_generic snd_hda_intel
> > > > > > > > snd_intel_dspcfg snd_hda_codec snd_hda_core snd_hwdep snd_seq uvcvideo
> > > > > > > > videobuf2_vmalloc snd_seq_device videobuf2_memops uvc videobuf2_v4l2
> > > > > > > > videodev snd_pcm videobuf2_common mc snd_timer snd vmw_vmci soundcore
> > > > > > > > xfs libcrc32c vmwgfx drm_ttm_helper ttm nvme drm_kms_helper
> > > > > > > > crct10dif_ce nvme_core ghash_ce sha2_ce sha256_arm64 sha1_ce drm
> > > > > > > > nvme_auth sr_mod cdrom e1000e sg fuse
> > > > > > > > [  198.633799] CPU: 5 UID: 0 PID: 6081 Comm: nfsd Kdump: loaded Not
> > > > > > > > tainted 6.12.0-rc6+ #47
> > > > > > > > [  198.634345] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS
> > > > > > > > VMW201.00V.21805430.BA64.2305221830 05/22/2023
> > > > > > > > [  198.635014] pstate: 11400005 (nzcV daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> > > > > > > > [  198.635492] pc : nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > > > [  198.635798] lr : nfsd4_sequence+0x340/0x1f60 [nfsd]
> > > > > > > > [  198.636065] sp : ffff8000884977e0
> > > > > > > > [  198.636234] x29: ffff800088497910 x28: ffff0000b1b39280 x27: ffff0000ab508128
> > > > > > > > [  198.636624] x26: ffff0000b1b39298 x25: ffff0000b1b39290 x24: ffff0000a65e1c64
> > > > > > > > [  198.637049] x23: 1fffe000212e6804 x22: ffff000109734024 x21: 1ffff00011092f16
> > > > > > > > [  198.637472] x20: ffff00010aed8000 x19: ffff000109734000 x18: 1fffe0002de20c8b
> > > > > > > > [  198.637883] x17: 0100000000000000 x16: 1ffff0000fcef234 x15: 1fffe000212e600f
> > > > > > > > [  198.638286] x14: ffff80007e779000 x13: ffff80007e7791a0 x12: 0000000000000000
> > > > > > > > [  198.638697] x11: ffff0000a65e1c38 x10: ffff00010aedaca0 x9 : 1fffe000215db594
> > > > > > > > [  198.639110] x8 : 1fffe00014cbc387 x7 : ffff0000a65e1c03 x6 : ffff0000a65e1c00
> > > > > > > > [  198.639541] x5 : ffff0000a65e1c00 x4 : 0000000020000000 x3 : 0000000100000001
> > > > > > > > [  198.639962] x2 : ffff000109730060 x1 : 0000000000000003 x0 : dfff800000000000
> > > > > > > > [  198.640332] Call trace:
> > > > > > > > [  198.640460]  nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > > > [  198.640715]  nfsd4_proc_compound+0xb94/0x23b0 [nfsd]
> > > > > > > > [  198.640997]  nfsd_dispatch+0x22c/0x718 [nfsd]
> > > > > > > > [  198.641260]  svc_process_common+0x8e8/0x1968 [sunrpc]
> > > > > > > > [  198.641566]  svc_process+0x3d4/0x7e0 [sunrpc]
> > > > > > > > [  198.641827]  svc_handle_xprt+0x828/0xe10 [sunrpc]
> > > > > > > > [  198.642108]  svc_recv+0x2cc/0x6a8 [sunrpc]
> > > > > > > > [  198.642346]  nfsd+0x270/0x400 [nfsd]
> > > > > > > > [  198.642562]  kthread+0x288/0x310
> > > > > > > > [  198.642745]  ret_from_fork+0x10/0x20
> > > > > > > > [  198.642937] Code: f2fbffe0 f9003be4 f94007e2 52800061 (38e06880)
> > > > > > > > [  198.643267] SMP: stopping secondary CPUs
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > Good catch. I think the problem here is that we don't currently cap the
> > > > > > > initial value of se_cb_highest_slot at NFSD_BC_SLOT_TABLE_MAX. Does
> > > > > > > this patch prevent the panic?
> > > > > > > 
> > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > index 3afe56ab9e0a..839be4ba765a 100644
> > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > @@ -2011,7 +2011,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > > > 
> > > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > > >         new->se_cb_slot_avail = ~0U;
> > > > > > > -       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > > +       new->se_cb_highest_slot = min(battrs->maxreqs - 1, NFSD_BC_SLOT_TABLE_MAX);
> > > > > > >         spin_lock_init(&new->se_lock);
> > > > > > >         return new;
> > > > > > >  out_free:
> > > > > > 
> > > > > > It does help. I thought that the CREATE_SESSION reply for the
> > > > > > backchannel would be guided by the NFSD_BC_SLOT_TABLE_MAX value but
> > > > > > instead it seems like it's not. But yes I can see that the highest
> > > > > > slot used by the server is capped by the NFSD_BC_SLOT_TABLE_MAX value.
> > > > > 
> > > > > Thanks for testing it, Olga.
> > > > > 
> > > > > Chuck, would you be OK with folding the above delta into 9ab4c4077de9,
> > > > > or would you rather I resend the patch?
> > > > 
> > > > I've folded the above one-liner into the applied patch.
> > > > 
> > > > I agree with Tom, I think there's probably a (surprising)
> > > > explanation lurking for not seeing the expected performance
> > > > improvement. I can delay sending the NFSD v6.13 merge window pull
> > > > request for a bit to see if you can get it teased out.
> > > 
> > > I would like to raise a couple of issues:
> > > (1) I believe the server should be reporting back an accurate value
> > > for the backchannel session table size. I think if the
> > > NFSD_BC_SLOT_TABLE_MAX was way lower than the client's value then the
> > > client would be wasting resources for its bc session table?
> > 
> > Yes, but those resources are 32-bit integer per wasted slot. The Linux
> > client allows for up to 16 slots, so we're wasting 64 bytes per session
> > with this scheme with the Linux client. I didn't think it was worth
> > doing a separate allocation for that.
> > 
> > We could make NFSD_BC_SLOT_TABLE_MAX smaller though. Maybe we should
> > match the client's size and make it 15?
> > 
> > > ->back_channel->maxreqs gets decoded in nfsd4_decode_create_session()
> > > and is never adjusted for the reply to be based on the
> > > NFSD_BC_SLOT_TABLE_MAX. The problem is currently invisible because
> > > linux client's bc slot table size is 16 and nfsd's is higher.
> > > 
> > 
> > I'm not sure I understand the problem here. We don't care about most of
> > the backchannel attributes. maxreqs is the only one that matters, and
> > track that in se_cb_highest_slot.
> 
> Client sends a create_session with cba_back_chan_attrs with max_reqs
> of 16 -- stating that the client can handle 16 slots in it's slot
> table. Server currently doesn't do anything about reflecting back to
> the client its session slot table. It blindly returns what the client
> sent. Say NFSD_BC_SLOT_TABLE_MAX was 4. Server would never use more
> than 4 slots and yet the client would have to create a reply cache
> table for 16 slots. Isn't that poor sportsmanship on behalf of the
> linux server?
> 
> 

Thanks, that does sound like a bug. I think we can fix that with
another one-liner.  When we allocate the new session, update the
back_channel attrs in the request with the correct maxreqs. Thoughts?

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 15438826ed5b..c35d8fc2f693 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3885,6 +3885,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel);
 	if (!new)
 		goto out_release_drc_mem;
+	cr_ses->back_channel.maxreqs = new->se_cb_highest_slot;
 	conn = alloc_conn_from_crses(rqstp, cr_ses);
 	if (!conn)
 		goto out_free_session;
Jeff Layton Nov. 11, 2024, 6:27 p.m. UTC | #17
On Mon, 2024-11-11 at 13:17 -0500, Jeff Layton wrote:
> On Mon, 2024-11-11 at 12:56 -0500, Olga Kornievskaia wrote:
> > On Mon, Nov 11, 2024 at 12:40 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > 
> > > On Mon, 2024-11-11 at 12:17 -0500, Olga Kornievskaia wrote:
> > > > On Mon, Nov 11, 2024 at 9:56 AM Chuck Lever <chuck.lever@oracle.com> wrote:
> > > > > 
> > > > > On Mon, Nov 11, 2024 at 08:22:07AM -0500, Jeff Layton wrote:
> > > > > > On Sun, 2024-11-10 at 21:19 -0500, Olga Kornievskaia wrote:
> > > > > > > On Sat, Nov 9, 2024 at 2:26 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > > 
> > > > > > > > On Sat, 2024-11-09 at 13:50 -0500, Olga Kornievskaia wrote:
> > > > > > > > > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > > > > 
> > > > > > > > > > nfsd currently only uses a single slot in the callback channel, which is
> > > > > > > > > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > > > > > > > > a max of 32 slots (subject to the client's target_maxreqs value).
> > > > > > > > > > 
> > > > > > > > > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > > > > > > > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > > > > > > > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > > > > > > > > track which slots are in use, and a u32 to track the latest callback
> > > > > > > > > > target_slotid that the client reports. To protect the new fields, add
> > > > > > > > > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > > > > > > > > search for the lowest slotid (using ffs()).
> > > > > > > > > > 
> > > > > > > > > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > > > > > > > > counters and add the necessary handling to ensure that the seqids get
> > > > > > > > > > reset at the appropriate times.
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > > > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > > > > > > > > ---
> > > > > > > > > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > > > > > > > > when the slot table is full. Olga, if you're able to test this one, it
> > > > > > > > > > would be much appreciated.
> > > > > > > > > > ---
> > > > > > > > > > Changes in v4:
> > > > > > > > > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > > > > > > > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > > > > > > > > > 
> > > > > > > > > > Changes in v3:
> > > > > > > > > > - add patch to convert se_flags to single se_dead bool
> > > > > > > > > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > > > > > > > > - don't reject target highest slot value of 0
> > > > > > > > > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > > > > > > > > > 
> > > > > > > > > > Changes in v2:
> > > > > > > > > > - take cl_lock when fetching fields from session to be encoded
> > > > > > > > > > - use fls() instead of bespoke highest_unset_index()
> > > > > > > > > > - rename variables in several functions with more descriptive names
> > > > > > > > > > - clamp limit of for loop in update_cb_slot_table()
> > > > > > > > > > - re-add missing rpc_wake_up_queued_task() call
> > > > > > > > > > - fix slotid check in decode_cb_sequence4resok()
> > > > > > > > > > - add new per-session spinlock
> > > > > > > > > > ---
> > > > > > > > > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > > > > > > > > >  fs/nfsd/nfs4state.c    |  11 +++--
> > > > > > > > > >  fs/nfsd/state.h        |  15 ++++---
> > > > > > > > > >  fs/nfsd/trace.h        |   2 +-
> > > > > > > > > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > > > > > > > > > 
> > > > > > > > > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > > > > > > > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > > > > > > > > --- a/fs/nfsd/nfs4callback.c
> > > > > > > > > > +++ b/fs/nfsd/nfs4callback.c
> > > > > > > > > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > > > > > > > > >         hdr->nops++;
> > > > > > > > > >  }
> > > > > > > > > > 
> > > > > > > > > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > > > > > > > > +{
> > > > > > > > > > +       u32 idx;
> > > > > > > > > > +
> > > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > > +       idx = fls(~ses->se_cb_slot_avail);
> > > > > > > > > > +       if (idx > 0)
> > > > > > > > > > +               --idx;
> > > > > > > > > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > > +       return idx;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > >  /*
> > > > > > > > > >   * CB_SEQUENCE4args
> > > > > > > > > >   *
> > > > > > > > > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > > > > > > > > >         encode_sessionid4(xdr, session);
> > > > > > > > > > 
> > > > > > > > > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > > > > > > > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > > > > > > > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > > > > > > > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > > > > > > > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > > > > > > > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > > > > > > > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > > > > > > > > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > > > > > > > > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > > > > > > > > > 
> > > > > > > > > >         hdr->nops++;
> > > > > > > > > >  }
> > > > > > > > > > 
> > > > > > > > > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > > > > > > > > +{
> > > > > > > > > > +       /* No need to do anything if nothing changed */
> > > > > > > > > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > > > > > > > > +               return;
> > > > > > > > > > +
> > > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > > +       if (target > ses->se_cb_highest_slot) {
> > > > > > > > > > +               int i;
> > > > > > > > > > +
> > > > > > > > > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > > > > > > > > +
> > > > > > > > > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > > > > > > > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > > > > > > > > +                       ses->se_cb_seq_nr[i] = 1;
> > > > > > > > > > +       }
> > > > > > > > > > +       ses->se_cb_highest_slot = target;
> > > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > >  /*
> > > > > > > > > >   * CB_SEQUENCE4resok
> > > > > > > > > >   *
> > > > > > > > > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > > > > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > > > > > > > > >         int status = -ESERVERFAULT;
> > > > > > > > > >         __be32 *p;
> > > > > > > > > > -       u32 dummy;
> > > > > > > > > > +       u32 seqid, slotid, target;
> > > > > > > > > > 
> > > > > > > > > >         /*
> > > > > > > > > >          * If the server returns different values for sessionID, slotID or
> > > > > > > > > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > > > > >         }
> > > > > > > > > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > > > > > > > > > 
> > > > > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > > > > -       if (dummy != session->se_cb_seq_nr) {
> > > > > > > > > > +       seqid = be32_to_cpup(p++);
> > > > > > > > > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > > > > > > > > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > > > > > > > > >                 goto out;
> > > > > > > > > >         }
> > > > > > > > > > 
> > > > > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > > > > -       if (dummy != 0) {
> > > > > > > > > > +       slotid = be32_to_cpup(p++);
> > > > > > > > > > +       if (slotid != cb->cb_held_slot) {
> > > > > > > > > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > > > > > > > > >                 goto out;
> > > > > > > > > >         }
> > > > > > > > > > 
> > > > > > > > > > -       /*
> > > > > > > > > > -        * FIXME: process highest slotid and target highest slotid
> > > > > > > > > > -        */
> > > > > > > > > > +       p++; // ignore current highest slot value
> > > > > > > > > > +
> > > > > > > > > > +       target = be32_to_cpup(p++);
> > > > > > > > > > +       update_cb_slot_table(session, target);
> > > > > > > > > >         status = 0;
> > > > > > > > > >  out:
> > > > > > > > > >         cb->cb_seq_status = status;
> > > > > > > > > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > > > > >         spin_unlock(&clp->cl_lock);
> > > > > > > > > >  }
> > > > > > > > > > 
> > > > > > > > > > +static int grab_slot(struct nfsd4_session *ses)
> > > > > > > > > > +{
> > > > > > > > > > +       int idx;
> > > > > > > > > > +
> > > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > > > > > > > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > > > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > > > > +               return -1;
> > > > > > > > > > +       }
> > > > > > > > > > +       /* clear the bit for the slot */
> > > > > > > > > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > > +       return idx;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > >  /*
> > > > > > > > > >   * There's currently a single callback channel slot.
> > > > > > > > > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > > > > > > > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > > > > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > > > > > > > > >  {
> > > > > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > > > > 
> > > > > > > > > > -       if (!cb->cb_holds_slot &&
> > > > > > > > > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > > > > +       if (cb->cb_held_slot >= 0)
> > > > > > > > > > +               return true;
> > > > > > > > > > +       cb->cb_held_slot = grab_slot(ses);
> > > > > > > > > > +       if (cb->cb_held_slot < 0) {
> > > > > > > > > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > > > > > > > > >                 /* Race breaker */
> > > > > > > > > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > > > > -                       dprintk("%s slot is busy\n", __func__);
> > > > > > > > > > +               cb->cb_held_slot = grab_slot(ses);
> > > > > > > > > > +               if (cb->cb_held_slot < 0)
> > > > > > > > > >                         return false;
> > > > > > > > > > -               }
> > > > > > > > > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > > > > > > > > >         }
> > > > > > > > > > -       cb->cb_holds_slot = true;
> > > > > > > > > >         return true;
> > > > > > > > > >  }
> > > > > > > > > > 
> > > > > > > > > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > > > > > > > > >  {
> > > > > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > > > > 
> > > > > > > > > > -       if (cb->cb_holds_slot) {
> > > > > > > > > > -               cb->cb_holds_slot = false;
> > > > > > > > > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > > > > > +       if (cb->cb_held_slot >= 0) {
> > > > > > > > > > +               spin_lock(&ses->se_lock);
> > > > > > > > > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > > > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > > > > +               cb->cb_held_slot = -1;
> > > > > > > > > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > > > > > > > > >         }
> > > > > > > > > >  }
> > > > > > > > > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > > > > > > > > >  }
> > > > > > > > > > 
> > > > > > > > > >  /*
> > > > > > > > > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > > > > > > > > - * slots, and mark callback channel down on communication errors.
> > > > > > > > > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > > > > > > > > + * and mark callback channel down on communication errors.
> > > > > > > > > >   */
> > > > > > > > > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > > > > > > > > >  {
> > > > > > > > > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > > >                 return true;
> > > > > > > > > >         }
> > > > > > > > > > 
> > > > > > > > > > -       if (!cb->cb_holds_slot)
> > > > > > > > > > +       if (cb->cb_held_slot < 0)
> > > > > > > > > >                 goto need_restart;
> > > > > > > > > > 
> > > > > > > > > >         /* This is the operation status code for CB_SEQUENCE */
> > > > > > > > > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > > > > > > > > >                  * (sequence ID, cached reply) MUST NOT change.
> > > > > > > > > >                  */
> > > > > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > > > > >                 break;
> > > > > > > > > >         case -ESERVERFAULT:
> > > > > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > > > > >                 ret = false;
> > > > > > > > > >                 break;
> > > > > > > > > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > > >         case -NFS4ERR_BADSLOT:
> > > > > > > > > >                 goto retry_nowait;
> > > > > > > > > >         case -NFS4ERR_SEQ_MISORDERED:
> > > > > > > > > > -               if (session->se_cb_seq_nr != 1) {
> > > > > > > > > > -                       session->se_cb_seq_nr = 1;
> > > > > > > > > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > > > > > > > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > > > > > > > > >                         goto retry_nowait;
> > > > > > > > > >                 }
> > > > > > > > > >                 break;
> > > > > > > > > >         default:
> > > > > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > > > > >         }
> > > > > > > > > > -       nfsd41_cb_release_slot(cb);
> > > > > > > > > > -
> > > > > > > > > >         trace_nfsd_cb_free_slot(task, cb);
> > > > > > > > > > +       nfsd41_cb_release_slot(cb);
> > > > > > > > > > 
> > > > > > > > > >         if (RPC_SIGNALLED(task))
> > > > > > > > > >                 goto need_restart;
> > > > > > > > > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > > > > > > > > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > > > > > > > > >         cb->cb_status = 0;
> > > > > > > > > >         cb->cb_need_restart = false;
> > > > > > > > > > -       cb->cb_holds_slot = false;
> > > > > > > > > > +       cb->cb_held_slot = -1;
> > > > > > > > > >  }
> > > > > > > > > > 
> > > > > > > > > >  /**
> > > > > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > > > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > > > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > > > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > > > > > >         }
> > > > > > > > > > 
> > > > > > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > > > > > > +       new->se_cb_slot_avail = ~0U;
> > > > > > > > > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > > > > > +       spin_lock_init(&new->se_lock);
> > > > > > > > > >         return new;
> > > > > > > > > >  out_free:
> > > > > > > > > >         while (i--)
> > > > > > > > > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > > > > > > > > 
> > > > > > > > > >         INIT_LIST_HEAD(&new->se_conns);
> > > > > > > > > > 
> > > > > > > > > > -       new->se_cb_seq_nr = 1;
> > > > > > > > > > +       atomic_set(&new->se_ref, 0);
> > > > > > > > > >         new->se_dead = false;
> > > > > > > > > >         new->se_cb_prog = cses->callback_prog;
> > > > > > > > > >         new->se_cb_sec = cses->cb_sec;
> > > > > > > > > > -       atomic_set(&new->se_ref, 0);
> > > > > > > > > > +
> > > > > > > > > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > > > > > > > > +               new->se_cb_seq_nr[idx] = 1;
> > > > > > > > > > +
> > > > > > > > > >         idx = hash_sessionid(&new->se_sessionid);
> > > > > > > > > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > > > > > > > > >         spin_lock(&clp->cl_lock);
> > > > > > > > > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > > > > > > > > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > > > > > > > > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > > > > > > > > >         clp->cl_time = ktime_get_boottime_seconds();
> > > > > > > > > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > > > > >         copy_verf(clp, verf);
> > > > > > > > > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > > > > > > > > >         clp->cl_cb_session = NULL;
> > > > > > > > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > > > > > > > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > > > > > > > > --- a/fs/nfsd/state.h
> > > > > > > > > > +++ b/fs/nfsd/state.h
> > > > > > > > > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > > > > > > > > >         struct work_struct cb_work;
> > > > > > > > > >         int cb_seq_status;
> > > > > > > > > >         int cb_status;
> > > > > > > > > > +       int cb_held_slot;
> > > > > > > > > >         bool cb_need_restart;
> > > > > > > > > > -       bool cb_holds_slot;
> > > > > > > > > >  };
> > > > > > > > > > 
> > > > > > > > > >  struct nfsd4_callback_ops {
> > > > > > > > > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > > > > > > > > >         unsigned char cn_flags;
> > > > > > > > > >  };
> > > > > > > > > > 
> > > > > > > > > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > > > > > > > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > > > > > > > > 
> > > > > > > > > Are there some values that are known not to work? I was experimenting
> > > > > > > > > with values and set it to 2 and 4 and the kernel oopsed. I understand
> > > > > > > > > it's not a configurable value but it would still be good to know the
> > > > > > > > > expectations...
> > > > > > > > > 
> > > > > > > > > [  198.625021] Unable to handle kernel paging request at virtual
> > > > > > > > > address dfff800020000000
> > > > > > > > > [  198.625870] KASAN: probably user-memory-access in range
> > > > > > > > > [0x0000000100000000-0x0000000100000007]
> > > > > > > > > [  198.626444] Mem abort info:
> > > > > > > > > [  198.626630]   ESR = 0x0000000096000005
> > > > > > > > > [  198.626882]   EC = 0x25: DABT (current EL), IL = 32 bits
> > > > > > > > > [  198.627234]   SET = 0, FnV = 0
> > > > > > > > > [  198.627441]   EA = 0, S1PTW = 0
> > > > > > > > > [  198.627627]   FSC = 0x05: level 1 translation fault
> > > > > > > > > [  198.627859] Data abort info:
> > > > > > > > > [  198.628000]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
> > > > > > > > > [  198.628272]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> > > > > > > > > [  198.628619]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> > > > > > > > > [  198.628967] [dfff800020000000] address between user and kernel address ranges
> > > > > > > > > [  198.629438] Internal error: Oops: 0000000096000005 [#1] SMP
> > > > > > > > > [  198.629806] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver
> > > > > > > > > nfs netfs nfnetlink_queue nfnetlink_log nfnetlink bluetooth cfg80211
> > > > > > > > > rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd auth_rpcgss nfs_acl lockd
> > > > > > > > > grace isofs uinput snd_seq_dummy snd_hrtimer vsock_loopback
> > > > > > > > > vmw_vsock_virtio_transport_common qrtr rfkill vmw_vsock_vmci_transport
> > > > > > > > > vsock sunrpc vfat fat snd_hda_codec_generic snd_hda_intel
> > > > > > > > > snd_intel_dspcfg snd_hda_codec snd_hda_core snd_hwdep snd_seq uvcvideo
> > > > > > > > > videobuf2_vmalloc snd_seq_device videobuf2_memops uvc videobuf2_v4l2
> > > > > > > > > videodev snd_pcm videobuf2_common mc snd_timer snd vmw_vmci soundcore
> > > > > > > > > xfs libcrc32c vmwgfx drm_ttm_helper ttm nvme drm_kms_helper
> > > > > > > > > crct10dif_ce nvme_core ghash_ce sha2_ce sha256_arm64 sha1_ce drm
> > > > > > > > > nvme_auth sr_mod cdrom e1000e sg fuse
> > > > > > > > > [  198.633799] CPU: 5 UID: 0 PID: 6081 Comm: nfsd Kdump: loaded Not
> > > > > > > > > tainted 6.12.0-rc6+ #47
> > > > > > > > > [  198.634345] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS
> > > > > > > > > VMW201.00V.21805430.BA64.2305221830 05/22/2023
> > > > > > > > > [  198.635014] pstate: 11400005 (nzcV daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> > > > > > > > > [  198.635492] pc : nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > > > > [  198.635798] lr : nfsd4_sequence+0x340/0x1f60 [nfsd]
> > > > > > > > > [  198.636065] sp : ffff8000884977e0
> > > > > > > > > [  198.636234] x29: ffff800088497910 x28: ffff0000b1b39280 x27: ffff0000ab508128
> > > > > > > > > [  198.636624] x26: ffff0000b1b39298 x25: ffff0000b1b39290 x24: ffff0000a65e1c64
> > > > > > > > > [  198.637049] x23: 1fffe000212e6804 x22: ffff000109734024 x21: 1ffff00011092f16
> > > > > > > > > [  198.637472] x20: ffff00010aed8000 x19: ffff000109734000 x18: 1fffe0002de20c8b
> > > > > > > > > [  198.637883] x17: 0100000000000000 x16: 1ffff0000fcef234 x15: 1fffe000212e600f
> > > > > > > > > [  198.638286] x14: ffff80007e779000 x13: ffff80007e7791a0 x12: 0000000000000000
> > > > > > > > > [  198.638697] x11: ffff0000a65e1c38 x10: ffff00010aedaca0 x9 : 1fffe000215db594
> > > > > > > > > [  198.639110] x8 : 1fffe00014cbc387 x7 : ffff0000a65e1c03 x6 : ffff0000a65e1c00
> > > > > > > > > [  198.639541] x5 : ffff0000a65e1c00 x4 : 0000000020000000 x3 : 0000000100000001
> > > > > > > > > [  198.639962] x2 : ffff000109730060 x1 : 0000000000000003 x0 : dfff800000000000
> > > > > > > > > [  198.640332] Call trace:
> > > > > > > > > [  198.640460]  nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > > > > [  198.640715]  nfsd4_proc_compound+0xb94/0x23b0 [nfsd]
> > > > > > > > > [  198.640997]  nfsd_dispatch+0x22c/0x718 [nfsd]
> > > > > > > > > [  198.641260]  svc_process_common+0x8e8/0x1968 [sunrpc]
> > > > > > > > > [  198.641566]  svc_process+0x3d4/0x7e0 [sunrpc]
> > > > > > > > > [  198.641827]  svc_handle_xprt+0x828/0xe10 [sunrpc]
> > > > > > > > > [  198.642108]  svc_recv+0x2cc/0x6a8 [sunrpc]
> > > > > > > > > [  198.642346]  nfsd+0x270/0x400 [nfsd]
> > > > > > > > > [  198.642562]  kthread+0x288/0x310
> > > > > > > > > [  198.642745]  ret_from_fork+0x10/0x20
> > > > > > > > > [  198.642937] Code: f2fbffe0 f9003be4 f94007e2 52800061 (38e06880)
> > > > > > > > > [  198.643267] SMP: stopping secondary CPUs
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Good catch. I think the problem here is that we don't currently cap the
> > > > > > > > initial value of se_cb_highest_slot at NFSD_BC_SLOT_TABLE_MAX. Does
> > > > > > > > this patch prevent the panic?
> > > > > > > > 
> > > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > > index 3afe56ab9e0a..839be4ba765a 100644
> > > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > > @@ -2011,7 +2011,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > > > > 
> > > > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > > > >         new->se_cb_slot_avail = ~0U;
> > > > > > > > -       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > > > +       new->se_cb_highest_slot = min(battrs->maxreqs - 1, NFSD_BC_SLOT_TABLE_MAX);
> > > > > > > >         spin_lock_init(&new->se_lock);
> > > > > > > >         return new;
> > > > > > > >  out_free:
> > > > > > > 
> > > > > > > It does help. I thought that the CREATE_SESSION reply for the
> > > > > > > backchannel would be guided by the NFSD_BC_SLOT_TABLE_MAX value but
> > > > > > > instead it seems like it's not. But yes I can see that the highest
> > > > > > > slot used by the server is capped by the NFSD_BC_SLOT_TABLE_MAX value.
> > > > > > 
> > > > > > Thanks for testing it, Olga.
> > > > > > 
> > > > > > Chuck, would you be OK with folding the above delta into 9ab4c4077de9,
> > > > > > or would you rather I resend the patch?
> > > > > 
> > > > > I've folded the above one-liner into the applied patch.
> > > > > 
> > > > > I agree with Tom, I think there's probably a (surprising)
> > > > > explanation lurking for not seeing the expected performance
> > > > > improvement. I can delay sending the NFSD v6.13 merge window pull
> > > > > request for a bit to see if you can get it teased out.
> > > > 
> > > > I would like to raise a couple of issues:
> > > > (1) I believe the server should be reporting back an accurate value
> > > > for the backchannel session table size. I think if the
> > > > NFSD_BC_SLOT_TABLE_MAX was way lower than the client's value then the
> > > > client would be wasting resources for its bc session table?
> > > 
> > > Yes, but those resources are 32-bit integer per wasted slot. The Linux
> > > client allows for up to 16 slots, so we're wasting 64 bytes per session
> > > with this scheme with the Linux client. I didn't think it was worth
> > > doing a separate allocation for that.
> > > 
> > > We could make NFSD_BC_SLOT_TABLE_MAX smaller though. Maybe we should
> > > match the client's size and make it 15?
> > > 
> > > > ->back_channel->maxreqs gets decoded in nfsd4_decode_create_session()
> > > > and is never adjusted for the reply to be based on the
> > > > NFSD_BC_SLOT_TABLE_MAX. The problem is currently invisible because
> > > > linux client's bc slot table size is 16 and nfsd's is higher.
> > > > 
> > > 
> > > I'm not sure I understand the problem here. We don't care about most of
> > > the backchannel attributes. maxreqs is the only one that matters, and
> > > track that in se_cb_highest_slot.
> > 
> > Client sends a create_session with cba_back_chan_attrs with max_reqs
> > of 16 -- stating that the client can handle 16 slots in it's slot
> > table. Server currently doesn't do anything about reflecting back to
> > the client its session slot table. It blindly returns what the client
> > sent. Say NFSD_BC_SLOT_TABLE_MAX was 4. Server would never use more
> > than 4 slots and yet the client would have to create a reply cache
> > table for 16 slots. Isn't that poor sportsmanship on behalf of the
> > linux server?
> > 
> > 
> 
> Thanks, that does sound like a bug. I think we can fix that with
> another one-liner.  When we allocate the new session, update the
> back_channel attrs in the request with the correct maxreqs. Thoughts?
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 15438826ed5b..c35d8fc2f693 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -3885,6 +3885,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
>  	new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel);
>  	if (!new)
>  		goto out_release_drc_mem;
> +	cr_ses->back_channel.maxreqs = new->se_cb_highest_slot;
>  	conn = alloc_conn_from_crses(rqstp, cr_ses);
>  	if (!conn)
>  		goto out_free_session;


Actually, I think this is better, since we're already modifying things
in this section of the code. Also the earlier patch was off-by-one:

------------------------8<----------------------

[PATCH] SQUASH: report the correct number of backchannel slots to client

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 15438826ed5b..cfc2190ffce5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3955,6 +3955,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	cr_ses->flags &= ~SESSION4_PERSIST;
 	/* Upshifting from TCP to RDMA is not supported */
 	cr_ses->flags &= ~SESSION4_RDMA;
+	/* Report the correct number of backchannel slots */
+	cr_ses->back_channel.maxreqs = new->se_cb_highest_slot + 1;
 
 	init_session(rqstp, new, conf, cr_ses);
 	nfsd4_get_session_locked(new);
Chuck Lever III Nov. 11, 2024, 6:46 p.m. UTC | #18
On Mon, Nov 11, 2024 at 01:27:39PM -0500, Jeff Layton wrote:
> On Mon, 2024-11-11 at 13:17 -0500, Jeff Layton wrote:
> > On Mon, 2024-11-11 at 12:56 -0500, Olga Kornievskaia wrote:
> > > On Mon, Nov 11, 2024 at 12:40 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > 
> > > > On Mon, 2024-11-11 at 12:17 -0500, Olga Kornievskaia wrote:
> > > > > On Mon, Nov 11, 2024 at 9:56 AM Chuck Lever <chuck.lever@oracle.com> wrote:
> > > > > > 
> > > > > > On Mon, Nov 11, 2024 at 08:22:07AM -0500, Jeff Layton wrote:
> > > > > > > On Sun, 2024-11-10 at 21:19 -0500, Olga Kornievskaia wrote:
> > > > > > > > On Sat, Nov 9, 2024 at 2:26 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > > > 
> > > > > > > > > On Sat, 2024-11-09 at 13:50 -0500, Olga Kornievskaia wrote:
> > > > > > > > > > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > > > > > 
> > > > > > > > > > > nfsd currently only uses a single slot in the callback channel, which is
> > > > > > > > > > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > > > > > > > > > a max of 32 slots (subject to the client's target_maxreqs value).
> > > > > > > > > > > 
> > > > > > > > > > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > > > > > > > > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > > > > > > > > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > > > > > > > > > track which slots are in use, and a u32 to track the latest callback
> > > > > > > > > > > target_slotid that the client reports. To protect the new fields, add
> > > > > > > > > > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > > > > > > > > > search for the lowest slotid (using ffs()).
> > > > > > > > > > > 
> > > > > > > > > > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > > > > > > > > > counters and add the necessary handling to ensure that the seqids get
> > > > > > > > > > > reset at the appropriate times.
> > > > > > > > > > > 
> > > > > > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > > > > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > > > > > > > > > ---
> > > > > > > > > > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > > > > > > > > > when the slot table is full. Olga, if you're able to test this one, it
> > > > > > > > > > > would be much appreciated.
> > > > > > > > > > > ---
> > > > > > > > > > > Changes in v4:
> > > > > > > > > > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > > > > > > > > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > > > > > > > > > > 
> > > > > > > > > > > Changes in v3:
> > > > > > > > > > > - add patch to convert se_flags to single se_dead bool
> > > > > > > > > > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > > > > > > > > > - don't reject target highest slot value of 0
> > > > > > > > > > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > > > > > > > > > > 
> > > > > > > > > > > Changes in v2:
> > > > > > > > > > > - take cl_lock when fetching fields from session to be encoded
> > > > > > > > > > > - use fls() instead of bespoke highest_unset_index()
> > > > > > > > > > > - rename variables in several functions with more descriptive names
> > > > > > > > > > > - clamp limit of for loop in update_cb_slot_table()
> > > > > > > > > > > - re-add missing rpc_wake_up_queued_task() call
> > > > > > > > > > > - fix slotid check in decode_cb_sequence4resok()
> > > > > > > > > > > - add new per-session spinlock
> > > > > > > > > > > ---
> > > > > > > > > > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > > > > > > > > > >  fs/nfsd/nfs4state.c    |  11 +++--
> > > > > > > > > > >  fs/nfsd/state.h        |  15 ++++---
> > > > > > > > > > >  fs/nfsd/trace.h        |   2 +-
> > > > > > > > > > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > > > > > > > > > > 
> > > > > > > > > > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > > > > > > > > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > > > > > > > > > --- a/fs/nfsd/nfs4callback.c
> > > > > > > > > > > +++ b/fs/nfsd/nfs4callback.c
> > > > > > > > > > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > > > > > > > > > >         hdr->nops++;
> > > > > > > > > > >  }
> > > > > > > > > > > 
> > > > > > > > > > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > > > > > > > > > +{
> > > > > > > > > > > +       u32 idx;
> > > > > > > > > > > +
> > > > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > > > +       idx = fls(~ses->se_cb_slot_avail);
> > > > > > > > > > > +       if (idx > 0)
> > > > > > > > > > > +               --idx;
> > > > > > > > > > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > > > +       return idx;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > >  /*
> > > > > > > > > > >   * CB_SEQUENCE4args
> > > > > > > > > > >   *
> > > > > > > > > > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > > > > > > > > > >         encode_sessionid4(xdr, session);
> > > > > > > > > > > 
> > > > > > > > > > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > > > > > > > > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > > > > > > > > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > > > > > > > > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > > > > > > > > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > > > > > > > > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > > > > > > > > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > > > > > > > > > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > > > > > > > > > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > > > > > > > > > > 
> > > > > > > > > > >         hdr->nops++;
> > > > > > > > > > >  }
> > > > > > > > > > > 
> > > > > > > > > > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > > > > > > > > > +{
> > > > > > > > > > > +       /* No need to do anything if nothing changed */
> > > > > > > > > > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > > > > > > > > > +               return;
> > > > > > > > > > > +
> > > > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > > > +       if (target > ses->se_cb_highest_slot) {
> > > > > > > > > > > +               int i;
> > > > > > > > > > > +
> > > > > > > > > > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > > > > > > > > > +
> > > > > > > > > > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > > > > > > > > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > > > > > > > > > +                       ses->se_cb_seq_nr[i] = 1;
> > > > > > > > > > > +       }
> > > > > > > > > > > +       ses->se_cb_highest_slot = target;
> > > > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > >  /*
> > > > > > > > > > >   * CB_SEQUENCE4resok
> > > > > > > > > > >   *
> > > > > > > > > > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > > > > > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > > > > > > > > > >         int status = -ESERVERFAULT;
> > > > > > > > > > >         __be32 *p;
> > > > > > > > > > > -       u32 dummy;
> > > > > > > > > > > +       u32 seqid, slotid, target;
> > > > > > > > > > > 
> > > > > > > > > > >         /*
> > > > > > > > > > >          * If the server returns different values for sessionID, slotID or
> > > > > > > > > > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > > > > > >         }
> > > > > > > > > > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > > > > > > > > > > 
> > > > > > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > > > > > -       if (dummy != session->se_cb_seq_nr) {
> > > > > > > > > > > +       seqid = be32_to_cpup(p++);
> > > > > > > > > > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > > > > > > > > > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > > > > > > > > > >                 goto out;
> > > > > > > > > > >         }
> > > > > > > > > > > 
> > > > > > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > > > > > -       if (dummy != 0) {
> > > > > > > > > > > +       slotid = be32_to_cpup(p++);
> > > > > > > > > > > +       if (slotid != cb->cb_held_slot) {
> > > > > > > > > > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > > > > > > > > > >                 goto out;
> > > > > > > > > > >         }
> > > > > > > > > > > 
> > > > > > > > > > > -       /*
> > > > > > > > > > > -        * FIXME: process highest slotid and target highest slotid
> > > > > > > > > > > -        */
> > > > > > > > > > > +       p++; // ignore current highest slot value
> > > > > > > > > > > +
> > > > > > > > > > > +       target = be32_to_cpup(p++);
> > > > > > > > > > > +       update_cb_slot_table(session, target);
> > > > > > > > > > >         status = 0;
> > > > > > > > > > >  out:
> > > > > > > > > > >         cb->cb_seq_status = status;
> > > > > > > > > > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > > > > > >         spin_unlock(&clp->cl_lock);
> > > > > > > > > > >  }
> > > > > > > > > > > 
> > > > > > > > > > > +static int grab_slot(struct nfsd4_session *ses)
> > > > > > > > > > > +{
> > > > > > > > > > > +       int idx;
> > > > > > > > > > > +
> > > > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > > > > > > > > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > > > > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > > > > > +               return -1;
> > > > > > > > > > > +       }
> > > > > > > > > > > +       /* clear the bit for the slot */
> > > > > > > > > > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > > > +       return idx;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > >  /*
> > > > > > > > > > >   * There's currently a single callback channel slot.
> > > > > > > > > > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > > > > > > > > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > > > > > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > > > > > > > > > >  {
> > > > > > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > > > > > 
> > > > > > > > > > > -       if (!cb->cb_holds_slot &&
> > > > > > > > > > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > > > > > +       if (cb->cb_held_slot >= 0)
> > > > > > > > > > > +               return true;
> > > > > > > > > > > +       cb->cb_held_slot = grab_slot(ses);
> > > > > > > > > > > +       if (cb->cb_held_slot < 0) {
> > > > > > > > > > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > > > > > > > > > >                 /* Race breaker */
> > > > > > > > > > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > > > > > -                       dprintk("%s slot is busy\n", __func__);
> > > > > > > > > > > +               cb->cb_held_slot = grab_slot(ses);
> > > > > > > > > > > +               if (cb->cb_held_slot < 0)
> > > > > > > > > > >                         return false;
> > > > > > > > > > > -               }
> > > > > > > > > > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > > > > > > > > > >         }
> > > > > > > > > > > -       cb->cb_holds_slot = true;
> > > > > > > > > > >         return true;
> > > > > > > > > > >  }
> > > > > > > > > > > 
> > > > > > > > > > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > > > > > > > > > >  {
> > > > > > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > > > > > 
> > > > > > > > > > > -       if (cb->cb_holds_slot) {
> > > > > > > > > > > -               cb->cb_holds_slot = false;
> > > > > > > > > > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > > > > > > +       if (cb->cb_held_slot >= 0) {
> > > > > > > > > > > +               spin_lock(&ses->se_lock);
> > > > > > > > > > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > > > > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > > > > > +               cb->cb_held_slot = -1;
> > > > > > > > > > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > > > > > > > > > >         }
> > > > > > > > > > >  }
> > > > > > > > > > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > > > > > > > > > >  }
> > > > > > > > > > > 
> > > > > > > > > > >  /*
> > > > > > > > > > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > > > > > > > > > - * slots, and mark callback channel down on communication errors.
> > > > > > > > > > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > > > > > > > > > + * and mark callback channel down on communication errors.
> > > > > > > > > > >   */
> > > > > > > > > > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > > > > > > > > > >  {
> > > > > > > > > > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > > > >                 return true;
> > > > > > > > > > >         }
> > > > > > > > > > > 
> > > > > > > > > > > -       if (!cb->cb_holds_slot)
> > > > > > > > > > > +       if (cb->cb_held_slot < 0)
> > > > > > > > > > >                 goto need_restart;
> > > > > > > > > > > 
> > > > > > > > > > >         /* This is the operation status code for CB_SEQUENCE */
> > > > > > > > > > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > > > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > > > > > > > > > >                  * (sequence ID, cached reply) MUST NOT change.
> > > > > > > > > > >                  */
> > > > > > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > > > > > >                 break;
> > > > > > > > > > >         case -ESERVERFAULT:
> > > > > > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > > > > > >                 ret = false;
> > > > > > > > > > >                 break;
> > > > > > > > > > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > > > >         case -NFS4ERR_BADSLOT:
> > > > > > > > > > >                 goto retry_nowait;
> > > > > > > > > > >         case -NFS4ERR_SEQ_MISORDERED:
> > > > > > > > > > > -               if (session->se_cb_seq_nr != 1) {
> > > > > > > > > > > -                       session->se_cb_seq_nr = 1;
> > > > > > > > > > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > > > > > > > > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > > > > > > > > > >                         goto retry_nowait;
> > > > > > > > > > >                 }
> > > > > > > > > > >                 break;
> > > > > > > > > > >         default:
> > > > > > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > > > > > >         }
> > > > > > > > > > > -       nfsd41_cb_release_slot(cb);
> > > > > > > > > > > -
> > > > > > > > > > >         trace_nfsd_cb_free_slot(task, cb);
> > > > > > > > > > > +       nfsd41_cb_release_slot(cb);
> > > > > > > > > > > 
> > > > > > > > > > >         if (RPC_SIGNALLED(task))
> > > > > > > > > > >                 goto need_restart;
> > > > > > > > > > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > > > > > > > > > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > > > > > > > > > >         cb->cb_status = 0;
> > > > > > > > > > >         cb->cb_need_restart = false;
> > > > > > > > > > > -       cb->cb_holds_slot = false;
> > > > > > > > > > > +       cb->cb_held_slot = -1;
> > > > > > > > > > >  }
> > > > > > > > > > > 
> > > > > > > > > > >  /**
> > > > > > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > > > > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > > > > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > > > > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > > > > > > >         }
> > > > > > > > > > > 
> > > > > > > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > > > > > > > +       new->se_cb_slot_avail = ~0U;
> > > > > > > > > > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > > > > > > +       spin_lock_init(&new->se_lock);
> > > > > > > > > > >         return new;
> > > > > > > > > > >  out_free:
> > > > > > > > > > >         while (i--)
> > > > > > > > > > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > > > > > > > > > 
> > > > > > > > > > >         INIT_LIST_HEAD(&new->se_conns);
> > > > > > > > > > > 
> > > > > > > > > > > -       new->se_cb_seq_nr = 1;
> > > > > > > > > > > +       atomic_set(&new->se_ref, 0);
> > > > > > > > > > >         new->se_dead = false;
> > > > > > > > > > >         new->se_cb_prog = cses->callback_prog;
> > > > > > > > > > >         new->se_cb_sec = cses->cb_sec;
> > > > > > > > > > > -       atomic_set(&new->se_ref, 0);
> > > > > > > > > > > +
> > > > > > > > > > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > > > > > > > > > +               new->se_cb_seq_nr[idx] = 1;
> > > > > > > > > > > +
> > > > > > > > > > >         idx = hash_sessionid(&new->se_sessionid);
> > > > > > > > > > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > > > > > > > > > >         spin_lock(&clp->cl_lock);
> > > > > > > > > > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > > > > > > > > > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > > > > > > > > > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > > > > > > > > > >         clp->cl_time = ktime_get_boottime_seconds();
> > > > > > > > > > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > > > > > >         copy_verf(clp, verf);
> > > > > > > > > > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > > > > > > > > > >         clp->cl_cb_session = NULL;
> > > > > > > > > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > > > > > > > > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > > > > > > > > > --- a/fs/nfsd/state.h
> > > > > > > > > > > +++ b/fs/nfsd/state.h
> > > > > > > > > > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > > > > > > > > > >         struct work_struct cb_work;
> > > > > > > > > > >         int cb_seq_status;
> > > > > > > > > > >         int cb_status;
> > > > > > > > > > > +       int cb_held_slot;
> > > > > > > > > > >         bool cb_need_restart;
> > > > > > > > > > > -       bool cb_holds_slot;
> > > > > > > > > > >  };
> > > > > > > > > > > 
> > > > > > > > > > >  struct nfsd4_callback_ops {
> > > > > > > > > > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > > > > > > > > > >         unsigned char cn_flags;
> > > > > > > > > > >  };
> > > > > > > > > > > 
> > > > > > > > > > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > > > > > > > > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > > > > > > > > > 
> > > > > > > > > > Are there some values that are known not to work? I was experimenting
> > > > > > > > > > with values and set it to 2 and 4 and the kernel oopsed. I understand
> > > > > > > > > > it's not a configurable value but it would still be good to know the
> > > > > > > > > > expectations...
> > > > > > > > > > 
> > > > > > > > > > [  198.625021] Unable to handle kernel paging request at virtual
> > > > > > > > > > address dfff800020000000
> > > > > > > > > > [  198.625870] KASAN: probably user-memory-access in range
> > > > > > > > > > [0x0000000100000000-0x0000000100000007]
> > > > > > > > > > [  198.626444] Mem abort info:
> > > > > > > > > > [  198.626630]   ESR = 0x0000000096000005
> > > > > > > > > > [  198.626882]   EC = 0x25: DABT (current EL), IL = 32 bits
> > > > > > > > > > [  198.627234]   SET = 0, FnV = 0
> > > > > > > > > > [  198.627441]   EA = 0, S1PTW = 0
> > > > > > > > > > [  198.627627]   FSC = 0x05: level 1 translation fault
> > > > > > > > > > [  198.627859] Data abort info:
> > > > > > > > > > [  198.628000]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
> > > > > > > > > > [  198.628272]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> > > > > > > > > > [  198.628619]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> > > > > > > > > > [  198.628967] [dfff800020000000] address between user and kernel address ranges
> > > > > > > > > > [  198.629438] Internal error: Oops: 0000000096000005 [#1] SMP
> > > > > > > > > > [  198.629806] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver
> > > > > > > > > > nfs netfs nfnetlink_queue nfnetlink_log nfnetlink bluetooth cfg80211
> > > > > > > > > > rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd auth_rpcgss nfs_acl lockd
> > > > > > > > > > grace isofs uinput snd_seq_dummy snd_hrtimer vsock_loopback
> > > > > > > > > > vmw_vsock_virtio_transport_common qrtr rfkill vmw_vsock_vmci_transport
> > > > > > > > > > vsock sunrpc vfat fat snd_hda_codec_generic snd_hda_intel
> > > > > > > > > > snd_intel_dspcfg snd_hda_codec snd_hda_core snd_hwdep snd_seq uvcvideo
> > > > > > > > > > videobuf2_vmalloc snd_seq_device videobuf2_memops uvc videobuf2_v4l2
> > > > > > > > > > videodev snd_pcm videobuf2_common mc snd_timer snd vmw_vmci soundcore
> > > > > > > > > > xfs libcrc32c vmwgfx drm_ttm_helper ttm nvme drm_kms_helper
> > > > > > > > > > crct10dif_ce nvme_core ghash_ce sha2_ce sha256_arm64 sha1_ce drm
> > > > > > > > > > nvme_auth sr_mod cdrom e1000e sg fuse
> > > > > > > > > > [  198.633799] CPU: 5 UID: 0 PID: 6081 Comm: nfsd Kdump: loaded Not
> > > > > > > > > > tainted 6.12.0-rc6+ #47
> > > > > > > > > > [  198.634345] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS
> > > > > > > > > > VMW201.00V.21805430.BA64.2305221830 05/22/2023
> > > > > > > > > > [  198.635014] pstate: 11400005 (nzcV daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> > > > > > > > > > [  198.635492] pc : nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > > > > > [  198.635798] lr : nfsd4_sequence+0x340/0x1f60 [nfsd]
> > > > > > > > > > [  198.636065] sp : ffff8000884977e0
> > > > > > > > > > [  198.636234] x29: ffff800088497910 x28: ffff0000b1b39280 x27: ffff0000ab508128
> > > > > > > > > > [  198.636624] x26: ffff0000b1b39298 x25: ffff0000b1b39290 x24: ffff0000a65e1c64
> > > > > > > > > > [  198.637049] x23: 1fffe000212e6804 x22: ffff000109734024 x21: 1ffff00011092f16
> > > > > > > > > > [  198.637472] x20: ffff00010aed8000 x19: ffff000109734000 x18: 1fffe0002de20c8b
> > > > > > > > > > [  198.637883] x17: 0100000000000000 x16: 1ffff0000fcef234 x15: 1fffe000212e600f
> > > > > > > > > > [  198.638286] x14: ffff80007e779000 x13: ffff80007e7791a0 x12: 0000000000000000
> > > > > > > > > > [  198.638697] x11: ffff0000a65e1c38 x10: ffff00010aedaca0 x9 : 1fffe000215db594
> > > > > > > > > > [  198.639110] x8 : 1fffe00014cbc387 x7 : ffff0000a65e1c03 x6 : ffff0000a65e1c00
> > > > > > > > > > [  198.639541] x5 : ffff0000a65e1c00 x4 : 0000000020000000 x3 : 0000000100000001
> > > > > > > > > > [  198.639962] x2 : ffff000109730060 x1 : 0000000000000003 x0 : dfff800000000000
> > > > > > > > > > [  198.640332] Call trace:
> > > > > > > > > > [  198.640460]  nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > > > > > [  198.640715]  nfsd4_proc_compound+0xb94/0x23b0 [nfsd]
> > > > > > > > > > [  198.640997]  nfsd_dispatch+0x22c/0x718 [nfsd]
> > > > > > > > > > [  198.641260]  svc_process_common+0x8e8/0x1968 [sunrpc]
> > > > > > > > > > [  198.641566]  svc_process+0x3d4/0x7e0 [sunrpc]
> > > > > > > > > > [  198.641827]  svc_handle_xprt+0x828/0xe10 [sunrpc]
> > > > > > > > > > [  198.642108]  svc_recv+0x2cc/0x6a8 [sunrpc]
> > > > > > > > > > [  198.642346]  nfsd+0x270/0x400 [nfsd]
> > > > > > > > > > [  198.642562]  kthread+0x288/0x310
> > > > > > > > > > [  198.642745]  ret_from_fork+0x10/0x20
> > > > > > > > > > [  198.642937] Code: f2fbffe0 f9003be4 f94007e2 52800061 (38e06880)
> > > > > > > > > > [  198.643267] SMP: stopping secondary CPUs
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Good catch. I think the problem here is that we don't currently cap the
> > > > > > > > > initial value of se_cb_highest_slot at NFSD_BC_SLOT_TABLE_MAX. Does
> > > > > > > > > this patch prevent the panic?
> > > > > > > > > 
> > > > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > > > index 3afe56ab9e0a..839be4ba765a 100644
> > > > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > > > @@ -2011,7 +2011,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > > > > > 
> > > > > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > > > > >         new->se_cb_slot_avail = ~0U;
> > > > > > > > > -       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > > > > +       new->se_cb_highest_slot = min(battrs->maxreqs - 1, NFSD_BC_SLOT_TABLE_MAX);
> > > > > > > > >         spin_lock_init(&new->se_lock);
> > > > > > > > >         return new;
> > > > > > > > >  out_free:
> > > > > > > > 
> > > > > > > > It does help. I thought that the CREATE_SESSION reply for the
> > > > > > > > backchannel would be guided by the NFSD_BC_SLOT_TABLE_MAX value but
> > > > > > > > instead it seems like it's not. But yes I can see that the highest
> > > > > > > > slot used by the server is capped by the NFSD_BC_SLOT_TABLE_MAX value.
> > > > > > > 
> > > > > > > Thanks for testing it, Olga.
> > > > > > > 
> > > > > > > Chuck, would you be OK with folding the above delta into 9ab4c4077de9,
> > > > > > > or would you rather I resend the patch?
> > > > > > 
> > > > > > I've folded the above one-liner into the applied patch.
> > > > > > 
> > > > > > I agree with Tom, I think there's probably a (surprising)
> > > > > > explanation lurking for not seeing the expected performance
> > > > > > improvement. I can delay sending the NFSD v6.13 merge window pull
> > > > > > request for a bit to see if you can get it teased out.
> > > > > 
> > > > > I would like to raise a couple of issues:
> > > > > (1) I believe the server should be reporting back an accurate value
> > > > > for the backchannel session table size. I think if the
> > > > > NFSD_BC_SLOT_TABLE_MAX was way lower than the client's value then the
> > > > > client would be wasting resources for its bc session table?
> > > > 
> > > > Yes, but those resources are 32-bit integer per wasted slot. The Linux
> > > > client allows for up to 16 slots, so we're wasting 64 bytes per session
> > > > with this scheme with the Linux client. I didn't think it was worth
> > > > doing a separate allocation for that.
> > > > 
> > > > We could make NFSD_BC_SLOT_TABLE_MAX smaller though. Maybe we should
> > > > match the client's size and make it 15?
> > > > 
> > > > > ->back_channel->maxreqs gets decoded in nfsd4_decode_create_session()
> > > > > and is never adjusted for the reply to be based on the
> > > > > NFSD_BC_SLOT_TABLE_MAX. The problem is currently invisible because
> > > > > linux client's bc slot table size is 16 and nfsd's is higher.
> > > > > 
> > > > 
> > > > I'm not sure I understand the problem here. We don't care about most of
> > > > the backchannel attributes. maxreqs is the only one that matters, and
> > > > track that in se_cb_highest_slot.
> > > 
> > > Client sends a create_session with cba_back_chan_attrs with max_reqs
> > > of 16 -- stating that the client can handle 16 slots in it's slot
> > > table. Server currently doesn't do anything about reflecting back to
> > > the client its session slot table. It blindly returns what the client
> > > sent. Say NFSD_BC_SLOT_TABLE_MAX was 4. Server would never use more
> > > than 4 slots and yet the client would have to create a reply cache
> > > table for 16 slots. Isn't that poor sportsmanship on behalf of the
> > > linux server?
> > > 
> > > 
> > 
> > Thanks, that does sound like a bug. I think we can fix that with
> > another one-liner.  When we allocate the new session, update the
> > back_channel attrs in the request with the correct maxreqs. Thoughts?
> > 
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index 15438826ed5b..c35d8fc2f693 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -3885,6 +3885,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
> >  	new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel);
> >  	if (!new)
> >  		goto out_release_drc_mem;
> > +	cr_ses->back_channel.maxreqs = new->se_cb_highest_slot;
> >  	conn = alloc_conn_from_crses(rqstp, cr_ses);
> >  	if (!conn)
> >  		goto out_free_session;
> 
> 
> Actually, I think this is better, since we're already modifying things
> in this section of the code. Also the earlier patch was off-by-one:
> 
> ------------------------8<----------------------
> 
> [PATCH] SQUASH: report the correct number of backchannel slots to client
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/nfsd/nfs4state.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 15438826ed5b..cfc2190ffce5 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -3955,6 +3955,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
>  	cr_ses->flags &= ~SESSION4_PERSIST;
>  	/* Upshifting from TCP to RDMA is not supported */
>  	cr_ses->flags &= ~SESSION4_RDMA;
> +	/* Report the correct number of backchannel slots */
> +	cr_ses->back_channel.maxreqs = new->se_cb_highest_slot + 1;
>  
>  	init_session(rqstp, new, conf, cr_ses);
>  	nfsd4_get_session_locked(new);
> -- 
> 2.47.0

Applied to nfsd-next, squashed, and pushed.

I've moved "nfsd: allow for up to 32 callback session slots" to the
end of the series to make it easier to update.
Olga Kornievskaia Nov. 11, 2024, 6:55 p.m. UTC | #19
On Mon, Nov 11, 2024 at 1:27 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Mon, 2024-11-11 at 13:17 -0500, Jeff Layton wrote:
> > On Mon, 2024-11-11 at 12:56 -0500, Olga Kornievskaia wrote:
> > > On Mon, Nov 11, 2024 at 12:40 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > >
> > > > On Mon, 2024-11-11 at 12:17 -0500, Olga Kornievskaia wrote:
> > > > > On Mon, Nov 11, 2024 at 9:56 AM Chuck Lever <chuck.lever@oracle.com> wrote:
> > > > > >
> > > > > > On Mon, Nov 11, 2024 at 08:22:07AM -0500, Jeff Layton wrote:
> > > > > > > On Sun, 2024-11-10 at 21:19 -0500, Olga Kornievskaia wrote:
> > > > > > > > On Sat, Nov 9, 2024 at 2:26 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > > >
> > > > > > > > > On Sat, 2024-11-09 at 13:50 -0500, Olga Kornievskaia wrote:
> > > > > > > > > > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > > > > >
> > > > > > > > > > > nfsd currently only uses a single slot in the callback channel, which is
> > > > > > > > > > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > > > > > > > > > a max of 32 slots (subject to the client's target_maxreqs value).
> > > > > > > > > > >
> > > > > > > > > > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > > > > > > > > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > > > > > > > > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > > > > > > > > > track which slots are in use, and a u32 to track the latest callback
> > > > > > > > > > > target_slotid that the client reports. To protect the new fields, add
> > > > > > > > > > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > > > > > > > > > search for the lowest slotid (using ffs()).
> > > > > > > > > > >
> > > > > > > > > > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > > > > > > > > > counters and add the necessary handling to ensure that the seqids get
> > > > > > > > > > > reset at the appropriate times.
> > > > > > > > > > >
> > > > > > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > > > > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > > > > > > > > > ---
> > > > > > > > > > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > > > > > > > > > when the slot table is full. Olga, if you're able to test this one, it
> > > > > > > > > > > would be much appreciated.
> > > > > > > > > > > ---
> > > > > > > > > > > Changes in v4:
> > > > > > > > > > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > > > > > > > > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > > > > > > > > > >
> > > > > > > > > > > Changes in v3:
> > > > > > > > > > > - add patch to convert se_flags to single se_dead bool
> > > > > > > > > > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > > > > > > > > > - don't reject target highest slot value of 0
> > > > > > > > > > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > > > > > > > > > >
> > > > > > > > > > > Changes in v2:
> > > > > > > > > > > - take cl_lock when fetching fields from session to be encoded
> > > > > > > > > > > - use fls() instead of bespoke highest_unset_index()
> > > > > > > > > > > - rename variables in several functions with more descriptive names
> > > > > > > > > > > - clamp limit of for loop in update_cb_slot_table()
> > > > > > > > > > > - re-add missing rpc_wake_up_queued_task() call
> > > > > > > > > > > - fix slotid check in decode_cb_sequence4resok()
> > > > > > > > > > > - add new per-session spinlock
> > > > > > > > > > > ---
> > > > > > > > > > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > > > > > > > > > >  fs/nfsd/nfs4state.c    |  11 +++--
> > > > > > > > > > >  fs/nfsd/state.h        |  15 ++++---
> > > > > > > > > > >  fs/nfsd/trace.h        |   2 +-
> > > > > > > > > > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > > > > > > > > > >
> > > > > > > > > > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > > > > > > > > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > > > > > > > > > --- a/fs/nfsd/nfs4callback.c
> > > > > > > > > > > +++ b/fs/nfsd/nfs4callback.c
> > > > > > > > > > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > > > > > > > > > >         hdr->nops++;
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > > > > > > > > > +{
> > > > > > > > > > > +       u32 idx;
> > > > > > > > > > > +
> > > > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > > > +       idx = fls(~ses->se_cb_slot_avail);
> > > > > > > > > > > +       if (idx > 0)
> > > > > > > > > > > +               --idx;
> > > > > > > > > > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > > > +       return idx;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > >  /*
> > > > > > > > > > >   * CB_SEQUENCE4args
> > > > > > > > > > >   *
> > > > > > > > > > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > > > > > > > > > >         encode_sessionid4(xdr, session);
> > > > > > > > > > >
> > > > > > > > > > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > > > > > > > > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > > > > > > > > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > > > > > > > > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > > > > > > > > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > > > > > > > > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > > > > > > > > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > > > > > > > > > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > > > > > > > > > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > > > > > > > > > >
> > > > > > > > > > >         hdr->nops++;
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > > > > > > > > > +{
> > > > > > > > > > > +       /* No need to do anything if nothing changed */
> > > > > > > > > > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > > > > > > > > > +               return;
> > > > > > > > > > > +
> > > > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > > > +       if (target > ses->se_cb_highest_slot) {
> > > > > > > > > > > +               int i;
> > > > > > > > > > > +
> > > > > > > > > > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > > > > > > > > > +
> > > > > > > > > > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > > > > > > > > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > > > > > > > > > +                       ses->se_cb_seq_nr[i] = 1;
> > > > > > > > > > > +       }
> > > > > > > > > > > +       ses->se_cb_highest_slot = target;
> > > > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > >  /*
> > > > > > > > > > >   * CB_SEQUENCE4resok
> > > > > > > > > > >   *
> > > > > > > > > > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > > > > > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > > > > > > > > > >         int status = -ESERVERFAULT;
> > > > > > > > > > >         __be32 *p;
> > > > > > > > > > > -       u32 dummy;
> > > > > > > > > > > +       u32 seqid, slotid, target;
> > > > > > > > > > >
> > > > > > > > > > >         /*
> > > > > > > > > > >          * If the server returns different values for sessionID, slotID or
> > > > > > > > > > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > > > > > >         }
> > > > > > > > > > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > > > > > > > > > >
> > > > > > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > > > > > -       if (dummy != session->se_cb_seq_nr) {
> > > > > > > > > > > +       seqid = be32_to_cpup(p++);
> > > > > > > > > > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > > > > > > > > > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > > > > > > > > > >                 goto out;
> > > > > > > > > > >         }
> > > > > > > > > > >
> > > > > > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > > > > > -       if (dummy != 0) {
> > > > > > > > > > > +       slotid = be32_to_cpup(p++);
> > > > > > > > > > > +       if (slotid != cb->cb_held_slot) {
> > > > > > > > > > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > > > > > > > > > >                 goto out;
> > > > > > > > > > >         }
> > > > > > > > > > >
> > > > > > > > > > > -       /*
> > > > > > > > > > > -        * FIXME: process highest slotid and target highest slotid
> > > > > > > > > > > -        */
> > > > > > > > > > > +       p++; // ignore current highest slot value
> > > > > > > > > > > +
> > > > > > > > > > > +       target = be32_to_cpup(p++);
> > > > > > > > > > > +       update_cb_slot_table(session, target);
> > > > > > > > > > >         status = 0;
> > > > > > > > > > >  out:
> > > > > > > > > > >         cb->cb_seq_status = status;
> > > > > > > > > > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > > > > > >         spin_unlock(&clp->cl_lock);
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > > +static int grab_slot(struct nfsd4_session *ses)
> > > > > > > > > > > +{
> > > > > > > > > > > +       int idx;
> > > > > > > > > > > +
> > > > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > > > > > > > > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > > > > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > > > > > +               return -1;
> > > > > > > > > > > +       }
> > > > > > > > > > > +       /* clear the bit for the slot */
> > > > > > > > > > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > > > +       return idx;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > >  /*
> > > > > > > > > > >   * There's currently a single callback channel slot.
> > > > > > > > > > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > > > > > > > > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > > > > > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > > > > > > > > > >  {
> > > > > > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > > > > >
> > > > > > > > > > > -       if (!cb->cb_holds_slot &&
> > > > > > > > > > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > > > > > +       if (cb->cb_held_slot >= 0)
> > > > > > > > > > > +               return true;
> > > > > > > > > > > +       cb->cb_held_slot = grab_slot(ses);
> > > > > > > > > > > +       if (cb->cb_held_slot < 0) {
> > > > > > > > > > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > > > > > > > > > >                 /* Race breaker */
> > > > > > > > > > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > > > > > -                       dprintk("%s slot is busy\n", __func__);
> > > > > > > > > > > +               cb->cb_held_slot = grab_slot(ses);
> > > > > > > > > > > +               if (cb->cb_held_slot < 0)
> > > > > > > > > > >                         return false;
> > > > > > > > > > > -               }
> > > > > > > > > > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > > > > > > > > > >         }
> > > > > > > > > > > -       cb->cb_holds_slot = true;
> > > > > > > > > > >         return true;
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > > > > > > > > > >  {
> > > > > > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > > > > >
> > > > > > > > > > > -       if (cb->cb_holds_slot) {
> > > > > > > > > > > -               cb->cb_holds_slot = false;
> > > > > > > > > > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > > > > > > +       if (cb->cb_held_slot >= 0) {
> > > > > > > > > > > +               spin_lock(&ses->se_lock);
> > > > > > > > > > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > > > > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > > > > > +               cb->cb_held_slot = -1;
> > > > > > > > > > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > > > > > > > > > >         }
> > > > > > > > > > >  }
> > > > > > > > > > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > >  /*
> > > > > > > > > > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > > > > > > > > > - * slots, and mark callback channel down on communication errors.
> > > > > > > > > > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > > > > > > > > > + * and mark callback channel down on communication errors.
> > > > > > > > > > >   */
> > > > > > > > > > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > > > > > > > > > >  {
> > > > > > > > > > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > > > >                 return true;
> > > > > > > > > > >         }
> > > > > > > > > > >
> > > > > > > > > > > -       if (!cb->cb_holds_slot)
> > > > > > > > > > > +       if (cb->cb_held_slot < 0)
> > > > > > > > > > >                 goto need_restart;
> > > > > > > > > > >
> > > > > > > > > > >         /* This is the operation status code for CB_SEQUENCE */
> > > > > > > > > > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > > > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > > > > > > > > > >                  * (sequence ID, cached reply) MUST NOT change.
> > > > > > > > > > >                  */
> > > > > > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > > > > > >                 break;
> > > > > > > > > > >         case -ESERVERFAULT:
> > > > > > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > > > > > >                 ret = false;
> > > > > > > > > > >                 break;
> > > > > > > > > > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > > > >         case -NFS4ERR_BADSLOT:
> > > > > > > > > > >                 goto retry_nowait;
> > > > > > > > > > >         case -NFS4ERR_SEQ_MISORDERED:
> > > > > > > > > > > -               if (session->se_cb_seq_nr != 1) {
> > > > > > > > > > > -                       session->se_cb_seq_nr = 1;
> > > > > > > > > > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > > > > > > > > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > > > > > > > > > >                         goto retry_nowait;
> > > > > > > > > > >                 }
> > > > > > > > > > >                 break;
> > > > > > > > > > >         default:
> > > > > > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > > > > > >         }
> > > > > > > > > > > -       nfsd41_cb_release_slot(cb);
> > > > > > > > > > > -
> > > > > > > > > > >         trace_nfsd_cb_free_slot(task, cb);
> > > > > > > > > > > +       nfsd41_cb_release_slot(cb);
> > > > > > > > > > >
> > > > > > > > > > >         if (RPC_SIGNALLED(task))
> > > > > > > > > > >                 goto need_restart;
> > > > > > > > > > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > > > > > > > > > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > > > > > > > > > >         cb->cb_status = 0;
> > > > > > > > > > >         cb->cb_need_restart = false;
> > > > > > > > > > > -       cb->cb_holds_slot = false;
> > > > > > > > > > > +       cb->cb_held_slot = -1;
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > >  /**
> > > > > > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > > > > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > > > > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > > > > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > > > > > > >         }
> > > > > > > > > > >
> > > > > > > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > > > > > > > +       new->se_cb_slot_avail = ~0U;
> > > > > > > > > > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > > > > > > +       spin_lock_init(&new->se_lock);
> > > > > > > > > > >         return new;
> > > > > > > > > > >  out_free:
> > > > > > > > > > >         while (i--)
> > > > > > > > > > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > > > > > > > > >
> > > > > > > > > > >         INIT_LIST_HEAD(&new->se_conns);
> > > > > > > > > > >
> > > > > > > > > > > -       new->se_cb_seq_nr = 1;
> > > > > > > > > > > +       atomic_set(&new->se_ref, 0);
> > > > > > > > > > >         new->se_dead = false;
> > > > > > > > > > >         new->se_cb_prog = cses->callback_prog;
> > > > > > > > > > >         new->se_cb_sec = cses->cb_sec;
> > > > > > > > > > > -       atomic_set(&new->se_ref, 0);
> > > > > > > > > > > +
> > > > > > > > > > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > > > > > > > > > +               new->se_cb_seq_nr[idx] = 1;
> > > > > > > > > > > +
> > > > > > > > > > >         idx = hash_sessionid(&new->se_sessionid);
> > > > > > > > > > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > > > > > > > > > >         spin_lock(&clp->cl_lock);
> > > > > > > > > > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > > > > > > > > > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > > > > > > > > > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > > > > > > > > > >         clp->cl_time = ktime_get_boottime_seconds();
> > > > > > > > > > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > > > > > >         copy_verf(clp, verf);
> > > > > > > > > > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > > > > > > > > > >         clp->cl_cb_session = NULL;
> > > > > > > > > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > > > > > > > > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > > > > > > > > > --- a/fs/nfsd/state.h
> > > > > > > > > > > +++ b/fs/nfsd/state.h
> > > > > > > > > > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > > > > > > > > > >         struct work_struct cb_work;
> > > > > > > > > > >         int cb_seq_status;
> > > > > > > > > > >         int cb_status;
> > > > > > > > > > > +       int cb_held_slot;
> > > > > > > > > > >         bool cb_need_restart;
> > > > > > > > > > > -       bool cb_holds_slot;
> > > > > > > > > > >  };
> > > > > > > > > > >
> > > > > > > > > > >  struct nfsd4_callback_ops {
> > > > > > > > > > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > > > > > > > > > >         unsigned char cn_flags;
> > > > > > > > > > >  };
> > > > > > > > > > >
> > > > > > > > > > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > > > > > > > > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > > > > > > > > >
> > > > > > > > > > Are there some values that are known not to work? I was experimenting
> > > > > > > > > > with values and set it to 2 and 4 and the kernel oopsed. I understand
> > > > > > > > > > it's not a configurable value but it would still be good to know the
> > > > > > > > > > expectations...
> > > > > > > > > >
> > > > > > > > > > [  198.625021] Unable to handle kernel paging request at virtual
> > > > > > > > > > address dfff800020000000
> > > > > > > > > > [  198.625870] KASAN: probably user-memory-access in range
> > > > > > > > > > [0x0000000100000000-0x0000000100000007]
> > > > > > > > > > [  198.626444] Mem abort info:
> > > > > > > > > > [  198.626630]   ESR = 0x0000000096000005
> > > > > > > > > > [  198.626882]   EC = 0x25: DABT (current EL), IL = 32 bits
> > > > > > > > > > [  198.627234]   SET = 0, FnV = 0
> > > > > > > > > > [  198.627441]   EA = 0, S1PTW = 0
> > > > > > > > > > [  198.627627]   FSC = 0x05: level 1 translation fault
> > > > > > > > > > [  198.627859] Data abort info:
> > > > > > > > > > [  198.628000]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
> > > > > > > > > > [  198.628272]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> > > > > > > > > > [  198.628619]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> > > > > > > > > > [  198.628967] [dfff800020000000] address between user and kernel address ranges
> > > > > > > > > > [  198.629438] Internal error: Oops: 0000000096000005 [#1] SMP
> > > > > > > > > > [  198.629806] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver
> > > > > > > > > > nfs netfs nfnetlink_queue nfnetlink_log nfnetlink bluetooth cfg80211
> > > > > > > > > > rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd auth_rpcgss nfs_acl lockd
> > > > > > > > > > grace isofs uinput snd_seq_dummy snd_hrtimer vsock_loopback
> > > > > > > > > > vmw_vsock_virtio_transport_common qrtr rfkill vmw_vsock_vmci_transport
> > > > > > > > > > vsock sunrpc vfat fat snd_hda_codec_generic snd_hda_intel
> > > > > > > > > > snd_intel_dspcfg snd_hda_codec snd_hda_core snd_hwdep snd_seq uvcvideo
> > > > > > > > > > videobuf2_vmalloc snd_seq_device videobuf2_memops uvc videobuf2_v4l2
> > > > > > > > > > videodev snd_pcm videobuf2_common mc snd_timer snd vmw_vmci soundcore
> > > > > > > > > > xfs libcrc32c vmwgfx drm_ttm_helper ttm nvme drm_kms_helper
> > > > > > > > > > crct10dif_ce nvme_core ghash_ce sha2_ce sha256_arm64 sha1_ce drm
> > > > > > > > > > nvme_auth sr_mod cdrom e1000e sg fuse
> > > > > > > > > > [  198.633799] CPU: 5 UID: 0 PID: 6081 Comm: nfsd Kdump: loaded Not
> > > > > > > > > > tainted 6.12.0-rc6+ #47
> > > > > > > > > > [  198.634345] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS
> > > > > > > > > > VMW201.00V.21805430.BA64.2305221830 05/22/2023
> > > > > > > > > > [  198.635014] pstate: 11400005 (nzcV daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> > > > > > > > > > [  198.635492] pc : nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > > > > > [  198.635798] lr : nfsd4_sequence+0x340/0x1f60 [nfsd]
> > > > > > > > > > [  198.636065] sp : ffff8000884977e0
> > > > > > > > > > [  198.636234] x29: ffff800088497910 x28: ffff0000b1b39280 x27: ffff0000ab508128
> > > > > > > > > > [  198.636624] x26: ffff0000b1b39298 x25: ffff0000b1b39290 x24: ffff0000a65e1c64
> > > > > > > > > > [  198.637049] x23: 1fffe000212e6804 x22: ffff000109734024 x21: 1ffff00011092f16
> > > > > > > > > > [  198.637472] x20: ffff00010aed8000 x19: ffff000109734000 x18: 1fffe0002de20c8b
> > > > > > > > > > [  198.637883] x17: 0100000000000000 x16: 1ffff0000fcef234 x15: 1fffe000212e600f
> > > > > > > > > > [  198.638286] x14: ffff80007e779000 x13: ffff80007e7791a0 x12: 0000000000000000
> > > > > > > > > > [  198.638697] x11: ffff0000a65e1c38 x10: ffff00010aedaca0 x9 : 1fffe000215db594
> > > > > > > > > > [  198.639110] x8 : 1fffe00014cbc387 x7 : ffff0000a65e1c03 x6 : ffff0000a65e1c00
> > > > > > > > > > [  198.639541] x5 : ffff0000a65e1c00 x4 : 0000000020000000 x3 : 0000000100000001
> > > > > > > > > > [  198.639962] x2 : ffff000109730060 x1 : 0000000000000003 x0 : dfff800000000000
> > > > > > > > > > [  198.640332] Call trace:
> > > > > > > > > > [  198.640460]  nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > > > > > [  198.640715]  nfsd4_proc_compound+0xb94/0x23b0 [nfsd]
> > > > > > > > > > [  198.640997]  nfsd_dispatch+0x22c/0x718 [nfsd]
> > > > > > > > > > [  198.641260]  svc_process_common+0x8e8/0x1968 [sunrpc]
> > > > > > > > > > [  198.641566]  svc_process+0x3d4/0x7e0 [sunrpc]
> > > > > > > > > > [  198.641827]  svc_handle_xprt+0x828/0xe10 [sunrpc]
> > > > > > > > > > [  198.642108]  svc_recv+0x2cc/0x6a8 [sunrpc]
> > > > > > > > > > [  198.642346]  nfsd+0x270/0x400 [nfsd]
> > > > > > > > > > [  198.642562]  kthread+0x288/0x310
> > > > > > > > > > [  198.642745]  ret_from_fork+0x10/0x20
> > > > > > > > > > [  198.642937] Code: f2fbffe0 f9003be4 f94007e2 52800061 (38e06880)
> > > > > > > > > > [  198.643267] SMP: stopping secondary CPUs
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Good catch. I think the problem here is that we don't currently cap the
> > > > > > > > > initial value of se_cb_highest_slot at NFSD_BC_SLOT_TABLE_MAX. Does
> > > > > > > > > this patch prevent the panic?
> > > > > > > > >
> > > > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > > > index 3afe56ab9e0a..839be4ba765a 100644
> > > > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > > > @@ -2011,7 +2011,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > > > > >
> > > > > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > > > > >         new->se_cb_slot_avail = ~0U;
> > > > > > > > > -       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > > > > +       new->se_cb_highest_slot = min(battrs->maxreqs - 1, NFSD_BC_SLOT_TABLE_MAX);
> > > > > > > > >         spin_lock_init(&new->se_lock);
> > > > > > > > >         return new;
> > > > > > > > >  out_free:
> > > > > > > >
> > > > > > > > It does help. I thought that the CREATE_SESSION reply for the
> > > > > > > > backchannel would be guided by the NFSD_BC_SLOT_TABLE_MAX value but
> > > > > > > > instead it seems like it's not. But yes I can see that the highest
> > > > > > > > slot used by the server is capped by the NFSD_BC_SLOT_TABLE_MAX value.
> > > > > > >
> > > > > > > Thanks for testing it, Olga.
> > > > > > >
> > > > > > > Chuck, would you be OK with folding the above delta into 9ab4c4077de9,
> > > > > > > or would you rather I resend the patch?
> > > > > >
> > > > > > I've folded the above one-liner into the applied patch.
> > > > > >
> > > > > > I agree with Tom, I think there's probably a (surprising)
> > > > > > explanation lurking for not seeing the expected performance
> > > > > > improvement. I can delay sending the NFSD v6.13 merge window pull
> > > > > > request for a bit to see if you can get it teased out.
> > > > >
> > > > > I would like to raise a couple of issues:
> > > > > (1) I believe the server should be reporting back an accurate value
> > > > > for the backchannel session table size. I think if the
> > > > > NFSD_BC_SLOT_TABLE_MAX was way lower than the client's value then the
> > > > > client would be wasting resources for its bc session table?
> > > >
> > > > Yes, but those resources are 32-bit integer per wasted slot. The Linux
> > > > client allows for up to 16 slots, so we're wasting 64 bytes per session
> > > > with this scheme with the Linux client. I didn't think it was worth
> > > > doing a separate allocation for that.
> > > >
> > > > We could make NFSD_BC_SLOT_TABLE_MAX smaller though. Maybe we should
> > > > match the client's size and make it 15?
> > > >
> > > > > ->back_channel->maxreqs gets decoded in nfsd4_decode_create_session()
> > > > > and is never adjusted for the reply to be based on the
> > > > > NFSD_BC_SLOT_TABLE_MAX. The problem is currently invisible because
> > > > > linux client's bc slot table size is 16 and nfsd's is higher.
> > > > >
> > > >
> > > > I'm not sure I understand the problem here. We don't care about most of
> > > > the backchannel attributes. maxreqs is the only one that matters, and
> > > > track that in se_cb_highest_slot.
> > >
> > > Client sends a create_session with cba_back_chan_attrs with max_reqs
> > > of 16 -- stating that the client can handle 16 slots in it's slot
> > > table. Server currently doesn't do anything about reflecting back to
> > > the client its session slot table. It blindly returns what the client
> > > sent. Say NFSD_BC_SLOT_TABLE_MAX was 4. Server would never use more
> > > than 4 slots and yet the client would have to create a reply cache
> > > table for 16 slots. Isn't that poor sportsmanship on behalf of the
> > > linux server?
> > >
> > >
> >
> > Thanks, that does sound like a bug. I think we can fix that with
> > another one-liner.  When we allocate the new session, update the
> > back_channel attrs in the request with the correct maxreqs. Thoughts?
> >
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index 15438826ed5b..c35d8fc2f693 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -3885,6 +3885,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
> >       new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel);
> >       if (!new)
> >               goto out_release_drc_mem;
> > +     cr_ses->back_channel.maxreqs = new->se_cb_highest_slot;
> >       conn = alloc_conn_from_crses(rqstp, cr_ses);
> >       if (!conn)
> >               goto out_free_session;
>
>
> Actually, I think this is better, since we're already modifying things
> in this section of the code. Also the earlier patch was off-by-one:
>
> ------------------------8<----------------------
>
> [PATCH] SQUASH: report the correct number of backchannel slots to client
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/nfsd/nfs4state.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 15438826ed5b..cfc2190ffce5 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -3955,6 +3955,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
>         cr_ses->flags &= ~SESSION4_PERSIST;
>         /* Upshifting from TCP to RDMA is not supported */
>         cr_ses->flags &= ~SESSION4_RDMA;
> +       /* Report the correct number of backchannel slots */
> +       cr_ses->back_channel.maxreqs = new->se_cb_highest_slot + 1;

Is the intent that NFSD_BC_SLOT_TABLE_MAX value represents a one off
value? With this patch if NFSD_BC_SLOT_TABLE_MAX=1 the server would
send back 2 in its CREATE_SESSION reply for the bc table size. Other
than the wrong value, this patch would indeed reflect back server's cb
table size.

>
>         init_session(rqstp, new, conf, cr_ses);
>         nfsd4_get_session_locked(new);
> --
> 2.47.0
>
>
>
Jeff Layton Nov. 11, 2024, 7:02 p.m. UTC | #20
On Mon, 2024-11-11 at 13:55 -0500, Olga Kornievskaia wrote:
> On Mon, Nov 11, 2024 at 1:27 PM Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > On Mon, 2024-11-11 at 13:17 -0500, Jeff Layton wrote:
> > > On Mon, 2024-11-11 at 12:56 -0500, Olga Kornievskaia wrote:
> > > > On Mon, Nov 11, 2024 at 12:40 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > 
> > > > > On Mon, 2024-11-11 at 12:17 -0500, Olga Kornievskaia wrote:
> > > > > > On Mon, Nov 11, 2024 at 9:56 AM Chuck Lever <chuck.lever@oracle.com> wrote:
> > > > > > > 
> > > > > > > On Mon, Nov 11, 2024 at 08:22:07AM -0500, Jeff Layton wrote:
> > > > > > > > On Sun, 2024-11-10 at 21:19 -0500, Olga Kornievskaia wrote:
> > > > > > > > > On Sat, Nov 9, 2024 at 2:26 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > > > > 
> > > > > > > > > > On Sat, 2024-11-09 at 13:50 -0500, Olga Kornievskaia wrote:
> > > > > > > > > > > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > > > > > > 
> > > > > > > > > > > > nfsd currently only uses a single slot in the callback channel, which is
> > > > > > > > > > > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > > > > > > > > > > a max of 32 slots (subject to the client's target_maxreqs value).
> > > > > > > > > > > > 
> > > > > > > > > > > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > > > > > > > > > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > > > > > > > > > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > > > > > > > > > > track which slots are in use, and a u32 to track the latest callback
> > > > > > > > > > > > target_slotid that the client reports. To protect the new fields, add
> > > > > > > > > > > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > > > > > > > > > > search for the lowest slotid (using ffs()).
> > > > > > > > > > > > 
> > > > > > > > > > > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > > > > > > > > > > counters and add the necessary handling to ensure that the seqids get
> > > > > > > > > > > > reset at the appropriate times.
> > > > > > > > > > > > 
> > > > > > > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > > > > > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > > > > > > > > > > ---
> > > > > > > > > > > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > > > > > > > > > > when the slot table is full. Olga, if you're able to test this one, it
> > > > > > > > > > > > would be much appreciated.
> > > > > > > > > > > > ---
> > > > > > > > > > > > Changes in v4:
> > > > > > > > > > > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > > > > > > > > > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > > > > > > > > > > > 
> > > > > > > > > > > > Changes in v3:
> > > > > > > > > > > > - add patch to convert se_flags to single se_dead bool
> > > > > > > > > > > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > > > > > > > > > > - don't reject target highest slot value of 0
> > > > > > > > > > > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > > > > > > > > > > > 
> > > > > > > > > > > > Changes in v2:
> > > > > > > > > > > > - take cl_lock when fetching fields from session to be encoded
> > > > > > > > > > > > - use fls() instead of bespoke highest_unset_index()
> > > > > > > > > > > > - rename variables in several functions with more descriptive names
> > > > > > > > > > > > - clamp limit of for loop in update_cb_slot_table()
> > > > > > > > > > > > - re-add missing rpc_wake_up_queued_task() call
> > > > > > > > > > > > - fix slotid check in decode_cb_sequence4resok()
> > > > > > > > > > > > - add new per-session spinlock
> > > > > > > > > > > > ---
> > > > > > > > > > > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > > > > > > > > > > >  fs/nfsd/nfs4state.c    |  11 +++--
> > > > > > > > > > > >  fs/nfsd/state.h        |  15 ++++---
> > > > > > > > > > > >  fs/nfsd/trace.h        |   2 +-
> > > > > > > > > > > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > > > > > > > > > > > 
> > > > > > > > > > > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > > > > > > > > > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > > > > > > > > > > --- a/fs/nfsd/nfs4callback.c
> > > > > > > > > > > > +++ b/fs/nfsd/nfs4callback.c
> > > > > > > > > > > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > > > > > > > > > > >         hdr->nops++;
> > > > > > > > > > > >  }
> > > > > > > > > > > > 
> > > > > > > > > > > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +       u32 idx;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > > > > +       idx = fls(~ses->se_cb_slot_avail);
> > > > > > > > > > > > +       if (idx > 0)
> > > > > > > > > > > > +               --idx;
> > > > > > > > > > > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > > > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > > > > +       return idx;
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > >  /*
> > > > > > > > > > > >   * CB_SEQUENCE4args
> > > > > > > > > > > >   *
> > > > > > > > > > > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > > > > > > > > > > >         encode_sessionid4(xdr, session);
> > > > > > > > > > > > 
> > > > > > > > > > > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > > > > > > > > > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > > > > > > > > > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > > > > > > > > > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > > > > > > > > > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > > > > > > > > > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > > > > > > > > > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > > > > > > > > > > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > > > > > > > > > > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > > > > > > > > > > > 
> > > > > > > > > > > >         hdr->nops++;
> > > > > > > > > > > >  }
> > > > > > > > > > > > 
> > > > > > > > > > > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +       /* No need to do anything if nothing changed */
> > > > > > > > > > > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > > > > > > > > > > +               return;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > > > > +       if (target > ses->se_cb_highest_slot) {
> > > > > > > > > > > > +               int i;
> > > > > > > > > > > > +
> > > > > > > > > > > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > > > > > > > > > > +
> > > > > > > > > > > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > > > > > > > > > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > > > > > > > > > > +                       ses->se_cb_seq_nr[i] = 1;
> > > > > > > > > > > > +       }
> > > > > > > > > > > > +       ses->se_cb_highest_slot = target;
> > > > > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > >  /*
> > > > > > > > > > > >   * CB_SEQUENCE4resok
> > > > > > > > > > > >   *
> > > > > > > > > > > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > > > > > > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > > > > > > > > > > >         int status = -ESERVERFAULT;
> > > > > > > > > > > >         __be32 *p;
> > > > > > > > > > > > -       u32 dummy;
> > > > > > > > > > > > +       u32 seqid, slotid, target;
> > > > > > > > > > > > 
> > > > > > > > > > > >         /*
> > > > > > > > > > > >          * If the server returns different values for sessionID, slotID or
> > > > > > > > > > > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > > > > > > >         }
> > > > > > > > > > > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > > > > > > > > > > > 
> > > > > > > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > > > > > > -       if (dummy != session->se_cb_seq_nr) {
> > > > > > > > > > > > +       seqid = be32_to_cpup(p++);
> > > > > > > > > > > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > > > > > > > > > > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > > > > > > > > > > >                 goto out;
> > > > > > > > > > > >         }
> > > > > > > > > > > > 
> > > > > > > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > > > > > > -       if (dummy != 0) {
> > > > > > > > > > > > +       slotid = be32_to_cpup(p++);
> > > > > > > > > > > > +       if (slotid != cb->cb_held_slot) {
> > > > > > > > > > > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > > > > > > > > > > >                 goto out;
> > > > > > > > > > > >         }
> > > > > > > > > > > > 
> > > > > > > > > > > > -       /*
> > > > > > > > > > > > -        * FIXME: process highest slotid and target highest slotid
> > > > > > > > > > > > -        */
> > > > > > > > > > > > +       p++; // ignore current highest slot value
> > > > > > > > > > > > +
> > > > > > > > > > > > +       target = be32_to_cpup(p++);
> > > > > > > > > > > > +       update_cb_slot_table(session, target);
> > > > > > > > > > > >         status = 0;
> > > > > > > > > > > >  out:
> > > > > > > > > > > >         cb->cb_seq_status = status;
> > > > > > > > > > > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > > > > > > >         spin_unlock(&clp->cl_lock);
> > > > > > > > > > > >  }
> > > > > > > > > > > > 
> > > > > > > > > > > > +static int grab_slot(struct nfsd4_session *ses)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +       int idx;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > > > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > > > > > > > > > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > > > > > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > > > > > > +               return -1;
> > > > > > > > > > > > +       }
> > > > > > > > > > > > +       /* clear the bit for the slot */
> > > > > > > > > > > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > > > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > > > > +       return idx;
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > >  /*
> > > > > > > > > > > >   * There's currently a single callback channel slot.
> > > > > > > > > > > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > > > > > > > > > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > > > > > > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > > > > > > > > > > >  {
> > > > > > > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > > > > > > 
> > > > > > > > > > > > -       if (!cb->cb_holds_slot &&
> > > > > > > > > > > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > > > > > > +       if (cb->cb_held_slot >= 0)
> > > > > > > > > > > > +               return true;
> > > > > > > > > > > > +       cb->cb_held_slot = grab_slot(ses);
> > > > > > > > > > > > +       if (cb->cb_held_slot < 0) {
> > > > > > > > > > > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > > > > > > > > > > >                 /* Race breaker */
> > > > > > > > > > > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > > > > > > -                       dprintk("%s slot is busy\n", __func__);
> > > > > > > > > > > > +               cb->cb_held_slot = grab_slot(ses);
> > > > > > > > > > > > +               if (cb->cb_held_slot < 0)
> > > > > > > > > > > >                         return false;
> > > > > > > > > > > > -               }
> > > > > > > > > > > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > > > > > > > > > > >         }
> > > > > > > > > > > > -       cb->cb_holds_slot = true;
> > > > > > > > > > > >         return true;
> > > > > > > > > > > >  }
> > > > > > > > > > > > 
> > > > > > > > > > > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > > > > > > > > > > >  {
> > > > > > > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > > > > > > 
> > > > > > > > > > > > -       if (cb->cb_holds_slot) {
> > > > > > > > > > > > -               cb->cb_holds_slot = false;
> > > > > > > > > > > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > > > > > > > +       if (cb->cb_held_slot >= 0) {
> > > > > > > > > > > > +               spin_lock(&ses->se_lock);
> > > > > > > > > > > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > > > > > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > > > > > > +               cb->cb_held_slot = -1;
> > > > > > > > > > > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > > > > > > > > > > >         }
> > > > > > > > > > > >  }
> > > > > > > > > > > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > > > > > > > > > > >  }
> > > > > > > > > > > > 
> > > > > > > > > > > >  /*
> > > > > > > > > > > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > > > > > > > > > > - * slots, and mark callback channel down on communication errors.
> > > > > > > > > > > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > > > > > > > > > > + * and mark callback channel down on communication errors.
> > > > > > > > > > > >   */
> > > > > > > > > > > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > > > > > > > > > > >  {
> > > > > > > > > > > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > > > > >                 return true;
> > > > > > > > > > > >         }
> > > > > > > > > > > > 
> > > > > > > > > > > > -       if (!cb->cb_holds_slot)
> > > > > > > > > > > > +       if (cb->cb_held_slot < 0)
> > > > > > > > > > > >                 goto need_restart;
> > > > > > > > > > > > 
> > > > > > > > > > > >         /* This is the operation status code for CB_SEQUENCE */
> > > > > > > > > > > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > > > > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > > > > > > > > > > >                  * (sequence ID, cached reply) MUST NOT change.
> > > > > > > > > > > >                  */
> > > > > > > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > > > > > > >                 break;
> > > > > > > > > > > >         case -ESERVERFAULT:
> > > > > > > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > > > > > > >                 ret = false;
> > > > > > > > > > > >                 break;
> > > > > > > > > > > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > > > > >         case -NFS4ERR_BADSLOT:
> > > > > > > > > > > >                 goto retry_nowait;
> > > > > > > > > > > >         case -NFS4ERR_SEQ_MISORDERED:
> > > > > > > > > > > > -               if (session->se_cb_seq_nr != 1) {
> > > > > > > > > > > > -                       session->se_cb_seq_nr = 1;
> > > > > > > > > > > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > > > > > > > > > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > > > > > > > > > > >                         goto retry_nowait;
> > > > > > > > > > > >                 }
> > > > > > > > > > > >                 break;
> > > > > > > > > > > >         default:
> > > > > > > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > > > > > > >         }
> > > > > > > > > > > > -       nfsd41_cb_release_slot(cb);
> > > > > > > > > > > > -
> > > > > > > > > > > >         trace_nfsd_cb_free_slot(task, cb);
> > > > > > > > > > > > +       nfsd41_cb_release_slot(cb);
> > > > > > > > > > > > 
> > > > > > > > > > > >         if (RPC_SIGNALLED(task))
> > > > > > > > > > > >                 goto need_restart;
> > > > > > > > > > > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > > > > > > > > > > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > > > > > > > > > > >         cb->cb_status = 0;
> > > > > > > > > > > >         cb->cb_need_restart = false;
> > > > > > > > > > > > -       cb->cb_holds_slot = false;
> > > > > > > > > > > > +       cb->cb_held_slot = -1;
> > > > > > > > > > > >  }
> > > > > > > > > > > > 
> > > > > > > > > > > >  /**
> > > > > > > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > > > > > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > > > > > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > > > > > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > > > > > > > >         }
> > > > > > > > > > > > 
> > > > > > > > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > > > > > > > > +       new->se_cb_slot_avail = ~0U;
> > > > > > > > > > > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > > > > > > > +       spin_lock_init(&new->se_lock);
> > > > > > > > > > > >         return new;
> > > > > > > > > > > >  out_free:
> > > > > > > > > > > >         while (i--)
> > > > > > > > > > > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > > > > > > > > > > 
> > > > > > > > > > > >         INIT_LIST_HEAD(&new->se_conns);
> > > > > > > > > > > > 
> > > > > > > > > > > > -       new->se_cb_seq_nr = 1;
> > > > > > > > > > > > +       atomic_set(&new->se_ref, 0);
> > > > > > > > > > > >         new->se_dead = false;
> > > > > > > > > > > >         new->se_cb_prog = cses->callback_prog;
> > > > > > > > > > > >         new->se_cb_sec = cses->cb_sec;
> > > > > > > > > > > > -       atomic_set(&new->se_ref, 0);
> > > > > > > > > > > > +
> > > > > > > > > > > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > > > > > > > > > > +               new->se_cb_seq_nr[idx] = 1;
> > > > > > > > > > > > +
> > > > > > > > > > > >         idx = hash_sessionid(&new->se_sessionid);
> > > > > > > > > > > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > > > > > > > > > > >         spin_lock(&clp->cl_lock);
> > > > > > > > > > > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > > > > > > > > > > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > > > > > > > > > > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > > > > > > > > > > >         clp->cl_time = ktime_get_boottime_seconds();
> > > > > > > > > > > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > > > > > > >         copy_verf(clp, verf);
> > > > > > > > > > > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > > > > > > > > > > >         clp->cl_cb_session = NULL;
> > > > > > > > > > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > > > > > > > > > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > > > > > > > > > > --- a/fs/nfsd/state.h
> > > > > > > > > > > > +++ b/fs/nfsd/state.h
> > > > > > > > > > > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > > > > > > > > > > >         struct work_struct cb_work;
> > > > > > > > > > > >         int cb_seq_status;
> > > > > > > > > > > >         int cb_status;
> > > > > > > > > > > > +       int cb_held_slot;
> > > > > > > > > > > >         bool cb_need_restart;
> > > > > > > > > > > > -       bool cb_holds_slot;
> > > > > > > > > > > >  };
> > > > > > > > > > > > 
> > > > > > > > > > > >  struct nfsd4_callback_ops {
> > > > > > > > > > > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > > > > > > > > > > >         unsigned char cn_flags;
> > > > > > > > > > > >  };
> > > > > > > > > > > > 
> > > > > > > > > > > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > > > > > > > > > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > > > > > > > > > > 
> > > > > > > > > > > Are there some values that are known not to work? I was experimenting
> > > > > > > > > > > with values and set it to 2 and 4 and the kernel oopsed. I understand
> > > > > > > > > > > it's not a configurable value but it would still be good to know the
> > > > > > > > > > > expectations...
> > > > > > > > > > > 
> > > > > > > > > > > [  198.625021] Unable to handle kernel paging request at virtual
> > > > > > > > > > > address dfff800020000000
> > > > > > > > > > > [  198.625870] KASAN: probably user-memory-access in range
> > > > > > > > > > > [0x0000000100000000-0x0000000100000007]
> > > > > > > > > > > [  198.626444] Mem abort info:
> > > > > > > > > > > [  198.626630]   ESR = 0x0000000096000005
> > > > > > > > > > > [  198.626882]   EC = 0x25: DABT (current EL), IL = 32 bits
> > > > > > > > > > > [  198.627234]   SET = 0, FnV = 0
> > > > > > > > > > > [  198.627441]   EA = 0, S1PTW = 0
> > > > > > > > > > > [  198.627627]   FSC = 0x05: level 1 translation fault
> > > > > > > > > > > [  198.627859] Data abort info:
> > > > > > > > > > > [  198.628000]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
> > > > > > > > > > > [  198.628272]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> > > > > > > > > > > [  198.628619]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> > > > > > > > > > > [  198.628967] [dfff800020000000] address between user and kernel address ranges
> > > > > > > > > > > [  198.629438] Internal error: Oops: 0000000096000005 [#1] SMP
> > > > > > > > > > > [  198.629806] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver
> > > > > > > > > > > nfs netfs nfnetlink_queue nfnetlink_log nfnetlink bluetooth cfg80211
> > > > > > > > > > > rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd auth_rpcgss nfs_acl lockd
> > > > > > > > > > > grace isofs uinput snd_seq_dummy snd_hrtimer vsock_loopback
> > > > > > > > > > > vmw_vsock_virtio_transport_common qrtr rfkill vmw_vsock_vmci_transport
> > > > > > > > > > > vsock sunrpc vfat fat snd_hda_codec_generic snd_hda_intel
> > > > > > > > > > > snd_intel_dspcfg snd_hda_codec snd_hda_core snd_hwdep snd_seq uvcvideo
> > > > > > > > > > > videobuf2_vmalloc snd_seq_device videobuf2_memops uvc videobuf2_v4l2
> > > > > > > > > > > videodev snd_pcm videobuf2_common mc snd_timer snd vmw_vmci soundcore
> > > > > > > > > > > xfs libcrc32c vmwgfx drm_ttm_helper ttm nvme drm_kms_helper
> > > > > > > > > > > crct10dif_ce nvme_core ghash_ce sha2_ce sha256_arm64 sha1_ce drm
> > > > > > > > > > > nvme_auth sr_mod cdrom e1000e sg fuse
> > > > > > > > > > > [  198.633799] CPU: 5 UID: 0 PID: 6081 Comm: nfsd Kdump: loaded Not
> > > > > > > > > > > tainted 6.12.0-rc6+ #47
> > > > > > > > > > > [  198.634345] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS
> > > > > > > > > > > VMW201.00V.21805430.BA64.2305221830 05/22/2023
> > > > > > > > > > > [  198.635014] pstate: 11400005 (nzcV daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> > > > > > > > > > > [  198.635492] pc : nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > > > > > > [  198.635798] lr : nfsd4_sequence+0x340/0x1f60 [nfsd]
> > > > > > > > > > > [  198.636065] sp : ffff8000884977e0
> > > > > > > > > > > [  198.636234] x29: ffff800088497910 x28: ffff0000b1b39280 x27: ffff0000ab508128
> > > > > > > > > > > [  198.636624] x26: ffff0000b1b39298 x25: ffff0000b1b39290 x24: ffff0000a65e1c64
> > > > > > > > > > > [  198.637049] x23: 1fffe000212e6804 x22: ffff000109734024 x21: 1ffff00011092f16
> > > > > > > > > > > [  198.637472] x20: ffff00010aed8000 x19: ffff000109734000 x18: 1fffe0002de20c8b
> > > > > > > > > > > [  198.637883] x17: 0100000000000000 x16: 1ffff0000fcef234 x15: 1fffe000212e600f
> > > > > > > > > > > [  198.638286] x14: ffff80007e779000 x13: ffff80007e7791a0 x12: 0000000000000000
> > > > > > > > > > > [  198.638697] x11: ffff0000a65e1c38 x10: ffff00010aedaca0 x9 : 1fffe000215db594
> > > > > > > > > > > [  198.639110] x8 : 1fffe00014cbc387 x7 : ffff0000a65e1c03 x6 : ffff0000a65e1c00
> > > > > > > > > > > [  198.639541] x5 : ffff0000a65e1c00 x4 : 0000000020000000 x3 : 0000000100000001
> > > > > > > > > > > [  198.639962] x2 : ffff000109730060 x1 : 0000000000000003 x0 : dfff800000000000
> > > > > > > > > > > [  198.640332] Call trace:
> > > > > > > > > > > [  198.640460]  nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > > > > > > [  198.640715]  nfsd4_proc_compound+0xb94/0x23b0 [nfsd]
> > > > > > > > > > > [  198.640997]  nfsd_dispatch+0x22c/0x718 [nfsd]
> > > > > > > > > > > [  198.641260]  svc_process_common+0x8e8/0x1968 [sunrpc]
> > > > > > > > > > > [  198.641566]  svc_process+0x3d4/0x7e0 [sunrpc]
> > > > > > > > > > > [  198.641827]  svc_handle_xprt+0x828/0xe10 [sunrpc]
> > > > > > > > > > > [  198.642108]  svc_recv+0x2cc/0x6a8 [sunrpc]
> > > > > > > > > > > [  198.642346]  nfsd+0x270/0x400 [nfsd]
> > > > > > > > > > > [  198.642562]  kthread+0x288/0x310
> > > > > > > > > > > [  198.642745]  ret_from_fork+0x10/0x20
> > > > > > > > > > > [  198.642937] Code: f2fbffe0 f9003be4 f94007e2 52800061 (38e06880)
> > > > > > > > > > > [  198.643267] SMP: stopping secondary CPUs
> > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > Good catch. I think the problem here is that we don't currently cap the
> > > > > > > > > > initial value of se_cb_highest_slot at NFSD_BC_SLOT_TABLE_MAX. Does
> > > > > > > > > > this patch prevent the panic?
> > > > > > > > > > 
> > > > > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > > > > index 3afe56ab9e0a..839be4ba765a 100644
> > > > > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > > > > @@ -2011,7 +2011,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > > > > > > 
> > > > > > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > > > > > >         new->se_cb_slot_avail = ~0U;
> > > > > > > > > > -       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > > > > > +       new->se_cb_highest_slot = min(battrs->maxreqs - 1, NFSD_BC_SLOT_TABLE_MAX);
> > > > > > > > > >         spin_lock_init(&new->se_lock);
> > > > > > > > > >         return new;
> > > > > > > > > >  out_free:
> > > > > > > > > 
> > > > > > > > > It does help. I thought that the CREATE_SESSION reply for the
> > > > > > > > > backchannel would be guided by the NFSD_BC_SLOT_TABLE_MAX value but
> > > > > > > > > instead it seems like it's not. But yes I can see that the highest
> > > > > > > > > slot used by the server is capped by the NFSD_BC_SLOT_TABLE_MAX value.
> > > > > > > > 
> > > > > > > > Thanks for testing it, Olga.
> > > > > > > > 
> > > > > > > > Chuck, would you be OK with folding the above delta into 9ab4c4077de9,
> > > > > > > > or would you rather I resend the patch?
> > > > > > > 
> > > > > > > I've folded the above one-liner into the applied patch.
> > > > > > > 
> > > > > > > I agree with Tom, I think there's probably a (surprising)
> > > > > > > explanation lurking for not seeing the expected performance
> > > > > > > improvement. I can delay sending the NFSD v6.13 merge window pull
> > > > > > > request for a bit to see if you can get it teased out.
> > > > > > 
> > > > > > I would like to raise a couple of issues:
> > > > > > (1) I believe the server should be reporting back an accurate value
> > > > > > for the backchannel session table size. I think if the
> > > > > > NFSD_BC_SLOT_TABLE_MAX was way lower than the client's value then the
> > > > > > client would be wasting resources for its bc session table?
> > > > > 
> > > > > Yes, but those resources are 32-bit integer per wasted slot. The Linux
> > > > > client allows for up to 16 slots, so we're wasting 64 bytes per session
> > > > > with this scheme with the Linux client. I didn't think it was worth
> > > > > doing a separate allocation for that.
> > > > > 
> > > > > We could make NFSD_BC_SLOT_TABLE_MAX smaller though. Maybe we should
> > > > > match the client's size and make it 15?
> > > > > 
> > > > > > ->back_channel->maxreqs gets decoded in nfsd4_decode_create_session()
> > > > > > and is never adjusted for the reply to be based on the
> > > > > > NFSD_BC_SLOT_TABLE_MAX. The problem is currently invisible because
> > > > > > linux client's bc slot table size is 16 and nfsd's is higher.
> > > > > > 
> > > > > 
> > > > > I'm not sure I understand the problem here. We don't care about most of
> > > > > the backchannel attributes. maxreqs is the only one that matters, and
> > > > > track that in se_cb_highest_slot.
> > > > 
> > > > Client sends a create_session with cba_back_chan_attrs with max_reqs
> > > > of 16 -- stating that the client can handle 16 slots in it's slot
> > > > table. Server currently doesn't do anything about reflecting back to
> > > > the client its session slot table. It blindly returns what the client
> > > > sent. Say NFSD_BC_SLOT_TABLE_MAX was 4. Server would never use more
> > > > than 4 slots and yet the client would have to create a reply cache
> > > > table for 16 slots. Isn't that poor sportsmanship on behalf of the
> > > > linux server?
> > > > 
> > > > 
> > > 
> > > Thanks, that does sound like a bug. I think we can fix that with
> > > another one-liner.  When we allocate the new session, update the
> > > back_channel attrs in the request with the correct maxreqs. Thoughts?
> > > 
> > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > index 15438826ed5b..c35d8fc2f693 100644
> > > --- a/fs/nfsd/nfs4state.c
> > > +++ b/fs/nfsd/nfs4state.c
> > > @@ -3885,6 +3885,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
> > >       new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel);
> > >       if (!new)
> > >               goto out_release_drc_mem;
> > > +     cr_ses->back_channel.maxreqs = new->se_cb_highest_slot;
> > >       conn = alloc_conn_from_crses(rqstp, cr_ses);
> > >       if (!conn)
> > >               goto out_free_session;
> > 
> > 
> > Actually, I think this is better, since we're already modifying things
> > in this section of the code. Also the earlier patch was off-by-one:
> > 
> > ------------------------8<----------------------
> > 
> > [PATCH] SQUASH: report the correct number of backchannel slots to client
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/nfsd/nfs4state.c | 2 ++
> >  1 file changed, 2 insertions(+)
> > 
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index 15438826ed5b..cfc2190ffce5 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -3955,6 +3955,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
> >         cr_ses->flags &= ~SESSION4_PERSIST;
> >         /* Upshifting from TCP to RDMA is not supported */
> >         cr_ses->flags &= ~SESSION4_RDMA;
> > +       /* Report the correct number of backchannel slots */
> > +       cr_ses->back_channel.maxreqs = new->se_cb_highest_slot + 1;
> 
> Is the intent that NFSD_BC_SLOT_TABLE_MAX value represents a one off
> value? With this patch if NFSD_BC_SLOT_TABLE_MAX=1 the server would
> send back 2 in its CREATE_SESSION reply for the bc table size. Other
> than the wrong value, this patch would indeed reflect back server's cb
> table size.
> 
> 

I think the above conversion is correct.

The se_cb_highest_slot represents the highest slotid that the server
will use. IOW, it's indexed starting at 0. "maxreqs" on the other hand
represents the maximum number of requests that can be in flight, so
it's indexed starting at 1.
Olga Kornievskaia Nov. 11, 2024, 9:56 p.m. UTC | #21
On Wed, Nov 6, 2024 at 11:44 AM Olga Kornievskaia <aglo@umich.edu> wrote:
>
> On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > nfsd currently only uses a single slot in the callback channel, which is
> > proving to be a bottleneck in some cases. Widen the callback channel to
> > a max of 32 slots (subject to the client's target_maxreqs value).
> >
> > Change the cb_holds_slot boolean to an integer that tracks the current
> > slot number (with -1 meaning "unassigned").  Move the callback slot
> > tracking info into the session. Add a new u32 that acts as a bitmap to
> > track which slots are in use, and a u32 to track the latest callback
> > target_slotid that the client reports. To protect the new fields, add
> > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > search for the lowest slotid (using ffs()).
> >
> > Finally, convert the session->se_cb_seq_nr field into an array of
> > counters and add the necessary handling to ensure that the seqids get
> > reset at the appropriate times.
> >
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > ---
> > v3 has a bug that Olga hit in testing. This version should fix the wait
> > when the slot table is full. Olga, if you're able to test this one, it
> > would be much appreciated.
>
> I have tested this version. I can confirm that I'm not seeing the
> softlockup. But the server still does not use the lowest available
> slot. It is hard for me to describe the algorithm of picking the slot
> number (in general it still seems to be picking the next slot value,
> even though slots have been replied to). I have seen slot 0 re-used
> eventually but it seemed to be when the server came to using slot=13.
>
> The other unfortunate thing that's happening when I use these patches
> is my test case that recalling delegations and making sure that the
> state management gets handled properly (ie., the patch that I've
> submitted to fix a race between the laundromat thread and free_state)
> is not working. After all the recalls, the server still thinks it has
> revoked state. I have to debug more to figure out what's going on.

I have finally been able to consistently hit the problem and it's not
a server issue but I can't decide who's at fault here -- client or
server. While handling the fact that state is revoked the client sends
what now is a SETATTR (for deleg attributes)+DELEGRETURN (previously
just a DELEGRETURN). SETATTR fails with BAD_STATEID. Client doesn't do
anything. Previously (before the deleg attr support) client would sent
DELEGRETURN and server would fail with DELEG_REVOKED or BAD_STATEID
and the client would follow up with FREE_STATEID. But now the client
doesn't send a FREE_STATEID and thus the server is left with "revoked
state which never was freed".

Now, if the server returned DELEG_REVOKED instead of BAD_STATEID for
the SETATTR then the problem doesn't happen.

Question: is the server incorrect here or is the client incorrect and
should have (1) either also resent the delegreturn on its own which
was not processed before and that should have still triggered the
free_stateid or (2) should have treated bad_stateid error of setattr
in the delegreturn compound such that it freed the state there.





>
> > ---
> > Changes in v4:
> > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> >
> > Changes in v3:
> > - add patch to convert se_flags to single se_dead bool
> > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > - don't reject target highest slot value of 0
> > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> >
> > Changes in v2:
> > - take cl_lock when fetching fields from session to be encoded
> > - use fls() instead of bespoke highest_unset_index()
> > - rename variables in several functions with more descriptive names
> > - clamp limit of for loop in update_cb_slot_table()
> > - re-add missing rpc_wake_up_queued_task() call
> > - fix slotid check in decode_cb_sequence4resok()
> > - add new per-session spinlock
> > ---
> >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> >  fs/nfsd/nfs4state.c    |  11 +++--
> >  fs/nfsd/state.h        |  15 ++++---
> >  fs/nfsd/trace.h        |   2 +-
> >  4 files changed, 101 insertions(+), 40 deletions(-)
> >
> > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > --- a/fs/nfsd/nfs4callback.c
> > +++ b/fs/nfsd/nfs4callback.c
> > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> >         hdr->nops++;
> >  }
> >
> > +static u32 highest_slotid(struct nfsd4_session *ses)
> > +{
> > +       u32 idx;
> > +
> > +       spin_lock(&ses->se_lock);
> > +       idx = fls(~ses->se_cb_slot_avail);
> > +       if (idx > 0)
> > +               --idx;
> > +       idx = max(idx, ses->se_cb_highest_slot);
> > +       spin_unlock(&ses->se_lock);
> > +       return idx;
> > +}
> > +
> >  /*
> >   * CB_SEQUENCE4args
> >   *
> > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> >         encode_sessionid4(xdr, session);
> >
> >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > -       *p++ = xdr_zero;                        /* csa_slotid */
> > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> >         *p++ = xdr_zero;                        /* csa_cachethis */
> >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> >
> >         hdr->nops++;
> >  }
> >
> > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > +{
> > +       /* No need to do anything if nothing changed */
> > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > +               return;
> > +
> > +       spin_lock(&ses->se_lock);
> > +       if (target > ses->se_cb_highest_slot) {
> > +               int i;
> > +
> > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > +
> > +               /* Growing the slot table. Reset any new sequences to 1 */
> > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > +                       ses->se_cb_seq_nr[i] = 1;
> > +       }
> > +       ses->se_cb_highest_slot = target;
> > +       spin_unlock(&ses->se_lock);
> > +}
> > +
> >  /*
> >   * CB_SEQUENCE4resok
> >   *
> > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> >         int status = -ESERVERFAULT;
> >         __be32 *p;
> > -       u32 dummy;
> > +       u32 seqid, slotid, target;
> >
> >         /*
> >          * If the server returns different values for sessionID, slotID or
> > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> >         }
> >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> >
> > -       dummy = be32_to_cpup(p++);
> > -       if (dummy != session->se_cb_seq_nr) {
> > +       seqid = be32_to_cpup(p++);
> > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> >                 goto out;
> >         }
> >
> > -       dummy = be32_to_cpup(p++);
> > -       if (dummy != 0) {
> > +       slotid = be32_to_cpup(p++);
> > +       if (slotid != cb->cb_held_slot) {
> >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> >                 goto out;
> >         }
> >
> > -       /*
> > -        * FIXME: process highest slotid and target highest slotid
> > -        */
> > +       p++; // ignore current highest slot value
> > +
> > +       target = be32_to_cpup(p++);
> > +       update_cb_slot_table(session, target);
> >         status = 0;
> >  out:
> >         cb->cb_seq_status = status;
> > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> >         spin_unlock(&clp->cl_lock);
> >  }
> >
> > +static int grab_slot(struct nfsd4_session *ses)
> > +{
> > +       int idx;
> > +
> > +       spin_lock(&ses->se_lock);
> > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > +               spin_unlock(&ses->se_lock);
> > +               return -1;
> > +       }
> > +       /* clear the bit for the slot */
> > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > +       spin_unlock(&ses->se_lock);
> > +       return idx;
> > +}
> > +
> >  /*
> >   * There's currently a single callback channel slot.
> >   * If the slot is available, then mark it busy.  Otherwise, set the
> > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> >  {
> >         struct nfs4_client *clp = cb->cb_clp;
> > +       struct nfsd4_session *ses = clp->cl_cb_session;
> >
> > -       if (!cb->cb_holds_slot &&
> > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > +       if (cb->cb_held_slot >= 0)
> > +               return true;
> > +       cb->cb_held_slot = grab_slot(ses);
> > +       if (cb->cb_held_slot < 0) {
> >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> >                 /* Race breaker */
> > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > -                       dprintk("%s slot is busy\n", __func__);
> > +               cb->cb_held_slot = grab_slot(ses);
> > +               if (cb->cb_held_slot < 0)
> >                         return false;
> > -               }
> >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> >         }
> > -       cb->cb_holds_slot = true;
> >         return true;
> >  }
> >
> >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> >  {
> >         struct nfs4_client *clp = cb->cb_clp;
> > +       struct nfsd4_session *ses = clp->cl_cb_session;
> >
> > -       if (cb->cb_holds_slot) {
> > -               cb->cb_holds_slot = false;
> > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > +       if (cb->cb_held_slot >= 0) {
> > +               spin_lock(&ses->se_lock);
> > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > +               spin_unlock(&ses->se_lock);
> > +               cb->cb_held_slot = -1;
> >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> >         }
> >  }
> > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> >  }
> >
> >  /*
> > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > - * slots, and mark callback channel down on communication errors.
> > + * TODO: cb_sequence should support referring call lists, cachethis,
> > + * and mark callback channel down on communication errors.
> >   */
> >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> >  {
> > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >                 return true;
> >         }
> >
> > -       if (!cb->cb_holds_slot)
> > +       if (cb->cb_held_slot < 0)
> >                 goto need_restart;
> >
> >         /* This is the operation status code for CB_SEQUENCE */
> > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >                  * If CB_SEQUENCE returns an error, then the state of the slot
> >                  * (sequence ID, cached reply) MUST NOT change.
> >                  */
> > -               ++session->se_cb_seq_nr;
> > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> >                 break;
> >         case -ESERVERFAULT:
> > -               ++session->se_cb_seq_nr;
> > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> >                 nfsd4_mark_cb_fault(cb->cb_clp);
> >                 ret = false;
> >                 break;
> > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >         case -NFS4ERR_BADSLOT:
> >                 goto retry_nowait;
> >         case -NFS4ERR_SEQ_MISORDERED:
> > -               if (session->se_cb_seq_nr != 1) {
> > -                       session->se_cb_seq_nr = 1;
> > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> >                         goto retry_nowait;
> >                 }
> >                 break;
> >         default:
> >                 nfsd4_mark_cb_fault(cb->cb_clp);
> >         }
> > -       nfsd41_cb_release_slot(cb);
> > -
> >         trace_nfsd_cb_free_slot(task, cb);
> > +       nfsd41_cb_release_slot(cb);
> >
> >         if (RPC_SIGNALLED(task))
> >                 goto need_restart;
> > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> >         cb->cb_status = 0;
> >         cb->cb_need_restart = false;
> > -       cb->cb_holds_slot = false;
> > +       cb->cb_held_slot = -1;
> >  }
> >
> >  /**
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> >         }
> >
> >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > +       new->se_cb_slot_avail = ~0U;
> > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > +       spin_lock_init(&new->se_lock);
> >         return new;
> >  out_free:
> >         while (i--)
> > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> >
> >         INIT_LIST_HEAD(&new->se_conns);
> >
> > -       new->se_cb_seq_nr = 1;
> > +       atomic_set(&new->se_ref, 0);
> >         new->se_dead = false;
> >         new->se_cb_prog = cses->callback_prog;
> >         new->se_cb_sec = cses->cb_sec;
> > -       atomic_set(&new->se_ref, 0);
> > +
> > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > +               new->se_cb_seq_nr[idx] = 1;
> > +
> >         idx = hash_sessionid(&new->se_sessionid);
> >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> >         spin_lock(&clp->cl_lock);
> > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> >         kref_init(&clp->cl_nfsdfs.cl_ref);
> >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> >         clp->cl_time = ktime_get_boottime_seconds();
> > -       clear_bit(0, &clp->cl_cb_slot_busy);
> >         copy_verf(clp, verf);
> >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> >         clp->cl_cb_session = NULL;
> > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > --- a/fs/nfsd/state.h
> > +++ b/fs/nfsd/state.h
> > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> >         struct work_struct cb_work;
> >         int cb_seq_status;
> >         int cb_status;
> > +       int cb_held_slot;
> >         bool cb_need_restart;
> > -       bool cb_holds_slot;
> >  };
> >
> >  struct nfsd4_callback_ops {
> > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> >         unsigned char cn_flags;
> >  };
> >
> > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > +
> >  /*
> >   * Representation of a v4.1+ session. These are refcounted in a similar fashion
> >   * to the nfs4_client. References are only taken when the server is actively
> > @@ -314,6 +317,10 @@ struct nfsd4_conn {
> >   */
> >  struct nfsd4_session {
> >         atomic_t                se_ref;
> > +       spinlock_t              se_lock;
> > +       u32                     se_cb_slot_avail; /* bitmap of available slots */
> > +       u32                     se_cb_highest_slot;     /* highest slot client wants */
> > +       u32                     se_cb_prog;
> >         bool                    se_dead;
> >         struct list_head        se_hash;        /* hash by sessionid */
> >         struct list_head        se_perclnt;
> > @@ -322,8 +329,7 @@ struct nfsd4_session {
> >         struct nfsd4_channel_attrs se_fchannel;
> >         struct nfsd4_cb_sec     se_cb_sec;
> >         struct list_head        se_conns;
> > -       u32                     se_cb_prog;
> > -       u32                     se_cb_seq_nr;
> > +       u32                     se_cb_seq_nr[NFSD_BC_SLOT_TABLE_MAX + 1];
> >         struct nfsd4_slot       *se_slots[];    /* forward channel slots */
> >  };
> >
> > @@ -457,9 +463,6 @@ struct nfs4_client {
> >          */
> >         struct dentry           *cl_nfsd_info_dentry;
> >
> > -       /* for nfs41 callbacks */
> > -       /* We currently support a single back channel with a single slot */
> > -       unsigned long           cl_cb_slot_busy;
> >         struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
> >                                                 /* wait here for slots */
> >         struct net              *net;
> > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > index f318898cfc31614b5a84a4867e18c2b3a07122c9..a9c17186b6892f1df8d7f7b90e250c2913ab23fe 100644
> > --- a/fs/nfsd/trace.h
> > +++ b/fs/nfsd/trace.h
> > @@ -1697,7 +1697,7 @@ TRACE_EVENT(nfsd_cb_free_slot,
> >                 __entry->cl_id = sid->clientid.cl_id;
> >                 __entry->seqno = sid->sequence;
> >                 __entry->reserved = sid->reserved;
> > -               __entry->slot_seqno = session->se_cb_seq_nr;
> > +               __entry->slot_seqno = session->se_cb_seq_nr[cb->cb_held_slot];
> >         ),
> >         TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
> >                 " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u",
> >
> > ---
> > base-commit: 3c16aac09d20f9005fbb0e737b3ec520bbb5badd
> > change-id: 20241025-bcwide-6bd7e4b63db2
> >
> > Best regards,
> > --
> > Jeff Layton <jlayton@kernel.org>
> >
> >
Jeff Layton Nov. 11, 2024, 10:26 p.m. UTC | #22
On Mon, 2024-11-11 at 16:56 -0500, Olga Kornievskaia wrote:
> On Wed, Nov 6, 2024 at 11:44 AM Olga Kornievskaia <aglo@umich.edu> wrote:
> > 
> > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > 
> > > nfsd currently only uses a single slot in the callback channel, which is
> > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > a max of 32 slots (subject to the client's target_maxreqs value).
> > > 
> > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > track which slots are in use, and a u32 to track the latest callback
> > > target_slotid that the client reports. To protect the new fields, add
> > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > search for the lowest slotid (using ffs()).
> > > 
> > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > counters and add the necessary handling to ensure that the seqids get
> > > reset at the appropriate times.
> > > 
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > ---
> > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > when the slot table is full. Olga, if you're able to test this one, it
> > > would be much appreciated.
> > 
> > I have tested this version. I can confirm that I'm not seeing the
> > softlockup. But the server still does not use the lowest available
> > slot. It is hard for me to describe the algorithm of picking the slot
> > number (in general it still seems to be picking the next slot value,
> > even though slots have been replied to). I have seen slot 0 re-used
> > eventually but it seemed to be when the server came to using slot=13.
> > 
> > The other unfortunate thing that's happening when I use these patches
> > is my test case that recalling delegations and making sure that the
> > state management gets handled properly (ie., the patch that I've
> > submitted to fix a race between the laundromat thread and free_state)
> > is not working. After all the recalls, the server still thinks it has
> > revoked state. I have to debug more to figure out what's going on.
> 
> I have finally been able to consistently hit the problem and it's not
> a server issue but I can't decide who's at fault here -- client or
> server. While handling the fact that state is revoked the client sends
> what now is a SETATTR (for deleg attributes)+DELEGRETURN (previously
> just a DELEGRETURN). SETATTR fails with BAD_STATEID. Client doesn't do
> anything. Previously (before the deleg attr support) client would sent
> DELEGRETURN and server would fail with DELEG_REVOKED or BAD_STATEID
> and the client would follow up with FREE_STATEID. But now the client
> doesn't send a FREE_STATEID and thus the server is left with "revoked
> state which never was freed". 
> Now, if the server returned DELEG_REVOKED instead of BAD_STATEID for
> the SETATTR then the problem doesn't happen.
> 
> Question: is the server incorrect here or is the client incorrect and
> should have (1) either also resent the delegreturn on its own which
> was not processed before and that should have still triggered the
> free_stateid or (2) should have treated bad_stateid error of setattr
> in the delegreturn compound such that it freed the state there.
> 
> 

That bit does sound like a server bug. DELEG_REVOKED is a valid return
code for SETATTR. It looks like nfsd4_lookup_stateid() should be
returning DELEG_REVOKED in this situation, so I'm not sure why that's
not working right.

That said, I'm also interested in why delegations are ending up revoked
in the first place. 

> > 
> > > ---
> > > Changes in v4:
> > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > > 
> > > Changes in v3:
> > > - add patch to convert se_flags to single se_dead bool
> > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > - don't reject target highest slot value of 0
> > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > > 
> > > Changes in v2:
> > > - take cl_lock when fetching fields from session to be encoded
> > > - use fls() instead of bespoke highest_unset_index()
> > > - rename variables in several functions with more descriptive names
> > > - clamp limit of for loop in update_cb_slot_table()
> > > - re-add missing rpc_wake_up_queued_task() call
> > > - fix slotid check in decode_cb_sequence4resok()
> > > - add new per-session spinlock
> > > ---
> > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > >  fs/nfsd/nfs4state.c    |  11 +++--
> > >  fs/nfsd/state.h        |  15 ++++---
> > >  fs/nfsd/trace.h        |   2 +-
> > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > > 
> > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > --- a/fs/nfsd/nfs4callback.c
> > > +++ b/fs/nfsd/nfs4callback.c
> > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > >         hdr->nops++;
> > >  }
> > > 
> > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > +{
> > > +       u32 idx;
> > > +
> > > +       spin_lock(&ses->se_lock);
> > > +       idx = fls(~ses->se_cb_slot_avail);
> > > +       if (idx > 0)
> > > +               --idx;
> > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > +       spin_unlock(&ses->se_lock);
> > > +       return idx;
> > > +}
> > > +
> > >  /*
> > >   * CB_SEQUENCE4args
> > >   *
> > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > >         encode_sessionid4(xdr, session);
> > > 
> > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > > 
> > >         hdr->nops++;
> > >  }
> > > 
> > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > +{
> > > +       /* No need to do anything if nothing changed */
> > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > +               return;
> > > +
> > > +       spin_lock(&ses->se_lock);
> > > +       if (target > ses->se_cb_highest_slot) {
> > > +               int i;
> > > +
> > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > +
> > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > +                       ses->se_cb_seq_nr[i] = 1;
> > > +       }
> > > +       ses->se_cb_highest_slot = target;
> > > +       spin_unlock(&ses->se_lock);
> > > +}
> > > +
> > >  /*
> > >   * CB_SEQUENCE4resok
> > >   *
> > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > >         int status = -ESERVERFAULT;
> > >         __be32 *p;
> > > -       u32 dummy;
> > > +       u32 seqid, slotid, target;
> > > 
> > >         /*
> > >          * If the server returns different values for sessionID, slotID or
> > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > >         }
> > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > > 
> > > -       dummy = be32_to_cpup(p++);
> > > -       if (dummy != session->se_cb_seq_nr) {
> > > +       seqid = be32_to_cpup(p++);
> > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > >                 goto out;
> > >         }
> > > 
> > > -       dummy = be32_to_cpup(p++);
> > > -       if (dummy != 0) {
> > > +       slotid = be32_to_cpup(p++);
> > > +       if (slotid != cb->cb_held_slot) {
> > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > >                 goto out;
> > >         }
> > > 
> > > -       /*
> > > -        * FIXME: process highest slotid and target highest slotid
> > > -        */
> > > +       p++; // ignore current highest slot value
> > > +
> > > +       target = be32_to_cpup(p++);
> > > +       update_cb_slot_table(session, target);
> > >         status = 0;
> > >  out:
> > >         cb->cb_seq_status = status;
> > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > >         spin_unlock(&clp->cl_lock);
> > >  }
> > > 
> > > +static int grab_slot(struct nfsd4_session *ses)
> > > +{
> > > +       int idx;
> > > +
> > > +       spin_lock(&ses->se_lock);
> > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > +               spin_unlock(&ses->se_lock);
> > > +               return -1;
> > > +       }
> > > +       /* clear the bit for the slot */
> > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > +       spin_unlock(&ses->se_lock);
> > > +       return idx;
> > > +}
> > > +
> > >  /*
> > >   * There's currently a single callback channel slot.
> > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > >  {
> > >         struct nfs4_client *clp = cb->cb_clp;
> > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > 
> > > -       if (!cb->cb_holds_slot &&
> > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > +       if (cb->cb_held_slot >= 0)
> > > +               return true;
> > > +       cb->cb_held_slot = grab_slot(ses);
> > > +       if (cb->cb_held_slot < 0) {
> > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > >                 /* Race breaker */
> > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > -                       dprintk("%s slot is busy\n", __func__);
> > > +               cb->cb_held_slot = grab_slot(ses);
> > > +               if (cb->cb_held_slot < 0)
> > >                         return false;
> > > -               }
> > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > >         }
> > > -       cb->cb_holds_slot = true;
> > >         return true;
> > >  }
> > > 
> > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > >  {
> > >         struct nfs4_client *clp = cb->cb_clp;
> > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > 
> > > -       if (cb->cb_holds_slot) {
> > > -               cb->cb_holds_slot = false;
> > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > +       if (cb->cb_held_slot >= 0) {
> > > +               spin_lock(&ses->se_lock);
> > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > +               spin_unlock(&ses->se_lock);
> > > +               cb->cb_held_slot = -1;
> > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > >         }
> > >  }
> > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > >  }
> > > 
> > >  /*
> > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > - * slots, and mark callback channel down on communication errors.
> > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > + * and mark callback channel down on communication errors.
> > >   */
> > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > >  {
> > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > >                 return true;
> > >         }
> > > 
> > > -       if (!cb->cb_holds_slot)
> > > +       if (cb->cb_held_slot < 0)
> > >                 goto need_restart;
> > > 
> > >         /* This is the operation status code for CB_SEQUENCE */
> > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > >                  * (sequence ID, cached reply) MUST NOT change.
> > >                  */
> > > -               ++session->se_cb_seq_nr;
> > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > >                 break;
> > >         case -ESERVERFAULT:
> > > -               ++session->se_cb_seq_nr;
> > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > >                 ret = false;
> > >                 break;
> > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > >         case -NFS4ERR_BADSLOT:
> > >                 goto retry_nowait;
> > >         case -NFS4ERR_SEQ_MISORDERED:
> > > -               if (session->se_cb_seq_nr != 1) {
> > > -                       session->se_cb_seq_nr = 1;
> > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > >                         goto retry_nowait;
> > >                 }
> > >                 break;
> > >         default:
> > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > >         }
> > > -       nfsd41_cb_release_slot(cb);
> > > -
> > >         trace_nfsd_cb_free_slot(task, cb);
> > > +       nfsd41_cb_release_slot(cb);
> > > 
> > >         if (RPC_SIGNALLED(task))
> > >                 goto need_restart;
> > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > >         cb->cb_status = 0;
> > >         cb->cb_need_restart = false;
> > > -       cb->cb_holds_slot = false;
> > > +       cb->cb_held_slot = -1;
> > >  }
> > > 
> > >  /**
> > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > --- a/fs/nfsd/nfs4state.c
> > > +++ b/fs/nfsd/nfs4state.c
> > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > >         }
> > > 
> > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > +       new->se_cb_slot_avail = ~0U;
> > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > +       spin_lock_init(&new->se_lock);
> > >         return new;
> > >  out_free:
> > >         while (i--)
> > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > 
> > >         INIT_LIST_HEAD(&new->se_conns);
> > > 
> > > -       new->se_cb_seq_nr = 1;
> > > +       atomic_set(&new->se_ref, 0);
> > >         new->se_dead = false;
> > >         new->se_cb_prog = cses->callback_prog;
> > >         new->se_cb_sec = cses->cb_sec;
> > > -       atomic_set(&new->se_ref, 0);
> > > +
> > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > +               new->se_cb_seq_nr[idx] = 1;
> > > +
> > >         idx = hash_sessionid(&new->se_sessionid);
> > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > >         spin_lock(&clp->cl_lock);
> > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > >         clp->cl_time = ktime_get_boottime_seconds();
> > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > >         copy_verf(clp, verf);
> > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > >         clp->cl_cb_session = NULL;
> > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > --- a/fs/nfsd/state.h
> > > +++ b/fs/nfsd/state.h
> > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > >         struct work_struct cb_work;
> > >         int cb_seq_status;
> > >         int cb_status;
> > > +       int cb_held_slot;
> > >         bool cb_need_restart;
> > > -       bool cb_holds_slot;
> > >  };
> > > 
> > >  struct nfsd4_callback_ops {
> > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > >         unsigned char cn_flags;
> > >  };
> > > 
> > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > > +
> > >  /*
> > >   * Representation of a v4.1+ session. These are refcounted in a similar fashion
> > >   * to the nfs4_client. References are only taken when the server is actively
> > > @@ -314,6 +317,10 @@ struct nfsd4_conn {
> > >   */
> > >  struct nfsd4_session {
> > >         atomic_t                se_ref;
> > > +       spinlock_t              se_lock;
> > > +       u32                     se_cb_slot_avail; /* bitmap of available slots */
> > > +       u32                     se_cb_highest_slot;     /* highest slot client wants */
> > > +       u32                     se_cb_prog;
> > >         bool                    se_dead;
> > >         struct list_head        se_hash;        /* hash by sessionid */
> > >         struct list_head        se_perclnt;
> > > @@ -322,8 +329,7 @@ struct nfsd4_session {
> > >         struct nfsd4_channel_attrs se_fchannel;
> > >         struct nfsd4_cb_sec     se_cb_sec;
> > >         struct list_head        se_conns;
> > > -       u32                     se_cb_prog;
> > > -       u32                     se_cb_seq_nr;
> > > +       u32                     se_cb_seq_nr[NFSD_BC_SLOT_TABLE_MAX + 1];
> > >         struct nfsd4_slot       *se_slots[];    /* forward channel slots */
> > >  };
> > > 
> > > @@ -457,9 +463,6 @@ struct nfs4_client {
> > >          */
> > >         struct dentry           *cl_nfsd_info_dentry;
> > > 
> > > -       /* for nfs41 callbacks */
> > > -       /* We currently support a single back channel with a single slot */
> > > -       unsigned long           cl_cb_slot_busy;
> > >         struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
> > >                                                 /* wait here for slots */
> > >         struct net              *net;
> > > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > > index f318898cfc31614b5a84a4867e18c2b3a07122c9..a9c17186b6892f1df8d7f7b90e250c2913ab23fe 100644
> > > --- a/fs/nfsd/trace.h
> > > +++ b/fs/nfsd/trace.h
> > > @@ -1697,7 +1697,7 @@ TRACE_EVENT(nfsd_cb_free_slot,
> > >                 __entry->cl_id = sid->clientid.cl_id;
> > >                 __entry->seqno = sid->sequence;
> > >                 __entry->reserved = sid->reserved;
> > > -               __entry->slot_seqno = session->se_cb_seq_nr;
> > > +               __entry->slot_seqno = session->se_cb_seq_nr[cb->cb_held_slot];
> > >         ),
> > >         TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
> > >                 " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u",
> > > 
> > > ---
> > > base-commit: 3c16aac09d20f9005fbb0e737b3ec520bbb5badd
> > > change-id: 20241025-bcwide-6bd7e4b63db2
> > > 
> > > Best regards,
> > > --
> > > Jeff Layton <jlayton@kernel.org>
> > > 
> > >
Olga Kornievskaia Nov. 11, 2024, 11:32 p.m. UTC | #23
On Mon, Nov 11, 2024 at 5:26 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Mon, 2024-11-11 at 16:56 -0500, Olga Kornievskaia wrote:
> > On Wed, Nov 6, 2024 at 11:44 AM Olga Kornievskaia <aglo@umich.edu> wrote:
> > >
> > > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > >
> > > > nfsd currently only uses a single slot in the callback channel, which is
> > > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > > a max of 32 slots (subject to the client's target_maxreqs value).
> > > >
> > > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > > track which slots are in use, and a u32 to track the latest callback
> > > > target_slotid that the client reports. To protect the new fields, add
> > > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > > search for the lowest slotid (using ffs()).
> > > >
> > > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > > counters and add the necessary handling to ensure that the seqids get
> > > > reset at the appropriate times.
> > > >
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > > ---
> > > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > > when the slot table is full. Olga, if you're able to test this one, it
> > > > would be much appreciated.
> > >
> > > I have tested this version. I can confirm that I'm not seeing the
> > > softlockup. But the server still does not use the lowest available
> > > slot. It is hard for me to describe the algorithm of picking the slot
> > > number (in general it still seems to be picking the next slot value,
> > > even though slots have been replied to). I have seen slot 0 re-used
> > > eventually but it seemed to be when the server came to using slot=13.
> > >
> > > The other unfortunate thing that's happening when I use these patches
> > > is my test case that recalling delegations and making sure that the
> > > state management gets handled properly (ie., the patch that I've
> > > submitted to fix a race between the laundromat thread and free_state)
> > > is not working. After all the recalls, the server still thinks it has
> > > revoked state. I have to debug more to figure out what's going on.
> >
> > I have finally been able to consistently hit the problem and it's not
> > a server issue but I can't decide who's at fault here -- client or
> > server. While handling the fact that state is revoked the client sends
> > what now is a SETATTR (for deleg attributes)+DELEGRETURN (previously
> > just a DELEGRETURN). SETATTR fails with BAD_STATEID. Client doesn't do
> > anything. Previously (before the deleg attr support) client would sent
> > DELEGRETURN and server would fail with DELEG_REVOKED or BAD_STATEID
> > and the client would follow up with FREE_STATEID. But now the client
> > doesn't send a FREE_STATEID and thus the server is left with "revoked
> > state which never was freed".
> > Now, if the server returned DELEG_REVOKED instead of BAD_STATEID for
> > the SETATTR then the problem doesn't happen.
> >
> > Question: is the server incorrect here or is the client incorrect and
> > should have (1) either also resent the delegreturn on its own which
> > was not processed before and that should have still triggered the
> > free_stateid or (2) should have treated bad_stateid error of setattr
> > in the delegreturn compound such that it freed the state there.
> >
> >
>
> That bit does sound like a server bug. DELEG_REVOKED is a valid return
> code for SETATTR. It looks like nfsd4_lookup_stateid() should be
> returning DELEG_REVOKED in this situation, so I'm not sure why that's
> not working right.

nfsd4_lookup_stateid() only returns DELEG_REVOKED for delegreturn is
it?. It needs statusmask to set acceptable states and nfsd4_setattr()
calls generic nfs4_preprocess_stateid_op function which call
nfsd4_lookup_stateid() passing 0 for okstates. In setattr's case it's
the call into find_stateid_by_type() with 0 statusmask which does't
allow for stateid flags as revoked to be "found", generating
bad_stateid error instead. If we wanted to make sure SETATTR to return
revoked then we need to re-write nfsd4_setattr not to call
nfs4_preprocess_stateid_op()... but if we are going down that rabbit
hole, the OPEN (with DELEG_CUR_FH) on stateid which was revoked ends
with BAD_STATEID error (not DELEG_REVOKED error). It doesn't hurt the
client there. The client proceeds to test the stateid and free it. But
technically, OPEN should have returned DELEG_REVOKED error. So should
we be changing that too?

I'm not so sure the server is at (totally) fault. I'm questioning why
receiving BAD_STATEID on the client for SETATTR of a DELEGRETURN
compound doesn't lead to either FREE_STATEID or retry of a DELEGRETURN
(which treats both types of errors as a reason to trigger
FREE_STATEID).

> That said, I'm also interested in why delegations are ending up revoked
> in the first place.

Again that's normal. The test case is we have a large number of opened
files that are being recalled. As the server keeps sending cb_recalls
the laundromat thread kicks in and determines that state has been
marked recalled for longer than the lease period so revokes it.

>
> > >
> > > > ---
> > > > Changes in v4:
> > > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > > >
> > > > Changes in v3:
> > > > - add patch to convert se_flags to single se_dead bool
> > > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > > - don't reject target highest slot value of 0
> > > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > > >
> > > > Changes in v2:
> > > > - take cl_lock when fetching fields from session to be encoded
> > > > - use fls() instead of bespoke highest_unset_index()
> > > > - rename variables in several functions with more descriptive names
> > > > - clamp limit of for loop in update_cb_slot_table()
> > > > - re-add missing rpc_wake_up_queued_task() call
> > > > - fix slotid check in decode_cb_sequence4resok()
> > > > - add new per-session spinlock
> > > > ---
> > > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > > >  fs/nfsd/nfs4state.c    |  11 +++--
> > > >  fs/nfsd/state.h        |  15 ++++---
> > > >  fs/nfsd/trace.h        |   2 +-
> > > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > > >
> > > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > > --- a/fs/nfsd/nfs4callback.c
> > > > +++ b/fs/nfsd/nfs4callback.c
> > > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > > >         hdr->nops++;
> > > >  }
> > > >
> > > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > > +{
> > > > +       u32 idx;
> > > > +
> > > > +       spin_lock(&ses->se_lock);
> > > > +       idx = fls(~ses->se_cb_slot_avail);
> > > > +       if (idx > 0)
> > > > +               --idx;
> > > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > > +       spin_unlock(&ses->se_lock);
> > > > +       return idx;
> > > > +}
> > > > +
> > > >  /*
> > > >   * CB_SEQUENCE4args
> > > >   *
> > > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > > >         encode_sessionid4(xdr, session);
> > > >
> > > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > > >
> > > >         hdr->nops++;
> > > >  }
> > > >
> > > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > > +{
> > > > +       /* No need to do anything if nothing changed */
> > > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > > +               return;
> > > > +
> > > > +       spin_lock(&ses->se_lock);
> > > > +       if (target > ses->se_cb_highest_slot) {
> > > > +               int i;
> > > > +
> > > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > > +
> > > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > > +                       ses->se_cb_seq_nr[i] = 1;
> > > > +       }
> > > > +       ses->se_cb_highest_slot = target;
> > > > +       spin_unlock(&ses->se_lock);
> > > > +}
> > > > +
> > > >  /*
> > > >   * CB_SEQUENCE4resok
> > > >   *
> > > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > > >         int status = -ESERVERFAULT;
> > > >         __be32 *p;
> > > > -       u32 dummy;
> > > > +       u32 seqid, slotid, target;
> > > >
> > > >         /*
> > > >          * If the server returns different values for sessionID, slotID or
> > > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > >         }
> > > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > > >
> > > > -       dummy = be32_to_cpup(p++);
> > > > -       if (dummy != session->se_cb_seq_nr) {
> > > > +       seqid = be32_to_cpup(p++);
> > > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > > >                 goto out;
> > > >         }
> > > >
> > > > -       dummy = be32_to_cpup(p++);
> > > > -       if (dummy != 0) {
> > > > +       slotid = be32_to_cpup(p++);
> > > > +       if (slotid != cb->cb_held_slot) {
> > > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > > >                 goto out;
> > > >         }
> > > >
> > > > -       /*
> > > > -        * FIXME: process highest slotid and target highest slotid
> > > > -        */
> > > > +       p++; // ignore current highest slot value
> > > > +
> > > > +       target = be32_to_cpup(p++);
> > > > +       update_cb_slot_table(session, target);
> > > >         status = 0;
> > > >  out:
> > > >         cb->cb_seq_status = status;
> > > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > >         spin_unlock(&clp->cl_lock);
> > > >  }
> > > >
> > > > +static int grab_slot(struct nfsd4_session *ses)
> > > > +{
> > > > +       int idx;
> > > > +
> > > > +       spin_lock(&ses->se_lock);
> > > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > > +               spin_unlock(&ses->se_lock);
> > > > +               return -1;
> > > > +       }
> > > > +       /* clear the bit for the slot */
> > > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > > +       spin_unlock(&ses->se_lock);
> > > > +       return idx;
> > > > +}
> > > > +
> > > >  /*
> > > >   * There's currently a single callback channel slot.
> > > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > > >  {
> > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > >
> > > > -       if (!cb->cb_holds_slot &&
> > > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > +       if (cb->cb_held_slot >= 0)
> > > > +               return true;
> > > > +       cb->cb_held_slot = grab_slot(ses);
> > > > +       if (cb->cb_held_slot < 0) {
> > > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > > >                 /* Race breaker */
> > > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > -                       dprintk("%s slot is busy\n", __func__);
> > > > +               cb->cb_held_slot = grab_slot(ses);
> > > > +               if (cb->cb_held_slot < 0)
> > > >                         return false;
> > > > -               }
> > > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > > >         }
> > > > -       cb->cb_holds_slot = true;
> > > >         return true;
> > > >  }
> > > >
> > > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > > >  {
> > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > >
> > > > -       if (cb->cb_holds_slot) {
> > > > -               cb->cb_holds_slot = false;
> > > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > > +       if (cb->cb_held_slot >= 0) {
> > > > +               spin_lock(&ses->se_lock);
> > > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > > +               spin_unlock(&ses->se_lock);
> > > > +               cb->cb_held_slot = -1;
> > > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > > >         }
> > > >  }
> > > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > > >  }
> > > >
> > > >  /*
> > > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > > - * slots, and mark callback channel down on communication errors.
> > > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > > + * and mark callback channel down on communication errors.
> > > >   */
> > > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > > >  {
> > > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > >                 return true;
> > > >         }
> > > >
> > > > -       if (!cb->cb_holds_slot)
> > > > +       if (cb->cb_held_slot < 0)
> > > >                 goto need_restart;
> > > >
> > > >         /* This is the operation status code for CB_SEQUENCE */
> > > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > > >                  * (sequence ID, cached reply) MUST NOT change.
> > > >                  */
> > > > -               ++session->se_cb_seq_nr;
> > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > >                 break;
> > > >         case -ESERVERFAULT:
> > > > -               ++session->se_cb_seq_nr;
> > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > >                 ret = false;
> > > >                 break;
> > > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > >         case -NFS4ERR_BADSLOT:
> > > >                 goto retry_nowait;
> > > >         case -NFS4ERR_SEQ_MISORDERED:
> > > > -               if (session->se_cb_seq_nr != 1) {
> > > > -                       session->se_cb_seq_nr = 1;
> > > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > > >                         goto retry_nowait;
> > > >                 }
> > > >                 break;
> > > >         default:
> > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > >         }
> > > > -       nfsd41_cb_release_slot(cb);
> > > > -
> > > >         trace_nfsd_cb_free_slot(task, cb);
> > > > +       nfsd41_cb_release_slot(cb);
> > > >
> > > >         if (RPC_SIGNALLED(task))
> > > >                 goto need_restart;
> > > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > > >         cb->cb_status = 0;
> > > >         cb->cb_need_restart = false;
> > > > -       cb->cb_holds_slot = false;
> > > > +       cb->cb_held_slot = -1;
> > > >  }
> > > >
> > > >  /**
> > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > > --- a/fs/nfsd/nfs4state.c
> > > > +++ b/fs/nfsd/nfs4state.c
> > > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > >         }
> > > >
> > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > +       new->se_cb_slot_avail = ~0U;
> > > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > +       spin_lock_init(&new->se_lock);
> > > >         return new;
> > > >  out_free:
> > > >         while (i--)
> > > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > >
> > > >         INIT_LIST_HEAD(&new->se_conns);
> > > >
> > > > -       new->se_cb_seq_nr = 1;
> > > > +       atomic_set(&new->se_ref, 0);
> > > >         new->se_dead = false;
> > > >         new->se_cb_prog = cses->callback_prog;
> > > >         new->se_cb_sec = cses->cb_sec;
> > > > -       atomic_set(&new->se_ref, 0);
> > > > +
> > > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > > +               new->se_cb_seq_nr[idx] = 1;
> > > > +
> > > >         idx = hash_sessionid(&new->se_sessionid);
> > > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > > >         spin_lock(&clp->cl_lock);
> > > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > > >         clp->cl_time = ktime_get_boottime_seconds();
> > > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > > >         copy_verf(clp, verf);
> > > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > > >         clp->cl_cb_session = NULL;
> > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > > --- a/fs/nfsd/state.h
> > > > +++ b/fs/nfsd/state.h
> > > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > > >         struct work_struct cb_work;
> > > >         int cb_seq_status;
> > > >         int cb_status;
> > > > +       int cb_held_slot;
> > > >         bool cb_need_restart;
> > > > -       bool cb_holds_slot;
> > > >  };
> > > >
> > > >  struct nfsd4_callback_ops {
> > > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > > >         unsigned char cn_flags;
> > > >  };
> > > >
> > > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > > > +
> > > >  /*
> > > >   * Representation of a v4.1+ session. These are refcounted in a similar fashion
> > > >   * to the nfs4_client. References are only taken when the server is actively
> > > > @@ -314,6 +317,10 @@ struct nfsd4_conn {
> > > >   */
> > > >  struct nfsd4_session {
> > > >         atomic_t                se_ref;
> > > > +       spinlock_t              se_lock;
> > > > +       u32                     se_cb_slot_avail; /* bitmap of available slots */
> > > > +       u32                     se_cb_highest_slot;     /* highest slot client wants */
> > > > +       u32                     se_cb_prog;
> > > >         bool                    se_dead;
> > > >         struct list_head        se_hash;        /* hash by sessionid */
> > > >         struct list_head        se_perclnt;
> > > > @@ -322,8 +329,7 @@ struct nfsd4_session {
> > > >         struct nfsd4_channel_attrs se_fchannel;
> > > >         struct nfsd4_cb_sec     se_cb_sec;
> > > >         struct list_head        se_conns;
> > > > -       u32                     se_cb_prog;
> > > > -       u32                     se_cb_seq_nr;
> > > > +       u32                     se_cb_seq_nr[NFSD_BC_SLOT_TABLE_MAX + 1];
> > > >         struct nfsd4_slot       *se_slots[];    /* forward channel slots */
> > > >  };
> > > >
> > > > @@ -457,9 +463,6 @@ struct nfs4_client {
> > > >          */
> > > >         struct dentry           *cl_nfsd_info_dentry;
> > > >
> > > > -       /* for nfs41 callbacks */
> > > > -       /* We currently support a single back channel with a single slot */
> > > > -       unsigned long           cl_cb_slot_busy;
> > > >         struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
> > > >                                                 /* wait here for slots */
> > > >         struct net              *net;
> > > > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > > > index f318898cfc31614b5a84a4867e18c2b3a07122c9..a9c17186b6892f1df8d7f7b90e250c2913ab23fe 100644
> > > > --- a/fs/nfsd/trace.h
> > > > +++ b/fs/nfsd/trace.h
> > > > @@ -1697,7 +1697,7 @@ TRACE_EVENT(nfsd_cb_free_slot,
> > > >                 __entry->cl_id = sid->clientid.cl_id;
> > > >                 __entry->seqno = sid->sequence;
> > > >                 __entry->reserved = sid->reserved;
> > > > -               __entry->slot_seqno = session->se_cb_seq_nr;
> > > > +               __entry->slot_seqno = session->se_cb_seq_nr[cb->cb_held_slot];
> > > >         ),
> > > >         TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
> > > >                 " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u",
> > > >
> > > > ---
> > > > base-commit: 3c16aac09d20f9005fbb0e737b3ec520bbb5badd
> > > > change-id: 20241025-bcwide-6bd7e4b63db2
> > > >
> > > > Best regards,
> > > > --
> > > > Jeff Layton <jlayton@kernel.org>
> > > >
> > > >
>
> --
> Jeff Layton <jlayton@kernel.org>
NeilBrown Nov. 12, 2024, 11:23 p.m. UTC | #24
On Tue, 12 Nov 2024, Olga Kornievskaia wrote:
> On Mon, Nov 11, 2024 at 12:40 PM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > On Mon, 2024-11-11 at 12:17 -0500, Olga Kornievskaia wrote:
> > > On Mon, Nov 11, 2024 at 9:56 AM Chuck Lever <chuck.lever@oracle.com> wrote:
> > > >
> > > > On Mon, Nov 11, 2024 at 08:22:07AM -0500, Jeff Layton wrote:
> > > > > On Sun, 2024-11-10 at 21:19 -0500, Olga Kornievskaia wrote:
> > > > > > On Sat, Nov 9, 2024 at 2:26 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > >
> > > > > > > On Sat, 2024-11-09 at 13:50 -0500, Olga Kornievskaia wrote:
> > > > > > > > On Tue, Nov 5, 2024 at 7:31 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > > >
> > > > > > > > > nfsd currently only uses a single slot in the callback channel, which is
> > > > > > > > > proving to be a bottleneck in some cases. Widen the callback channel to
> > > > > > > > > a max of 32 slots (subject to the client's target_maxreqs value).
> > > > > > > > >
> > > > > > > > > Change the cb_holds_slot boolean to an integer that tracks the current
> > > > > > > > > slot number (with -1 meaning "unassigned").  Move the callback slot
> > > > > > > > > tracking info into the session. Add a new u32 that acts as a bitmap to
> > > > > > > > > track which slots are in use, and a u32 to track the latest callback
> > > > > > > > > target_slotid that the client reports. To protect the new fields, add
> > > > > > > > > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > > > > > > > > search for the lowest slotid (using ffs()).
> > > > > > > > >
> > > > > > > > > Finally, convert the session->se_cb_seq_nr field into an array of
> > > > > > > > > counters and add the necessary handling to ensure that the seqids get
> > > > > > > > > reset at the appropriate times.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > > > > > > > ---
> > > > > > > > > v3 has a bug that Olga hit in testing. This version should fix the wait
> > > > > > > > > when the slot table is full. Olga, if you're able to test this one, it
> > > > > > > > > would be much appreciated.
> > > > > > > > > ---
> > > > > > > > > Changes in v4:
> > > > > > > > > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > > > > > > > > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > > > > > > > >
> > > > > > > > > Changes in v3:
> > > > > > > > > - add patch to convert se_flags to single se_dead bool
> > > > > > > > > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > > > > > > > > - don't reject target highest slot value of 0
> > > > > > > > > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > > > > > > > >
> > > > > > > > > Changes in v2:
> > > > > > > > > - take cl_lock when fetching fields from session to be encoded
> > > > > > > > > - use fls() instead of bespoke highest_unset_index()
> > > > > > > > > - rename variables in several functions with more descriptive names
> > > > > > > > > - clamp limit of for loop in update_cb_slot_table()
> > > > > > > > > - re-add missing rpc_wake_up_queued_task() call
> > > > > > > > > - fix slotid check in decode_cb_sequence4resok()
> > > > > > > > > - add new per-session spinlock
> > > > > > > > > ---
> > > > > > > > >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> > > > > > > > >  fs/nfsd/nfs4state.c    |  11 +++--
> > > > > > > > >  fs/nfsd/state.h        |  15 ++++---
> > > > > > > > >  fs/nfsd/trace.h        |   2 +-
> > > > > > > > >  4 files changed, 101 insertions(+), 40 deletions(-)
> > > > > > > > >
> > > > > > > > > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > > > > > > > > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > > > > > > > > --- a/fs/nfsd/nfs4callback.c
> > > > > > > > > +++ b/fs/nfsd/nfs4callback.c
> > > > > > > > > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> > > > > > > > >         hdr->nops++;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +static u32 highest_slotid(struct nfsd4_session *ses)
> > > > > > > > > +{
> > > > > > > > > +       u32 idx;
> > > > > > > > > +
> > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > +       idx = fls(~ses->se_cb_slot_avail);
> > > > > > > > > +       if (idx > 0)
> > > > > > > > > +               --idx;
> > > > > > > > > +       idx = max(idx, ses->se_cb_highest_slot);
> > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > +       return idx;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  /*
> > > > > > > > >   * CB_SEQUENCE4args
> > > > > > > > >   *
> > > > > > > > > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> > > > > > > > >         encode_sessionid4(xdr, session);
> > > > > > > > >
> > > > > > > > >         p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > > > > > > > > -       *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
> > > > > > > > > -       *p++ = xdr_zero;                        /* csa_slotid */
> > > > > > > > > -       *p++ = xdr_zero;                        /* csa_highest_slotid */
> > > > > > > > > +       *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);    /* csa_sequenceid */
> > > > > > > > > +       *p++ = cpu_to_be32(cb->cb_held_slot);           /* csa_slotid */
> > > > > > > > > +       *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> > > > > > > > >         *p++ = xdr_zero;                        /* csa_cachethis */
> > > > > > > > >         xdr_encode_empty_array(p);              /* csa_referring_call_lists */
> > > > > > > > >
> > > > > > > > >         hdr->nops++;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > > > > > > > > +{
> > > > > > > > > +       /* No need to do anything if nothing changed */
> > > > > > > > > +       if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > > > > > > > > +               return;
> > > > > > > > > +
> > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > +       if (target > ses->se_cb_highest_slot) {
> > > > > > > > > +               int i;
> > > > > > > > > +
> > > > > > > > > +               target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > > > > > > > +
> > > > > > > > > +               /* Growing the slot table. Reset any new sequences to 1 */
> > > > > > > > > +               for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > > > > > > > +                       ses->se_cb_seq_nr[i] = 1;
> > > > > > > > > +       }
> > > > > > > > > +       ses->se_cb_highest_slot = target;
> > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  /*
> > > > > > > > >   * CB_SEQUENCE4resok
> > > > > > > > >   *
> > > > > > > > > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > > > >         struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> > > > > > > > >         int status = -ESERVERFAULT;
> > > > > > > > >         __be32 *p;
> > > > > > > > > -       u32 dummy;
> > > > > > > > > +       u32 seqid, slotid, target;
> > > > > > > > >
> > > > > > > > >         /*
> > > > > > > > >          * If the server returns different values for sessionID, slotID or
> > > > > > > > > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> > > > > > > > >         }
> > > > > > > > >         p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> > > > > > > > >
> > > > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > > > -       if (dummy != session->se_cb_seq_nr) {
> > > > > > > > > +       seqid = be32_to_cpup(p++);
> > > > > > > > > +       if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> > > > > > > > >                 dprintk("NFS: %s Invalid sequence number\n", __func__);
> > > > > > > > >                 goto out;
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > > -       dummy = be32_to_cpup(p++);
> > > > > > > > > -       if (dummy != 0) {
> > > > > > > > > +       slotid = be32_to_cpup(p++);
> > > > > > > > > +       if (slotid != cb->cb_held_slot) {
> > > > > > > > >                 dprintk("NFS: %s Invalid slotid\n", __func__);
> > > > > > > > >                 goto out;
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > > -       /*
> > > > > > > > > -        * FIXME: process highest slotid and target highest slotid
> > > > > > > > > -        */
> > > > > > > > > +       p++; // ignore current highest slot value
> > > > > > > > > +
> > > > > > > > > +       target = be32_to_cpup(p++);
> > > > > > > > > +       update_cb_slot_table(session, target);
> > > > > > > > >         status = 0;
> > > > > > > > >  out:
> > > > > > > > >         cb->cb_seq_status = status;
> > > > > > > > > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > > > >         spin_unlock(&clp->cl_lock);
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +static int grab_slot(struct nfsd4_session *ses)
> > > > > > > > > +{
> > > > > > > > > +       int idx;
> > > > > > > > > +
> > > > > > > > > +       spin_lock(&ses->se_lock);
> > > > > > > > > +       idx = ffs(ses->se_cb_slot_avail) - 1;
> > > > > > > > > +       if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > > > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > > > +               return -1;
> > > > > > > > > +       }
> > > > > > > > > +       /* clear the bit for the slot */
> > > > > > > > > +       ses->se_cb_slot_avail &= ~BIT(idx);
> > > > > > > > > +       spin_unlock(&ses->se_lock);
> > > > > > > > > +       return idx;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  /*
> > > > > > > > >   * There's currently a single callback channel slot.
> > > > > > > > >   * If the slot is available, then mark it busy.  Otherwise, set the
> > > > > > > > > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> > > > > > > > >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> > > > > > > > >  {
> > > > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > > >
> > > > > > > > > -       if (!cb->cb_holds_slot &&
> > > > > > > > > -           test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > > > +       if (cb->cb_held_slot >= 0)
> > > > > > > > > +               return true;
> > > > > > > > > +       cb->cb_held_slot = grab_slot(ses);
> > > > > > > > > +       if (cb->cb_held_slot < 0) {
> > > > > > > > >                 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> > > > > > > > >                 /* Race breaker */
> > > > > > > > > -               if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > > > > > > > > -                       dprintk("%s slot is busy\n", __func__);
> > > > > > > > > +               cb->cb_held_slot = grab_slot(ses);
> > > > > > > > > +               if (cb->cb_held_slot < 0)
> > > > > > > > >                         return false;
> > > > > > > > > -               }
> > > > > > > > >                 rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> > > > > > > > >         }
> > > > > > > > > -       cb->cb_holds_slot = true;
> > > > > > > > >         return true;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> > > > > > > > >  {
> > > > > > > > >         struct nfs4_client *clp = cb->cb_clp;
> > > > > > > > > +       struct nfsd4_session *ses = clp->cl_cb_session;
> > > > > > > > >
> > > > > > > > > -       if (cb->cb_holds_slot) {
> > > > > > > > > -               cb->cb_holds_slot = false;
> > > > > > > > > -               clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > > > > +       if (cb->cb_held_slot >= 0) {
> > > > > > > > > +               spin_lock(&ses->se_lock);
> > > > > > > > > +               ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > > > > > > > > +               spin_unlock(&ses->se_lock);
> > > > > > > > > +               cb->cb_held_slot = -1;
> > > > > > > > >                 rpc_wake_up_next(&clp->cl_cb_waitq);
> > > > > > > > >         }
> > > > > > > > >  }
> > > > > > > > > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > >  /*
> > > > > > > > > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > > > > > > > > - * slots, and mark callback channel down on communication errors.
> > > > > > > > > + * TODO: cb_sequence should support referring call lists, cachethis,
> > > > > > > > > + * and mark callback channel down on communication errors.
> > > > > > > > >   */
> > > > > > > > >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> > > > > > > > >  {
> > > > > > > > > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > >                 return true;
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > > -       if (!cb->cb_holds_slot)
> > > > > > > > > +       if (cb->cb_held_slot < 0)
> > > > > > > > >                 goto need_restart;
> > > > > > > > >
> > > > > > > > >         /* This is the operation status code for CB_SEQUENCE */
> > > > > > > > > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > >                  * If CB_SEQUENCE returns an error, then the state of the slot
> > > > > > > > >                  * (sequence ID, cached reply) MUST NOT change.
> > > > > > > > >                  */
> > > > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > > > >                 break;
> > > > > > > > >         case -ESERVERFAULT:
> > > > > > > > > -               ++session->se_cb_seq_nr;
> > > > > > > > > +               ++session->se_cb_seq_nr[cb->cb_held_slot];
> > > > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > > > >                 ret = false;
> > > > > > > > >                 break;
> > > > > > > > > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> > > > > > > > >         case -NFS4ERR_BADSLOT:
> > > > > > > > >                 goto retry_nowait;
> > > > > > > > >         case -NFS4ERR_SEQ_MISORDERED:
> > > > > > > > > -               if (session->se_cb_seq_nr != 1) {
> > > > > > > > > -                       session->se_cb_seq_nr = 1;
> > > > > > > > > +               if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > > > > > > > +                       session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > > > > > > > >                         goto retry_nowait;
> > > > > > > > >                 }
> > > > > > > > >                 break;
> > > > > > > > >         default:
> > > > > > > > >                 nfsd4_mark_cb_fault(cb->cb_clp);
> > > > > > > > >         }
> > > > > > > > > -       nfsd41_cb_release_slot(cb);
> > > > > > > > > -
> > > > > > > > >         trace_nfsd_cb_free_slot(task, cb);
> > > > > > > > > +       nfsd41_cb_release_slot(cb);
> > > > > > > > >
> > > > > > > > >         if (RPC_SIGNALLED(task))
> > > > > > > > >                 goto need_restart;
> > > > > > > > > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> > > > > > > > >         INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> > > > > > > > >         cb->cb_status = 0;
> > > > > > > > >         cb->cb_need_restart = false;
> > > > > > > > > -       cb->cb_holds_slot = false;
> > > > > > > > > +       cb->cb_held_slot = -1;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > >  /**
> > > > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > > > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > > > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > > > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > > > > > +       new->se_cb_slot_avail = ~0U;
> > > > > > > > > +       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > > > > +       spin_lock_init(&new->se_lock);
> > > > > > > > >         return new;
> > > > > > > > >  out_free:
> > > > > > > > >         while (i--)
> > > > > > > > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > > > > > > >
> > > > > > > > >         INIT_LIST_HEAD(&new->se_conns);
> > > > > > > > >
> > > > > > > > > -       new->se_cb_seq_nr = 1;
> > > > > > > > > +       atomic_set(&new->se_ref, 0);
> > > > > > > > >         new->se_dead = false;
> > > > > > > > >         new->se_cb_prog = cses->callback_prog;
> > > > > > > > >         new->se_cb_sec = cses->cb_sec;
> > > > > > > > > -       atomic_set(&new->se_ref, 0);
> > > > > > > > > +
> > > > > > > > > +       for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > > > > > > > +               new->se_cb_seq_nr[idx] = 1;
> > > > > > > > > +
> > > > > > > > >         idx = hash_sessionid(&new->se_sessionid);
> > > > > > > > >         list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> > > > > > > > >         spin_lock(&clp->cl_lock);
> > > > > > > > > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> > > > > > > > >         kref_init(&clp->cl_nfsdfs.cl_ref);
> > > > > > > > >         nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> > > > > > > > >         clp->cl_time = ktime_get_boottime_seconds();
> > > > > > > > > -       clear_bit(0, &clp->cl_cb_slot_busy);
> > > > > > > > >         copy_verf(clp, verf);
> > > > > > > > >         memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> > > > > > > > >         clp->cl_cb_session = NULL;
> > > > > > > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > > > > > > > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > > > > > > > > --- a/fs/nfsd/state.h
> > > > > > > > > +++ b/fs/nfsd/state.h
> > > > > > > > > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> > > > > > > > >         struct work_struct cb_work;
> > > > > > > > >         int cb_seq_status;
> > > > > > > > >         int cb_status;
> > > > > > > > > +       int cb_held_slot;
> > > > > > > > >         bool cb_need_restart;
> > > > > > > > > -       bool cb_holds_slot;
> > > > > > > > >  };
> > > > > > > > >
> > > > > > > > >  struct nfsd4_callback_ops {
> > > > > > > > > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> > > > > > > > >         unsigned char cn_flags;
> > > > > > > > >  };
> > > > > > > > >
> > > > > > > > > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > > > > > > > > +#define NFSD_BC_SLOT_TABLE_MAX (sizeof(u32) * 8 - 1)
> > > > > > > >
> > > > > > > > Are there some values that are known not to work? I was experimenting
> > > > > > > > with values and set it to 2 and 4 and the kernel oopsed. I understand
> > > > > > > > it's not a configurable value but it would still be good to know the
> > > > > > > > expectations...
> > > > > > > >
> > > > > > > > [  198.625021] Unable to handle kernel paging request at virtual
> > > > > > > > address dfff800020000000
> > > > > > > > [  198.625870] KASAN: probably user-memory-access in range
> > > > > > > > [0x0000000100000000-0x0000000100000007]
> > > > > > > > [  198.626444] Mem abort info:
> > > > > > > > [  198.626630]   ESR = 0x0000000096000005
> > > > > > > > [  198.626882]   EC = 0x25: DABT (current EL), IL = 32 bits
> > > > > > > > [  198.627234]   SET = 0, FnV = 0
> > > > > > > > [  198.627441]   EA = 0, S1PTW = 0
> > > > > > > > [  198.627627]   FSC = 0x05: level 1 translation fault
> > > > > > > > [  198.627859] Data abort info:
> > > > > > > > [  198.628000]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
> > > > > > > > [  198.628272]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> > > > > > > > [  198.628619]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> > > > > > > > [  198.628967] [dfff800020000000] address between user and kernel address ranges
> > > > > > > > [  198.629438] Internal error: Oops: 0000000096000005 [#1] SMP
> > > > > > > > [  198.629806] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver
> > > > > > > > nfs netfs nfnetlink_queue nfnetlink_log nfnetlink bluetooth cfg80211
> > > > > > > > rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd auth_rpcgss nfs_acl lockd
> > > > > > > > grace isofs uinput snd_seq_dummy snd_hrtimer vsock_loopback
> > > > > > > > vmw_vsock_virtio_transport_common qrtr rfkill vmw_vsock_vmci_transport
> > > > > > > > vsock sunrpc vfat fat snd_hda_codec_generic snd_hda_intel
> > > > > > > > snd_intel_dspcfg snd_hda_codec snd_hda_core snd_hwdep snd_seq uvcvideo
> > > > > > > > videobuf2_vmalloc snd_seq_device videobuf2_memops uvc videobuf2_v4l2
> > > > > > > > videodev snd_pcm videobuf2_common mc snd_timer snd vmw_vmci soundcore
> > > > > > > > xfs libcrc32c vmwgfx drm_ttm_helper ttm nvme drm_kms_helper
> > > > > > > > crct10dif_ce nvme_core ghash_ce sha2_ce sha256_arm64 sha1_ce drm
> > > > > > > > nvme_auth sr_mod cdrom e1000e sg fuse
> > > > > > > > [  198.633799] CPU: 5 UID: 0 PID: 6081 Comm: nfsd Kdump: loaded Not
> > > > > > > > tainted 6.12.0-rc6+ #47
> > > > > > > > [  198.634345] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS
> > > > > > > > VMW201.00V.21805430.BA64.2305221830 05/22/2023
> > > > > > > > [  198.635014] pstate: 11400005 (nzcV daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> > > > > > > > [  198.635492] pc : nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > > > [  198.635798] lr : nfsd4_sequence+0x340/0x1f60 [nfsd]
> > > > > > > > [  198.636065] sp : ffff8000884977e0
> > > > > > > > [  198.636234] x29: ffff800088497910 x28: ffff0000b1b39280 x27: ffff0000ab508128
> > > > > > > > [  198.636624] x26: ffff0000b1b39298 x25: ffff0000b1b39290 x24: ffff0000a65e1c64
> > > > > > > > [  198.637049] x23: 1fffe000212e6804 x22: ffff000109734024 x21: 1ffff00011092f16
> > > > > > > > [  198.637472] x20: ffff00010aed8000 x19: ffff000109734000 x18: 1fffe0002de20c8b
> > > > > > > > [  198.637883] x17: 0100000000000000 x16: 1ffff0000fcef234 x15: 1fffe000212e600f
> > > > > > > > [  198.638286] x14: ffff80007e779000 x13: ffff80007e7791a0 x12: 0000000000000000
> > > > > > > > [  198.638697] x11: ffff0000a65e1c38 x10: ffff00010aedaca0 x9 : 1fffe000215db594
> > > > > > > > [  198.639110] x8 : 1fffe00014cbc387 x7 : ffff0000a65e1c03 x6 : ffff0000a65e1c00
> > > > > > > > [  198.639541] x5 : ffff0000a65e1c00 x4 : 0000000020000000 x3 : 0000000100000001
> > > > > > > > [  198.639962] x2 : ffff000109730060 x1 : 0000000000000003 x0 : dfff800000000000
> > > > > > > > [  198.640332] Call trace:
> > > > > > > > [  198.640460]  nfsd4_sequence+0x5a0/0x1f60 [nfsd]
> > > > > > > > [  198.640715]  nfsd4_proc_compound+0xb94/0x23b0 [nfsd]
> > > > > > > > [  198.640997]  nfsd_dispatch+0x22c/0x718 [nfsd]
> > > > > > > > [  198.641260]  svc_process_common+0x8e8/0x1968 [sunrpc]
> > > > > > > > [  198.641566]  svc_process+0x3d4/0x7e0 [sunrpc]
> > > > > > > > [  198.641827]  svc_handle_xprt+0x828/0xe10 [sunrpc]
> > > > > > > > [  198.642108]  svc_recv+0x2cc/0x6a8 [sunrpc]
> > > > > > > > [  198.642346]  nfsd+0x270/0x400 [nfsd]
> > > > > > > > [  198.642562]  kthread+0x288/0x310
> > > > > > > > [  198.642745]  ret_from_fork+0x10/0x20
> > > > > > > > [  198.642937] Code: f2fbffe0 f9003be4 f94007e2 52800061 (38e06880)
> > > > > > > > [  198.643267] SMP: stopping secondary CPUs
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > Good catch. I think the problem here is that we don't currently cap the
> > > > > > > initial value of se_cb_highest_slot at NFSD_BC_SLOT_TABLE_MAX. Does
> > > > > > > this patch prevent the panic?
> > > > > > >
> > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > index 3afe56ab9e0a..839be4ba765a 100644
> > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > @@ -2011,7 +2011,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> > > > > > >
> > > > > > >         memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > > > > > >         new->se_cb_slot_avail = ~0U;
> > > > > > > -       new->se_cb_highest_slot = battrs->maxreqs - 1;
> > > > > > > +       new->se_cb_highest_slot = min(battrs->maxreqs - 1, NFSD_BC_SLOT_TABLE_MAX);
> > > > > > >         spin_lock_init(&new->se_lock);
> > > > > > >         return new;
> > > > > > >  out_free:
> > > > > >
> > > > > > It does help. I thought that the CREATE_SESSION reply for the
> > > > > > backchannel would be guided by the NFSD_BC_SLOT_TABLE_MAX value but
> > > > > > instead it seems like it's not. But yes I can see that the highest
> > > > > > slot used by the server is capped by the NFSD_BC_SLOT_TABLE_MAX value.
> > > > >
> > > > > Thanks for testing it, Olga.
> > > > >
> > > > > Chuck, would you be OK with folding the above delta into 9ab4c4077de9,
> > > > > or would you rather I resend the patch?
> > > >
> > > > I've folded the above one-liner into the applied patch.
> > > >
> > > > I agree with Tom, I think there's probably a (surprising)
> > > > explanation lurking for not seeing the expected performance
> > > > improvement. I can delay sending the NFSD v6.13 merge window pull
> > > > request for a bit to see if you can get it teased out.
> > >
> > > I would like to raise a couple of issues:
> > > (1) I believe the server should be reporting back an accurate value
> > > for the backchannel session table size. I think if the
> > > NFSD_BC_SLOT_TABLE_MAX was way lower than the client's value then the
> > > client would be wasting resources for its bc session table?
> >
> > Yes, but those resources are 32-bit integer per wasted slot. The Linux
> > client allows for up to 16 slots, so we're wasting 64 bytes per session
> > with this scheme with the Linux client. I didn't think it was worth
> > doing a separate allocation for that.
> >
> > We could make NFSD_BC_SLOT_TABLE_MAX smaller though. Maybe we should
> > match the client's size and make it 15?
> >
> > > ->back_channel->maxreqs gets decoded in nfsd4_decode_create_session()
> > > and is never adjusted for the reply to be based on the
> > > NFSD_BC_SLOT_TABLE_MAX. The problem is currently invisible because
> > > linux client's bc slot table size is 16 and nfsd's is higher.
> > >
> >
> > I'm not sure I understand the problem here. We don't care about most of
> > the backchannel attributes. maxreqs is the only one that matters, and
> > track that in se_cb_highest_slot.
> 
> Client sends a create_session with cba_back_chan_attrs with max_reqs
> of 16 -- stating that the client can handle 16 slots in it's slot
> table. Server currently doesn't do anything about reflecting back to
> the client its session slot table. It blindly returns what the client
> sent. Say NFSD_BC_SLOT_TABLE_MAX was 4. Server would never use more
> than 4 slots and yet the client would have to create a reply cache
> table for 16 slots. Isn't that poor sportsmanship on behalf of the
> linux server?

RFC8881 section 18.36.2 - Description for CREATE_SESSION

ca_maxrequests:

 The maximum number of concurrent COMPOUND or CB_COMPOUND requests the
 requester will send on the session.  Subsequent requests will each be
 assigned a slot identifier by the requester within the range zero to
 ca_maxrequests - 1 inclusive.  For the backchannel, the server MUST NOT
 change the value the client offers.  For the fore channel, the server
 MAY change the requested value. 

The "MUST NOT" doesn't seem ambiguous.

NeilBrown
NeilBrown Nov. 13, 2024, 12:07 a.m. UTC | #25
On Wed, 06 Nov 2024, Jeff Layton wrote:
> nfsd currently only uses a single slot in the callback channel, which is
> proving to be a bottleneck in some cases. Widen the callback channel to
> a max of 32 slots (subject to the client's target_maxreqs value).
> 
> Change the cb_holds_slot boolean to an integer that tracks the current
> slot number (with -1 meaning "unassigned").  Move the callback slot
> tracking info into the session. Add a new u32 that acts as a bitmap to
> track which slots are in use, and a u32 to track the latest callback
> target_slotid that the client reports. To protect the new fields, add
> a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> search for the lowest slotid (using ffs()).
> 
> Finally, convert the session->se_cb_seq_nr field into an array of
> counters and add the necessary handling to ensure that the seqids get
> reset at the appropriate times.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
> v3 has a bug that Olga hit in testing. This version should fix the wait
> when the slot table is full. Olga, if you're able to test this one, it
> would be much appreciated.
> ---
> Changes in v4:
> - Fix the wait for a slot in nfsd41_cb_get_slot()
> - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> 
> Changes in v3:
> - add patch to convert se_flags to single se_dead bool
> - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> - don't reject target highest slot value of 0
> - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> 
> Changes in v2:
> - take cl_lock when fetching fields from session to be encoded
> - use fls() instead of bespoke highest_unset_index()
> - rename variables in several functions with more descriptive names
> - clamp limit of for loop in update_cb_slot_table()
> - re-add missing rpc_wake_up_queued_task() call
> - fix slotid check in decode_cb_sequence4resok()
> - add new per-session spinlock
> ---
>  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
>  fs/nfsd/nfs4state.c    |  11 +++--
>  fs/nfsd/state.h        |  15 ++++---
>  fs/nfsd/trace.h        |   2 +-
>  4 files changed, 101 insertions(+), 40 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> --- a/fs/nfsd/nfs4callback.c
> +++ b/fs/nfsd/nfs4callback.c
> @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
>  	hdr->nops++;
>  }
>  
> +static u32 highest_slotid(struct nfsd4_session *ses)
> +{
> +	u32 idx;
> +
> +	spin_lock(&ses->se_lock);
> +	idx = fls(~ses->se_cb_slot_avail);
> +	if (idx > 0)
> +		--idx;
> +	idx = max(idx, ses->se_cb_highest_slot);
> +	spin_unlock(&ses->se_lock);
> +	return idx;
> +}
> +
>  /*
>   * CB_SEQUENCE4args
>   *
> @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
>  	encode_sessionid4(xdr, session);
>  
>  	p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> -	*p++ = cpu_to_be32(session->se_cb_seq_nr);	/* csa_sequenceid */
> -	*p++ = xdr_zero;			/* csa_slotid */
> -	*p++ = xdr_zero;			/* csa_highest_slotid */
> +	*p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);	/* csa_sequenceid */
> +	*p++ = cpu_to_be32(cb->cb_held_slot);		/* csa_slotid */
> +	*p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
>  	*p++ = xdr_zero;			/* csa_cachethis */
>  	xdr_encode_empty_array(p);		/* csa_referring_call_lists */
>  
>  	hdr->nops++;
>  }
>  
> +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> +{
> +	/* No need to do anything if nothing changed */
> +	if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> +		return;
> +
> +	spin_lock(&ses->se_lock);
> +	if (target > ses->se_cb_highest_slot) {
> +		int i;
> +
> +		target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> +
> +		/* Growing the slot table. Reset any new sequences to 1 */
> +		for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> +			ses->se_cb_seq_nr[i] = 1;

Where is the justification in the RFC for resetting the sequence
numbers?

The csr_target_highest_slotid from the client - which is the value passed as
'target' is defined as:

   the highest slot ID the client would prefer the server use on a
   future CB_SEQUENCE operation. 

This is not "the highest slot ID for which the client is remembering
sequence numbers".

If we can get rid of this, then I think the need for se_lock evaporates.
Allocating a new slow would be

do {
 idx = ffs(ses->se_cb_slot_avail) - 1;
} while (is_valid(idx) && test_and_set_bit(idx, &ses->se_sb_slot_avail));
 
where is_valid(idX) is idx >= 0 && idx <= ses->se_sb_highest_slot


> +	}
> +	ses->se_cb_highest_slot = target;
> +	spin_unlock(&ses->se_lock);
> +}
> +
>  /*
>   * CB_SEQUENCE4resok
>   *
> @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
>  	struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
>  	int status = -ESERVERFAULT;
>  	__be32 *p;
> -	u32 dummy;
> +	u32 seqid, slotid, target;
>  
>  	/*
>  	 * If the server returns different values for sessionID, slotID or
> @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
>  	}
>  	p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
>  
> -	dummy = be32_to_cpup(p++);
> -	if (dummy != session->se_cb_seq_nr) {
> +	seqid = be32_to_cpup(p++);
> +	if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
>  		dprintk("NFS: %s Invalid sequence number\n", __func__);
>  		goto out;
>  	}
>  
> -	dummy = be32_to_cpup(p++);
> -	if (dummy != 0) {
> +	slotid = be32_to_cpup(p++);
> +	if (slotid != cb->cb_held_slot) {
>  		dprintk("NFS: %s Invalid slotid\n", __func__);
>  		goto out;
>  	}
>  
> -	/*
> -	 * FIXME: process highest slotid and target highest slotid
> -	 */
> +	p++; // ignore current highest slot value
> +
> +	target = be32_to_cpup(p++);
> +	update_cb_slot_table(session, target);
>  	status = 0;
>  out:
>  	cb->cb_seq_status = status;
> @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
>  	spin_unlock(&clp->cl_lock);
>  }
>  
> +static int grab_slot(struct nfsd4_session *ses)
> +{
> +	int idx;
> +
> +	spin_lock(&ses->se_lock);
> +	idx = ffs(ses->se_cb_slot_avail) - 1;
> +	if (idx < 0 || idx > ses->se_cb_highest_slot) {
> +		spin_unlock(&ses->se_lock);
> +		return -1;
> +	}
> +	/* clear the bit for the slot */
> +	ses->se_cb_slot_avail &= ~BIT(idx);
> +	spin_unlock(&ses->se_lock);
> +	return idx;
> +}
> +
>  /*
>   * There's currently a single callback channel slot.
>   * If the slot is available, then mark it busy.  Otherwise, set the
> @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
>  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
>  {
>  	struct nfs4_client *clp = cb->cb_clp;
> +	struct nfsd4_session *ses = clp->cl_cb_session;
>  
> -	if (!cb->cb_holds_slot &&
> -	    test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> +	if (cb->cb_held_slot >= 0)
> +		return true;
> +	cb->cb_held_slot = grab_slot(ses);
> +	if (cb->cb_held_slot < 0) {
>  		rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
>  		/* Race breaker */
> -		if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> -			dprintk("%s slot is busy\n", __func__);
> +		cb->cb_held_slot = grab_slot(ses);
> +		if (cb->cb_held_slot < 0)
>  			return false;
> -		}
>  		rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
>  	}
> -	cb->cb_holds_slot = true;
>  	return true;
>  }
>  
>  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
>  {
>  	struct nfs4_client *clp = cb->cb_clp;
> +	struct nfsd4_session *ses = clp->cl_cb_session;
>  
> -	if (cb->cb_holds_slot) {
> -		cb->cb_holds_slot = false;
> -		clear_bit(0, &clp->cl_cb_slot_busy);
> +	if (cb->cb_held_slot >= 0) {
> +		spin_lock(&ses->se_lock);
> +		ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> +		spin_unlock(&ses->se_lock);
> +		cb->cb_held_slot = -1;
>  		rpc_wake_up_next(&clp->cl_cb_waitq);
>  	}
>  }
> @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
>  }
>  
>  /*
> - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> - * slots, and mark callback channel down on communication errors.
> + * TODO: cb_sequence should support referring call lists, cachethis,
> + * and mark callback channel down on communication errors.
>   */
>  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
>  {
> @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
>  		return true;
>  	}
>  
> -	if (!cb->cb_holds_slot)
> +	if (cb->cb_held_slot < 0)
>  		goto need_restart;
>  
>  	/* This is the operation status code for CB_SEQUENCE */
> @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
>  		 * If CB_SEQUENCE returns an error, then the state of the slot
>  		 * (sequence ID, cached reply) MUST NOT change.
>  		 */
> -		++session->se_cb_seq_nr;
> +		++session->se_cb_seq_nr[cb->cb_held_slot];
>  		break;
>  	case -ESERVERFAULT:
> -		++session->se_cb_seq_nr;
> +		++session->se_cb_seq_nr[cb->cb_held_slot];
>  		nfsd4_mark_cb_fault(cb->cb_clp);
>  		ret = false;
>  		break;
> @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
>  	case -NFS4ERR_BADSLOT:
>  		goto retry_nowait;
>  	case -NFS4ERR_SEQ_MISORDERED:
> -		if (session->se_cb_seq_nr != 1) {
> -			session->se_cb_seq_nr = 1;
> +		if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> +			session->se_cb_seq_nr[cb->cb_held_slot] = 1;

This is weird ...  why do we reset the seq_nr to 1 when we get
SEQ_MISORDERED??  Git logs don't shed any light :-(


>  			goto retry_nowait;
>  		}
>  		break;
>  	default:
>  		nfsd4_mark_cb_fault(cb->cb_clp);
>  	}
> -	nfsd41_cb_release_slot(cb);
> -
>  	trace_nfsd_cb_free_slot(task, cb);
> +	nfsd41_cb_release_slot(cb);
>  
>  	if (RPC_SIGNALLED(task))
>  		goto need_restart;
> @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
>  	INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
>  	cb->cb_status = 0;
>  	cb->cb_need_restart = false;
> -	cb->cb_holds_slot = false;
> +	cb->cb_held_slot = -1;
>  }
>  
>  /**
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
>  	}
>  
>  	memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> +	new->se_cb_slot_avail = ~0U;
> +	new->se_cb_highest_slot = battrs->maxreqs - 1;
> +	spin_lock_init(&new->se_lock);
>  	return new;
>  out_free:
>  	while (i--)
> @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
>  
>  	INIT_LIST_HEAD(&new->se_conns);
>  
> -	new->se_cb_seq_nr = 1;
> +	atomic_set(&new->se_ref, 0);
>  	new->se_dead = false;
>  	new->se_cb_prog = cses->callback_prog;
>  	new->se_cb_sec = cses->cb_sec;
> -	atomic_set(&new->se_ref, 0);
> +
> +	for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> +		new->se_cb_seq_nr[idx] = 1;

That should be "<= NFSD_BC_SLOT_TABLE_MAX"

I don't think *_MAX is a good choice of name.  It is the maximum number
of slots (no) or the maximum slot number (yes).
I think *_SIZE would be a better name - the size of the table that we
allocate. 32.
Looking at where the const is used in current nfsd-next:

		target = min(target, NFSD_BC_SLOT_TABLE_SIZE - 1

	new->se_cb_highest_slot = min(battrs->maxreqs,
				      NFSD_BC_SLOT_TABLE_SIZE) - 1;

	for (idx = 0; idx < NFSD_BC_SLOT_TABLE_SIZE; ++idx)

#define NFSD_BC_SLOT_TABLE_SIZE	(sizeof(u32) * 8)

	u32			se_cb_seq_nr[NFSD_BC_SLOT_TABLE_SIZE];

which is a slight reduction in the number of "+/-1" adjustments.




> +
>  	idx = hash_sessionid(&new->se_sessionid);
>  	list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
>  	spin_lock(&clp->cl_lock);
> @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
>  	kref_init(&clp->cl_nfsdfs.cl_ref);
>  	nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
>  	clp->cl_time = ktime_get_boottime_seconds();
> -	clear_bit(0, &clp->cl_cb_slot_busy);
>  	copy_verf(clp, verf);
>  	memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
>  	clp->cl_cb_session = NULL;
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -71,8 +71,8 @@ struct nfsd4_callback {
>  	struct work_struct cb_work;
>  	int cb_seq_status;
>  	int cb_status;
> +	int cb_held_slot;
>  	bool cb_need_restart;
> -	bool cb_holds_slot;
>  };
>  
>  struct nfsd4_callback_ops {
> @@ -307,6 +307,9 @@ struct nfsd4_conn {
>  	unsigned char cn_flags;
>  };
>  
> +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> +#define NFSD_BC_SLOT_TABLE_MAX	(sizeof(u32) * 8 - 1)
> +
>  /*
>   * Representation of a v4.1+ session. These are refcounted in a similar fashion
>   * to the nfs4_client. References are only taken when the server is actively
> @@ -314,6 +317,10 @@ struct nfsd4_conn {
>   */
>  struct nfsd4_session {
>  	atomic_t		se_ref;
> +	spinlock_t		se_lock;
> +	u32			se_cb_slot_avail; /* bitmap of available slots */
> +	u32			se_cb_highest_slot;	/* highest slot client wants */
> +	u32			se_cb_prog;
>  	bool			se_dead;
>  	struct list_head	se_hash;	/* hash by sessionid */
>  	struct list_head	se_perclnt;
> @@ -322,8 +329,7 @@ struct nfsd4_session {
>  	struct nfsd4_channel_attrs se_fchannel;
>  	struct nfsd4_cb_sec	se_cb_sec;
>  	struct list_head	se_conns;
> -	u32			se_cb_prog;
> -	u32			se_cb_seq_nr;
> +	u32			se_cb_seq_nr[NFSD_BC_SLOT_TABLE_MAX + 1];
>  	struct nfsd4_slot	*se_slots[];	/* forward channel slots */
>  };
>  
> @@ -457,9 +463,6 @@ struct nfs4_client {
>  	 */
>  	struct dentry		*cl_nfsd_info_dentry;
>  
> -	/* for nfs41 callbacks */
> -	/* We currently support a single back channel with a single slot */
> -	unsigned long		cl_cb_slot_busy;
>  	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
>  						/* wait here for slots */
>  	struct net		*net;
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index f318898cfc31614b5a84a4867e18c2b3a07122c9..a9c17186b6892f1df8d7f7b90e250c2913ab23fe 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -1697,7 +1697,7 @@ TRACE_EVENT(nfsd_cb_free_slot,
>  		__entry->cl_id = sid->clientid.cl_id;
>  		__entry->seqno = sid->sequence;
>  		__entry->reserved = sid->reserved;
> -		__entry->slot_seqno = session->se_cb_seq_nr;
> +		__entry->slot_seqno = session->se_cb_seq_nr[cb->cb_held_slot];
>  	),
>  	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
>  		" sessionid=%08x:%08x:%08x:%08x new slot seqno=%u",
> 
> ---
> base-commit: 3c16aac09d20f9005fbb0e737b3ec520bbb5badd
> change-id: 20241025-bcwide-6bd7e4b63db2
> 
> Best regards,
> -- 
> Jeff Layton <jlayton@kernel.org>
> 
> 

NeilBrown
Jeff Layton Nov. 13, 2024, 1:03 a.m. UTC | #26
On Wed, 2024-11-13 at 11:07 +1100, NeilBrown wrote:
> On Wed, 06 Nov 2024, Jeff Layton wrote:
> > nfsd currently only uses a single slot in the callback channel, which is
> > proving to be a bottleneck in some cases. Widen the callback channel to
> > a max of 32 slots (subject to the client's target_maxreqs value).
> > 
> > Change the cb_holds_slot boolean to an integer that tracks the current
> > slot number (with -1 meaning "unassigned").  Move the callback slot
> > tracking info into the session. Add a new u32 that acts as a bitmap to
> > track which slots are in use, and a u32 to track the latest callback
> > target_slotid that the client reports. To protect the new fields, add
> > a new per-session spinlock (the se_lock). Fix nfsd41_cb_get_slot to always
> > search for the lowest slotid (using ffs()).
> > 
> > Finally, convert the session->se_cb_seq_nr field into an array of
> > counters and add the necessary handling to ensure that the seqids get
> > reset at the appropriate times.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > ---
> > v3 has a bug that Olga hit in testing. This version should fix the wait
> > when the slot table is full. Olga, if you're able to test this one, it
> > would be much appreciated.
> > ---
> > Changes in v4:
> > - Fix the wait for a slot in nfsd41_cb_get_slot()
> > - Link to v3: https://lore.kernel.org/r/20241030-bcwide-v3-0-c2df49a26c45@kernel.org
> > 
> > Changes in v3:
> > - add patch to convert se_flags to single se_dead bool
> > - fix off-by-one bug in handling of NFSD_BC_SLOT_TABLE_MAX
> > - don't reject target highest slot value of 0
> > - Link to v2: https://lore.kernel.org/r/20241029-bcwide-v2-1-e9010b6ef55d@kernel.org
> > 
> > Changes in v2:
> > - take cl_lock when fetching fields from session to be encoded
> > - use fls() instead of bespoke highest_unset_index()
> > - rename variables in several functions with more descriptive names
> > - clamp limit of for loop in update_cb_slot_table()
> > - re-add missing rpc_wake_up_queued_task() call
> > - fix slotid check in decode_cb_sequence4resok()
> > - add new per-session spinlock
> > ---
> >  fs/nfsd/nfs4callback.c | 113 ++++++++++++++++++++++++++++++++++++-------------
> >  fs/nfsd/nfs4state.c    |  11 +++--
> >  fs/nfsd/state.h        |  15 ++++---
> >  fs/nfsd/trace.h        |   2 +-
> >  4 files changed, 101 insertions(+), 40 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
> > --- a/fs/nfsd/nfs4callback.c
> > +++ b/fs/nfsd/nfs4callback.c
> > @@ -406,6 +406,19 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
> >  	hdr->nops++;
> >  }
> >  
> > +static u32 highest_slotid(struct nfsd4_session *ses)
> > +{
> > +	u32 idx;
> > +
> > +	spin_lock(&ses->se_lock);
> > +	idx = fls(~ses->se_cb_slot_avail);
> > +	if (idx > 0)
> > +		--idx;
> > +	idx = max(idx, ses->se_cb_highest_slot);
> > +	spin_unlock(&ses->se_lock);
> > +	return idx;
> > +}
> > +
> >  /*
> >   * CB_SEQUENCE4args
> >   *
> > @@ -432,15 +445,35 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
> >  	encode_sessionid4(xdr, session);
> >  
> >  	p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
> > -	*p++ = cpu_to_be32(session->se_cb_seq_nr);	/* csa_sequenceid */
> > -	*p++ = xdr_zero;			/* csa_slotid */
> > -	*p++ = xdr_zero;			/* csa_highest_slotid */
> > +	*p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);	/* csa_sequenceid */
> > +	*p++ = cpu_to_be32(cb->cb_held_slot);		/* csa_slotid */
> > +	*p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
> >  	*p++ = xdr_zero;			/* csa_cachethis */
> >  	xdr_encode_empty_array(p);		/* csa_referring_call_lists */
> >  
> >  	hdr->nops++;
> >  }
> >  
> > +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
> > +{
> > +	/* No need to do anything if nothing changed */
> > +	if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
> > +		return;
> > +
> > +	spin_lock(&ses->se_lock);
> > +	if (target > ses->se_cb_highest_slot) {
> > +		int i;
> > +
> > +		target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > +
> > +		/* Growing the slot table. Reset any new sequences to 1 */
> > +		for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > +			ses->se_cb_seq_nr[i] = 1;
> 
> Where is the justification in the RFC for resetting the sequence
> numbers?
> 

RFC 8881, 18.36:



[...]

Once the session is created, the first SEQUENCE or CB_SEQUENCE received
on a slot MUST have a sequence ID equal to 1; if not, the replier MUST
return NFS4ERR_SEQ_MISORDERED.

There is also some verbiage in 20.10.6.1.

> The csr_target_highest_slotid from the client - which is the value passed as
> 'target' is defined as:
> 
>    the highest slot ID the client would prefer the server use on a
>    future CB_SEQUENCE operation. 
> 
> This is not "the highest slot ID for which the client is remembering
> sequence numbers".
> 
> If we can get rid of this, then I think the need for se_lock evaporates.
> Allocating a new slow would be
> 
> do {
>  idx = ffs(ses->se_cb_slot_avail) - 1;
> } while (is_valid(idx) && test_and_set_bit(idx, &ses->se_sb_slot_avail));
>  
> where is_valid(idX) is idx >= 0 && idx <= ses->se_sb_highest_slot
> 

That certainly would be better.

Maybe it's not required to start the seqid for a new slot at 1? If a
new slot can start its sequence counter at an arbitrary value then we
should be able to do this.

> 
> > +	}
> > +	ses->se_cb_highest_slot = target;
> > +	spin_unlock(&ses->se_lock);
> > +}
> > +
> >  /*
> >   * CB_SEQUENCE4resok
> >   *
> > @@ -468,7 +501,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> >  	struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
> >  	int status = -ESERVERFAULT;
> >  	__be32 *p;
> > -	u32 dummy;
> > +	u32 seqid, slotid, target;
> >  
> >  	/*
> >  	 * If the server returns different values for sessionID, slotID or
> > @@ -484,21 +517,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
> >  	}
> >  	p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
> >  
> > -	dummy = be32_to_cpup(p++);
> > -	if (dummy != session->se_cb_seq_nr) {
> > +	seqid = be32_to_cpup(p++);
> > +	if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
> >  		dprintk("NFS: %s Invalid sequence number\n", __func__);
> >  		goto out;
> >  	}
> >  
> > -	dummy = be32_to_cpup(p++);
> > -	if (dummy != 0) {
> > +	slotid = be32_to_cpup(p++);
> > +	if (slotid != cb->cb_held_slot) {
> >  		dprintk("NFS: %s Invalid slotid\n", __func__);
> >  		goto out;
> >  	}
> >  
> > -	/*
> > -	 * FIXME: process highest slotid and target highest slotid
> > -	 */
> > +	p++; // ignore current highest slot value
> > +
> > +	target = be32_to_cpup(p++);
> > +	update_cb_slot_table(session, target);
> >  	status = 0;
> >  out:
> >  	cb->cb_seq_status = status;
> > @@ -1203,6 +1237,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> >  	spin_unlock(&clp->cl_lock);
> >  }
> >  
> > +static int grab_slot(struct nfsd4_session *ses)
> > +{
> > +	int idx;
> > +
> > +	spin_lock(&ses->se_lock);
> > +	idx = ffs(ses->se_cb_slot_avail) - 1;
> > +	if (idx < 0 || idx > ses->se_cb_highest_slot) {
> > +		spin_unlock(&ses->se_lock);
> > +		return -1;
> > +	}
> > +	/* clear the bit for the slot */
> > +	ses->se_cb_slot_avail &= ~BIT(idx);
> > +	spin_unlock(&ses->se_lock);
> > +	return idx;
> > +}
> > +
> >  /*
> >   * There's currently a single callback channel slot.
> >   * If the slot is available, then mark it busy.  Otherwise, set the
> > @@ -1211,28 +1261,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
> >  static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
> >  {
> >  	struct nfs4_client *clp = cb->cb_clp;
> > +	struct nfsd4_session *ses = clp->cl_cb_session;
> >  
> > -	if (!cb->cb_holds_slot &&
> > -	    test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > +	if (cb->cb_held_slot >= 0)
> > +		return true;
> > +	cb->cb_held_slot = grab_slot(ses);
> > +	if (cb->cb_held_slot < 0) {
> >  		rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
> >  		/* Race breaker */
> > -		if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
> > -			dprintk("%s slot is busy\n", __func__);
> > +		cb->cb_held_slot = grab_slot(ses);
> > +		if (cb->cb_held_slot < 0)
> >  			return false;
> > -		}
> >  		rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
> >  	}
> > -	cb->cb_holds_slot = true;
> >  	return true;
> >  }
> >  
> >  static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
> >  {
> >  	struct nfs4_client *clp = cb->cb_clp;
> > +	struct nfsd4_session *ses = clp->cl_cb_session;
> >  
> > -	if (cb->cb_holds_slot) {
> > -		cb->cb_holds_slot = false;
> > -		clear_bit(0, &clp->cl_cb_slot_busy);
> > +	if (cb->cb_held_slot >= 0) {
> > +		spin_lock(&ses->se_lock);
> > +		ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
> > +		spin_unlock(&ses->se_lock);
> > +		cb->cb_held_slot = -1;
> >  		rpc_wake_up_next(&clp->cl_cb_waitq);
> >  	}
> >  }
> > @@ -1249,8 +1303,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
> >  }
> >  
> >  /*
> > - * TODO: cb_sequence should support referring call lists, cachethis, multiple
> > - * slots, and mark callback channel down on communication errors.
> > + * TODO: cb_sequence should support referring call lists, cachethis,
> > + * and mark callback channel down on communication errors.
> >   */
> >  static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
> >  {
> > @@ -1292,7 +1346,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >  		return true;
> >  	}
> >  
> > -	if (!cb->cb_holds_slot)
> > +	if (cb->cb_held_slot < 0)
> >  		goto need_restart;
> >  
> >  	/* This is the operation status code for CB_SEQUENCE */
> > @@ -1306,10 +1360,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >  		 * If CB_SEQUENCE returns an error, then the state of the slot
> >  		 * (sequence ID, cached reply) MUST NOT change.
> >  		 */
> > -		++session->se_cb_seq_nr;
> > +		++session->se_cb_seq_nr[cb->cb_held_slot];
> >  		break;
> >  	case -ESERVERFAULT:
> > -		++session->se_cb_seq_nr;
> > +		++session->se_cb_seq_nr[cb->cb_held_slot];
> >  		nfsd4_mark_cb_fault(cb->cb_clp);
> >  		ret = false;
> >  		break;
> > @@ -1335,17 +1389,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
> >  	case -NFS4ERR_BADSLOT:
> >  		goto retry_nowait;
> >  	case -NFS4ERR_SEQ_MISORDERED:
> > -		if (session->se_cb_seq_nr != 1) {
> > -			session->se_cb_seq_nr = 1;
> > +		if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > +			session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> 
> This is weird ...  why do we reset the seq_nr to 1 when we get
> SEQ_MISORDERED??  Git logs don't shed any light :-(
> 


The above verbiage from 18.36 might hint that this is the right thing
to do, but it's a little vague.

> >  			goto retry_nowait;
> >  		}
> >  		break;
> >  	default:
> >  		nfsd4_mark_cb_fault(cb->cb_clp);
> >  	}
> > -	nfsd41_cb_release_slot(cb);
> > -
> >  	trace_nfsd_cb_free_slot(task, cb);
> > +	nfsd41_cb_release_slot(cb);
> >  
> >  	if (RPC_SIGNALLED(task))
> >  		goto need_restart;
> > @@ -1565,7 +1618,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
> >  	INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
> >  	cb->cb_status = 0;
> >  	cb->cb_need_restart = false;
> > -	cb->cb_holds_slot = false;
> > +	cb->cb_held_slot = -1;
> >  }
> >  
> >  /**
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -2002,6 +2002,9 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
> >  	}
> >  
> >  	memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
> > +	new->se_cb_slot_avail = ~0U;
> > +	new->se_cb_highest_slot = battrs->maxreqs - 1;
> > +	spin_lock_init(&new->se_lock);
> >  	return new;
> >  out_free:
> >  	while (i--)
> > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> >  
> >  	INIT_LIST_HEAD(&new->se_conns);
> >  
> > -	new->se_cb_seq_nr = 1;
> > +	atomic_set(&new->se_ref, 0);
> >  	new->se_dead = false;
> >  	new->se_cb_prog = cses->callback_prog;
> >  	new->se_cb_sec = cses->cb_sec;
> > -	atomic_set(&new->se_ref, 0);
> > +
> > +	for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > +		new->se_cb_seq_nr[idx] = 1;
> 
> That should be "<= NFSD_BC_SLOT_TABLE_MAX"

MAX in this case is the maximum slot index, so this is correct for the
code as it stands today. I'm fine with redefining the constant to track
the size of the slot table instead. We could also make the existing
code more clear by just renaming the existing constant to
NFSD_BC_SLOT_INDEX_MAX.

> 
> I don't think *_MAX is a good choice of name.  It is the maximum number
> of slots (no) or the maximum slot number (yes).
> I think *_SIZE would be a better name - the size of the table that we
> allocate. 32.
> Looking at where the const is used in current nfsd-next:
> 
> 		target = min(target, NFSD_BC_SLOT_TABLE_SIZE - 1
> 
> 	new->se_cb_highest_slot = min(battrs->maxreqs,
> 				      NFSD_BC_SLOT_TABLE_SIZE) - 1;
> 
> 	for (idx = 0; idx < NFSD_BC_SLOT_TABLE_SIZE; ++idx)
> 
> #define NFSD_BC_SLOT_TABLE_SIZE	(sizeof(u32) * 8)
> 
> 	u32			se_cb_seq_nr[NFSD_BC_SLOT_TABLE_SIZE];
> 
> which is a slight reduction in the number of "+/-1" adjustments.
> 
> 
> 
> 
> > +
> >  	idx = hash_sessionid(&new->se_sessionid);
> >  	list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
> >  	spin_lock(&clp->cl_lock);
> > @@ -3159,7 +3165,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
> >  	kref_init(&clp->cl_nfsdfs.cl_ref);
> >  	nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
> >  	clp->cl_time = ktime_get_boottime_seconds();
> > -	clear_bit(0, &clp->cl_cb_slot_busy);
> >  	copy_verf(clp, verf);
> >  	memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
> >  	clp->cl_cb_session = NULL;
> > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
> > --- a/fs/nfsd/state.h
> > +++ b/fs/nfsd/state.h
> > @@ -71,8 +71,8 @@ struct nfsd4_callback {
> >  	struct work_struct cb_work;
> >  	int cb_seq_status;
> >  	int cb_status;
> > +	int cb_held_slot;
> >  	bool cb_need_restart;
> > -	bool cb_holds_slot;
> >  };
> >  
> >  struct nfsd4_callback_ops {
> > @@ -307,6 +307,9 @@ struct nfsd4_conn {
> >  	unsigned char cn_flags;
> >  };
> >  
> > +/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
> > +#define NFSD_BC_SLOT_TABLE_MAX	(sizeof(u32) * 8 - 1)
> > +
> >  /*
> >   * Representation of a v4.1+ session. These are refcounted in a similar fashion
> >   * to the nfs4_client. References are only taken when the server is actively
> > @@ -314,6 +317,10 @@ struct nfsd4_conn {
> >   */
> >  struct nfsd4_session {
> >  	atomic_t		se_ref;
> > +	spinlock_t		se_lock;
> > +	u32			se_cb_slot_avail; /* bitmap of available slots */
> > +	u32			se_cb_highest_slot;	/* highest slot client wants */
> > +	u32			se_cb_prog;
> >  	bool			se_dead;
> >  	struct list_head	se_hash;	/* hash by sessionid */
> >  	struct list_head	se_perclnt;
> > @@ -322,8 +329,7 @@ struct nfsd4_session {
> >  	struct nfsd4_channel_attrs se_fchannel;
> >  	struct nfsd4_cb_sec	se_cb_sec;
> >  	struct list_head	se_conns;
> > -	u32			se_cb_prog;
> > -	u32			se_cb_seq_nr;
> > +	u32			se_cb_seq_nr[NFSD_BC_SLOT_TABLE_MAX + 1];
> >  	struct nfsd4_slot	*se_slots[];	/* forward channel slots */
> >  };
> >  
> > @@ -457,9 +463,6 @@ struct nfs4_client {
> >  	 */
> >  	struct dentry		*cl_nfsd_info_dentry;
> >  
> > -	/* for nfs41 callbacks */
> > -	/* We currently support a single back channel with a single slot */
> > -	unsigned long		cl_cb_slot_busy;
> >  	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
> >  						/* wait here for slots */
> >  	struct net		*net;
> > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > index f318898cfc31614b5a84a4867e18c2b3a07122c9..a9c17186b6892f1df8d7f7b90e250c2913ab23fe 100644
> > --- a/fs/nfsd/trace.h
> > +++ b/fs/nfsd/trace.h
> > @@ -1697,7 +1697,7 @@ TRACE_EVENT(nfsd_cb_free_slot,
> >  		__entry->cl_id = sid->clientid.cl_id;
> >  		__entry->seqno = sid->sequence;
> >  		__entry->reserved = sid->reserved;
> > -		__entry->slot_seqno = session->se_cb_seq_nr;
> > +		__entry->slot_seqno = session->se_cb_seq_nr[cb->cb_held_slot];
> >  	),
> >  	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
> >  		" sessionid=%08x:%08x:%08x:%08x new slot seqno=%u",
> > 
> > ---
> > base-commit: 3c16aac09d20f9005fbb0e737b3ec520bbb5badd
> > change-id: 20241025-bcwide-6bd7e4b63db2
> > 
> > Best regards,
> > -- 
> > Jeff Layton <jlayton@kernel.org>
> > 
> > 
> 
> NeilBrown
NeilBrown Nov. 13, 2024, 1:31 a.m. UTC | #27
On Wed, 13 Nov 2024, Jeff Layton wrote:
> On Wed, 2024-11-13 at 11:07 +1100, NeilBrown wrote:
> > On Wed, 06 Nov 2024, Jeff Layton wrote:
> > > +	spin_lock(&ses->se_lock);
> > > +	if (target > ses->se_cb_highest_slot) {
> > > +		int i;
> > > +
> > > +		target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > +
> > > +		/* Growing the slot table. Reset any new sequences to 1 */
> > > +		for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > +			ses->se_cb_seq_nr[i] = 1;
> > 
> > Where is the justification in the RFC for resetting the sequence
> > numbers?
> > 
> 
> RFC 8881, 18.36:
> 
> 
> 
> [...]
> 
> Once the session is created, the first SEQUENCE or CB_SEQUENCE received
> on a slot MUST have a sequence ID equal to 1; if not, the replier MUST
> return NFS4ERR_SEQ_MISORDERED.

So initialising them all to 1 when the session is created, as you do in
init_session(), is clearly correct.  Reinitialising them after
target_highest_slot_id has been reduced and then increased is not
justified by the above.

> 
> There is also some verbiage in 20.10.6.1.

2.10.6.1 ??

I cannot find anything in there that justifies discarding seq ids from
slots that have been used.  Discarding cached data and allocated memory
to cache future data is certainly justified, but there is no clear
protocol by which the client and server can agree that it is time to
reset the seqid for a particular slot (or range of slots).

Can you point me to what you can find?

> 
> > The csr_target_highest_slotid from the client - which is the value passed as
> > 'target' is defined as:
> > 
> >    the highest slot ID the client would prefer the server use on a
> >    future CB_SEQUENCE operation. 
> > 
> > This is not "the highest slot ID for which the client is remembering
> > sequence numbers".
> > 
> > If we can get rid of this, then I think the need for se_lock evaporates.
> > Allocating a new slow would be
> > 
> > do {
> >  idx = ffs(ses->se_cb_slot_avail) - 1;
> > } while (is_valid(idx) && test_and_set_bit(idx, &ses->se_sb_slot_avail));
> >  
> > where is_valid(idX) is idx >= 0 && idx <= ses->se_sb_highest_slot
> > 
> 
> That certainly would be better.
> 
> Maybe it's not required to start the seqid for a new slot at 1? If a
> new slot can start its sequence counter at an arbitrary value then we
> should be able to do this.

A new slot MUST start with a seqid of 1 when the session is created.  So
the first time a slot is used in a session the seqid must be 1.  The
second time it must be 2.  etc.  But I don't see how that relates to the
code for managing se_sb_slot_avail ....

> > >  	case -NFS4ERR_SEQ_MISORDERED:
> > > -		if (session->se_cb_seq_nr != 1) {
> > > -			session->se_cb_seq_nr = 1;
> > > +		if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > +			session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > 
> > This is weird ...  why do we reset the seq_nr to 1 when we get
> > SEQ_MISORDERED??  Git logs don't shed any light :-(
> > 
> 
> 
> The above verbiage from 18.36 might hint that this is the right thing
> to do, but it's a little vague.

Maybe this code is useful for buggy clients that choose to reset the
seqid for slots that have been unused for a while...  It looks like the
Linux NFS client will reset seqids.  nfs41_set_client_slotid_locked()
records a new target bumping ->generation and
nfs41_set_server_slotid_locked() may then call nfs4_shrink_slot_table()
which discards seqid information.

I still cannot see how it is justified. 

> > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > >  
> > >  	INIT_LIST_HEAD(&new->se_conns);
> > >  
> > > -	new->se_cb_seq_nr = 1;
> > > +	atomic_set(&new->se_ref, 0);
> > >  	new->se_dead = false;
> > >  	new->se_cb_prog = cses->callback_prog;
> > >  	new->se_cb_sec = cses->cb_sec;
> > > -	atomic_set(&new->se_ref, 0);
> > > +
> > > +	for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > +		new->se_cb_seq_nr[idx] = 1;
> > 
> > That should be "<= NFSD_BC_SLOT_TABLE_MAX"
> 
> MAX in this case is the maximum slot index, so this is correct for the
> code as it stands today. I'm fine with redefining the constant to track
> the size of the slot table instead. We could also make the existing
> code more clear by just renaming the existing constant to
> NFSD_BC_SLOT_INDEX_MAX.

What do you mean by "this" in "this is correct for.."??  The code as it
stands today is incorrect as it initialises the se_cb_seq_nr for slots
0..30 but not for slot 31.

> 
> > 
> > I don't think *_MAX is a good choice of name.  It is the maximum number
> > of slots (no) or the maximum slot number (yes).
> > I think *_SIZE would be a better name - the size of the table that we
> > allocate. 32.
> > Looking at where the const is used in current nfsd-next:
> > 
> > 		target = min(target, NFSD_BC_SLOT_TABLE_SIZE - 1
> > 
> > 	new->se_cb_highest_slot = min(battrs->maxreqs,
> > 				      NFSD_BC_SLOT_TABLE_SIZE) - 1;
> > 
> > 	for (idx = 0; idx < NFSD_BC_SLOT_TABLE_SIZE; ++idx)
> > 
> > #define NFSD_BC_SLOT_TABLE_SIZE	(sizeof(u32) * 8)
> > 
> > 	u32			se_cb_seq_nr[NFSD_BC_SLOT_TABLE_SIZE];
> > 
> > which is a slight reduction in the number of "+/-1" adjustments.
> > 
> > 

Thanks,
NeilBrown
Jeff Layton Nov. 13, 2024, 3:15 p.m. UTC | #28
On Wed, 2024-11-13 at 12:31 +1100, NeilBrown wrote:
> On Wed, 13 Nov 2024, Jeff Layton wrote:
> > On Wed, 2024-11-13 at 11:07 +1100, NeilBrown wrote:
> > > On Wed, 06 Nov 2024, Jeff Layton wrote:
> > > > +	spin_lock(&ses->se_lock);
> > > > +	if (target > ses->se_cb_highest_slot) {
> > > > +		int i;
> > > > +
> > > > +		target = min(target, NFSD_BC_SLOT_TABLE_MAX);
> > > > +
> > > > +		/* Growing the slot table. Reset any new sequences to 1 */
> > > > +		for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
> > > > +			ses->se_cb_seq_nr[i] = 1;
> > > 
> > > Where is the justification in the RFC for resetting the sequence
> > > numbers?
> > > 
> > 
> > RFC 8881, 18.36:
> > 
> > 
> > 
> > [...]
> > 
> > Once the session is created, the first SEQUENCE or CB_SEQUENCE received
> > on a slot MUST have a sequence ID equal to 1; if not, the replier MUST
> > return NFS4ERR_SEQ_MISORDERED.
> 
> So initialising them all to 1 when the session is created, as you do in
> init_session(), is clearly correct.  Reinitialising them after
> target_highest_slot_id has been reduced and then increased is not
> justified by the above.
> 

But, once the client and server have forgotten about those slots after
shrinking the slot table, aren't they effectively new? IOW, once you've
shrunk the slot table, the slots are effectively "freed". Growing it
means that you have to allocate new ones. The fact that this patch just
keeps them around is an implementation detail.

 
> > 
> > There is also some verbiage in 20.10.6.1.
> 
> 2.10.6.1 ??
> 
> I cannot find anything in there that justifies discarding seq ids from
> slots that have been used.  Discarding cached data and allocated memory
> to cache future data is certainly justified, but there is no clear
> protocol by which the client and server can agree that it is time to
> reset the seqid for a particular slot (or range of slots).
> 
> Can you point me to what you can find?
>

I guess I'm stuck on this idea that shrinking the slot table
effectively frees those slots, so if you grow it again later, you have
to consider those slots to be new.

> > 
> > > The csr_target_highest_slotid from the client - which is the value passed as
> > > 'target' is defined as:
> > > 
> > >    the highest slot ID the client would prefer the server use on a
> > >    future CB_SEQUENCE operation. 
> > > 
> > > This is not "the highest slot ID for which the client is remembering
> > > sequence numbers".
> > > 
> > > If we can get rid of this, then I think the need for se_lock evaporates.
> > > Allocating a new slow would be
> > > 
> > > do {
> > >  idx = ffs(ses->se_cb_slot_avail) - 1;
> > > } while (is_valid(idx) && test_and_set_bit(idx, &ses->se_sb_slot_avail));
> > >  
> > > where is_valid(idX) is idx >= 0 && idx <= ses->se_sb_highest_slot
> > > 
> > 
> > That certainly would be better.
> > 
> > Maybe it's not required to start the seqid for a new slot at 1? If a
> > new slot can start its sequence counter at an arbitrary value then we
> > should be able to do this.
> 
> A new slot MUST start with a seqid of 1 when the session is created.  So
> the first time a slot is used in a session the seqid must be 1.  The
> second time it must be 2.  etc.  But I don't see how that relates to the
> code for managing se_sb_slot_avail ....
> 

It doesn't. The se_lock was just a simple way to deal with the table
resizing.

> > > >  	case -NFS4ERR_SEQ_MISORDERED:
> > > > -		if (session->se_cb_seq_nr != 1) {
> > > > -			session->se_cb_seq_nr = 1;
> > > > +		if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
> > > > +			session->se_cb_seq_nr[cb->cb_held_slot] = 1;
> > > 
> > > This is weird ...  why do we reset the seq_nr to 1 when we get
> > > SEQ_MISORDERED??  Git logs don't shed any light :-(
> > > 
> > 
> > 
> > The above verbiage from 18.36 might hint that this is the right thing
> > to do, but it's a little vague.
> 
> Maybe this code is useful for buggy clients that choose to reset the
> seqid for slots that have been unused for a while...  It looks like the
> Linux NFS client will reset seqids.  nfs41_set_client_slotid_locked()
> records a new target bumping ->generation and
> nfs41_set_server_slotid_locked() may then call nfs4_shrink_slot_table()
> which discards seqid information.
> 
> I still cannot see how it is justified. 
> 

Fair enough. I'm fine with doing this locklessly if the starting seqid
values truly don't matter. I fear they do though.


> > > > @@ -2132,11 +2135,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
> > > >  
> > > >  	INIT_LIST_HEAD(&new->se_conns);
> > > >  
> > > > -	new->se_cb_seq_nr = 1;
> > > > +	atomic_set(&new->se_ref, 0);
> > > >  	new->se_dead = false;
> > > >  	new->se_cb_prog = cses->callback_prog;
> > > >  	new->se_cb_sec = cses->cb_sec;
> > > > -	atomic_set(&new->se_ref, 0);
> > > > +
> > > > +	for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
> > > > +		new->se_cb_seq_nr[idx] = 1;
> > > 
> > > That should be "<= NFSD_BC_SLOT_TABLE_MAX"
> > 
> > MAX in this case is the maximum slot index, so this is correct for the
> > code as it stands today. I'm fine with redefining the constant to track
> > the size of the slot table instead. We could also make the existing
> > code more clear by just renaming the existing constant to
> > NFSD_BC_SLOT_INDEX_MAX.
> 
> What do you mean by "this" in "this is correct for.."??  The code as it
> stands today is incorrect as it initialises the se_cb_seq_nr for slots
> 0..30 but not for slot 31.
> 

My bad, you're correct. I'll plan to fix that up once I'm a little
clearer on what the next iteration needs to look like.

> > 
> > > 
> > > I don't think *_MAX is a good choice of name.  It is the maximum number
> > > of slots (no) or the maximum slot number (yes).
> > > I think *_SIZE would be a better name - the size of the table that we
> > > allocate. 32.
> > > Looking at where the const is used in current nfsd-next:
> > > 
> > > 		target = min(target, NFSD_BC_SLOT_TABLE_SIZE - 1
> > > 
> > > 	new->se_cb_highest_slot = min(battrs->maxreqs,
> > > 				      NFSD_BC_SLOT_TABLE_SIZE) - 1;
> > > 
> > > 	for (idx = 0; idx < NFSD_BC_SLOT_TABLE_SIZE; ++idx)
> > > 
> > > #define NFSD_BC_SLOT_TABLE_SIZE	(sizeof(u32) * 8)
> > > 
> > > 	u32			se_cb_seq_nr[NFSD_BC_SLOT_TABLE_SIZE];
> > > 
> > > which is a slight reduction in the number of "+/-1" adjustments.
> > > 
> > > 
> 
> Thanks,
> NeilBrown
NeilBrown Nov. 14, 2024, 3:19 a.m. UTC | #29
On Thu, 14 Nov 2024, Jeff Layton wrote:
> On Wed, 2024-11-13 at 12:31 +1100, NeilBrown wrote:
> > 
> > So initialising them all to 1 when the session is created, as you do in
> > init_session(), is clearly correct.  Reinitialising them after
> > target_highest_slot_id has been reduced and then increased is not
> > justified by the above.
> > 
> 
> But, once the client and server have forgotten about those slots after
> shrinking the slot table, aren't they effectively new? IOW, once you've
> shrunk the slot table, the slots are effectively "freed". Growing it
> means that you have to allocate new ones. The fact that this patch just
> keeps them around is an implementation detail.


There is no text in the RFC about shrinking or growing or forgetting.
The only meaning given to numbers like ca_maxreqs is that the client
shouldn't use a larger slot number than the given one.

I think the slot table is conceptually infinite and exists in its
entirety from the moment CREATE_SESSION completes to the moment
DESTROY_SESSION completes (or a lease expires or similar).  The client
can limit how much of that infinitude that it will choose to use, and
the server can limit how much of it it will allow to be used so neither
need to store the full infinity.  But it never changes size.
Implementations can choose how much to store in real memory and can
discard every except (I think) the last sequence number seen on any slot
for which a request was sent (client) or accepted (server).

I agree that this seems less that ideal and it would be good if the
protocol has a mechanism for the client and server to agree to reset
the seqid for some slots.  But I cannot find any such mechanism.

Thanks,
NeilBrown
Chuck Lever III Nov. 14, 2024, 2:19 p.m. UTC | #30
> On Nov 13, 2024, at 10:19 PM, NeilBrown <neilb@suse.de> wrote:
> 
> On Thu, 14 Nov 2024, Jeff Layton wrote:
>> On Wed, 2024-11-13 at 12:31 +1100, NeilBrown wrote:
>>> 
>>> So initialising them all to 1 when the session is created, as you do in
>>> init_session(), is clearly correct.  Reinitialising them after
>>> target_highest_slot_id has been reduced and then increased is not
>>> justified by the above.
>>> 
>> 
>> But, once the client and server have forgotten about those slots after
>> shrinking the slot table, aren't they effectively new? IOW, once you've
>> shrunk the slot table, the slots are effectively "freed". Growing it
>> means that you have to allocate new ones. The fact that this patch just
>> keeps them around is an implementation detail.
> 
> 
> There is no text in the RFC about shrinking or growing or forgetting.
> The only meaning given to numbers like ca_maxreqs is that the client
> shouldn't use a larger slot number than the given one.
> 
> I think the slot table is conceptually infinite and exists in its
> entirety from the moment CREATE_SESSION completes to the moment
> DESTROY_SESSION completes (or a lease expires or similar).  The client
> can limit how much of that infinitude that it will choose to use, and
> the server can limit how much of it it will allow to be used so neither
> need to store the full infinity.  But it never changes size.
> Implementations can choose how much to store in real memory and can
> discard every except (I think) the last sequence number seen on any slot
> for which a request was sent (client) or accepted (server).

This is, IMO, one possible implementation of a slot table.

As you say, the spec doesn't provide a lot of guidance
about it. Therefore I believe other implementations are
possible.

It would be prudent to survey some of them.


> I agree that this seems less that ideal and it would be good if the
> protocol has a mechanism for the client and server to agree to reset
> the seqid for some slots.  But I cannot find any such mechanism.


--
Chuck Lever
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index e38fa834b3d91333acf1425eb14c644e5d5f2601..47a678333907eaa92db305dada503704c34c15b2 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -406,6 +406,19 @@  encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
 	hdr->nops++;
 }
 
+static u32 highest_slotid(struct nfsd4_session *ses)
+{
+	u32 idx;
+
+	spin_lock(&ses->se_lock);
+	idx = fls(~ses->se_cb_slot_avail);
+	if (idx > 0)
+		--idx;
+	idx = max(idx, ses->se_cb_highest_slot);
+	spin_unlock(&ses->se_lock);
+	return idx;
+}
+
 /*
  * CB_SEQUENCE4args
  *
@@ -432,15 +445,35 @@  static void encode_cb_sequence4args(struct xdr_stream *xdr,
 	encode_sessionid4(xdr, session);
 
 	p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
-	*p++ = cpu_to_be32(session->se_cb_seq_nr);	/* csa_sequenceid */
-	*p++ = xdr_zero;			/* csa_slotid */
-	*p++ = xdr_zero;			/* csa_highest_slotid */
+	*p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]);	/* csa_sequenceid */
+	*p++ = cpu_to_be32(cb->cb_held_slot);		/* csa_slotid */
+	*p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
 	*p++ = xdr_zero;			/* csa_cachethis */
 	xdr_encode_empty_array(p);		/* csa_referring_call_lists */
 
 	hdr->nops++;
 }
 
+static void update_cb_slot_table(struct nfsd4_session *ses, u32 target)
+{
+	/* No need to do anything if nothing changed */
+	if (likely(target == READ_ONCE(ses->se_cb_highest_slot)))
+		return;
+
+	spin_lock(&ses->se_lock);
+	if (target > ses->se_cb_highest_slot) {
+		int i;
+
+		target = min(target, NFSD_BC_SLOT_TABLE_MAX);
+
+		/* Growing the slot table. Reset any new sequences to 1 */
+		for (i = ses->se_cb_highest_slot + 1; i <= target; ++i)
+			ses->se_cb_seq_nr[i] = 1;
+	}
+	ses->se_cb_highest_slot = target;
+	spin_unlock(&ses->se_lock);
+}
+
 /*
  * CB_SEQUENCE4resok
  *
@@ -468,7 +501,7 @@  static int decode_cb_sequence4resok(struct xdr_stream *xdr,
 	struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
 	int status = -ESERVERFAULT;
 	__be32 *p;
-	u32 dummy;
+	u32 seqid, slotid, target;
 
 	/*
 	 * If the server returns different values for sessionID, slotID or
@@ -484,21 +517,22 @@  static int decode_cb_sequence4resok(struct xdr_stream *xdr,
 	}
 	p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
 
-	dummy = be32_to_cpup(p++);
-	if (dummy != session->se_cb_seq_nr) {
+	seqid = be32_to_cpup(p++);
+	if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) {
 		dprintk("NFS: %s Invalid sequence number\n", __func__);
 		goto out;
 	}
 
-	dummy = be32_to_cpup(p++);
-	if (dummy != 0) {
+	slotid = be32_to_cpup(p++);
+	if (slotid != cb->cb_held_slot) {
 		dprintk("NFS: %s Invalid slotid\n", __func__);
 		goto out;
 	}
 
-	/*
-	 * FIXME: process highest slotid and target highest slotid
-	 */
+	p++; // ignore current highest slot value
+
+	target = be32_to_cpup(p++);
+	update_cb_slot_table(session, target);
 	status = 0;
 out:
 	cb->cb_seq_status = status;
@@ -1203,6 +1237,22 @@  void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 	spin_unlock(&clp->cl_lock);
 }
 
+static int grab_slot(struct nfsd4_session *ses)
+{
+	int idx;
+
+	spin_lock(&ses->se_lock);
+	idx = ffs(ses->se_cb_slot_avail) - 1;
+	if (idx < 0 || idx > ses->se_cb_highest_slot) {
+		spin_unlock(&ses->se_lock);
+		return -1;
+	}
+	/* clear the bit for the slot */
+	ses->se_cb_slot_avail &= ~BIT(idx);
+	spin_unlock(&ses->se_lock);
+	return idx;
+}
+
 /*
  * There's currently a single callback channel slot.
  * If the slot is available, then mark it busy.  Otherwise, set the
@@ -1211,28 +1261,32 @@  void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
 {
 	struct nfs4_client *clp = cb->cb_clp;
+	struct nfsd4_session *ses = clp->cl_cb_session;
 
-	if (!cb->cb_holds_slot &&
-	    test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
+	if (cb->cb_held_slot >= 0)
+		return true;
+	cb->cb_held_slot = grab_slot(ses);
+	if (cb->cb_held_slot < 0) {
 		rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
 		/* Race breaker */
-		if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
-			dprintk("%s slot is busy\n", __func__);
+		cb->cb_held_slot = grab_slot(ses);
+		if (cb->cb_held_slot < 0)
 			return false;
-		}
 		rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
 	}
-	cb->cb_holds_slot = true;
 	return true;
 }
 
 static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
 {
 	struct nfs4_client *clp = cb->cb_clp;
+	struct nfsd4_session *ses = clp->cl_cb_session;
 
-	if (cb->cb_holds_slot) {
-		cb->cb_holds_slot = false;
-		clear_bit(0, &clp->cl_cb_slot_busy);
+	if (cb->cb_held_slot >= 0) {
+		spin_lock(&ses->se_lock);
+		ses->se_cb_slot_avail |= BIT(cb->cb_held_slot);
+		spin_unlock(&ses->se_lock);
+		cb->cb_held_slot = -1;
 		rpc_wake_up_next(&clp->cl_cb_waitq);
 	}
 }
@@ -1249,8 +1303,8 @@  static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
 }
 
 /*
- * TODO: cb_sequence should support referring call lists, cachethis, multiple
- * slots, and mark callback channel down on communication errors.
+ * TODO: cb_sequence should support referring call lists, cachethis,
+ * and mark callback channel down on communication errors.
  */
 static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 {
@@ -1292,7 +1346,7 @@  static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		return true;
 	}
 
-	if (!cb->cb_holds_slot)
+	if (cb->cb_held_slot < 0)
 		goto need_restart;
 
 	/* This is the operation status code for CB_SEQUENCE */
@@ -1306,10 +1360,10 @@  static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		 * If CB_SEQUENCE returns an error, then the state of the slot
 		 * (sequence ID, cached reply) MUST NOT change.
 		 */
-		++session->se_cb_seq_nr;
+		++session->se_cb_seq_nr[cb->cb_held_slot];
 		break;
 	case -ESERVERFAULT:
-		++session->se_cb_seq_nr;
+		++session->se_cb_seq_nr[cb->cb_held_slot];
 		nfsd4_mark_cb_fault(cb->cb_clp);
 		ret = false;
 		break;
@@ -1335,17 +1389,16 @@  static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 	case -NFS4ERR_BADSLOT:
 		goto retry_nowait;
 	case -NFS4ERR_SEQ_MISORDERED:
-		if (session->se_cb_seq_nr != 1) {
-			session->se_cb_seq_nr = 1;
+		if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
+			session->se_cb_seq_nr[cb->cb_held_slot] = 1;
 			goto retry_nowait;
 		}
 		break;
 	default:
 		nfsd4_mark_cb_fault(cb->cb_clp);
 	}
-	nfsd41_cb_release_slot(cb);
-
 	trace_nfsd_cb_free_slot(task, cb);
+	nfsd41_cb_release_slot(cb);
 
 	if (RPC_SIGNALLED(task))
 		goto need_restart;
@@ -1565,7 +1618,7 @@  void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
 	INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
 	cb->cb_status = 0;
 	cb->cb_need_restart = false;
-	cb->cb_holds_slot = false;
+	cb->cb_held_slot = -1;
 }
 
 /**
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index baf7994131fe1b0a4715174ba943fd2a9882aa12..75557e7cc9265517f51952563beaa4cfe8adcc3f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2002,6 +2002,9 @@  static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
 	}
 
 	memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
+	new->se_cb_slot_avail = ~0U;
+	new->se_cb_highest_slot = battrs->maxreqs - 1;
+	spin_lock_init(&new->se_lock);
 	return new;
 out_free:
 	while (i--)
@@ -2132,11 +2135,14 @@  static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
 
 	INIT_LIST_HEAD(&new->se_conns);
 
-	new->se_cb_seq_nr = 1;
+	atomic_set(&new->se_ref, 0);
 	new->se_dead = false;
 	new->se_cb_prog = cses->callback_prog;
 	new->se_cb_sec = cses->cb_sec;
-	atomic_set(&new->se_ref, 0);
+
+	for (idx = 0; idx < NFSD_BC_SLOT_TABLE_MAX; ++idx)
+		new->se_cb_seq_nr[idx] = 1;
+
 	idx = hash_sessionid(&new->se_sessionid);
 	list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
 	spin_lock(&clp->cl_lock);
@@ -3159,7 +3165,6 @@  static struct nfs4_client *create_client(struct xdr_netobj name,
 	kref_init(&clp->cl_nfsdfs.cl_ref);
 	nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
 	clp->cl_time = ktime_get_boottime_seconds();
-	clear_bit(0, &clp->cl_cb_slot_busy);
 	copy_verf(clp, verf);
 	memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
 	clp->cl_cb_session = NULL;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index d22e4f2c9039324a0953a9e15a3c255fb8ee1a44..848d023cb308f0b69916c4ee34b09075708f0de3 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -71,8 +71,8 @@  struct nfsd4_callback {
 	struct work_struct cb_work;
 	int cb_seq_status;
 	int cb_status;
+	int cb_held_slot;
 	bool cb_need_restart;
-	bool cb_holds_slot;
 };
 
 struct nfsd4_callback_ops {
@@ -307,6 +307,9 @@  struct nfsd4_conn {
 	unsigned char cn_flags;
 };
 
+/* Highest slot index that nfsd implements in NFSv4.1+ backchannel */
+#define NFSD_BC_SLOT_TABLE_MAX	(sizeof(u32) * 8 - 1)
+
 /*
  * Representation of a v4.1+ session. These are refcounted in a similar fashion
  * to the nfs4_client. References are only taken when the server is actively
@@ -314,6 +317,10 @@  struct nfsd4_conn {
  */
 struct nfsd4_session {
 	atomic_t		se_ref;
+	spinlock_t		se_lock;
+	u32			se_cb_slot_avail; /* bitmap of available slots */
+	u32			se_cb_highest_slot;	/* highest slot client wants */
+	u32			se_cb_prog;
 	bool			se_dead;
 	struct list_head	se_hash;	/* hash by sessionid */
 	struct list_head	se_perclnt;
@@ -322,8 +329,7 @@  struct nfsd4_session {
 	struct nfsd4_channel_attrs se_fchannel;
 	struct nfsd4_cb_sec	se_cb_sec;
 	struct list_head	se_conns;
-	u32			se_cb_prog;
-	u32			se_cb_seq_nr;
+	u32			se_cb_seq_nr[NFSD_BC_SLOT_TABLE_MAX + 1];
 	struct nfsd4_slot	*se_slots[];	/* forward channel slots */
 };
 
@@ -457,9 +463,6 @@  struct nfs4_client {
 	 */
 	struct dentry		*cl_nfsd_info_dentry;
 
-	/* for nfs41 callbacks */
-	/* We currently support a single back channel with a single slot */
-	unsigned long		cl_cb_slot_busy;
 	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
 						/* wait here for slots */
 	struct net		*net;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index f318898cfc31614b5a84a4867e18c2b3a07122c9..a9c17186b6892f1df8d7f7b90e250c2913ab23fe 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1697,7 +1697,7 @@  TRACE_EVENT(nfsd_cb_free_slot,
 		__entry->cl_id = sid->clientid.cl_id;
 		__entry->seqno = sid->sequence;
 		__entry->reserved = sid->reserved;
-		__entry->slot_seqno = session->se_cb_seq_nr;
+		__entry->slot_seqno = session->se_cb_seq_nr[cb->cb_held_slot];
 	),
 	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
 		" sessionid=%08x:%08x:%08x:%08x new slot seqno=%u",