diff mbox series

[v1,02/11] svcrdma: Use all allocated Send Queue entries

Message ID 170653984365.24162.652127313173673494.stgit@manet.1015granger.net (mailing list archive)
State Superseded
Headers show
Series NFSD RDMA transport improvements | expand

Commit Message

Chuck Lever Jan. 29, 2024, 2:50 p.m. UTC
From: Chuck Lever <chuck.lever@oracle.com>

For upper layer protocols that request rw_ctxs, ib_create_qp()
adjusts ib_qp_init_attr::max_send_wr to accommodate the WQEs those
rw_ctxs will consume. See rdma_rw_init_qp() for details.

To actually use those additional WQEs, svc_rdma_accept() needs to
retrieve the corrected SQ depth after calling rdma_create_qp() and
set newxprt->sc_sq_depth and  newxprt->sc_sq_avail so that
svc_rdma_send() and svc_rdma_post_chunk_ctxt() can utilize those
WQEs.

The NVMe target driver, for example, already does this properly.

Fixes: 26fb2254dd33 ("svcrdma: Estimate Send Queue depth properly")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c |   36 ++++++++++++++++++------------
 1 file changed, 22 insertions(+), 14 deletions(-)

Comments

Chuck Lever III Jan. 29, 2024, 5:26 p.m. UTC | #1
On Mon, Jan 29, 2024 at 09:50:43AM -0500, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
> 
> For upper layer protocols that request rw_ctxs, ib_create_qp()
> adjusts ib_qp_init_attr::max_send_wr to accommodate the WQEs those
> rw_ctxs will consume. See rdma_rw_init_qp() for details.
> 
> To actually use those additional WQEs, svc_rdma_accept() needs to
> retrieve the corrected SQ depth after calling rdma_create_qp() and
> set newxprt->sc_sq_depth and  newxprt->sc_sq_avail so that
> svc_rdma_send() and svc_rdma_post_chunk_ctxt() can utilize those
> WQEs.
> 
> The NVMe target driver, for example, already does this properly.
> 
> Fixes: 26fb2254dd33 ("svcrdma: Estimate Send Queue depth properly")
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  net/sunrpc/xprtrdma/svc_rdma_transport.c |   36 ++++++++++++++++++------------
>  1 file changed, 22 insertions(+), 14 deletions(-)

After applying this one, workload tests trigger this set of errors
on my test NFS server (CX-5 in RoCE mode):

mlx5_core 0000:02:00.0: cq_err_event_notifier:517:(pid 4374): CQ error on CQN 0x407, syndrome 0x1
rocep2s0/1: QP 137 error: unrecognized status (0x41 0x0 0x0)

I think syndrome 0x1 is IB_EVENT_QP_FATAL ?

But the "unrecognized status" comes from mlx5_ib_qp_err_syndrome(),
which does this:

 344         pr_err("%s/%d: QP %d error: %s (0x%x 0x%x 0x%x)\n",
 345                ibqp->device->name, ibqp->port, ibqp->qp_num,
 346                ib_wc_status_msg(
 347                        MLX5_GET(cqe_error_syndrome, err_syn, syndrome)),
 348                MLX5_GET(cqe_error_syndrome, err_syn, vendor_error_syndrome),
 349                MLX5_GET(cqe_error_syndrome, err_syn, hw_syndrome_type),
 350                MLX5_GET(cqe_error_syndrome, err_syn, hw_error_syndrome));

I don't think the "syndrome" field contains a WC status code, so
invoking ib_wc_status_msg() to get a symbolic string seems wrong.

Anyway,

 - Can someone with an mlx5 decoder ring tell me what 0x41 is?
 - If someone sees an obvious error with how this patch has
   set up the SQ and Send CQ, please hit me with a clue bat.


> diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
> index 4a038c7e86f9..75f1481fbca0 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
> @@ -370,12 +370,12 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
>   */
>  static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
>  {
> +	unsigned int ctxts, rq_depth, sq_depth;
>  	struct svcxprt_rdma *listen_rdma;
>  	struct svcxprt_rdma *newxprt = NULL;
>  	struct rdma_conn_param conn_param;
>  	struct rpcrdma_connect_private pmsg;
>  	struct ib_qp_init_attr qp_attr;
> -	unsigned int ctxts, rq_depth;
>  	struct ib_device *dev;
>  	int ret = 0;
>  	RPC_IFDEBUG(struct sockaddr *sap);
> @@ -422,24 +422,29 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
>  		newxprt->sc_max_requests = rq_depth - 2;
>  		newxprt->sc_max_bc_requests = 2;
>  	}
> +	sq_depth = rq_depth;
> +
>  	ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES);
>  	ctxts *= newxprt->sc_max_requests;
> -	newxprt->sc_sq_depth = rq_depth + ctxts;
> -	if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr)
> -		newxprt->sc_sq_depth = dev->attrs.max_qp_wr;
> -	atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
>  
>  	newxprt->sc_pd = ib_alloc_pd(dev, 0);
>  	if (IS_ERR(newxprt->sc_pd)) {
>  		trace_svcrdma_pd_err(newxprt, PTR_ERR(newxprt->sc_pd));
>  		goto errout;
>  	}
> -	newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth,
> +
> +	/* The Completion Queue depth is the maximum number of signaled
> +	 * WRs expected to be in flight. Every Send WR is signaled, and
> +	 * each rw_ctx has a chain of WRs, but only one WR in each chain
> +	 * is signaled.
> +	 */
> +	newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, sq_depth + ctxts,
>  					    IB_POLL_WORKQUEUE);
>  	if (IS_ERR(newxprt->sc_sq_cq))
>  		goto errout;
> -	newxprt->sc_rq_cq =
> -		ib_alloc_cq_any(dev, newxprt, rq_depth, IB_POLL_WORKQUEUE);
> +	/* Every Receive WR is signaled. */
> +	newxprt->sc_rq_cq = ib_alloc_cq_any(dev, newxprt, rq_depth,
> +					    IB_POLL_WORKQUEUE);
>  	if (IS_ERR(newxprt->sc_rq_cq))
>  		goto errout;
>  
> @@ -448,7 +453,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
>  	qp_attr.qp_context = &newxprt->sc_xprt;
>  	qp_attr.port_num = newxprt->sc_port_num;
>  	qp_attr.cap.max_rdma_ctxs = ctxts;
> -	qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts;
> +	qp_attr.cap.max_send_wr = sq_depth;
>  	qp_attr.cap.max_recv_wr = rq_depth;
>  	qp_attr.cap.max_send_sge = newxprt->sc_max_send_sges;
>  	qp_attr.cap.max_recv_sge = 1;
> @@ -456,17 +461,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
>  	qp_attr.qp_type = IB_QPT_RC;
>  	qp_attr.send_cq = newxprt->sc_sq_cq;
>  	qp_attr.recv_cq = newxprt->sc_rq_cq;
> -	dprintk("    cap.max_send_wr = %d, cap.max_recv_wr = %d\n",
> -		qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
> -	dprintk("    cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
> -		qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
> -
>  	ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
>  	if (ret) {
>  		trace_svcrdma_qp_err(newxprt, ret);
>  		goto errout;
>  	}
> +	dprintk("svcrdma: cap.max_send_wr = %d, cap.max_recv_wr = %d\n",
> +		qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
> +	dprintk("    cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
> +		qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
> +	dprintk("    send CQ depth = %d, recv CQ depth = %d\n",
> +		sq_depth, rq_depth);
>  	newxprt->sc_qp = newxprt->sc_cm_id->qp;
> +	newxprt->sc_sq_depth = qp_attr.cap.max_send_wr;
> +	atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
>  
>  	if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
>  		newxprt->sc_snd_w_inv = false;
> 
> 
>
diff mbox series

Patch

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 4a038c7e86f9..75f1481fbca0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -370,12 +370,12 @@  static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
  */
 static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 {
+	unsigned int ctxts, rq_depth, sq_depth;
 	struct svcxprt_rdma *listen_rdma;
 	struct svcxprt_rdma *newxprt = NULL;
 	struct rdma_conn_param conn_param;
 	struct rpcrdma_connect_private pmsg;
 	struct ib_qp_init_attr qp_attr;
-	unsigned int ctxts, rq_depth;
 	struct ib_device *dev;
 	int ret = 0;
 	RPC_IFDEBUG(struct sockaddr *sap);
@@ -422,24 +422,29 @@  static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		newxprt->sc_max_requests = rq_depth - 2;
 		newxprt->sc_max_bc_requests = 2;
 	}
+	sq_depth = rq_depth;
+
 	ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES);
 	ctxts *= newxprt->sc_max_requests;
-	newxprt->sc_sq_depth = rq_depth + ctxts;
-	if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr)
-		newxprt->sc_sq_depth = dev->attrs.max_qp_wr;
-	atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
 
 	newxprt->sc_pd = ib_alloc_pd(dev, 0);
 	if (IS_ERR(newxprt->sc_pd)) {
 		trace_svcrdma_pd_err(newxprt, PTR_ERR(newxprt->sc_pd));
 		goto errout;
 	}
-	newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth,
+
+	/* The Completion Queue depth is the maximum number of signaled
+	 * WRs expected to be in flight. Every Send WR is signaled, and
+	 * each rw_ctx has a chain of WRs, but only one WR in each chain
+	 * is signaled.
+	 */
+	newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, sq_depth + ctxts,
 					    IB_POLL_WORKQUEUE);
 	if (IS_ERR(newxprt->sc_sq_cq))
 		goto errout;
-	newxprt->sc_rq_cq =
-		ib_alloc_cq_any(dev, newxprt, rq_depth, IB_POLL_WORKQUEUE);
+	/* Every Receive WR is signaled. */
+	newxprt->sc_rq_cq = ib_alloc_cq_any(dev, newxprt, rq_depth,
+					    IB_POLL_WORKQUEUE);
 	if (IS_ERR(newxprt->sc_rq_cq))
 		goto errout;
 
@@ -448,7 +453,7 @@  static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	qp_attr.qp_context = &newxprt->sc_xprt;
 	qp_attr.port_num = newxprt->sc_port_num;
 	qp_attr.cap.max_rdma_ctxs = ctxts;
-	qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts;
+	qp_attr.cap.max_send_wr = sq_depth;
 	qp_attr.cap.max_recv_wr = rq_depth;
 	qp_attr.cap.max_send_sge = newxprt->sc_max_send_sges;
 	qp_attr.cap.max_recv_sge = 1;
@@ -456,17 +461,20 @@  static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	qp_attr.qp_type = IB_QPT_RC;
 	qp_attr.send_cq = newxprt->sc_sq_cq;
 	qp_attr.recv_cq = newxprt->sc_rq_cq;
-	dprintk("    cap.max_send_wr = %d, cap.max_recv_wr = %d\n",
-		qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
-	dprintk("    cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
-		qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
-
 	ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
 	if (ret) {
 		trace_svcrdma_qp_err(newxprt, ret);
 		goto errout;
 	}
+	dprintk("svcrdma: cap.max_send_wr = %d, cap.max_recv_wr = %d\n",
+		qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
+	dprintk("    cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
+		qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
+	dprintk("    send CQ depth = %d, recv CQ depth = %d\n",
+		sq_depth, rq_depth);
 	newxprt->sc_qp = newxprt->sc_cm_id->qp;
+	newxprt->sc_sq_depth = qp_attr.cap.max_send_wr;
+	atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
 
 	if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
 		newxprt->sc_snd_w_inv = false;