diff mbox series

[3/4] nvme-rdma: use new shared CQ mechanism

Message ID 1589122557-88996-4-git-send-email-yaminf@mellanox.com (mailing list archive)
State Superseded
Headers show
Series Introducing RDMA shared CQ pool | expand

Commit Message

Yamin Friedman May 10, 2020, 2:55 p.m. UTC
Has the driver use shared CQs providing ~10%-50% improvement as seen in the
patch introducing shared CQs. Instead of opening a CQ on each core per NS
connected, a CQ for each core will be provided by the core driver that will
be shared between the QPs on that core reducing interrupt overhead.

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/nvme/host/rdma.c | 65 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 22 deletions(-)

Comments

Sagi Grimberg May 11, 2020, 8:50 a.m. UTC | #1
Looks good,

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Max Gurtovoy May 11, 2020, 4:29 p.m. UTC | #2
On 5/10/2020 5:55 PM, Yamin Friedman wrote:
> Has the driver use shared CQs providing ~10%-50% improvement as seen in the
> patch introducing shared CQs. Instead of opening a CQ on each core per NS
> connected, a CQ for each core will be provided by the core driver that will
> be shared between the QPs on that core reducing interrupt overhead.
>
> Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
> Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
> ---
>   drivers/nvme/host/rdma.c | 65 ++++++++++++++++++++++++++++++++----------------
>   1 file changed, 43 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index cac8a93..23eefd7 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -85,6 +85,7 @@ struct nvme_rdma_queue {
>   	struct rdma_cm_id	*cm_id;
>   	int			cm_error;
>   	struct completion	cm_done;
> +	int			cq_size;
>   };
>   
>   struct nvme_rdma_ctrl {
> @@ -261,6 +262,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
>   	init_attr.qp_type = IB_QPT_RC;
>   	init_attr.send_cq = queue->ib_cq;
>   	init_attr.recv_cq = queue->ib_cq;
> +	init_attr.qp_context = queue;
>   
>   	ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
>   
> @@ -389,6 +391,15 @@ static int nvme_rdma_dev_get(struct nvme_rdma_device *dev)
>   	return NULL;
>   }
>   
> +static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue)
> +{
> +	if (nvme_rdma_poll_queue(queue))
> +		ib_free_cq(queue->ib_cq);
> +	else {
> +		ib_cq_pool_put(queue->ib_cq, queue->cq_size);
> +	}
> +}

somehow I missed it in the internal review:

	if (nvme_rdma_poll_queue(queue))
		ib_free_cq(queue->ib_cq);
	else
		ib_cq_pool_put(queue->ib_cq, queue->cq_size);


> +
>   static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
>   {
>   	struct nvme_rdma_device *dev;
> @@ -408,7 +419,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
>   	 * the destruction of the QP shouldn't use rdma_cm API.
>   	 */
>   	ib_destroy_qp(queue->qp);
> -	ib_free_cq(queue->ib_cq);
> +	nvme_rdma_free_cq(queue);
>   
>   	nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
>   			sizeof(struct nvme_completion), DMA_FROM_DEVICE);
> @@ -422,13 +433,35 @@ static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
>   		     ibdev->attrs.max_fast_reg_page_list_len - 1);
>   }
>   
> +static void nvme_rdma_create_cq(struct ib_device *ibdev,
> +				struct nvme_rdma_queue *queue)
> +{
> +	int comp_vector, idx = nvme_rdma_queue_idx(queue);
> +	enum ib_poll_context poll_ctx;
> +
> +	/*
> +	 * Spread I/O queues completion vectors according their queue index.
> +	 * Admin queues can always go on completion vector 0.
> +	 */
> +	comp_vector = idx == 0 ? idx : idx - 1;
> +
> +	/* Polling queues need direct cq polling context */
> +	if (nvme_rdma_poll_queue(queue)) {
> +		poll_ctx = IB_POLL_DIRECT;
> +		queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size,
> +					   comp_vector, poll_ctx);
> +	} else {
> +		poll_ctx = IB_POLL_SOFTIRQ;
> +		queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size,
> +					      comp_vector, poll_ctx);
> +	}

please add integer as return value here instead of void.

And check the return value in the caller.


> +}
> +
>   static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
>   {
>   	struct ib_device *ibdev;
>   	const int send_wr_factor = 3;			/* MR, SEND, INV */
>   	const int cq_factor = send_wr_factor + 1;	/* + RECV */
> -	int comp_vector, idx = nvme_rdma_queue_idx(queue);
> -	enum ib_poll_context poll_ctx;
>   	int ret, pages_per_mr;
>   
>   	queue->device = nvme_rdma_find_get_device(queue->cm_id);
> @@ -439,22 +472,10 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
>   	}
>   	ibdev = queue->device->dev;
>   
> -	/*
> -	 * Spread I/O queues completion vectors according their queue index.
> -	 * Admin queues can always go on completion vector 0.
> -	 */
> -	comp_vector = idx == 0 ? idx : idx - 1;
> -
> -	/* Polling queues need direct cq polling context */
> -	if (nvme_rdma_poll_queue(queue))
> -		poll_ctx = IB_POLL_DIRECT;
> -	else
> -		poll_ctx = IB_POLL_SOFTIRQ;
> -
>   	/* +1 for ib_stop_cq */
> -	queue->ib_cq = ib_alloc_cq(ibdev, queue,
> -				cq_factor * queue->queue_size + 1,
> -				comp_vector, poll_ctx);
> +	queue->cq_size = cq_factor * queue->queue_size + 1;
> +
> +	nvme_rdma_create_cq(ibdev, queue);

see above.


>   	if (IS_ERR(queue->ib_cq)) {
>   		ret = PTR_ERR(queue->ib_cq);
>   		goto out_put_dev;
> @@ -484,7 +505,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
>   	if (ret) {
>   		dev_err(queue->ctrl->ctrl.device,
>   			"failed to initialize MR pool sized %d for QID %d\n",
> -			queue->queue_size, idx);
> +			queue->queue_size, nvme_rdma_queue_idx(queue));
>   		goto out_destroy_ring;
>   	}
>   
> @@ -498,7 +519,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
>   out_destroy_qp:
>   	rdma_destroy_qp(queue->cm_id);
>   out_destroy_ib_cq:
> -	ib_free_cq(queue->ib_cq);
> +	nvme_rdma_free_cq(queue);
>   out_put_dev:
>   	nvme_rdma_dev_put(queue->device);
>   	return ret;
> @@ -1093,7 +1114,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
>   static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
>   		const char *op)
>   {
> -	struct nvme_rdma_queue *queue = cq->cq_context;
> +	struct nvme_rdma_queue *queue = wc->qp->qp_context;
>   	struct nvme_rdma_ctrl *ctrl = queue->ctrl;
>   
>   	if (ctrl->ctrl.state == NVME_CTRL_LIVE)
> @@ -1481,7 +1502,7 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
>   {
>   	struct nvme_rdma_qe *qe =
>   		container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
> -	struct nvme_rdma_queue *queue = cq->cq_context;
> +	struct nvme_rdma_queue *queue = wc->qp->qp_context;
>   	struct ib_device *ibdev = queue->device->dev;
>   	struct nvme_completion *cqe = qe->data;
>   	const size_t len = sizeof(struct nvme_completion);

with the above small fixes:

Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
diff mbox series

Patch

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index cac8a93..23eefd7 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -85,6 +85,7 @@  struct nvme_rdma_queue {
 	struct rdma_cm_id	*cm_id;
 	int			cm_error;
 	struct completion	cm_done;
+	int			cq_size;
 };
 
 struct nvme_rdma_ctrl {
@@ -261,6 +262,7 @@  static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
 	init_attr.qp_type = IB_QPT_RC;
 	init_attr.send_cq = queue->ib_cq;
 	init_attr.recv_cq = queue->ib_cq;
+	init_attr.qp_context = queue;
 
 	ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
 
@@ -389,6 +391,15 @@  static int nvme_rdma_dev_get(struct nvme_rdma_device *dev)
 	return NULL;
 }
 
+static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue)
+{
+	if (nvme_rdma_poll_queue(queue))
+		ib_free_cq(queue->ib_cq);
+	else {
+		ib_cq_pool_put(queue->ib_cq, queue->cq_size);
+	}
+}
+
 static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
 {
 	struct nvme_rdma_device *dev;
@@ -408,7 +419,7 @@  static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
 	 * the destruction of the QP shouldn't use rdma_cm API.
 	 */
 	ib_destroy_qp(queue->qp);
-	ib_free_cq(queue->ib_cq);
+	nvme_rdma_free_cq(queue);
 
 	nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
 			sizeof(struct nvme_completion), DMA_FROM_DEVICE);
@@ -422,13 +433,35 @@  static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
 		     ibdev->attrs.max_fast_reg_page_list_len - 1);
 }
 
+static void nvme_rdma_create_cq(struct ib_device *ibdev,
+				struct nvme_rdma_queue *queue)
+{
+	int comp_vector, idx = nvme_rdma_queue_idx(queue);
+	enum ib_poll_context poll_ctx;
+
+	/*
+	 * Spread I/O queues completion vectors according their queue index.
+	 * Admin queues can always go on completion vector 0.
+	 */
+	comp_vector = idx == 0 ? idx : idx - 1;
+
+	/* Polling queues need direct cq polling context */
+	if (nvme_rdma_poll_queue(queue)) {
+		poll_ctx = IB_POLL_DIRECT;
+		queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size,
+					   comp_vector, poll_ctx);
+	} else {
+		poll_ctx = IB_POLL_SOFTIRQ;
+		queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size,
+					      comp_vector, poll_ctx);
+	}
+}
+
 static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
 {
 	struct ib_device *ibdev;
 	const int send_wr_factor = 3;			/* MR, SEND, INV */
 	const int cq_factor = send_wr_factor + 1;	/* + RECV */
-	int comp_vector, idx = nvme_rdma_queue_idx(queue);
-	enum ib_poll_context poll_ctx;
 	int ret, pages_per_mr;
 
 	queue->device = nvme_rdma_find_get_device(queue->cm_id);
@@ -439,22 +472,10 @@  static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
 	}
 	ibdev = queue->device->dev;
 
-	/*
-	 * Spread I/O queues completion vectors according their queue index.
-	 * Admin queues can always go on completion vector 0.
-	 */
-	comp_vector = idx == 0 ? idx : idx - 1;
-
-	/* Polling queues need direct cq polling context */
-	if (nvme_rdma_poll_queue(queue))
-		poll_ctx = IB_POLL_DIRECT;
-	else
-		poll_ctx = IB_POLL_SOFTIRQ;
-
 	/* +1 for ib_stop_cq */
-	queue->ib_cq = ib_alloc_cq(ibdev, queue,
-				cq_factor * queue->queue_size + 1,
-				comp_vector, poll_ctx);
+	queue->cq_size = cq_factor * queue->queue_size + 1;
+
+	nvme_rdma_create_cq(ibdev, queue);
 	if (IS_ERR(queue->ib_cq)) {
 		ret = PTR_ERR(queue->ib_cq);
 		goto out_put_dev;
@@ -484,7 +505,7 @@  static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
 	if (ret) {
 		dev_err(queue->ctrl->ctrl.device,
 			"failed to initialize MR pool sized %d for QID %d\n",
-			queue->queue_size, idx);
+			queue->queue_size, nvme_rdma_queue_idx(queue));
 		goto out_destroy_ring;
 	}
 
@@ -498,7 +519,7 @@  static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
 out_destroy_qp:
 	rdma_destroy_qp(queue->cm_id);
 out_destroy_ib_cq:
-	ib_free_cq(queue->ib_cq);
+	nvme_rdma_free_cq(queue);
 out_put_dev:
 	nvme_rdma_dev_put(queue->device);
 	return ret;
@@ -1093,7 +1114,7 @@  static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
 static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
 		const char *op)
 {
-	struct nvme_rdma_queue *queue = cq->cq_context;
+	struct nvme_rdma_queue *queue = wc->qp->qp_context;
 	struct nvme_rdma_ctrl *ctrl = queue->ctrl;
 
 	if (ctrl->ctrl.state == NVME_CTRL_LIVE)
@@ -1481,7 +1502,7 @@  static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
 	struct nvme_rdma_qe *qe =
 		container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
-	struct nvme_rdma_queue *queue = cq->cq_context;
+	struct nvme_rdma_queue *queue = wc->qp->qp_context;
 	struct ib_device *ibdev = queue->device->dev;
 	struct nvme_completion *cqe = qe->data;
 	const size_t len = sizeof(struct nvme_completion);