Message ID | 1589122557-88996-4-git-send-email-yaminf@mellanox.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Introducing RDMA shared CQ pool | expand |
Looks good,
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
On 5/10/2020 5:55 PM, Yamin Friedman wrote: > Has the driver use shared CQs providing ~10%-50% improvement as seen in the > patch introducing shared CQs. Instead of opening a CQ on each core per NS > connected, a CQ for each core will be provided by the core driver that will > be shared between the QPs on that core reducing interrupt overhead. > > Signed-off-by: Yamin Friedman <yaminf@mellanox.com> > Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com> > --- > drivers/nvme/host/rdma.c | 65 ++++++++++++++++++++++++++++++++---------------- > 1 file changed, 43 insertions(+), 22 deletions(-) > > diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c > index cac8a93..23eefd7 100644 > --- a/drivers/nvme/host/rdma.c > +++ b/drivers/nvme/host/rdma.c > @@ -85,6 +85,7 @@ struct nvme_rdma_queue { > struct rdma_cm_id *cm_id; > int cm_error; > struct completion cm_done; > + int cq_size; > }; > > struct nvme_rdma_ctrl { > @@ -261,6 +262,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) > init_attr.qp_type = IB_QPT_RC; > init_attr.send_cq = queue->ib_cq; > init_attr.recv_cq = queue->ib_cq; > + init_attr.qp_context = queue; > > ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr); > > @@ -389,6 +391,15 @@ static int nvme_rdma_dev_get(struct nvme_rdma_device *dev) > return NULL; > } > > +static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue) > +{ > + if (nvme_rdma_poll_queue(queue)) > + ib_free_cq(queue->ib_cq); > + else { > + ib_cq_pool_put(queue->ib_cq, queue->cq_size); > + } > +} somehow I missed it in the internal review: if (nvme_rdma_poll_queue(queue)) ib_free_cq(queue->ib_cq); else ib_cq_pool_put(queue->ib_cq, queue->cq_size); > + > static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) > { > struct nvme_rdma_device *dev; > @@ -408,7 +419,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) > * the destruction of the QP shouldn't use rdma_cm API. > */ > ib_destroy_qp(queue->qp); > - ib_free_cq(queue->ib_cq); > + nvme_rdma_free_cq(queue); > > nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, > sizeof(struct nvme_completion), DMA_FROM_DEVICE); > @@ -422,13 +433,35 @@ static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev) > ibdev->attrs.max_fast_reg_page_list_len - 1); > } > > +static void nvme_rdma_create_cq(struct ib_device *ibdev, > + struct nvme_rdma_queue *queue) > +{ > + int comp_vector, idx = nvme_rdma_queue_idx(queue); > + enum ib_poll_context poll_ctx; > + > + /* > + * Spread I/O queues completion vectors according their queue index. > + * Admin queues can always go on completion vector 0. > + */ > + comp_vector = idx == 0 ? idx : idx - 1; > + > + /* Polling queues need direct cq polling context */ > + if (nvme_rdma_poll_queue(queue)) { > + poll_ctx = IB_POLL_DIRECT; > + queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size, > + comp_vector, poll_ctx); > + } else { > + poll_ctx = IB_POLL_SOFTIRQ; > + queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size, > + comp_vector, poll_ctx); > + } please add integer as return value here instead of void. And check the return value in the caller. > +} > + > static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) > { > struct ib_device *ibdev; > const int send_wr_factor = 3; /* MR, SEND, INV */ > const int cq_factor = send_wr_factor + 1; /* + RECV */ > - int comp_vector, idx = nvme_rdma_queue_idx(queue); > - enum ib_poll_context poll_ctx; > int ret, pages_per_mr; > > queue->device = nvme_rdma_find_get_device(queue->cm_id); > @@ -439,22 +472,10 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) > } > ibdev = queue->device->dev; > > - /* > - * Spread I/O queues completion vectors according their queue index. > - * Admin queues can always go on completion vector 0. > - */ > - comp_vector = idx == 0 ? idx : idx - 1; > - > - /* Polling queues need direct cq polling context */ > - if (nvme_rdma_poll_queue(queue)) > - poll_ctx = IB_POLL_DIRECT; > - else > - poll_ctx = IB_POLL_SOFTIRQ; > - > /* +1 for ib_stop_cq */ > - queue->ib_cq = ib_alloc_cq(ibdev, queue, > - cq_factor * queue->queue_size + 1, > - comp_vector, poll_ctx); > + queue->cq_size = cq_factor * queue->queue_size + 1; > + > + nvme_rdma_create_cq(ibdev, queue); see above. > if (IS_ERR(queue->ib_cq)) { > ret = PTR_ERR(queue->ib_cq); > goto out_put_dev; > @@ -484,7 +505,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) > if (ret) { > dev_err(queue->ctrl->ctrl.device, > "failed to initialize MR pool sized %d for QID %d\n", > - queue->queue_size, idx); > + queue->queue_size, nvme_rdma_queue_idx(queue)); > goto out_destroy_ring; > } > > @@ -498,7 +519,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) > out_destroy_qp: > rdma_destroy_qp(queue->cm_id); > out_destroy_ib_cq: > - ib_free_cq(queue->ib_cq); > + nvme_rdma_free_cq(queue); > out_put_dev: > nvme_rdma_dev_put(queue->device); > return ret; > @@ -1093,7 +1114,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) > static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc, > const char *op) > { > - struct nvme_rdma_queue *queue = cq->cq_context; > + struct nvme_rdma_queue *queue = wc->qp->qp_context; > struct nvme_rdma_ctrl *ctrl = queue->ctrl; > > if (ctrl->ctrl.state == NVME_CTRL_LIVE) > @@ -1481,7 +1502,7 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) > { > struct nvme_rdma_qe *qe = > container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); > - struct nvme_rdma_queue *queue = cq->cq_context; > + struct nvme_rdma_queue *queue = wc->qp->qp_context; > struct ib_device *ibdev = queue->device->dev; > struct nvme_completion *cqe = qe->data; > const size_t len = sizeof(struct nvme_completion); with the above small fixes: Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index cac8a93..23eefd7 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -85,6 +85,7 @@ struct nvme_rdma_queue { struct rdma_cm_id *cm_id; int cm_error; struct completion cm_done; + int cq_size; }; struct nvme_rdma_ctrl { @@ -261,6 +262,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) init_attr.qp_type = IB_QPT_RC; init_attr.send_cq = queue->ib_cq; init_attr.recv_cq = queue->ib_cq; + init_attr.qp_context = queue; ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr); @@ -389,6 +391,15 @@ static int nvme_rdma_dev_get(struct nvme_rdma_device *dev) return NULL; } +static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue) +{ + if (nvme_rdma_poll_queue(queue)) + ib_free_cq(queue->ib_cq); + else { + ib_cq_pool_put(queue->ib_cq, queue->cq_size); + } +} + static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) { struct nvme_rdma_device *dev; @@ -408,7 +419,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) * the destruction of the QP shouldn't use rdma_cm API. */ ib_destroy_qp(queue->qp); - ib_free_cq(queue->ib_cq); + nvme_rdma_free_cq(queue); nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, sizeof(struct nvme_completion), DMA_FROM_DEVICE); @@ -422,13 +433,35 @@ static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev) ibdev->attrs.max_fast_reg_page_list_len - 1); } +static void nvme_rdma_create_cq(struct ib_device *ibdev, + struct nvme_rdma_queue *queue) +{ + int comp_vector, idx = nvme_rdma_queue_idx(queue); + enum ib_poll_context poll_ctx; + + /* + * Spread I/O queues completion vectors according their queue index. + * Admin queues can always go on completion vector 0. + */ + comp_vector = idx == 0 ? idx : idx - 1; + + /* Polling queues need direct cq polling context */ + if (nvme_rdma_poll_queue(queue)) { + poll_ctx = IB_POLL_DIRECT; + queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size, + comp_vector, poll_ctx); + } else { + poll_ctx = IB_POLL_SOFTIRQ; + queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size, + comp_vector, poll_ctx); + } +} + static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) { struct ib_device *ibdev; const int send_wr_factor = 3; /* MR, SEND, INV */ const int cq_factor = send_wr_factor + 1; /* + RECV */ - int comp_vector, idx = nvme_rdma_queue_idx(queue); - enum ib_poll_context poll_ctx; int ret, pages_per_mr; queue->device = nvme_rdma_find_get_device(queue->cm_id); @@ -439,22 +472,10 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) } ibdev = queue->device->dev; - /* - * Spread I/O queues completion vectors according their queue index. - * Admin queues can always go on completion vector 0. - */ - comp_vector = idx == 0 ? idx : idx - 1; - - /* Polling queues need direct cq polling context */ - if (nvme_rdma_poll_queue(queue)) - poll_ctx = IB_POLL_DIRECT; - else - poll_ctx = IB_POLL_SOFTIRQ; - /* +1 for ib_stop_cq */ - queue->ib_cq = ib_alloc_cq(ibdev, queue, - cq_factor * queue->queue_size + 1, - comp_vector, poll_ctx); + queue->cq_size = cq_factor * queue->queue_size + 1; + + nvme_rdma_create_cq(ibdev, queue); if (IS_ERR(queue->ib_cq)) { ret = PTR_ERR(queue->ib_cq); goto out_put_dev; @@ -484,7 +505,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) if (ret) { dev_err(queue->ctrl->ctrl.device, "failed to initialize MR pool sized %d for QID %d\n", - queue->queue_size, idx); + queue->queue_size, nvme_rdma_queue_idx(queue)); goto out_destroy_ring; } @@ -498,7 +519,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) out_destroy_qp: rdma_destroy_qp(queue->cm_id); out_destroy_ib_cq: - ib_free_cq(queue->ib_cq); + nvme_rdma_free_cq(queue); out_put_dev: nvme_rdma_dev_put(queue->device); return ret; @@ -1093,7 +1114,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc, const char *op) { - struct nvme_rdma_queue *queue = cq->cq_context; + struct nvme_rdma_queue *queue = wc->qp->qp_context; struct nvme_rdma_ctrl *ctrl = queue->ctrl; if (ctrl->ctrl.state == NVME_CTRL_LIVE) @@ -1481,7 +1502,7 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvme_rdma_qe *qe = container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); - struct nvme_rdma_queue *queue = cq->cq_context; + struct nvme_rdma_queue *queue = wc->qp->qp_context; struct ib_device *ibdev = queue->device->dev; struct nvme_completion *cqe = qe->data; const size_t len = sizeof(struct nvme_completion);