From patchwork Tue Jul 17 07:38:18 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Potnuri Bharat Teja X-Patchwork-Id: 10528237 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork.web.codeaurora.org (Postfix) with ESMTP id 62E9B601D2 for ; Tue, 17 Jul 2018 07:38:50 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 46DCE28614 for ; Tue, 17 Jul 2018 07:38:50 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 3AE1E28623; Tue, 17 Jul 2018 07:38:50 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-7.9 required=2.0 tests=BAYES_00, MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 6F29728614 for ; Tue, 17 Jul 2018 07:38:49 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1728843AbeGQIKD (ORCPT ); Tue, 17 Jul 2018 04:10:03 -0400 Received: from stargate.chelsio.com ([12.32.117.8]:8810 "EHLO stargate.chelsio.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1727052AbeGQIKC (ORCPT ); Tue, 17 Jul 2018 04:10:02 -0400 Received: from localhost (chittorgarh.blr.asicdesigners.com [10.193.184.65]) by stargate.chelsio.com (8.13.8/8.13.8) with ESMTP id w6H7cgRc011955; Tue, 17 Jul 2018 00:38:42 -0700 From: Potnuri Bharat Teja To: jgg@ziepe.ca, dledford@redhat.com Cc: swise@opengridcomputing.com, rajur@chelsio.com, bharat@chelsio.com, linux-rdma@vger.kernel.org Subject: [PATCH for-next v2 2/2] iw_cxgb4: Support FW write completion WR Date: Tue, 17 Jul 2018 13:08:18 +0530 Message-Id: <20180717073818.13442-3-bharat@chelsio.com> X-Mailer: git-send-email 2.9.3 In-Reply-To: <20180717073818.13442-1-bharat@chelsio.com> References: <20180717073818.13442-1-bharat@chelsio.com> Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP To optimize NVME-oF READ IOPs, use a specialized WQE that combines the RDMA WRITE and SEND_INV WR chain submitted by the NVME-oF target driver. This reduces uP overhead per NVME-oF IO, and results in over 10% improvement in NVME-oF 4K READ IOPs. Signed-off-by: Potnuri Bharat Teja Signed-off-by: Steve Wise --- drivers/infiniband/hw/cxgb4/device.c | 1 + drivers/infiniband/hw/cxgb4/qp.c | 147 +++++++++++++++++++++++++++++- drivers/infiniband/hw/cxgb4/t4.h | 6 +- drivers/infiniband/hw/cxgb4/t4fw_ri_api.h | 31 +++++++ 4 files changed, 183 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 5ef082bfa95a..c13c0ba30f63 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -866,6 +866,7 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) rdev->status_page->qp_size = rdev->lldi.vr->qp.size; rdev->status_page->cq_start = rdev->lldi.vr->cq.start; rdev->status_page->cq_size = rdev->lldi.vr->cq.size; + rdev->status_page->write_cmpl_supported = rdev->lldi.write_cmpl_support; if (c4iw_wr_log) { rdev->wr_log = kcalloc(1 << c4iw_wr_log_size_order, diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 88bd67d7dc77..a41c598826ca 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -455,7 +455,12 @@ static int build_isgl(__be64 *queue_start, __be64 *queue_end, { int i; u32 plen = 0; - __be64 *flitp = (__be64 *)isglp->sge; + __be64 *flitp; + + if ((__be64 *)isglp == queue_end) + isglp = (struct fw_ri_isgl *)queue_start; + + flitp = (__be64 *)isglp->sge; for (i = 0; i < num_sge; i++) { if ((plen + sg_list[i].length) < plen) @@ -597,6 +602,56 @@ static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe, return 0; } +static void build_immd_cmpl(struct t4_sq *sq, struct fw_ri_immd_cmpl *immdp, + struct ib_send_wr *wr) +{ + memcpy((u8 *)immdp->data, (u8 *)(uintptr_t)wr->sg_list->addr, 16); + memset(immdp->r1, 0, 6); + immdp->op = FW_RI_DATA_IMMD; + immdp->immdlen = 16; +} + +static void build_rdma_write_cmpl(struct t4_sq *sq, + struct fw_ri_rdma_write_cmpl_wr *wcwr, + struct ib_send_wr *wr, u8 *len16) +{ + u32 plen; + int size; + + /* + * This code assumes the struct fields preceding the write isgl + * fit in one 64B WR slot. This is because the WQE is built + * directly in the dma queue, and wrapping is only handled + * by the code buildling sgls. IE the "fixed part" of the wr + * structs must all fit in 64B. The WQE build code should probably be + * redesigned to avoid this restriction, but for now just add + * the BUILD_BUG_ON() to catch if this WQE struct gets too big. + */ + BUILD_BUG_ON(offsetof(struct fw_ri_rdma_write_cmpl_wr, u) > 64); + + wcwr->stag_sink = cpu_to_be32(rdma_wr(wr)->rkey); + wcwr->to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr); + wcwr->stag_inv = cpu_to_be32(wr->next->ex.invalidate_rkey); + wcwr->r2 = 0; + wcwr->r3 = 0; + + /* SEND_INV SGL */ + if (wr->next->send_flags & IB_SEND_INLINE) + build_immd_cmpl(sq, &wcwr->u_cmpl.immd_src, wr->next); + else + build_isgl((__be64 *)sq->queue, (__be64 *)&sq->queue[sq->size], + &wcwr->u_cmpl.isgl_src, wr->next->sg_list, 1, NULL); + + /* WRITE SGL */ + build_isgl((__be64 *)sq->queue, (__be64 *)&sq->queue[sq->size], + wcwr->u.isgl_src, wr->sg_list, wr->num_sge, &plen); + + size = sizeof(*wcwr) + sizeof(struct fw_ri_isgl) + + wr->num_sge * sizeof(struct fw_ri_sge); + wcwr->plen = cpu_to_be32(plen); + *len16 = DIV_ROUND_UP(size, 16); +} + static int build_rdma_read(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) { if (wr->num_sge > 1) @@ -626,6 +681,72 @@ static int build_rdma_read(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) return 0; } +static void post_write_cmpl(struct c4iw_qp *qhp, struct ib_send_wr *wr) +{ + bool send_signaled = (wr->next->send_flags & IB_SEND_SIGNALED) || + qhp->sq_sig_all; + bool write_signaled = (wr->send_flags & IB_SEND_SIGNALED) || + qhp->sq_sig_all; + struct t4_swsqe *swsqe; + union t4_wr *wqe; + u16 write_wrid; + u8 len16; + u16 idx; + + /* + * The sw_sq entries still look like a WRITE and a SEND and consume + * 2 slots. The FW WR, however, will be a single uber-WR. + */ + wqe = (union t4_wr *)((u8 *)qhp->wq.sq.queue + + qhp->wq.sq.wq_pidx * T4_EQ_ENTRY_SIZE); + build_rdma_write_cmpl(&qhp->wq.sq, &wqe->write_cmpl, wr, &len16); + + /* WRITE swsqe */ + swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx]; + swsqe->opcode = FW_RI_RDMA_WRITE; + swsqe->idx = qhp->wq.sq.pidx; + swsqe->complete = 0; + swsqe->signaled = write_signaled; + swsqe->flushed = 0; + swsqe->wr_id = wr->wr_id; + if (c4iw_wr_log) { + swsqe->sge_ts = + cxgb4_read_sge_timestamp(qhp->rhp->rdev.lldi.ports[0]); + swsqe->host_time = ktime_get(); + } + + write_wrid = qhp->wq.sq.pidx; + + /* just bump the sw_sq */ + qhp->wq.sq.in_use++; + if (++qhp->wq.sq.pidx == qhp->wq.sq.size) + qhp->wq.sq.pidx = 0; + + /* SEND_WITH_INV swsqe */ + swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx]; + swsqe->opcode = FW_RI_SEND_WITH_INV; + swsqe->idx = qhp->wq.sq.pidx; + swsqe->complete = 0; + swsqe->signaled = send_signaled; + swsqe->flushed = 0; + swsqe->wr_id = wr->next->wr_id; + if (c4iw_wr_log) { + swsqe->sge_ts = + cxgb4_read_sge_timestamp(qhp->rhp->rdev.lldi.ports[0]); + swsqe->host_time = ktime_get(); + } + + wqe->write_cmpl.flags_send = send_signaled ? FW_RI_COMPLETION_FLAG : 0; + wqe->write_cmpl.wrid_send = qhp->wq.sq.pidx; + + init_wr_hdr(wqe, write_wrid, FW_RI_RDMA_WRITE_CMPL_WR, + write_signaled ? FW_RI_COMPLETION_FLAG : 0, len16); + t4_sq_produce(&qhp->wq, len16); + idx = DIV_ROUND_UP(len16 * 16, T4_EQ_ENTRY_SIZE); + + t4_ring_sq_db(&qhp->wq, idx, wqe); +} + static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe, struct ib_recv_wr *wr, u8 *len16) { @@ -1001,6 +1122,30 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, *bad_wr = wr; return -ENOMEM; } + + /* + * Fastpath for NVMe-oF target WRITE + SEND_WITH_INV wr chain which is + * the response for small NVMEe-oF READ requests. If the chain is + * exactly a WRITE->SEND_WITH_INV and the sgl depths and lengths + * meet the requirements of the fw_ri_write_cmpl_wr work request, + * then build and post the write_cmpl WR. If any of the tests + * below are not true, then we continue on with the tradtional WRITE + * and SEND WRs. + */ + if (qhp->rhp->rdev.lldi.write_cmpl_support && + CHELSIO_CHIP_VERSION(qhp->rhp->rdev.lldi.adapter_type) >= + CHELSIO_T5 && + wr && wr->next && !wr->next->next && + wr->opcode == IB_WR_RDMA_WRITE && + wr->sg_list[0].length && wr->num_sge <= T4_WRITE_CMPL_MAX_SGL && + wr->next->opcode == IB_WR_SEND_WITH_INV && + wr->next->sg_list[0].length == T4_WRITE_CMPL_MAX_CQE && + wr->next->num_sge == 1 && num_wrs >= 2) { + post_write_cmpl(qhp, wr); + spin_unlock_irqrestore(&qhp->lock, flag); + return 0; + } + while (wr) { if (num_wrs == 0) { err = -ENOMEM; diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h index 3fdcc445ff16..899ba8d785d0 100644 --- a/drivers/infiniband/hw/cxgb4/t4.h +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -91,6 +91,9 @@ static inline int t4_max_fr_depth(int use_dsgl) #define T4_RQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_RQ_NUM_SLOTS) #define T4_MAX_RECV_SGE 4 +#define T4_WRITE_CMPL_MAX_SGL 4 +#define T4_WRITE_CMPL_MAX_CQE 16 + union t4_wr { struct fw_ri_res_wr res; struct fw_ri_wr ri; @@ -101,6 +104,7 @@ union t4_wr { struct fw_ri_fr_nsmr_wr fr; struct fw_ri_fr_nsmr_tpte_wr fr_tpte; struct fw_ri_inv_lstag_wr inv; + struct fw_ri_rdma_write_cmpl_wr write_cmpl; struct t4_status_page status; __be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_SQ_NUM_SLOTS]; }; @@ -851,7 +855,7 @@ static inline void t4_set_cq_in_error(struct t4_cq *cq) struct t4_dev_status_page { u8 db_off; - u8 pad1; + u8 write_cmpl_supported; u16 pad2; u32 pad3; u64 qp_start; diff --git a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h index 45e7211e94ac..39d1ebd0ff0a 100644 --- a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h +++ b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h @@ -595,6 +595,37 @@ struct fw_ri_send_wr { #define FW_RI_SEND_WR_SENDOP_G(x) \ (((x) >> FW_RI_SEND_WR_SENDOP_S) & FW_RI_SEND_WR_SENDOP_M) +struct fw_ri_rdma_write_cmpl_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __u8 r2; + __u8 flags_send; + __u16 wrid_send; + __be32 stag_inv; + __be32 plen; + __be32 stag_sink; + __be64 to_sink; + union fw_ri_cmpl { + struct fw_ri_immd_cmpl { + __u8 op; + __u8 r1[6]; + __u8 immdlen; + __u8 data[16]; + } immd_src; + struct fw_ri_isgl isgl_src; + } u_cmpl; + __be64 r3; +#ifndef C99_NOT_SUPPORTED + union fw_ri_write { + struct fw_ri_immd immd_src[0]; + struct fw_ri_isgl isgl_src[0]; + } u; +#endif +}; + struct fw_ri_rdma_read_wr { __u8 opcode; __u8 flags;