From patchwork Thu Sep 20 21:43:09 2012 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Hefty, Sean" X-Patchwork-Id: 1488091 X-Patchwork-Delegate: roland@digitalvampire.org Return-Path: X-Original-To: patchwork-linux-rdma@patchwork.kernel.org Delivered-To: patchwork-process-083081@patchwork1.kernel.org Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by patchwork1.kernel.org (Postfix) with ESMTP id A536A400EC for ; Thu, 20 Sep 2012 21:43:20 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755990Ab2ITVnT (ORCPT ); Thu, 20 Sep 2012 17:43:19 -0400 Received: from mga01.intel.com ([192.55.52.88]:54149 "EHLO mga01.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1755994Ab2ITVnM convert rfc822-to-8bit (ORCPT ); Thu, 20 Sep 2012 17:43:12 -0400 Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by fmsmga101.fm.intel.com with ESMTP; 20 Sep 2012 14:43:11 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="4.80,455,1344236400"; d="scan'208";a="224780459" Received: from orsmsx106.amr.corp.intel.com ([10.22.225.133]) by fmsmga002.fm.intel.com with ESMTP; 20 Sep 2012 14:43:10 -0700 Received: from orsmsx101.amr.corp.intel.com ([169.254.8.152]) by ORSMSX106.amr.corp.intel.com ([169.254.5.60]) with mapi id 14.01.0355.002; Thu, 20 Sep 2012 14:43:09 -0700 From: "Hefty, Sean" To: "linux-rdma (linux-rdma@vger.kernel.org)" Subject: [PATCH 2/2] libmlx4: Add support for XRC QPs Thread-Topic: [PATCH 2/2] libmlx4: Add support for XRC QPs Thread-Index: Ac2XeNt5mofYj1uGTx+c4QCMpfzmUA== Date: Thu, 20 Sep 2012 21:43:09 +0000 Message-ID: <1828884A29C6694DAF28B7E6B8A8237346A8E834@ORSMSX101.amr.corp.intel.com> Accept-Language: en-US Content-Language: en-US X-MS-Has-Attach: X-MS-TNEF-Correlator: x-originating-ip: [10.22.254.140] MIME-Version: 1.0 Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org Signed-off-by: Sean Hefty --- Note that I have a hack in cq.c. Someone more familiar with the mlx4 HW needs to look at the change. src/buf.c | 6 +- src/cq.c | 40 ++++++++--- src/mlx4-abi.h | 6 ++ src/mlx4.c | 19 +++-- src/mlx4.h | 59 ++++++++++++++++ src/qp.c | 35 ++++++---- src/srq.c | 151 +++++++++++++++++++++++++++++++++++++++++ src/verbs.c | 205 ++++++++++++++++++++++++++++++++++++++++++-------------- 8 files changed, 436 insertions(+), 85 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html diff --git a/src/buf.c b/src/buf.c index a80bcb1..50957bb 100644 --- a/src/buf.c +++ b/src/buf.c @@ -78,6 +78,8 @@ int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size) void mlx4_free_buf(struct mlx4_buf *buf) { - ibv_dofork_range(buf->buf, buf->length); - munmap(buf->buf, buf->length); + if (buf->length) { + ibv_dofork_range(buf->buf, buf->length); + munmap(buf->buf, buf->length); + } } diff --git a/src/cq.c b/src/cq.c index 8f7a8cc..5945270 100644 --- a/src/cq.c +++ b/src/cq.c @@ -220,33 +220,43 @@ static int mlx4_poll_one(struct mlx4_cq *cq, rmb(); qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK; + wc->qp_num = qpn; is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK; is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_ERROR; - if (!*cur_qp || - (qpn != (*cur_qp)->ibv_qp.qp_num)) { + if ((qpn & MLX4_XRC_QPN_BIT) && !is_send) { /* - * We do not have to take the QP table lock here, - * because CQs will be locked while QPs are removed + * We do not have to take the XSRQ table lock here, + * because CQs will be locked while SRQs are removed * from the table. */ - *cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn); - if (!*cur_qp) + srq = mlx4_find_xsrq(&to_mctx(cq->ibv_cq.context)->xsrq_table, + ntohl(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK); + if (!srq) return CQ_POLL_ERR; + } else { + if (!*cur_qp || (qpn != (*cur_qp)->ibv_qp.qp_num)) { + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + *cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn); + if (!*cur_qp) + return CQ_POLL_ERR; + } + srq = ((*cur_qp)->ibv_qp.srq) ? to_msrq((*cur_qp)->ibv_qp.srq) : NULL; } - wc->qp_num = (*cur_qp)->ibv_qp.qp_num; - if (is_send) { wq = &(*cur_qp)->sq; wqe_index = ntohs(cqe->wqe_index); wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail); wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; ++wq->tail; - } else if ((*cur_qp)->ibv_qp.srq) { - srq = to_msrq((*cur_qp)->ibv_qp.srq); + } else if (srq) { wqe_index = htons(cqe->wqe_index); wc->wr_id = srq->wrid[wqe_index]; mlx4_free_srq_wqe(srq, wqe_index); @@ -322,7 +332,8 @@ static int mlx4_poll_one(struct mlx4_cq *cq, wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f; wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0; wc->pkey_index = ntohl(cqe->immed_rss_invalid) & 0x7f; - if ((*cur_qp)->link_layer == IBV_LINK_LAYER_ETHERNET) + /* HACK */ + if ((*cur_qp) && (*cur_qp)->link_layer == IBV_LINK_LAYER_ETHERNET) wc->sl = ntohs(cqe->sl_vid) >> 13; else wc->sl = ntohs(cqe->sl_vid) >> 12; @@ -411,7 +422,12 @@ void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq) */ while ((int) --prod_index - (int) cq->cons_index >= 0) { cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe); - if ((ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { + if (srq && srq->ext_srq && + ntohl(cqe->g_mlpath_rqpn & MLX4_CQE_QPN_MASK) == MLX4_GET_SRQN(srq) && + !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) { + mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index)); + ++nfreed; + } else if ((ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index)); ++nfreed; diff --git a/src/mlx4-abi.h b/src/mlx4-abi.h index 20a40c9..40d0d9a 100644 --- a/src/mlx4-abi.h +++ b/src/mlx4-abi.h @@ -74,6 +74,12 @@ struct mlx4_create_srq { __u64 db_addr; }; +struct mlx4_create_xsrq { + struct ibv_create_xsrq ibv_cmd; + __u64 buf_addr; + __u64 db_addr; +}; + struct mlx4_create_srq_resp { struct ibv_create_srq_resp ibv_resp; __u32 srqn; diff --git a/src/mlx4.c b/src/mlx4.c index 1a4e8b0..5a6b353 100644 --- a/src/mlx4.c +++ b/src/mlx4.c @@ -135,13 +135,14 @@ static int mlx4_init_context(struct verbs_device *device, struct ibv_get_context cmd; struct mlx4_alloc_ucontext_resp resp; int i; - /* verbs_context should be used for new verbs - *struct verbs_context *verbs_ctx = verbs_get_ctx(ibv_ctx); - */ + struct verbs_context *verbs_ctx = verbs_get_ctx(ibv_ctx); /* memory footprint of mlx4_context and verbs_context share * struct ibv_context. */ + if (sizeof(*verbs_ctx) > *(((size_t *) ibv_ctx) - 1)) + return ENOSYS; + context = to_mctx(ibv_ctx); ibv_ctx->cmd_fd = cmd_fd; @@ -160,6 +161,7 @@ static int mlx4_init_context(struct verbs_device *device, for (i = 0; i < MLX4_NUM_DB_TYPE; ++i) context->db_list[i] = NULL; + mlx4_init_xsrq_table(&context->xsrq_table, resp.qp_tab_size); pthread_mutex_init(&context->db_list_mutex, NULL); context->uar = mmap(NULL, to_mdev_ex(device)->page_size, PROT_WRITE, @@ -189,14 +191,15 @@ static int mlx4_init_context(struct verbs_device *device, pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE); ibv_ctx->ops = mlx4_ctx_ops; - /* New verbs should be added as below - * verbs_ctx->drv_new_func1 = mlx4_new_func1; - */ - return 0; + verbs_ctx->close_xrcd = mlx4_close_xrcd; + verbs_ctx->open_xrcd = mlx4_open_xrcd; + verbs_ctx->create_srq_ex = mlx4_create_srq_ex; + verbs_ctx->create_qp_ex = mlx4_create_qp_ex; + verbs_ctx->open_qp = mlx4_open_qp; + return 0; } - static void mlx4_uninit_context(struct verbs_device *device, struct ibv_context *ibv_ctx) { diff --git a/src/mlx4.h b/src/mlx4.h index c06dbd5..f1ea788 100644 --- a/src/mlx4.h +++ b/src/mlx4.h @@ -38,6 +38,7 @@ #include #include +#include #ifdef HAVE_VALGRIND_MEMCHECK_H @@ -97,6 +98,37 @@ enum { MLX4_QP_TABLE_MASK = MLX4_QP_TABLE_SIZE - 1 }; +#define MLX4_REMOTE_SRQN_FLAGS(wr) htonl((wr)->wr.xrc.remote_srqn << 8) +#define MLX4_GET_SRQN(srq) (srq)->ibv_srq.srq_num + +enum { + MLX4_XSRQ_TABLE_BITS = 8, + MLX4_XSRQ_TABLE_SIZE = 1 << MLX4_XSRQ_TABLE_BITS, + MLX4_XSRQ_TABLE_MASK = MLX4_XSRQ_TABLE_SIZE - 1 +}; + +struct mlx4_xsrq_table { + struct { + struct mlx4_srq **table; + int refcnt; + } xsrq_table[MLX4_XSRQ_TABLE_SIZE]; + + pthread_mutex_t mutex; + int num_xsrq; + int shift; + int mask; +}; + +void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size); +struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn); +int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn, + struct mlx4_srq *srq); +void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn); + +enum { + MLX4_XRC_QPN_BIT = (1 << 23) +}; + enum mlx4_db_type { MLX4_DB_TYPE_CQ, MLX4_DB_TYPE_RQ, @@ -162,6 +194,8 @@ struct mlx4_context { int qp_table_shift; int qp_table_mask; + struct mlx4_xsrq_table xsrq_table; + struct mlx4_db_page *db_list[MLX4_NUM_DB_TYPE]; pthread_mutex_t db_list_mutex; }; @@ -201,6 +235,7 @@ struct mlx4_srq { int tail; uint32_t *db; uint16_t counter; + uint8_t ext_srq; }; struct mlx4_wq { @@ -256,6 +291,7 @@ static inline unsigned long align(unsigned long val, unsigned long align) { return (val + align - 1) & ~(align - 1); } +int align_queue_size(int req); #define to_mxxx(xxx, type) \ ((struct mlx4_##type *) \ @@ -307,6 +343,13 @@ static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah) return to_mxxx(ah, ah); } +static inline struct ibv_context * +mlx4_get_context_qp(struct ibv_pd *pd, struct ibv_qp_init_attr_ex *attr_ex) +{ + return (attr_ex->comp_mask & IBV_QP_INIT_ATTR_XRCD) ? + attr_ex->xrcd->context : pd->context; +} + int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size); void mlx4_free_buf(struct mlx4_buf *buf); @@ -320,6 +363,8 @@ int mlx4_query_port(struct ibv_context *context, uint8_t port, struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context); int mlx4_free_pd(struct ibv_pd *pd); +struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context, int fd, int oflags); +int mlx4_close_xrcd(struct ibv_xrcd *xrcd); struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access); @@ -341,20 +386,32 @@ void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int new_cqe); struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd, struct ibv_srq_init_attr *attr); +struct ibv_srq *mlx4_create_srq_ex(struct ibv_pd *pd, + struct ibv_srq_init_attr_ex *attr_ex); +struct ibv_srq *mlx4_create_xrc_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr_ex *attr_ex); int mlx4_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr, int mask); int mlx4_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr); int mlx4_destroy_srq(struct ibv_srq *srq); +int mlx4_destroy_xrc_srq(struct ibv_srq *srq); int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr, struct mlx4_srq *srq); +void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size); +struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn); +int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn, + struct mlx4_srq *srq); +void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn); void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind); int mlx4_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); +struct ibv_qp *mlx4_create_qp_ex(struct ibv_pd *pd, struct ibv_qp_init_attr_ex *attr); +struct ibv_qp *mlx4_open_qp(struct ibv_xrcd *xrcd, struct ibv_qp_open_attr *attr); int mlx4_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); @@ -369,7 +426,7 @@ int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type, struct mlx4_qp *qp); -int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, +int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap, enum ibv_qp_type type, struct mlx4_qp *qp); void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap, enum ibv_qp_type type); diff --git a/src/qp.c b/src/qp.c index 40a6689..18cf263 100644 --- a/src/qp.c +++ b/src/qp.c @@ -246,6 +246,9 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, size = sizeof *ctrl / 16; switch (ibqp->qp_type) { + case IBV_QPT_XRC_SEND: + ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr); + /* fall through */ case IBV_QPT_RC: case IBV_QPT_UC: switch (wr->opcode) { @@ -546,6 +549,7 @@ void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type, size += sizeof (struct mlx4_wqe_raddr_seg); break; + case IBV_QPT_XRC_SEND: case IBV_QPT_RC: size += sizeof (struct mlx4_wqe_raddr_seg); /* @@ -575,14 +579,16 @@ void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type, ; /* nothing */ } -int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, +int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap, enum ibv_qp_type type, struct mlx4_qp *qp) { qp->rq.max_gs = cap->max_recv_sge; - qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t)); - if (!qp->sq.wrid) - return -1; + if (qp->sq.wqe_cnt) { + qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t)); + if (!qp->sq.wrid) + return -1; + } if (qp->rq.wqe_cnt) { qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t)); @@ -607,15 +613,19 @@ int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, qp->sq.offset = 0; } - if (mlx4_alloc_buf(&qp->buf, - align(qp->buf_size, to_mdev(pd->context->device)->page_size), - to_mdev(pd->context->device)->page_size)) { - free(qp->sq.wrid); - free(qp->rq.wrid); - return -1; - } + if (qp->buf_size) { + if (mlx4_alloc_buf(&qp->buf, + align(qp->buf_size, to_mdev(context->device)->page_size), + to_mdev(context->device)->page_size)) { + free(qp->sq.wrid); + free(qp->rq.wrid); + return -1; + } - memset(qp->buf.buf, 0, qp->buf_size); + memset(qp->buf.buf, 0, qp->buf_size); + } else { + qp->buf.buf = NULL; + } return 0; } @@ -631,6 +641,7 @@ void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap, wqe_size -= sizeof (struct mlx4_wqe_datagram_seg); break; + case IBV_QPT_XRC_SEND: case IBV_QPT_UC: case IBV_QPT_RC: wqe_size -= sizeof (struct mlx4_wqe_raddr_seg); diff --git a/src/srq.c b/src/srq.c index f1d1240..d27572a 100644 --- a/src/srq.c +++ b/src/srq.c @@ -42,6 +42,7 @@ #include "mlx4.h" #include "doorbell.h" #include "wqe.h" +#include "mlx4-abi.h" static void *get_wqe(struct mlx4_srq *srq, int n) { @@ -173,3 +174,153 @@ int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr, return 0; } + +void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size) +{ + memset(xsrq_table, 0, sizeof *xsrq_table); + xsrq_table->num_xsrq = size; + xsrq_table->shift = ffs(size) - 1 - MLX4_XSRQ_TABLE_BITS; + xsrq_table->mask = (1 << xsrq_table->shift) - 1; + + pthread_mutex_init(&xsrq_table->mutex, NULL); +} + +struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn) +{ + int index; + + index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift; + if (xsrq_table->xsrq_table[index].refcnt) + return xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask]; + + return NULL; +} + +int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn, + struct mlx4_srq *srq) +{ + int index, ret = 0; + + index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift; + pthread_mutex_lock(&xsrq_table->mutex); + if (!xsrq_table->xsrq_table[index].refcnt) { + xsrq_table->xsrq_table[index].table = calloc(xsrq_table->mask + 1, + sizeof(struct mlx4_srq *)); + if (!xsrq_table->xsrq_table[index].table) { + ret = -1; + goto out; + } + } + + xsrq_table->xsrq_table[index].refcnt++; + xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = srq; + +out: + pthread_mutex_unlock(&xsrq_table->mutex); + return ret; +} + +void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn) +{ + int index; + + index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift; + pthread_mutex_lock(&xsrq_table->mutex); + + if (--xsrq_table->xsrq_table[index].refcnt) + xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = NULL; + else + free(xsrq_table->xsrq_table[index].table); + + pthread_mutex_unlock(&xsrq_table->mutex); +} + +struct ibv_srq *mlx4_create_xrc_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr_ex *attr_ex) +{ + struct mlx4_create_xsrq cmd; + struct mlx4_create_srq_resp resp; + struct mlx4_srq *srq; + int ret; + + /* Sanity check SRQ size before proceeding */ + if (attr_ex->attr.max_wr > 1 << 16 || attr_ex->attr.max_sge > 64) + return NULL; + + srq = calloc(1, sizeof *srq); + if (!srq) + return NULL; + + if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + + srq->max = align_queue_size(attr_ex->attr.max_wr + 1); + srq->max_gs = attr_ex->attr.max_sge; + srq->counter = 0; + srq->ext_srq = 1; + + if (mlx4_alloc_srq_buf(pd, &attr_ex->attr, srq)) + goto err; + + srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ); + if (!srq->db) + goto err_free; + + *srq->db = 0; + + cmd.buf_addr = (uintptr_t) srq->buf.buf; + cmd.db_addr = (uintptr_t) srq->db; + + ret = ibv_cmd_create_srq_ex(pd, &srq->ibv_srq, attr_ex, + &cmd.ibv_cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) + goto err_db; + + ret = mlx4_store_xsrq(&to_mctx(pd->context)->xsrq_table, + srq->ibv_srq.srq_num, srq); + if (ret) + goto err_destroy; + + return &srq->ibv_srq; + +err_destroy: + ibv_cmd_destroy_srq(&srq->ibv_srq); +err_db: + mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db); +err_free: + free(srq->wrid); + mlx4_free_buf(&srq->buf); +err: + free(srq); + return NULL; +} + +int mlx4_destroy_xrc_srq(struct ibv_srq *srq) +{ + struct mlx4_context *mctx = to_mctx(srq->context); + struct mlx4_srq *msrq = to_msrq(srq); + struct mlx4_cq *mcq; + int ret; + + mcq = to_mcq(srq->cq); + mlx4_cq_clean(mcq, 0, msrq); + pthread_spin_lock(&mcq->lock); + mlx4_clear_xsrq(&mctx->xsrq_table, srq->srq_num); + pthread_spin_unlock(&mcq->lock); + + ret = ibv_cmd_destroy_srq(srq); + if (ret) { + pthread_spin_lock(&mcq->lock); + mlx4_store_xsrq(&mctx->xsrq_table, srq->srq_num, msrq); + pthread_spin_unlock(&mcq->lock); + return ret; + } + + mlx4_free_db(mctx, MLX4_DB_TYPE_RQ, msrq->db); + mlx4_free_buf(&msrq->buf); + free(msrq->wrid); + free(msrq); + + return 0; +} diff --git a/src/verbs.c b/src/verbs.c index 408fc6d..a9b11a3 100644 --- a/src/verbs.c +++ b/src/verbs.c @@ -107,6 +107,40 @@ int mlx4_free_pd(struct ibv_pd *pd) return 0; } +struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context, int fd, int oflags) +{ + struct ibv_open_xrcd cmd; + struct ibv_open_xrcd_resp resp; + struct ibv_xrcd *xrcd; + int ret; + + xrcd = calloc(1, sizeof *xrcd); + if (!xrcd) + return NULL; + + ret = ibv_cmd_open_xrcd(context, xrcd, fd, oflags, + &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) + goto err; + + return xrcd; + +err: + free(xrcd); + return NULL; +} + +int mlx4_close_xrcd(struct ibv_xrcd *xrcd) +{ + int ret; + + ret = ibv_cmd_close_xrcd(xrcd); + if (!ret) + free(xrcd); + + return ret; +} + struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) { @@ -150,7 +184,7 @@ int mlx4_dereg_mr(struct ibv_mr *mr) return 0; } -static int align_queue_size(int req) +int align_queue_size(int req) { int nent; @@ -294,7 +328,7 @@ int mlx4_destroy_cq(struct ibv_cq *cq) } struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd, - struct ibv_srq_init_attr *attr) + struct ibv_srq_init_attr *attr) { struct mlx4_create_srq cmd; struct mlx4_create_srq_resp resp; @@ -315,6 +349,7 @@ struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd, srq->max = align_queue_size(attr->attr.max_wr + 1); srq->max_gs = attr->attr.max_sge; srq->counter = 0; + srq->ext_srq = 0; if (mlx4_alloc_srq_buf(pd, &attr->attr, srq)) goto err; @@ -334,8 +369,6 @@ struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd, if (ret) goto err_db; - srq->srqn = resp.srqn; - return &srq->ibv_srq; err_db: @@ -351,6 +384,17 @@ err: return NULL; } +struct ibv_srq *mlx4_create_srq_ex(struct ibv_pd *pd, struct ibv_srq_init_attr_ex *attr_ex) +{ + if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) || + (attr_ex->srq_type == IBV_SRQT_BASIC)) + return mlx4_create_srq(pd, (struct ibv_srq_init_attr *) attr_ex); + else if (attr_ex->srq_type == IBV_SRQT_XRC) + return mlx4_create_xrc_srq(pd, attr_ex); + + return NULL; +} + int mlx4_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr, int attr_mask) @@ -372,6 +416,9 @@ int mlx4_destroy_srq(struct ibv_srq *srq) { int ret; + if (to_msrq(srq)->ext_srq) + return mlx4_destroy_xrc_srq(srq); + ret = ibv_cmd_destroy_srq(srq); if (ret) return ret; @@ -384,8 +431,9 @@ int mlx4_destroy_srq(struct ibv_srq *srq) return 0; } -struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) +struct ibv_qp *mlx4_create_qp_ex(struct ibv_pd *pd, struct ibv_qp_init_attr_ex *attr) { + struct ibv_context *context; struct mlx4_create_qp cmd; struct ibv_create_qp_resp resp; struct mlx4_qp *qp; @@ -399,30 +447,35 @@ struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) attr->cap.max_inline_data > 1024) return NULL; - qp = malloc(sizeof *qp); + context = mlx4_get_context_qp(pd, attr); + qp = calloc(1, sizeof *qp); if (!qp) return NULL; - mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp); - - /* - * We need to leave 2 KB + 1 WQE of headroom in the SQ to - * allow HW to prefetch. - */ - qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; - qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes); - qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr); + if (attr->qp_type == IBV_QPT_XRC_RECV) { + attr->cap.max_send_wr = qp->sq.wqe_cnt = 0; + } else { + mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp); + /* + * We need to leave 2 KB + 1 WQE of headroom in the SQ to + * allow HW to prefetch. + */ + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; + qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes); + } - if (attr->srq) - attr->cap.max_recv_wr = qp->rq.wqe_cnt = 0; - else { + if (attr->srq || attr->qp_type == IBV_QPT_XRC_SEND || + attr->qp_type == IBV_QPT_XRC_RECV) { + attr->cap.max_recv_wr = qp->rq.wqe_cnt = attr->cap.max_recv_sge = 0; + } else { + qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr); if (attr->cap.max_recv_sge < 1) attr->cap.max_recv_sge = 1; if (attr->cap.max_recv_wr < 1) attr->cap.max_recv_wr = 1; } - if (mlx4_alloc_qp_buf(pd, &attr->cap, attr->qp_type, qp)) + if (mlx4_alloc_qp_buf(context, &attr->cap, attr->qp_type, qp)) goto err; mlx4_init_qp_indices(qp); @@ -431,19 +484,18 @@ struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE)) goto err_free; - if (!attr->srq) { - qp->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ); + if (attr->cap.max_recv_sge) { + qp->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ); if (!qp->db) goto err_free; *qp->db = 0; + cmd.db_addr = (uintptr_t) qp->db; + } else { + cmd.db_addr = 0; } cmd.buf_addr = (uintptr_t) qp->buf.buf; - if (attr->srq) - cmd.db_addr = 0; - else - cmd.db_addr = (uintptr_t) qp->db; cmd.log_sq_stride = qp->sq.wqe_shift; for (cmd.log_sq_bb_count = 0; qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count; @@ -452,17 +504,19 @@ struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) cmd.sq_no_prefetch = 0; /* OK for ABI 2: just a reserved field */ memset(cmd.reserved, 0, sizeof cmd.reserved); - pthread_mutex_lock(&to_mctx(pd->context)->qp_table_mutex); + pthread_mutex_lock(&to_mctx(context)->qp_table_mutex); - ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd.ibv_cmd, sizeof cmd, - &resp, sizeof resp); + ret = ibv_cmd_create_qp_ex(pd, &qp->ibv_qp, attr, &cmd.ibv_cmd, sizeof cmd, + &resp, sizeof resp); if (ret) goto err_rq_db; - ret = mlx4_store_qp(to_mctx(pd->context), qp->ibv_qp.qp_num, qp); - if (ret) - goto err_destroy; - pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex); + if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) { + ret = mlx4_store_qp(to_mctx(context), qp->ibv_qp.qp_num, qp); + if (ret) + goto err_destroy; + } + pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex); qp->rq.wqe_cnt = qp->rq.max_post = attr->cap.max_recv_wr; qp->rq.max_gs = attr->cap.max_recv_sge; @@ -480,9 +534,9 @@ err_destroy: ibv_cmd_destroy_qp(&qp->ibv_qp); err_rq_db: - pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex); - if (!attr->srq) - mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, qp->db); + pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex); + if (attr->cap.max_recv_sge) + mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, qp->db); err_free: free(qp->sq.wrid); @@ -496,6 +550,42 @@ err: return NULL; } +struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) +{ + struct ibv_qp_init_attr_ex attr_ex; + struct ibv_qp *qp; + + memcpy(&attr_ex, attr, sizeof *attr); + attr_ex.comp_mask = 0; + qp = mlx4_create_qp_ex(pd, &attr_ex); + if (qp) + memcpy(attr, &attr_ex, sizeof *attr); + return qp; +} + +struct ibv_qp *mlx4_open_qp(struct ibv_xrcd *xrcd, struct ibv_qp_open_attr *attr) +{ + struct ibv_open_qp cmd; + struct ibv_create_qp_resp resp; + struct mlx4_qp *qp; + int ret; + + qp = calloc(1, sizeof *qp); + if (!qp) + return NULL; + + ret = ibv_cmd_open_qp(xrcd, &qp->ibv_qp, attr, + &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) + goto err; + + return &qp->ibv_qp; + +err: + free(qp); + return NULL; +} + int mlx4_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) @@ -542,13 +632,14 @@ int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, if (!ret && (attr_mask & IBV_QP_STATE) && attr->qp_state == IBV_QPS_RESET) { - mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num, - qp->srq ? to_msrq(qp->srq) : NULL); - if (qp->send_cq != qp->recv_cq) + if (qp->recv_cq) + mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num, + qp->srq ? to_msrq(qp->srq) : NULL); + if (qp->send_cq && qp->send_cq != qp->recv_cq) mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL); mlx4_init_qp_indices(to_mqp(qp)); - if (!qp->srq) + if (to_mqp(qp)->rq.wqe_cnt) *to_mqp(qp)->db = 0; } @@ -560,9 +651,14 @@ static void mlx4_lock_cqs(struct ibv_qp *qp) struct mlx4_cq *send_cq = to_mcq(qp->send_cq); struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq); - if (send_cq == recv_cq) + if (!qp->send_cq || !qp->recv_cq) { + if (qp->send_cq) + pthread_spin_lock(&send_cq->lock); + else if (qp->recv_cq) + pthread_spin_lock(&recv_cq->lock); + } else if (send_cq == recv_cq) { pthread_spin_lock(&send_cq->lock); - else if (send_cq->cqn < recv_cq->cqn) { + } else if (send_cq->cqn < recv_cq->cqn) { pthread_spin_lock(&send_cq->lock); pthread_spin_lock(&recv_cq->lock); } else { @@ -576,9 +672,15 @@ static void mlx4_unlock_cqs(struct ibv_qp *qp) struct mlx4_cq *send_cq = to_mcq(qp->send_cq); struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq); - if (send_cq == recv_cq) + + if (!qp->send_cq || !qp->recv_cq) { + if (qp->send_cq) + pthread_spin_unlock(&send_cq->lock); + else if (qp->recv_cq) + pthread_spin_unlock(&recv_cq->lock); + } else if (send_cq == recv_cq) { pthread_spin_unlock(&send_cq->lock); - else if (send_cq->cqn < recv_cq->cqn) { + } else if (send_cq->cqn < recv_cq->cqn) { pthread_spin_unlock(&recv_cq->lock); pthread_spin_unlock(&send_cq->lock); } else { @@ -601,21 +703,24 @@ int mlx4_destroy_qp(struct ibv_qp *ibqp) mlx4_lock_cqs(ibqp); - __mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num, - ibqp->srq ? to_msrq(ibqp->srq) : NULL); - if (ibqp->send_cq != ibqp->recv_cq) + if (ibqp->recv_cq) + __mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (ibqp->send_cq && ibqp->send_cq != ibqp->recv_cq) __mlx4_cq_clean(to_mcq(ibqp->send_cq), ibqp->qp_num, NULL); - mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num); + if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) + mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num); mlx4_unlock_cqs(ibqp); pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex); - if (!ibqp->srq) + if (qp->rq.wqe_cnt) { mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db); - free(qp->sq.wrid); - if (qp->rq.wqe_cnt) free(qp->rq.wrid); + } + if (qp->sq.wqe_cnt) + free(qp->sq.wrid); mlx4_free_buf(&qp->buf); free(qp);