@@ -93,14 +93,46 @@ struct rxe_send_wr {
__u32 remote_qkey;
__u16 pkey_index;
} ud;
- /* reg is only used by the kernel and is not part of the uapi */
+ struct {
+ __aligned_u64 addr;
+ __aligned_u64 length;
+ union {
+ __u32 mr_index;
+ __aligned_u64 reserved1;
+ };
+ union {
+ __u32 mw_index;
+ __aligned_u64 reserved2;
+ };
+ __u32 rkey;
+ __u32 access;
+ __u32 flags;
+ } umw;
+ /* The following are only used by the kernel
+ * and are not part of the uapi
+ */
+ struct {
+ __aligned_u64 addr;
+ __aligned_u64 length;
+ union {
+ struct ib_mr *mr;
+ __aligned_u64 reserved1;
+ };
+ union {
+ struct ib_mw *mw;
+ __aligned_u64 reserved2;
+ };
+ __u32 rkey;
+ __u32 access;
+ __u32 flags;
+ } kmw;
struct {
union {
struct ib_mr *mr;
__aligned_u64 reserved;
};
- __u32 key;
- __u32 access;
+ __u32 key;
+ __u32 access;
} reg;
} wr;
};
@@ -112,7 +144,7 @@ struct rxe_sge {
};
struct mminfo {
- __aligned_u64 offset;
+ __aligned_u64 offset;
__u32 size;
__u32 pad;
};
@@ -175,4 +207,14 @@ struct rxe_modify_srq_cmd {
__aligned_u64 mmap_info_addr;
};
+struct rxe_reg_mr_resp {
+ __u32 index;
+ __u32 reserved;
+};
+
+struct rxe_alloc_mw_resp {
+ __u32 index;
+ __u32 reserved;
+};
+
#endif /* RDMA_USER_RXE_H */
@@ -1,3 +1,5 @@
rdma_provider(rxe
rxe.c
+ rxe_sq.c
+ rxe_mw.c
)
@@ -49,5 +49,9 @@ DECLARE_DRV_CMD(urxe_modify_srq, IB_USER_VERBS_CMD_MODIFY_SRQ,
rxe_modify_srq_cmd, empty);
DECLARE_DRV_CMD(urxe_resize_cq, IB_USER_VERBS_CMD_RESIZE_CQ,
empty, rxe_resize_cq_resp);
+DECLARE_DRV_CMD(urxe_reg_mr, IB_USER_VERBS_CMD_REG_MR,
+ empty, rxe_reg_mr_resp);
+DECLARE_DRV_CMD(urxe_alloc_mw, IB_USER_VERBS_CMD_ALLOC_MW,
+ empty, rxe_alloc_mw_resp);
#endif /* RXE_ABI_H */
@@ -69,11 +69,11 @@ static int rxe_query_device(struct ibv_context *context,
{
struct ibv_query_device cmd;
uint64_t raw_fw_ver;
- unsigned major, minor, sub_minor;
+ unsigned int major, minor, sub_minor;
int ret;
ret = ibv_cmd_query_device(context, attr, &raw_fw_ver,
- &cmd, sizeof cmd);
+ &cmd, sizeof(cmd));
if (ret)
return ret;
@@ -81,7 +81,7 @@ static int rxe_query_device(struct ibv_context *context,
minor = (raw_fw_ver >> 16) & 0xffff;
sub_minor = raw_fw_ver & 0xffff;
- snprintf(attr->fw_ver, sizeof attr->fw_ver,
+ snprintf(attr->fw_ver, sizeof(attr->fw_ver),
"%d.%d.%d", major, minor, sub_minor);
return 0;
@@ -92,7 +92,7 @@ static int rxe_query_port(struct ibv_context *context, uint8_t port,
{
struct ibv_query_port cmd;
- return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd);
+ return ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd));
}
static struct ibv_pd *rxe_alloc_pd(struct ibv_context *context)
@@ -101,11 +101,12 @@ static struct ibv_pd *rxe_alloc_pd(struct ibv_context *context)
struct ib_uverbs_alloc_pd_resp resp;
struct ibv_pd *pd;
- pd = malloc(sizeof *pd);
+ pd = malloc(sizeof(*pd));
if (!pd)
return NULL;
- if (ibv_cmd_alloc_pd(context, pd, &cmd, sizeof cmd, &resp, sizeof resp)) {
+ if (ibv_cmd_alloc_pd(context, pd, &cmd, sizeof(cmd), &resp,
+ sizeof(resp))) {
free(pd);
return NULL;
}
@@ -127,34 +128,38 @@ static int rxe_dealloc_pd(struct ibv_pd *pd)
static struct ibv_mr *rxe_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
uint64_t hca_va, int access)
{
- struct verbs_mr *vmr;
- struct ibv_reg_mr cmd;
- struct ib_uverbs_reg_mr_resp resp;
+ struct rxe_mr *mr;
+ struct ibv_reg_mr cmd = {};
+ struct urxe_reg_mr_resp resp = {};
int ret;
- vmr = malloc(sizeof(*vmr));
- if (!vmr)
+ mr = calloc(1, sizeof(*mr));
+ if (!mr)
return NULL;
- ret = ibv_cmd_reg_mr(pd, addr, length, hca_va, access, vmr, &cmd,
- sizeof(cmd), &resp, sizeof(resp));
+ ret = ibv_cmd_reg_mr(pd, addr, length, hca_va, access,
+ &mr->verbs_mr, &cmd, sizeof(cmd),
+ &resp.ibv_resp, sizeof(resp));
if (ret) {
- free(vmr);
+ free(mr);
return NULL;
}
- return &vmr->ibv_mr;
+ mr->index = resp.index;
+
+ return &mr->verbs_mr.ibv_mr;
}
static int rxe_dereg_mr(struct verbs_mr *vmr)
{
int ret;
+ struct rxe_mr *mr = to_rmr(&vmr->ibv_mr);
- ret = ibv_cmd_dereg_mr(vmr);
+ ret = ibv_cmd_dereg_mr(&mr->verbs_mr);
if (ret)
return ret;
- free(vmr);
+ free(mr);
return 0;
}
@@ -166,14 +171,13 @@ static struct ibv_cq *rxe_create_cq(struct ibv_context *context, int cqe,
struct urxe_create_cq_resp resp;
int ret;
- cq = malloc(sizeof *cq);
- if (!cq) {
+ cq = malloc(sizeof(*cq));
+ if (!cq)
return NULL;
- }
ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector,
&cq->ibv_cq, NULL, 0,
- &resp.ibv_resp, sizeof resp);
+ &resp.ibv_resp, sizeof(resp));
if (ret) {
free(cq);
return NULL;
@@ -202,8 +206,8 @@ static int rxe_resize_cq(struct ibv_cq *ibcq, int cqe)
pthread_spin_lock(&cq->lock);
- ret = ibv_cmd_resize_cq(ibcq, cqe, &cmd, sizeof cmd,
- &resp.ibv_resp, sizeof resp);
+ ret = ibv_cmd_resize_cq(ibcq, cqe, &cmd, sizeof(cmd),
+ &resp.ibv_resp, sizeof(resp));
if (ret) {
pthread_spin_unlock(&cq->lock);
return ret;
@@ -277,13 +281,12 @@ static struct ibv_srq *rxe_create_srq(struct ibv_pd *pd,
struct urxe_create_srq_resp resp;
int ret;
- srq = malloc(sizeof *srq);
- if (srq == NULL) {
+ srq = malloc(sizeof(*srq));
+ if (srq == NULL)
return NULL;
- }
- ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr, &cmd, sizeof cmd,
- &resp.ibv_resp, sizeof resp);
+ ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr, &cmd, sizeof(cmd),
+ &resp.ibv_resp, sizeof(resp));
if (ret) {
free(srq);
return NULL;
@@ -298,6 +301,7 @@ static struct ibv_srq *rxe_create_srq(struct ibv_pd *pd,
return NULL;
}
+ srq->srq_num = resp.srq_num;
srq->mmap_info = resp.mi;
srq->rq.max_sge = attr->attr.max_sge;
pthread_spin_init(&srq->rq.lock, PTHREAD_PROCESS_PRIVATE);
@@ -305,6 +309,15 @@ static struct ibv_srq *rxe_create_srq(struct ibv_pd *pd,
return &srq->ibv_srq;
}
+static int rxe_get_srq_num(struct ibv_srq *ibsrq, uint32_t *srq_num)
+{
+ struct rxe_srq *srq = to_rsrq(ibsrq);
+
+ *srq_num = srq->srq_num;
+
+ return 0;
+}
+
static int rxe_modify_srq(struct ibv_srq *ibsrq,
struct ibv_srq_attr *attr, int attr_mask)
{
@@ -319,9 +332,9 @@ static int rxe_modify_srq(struct ibv_srq *ibsrq,
if (attr_mask & IBV_SRQ_MAX_WR)
pthread_spin_lock(&srq->rq.lock);
- cmd.mmap_info_addr = (__u64)(uintptr_t) & mi;
+ cmd.mmap_info_addr = (__u64)(uintptr_t) &mi;
rc = ibv_cmd_modify_srq(ibsrq, attr, attr_mask,
- &cmd.ibv_cmd, sizeof cmd);
+ &cmd.ibv_cmd, sizeof(cmd));
if (rc)
goto out;
@@ -351,7 +364,7 @@ static int rxe_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr)
{
struct ibv_query_srq cmd;
- return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
+ return ibv_cmd_query_srq(srq, attr, &cmd, sizeof(cmd));
}
static int rxe_destroy_srq(struct ibv_srq *ibvsrq)
@@ -396,9 +409,8 @@ static int rxe_post_one_recv(struct rxe_wq *rq, struct ibv_recv_wr *recv_wr)
memcpy(wqe->dma.sge, recv_wr->sg_list,
wqe->num_sge*sizeof(*wqe->dma.sge));
- for (i = 0; i < wqe->num_sge; i++) {
+ for (i = 0; i < wqe->num_sge; i++)
length += wqe->dma.sge[i].length;
- }
wqe->dma.length = length;
wqe->dma.resid = length;
@@ -444,13 +456,12 @@ static struct ibv_qp *rxe_create_qp(struct ibv_pd *pd,
struct rxe_qp *qp;
int ret;
- qp = malloc(sizeof *qp);
- if (!qp) {
+ qp = malloc(sizeof(*qp));
+ if (!qp)
return NULL;
- }
- ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd, sizeof cmd,
- &resp.ibv_resp, sizeof resp);
+ ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd, sizeof(cmd),
+ &resp.ibv_resp, sizeof(resp));
if (ret) {
free(qp);
return NULL;
@@ -501,7 +512,7 @@ static int rxe_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
struct ibv_query_qp cmd;
return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr,
- &cmd, sizeof cmd);
+ &cmd, sizeof(cmd));
}
static int rxe_modify_qp(struct ibv_qp *ibvqp,
@@ -510,7 +521,7 @@ static int rxe_modify_qp(struct ibv_qp *ibvqp,
{
struct ibv_modify_qp cmd = {};
- return ibv_cmd_modify_qp(ibvqp, attr, attr_mask, &cmd, sizeof cmd);
+ return ibv_cmd_modify_qp(ibvqp, attr, attr_mask, &cmd, sizeof(cmd));
}
static int rxe_destroy_qp(struct ibv_qp *ibv_qp)
@@ -531,199 +542,6 @@ static int rxe_destroy_qp(struct ibv_qp *ibv_qp)
return ret;
}
-/* basic sanity checks for send work request */
-static int validate_send_wr(struct rxe_wq *sq, struct ibv_send_wr *ibwr,
- unsigned int length)
-{
- enum ibv_wr_opcode opcode = ibwr->opcode;
-
- if (ibwr->num_sge > sq->max_sge)
- return -EINVAL;
-
- if ((opcode == IBV_WR_ATOMIC_CMP_AND_SWP)
- || (opcode == IBV_WR_ATOMIC_FETCH_AND_ADD))
- if (length < 8 || ibwr->wr.atomic.remote_addr & 0x7)
- return -EINVAL;
-
- if ((ibwr->send_flags & IBV_SEND_INLINE) && (length > sq->max_inline))
- return -EINVAL;
-
- return 0;
-}
-
-static void convert_send_wr(struct rxe_send_wr *kwr, struct ibv_send_wr *uwr)
-{
- memset(kwr, 0, sizeof(*kwr));
-
- kwr->wr_id = uwr->wr_id;
- kwr->num_sge = uwr->num_sge;
- kwr->opcode = uwr->opcode;
- kwr->send_flags = uwr->send_flags;
- kwr->ex.imm_data = uwr->imm_data;
-
- switch(uwr->opcode) {
- case IBV_WR_RDMA_WRITE:
- case IBV_WR_RDMA_WRITE_WITH_IMM:
- case IBV_WR_RDMA_READ:
- kwr->wr.rdma.remote_addr = uwr->wr.rdma.remote_addr;
- kwr->wr.rdma.rkey = uwr->wr.rdma.rkey;
- break;
-
- case IBV_WR_SEND:
- case IBV_WR_SEND_WITH_IMM:
- kwr->wr.ud.remote_qpn = uwr->wr.ud.remote_qpn;
- kwr->wr.ud.remote_qkey = uwr->wr.ud.remote_qkey;
- break;
-
- case IBV_WR_ATOMIC_CMP_AND_SWP:
- case IBV_WR_ATOMIC_FETCH_AND_ADD:
- kwr->wr.atomic.remote_addr = uwr->wr.atomic.remote_addr;
- kwr->wr.atomic.compare_add = uwr->wr.atomic.compare_add;
- kwr->wr.atomic.swap = uwr->wr.atomic.swap;
- kwr->wr.atomic.rkey = uwr->wr.atomic.rkey;
- break;
-
- case IBV_WR_LOCAL_INV:
- case IBV_WR_BIND_MW:
- case IBV_WR_SEND_WITH_INV:
- case IBV_WR_TSO:
- case IBV_WR_DRIVER1:
- break;
- }
-}
-
-static int init_send_wqe(struct rxe_qp *qp, struct rxe_wq *sq,
- struct ibv_send_wr *ibwr, unsigned int length,
- struct rxe_send_wqe *wqe)
-{
- int num_sge = ibwr->num_sge;
- int i;
- unsigned int opcode = ibwr->opcode;
-
- convert_send_wr(&wqe->wr, ibwr);
-
- if (qp_type(qp) == IBV_QPT_UD)
- memcpy(&wqe->av, &to_rah(ibwr->wr.ud.ah)->av,
- sizeof(struct rxe_av));
-
- if (ibwr->send_flags & IBV_SEND_INLINE) {
- uint8_t *inline_data = wqe->dma.inline_data;
-
- for (i = 0; i < num_sge; i++) {
- memcpy(inline_data,
- (uint8_t *)(long)ibwr->sg_list[i].addr,
- ibwr->sg_list[i].length);
- inline_data += ibwr->sg_list[i].length;
- }
- } else
- memcpy(wqe->dma.sge, ibwr->sg_list,
- num_sge*sizeof(struct ibv_sge));
-
- if ((opcode == IBV_WR_ATOMIC_CMP_AND_SWP)
- || (opcode == IBV_WR_ATOMIC_FETCH_AND_ADD))
- wqe->iova = ibwr->wr.atomic.remote_addr;
- else
- wqe->iova = ibwr->wr.rdma.remote_addr;
- wqe->dma.length = length;
- wqe->dma.resid = length;
- wqe->dma.num_sge = num_sge;
- wqe->dma.cur_sge = 0;
- wqe->dma.sge_offset = 0;
- wqe->state = 0;
- wqe->ssn = qp->ssn++;
-
- return 0;
-}
-
-static int post_one_send(struct rxe_qp *qp, struct rxe_wq *sq,
- struct ibv_send_wr *ibwr)
-{
- int err;
- struct rxe_send_wqe *wqe;
- unsigned int length = 0;
- int i;
-
- for (i = 0; i < ibwr->num_sge; i++)
- length += ibwr->sg_list[i].length;
-
- err = validate_send_wr(sq, ibwr, length);
- if (err) {
- printf("validate send failed\n");
- return err;
- }
-
- wqe = (struct rxe_send_wqe *)producer_addr(sq->queue);
-
- err = init_send_wqe(qp, sq, ibwr, length, wqe);
- if (err)
- return err;
-
- if (queue_full(sq->queue))
- return -ENOMEM;
-
- advance_producer(sq->queue);
-
- return 0;
-}
-
-/* send a null post send as a doorbell */
-static int post_send_db(struct ibv_qp *ibqp)
-{
- struct ibv_post_send cmd;
- struct ib_uverbs_post_send_resp resp;
-
- cmd.hdr.command = IB_USER_VERBS_CMD_POST_SEND;
- cmd.hdr.in_words = sizeof(cmd) / 4;
- cmd.hdr.out_words = sizeof(resp) / 4;
- cmd.response = (uintptr_t)&resp;
- cmd.qp_handle = ibqp->handle;
- cmd.wr_count = 0;
- cmd.sge_count = 0;
- cmd.wqe_size = sizeof(struct ibv_send_wr);
-
- if (write(ibqp->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd))
- return errno;
-
- return 0;
-}
-
-/* this API does not make a distinction between
- restartable and non-restartable errors */
-static int rxe_post_send(struct ibv_qp *ibqp,
- struct ibv_send_wr *wr_list,
- struct ibv_send_wr **bad_wr)
-{
- int rc = 0;
- int err;
- struct rxe_qp *qp = to_rqp(ibqp);
- struct rxe_wq *sq = &qp->sq;
-
- if (!bad_wr)
- return EINVAL;
-
- *bad_wr = NULL;
-
- if (!sq || !wr_list || !sq->queue)
- return EINVAL;
-
- pthread_spin_lock(&sq->lock);
-
- while (wr_list) {
- rc = post_one_send(qp, sq, wr_list);
- if (rc) {
- *bad_wr = wr_list;
- break;
- }
-
- wr_list = wr_list->next;
- }
-
- pthread_spin_unlock(&sq->lock);
-
- err = post_send_db(ibqp);
- return err ? err : rc;
-}
-
static int rxe_post_recv(struct ibv_qp *ibqp,
struct ibv_recv_wr *recv_wr,
struct ibv_recv_wr **bad_wr)
@@ -792,7 +610,7 @@ static struct ibv_ah *rxe_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
return NULL;
}
- ah = malloc(sizeof *ah);
+ ah = malloc(sizeof(*ah));
if (ah == NULL)
return NULL;
@@ -860,6 +678,10 @@ static const struct verbs_context_ops rxe_ctx_ops = {
.attach_mcast = ibv_cmd_attach_mcast,
.detach_mcast = ibv_cmd_detach_mcast,
.free_context = rxe_free_context,
+ .alloc_mw = rxe_alloc_mw,
+ .bind_mw = rxe_bind_mw,
+ .dealloc_mw = rxe_dealloc_mw,
+ .get_srq_num = rxe_get_srq_num,
};
static struct verbs_context *rxe_alloc_context(struct ibv_device *ibdev,
@@ -876,7 +698,7 @@ static struct verbs_context *rxe_alloc_context(struct ibv_device *ibdev,
return NULL;
if (ibv_cmd_get_context(&context->ibv_ctx, &cmd,
- sizeof cmd, &resp, sizeof resp))
+ sizeof(cmd), &resp, sizeof(resp)))
goto out;
verbs_set_ops(&context->ibv_ctx, &rxe_ctx_ops);
@@ -907,6 +729,7 @@ static void rxe_uninit_device(struct verbs_device *verbs_device)
static struct verbs_device *rxe_device_alloc(struct verbs_sysfs_dev *sysfs_dev)
{
struct rxe_device *dev;
+
dev = calloc(1, sizeof(*dev));
if (!dev)
return NULL;
@@ -94,6 +94,25 @@ struct rxe_srq {
uint32_t srq_num;
};
+struct rxe_mr {
+ struct verbs_mr verbs_mr;
+ uint32_t index;
+};
+
+/* private flags to rxe_post_one_send */
+enum rxe_send_flags {
+ /* used to tell bind calls that
+ * used the verbs API from user
+ * posted send wr
+ */
+ RXE_BIND_MW = (1 << 0),
+};
+
+struct rxe_mw {
+ struct ibv_mw ibv_mw;
+ uint32_t index;
+};
+
#define to_rxxx(xxx, type) container_of(ib##xxx, struct rxe_##type, ibv_##xxx)
static inline struct rxe_context *to_rctx(struct ibv_context *ibctx)
@@ -126,4 +145,29 @@ static inline struct rxe_ah *to_rah(struct ibv_ah *ibah)
return to_rxxx(ah, ah);
}
+static inline struct rxe_mr *to_rmr(struct ibv_mr *ibmr)
+{
+ return container_of(ibmr, struct rxe_mr, verbs_mr.ibv_mr);
+}
+
+static inline struct rxe_mw *to_rmw(struct ibv_mw *ibmw)
+{
+ return to_rxxx(mw, mw);
+}
+
+/* rxe_mw.c */
+struct ibv_mw *rxe_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type);
+int rxe_dealloc_mw(struct ibv_mw *mw);
+int rxe_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
+ struct ibv_mw_bind *mw_bind);
+
+/* rxe_sq.c */
+int rxe_post_one_send(struct rxe_qp *qp, struct ibv_send_wr *ibwr,
+ unsigned int length, enum rxe_send_flags flags);
+
+int rxe_post_send_db(struct ibv_qp *ibqp);
+
+int rxe_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr_list,
+ struct ibv_send_wr **bad_wr);
+
#endif /* RXE_H */
new file mode 100644
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2020 Hewlett Packard Enterprise, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <config.h>
+
+#include <endian.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <pthread.h>
+#include <netinet/in.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+#include <endian.h>
+#include <pthread.h>
+#include <stddef.h>
+
+#include "rxe.h"
+#include "rxe_queue.h"
+#include <rdma/rdma_user_rxe.h>
+#include "rxe-abi.h"
+
+struct ibv_mw *rxe_alloc_mw(struct ibv_pd *ibpd, enum ibv_mw_type type)
+{
+ int ret;
+ struct rxe_mw *mw;
+ struct ibv_alloc_mw cmd = {};
+ struct urxe_alloc_mw_resp resp = {};
+
+ mw = calloc(1, sizeof(*mw));
+ if (!mw)
+ return NULL;
+
+ ret = ibv_cmd_alloc_mw(ibpd, type, &mw->ibv_mw, &cmd, sizeof(cmd),
+ &resp.ibv_resp, sizeof(resp));
+ if (ret) {
+ free(mw);
+ return NULL;
+ }
+
+ mw->index = resp.index;
+
+ return &mw->ibv_mw;
+}
+
+static int next_rkey(int rkey)
+{
+ return (rkey & 0xffffff00) | ((rkey + 1) & 0x000000ff);
+}
+
+/* private version of rxe_post_one_send to set flags field in wqe */
+int rxe_bind_mw(struct ibv_qp *ibqp, struct ibv_mw *ibmw,
+ struct ibv_mw_bind *mw_bind)
+{
+ int ret;
+ struct rxe_qp *qp = to_rqp(ibqp);
+ struct ibv_mw_bind_info *bind_info = &mw_bind->bind_info;
+ struct ibv_send_wr ibwr;
+
+ if (!bind_info->mr && (bind_info->addr || bind_info->length)) {
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (bind_info->mw_access_flags & IBV_ACCESS_ZERO_BASED) {
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (bind_info->mr) {
+ /* more to do here see mlx5 */
+ if (ibmw->pd != bind_info->mr->pd) {
+ ret = EPERM;
+ goto err;
+ }
+ }
+
+ memset(&ibwr, 0, sizeof(ibwr));
+
+ ibwr.opcode = IBV_WR_BIND_MW;
+ ibwr.next = NULL;
+ ibwr.wr_id = mw_bind->wr_id;
+ ibwr.send_flags = mw_bind->send_flags;
+ ibwr.bind_mw.bind_info = mw_bind->bind_info;
+ ibwr.bind_mw.mw = ibmw;
+ ibwr.bind_mw.rkey = next_rkey(ibmw->rkey);
+
+ pthread_spin_lock(&qp->sq.lock);
+ ret = rxe_post_one_send(qp, &ibwr, 0, RXE_BIND_MW);
+ pthread_spin_unlock(&qp->sq.lock);
+
+ if (!ret)
+ ret = rxe_post_send_db(ibqp);
+
+ if (ret)
+ goto err;
+
+ /* user has to undo this if he gets an error wc */
+ ibmw->rkey = ibwr.bind_mw.rkey;
+
+ return 0;
+err:
+ errno = ret;
+ return errno;
+}
+
+int rxe_dealloc_mw(struct ibv_mw *ibmw)
+{
+ struct rxe_mw *mw = to_rmw(ibmw);
+ int ret;
+
+ ret = ibv_cmd_dealloc_mw(ibmw);
+ if (ret)
+ return ret;
+
+ free(mw);
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2020 Hewlett Packard Entrprise, Inc. All rights reserved.
+ * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2009 System Fabric Works, Inc. All rights reserved.
+ * Copyright (C) 2006-2007 QLogic Corporation, All rights reserved.
+ * Copyright (c) 2005. PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <config.h>
+
+#include <endian.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <pthread.h>
+#include <netinet/in.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+#include <endian.h>
+#include <pthread.h>
+#include <stddef.h>
+
+#include "rxe.h"
+#include "rxe_queue.h"
+#include <rdma/rdma_user_rxe.h>
+#include "rxe-abi.h"
+
+/* basic sanity checks for send work request */
+static int validate_send_wr(struct rxe_qp *qp,
+ struct ibv_send_wr *ibwr,
+ unsigned int length)
+{
+ struct rxe_wq *sq = &qp->sq;
+ enum ibv_wr_opcode opcode = ibwr->opcode;
+
+ if (ibwr->num_sge > sq->max_sge)
+ goto err;
+
+ if ((opcode == IBV_WR_ATOMIC_CMP_AND_SWP)
+ || (opcode == IBV_WR_ATOMIC_FETCH_AND_ADD))
+ if (length < 8 || ibwr->wr.atomic.remote_addr & 0x7)
+ goto err;
+
+ if ((ibwr->send_flags & IBV_SEND_INLINE) &&
+ (length > sq->max_inline))
+ goto err;
+
+ if (ibwr->opcode == IBV_WR_BIND_MW) {
+ if (length)
+ goto err;
+ if (ibwr->num_sge)
+ goto err;
+ if (ibwr->imm_data)
+ goto err;
+ if ((qp_type(qp) != IBV_QPT_RC) &&
+ (qp_type(qp) != IBV_QPT_UC) &&
+ (qp_type(qp) != IBV_QPT_XRC_SEND))
+ goto err;
+ /* only type 2 MWs may use post send bind
+ * we skip this test for ibv_bind_mw by
+ * calling post_one_send
+ */
+ if (ibwr->bind_mw.mw->type == 1)
+ goto err;
+ }
+
+ return 0;
+err:
+ errno = EINVAL;
+ return errno;
+}
+
+static int convert_send_wr(struct rxe_send_wr *kwr, struct ibv_send_wr *uwr)
+{
+ struct rxe_mr *mr;
+ struct rxe_mw *mw;
+
+ memset(kwr, 0, sizeof(*kwr));
+
+ kwr->wr_id = uwr->wr_id;
+ kwr->num_sge = uwr->num_sge;
+ kwr->opcode = uwr->opcode;
+ kwr->send_flags = uwr->send_flags;
+ kwr->ex.imm_data = uwr->imm_data;
+
+ switch (uwr->opcode) {
+ case IBV_WR_RDMA_WRITE:
+ case IBV_WR_RDMA_WRITE_WITH_IMM:
+ case IBV_WR_RDMA_READ:
+ kwr->wr.rdma.remote_addr = uwr->wr.rdma.remote_addr;
+ kwr->wr.rdma.rkey = uwr->wr.rdma.rkey;
+ break;
+
+ case IBV_WR_SEND:
+ case IBV_WR_SEND_WITH_IMM:
+ kwr->wr.ud.remote_qpn = uwr->wr.ud.remote_qpn;
+ kwr->wr.ud.remote_qkey = uwr->wr.ud.remote_qkey;
+ break;
+
+ case IBV_WR_ATOMIC_CMP_AND_SWP:
+ case IBV_WR_ATOMIC_FETCH_AND_ADD:
+ kwr->wr.atomic.remote_addr = uwr->wr.atomic.remote_addr;
+ kwr->wr.atomic.compare_add = uwr->wr.atomic.compare_add;
+ kwr->wr.atomic.swap = uwr->wr.atomic.swap;
+ kwr->wr.atomic.rkey = uwr->wr.atomic.rkey;
+ break;
+
+ case IBV_WR_BIND_MW:
+ mr = to_rmr(uwr->bind_mw.bind_info.mr);
+ mw = to_rmw(uwr->bind_mw.mw);
+
+ kwr->wr.umw.addr = uwr->bind_mw.bind_info.addr;
+ kwr->wr.umw.length = uwr->bind_mw.bind_info.length;
+ kwr->wr.umw.mr_index = mr->index;
+ kwr->wr.umw.mw_index = mw->index;
+ kwr->wr.umw.rkey = uwr->bind_mw.rkey;
+ kwr->wr.umw.access = uwr->bind_mw.bind_info.mw_access_flags;
+ break;
+
+ case IBV_WR_LOCAL_INV:
+ case IBV_WR_SEND_WITH_INV:
+ case IBV_WR_TSO:
+ case IBV_WR_DRIVER1:
+ break;
+ default:
+ errno = EINVAL;
+ return errno;
+ }
+
+ return 0;
+}
+
+static int init_send_wqe(struct rxe_qp *qp, struct ibv_send_wr *ibwr,
+ unsigned int length, struct rxe_send_wqe *wqe,
+ enum rxe_send_flags flags)
+{
+ int i;
+ int ret;
+ int num_sge = ibwr->num_sge;
+ unsigned int opcode = ibwr->opcode;
+
+ ret = convert_send_wr(&wqe->wr, ibwr);
+ if (ret)
+ return ret;
+
+ wqe->dma.length = length;
+ wqe->dma.resid = length;
+ wqe->dma.num_sge = num_sge;
+ wqe->dma.cur_sge = 0;
+ wqe->dma.sge_offset = 0;
+ wqe->state = 0;
+ wqe->ssn = qp->ssn++;
+
+ if (qp_type(qp) == IBV_QPT_UD)
+ memcpy(&wqe->av, &to_rah(ibwr->wr.ud.ah)->av,
+ sizeof(struct rxe_av));
+
+ if (ibwr->send_flags & IBV_SEND_INLINE) {
+ uint8_t *inline_data = wqe->dma.inline_data;
+
+ wqe->dma.resid = 0;
+
+ for (i = 0; i < num_sge; i++) {
+ memcpy(inline_data,
+ (uint8_t *)(long)ibwr->sg_list[i].addr,
+ ibwr->sg_list[i].length);
+ inline_data += ibwr->sg_list[i].length;
+ }
+ } else {
+ memcpy(wqe->dma.sge, ibwr->sg_list,
+ num_sge*sizeof(struct ibv_sge));
+ }
+
+ if ((opcode == IBV_WR_ATOMIC_CMP_AND_SWP) ||
+ (opcode == IBV_WR_ATOMIC_FETCH_AND_ADD))
+ wqe->iova = ibwr->wr.atomic.remote_addr;
+ else
+ wqe->iova = ibwr->wr.rdma.remote_addr;
+
+ /* let the kernel know we came through the verbs API */
+ if (flags & RXE_BIND_MW)
+ wqe->wr.wr.umw.flags = RXE_BIND_MW;
+
+ return 0;
+}
+
+/* call with qp->sq->lock held */
+int rxe_post_one_send(struct rxe_qp *qp, struct ibv_send_wr *ibwr,
+ unsigned int length, enum rxe_send_flags flags)
+{
+ int err;
+ struct rxe_send_wqe *wqe;
+ struct rxe_wq *sq = &qp->sq;
+
+ if (queue_full(sq->queue)) {
+ err = ENOMEM;
+ goto err;
+ }
+
+ wqe = (struct rxe_send_wqe *)producer_addr(sq->queue);
+
+ err = init_send_wqe(qp, ibwr, length, wqe, flags);
+ if (err)
+ goto err;
+
+ advance_producer(qp->sq.queue);
+
+ return 0;
+err:
+ errno = err;
+ return errno;
+}
+
+/* send a null post send as a doorbell */
+int rxe_post_send_db(struct ibv_qp *ibqp)
+{
+ int ret;
+
+ ret = ibv_cmd_post_send(ibqp, NULL, NULL);
+ if (ret)
+ goto err;
+ return 0;
+err:
+ errno = ret;
+ return errno;
+}
+
+/* this API does not make a distinction between
+ * restartable and non-restartable errors
+ */
+int rxe_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr_list,
+ struct ibv_send_wr **bad_wr)
+{
+ int i;
+ int ret = 0;
+ int err = 0;
+ unsigned int length = 0;
+ int work_to_do = 0;
+ struct rxe_qp *qp = to_rqp(ibqp);
+ struct ibv_send_wr *ibwr;
+
+ if (!wr_list || !bad_wr) {
+ err = EINVAL;
+ goto err;
+ }
+
+ *bad_wr = NULL;
+
+ pthread_spin_lock(&qp->sq.lock);
+
+ while (wr_list) {
+ ibwr = wr_list;
+
+ for (i = 0; i < ibwr->num_sge; i++)
+ length += ibwr->sg_list[i].length;
+
+ /* moved here from post_one_send to allow
+ * calling post_one_send without checking
+ */
+ ret = validate_send_wr(qp, ibwr, length);
+ if (ret) {
+ *bad_wr = ibwr;
+ break;
+ }
+
+ ret = rxe_post_one_send(qp, ibwr, length, 0);
+ if (ret) {
+ *bad_wr = ibwr;
+ break;
+ }
+
+ work_to_do++;
+ wr_list = wr_list->next;
+ }
+
+ pthread_spin_unlock(&qp->sq.lock);
+
+ if (work_to_do)
+ err = rxe_post_send_db(ibqp);
+
+ err = err ? err : ret;
+ if (err)
+ goto err;
+
+ return 0;
+err:
+ errno = err;
+ return errno;
+}
Implemented ibv_alloc_mw ibv_dealloc_mw ibv_bind_mw post bind mw operations post invalidate rkey commands access MW and MR rkeys for RDMA write, read, and atomic Depends on matching kernel patch set Signed-off-by: Bob Pearson <rpearson@hpe.com> --- kernel-headers/rdma/rdma_user_rxe.h | 50 ++++- providers/rxe/CMakeLists.txt | 2 + providers/rxe/rxe-abi.h | 4 + providers/rxe/rxe.c | 293 +++++-------------------- providers/rxe/rxe.h | 44 ++++ providers/rxe/rxe_mw.c | 149 +++++++++++++ providers/rxe/rxe_sq.c | 319 ++++++++++++++++++++++++++++ 7 files changed, 622 insertions(+), 239 deletions(-) create mode 100644 providers/rxe/rxe_mw.c create mode 100644 providers/rxe/rxe_sq.c