Message ID | 20181122121402.13764-6-yuval.shaia@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add support for RDMA MAD | expand |
On 11/22/18 2:13 PM, Yuval Shaia wrote: > MAD (Management Datagram) packets are widely used by various modules > both in kernel and in user space for example the rdma_* API which is > used to create and maintain "connection" layer on top of RDMA uses > several types of MAD packets. > > For more information please refer to chapter 13.4 in Volume 1 > Architecture Specification, Release 1.1 available here: > https://www.infinibandta.org/ibta-specifications-download/ > > To support MAD packets the device uses an external utility > (contrib/rdmacm-mux) to relay packets from and to the guest driver. > > Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> > --- > hw/rdma/rdma_backend.c | 275 +++++++++++++++++++++++++++++++++++- > hw/rdma/rdma_backend.h | 4 +- > hw/rdma/rdma_backend_defs.h | 10 +- > hw/rdma/vmw/pvrdma.h | 2 + > hw/rdma/vmw/pvrdma_main.c | 4 +- > 5 files changed, 285 insertions(+), 10 deletions(-) > > diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c > index 1e148398a2..7c220a5798 100644 > --- a/hw/rdma/rdma_backend.c > +++ b/hw/rdma/rdma_backend.c > @@ -16,8 +16,13 @@ > #include "qemu/osdep.h" > #include "qemu/error-report.h" > #include "qapi/error.h" > +#include "qapi/qmp/qlist.h" > +#include "qapi/qmp/qnum.h" > > #include <infiniband/verbs.h> > +#include <infiniband/umad_types.h> > +#include <infiniband/umad.h> > +#include <rdma/rdma_user_cm.h> > > #include "trace.h" > #include "rdma_utils.h" > @@ -33,16 +38,25 @@ > #define VENDOR_ERR_MAD_SEND 0x206 > #define VENDOR_ERR_INVLKEY 0x207 > #define VENDOR_ERR_MR_SMALL 0x208 > +#define VENDOR_ERR_INV_MAD_BUFF 0x209 > +#define VENDOR_ERR_INV_NUM_SGE 0x210 > > #define THR_NAME_LEN 16 > #define THR_POLL_TO 5000 > > +#define MAD_HDR_SIZE sizeof(struct ibv_grh) > + > typedef struct BackendCtx { > - uint64_t req_id; > void *up_ctx; > bool is_tx_req; > + struct ibv_sge sge; /* Used to save MAD recv buffer */ > } BackendCtx; > > +struct backend_umad { > + struct ib_user_mad hdr; > + char mad[RDMA_MAX_PRIVATE_DATA]; > +}; > + > static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx); > > static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx) > @@ -286,6 +300,61 @@ static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res, > return 0; > } > > +static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge, > + uint32_t num_sge) > +{ > + struct backend_umad umad = {0}; > + char *hdr, *msg; > + int ret; > + > + pr_dbg("num_sge=%d\n", num_sge); > + > + if (num_sge != 2) { > + return -EINVAL; > + } > + > + umad.hdr.length = sge[0].length + sge[1].length; > + pr_dbg("msg_len=%d\n", umad.hdr.length); > + > + if (umad.hdr.length > sizeof(umad.mad)) { > + return -ENOMEM; > + } > + > + umad.hdr.addr.qpn = htobe32(1); > + umad.hdr.addr.grh_present = 1; > + umad.hdr.addr.gid_index = backend_dev->backend_gid_idx; > + memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid)); > + umad.hdr.addr.hop_limit = 1; > + > + hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length); > + if (!hdr) { > + pr_dbg("Fail to map to sge[0]\n"); > + return -ENOMEM; > + } > + msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length); > + if (!msg) { > + pr_dbg("Fail to map to sge[1]\n"); > + rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length); > + return -ENOMEM; > + } > + > + pr_dbg_buf("mad_hdr", hdr, sge[0].length); > + pr_dbg_buf("mad_data", data, sge[1].length); > + > + memcpy(&umad.mad[0], hdr, sge[0].length); > + memcpy(&umad.mad[sge[0].length], msg, sge[1].length); > + > + rdma_pci_dma_unmap(backend_dev->dev, msg, sge[1].length); > + rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length); > + > + ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *)&umad, > + sizeof(umad)); > + > + pr_dbg("qemu_chr_fe_write=%d\n", ret); > + > + return (ret != sizeof(umad)); > +} > + > void rdma_backend_post_send(RdmaBackendDev *backend_dev, > RdmaBackendQP *qp, uint8_t qp_type, > struct ibv_sge *sge, uint32_t num_sge, > @@ -304,9 +373,13 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev, > comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx); > } else if (qp_type == IBV_QPT_GSI) { > pr_dbg("QP1\n"); > - comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); > + rc = mad_send(backend_dev, sge, num_sge); > + if (rc) { > + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); > + } else { > + comp_handler(IBV_WC_SUCCESS, 0, ctx); > + } > } > - pr_dbg("qp->ibqp is NULL for qp_type %d!!!\n", qp_type); > return; > } > > @@ -370,6 +443,48 @@ out_free_bctx: > g_free(bctx); > } > > +static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev, > + struct ibv_sge *sge, uint32_t num_sge, > + void *ctx) > +{ > + BackendCtx *bctx; > + int rc; > + uint32_t bctx_id; > + > + if (num_sge != 1) { > + pr_dbg("Invalid num_sge (%d), expecting 1\n", num_sge); Maybe not for this patch set, but please consider using QEMU error report facilities for errors instead of debug messages when necessary. The same about using traces on normal flow instead pr_dbg, but this has a lower priority than error reporting. > + return VENDOR_ERR_INV_NUM_SGE; > + } > + > + if (sge[0].length < RDMA_MAX_PRIVATE_DATA + sizeof(struct ibv_grh)) { > + pr_dbg("Too small buffer for MAD\n"); > + return VENDOR_ERR_INV_MAD_BUFF; > + } > + > + pr_dbg("addr=0x%" PRIx64"\n", sge[0].addr); > + pr_dbg("length=%d\n", sge[0].length); > + pr_dbg("lkey=%d\n", sge[0].lkey); > + > + bctx = g_malloc0(sizeof(*bctx)); > + > + rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx); > + if (unlikely(rc)) { > + g_free(bctx); > + pr_dbg("Fail to allocate cqe_ctx\n"); > + return VENDOR_ERR_NOMEM; > + } > + > + pr_dbg("bctx_id %d, bctx %p, ctx %p\n", bctx_id, bctx, ctx); > + bctx->up_ctx = ctx; > + bctx->sge = *sge; > + > + qemu_mutex_lock(&backend_dev->recv_mads_list.lock); > + qlist_append_int(backend_dev->recv_mads_list.list, bctx_id); > + qemu_mutex_unlock(&backend_dev->recv_mads_list.lock); > + > + return 0; > +} > + > void rdma_backend_post_recv(RdmaBackendDev *backend_dev, > RdmaDeviceResources *rdma_dev_res, > RdmaBackendQP *qp, uint8_t qp_type, > @@ -388,7 +503,10 @@ void rdma_backend_post_recv(RdmaBackendDev *backend_dev, > } > if (qp_type == IBV_QPT_GSI) { > pr_dbg("QP1\n"); > - comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); > + rc = save_mad_recv_buffer(backend_dev, sge, num_sge, ctx); > + if (rc) { > + comp_handler(IBV_WC_GENERAL_ERR, rc, ctx); > + } > } > return; > } > @@ -517,7 +635,6 @@ int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type, > > switch (qp_type) { > case IBV_QPT_GSI: > - pr_dbg("QP1 unsupported\n"); > return 0; > > case IBV_QPT_RC: > @@ -748,11 +865,146 @@ static int init_device_caps(RdmaBackendDev *backend_dev, > return 0; > } > > +static inline void build_mad_hdr(struct ibv_grh *grh, union ibv_gid *sgid, > + union ibv_gid *my_gid, int paylen) > +{ > + grh->paylen = htons(paylen); > + grh->sgid = *sgid; > + grh->dgid = *my_gid; > + > + pr_dbg("paylen=%d (net=0x%x)\n", paylen, grh->paylen); > + pr_dbg("my_gid=0x%llx\n", my_gid->global.interface_id); > + pr_dbg("gid=0x%llx\n", sgid->global.interface_id); > +} > + > +static inline int mad_can_receieve(void *opaque) > +{ > + return sizeof(struct backend_umad); > +} > + > +static void mad_read(void *opaque, const uint8_t *buf, int size) > +{ > + RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque; > + QObject *o_ctx_id; > + unsigned long cqe_ctx_id; > + BackendCtx *bctx; > + char *mad; > + struct backend_umad *umad; > + > + assert(size != sizeof(umad)); > + umad = (struct backend_umad *)buf; > + > + pr_dbg("Got %d bytes\n", size); > + pr_dbg("umad->hdr.length=%d\n", umad->hdr.length); > + > +#ifdef PVRDMA_DEBUG > + struct umad_hdr *hdr = (struct umad_hdr *)&msg->umad.mad; > + pr_dbg("bv %x cls %x cv %x mtd %x st %d tid %" PRIx64 " at %x atm %x\n", > + hdr->base_version, hdr->mgmt_class, hdr->class_version, > + hdr->method, hdr->status, be64toh(hdr->tid), > + hdr->attr_id, hdr->attr_mod); > +#endif > + > + qemu_mutex_lock(&backend_dev->recv_mads_list.lock); > + o_ctx_id = qlist_pop(backend_dev->recv_mads_list.list); > + qemu_mutex_unlock(&backend_dev->recv_mads_list.lock); > + if (!o_ctx_id) { > + pr_dbg("No more free MADs buffers, waiting for a while\n"); > + sleep(THR_POLL_TO); > + return; > + } > + > + cqe_ctx_id = qnum_get_uint(qobject_to(QNum, o_ctx_id)); > + bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); > + if (unlikely(!bctx)) { > + pr_dbg("Error: Fail to find ctx for %ld\n", cqe_ctx_id); > + return; > + } > + > + pr_dbg("id %ld, bctx %p, ctx %p\n", cqe_ctx_id, bctx, bctx->up_ctx); > + > + mad = rdma_pci_dma_map(backend_dev->dev, bctx->sge.addr, > + bctx->sge.length); > + if (!mad || bctx->sge.length < umad->hdr.length + MAD_HDR_SIZE) { > + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_INV_MAD_BUFF, > + bctx->up_ctx); > + } else { > + memset(mad, 0, bctx->sge.length); > + build_mad_hdr((struct ibv_grh *)mad, > + (union ibv_gid *)&umad->hdr.addr.gid, > + &backend_dev->gid, umad->hdr.length); > + memcpy(&mad[MAD_HDR_SIZE], umad->mad, umad->hdr.length); > + rdma_pci_dma_unmap(backend_dev->dev, mad, bctx->sge.length); > + > + comp_handler(IBV_WC_SUCCESS, 0, bctx->up_ctx); > + } > + > + g_free(bctx); > + rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); > +} > + > +static int mad_init(RdmaBackendDev *backend_dev) > +{ > + struct backend_umad umad = {0}; > + int ret; > + > + if (!qemu_chr_fe_backend_connected(backend_dev->mad_chr_be)) { > + pr_dbg("Missing chardev for MAD multiplexer\n"); > + return -EIO; > + } > + > + qemu_chr_fe_set_handlers(backend_dev->mad_chr_be, mad_can_receieve, > + mad_read, NULL, NULL, backend_dev, NULL, true); > + > + /* Register ourself */ > + memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid)); > + ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *)&umad, > + sizeof(umad.hdr)); > + if (ret != sizeof(umad.hdr)) { > + pr_dbg("Fail to register to rdma_umadmux (%d)\n", ret); > + } > + > + qemu_mutex_init(&backend_dev->recv_mads_list.lock); > + backend_dev->recv_mads_list.list = qlist_new(); > + > + return 0; > +} > + > +static void mad_stop(RdmaBackendDev *backend_dev) > +{ > + QObject *o_ctx_id; > + unsigned long cqe_ctx_id; > + BackendCtx *bctx; > + > + pr_dbg("Closing MAD\n"); > + > + /* Clear MAD buffers list */ > + qemu_mutex_lock(&backend_dev->recv_mads_list.lock); > + do { > + o_ctx_id = qlist_pop(backend_dev->recv_mads_list.list); I suppose you cal lock only the above line, but maybe it doesn't matter at this point. > + if (o_ctx_id) { > + cqe_ctx_id = qnum_get_uint(qobject_to(QNum, o_ctx_id)); > + bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); > + if (bctx) { > + rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); > + g_free(bctx); > + } > + } > + } while (o_ctx_id); > + qemu_mutex_unlock(&backend_dev->recv_mads_list.lock); > +} > + > +static void mad_fini(RdmaBackendDev *backend_dev) > +{ > + qlist_destroy_obj(QOBJECT(backend_dev->recv_mads_list.list)); > + qemu_mutex_destroy(&backend_dev->recv_mads_list.lock); > +} > + > int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, > RdmaDeviceResources *rdma_dev_res, > const char *backend_device_name, uint8_t port_num, > uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr, > - Error **errp) > + CharBackend *mad_chr_be, Error **errp) > { > int i; > int ret = 0; > @@ -763,7 +1015,7 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, > memset(backend_dev, 0, sizeof(*backend_dev)); > > backend_dev->dev = pdev; > - > + backend_dev->mad_chr_be = mad_chr_be; > backend_dev->backend_gid_idx = backend_gid_idx; > backend_dev->port_num = port_num; > backend_dev->rdma_dev_res = rdma_dev_res; > @@ -854,6 +1106,13 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, > pr_dbg("interface_id=0x%" PRIx64 "\n", > be64_to_cpu(backend_dev->gid.global.interface_id)); > > + ret = mad_init(backend_dev); > + if (ret) { > + error_setg(errp, "Fail to initialize mad"); > + ret = -EIO; > + goto out_destroy_comm_channel; > + } > + > backend_dev->comp_thread.run = false; > backend_dev->comp_thread.is_running = false; > > @@ -885,11 +1144,13 @@ void rdma_backend_stop(RdmaBackendDev *backend_dev) > { > pr_dbg("Stopping rdma_backend\n"); > stop_backend_thread(&backend_dev->comp_thread); > + mad_stop(backend_dev); > } > > void rdma_backend_fini(RdmaBackendDev *backend_dev) > { > rdma_backend_stop(backend_dev); > + mad_fini(backend_dev); > g_hash_table_destroy(ah_hash); > ibv_destroy_comp_channel(backend_dev->channel); > ibv_close_device(backend_dev->context); > diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h > index 3ccc9a2494..fc83330251 100644 > --- a/hw/rdma/rdma_backend.h > +++ b/hw/rdma/rdma_backend.h > @@ -17,6 +17,8 @@ > #define RDMA_BACKEND_H > > #include "qapi/error.h" > +#include "chardev/char-fe.h" > + > #include "rdma_rm_defs.h" > #include "rdma_backend_defs.h" > > @@ -50,7 +52,7 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, > RdmaDeviceResources *rdma_dev_res, > const char *backend_device_name, uint8_t port_num, > uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr, > - Error **errp); > + CharBackend *mad_chr_be, Error **errp); > void rdma_backend_fini(RdmaBackendDev *backend_dev); > void rdma_backend_start(RdmaBackendDev *backend_dev); > void rdma_backend_stop(RdmaBackendDev *backend_dev); > diff --git a/hw/rdma/rdma_backend_defs.h b/hw/rdma/rdma_backend_defs.h > index 7404f64002..2a7e667075 100644 > --- a/hw/rdma/rdma_backend_defs.h > +++ b/hw/rdma/rdma_backend_defs.h > @@ -16,8 +16,9 @@ > #ifndef RDMA_BACKEND_DEFS_H > #define RDMA_BACKEND_DEFS_H > > -#include <infiniband/verbs.h> > #include "qemu/thread.h" > +#include "chardev/char-fe.h" > +#include <infiniband/verbs.h> > > typedef struct RdmaDeviceResources RdmaDeviceResources; > > @@ -28,6 +29,11 @@ typedef struct RdmaBackendThread { > bool is_running; /* Set by the thread to report its status */ > } RdmaBackendThread; > > +typedef struct RecvMadList { > + QemuMutex lock; > + QList *list; > +} RecvMadList; > + > typedef struct RdmaBackendDev { > struct ibv_device_attr dev_attr; > RdmaBackendThread comp_thread; > @@ -39,6 +45,8 @@ typedef struct RdmaBackendDev { > struct ibv_comp_channel *channel; > uint8_t port_num; > uint8_t backend_gid_idx; > + RecvMadList recv_mads_list; > + CharBackend *mad_chr_be; > } RdmaBackendDev; > > typedef struct RdmaBackendPD { > diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h > index e2d9f93cdf..e3742d893a 100644 > --- a/hw/rdma/vmw/pvrdma.h > +++ b/hw/rdma/vmw/pvrdma.h > @@ -19,6 +19,7 @@ > #include "qemu/units.h" > #include "hw/pci/pci.h" > #include "hw/pci/msix.h" > +#include "chardev/char-fe.h" > > #include "../rdma_backend_defs.h" > #include "../rdma_rm_defs.h" > @@ -83,6 +84,7 @@ typedef struct PVRDMADev { > uint8_t backend_port_num; > RdmaBackendDev backend_dev; > RdmaDeviceResources rdma_dev_res; > + CharBackend mad_chr; > } PVRDMADev; > #define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME) > > diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c > index ca5fa8d981..6c8c0154fa 100644 > --- a/hw/rdma/vmw/pvrdma_main.c > +++ b/hw/rdma/vmw/pvrdma_main.c > @@ -51,6 +51,7 @@ static Property pvrdma_dev_properties[] = { > DEFINE_PROP_INT32("dev-caps-max-qp-init-rd-atom", PVRDMADev, > dev_attr.max_qp_init_rd_atom, MAX_QP_INIT_RD_ATOM), > DEFINE_PROP_INT32("dev-caps-max-ah", PVRDMADev, dev_attr.max_ah, MAX_AH), > + DEFINE_PROP_CHR("mad-chardev", PVRDMADev, mad_chr), > DEFINE_PROP_END_OF_LIST(), > }; > > @@ -613,7 +614,8 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp) > > rc = rdma_backend_init(&dev->backend_dev, pdev, &dev->rdma_dev_res, > dev->backend_device_name, dev->backend_port_num, > - dev->backend_gid_idx, &dev->dev_attr, errp); > + dev->backend_gid_idx, &dev->dev_attr, &dev->mad_chr, > + errp); > if (rc) { > goto out; > } I only have a few minor comments, but it looks OK to me anyway. Reviewed-by: Marcel Apfelbaum<marcel.apfelbaum@gmail.com> Thanks, Marcel
On Sun, Nov 25, 2018 at 09:05:17AM +0200, Marcel Apfelbaum wrote: > > > On 11/22/18 2:13 PM, Yuval Shaia wrote: > > MAD (Management Datagram) packets are widely used by various modules > > both in kernel and in user space for example the rdma_* API which is > > used to create and maintain "connection" layer on top of RDMA uses > > several types of MAD packets. > > > > For more information please refer to chapter 13.4 in Volume 1 > > Architecture Specification, Release 1.1 available here: > > https://www.infinibandta.org/ibta-specifications-download/ > > > > To support MAD packets the device uses an external utility > > (contrib/rdmacm-mux) to relay packets from and to the guest driver. > > > > Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> > > --- > > hw/rdma/rdma_backend.c | 275 +++++++++++++++++++++++++++++++++++- > > hw/rdma/rdma_backend.h | 4 +- > > hw/rdma/rdma_backend_defs.h | 10 +- > > hw/rdma/vmw/pvrdma.h | 2 + > > hw/rdma/vmw/pvrdma_main.c | 4 +- > > 5 files changed, 285 insertions(+), 10 deletions(-) > > > > diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c > > index 1e148398a2..7c220a5798 100644 > > --- a/hw/rdma/rdma_backend.c > > +++ b/hw/rdma/rdma_backend.c > > @@ -16,8 +16,13 @@ > > #include "qemu/osdep.h" > > #include "qemu/error-report.h" > > #include "qapi/error.h" > > +#include "qapi/qmp/qlist.h" > > +#include "qapi/qmp/qnum.h" > > #include <infiniband/verbs.h> > > +#include <infiniband/umad_types.h> > > +#include <infiniband/umad.h> > > +#include <rdma/rdma_user_cm.h> > > #include "trace.h" > > #include "rdma_utils.h" > > @@ -33,16 +38,25 @@ > > #define VENDOR_ERR_MAD_SEND 0x206 > > #define VENDOR_ERR_INVLKEY 0x207 > > #define VENDOR_ERR_MR_SMALL 0x208 > > +#define VENDOR_ERR_INV_MAD_BUFF 0x209 > > +#define VENDOR_ERR_INV_NUM_SGE 0x210 > > #define THR_NAME_LEN 16 > > #define THR_POLL_TO 5000 > > +#define MAD_HDR_SIZE sizeof(struct ibv_grh) > > + > > typedef struct BackendCtx { > > - uint64_t req_id; > > void *up_ctx; > > bool is_tx_req; > > + struct ibv_sge sge; /* Used to save MAD recv buffer */ > > } BackendCtx; > > +struct backend_umad { > > + struct ib_user_mad hdr; > > + char mad[RDMA_MAX_PRIVATE_DATA]; > > +}; > > + > > static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx); > > static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx) > > @@ -286,6 +300,61 @@ static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res, > > return 0; > > } > > +static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge, > > + uint32_t num_sge) > > +{ > > + struct backend_umad umad = {0}; > > + char *hdr, *msg; > > + int ret; > > + > > + pr_dbg("num_sge=%d\n", num_sge); > > + > > + if (num_sge != 2) { > > + return -EINVAL; > > + } > > + > > + umad.hdr.length = sge[0].length + sge[1].length; > > + pr_dbg("msg_len=%d\n", umad.hdr.length); > > + > > + if (umad.hdr.length > sizeof(umad.mad)) { > > + return -ENOMEM; > > + } > > + > > + umad.hdr.addr.qpn = htobe32(1); > > + umad.hdr.addr.grh_present = 1; > > + umad.hdr.addr.gid_index = backend_dev->backend_gid_idx; > > + memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid)); > > + umad.hdr.addr.hop_limit = 1; > > + > > + hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length); > > + if (!hdr) { > > + pr_dbg("Fail to map to sge[0]\n"); > > + return -ENOMEM; > > + } > > + msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length); > > + if (!msg) { > > + pr_dbg("Fail to map to sge[1]\n"); > > + rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length); > > + return -ENOMEM; > > + } > > + > > + pr_dbg_buf("mad_hdr", hdr, sge[0].length); > > + pr_dbg_buf("mad_data", data, sge[1].length); > > + > > + memcpy(&umad.mad[0], hdr, sge[0].length); > > + memcpy(&umad.mad[sge[0].length], msg, sge[1].length); > > + > > + rdma_pci_dma_unmap(backend_dev->dev, msg, sge[1].length); > > + rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length); > > + > > + ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *)&umad, > > + sizeof(umad)); > > + > > + pr_dbg("qemu_chr_fe_write=%d\n", ret); > > + > > + return (ret != sizeof(umad)); > > +} > > + > > void rdma_backend_post_send(RdmaBackendDev *backend_dev, > > RdmaBackendQP *qp, uint8_t qp_type, > > struct ibv_sge *sge, uint32_t num_sge, > > @@ -304,9 +373,13 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev, > > comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx); > > } else if (qp_type == IBV_QPT_GSI) { > > pr_dbg("QP1\n"); > > - comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); > > + rc = mad_send(backend_dev, sge, num_sge); > > + if (rc) { > > + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); > > + } else { > > + comp_handler(IBV_WC_SUCCESS, 0, ctx); > > + } > > } > > - pr_dbg("qp->ibqp is NULL for qp_type %d!!!\n", qp_type); > > return; > > } > > @@ -370,6 +443,48 @@ out_free_bctx: > > g_free(bctx); > > } > > +static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev, > > + struct ibv_sge *sge, uint32_t num_sge, > > + void *ctx) > > +{ > > + BackendCtx *bctx; > > + int rc; > > + uint32_t bctx_id; > > + > > + if (num_sge != 1) { > > + pr_dbg("Invalid num_sge (%d), expecting 1\n", num_sge); > > Maybe not for this patch set, but please consider using > QEMU error report facilities for errors instead of debug messages > when necessary. > > The same about using traces on normal flow instead pr_dbg, > but this has a lower priority than error reporting. Sure, this is next on my plate. > > > + return VENDOR_ERR_INV_NUM_SGE; > > + } > > + > > + if (sge[0].length < RDMA_MAX_PRIVATE_DATA + sizeof(struct ibv_grh)) { > > + pr_dbg("Too small buffer for MAD\n"); > > + return VENDOR_ERR_INV_MAD_BUFF; > > + } > > + > > + pr_dbg("addr=0x%" PRIx64"\n", sge[0].addr); > > + pr_dbg("length=%d\n", sge[0].length); > > + pr_dbg("lkey=%d\n", sge[0].lkey); > > + > > + bctx = g_malloc0(sizeof(*bctx)); > > + > > + rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx); > > + if (unlikely(rc)) { > > + g_free(bctx); > > + pr_dbg("Fail to allocate cqe_ctx\n"); > > + return VENDOR_ERR_NOMEM; > > + } > > + > > + pr_dbg("bctx_id %d, bctx %p, ctx %p\n", bctx_id, bctx, ctx); > > + bctx->up_ctx = ctx; > > + bctx->sge = *sge; > > + > > + qemu_mutex_lock(&backend_dev->recv_mads_list.lock); > > + qlist_append_int(backend_dev->recv_mads_list.list, bctx_id); > > + qemu_mutex_unlock(&backend_dev->recv_mads_list.lock); > > + > > + return 0; > > +} > > + > > void rdma_backend_post_recv(RdmaBackendDev *backend_dev, > > RdmaDeviceResources *rdma_dev_res, > > RdmaBackendQP *qp, uint8_t qp_type, > > @@ -388,7 +503,10 @@ void rdma_backend_post_recv(RdmaBackendDev *backend_dev, > > } > > if (qp_type == IBV_QPT_GSI) { > > pr_dbg("QP1\n"); > > - comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); > > + rc = save_mad_recv_buffer(backend_dev, sge, num_sge, ctx); > > + if (rc) { > > + comp_handler(IBV_WC_GENERAL_ERR, rc, ctx); > > + } > > } > > return; > > } > > @@ -517,7 +635,6 @@ int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type, > > switch (qp_type) { > > case IBV_QPT_GSI: > > - pr_dbg("QP1 unsupported\n"); > > return 0; > > case IBV_QPT_RC: > > @@ -748,11 +865,146 @@ static int init_device_caps(RdmaBackendDev *backend_dev, > > return 0; > > } > > +static inline void build_mad_hdr(struct ibv_grh *grh, union ibv_gid *sgid, > > + union ibv_gid *my_gid, int paylen) > > +{ > > + grh->paylen = htons(paylen); > > + grh->sgid = *sgid; > > + grh->dgid = *my_gid; > > + > > + pr_dbg("paylen=%d (net=0x%x)\n", paylen, grh->paylen); > > + pr_dbg("my_gid=0x%llx\n", my_gid->global.interface_id); > > + pr_dbg("gid=0x%llx\n", sgid->global.interface_id); > > +} > > + > > +static inline int mad_can_receieve(void *opaque) > > +{ > > + return sizeof(struct backend_umad); > > +} > > + > > +static void mad_read(void *opaque, const uint8_t *buf, int size) > > +{ > > + RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque; > > + QObject *o_ctx_id; > > + unsigned long cqe_ctx_id; > > + BackendCtx *bctx; > > + char *mad; > > + struct backend_umad *umad; > > + > > + assert(size != sizeof(umad)); > > + umad = (struct backend_umad *)buf; > > + > > + pr_dbg("Got %d bytes\n", size); > > + pr_dbg("umad->hdr.length=%d\n", umad->hdr.length); > > + > > +#ifdef PVRDMA_DEBUG > > + struct umad_hdr *hdr = (struct umad_hdr *)&msg->umad.mad; > > + pr_dbg("bv %x cls %x cv %x mtd %x st %d tid %" PRIx64 " at %x atm %x\n", > > + hdr->base_version, hdr->mgmt_class, hdr->class_version, > > + hdr->method, hdr->status, be64toh(hdr->tid), > > + hdr->attr_id, hdr->attr_mod); > > +#endif > > + > > + qemu_mutex_lock(&backend_dev->recv_mads_list.lock); > > + o_ctx_id = qlist_pop(backend_dev->recv_mads_list.list); > > + qemu_mutex_unlock(&backend_dev->recv_mads_list.lock); > > + if (!o_ctx_id) { > > + pr_dbg("No more free MADs buffers, waiting for a while\n"); > > + sleep(THR_POLL_TO); > > + return; > > + } > > + > > + cqe_ctx_id = qnum_get_uint(qobject_to(QNum, o_ctx_id)); > > + bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); > > + if (unlikely(!bctx)) { > > + pr_dbg("Error: Fail to find ctx for %ld\n", cqe_ctx_id); > > + return; > > + } > > + > > + pr_dbg("id %ld, bctx %p, ctx %p\n", cqe_ctx_id, bctx, bctx->up_ctx); > > + > > + mad = rdma_pci_dma_map(backend_dev->dev, bctx->sge.addr, > > + bctx->sge.length); > > + if (!mad || bctx->sge.length < umad->hdr.length + MAD_HDR_SIZE) { > > + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_INV_MAD_BUFF, > > + bctx->up_ctx); > > + } else { > > + memset(mad, 0, bctx->sge.length); > > + build_mad_hdr((struct ibv_grh *)mad, > > + (union ibv_gid *)&umad->hdr.addr.gid, > > + &backend_dev->gid, umad->hdr.length); > > + memcpy(&mad[MAD_HDR_SIZE], umad->mad, umad->hdr.length); > > + rdma_pci_dma_unmap(backend_dev->dev, mad, bctx->sge.length); > > + > > + comp_handler(IBV_WC_SUCCESS, 0, bctx->up_ctx); > > + } > > + > > + g_free(bctx); > > + rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); > > +} > > + > > +static int mad_init(RdmaBackendDev *backend_dev) > > +{ > > + struct backend_umad umad = {0}; > > + int ret; > > + > > + if (!qemu_chr_fe_backend_connected(backend_dev->mad_chr_be)) { > > + pr_dbg("Missing chardev for MAD multiplexer\n"); > > + return -EIO; > > + } > > + > > + qemu_chr_fe_set_handlers(backend_dev->mad_chr_be, mad_can_receieve, > > + mad_read, NULL, NULL, backend_dev, NULL, true); > > + > > + /* Register ourself */ > > + memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid)); > > + ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *)&umad, > > + sizeof(umad.hdr)); > > + if (ret != sizeof(umad.hdr)) { > > + pr_dbg("Fail to register to rdma_umadmux (%d)\n", ret); > > + } > > + > > + qemu_mutex_init(&backend_dev->recv_mads_list.lock); > > + backend_dev->recv_mads_list.list = qlist_new(); > > + > > + return 0; > > +} > > + > > +static void mad_stop(RdmaBackendDev *backend_dev) > > +{ > > + QObject *o_ctx_id; > > + unsigned long cqe_ctx_id; > > + BackendCtx *bctx; > > + > > + pr_dbg("Closing MAD\n"); > > + > > + /* Clear MAD buffers list */ > > + qemu_mutex_lock(&backend_dev->recv_mads_list.lock); > > + do { > > + o_ctx_id = qlist_pop(backend_dev->recv_mads_list.list); > > I suppose you cal lock only the above line, but maybe it doesn't > matter at this point. Sorry, my bad, there is no need for this cleanup, will squash patch #23 to this one. > > + if (o_ctx_id) { > > + cqe_ctx_id = qnum_get_uint(qobject_to(QNum, o_ctx_id)); > > + bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); > > + if (bctx) { > > + rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); > > + g_free(bctx); > > + } > > + } > > + } while (o_ctx_id); > > + qemu_mutex_unlock(&backend_dev->recv_mads_list.lock); > > +} > > + > > +static void mad_fini(RdmaBackendDev *backend_dev) > > +{ > > + qlist_destroy_obj(QOBJECT(backend_dev->recv_mads_list.list)); > > + qemu_mutex_destroy(&backend_dev->recv_mads_list.lock); > > +} > > + > > int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, > > RdmaDeviceResources *rdma_dev_res, > > const char *backend_device_name, uint8_t port_num, > > uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr, > > - Error **errp) > > + CharBackend *mad_chr_be, Error **errp) > > { > > int i; > > int ret = 0; > > @@ -763,7 +1015,7 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, > > memset(backend_dev, 0, sizeof(*backend_dev)); > > backend_dev->dev = pdev; > > - > > + backend_dev->mad_chr_be = mad_chr_be; > > backend_dev->backend_gid_idx = backend_gid_idx; > > backend_dev->port_num = port_num; > > backend_dev->rdma_dev_res = rdma_dev_res; > > @@ -854,6 +1106,13 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, > > pr_dbg("interface_id=0x%" PRIx64 "\n", > > be64_to_cpu(backend_dev->gid.global.interface_id)); > > + ret = mad_init(backend_dev); > > + if (ret) { > > + error_setg(errp, "Fail to initialize mad"); > > + ret = -EIO; > > + goto out_destroy_comm_channel; > > + } > > + > > backend_dev->comp_thread.run = false; > > backend_dev->comp_thread.is_running = false; > > @@ -885,11 +1144,13 @@ void rdma_backend_stop(RdmaBackendDev *backend_dev) > > { > > pr_dbg("Stopping rdma_backend\n"); > > stop_backend_thread(&backend_dev->comp_thread); > > + mad_stop(backend_dev); > > } > > void rdma_backend_fini(RdmaBackendDev *backend_dev) > > { > > rdma_backend_stop(backend_dev); > > + mad_fini(backend_dev); > > g_hash_table_destroy(ah_hash); > > ibv_destroy_comp_channel(backend_dev->channel); > > ibv_close_device(backend_dev->context); > > diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h > > index 3ccc9a2494..fc83330251 100644 > > --- a/hw/rdma/rdma_backend.h > > +++ b/hw/rdma/rdma_backend.h > > @@ -17,6 +17,8 @@ > > #define RDMA_BACKEND_H > > #include "qapi/error.h" > > +#include "chardev/char-fe.h" > > + > > #include "rdma_rm_defs.h" > > #include "rdma_backend_defs.h" > > @@ -50,7 +52,7 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, > > RdmaDeviceResources *rdma_dev_res, > > const char *backend_device_name, uint8_t port_num, > > uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr, > > - Error **errp); > > + CharBackend *mad_chr_be, Error **errp); > > void rdma_backend_fini(RdmaBackendDev *backend_dev); > > void rdma_backend_start(RdmaBackendDev *backend_dev); > > void rdma_backend_stop(RdmaBackendDev *backend_dev); > > diff --git a/hw/rdma/rdma_backend_defs.h b/hw/rdma/rdma_backend_defs.h > > index 7404f64002..2a7e667075 100644 > > --- a/hw/rdma/rdma_backend_defs.h > > +++ b/hw/rdma/rdma_backend_defs.h > > @@ -16,8 +16,9 @@ > > #ifndef RDMA_BACKEND_DEFS_H > > #define RDMA_BACKEND_DEFS_H > > -#include <infiniband/verbs.h> > > #include "qemu/thread.h" > > +#include "chardev/char-fe.h" > > +#include <infiniband/verbs.h> > > typedef struct RdmaDeviceResources RdmaDeviceResources; > > @@ -28,6 +29,11 @@ typedef struct RdmaBackendThread { > > bool is_running; /* Set by the thread to report its status */ > > } RdmaBackendThread; > > +typedef struct RecvMadList { > > + QemuMutex lock; > > + QList *list; > > +} RecvMadList; > > + > > typedef struct RdmaBackendDev { > > struct ibv_device_attr dev_attr; > > RdmaBackendThread comp_thread; > > @@ -39,6 +45,8 @@ typedef struct RdmaBackendDev { > > struct ibv_comp_channel *channel; > > uint8_t port_num; > > uint8_t backend_gid_idx; > > + RecvMadList recv_mads_list; > > + CharBackend *mad_chr_be; > > } RdmaBackendDev; > > typedef struct RdmaBackendPD { > > diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h > > index e2d9f93cdf..e3742d893a 100644 > > --- a/hw/rdma/vmw/pvrdma.h > > +++ b/hw/rdma/vmw/pvrdma.h > > @@ -19,6 +19,7 @@ > > #include "qemu/units.h" > > #include "hw/pci/pci.h" > > #include "hw/pci/msix.h" > > +#include "chardev/char-fe.h" > > #include "../rdma_backend_defs.h" > > #include "../rdma_rm_defs.h" > > @@ -83,6 +84,7 @@ typedef struct PVRDMADev { > > uint8_t backend_port_num; > > RdmaBackendDev backend_dev; > > RdmaDeviceResources rdma_dev_res; > > + CharBackend mad_chr; > > } PVRDMADev; > > #define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME) > > diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c > > index ca5fa8d981..6c8c0154fa 100644 > > --- a/hw/rdma/vmw/pvrdma_main.c > > +++ b/hw/rdma/vmw/pvrdma_main.c > > @@ -51,6 +51,7 @@ static Property pvrdma_dev_properties[] = { > > DEFINE_PROP_INT32("dev-caps-max-qp-init-rd-atom", PVRDMADev, > > dev_attr.max_qp_init_rd_atom, MAX_QP_INIT_RD_ATOM), > > DEFINE_PROP_INT32("dev-caps-max-ah", PVRDMADev, dev_attr.max_ah, MAX_AH), > > + DEFINE_PROP_CHR("mad-chardev", PVRDMADev, mad_chr), > > DEFINE_PROP_END_OF_LIST(), > > }; > > @@ -613,7 +614,8 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp) > > rc = rdma_backend_init(&dev->backend_dev, pdev, &dev->rdma_dev_res, > > dev->backend_device_name, dev->backend_port_num, > > - dev->backend_gid_idx, &dev->dev_attr, errp); > > + dev->backend_gid_idx, &dev->dev_attr, &dev->mad_chr, > > + errp); > > if (rc) { > > goto out; > > } > > I only have a few minor comments, but it looks OK to me anyway. > > Reviewed-by: Marcel Apfelbaum<marcel.apfelbaum@gmail.com> > Thanks, > Marcel Thanks! >
diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c index 1e148398a2..7c220a5798 100644 --- a/hw/rdma/rdma_backend.c +++ b/hw/rdma/rdma_backend.c @@ -16,8 +16,13 @@ #include "qemu/osdep.h" #include "qemu/error-report.h" #include "qapi/error.h" +#include "qapi/qmp/qlist.h" +#include "qapi/qmp/qnum.h" #include <infiniband/verbs.h> +#include <infiniband/umad_types.h> +#include <infiniband/umad.h> +#include <rdma/rdma_user_cm.h> #include "trace.h" #include "rdma_utils.h" @@ -33,16 +38,25 @@ #define VENDOR_ERR_MAD_SEND 0x206 #define VENDOR_ERR_INVLKEY 0x207 #define VENDOR_ERR_MR_SMALL 0x208 +#define VENDOR_ERR_INV_MAD_BUFF 0x209 +#define VENDOR_ERR_INV_NUM_SGE 0x210 #define THR_NAME_LEN 16 #define THR_POLL_TO 5000 +#define MAD_HDR_SIZE sizeof(struct ibv_grh) + typedef struct BackendCtx { - uint64_t req_id; void *up_ctx; bool is_tx_req; + struct ibv_sge sge; /* Used to save MAD recv buffer */ } BackendCtx; +struct backend_umad { + struct ib_user_mad hdr; + char mad[RDMA_MAX_PRIVATE_DATA]; +}; + static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx); static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx) @@ -286,6 +300,61 @@ static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res, return 0; } +static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge, + uint32_t num_sge) +{ + struct backend_umad umad = {0}; + char *hdr, *msg; + int ret; + + pr_dbg("num_sge=%d\n", num_sge); + + if (num_sge != 2) { + return -EINVAL; + } + + umad.hdr.length = sge[0].length + sge[1].length; + pr_dbg("msg_len=%d\n", umad.hdr.length); + + if (umad.hdr.length > sizeof(umad.mad)) { + return -ENOMEM; + } + + umad.hdr.addr.qpn = htobe32(1); + umad.hdr.addr.grh_present = 1; + umad.hdr.addr.gid_index = backend_dev->backend_gid_idx; + memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid)); + umad.hdr.addr.hop_limit = 1; + + hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length); + if (!hdr) { + pr_dbg("Fail to map to sge[0]\n"); + return -ENOMEM; + } + msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length); + if (!msg) { + pr_dbg("Fail to map to sge[1]\n"); + rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length); + return -ENOMEM; + } + + pr_dbg_buf("mad_hdr", hdr, sge[0].length); + pr_dbg_buf("mad_data", data, sge[1].length); + + memcpy(&umad.mad[0], hdr, sge[0].length); + memcpy(&umad.mad[sge[0].length], msg, sge[1].length); + + rdma_pci_dma_unmap(backend_dev->dev, msg, sge[1].length); + rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length); + + ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *)&umad, + sizeof(umad)); + + pr_dbg("qemu_chr_fe_write=%d\n", ret); + + return (ret != sizeof(umad)); +} + void rdma_backend_post_send(RdmaBackendDev *backend_dev, RdmaBackendQP *qp, uint8_t qp_type, struct ibv_sge *sge, uint32_t num_sge, @@ -304,9 +373,13 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev, comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx); } else if (qp_type == IBV_QPT_GSI) { pr_dbg("QP1\n"); - comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); + rc = mad_send(backend_dev, sge, num_sge); + if (rc) { + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); + } else { + comp_handler(IBV_WC_SUCCESS, 0, ctx); + } } - pr_dbg("qp->ibqp is NULL for qp_type %d!!!\n", qp_type); return; } @@ -370,6 +443,48 @@ out_free_bctx: g_free(bctx); } +static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev, + struct ibv_sge *sge, uint32_t num_sge, + void *ctx) +{ + BackendCtx *bctx; + int rc; + uint32_t bctx_id; + + if (num_sge != 1) { + pr_dbg("Invalid num_sge (%d), expecting 1\n", num_sge); + return VENDOR_ERR_INV_NUM_SGE; + } + + if (sge[0].length < RDMA_MAX_PRIVATE_DATA + sizeof(struct ibv_grh)) { + pr_dbg("Too small buffer for MAD\n"); + return VENDOR_ERR_INV_MAD_BUFF; + } + + pr_dbg("addr=0x%" PRIx64"\n", sge[0].addr); + pr_dbg("length=%d\n", sge[0].length); + pr_dbg("lkey=%d\n", sge[0].lkey); + + bctx = g_malloc0(sizeof(*bctx)); + + rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx); + if (unlikely(rc)) { + g_free(bctx); + pr_dbg("Fail to allocate cqe_ctx\n"); + return VENDOR_ERR_NOMEM; + } + + pr_dbg("bctx_id %d, bctx %p, ctx %p\n", bctx_id, bctx, ctx); + bctx->up_ctx = ctx; + bctx->sge = *sge; + + qemu_mutex_lock(&backend_dev->recv_mads_list.lock); + qlist_append_int(backend_dev->recv_mads_list.list, bctx_id); + qemu_mutex_unlock(&backend_dev->recv_mads_list.lock); + + return 0; +} + void rdma_backend_post_recv(RdmaBackendDev *backend_dev, RdmaDeviceResources *rdma_dev_res, RdmaBackendQP *qp, uint8_t qp_type, @@ -388,7 +503,10 @@ void rdma_backend_post_recv(RdmaBackendDev *backend_dev, } if (qp_type == IBV_QPT_GSI) { pr_dbg("QP1\n"); - comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); + rc = save_mad_recv_buffer(backend_dev, sge, num_sge, ctx); + if (rc) { + comp_handler(IBV_WC_GENERAL_ERR, rc, ctx); + } } return; } @@ -517,7 +635,6 @@ int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type, switch (qp_type) { case IBV_QPT_GSI: - pr_dbg("QP1 unsupported\n"); return 0; case IBV_QPT_RC: @@ -748,11 +865,146 @@ static int init_device_caps(RdmaBackendDev *backend_dev, return 0; } +static inline void build_mad_hdr(struct ibv_grh *grh, union ibv_gid *sgid, + union ibv_gid *my_gid, int paylen) +{ + grh->paylen = htons(paylen); + grh->sgid = *sgid; + grh->dgid = *my_gid; + + pr_dbg("paylen=%d (net=0x%x)\n", paylen, grh->paylen); + pr_dbg("my_gid=0x%llx\n", my_gid->global.interface_id); + pr_dbg("gid=0x%llx\n", sgid->global.interface_id); +} + +static inline int mad_can_receieve(void *opaque) +{ + return sizeof(struct backend_umad); +} + +static void mad_read(void *opaque, const uint8_t *buf, int size) +{ + RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque; + QObject *o_ctx_id; + unsigned long cqe_ctx_id; + BackendCtx *bctx; + char *mad; + struct backend_umad *umad; + + assert(size != sizeof(umad)); + umad = (struct backend_umad *)buf; + + pr_dbg("Got %d bytes\n", size); + pr_dbg("umad->hdr.length=%d\n", umad->hdr.length); + +#ifdef PVRDMA_DEBUG + struct umad_hdr *hdr = (struct umad_hdr *)&msg->umad.mad; + pr_dbg("bv %x cls %x cv %x mtd %x st %d tid %" PRIx64 " at %x atm %x\n", + hdr->base_version, hdr->mgmt_class, hdr->class_version, + hdr->method, hdr->status, be64toh(hdr->tid), + hdr->attr_id, hdr->attr_mod); +#endif + + qemu_mutex_lock(&backend_dev->recv_mads_list.lock); + o_ctx_id = qlist_pop(backend_dev->recv_mads_list.list); + qemu_mutex_unlock(&backend_dev->recv_mads_list.lock); + if (!o_ctx_id) { + pr_dbg("No more free MADs buffers, waiting for a while\n"); + sleep(THR_POLL_TO); + return; + } + + cqe_ctx_id = qnum_get_uint(qobject_to(QNum, o_ctx_id)); + bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); + if (unlikely(!bctx)) { + pr_dbg("Error: Fail to find ctx for %ld\n", cqe_ctx_id); + return; + } + + pr_dbg("id %ld, bctx %p, ctx %p\n", cqe_ctx_id, bctx, bctx->up_ctx); + + mad = rdma_pci_dma_map(backend_dev->dev, bctx->sge.addr, + bctx->sge.length); + if (!mad || bctx->sge.length < umad->hdr.length + MAD_HDR_SIZE) { + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_INV_MAD_BUFF, + bctx->up_ctx); + } else { + memset(mad, 0, bctx->sge.length); + build_mad_hdr((struct ibv_grh *)mad, + (union ibv_gid *)&umad->hdr.addr.gid, + &backend_dev->gid, umad->hdr.length); + memcpy(&mad[MAD_HDR_SIZE], umad->mad, umad->hdr.length); + rdma_pci_dma_unmap(backend_dev->dev, mad, bctx->sge.length); + + comp_handler(IBV_WC_SUCCESS, 0, bctx->up_ctx); + } + + g_free(bctx); + rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); +} + +static int mad_init(RdmaBackendDev *backend_dev) +{ + struct backend_umad umad = {0}; + int ret; + + if (!qemu_chr_fe_backend_connected(backend_dev->mad_chr_be)) { + pr_dbg("Missing chardev for MAD multiplexer\n"); + return -EIO; + } + + qemu_chr_fe_set_handlers(backend_dev->mad_chr_be, mad_can_receieve, + mad_read, NULL, NULL, backend_dev, NULL, true); + + /* Register ourself */ + memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid)); + ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *)&umad, + sizeof(umad.hdr)); + if (ret != sizeof(umad.hdr)) { + pr_dbg("Fail to register to rdma_umadmux (%d)\n", ret); + } + + qemu_mutex_init(&backend_dev->recv_mads_list.lock); + backend_dev->recv_mads_list.list = qlist_new(); + + return 0; +} + +static void mad_stop(RdmaBackendDev *backend_dev) +{ + QObject *o_ctx_id; + unsigned long cqe_ctx_id; + BackendCtx *bctx; + + pr_dbg("Closing MAD\n"); + + /* Clear MAD buffers list */ + qemu_mutex_lock(&backend_dev->recv_mads_list.lock); + do { + o_ctx_id = qlist_pop(backend_dev->recv_mads_list.list); + if (o_ctx_id) { + cqe_ctx_id = qnum_get_uint(qobject_to(QNum, o_ctx_id)); + bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); + if (bctx) { + rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); + g_free(bctx); + } + } + } while (o_ctx_id); + qemu_mutex_unlock(&backend_dev->recv_mads_list.lock); +} + +static void mad_fini(RdmaBackendDev *backend_dev) +{ + qlist_destroy_obj(QOBJECT(backend_dev->recv_mads_list.list)); + qemu_mutex_destroy(&backend_dev->recv_mads_list.lock); +} + int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, RdmaDeviceResources *rdma_dev_res, const char *backend_device_name, uint8_t port_num, uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr, - Error **errp) + CharBackend *mad_chr_be, Error **errp) { int i; int ret = 0; @@ -763,7 +1015,7 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, memset(backend_dev, 0, sizeof(*backend_dev)); backend_dev->dev = pdev; - + backend_dev->mad_chr_be = mad_chr_be; backend_dev->backend_gid_idx = backend_gid_idx; backend_dev->port_num = port_num; backend_dev->rdma_dev_res = rdma_dev_res; @@ -854,6 +1106,13 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, pr_dbg("interface_id=0x%" PRIx64 "\n", be64_to_cpu(backend_dev->gid.global.interface_id)); + ret = mad_init(backend_dev); + if (ret) { + error_setg(errp, "Fail to initialize mad"); + ret = -EIO; + goto out_destroy_comm_channel; + } + backend_dev->comp_thread.run = false; backend_dev->comp_thread.is_running = false; @@ -885,11 +1144,13 @@ void rdma_backend_stop(RdmaBackendDev *backend_dev) { pr_dbg("Stopping rdma_backend\n"); stop_backend_thread(&backend_dev->comp_thread); + mad_stop(backend_dev); } void rdma_backend_fini(RdmaBackendDev *backend_dev) { rdma_backend_stop(backend_dev); + mad_fini(backend_dev); g_hash_table_destroy(ah_hash); ibv_destroy_comp_channel(backend_dev->channel); ibv_close_device(backend_dev->context); diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h index 3ccc9a2494..fc83330251 100644 --- a/hw/rdma/rdma_backend.h +++ b/hw/rdma/rdma_backend.h @@ -17,6 +17,8 @@ #define RDMA_BACKEND_H #include "qapi/error.h" +#include "chardev/char-fe.h" + #include "rdma_rm_defs.h" #include "rdma_backend_defs.h" @@ -50,7 +52,7 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, RdmaDeviceResources *rdma_dev_res, const char *backend_device_name, uint8_t port_num, uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr, - Error **errp); + CharBackend *mad_chr_be, Error **errp); void rdma_backend_fini(RdmaBackendDev *backend_dev); void rdma_backend_start(RdmaBackendDev *backend_dev); void rdma_backend_stop(RdmaBackendDev *backend_dev); diff --git a/hw/rdma/rdma_backend_defs.h b/hw/rdma/rdma_backend_defs.h index 7404f64002..2a7e667075 100644 --- a/hw/rdma/rdma_backend_defs.h +++ b/hw/rdma/rdma_backend_defs.h @@ -16,8 +16,9 @@ #ifndef RDMA_BACKEND_DEFS_H #define RDMA_BACKEND_DEFS_H -#include <infiniband/verbs.h> #include "qemu/thread.h" +#include "chardev/char-fe.h" +#include <infiniband/verbs.h> typedef struct RdmaDeviceResources RdmaDeviceResources; @@ -28,6 +29,11 @@ typedef struct RdmaBackendThread { bool is_running; /* Set by the thread to report its status */ } RdmaBackendThread; +typedef struct RecvMadList { + QemuMutex lock; + QList *list; +} RecvMadList; + typedef struct RdmaBackendDev { struct ibv_device_attr dev_attr; RdmaBackendThread comp_thread; @@ -39,6 +45,8 @@ typedef struct RdmaBackendDev { struct ibv_comp_channel *channel; uint8_t port_num; uint8_t backend_gid_idx; + RecvMadList recv_mads_list; + CharBackend *mad_chr_be; } RdmaBackendDev; typedef struct RdmaBackendPD { diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h index e2d9f93cdf..e3742d893a 100644 --- a/hw/rdma/vmw/pvrdma.h +++ b/hw/rdma/vmw/pvrdma.h @@ -19,6 +19,7 @@ #include "qemu/units.h" #include "hw/pci/pci.h" #include "hw/pci/msix.h" +#include "chardev/char-fe.h" #include "../rdma_backend_defs.h" #include "../rdma_rm_defs.h" @@ -83,6 +84,7 @@ typedef struct PVRDMADev { uint8_t backend_port_num; RdmaBackendDev backend_dev; RdmaDeviceResources rdma_dev_res; + CharBackend mad_chr; } PVRDMADev; #define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME) diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c index ca5fa8d981..6c8c0154fa 100644 --- a/hw/rdma/vmw/pvrdma_main.c +++ b/hw/rdma/vmw/pvrdma_main.c @@ -51,6 +51,7 @@ static Property pvrdma_dev_properties[] = { DEFINE_PROP_INT32("dev-caps-max-qp-init-rd-atom", PVRDMADev, dev_attr.max_qp_init_rd_atom, MAX_QP_INIT_RD_ATOM), DEFINE_PROP_INT32("dev-caps-max-ah", PVRDMADev, dev_attr.max_ah, MAX_AH), + DEFINE_PROP_CHR("mad-chardev", PVRDMADev, mad_chr), DEFINE_PROP_END_OF_LIST(), }; @@ -613,7 +614,8 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp) rc = rdma_backend_init(&dev->backend_dev, pdev, &dev->rdma_dev_res, dev->backend_device_name, dev->backend_port_num, - dev->backend_gid_idx, &dev->dev_attr, errp); + dev->backend_gid_idx, &dev->dev_attr, &dev->mad_chr, + errp); if (rc) { goto out; }
MAD (Management Datagram) packets are widely used by various modules both in kernel and in user space for example the rdma_* API which is used to create and maintain "connection" layer on top of RDMA uses several types of MAD packets. For more information please refer to chapter 13.4 in Volume 1 Architecture Specification, Release 1.1 available here: https://www.infinibandta.org/ibta-specifications-download/ To support MAD packets the device uses an external utility (contrib/rdmacm-mux) to relay packets from and to the guest driver. Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> --- hw/rdma/rdma_backend.c | 275 +++++++++++++++++++++++++++++++++++- hw/rdma/rdma_backend.h | 4 +- hw/rdma/rdma_backend_defs.h | 10 +- hw/rdma/vmw/pvrdma.h | 2 + hw/rdma/vmw/pvrdma_main.c | 4 +- 5 files changed, 285 insertions(+), 10 deletions(-)