diff mbox

[01/13] hw/rdma: Make distinction between device init and start modes

Message ID 20180716074038.3364-2-yuval.shaia@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Yuval Shaia July 16, 2018, 7:40 a.m. UTC
There are certain operations that are well considered as part of device
configuration while others are needed only when "start" command is
triggered by the guest driver. An example of device initialization step
is msix_init and example of "device start" stage is the creation of a CQ
completion handler thread.

Driver expects such distinction - implement it.

Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
---
 hw/rdma/rdma_backend.c      |  96 +++++++++++++++++++++------
 hw/rdma/rdma_backend.h      |   2 +
 hw/rdma/rdma_backend_defs.h |   3 +-
 hw/rdma/vmw/pvrdma_main.c   | 129 +++++++++++++++++++++---------------
 4 files changed, 155 insertions(+), 75 deletions(-)

Comments

Marcel Apfelbaum July 24, 2018, 12:08 p.m. UTC | #1
Hi Yuval,

On 07/16/2018 10:40 AM, Yuval Shaia wrote:
> There are certain operations that are well considered as part of device
> configuration while others are needed only when "start" command is
> triggered by the guest driver. An example of device initialization step
> is msix_init and example of "device start" stage is the creation of a CQ
> completion handler thread.
>
> Driver expects such distinction - implement it.
>
> Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
> ---
>   hw/rdma/rdma_backend.c      |  96 +++++++++++++++++++++------
>   hw/rdma/rdma_backend.h      |   2 +
>   hw/rdma/rdma_backend_defs.h |   3 +-
>   hw/rdma/vmw/pvrdma_main.c   | 129 +++++++++++++++++++++---------------
>   4 files changed, 155 insertions(+), 75 deletions(-)
>
> diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
> index e9ced6f9ef..647e16399f 100644
> --- a/hw/rdma/rdma_backend.c
> +++ b/hw/rdma/rdma_backend.c
> @@ -35,6 +35,7 @@
>   #define VENDOR_ERR_MR_SMALL         0x208
>   
>   #define THR_NAME_LEN 16
> +#define THR_POLL_TO  5000
>   
>   typedef struct BackendCtx {
>       uint64_t req_id;
> @@ -91,35 +92,82 @@ static void *comp_handler_thread(void *arg)
>       int rc;
>       struct ibv_cq *ev_cq;
>       void *ev_ctx;
> +    int flags;
> +    GPollFD pfds[1];
> +
> +    /* Change to non-blocking mode */
> +    flags = fcntl(backend_dev->channel->fd, F_GETFL);
> +    rc = fcntl(backend_dev->channel->fd, F_SETFL, flags | O_NONBLOCK);
> +    if (rc < 0) {
> +        pr_dbg("Fail to change to non-blocking mode\n");
> +        return NULL;
> +    }
>   
>       pr_dbg("Starting\n");
>   
> +    pfds[0].fd = backend_dev->channel->fd;
> +    pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
> +
> +    backend_dev->comp_thread.is_running = true;
> +
>       while (backend_dev->comp_thread.run) {
> -        pr_dbg("Waiting for completion on channel %p\n", backend_dev->channel);
> -        rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx);
> -        pr_dbg("ibv_get_cq_event=%d\n", rc);
> -        if (unlikely(rc)) {
> -            pr_dbg("---> ibv_get_cq_event (%d)\n", rc);
> -            continue;
> -        }
> +        do {
> +            rc = qemu_poll_ns(pfds, 1, THR_POLL_TO * (int64_t)SCALE_MS);
> +        } while (!rc && backend_dev->comp_thread.run);
> +
> +        if (backend_dev->comp_thread.run) {
> +            pr_dbg("Waiting for completion on channel %p\n", backend_dev->channel);
> +            rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx);
> +            pr_dbg("ibv_get_cq_event=%d\n", rc);
> +            if (unlikely(rc)) {
> +                pr_dbg("---> ibv_get_cq_event (%d)\n", rc);
> +                continue;
> +            }
>   
> -        rc = ibv_req_notify_cq(ev_cq, 0);
> -        if (unlikely(rc)) {
> -            pr_dbg("Error %d from ibv_req_notify_cq\n", rc);
> -        }
> +            rc = ibv_req_notify_cq(ev_cq, 0);
> +            if (unlikely(rc)) {
> +                pr_dbg("Error %d from ibv_req_notify_cq\n", rc);
> +            }
>   
> -        poll_cq(backend_dev->rdma_dev_res, ev_cq);
> +            poll_cq(backend_dev->rdma_dev_res, ev_cq);
>   
> -        ibv_ack_cq_events(ev_cq, 1);
> +            ibv_ack_cq_events(ev_cq, 1);
> +        }
>       }
>   
>       pr_dbg("Going down\n");
>   
>       /* TODO: Post cqe for all remaining buffs that were posted */
>   
> +    backend_dev->comp_thread.is_running = false;
> +
> +    qemu_thread_exit(0);
> +
>       return NULL;
>   }
>   
> +static void stop_comp_thread(RdmaBackendDev *backend_dev)
> +{
> +    backend_dev->comp_thread.run = false;
> +    while (backend_dev->comp_thread.is_running) {
> +        pr_dbg("Waiting for thread to complete\n");
> +        sleep(THR_POLL_TO / SCALE_US / 2);
> +    }
> +}
> +
> +static void start_comp_thread(RdmaBackendDev *backend_dev)
> +{
> +    char thread_name[THR_NAME_LEN] = {0};
> +
> +    stop_comp_thread(backend_dev);
> +
> +    snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s",
> +             ibv_get_device_name(backend_dev->ib_dev));
> +    backend_dev->comp_thread.run = true;
> +    qemu_thread_create(&backend_dev->comp_thread.thread, thread_name,
> +                       comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
> +}
> +
>   void rdma_backend_register_comp_handler(void (*handler)(int status,
>                                           unsigned int vendor_err, void *ctx))
>   {
> @@ -706,7 +754,6 @@ int rdma_backend_init(RdmaBackendDev *backend_dev,
>       int i;
>       int ret = 0;
>       int num_ibv_devices;
> -    char thread_name[THR_NAME_LEN] = {0};
>       struct ibv_device **dev_list;
>       struct ibv_port_attr port_attr;
>   
> @@ -800,11 +847,8 @@ int rdma_backend_init(RdmaBackendDev *backend_dev,
>       pr_dbg("interface_id=0x%" PRIx64 "\n",
>              be64_to_cpu(backend_dev->gid.global.interface_id));
>   
> -    snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s",
> -             ibv_get_device_name(backend_dev->ib_dev));
> -    backend_dev->comp_thread.run = true;
> -    qemu_thread_create(&backend_dev->comp_thread.thread, thread_name,
> -                       comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
> +    backend_dev->comp_thread.run = false;
> +    backend_dev->comp_thread.is_running = false;
>   
>       ah_cache_init();
>   
> @@ -823,8 +867,22 @@ out:
>       return ret;
>   }
>   
> +
> +void rdma_backend_start(RdmaBackendDev *backend_dev)
> +{
> +    pr_dbg("Starting rdma_backend\n");
> +    start_comp_thread(backend_dev);
> +}
> +
> +void rdma_backend_stop(RdmaBackendDev *backend_dev)
> +{
> +    pr_dbg("Stopping rdma_backend\n");
> +    stop_comp_thread(backend_dev);
> +}
> +
>   void rdma_backend_fini(RdmaBackendDev *backend_dev)
>   {
> +    rdma_backend_stop(backend_dev);
>       g_hash_table_destroy(ah_hash);
>       ibv_destroy_comp_channel(backend_dev->channel);
>       ibv_close_device(backend_dev->context);
> diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h
> index 3cd636dd88..3049a73962 100644
> --- a/hw/rdma/rdma_backend.h
> +++ b/hw/rdma/rdma_backend.h
> @@ -52,6 +52,8 @@ int rdma_backend_init(RdmaBackendDev *backend_dev,
>                         uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
>                         Error **errp);
>   void rdma_backend_fini(RdmaBackendDev *backend_dev);
> +void rdma_backend_start(RdmaBackendDev *backend_dev);
> +void rdma_backend_stop(RdmaBackendDev *backend_dev);
>   void rdma_backend_register_comp_handler(void (*handler)(int status,
>                                           unsigned int vendor_err, void *ctx));
>   void rdma_backend_unregister_comp_handler(void);
> diff --git a/hw/rdma/rdma_backend_defs.h b/hw/rdma/rdma_backend_defs.h
> index ff5cfc26eb..7404f64002 100644
> --- a/hw/rdma/rdma_backend_defs.h
> +++ b/hw/rdma/rdma_backend_defs.h
> @@ -24,7 +24,8 @@ typedef struct RdmaDeviceResources RdmaDeviceResources;
>   typedef struct RdmaBackendThread {
>       QemuThread thread;
>       QemuMutex mutex;
> -    bool run;
> +    bool run; /* Set by thread manager to let thread know it should exit */
> +    bool is_running; /* Set by the thread to report its status */
>   } RdmaBackendThread;
>   
>   typedef struct RdmaBackendDev {
> diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
> index 3ed7409763..6a5073974d 100644
> --- a/hw/rdma/vmw/pvrdma_main.c
> +++ b/hw/rdma/vmw/pvrdma_main.c
> @@ -286,8 +286,78 @@ static void init_ports(PVRDMADev *dev, Error **errp)
>       }
>   }
>   
> +static void uninit_msix(PCIDevice *pdev, int used_vectors)
> +{
> +    PVRDMADev *dev = PVRDMA_DEV(pdev);
> +    int i;
> +
> +    for (i = 0; i < used_vectors; i++) {
> +        msix_vector_unuse(pdev, i);
> +    }
> +
> +    msix_uninit(pdev, &dev->msix, &dev->msix);
> +}
> +
> +static int init_msix(PCIDevice *pdev, Error **errp)
> +{
> +    PVRDMADev *dev = PVRDMA_DEV(pdev);
> +    int i;
> +    int rc;
> +
> +    rc = msix_init(pdev, RDMA_MAX_INTRS, &dev->msix, RDMA_MSIX_BAR_IDX,
> +                   RDMA_MSIX_TABLE, &dev->msix, RDMA_MSIX_BAR_IDX,
> +                   RDMA_MSIX_PBA, 0, NULL);
> +
> +    if (rc < 0) {
> +        error_setg(errp, "Failed to initialize MSI-X");
> +        return rc;
> +    }
> +
> +    for (i = 0; i < RDMA_MAX_INTRS; i++) {
> +        rc = msix_vector_use(PCI_DEVICE(dev), i);
> +        if (rc < 0) {
> +            error_setg(errp, "Fail mark MSI-X vector %d", i);
> +            uninit_msix(pdev, i);
> +            return rc;
> +        }
> +    }
> +
> +    return 0;
> +}
> +

The above functions were only moved around the same file. Right?
I suggest keeping them as they were for easier review.

> +static void pvrdma_fini(PCIDevice *pdev)
> +{
> +    PVRDMADev *dev = PVRDMA_DEV(pdev);
> +
> +    pr_dbg("Closing device %s %x.%x\n", pdev->name, PCI_SLOT(pdev->devfn),
> +           PCI_FUNC(pdev->devfn));
> +
> +    pvrdma_qp_ops_fini();
> +
> +    rdma_rm_fini(&dev->rdma_dev_res);
> +
> +    rdma_backend_fini(&dev->backend_dev);
> +
> +    free_dsr(dev);
> +
> +    if (msix_enabled(pdev)) {
> +        uninit_msix(pdev, RDMA_MAX_INTRS);
> +    }
> +}
> +
> +static void pvrdma_stop(PVRDMADev *dev)
> +{
> +    rdma_backend_stop(&dev->backend_dev);
> +}
> +
> +static void pvrdma_start(PVRDMADev *dev)
> +{
> +    rdma_backend_start(&dev->backend_dev);
> +}
> +

You might not need the above functions for now.

>   static void activate_device(PVRDMADev *dev)
>   {
> +    pvrdma_start(dev);
>       set_reg_val(dev, PVRDMA_REG_ERR, 0);
>       pr_dbg("Device activated\n");
>   }
> @@ -300,7 +370,10 @@ static int unquiesce_device(PVRDMADev *dev)
>   
>   static int reset_device(PVRDMADev *dev)
>   {
> +    pvrdma_stop(dev);
> +
>       pr_dbg("Device reset complete\n");
> +
>       return 0;
>   }
>   
> @@ -469,45 +542,6 @@ static void init_regs(PCIDevice *pdev)
>       set_reg_val(dev, PVRDMA_REG_ERR, 0xFFFF);
>   }
>   
> -static void uninit_msix(PCIDevice *pdev, int used_vectors)
> -{
> -    PVRDMADev *dev = PVRDMA_DEV(pdev);
> -    int i;
> -
> -    for (i = 0; i < used_vectors; i++) {
> -        msix_vector_unuse(pdev, i);
> -    }
> -
> -    msix_uninit(pdev, &dev->msix, &dev->msix);
> -}
> -
> -static int init_msix(PCIDevice *pdev, Error **errp)
> -{
> -    PVRDMADev *dev = PVRDMA_DEV(pdev);
> -    int i;
> -    int rc;
> -
> -    rc = msix_init(pdev, RDMA_MAX_INTRS, &dev->msix, RDMA_MSIX_BAR_IDX,
> -                   RDMA_MSIX_TABLE, &dev->msix, RDMA_MSIX_BAR_IDX,
> -                   RDMA_MSIX_PBA, 0, NULL);
> -
> -    if (rc < 0) {
> -        error_setg(errp, "Failed to initialize MSI-X");
> -        return rc;
> -    }
> -
> -    for (i = 0; i < RDMA_MAX_INTRS; i++) {
> -        rc = msix_vector_use(PCI_DEVICE(dev), i);
> -        if (rc < 0) {
> -            error_setg(errp, "Fail mark MSI-X vercor %d", i);
> -            uninit_msix(pdev, i);
> -            return rc;
> -        }
> -    }
> -
> -    return 0;
> -}
> -
>   static void init_dev_caps(PVRDMADev *dev)
>   {
>       size_t pg_tbl_bytes = TARGET_PAGE_SIZE *
> @@ -602,22 +636,7 @@ out:
>   
>   static void pvrdma_exit(PCIDevice *pdev)
>   {
> -    PVRDMADev *dev = PVRDMA_DEV(pdev);
> -
> -    pr_dbg("Closing device %s %x.%x\n", pdev->name, PCI_SLOT(pdev->devfn),
> -           PCI_FUNC(pdev->devfn));
> -
> -    pvrdma_qp_ops_fini();
> -
> -    rdma_rm_fini(&dev->rdma_dev_res);
> -
> -    rdma_backend_fini(&dev->backend_dev);
> -
> -    free_dsr(dev);
> -
> -    if (msix_enabled(pdev)) {
> -        uninit_msix(pdev, RDMA_MAX_INTRS);
> -    }
> +    pvrdma_fini(pdev);
>   }
>   
>   static void pvrdma_class_init(ObjectClass *klass, void *data)



Reviewed-by: Marcel Apfelbaum<marcel.apfelbaum@gmail.com>

Thanks,
Marcel
Yuval Shaia July 24, 2018, 7:29 p.m. UTC | #2
On Tue, Jul 24, 2018 at 03:08:10PM +0300, Marcel Apfelbaum wrote:
> Hi Yuval,
> 
> On 07/16/2018 10:40 AM, Yuval Shaia wrote:
> > There are certain operations that are well considered as part of device
> > configuration while others are needed only when "start" command is
> > triggered by the guest driver. An example of device initialization step
> > is msix_init and example of "device start" stage is the creation of a CQ
> > completion handler thread.
> > 
> > Driver expects such distinction - implement it.
> > 
> > Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
> > ---
> >   hw/rdma/rdma_backend.c      |  96 +++++++++++++++++++++------
> >   hw/rdma/rdma_backend.h      |   2 +
> >   hw/rdma/rdma_backend_defs.h |   3 +-
> >   hw/rdma/vmw/pvrdma_main.c   | 129 +++++++++++++++++++++---------------
> >   4 files changed, 155 insertions(+), 75 deletions(-)
> > 
> > diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
> > index e9ced6f9ef..647e16399f 100644
> > --- a/hw/rdma/rdma_backend.c
> > +++ b/hw/rdma/rdma_backend.c
> > @@ -35,6 +35,7 @@
> >   #define VENDOR_ERR_MR_SMALL         0x208
> >   #define THR_NAME_LEN 16
> > +#define THR_POLL_TO  5000
> >   typedef struct BackendCtx {
> >       uint64_t req_id;
> > @@ -91,35 +92,82 @@ static void *comp_handler_thread(void *arg)
> >       int rc;
> >       struct ibv_cq *ev_cq;
> >       void *ev_ctx;
> > +    int flags;
> > +    GPollFD pfds[1];
> > +
> > +    /* Change to non-blocking mode */
> > +    flags = fcntl(backend_dev->channel->fd, F_GETFL);
> > +    rc = fcntl(backend_dev->channel->fd, F_SETFL, flags | O_NONBLOCK);
> > +    if (rc < 0) {
> > +        pr_dbg("Fail to change to non-blocking mode\n");
> > +        return NULL;
> > +    }
> >       pr_dbg("Starting\n");
> > +    pfds[0].fd = backend_dev->channel->fd;
> > +    pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
> > +
> > +    backend_dev->comp_thread.is_running = true;
> > +
> >       while (backend_dev->comp_thread.run) {
> > -        pr_dbg("Waiting for completion on channel %p\n", backend_dev->channel);
> > -        rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx);
> > -        pr_dbg("ibv_get_cq_event=%d\n", rc);
> > -        if (unlikely(rc)) {
> > -            pr_dbg("---> ibv_get_cq_event (%d)\n", rc);
> > -            continue;
> > -        }
> > +        do {
> > +            rc = qemu_poll_ns(pfds, 1, THR_POLL_TO * (int64_t)SCALE_MS);
> > +        } while (!rc && backend_dev->comp_thread.run);
> > +
> > +        if (backend_dev->comp_thread.run) {
> > +            pr_dbg("Waiting for completion on channel %p\n", backend_dev->channel);
> > +            rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx);
> > +            pr_dbg("ibv_get_cq_event=%d\n", rc);
> > +            if (unlikely(rc)) {
> > +                pr_dbg("---> ibv_get_cq_event (%d)\n", rc);
> > +                continue;
> > +            }
> > -        rc = ibv_req_notify_cq(ev_cq, 0);
> > -        if (unlikely(rc)) {
> > -            pr_dbg("Error %d from ibv_req_notify_cq\n", rc);
> > -        }
> > +            rc = ibv_req_notify_cq(ev_cq, 0);
> > +            if (unlikely(rc)) {
> > +                pr_dbg("Error %d from ibv_req_notify_cq\n", rc);
> > +            }
> > -        poll_cq(backend_dev->rdma_dev_res, ev_cq);
> > +            poll_cq(backend_dev->rdma_dev_res, ev_cq);
> > -        ibv_ack_cq_events(ev_cq, 1);
> > +            ibv_ack_cq_events(ev_cq, 1);
> > +        }
> >       }
> >       pr_dbg("Going down\n");
> >       /* TODO: Post cqe for all remaining buffs that were posted */
> > +    backend_dev->comp_thread.is_running = false;
> > +
> > +    qemu_thread_exit(0);
> > +
> >       return NULL;
> >   }
> > +static void stop_comp_thread(RdmaBackendDev *backend_dev)
> > +{
> > +    backend_dev->comp_thread.run = false;
> > +    while (backend_dev->comp_thread.is_running) {
> > +        pr_dbg("Waiting for thread to complete\n");
> > +        sleep(THR_POLL_TO / SCALE_US / 2);
> > +    }
> > +}
> > +
> > +static void start_comp_thread(RdmaBackendDev *backend_dev)
> > +{
> > +    char thread_name[THR_NAME_LEN] = {0};
> > +
> > +    stop_comp_thread(backend_dev);
> > +
> > +    snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s",
> > +             ibv_get_device_name(backend_dev->ib_dev));
> > +    backend_dev->comp_thread.run = true;
> > +    qemu_thread_create(&backend_dev->comp_thread.thread, thread_name,
> > +                       comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
> > +}
> > +
> >   void rdma_backend_register_comp_handler(void (*handler)(int status,
> >                                           unsigned int vendor_err, void *ctx))
> >   {
> > @@ -706,7 +754,6 @@ int rdma_backend_init(RdmaBackendDev *backend_dev,
> >       int i;
> >       int ret = 0;
> >       int num_ibv_devices;
> > -    char thread_name[THR_NAME_LEN] = {0};
> >       struct ibv_device **dev_list;
> >       struct ibv_port_attr port_attr;
> > @@ -800,11 +847,8 @@ int rdma_backend_init(RdmaBackendDev *backend_dev,
> >       pr_dbg("interface_id=0x%" PRIx64 "\n",
> >              be64_to_cpu(backend_dev->gid.global.interface_id));
> > -    snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s",
> > -             ibv_get_device_name(backend_dev->ib_dev));
> > -    backend_dev->comp_thread.run = true;
> > -    qemu_thread_create(&backend_dev->comp_thread.thread, thread_name,
> > -                       comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
> > +    backend_dev->comp_thread.run = false;
> > +    backend_dev->comp_thread.is_running = false;
> >       ah_cache_init();
> > @@ -823,8 +867,22 @@ out:
> >       return ret;
> >   }
> > +
> > +void rdma_backend_start(RdmaBackendDev *backend_dev)
> > +{
> > +    pr_dbg("Starting rdma_backend\n");
> > +    start_comp_thread(backend_dev);
> > +}
> > +
> > +void rdma_backend_stop(RdmaBackendDev *backend_dev)
> > +{
> > +    pr_dbg("Stopping rdma_backend\n");
> > +    stop_comp_thread(backend_dev);
> > +}
> > +
> >   void rdma_backend_fini(RdmaBackendDev *backend_dev)
> >   {
> > +    rdma_backend_stop(backend_dev);
> >       g_hash_table_destroy(ah_hash);
> >       ibv_destroy_comp_channel(backend_dev->channel);
> >       ibv_close_device(backend_dev->context);
> > diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h
> > index 3cd636dd88..3049a73962 100644
> > --- a/hw/rdma/rdma_backend.h
> > +++ b/hw/rdma/rdma_backend.h
> > @@ -52,6 +52,8 @@ int rdma_backend_init(RdmaBackendDev *backend_dev,
> >                         uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
> >                         Error **errp);
> >   void rdma_backend_fini(RdmaBackendDev *backend_dev);
> > +void rdma_backend_start(RdmaBackendDev *backend_dev);
> > +void rdma_backend_stop(RdmaBackendDev *backend_dev);
> >   void rdma_backend_register_comp_handler(void (*handler)(int status,
> >                                           unsigned int vendor_err, void *ctx));
> >   void rdma_backend_unregister_comp_handler(void);
> > diff --git a/hw/rdma/rdma_backend_defs.h b/hw/rdma/rdma_backend_defs.h
> > index ff5cfc26eb..7404f64002 100644
> > --- a/hw/rdma/rdma_backend_defs.h
> > +++ b/hw/rdma/rdma_backend_defs.h
> > @@ -24,7 +24,8 @@ typedef struct RdmaDeviceResources RdmaDeviceResources;
> >   typedef struct RdmaBackendThread {
> >       QemuThread thread;
> >       QemuMutex mutex;
> > -    bool run;
> > +    bool run; /* Set by thread manager to let thread know it should exit */
> > +    bool is_running; /* Set by the thread to report its status */
> >   } RdmaBackendThread;
> >   typedef struct RdmaBackendDev {
> > diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
> > index 3ed7409763..6a5073974d 100644
> > --- a/hw/rdma/vmw/pvrdma_main.c
> > +++ b/hw/rdma/vmw/pvrdma_main.c
> > @@ -286,8 +286,78 @@ static void init_ports(PVRDMADev *dev, Error **errp)
> >       }
> >   }
> > +static void uninit_msix(PCIDevice *pdev, int used_vectors)
> > +{
> > +    PVRDMADev *dev = PVRDMA_DEV(pdev);
> > +    int i;
> > +
> > +    for (i = 0; i < used_vectors; i++) {
> > +        msix_vector_unuse(pdev, i);
> > +    }
> > +
> > +    msix_uninit(pdev, &dev->msix, &dev->msix);
> > +}
> > +
> > +static int init_msix(PCIDevice *pdev, Error **errp)
> > +{
> > +    PVRDMADev *dev = PVRDMA_DEV(pdev);
> > +    int i;
> > +    int rc;
> > +
> > +    rc = msix_init(pdev, RDMA_MAX_INTRS, &dev->msix, RDMA_MSIX_BAR_IDX,
> > +                   RDMA_MSIX_TABLE, &dev->msix, RDMA_MSIX_BAR_IDX,
> > +                   RDMA_MSIX_PBA, 0, NULL);
> > +
> > +    if (rc < 0) {
> > +        error_setg(errp, "Failed to initialize MSI-X");
> > +        return rc;
> > +    }
> > +
> > +    for (i = 0; i < RDMA_MAX_INTRS; i++) {
> > +        rc = msix_vector_use(PCI_DEVICE(dev), i);
> > +        if (rc < 0) {
> > +            error_setg(errp, "Fail mark MSI-X vector %d", i);
> > +            uninit_msix(pdev, i);
> > +            return rc;
> > +        }
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> 
> The above functions were only moved around the same file. Right?

Correct.

> I suggest keeping them as they were for easier review.

I think that this is what i did, the rest is the work of diff :)

Anyways, functions should be gathered as groups and now looks like they are
so better have one-time-"hard"-review and have file looks organized.

> 
> > +static void pvrdma_fini(PCIDevice *pdev)
> > +{
> > +    PVRDMADev *dev = PVRDMA_DEV(pdev);
> > +
> > +    pr_dbg("Closing device %s %x.%x\n", pdev->name, PCI_SLOT(pdev->devfn),
> > +           PCI_FUNC(pdev->devfn));
> > +
> > +    pvrdma_qp_ops_fini();
> > +
> > +    rdma_rm_fini(&dev->rdma_dev_res);
> > +
> > +    rdma_backend_fini(&dev->backend_dev);
> > +
> > +    free_dsr(dev);
> > +
> > +    if (msix_enabled(pdev)) {
> > +        uninit_msix(pdev, RDMA_MAX_INTRS);
> > +    }
> > +}
> > +
> > +static void pvrdma_stop(PVRDMADev *dev)
> > +{
> > +    rdma_backend_stop(&dev->backend_dev);
> > +}
> > +
> > +static void pvrdma_start(PVRDMADev *dev)
> > +{
> > +    rdma_backend_start(&dev->backend_dev);
> > +}
> > +
> 
> You might not need the above functions for now.

Yeah, agree but how about leave it just as place holder for others that
will probably join and also from modularity perspectives better that caller
will not be aware of what exactly needs to be start/stop.

Thought?

> 
> >   static void activate_device(PVRDMADev *dev)
> >   {
> > +    pvrdma_start(dev);
> >       set_reg_val(dev, PVRDMA_REG_ERR, 0);
> >       pr_dbg("Device activated\n");
> >   }
> > @@ -300,7 +370,10 @@ static int unquiesce_device(PVRDMADev *dev)
> >   static int reset_device(PVRDMADev *dev)
> >   {
> > +    pvrdma_stop(dev);
> > +
> >       pr_dbg("Device reset complete\n");
> > +
> >       return 0;
> >   }
> > @@ -469,45 +542,6 @@ static void init_regs(PCIDevice *pdev)
> >       set_reg_val(dev, PVRDMA_REG_ERR, 0xFFFF);
> >   }
> > -static void uninit_msix(PCIDevice *pdev, int used_vectors)
> > -{
> > -    PVRDMADev *dev = PVRDMA_DEV(pdev);
> > -    int i;
> > -
> > -    for (i = 0; i < used_vectors; i++) {
> > -        msix_vector_unuse(pdev, i);
> > -    }
> > -
> > -    msix_uninit(pdev, &dev->msix, &dev->msix);
> > -}
> > -
> > -static int init_msix(PCIDevice *pdev, Error **errp)
> > -{
> > -    PVRDMADev *dev = PVRDMA_DEV(pdev);
> > -    int i;
> > -    int rc;
> > -
> > -    rc = msix_init(pdev, RDMA_MAX_INTRS, &dev->msix, RDMA_MSIX_BAR_IDX,
> > -                   RDMA_MSIX_TABLE, &dev->msix, RDMA_MSIX_BAR_IDX,
> > -                   RDMA_MSIX_PBA, 0, NULL);
> > -
> > -    if (rc < 0) {
> > -        error_setg(errp, "Failed to initialize MSI-X");
> > -        return rc;
> > -    }
> > -
> > -    for (i = 0; i < RDMA_MAX_INTRS; i++) {
> > -        rc = msix_vector_use(PCI_DEVICE(dev), i);
> > -        if (rc < 0) {
> > -            error_setg(errp, "Fail mark MSI-X vercor %d", i);
> > -            uninit_msix(pdev, i);
> > -            return rc;
> > -        }
> > -    }
> > -
> > -    return 0;
> > -}
> > -
> >   static void init_dev_caps(PVRDMADev *dev)
> >   {
> >       size_t pg_tbl_bytes = TARGET_PAGE_SIZE *
> > @@ -602,22 +636,7 @@ out:
> >   static void pvrdma_exit(PCIDevice *pdev)
> >   {
> > -    PVRDMADev *dev = PVRDMA_DEV(pdev);
> > -
> > -    pr_dbg("Closing device %s %x.%x\n", pdev->name, PCI_SLOT(pdev->devfn),
> > -           PCI_FUNC(pdev->devfn));
> > -
> > -    pvrdma_qp_ops_fini();
> > -
> > -    rdma_rm_fini(&dev->rdma_dev_res);
> > -
> > -    rdma_backend_fini(&dev->backend_dev);
> > -
> > -    free_dsr(dev);
> > -
> > -    if (msix_enabled(pdev)) {
> > -        uninit_msix(pdev, RDMA_MAX_INTRS);
> > -    }
> > +    pvrdma_fini(pdev);
> >   }
> >   static void pvrdma_class_init(ObjectClass *klass, void *data)
> 
> 
> 
> Reviewed-by: Marcel Apfelbaum<marcel.apfelbaum@gmail.com>
> 
> Thanks,
> Marcel
Marcel Apfelbaum July 24, 2018, 7:53 p.m. UTC | #3
On 07/24/2018 10:29 PM, Yuval Shaia wrote:
> On Tue, Jul 24, 2018 at 03:08:10PM +0300, Marcel Apfelbaum wrote:
>> Hi Yuval,
>>
>> On 07/16/2018 10:40 AM, Yuval Shaia wrote:
>>> There are certain operations that are well considered as part of device
>>> configuration while others are needed only when "start" command is
>>> triggered by the guest driver. An example of device initialization step
>>> is msix_init and example of "device start" stage is the creation of a CQ
>>> completion handler thread.
>>>
>>> Driver expects such distinction - implement it.
>>>
>>> Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
>>> ---
>>>    hw/rdma/rdma_backend.c      |  96 +++++++++++++++++++++------
>>>    hw/rdma/rdma_backend.h      |   2 +
>>>    hw/rdma/rdma_backend_defs.h |   3 +-
>>>    hw/rdma/vmw/pvrdma_main.c   | 129 +++++++++++++++++++++---------------
>>>    4 files changed, 155 insertions(+), 75 deletions(-)
>>>
>>> diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
>>> index e9ced6f9ef..647e16399f 100644
>>> --- a/hw/rdma/rdma_backend.c
>>> +++ b/hw/rdma/rdma_backend.c
>>> @@ -35,6 +35,7 @@
>>>    #define VENDOR_ERR_MR_SMALL         0x208
>>>    #define THR_NAME_LEN 16
>>> +#define THR_POLL_TO  5000
>>>    typedef struct BackendCtx {
>>>        uint64_t req_id;
>>> @@ -91,35 +92,82 @@ static void *comp_handler_thread(void *arg)
>>>        int rc;
>>>        struct ibv_cq *ev_cq;
>>>        void *ev_ctx;
>>> +    int flags;
>>> +    GPollFD pfds[1];
>>> +
>>> +    /* Change to non-blocking mode */
>>> +    flags = fcntl(backend_dev->channel->fd, F_GETFL);
>>> +    rc = fcntl(backend_dev->channel->fd, F_SETFL, flags | O_NONBLOCK);
>>> +    if (rc < 0) {
>>> +        pr_dbg("Fail to change to non-blocking mode\n");
>>> +        return NULL;
>>> +    }
>>>        pr_dbg("Starting\n");
>>> +    pfds[0].fd = backend_dev->channel->fd;
>>> +    pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
>>> +
>>> +    backend_dev->comp_thread.is_running = true;
>>> +
>>>        while (backend_dev->comp_thread.run) {
>>> -        pr_dbg("Waiting for completion on channel %p\n", backend_dev->channel);
>>> -        rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx);
>>> -        pr_dbg("ibv_get_cq_event=%d\n", rc);
>>> -        if (unlikely(rc)) {
>>> -            pr_dbg("---> ibv_get_cq_event (%d)\n", rc);
>>> -            continue;
>>> -        }
>>> +        do {
>>> +            rc = qemu_poll_ns(pfds, 1, THR_POLL_TO * (int64_t)SCALE_MS);
>>> +        } while (!rc && backend_dev->comp_thread.run);
>>> +
>>> +        if (backend_dev->comp_thread.run) {
>>> +            pr_dbg("Waiting for completion on channel %p\n", backend_dev->channel);
>>> +            rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx);
>>> +            pr_dbg("ibv_get_cq_event=%d\n", rc);
>>> +            if (unlikely(rc)) {
>>> +                pr_dbg("---> ibv_get_cq_event (%d)\n", rc);
>>> +                continue;
>>> +            }
>>> -        rc = ibv_req_notify_cq(ev_cq, 0);
>>> -        if (unlikely(rc)) {
>>> -            pr_dbg("Error %d from ibv_req_notify_cq\n", rc);
>>> -        }
>>> +            rc = ibv_req_notify_cq(ev_cq, 0);
>>> +            if (unlikely(rc)) {
>>> +                pr_dbg("Error %d from ibv_req_notify_cq\n", rc);
>>> +            }
>>> -        poll_cq(backend_dev->rdma_dev_res, ev_cq);
>>> +            poll_cq(backend_dev->rdma_dev_res, ev_cq);
>>> -        ibv_ack_cq_events(ev_cq, 1);
>>> +            ibv_ack_cq_events(ev_cq, 1);
>>> +        }
>>>        }
>>>        pr_dbg("Going down\n");
>>>        /* TODO: Post cqe for all remaining buffs that were posted */
>>> +    backend_dev->comp_thread.is_running = false;
>>> +
>>> +    qemu_thread_exit(0);
>>> +
>>>        return NULL;
>>>    }
>>> +static void stop_comp_thread(RdmaBackendDev *backend_dev)
>>> +{
>>> +    backend_dev->comp_thread.run = false;
>>> +    while (backend_dev->comp_thread.is_running) {
>>> +        pr_dbg("Waiting for thread to complete\n");
>>> +        sleep(THR_POLL_TO / SCALE_US / 2);
>>> +    }
>>> +}
>>> +
>>> +static void start_comp_thread(RdmaBackendDev *backend_dev)
>>> +{
>>> +    char thread_name[THR_NAME_LEN] = {0};
>>> +
>>> +    stop_comp_thread(backend_dev);
>>> +
>>> +    snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s",
>>> +             ibv_get_device_name(backend_dev->ib_dev));
>>> +    backend_dev->comp_thread.run = true;
>>> +    qemu_thread_create(&backend_dev->comp_thread.thread, thread_name,
>>> +                       comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
>>> +}
>>> +
>>>    void rdma_backend_register_comp_handler(void (*handler)(int status,
>>>                                            unsigned int vendor_err, void *ctx))
>>>    {
>>> @@ -706,7 +754,6 @@ int rdma_backend_init(RdmaBackendDev *backend_dev,
>>>        int i;
>>>        int ret = 0;
>>>        int num_ibv_devices;
>>> -    char thread_name[THR_NAME_LEN] = {0};
>>>        struct ibv_device **dev_list;
>>>        struct ibv_port_attr port_attr;
>>> @@ -800,11 +847,8 @@ int rdma_backend_init(RdmaBackendDev *backend_dev,
>>>        pr_dbg("interface_id=0x%" PRIx64 "\n",
>>>               be64_to_cpu(backend_dev->gid.global.interface_id));
>>> -    snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s",
>>> -             ibv_get_device_name(backend_dev->ib_dev));
>>> -    backend_dev->comp_thread.run = true;
>>> -    qemu_thread_create(&backend_dev->comp_thread.thread, thread_name,
>>> -                       comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
>>> +    backend_dev->comp_thread.run = false;
>>> +    backend_dev->comp_thread.is_running = false;
>>>        ah_cache_init();
>>> @@ -823,8 +867,22 @@ out:
>>>        return ret;
>>>    }
>>> +
>>> +void rdma_backend_start(RdmaBackendDev *backend_dev)
>>> +{
>>> +    pr_dbg("Starting rdma_backend\n");
>>> +    start_comp_thread(backend_dev);
>>> +}
>>> +
>>> +void rdma_backend_stop(RdmaBackendDev *backend_dev)
>>> +{
>>> +    pr_dbg("Stopping rdma_backend\n");
>>> +    stop_comp_thread(backend_dev);
>>> +}
>>> +
>>>    void rdma_backend_fini(RdmaBackendDev *backend_dev)
>>>    {
>>> +    rdma_backend_stop(backend_dev);
>>>        g_hash_table_destroy(ah_hash);
>>>        ibv_destroy_comp_channel(backend_dev->channel);
>>>        ibv_close_device(backend_dev->context);
>>> diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h
>>> index 3cd636dd88..3049a73962 100644
>>> --- a/hw/rdma/rdma_backend.h
>>> +++ b/hw/rdma/rdma_backend.h
>>> @@ -52,6 +52,8 @@ int rdma_backend_init(RdmaBackendDev *backend_dev,
>>>                          uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
>>>                          Error **errp);
>>>    void rdma_backend_fini(RdmaBackendDev *backend_dev);
>>> +void rdma_backend_start(RdmaBackendDev *backend_dev);
>>> +void rdma_backend_stop(RdmaBackendDev *backend_dev);
>>>    void rdma_backend_register_comp_handler(void (*handler)(int status,
>>>                                            unsigned int vendor_err, void *ctx));
>>>    void rdma_backend_unregister_comp_handler(void);
>>> diff --git a/hw/rdma/rdma_backend_defs.h b/hw/rdma/rdma_backend_defs.h
>>> index ff5cfc26eb..7404f64002 100644
>>> --- a/hw/rdma/rdma_backend_defs.h
>>> +++ b/hw/rdma/rdma_backend_defs.h
>>> @@ -24,7 +24,8 @@ typedef struct RdmaDeviceResources RdmaDeviceResources;
>>>    typedef struct RdmaBackendThread {
>>>        QemuThread thread;
>>>        QemuMutex mutex;
>>> -    bool run;
>>> +    bool run; /* Set by thread manager to let thread know it should exit */
>>> +    bool is_running; /* Set by the thread to report its status */
>>>    } RdmaBackendThread;
>>>    typedef struct RdmaBackendDev {
>>> diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
>>> index 3ed7409763..6a5073974d 100644
>>> --- a/hw/rdma/vmw/pvrdma_main.c
>>> +++ b/hw/rdma/vmw/pvrdma_main.c
>>> @@ -286,8 +286,78 @@ static void init_ports(PVRDMADev *dev, Error **errp)
>>>        }
>>>    }
>>> +static void uninit_msix(PCIDevice *pdev, int used_vectors)
>>> +{
>>> +    PVRDMADev *dev = PVRDMA_DEV(pdev);
>>> +    int i;
>>> +
>>> +    for (i = 0; i < used_vectors; i++) {
>>> +        msix_vector_unuse(pdev, i);
>>> +    }
>>> +
>>> +    msix_uninit(pdev, &dev->msix, &dev->msix);
>>> +}
>>> +
>>> +static int init_msix(PCIDevice *pdev, Error **errp)
>>> +{
>>> +    PVRDMADev *dev = PVRDMA_DEV(pdev);
>>> +    int i;
>>> +    int rc;
>>> +
>>> +    rc = msix_init(pdev, RDMA_MAX_INTRS, &dev->msix, RDMA_MSIX_BAR_IDX,
>>> +                   RDMA_MSIX_TABLE, &dev->msix, RDMA_MSIX_BAR_IDX,
>>> +                   RDMA_MSIX_PBA, 0, NULL);
>>> +
>>> +    if (rc < 0) {
>>> +        error_setg(errp, "Failed to initialize MSI-X");
>>> +        return rc;
>>> +    }
>>> +
>>> +    for (i = 0; i < RDMA_MAX_INTRS; i++) {
>>> +        rc = msix_vector_use(PCI_DEVICE(dev), i);
>>> +        if (rc < 0) {
>>> +            error_setg(errp, "Fail mark MSI-X vector %d", i);
>>> +            uninit_msix(pdev, i);
>>> +            return rc;
>>> +        }
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>> The above functions were only moved around the same file. Right?
> Correct.
>
>> I suggest keeping them as they were for easier review.
> I think that this is what i did, the rest is the work of diff :)
>
> Anyways, functions should be gathered as groups and now looks like they are
> so better have one-time-"hard"-review and have file looks organized.
>
>>> +static void pvrdma_fini(PCIDevice *pdev)
>>> +{
>>> +    PVRDMADev *dev = PVRDMA_DEV(pdev);
>>> +
>>> +    pr_dbg("Closing device %s %x.%x\n", pdev->name, PCI_SLOT(pdev->devfn),
>>> +           PCI_FUNC(pdev->devfn));
>>> +
>>> +    pvrdma_qp_ops_fini();
>>> +
>>> +    rdma_rm_fini(&dev->rdma_dev_res);
>>> +
>>> +    rdma_backend_fini(&dev->backend_dev);
>>> +
>>> +    free_dsr(dev);
>>> +
>>> +    if (msix_enabled(pdev)) {
>>> +        uninit_msix(pdev, RDMA_MAX_INTRS);
>>> +    }
>>> +}
>>> +
>>> +static void pvrdma_stop(PVRDMADev *dev)
>>> +{
>>> +    rdma_backend_stop(&dev->backend_dev);
>>> +}
>>> +
>>> +static void pvrdma_start(PVRDMADev *dev)
>>> +{
>>> +    rdma_backend_start(&dev->backend_dev);
>>> +}
>>> +
>> You might not need the above functions for now.
> Yeah, agree but how about leave it just as place holder for others that
> will probably join and also from modularity perspectives better that caller
> will not be aware of what exactly needs to be start/stop.

Sure.

Thanks,
Marcel

>
> Thought?
>>>    static void activate_device(PVRDMADev *dev)
>>>    {
>>> +    pvrdma_start(dev);
>>>        set_reg_val(dev, PVRDMA_REG_ERR, 0);
>>>        pr_dbg("Device activated\n");
>>>    }
>>> @@ -300,7 +370,10 @@ static int unquiesce_device(PVRDMADev *dev)
>>>    static int reset_device(PVRDMADev *dev)
>>>    {
>>> +    pvrdma_stop(dev);
>>> +
>>>        pr_dbg("Device reset complete\n");
>>> +
>>>        return 0;
>>>    }
>>> @@ -469,45 +542,6 @@ static void init_regs(PCIDevice *pdev)
>>>        set_reg_val(dev, PVRDMA_REG_ERR, 0xFFFF);
>>>    }
>>> -static void uninit_msix(PCIDevice *pdev, int used_vectors)
>>> -{
>>> -    PVRDMADev *dev = PVRDMA_DEV(pdev);
>>> -    int i;
>>> -
>>> -    for (i = 0; i < used_vectors; i++) {
>>> -        msix_vector_unuse(pdev, i);
>>> -    }
>>> -
>>> -    msix_uninit(pdev, &dev->msix, &dev->msix);
>>> -}
>>> -
>>> -static int init_msix(PCIDevice *pdev, Error **errp)
>>> -{
>>> -    PVRDMADev *dev = PVRDMA_DEV(pdev);
>>> -    int i;
>>> -    int rc;
>>> -
>>> -    rc = msix_init(pdev, RDMA_MAX_INTRS, &dev->msix, RDMA_MSIX_BAR_IDX,
>>> -                   RDMA_MSIX_TABLE, &dev->msix, RDMA_MSIX_BAR_IDX,
>>> -                   RDMA_MSIX_PBA, 0, NULL);
>>> -
>>> -    if (rc < 0) {
>>> -        error_setg(errp, "Failed to initialize MSI-X");
>>> -        return rc;
>>> -    }
>>> -
>>> -    for (i = 0; i < RDMA_MAX_INTRS; i++) {
>>> -        rc = msix_vector_use(PCI_DEVICE(dev), i);
>>> -        if (rc < 0) {
>>> -            error_setg(errp, "Fail mark MSI-X vercor %d", i);
>>> -            uninit_msix(pdev, i);
>>> -            return rc;
>>> -        }
>>> -    }
>>> -
>>> -    return 0;
>>> -}
>>> -
>>>    static void init_dev_caps(PVRDMADev *dev)
>>>    {
>>>        size_t pg_tbl_bytes = TARGET_PAGE_SIZE *
>>> @@ -602,22 +636,7 @@ out:
>>>    static void pvrdma_exit(PCIDevice *pdev)
>>>    {
>>> -    PVRDMADev *dev = PVRDMA_DEV(pdev);
>>> -
>>> -    pr_dbg("Closing device %s %x.%x\n", pdev->name, PCI_SLOT(pdev->devfn),
>>> -           PCI_FUNC(pdev->devfn));
>>> -
>>> -    pvrdma_qp_ops_fini();
>>> -
>>> -    rdma_rm_fini(&dev->rdma_dev_res);
>>> -
>>> -    rdma_backend_fini(&dev->backend_dev);
>>> -
>>> -    free_dsr(dev);
>>> -
>>> -    if (msix_enabled(pdev)) {
>>> -        uninit_msix(pdev, RDMA_MAX_INTRS);
>>> -    }
>>> +    pvrdma_fini(pdev);
>>>    }
>>>    static void pvrdma_class_init(ObjectClass *klass, void *data)
>>
>>
>> Reviewed-by: Marcel Apfelbaum<marcel.apfelbaum@gmail.com>
>>
>> Thanks,
>> Marcel
diff mbox

Patch

diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
index e9ced6f9ef..647e16399f 100644
--- a/hw/rdma/rdma_backend.c
+++ b/hw/rdma/rdma_backend.c
@@ -35,6 +35,7 @@ 
 #define VENDOR_ERR_MR_SMALL         0x208
 
 #define THR_NAME_LEN 16
+#define THR_POLL_TO  5000
 
 typedef struct BackendCtx {
     uint64_t req_id;
@@ -91,35 +92,82 @@  static void *comp_handler_thread(void *arg)
     int rc;
     struct ibv_cq *ev_cq;
     void *ev_ctx;
+    int flags;
+    GPollFD pfds[1];
+
+    /* Change to non-blocking mode */
+    flags = fcntl(backend_dev->channel->fd, F_GETFL);
+    rc = fcntl(backend_dev->channel->fd, F_SETFL, flags | O_NONBLOCK);
+    if (rc < 0) {
+        pr_dbg("Fail to change to non-blocking mode\n");
+        return NULL;
+    }
 
     pr_dbg("Starting\n");
 
+    pfds[0].fd = backend_dev->channel->fd;
+    pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
+
+    backend_dev->comp_thread.is_running = true;
+
     while (backend_dev->comp_thread.run) {
-        pr_dbg("Waiting for completion on channel %p\n", backend_dev->channel);
-        rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx);
-        pr_dbg("ibv_get_cq_event=%d\n", rc);
-        if (unlikely(rc)) {
-            pr_dbg("---> ibv_get_cq_event (%d)\n", rc);
-            continue;
-        }
+        do {
+            rc = qemu_poll_ns(pfds, 1, THR_POLL_TO * (int64_t)SCALE_MS);
+        } while (!rc && backend_dev->comp_thread.run);
+
+        if (backend_dev->comp_thread.run) {
+            pr_dbg("Waiting for completion on channel %p\n", backend_dev->channel);
+            rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx);
+            pr_dbg("ibv_get_cq_event=%d\n", rc);
+            if (unlikely(rc)) {
+                pr_dbg("---> ibv_get_cq_event (%d)\n", rc);
+                continue;
+            }
 
-        rc = ibv_req_notify_cq(ev_cq, 0);
-        if (unlikely(rc)) {
-            pr_dbg("Error %d from ibv_req_notify_cq\n", rc);
-        }
+            rc = ibv_req_notify_cq(ev_cq, 0);
+            if (unlikely(rc)) {
+                pr_dbg("Error %d from ibv_req_notify_cq\n", rc);
+            }
 
-        poll_cq(backend_dev->rdma_dev_res, ev_cq);
+            poll_cq(backend_dev->rdma_dev_res, ev_cq);
 
-        ibv_ack_cq_events(ev_cq, 1);
+            ibv_ack_cq_events(ev_cq, 1);
+        }
     }
 
     pr_dbg("Going down\n");
 
     /* TODO: Post cqe for all remaining buffs that were posted */
 
+    backend_dev->comp_thread.is_running = false;
+
+    qemu_thread_exit(0);
+
     return NULL;
 }
 
+static void stop_comp_thread(RdmaBackendDev *backend_dev)
+{
+    backend_dev->comp_thread.run = false;
+    while (backend_dev->comp_thread.is_running) {
+        pr_dbg("Waiting for thread to complete\n");
+        sleep(THR_POLL_TO / SCALE_US / 2);
+    }
+}
+
+static void start_comp_thread(RdmaBackendDev *backend_dev)
+{
+    char thread_name[THR_NAME_LEN] = {0};
+
+    stop_comp_thread(backend_dev);
+
+    snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s",
+             ibv_get_device_name(backend_dev->ib_dev));
+    backend_dev->comp_thread.run = true;
+    qemu_thread_create(&backend_dev->comp_thread.thread, thread_name,
+                       comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
+}
+
 void rdma_backend_register_comp_handler(void (*handler)(int status,
                                         unsigned int vendor_err, void *ctx))
 {
@@ -706,7 +754,6 @@  int rdma_backend_init(RdmaBackendDev *backend_dev,
     int i;
     int ret = 0;
     int num_ibv_devices;
-    char thread_name[THR_NAME_LEN] = {0};
     struct ibv_device **dev_list;
     struct ibv_port_attr port_attr;
 
@@ -800,11 +847,8 @@  int rdma_backend_init(RdmaBackendDev *backend_dev,
     pr_dbg("interface_id=0x%" PRIx64 "\n",
            be64_to_cpu(backend_dev->gid.global.interface_id));
 
-    snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s",
-             ibv_get_device_name(backend_dev->ib_dev));
-    backend_dev->comp_thread.run = true;
-    qemu_thread_create(&backend_dev->comp_thread.thread, thread_name,
-                       comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
+    backend_dev->comp_thread.run = false;
+    backend_dev->comp_thread.is_running = false;
 
     ah_cache_init();
 
@@ -823,8 +867,22 @@  out:
     return ret;
 }
 
+
+void rdma_backend_start(RdmaBackendDev *backend_dev)
+{
+    pr_dbg("Starting rdma_backend\n");
+    start_comp_thread(backend_dev);
+}
+
+void rdma_backend_stop(RdmaBackendDev *backend_dev)
+{
+    pr_dbg("Stopping rdma_backend\n");
+    stop_comp_thread(backend_dev);
+}
+
 void rdma_backend_fini(RdmaBackendDev *backend_dev)
 {
+    rdma_backend_stop(backend_dev);
     g_hash_table_destroy(ah_hash);
     ibv_destroy_comp_channel(backend_dev->channel);
     ibv_close_device(backend_dev->context);
diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h
index 3cd636dd88..3049a73962 100644
--- a/hw/rdma/rdma_backend.h
+++ b/hw/rdma/rdma_backend.h
@@ -52,6 +52,8 @@  int rdma_backend_init(RdmaBackendDev *backend_dev,
                       uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
                       Error **errp);
 void rdma_backend_fini(RdmaBackendDev *backend_dev);
+void rdma_backend_start(RdmaBackendDev *backend_dev);
+void rdma_backend_stop(RdmaBackendDev *backend_dev);
 void rdma_backend_register_comp_handler(void (*handler)(int status,
                                         unsigned int vendor_err, void *ctx));
 void rdma_backend_unregister_comp_handler(void);
diff --git a/hw/rdma/rdma_backend_defs.h b/hw/rdma/rdma_backend_defs.h
index ff5cfc26eb..7404f64002 100644
--- a/hw/rdma/rdma_backend_defs.h
+++ b/hw/rdma/rdma_backend_defs.h
@@ -24,7 +24,8 @@  typedef struct RdmaDeviceResources RdmaDeviceResources;
 typedef struct RdmaBackendThread {
     QemuThread thread;
     QemuMutex mutex;
-    bool run;
+    bool run; /* Set by thread manager to let thread know it should exit */
+    bool is_running; /* Set by the thread to report its status */
 } RdmaBackendThread;
 
 typedef struct RdmaBackendDev {
diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index 3ed7409763..6a5073974d 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -286,8 +286,78 @@  static void init_ports(PVRDMADev *dev, Error **errp)
     }
 }
 
+static void uninit_msix(PCIDevice *pdev, int used_vectors)
+{
+    PVRDMADev *dev = PVRDMA_DEV(pdev);
+    int i;
+
+    for (i = 0; i < used_vectors; i++) {
+        msix_vector_unuse(pdev, i);
+    }
+
+    msix_uninit(pdev, &dev->msix, &dev->msix);
+}
+
+static int init_msix(PCIDevice *pdev, Error **errp)
+{
+    PVRDMADev *dev = PVRDMA_DEV(pdev);
+    int i;
+    int rc;
+
+    rc = msix_init(pdev, RDMA_MAX_INTRS, &dev->msix, RDMA_MSIX_BAR_IDX,
+                   RDMA_MSIX_TABLE, &dev->msix, RDMA_MSIX_BAR_IDX,
+                   RDMA_MSIX_PBA, 0, NULL);
+
+    if (rc < 0) {
+        error_setg(errp, "Failed to initialize MSI-X");
+        return rc;
+    }
+
+    for (i = 0; i < RDMA_MAX_INTRS; i++) {
+        rc = msix_vector_use(PCI_DEVICE(dev), i);
+        if (rc < 0) {
+            error_setg(errp, "Fail mark MSI-X vector %d", i);
+            uninit_msix(pdev, i);
+            return rc;
+        }
+    }
+
+    return 0;
+}
+
+static void pvrdma_fini(PCIDevice *pdev)
+{
+    PVRDMADev *dev = PVRDMA_DEV(pdev);
+
+    pr_dbg("Closing device %s %x.%x\n", pdev->name, PCI_SLOT(pdev->devfn),
+           PCI_FUNC(pdev->devfn));
+
+    pvrdma_qp_ops_fini();
+
+    rdma_rm_fini(&dev->rdma_dev_res);
+
+    rdma_backend_fini(&dev->backend_dev);
+
+    free_dsr(dev);
+
+    if (msix_enabled(pdev)) {
+        uninit_msix(pdev, RDMA_MAX_INTRS);
+    }
+}
+
+static void pvrdma_stop(PVRDMADev *dev)
+{
+    rdma_backend_stop(&dev->backend_dev);
+}
+
+static void pvrdma_start(PVRDMADev *dev)
+{
+    rdma_backend_start(&dev->backend_dev);
+}
+
 static void activate_device(PVRDMADev *dev)
 {
+    pvrdma_start(dev);
     set_reg_val(dev, PVRDMA_REG_ERR, 0);
     pr_dbg("Device activated\n");
 }
@@ -300,7 +370,10 @@  static int unquiesce_device(PVRDMADev *dev)
 
 static int reset_device(PVRDMADev *dev)
 {
+    pvrdma_stop(dev);
+
     pr_dbg("Device reset complete\n");
+
     return 0;
 }
 
@@ -469,45 +542,6 @@  static void init_regs(PCIDevice *pdev)
     set_reg_val(dev, PVRDMA_REG_ERR, 0xFFFF);
 }
 
-static void uninit_msix(PCIDevice *pdev, int used_vectors)
-{
-    PVRDMADev *dev = PVRDMA_DEV(pdev);
-    int i;
-
-    for (i = 0; i < used_vectors; i++) {
-        msix_vector_unuse(pdev, i);
-    }
-
-    msix_uninit(pdev, &dev->msix, &dev->msix);
-}
-
-static int init_msix(PCIDevice *pdev, Error **errp)
-{
-    PVRDMADev *dev = PVRDMA_DEV(pdev);
-    int i;
-    int rc;
-
-    rc = msix_init(pdev, RDMA_MAX_INTRS, &dev->msix, RDMA_MSIX_BAR_IDX,
-                   RDMA_MSIX_TABLE, &dev->msix, RDMA_MSIX_BAR_IDX,
-                   RDMA_MSIX_PBA, 0, NULL);
-
-    if (rc < 0) {
-        error_setg(errp, "Failed to initialize MSI-X");
-        return rc;
-    }
-
-    for (i = 0; i < RDMA_MAX_INTRS; i++) {
-        rc = msix_vector_use(PCI_DEVICE(dev), i);
-        if (rc < 0) {
-            error_setg(errp, "Fail mark MSI-X vercor %d", i);
-            uninit_msix(pdev, i);
-            return rc;
-        }
-    }
-
-    return 0;
-}
-
 static void init_dev_caps(PVRDMADev *dev)
 {
     size_t pg_tbl_bytes = TARGET_PAGE_SIZE *
@@ -602,22 +636,7 @@  out:
 
 static void pvrdma_exit(PCIDevice *pdev)
 {
-    PVRDMADev *dev = PVRDMA_DEV(pdev);
-
-    pr_dbg("Closing device %s %x.%x\n", pdev->name, PCI_SLOT(pdev->devfn),
-           PCI_FUNC(pdev->devfn));
-
-    pvrdma_qp_ops_fini();
-
-    rdma_rm_fini(&dev->rdma_dev_res);
-
-    rdma_backend_fini(&dev->backend_dev);
-
-    free_dsr(dev);
-
-    if (msix_enabled(pdev)) {
-        uninit_msix(pdev, RDMA_MAX_INTRS);
-    }
+    pvrdma_fini(pdev);
 }
 
 static void pvrdma_class_init(ObjectClass *klass, void *data)