diff mbox series

[RFC,v2,04/16] vfio-user: connect vfio proxy to remote server

Message ID e9fa92081e0faf49089f4afb9a45187e49ea4bad.1629131628.git.elena.ufimtseva@oracle.com (mailing list archive)
State New, archived
Headers show
Series vfio-user implementation | expand

Commit Message

Elena Ufimtseva Aug. 16, 2021, 4:42 p.m. UTC
From: John Johnson <john.g.johnson@oracle.com>

Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
 hw/vfio/user.h                |  66 ++++++++++++++
 include/hw/vfio/vfio-common.h |   2 +
 hw/vfio/pci.c                 |  29 ++++++
 hw/vfio/user.c                | 160 ++++++++++++++++++++++++++++++++++
 MAINTAINERS                   |   4 +
 hw/vfio/meson.build           |   1 +
 6 files changed, 262 insertions(+)
 create mode 100644 hw/vfio/user.h
 create mode 100644 hw/vfio/user.c

Comments

Alex Williamson Aug. 18, 2021, 6:47 p.m. UTC | #1
On Mon, 16 Aug 2021 09:42:37 -0700
Elena Ufimtseva <elena.ufimtseva@oracle.com> wrote:

> diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
> index da9af297a0..739b30be73 100644
> --- a/hw/vfio/meson.build
> +++ b/hw/vfio/meson.build
> @@ -8,6 +8,7 @@ vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
>    'display.c',
>    'pci-quirks.c',
>    'pci.c',
> +  'user.c',
>  ))
>  vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c'))
>  vfio_ss.add(when: 'CONFIG_VFIO_PLATFORM', if_true: files('platform.c'))

Wouldn't it make sense to be able to configure QEMU with any
combination of vfio-pci and/or vfio-user-pci support rather than
statically tying vfio-user-pci to vfio-pci?  Not to mention that doing
so would help to more formally define the interface operations between
kernel and user options, for example fewer tests of vbasedev->proxy and
perhaps more abstraction through ops structures.  Thanks,

Alex
John Johnson Aug. 19, 2021, 2:10 p.m. UTC | #2
> On Aug 18, 2021, at 2:47 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> On Mon, 16 Aug 2021 09:42:37 -0700
> Elena Ufimtseva <elena.ufimtseva@oracle.com> wrote:
> 
>> diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
>> index da9af297a0..739b30be73 100644
>> --- a/hw/vfio/meson.build
>> +++ b/hw/vfio/meson.build
>> @@ -8,6 +8,7 @@ vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
>>   'display.c',
>>   'pci-quirks.c',
>>   'pci.c',
>> +  'user.c',
>> ))
>> vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c'))
>> vfio_ss.add(when: 'CONFIG_VFIO_PLATFORM', if_true: files('platform.c'))
> 
> Wouldn't it make sense to be able to configure QEMU with any
> combination of vfio-pci and/or vfio-user-pci support rather than
> statically tying vfio-user-pci to vfio-pci?  Not to mention that doing
> so would help to more formally define the interface operations between
> kernel and user options, for example fewer tests of vbasedev->proxy and
> perhaps more abstraction through ops structures.  Thanks,
> 

	We can certainly add another config option for vfio-user.

	As far as an ops vector vs vbasedev->proxy tests, it’s a
matter personal preference.  I prefer the changes inline when they
are this small, but we can make a vector if that’s what you want.

							JJ
Stefan Hajnoczi Aug. 24, 2021, 2:15 p.m. UTC | #3
On Mon, Aug 16, 2021 at 09:42:37AM -0700, Elena Ufimtseva wrote:
> @@ -3361,13 +3362,35 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp)
>      VFIOUserPCIDevice *udev = VFIO_USER_PCI(pdev);
>      VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
>      VFIODevice *vbasedev = &vdev->vbasedev;
> +    SocketAddress addr;
> +    VFIOProxy *proxy;
> +    Error *err = NULL;
>  
> +    /*
> +     * TODO: make option parser understand SocketAddress
> +     * and use that instead of having scaler options

s/scaler/scalar/

> +VFIOProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp)
> +{
> +    VFIOProxy *proxy;
> +    QIOChannelSocket *sioc;
> +    QIOChannel *ioc;
> +    char *sockname;
> +
> +    if (addr->type != SOCKET_ADDRESS_TYPE_UNIX) {
> +        error_setg(errp, "vfio_user_connect - bad address family");
> +        return NULL;
> +    }
> +    sockname = addr->u.q_unix.path;
> +
> +    sioc = qio_channel_socket_new();
> +    ioc = QIO_CHANNEL(sioc);
> +    if (qio_channel_socket_connect_sync(sioc, addr, errp)) {
> +        object_unref(OBJECT(ioc));
> +        return NULL;
> +    }
> +    qio_channel_set_blocking(ioc, true, NULL);
> +
> +    proxy = g_malloc0(sizeof(VFIOProxy));
> +    proxy->sockname = sockname;

sockname is addr->u.q_unix.path, so there's an assumption that the
lifetime of the addr argument is at least as long as the proxy object's
lifetime. This doesn't seem to be the case in vfio_user_pci_realize()
since the SocketAddress variable is declared on the stack.

I suggest making SocketAddress *addr const so it's obvious that this
function just reads it (doesn't take ownership of the pointer) and
copying the UNIX domain socket path with g_strdup() to avoid the
dangling pointer.

> +    proxy->ioc = ioc;
> +    proxy->flags = VFIO_PROXY_CLIENT;
> +    proxy->state = VFIO_PROXY_CONNECTED;
> +    qemu_cond_init(&proxy->close_cv);
> +
> +    if (vfio_user_iothread == NULL) {
> +        vfio_user_iothread = iothread_create("VFIO user", errp);
> +    }

Why is a dedicated IOThread needed for VFIO user?

> +
> +    qemu_mutex_init(&proxy->lock);
> +    QTAILQ_INIT(&proxy->free);
> +    QTAILQ_INIT(&proxy->pending);
> +    QLIST_INSERT_HEAD(&vfio_user_sockets, proxy, next);
> +
> +    return proxy;
> +}
> +

/* Called with the BQL */
> +void vfio_user_disconnect(VFIOProxy *proxy)
> +{
> +    VFIOUserReply *r1, *r2;
> +
> +    qemu_mutex_lock(&proxy->lock);
> +
> +    /* our side is quitting */
> +    if (proxy->state == VFIO_PROXY_CONNECTED) {
> +        vfio_user_shutdown(proxy);
> +        if (!QTAILQ_EMPTY(&proxy->pending)) {
> +            error_printf("vfio_user_disconnect: outstanding requests\n");
> +        }
> +    }
> +    object_unref(OBJECT(proxy->ioc));
> +    proxy->ioc = NULL;
> +
> +    proxy->state = VFIO_PROXY_CLOSING;
> +    QTAILQ_FOREACH_SAFE(r1, &proxy->pending, next, r2) {
> +        qemu_cond_destroy(&r1->cv);
> +        QTAILQ_REMOVE(&proxy->pending, r1, next);
> +        g_free(r1);
> +    }
> +    QTAILQ_FOREACH_SAFE(r1, &proxy->free, next, r2) {
> +        qemu_cond_destroy(&r1->cv);
> +        QTAILQ_REMOVE(&proxy->free, r1, next);
> +        g_free(r1);
> +    }
> +
> +    /*
> +     * Make sure the iothread isn't blocking anywhere
> +     * with a ref to this proxy by waiting for a BH
> +     * handler to run after the proxy fd handlers were
> +     * deleted above.
> +     */
> +    proxy->close_wait = 1;

Please use true. '1' is shorter but it's less obvious to the reader (I
had to jump to the definition to check whether this field was bool or
int).

> +    aio_bh_schedule_oneshot(iothread_get_aio_context(vfio_user_iothread),
> +                            vfio_user_cb, proxy);
> +
> +    /* drop locks so the iothread can make progress */
> +    qemu_mutex_unlock_iothread();

Why does the BQL needs to be dropped so vfio_user_iothread can make
progress?

> +    qemu_cond_wait(&proxy->close_cv, &proxy->lock);
John Johnson Aug. 30, 2021, 3 a.m. UTC | #4
> On Aug 24, 2021, at 7:15 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Mon, Aug 16, 2021 at 09:42:37AM -0700, Elena Ufimtseva wrote:
>> @@ -3361,13 +3362,35 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp)
>>     VFIOUserPCIDevice *udev = VFIO_USER_PCI(pdev);
>>     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
>>     VFIODevice *vbasedev = &vdev->vbasedev;
>> +    SocketAddress addr;
>> +    VFIOProxy *proxy;
>> +    Error *err = NULL;
>> 
>> +    /*
>> +     * TODO: make option parser understand SocketAddress
>> +     * and use that instead of having scaler options
> 
> s/scaler/scalar/
> 

	OK


>> +VFIOProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp)
>> +{
>> +    VFIOProxy *proxy;
>> +    QIOChannelSocket *sioc;
>> +    QIOChannel *ioc;
>> +    char *sockname;
>> +
>> +    if (addr->type != SOCKET_ADDRESS_TYPE_UNIX) {
>> +        error_setg(errp, "vfio_user_connect - bad address family");
>> +        return NULL;
>> +    }
>> +    sockname = addr->u.q_unix.path;
>> +
>> +    sioc = qio_channel_socket_new();
>> +    ioc = QIO_CHANNEL(sioc);
>> +    if (qio_channel_socket_connect_sync(sioc, addr, errp)) {
>> +        object_unref(OBJECT(ioc));
>> +        return NULL;
>> +    }
>> +    qio_channel_set_blocking(ioc, true, NULL);
>> +
>> +    proxy = g_malloc0(sizeof(VFIOProxy));
>> +    proxy->sockname = sockname;
> 
> sockname is addr->u.q_unix.path, so there's an assumption that the
> lifetime of the addr argument is at least as long as the proxy object's
> lifetime. This doesn't seem to be the case in vfio_user_pci_realize()
> since the SocketAddress variable is declared on the stack.
> 
> I suggest making SocketAddress *addr const so it's obvious that this
> function just reads it (doesn't take ownership of the pointer) and
> copying the UNIX domain socket path with g_strdup() to avoid the
> dangling pointer.
> 

	OK


>> +    proxy->ioc = ioc;
>> +    proxy->flags = VFIO_PROXY_CLIENT;
>> +    proxy->state = VFIO_PROXY_CONNECTED;
>> +    qemu_cond_init(&proxy->close_cv);
>> +
>> +    if (vfio_user_iothread == NULL) {
>> +        vfio_user_iothread = iothread_create("VFIO user", errp);
>> +    }
> 
> Why is a dedicated IOThread needed for VFIO user?
> 

	It seemed the best model for inbound message processing.  Most messages
are replies, so the receiver will either signal a thread waiting the reply or
report any errors from the server if there is no waiter.  None of this requires
the BQL.

	If the message is a request - which currently only happens if device
DMA targets guest memory that wasn’t mmap()d by QEMU or if the ’secure-dma’
option is used - then the receiver can then acquire BQL so it can call the
VFIO code to handle the request.


>> +
>> +    qemu_mutex_init(&proxy->lock);
>> +    QTAILQ_INIT(&proxy->free);
>> +    QTAILQ_INIT(&proxy->pending);
>> +    QLIST_INSERT_HEAD(&vfio_user_sockets, proxy, next);
>> +
>> +    return proxy;
>> +}
>> +
> 
> /* Called with the BQL */

	OK

>> +void vfio_user_disconnect(VFIOProxy *proxy)
>> +{
>> +    VFIOUserReply *r1, *r2;
>> +
>> +    qemu_mutex_lock(&proxy->lock);
>> +
>> +    /* our side is quitting */
>> +    if (proxy->state == VFIO_PROXY_CONNECTED) {
>> +        vfio_user_shutdown(proxy);
>> +        if (!QTAILQ_EMPTY(&proxy->pending)) {
>> +            error_printf("vfio_user_disconnect: outstanding requests\n");
>> +        }
>> +    }
>> +    object_unref(OBJECT(proxy->ioc));
>> +    proxy->ioc = NULL;
>> +
>> +    proxy->state = VFIO_PROXY_CLOSING;
>> +    QTAILQ_FOREACH_SAFE(r1, &proxy->pending, next, r2) {
>> +        qemu_cond_destroy(&r1->cv);
>> +        QTAILQ_REMOVE(&proxy->pending, r1, next);
>> +        g_free(r1);
>> +    }
>> +    QTAILQ_FOREACH_SAFE(r1, &proxy->free, next, r2) {
>> +        qemu_cond_destroy(&r1->cv);
>> +        QTAILQ_REMOVE(&proxy->free, r1, next);
>> +        g_free(r1);
>> +    }
>> +
>> +    /*
>> +     * Make sure the iothread isn't blocking anywhere
>> +     * with a ref to this proxy by waiting for a BH
>> +     * handler to run after the proxy fd handlers were
>> +     * deleted above.
>> +     */
>> +    proxy->close_wait = 1;
> 
> Please use true. '1' is shorter but it's less obvious to the reader (I
> had to jump to the definition to check whether this field was bool or
> int).
> 

	I assume this is also true for the other boolean struct members
I’ve added.


>> +    aio_bh_schedule_oneshot(iothread_get_aio_context(vfio_user_iothread),
>> +                            vfio_user_cb, proxy);
>> +
>> +    /* drop locks so the iothread can make progress */
>> +    qemu_mutex_unlock_iothread();
> 
> Why does the BQL needs to be dropped so vfio_user_iothread can make
> progress?
> 

	See above.  Acquiring BQL by the iothread is rare, but I have to
handle the case where a disconnect is concurrent with an incoming request
message that is waiting for the BQL.  See the proxy state check after BQL
is acquired in vfio_user_recv()


>> +    qemu_cond_wait(&proxy->close_cv, &proxy->lock);


								JJ
Stefan Hajnoczi Sept. 7, 2021, 1:21 p.m. UTC | #5
On Mon, Aug 30, 2021 at 03:00:37AM +0000, John Johnson wrote:
> > On Aug 24, 2021, at 7:15 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > On Mon, Aug 16, 2021 at 09:42:37AM -0700, Elena Ufimtseva wrote:
> >> +    proxy->ioc = ioc;
> >> +    proxy->flags = VFIO_PROXY_CLIENT;
> >> +    proxy->state = VFIO_PROXY_CONNECTED;
> >> +    qemu_cond_init(&proxy->close_cv);
> >> +
> >> +    if (vfio_user_iothread == NULL) {
> >> +        vfio_user_iothread = iothread_create("VFIO user", errp);
> >> +    }
> > 
> > Why is a dedicated IOThread needed for VFIO user?
> > 
> 
> 	It seemed the best model for inbound message processing.  Most messages
> are replies, so the receiver will either signal a thread waiting the reply or
> report any errors from the server if there is no waiter.  None of this requires
> the BQL.
> 
> 	If the message is a request - which currently only happens if device
> DMA targets guest memory that wasn’t mmap()d by QEMU or if the ’secure-dma’
> option is used - then the receiver can then acquire BQL so it can call the
> VFIO code to handle the request.

QEMU is generally event-driven and the APIs are designed for that style.
Threads in QEMU are there for parallelism or resource control,
everything else is event-driven.

It's not clear to me if there is a reason why the message processing
must be done in a separate thread or whether it is just done this way
because the code was written in multi-threaded style instead of
event-driven style.

You mentioned other threads waiting for replies. Which threads are they?

> > Please use true. '1' is shorter but it's less obvious to the reader (I
> > had to jump to the definition to check whether this field was bool or
> > int).
> > 
> 
> 	I assume this is also true for the other boolean struct members
> I’ve added.

Yes, please. QEMU uses bool and true/false.

> 
> 
> >> +    aio_bh_schedule_oneshot(iothread_get_aio_context(vfio_user_iothread),
> >> +                            vfio_user_cb, proxy);
> >> +
> >> +    /* drop locks so the iothread can make progress */
> >> +    qemu_mutex_unlock_iothread();
> > 
> > Why does the BQL needs to be dropped so vfio_user_iothread can make
> > progress?
> > 
> 
> 	See above.  Acquiring BQL by the iothread is rare, but I have to
> handle the case where a disconnect is concurrent with an incoming request
> message that is waiting for the BQL.  See the proxy state check after BQL
> is acquired in vfio_user_recv()

Here is how this code looks when written using coroutines (this is from
nbd/server.c):

  static coroutine_fn void nbd_trip(void *opaque)
  {
      ...
      req = nbd_request_get(client);
      ret = nbd_co_receive_request(req, &request, &local_err);
      client->recv_coroutine = NULL;
  
      if (client->closing) {
          /*
           * The client may be closed when we are blocked in
           * nbd_co_receive_request()
           */
          goto done;
      }

It's the same check. The code is inverted: the server reads the next
request using nbd_co_receive_request() (this coroutine function can
yield while waiting for data on the socket).

This way the network communication code doesn't need to know how
messages will by processed by the client or server. There is no need for
if (isreply) { qemu_cond_signal(&reply->cv); } else {
proxy->request(proxy->reqarg, buf, &reqfds); }. The callbacks and
threads aren't hardcoded into the network communication code.

This goes back to the question earlier about why a dedicated thread is
necessary here. I suggest writing the network communication code using
coroutines. That way the code is easier to read (no callbacks or
thread synchronization), there are fewer thread-safety issues to worry
about, and users or management tools don't need to know about additional
threads (e.g. CPU/NUMA affinity).

Stefan
John Johnson Sept. 9, 2021, 5:11 a.m. UTC | #6
> On Sep 7, 2021, at 6:21 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> 
> This way the network communication code doesn't need to know how
> messages will by processed by the client or server. There is no need for
> if (isreply) { qemu_cond_signal(&reply->cv); } else {
> proxy->request(proxy->reqarg, buf, &reqfds); }. The callbacks and
> threads aren't hardcoded into the network communication code.
> 

	I fear we are talking past each other.  The vfio-user protocol
is bi-directional.  e.g., the client both sends requests to the server
and receives requests from the server on the same socket.  No matter
what threading model we use, the receive algorithm will be:


read message header
if it’s a reply
   schedule the thread waiting for the reply
else
   run a callback to process the request


	The only way I can see changing this is to establish two
uni-directional sockets: one for requests outbound to the server,
and one for requests inbound from the server.

	This is the reason I chose the iothread model.  It can run
independently of any vCPU/main threads waiting for replies and of
the callback thread.  I did muddle this idea by having the iothread
become a callback thread by grabbing BQL and running the callback
inline when it receives a request from the server, but if you like a
pure event driven model, I can make incoming requests kick a BH from
the main loop.  e.g.,

if it’s a reply
   qemu_cond_signal(reply cv)
else
   qemu_bh_schedule(proxy bh)

	That would avoid disconnect having to handle the iothread
blocked on BQL.


> This goes back to the question earlier about why a dedicated thread is
> necessary here. I suggest writing the network communication code using
> coroutines. That way the code is easier to read (no callbacks or
> thread synchronization), there are fewer thread-safety issues to worry
> about, and users or management tools don't need to know about additional
> threads (e.g. CPU/NUMA affinity).
> 


	I did look at coroutines, but they seemed to work when the sender
is triggering the coroutine on send, not when request packets are arriving
asynchronously to the sends.

								JJ
Stefan Hajnoczi Sept. 9, 2021, 6:29 a.m. UTC | #7
On Thu, Sep 09, 2021 at 05:11:49AM +0000, John Johnson wrote:
> 
> 
> > On Sep 7, 2021, at 6:21 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> > 
> > This way the network communication code doesn't need to know how
> > messages will by processed by the client or server. There is no need for
> > if (isreply) { qemu_cond_signal(&reply->cv); } else {
> > proxy->request(proxy->reqarg, buf, &reqfds); }. The callbacks and
> > threads aren't hardcoded into the network communication code.
> > 
> 
> 	I fear we are talking past each other.  The vfio-user protocol
> is bi-directional.  e.g., the client both sends requests to the server
> and receives requests from the server on the same socket.  No matter
> what threading model we use, the receive algorithm will be:
> 
> 
> read message header
> if it’s a reply
>    schedule the thread waiting for the reply
> else
>    run a callback to process the request
> 
> 
> 	The only way I can see changing this is to establish two
> uni-directional sockets: one for requests outbound to the server,
> and one for requests inbound from the server.
> 
> 	This is the reason I chose the iothread model.  It can run
> independently of any vCPU/main threads waiting for replies and of
> the callback thread.  I did muddle this idea by having the iothread
> become a callback thread by grabbing BQL and running the callback
> inline when it receives a request from the server, but if you like a
> pure event driven model, I can make incoming requests kick a BH from
> the main loop.  e.g.,
> 
> if it’s a reply
>    qemu_cond_signal(reply cv)
> else
>    qemu_bh_schedule(proxy bh)
> 
> 	That would avoid disconnect having to handle the iothread
> blocked on BQL.
> 
> 
> > This goes back to the question earlier about why a dedicated thread is
> > necessary here. I suggest writing the network communication code using
> > coroutines. That way the code is easier to read (no callbacks or
> > thread synchronization), there are fewer thread-safety issues to worry
> > about, and users or management tools don't need to know about additional
> > threads (e.g. CPU/NUMA affinity).
> > 
> 
> 
> 	I did look at coroutines, but they seemed to work when the sender
> is triggering the coroutine on send, not when request packets are arriving
> asynchronously to the sends.

This can be done with a receiver coroutine. Its job is to be the only
thing that reads vfio-user messages from the socket. A receiver
coroutine reads messages from the socket and wakes up the waiting
coroutine that yielded from vfio_user_send_recv() or
vfio_user_pci_process_req().

(Although vfio_user_pci_process_req() could be called directly from the
receiver coroutine, it seems safer to have a separate coroutine that
processes requests so that the receiver isn't blocked in case
vfio_user_pci_process_req() yields while processing a request.)

Going back to what you mentioned above, the receiver coroutine does
something like this:

  if it's a reply
      reply = find_reply(...)
      qemu_coroutine_enter(reply->co) // instead of signalling reply->cv
  else
      QSIMPLEQ_INSERT_TAIL(&pending_reqs, request, next);
      if (pending_reqs_was_empty) {
          qemu_coroutine_enter(process_request_co);
      }

The pending_reqs queue holds incoming requests that the
process_request_co coroutine processes.

Stefan
John Johnson Sept. 10, 2021, 5:25 a.m. UTC | #8
> On Sep 8, 2021, at 11:29 PM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Thu, Sep 09, 2021 at 05:11:49AM +0000, John Johnson wrote:
>> 
>> 
>> 	I did look at coroutines, but they seemed to work when the sender
>> is triggering the coroutine on send, not when request packets are arriving
>> asynchronously to the sends.
> 
> This can be done with a receiver coroutine. Its job is to be the only
> thing that reads vfio-user messages from the socket. A receiver
> coroutine reads messages from the socket and wakes up the waiting
> coroutine that yielded from vfio_user_send_recv() or
> vfio_user_pci_process_req().
> 
> (Although vfio_user_pci_process_req() could be called directly from the
> receiver coroutine, it seems safer to have a separate coroutine that
> processes requests so that the receiver isn't blocked in case
> vfio_user_pci_process_req() yields while processing a request.)
> 
> Going back to what you mentioned above, the receiver coroutine does
> something like this:
> 
>  if it's a reply
>      reply = find_reply(...)
>      qemu_coroutine_enter(reply->co) // instead of signalling reply->cv
>  else
>      QSIMPLEQ_INSERT_TAIL(&pending_reqs, request, next);
>      if (pending_reqs_was_empty) {
>          qemu_coroutine_enter(process_request_co);
>      }
> 
> The pending_reqs queue holds incoming requests that the
> process_request_co coroutine processes.
> 


	How do coroutines work across threads?  There can be multiple vCPU
threads waiting for replies, and I think the receiver coroutine will be
running in the main loop thread.  Where would a vCPU block waiting for
a reply?  I think coroutine_yield() returns to its coroutine_enter() caller.

							JJ
Stefan Hajnoczi Sept. 13, 2021, 12:35 p.m. UTC | #9
On Fri, Sep 10, 2021 at 05:25:13AM +0000, John Johnson wrote:
> 
> 
> > On Sep 8, 2021, at 11:29 PM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> > On Thu, Sep 09, 2021 at 05:11:49AM +0000, John Johnson wrote:
> >> 
> >> 
> >> 	I did look at coroutines, but they seemed to work when the sender
> >> is triggering the coroutine on send, not when request packets are arriving
> >> asynchronously to the sends.
> > 
> > This can be done with a receiver coroutine. Its job is to be the only
> > thing that reads vfio-user messages from the socket. A receiver
> > coroutine reads messages from the socket and wakes up the waiting
> > coroutine that yielded from vfio_user_send_recv() or
> > vfio_user_pci_process_req().
> > 
> > (Although vfio_user_pci_process_req() could be called directly from the
> > receiver coroutine, it seems safer to have a separate coroutine that
> > processes requests so that the receiver isn't blocked in case
> > vfio_user_pci_process_req() yields while processing a request.)
> > 
> > Going back to what you mentioned above, the receiver coroutine does
> > something like this:
> > 
> >  if it's a reply
> >      reply = find_reply(...)
> >      qemu_coroutine_enter(reply->co) // instead of signalling reply->cv
> >  else
> >      QSIMPLEQ_INSERT_TAIL(&pending_reqs, request, next);
> >      if (pending_reqs_was_empty) {
> >          qemu_coroutine_enter(process_request_co);
> >      }
> > 
> > The pending_reqs queue holds incoming requests that the
> > process_request_co coroutine processes.
> > 
> 
> 
> 	How do coroutines work across threads?  There can be multiple vCPU
> threads waiting for replies, and I think the receiver coroutine will be
> running in the main loop thread.  Where would a vCPU block waiting for
> a reply?  I think coroutine_yield() returns to its coroutine_enter() caller.

A vCPU thread holding the BQL can iterate the event loop if it has
reached a synchronous point that needs to wait for a reply before
returning. I think we have this situation when a MemoryRegion is
accessed on the proxy device.

For example, block/block-backend.c:blk_prw() kicks off a coroutine and
then runs the event loop until the coroutine finishes:

  Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
  bdrv_coroutine_enter(blk_bs(blk), co);
  BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);

BDRV_POLL_WHILE() boils down to a loop like this:

  while ((cond)) {
    aio_poll(ctx, true);
  }

I also want to check that I understand the scenarios in which the
vfio-user communication code is used:

1. vhost-user-server

The vfio-user communication code should run in a given AioContext (it
will be the main loop by default but maybe the user will be able to
configure a specific IOThread in the future).

2. vCPU thread vfio-user clients

The vfio-user communication code is called from the vCPU thread where
the proxy device executes. The MemoryRegion->read()/write() callbacks
are synchronous, so the thread needs to wait for a vfio-user reply
before it can return.

Is this what you had in mind?

Stefan
John Johnson Sept. 13, 2021, 5:23 p.m. UTC | #10
> 
>> On Sep 9, 2021, at 10:25 PM, John Johnson <john.g.johnson@oracle.com> wrote:
>> 
>> 
>> 
>>> On Sep 8, 2021, at 11:29 PM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>> 
>>> On Thu, Sep 09, 2021 at 05:11:49AM +0000, John Johnson wrote:
>>>> 
>>>> 
>>>> 	I did look at coroutines, but they seemed to work when the sender
>>>> is triggering the coroutine on send, not when request packets are arriving
>>>> asynchronously to the sends.
>>> 
>>> This can be done with a receiver coroutine. Its job is to be the only
>>> thing that reads vfio-user messages from the socket. A receiver
>>> coroutine reads messages from the socket and wakes up the waiting
>>> coroutine that yielded from vfio_user_send_recv() or
>>> vfio_user_pci_process_req().
>>> 
>>> (Although vfio_user_pci_process_req() could be called directly from the
>>> receiver coroutine, it seems safer to have a separate coroutine that
>>> processes requests so that the receiver isn't blocked in case
>>> vfio_user_pci_process_req() yields while processing a request.)
>>> 
>>> Going back to what you mentioned above, the receiver coroutine does
>>> something like this:
>>> 
>>> if it's a reply
>>>     reply = find_reply(...)
>>>     qemu_coroutine_enter(reply->co) // instead of signalling reply->cv
>>> else
>>>     QSIMPLEQ_INSERT_TAIL(&pending_reqs, request, next);
>>>     if (pending_reqs_was_empty) {
>>>         qemu_coroutine_enter(process_request_co);
>>>     }
>>> 
>>> The pending_reqs queue holds incoming requests that the
>>> process_request_co coroutine processes.
>>> 
>> 
>> 
>> 	How do coroutines work across threads?  There can be multiple vCPU
>> threads waiting for replies, and I think the receiver coroutine will be
>> running in the main loop thread.  Where would a vCPU block waiting for
>> a reply?  I think coroutine_yield() returns to its coroutine_enter() caller
> 
> 
> 
> A vCPU thread holding the BQL can iterate the event loop if it has
> reached a synchronous point that needs to wait for a reply before
> returning. I think we have this situation when a MemoryRegion is
> accessed on the proxy device.
> 
> For example, block/block-backend.c:blk_prw() kicks off a coroutine and
> then runs the event loop until the coroutine finishes:
> 
>   Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
>   bdrv_coroutine_enter(blk_bs(blk), co);
>   BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
> 
> BDRV_POLL_WHILE() boils down to a loop like this:
> 
>   while ((cond)) {
>     aio_poll(ctx, true);
>   }
> 

	I think that would make vCPUs sending requests and the
receiver coroutine all poll on the same socket.  If the “wrong”
routine reads the message, I’d need a second level of synchronization
to pass the message to the “right” one.  e.g., if the vCPU coroutine
reads a request, it needs to pass it to the receiver; if the receiver
coroutine reads a reply, it needs to pass it to a vCPU.

	Avoiding this complexity is one of the reasons I went with
a separate thread that only reads the socket over the mp-qemu model,
which does have the sender poll, but doesn’t need to handle incoming
requests.



> I also want to check that I understand the scenarios in which the
> vfio-user communication code is used:
> 
> 1. vhost-user-server
> 
> The vfio-user communication code should run in a given AioContext (it
> will be the main loop by default but maybe the user will be able to
> configure a specific IOThread in the future).
> 

	Jag would know more, but I believe it runs off the main loop.
Running it in an iothread doesn’t gain much, since it needs BQL to
run the device emulation code.


> 2. vCPU thread vfio-user clients
> 
> The vfio-user communication code is called from the vCPU thread where
> the proxy device executes. The MemoryRegion->read()/write() callbacks
> are synchronous, so the thread needs to wait for a vfio-user reply
> before it can return.
> 
> Is this what you had in mind?

	The client is also called from the main thread - the GET_*
messages from vfio_user_pci_realize() as well as MAP/DEMAP messages
from guest address space change transactions.  It is also called by
the migration thread, which is a separate thread that does not run
holding BQL.

							JJ
Stefan Hajnoczi Sept. 14, 2021, 1:06 p.m. UTC | #11
On Mon, Sep 13, 2021 at 05:23:33PM +0000, John Johnson wrote:
> >> On Sep 9, 2021, at 10:25 PM, John Johnson <john.g.johnson@oracle.com> wrote:
> >>> On Sep 8, 2021, at 11:29 PM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >>> On Thu, Sep 09, 2021 at 05:11:49AM +0000, John Johnson wrote:
> >>>> 	I did look at coroutines, but they seemed to work when the sender
> >>>> is triggering the coroutine on send, not when request packets are arriving
> >>>> asynchronously to the sends.
> >>> 
> >>> This can be done with a receiver coroutine. Its job is to be the only
> >>> thing that reads vfio-user messages from the socket. A receiver
> >>> coroutine reads messages from the socket and wakes up the waiting
> >>> coroutine that yielded from vfio_user_send_recv() or
> >>> vfio_user_pci_process_req().
> >>> 
> >>> (Although vfio_user_pci_process_req() could be called directly from the
> >>> receiver coroutine, it seems safer to have a separate coroutine that
> >>> processes requests so that the receiver isn't blocked in case
> >>> vfio_user_pci_process_req() yields while processing a request.)
> >>> 
> >>> Going back to what you mentioned above, the receiver coroutine does
> >>> something like this:
> >>> 
> >>> if it's a reply
> >>>     reply = find_reply(...)
> >>>     qemu_coroutine_enter(reply->co) // instead of signalling reply->cv
> >>> else
> >>>     QSIMPLEQ_INSERT_TAIL(&pending_reqs, request, next);
> >>>     if (pending_reqs_was_empty) {
> >>>         qemu_coroutine_enter(process_request_co);
> >>>     }
> >>> 
> >>> The pending_reqs queue holds incoming requests that the
> >>> process_request_co coroutine processes.
> >>> 
> >> 
> >> 
> >> 	How do coroutines work across threads?  There can be multiple vCPU
> >> threads waiting for replies, and I think the receiver coroutine will be
> >> running in the main loop thread.  Where would a vCPU block waiting for
> >> a reply?  I think coroutine_yield() returns to its coroutine_enter() caller
> > 
> > 
> > 
> > A vCPU thread holding the BQL can iterate the event loop if it has
> > reached a synchronous point that needs to wait for a reply before
> > returning. I think we have this situation when a MemoryRegion is
> > accessed on the proxy device.
> > 
> > For example, block/block-backend.c:blk_prw() kicks off a coroutine and
> > then runs the event loop until the coroutine finishes:
> > 
> >   Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
> >   bdrv_coroutine_enter(blk_bs(blk), co);
> >   BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
> > 
> > BDRV_POLL_WHILE() boils down to a loop like this:
> > 
> >   while ((cond)) {
> >     aio_poll(ctx, true);
> >   }
> > 
> 
> 	I think that would make vCPUs sending requests and the
> receiver coroutine all poll on the same socket.  If the “wrong”
> routine reads the message, I’d need a second level of synchronization
> to pass the message to the “right” one.  e.g., if the vCPU coroutine
> reads a request, it needs to pass it to the receiver; if the receiver
> coroutine reads a reply, it needs to pass it to a vCPU.
> 
> 	Avoiding this complexity is one of the reasons I went with
> a separate thread that only reads the socket over the mp-qemu model,
> which does have the sender poll, but doesn’t need to handle incoming
> requests.

Only one coroutine reads from the socket, the "receiver" coroutine. In a
previous reply I sketched what the receiver does:

  if it's a reply
      reply = find_reply(...)
      qemu_coroutine_enter(reply->co) // instead of signalling reply->cv
  else
      QSIMPLEQ_INSERT_TAIL(&pending_reqs, request, next);
      if (pending_reqs_was_empty) {
          qemu_coroutine_enter(process_request_co);
      }

The qemu_coroutine_enter(reply->co) call re-enters the coroutine that
was created by the vCPU thread. Is this the "second level of
synchronization" that you described? It's very similar to signalling
reply->cv in the existing patch.

Now I'm actually thinking about whether this can be improved by keeping
the condvar so that the vCPU thread doesn't need to call aio_poll()
(which is awkward because it doesn't drop the BQL and therefore blocks
other vCPUs from making progress). That approach wouldn't require a
dedicated thread for vfio-user.

> > I also want to check that I understand the scenarios in which the
> > vfio-user communication code is used:
> > 
> > 1. vhost-user-server
> > 
> > The vfio-user communication code should run in a given AioContext (it
> > will be the main loop by default but maybe the user will be able to
> > configure a specific IOThread in the future).
> > 
> 
> 	Jag would know more, but I believe it runs off the main loop.
> Running it in an iothread doesn’t gain much, since it needs BQL to
> run the device emulation code.
> 
> 
> > 2. vCPU thread vfio-user clients
> > 
> > The vfio-user communication code is called from the vCPU thread where
> > the proxy device executes. The MemoryRegion->read()/write() callbacks
> > are synchronous, so the thread needs to wait for a vfio-user reply
> > before it can return.
> > 
> > Is this what you had in mind?
> 
> 	The client is also called from the main thread - the GET_*
> messages from vfio_user_pci_realize() as well as MAP/DEMAP messages
> from guest address space change transactions.  It is also called by
> the migration thread, which is a separate thread that does not run
> holding BQL.

Thanks for mentioning those additional cases.

Stefan
John Johnson Sept. 15, 2021, 12:21 a.m. UTC | #12
> On Sep 14, 2021, at 6:06 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Mon, Sep 13, 2021 at 05:23:33PM +0000, John Johnson wrote:
>>>> On Sep 9, 2021, at 10:25 PM, John Johnson <john.g.johnson@oracle.com> wrote:
>>>>> On Sep 8, 2021, at 11:29 PM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>>>> On Thu, Sep 09, 2021 at 05:11:49AM +0000, John Johnson wrote:
>>>>>> 	I did look at coroutines, but they seemed to work when the sender
>>>>>> is triggering the coroutine on send, not when request packets are arriving
>>>>>> asynchronously to the sends.
>>>>> 
>>>>> This can be done with a receiver coroutine. Its job is to be the only
>>>>> thing that reads vfio-user messages from the socket. A receiver
>>>>> coroutine reads messages from the socket and wakes up the waiting
>>>>> coroutine that yielded from vfio_user_send_recv() or
>>>>> vfio_user_pci_process_req().
>>>>> 
>>>>> (Although vfio_user_pci_process_req() could be called directly from the
>>>>> receiver coroutine, it seems safer to have a separate coroutine that
>>>>> processes requests so that the receiver isn't blocked in case
>>>>> vfio_user_pci_process_req() yields while processing a request.)
>>>>> 
>>>>> Going back to what you mentioned above, the receiver coroutine does
>>>>> something like this:
>>>>> 
>>>>> if it's a reply
>>>>>    reply = find_reply(...)
>>>>>    qemu_coroutine_enter(reply->co) // instead of signalling reply->cv
>>>>> else
>>>>>    QSIMPLEQ_INSERT_TAIL(&pending_reqs, request, next);
>>>>>    if (pending_reqs_was_empty) {
>>>>>        qemu_coroutine_enter(process_request_co);
>>>>>    }
>>>>> 
>>>>> The pending_reqs queue holds incoming requests that the
>>>>> process_request_co coroutine processes.
>>>>> 
>>>> 
>>>> 
>>>> 	How do coroutines work across threads?  There can be multiple vCPU
>>>> threads waiting for replies, and I think the receiver coroutine will be
>>>> running in the main loop thread.  Where would a vCPU block waiting for
>>>> a reply?  I think coroutine_yield() returns to its coroutine_enter() caller
>>> 
>>> 
>>> 
>>> A vCPU thread holding the BQL can iterate the event loop if it has
>>> reached a synchronous point that needs to wait for a reply before
>>> returning. I think we have this situation when a MemoryRegion is
>>> accessed on the proxy device.
>>> 
>>> For example, block/block-backend.c:blk_prw() kicks off a coroutine and
>>> then runs the event loop until the coroutine finishes:
>>> 
>>>  Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
>>>  bdrv_coroutine_enter(blk_bs(blk), co);
>>>  BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
>>> 
>>> BDRV_POLL_WHILE() boils down to a loop like this:
>>> 
>>>  while ((cond)) {
>>>    aio_poll(ctx, true);
>>>  }
>>> 
>> 
>> 	I think that would make vCPUs sending requests and the
>> receiver coroutine all poll on the same socket.  If the “wrong”
>> routine reads the message, I’d need a second level of synchronization
>> to pass the message to the “right” one.  e.g., if the vCPU coroutine
>> reads a request, it needs to pass it to the receiver; if the receiver
>> coroutine reads a reply, it needs to pass it to a vCPU.
>> 
>> 	Avoiding this complexity is one of the reasons I went with
>> a separate thread that only reads the socket over the mp-qemu model,
>> which does have the sender poll, but doesn’t need to handle incoming
>> requests.
> 
> Only one coroutine reads from the socket, the "receiver" coroutine. In a
> previous reply I sketched what the receiver does:
> 
>  if it's a reply
>      reply = find_reply(...)
>      qemu_coroutine_enter(reply->co) // instead of signalling reply->cv
>  else
>      QSIMPLEQ_INSERT_TAIL(&pending_reqs, request, next);
>      if (pending_reqs_was_empty) {
>          qemu_coroutine_enter(process_request_co);
>      }
> 

	Sorry, I was assuming when you said the coroutine will block with
aio_poll(), you implied it would also read messages from the socket.
 

> The qemu_coroutine_enter(reply->co) call re-enters the coroutine that
> was created by the vCPU thread. Is this the "second level of
> synchronization" that you described? It's very similar to signalling
> reply->cv in the existing patch.
> 

	Yes, the only difference is it would be woken on each message,
even though it doesn’t read them.  Which is what I think you’re addressing
below.


> Now I'm actually thinking about whether this can be improved by keeping
> the condvar so that the vCPU thread doesn't need to call aio_poll()
> (which is awkward because it doesn't drop the BQL and therefore blocks
> other vCPUs from making progress). That approach wouldn't require a
> dedicated thread for vfio-user.
> 

	Wouldn’t you need to acquire BQL twice for every vCPU reply: once to
run the receiver coroutine, and once when the vCPU thread wakes up and wants
to return to the VFIO code.  The migration thread would also add a BQL
dependency, where it didn’t have one before.

	Is your objection with using an iothread, or using a separate thread?
I can change to using qemu_thread_create() and running aio_poll() from the
thread routine, instead of creating an iothread.


	On a related subject:

On Aug 24, 2021, at 8:14 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:

>> +    ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds,
>> +                                 &local_err);
> 
> This is a blocking call. My understanding is that the IOThread is shared
> by all vfio-user devices, so other devices will have to wait if one of
> them is acting up (e.g. the device emulation process sent less than
> sizeof(msg) bytes).


	This shouldn’t block if the emulation process sends less than sizeof(msg)
bytes.  qio_channel_readv() will eventually call recvmsg(), which only blocks a
short read if MSG_WAITALL is set, and it’s not set in this case.  recvmsg() will
return the data available, and vfio_user_recv() will treat a short read as an error.

								JJ
Stefan Hajnoczi Sept. 15, 2021, 1:04 p.m. UTC | #13
On Wed, Sep 15, 2021 at 12:21:10AM +0000, John Johnson wrote:
> 
> 
> > On Sep 14, 2021, at 6:06 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> > On Mon, Sep 13, 2021 at 05:23:33PM +0000, John Johnson wrote:
> >>>> On Sep 9, 2021, at 10:25 PM, John Johnson <john.g.johnson@oracle.com> wrote:
> >>>>> On Sep 8, 2021, at 11:29 PM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >>>>> On Thu, Sep 09, 2021 at 05:11:49AM +0000, John Johnson wrote:
> >>>>>> 	I did look at coroutines, but they seemed to work when the sender
> >>>>>> is triggering the coroutine on send, not when request packets are arriving
> >>>>>> asynchronously to the sends.
> >>>>> 
> >>>>> This can be done with a receiver coroutine. Its job is to be the only
> >>>>> thing that reads vfio-user messages from the socket. A receiver
> >>>>> coroutine reads messages from the socket and wakes up the waiting
> >>>>> coroutine that yielded from vfio_user_send_recv() or
> >>>>> vfio_user_pci_process_req().
> >>>>> 
> >>>>> (Although vfio_user_pci_process_req() could be called directly from the
> >>>>> receiver coroutine, it seems safer to have a separate coroutine that
> >>>>> processes requests so that the receiver isn't blocked in case
> >>>>> vfio_user_pci_process_req() yields while processing a request.)
> >>>>> 
> >>>>> Going back to what you mentioned above, the receiver coroutine does
> >>>>> something like this:
> >>>>> 
> >>>>> if it's a reply
> >>>>>    reply = find_reply(...)
> >>>>>    qemu_coroutine_enter(reply->co) // instead of signalling reply->cv
> >>>>> else
> >>>>>    QSIMPLEQ_INSERT_TAIL(&pending_reqs, request, next);
> >>>>>    if (pending_reqs_was_empty) {
> >>>>>        qemu_coroutine_enter(process_request_co);
> >>>>>    }
> >>>>> 
> >>>>> The pending_reqs queue holds incoming requests that the
> >>>>> process_request_co coroutine processes.
> >>>>> 
> >>>> 
> >>>> 
> >>>> 	How do coroutines work across threads?  There can be multiple vCPU
> >>>> threads waiting for replies, and I think the receiver coroutine will be
> >>>> running in the main loop thread.  Where would a vCPU block waiting for
> >>>> a reply?  I think coroutine_yield() returns to its coroutine_enter() caller
> >>> 
> >>> 
> >>> 
> >>> A vCPU thread holding the BQL can iterate the event loop if it has
> >>> reached a synchronous point that needs to wait for a reply before
> >>> returning. I think we have this situation when a MemoryRegion is
> >>> accessed on the proxy device.
> >>> 
> >>> For example, block/block-backend.c:blk_prw() kicks off a coroutine and
> >>> then runs the event loop until the coroutine finishes:
> >>> 
> >>>  Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
> >>>  bdrv_coroutine_enter(blk_bs(blk), co);
> >>>  BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
> >>> 
> >>> BDRV_POLL_WHILE() boils down to a loop like this:
> >>> 
> >>>  while ((cond)) {
> >>>    aio_poll(ctx, true);
> >>>  }
> >>> 
> >> 
> >> 	I think that would make vCPUs sending requests and the
> >> receiver coroutine all poll on the same socket.  If the “wrong”
> >> routine reads the message, I’d need a second level of synchronization
> >> to pass the message to the “right” one.  e.g., if the vCPU coroutine
> >> reads a request, it needs to pass it to the receiver; if the receiver
> >> coroutine reads a reply, it needs to pass it to a vCPU.
> >> 
> >> 	Avoiding this complexity is one of the reasons I went with
> >> a separate thread that only reads the socket over the mp-qemu model,
> >> which does have the sender poll, but doesn’t need to handle incoming
> >> requests.
> > 
> > Only one coroutine reads from the socket, the "receiver" coroutine. In a
> > previous reply I sketched what the receiver does:
> > 
> >  if it's a reply
> >      reply = find_reply(...)
> >      qemu_coroutine_enter(reply->co) // instead of signalling reply->cv
> >  else
> >      QSIMPLEQ_INSERT_TAIL(&pending_reqs, request, next);
> >      if (pending_reqs_was_empty) {
> >          qemu_coroutine_enter(process_request_co);
> >      }
> > 
> 
> 	Sorry, I was assuming when you said the coroutine will block with
> aio_poll(), you implied it would also read messages from the socket.

The vCPU thread calls aio_poll() outside the coroutine, similar to the
block/block-backend.c:blk_prw() example I posted above:

  Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
  bdrv_coroutine_enter(blk_bs(blk), co);
  BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);

(BDRV_POLL_WHILE() is a aio_poll() loop.)

The coroutine isn't aware of aio_poll(), it just yields when it needs to
wait.

> > The qemu_coroutine_enter(reply->co) call re-enters the coroutine that
> > was created by the vCPU thread. Is this the "second level of
> > synchronization" that you described? It's very similar to signalling
> > reply->cv in the existing patch.
> > 
> 
> 	Yes, the only difference is it would be woken on each message,
> even though it doesn’t read them.  Which is what I think you’re addressing
> below.
>
> > Now I'm actually thinking about whether this can be improved by keeping
> > the condvar so that the vCPU thread doesn't need to call aio_poll()
> > (which is awkward because it doesn't drop the BQL and therefore blocks
> > other vCPUs from making progress). That approach wouldn't require a
> > dedicated thread for vfio-user.
> > 
> 
> 	Wouldn’t you need to acquire BQL twice for every vCPU reply: once to
> run the receiver coroutine, and once when the vCPU thread wakes up and wants
> to return to the VFIO code.  The migration thread would also add a BQL
> dependency, where it didn’t have one before.

If aio_poll() is used then the vCPU thread doesn't drop the BQL at all.
The vCPU thread sends the message and waits for the reply while other
BQL threads are locked out.

If a condvar or similar mechanism is used then the vCPU sends the
message, drops the BQL, and waits on the condvar. The main loop thread
runs the receiver coroutine and re-enters the coroutine, which signals
the condvar. The vCPU then re-acquires the BQL.

> 	Is your objection with using an iothread, or using a separate thread?
> I can change to using qemu_thread_create() and running aio_poll() from the
> thread routine, instead of creating an iothread.

The vfio-user communication code shouldn't need to worry about threads
or locks. The code can be written in terms of AioContext so the caller
can use it from various environments without hardcoding details of the
BQL or threads into the communication code. This makes it easier to
understand and less tightly coupled.

I'll try to sketch how that could work:

The main concept is VFIOProxy, which has a QIOChannel (the socket
connection) and its main API is:

  void coroutine_fn vfio_user_co_send_recv(VFIOProxy *proxy,
          VFIOUserHdr *msg, VFIOUserFDs *fds, int rsize, int flags);

There is also a request callback for processing incoming requests:

  void coroutine_fn (*request)(void *opaque, char *buf,
                              VFIOUserFDs *fds);

The main loop thread can either use vfio_user_co_send_recv() from
coroutine context or use this blocking wrapper:

  typedef struct {
      VFIOProxy *proxy;
      VFIOUserHdr *msg;
      VFIOUserFDs *fds;
      int rsize;
      int flags;
      bool done;
  } VFIOUserSendRecvData;

  static void coroutine_fn vfu_send_recv_co(void *opaque)
  {
      VFIOUserSendRecvData *data = opaque;
      vfio_user_co_send_recv(data->proxy, data->msg, data->fds,
                             data->rsize, data->flags);
      data->done = true;
  }

  /* A blocking version of vfio_user_co_send_recv() */
  void vfio_user_send_recv(VFIOProxy *proxy, VFIOUserHdr *msg,
                           VFIOUserFDs *fds, int rsize, int flags)
  {
      VFIOUserSendRecvData data = {
          .proxy = proxy,
	  .msg = msg,
	  .fds = fds,
	  .rsize = rsize,
	  .flags = flags,
      };
      Coroutine *co = qemu_coroutine_create(vfu_send_recv_co, &data);
      qemu_coroutine_enter(co);
      while (!data.done) {
          aio_poll(proxy->ioc->ctx, true);
      }
  }

The vCPU thread can use vfio_user_send_recv() if it wants, although the
BQL will be held, preventing other threads from making progress. That
can be avoided by writing a similar wrapper that uses a QemuSemaphore:

  typedef struct {
      VFIOProxy *proxy;
      VFIOUserHdr *msg;
      VFIOUserFDs *fds;
      int rsize;
      int flags;
      QemuSemaphore sem;
  } VFIOUserSendRecvData;

  static void coroutine_fn vfu_send_recv_co(void *opaque)
  {
      VFIOUserSendRecvData *data = opaque;
      vfio_user_co_send_recv(data->proxy, data->msg, data->fds,
                             data->rsize, data->flags);
      qemu_sem_post(&data->sem);
  }

  /*
   * A blocking version of vfio_user_co_send_recv() that relies on
   * another thread to run the event loop. This can be used from vCPU
   * threads to avoid hogging the BQL.
   */
  void vfio_user_vcpu_send_recv(VFIOProxy *proxy, VFIOUserHdr *msg,
                                VFIOUserFDs *fds, int rsize, int flags)
  {
      VFIOUserSendRecvData data = {
          .proxy = proxy,
	  .msg = msg,
	  .fds = fds,
	  .rsize = rsize,
	  .flags = flags,
      };
      Coroutine *co = qemu_coroutine_create(vfu_vcpu_send_recv_co, &data);

      qemu_sem_init(&data.sem, 0);

      qemu_coroutine_enter(co);

      qemu_mutex_unlock_iothread();
      qemu_sem_wait(&data.sem);
      qemu_mutex_lock_iothread();

      qemu_sem_destroy(&data.sem);
  }

With vfio_user_vcpu_send_recv() the vCPU thread doesn't call aio_poll()
itself but instead relies on the main loop thread to run the event loop.

By writing coroutines that run in proxy->ioc->ctx we keep the threading
model and locking in the caller. The communication code isn't aware of
or tied to specific threads. It's possible to drop proxy->lock because
state is only changed from within the AioContext, not multiple threads
that may run in parallel. I think this makes the communication code
simpler and cleaner.

It's possible to use IOThreads with this approach: set the QIOChannel's
AioContext to the IOThread AioContext. However, I don't think we can do
this in the vhost-user server yet because QEMU's device models expect to
run with the BQL and not in an IOThread.

I didn't go into detail about how vfio_user_co_send_recv() is
implemented. Please let me know if you want me to share ideas about
that, but it's what we've already discussed with a "receiver" coroutine
that re-enters the reply coroutines or calls ->request(). A CoMutex is
needed to around qio_channel_write_all() to ensure that coroutines
sending messages don't interleave partial writes if the socket sndbuf is
exhausted.

> 	On a related subject:
> 
> On Aug 24, 2021, at 8:14 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> >> +    ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds,
> >> +                                 &local_err);
> > 
> > This is a blocking call. My understanding is that the IOThread is shared
> > by all vfio-user devices, so other devices will have to wait if one of
> > them is acting up (e.g. the device emulation process sent less than
> > sizeof(msg) bytes).
> 
> 
> 	This shouldn’t block if the emulation process sends less than sizeof(msg)
> bytes.  qio_channel_readv() will eventually call recvmsg(), which only blocks a
> short read if MSG_WAITALL is set, and it’s not set in this case.  recvmsg() will
> return the data available, and vfio_user_recv() will treat a short read as an error.

That's true but vfio_user_recv() can still block layer on: if only
sizeof(msg) bytes are available and msg.size > sizeof(msg) then the
second call blocks.

  msgleft = msg.size - sizeof(msg);
  if (msgleft != 0) {
      ret = qio_channel_read(proxy->ioc, data, msgleft, &local_err);

I think either code should be non-blocking or it shouldn't be. Writing
code that is partially non-blocking is asking for trouble because it's
not obvious where it can block and misbehaving or malicious programs can
cause it to block.

Stefan
John Johnson Sept. 15, 2021, 7:14 p.m. UTC | #14
> On Sep 15, 2021, at 6:04 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Wed, Sep 15, 2021 at 12:21:10AM +0000, John Johnson wrote:
>> 
>> 
>>> On Sep 14, 2021, at 6:06 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>> 
>>> On Mon, Sep 13, 2021 at 05:23:33PM +0000, John Johnson wrote:
>>>>>> On Sep 9, 2021, at 10:25 PM, John Johnson <john.g.johnson@oracle.com> wrote:
>>>>>>> On Sep 8, 2021, at 11:29 PM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>>>>>> On Thu, Sep 09, 2021 at 05:11:49AM +0000, John Johnson wrote:
>>>>>>>> 	I did look at coroutines, but they seemed to work when the sender
>>>>>>>> is triggering the coroutine on send, not when request packets are arriving
>>>>>>>> asynchronously to the sends.
>>>>>>> 
>>>>>>> This can be done with a receiver coroutine. Its job is to be the only
>>>>>>> thing that reads vfio-user messages from the socket. A receiver
>>>>>>> coroutine reads messages from the socket and wakes up the waiting
>>>>>>> coroutine that yielded from vfio_user_send_recv() or
>>>>>>> vfio_user_pci_process_req().
>>>>>>> 
>>>>>>> (Although vfio_user_pci_process_req() could be called directly from the
>>>>>>> receiver coroutine, it seems safer to have a separate coroutine that
>>>>>>> processes requests so that the receiver isn't blocked in case
>>>>>>> vfio_user_pci_process_req() yields while processing a request.)
>>>>>>> 
>>>>>>> Going back to what you mentioned above, the receiver coroutine does
>>>>>>> something like this:
>>>>>>> 
>>>>>>> if it's a reply
>>>>>>>   reply = find_reply(...)
>>>>>>>   qemu_coroutine_enter(reply->co) // instead of signalling reply->cv
>>>>>>> else
>>>>>>>   QSIMPLEQ_INSERT_TAIL(&pending_reqs, request, next);
>>>>>>>   if (pending_reqs_was_empty) {
>>>>>>>       qemu_coroutine_enter(process_request_co);
>>>>>>>   }
>>>>>>> 
>>>>>>> The pending_reqs queue holds incoming requests that the
>>>>>>> process_request_co coroutine processes.
>>>>>>> 
>>>>>> 
>>>>>> 
>>>>>> 	How do coroutines work across threads?  There can be multiple vCPU
>>>>>> threads waiting for replies, and I think the receiver coroutine will be
>>>>>> running in the main loop thread.  Where would a vCPU block waiting for
>>>>>> a reply?  I think coroutine_yield() returns to its coroutine_enter() caller
>>>>> 
>>>>> 
>>>>> 
>>>>> A vCPU thread holding the BQL can iterate the event loop if it has
>>>>> reached a synchronous point that needs to wait for a reply before
>>>>> returning. I think we have this situation when a MemoryRegion is
>>>>> accessed on the proxy device.
>>>>> 
>>>>> For example, block/block-backend.c:blk_prw() kicks off a coroutine and
>>>>> then runs the event loop until the coroutine finishes:
>>>>> 
>>>>> Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
>>>>> bdrv_coroutine_enter(blk_bs(blk), co);
>>>>> BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
>>>>> 
>>>>> BDRV_POLL_WHILE() boils down to a loop like this:
>>>>> 
>>>>> while ((cond)) {
>>>>>   aio_poll(ctx, true);
>>>>> }
>>>>> 
>>>> 
>>>> 	I think that would make vCPUs sending requests and the
>>>> receiver coroutine all poll on the same socket.  If the “wrong”
>>>> routine reads the message, I’d need a second level of synchronization
>>>> to pass the message to the “right” one.  e.g., if the vCPU coroutine
>>>> reads a request, it needs to pass it to the receiver; if the receiver
>>>> coroutine reads a reply, it needs to pass it to a vCPU.
>>>> 
>>>> 	Avoiding this complexity is one of the reasons I went with
>>>> a separate thread that only reads the socket over the mp-qemu model,
>>>> which does have the sender poll, but doesn’t need to handle incoming
>>>> requests.
>>> 
>>> Only one coroutine reads from the socket, the "receiver" coroutine. In a
>>> previous reply I sketched what the receiver does:
>>> 
>>> if it's a reply
>>>     reply = find_reply(...)
>>>     qemu_coroutine_enter(reply->co) // instead of signalling reply->cv
>>> else
>>>     QSIMPLEQ_INSERT_TAIL(&pending_reqs, request, next);
>>>     if (pending_reqs_was_empty) {
>>>         qemu_coroutine_enter(process_request_co);
>>>     }
>>> 
>> 
>> 	Sorry, I was assuming when you said the coroutine will block with
>> aio_poll(), you implied it would also read messages from the socket.
> 
> The vCPU thread calls aio_poll() outside the coroutine, similar to the
> block/block-backend.c:blk_prw() example I posted above:
> 
>  Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
>  bdrv_coroutine_enter(blk_bs(blk), co);
>  BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
> 
> (BDRV_POLL_WHILE() is a aio_poll() loop.)
> 
> The coroutine isn't aware of aio_poll(), it just yields when it needs to
> wait.
> 
>>> The qemu_coroutine_enter(reply->co) call re-enters the coroutine that
>>> was created by the vCPU thread. Is this the "second level of
>>> synchronization" that you described? It's very similar to signalling
>>> reply->cv in the existing patch.
>>> 
>> 
>> 	Yes, the only difference is it would be woken on each message,
>> even though it doesn’t read them.  Which is what I think you’re addressing
>> below.
>> 
>>> Now I'm actually thinking about whether this can be improved by keeping
>>> the condvar so that the vCPU thread doesn't need to call aio_poll()
>>> (which is awkward because it doesn't drop the BQL and therefore blocks
>>> other vCPUs from making progress). That approach wouldn't require a
>>> dedicated thread for vfio-user.
>>> 
>> 
>> 	Wouldn’t you need to acquire BQL twice for every vCPU reply: once to
>> run the receiver coroutine, and once when the vCPU thread wakes up and wants
>> to return to the VFIO code.  The migration thread would also add a BQL
>> dependency, where it didn’t have one before.
> 
> If aio_poll() is used then the vCPU thread doesn't drop the BQL at all.
> The vCPU thread sends the message and waits for the reply while other
> BQL threads are locked out.
> 
> If a condvar or similar mechanism is used then the vCPU sends the
> message, drops the BQL, and waits on the condvar. The main loop thread
> runs the receiver coroutine and re-enters the coroutine, which signals
> the condvar. The vCPU then re-acquires the BQL.
> 

	I understand this.  The point I was trying to make was you'd need
to acquire BQL twice for every reply: once by the main loop before it runs
the receiver coroutine and once after the vCPU wakes up.  That would seem
to increase latency over the iothread model.


>> 	Is your objection with using an iothread, or using a separate thread?
>> I can change to using qemu_thread_create() and running aio_poll() from the
>> thread routine, instead of creating an iothread.
> 
> The vfio-user communication code shouldn't need to worry about threads
> or locks. The code can be written in terms of AioContext so the caller
> can use it from various environments without hardcoding details of the
> BQL or threads into the communication code. This makes it easier to
> understand and less tightly coupled.
> 
> I'll try to sketch how that could work:
> 
> The main concept is VFIOProxy, which has a QIOChannel (the socket
> connection) and its main API is:
> 
>  void coroutine_fn vfio_user_co_send_recv(VFIOProxy *proxy,
>          VFIOUserHdr *msg, VFIOUserFDs *fds, int rsize, int flags);
> 
> There is also a request callback for processing incoming requests:
> 
>  void coroutine_fn (*request)(void *opaque, char *buf,
>                              VFIOUserFDs *fds);
> 
> The main loop thread can either use vfio_user_co_send_recv() from
> coroutine context or use this blocking wrapper:
> 
>  typedef struct {
>      VFIOProxy *proxy;
>      VFIOUserHdr *msg;
>      VFIOUserFDs *fds;
>      int rsize;
>      int flags;
>      bool done;
>  } VFIOUserSendRecvData;
> 
>  static void coroutine_fn vfu_send_recv_co(void *opaque)
>  {
>      VFIOUserSendRecvData *data = opaque;
>      vfio_user_co_send_recv(data->proxy, data->msg, data->fds,
>                             data->rsize, data->flags);
>      data->done = true;
>  }
> 
>  /* A blocking version of vfio_user_co_send_recv() */
>  void vfio_user_send_recv(VFIOProxy *proxy, VFIOUserHdr *msg,
>                           VFIOUserFDs *fds, int rsize, int flags)
>  {
>      VFIOUserSendRecvData data = {
>          .proxy = proxy,
> 	  .msg = msg,
> 	  .fds = fds,
> 	  .rsize = rsize,
> 	  .flags = flags,
>      };
>      Coroutine *co = qemu_coroutine_create(vfu_send_recv_co, &data);
>      qemu_coroutine_enter(co);
>      while (!data.done) {
>          aio_poll(proxy->ioc->ctx, true);
>      }
>  }
> 
> The vCPU thread can use vfio_user_send_recv() if it wants, although the
> BQL will be held, preventing other threads from making progress. That
> can be avoided by writing a similar wrapper that uses a QemuSemaphore:
> 
>  typedef struct {
>      VFIOProxy *proxy;
>      VFIOUserHdr *msg;
>      VFIOUserFDs *fds;
>      int rsize;
>      int flags;
>      QemuSemaphore sem;
>  } VFIOUserSendRecvData;
> 
>  static void coroutine_fn vfu_send_recv_co(void *opaque)
>  {
>      VFIOUserSendRecvData *data = opaque;
>      vfio_user_co_send_recv(data->proxy, data->msg, data->fds,
>                             data->rsize, data->flags);
>      qemu_sem_post(&data->sem);
>  }
> 
>  /*
>   * A blocking version of vfio_user_co_send_recv() that relies on
>   * another thread to run the event loop. This can be used from vCPU
>   * threads to avoid hogging the BQL.
>   */
>  void vfio_user_vcpu_send_recv(VFIOProxy *proxy, VFIOUserHdr *msg,
>                                VFIOUserFDs *fds, int rsize, int flags)
>  {
>      VFIOUserSendRecvData data = {
>          .proxy = proxy,
> 	  .msg = msg,
> 	  .fds = fds,
> 	  .rsize = rsize,
> 	  .flags = flags,
>      };
>      Coroutine *co = qemu_coroutine_create(vfu_vcpu_send_recv_co, &data);
> 
>      qemu_sem_init(&data.sem, 0);
> 
>      qemu_coroutine_enter(co);
> 
>      qemu_mutex_unlock_iothread();
>      qemu_sem_wait(&data.sem);
>      qemu_mutex_lock_iothread();
> 
>      qemu_sem_destroy(&data.sem);
>  }
> 
> With vfio_user_vcpu_send_recv() the vCPU thread doesn't call aio_poll()
> itself but instead relies on the main loop thread to run the event loop.
> 

	I think this means I need 2 send algorithms: one for when called
from the main loop, and another for when called outside the main loop
(vCPU or migration).  I can’t use the semaphore version from the main
loop, since blocking the main loop would prevent the receiver routine
from being scheduled, so I’d want to use aio_poll() there.

	Some vfio_user calls can come from either place (e.g., realize
uses REGION_READ to read the device config space, and vCPU uses it
on a guest load to the device), so I’d need to detect which thread I’m
running in to choose the right sender.


> By writing coroutines that run in proxy->ioc->ctx we keep the threading
> model and locking in the caller. The communication code isn't aware of
> or tied to specific threads. It's possible to drop proxy->lock because
> state is only changed from within the AioContext, not multiple threads
> that may run in parallel. I think this makes the communication code
> simpler and cleaner.
> 
> It's possible to use IOThreads with this approach: set the QIOChannel's
> AioContext to the IOThread AioContext. However, I don't think we can do
> this in the vhost-user server yet because QEMU's device models expect to
> run with the BQL and not in an IOThread.
> 
> I didn't go into detail about how vfio_user_co_send_recv() is
> implemented. Please let me know if you want me to share ideas about
> that, but it's what we've already discussed with a "receiver" coroutine
> that re-enters the reply coroutines or calls ->request(). A CoMutex is
> needed to around qio_channel_write_all() to ensure that coroutines
> sending messages don't interleave partial writes if the socket sndbuf is
> exhausted.
> 

	Here is where I questioned how coroutines work across threads.
When the reply waiter is not the main loop, would the receiver coroutine
re-enter the reply coroutine or signal the condvar it is waiting on?


>> 	On a related subject:
>> 
>> On Aug 24, 2021, at 8:14 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>> 
>>>> +    ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds,
>>>> +                                 &local_err);
>>> 
>>> This is a blocking call. My understanding is that the IOThread is shared
>>> by all vfio-user devices, so other devices will have to wait if one of
>>> them is acting up (e.g. the device emulation process sent less than
>>> sizeof(msg) bytes).
>> 
>> 
>> 	This shouldn’t block if the emulation process sends less than sizeof(msg)
>> bytes.  qio_channel_readv() will eventually call recvmsg(), which only blocks a
>> short read if MSG_WAITALL is set, and it’s not set in this case.  recvmsg() will
>> return the data available, and vfio_user_recv() will treat a short read as an error.
> 
> That's true but vfio_user_recv() can still block layer on: if only
> sizeof(msg) bytes are available and msg.size > sizeof(msg) then the
> second call blocks.
> 
>  msgleft = msg.size - sizeof(msg);
>  if (msgleft != 0) {
>      ret = qio_channel_read(proxy->ioc, data, msgleft, &local_err);
> 
> I think either code should be non-blocking or it shouldn't be. Writing
> code that is partially non-blocking is asking for trouble because it's
> not obvious where it can block and misbehaving or malicious programs can
> cause it to block.
> 

	I wonder if I should just go fully non-blocking, and have the
senders queue messages for the sending routine, and have the receiving
routine either signal a reply waiter or schedule a request handling
routine.

								JJ
Stefan Hajnoczi Sept. 16, 2021, 11:49 a.m. UTC | #15
On Wed, Sep 15, 2021 at 07:14:30PM +0000, John Johnson wrote:
> 
> 
> > On Sep 15, 2021, at 6:04 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> > On Wed, Sep 15, 2021 at 12:21:10AM +0000, John Johnson wrote:
> >> 
> >> 
> >>> On Sep 14, 2021, at 6:06 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >>> 
> >>> On Mon, Sep 13, 2021 at 05:23:33PM +0000, John Johnson wrote:
> >>>>>> On Sep 9, 2021, at 10:25 PM, John Johnson <john.g.johnson@oracle.com> wrote:
> >>>>>>> On Sep 8, 2021, at 11:29 PM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >>>>>>> On Thu, Sep 09, 2021 at 05:11:49AM +0000, John Johnson wrote:
> >>>>>>>> 	I did look at coroutines, but they seemed to work when the sender
> >>>>>>>> is triggering the coroutine on send, not when request packets are arriving
> >>>>>>>> asynchronously to the sends.
> >>>>>>> 
> >>>>>>> This can be done with a receiver coroutine. Its job is to be the only
> >>>>>>> thing that reads vfio-user messages from the socket. A receiver
> >>>>>>> coroutine reads messages from the socket and wakes up the waiting
> >>>>>>> coroutine that yielded from vfio_user_send_recv() or
> >>>>>>> vfio_user_pci_process_req().
> >>>>>>> 
> >>>>>>> (Although vfio_user_pci_process_req() could be called directly from the
> >>>>>>> receiver coroutine, it seems safer to have a separate coroutine that
> >>>>>>> processes requests so that the receiver isn't blocked in case
> >>>>>>> vfio_user_pci_process_req() yields while processing a request.)
> >>>>>>> 
> >>>>>>> Going back to what you mentioned above, the receiver coroutine does
> >>>>>>> something like this:
> >>>>>>> 
> >>>>>>> if it's a reply
> >>>>>>>   reply = find_reply(...)
> >>>>>>>   qemu_coroutine_enter(reply->co) // instead of signalling reply->cv
> >>>>>>> else
> >>>>>>>   QSIMPLEQ_INSERT_TAIL(&pending_reqs, request, next);
> >>>>>>>   if (pending_reqs_was_empty) {
> >>>>>>>       qemu_coroutine_enter(process_request_co);
> >>>>>>>   }
> >>>>>>> 
> >>>>>>> The pending_reqs queue holds incoming requests that the
> >>>>>>> process_request_co coroutine processes.
> >>>>>>> 
> >>>>>> 
> >>>>>> 
> >>>>>> 	How do coroutines work across threads?  There can be multiple vCPU
> >>>>>> threads waiting for replies, and I think the receiver coroutine will be
> >>>>>> running in the main loop thread.  Where would a vCPU block waiting for
> >>>>>> a reply?  I think coroutine_yield() returns to its coroutine_enter() caller
> >>>>> 
> >>>>> 
> >>>>> 
> >>>>> A vCPU thread holding the BQL can iterate the event loop if it has
> >>>>> reached a synchronous point that needs to wait for a reply before
> >>>>> returning. I think we have this situation when a MemoryRegion is
> >>>>> accessed on the proxy device.
> >>>>> 
> >>>>> For example, block/block-backend.c:blk_prw() kicks off a coroutine and
> >>>>> then runs the event loop until the coroutine finishes:
> >>>>> 
> >>>>> Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
> >>>>> bdrv_coroutine_enter(blk_bs(blk), co);
> >>>>> BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
> >>>>> 
> >>>>> BDRV_POLL_WHILE() boils down to a loop like this:
> >>>>> 
> >>>>> while ((cond)) {
> >>>>>   aio_poll(ctx, true);
> >>>>> }
> >>>>> 
> >>>> 
> >>>> 	I think that would make vCPUs sending requests and the
> >>>> receiver coroutine all poll on the same socket.  If the “wrong”
> >>>> routine reads the message, I’d need a second level of synchronization
> >>>> to pass the message to the “right” one.  e.g., if the vCPU coroutine
> >>>> reads a request, it needs to pass it to the receiver; if the receiver
> >>>> coroutine reads a reply, it needs to pass it to a vCPU.
> >>>> 
> >>>> 	Avoiding this complexity is one of the reasons I went with
> >>>> a separate thread that only reads the socket over the mp-qemu model,
> >>>> which does have the sender poll, but doesn’t need to handle incoming
> >>>> requests.
> >>> 
> >>> Only one coroutine reads from the socket, the "receiver" coroutine. In a
> >>> previous reply I sketched what the receiver does:
> >>> 
> >>> if it's a reply
> >>>     reply = find_reply(...)
> >>>     qemu_coroutine_enter(reply->co) // instead of signalling reply->cv
> >>> else
> >>>     QSIMPLEQ_INSERT_TAIL(&pending_reqs, request, next);
> >>>     if (pending_reqs_was_empty) {
> >>>         qemu_coroutine_enter(process_request_co);
> >>>     }
> >>> 
> >> 
> >> 	Sorry, I was assuming when you said the coroutine will block with
> >> aio_poll(), you implied it would also read messages from the socket.
> > 
> > The vCPU thread calls aio_poll() outside the coroutine, similar to the
> > block/block-backend.c:blk_prw() example I posted above:
> > 
> >  Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
> >  bdrv_coroutine_enter(blk_bs(blk), co);
> >  BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
> > 
> > (BDRV_POLL_WHILE() is a aio_poll() loop.)
> > 
> > The coroutine isn't aware of aio_poll(), it just yields when it needs to
> > wait.
> > 
> >>> The qemu_coroutine_enter(reply->co) call re-enters the coroutine that
> >>> was created by the vCPU thread. Is this the "second level of
> >>> synchronization" that you described? It's very similar to signalling
> >>> reply->cv in the existing patch.
> >>> 
> >> 
> >> 	Yes, the only difference is it would be woken on each message,
> >> even though it doesn’t read them.  Which is what I think you’re addressing
> >> below.
> >> 
> >>> Now I'm actually thinking about whether this can be improved by keeping
> >>> the condvar so that the vCPU thread doesn't need to call aio_poll()
> >>> (which is awkward because it doesn't drop the BQL and therefore blocks
> >>> other vCPUs from making progress). That approach wouldn't require a
> >>> dedicated thread for vfio-user.
> >>> 
> >> 
> >> 	Wouldn’t you need to acquire BQL twice for every vCPU reply: once to
> >> run the receiver coroutine, and once when the vCPU thread wakes up and wants
> >> to return to the VFIO code.  The migration thread would also add a BQL
> >> dependency, where it didn’t have one before.
> > 
> > If aio_poll() is used then the vCPU thread doesn't drop the BQL at all.
> > The vCPU thread sends the message and waits for the reply while other
> > BQL threads are locked out.
> > 
> > If a condvar or similar mechanism is used then the vCPU sends the
> > message, drops the BQL, and waits on the condvar. The main loop thread
> > runs the receiver coroutine and re-enters the coroutine, which signals
> > the condvar. The vCPU then re-acquires the BQL.
> > 
> 
> 	I understand this.  The point I was trying to make was you'd need
> to acquire BQL twice for every reply: once by the main loop before it runs
> the receiver coroutine and once after the vCPU wakes up.  That would seem
> to increase latency over the iothread model.

Yes, but if minimizing latency is critical then you can use the
aio_poll() approach. It's fastest since it doesn't context switch or
drop the BQL.

Regarding vfio-user performance in general, devices should use ioeventfd
and/or mmap regions to avoid going through the VMM in
performance-critical code paths.

> >> 	Is your objection with using an iothread, or using a separate thread?
> >> I can change to using qemu_thread_create() and running aio_poll() from the
> >> thread routine, instead of creating an iothread.
> > 
> > The vfio-user communication code shouldn't need to worry about threads
> > or locks. The code can be written in terms of AioContext so the caller
> > can use it from various environments without hardcoding details of the
> > BQL or threads into the communication code. This makes it easier to
> > understand and less tightly coupled.
> > 
> > I'll try to sketch how that could work:
> > 
> > The main concept is VFIOProxy, which has a QIOChannel (the socket
> > connection) and its main API is:
> > 
> >  void coroutine_fn vfio_user_co_send_recv(VFIOProxy *proxy,
> >          VFIOUserHdr *msg, VFIOUserFDs *fds, int rsize, int flags);
> > 
> > There is also a request callback for processing incoming requests:
> > 
> >  void coroutine_fn (*request)(void *opaque, char *buf,
> >                              VFIOUserFDs *fds);
> > 
> > The main loop thread can either use vfio_user_co_send_recv() from
> > coroutine context or use this blocking wrapper:
> > 
> >  typedef struct {
> >      VFIOProxy *proxy;
> >      VFIOUserHdr *msg;
> >      VFIOUserFDs *fds;
> >      int rsize;
> >      int flags;
> >      bool done;
> >  } VFIOUserSendRecvData;
> > 
> >  static void coroutine_fn vfu_send_recv_co(void *opaque)
> >  {
> >      VFIOUserSendRecvData *data = opaque;
> >      vfio_user_co_send_recv(data->proxy, data->msg, data->fds,
> >                             data->rsize, data->flags);
> >      data->done = true;
> >  }
> > 
> >  /* A blocking version of vfio_user_co_send_recv() */
> >  void vfio_user_send_recv(VFIOProxy *proxy, VFIOUserHdr *msg,
> >                           VFIOUserFDs *fds, int rsize, int flags)
> >  {
> >      VFIOUserSendRecvData data = {
> >          .proxy = proxy,
> > 	  .msg = msg,
> > 	  .fds = fds,
> > 	  .rsize = rsize,
> > 	  .flags = flags,
> >      };
> >      Coroutine *co = qemu_coroutine_create(vfu_send_recv_co, &data);
> >      qemu_coroutine_enter(co);
> >      while (!data.done) {
> >          aio_poll(proxy->ioc->ctx, true);
> >      }
> >  }
> > 
> > The vCPU thread can use vfio_user_send_recv() if it wants, although the
> > BQL will be held, preventing other threads from making progress. That
> > can be avoided by writing a similar wrapper that uses a QemuSemaphore:
> > 
> >  typedef struct {
> >      VFIOProxy *proxy;
> >      VFIOUserHdr *msg;
> >      VFIOUserFDs *fds;
> >      int rsize;
> >      int flags;
> >      QemuSemaphore sem;
> >  } VFIOUserSendRecvData;
> > 
> >  static void coroutine_fn vfu_send_recv_co(void *opaque)
> >  {
> >      VFIOUserSendRecvData *data = opaque;
> >      vfio_user_co_send_recv(data->proxy, data->msg, data->fds,
> >                             data->rsize, data->flags);
> >      qemu_sem_post(&data->sem);
> >  }
> > 
> >  /*
> >   * A blocking version of vfio_user_co_send_recv() that relies on
> >   * another thread to run the event loop. This can be used from vCPU
> >   * threads to avoid hogging the BQL.
> >   */
> >  void vfio_user_vcpu_send_recv(VFIOProxy *proxy, VFIOUserHdr *msg,
> >                                VFIOUserFDs *fds, int rsize, int flags)
> >  {
> >      VFIOUserSendRecvData data = {
> >          .proxy = proxy,
> > 	  .msg = msg,
> > 	  .fds = fds,
> > 	  .rsize = rsize,
> > 	  .flags = flags,
> >      };
> >      Coroutine *co = qemu_coroutine_create(vfu_vcpu_send_recv_co, &data);
> > 
> >      qemu_sem_init(&data.sem, 0);
> > 
> >      qemu_coroutine_enter(co);
> > 
> >      qemu_mutex_unlock_iothread();
> >      qemu_sem_wait(&data.sem);
> >      qemu_mutex_lock_iothread();
> > 
> >      qemu_sem_destroy(&data.sem);
> >  }
> > 
> > With vfio_user_vcpu_send_recv() the vCPU thread doesn't call aio_poll()
> > itself but instead relies on the main loop thread to run the event loop.
> > 
> 
> 	I think this means I need 2 send algorithms: one for when called
> from the main loop, and another for when called outside the main loop
> (vCPU or migration).  I can’t use the semaphore version from the main
> loop, since blocking the main loop would prevent the receiver routine
> from being scheduled, so I’d want to use aio_poll() there.
>
> 	Some vfio_user calls can come from either place (e.g., realize
> uses REGION_READ to read the device config space, and vCPU uses it
> on a guest load to the device), so I’d need to detect which thread I’m
> running in to choose the right sender.

The semaphore version is not really necessary, although it allows other
BQL threads to make progress while we wait for a reply (but is slower,
as you pointed out).

The aio_poll() approach can be used from either thread.

> > By writing coroutines that run in proxy->ioc->ctx we keep the threading
> > model and locking in the caller. The communication code isn't aware of
> > or tied to specific threads. It's possible to drop proxy->lock because
> > state is only changed from within the AioContext, not multiple threads
> > that may run in parallel. I think this makes the communication code
> > simpler and cleaner.
> > 
> > It's possible to use IOThreads with this approach: set the QIOChannel's
> > AioContext to the IOThread AioContext. However, I don't think we can do
> > this in the vhost-user server yet because QEMU's device models expect to
> > run with the BQL and not in an IOThread.
> > 
> > I didn't go into detail about how vfio_user_co_send_recv() is
> > implemented. Please let me know if you want me to share ideas about
> > that, but it's what we've already discussed with a "receiver" coroutine
> > that re-enters the reply coroutines or calls ->request(). A CoMutex is
> > needed to around qio_channel_write_all() to ensure that coroutines
> > sending messages don't interleave partial writes if the socket sndbuf is
> > exhausted.
> > 
> 
> 	Here is where I questioned how coroutines work across threads.
> When the reply waiter is not the main loop, would the receiver coroutine
> re-enter the reply coroutine or signal the condvar it is waiting on?

The global AioContext (qemu_get_aio_context()) is shared by multiple
thread and mutual exclusion is provided by the BQL. Any of the threads
can run the coroutines and during a coroutine's lifetime it can run in
multiple threads (but never simultaneously in multiple threads).

The vfu_send_recv_co() coroutine above starts in the vCPU thread but
then yields and is re-entered from the main loop thread. The receiver
coroutine re-enters vfu_send_recv_co() in the main loop thread where it
calls qemu_sem_post() to wake up the vCPU thread.

(IOThread AioContexts are not shared by multiple threads, so in that
case we don't need to worry about threads since everything is done in
the IOThread.)

> >> 	On a related subject:
> >> 
> >> On Aug 24, 2021, at 8:14 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >> 
> >>>> +    ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds,
> >>>> +                                 &local_err);
> >>> 
> >>> This is a blocking call. My understanding is that the IOThread is shared
> >>> by all vfio-user devices, so other devices will have to wait if one of
> >>> them is acting up (e.g. the device emulation process sent less than
> >>> sizeof(msg) bytes).
> >> 
> >> 
> >> 	This shouldn’t block if the emulation process sends less than sizeof(msg)
> >> bytes.  qio_channel_readv() will eventually call recvmsg(), which only blocks a
> >> short read if MSG_WAITALL is set, and it’s not set in this case.  recvmsg() will
> >> return the data available, and vfio_user_recv() will treat a short read as an error.
> > 
> > That's true but vfio_user_recv() can still block layer on: if only
> > sizeof(msg) bytes are available and msg.size > sizeof(msg) then the
> > second call blocks.
> > 
> >  msgleft = msg.size - sizeof(msg);
> >  if (msgleft != 0) {
> >      ret = qio_channel_read(proxy->ioc, data, msgleft, &local_err);
> > 
> > I think either code should be non-blocking or it shouldn't be. Writing
> > code that is partially non-blocking is asking for trouble because it's
> > not obvious where it can block and misbehaving or malicious programs can
> > cause it to block.
> > 
> 
> 	I wonder if I should just go fully non-blocking, and have the
> senders queue messages for the sending routine, and have the receiving
> routine either signal a reply waiter or schedule a request handling
> routine.

That sounds good.

If messages are sent from coroutine context rather than plain functions
then a separate sender isn't needed, they can use a CoMutex for mutual
exclusion/queuing instead of an explicit send queue and sender routine.

Stefan
diff mbox series

Patch

diff --git a/hw/vfio/user.h b/hw/vfio/user.h
new file mode 100644
index 0000000000..62b2d03d56
--- /dev/null
+++ b/hw/vfio/user.h
@@ -0,0 +1,66 @@ 
+#ifndef VFIO_USER_H
+#define VFIO_USER_H
+
+/*
+ * vfio protocol over a UNIX socket.
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+typedef struct {
+    int send_fds;
+    int recv_fds;
+    int *fds;
+} VFIOUserFDs;
+
+typedef struct VFIOUserReply {
+    QTAILQ_ENTRY(VFIOUserReply) next;
+    VFIOUserFDs *fds;
+    uint32_t rsize;
+    uint32_t id;
+    QemuCond cv;
+    bool complete;
+    bool nowait;
+} VFIOUserReply;
+
+
+enum proxy_state {
+    VFIO_PROXY_CONNECTED = 1,
+    VFIO_PROXY_RECV_ERROR = 2,
+    VFIO_PROXY_CLOSING = 3,
+    VFIO_PROXY_CLOSED = 4,
+};
+
+typedef struct VFIOProxy {
+    QLIST_ENTRY(VFIOProxy) next;
+    char *sockname;
+    struct QIOChannel *ioc;
+    int (*request)(void *opaque, char *buf, VFIOUserFDs *fds);
+    void *reqarg;
+    int flags;
+    QemuCond close_cv;
+
+    /*
+     * above only changed when BQL is held
+     * below are protected by per-proxy lock
+     */
+    QemuMutex lock;
+    QTAILQ_HEAD(, VFIOUserReply) free;
+    QTAILQ_HEAD(, VFIOUserReply) pending;
+    VFIOUserReply *last_nowait;
+    enum proxy_state state;
+    bool close_wait;
+} VFIOProxy;
+
+/* VFIOProxy flags */
+#define VFIO_PROXY_CLIENT       0x1
+#define VFIO_PROXY_SECURE       0x2
+
+VFIOProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp);
+void vfio_user_disconnect(VFIOProxy *proxy);
+
+#endif /* VFIO_USER_H */
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 8af11b0a76..f43dc6e5d0 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -75,6 +75,7 @@  typedef struct VFIOAddressSpace {
 } VFIOAddressSpace;
 
 struct VFIOGroup;
+typedef struct VFIOProxy VFIOProxy;
 
 typedef struct VFIOContainer {
     VFIOAddressSpace *space;
@@ -143,6 +144,7 @@  typedef struct VFIODevice {
     VFIOMigration *migration;
     Error *migration_blocker;
     OnOffAuto pre_copy_dirty_page_tracking;
+    VFIOProxy *proxy;
 } VFIODevice;
 
 struct VFIODeviceOps {
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index d642aafb7f..7c2d245ca5 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -42,6 +42,7 @@ 
 #include "qapi/error.h"
 #include "migration/blocker.h"
 #include "migration/qemu-file.h"
+#include "hw/vfio/user.h"
 
 #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
 
@@ -3361,13 +3362,35 @@  static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp)
     VFIOUserPCIDevice *udev = VFIO_USER_PCI(pdev);
     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
     VFIODevice *vbasedev = &vdev->vbasedev;
+    SocketAddress addr;
+    VFIOProxy *proxy;
+    Error *err = NULL;
 
+    /*
+     * TODO: make option parser understand SocketAddress
+     * and use that instead of having scaler options
+     * for each socket type.
+     */
     if (!udev->sock_name) {
         error_setg(errp, "No socket specified");
         error_append_hint(errp, "Use -device vfio-user-pci,socket=<name>\n");
         return;
     }
 
+    memset(&addr, 0, sizeof(addr));
+    addr.type = SOCKET_ADDRESS_TYPE_UNIX;
+    addr.u.q_unix.path = udev->sock_name;
+    proxy = vfio_user_connect_dev(&addr, &err);
+    if (!proxy) {
+        error_setg(errp, "Remote proxy not found");
+        return;
+    }
+    vbasedev->proxy = proxy;
+
+    if (udev->secure_dma) {
+        proxy->flags |= VFIO_PROXY_SECURE;
+    }
+
     vbasedev->name = g_strdup_printf("VFIO user <%s>", udev->sock_name);
     vbasedev->dev = DEVICE(vdev);
     vbasedev->fd = -1;
@@ -3379,6 +3402,12 @@  static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp)
 
 static void vfio_user_instance_finalize(Object *obj)
 {
+    VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
+    VFIODevice *vbasedev = &vdev->vbasedev;
+
+    vfio_put_device(vdev);
+
+    vfio_user_disconnect(vbasedev->proxy);
 }
 
 static Property vfio_user_pci_dev_properties[] = {
diff --git a/hw/vfio/user.c b/hw/vfio/user.c
new file mode 100644
index 0000000000..3bd304e036
--- /dev/null
+++ b/hw/vfio/user.c
@@ -0,0 +1,160 @@ 
+/*
+ * vfio protocol over a UNIX socket.
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include <linux/vfio.h>
+#include <sys/ioctl.h>
+
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "qemu/main-loop.h"
+#include "hw/hw.h"
+#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio.h"
+#include "qemu/sockets.h"
+#include "io/channel.h"
+#include "io/channel-socket.h"
+#include "io/channel-util.h"
+#include "sysemu/iothread.h"
+#include "user.h"
+
+static IOThread *vfio_user_iothread;
+static void vfio_user_shutdown(VFIOProxy *proxy);
+
+
+/*
+ * Functions called by main, CPU, or iothread threads
+ */
+
+static void vfio_user_shutdown(VFIOProxy *proxy)
+{
+    qio_channel_shutdown(proxy->ioc, QIO_CHANNEL_SHUTDOWN_READ, NULL);
+}
+
+
+/*
+ * Functions only called by iothread
+ */
+
+static void vfio_user_cb(void *opaque)
+{
+    VFIOProxy *proxy = opaque;
+
+    qemu_mutex_lock(&proxy->lock);
+    proxy->state = VFIO_PROXY_CLOSED;
+    qemu_mutex_unlock(&proxy->lock);
+    qemu_cond_signal(&proxy->close_cv);
+}
+
+
+/*
+ * Functions called by main or CPU threads
+ */
+
+static QLIST_HEAD(, VFIOProxy) vfio_user_sockets =
+    QLIST_HEAD_INITIALIZER(vfio_user_sockets);
+
+VFIOProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp)
+{
+    VFIOProxy *proxy;
+    QIOChannelSocket *sioc;
+    QIOChannel *ioc;
+    char *sockname;
+
+    if (addr->type != SOCKET_ADDRESS_TYPE_UNIX) {
+        error_setg(errp, "vfio_user_connect - bad address family");
+        return NULL;
+    }
+    sockname = addr->u.q_unix.path;
+
+    sioc = qio_channel_socket_new();
+    ioc = QIO_CHANNEL(sioc);
+    if (qio_channel_socket_connect_sync(sioc, addr, errp)) {
+        object_unref(OBJECT(ioc));
+        return NULL;
+    }
+    qio_channel_set_blocking(ioc, true, NULL);
+
+    proxy = g_malloc0(sizeof(VFIOProxy));
+    proxy->sockname = sockname;
+    proxy->ioc = ioc;
+    proxy->flags = VFIO_PROXY_CLIENT;
+    proxy->state = VFIO_PROXY_CONNECTED;
+    qemu_cond_init(&proxy->close_cv);
+
+    if (vfio_user_iothread == NULL) {
+        vfio_user_iothread = iothread_create("VFIO user", errp);
+    }
+
+    qemu_mutex_init(&proxy->lock);
+    QTAILQ_INIT(&proxy->free);
+    QTAILQ_INIT(&proxy->pending);
+    QLIST_INSERT_HEAD(&vfio_user_sockets, proxy, next);
+
+    return proxy;
+}
+
+void vfio_user_disconnect(VFIOProxy *proxy)
+{
+    VFIOUserReply *r1, *r2;
+
+    qemu_mutex_lock(&proxy->lock);
+
+    /* our side is quitting */
+    if (proxy->state == VFIO_PROXY_CONNECTED) {
+        vfio_user_shutdown(proxy);
+        if (!QTAILQ_EMPTY(&proxy->pending)) {
+            error_printf("vfio_user_disconnect: outstanding requests\n");
+        }
+    }
+    object_unref(OBJECT(proxy->ioc));
+    proxy->ioc = NULL;
+
+    proxy->state = VFIO_PROXY_CLOSING;
+    QTAILQ_FOREACH_SAFE(r1, &proxy->pending, next, r2) {
+        qemu_cond_destroy(&r1->cv);
+        QTAILQ_REMOVE(&proxy->pending, r1, next);
+        g_free(r1);
+    }
+    QTAILQ_FOREACH_SAFE(r1, &proxy->free, next, r2) {
+        qemu_cond_destroy(&r1->cv);
+        QTAILQ_REMOVE(&proxy->free, r1, next);
+        g_free(r1);
+    }
+
+    /*
+     * Make sure the iothread isn't blocking anywhere
+     * with a ref to this proxy by waiting for a BH
+     * handler to run after the proxy fd handlers were
+     * deleted above.
+     */
+    proxy->close_wait = 1;
+    aio_bh_schedule_oneshot(iothread_get_aio_context(vfio_user_iothread),
+                            vfio_user_cb, proxy);
+
+    /* drop locks so the iothread can make progress */
+    qemu_mutex_unlock_iothread();
+    qemu_cond_wait(&proxy->close_cv, &proxy->lock);
+
+    /* we now hold the only ref to proxy */
+    qemu_mutex_unlock(&proxy->lock);
+    qemu_cond_destroy(&proxy->close_cv);
+    qemu_mutex_destroy(&proxy->lock);
+
+    qemu_mutex_lock_iothread();
+
+    QLIST_REMOVE(proxy, next);
+    if (QLIST_EMPTY(&vfio_user_sockets)) {
+        iothread_destroy(vfio_user_iothread);
+        vfio_user_iothread = NULL;
+    }
+
+    g_free(proxy);
+}
diff --git a/MAINTAINERS b/MAINTAINERS
index d838b9e3f2..f429bab391 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1882,8 +1882,12 @@  L: qemu-s390x@nongnu.org
 vfio-user
 M: John G Johnson <john.g.johnson@oracle.com>
 M: Thanos Makatos <thanos.makatos@nutanix.com>
+M: Elena Ufimtseva <elena.ufimtseva@oracle.com>
+M: Jagannathan Raman <jag.raman@oracle.com>
 S: Supported
 F: docs/devel/vfio-user.rst
+F: hw/vfio/user.c
+F: hw/vfio/user.h
 
 vhost
 M: Michael S. Tsirkin <mst@redhat.com>
diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
index da9af297a0..739b30be73 100644
--- a/hw/vfio/meson.build
+++ b/hw/vfio/meson.build
@@ -8,6 +8,7 @@  vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
   'display.c',
   'pci-quirks.c',
   'pci.c',
+  'user.c',
 ))
 vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c'))
 vfio_ss.add(when: 'CONFIG_VFIO_PLATFORM', if_true: files('platform.c'))