diff mbox series

[RFC,03/19] vfio-user: define VFIO Proxy and communication functions

Message ID cd9d5d6214d957db61120d9c3cbdc99e799a3baa.1626675354.git.elena.ufimtseva@oracle.com (mailing list archive)
State New, archived
Headers show
Series vfio-user implementation | expand

Commit Message

Elena Ufimtseva July 19, 2021, 6:27 a.m. UTC
From: John G Johnson <john.g.johnson@oracle.com>

Add user.c and user.h files for vfio-user with the basic
send and receive functions.

Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
 hw/vfio/user.h                | 120 ++++++++++++++
 include/hw/vfio/vfio-common.h |   2 +
 hw/vfio/user.c                | 286 ++++++++++++++++++++++++++++++++++
 MAINTAINERS                   |   4 +
 hw/vfio/meson.build           |   1 +
 5 files changed, 413 insertions(+)
 create mode 100644 hw/vfio/user.h
 create mode 100644 hw/vfio/user.c

Comments

Stefan Hajnoczi July 27, 2021, 4:34 p.m. UTC | #1
On Sun, Jul 18, 2021 at 11:27:42PM -0700, Elena Ufimtseva wrote:
> From: John G Johnson <john.g.johnson@oracle.com>
> 
> Add user.c and user.h files for vfio-user with the basic
> send and receive functions.
> 
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> ---
>  hw/vfio/user.h                | 120 ++++++++++++++
>  include/hw/vfio/vfio-common.h |   2 +
>  hw/vfio/user.c                | 286 ++++++++++++++++++++++++++++++++++
>  MAINTAINERS                   |   4 +
>  hw/vfio/meson.build           |   1 +
>  5 files changed, 413 insertions(+)
>  create mode 100644 hw/vfio/user.h
>  create mode 100644 hw/vfio/user.c

The multi-threading, coroutine, and blocking I/O requirements of
vfio_user_recv() and vfio_user_send_reply() are unclear to me. Please
document them so it's clear what environment they can be called from. I
guess they are not called from coroutines and proxy->ioc is a blocking
IOChannel?

> 
> diff --git a/hw/vfio/user.h b/hw/vfio/user.h
> new file mode 100644
> index 0000000000..cdbc074579
> --- /dev/null
> +++ b/hw/vfio/user.h
> @@ -0,0 +1,120 @@
> +#ifndef VFIO_USER_H
> +#define VFIO_USER_H
> +
> +/*
> + * vfio protocol over a UNIX socket.
> + *
> + * Copyright © 2018, 2021 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Each message has a standard header that describes the command
> + * being sent, which is almost always a VFIO ioctl().
> + *
> + * The header may be followed by command-specfic data, such as the
> + * region and offset info for read and write commands.
> + */
> +
> +/* commands */
> +enum vfio_user_command {
> +    VFIO_USER_VERSION                   = 1,
> +    VFIO_USER_DMA_MAP                   = 2,
> +    VFIO_USER_DMA_UNMAP                 = 3,
> +    VFIO_USER_DEVICE_GET_INFO           = 4,
> +    VFIO_USER_DEVICE_GET_REGION_INFO    = 5,
> +    VFIO_USER_DEVICE_GET_REGION_IO_FDS  = 6,
> +    VFIO_USER_DEVICE_GET_IRQ_INFO       = 7,
> +    VFIO_USER_DEVICE_SET_IRQS           = 8,
> +    VFIO_USER_REGION_READ               = 9,
> +    VFIO_USER_REGION_WRITE              = 10,
> +    VFIO_USER_DMA_READ                  = 11,
> +    VFIO_USER_DMA_WRITE                 = 12,
> +    VFIO_USER_DEVICE_RESET              = 13,
> +    VFIO_USER_DIRTY_PAGES               = 14,
> +    VFIO_USER_MAX,
> +};
> +
> +/* flags */
> +#define VFIO_USER_REQUEST       0x0
> +#define VFIO_USER_REPLY         0x1
> +#define VFIO_USER_TYPE          0xF
> +
> +#define VFIO_USER_NO_REPLY      0x10
> +#define VFIO_USER_ERROR         0x20
> +
> +typedef struct vfio_user_hdr {
> +    uint16_t id;
> +    uint16_t command;
> +    uint32_t size;
> +    uint32_t flags;
> +    uint32_t error_reply;
> +} vfio_user_hdr_t;

Please use QEMU coding style in QEMU code (i.e. not shared with Linux or
external libraries):

  typedef struct {
      ...
  } VfioUserHdr;

You can also specify the struct VfioUserHdr tag if you want but it's
only necessary to reference the struct before the end of the typedef
definition.

https://qemu-project.gitlab.io/qemu/devel/style.html

> +
> +/*
> + * VFIO_USER_VERSION
> + */
> +#define VFIO_USER_MAJOR_VER     0
> +#define VFIO_USER_MINOR_VER     0
> +
> +struct vfio_user_version {
> +    vfio_user_hdr_t hdr;
> +    uint16_t major;
> +    uint16_t minor;
> +    char capabilities[];
> +};
> +
> +#define VFIO_USER_DEF_MAX_FDS   8
> +#define VFIO_USER_MAX_MAX_FDS   16
> +
> +#define VFIO_USER_DEF_MAX_XFER  (1024 * 1024)
> +#define VFIO_USER_MAX_MAX_XFER  (64 * 1024 * 1024)
> +
> +typedef struct VFIOUserFDs {
> +    int send_fds;
> +    int recv_fds;
> +    int *fds;
> +} VFIOUserFDs;

I think around here we switch from vfio-user spec definitions to QEMU
implementation details. It might be nice to keep the vfio-user spec
definitions in a separate header file so the boundary is clear.

> +
> +typedef struct VFIOUserReply {
> +    QTAILQ_ENTRY(VFIOUserReply) next;
> +    vfio_user_hdr_t *msg;
> +    VFIOUserFDs *fds;
> +    int rsize;
> +    uint32_t id;
> +    QemuCond cv;
> +    uint8_t complete;

Please use bool.

> +} VFIOUserReply;
> +
> +enum proxy_state {
> +    CONNECTED = 1,
> +    RECV_ERROR = 2,
> +    CLOSING = 3,
> +    CLOSED = 4,
> +};

These enum values probably need a prefix (VFIO_PROXY_*). Generic short
names like CONNECTED, CLOSED, etc could lead to namespace collisions.
Enum constants are in the global namespace.

> +
> +typedef struct VFIOProxy {
> +    QLIST_ENTRY(VFIOProxy) next;
> +    char *sockname;
> +    struct QIOChannel *ioc;
> +    int (*request)(void *opaque, char *buf, VFIOUserFDs *fds);
> +    void *reqarg;
> +    int flags;
> +    QemuCond close_cv;
> +
> +    /*
> +     * above only changed when iolock is held

Please use "BQL" instead of "iolock". git grep shows many results for
BQL and the only result for iolock is in mpqemu code.

> +     * below are protected by per-proxy lock
> +     */
> +    QemuMutex lock;
> +    QTAILQ_HEAD(, VFIOUserReply) free;
> +    QTAILQ_HEAD(, VFIOUserReply) pending;
> +    enum proxy_state state;
> +    int close_wait;

Is this a bool? Please use bool.

> +} VFIOProxy;
> +
> +#define VFIO_PROXY_CLIENT       0x1

A comment that shows which field VFIO_PROXY_CLIENT relates would make this clearer:

  /* VFIOProxy->flags */
  #define VFIO_PROXY_CLIENT 0x1

> +
> +void vfio_user_recv(void *opaque);
> +void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret);
> +#endif /* VFIO_USER_H */
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 8af11b0a76..f43dc6e5d0 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -75,6 +75,7 @@ typedef struct VFIOAddressSpace {
>  } VFIOAddressSpace;
>  
>  struct VFIOGroup;
> +typedef struct VFIOProxy VFIOProxy;
>  
>  typedef struct VFIOContainer {
>      VFIOAddressSpace *space;
> @@ -143,6 +144,7 @@ typedef struct VFIODevice {
>      VFIOMigration *migration;
>      Error *migration_blocker;
>      OnOffAuto pre_copy_dirty_page_tracking;
> +    VFIOProxy *proxy;
>  } VFIODevice;
>  
>  struct VFIODeviceOps {
> diff --git a/hw/vfio/user.c b/hw/vfio/user.c
> new file mode 100644
> index 0000000000..021d5540e0
> --- /dev/null
> +++ b/hw/vfio/user.c
> @@ -0,0 +1,286 @@
> +/*
> + * vfio protocol over a UNIX socket.
> + *
> + * Copyright © 2018, 2021 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu/osdep.h"
> +#include <linux/vfio.h>
> +#include <sys/ioctl.h>
> +
> +#include "qemu/error-report.h"
> +#include "qapi/error.h"
> +#include "qemu/main-loop.h"
> +#include "hw/hw.h"
> +#include "hw/vfio/vfio-common.h"
> +#include "hw/vfio/vfio.h"
> +#include "qemu/sockets.h"
> +#include "io/channel.h"
> +#include "io/channel-util.h"
> +#include "sysemu/iothread.h"
> +#include "user.h"
> +
> +static uint64_t max_xfer_size = VFIO_USER_DEF_MAX_XFER;
> +static IOThread *vfio_user_iothread;
> +static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg,
> +                                  VFIOUserFDs *fds);
> +static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg,
> +                           VFIOUserFDs *fds);
> +static void vfio_user_shutdown(VFIOProxy *proxy);
> +
> +static void vfio_user_shutdown(VFIOProxy *proxy)
> +{
> +    qio_channel_shutdown(proxy->ioc, QIO_CHANNEL_SHUTDOWN_READ, NULL);
> +    qio_channel_set_aio_fd_handler(proxy->ioc,
> +                                   iothread_get_aio_context(vfio_user_iothread),
> +                                   NULL, NULL, NULL);

There is no other qio_channel_set_aio_fd_handler() call in this patch.
Why is this one necessary?

> +}
> +
> +void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret)
> +{
> +    vfio_user_hdr_t *hdr = (vfio_user_hdr_t *)buf;
> +
> +    /*
> +     * convert header to associated reply
> +     * positive ret is reply size, negative is error code
> +     */
> +    hdr->flags = VFIO_USER_REPLY;
> +    if (ret > 0) {
> +        hdr->size = ret;
> +    } else if (ret < 0) {
> +        hdr->flags |= VFIO_USER_ERROR;
> +        hdr->error_reply = -ret;
> +        hdr->size = sizeof(*hdr);
> +    }

assert(ret != 0)? That case doesn't seem to be defined so maybe an
assertion is worthwhile.

> +    vfio_user_send(proxy, hdr, NULL);
> +}
> +
> +void vfio_user_recv(void *opaque)
> +{
> +    VFIODevice *vbasedev = opaque;
> +    VFIOProxy *proxy = vbasedev->proxy;
> +    VFIOUserReply *reply = NULL;
> +    g_autofree int *fdp = NULL;
> +    VFIOUserFDs reqfds = { 0, 0, fdp };
> +    vfio_user_hdr_t msg;
> +    struct iovec iov = {
> +        .iov_base = &msg,
> +        .iov_len = sizeof(msg),
> +    };
> +    int isreply, i, ret;
> +    size_t msgleft, numfds = 0;
> +    char *data = NULL;
> +    g_autofree char *buf = NULL;
> +    Error *local_err = NULL;
> +
> +    qemu_mutex_lock(&proxy->lock);
> +    if (proxy->state == CLOSING) {
> +        qemu_mutex_unlock(&proxy->lock);

QEMU_LOCK_GUARD() automatically unlocks mutexes when the function
returns and is less error-prone than manual lock/unlock calls.

> +        return;
> +    }
> +
> +    ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds,
> +                                 &local_err);
> +    if (ret <= 0) {
> +        /* read error or other side closed connection */
> +        error_setg_errno(&local_err, errno, "vfio_user_recv read error");

This will trigger an assertion failure when local_err was already set by
qio_channel_readv_full():

  static void error_setv(Error **errp,
                         const char *src, int line, const char *func,
                         ErrorClass err_class, const char *fmt, va_list ap,
                         const char *suffix)
  {
      Error *err;
      int saved_errno = errno;
  
      if (errp == NULL) {
          return;
      }
      assert(*errp == NULL);
      ^^^^^^^^^^^^^^^^^^^^^^

I think this error_setg_errno() call should be dropped. You can use
error_prepend() if you'd like to add more information to the error
message from qio_channel_readv_full().

> +        goto fatal;
> +    }
> +
> +    if (ret < sizeof(msg)) {
> +        error_setg(&local_err, "vfio_user_recv short read of header");
> +        goto err;
> +    }
> +
> +    /*
> +     * For replies, find the matching pending request
> +     */
> +    switch (msg.flags & VFIO_USER_TYPE) {
> +    case VFIO_USER_REQUEST:
> +        isreply = 0;
> +        break;
> +    case VFIO_USER_REPLY:
> +        isreply = 1;
> +        break;
> +    default:
> +        error_setg(&local_err, "vfio_user_recv unknown message type");
> +        goto err;
> +    }
> +
> +    if (isreply) {
> +        QTAILQ_FOREACH(reply, &proxy->pending, next) {
> +            if (msg.id == reply->id) {
> +                break;
> +            }
> +        }

I'm surprised to see this loop since proxy->lock prevents additional
requests from being sent while we're trying to receive a message. Can
there really be multiple replies pending with this locking scheme?

> +        if (reply == NULL) {
> +            error_setg(&local_err, "vfio_user_recv unexpected reply");
> +            goto err;
> +        }
> +        QTAILQ_REMOVE(&proxy->pending, reply, next);
> +
> +        /*
> +         * Process any received FDs
> +         */
> +        if (numfds != 0) {
> +            if (reply->fds == NULL || reply->fds->recv_fds < numfds) {
> +                error_setg(&local_err, "vfio_user_recv unexpected FDs");
> +                goto err;
> +            }
> +            reply->fds->recv_fds = numfds;
> +            memcpy(reply->fds->fds, fdp, numfds * sizeof(int));
> +        }
> +
> +    } else {
> +        /*
> +         * The client doesn't expect any FDs in requests, but
> +         * they will be expected on the server
> +         */
> +        if (numfds != 0 && (proxy->flags & VFIO_PROXY_CLIENT)) {
> +            error_setg(&local_err, "vfio_user_recv fd in client reply");
> +            goto err;
> +        }
> +        reqfds.recv_fds = numfds;
> +    }
> +
> +    /*
> +     * put the whole message into a single buffer
> +     */
> +    msgleft = msg.size - sizeof(msg);

msg.size has not been validated so this could underflow. Please validate
all inputs so malicious servers/clients cannot crash or compromise the
program.

> +    if (isreply) {
> +        if (msg.size > reply->rsize) {

rsize is an int. Should it be uint32_t like msg.size?

> +            error_setg(&local_err,
> +                       "vfio_user_recv reply larger than recv buffer");
> +            goto fatal;
> +        }
> +        *reply->msg = msg;
> +        data = (char *)reply->msg + sizeof(msg);
> +    } else {
> +        if (msg.size > max_xfer_size) {
> +            error_setg(&local_err, "vfio_user_recv request larger than max");
> +            goto fatal;
> +        }

Missing check to prevent buffer overflow:

  if (msg.size < sizeof(msg)) {
      error_setg(&local_err, "vfio_user_recv request too small");
      goto fatal;
  }

> +        buf = g_malloc0(msg.size);
> +        memcpy(buf, &msg, sizeof(msg));
> +        data = buf + sizeof(msg);
> +    }
> +
> +    if (msgleft != 0) {
> +        ret = qio_channel_read(proxy->ioc, data, msgleft, &local_err);
> +        if (ret < 0) {
> +            goto fatal;
> +        }
> +        if (ret != msgleft) {
> +            error_setg(&local_err, "vfio_user_recv short read of msg body");
> +            goto err;
> +        }
> +    }
> +
> +    /*
> +     * Replies signal a waiter, requests get processed by vfio code
> +     * that may assume the iothread lock is held.
> +     */
> +    qemu_mutex_unlock(&proxy->lock);
> +    if (isreply) {
> +        reply->complete = 1;
> +        qemu_cond_signal(&reply->cv);

signal must be called with the mutex held to avoid race conditions. If
the waiter acquires the lock and still sees complete == 0, then we
signal before wait is entered, the signal is missed and the waiter is
stuck.

> +    } else {
> +        qemu_mutex_lock_iothread();
> +        /*
> +         * make sure proxy wasn't closed while we waited
> +         * checking without holding the proxy lock is safe
> +         * since state is only set to CLOSING when iolock is held

s/iolock/the BQL/

> +         */
> +        if (proxy->state != CLOSING) {
> +            ret = proxy->request(proxy->reqarg, buf, &reqfds);
> +            if (ret < 0 && !(msg.flags & VFIO_USER_NO_REPLY)) {
> +                vfio_user_send_reply(proxy, buf, ret);
> +            }
> +        }
> +        qemu_mutex_unlock_iothread();
> +    }
> +
> +    return;
> + fatal:
> +    vfio_user_shutdown(proxy);
> +    proxy->state = RECV_ERROR;
> +
> + err:
> +    qemu_mutex_unlock(&proxy->lock);
> +    for (i = 0; i < numfds; i++) {
> +        close(fdp[i]);
> +    }
> +    if (reply != NULL) {
> +        /* force an error to keep sending thread from hanging */
> +        reply->msg->flags |= VFIO_USER_ERROR;
> +        reply->msg->error_reply = EINVAL;
> +        reply->complete = 1;
> +        qemu_cond_signal(&reply->cv);

This has the race condition too.

> +    }
> +    error_report_err(local_err);
> +}
> +
> +static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg,
> +                                  VFIOUserFDs *fds)
> +{
> +    struct iovec iov = {
> +        .iov_base = msg,
> +        .iov_len = msg->size,
> +    };
> +    size_t numfds = 0;
> +    int msgleft, ret, *fdp = NULL;
> +    char *buf;
> +    Error *local_err = NULL;
> +
> +    if (proxy->state != CONNECTED) {
> +        msg->flags |= VFIO_USER_ERROR;
> +        msg->error_reply = ECONNRESET;
> +        return;
> +    }
> +
> +    if (fds != NULL && fds->send_fds != 0) {
> +        numfds = fds->send_fds;
> +        fdp = fds->fds;
> +    }
> +    ret = qio_channel_writev_full(proxy->ioc, &iov, 1, fdp, numfds, &local_err);
> +    if (ret < 0) {
> +        goto err;
> +    }
> +    if (ret == msg->size) {
> +        return;
> +    }
> +
> +    buf = iov.iov_base + ret;
> +    msgleft = iov.iov_len - ret;
> +    do {
> +        ret = qio_channel_write(proxy->ioc, buf, msgleft, &local_err);
> +        if (ret < 0) {
> +            goto err;
> +        }
> +        buf += ret, msgleft -= ret;

Please use semicolon. Comma operators are rare, requiring readers to
check their exact semantics. There is no need to use comma here.

> +    } while (msgleft != 0);
> +    return;
> +
> + err:
> +    error_report_err(local_err);

State remains unchanged and msg->error_reply isn't set?

> +}
> +
> +static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg,
> +                           VFIOUserFDs *fds)
> +{
> +    bool iolock = qemu_mutex_iothread_locked();
> +
> +    if (iolock) {
> +        qemu_mutex_unlock_iothread();
> +    }
> +    qemu_mutex_lock(&proxy->lock);
> +    vfio_user_send_locked(proxy, msg, fds);
> +    qemu_mutex_unlock(&proxy->lock);
> +    if (iolock) {
> +        qemu_mutex_lock_iothread();
> +    }
> +}
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 12d69f3a45..aa4df6c418 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1883,8 +1883,12 @@ L: qemu-s390x@nongnu.org
>  vfio-user
>  M: John G Johnson <john.g.johnson@oracle.com>
>  M: Thanos Makatos <thanos.makatos@nutanix.com>
> +M: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> +M: Jagannathan Raman <jag.raman@oracle.com>
>  S: Supported
>  F: docs/devel/vfio-user.rst
> +F: hw/vfio/user.c
> +F: hw/vfio/user.h
>  
>  vhost
>  M: Michael S. Tsirkin <mst@redhat.com>
> diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
> index da9af297a0..739b30be73 100644
> --- a/hw/vfio/meson.build
> +++ b/hw/vfio/meson.build
> @@ -8,6 +8,7 @@ vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
>    'display.c',
>    'pci-quirks.c',
>    'pci.c',
> +  'user.c',
>  ))
>  vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c'))
>  vfio_ss.add(when: 'CONFIG_VFIO_PLATFORM', if_true: files('platform.c'))
> -- 
> 2.25.1
>
John Johnson July 28, 2021, 6:08 p.m. UTC | #2
> On Jul 27, 2021, at 9:34 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Sun, Jul 18, 2021 at 11:27:42PM -0700, Elena Ufimtseva wrote:
>> From: John G Johnson <john.g.johnson@oracle.com>
>> 
>> Add user.c and user.h files for vfio-user with the basic
>> send and receive functions.
>> 
>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>> ---
>> hw/vfio/user.h                | 120 ++++++++++++++
>> include/hw/vfio/vfio-common.h |   2 +
>> hw/vfio/user.c                | 286 ++++++++++++++++++++++++++++++++++
>> MAINTAINERS                   |   4 +
>> hw/vfio/meson.build           |   1 +
>> 5 files changed, 413 insertions(+)
>> create mode 100644 hw/vfio/user.h
>> create mode 100644 hw/vfio/user.c
> 
> The multi-threading, coroutine, and blocking I/O requirements of
> vfio_user_recv() and vfio_user_send_reply() are unclear to me. Please
> document them so it's clear what environment they can be called from. I
> guess they are not called from coroutines and proxy->ioc is a blocking
> IOChannel?
> 

	Yes to both, moreover, a block comment above vfio_user_recv() would
be useful.  The call to setup vfio_user_recv() as the socket handler isn’t
in this patch, do you want the series re-org’d?



>> 
>> diff --git a/hw/vfio/user.h b/hw/vfio/user.h
>> new file mode 100644
>> index 0000000000..cdbc074579
>> --- /dev/null
>> +++ b/hw/vfio/user.h
>> @@ -0,0 +1,120 @@
>> +#ifndef VFIO_USER_H
>> +#define VFIO_USER_H
>> +
>> +/*
>> + * vfio protocol over a UNIX socket.
>> + *
>> + * Copyright © 2018, 2021 Oracle and/or its affiliates.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + *
>> + * Each message has a standard header that describes the command
>> + * being sent, which is almost always a VFIO ioctl().
>> + *
>> + * The header may be followed by command-specfic data, such as the
>> + * region and offset info for read and write commands.
>> + */
>> +
>> +/* commands */
>> +enum vfio_user_command {
>> +    VFIO_USER_VERSION                   = 1,
>> +    VFIO_USER_DMA_MAP                   = 2,
>> +    VFIO_USER_DMA_UNMAP                 = 3,
>> +    VFIO_USER_DEVICE_GET_INFO           = 4,
>> +    VFIO_USER_DEVICE_GET_REGION_INFO    = 5,
>> +    VFIO_USER_DEVICE_GET_REGION_IO_FDS  = 6,
>> +    VFIO_USER_DEVICE_GET_IRQ_INFO       = 7,
>> +    VFIO_USER_DEVICE_SET_IRQS           = 8,
>> +    VFIO_USER_REGION_READ               = 9,
>> +    VFIO_USER_REGION_WRITE              = 10,
>> +    VFIO_USER_DMA_READ                  = 11,
>> +    VFIO_USER_DMA_WRITE                 = 12,
>> +    VFIO_USER_DEVICE_RESET              = 13,
>> +    VFIO_USER_DIRTY_PAGES               = 14,
>> +    VFIO_USER_MAX,
>> +};
>> +
>> +/* flags */
>> +#define VFIO_USER_REQUEST       0x0
>> +#define VFIO_USER_REPLY         0x1
>> +#define VFIO_USER_TYPE          0xF
>> +
>> +#define VFIO_USER_NO_REPLY      0x10
>> +#define VFIO_USER_ERROR         0x20
>> +
>> +typedef struct vfio_user_hdr {
>> +    uint16_t id;
>> +    uint16_t command;
>> +    uint32_t size;
>> +    uint32_t flags;
>> +    uint32_t error_reply;
>> +} vfio_user_hdr_t;
> 
> Please use QEMU coding style in QEMU code (i.e. not shared with Linux or
> external libraries):
> 
>  typedef struct {
>      ...
>  } VfioUserHdr;
> 
> You can also specify the struct VfioUserHdr tag if you want but it's
> only necessary to reference the struct before the end of the typedef
> definition.
> 
> https://qemu-project.gitlab.io/qemu/devel/style.html
> 

	OK

>> +
>> +/*
>> + * VFIO_USER_VERSION
>> + */
>> +#define VFIO_USER_MAJOR_VER     0
>> +#define VFIO_USER_MINOR_VER     0
>> +
>> +struct vfio_user_version {
>> +    vfio_user_hdr_t hdr;
>> +    uint16_t major;
>> +    uint16_t minor;
>> +    char capabilities[];
>> +};
>> +
>> +#define VFIO_USER_DEF_MAX_FDS   8
>> +#define VFIO_USER_MAX_MAX_FDS   16
>> +
>> +#define VFIO_USER_DEF_MAX_XFER  (1024 * 1024)
>> +#define VFIO_USER_MAX_MAX_XFER  (64 * 1024 * 1024)
>> +
>> +typedef struct VFIOUserFDs {
>> +    int send_fds;
>> +    int recv_fds;
>> +    int *fds;
>> +} VFIOUserFDs;
> 
> I think around here we switch from vfio-user spec definitions to QEMU
> implementation details. It might be nice to keep the vfio-user spec
> definitions in a separate header file so the boundary is clear.
> 

	OK


>> +
>> +typedef struct VFIOUserReply {
>> +    QTAILQ_ENTRY(VFIOUserReply) next;
>> +    vfio_user_hdr_t *msg;
>> +    VFIOUserFDs *fds;
>> +    int rsize;
>> +    uint32_t id;
>> +    QemuCond cv;
>> +    uint8_t complete;
> 
> Please use bool.
> 

	OK

>> +} VFIOUserReply;
>> +
>> +enum proxy_state {
>> +    CONNECTED = 1,
>> +    RECV_ERROR = 2,
>> +    CLOSING = 3,
>> +    CLOSED = 4,
>> +};
> 
> These enum values probably need a prefix (VFIO_PROXY_*). Generic short
> names like CONNECTED, CLOSED, etc could lead to namespace collisions.
> Enum constants are in the global namespace.
> 

	OK


>> +
>> +typedef struct VFIOProxy {
>> +    QLIST_ENTRY(VFIOProxy) next;
>> +    char *sockname;
>> +    struct QIOChannel *ioc;
>> +    int (*request)(void *opaque, char *buf, VFIOUserFDs *fds);
>> +    void *reqarg;
>> +    int flags;
>> +    QemuCond close_cv;
>> +
>> +    /*
>> +     * above only changed when iolock is held
> 
> Please use "BQL" instead of "iolock". git grep shows many results for
> BQL and the only result for iolock is in mpqemu code.
> 

	OK

>> +     * below are protected by per-proxy lock
>> +     */
>> +    QemuMutex lock;
>> +    QTAILQ_HEAD(, VFIOUserReply) free;
>> +    QTAILQ_HEAD(, VFIOUserReply) pending;
>> +    enum proxy_state state;
>> +    int close_wait;
> 
> Is this a bool? Please use bool.

	yes it’s a bool


> 
>> +} VFIOProxy;
>> +
>> +#define VFIO_PROXY_CLIENT       0x1
> 
> A comment that shows which field VFIO_PROXY_CLIENT relates would make this clearer:
> 
>  /* VFIOProxy->flags */
>  #define VFIO_PROXY_CLIENT 0x1
> 

	OK

>> +
>> +void vfio_user_recv(void *opaque);
>> +void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret);
>> +#endif /* VFIO_USER_H */
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 8af11b0a76..f43dc6e5d0 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -75,6 +75,7 @@ typedef struct VFIOAddressSpace {
>> } VFIOAddressSpace;
>> 
>> struct VFIOGroup;
>> +typedef struct VFIOProxy VFIOProxy;
>> 
>> typedef struct VFIOContainer {
>>     VFIOAddressSpace *space;
>> @@ -143,6 +144,7 @@ typedef struct VFIODevice {
>>     VFIOMigration *migration;
>>     Error *migration_blocker;
>>     OnOffAuto pre_copy_dirty_page_tracking;
>> +    VFIOProxy *proxy;
>> } VFIODevice;
>> 
>> struct VFIODeviceOps {
>> diff --git a/hw/vfio/user.c b/hw/vfio/user.c
>> new file mode 100644
>> index 0000000000..021d5540e0
>> --- /dev/null
>> +++ b/hw/vfio/user.c
>> @@ -0,0 +1,286 @@
>> +/*
>> + * vfio protocol over a UNIX socket.
>> + *
>> + * Copyright © 2018, 2021 Oracle and/or its affiliates.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include <linux/vfio.h>
>> +#include <sys/ioctl.h>
>> +
>> +#include "qemu/error-report.h"
>> +#include "qapi/error.h"
>> +#include "qemu/main-loop.h"
>> +#include "hw/hw.h"
>> +#include "hw/vfio/vfio-common.h"
>> +#include "hw/vfio/vfio.h"
>> +#include "qemu/sockets.h"
>> +#include "io/channel.h"
>> +#include "io/channel-util.h"
>> +#include "sysemu/iothread.h"
>> +#include "user.h"
>> +
>> +static uint64_t max_xfer_size = VFIO_USER_DEF_MAX_XFER;
>> +static IOThread *vfio_user_iothread;
>> +static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg,
>> +                                  VFIOUserFDs *fds);
>> +static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg,
>> +                           VFIOUserFDs *fds);
>> +static void vfio_user_shutdown(VFIOProxy *proxy);
>> +
>> +static void vfio_user_shutdown(VFIOProxy *proxy)
>> +{
>> +    qio_channel_shutdown(proxy->ioc, QIO_CHANNEL_SHUTDOWN_READ, NULL);
>> +    qio_channel_set_aio_fd_handler(proxy->ioc,
>> +                                   iothread_get_aio_context(vfio_user_iothread),
>> +                                   NULL, NULL, NULL);
> 
> There is no other qio_channel_set_aio_fd_handler() call in this patch.
> Why is this one necessary?
> 

	See first comment.

>> +}
>> +
>> +void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret)
>> +{
>> +    vfio_user_hdr_t *hdr = (vfio_user_hdr_t *)buf;
>> +
>> +    /*
>> +     * convert header to associated reply
>> +     * positive ret is reply size, negative is error code
>> +     */
>> +    hdr->flags = VFIO_USER_REPLY;
>> +    if (ret > 0) {
>> +        hdr->size = ret;
>> +    } else if (ret < 0) {
>> +        hdr->flags |= VFIO_USER_ERROR;
>> +        hdr->error_reply = -ret;
>> +        hdr->size = sizeof(*hdr);
>> +    }
> 
> assert(ret != 0)? That case doesn't seem to be defined so maybe an
> assertion is worthwhile.
> 

	I should test for positive size less than the header size as an error.


>> +    vfio_user_send(proxy, hdr, NULL);
>> +}
>> +
>> +void vfio_user_recv(void *opaque)
>> +{
>> +    VFIODevice *vbasedev = opaque;
>> +    VFIOProxy *proxy = vbasedev->proxy;
>> +    VFIOUserReply *reply = NULL;
>> +    g_autofree int *fdp = NULL;
>> +    VFIOUserFDs reqfds = { 0, 0, fdp };
>> +    vfio_user_hdr_t msg;
>> +    struct iovec iov = {
>> +        .iov_base = &msg,
>> +        .iov_len = sizeof(msg),
>> +    };
>> +    int isreply, i, ret;
>> +    size_t msgleft, numfds = 0;
>> +    char *data = NULL;
>> +    g_autofree char *buf = NULL;
>> +    Error *local_err = NULL;
>> +
>> +    qemu_mutex_lock(&proxy->lock);
>> +    if (proxy->state == CLOSING) {
>> +        qemu_mutex_unlock(&proxy->lock);
> 
> QEMU_LOCK_GUARD() automatically unlocks mutexes when the function
> returns and is less error-prone than manual lock/unlock calls.
> 

	will look into it


>> +        return;
>> +    }
>> +
>> +    ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds,
>> +                                 &local_err);
>> +    if (ret <= 0) {
>> +        /* read error or other side closed connection */
>> +        error_setg_errno(&local_err, errno, "vfio_user_recv read error");
> 
> This will trigger an assertion failure when local_err was already set by
> qio_channel_readv_full():
> 
>  static void error_setv(Error **errp,
>                         const char *src, int line, const char *func,
>                         ErrorClass err_class, const char *fmt, va_list ap,
>                         const char *suffix)
>  {
>      Error *err;
>      int saved_errno = errno;
> 
>      if (errp == NULL) {
>          return;
>      }
>      assert(*errp == NULL);
>      ^^^^^^^^^^^^^^^^^^^^^^
> 
> I think this error_setg_errno() call should be dropped. You can use
> error_prepend() if you'd like to add more information to the error
> message from qio_channel_readv_full().
> 

	OK


>> +        goto fatal;
>> +    }
>> +
>> +    if (ret < sizeof(msg)) {
>> +        error_setg(&local_err, "vfio_user_recv short read of header");
>> +        goto err;
>> +    }
>> +
>> +    /*
>> +     * For replies, find the matching pending request
>> +     */
>> +    switch (msg.flags & VFIO_USER_TYPE) {
>> +    case VFIO_USER_REQUEST:
>> +        isreply = 0;
>> +        break;
>> +    case VFIO_USER_REPLY:
>> +        isreply = 1;
>> +        break;
>> +    default:
>> +        error_setg(&local_err, "vfio_user_recv unknown message type");
>> +        goto err;
>> +    }
>> +
>> +    if (isreply) {
>> +        QTAILQ_FOREACH(reply, &proxy->pending, next) {
>> +            if (msg.id == reply->id) {
>> +                break;
>> +            }
>> +        }
> 
> I'm surprised to see this loop since proxy->lock prevents additional
> requests from being sent while we're trying to receive a message. Can
> there really be multiple replies pending with this locking scheme?
> 

	I didn’t want to assume that was always true.  Note an email
exchange with Peter Xu where I can drop BQL in the middle of a memory
region transaction that causes dma_map/unmap messages to be sent.  The
fix to that issue will be to send the messages async, then wait for the
youngest reply when the transaction commits.



>> +        if (reply == NULL) {
>> +            error_setg(&local_err, "vfio_user_recv unexpected reply");
>> +            goto err;
>> +        }
>> +        QTAILQ_REMOVE(&proxy->pending, reply, next);
>> +
>> +        /*
>> +         * Process any received FDs
>> +         */
>> +        if (numfds != 0) {
>> +            if (reply->fds == NULL || reply->fds->recv_fds < numfds) {
>> +                error_setg(&local_err, "vfio_user_recv unexpected FDs");
>> +                goto err;
>> +            }
>> +            reply->fds->recv_fds = numfds;
>> +            memcpy(reply->fds->fds, fdp, numfds * sizeof(int));
>> +        }
>> +
>> +    } else {
>> +        /*
>> +         * The client doesn't expect any FDs in requests, but
>> +         * they will be expected on the server
>> +         */
>> +        if (numfds != 0 && (proxy->flags & VFIO_PROXY_CLIENT)) {
>> +            error_setg(&local_err, "vfio_user_recv fd in client reply");
>> +            goto err;
>> +        }
>> +        reqfds.recv_fds = numfds;
>> +    }
>> +
>> +    /*
>> +     * put the whole message into a single buffer
>> +     */
>> +    msgleft = msg.size - sizeof(msg);
> 
> msg.size has not been validated so this could underflow. Please validate
> all inputs so malicious servers/clients cannot crash or compromise the
> program.
> 

	OK

>> +    if (isreply) {
>> +        if (msg.size > reply->rsize) {
> 
> rsize is an int. Should it be uint32_t like msg.size?
> 

	OK

>> +            error_setg(&local_err,
>> +                       "vfio_user_recv reply larger than recv buffer");
>> +            goto fatal;
>> +        }
>> +        *reply->msg = msg;
>> +        data = (char *)reply->msg + sizeof(msg);
>> +    } else {
>> +        if (msg.size > max_xfer_size) {
>> +            error_setg(&local_err, "vfio_user_recv request larger than max");
>> +            goto fatal;
>> +        }
> 
> Missing check to prevent buffer overflow:
> 
>  if (msg.size < sizeof(msg)) {
>      error_setg(&local_err, "vfio_user_recv request too small");
>      goto fatal;
>  }
> 

	I will put this check up before the msgleft calculation in
the review comment above.


>> +        buf = g_malloc0(msg.size);
>> +        memcpy(buf, &msg, sizeof(msg));
>> +        data = buf + sizeof(msg);
>> +    }
>> +
>> +    if (msgleft != 0) {
>> +        ret = qio_channel_read(proxy->ioc, data, msgleft, &local_err);
>> +        if (ret < 0) {
>> +            goto fatal;
>> +        }
>> +        if (ret != msgleft) {
>> +            error_setg(&local_err, "vfio_user_recv short read of msg body");
>> +            goto err;
>> +        }
>> +    }
>> +
>> +    /*
>> +     * Replies signal a waiter, requests get processed by vfio code
>> +     * that may assume the iothread lock is held.
>> +     */
>> +    qemu_mutex_unlock(&proxy->lock);
>> +    if (isreply) {
>> +        reply->complete = 1;
>> +        qemu_cond_signal(&reply->cv);
> 
> signal must be called with the mutex held to avoid race conditions. If
> the waiter acquires the lock and still sees complete == 0, then we
> signal before wait is entered, the signal is missed and the waiter is
> stuck.
> 

	Yes, this is a bug


>> +    } else {
>> +        qemu_mutex_lock_iothread();
>> +        /*
>> +         * make sure proxy wasn't closed while we waited
>> +         * checking without holding the proxy lock is safe
>> +         * since state is only set to CLOSING when iolock is held
> 
> s/iolock/the BQL/
> 

	OK


>> +         */
>> +        if (proxy->state != CLOSING) {
>> +            ret = proxy->request(proxy->reqarg, buf, &reqfds);
>> +            if (ret < 0 && !(msg.flags & VFIO_USER_NO_REPLY)) {
>> +                vfio_user_send_reply(proxy, buf, ret);
>> +            }
>> +        }
>> +        qemu_mutex_unlock_iothread();
>> +    }
>> +
>> +    return;
>> + fatal:
>> +    vfio_user_shutdown(proxy);
>> +    proxy->state = RECV_ERROR;
>> +
>> + err:
>> +    qemu_mutex_unlock(&proxy->lock);
>> +    for (i = 0; i < numfds; i++) {
>> +        close(fdp[i]);
>> +    }
>> +    if (reply != NULL) {
>> +        /* force an error to keep sending thread from hanging */
>> +        reply->msg->flags |= VFIO_USER_ERROR;
>> +        reply->msg->error_reply = EINVAL;
>> +        reply->complete = 1;
>> +        qemu_cond_signal(&reply->cv);
> 
> This has the race condition too.
> 

	Yes

>> +    }
>> +    error_report_err(local_err);
>> +}
>> +
>> +static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg,
>> +                                  VFIOUserFDs *fds)
>> +{
>> +    struct iovec iov = {
>> +        .iov_base = msg,
>> +        .iov_len = msg->size,
>> +    };
>> +    size_t numfds = 0;
>> +    int msgleft, ret, *fdp = NULL;
>> +    char *buf;
>> +    Error *local_err = NULL;
>> +
>> +    if (proxy->state != CONNECTED) {
>> +        msg->flags |= VFIO_USER_ERROR;
>> +        msg->error_reply = ECONNRESET;
>> +        return;
>> +    }
>> +
>> +    if (fds != NULL && fds->send_fds != 0) {
>> +        numfds = fds->send_fds;
>> +        fdp = fds->fds;
>> +    }
>> +    ret = qio_channel_writev_full(proxy->ioc, &iov, 1, fdp, numfds, &local_err);
>> +    if (ret < 0) {
>> +        goto err;
>> +    }
>> +    if (ret == msg->size) {
>> +        return;
>> +    }
>> +
>> +    buf = iov.iov_base + ret;
>> +    msgleft = iov.iov_len - ret;
>> +    do {
>> +        ret = qio_channel_write(proxy->ioc, buf, msgleft, &local_err);
>> +        if (ret < 0) {
>> +            goto err;
>> +        }
>> +        buf += ret, msgleft -= ret;
> 
> Please use semicolon. Comma operators are rare, requiring readers to
> check their exact semantics. There is no need to use comma here.
> 

	OK


>> +    } while (msgleft != 0);
>> +    return;
>> +
>> + err:
>> +    error_report_err(local_err);
> 
> State remains unchanged and msg->error_reply isn't set?
> 

	They should be set.

						JJ



>> +}
>> +
>> +static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg,
>> +                           VFIOUserFDs *fds)
>> +{
>> +    bool iolock = qemu_mutex_iothread_locked();
>> +
>> +    if (iolock) {
>> +        qemu_mutex_unlock_iothread();
>> +    }
>> +    qemu_mutex_lock(&proxy->lock);
>> +    vfio_user_send_locked(proxy, msg, fds);
>> +    qemu_mutex_unlock(&proxy->lock);
>> +    if (iolock) {
>> +        qemu_mutex_lock_iothread();
>> +    }
>> +}
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 12d69f3a45..aa4df6c418 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -1883,8 +1883,12 @@ L: qemu-s390x@nongnu.org
>> vfio-user
>> M: John G Johnson <john.g.johnson@oracle.com>
>> M: Thanos Makatos <thanos.makatos@nutanix.com>
>> +M: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>> +M: Jagannathan Raman <jag.raman@oracle.com>
>> S: Supported
>> F: docs/devel/vfio-user.rst
>> +F: hw/vfio/user.c
>> +F: hw/vfio/user.h
>> 
>> vhost
>> M: Michael S. Tsirkin <mst@redhat.com>
>> diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
>> index da9af297a0..739b30be73 100644
>> --- a/hw/vfio/meson.build
>> +++ b/hw/vfio/meson.build
>> @@ -8,6 +8,7 @@ vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
>>   'display.c',
>>   'pci-quirks.c',
>>   'pci.c',
>> +  'user.c',
>> ))
>> vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c'))
>> vfio_ss.add(when: 'CONFIG_VFIO_PLATFORM', if_true: files('platform.c'))
>> -- 
>> 2.25.1
>>
Stefan Hajnoczi July 29, 2021, 8:06 a.m. UTC | #3
On Wed, Jul 28, 2021 at 06:08:26PM +0000, John Johnson wrote:
> 
> 
> > On Jul 27, 2021, at 9:34 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> > On Sun, Jul 18, 2021 at 11:27:42PM -0700, Elena Ufimtseva wrote:
> >> From: John G Johnson <john.g.johnson@oracle.com>
> >> 
> >> Add user.c and user.h files for vfio-user with the basic
> >> send and receive functions.
> >> 
> >> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> >> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> >> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> >> ---
> >> hw/vfio/user.h                | 120 ++++++++++++++
> >> include/hw/vfio/vfio-common.h |   2 +
> >> hw/vfio/user.c                | 286 ++++++++++++++++++++++++++++++++++
> >> MAINTAINERS                   |   4 +
> >> hw/vfio/meson.build           |   1 +
> >> 5 files changed, 413 insertions(+)
> >> create mode 100644 hw/vfio/user.h
> >> create mode 100644 hw/vfio/user.c
> > 
> > The multi-threading, coroutine, and blocking I/O requirements of
> > vfio_user_recv() and vfio_user_send_reply() are unclear to me. Please
> > document them so it's clear what environment they can be called from. I
> > guess they are not called from coroutines and proxy->ioc is a blocking
> > IOChannel?
> > 
> 
> 	Yes to both, moreover, a block comment above vfio_user_recv() would
> be useful.  The call to setup vfio_user_recv() as the socket handler isn’t
> in this patch, do you want the series re-org’d?

That would help with review, thanks!

Stefan
diff mbox series

Patch

diff --git a/hw/vfio/user.h b/hw/vfio/user.h
new file mode 100644
index 0000000000..cdbc074579
--- /dev/null
+++ b/hw/vfio/user.h
@@ -0,0 +1,120 @@ 
+#ifndef VFIO_USER_H
+#define VFIO_USER_H
+
+/*
+ * vfio protocol over a UNIX socket.
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Each message has a standard header that describes the command
+ * being sent, which is almost always a VFIO ioctl().
+ *
+ * The header may be followed by command-specfic data, such as the
+ * region and offset info for read and write commands.
+ */
+
+/* commands */
+enum vfio_user_command {
+    VFIO_USER_VERSION                   = 1,
+    VFIO_USER_DMA_MAP                   = 2,
+    VFIO_USER_DMA_UNMAP                 = 3,
+    VFIO_USER_DEVICE_GET_INFO           = 4,
+    VFIO_USER_DEVICE_GET_REGION_INFO    = 5,
+    VFIO_USER_DEVICE_GET_REGION_IO_FDS  = 6,
+    VFIO_USER_DEVICE_GET_IRQ_INFO       = 7,
+    VFIO_USER_DEVICE_SET_IRQS           = 8,
+    VFIO_USER_REGION_READ               = 9,
+    VFIO_USER_REGION_WRITE              = 10,
+    VFIO_USER_DMA_READ                  = 11,
+    VFIO_USER_DMA_WRITE                 = 12,
+    VFIO_USER_DEVICE_RESET              = 13,
+    VFIO_USER_DIRTY_PAGES               = 14,
+    VFIO_USER_MAX,
+};
+
+/* flags */
+#define VFIO_USER_REQUEST       0x0
+#define VFIO_USER_REPLY         0x1
+#define VFIO_USER_TYPE          0xF
+
+#define VFIO_USER_NO_REPLY      0x10
+#define VFIO_USER_ERROR         0x20
+
+typedef struct vfio_user_hdr {
+    uint16_t id;
+    uint16_t command;
+    uint32_t size;
+    uint32_t flags;
+    uint32_t error_reply;
+} vfio_user_hdr_t;
+
+/*
+ * VFIO_USER_VERSION
+ */
+#define VFIO_USER_MAJOR_VER     0
+#define VFIO_USER_MINOR_VER     0
+
+struct vfio_user_version {
+    vfio_user_hdr_t hdr;
+    uint16_t major;
+    uint16_t minor;
+    char capabilities[];
+};
+
+#define VFIO_USER_DEF_MAX_FDS   8
+#define VFIO_USER_MAX_MAX_FDS   16
+
+#define VFIO_USER_DEF_MAX_XFER  (1024 * 1024)
+#define VFIO_USER_MAX_MAX_XFER  (64 * 1024 * 1024)
+
+typedef struct VFIOUserFDs {
+    int send_fds;
+    int recv_fds;
+    int *fds;
+} VFIOUserFDs;
+
+typedef struct VFIOUserReply {
+    QTAILQ_ENTRY(VFIOUserReply) next;
+    vfio_user_hdr_t *msg;
+    VFIOUserFDs *fds;
+    int rsize;
+    uint32_t id;
+    QemuCond cv;
+    uint8_t complete;
+} VFIOUserReply;
+
+enum proxy_state {
+    CONNECTED = 1,
+    RECV_ERROR = 2,
+    CLOSING = 3,
+    CLOSED = 4,
+};
+
+typedef struct VFIOProxy {
+    QLIST_ENTRY(VFIOProxy) next;
+    char *sockname;
+    struct QIOChannel *ioc;
+    int (*request)(void *opaque, char *buf, VFIOUserFDs *fds);
+    void *reqarg;
+    int flags;
+    QemuCond close_cv;
+
+    /*
+     * above only changed when iolock is held
+     * below are protected by per-proxy lock
+     */
+    QemuMutex lock;
+    QTAILQ_HEAD(, VFIOUserReply) free;
+    QTAILQ_HEAD(, VFIOUserReply) pending;
+    enum proxy_state state;
+    int close_wait;
+} VFIOProxy;
+
+#define VFIO_PROXY_CLIENT       0x1
+
+void vfio_user_recv(void *opaque);
+void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret);
+#endif /* VFIO_USER_H */
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 8af11b0a76..f43dc6e5d0 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -75,6 +75,7 @@  typedef struct VFIOAddressSpace {
 } VFIOAddressSpace;
 
 struct VFIOGroup;
+typedef struct VFIOProxy VFIOProxy;
 
 typedef struct VFIOContainer {
     VFIOAddressSpace *space;
@@ -143,6 +144,7 @@  typedef struct VFIODevice {
     VFIOMigration *migration;
     Error *migration_blocker;
     OnOffAuto pre_copy_dirty_page_tracking;
+    VFIOProxy *proxy;
 } VFIODevice;
 
 struct VFIODeviceOps {
diff --git a/hw/vfio/user.c b/hw/vfio/user.c
new file mode 100644
index 0000000000..021d5540e0
--- /dev/null
+++ b/hw/vfio/user.c
@@ -0,0 +1,286 @@ 
+/*
+ * vfio protocol over a UNIX socket.
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include <linux/vfio.h>
+#include <sys/ioctl.h>
+
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "qemu/main-loop.h"
+#include "hw/hw.h"
+#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio.h"
+#include "qemu/sockets.h"
+#include "io/channel.h"
+#include "io/channel-util.h"
+#include "sysemu/iothread.h"
+#include "user.h"
+
+static uint64_t max_xfer_size = VFIO_USER_DEF_MAX_XFER;
+static IOThread *vfio_user_iothread;
+static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg,
+                                  VFIOUserFDs *fds);
+static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg,
+                           VFIOUserFDs *fds);
+static void vfio_user_shutdown(VFIOProxy *proxy);
+
+static void vfio_user_shutdown(VFIOProxy *proxy)
+{
+    qio_channel_shutdown(proxy->ioc, QIO_CHANNEL_SHUTDOWN_READ, NULL);
+    qio_channel_set_aio_fd_handler(proxy->ioc,
+                                   iothread_get_aio_context(vfio_user_iothread),
+                                   NULL, NULL, NULL);
+}
+
+void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret)
+{
+    vfio_user_hdr_t *hdr = (vfio_user_hdr_t *)buf;
+
+    /*
+     * convert header to associated reply
+     * positive ret is reply size, negative is error code
+     */
+    hdr->flags = VFIO_USER_REPLY;
+    if (ret > 0) {
+        hdr->size = ret;
+    } else if (ret < 0) {
+        hdr->flags |= VFIO_USER_ERROR;
+        hdr->error_reply = -ret;
+        hdr->size = sizeof(*hdr);
+    }
+    vfio_user_send(proxy, hdr, NULL);
+}
+
+void vfio_user_recv(void *opaque)
+{
+    VFIODevice *vbasedev = opaque;
+    VFIOProxy *proxy = vbasedev->proxy;
+    VFIOUserReply *reply = NULL;
+    g_autofree int *fdp = NULL;
+    VFIOUserFDs reqfds = { 0, 0, fdp };
+    vfio_user_hdr_t msg;
+    struct iovec iov = {
+        .iov_base = &msg,
+        .iov_len = sizeof(msg),
+    };
+    int isreply, i, ret;
+    size_t msgleft, numfds = 0;
+    char *data = NULL;
+    g_autofree char *buf = NULL;
+    Error *local_err = NULL;
+
+    qemu_mutex_lock(&proxy->lock);
+    if (proxy->state == CLOSING) {
+        qemu_mutex_unlock(&proxy->lock);
+        return;
+    }
+
+    ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds,
+                                 &local_err);
+    if (ret <= 0) {
+        /* read error or other side closed connection */
+        error_setg_errno(&local_err, errno, "vfio_user_recv read error");
+        goto fatal;
+    }
+
+    if (ret < sizeof(msg)) {
+        error_setg(&local_err, "vfio_user_recv short read of header");
+        goto err;
+    }
+
+    /*
+     * For replies, find the matching pending request
+     */
+    switch (msg.flags & VFIO_USER_TYPE) {
+    case VFIO_USER_REQUEST:
+        isreply = 0;
+        break;
+    case VFIO_USER_REPLY:
+        isreply = 1;
+        break;
+    default:
+        error_setg(&local_err, "vfio_user_recv unknown message type");
+        goto err;
+    }
+
+    if (isreply) {
+        QTAILQ_FOREACH(reply, &proxy->pending, next) {
+            if (msg.id == reply->id) {
+                break;
+            }
+        }
+        if (reply == NULL) {
+            error_setg(&local_err, "vfio_user_recv unexpected reply");
+            goto err;
+        }
+        QTAILQ_REMOVE(&proxy->pending, reply, next);
+
+        /*
+         * Process any received FDs
+         */
+        if (numfds != 0) {
+            if (reply->fds == NULL || reply->fds->recv_fds < numfds) {
+                error_setg(&local_err, "vfio_user_recv unexpected FDs");
+                goto err;
+            }
+            reply->fds->recv_fds = numfds;
+            memcpy(reply->fds->fds, fdp, numfds * sizeof(int));
+        }
+
+    } else {
+        /*
+         * The client doesn't expect any FDs in requests, but
+         * they will be expected on the server
+         */
+        if (numfds != 0 && (proxy->flags & VFIO_PROXY_CLIENT)) {
+            error_setg(&local_err, "vfio_user_recv fd in client reply");
+            goto err;
+        }
+        reqfds.recv_fds = numfds;
+    }
+
+    /*
+     * put the whole message into a single buffer
+     */
+    msgleft = msg.size - sizeof(msg);
+    if (isreply) {
+        if (msg.size > reply->rsize) {
+            error_setg(&local_err,
+                       "vfio_user_recv reply larger than recv buffer");
+            goto fatal;
+        }
+        *reply->msg = msg;
+        data = (char *)reply->msg + sizeof(msg);
+    } else {
+        if (msg.size > max_xfer_size) {
+            error_setg(&local_err, "vfio_user_recv request larger than max");
+            goto fatal;
+        }
+        buf = g_malloc0(msg.size);
+        memcpy(buf, &msg, sizeof(msg));
+        data = buf + sizeof(msg);
+    }
+
+    if (msgleft != 0) {
+        ret = qio_channel_read(proxy->ioc, data, msgleft, &local_err);
+        if (ret < 0) {
+            goto fatal;
+        }
+        if (ret != msgleft) {
+            error_setg(&local_err, "vfio_user_recv short read of msg body");
+            goto err;
+        }
+    }
+
+    /*
+     * Replies signal a waiter, requests get processed by vfio code
+     * that may assume the iothread lock is held.
+     */
+    qemu_mutex_unlock(&proxy->lock);
+    if (isreply) {
+        reply->complete = 1;
+        qemu_cond_signal(&reply->cv);
+    } else {
+        qemu_mutex_lock_iothread();
+        /*
+         * make sure proxy wasn't closed while we waited
+         * checking without holding the proxy lock is safe
+         * since state is only set to CLOSING when iolock is held
+         */
+        if (proxy->state != CLOSING) {
+            ret = proxy->request(proxy->reqarg, buf, &reqfds);
+            if (ret < 0 && !(msg.flags & VFIO_USER_NO_REPLY)) {
+                vfio_user_send_reply(proxy, buf, ret);
+            }
+        }
+        qemu_mutex_unlock_iothread();
+    }
+
+    return;
+ fatal:
+    vfio_user_shutdown(proxy);
+    proxy->state = RECV_ERROR;
+
+ err:
+    qemu_mutex_unlock(&proxy->lock);
+    for (i = 0; i < numfds; i++) {
+        close(fdp[i]);
+    }
+    if (reply != NULL) {
+        /* force an error to keep sending thread from hanging */
+        reply->msg->flags |= VFIO_USER_ERROR;
+        reply->msg->error_reply = EINVAL;
+        reply->complete = 1;
+        qemu_cond_signal(&reply->cv);
+    }
+    error_report_err(local_err);
+}
+
+static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg,
+                                  VFIOUserFDs *fds)
+{
+    struct iovec iov = {
+        .iov_base = msg,
+        .iov_len = msg->size,
+    };
+    size_t numfds = 0;
+    int msgleft, ret, *fdp = NULL;
+    char *buf;
+    Error *local_err = NULL;
+
+    if (proxy->state != CONNECTED) {
+        msg->flags |= VFIO_USER_ERROR;
+        msg->error_reply = ECONNRESET;
+        return;
+    }
+
+    if (fds != NULL && fds->send_fds != 0) {
+        numfds = fds->send_fds;
+        fdp = fds->fds;
+    }
+    ret = qio_channel_writev_full(proxy->ioc, &iov, 1, fdp, numfds, &local_err);
+    if (ret < 0) {
+        goto err;
+    }
+    if (ret == msg->size) {
+        return;
+    }
+
+    buf = iov.iov_base + ret;
+    msgleft = iov.iov_len - ret;
+    do {
+        ret = qio_channel_write(proxy->ioc, buf, msgleft, &local_err);
+        if (ret < 0) {
+            goto err;
+        }
+        buf += ret, msgleft -= ret;
+    } while (msgleft != 0);
+    return;
+
+ err:
+    error_report_err(local_err);
+}
+
+static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg,
+                           VFIOUserFDs *fds)
+{
+    bool iolock = qemu_mutex_iothread_locked();
+
+    if (iolock) {
+        qemu_mutex_unlock_iothread();
+    }
+    qemu_mutex_lock(&proxy->lock);
+    vfio_user_send_locked(proxy, msg, fds);
+    qemu_mutex_unlock(&proxy->lock);
+    if (iolock) {
+        qemu_mutex_lock_iothread();
+    }
+}
diff --git a/MAINTAINERS b/MAINTAINERS
index 12d69f3a45..aa4df6c418 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1883,8 +1883,12 @@  L: qemu-s390x@nongnu.org
 vfio-user
 M: John G Johnson <john.g.johnson@oracle.com>
 M: Thanos Makatos <thanos.makatos@nutanix.com>
+M: Elena Ufimtseva <elena.ufimtseva@oracle.com>
+M: Jagannathan Raman <jag.raman@oracle.com>
 S: Supported
 F: docs/devel/vfio-user.rst
+F: hw/vfio/user.c
+F: hw/vfio/user.h
 
 vhost
 M: Michael S. Tsirkin <mst@redhat.com>
diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
index da9af297a0..739b30be73 100644
--- a/hw/vfio/meson.build
+++ b/hw/vfio/meson.build
@@ -8,6 +8,7 @@  vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
   'display.c',
   'pci-quirks.c',
   'pci.c',
+  'user.c',
 ))
 vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c'))
 vfio_ss.add(when: 'CONFIG_VFIO_PLATFORM', if_true: files('platform.c'))