Message ID | cd9d5d6214d957db61120d9c3cbdc99e799a3baa.1626675354.git.elena.ufimtseva@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vfio-user implementation | expand |
On Sun, Jul 18, 2021 at 11:27:42PM -0700, Elena Ufimtseva wrote: > From: John G Johnson <john.g.johnson@oracle.com> > > Add user.c and user.h files for vfio-user with the basic > send and receive functions. > > Signed-off-by: John G Johnson <john.g.johnson@oracle.com> > Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com> > Signed-off-by: Jagannathan Raman <jag.raman@oracle.com> > --- > hw/vfio/user.h | 120 ++++++++++++++ > include/hw/vfio/vfio-common.h | 2 + > hw/vfio/user.c | 286 ++++++++++++++++++++++++++++++++++ > MAINTAINERS | 4 + > hw/vfio/meson.build | 1 + > 5 files changed, 413 insertions(+) > create mode 100644 hw/vfio/user.h > create mode 100644 hw/vfio/user.c The multi-threading, coroutine, and blocking I/O requirements of vfio_user_recv() and vfio_user_send_reply() are unclear to me. Please document them so it's clear what environment they can be called from. I guess they are not called from coroutines and proxy->ioc is a blocking IOChannel? > > diff --git a/hw/vfio/user.h b/hw/vfio/user.h > new file mode 100644 > index 0000000000..cdbc074579 > --- /dev/null > +++ b/hw/vfio/user.h > @@ -0,0 +1,120 @@ > +#ifndef VFIO_USER_H > +#define VFIO_USER_H > + > +/* > + * vfio protocol over a UNIX socket. > + * > + * Copyright © 2018, 2021 Oracle and/or its affiliates. > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + * Each message has a standard header that describes the command > + * being sent, which is almost always a VFIO ioctl(). > + * > + * The header may be followed by command-specfic data, such as the > + * region and offset info for read and write commands. > + */ > + > +/* commands */ > +enum vfio_user_command { > + VFIO_USER_VERSION = 1, > + VFIO_USER_DMA_MAP = 2, > + VFIO_USER_DMA_UNMAP = 3, > + VFIO_USER_DEVICE_GET_INFO = 4, > + VFIO_USER_DEVICE_GET_REGION_INFO = 5, > + VFIO_USER_DEVICE_GET_REGION_IO_FDS = 6, > + VFIO_USER_DEVICE_GET_IRQ_INFO = 7, > + VFIO_USER_DEVICE_SET_IRQS = 8, > + VFIO_USER_REGION_READ = 9, > + VFIO_USER_REGION_WRITE = 10, > + VFIO_USER_DMA_READ = 11, > + VFIO_USER_DMA_WRITE = 12, > + VFIO_USER_DEVICE_RESET = 13, > + VFIO_USER_DIRTY_PAGES = 14, > + VFIO_USER_MAX, > +}; > + > +/* flags */ > +#define VFIO_USER_REQUEST 0x0 > +#define VFIO_USER_REPLY 0x1 > +#define VFIO_USER_TYPE 0xF > + > +#define VFIO_USER_NO_REPLY 0x10 > +#define VFIO_USER_ERROR 0x20 > + > +typedef struct vfio_user_hdr { > + uint16_t id; > + uint16_t command; > + uint32_t size; > + uint32_t flags; > + uint32_t error_reply; > +} vfio_user_hdr_t; Please use QEMU coding style in QEMU code (i.e. not shared with Linux or external libraries): typedef struct { ... } VfioUserHdr; You can also specify the struct VfioUserHdr tag if you want but it's only necessary to reference the struct before the end of the typedef definition. https://qemu-project.gitlab.io/qemu/devel/style.html > + > +/* > + * VFIO_USER_VERSION > + */ > +#define VFIO_USER_MAJOR_VER 0 > +#define VFIO_USER_MINOR_VER 0 > + > +struct vfio_user_version { > + vfio_user_hdr_t hdr; > + uint16_t major; > + uint16_t minor; > + char capabilities[]; > +}; > + > +#define VFIO_USER_DEF_MAX_FDS 8 > +#define VFIO_USER_MAX_MAX_FDS 16 > + > +#define VFIO_USER_DEF_MAX_XFER (1024 * 1024) > +#define VFIO_USER_MAX_MAX_XFER (64 * 1024 * 1024) > + > +typedef struct VFIOUserFDs { > + int send_fds; > + int recv_fds; > + int *fds; > +} VFIOUserFDs; I think around here we switch from vfio-user spec definitions to QEMU implementation details. It might be nice to keep the vfio-user spec definitions in a separate header file so the boundary is clear. > + > +typedef struct VFIOUserReply { > + QTAILQ_ENTRY(VFIOUserReply) next; > + vfio_user_hdr_t *msg; > + VFIOUserFDs *fds; > + int rsize; > + uint32_t id; > + QemuCond cv; > + uint8_t complete; Please use bool. > +} VFIOUserReply; > + > +enum proxy_state { > + CONNECTED = 1, > + RECV_ERROR = 2, > + CLOSING = 3, > + CLOSED = 4, > +}; These enum values probably need a prefix (VFIO_PROXY_*). Generic short names like CONNECTED, CLOSED, etc could lead to namespace collisions. Enum constants are in the global namespace. > + > +typedef struct VFIOProxy { > + QLIST_ENTRY(VFIOProxy) next; > + char *sockname; > + struct QIOChannel *ioc; > + int (*request)(void *opaque, char *buf, VFIOUserFDs *fds); > + void *reqarg; > + int flags; > + QemuCond close_cv; > + > + /* > + * above only changed when iolock is held Please use "BQL" instead of "iolock". git grep shows many results for BQL and the only result for iolock is in mpqemu code. > + * below are protected by per-proxy lock > + */ > + QemuMutex lock; > + QTAILQ_HEAD(, VFIOUserReply) free; > + QTAILQ_HEAD(, VFIOUserReply) pending; > + enum proxy_state state; > + int close_wait; Is this a bool? Please use bool. > +} VFIOProxy; > + > +#define VFIO_PROXY_CLIENT 0x1 A comment that shows which field VFIO_PROXY_CLIENT relates would make this clearer: /* VFIOProxy->flags */ #define VFIO_PROXY_CLIENT 0x1 > + > +void vfio_user_recv(void *opaque); > +void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret); > +#endif /* VFIO_USER_H */ > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index 8af11b0a76..f43dc6e5d0 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -75,6 +75,7 @@ typedef struct VFIOAddressSpace { > } VFIOAddressSpace; > > struct VFIOGroup; > +typedef struct VFIOProxy VFIOProxy; > > typedef struct VFIOContainer { > VFIOAddressSpace *space; > @@ -143,6 +144,7 @@ typedef struct VFIODevice { > VFIOMigration *migration; > Error *migration_blocker; > OnOffAuto pre_copy_dirty_page_tracking; > + VFIOProxy *proxy; > } VFIODevice; > > struct VFIODeviceOps { > diff --git a/hw/vfio/user.c b/hw/vfio/user.c > new file mode 100644 > index 0000000000..021d5540e0 > --- /dev/null > +++ b/hw/vfio/user.c > @@ -0,0 +1,286 @@ > +/* > + * vfio protocol over a UNIX socket. > + * > + * Copyright © 2018, 2021 Oracle and/or its affiliates. > + * > + * This work is licensed under the terms of the GNU GPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + * > + */ > + > +#include "qemu/osdep.h" > +#include <linux/vfio.h> > +#include <sys/ioctl.h> > + > +#include "qemu/error-report.h" > +#include "qapi/error.h" > +#include "qemu/main-loop.h" > +#include "hw/hw.h" > +#include "hw/vfio/vfio-common.h" > +#include "hw/vfio/vfio.h" > +#include "qemu/sockets.h" > +#include "io/channel.h" > +#include "io/channel-util.h" > +#include "sysemu/iothread.h" > +#include "user.h" > + > +static uint64_t max_xfer_size = VFIO_USER_DEF_MAX_XFER; > +static IOThread *vfio_user_iothread; > +static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg, > + VFIOUserFDs *fds); > +static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg, > + VFIOUserFDs *fds); > +static void vfio_user_shutdown(VFIOProxy *proxy); > + > +static void vfio_user_shutdown(VFIOProxy *proxy) > +{ > + qio_channel_shutdown(proxy->ioc, QIO_CHANNEL_SHUTDOWN_READ, NULL); > + qio_channel_set_aio_fd_handler(proxy->ioc, > + iothread_get_aio_context(vfio_user_iothread), > + NULL, NULL, NULL); There is no other qio_channel_set_aio_fd_handler() call in this patch. Why is this one necessary? > +} > + > +void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret) > +{ > + vfio_user_hdr_t *hdr = (vfio_user_hdr_t *)buf; > + > + /* > + * convert header to associated reply > + * positive ret is reply size, negative is error code > + */ > + hdr->flags = VFIO_USER_REPLY; > + if (ret > 0) { > + hdr->size = ret; > + } else if (ret < 0) { > + hdr->flags |= VFIO_USER_ERROR; > + hdr->error_reply = -ret; > + hdr->size = sizeof(*hdr); > + } assert(ret != 0)? That case doesn't seem to be defined so maybe an assertion is worthwhile. > + vfio_user_send(proxy, hdr, NULL); > +} > + > +void vfio_user_recv(void *opaque) > +{ > + VFIODevice *vbasedev = opaque; > + VFIOProxy *proxy = vbasedev->proxy; > + VFIOUserReply *reply = NULL; > + g_autofree int *fdp = NULL; > + VFIOUserFDs reqfds = { 0, 0, fdp }; > + vfio_user_hdr_t msg; > + struct iovec iov = { > + .iov_base = &msg, > + .iov_len = sizeof(msg), > + }; > + int isreply, i, ret; > + size_t msgleft, numfds = 0; > + char *data = NULL; > + g_autofree char *buf = NULL; > + Error *local_err = NULL; > + > + qemu_mutex_lock(&proxy->lock); > + if (proxy->state == CLOSING) { > + qemu_mutex_unlock(&proxy->lock); QEMU_LOCK_GUARD() automatically unlocks mutexes when the function returns and is less error-prone than manual lock/unlock calls. > + return; > + } > + > + ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds, > + &local_err); > + if (ret <= 0) { > + /* read error or other side closed connection */ > + error_setg_errno(&local_err, errno, "vfio_user_recv read error"); This will trigger an assertion failure when local_err was already set by qio_channel_readv_full(): static void error_setv(Error **errp, const char *src, int line, const char *func, ErrorClass err_class, const char *fmt, va_list ap, const char *suffix) { Error *err; int saved_errno = errno; if (errp == NULL) { return; } assert(*errp == NULL); ^^^^^^^^^^^^^^^^^^^^^^ I think this error_setg_errno() call should be dropped. You can use error_prepend() if you'd like to add more information to the error message from qio_channel_readv_full(). > + goto fatal; > + } > + > + if (ret < sizeof(msg)) { > + error_setg(&local_err, "vfio_user_recv short read of header"); > + goto err; > + } > + > + /* > + * For replies, find the matching pending request > + */ > + switch (msg.flags & VFIO_USER_TYPE) { > + case VFIO_USER_REQUEST: > + isreply = 0; > + break; > + case VFIO_USER_REPLY: > + isreply = 1; > + break; > + default: > + error_setg(&local_err, "vfio_user_recv unknown message type"); > + goto err; > + } > + > + if (isreply) { > + QTAILQ_FOREACH(reply, &proxy->pending, next) { > + if (msg.id == reply->id) { > + break; > + } > + } I'm surprised to see this loop since proxy->lock prevents additional requests from being sent while we're trying to receive a message. Can there really be multiple replies pending with this locking scheme? > + if (reply == NULL) { > + error_setg(&local_err, "vfio_user_recv unexpected reply"); > + goto err; > + } > + QTAILQ_REMOVE(&proxy->pending, reply, next); > + > + /* > + * Process any received FDs > + */ > + if (numfds != 0) { > + if (reply->fds == NULL || reply->fds->recv_fds < numfds) { > + error_setg(&local_err, "vfio_user_recv unexpected FDs"); > + goto err; > + } > + reply->fds->recv_fds = numfds; > + memcpy(reply->fds->fds, fdp, numfds * sizeof(int)); > + } > + > + } else { > + /* > + * The client doesn't expect any FDs in requests, but > + * they will be expected on the server > + */ > + if (numfds != 0 && (proxy->flags & VFIO_PROXY_CLIENT)) { > + error_setg(&local_err, "vfio_user_recv fd in client reply"); > + goto err; > + } > + reqfds.recv_fds = numfds; > + } > + > + /* > + * put the whole message into a single buffer > + */ > + msgleft = msg.size - sizeof(msg); msg.size has not been validated so this could underflow. Please validate all inputs so malicious servers/clients cannot crash or compromise the program. > + if (isreply) { > + if (msg.size > reply->rsize) { rsize is an int. Should it be uint32_t like msg.size? > + error_setg(&local_err, > + "vfio_user_recv reply larger than recv buffer"); > + goto fatal; > + } > + *reply->msg = msg; > + data = (char *)reply->msg + sizeof(msg); > + } else { > + if (msg.size > max_xfer_size) { > + error_setg(&local_err, "vfio_user_recv request larger than max"); > + goto fatal; > + } Missing check to prevent buffer overflow: if (msg.size < sizeof(msg)) { error_setg(&local_err, "vfio_user_recv request too small"); goto fatal; } > + buf = g_malloc0(msg.size); > + memcpy(buf, &msg, sizeof(msg)); > + data = buf + sizeof(msg); > + } > + > + if (msgleft != 0) { > + ret = qio_channel_read(proxy->ioc, data, msgleft, &local_err); > + if (ret < 0) { > + goto fatal; > + } > + if (ret != msgleft) { > + error_setg(&local_err, "vfio_user_recv short read of msg body"); > + goto err; > + } > + } > + > + /* > + * Replies signal a waiter, requests get processed by vfio code > + * that may assume the iothread lock is held. > + */ > + qemu_mutex_unlock(&proxy->lock); > + if (isreply) { > + reply->complete = 1; > + qemu_cond_signal(&reply->cv); signal must be called with the mutex held to avoid race conditions. If the waiter acquires the lock and still sees complete == 0, then we signal before wait is entered, the signal is missed and the waiter is stuck. > + } else { > + qemu_mutex_lock_iothread(); > + /* > + * make sure proxy wasn't closed while we waited > + * checking without holding the proxy lock is safe > + * since state is only set to CLOSING when iolock is held s/iolock/the BQL/ > + */ > + if (proxy->state != CLOSING) { > + ret = proxy->request(proxy->reqarg, buf, &reqfds); > + if (ret < 0 && !(msg.flags & VFIO_USER_NO_REPLY)) { > + vfio_user_send_reply(proxy, buf, ret); > + } > + } > + qemu_mutex_unlock_iothread(); > + } > + > + return; > + fatal: > + vfio_user_shutdown(proxy); > + proxy->state = RECV_ERROR; > + > + err: > + qemu_mutex_unlock(&proxy->lock); > + for (i = 0; i < numfds; i++) { > + close(fdp[i]); > + } > + if (reply != NULL) { > + /* force an error to keep sending thread from hanging */ > + reply->msg->flags |= VFIO_USER_ERROR; > + reply->msg->error_reply = EINVAL; > + reply->complete = 1; > + qemu_cond_signal(&reply->cv); This has the race condition too. > + } > + error_report_err(local_err); > +} > + > +static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg, > + VFIOUserFDs *fds) > +{ > + struct iovec iov = { > + .iov_base = msg, > + .iov_len = msg->size, > + }; > + size_t numfds = 0; > + int msgleft, ret, *fdp = NULL; > + char *buf; > + Error *local_err = NULL; > + > + if (proxy->state != CONNECTED) { > + msg->flags |= VFIO_USER_ERROR; > + msg->error_reply = ECONNRESET; > + return; > + } > + > + if (fds != NULL && fds->send_fds != 0) { > + numfds = fds->send_fds; > + fdp = fds->fds; > + } > + ret = qio_channel_writev_full(proxy->ioc, &iov, 1, fdp, numfds, &local_err); > + if (ret < 0) { > + goto err; > + } > + if (ret == msg->size) { > + return; > + } > + > + buf = iov.iov_base + ret; > + msgleft = iov.iov_len - ret; > + do { > + ret = qio_channel_write(proxy->ioc, buf, msgleft, &local_err); > + if (ret < 0) { > + goto err; > + } > + buf += ret, msgleft -= ret; Please use semicolon. Comma operators are rare, requiring readers to check their exact semantics. There is no need to use comma here. > + } while (msgleft != 0); > + return; > + > + err: > + error_report_err(local_err); State remains unchanged and msg->error_reply isn't set? > +} > + > +static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg, > + VFIOUserFDs *fds) > +{ > + bool iolock = qemu_mutex_iothread_locked(); > + > + if (iolock) { > + qemu_mutex_unlock_iothread(); > + } > + qemu_mutex_lock(&proxy->lock); > + vfio_user_send_locked(proxy, msg, fds); > + qemu_mutex_unlock(&proxy->lock); > + if (iolock) { > + qemu_mutex_lock_iothread(); > + } > +} > diff --git a/MAINTAINERS b/MAINTAINERS > index 12d69f3a45..aa4df6c418 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -1883,8 +1883,12 @@ L: qemu-s390x@nongnu.org > vfio-user > M: John G Johnson <john.g.johnson@oracle.com> > M: Thanos Makatos <thanos.makatos@nutanix.com> > +M: Elena Ufimtseva <elena.ufimtseva@oracle.com> > +M: Jagannathan Raman <jag.raman@oracle.com> > S: Supported > F: docs/devel/vfio-user.rst > +F: hw/vfio/user.c > +F: hw/vfio/user.h > > vhost > M: Michael S. Tsirkin <mst@redhat.com> > diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build > index da9af297a0..739b30be73 100644 > --- a/hw/vfio/meson.build > +++ b/hw/vfio/meson.build > @@ -8,6 +8,7 @@ vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( > 'display.c', > 'pci-quirks.c', > 'pci.c', > + 'user.c', > )) > vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c')) > vfio_ss.add(when: 'CONFIG_VFIO_PLATFORM', if_true: files('platform.c')) > -- > 2.25.1 >
> On Jul 27, 2021, at 9:34 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote: > > On Sun, Jul 18, 2021 at 11:27:42PM -0700, Elena Ufimtseva wrote: >> From: John G Johnson <john.g.johnson@oracle.com> >> >> Add user.c and user.h files for vfio-user with the basic >> send and receive functions. >> >> Signed-off-by: John G Johnson <john.g.johnson@oracle.com> >> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com> >> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com> >> --- >> hw/vfio/user.h | 120 ++++++++++++++ >> include/hw/vfio/vfio-common.h | 2 + >> hw/vfio/user.c | 286 ++++++++++++++++++++++++++++++++++ >> MAINTAINERS | 4 + >> hw/vfio/meson.build | 1 + >> 5 files changed, 413 insertions(+) >> create mode 100644 hw/vfio/user.h >> create mode 100644 hw/vfio/user.c > > The multi-threading, coroutine, and blocking I/O requirements of > vfio_user_recv() and vfio_user_send_reply() are unclear to me. Please > document them so it's clear what environment they can be called from. I > guess they are not called from coroutines and proxy->ioc is a blocking > IOChannel? > Yes to both, moreover, a block comment above vfio_user_recv() would be useful. The call to setup vfio_user_recv() as the socket handler isn’t in this patch, do you want the series re-org’d? >> >> diff --git a/hw/vfio/user.h b/hw/vfio/user.h >> new file mode 100644 >> index 0000000000..cdbc074579 >> --- /dev/null >> +++ b/hw/vfio/user.h >> @@ -0,0 +1,120 @@ >> +#ifndef VFIO_USER_H >> +#define VFIO_USER_H >> + >> +/* >> + * vfio protocol over a UNIX socket. >> + * >> + * Copyright © 2018, 2021 Oracle and/or its affiliates. >> + * >> + * This work is licensed under the terms of the GNU GPL, version 2. See >> + * the COPYING file in the top-level directory. >> + * >> + * Each message has a standard header that describes the command >> + * being sent, which is almost always a VFIO ioctl(). >> + * >> + * The header may be followed by command-specfic data, such as the >> + * region and offset info for read and write commands. >> + */ >> + >> +/* commands */ >> +enum vfio_user_command { >> + VFIO_USER_VERSION = 1, >> + VFIO_USER_DMA_MAP = 2, >> + VFIO_USER_DMA_UNMAP = 3, >> + VFIO_USER_DEVICE_GET_INFO = 4, >> + VFIO_USER_DEVICE_GET_REGION_INFO = 5, >> + VFIO_USER_DEVICE_GET_REGION_IO_FDS = 6, >> + VFIO_USER_DEVICE_GET_IRQ_INFO = 7, >> + VFIO_USER_DEVICE_SET_IRQS = 8, >> + VFIO_USER_REGION_READ = 9, >> + VFIO_USER_REGION_WRITE = 10, >> + VFIO_USER_DMA_READ = 11, >> + VFIO_USER_DMA_WRITE = 12, >> + VFIO_USER_DEVICE_RESET = 13, >> + VFIO_USER_DIRTY_PAGES = 14, >> + VFIO_USER_MAX, >> +}; >> + >> +/* flags */ >> +#define VFIO_USER_REQUEST 0x0 >> +#define VFIO_USER_REPLY 0x1 >> +#define VFIO_USER_TYPE 0xF >> + >> +#define VFIO_USER_NO_REPLY 0x10 >> +#define VFIO_USER_ERROR 0x20 >> + >> +typedef struct vfio_user_hdr { >> + uint16_t id; >> + uint16_t command; >> + uint32_t size; >> + uint32_t flags; >> + uint32_t error_reply; >> +} vfio_user_hdr_t; > > Please use QEMU coding style in QEMU code (i.e. not shared with Linux or > external libraries): > > typedef struct { > ... > } VfioUserHdr; > > You can also specify the struct VfioUserHdr tag if you want but it's > only necessary to reference the struct before the end of the typedef > definition. > > https://qemu-project.gitlab.io/qemu/devel/style.html > OK >> + >> +/* >> + * VFIO_USER_VERSION >> + */ >> +#define VFIO_USER_MAJOR_VER 0 >> +#define VFIO_USER_MINOR_VER 0 >> + >> +struct vfio_user_version { >> + vfio_user_hdr_t hdr; >> + uint16_t major; >> + uint16_t minor; >> + char capabilities[]; >> +}; >> + >> +#define VFIO_USER_DEF_MAX_FDS 8 >> +#define VFIO_USER_MAX_MAX_FDS 16 >> + >> +#define VFIO_USER_DEF_MAX_XFER (1024 * 1024) >> +#define VFIO_USER_MAX_MAX_XFER (64 * 1024 * 1024) >> + >> +typedef struct VFIOUserFDs { >> + int send_fds; >> + int recv_fds; >> + int *fds; >> +} VFIOUserFDs; > > I think around here we switch from vfio-user spec definitions to QEMU > implementation details. It might be nice to keep the vfio-user spec > definitions in a separate header file so the boundary is clear. > OK >> + >> +typedef struct VFIOUserReply { >> + QTAILQ_ENTRY(VFIOUserReply) next; >> + vfio_user_hdr_t *msg; >> + VFIOUserFDs *fds; >> + int rsize; >> + uint32_t id; >> + QemuCond cv; >> + uint8_t complete; > > Please use bool. > OK >> +} VFIOUserReply; >> + >> +enum proxy_state { >> + CONNECTED = 1, >> + RECV_ERROR = 2, >> + CLOSING = 3, >> + CLOSED = 4, >> +}; > > These enum values probably need a prefix (VFIO_PROXY_*). Generic short > names like CONNECTED, CLOSED, etc could lead to namespace collisions. > Enum constants are in the global namespace. > OK >> + >> +typedef struct VFIOProxy { >> + QLIST_ENTRY(VFIOProxy) next; >> + char *sockname; >> + struct QIOChannel *ioc; >> + int (*request)(void *opaque, char *buf, VFIOUserFDs *fds); >> + void *reqarg; >> + int flags; >> + QemuCond close_cv; >> + >> + /* >> + * above only changed when iolock is held > > Please use "BQL" instead of "iolock". git grep shows many results for > BQL and the only result for iolock is in mpqemu code. > OK >> + * below are protected by per-proxy lock >> + */ >> + QemuMutex lock; >> + QTAILQ_HEAD(, VFIOUserReply) free; >> + QTAILQ_HEAD(, VFIOUserReply) pending; >> + enum proxy_state state; >> + int close_wait; > > Is this a bool? Please use bool. yes it’s a bool > >> +} VFIOProxy; >> + >> +#define VFIO_PROXY_CLIENT 0x1 > > A comment that shows which field VFIO_PROXY_CLIENT relates would make this clearer: > > /* VFIOProxy->flags */ > #define VFIO_PROXY_CLIENT 0x1 > OK >> + >> +void vfio_user_recv(void *opaque); >> +void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret); >> +#endif /* VFIO_USER_H */ >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h >> index 8af11b0a76..f43dc6e5d0 100644 >> --- a/include/hw/vfio/vfio-common.h >> +++ b/include/hw/vfio/vfio-common.h >> @@ -75,6 +75,7 @@ typedef struct VFIOAddressSpace { >> } VFIOAddressSpace; >> >> struct VFIOGroup; >> +typedef struct VFIOProxy VFIOProxy; >> >> typedef struct VFIOContainer { >> VFIOAddressSpace *space; >> @@ -143,6 +144,7 @@ typedef struct VFIODevice { >> VFIOMigration *migration; >> Error *migration_blocker; >> OnOffAuto pre_copy_dirty_page_tracking; >> + VFIOProxy *proxy; >> } VFIODevice; >> >> struct VFIODeviceOps { >> diff --git a/hw/vfio/user.c b/hw/vfio/user.c >> new file mode 100644 >> index 0000000000..021d5540e0 >> --- /dev/null >> +++ b/hw/vfio/user.c >> @@ -0,0 +1,286 @@ >> +/* >> + * vfio protocol over a UNIX socket. >> + * >> + * Copyright © 2018, 2021 Oracle and/or its affiliates. >> + * >> + * This work is licensed under the terms of the GNU GPL, version 2 or later. >> + * See the COPYING file in the top-level directory. >> + * >> + */ >> + >> +#include "qemu/osdep.h" >> +#include <linux/vfio.h> >> +#include <sys/ioctl.h> >> + >> +#include "qemu/error-report.h" >> +#include "qapi/error.h" >> +#include "qemu/main-loop.h" >> +#include "hw/hw.h" >> +#include "hw/vfio/vfio-common.h" >> +#include "hw/vfio/vfio.h" >> +#include "qemu/sockets.h" >> +#include "io/channel.h" >> +#include "io/channel-util.h" >> +#include "sysemu/iothread.h" >> +#include "user.h" >> + >> +static uint64_t max_xfer_size = VFIO_USER_DEF_MAX_XFER; >> +static IOThread *vfio_user_iothread; >> +static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg, >> + VFIOUserFDs *fds); >> +static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg, >> + VFIOUserFDs *fds); >> +static void vfio_user_shutdown(VFIOProxy *proxy); >> + >> +static void vfio_user_shutdown(VFIOProxy *proxy) >> +{ >> + qio_channel_shutdown(proxy->ioc, QIO_CHANNEL_SHUTDOWN_READ, NULL); >> + qio_channel_set_aio_fd_handler(proxy->ioc, >> + iothread_get_aio_context(vfio_user_iothread), >> + NULL, NULL, NULL); > > There is no other qio_channel_set_aio_fd_handler() call in this patch. > Why is this one necessary? > See first comment. >> +} >> + >> +void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret) >> +{ >> + vfio_user_hdr_t *hdr = (vfio_user_hdr_t *)buf; >> + >> + /* >> + * convert header to associated reply >> + * positive ret is reply size, negative is error code >> + */ >> + hdr->flags = VFIO_USER_REPLY; >> + if (ret > 0) { >> + hdr->size = ret; >> + } else if (ret < 0) { >> + hdr->flags |= VFIO_USER_ERROR; >> + hdr->error_reply = -ret; >> + hdr->size = sizeof(*hdr); >> + } > > assert(ret != 0)? That case doesn't seem to be defined so maybe an > assertion is worthwhile. > I should test for positive size less than the header size as an error. >> + vfio_user_send(proxy, hdr, NULL); >> +} >> + >> +void vfio_user_recv(void *opaque) >> +{ >> + VFIODevice *vbasedev = opaque; >> + VFIOProxy *proxy = vbasedev->proxy; >> + VFIOUserReply *reply = NULL; >> + g_autofree int *fdp = NULL; >> + VFIOUserFDs reqfds = { 0, 0, fdp }; >> + vfio_user_hdr_t msg; >> + struct iovec iov = { >> + .iov_base = &msg, >> + .iov_len = sizeof(msg), >> + }; >> + int isreply, i, ret; >> + size_t msgleft, numfds = 0; >> + char *data = NULL; >> + g_autofree char *buf = NULL; >> + Error *local_err = NULL; >> + >> + qemu_mutex_lock(&proxy->lock); >> + if (proxy->state == CLOSING) { >> + qemu_mutex_unlock(&proxy->lock); > > QEMU_LOCK_GUARD() automatically unlocks mutexes when the function > returns and is less error-prone than manual lock/unlock calls. > will look into it >> + return; >> + } >> + >> + ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds, >> + &local_err); >> + if (ret <= 0) { >> + /* read error or other side closed connection */ >> + error_setg_errno(&local_err, errno, "vfio_user_recv read error"); > > This will trigger an assertion failure when local_err was already set by > qio_channel_readv_full(): > > static void error_setv(Error **errp, > const char *src, int line, const char *func, > ErrorClass err_class, const char *fmt, va_list ap, > const char *suffix) > { > Error *err; > int saved_errno = errno; > > if (errp == NULL) { > return; > } > assert(*errp == NULL); > ^^^^^^^^^^^^^^^^^^^^^^ > > I think this error_setg_errno() call should be dropped. You can use > error_prepend() if you'd like to add more information to the error > message from qio_channel_readv_full(). > OK >> + goto fatal; >> + } >> + >> + if (ret < sizeof(msg)) { >> + error_setg(&local_err, "vfio_user_recv short read of header"); >> + goto err; >> + } >> + >> + /* >> + * For replies, find the matching pending request >> + */ >> + switch (msg.flags & VFIO_USER_TYPE) { >> + case VFIO_USER_REQUEST: >> + isreply = 0; >> + break; >> + case VFIO_USER_REPLY: >> + isreply = 1; >> + break; >> + default: >> + error_setg(&local_err, "vfio_user_recv unknown message type"); >> + goto err; >> + } >> + >> + if (isreply) { >> + QTAILQ_FOREACH(reply, &proxy->pending, next) { >> + if (msg.id == reply->id) { >> + break; >> + } >> + } > > I'm surprised to see this loop since proxy->lock prevents additional > requests from being sent while we're trying to receive a message. Can > there really be multiple replies pending with this locking scheme? > I didn’t want to assume that was always true. Note an email exchange with Peter Xu where I can drop BQL in the middle of a memory region transaction that causes dma_map/unmap messages to be sent. The fix to that issue will be to send the messages async, then wait for the youngest reply when the transaction commits. >> + if (reply == NULL) { >> + error_setg(&local_err, "vfio_user_recv unexpected reply"); >> + goto err; >> + } >> + QTAILQ_REMOVE(&proxy->pending, reply, next); >> + >> + /* >> + * Process any received FDs >> + */ >> + if (numfds != 0) { >> + if (reply->fds == NULL || reply->fds->recv_fds < numfds) { >> + error_setg(&local_err, "vfio_user_recv unexpected FDs"); >> + goto err; >> + } >> + reply->fds->recv_fds = numfds; >> + memcpy(reply->fds->fds, fdp, numfds * sizeof(int)); >> + } >> + >> + } else { >> + /* >> + * The client doesn't expect any FDs in requests, but >> + * they will be expected on the server >> + */ >> + if (numfds != 0 && (proxy->flags & VFIO_PROXY_CLIENT)) { >> + error_setg(&local_err, "vfio_user_recv fd in client reply"); >> + goto err; >> + } >> + reqfds.recv_fds = numfds; >> + } >> + >> + /* >> + * put the whole message into a single buffer >> + */ >> + msgleft = msg.size - sizeof(msg); > > msg.size has not been validated so this could underflow. Please validate > all inputs so malicious servers/clients cannot crash or compromise the > program. > OK >> + if (isreply) { >> + if (msg.size > reply->rsize) { > > rsize is an int. Should it be uint32_t like msg.size? > OK >> + error_setg(&local_err, >> + "vfio_user_recv reply larger than recv buffer"); >> + goto fatal; >> + } >> + *reply->msg = msg; >> + data = (char *)reply->msg + sizeof(msg); >> + } else { >> + if (msg.size > max_xfer_size) { >> + error_setg(&local_err, "vfio_user_recv request larger than max"); >> + goto fatal; >> + } > > Missing check to prevent buffer overflow: > > if (msg.size < sizeof(msg)) { > error_setg(&local_err, "vfio_user_recv request too small"); > goto fatal; > } > I will put this check up before the msgleft calculation in the review comment above. >> + buf = g_malloc0(msg.size); >> + memcpy(buf, &msg, sizeof(msg)); >> + data = buf + sizeof(msg); >> + } >> + >> + if (msgleft != 0) { >> + ret = qio_channel_read(proxy->ioc, data, msgleft, &local_err); >> + if (ret < 0) { >> + goto fatal; >> + } >> + if (ret != msgleft) { >> + error_setg(&local_err, "vfio_user_recv short read of msg body"); >> + goto err; >> + } >> + } >> + >> + /* >> + * Replies signal a waiter, requests get processed by vfio code >> + * that may assume the iothread lock is held. >> + */ >> + qemu_mutex_unlock(&proxy->lock); >> + if (isreply) { >> + reply->complete = 1; >> + qemu_cond_signal(&reply->cv); > > signal must be called with the mutex held to avoid race conditions. If > the waiter acquires the lock and still sees complete == 0, then we > signal before wait is entered, the signal is missed and the waiter is > stuck. > Yes, this is a bug >> + } else { >> + qemu_mutex_lock_iothread(); >> + /* >> + * make sure proxy wasn't closed while we waited >> + * checking without holding the proxy lock is safe >> + * since state is only set to CLOSING when iolock is held > > s/iolock/the BQL/ > OK >> + */ >> + if (proxy->state != CLOSING) { >> + ret = proxy->request(proxy->reqarg, buf, &reqfds); >> + if (ret < 0 && !(msg.flags & VFIO_USER_NO_REPLY)) { >> + vfio_user_send_reply(proxy, buf, ret); >> + } >> + } >> + qemu_mutex_unlock_iothread(); >> + } >> + >> + return; >> + fatal: >> + vfio_user_shutdown(proxy); >> + proxy->state = RECV_ERROR; >> + >> + err: >> + qemu_mutex_unlock(&proxy->lock); >> + for (i = 0; i < numfds; i++) { >> + close(fdp[i]); >> + } >> + if (reply != NULL) { >> + /* force an error to keep sending thread from hanging */ >> + reply->msg->flags |= VFIO_USER_ERROR; >> + reply->msg->error_reply = EINVAL; >> + reply->complete = 1; >> + qemu_cond_signal(&reply->cv); > > This has the race condition too. > Yes >> + } >> + error_report_err(local_err); >> +} >> + >> +static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg, >> + VFIOUserFDs *fds) >> +{ >> + struct iovec iov = { >> + .iov_base = msg, >> + .iov_len = msg->size, >> + }; >> + size_t numfds = 0; >> + int msgleft, ret, *fdp = NULL; >> + char *buf; >> + Error *local_err = NULL; >> + >> + if (proxy->state != CONNECTED) { >> + msg->flags |= VFIO_USER_ERROR; >> + msg->error_reply = ECONNRESET; >> + return; >> + } >> + >> + if (fds != NULL && fds->send_fds != 0) { >> + numfds = fds->send_fds; >> + fdp = fds->fds; >> + } >> + ret = qio_channel_writev_full(proxy->ioc, &iov, 1, fdp, numfds, &local_err); >> + if (ret < 0) { >> + goto err; >> + } >> + if (ret == msg->size) { >> + return; >> + } >> + >> + buf = iov.iov_base + ret; >> + msgleft = iov.iov_len - ret; >> + do { >> + ret = qio_channel_write(proxy->ioc, buf, msgleft, &local_err); >> + if (ret < 0) { >> + goto err; >> + } >> + buf += ret, msgleft -= ret; > > Please use semicolon. Comma operators are rare, requiring readers to > check their exact semantics. There is no need to use comma here. > OK >> + } while (msgleft != 0); >> + return; >> + >> + err: >> + error_report_err(local_err); > > State remains unchanged and msg->error_reply isn't set? > They should be set. JJ >> +} >> + >> +static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg, >> + VFIOUserFDs *fds) >> +{ >> + bool iolock = qemu_mutex_iothread_locked(); >> + >> + if (iolock) { >> + qemu_mutex_unlock_iothread(); >> + } >> + qemu_mutex_lock(&proxy->lock); >> + vfio_user_send_locked(proxy, msg, fds); >> + qemu_mutex_unlock(&proxy->lock); >> + if (iolock) { >> + qemu_mutex_lock_iothread(); >> + } >> +} >> diff --git a/MAINTAINERS b/MAINTAINERS >> index 12d69f3a45..aa4df6c418 100644 >> --- a/MAINTAINERS >> +++ b/MAINTAINERS >> @@ -1883,8 +1883,12 @@ L: qemu-s390x@nongnu.org >> vfio-user >> M: John G Johnson <john.g.johnson@oracle.com> >> M: Thanos Makatos <thanos.makatos@nutanix.com> >> +M: Elena Ufimtseva <elena.ufimtseva@oracle.com> >> +M: Jagannathan Raman <jag.raman@oracle.com> >> S: Supported >> F: docs/devel/vfio-user.rst >> +F: hw/vfio/user.c >> +F: hw/vfio/user.h >> >> vhost >> M: Michael S. Tsirkin <mst@redhat.com> >> diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build >> index da9af297a0..739b30be73 100644 >> --- a/hw/vfio/meson.build >> +++ b/hw/vfio/meson.build >> @@ -8,6 +8,7 @@ vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( >> 'display.c', >> 'pci-quirks.c', >> 'pci.c', >> + 'user.c', >> )) >> vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c')) >> vfio_ss.add(when: 'CONFIG_VFIO_PLATFORM', if_true: files('platform.c')) >> -- >> 2.25.1 >>
On Wed, Jul 28, 2021 at 06:08:26PM +0000, John Johnson wrote: > > > > On Jul 27, 2021, at 9:34 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > On Sun, Jul 18, 2021 at 11:27:42PM -0700, Elena Ufimtseva wrote: > >> From: John G Johnson <john.g.johnson@oracle.com> > >> > >> Add user.c and user.h files for vfio-user with the basic > >> send and receive functions. > >> > >> Signed-off-by: John G Johnson <john.g.johnson@oracle.com> > >> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com> > >> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com> > >> --- > >> hw/vfio/user.h | 120 ++++++++++++++ > >> include/hw/vfio/vfio-common.h | 2 + > >> hw/vfio/user.c | 286 ++++++++++++++++++++++++++++++++++ > >> MAINTAINERS | 4 + > >> hw/vfio/meson.build | 1 + > >> 5 files changed, 413 insertions(+) > >> create mode 100644 hw/vfio/user.h > >> create mode 100644 hw/vfio/user.c > > > > The multi-threading, coroutine, and blocking I/O requirements of > > vfio_user_recv() and vfio_user_send_reply() are unclear to me. Please > > document them so it's clear what environment they can be called from. I > > guess they are not called from coroutines and proxy->ioc is a blocking > > IOChannel? > > > > Yes to both, moreover, a block comment above vfio_user_recv() would > be useful. The call to setup vfio_user_recv() as the socket handler isn’t > in this patch, do you want the series re-org’d? That would help with review, thanks! Stefan
diff --git a/hw/vfio/user.h b/hw/vfio/user.h new file mode 100644 index 0000000000..cdbc074579 --- /dev/null +++ b/hw/vfio/user.h @@ -0,0 +1,120 @@ +#ifndef VFIO_USER_H +#define VFIO_USER_H + +/* + * vfio protocol over a UNIX socket. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Each message has a standard header that describes the command + * being sent, which is almost always a VFIO ioctl(). + * + * The header may be followed by command-specfic data, such as the + * region and offset info for read and write commands. + */ + +/* commands */ +enum vfio_user_command { + VFIO_USER_VERSION = 1, + VFIO_USER_DMA_MAP = 2, + VFIO_USER_DMA_UNMAP = 3, + VFIO_USER_DEVICE_GET_INFO = 4, + VFIO_USER_DEVICE_GET_REGION_INFO = 5, + VFIO_USER_DEVICE_GET_REGION_IO_FDS = 6, + VFIO_USER_DEVICE_GET_IRQ_INFO = 7, + VFIO_USER_DEVICE_SET_IRQS = 8, + VFIO_USER_REGION_READ = 9, + VFIO_USER_REGION_WRITE = 10, + VFIO_USER_DMA_READ = 11, + VFIO_USER_DMA_WRITE = 12, + VFIO_USER_DEVICE_RESET = 13, + VFIO_USER_DIRTY_PAGES = 14, + VFIO_USER_MAX, +}; + +/* flags */ +#define VFIO_USER_REQUEST 0x0 +#define VFIO_USER_REPLY 0x1 +#define VFIO_USER_TYPE 0xF + +#define VFIO_USER_NO_REPLY 0x10 +#define VFIO_USER_ERROR 0x20 + +typedef struct vfio_user_hdr { + uint16_t id; + uint16_t command; + uint32_t size; + uint32_t flags; + uint32_t error_reply; +} vfio_user_hdr_t; + +/* + * VFIO_USER_VERSION + */ +#define VFIO_USER_MAJOR_VER 0 +#define VFIO_USER_MINOR_VER 0 + +struct vfio_user_version { + vfio_user_hdr_t hdr; + uint16_t major; + uint16_t minor; + char capabilities[]; +}; + +#define VFIO_USER_DEF_MAX_FDS 8 +#define VFIO_USER_MAX_MAX_FDS 16 + +#define VFIO_USER_DEF_MAX_XFER (1024 * 1024) +#define VFIO_USER_MAX_MAX_XFER (64 * 1024 * 1024) + +typedef struct VFIOUserFDs { + int send_fds; + int recv_fds; + int *fds; +} VFIOUserFDs; + +typedef struct VFIOUserReply { + QTAILQ_ENTRY(VFIOUserReply) next; + vfio_user_hdr_t *msg; + VFIOUserFDs *fds; + int rsize; + uint32_t id; + QemuCond cv; + uint8_t complete; +} VFIOUserReply; + +enum proxy_state { + CONNECTED = 1, + RECV_ERROR = 2, + CLOSING = 3, + CLOSED = 4, +}; + +typedef struct VFIOProxy { + QLIST_ENTRY(VFIOProxy) next; + char *sockname; + struct QIOChannel *ioc; + int (*request)(void *opaque, char *buf, VFIOUserFDs *fds); + void *reqarg; + int flags; + QemuCond close_cv; + + /* + * above only changed when iolock is held + * below are protected by per-proxy lock + */ + QemuMutex lock; + QTAILQ_HEAD(, VFIOUserReply) free; + QTAILQ_HEAD(, VFIOUserReply) pending; + enum proxy_state state; + int close_wait; +} VFIOProxy; + +#define VFIO_PROXY_CLIENT 0x1 + +void vfio_user_recv(void *opaque); +void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret); +#endif /* VFIO_USER_H */ diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 8af11b0a76..f43dc6e5d0 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -75,6 +75,7 @@ typedef struct VFIOAddressSpace { } VFIOAddressSpace; struct VFIOGroup; +typedef struct VFIOProxy VFIOProxy; typedef struct VFIOContainer { VFIOAddressSpace *space; @@ -143,6 +144,7 @@ typedef struct VFIODevice { VFIOMigration *migration; Error *migration_blocker; OnOffAuto pre_copy_dirty_page_tracking; + VFIOProxy *proxy; } VFIODevice; struct VFIODeviceOps { diff --git a/hw/vfio/user.c b/hw/vfio/user.c new file mode 100644 index 0000000000..021d5540e0 --- /dev/null +++ b/hw/vfio/user.c @@ -0,0 +1,286 @@ +/* + * vfio protocol over a UNIX socket. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include <linux/vfio.h> +#include <sys/ioctl.h> + +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "qemu/main-loop.h" +#include "hw/hw.h" +#include "hw/vfio/vfio-common.h" +#include "hw/vfio/vfio.h" +#include "qemu/sockets.h" +#include "io/channel.h" +#include "io/channel-util.h" +#include "sysemu/iothread.h" +#include "user.h" + +static uint64_t max_xfer_size = VFIO_USER_DEF_MAX_XFER; +static IOThread *vfio_user_iothread; +static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg, + VFIOUserFDs *fds); +static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg, + VFIOUserFDs *fds); +static void vfio_user_shutdown(VFIOProxy *proxy); + +static void vfio_user_shutdown(VFIOProxy *proxy) +{ + qio_channel_shutdown(proxy->ioc, QIO_CHANNEL_SHUTDOWN_READ, NULL); + qio_channel_set_aio_fd_handler(proxy->ioc, + iothread_get_aio_context(vfio_user_iothread), + NULL, NULL, NULL); +} + +void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret) +{ + vfio_user_hdr_t *hdr = (vfio_user_hdr_t *)buf; + + /* + * convert header to associated reply + * positive ret is reply size, negative is error code + */ + hdr->flags = VFIO_USER_REPLY; + if (ret > 0) { + hdr->size = ret; + } else if (ret < 0) { + hdr->flags |= VFIO_USER_ERROR; + hdr->error_reply = -ret; + hdr->size = sizeof(*hdr); + } + vfio_user_send(proxy, hdr, NULL); +} + +void vfio_user_recv(void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOProxy *proxy = vbasedev->proxy; + VFIOUserReply *reply = NULL; + g_autofree int *fdp = NULL; + VFIOUserFDs reqfds = { 0, 0, fdp }; + vfio_user_hdr_t msg; + struct iovec iov = { + .iov_base = &msg, + .iov_len = sizeof(msg), + }; + int isreply, i, ret; + size_t msgleft, numfds = 0; + char *data = NULL; + g_autofree char *buf = NULL; + Error *local_err = NULL; + + qemu_mutex_lock(&proxy->lock); + if (proxy->state == CLOSING) { + qemu_mutex_unlock(&proxy->lock); + return; + } + + ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds, + &local_err); + if (ret <= 0) { + /* read error or other side closed connection */ + error_setg_errno(&local_err, errno, "vfio_user_recv read error"); + goto fatal; + } + + if (ret < sizeof(msg)) { + error_setg(&local_err, "vfio_user_recv short read of header"); + goto err; + } + + /* + * For replies, find the matching pending request + */ + switch (msg.flags & VFIO_USER_TYPE) { + case VFIO_USER_REQUEST: + isreply = 0; + break; + case VFIO_USER_REPLY: + isreply = 1; + break; + default: + error_setg(&local_err, "vfio_user_recv unknown message type"); + goto err; + } + + if (isreply) { + QTAILQ_FOREACH(reply, &proxy->pending, next) { + if (msg.id == reply->id) { + break; + } + } + if (reply == NULL) { + error_setg(&local_err, "vfio_user_recv unexpected reply"); + goto err; + } + QTAILQ_REMOVE(&proxy->pending, reply, next); + + /* + * Process any received FDs + */ + if (numfds != 0) { + if (reply->fds == NULL || reply->fds->recv_fds < numfds) { + error_setg(&local_err, "vfio_user_recv unexpected FDs"); + goto err; + } + reply->fds->recv_fds = numfds; + memcpy(reply->fds->fds, fdp, numfds * sizeof(int)); + } + + } else { + /* + * The client doesn't expect any FDs in requests, but + * they will be expected on the server + */ + if (numfds != 0 && (proxy->flags & VFIO_PROXY_CLIENT)) { + error_setg(&local_err, "vfio_user_recv fd in client reply"); + goto err; + } + reqfds.recv_fds = numfds; + } + + /* + * put the whole message into a single buffer + */ + msgleft = msg.size - sizeof(msg); + if (isreply) { + if (msg.size > reply->rsize) { + error_setg(&local_err, + "vfio_user_recv reply larger than recv buffer"); + goto fatal; + } + *reply->msg = msg; + data = (char *)reply->msg + sizeof(msg); + } else { + if (msg.size > max_xfer_size) { + error_setg(&local_err, "vfio_user_recv request larger than max"); + goto fatal; + } + buf = g_malloc0(msg.size); + memcpy(buf, &msg, sizeof(msg)); + data = buf + sizeof(msg); + } + + if (msgleft != 0) { + ret = qio_channel_read(proxy->ioc, data, msgleft, &local_err); + if (ret < 0) { + goto fatal; + } + if (ret != msgleft) { + error_setg(&local_err, "vfio_user_recv short read of msg body"); + goto err; + } + } + + /* + * Replies signal a waiter, requests get processed by vfio code + * that may assume the iothread lock is held. + */ + qemu_mutex_unlock(&proxy->lock); + if (isreply) { + reply->complete = 1; + qemu_cond_signal(&reply->cv); + } else { + qemu_mutex_lock_iothread(); + /* + * make sure proxy wasn't closed while we waited + * checking without holding the proxy lock is safe + * since state is only set to CLOSING when iolock is held + */ + if (proxy->state != CLOSING) { + ret = proxy->request(proxy->reqarg, buf, &reqfds); + if (ret < 0 && !(msg.flags & VFIO_USER_NO_REPLY)) { + vfio_user_send_reply(proxy, buf, ret); + } + } + qemu_mutex_unlock_iothread(); + } + + return; + fatal: + vfio_user_shutdown(proxy); + proxy->state = RECV_ERROR; + + err: + qemu_mutex_unlock(&proxy->lock); + for (i = 0; i < numfds; i++) { + close(fdp[i]); + } + if (reply != NULL) { + /* force an error to keep sending thread from hanging */ + reply->msg->flags |= VFIO_USER_ERROR; + reply->msg->error_reply = EINVAL; + reply->complete = 1; + qemu_cond_signal(&reply->cv); + } + error_report_err(local_err); +} + +static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg, + VFIOUserFDs *fds) +{ + struct iovec iov = { + .iov_base = msg, + .iov_len = msg->size, + }; + size_t numfds = 0; + int msgleft, ret, *fdp = NULL; + char *buf; + Error *local_err = NULL; + + if (proxy->state != CONNECTED) { + msg->flags |= VFIO_USER_ERROR; + msg->error_reply = ECONNRESET; + return; + } + + if (fds != NULL && fds->send_fds != 0) { + numfds = fds->send_fds; + fdp = fds->fds; + } + ret = qio_channel_writev_full(proxy->ioc, &iov, 1, fdp, numfds, &local_err); + if (ret < 0) { + goto err; + } + if (ret == msg->size) { + return; + } + + buf = iov.iov_base + ret; + msgleft = iov.iov_len - ret; + do { + ret = qio_channel_write(proxy->ioc, buf, msgleft, &local_err); + if (ret < 0) { + goto err; + } + buf += ret, msgleft -= ret; + } while (msgleft != 0); + return; + + err: + error_report_err(local_err); +} + +static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg, + VFIOUserFDs *fds) +{ + bool iolock = qemu_mutex_iothread_locked(); + + if (iolock) { + qemu_mutex_unlock_iothread(); + } + qemu_mutex_lock(&proxy->lock); + vfio_user_send_locked(proxy, msg, fds); + qemu_mutex_unlock(&proxy->lock); + if (iolock) { + qemu_mutex_lock_iothread(); + } +} diff --git a/MAINTAINERS b/MAINTAINERS index 12d69f3a45..aa4df6c418 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1883,8 +1883,12 @@ L: qemu-s390x@nongnu.org vfio-user M: John G Johnson <john.g.johnson@oracle.com> M: Thanos Makatos <thanos.makatos@nutanix.com> +M: Elena Ufimtseva <elena.ufimtseva@oracle.com> +M: Jagannathan Raman <jag.raman@oracle.com> S: Supported F: docs/devel/vfio-user.rst +F: hw/vfio/user.c +F: hw/vfio/user.h vhost M: Michael S. Tsirkin <mst@redhat.com> diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index da9af297a0..739b30be73 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -8,6 +8,7 @@ vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( 'display.c', 'pci-quirks.c', 'pci.c', + 'user.c', )) vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c')) vfio_ss.add(when: 'CONFIG_VFIO_PLATFORM', if_true: files('platform.c'))