Message ID | 20180723045956.27521-4-tiwei.bie@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Supporting programming IOMMU in QEMU (vDPA/vhost-user) | expand |
On Mon, Jul 23, 2018 at 12:59:56PM +0800, Tiwei Bie wrote: > Introduce a slave message to allow slave to share its > VFIO group fd to master and do the IOMMU programming > based on virtio device's DMA address space for this > group in QEMU. > > For the vhost backends which support vDPA, they could > leverage this message to ask master to do the IOMMU > programming in QEMU for the vDPA device in backend. > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com> > --- > docs/interop/vhost-user.txt | 16 ++++++++++++++ > hw/virtio/vhost-user.c | 40 ++++++++++++++++++++++++++++++++++ > include/hw/virtio/vhost-user.h | 2 ++ > 3 files changed, 58 insertions(+) > > diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt > index f59667f498..a57a8f9451 100644 > --- a/docs/interop/vhost-user.txt > +++ b/docs/interop/vhost-user.txt > @@ -397,6 +397,7 @@ Protocol features > #define VHOST_USER_PROTOCOL_F_CONFIG 9 > #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 > #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 > +#define VHOST_USER_PROTOCOL_F_VFIO_GROUP 12 > > Master message types > -------------------- > @@ -815,6 +816,21 @@ Slave message types > This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER > protocol feature has been successfully negotiated. > > + * VHOST_USER_SLAVE_VFIO_GROUP_MSG > + > + Id: 4 > + Equivalent ioctl: N/A > + Slave payload: N/A > + Master payload: N/A > + > + When VHOST_USER_PROTOCOL_F_VFIO_GROUP is negotiated, vhost-user slave > + could send this request to share its VFIO group fd via ancillary data > + to master. After receiving this request from slave, master will close > + the existing VFIO group if any and do the DMA programming based on the > + virtio device's DMA address space for the new group if the request is > + sent with a file descriptor. > + Should it also clear out any mappings that were set on the old fd before closing it? Also should we limit when can this message be received? If not you need to re-program mappings again whenever we get a new fd. > + > VHOST_USER_PROTOCOL_F_REPLY_ACK: > ------------------------------- > The original vhost-user specification only demands replies for certain > diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c > index b041343632..db958e24c7 100644 > --- a/hw/virtio/vhost-user.c > +++ b/hw/virtio/vhost-user.c > @@ -52,6 +52,7 @@ enum VhostUserProtocolFeature { > VHOST_USER_PROTOCOL_F_CONFIG = 9, > VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, > VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, > + VHOST_USER_PROTOCOL_F_VFIO_GROUP = 12, > VHOST_USER_PROTOCOL_F_MAX > }; > > @@ -97,6 +98,7 @@ typedef enum VhostUserSlaveRequest { > VHOST_USER_SLAVE_IOTLB_MSG = 1, > VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2, > VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3, > + VHOST_USER_SLAVE_VFIO_GROUP_MSG = 4, > VHOST_USER_SLAVE_MAX > } VhostUserSlaveRequest; > > @@ -949,6 +951,41 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev, > return 0; > } > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev, > + int *fd) > +{ > + struct vhost_user *u = dev->opaque; > + VhostUserState *user = u->user; > + VirtIODevice *vdev = dev->vdev; > + int groupfd = fd[0]; > + VFIOGroup *group; > + > + if (!virtio_has_feature(dev->protocol_features, > + VHOST_USER_PROTOCOL_F_VFIO_GROUP) || > + vdev == NULL) { > + return -1; > + } > + > + if (user->vfio_group) { > + vfio_put_group(user->vfio_group); > + user->vfio_group = NULL; > + } > + > + group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL); > + if (group == NULL) { > + return -1; > + } > + > + if (group->fd != groupfd) { > + close(groupfd); > + } > + > + user->vfio_group = group; > + fd[0] = -1; > + > + return 0; > +} > + What will cause propagating groups to this vfio fd? > static void slave_read(void *opaque) > { > struct vhost_dev *dev = opaque; > @@ -1021,6 +1058,9 @@ static void slave_read(void *opaque) > ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area, > fd[0]); > break; > + case VHOST_USER_SLAVE_VFIO_GROUP_MSG: > + ret = vhost_user_slave_handle_vfio_group(dev, fd); > + break; > default: > error_report("Received unexpected msg type."); > ret = -EINVAL; > diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h > index fd660393a0..9e11473274 100644 > --- a/include/hw/virtio/vhost-user.h > +++ b/include/hw/virtio/vhost-user.h > @@ -10,6 +10,7 @@ > > #include "chardev/char-fe.h" > #include "hw/virtio/virtio.h" > +#include "hw/vfio/vfio-common.h" > > typedef struct VhostUserHostNotifier { > MemoryRegion mr; > @@ -20,6 +21,7 @@ typedef struct VhostUserHostNotifier { > typedef struct VhostUserState { > CharBackend *chr; > VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX]; > + VFIOGroup *vfio_group; > } VhostUserState; > > VhostUserState *vhost_user_init(void); > -- > 2.18.0
On Mon, Jul 23, 2018 at 12:59:56PM +0800, Tiwei Bie wrote: > Introduce a slave message to allow slave to share its > VFIO group fd to master and do the IOMMU programming > based on virtio device's DMA address space for this > group in QEMU. > > For the vhost backends which support vDPA, they could > leverage this message to ask master to do the IOMMU > programming in QEMU for the vDPA device in backend. > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com> > --- > docs/interop/vhost-user.txt | 16 ++++++++++++++ > hw/virtio/vhost-user.c | 40 ++++++++++++++++++++++++++++++++++ > include/hw/virtio/vhost-user.h | 2 ++ > 3 files changed, 58 insertions(+) > > diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt > index f59667f498..a57a8f9451 100644 > --- a/docs/interop/vhost-user.txt > +++ b/docs/interop/vhost-user.txt > @@ -397,6 +397,7 @@ Protocol features > #define VHOST_USER_PROTOCOL_F_CONFIG 9 > #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 > #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 > +#define VHOST_USER_PROTOCOL_F_VFIO_GROUP 12 > > Master message types > -------------------- > @@ -815,6 +816,21 @@ Slave message types > This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER > protocol feature has been successfully negotiated. > > + * VHOST_USER_SLAVE_VFIO_GROUP_MSG > + > + Id: 4 > + Equivalent ioctl: N/A > + Slave payload: N/A > + Master payload: N/A > + > + When VHOST_USER_PROTOCOL_F_VFIO_GROUP is negotiated, vhost-user slave > + could send this request to share its VFIO group fd via ancillary data > + to master. After receiving this request from slave, master will close > + the existing VFIO group if any and do the DMA programming based on the > + virtio device's DMA address space for the new group if the request is > + sent with a file descriptor. > + > + > VHOST_USER_PROTOCOL_F_REPLY_ACK: > ------------------------------- > The original vhost-user specification only demands replies for certain > diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c > index b041343632..db958e24c7 100644 > --- a/hw/virtio/vhost-user.c > +++ b/hw/virtio/vhost-user.c > @@ -52,6 +52,7 @@ enum VhostUserProtocolFeature { > VHOST_USER_PROTOCOL_F_CONFIG = 9, > VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, > VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, > + VHOST_USER_PROTOCOL_F_VFIO_GROUP = 12, > VHOST_USER_PROTOCOL_F_MAX > }; > > @@ -97,6 +98,7 @@ typedef enum VhostUserSlaveRequest { > VHOST_USER_SLAVE_IOTLB_MSG = 1, > VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2, > VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3, > + VHOST_USER_SLAVE_VFIO_GROUP_MSG = 4, > VHOST_USER_SLAVE_MAX > } VhostUserSlaveRequest; > > @@ -949,6 +951,41 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev, > return 0; > } > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev, > + int *fd) > +{ > + struct vhost_user *u = dev->opaque; > + VhostUserState *user = u->user; > + VirtIODevice *vdev = dev->vdev; > + int groupfd = fd[0]; > + VFIOGroup *group; > + > + if (!virtio_has_feature(dev->protocol_features, > + VHOST_USER_PROTOCOL_F_VFIO_GROUP) || > + vdev == NULL) { > + return -1; > + } > + > + if (user->vfio_group) { > + vfio_put_group(user->vfio_group); > + user->vfio_group = NULL; Seems to create a window where mappings are invalid even if the same fd is re-sent. Is that OK? > + } > + > + group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL); > + if (group == NULL) { > + return -1; > + } > + > + if (group->fd != groupfd) { > + close(groupfd); > + } > + > + user->vfio_group = group; > + fd[0] = -1; > + > + return 0; > +} > + > static void slave_read(void *opaque) > { > struct vhost_dev *dev = opaque; > @@ -1021,6 +1058,9 @@ static void slave_read(void *opaque) > ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area, > fd[0]); > break; > + case VHOST_USER_SLAVE_VFIO_GROUP_MSG: > + ret = vhost_user_slave_handle_vfio_group(dev, fd); > + break; > default: > error_report("Received unexpected msg type."); > ret = -EINVAL; > diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h > index fd660393a0..9e11473274 100644 > --- a/include/hw/virtio/vhost-user.h > +++ b/include/hw/virtio/vhost-user.h > @@ -10,6 +10,7 @@ > > #include "chardev/char-fe.h" > #include "hw/virtio/virtio.h" > +#include "hw/vfio/vfio-common.h" > > typedef struct VhostUserHostNotifier { > MemoryRegion mr; > @@ -20,6 +21,7 @@ typedef struct VhostUserHostNotifier { > typedef struct VhostUserState { > CharBackend *chr; > VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX]; > + VFIOGroup *vfio_group; > } VhostUserState; > > VhostUserState *vhost_user_init(void); > -- > 2.18.0
On Mon, Jul 23, 2018 at 12:19:04PM +0300, Michael S. Tsirkin wrote: > On Mon, Jul 23, 2018 at 12:59:56PM +0800, Tiwei Bie wrote: [...] > > @@ -815,6 +816,21 @@ Slave message types > > This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER > > protocol feature has been successfully negotiated. > > > > + * VHOST_USER_SLAVE_VFIO_GROUP_MSG > > + > > + Id: 4 > > + Equivalent ioctl: N/A > > + Slave payload: N/A > > + Master payload: N/A > > + > > + When VHOST_USER_PROTOCOL_F_VFIO_GROUP is negotiated, vhost-user slave > > + could send this request to share its VFIO group fd via ancillary data > > + to master. After receiving this request from slave, master will close > > + the existing VFIO group if any and do the DMA programming based on the > > + virtio device's DMA address space for the new group if the request is > > + sent with a file descriptor. > > + > > Should it also clear out any mappings that were set on the old fd > before closing it? Yes, more exactly it will do below things: 1. Delete VFIO group from KVM device fd; 2. Unset the container fd for this group fd; 3. Close this VFIO group fd; Should I include above details in this doc? > > Also should we limit when can this message be received? > If not you need to re-program mappings again whenever > we get a new fd. To keep things simple, this proposal requires the slave to assume the mappings are invalid before receiving the REPLY from master when the slave sends this message to master, and master will destroy the existing VFIO group if any and do the setup for the (new) VFIO group if the message carries a fd. So if a VFIO group fd has been sent and the device has been started, before sending a VFIO group fd (could be the same fd that has been sent), the slave should stop the device first and shouldn't assume the mappings are valid before receiving the REPLY. > > > + [...] > > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev, > > + int *fd) > > +{ > > + struct vhost_user *u = dev->opaque; > > + VhostUserState *user = u->user; > > + VirtIODevice *vdev = dev->vdev; > > + int groupfd = fd[0]; > > + VFIOGroup *group; > > + > > + if (!virtio_has_feature(dev->protocol_features, > > + VHOST_USER_PROTOCOL_F_VFIO_GROUP) || > > + vdev == NULL) { > > + return -1; > > + } > > + > > + if (user->vfio_group) { > > + vfio_put_group(user->vfio_group); > > + user->vfio_group = NULL; > > + } > > + > > + group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL); > > + if (group == NULL) { > > + return -1; > > + } > > + > > + if (group->fd != groupfd) { > > + close(groupfd); > > + } > > + > > + user->vfio_group = group; > > + fd[0] = -1; > > + > > + return 0; > > +} > > + > > What will cause propagating groups to this vfio fd? Do you mean when a VFIOGroup will be created for this VFIO group fd? A VFIOGroup will be created if there is no existing VFIOGroup that references to the same group id. Best regards, Tiwei Bie > > [...]
On Mon, Jul 23, 2018 at 12:20:12PM +0300, Michael S. Tsirkin wrote: > On Mon, Jul 23, 2018 at 12:59:56PM +0800, Tiwei Bie wrote: [...] > > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev, > > + int *fd) > > +{ > > + struct vhost_user *u = dev->opaque; > > + VhostUserState *user = u->user; > > + VirtIODevice *vdev = dev->vdev; > > + int groupfd = fd[0]; > > + VFIOGroup *group; > > + > > + if (!virtio_has_feature(dev->protocol_features, > > + VHOST_USER_PROTOCOL_F_VFIO_GROUP) || > > + vdev == NULL) { > > + return -1; > > + } > > + > > + if (user->vfio_group) { > > + vfio_put_group(user->vfio_group); > > + user->vfio_group = NULL; > > Seems to create a window where mappings are invalid > even if the same fd is re-sent. Is that OK? Yeah, there will be a window that mappings are invalid when the same fd is re-sent. Based on the proposal [1] of this patch, it should be OK. [1] http://lists.gnu.org/archive/html/qemu-devel/2018-07/msg04335.html """ To keep things simple, this proposal requires the slave to assume the mappings are invalid before receiving the REPLY from master when the slave sends this message to master, and master will destroy the existing VFIO group if any and do the setup for the (new) VFIO group if the message carries a fd. So if a VFIO group fd has been sent and the device has been started, before sending a VFIO group fd (could be the same fd that has been sent), the slave should stop the device first and shouldn't assume the mappings are valid before receiving the REPLY. """ Best regards, Tiwei Bie > > > + } [...]
On Mon, 23 Jul 2018 12:59:56 +0800 Tiwei Bie <tiwei.bie@intel.com> wrote: > Introduce a slave message to allow slave to share its > VFIO group fd to master and do the IOMMU programming > based on virtio device's DMA address space for this > group in QEMU. > > For the vhost backends which support vDPA, they could > leverage this message to ask master to do the IOMMU > programming in QEMU for the vDPA device in backend. > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com> > --- > docs/interop/vhost-user.txt | 16 ++++++++++++++ > hw/virtio/vhost-user.c | 40 ++++++++++++++++++++++++++++++++++ > include/hw/virtio/vhost-user.h | 2 ++ > 3 files changed, 58 insertions(+) > > diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt > index f59667f498..a57a8f9451 100644 > --- a/docs/interop/vhost-user.txt > +++ b/docs/interop/vhost-user.txt > @@ -397,6 +397,7 @@ Protocol features > #define VHOST_USER_PROTOCOL_F_CONFIG 9 > #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 > #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 > +#define VHOST_USER_PROTOCOL_F_VFIO_GROUP 12 > > Master message types > -------------------- > @@ -815,6 +816,21 @@ Slave message types > This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER > protocol feature has been successfully negotiated. > > + * VHOST_USER_SLAVE_VFIO_GROUP_MSG > + > + Id: 4 > + Equivalent ioctl: N/A > + Slave payload: N/A > + Master payload: N/A > + > + When VHOST_USER_PROTOCOL_F_VFIO_GROUP is negotiated, vhost-user slave > + could send this request to share its VFIO group fd via ancillary data > + to master. After receiving this request from slave, master will close > + the existing VFIO group if any and do the DMA programming based on the > + virtio device's DMA address space for the new group if the request is > + sent with a file descriptor. > + > + > VHOST_USER_PROTOCOL_F_REPLY_ACK: > ------------------------------- > The original vhost-user specification only demands replies for certain > diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c > index b041343632..db958e24c7 100644 > --- a/hw/virtio/vhost-user.c > +++ b/hw/virtio/vhost-user.c > @@ -52,6 +52,7 @@ enum VhostUserProtocolFeature { > VHOST_USER_PROTOCOL_F_CONFIG = 9, > VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, > VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, > + VHOST_USER_PROTOCOL_F_VFIO_GROUP = 12, > VHOST_USER_PROTOCOL_F_MAX > }; > > @@ -97,6 +98,7 @@ typedef enum VhostUserSlaveRequest { > VHOST_USER_SLAVE_IOTLB_MSG = 1, > VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2, > VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3, > + VHOST_USER_SLAVE_VFIO_GROUP_MSG = 4, > VHOST_USER_SLAVE_MAX > } VhostUserSlaveRequest; > > @@ -949,6 +951,41 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev, > return 0; > } > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev, > + int *fd) > +{ > + struct vhost_user *u = dev->opaque; > + VhostUserState *user = u->user; > + VirtIODevice *vdev = dev->vdev; > + int groupfd = fd[0]; > + VFIOGroup *group; > + > + if (!virtio_has_feature(dev->protocol_features, > + VHOST_USER_PROTOCOL_F_VFIO_GROUP) || > + vdev == NULL) { > + return -1; > + } > + > + if (user->vfio_group) { > + vfio_put_group(user->vfio_group); > + user->vfio_group = NULL; > + } > + > + group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL); > + if (group == NULL) { > + return -1; > + } > + > + if (group->fd != groupfd) { > + close(groupfd); > + } > + > + user->vfio_group = group; > + fd[0] = -1; > + > + return 0; > +} This all looks very sketchy, we're reaching into vfio internal state and arbitrarily releasing data structures, reusing existing ones, and maybe creating new ones. We know that a vfio group only allows a single open, right? So if you have a groupfd, vfio will never have that same group opened already. Is that the goal? It's not obvious in the code. I don't really understand what vhost goes on to do with this group, but I'm pretty sure the existing state machine in vfio isn't designed for it. Thanks, Alex > + > static void slave_read(void *opaque) > { > struct vhost_dev *dev = opaque; > @@ -1021,6 +1058,9 @@ static void slave_read(void *opaque) > ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area, > fd[0]); > break; > + case VHOST_USER_SLAVE_VFIO_GROUP_MSG: > + ret = vhost_user_slave_handle_vfio_group(dev, fd); > + break; > default: > error_report("Received unexpected msg type."); > ret = -EINVAL; > diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h > index fd660393a0..9e11473274 100644 > --- a/include/hw/virtio/vhost-user.h > +++ b/include/hw/virtio/vhost-user.h > @@ -10,6 +10,7 @@ > > #include "chardev/char-fe.h" > #include "hw/virtio/virtio.h" > +#include "hw/vfio/vfio-common.h" > > typedef struct VhostUserHostNotifier { > MemoryRegion mr; > @@ -20,6 +21,7 @@ typedef struct VhostUserHostNotifier { > typedef struct VhostUserState { > CharBackend *chr; > VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX]; > + VFIOGroup *vfio_group; > } VhostUserState; > > VhostUserState *vhost_user_init(void);
On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote: > On Mon, 23 Jul 2018 12:59:56 +0800 > Tiwei Bie <tiwei.bie@intel.com> wrote: > [...] > > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev, > > + int *fd) > > +{ > > + struct vhost_user *u = dev->opaque; > > + VhostUserState *user = u->user; > > + VirtIODevice *vdev = dev->vdev; > > + int groupfd = fd[0]; > > + VFIOGroup *group; > > + > > + if (!virtio_has_feature(dev->protocol_features, > > + VHOST_USER_PROTOCOL_F_VFIO_GROUP) || > > + vdev == NULL) { > > + return -1; > > + } > > + > > + if (user->vfio_group) { > > + vfio_put_group(user->vfio_group); > > + user->vfio_group = NULL; > > + } > > + There should be a below check here (I missed it in this patch, sorry..): if (groupfd < 0) { return 0; } > > + group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL); > > + if (group == NULL) { > > + return -1; > > + } > > + > > + if (group->fd != groupfd) { > > + close(groupfd); > > + } > > + > > + user->vfio_group = group; > > + fd[0] = -1; > > + > > + return 0; > > +} > > This all looks very sketchy, we're reaching into vfio internal state > and arbitrarily releasing data structures, reusing existing ones, and > maybe creating new ones. We know that a vfio group only allows a > single open, right? Yeah, exactly! When the vhost-user backend process has opened a VFIO group fd, QEMU won't be able to open this group file via open() any more. So vhost-user backend process will share the VFIO group fd to QEMU via the UNIX socket. And that's why we're introducing a new API: vfio_get_group_from_fd(groupfd, ...); for vfio/common.c to get (setup) a VFIOGroup structure. (Because in this case, vfio/common.c can't open this group file via open(), and what we have is a VFIO group fd shared by another process via UNIX socket). And ... > So if you have a groupfd, vfio will never have > that same group opened already. ... if the same vhost-user backend process shares the same VFIO group fd more than one times via different vhost-user slave channels to different vhost-user frontends in the same QEMU process, the same VFIOGroup could exist already. This case could happen when e.g. there are two vhost accelerator devices in the same VFIO group, and they are managed by the same vhost-user backend process, and used to accelerate two different virtio devices in QEMU. In this case, the same VFIO group fd could be sent twice. If we need to support this case, more changes in vfio/common.c will be needed, e.g. 1) add a refcnt in VFIOGroup to support getting and putting a VFIOGroup structure multiple times, 2) add a lock to make vfio_get_group*() and vfio_put_group() thread-safe. > Is that the goal? It's not obvious in > the code. I don't really understand what vhost goes on to do with this > group, The goal is that, we have a VFIO group opened by an external vhost-user backend process, this process will manage the VFIO device, but want QEMU to manage the VFIO group, including: 1 - program the DMA mappings for this VFIO group based on the front-end device's dma_as in QEMU. 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD). Vhost-user in QEMU as the vhost-user frontend just wants hw/vfio to do above things after receiving a VFIO group fd from the vhost-user backend process. Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup by calling vfio_get_group_from_fd() or put this VFIOGroup by calling vfio_put_group() when it's not needed anymore, and won't do anything else. Vhost-user will only deal with the VFIOGroups allocated by itself, and won't influence any other VFIOGroups used by the VFIO passthru devices (-device vfio-pci). Because the same VFIO group file can only be opened by one process via open(). > but I'm pretty sure the existing state machine in vfio isn't > designed for it. Thanks, Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to make it work robustly. Hmm.. Is above idea (e.g. how vhost-user is going to use VFIOGroup and how we are going to extend hw/vfio) acceptable to you? Thanks! Best regards, Tiwei Bie
On Fri, 27 Jul 2018 09:58:05 +0800 Tiwei Bie <tiwei.bie@intel.com> wrote: > On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote: > > On Mon, 23 Jul 2018 12:59:56 +0800 > > Tiwei Bie <tiwei.bie@intel.com> wrote: > > > [...] > > > > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev, > > > + int *fd) > > > +{ > > > + struct vhost_user *u = dev->opaque; > > > + VhostUserState *user = u->user; > > > + VirtIODevice *vdev = dev->vdev; > > > + int groupfd = fd[0]; > > > + VFIOGroup *group; > > > + > > > + if (!virtio_has_feature(dev->protocol_features, > > > + VHOST_USER_PROTOCOL_F_VFIO_GROUP) || > > > + vdev == NULL) { > > > + return -1; > > > + } > > > + > > > + if (user->vfio_group) { > > > + vfio_put_group(user->vfio_group); > > > + user->vfio_group = NULL; > > > + } > > > + > > There should be a below check here (I missed it in this > patch, sorry..): > > if (groupfd < 0) { > return 0; > } > > > > + group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL); > > > + if (group == NULL) { > > > + return -1; > > > + } > > > + > > > + if (group->fd != groupfd) { > > > + close(groupfd); > > > + } > > > + > > > + user->vfio_group = group; > > > + fd[0] = -1; > > > + > > > + return 0; > > > +} > > > > This all looks very sketchy, we're reaching into vfio internal state > > and arbitrarily releasing data structures, reusing existing ones, and > > maybe creating new ones. We know that a vfio group only allows a > > single open, right? > > Yeah, exactly! > > When the vhost-user backend process has opened a VFIO group fd, > QEMU won't be able to open this group file via open() any more. > So vhost-user backend process will share the VFIO group fd to > QEMU via the UNIX socket. And that's why we're introducing a > new API: > > vfio_get_group_from_fd(groupfd, ...); > > for vfio/common.c to get (setup) a VFIOGroup structure. (Because > in this case, vfio/common.c can't open this group file via open(), > and what we have is a VFIO group fd shared by another process via > UNIX socket). And ... > > > So if you have a groupfd, vfio will never have > > that same group opened already. > > ... if the same vhost-user backend process shares the same VFIO > group fd more than one times via different vhost-user slave channels > to different vhost-user frontends in the same QEMU process, the > same VFIOGroup could exist already. > > This case could happen when e.g. there are two vhost accelerator > devices in the same VFIO group, and they are managed by the same > vhost-user backend process, and used to accelerate two different > virtio devices in QEMU. In this case, the same VFIO group fd could > be sent twice. > > If we need to support this case, more changes in vfio/common.c > will be needed, e.g. 1) add a refcnt in VFIOGroup to support > getting and putting a VFIOGroup structure multiple times, > 2) add a lock to make vfio_get_group*() and vfio_put_group() > thread-safe. This is the sort of case where it seems like we're just grabbing internal interfaces and using them from other modules. VFIOGroups have a device_list and currently handles multiple puts by testing whether that device list is empty (ie. we already have a refcnt effectively). Now we have a group user that has no devices, which is not something that it seems like we've designed or audited the code for handling. IOW, while the header file lives in a common location, it's not really intended to be an API within QEMU and it makes me a bit uncomfortable to use it as such. > > Is that the goal? It's not obvious in > > the code. I don't really understand what vhost goes on to do with this > > group, > > The goal is that, we have a VFIO group opened by an external > vhost-user backend process, this process will manage the VFIO > device, but want QEMU to manage the VFIO group, including: > > 1 - program the DMA mappings for this VFIO group based on > the front-end device's dma_as in QEMU. > > 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD). > > Vhost-user in QEMU as the vhost-user frontend just wants > hw/vfio to do above things after receiving a VFIO group fd > from the vhost-user backend process. > > Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup > by calling vfio_get_group_from_fd() or put this VFIOGroup by > calling vfio_put_group() when it's not needed anymore, and > won't do anything else. > > Vhost-user will only deal with the VFIOGroups allocated by > itself, and won't influence any other VFIOGroups used by the > VFIO passthru devices (-device vfio-pci). Because the same > VFIO group file can only be opened by one process via open(). But there's nothing here that guarantees the reverse. vhost cannot open the group of an existing vfio-pci device, but vfio-pci can re-use the group that vhost has already opened. This is again where it seems like we're trying to cherry pick from an internal API and leaving some loose ends dangling. > > but I'm pretty sure the existing state machine in vfio isn't > > designed for it. Thanks, > > Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to > make it work robustly. Hmm.. Is above idea (e.g. how vhost-user > is going to use VFIOGroup and how we are going to extend hw/vfio) > acceptable to you? Thanks! I certainly would not want to duplicate managing the group, container, and MemoryListener, but I think we need a more formal interface with at least some notion of reference counting beyond the device list or explicit knowledge of an external user. I'm also curious if it really makes sense that the vhost backend is opening the group rather than letting QEMU do it. It seems that vhost wants to deal with the device and we can never have symmetry in the scenario above, where a group might contain multiple devices and order matters because of where the group is opened. If we still had a notion of a VFIODevice for an external user, then the device_list refcnt would just work. Thanks, Alex
On Fri, Jul 27, 2018 at 02:03:00PM -0600, Alex Williamson wrote: > On Fri, 27 Jul 2018 09:58:05 +0800 > Tiwei Bie <tiwei.bie@intel.com> wrote: > > > On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote: > > > On Mon, 23 Jul 2018 12:59:56 +0800 > > > Tiwei Bie <tiwei.bie@intel.com> wrote: > > > > > [...] > > > > > > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev, > > > > + int *fd) > > > > +{ > > > > + struct vhost_user *u = dev->opaque; > > > > + VhostUserState *user = u->user; > > > > + VirtIODevice *vdev = dev->vdev; > > > > + int groupfd = fd[0]; > > > > + VFIOGroup *group; > > > > + > > > > + if (!virtio_has_feature(dev->protocol_features, > > > > + VHOST_USER_PROTOCOL_F_VFIO_GROUP) || > > > > + vdev == NULL) { > > > > + return -1; > > > > + } > > > > + > > > > + if (user->vfio_group) { > > > > + vfio_put_group(user->vfio_group); > > > > + user->vfio_group = NULL; > > > > + } > > > > + > > > > There should be a below check here (I missed it in this > > patch, sorry..): > > > > if (groupfd < 0) { > > return 0; > > } > > > > > > + group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL); > > > > + if (group == NULL) { > > > > + return -1; > > > > + } > > > > + > > > > + if (group->fd != groupfd) { > > > > + close(groupfd); > > > > + } > > > > + > > > > + user->vfio_group = group; > > > > + fd[0] = -1; > > > > + > > > > + return 0; > > > > +} > > > > > > This all looks very sketchy, we're reaching into vfio internal state > > > and arbitrarily releasing data structures, reusing existing ones, and > > > maybe creating new ones. We know that a vfio group only allows a > > > single open, right? > > > > Yeah, exactly! > > > > When the vhost-user backend process has opened a VFIO group fd, > > QEMU won't be able to open this group file via open() any more. > > So vhost-user backend process will share the VFIO group fd to > > QEMU via the UNIX socket. And that's why we're introducing a > > new API: > > > > vfio_get_group_from_fd(groupfd, ...); > > > > for vfio/common.c to get (setup) a VFIOGroup structure. (Because > > in this case, vfio/common.c can't open this group file via open(), > > and what we have is a VFIO group fd shared by another process via > > UNIX socket). And ... > > > > > So if you have a groupfd, vfio will never have > > > that same group opened already. > > > > ... if the same vhost-user backend process shares the same VFIO > > group fd more than one times via different vhost-user slave channels > > to different vhost-user frontends in the same QEMU process, the > > same VFIOGroup could exist already. > > > > This case could happen when e.g. there are two vhost accelerator > > devices in the same VFIO group, and they are managed by the same > > vhost-user backend process, and used to accelerate two different > > virtio devices in QEMU. In this case, the same VFIO group fd could > > be sent twice. > > > > If we need to support this case, more changes in vfio/common.c > > will be needed, e.g. 1) add a refcnt in VFIOGroup to support > > getting and putting a VFIOGroup structure multiple times, > > 2) add a lock to make vfio_get_group*() and vfio_put_group() > > thread-safe. > > This is the sort of case where it seems like we're just grabbing > internal interfaces and using them from other modules. VFIOGroups have > a device_list and currently handles multiple puts by testing whether > that device list is empty (ie. we already have a refcnt effectively). > Now we have a group user that has no devices, which is not something > that it seems like we've designed or audited the code for handling. > IOW, while the header file lives in a common location, it's not really > intended to be an API within QEMU and it makes me a bit uncomfortable > to use it as such. > > > > Is that the goal? It's not obvious in > > > the code. I don't really understand what vhost goes on to do with this > > > group, > > > > The goal is that, we have a VFIO group opened by an external > > vhost-user backend process, this process will manage the VFIO > > device, but want QEMU to manage the VFIO group, including: > > > > 1 - program the DMA mappings for this VFIO group based on > > the front-end device's dma_as in QEMU. > > > > 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD). > > > > Vhost-user in QEMU as the vhost-user frontend just wants > > hw/vfio to do above things after receiving a VFIO group fd > > from the vhost-user backend process. > > > > Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup > > by calling vfio_get_group_from_fd() or put this VFIOGroup by > > calling vfio_put_group() when it's not needed anymore, and > > won't do anything else. > > > > Vhost-user will only deal with the VFIOGroups allocated by > > itself, and won't influence any other VFIOGroups used by the > > VFIO passthru devices (-device vfio-pci). Because the same > > VFIO group file can only be opened by one process via open(). > > But there's nothing here that guarantees the reverse. vhost cannot > open the group of an existing vfio-pci device, but vfio-pci can re-use > the group that vhost has already opened. This is again where it seems > like we're trying to cherry pick from an internal API and leaving some > loose ends dangling. > > > > but I'm pretty sure the existing state machine in vfio isn't > > > designed for it. Thanks, > > > > Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to > > make it work robustly. Hmm.. Is above idea (e.g. how vhost-user > > is going to use VFIOGroup and how we are going to extend hw/vfio) > > acceptable to you? Thanks! > > I certainly would not want to duplicate managing the group, container, > and MemoryListener, but I think we need a more formal interface with at > least some notion of reference counting beyond the device list or > explicit knowledge of an external user. Got it! I'll try to address this. Thanks! > I'm also curious if it really > makes sense that the vhost backend is opening the group rather than > letting QEMU do it. It seems that vhost wants to deal with the device > and we can never have symmetry in the scenario above, where a group > might contain multiple devices and order matters because of where the > group is opened. If we still had a notion of a VFIODevice for an > external user, then the device_list refcnt would just work. Thanks, Vhost-user backend will but QEMU won't open the VFIO fds, because QEMU just has a vhost-user UNIX socket path that it should connect to or listen on. So QEMU doesn't know which group and device it should open. The vhost accelerator devices are probed and managed in the vhost-user backend process. And the vhost-user backend process will bind the vhost-user instances and vhost accelerator devices based on some sort of user's input. Best regards, Tiwei Bie
On Mon, Jul 30, 2018 at 04:10:04PM +0800, Tiwei Bie wrote: > On Fri, Jul 27, 2018 at 02:03:00PM -0600, Alex Williamson wrote: > > On Fri, 27 Jul 2018 09:58:05 +0800 > > Tiwei Bie <tiwei.bie@intel.com> wrote: > > > > > On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote: > > > > On Mon, 23 Jul 2018 12:59:56 +0800 > > > > Tiwei Bie <tiwei.bie@intel.com> wrote: > > > > > > > [...] > > > > > > > > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev, > > > > > + int *fd) > > > > > +{ > > > > > + struct vhost_user *u = dev->opaque; > > > > > + VhostUserState *user = u->user; > > > > > + VirtIODevice *vdev = dev->vdev; > > > > > + int groupfd = fd[0]; > > > > > + VFIOGroup *group; > > > > > + > > > > > + if (!virtio_has_feature(dev->protocol_features, > > > > > + VHOST_USER_PROTOCOL_F_VFIO_GROUP) || > > > > > + vdev == NULL) { > > > > > + return -1; > > > > > + } > > > > > + > > > > > + if (user->vfio_group) { > > > > > + vfio_put_group(user->vfio_group); > > > > > + user->vfio_group = NULL; > > > > > + } > > > > > + > > > > > > There should be a below check here (I missed it in this > > > patch, sorry..): > > > > > > if (groupfd < 0) { > > > return 0; > > > } > > > > > > > > + group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL); > > > > > + if (group == NULL) { > > > > > + return -1; > > > > > + } > > > > > + > > > > > + if (group->fd != groupfd) { > > > > > + close(groupfd); > > > > > + } > > > > > + > > > > > + user->vfio_group = group; > > > > > + fd[0] = -1; > > > > > + > > > > > + return 0; > > > > > +} > > > > > > > > This all looks very sketchy, we're reaching into vfio internal state > > > > and arbitrarily releasing data structures, reusing existing ones, and > > > > maybe creating new ones. We know that a vfio group only allows a > > > > single open, right? > > > > > > Yeah, exactly! > > > > > > When the vhost-user backend process has opened a VFIO group fd, > > > QEMU won't be able to open this group file via open() any more. > > > So vhost-user backend process will share the VFIO group fd to > > > QEMU via the UNIX socket. And that's why we're introducing a > > > new API: > > > > > > vfio_get_group_from_fd(groupfd, ...); > > > > > > for vfio/common.c to get (setup) a VFIOGroup structure. (Because > > > in this case, vfio/common.c can't open this group file via open(), > > > and what we have is a VFIO group fd shared by another process via > > > UNIX socket). And ... > > > > > > > So if you have a groupfd, vfio will never have > > > > that same group opened already. > > > > > > ... if the same vhost-user backend process shares the same VFIO > > > group fd more than one times via different vhost-user slave channels > > > to different vhost-user frontends in the same QEMU process, the > > > same VFIOGroup could exist already. > > > > > > This case could happen when e.g. there are two vhost accelerator > > > devices in the same VFIO group, and they are managed by the same > > > vhost-user backend process, and used to accelerate two different > > > virtio devices in QEMU. In this case, the same VFIO group fd could > > > be sent twice. > > > > > > If we need to support this case, more changes in vfio/common.c > > > will be needed, e.g. 1) add a refcnt in VFIOGroup to support > > > getting and putting a VFIOGroup structure multiple times, > > > 2) add a lock to make vfio_get_group*() and vfio_put_group() > > > thread-safe. > > > > This is the sort of case where it seems like we're just grabbing > > internal interfaces and using them from other modules. VFIOGroups have > > a device_list and currently handles multiple puts by testing whether > > that device list is empty (ie. we already have a refcnt effectively). > > Now we have a group user that has no devices, which is not something > > that it seems like we've designed or audited the code for handling. > > IOW, while the header file lives in a common location, it's not really > > intended to be an API within QEMU and it makes me a bit uncomfortable > > to use it as such. > > > > > > Is that the goal? It's not obvious in > > > > the code. I don't really understand what vhost goes on to do with this > > > > group, > > > > > > The goal is that, we have a VFIO group opened by an external > > > vhost-user backend process, this process will manage the VFIO > > > device, but want QEMU to manage the VFIO group, including: > > > > > > 1 - program the DMA mappings for this VFIO group based on > > > the front-end device's dma_as in QEMU. > > > > > > 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD). > > > > > > Vhost-user in QEMU as the vhost-user frontend just wants > > > hw/vfio to do above things after receiving a VFIO group fd > > > from the vhost-user backend process. > > > > > > Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup > > > by calling vfio_get_group_from_fd() or put this VFIOGroup by > > > calling vfio_put_group() when it's not needed anymore, and > > > won't do anything else. > > > > > > Vhost-user will only deal with the VFIOGroups allocated by > > > itself, and won't influence any other VFIOGroups used by the > > > VFIO passthru devices (-device vfio-pci). Because the same > > > VFIO group file can only be opened by one process via open(). > > > > But there's nothing here that guarantees the reverse. vhost cannot > > open the group of an existing vfio-pci device, but vfio-pci can re-use > > the group that vhost has already opened. This is again where it seems > > like we're trying to cherry pick from an internal API and leaving some > > loose ends dangling. > > > > > > but I'm pretty sure the existing state machine in vfio isn't > > > > designed for it. Thanks, > > > > > > Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to > > > make it work robustly. Hmm.. Is above idea (e.g. how vhost-user > > > is going to use VFIOGroup and how we are going to extend hw/vfio) > > > acceptable to you? Thanks! > > > > I certainly would not want to duplicate managing the group, container, > > and MemoryListener, but I think we need a more formal interface with at > > least some notion of reference counting beyond the device list or > > explicit knowledge of an external user. > > Got it! I'll try to address this. Thanks! > > > I'm also curious if it really > > makes sense that the vhost backend is opening the group rather than > > letting QEMU do it. It seems that vhost wants to deal with the device > > and we can never have symmetry in the scenario above, where a group > > might contain multiple devices and order matters because of where the > > group is opened. If we still had a notion of a VFIODevice for an > > external user, then the device_list refcnt would just work. Thanks, > > Vhost-user backend will but QEMU won't open the VFIO > fds, because QEMU just has a vhost-user UNIX socket > path that it should connect to or listen on. So QEMU > doesn't know which group and device it should open. > The vhost accelerator devices are probed and managed > in the vhost-user backend process. And the vhost-user > backend process will bind the vhost-user instances > and vhost accelerator devices based on some sort of > user's input. > > Best regards, > Tiwei Bie Is the reason it's done like this because the backend is sharing the device with multiple QEMUs? I generally wonder how are restarts of the backend handled with this approach: closing the VFIO device tends to reset the whole device.
On Mon, 30 Jul 2018 12:30:58 +0300 "Michael S. Tsirkin" <mst@redhat.com> wrote: > On Mon, Jul 30, 2018 at 04:10:04PM +0800, Tiwei Bie wrote: > > On Fri, Jul 27, 2018 at 02:03:00PM -0600, Alex Williamson wrote: > > > On Fri, 27 Jul 2018 09:58:05 +0800 > > > Tiwei Bie <tiwei.bie@intel.com> wrote: > > > > > > > On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote: > > > > > On Mon, 23 Jul 2018 12:59:56 +0800 > > > > > Tiwei Bie <tiwei.bie@intel.com> wrote: > > > > > > > > > [...] > > > > > > > > > > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev, > > > > > > + int *fd) > > > > > > +{ > > > > > > + struct vhost_user *u = dev->opaque; > > > > > > + VhostUserState *user = u->user; > > > > > > + VirtIODevice *vdev = dev->vdev; > > > > > > + int groupfd = fd[0]; > > > > > > + VFIOGroup *group; > > > > > > + > > > > > > + if (!virtio_has_feature(dev->protocol_features, > > > > > > + VHOST_USER_PROTOCOL_F_VFIO_GROUP) || > > > > > > + vdev == NULL) { > > > > > > + return -1; > > > > > > + } > > > > > > + > > > > > > + if (user->vfio_group) { > > > > > > + vfio_put_group(user->vfio_group); > > > > > > + user->vfio_group = NULL; > > > > > > + } > > > > > > + > > > > > > > > There should be a below check here (I missed it in this > > > > patch, sorry..): > > > > > > > > if (groupfd < 0) { > > > > return 0; > > > > } > > > > > > > > > > + group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL); > > > > > > + if (group == NULL) { > > > > > > + return -1; > > > > > > + } > > > > > > + > > > > > > + if (group->fd != groupfd) { > > > > > > + close(groupfd); > > > > > > + } > > > > > > + > > > > > > + user->vfio_group = group; > > > > > > + fd[0] = -1; > > > > > > + > > > > > > + return 0; > > > > > > +} > > > > > > > > > > This all looks very sketchy, we're reaching into vfio internal state > > > > > and arbitrarily releasing data structures, reusing existing ones, and > > > > > maybe creating new ones. We know that a vfio group only allows a > > > > > single open, right? > > > > > > > > Yeah, exactly! > > > > > > > > When the vhost-user backend process has opened a VFIO group fd, > > > > QEMU won't be able to open this group file via open() any more. > > > > So vhost-user backend process will share the VFIO group fd to > > > > QEMU via the UNIX socket. And that's why we're introducing a > > > > new API: > > > > > > > > vfio_get_group_from_fd(groupfd, ...); > > > > > > > > for vfio/common.c to get (setup) a VFIOGroup structure. (Because > > > > in this case, vfio/common.c can't open this group file via open(), > > > > and what we have is a VFIO group fd shared by another process via > > > > UNIX socket). And ... > > > > > > > > > So if you have a groupfd, vfio will never have > > > > > that same group opened already. > > > > > > > > ... if the same vhost-user backend process shares the same VFIO > > > > group fd more than one times via different vhost-user slave channels > > > > to different vhost-user frontends in the same QEMU process, the > > > > same VFIOGroup could exist already. > > > > > > > > This case could happen when e.g. there are two vhost accelerator > > > > devices in the same VFIO group, and they are managed by the same > > > > vhost-user backend process, and used to accelerate two different > > > > virtio devices in QEMU. In this case, the same VFIO group fd could > > > > be sent twice. > > > > > > > > If we need to support this case, more changes in vfio/common.c > > > > will be needed, e.g. 1) add a refcnt in VFIOGroup to support > > > > getting and putting a VFIOGroup structure multiple times, > > > > 2) add a lock to make vfio_get_group*() and vfio_put_group() > > > > thread-safe. > > > > > > This is the sort of case where it seems like we're just grabbing > > > internal interfaces and using them from other modules. VFIOGroups have > > > a device_list and currently handles multiple puts by testing whether > > > that device list is empty (ie. we already have a refcnt effectively). > > > Now we have a group user that has no devices, which is not something > > > that it seems like we've designed or audited the code for handling. > > > IOW, while the header file lives in a common location, it's not really > > > intended to be an API within QEMU and it makes me a bit uncomfortable > > > to use it as such. > > > > > > > > Is that the goal? It's not obvious in > > > > > the code. I don't really understand what vhost goes on to do with this > > > > > group, > > > > > > > > The goal is that, we have a VFIO group opened by an external > > > > vhost-user backend process, this process will manage the VFIO > > > > device, but want QEMU to manage the VFIO group, including: > > > > > > > > 1 - program the DMA mappings for this VFIO group based on > > > > the front-end device's dma_as in QEMU. > > > > > > > > 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD). > > > > > > > > Vhost-user in QEMU as the vhost-user frontend just wants > > > > hw/vfio to do above things after receiving a VFIO group fd > > > > from the vhost-user backend process. > > > > > > > > Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup > > > > by calling vfio_get_group_from_fd() or put this VFIOGroup by > > > > calling vfio_put_group() when it's not needed anymore, and > > > > won't do anything else. > > > > > > > > Vhost-user will only deal with the VFIOGroups allocated by > > > > itself, and won't influence any other VFIOGroups used by the > > > > VFIO passthru devices (-device vfio-pci). Because the same > > > > VFIO group file can only be opened by one process via open(). > > > > > > But there's nothing here that guarantees the reverse. vhost cannot > > > open the group of an existing vfio-pci device, but vfio-pci can re-use > > > the group that vhost has already opened. This is again where it seems > > > like we're trying to cherry pick from an internal API and leaving some > > > loose ends dangling. > > > > > > > > but I'm pretty sure the existing state machine in vfio isn't > > > > > designed for it. Thanks, > > > > > > > > Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to > > > > make it work robustly. Hmm.. Is above idea (e.g. how vhost-user > > > > is going to use VFIOGroup and how we are going to extend hw/vfio) > > > > acceptable to you? Thanks! > > > > > > I certainly would not want to duplicate managing the group, container, > > > and MemoryListener, but I think we need a more formal interface with at > > > least some notion of reference counting beyond the device list or > > > explicit knowledge of an external user. > > > > Got it! I'll try to address this. Thanks! > > > > > I'm also curious if it really > > > makes sense that the vhost backend is opening the group rather than > > > letting QEMU do it. It seems that vhost wants to deal with the device > > > and we can never have symmetry in the scenario above, where a group > > > might contain multiple devices and order matters because of where the > > > group is opened. If we still had a notion of a VFIODevice for an > > > external user, then the device_list refcnt would just work. Thanks, > > > > Vhost-user backend will but QEMU won't open the VFIO > > fds, because QEMU just has a vhost-user UNIX socket > > path that it should connect to or listen on. So QEMU > > doesn't know which group and device it should open. > > The vhost accelerator devices are probed and managed > > in the vhost-user backend process. And the vhost-user > > backend process will bind the vhost-user instances > > and vhost accelerator devices based on some sort of > > user's input. > > > > Best regards, > > Tiwei Bie > > Is the reason it's done like this because the > backend is sharing the device with multiple QEMUs? Each QEMU instance that gets passed a vfio group would attempt to add it to a vfio container (ie. IOMMU domain), so this mechanism cannot support multiple QEMU instances attached to the same group. Beyond that, the address space of each device would need to be unique within the instance as we only have a single IOVA space for the group. Thanks, Alex
On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote: > On Mon, Jul 30, 2018 at 04:10:04PM +0800, Tiwei Bie wrote: > > On Fri, Jul 27, 2018 at 02:03:00PM -0600, Alex Williamson wrote: > > > On Fri, 27 Jul 2018 09:58:05 +0800 > > > Tiwei Bie <tiwei.bie@intel.com> wrote: > > > > > > > On Thu, Jul 26, 2018 at 02:45:39PM -0600, Alex Williamson wrote: > > > > > On Mon, 23 Jul 2018 12:59:56 +0800 > > > > > Tiwei Bie <tiwei.bie@intel.com> wrote: > > > > > > > > > [...] > > > > > > > > > > > > +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev, > > > > > > + int *fd) > > > > > > +{ > > > > > > + struct vhost_user *u = dev->opaque; > > > > > > + VhostUserState *user = u->user; > > > > > > + VirtIODevice *vdev = dev->vdev; > > > > > > + int groupfd = fd[0]; > > > > > > + VFIOGroup *group; > > > > > > + > > > > > > + if (!virtio_has_feature(dev->protocol_features, > > > > > > + VHOST_USER_PROTOCOL_F_VFIO_GROUP) || > > > > > > + vdev == NULL) { > > > > > > + return -1; > > > > > > + } > > > > > > + > > > > > > + if (user->vfio_group) { > > > > > > + vfio_put_group(user->vfio_group); > > > > > > + user->vfio_group = NULL; > > > > > > + } > > > > > > + > > > > > > > > There should be a below check here (I missed it in this > > > > patch, sorry..): > > > > > > > > if (groupfd < 0) { > > > > return 0; > > > > } > > > > > > > > > > + group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL); > > > > > > + if (group == NULL) { > > > > > > + return -1; > > > > > > + } > > > > > > + > > > > > > + if (group->fd != groupfd) { > > > > > > + close(groupfd); > > > > > > + } > > > > > > + > > > > > > + user->vfio_group = group; > > > > > > + fd[0] = -1; > > > > > > + > > > > > > + return 0; > > > > > > +} > > > > > > > > > > This all looks very sketchy, we're reaching into vfio internal state > > > > > and arbitrarily releasing data structures, reusing existing ones, and > > > > > maybe creating new ones. We know that a vfio group only allows a > > > > > single open, right? > > > > > > > > Yeah, exactly! > > > > > > > > When the vhost-user backend process has opened a VFIO group fd, > > > > QEMU won't be able to open this group file via open() any more. > > > > So vhost-user backend process will share the VFIO group fd to > > > > QEMU via the UNIX socket. And that's why we're introducing a > > > > new API: > > > > > > > > vfio_get_group_from_fd(groupfd, ...); > > > > > > > > for vfio/common.c to get (setup) a VFIOGroup structure. (Because > > > > in this case, vfio/common.c can't open this group file via open(), > > > > and what we have is a VFIO group fd shared by another process via > > > > UNIX socket). And ... > > > > > > > > > So if you have a groupfd, vfio will never have > > > > > that same group opened already. > > > > > > > > ... if the same vhost-user backend process shares the same VFIO > > > > group fd more than one times via different vhost-user slave channels > > > > to different vhost-user frontends in the same QEMU process, the > > > > same VFIOGroup could exist already. > > > > > > > > This case could happen when e.g. there are two vhost accelerator > > > > devices in the same VFIO group, and they are managed by the same > > > > vhost-user backend process, and used to accelerate two different > > > > virtio devices in QEMU. In this case, the same VFIO group fd could > > > > be sent twice. > > > > > > > > If we need to support this case, more changes in vfio/common.c > > > > will be needed, e.g. 1) add a refcnt in VFIOGroup to support > > > > getting and putting a VFIOGroup structure multiple times, > > > > 2) add a lock to make vfio_get_group*() and vfio_put_group() > > > > thread-safe. > > > > > > This is the sort of case where it seems like we're just grabbing > > > internal interfaces and using them from other modules. VFIOGroups have > > > a device_list and currently handles multiple puts by testing whether > > > that device list is empty (ie. we already have a refcnt effectively). > > > Now we have a group user that has no devices, which is not something > > > that it seems like we've designed or audited the code for handling. > > > IOW, while the header file lives in a common location, it's not really > > > intended to be an API within QEMU and it makes me a bit uncomfortable > > > to use it as such. > > > > > > > > Is that the goal? It's not obvious in > > > > > the code. I don't really understand what vhost goes on to do with this > > > > > group, > > > > > > > > The goal is that, we have a VFIO group opened by an external > > > > vhost-user backend process, this process will manage the VFIO > > > > device, but want QEMU to manage the VFIO group, including: > > > > > > > > 1 - program the DMA mappings for this VFIO group based on > > > > the front-end device's dma_as in QEMU. > > > > > > > > 2 - add this VFIO group to KVM (KVM_DEV_VFIO_GROUP_ADD). > > > > > > > > Vhost-user in QEMU as the vhost-user frontend just wants > > > > hw/vfio to do above things after receiving a VFIO group fd > > > > from the vhost-user backend process. > > > > > > > > Vhost-user will just ask hw/vfio to get (setup) a VFIOGroup > > > > by calling vfio_get_group_from_fd() or put this VFIOGroup by > > > > calling vfio_put_group() when it's not needed anymore, and > > > > won't do anything else. > > > > > > > > Vhost-user will only deal with the VFIOGroups allocated by > > > > itself, and won't influence any other VFIOGroups used by the > > > > VFIO passthru devices (-device vfio-pci). Because the same > > > > VFIO group file can only be opened by one process via open(). > > > > > > But there's nothing here that guarantees the reverse. vhost cannot > > > open the group of an existing vfio-pci device, but vfio-pci can re-use > > > the group that vhost has already opened. This is again where it seems > > > like we're trying to cherry pick from an internal API and leaving some > > > loose ends dangling. > > > > > > > > but I'm pretty sure the existing state machine in vfio isn't > > > > > designed for it. Thanks, > > > > > > > > Yeah, more changes (e.g. refcnt, ..) in hw/vfio are needed to > > > > make it work robustly. Hmm.. Is above idea (e.g. how vhost-user > > > > is going to use VFIOGroup and how we are going to extend hw/vfio) > > > > acceptable to you? Thanks! > > > > > > I certainly would not want to duplicate managing the group, container, > > > and MemoryListener, but I think we need a more formal interface with at > > > least some notion of reference counting beyond the device list or > > > explicit knowledge of an external user. > > > > Got it! I'll try to address this. Thanks! > > > > > I'm also curious if it really > > > makes sense that the vhost backend is opening the group rather than > > > letting QEMU do it. It seems that vhost wants to deal with the device > > > and we can never have symmetry in the scenario above, where a group > > > might contain multiple devices and order matters because of where the > > > group is opened. If we still had a notion of a VFIODevice for an > > > external user, then the device_list refcnt would just work. Thanks, > > > > Vhost-user backend will but QEMU won't open the VFIO > > fds, because QEMU just has a vhost-user UNIX socket > > path that it should connect to or listen on. So QEMU > > doesn't know which group and device it should open. > > The vhost accelerator devices are probed and managed > > in the vhost-user backend process. And the vhost-user > > backend process will bind the vhost-user instances > > and vhost accelerator devices based on some sort of > > user's input. > > > > Best regards, > > Tiwei Bie > > Is the reason it's done like this because the > backend is sharing the device with multiple QEMUs? IMO, in the vhost-user based approach, it's natural for QEMU users to just specify a vhost-user UNIX socket path when launching the QEMU. And the vhost-user backend behind this socket path can be implemented purely in software or accelerated by hardware, which is transparent to the QEMU users. If we want an approach that QEMU users need to specify the vhost accelerate device (similar to the case that users specify the vhost-user socket path) when launching the QEMU, we may not want to do it over vhost-user. Best regards, Tiwei Bie > > I generally wonder how are restarts of the backend handled > with this approach: closing the VFIO device tends to reset > the whole device. > > -- > MST
On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote: [...] > > I generally wonder how are restarts of the backend handled > with this approach: closing the VFIO device tends to reset > the whole device. Hi Michael, I missed this comment previously.. This is a good point! In this RFC, before sending the VFIO group fd to QEMU, backend needs to close the VFIO device and unset the VFIO container first. Otherwise, QEMU won't be able to set the VFIO container for the VFIO group. Another option is to share the container fd instead of the group fd to QEMU. In this case, backend won't need to close any fd. But there is one problem that, it's hard to unmap the old mappings, especially when QEMU crashes. Do you have any suggestions? Thanks! Best regards, Tiwei Bie
On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote: > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote: > [...] > > > > I generally wonder how are restarts of the backend handled > > with this approach: closing the VFIO device tends to reset > > the whole device. > > Hi Michael, > > I missed this comment previously.. This is a good point! > In this RFC, before sending the VFIO group fd to QEMU, > backend needs to close the VFIO device and unset the VFIO > container first. Otherwise, QEMU won't be able to set the > VFIO container for the VFIO group. > > Another option is to share the container fd instead of > the group fd to QEMU. In this case, backend won't need > to close any fd. But there is one problem that, it's > hard to unmap the old mappings, especially when QEMU > crashes. What are these old mappings and who creates them? If you want to just reset everything the way it was on open, surely it would be easy to add such a reset ioctl. > Do you have any suggestions? Thanks! > > Best regards, > Tiwei Bie Donnu. Alex, any thoughts? Which approach would you prefer?
On Wed, 12 Sep 2018 12:14:44 -0400 "Michael S. Tsirkin" <mst@redhat.com> wrote: > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote: > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote: > > [...] > > > > > > I generally wonder how are restarts of the backend handled > > > with this approach: closing the VFIO device tends to reset > > > the whole device. > > > > Hi Michael, > > > > I missed this comment previously.. This is a good point! > > In this RFC, before sending the VFIO group fd to QEMU, > > backend needs to close the VFIO device and unset the VFIO > > container first. Otherwise, QEMU won't be able to set the > > VFIO container for the VFIO group. > > > > Another option is to share the container fd instead of > > the group fd to QEMU. In this case, backend won't need > > to close any fd. But there is one problem that, it's > > hard to unmap the old mappings, especially when QEMU > > crashes. > > What are these old mappings and who creates them? > If you want to just reset everything the way it was > on open, surely it would be easy to add such a reset ioctl. > > > Do you have any suggestions? Thanks! > > > > Best regards, > > Tiwei Bie > > Donnu. Alex, any thoughts? Which approach would you prefer? The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only requires that an unmap does not bisect previous mappings, ie. a previous mapping cannot be partially unmapped. Therefore you can already dump the entire IOVA space for a container with one UNMAP_DMA call, iova = 0, size = (u64)-1. Thanks, Alex
On Wed, Sep 12, 2018 at 10:34:43AM -0600, Alex Williamson wrote: > On Wed, 12 Sep 2018 12:14:44 -0400 > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote: > > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote: > > > [...] > > > > > > > > I generally wonder how are restarts of the backend handled > > > > with this approach: closing the VFIO device tends to reset > > > > the whole device. > > > > > > Hi Michael, > > > > > > I missed this comment previously.. This is a good point! > > > In this RFC, before sending the VFIO group fd to QEMU, > > > backend needs to close the VFIO device and unset the VFIO > > > container first. Otherwise, QEMU won't be able to set the > > > VFIO container for the VFIO group. > > > > > > Another option is to share the container fd instead of > > > the group fd to QEMU. In this case, backend won't need > > > to close any fd. But there is one problem that, it's > > > hard to unmap the old mappings, especially when QEMU > > > crashes. > > > > What are these old mappings and who creates them? > > If you want to just reset everything the way it was > > on open, surely it would be easy to add such a reset ioctl. > > > > > Do you have any suggestions? Thanks! > > > > > > Best regards, > > > Tiwei Bie > > > > Donnu. Alex, any thoughts? Which approach would you prefer? > > The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only requires > that an unmap does not bisect previous mappings, ie. a previous mapping > cannot be partially unmapped. Therefore you can already dump the > entire IOVA space for a container with one UNMAP_DMA call, iova = 0, > size = (u64)-1. Thanks, > > Alex Hmm this would exclude the last byte (address (u64)-1). VTD does not support such iova values for now but something to keep in mind e.g. for virtio-iommu with nested virt which does.
On Wed, 12 Sep 2018 12:44:15 -0400 "Michael S. Tsirkin" <mst@redhat.com> wrote: > On Wed, Sep 12, 2018 at 10:34:43AM -0600, Alex Williamson wrote: > > On Wed, 12 Sep 2018 12:14:44 -0400 > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote: > > > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote: > > > > [...] > > > > > > > > > > I generally wonder how are restarts of the backend handled > > > > > with this approach: closing the VFIO device tends to reset > > > > > the whole device. > > > > > > > > Hi Michael, > > > > > > > > I missed this comment previously.. This is a good point! > > > > In this RFC, before sending the VFIO group fd to QEMU, > > > > backend needs to close the VFIO device and unset the VFIO > > > > container first. Otherwise, QEMU won't be able to set the > > > > VFIO container for the VFIO group. > > > > > > > > Another option is to share the container fd instead of > > > > the group fd to QEMU. In this case, backend won't need > > > > to close any fd. But there is one problem that, it's > > > > hard to unmap the old mappings, especially when QEMU > > > > crashes. > > > > > > What are these old mappings and who creates them? > > > If you want to just reset everything the way it was > > > on open, surely it would be easy to add such a reset ioctl. > > > > > > > Do you have any suggestions? Thanks! > > > > > > > > Best regards, > > > > Tiwei Bie > > > > > > Donnu. Alex, any thoughts? Which approach would you prefer? > > > > The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only requires > > that an unmap does not bisect previous mappings, ie. a previous mapping > > cannot be partially unmapped. Therefore you can already dump the > > entire IOVA space for a container with one UNMAP_DMA call, iova = 0, > > size = (u64)-1. Thanks, > > > > Alex > > Hmm this would exclude the last byte (address (u64)-1). > VTD does not support such iova values for now but something > to keep in mind e.g. for virtio-iommu with nested virt > which does. Yes, technically you'd need another ioctl to get the last byte, the ioctls should have used start and end rather than start and size, but it's too late to change. Thanks, Alex
On Wed, Sep 12, 2018 at 11:15:32AM -0600, Alex Williamson wrote: > On Wed, 12 Sep 2018 12:44:15 -0400 > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > On Wed, Sep 12, 2018 at 10:34:43AM -0600, Alex Williamson wrote: > > > On Wed, 12 Sep 2018 12:14:44 -0400 > > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > > > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote: > > > > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote: > > > > > [...] > > > > > > > > > > > > I generally wonder how are restarts of the backend handled > > > > > > with this approach: closing the VFIO device tends to reset > > > > > > the whole device. > > > > > > > > > > Hi Michael, > > > > > > > > > > I missed this comment previously.. This is a good point! > > > > > In this RFC, before sending the VFIO group fd to QEMU, > > > > > backend needs to close the VFIO device and unset the VFIO > > > > > container first. Otherwise, QEMU won't be able to set the > > > > > VFIO container for the VFIO group. > > > > > > > > > > Another option is to share the container fd instead of > > > > > the group fd to QEMU. In this case, backend won't need > > > > > to close any fd. But there is one problem that, it's > > > > > hard to unmap the old mappings, especially when QEMU > > > > > crashes. > > > > > > > > What are these old mappings and who creates them? > > > > If you want to just reset everything the way it was > > > > on open, surely it would be easy to add such a reset ioctl. > > > > > > > > > Do you have any suggestions? Thanks! > > > > > > > > > > Best regards, > > > > > Tiwei Bie > > > > > > > > Donnu. Alex, any thoughts? Which approach would you prefer? > > > > > > The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only requires > > > that an unmap does not bisect previous mappings, ie. a previous mapping > > > cannot be partially unmapped. Therefore you can already dump the > > > entire IOVA space for a container with one UNMAP_DMA call, iova = 0, > > > size = (u64)-1. Thanks, > > > > > > Alex > > > > Hmm this would exclude the last byte (address (u64)-1). > > VTD does not support such iova values for now but something > > to keep in mind e.g. for virtio-iommu with nested virt > > which does. > > Yes, technically you'd need another ioctl to get the last byte, It will violate the requirement of not unmapping part of a region, won't it? So it won't work. > the > ioctls should have used start and end rather than start and size, but > it's too late to change. Thanks, > > Alex At some point we'll have to fix above issue in some way. Maybe when we add support for atomic remaps.
On Wed, 12 Sep 2018 13:29:33 -0400 "Michael S. Tsirkin" <mst@redhat.com> wrote: > On Wed, Sep 12, 2018 at 11:15:32AM -0600, Alex Williamson wrote: > > On Wed, 12 Sep 2018 12:44:15 -0400 > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > On Wed, Sep 12, 2018 at 10:34:43AM -0600, Alex Williamson wrote: > > > > On Wed, 12 Sep 2018 12:14:44 -0400 > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > > > > > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote: > > > > > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin wrote: > > > > > > [...] > > > > > > > > > > > > > > I generally wonder how are restarts of the backend handled > > > > > > > with this approach: closing the VFIO device tends to reset > > > > > > > the whole device. > > > > > > > > > > > > Hi Michael, > > > > > > > > > > > > I missed this comment previously.. This is a good point! > > > > > > In this RFC, before sending the VFIO group fd to QEMU, > > > > > > backend needs to close the VFIO device and unset the VFIO > > > > > > container first. Otherwise, QEMU won't be able to set the > > > > > > VFIO container for the VFIO group. > > > > > > > > > > > > Another option is to share the container fd instead of > > > > > > the group fd to QEMU. In this case, backend won't need > > > > > > to close any fd. But there is one problem that, it's > > > > > > hard to unmap the old mappings, especially when QEMU > > > > > > crashes. > > > > > > > > > > What are these old mappings and who creates them? > > > > > If you want to just reset everything the way it was > > > > > on open, surely it would be easy to add such a reset ioctl. > > > > > > > > > > > Do you have any suggestions? Thanks! > > > > > > > > > > > > Best regards, > > > > > > Tiwei Bie > > > > > > > > > > Donnu. Alex, any thoughts? Which approach would you prefer? > > > > > > > > The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only requires > > > > that an unmap does not bisect previous mappings, ie. a previous mapping > > > > cannot be partially unmapped. Therefore you can already dump the > > > > entire IOVA space for a container with one UNMAP_DMA call, iova = 0, > > > > size = (u64)-1. Thanks, > > > > > > > > Alex > > > > > > Hmm this would exclude the last byte (address (u64)-1). > > > VTD does not support such iova values for now but something > > > to keep in mind e.g. for virtio-iommu with nested virt > > > which does. > > > > Yes, technically you'd need another ioctl to get the last byte, > > It will violate the requirement of not unmapping part of a region, > won't it? So it won't work. Yes, in theory it's more complicated, but in practice it's not because Intel only supports up to 48-bits of IOVA space (modulo round down by a page, not a byte). It would also be deterministic if we could ever get past the RMRR issues with IGD to implement an IOVA list as every IOMMU (AFAIK) has some reserved ranges that cannot be mapped and would bisect the IOVA space somewhere. > > the > > ioctls should have used start and end rather than start and size, but > > it's too late to change. Thanks, > > > > Alex > > At some point we'll have to fix above issue in some way. Maybe when we > add support for atomic remaps. "Patches welcome". Thanks, Alex
> From: Alex Williamson > Sent: Thursday, September 13, 2018 2:10 AM > > On Wed, 12 Sep 2018 13:29:33 -0400 > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > On Wed, Sep 12, 2018 at 11:15:32AM -0600, Alex Williamson wrote: > > > On Wed, 12 Sep 2018 12:44:15 -0400 > > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > > > On Wed, Sep 12, 2018 at 10:34:43AM -0600, Alex Williamson wrote: > > > > > On Wed, 12 Sep 2018 12:14:44 -0400 > > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote: > > > > > > > > > > > On Wed, Sep 12, 2018 at 04:04:00PM +0800, Tiwei Bie wrote: > > > > > > > On Mon, Jul 30, 2018 at 12:30:58PM +0300, Michael S. Tsirkin > wrote: > > > > > > > [...] > > > > > > > > > > > > > > > > I generally wonder how are restarts of the backend handled > > > > > > > > with this approach: closing the VFIO device tends to reset > > > > > > > > the whole device. > > > > > > > > > > > > > > Hi Michael, > > > > > > > > > > > > > > I missed this comment previously.. This is a good point! > > > > > > > In this RFC, before sending the VFIO group fd to QEMU, > > > > > > > backend needs to close the VFIO device and unset the VFIO > > > > > > > container first. Otherwise, QEMU won't be able to set the > > > > > > > VFIO container for the VFIO group. > > > > > > > > > > > > > > Another option is to share the container fd instead of > > > > > > > the group fd to QEMU. In this case, backend won't need > > > > > > > to close any fd. But there is one problem that, it's > > > > > > > hard to unmap the old mappings, especially when QEMU > > > > > > > crashes. > > > > > > > > > > > > What are these old mappings and who creates them? > > > > > > If you want to just reset everything the way it was > > > > > > on open, surely it would be easy to add such a reset ioctl. > > > > > > > > > > > > > Do you have any suggestions? Thanks! > > > > > > > > > > > > > > Best regards, > > > > > > > Tiwei Bie > > > > > > > > > > > > Donnu. Alex, any thoughts? Which approach would you prefer? > > > > > > > > > > The existing UNMAP_DMA ioctl for the vfio type1 IOMMU only > requires > > > > > that an unmap does not bisect previous mappings, ie. a previous > mapping > > > > > cannot be partially unmapped. Therefore you can already dump the > > > > > entire IOVA space for a container with one UNMAP_DMA call, iova = > 0, > > > > > size = (u64)-1. Thanks, > > > > > > > > > > Alex > > > > > > > > Hmm this would exclude the last byte (address (u64)-1). > > > > VTD does not support such iova values for now but something > > > > to keep in mind e.g. for virtio-iommu with nested virt > > > > which does. > > > > > > Yes, technically you'd need another ioctl to get the last byte, > > > > It will violate the requirement of not unmapping part of a region, > > won't it? So it won't work. > > Yes, in theory it's more complicated, but in practice it's not because > Intel only supports up to 48-bits of IOVA space (modulo round down by > a page, not a byte). It would also be deterministic if we could ever > get past the RMRR issues with IGD to implement an IOVA list as every > IOMMU (AFAIK) has some reserved ranges that cannot be mapped and > would > bisect the IOVA space somewhere. > One correction - Intel VT-d supports 5-level paging now, which gives 57-bits IOVA space. though this comment doesn't affect your discussion... :-) Thanks Kevin
diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt index f59667f498..a57a8f9451 100644 --- a/docs/interop/vhost-user.txt +++ b/docs/interop/vhost-user.txt @@ -397,6 +397,7 @@ Protocol features #define VHOST_USER_PROTOCOL_F_CONFIG 9 #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 +#define VHOST_USER_PROTOCOL_F_VFIO_GROUP 12 Master message types -------------------- @@ -815,6 +816,21 @@ Slave message types This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER protocol feature has been successfully negotiated. + * VHOST_USER_SLAVE_VFIO_GROUP_MSG + + Id: 4 + Equivalent ioctl: N/A + Slave payload: N/A + Master payload: N/A + + When VHOST_USER_PROTOCOL_F_VFIO_GROUP is negotiated, vhost-user slave + could send this request to share its VFIO group fd via ancillary data + to master. After receiving this request from slave, master will close + the existing VFIO group if any and do the DMA programming based on the + virtio device's DMA address space for the new group if the request is + sent with a file descriptor. + + VHOST_USER_PROTOCOL_F_REPLY_ACK: ------------------------------- The original vhost-user specification only demands replies for certain diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index b041343632..db958e24c7 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -52,6 +52,7 @@ enum VhostUserProtocolFeature { VHOST_USER_PROTOCOL_F_CONFIG = 9, VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, + VHOST_USER_PROTOCOL_F_VFIO_GROUP = 12, VHOST_USER_PROTOCOL_F_MAX }; @@ -97,6 +98,7 @@ typedef enum VhostUserSlaveRequest { VHOST_USER_SLAVE_IOTLB_MSG = 1, VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2, VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3, + VHOST_USER_SLAVE_VFIO_GROUP_MSG = 4, VHOST_USER_SLAVE_MAX } VhostUserSlaveRequest; @@ -949,6 +951,41 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev, return 0; } +static int vhost_user_slave_handle_vfio_group(struct vhost_dev *dev, + int *fd) +{ + struct vhost_user *u = dev->opaque; + VhostUserState *user = u->user; + VirtIODevice *vdev = dev->vdev; + int groupfd = fd[0]; + VFIOGroup *group; + + if (!virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_VFIO_GROUP) || + vdev == NULL) { + return -1; + } + + if (user->vfio_group) { + vfio_put_group(user->vfio_group); + user->vfio_group = NULL; + } + + group = vfio_get_group_from_fd(groupfd, vdev->dma_as, NULL); + if (group == NULL) { + return -1; + } + + if (group->fd != groupfd) { + close(groupfd); + } + + user->vfio_group = group; + fd[0] = -1; + + return 0; +} + static void slave_read(void *opaque) { struct vhost_dev *dev = opaque; @@ -1021,6 +1058,9 @@ static void slave_read(void *opaque) ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area, fd[0]); break; + case VHOST_USER_SLAVE_VFIO_GROUP_MSG: + ret = vhost_user_slave_handle_vfio_group(dev, fd); + break; default: error_report("Received unexpected msg type."); ret = -EINVAL; diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h index fd660393a0..9e11473274 100644 --- a/include/hw/virtio/vhost-user.h +++ b/include/hw/virtio/vhost-user.h @@ -10,6 +10,7 @@ #include "chardev/char-fe.h" #include "hw/virtio/virtio.h" +#include "hw/vfio/vfio-common.h" typedef struct VhostUserHostNotifier { MemoryRegion mr; @@ -20,6 +21,7 @@ typedef struct VhostUserHostNotifier { typedef struct VhostUserState { CharBackend *chr; VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX]; + VFIOGroup *vfio_group; } VhostUserState; VhostUserState *vhost_user_init(void);
Introduce a slave message to allow slave to share its VFIO group fd to master and do the IOMMU programming based on virtio device's DMA address space for this group in QEMU. For the vhost backends which support vDPA, they could leverage this message to ask master to do the IOMMU programming in QEMU for the vDPA device in backend. Signed-off-by: Tiwei Bie <tiwei.bie@intel.com> --- docs/interop/vhost-user.txt | 16 ++++++++++++++ hw/virtio/vhost-user.c | 40 ++++++++++++++++++++++++++++++++++ include/hw/virtio/vhost-user.h | 2 ++ 3 files changed, 58 insertions(+)