Message ID | 20210315194842.277740-12-eperezma@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vDPA software assisted live migration | expand |
在 2021/3/16 上午3:48, Eugenio Pérez 写道: > Initial version of shadow virtqueue that actually forward buffers. > > It reuses the VirtQueue code for the device part. The driver part is > based on Linux's virtio_ring driver, but with stripped functionality > and optimizations so it's easier to review. > > These will be added in later commits. > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > --- > hw/virtio/vhost-shadow-virtqueue.c | 212 +++++++++++++++++++++++++++-- > hw/virtio/vhost.c | 113 ++++++++++++++- > 2 files changed, 312 insertions(+), 13 deletions(-) > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c > index 1460d1d5d1..68ed0f2740 100644 > --- a/hw/virtio/vhost-shadow-virtqueue.c > +++ b/hw/virtio/vhost-shadow-virtqueue.c > @@ -9,6 +9,7 @@ > > #include "hw/virtio/vhost-shadow-virtqueue.h" > #include "hw/virtio/vhost.h" > +#include "hw/virtio/virtio-access.h" > > #include "standard-headers/linux/vhost_types.h" > > @@ -55,11 +56,96 @@ typedef struct VhostShadowVirtqueue { > /* Virtio device */ > VirtIODevice *vdev; > > + /* Map for returning guest's descriptors */ > + VirtQueueElement **ring_id_maps; > + > + /* Next head to expose to device */ > + uint16_t avail_idx_shadow; > + > + /* Next free descriptor */ > + uint16_t free_head; > + > + /* Last seen used idx */ > + uint16_t shadow_used_idx; > + > + /* Next head to consume from device */ > + uint16_t used_idx; > + > /* Descriptors copied from guest */ > vring_desc_t descs[]; > } VhostShadowVirtqueue; > > -/* Forward guest notifications */ > +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, > + const struct iovec *iovec, > + size_t num, bool more_descs, bool write) > +{ > + uint16_t i = svq->free_head, last = svq->free_head; > + unsigned n; > + uint16_t flags = write ? virtio_tswap16(svq->vdev, VRING_DESC_F_WRITE) : 0; > + vring_desc_t *descs = svq->vring.desc; > + > + if (num == 0) { > + return; > + } > + > + for (n = 0; n < num; n++) { > + if (more_descs || (n + 1 < num)) { > + descs[i].flags = flags | virtio_tswap16(svq->vdev, > + VRING_DESC_F_NEXT); > + } else { > + descs[i].flags = flags; > + } > + descs[i].addr = virtio_tswap64(svq->vdev, (hwaddr)iovec[n].iov_base); So unsing virtio_tswap() is probably not correct since we're talking with vhost backends which has its own endiness. For vhost-vDPA, we can assume that it's a 1.0 device. > + descs[i].len = virtio_tswap32(svq->vdev, iovec[n].iov_len); > + > + last = i; > + i = virtio_tswap16(svq->vdev, descs[i].next); > + } > + > + svq->free_head = virtio_tswap16(svq->vdev, descs[last].next); > +} > + > +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq, > + VirtQueueElement *elem) > +{ > + int head; > + unsigned avail_idx; > + vring_avail_t *avail = svq->vring.avail; > + > + head = svq->free_head; > + > + /* We need some descriptors here */ > + assert(elem->out_num || elem->in_num); > + > + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, > + elem->in_num > 0, false); > + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); > + > + /* > + * Put entry in available array (but don't update avail->idx until they > + * do sync). > + */ > + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); > + avail->ring[avail_idx] = virtio_tswap16(svq->vdev, head); > + svq->avail_idx_shadow++; > + > + /* Expose descriptors to device */ > + smp_wmb(); > + avail->idx = virtio_tswap16(svq->vdev, svq->avail_idx_shadow); > + > + return head; > + > +} > + > +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq, > + VirtQueueElement *elem) > +{ > + unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem); > + > + svq->ring_id_maps[qemu_head] = elem; > +} > + > +/* Handle guest->device notifications */ > static void vhost_handle_guest_kick(EventNotifier *n) > { > VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > @@ -69,7 +155,72 @@ static void vhost_handle_guest_kick(EventNotifier *n) > return; > } > > - event_notifier_set(&svq->kick_notifier); > + /* Make available as many buffers as possible */ > + do { > + if (virtio_queue_get_notification(svq->vq)) { > + /* No more notifications until process all available */ > + virtio_queue_set_notification(svq->vq, false); > + } > + > + while (true) { > + VirtQueueElement *elem; > + if (virtio_queue_full(svq->vq)) { > + break; So we've disabled guest notification. If buffer has been consumed, we need to retry the handle_guest_kick here. But I didn't find the code? > + } > + > + elem = virtqueue_pop(svq->vq, sizeof(*elem)); > + if (!elem) { > + break; > + } > + > + vhost_shadow_vq_add(svq, elem); > + event_notifier_set(&svq->kick_notifier); > + } > + > + virtio_queue_set_notification(svq->vq, true); > + } while (!virtio_queue_empty(svq->vq)); > +} > + > +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq) > +{ > + if (svq->used_idx != svq->shadow_used_idx) { > + return true; > + } > + > + /* Get used idx must not be reordered */ > + smp_rmb(); > + svq->shadow_used_idx = virtio_tswap16(svq->vdev, svq->vring.used->idx); > + > + return svq->used_idx != svq->shadow_used_idx; > +} > + > +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq) > +{ > + vring_desc_t *descs = svq->vring.desc; > + const vring_used_t *used = svq->vring.used; > + vring_used_elem_t used_elem; > + uint16_t last_used; > + > + if (!vhost_shadow_vq_more_used(svq)) { > + return NULL; > + } > + > + last_used = svq->used_idx & (svq->vring.num - 1); > + used_elem.id = virtio_tswap32(svq->vdev, used->ring[last_used].id); > + used_elem.len = virtio_tswap32(svq->vdev, used->ring[last_used].len); > + > + if (unlikely(used_elem.id >= svq->vring.num)) { > + error_report("Device %s says index %u is available", svq->vdev->name, > + used_elem.id); > + return NULL; > + } > + > + descs[used_elem.id].next = svq->free_head; > + svq->free_head = used_elem.id; > + > + svq->used_idx++; > + svq->ring_id_maps[used_elem.id]->len = used_elem.len; > + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); > } > > /* Forward vhost notifications */ > @@ -78,6 +229,7 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > call_notifier); > EventNotifier *masked_notifier; > + VirtQueue *vq = svq->vq; > > /* Signal start of using masked notifier */ > qemu_event_reset(&svq->masked_notifier.is_free); > @@ -86,14 +238,29 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > qemu_event_set(&svq->masked_notifier.is_free); > } > > - if (!masked_notifier) { > - unsigned n = virtio_get_queue_index(svq->vq); > - virtio_queue_invalidate_signalled_used(svq->vdev, n); > - virtio_notify_irqfd(svq->vdev, svq->vq); > - } else if (!svq->masked_notifier.signaled) { > - svq->masked_notifier.signaled = true; > - event_notifier_set(svq->masked_notifier.n); > - } > + /* Make as many buffers as possible used. */ > + do { > + unsigned i = 0; > + > + /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */ > + while (true) { > + g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq); > + if (!elem) { > + break; > + } > + > + assert(i < svq->vring.num); > + virtqueue_fill(vq, elem, elem->len, i++); > + } > + > + virtqueue_flush(vq, i); > + if (!masked_notifier) { > + virtio_notify_irqfd(svq->vdev, svq->vq); > + } else if (!svq->masked_notifier.signaled) { > + svq->masked_notifier.signaled = true; > + event_notifier_set(svq->masked_notifier.n); > + } > + } while (vhost_shadow_vq_more_used(svq)); > > if (masked_notifier) { > /* Signal not using it anymore */ > @@ -103,7 +270,6 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > > static void vhost_shadow_vq_handle_call(EventNotifier *n) > { > - > if (likely(event_notifier_test_and_clear(n))) { > vhost_shadow_vq_handle_call_no_test(n); > } > @@ -254,7 +420,11 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, > unsigned idx, > VhostShadowVirtqueue *svq) > { > + int i; > int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq); > + > + assert(!dev->shadow_vqs_enabled); > + > if (unlikely(r < 0)) { > error_report("Couldn't restore vq kick fd: %s", strerror(-r)); > } > @@ -272,6 +442,18 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, > /* Restore vhost call */ > vhost_virtqueue_mask(dev, dev->vdev, dev->vq_index + idx, > dev->vqs[idx].notifier_is_masked); > + > + > + for (i = 0; i < svq->vring.num; ++i) { > + g_autofree VirtQueueElement *elem = svq->ring_id_maps[i]; > + /* > + * Although the doc says we must unpop in order, it's ok to unpop > + * everything. > + */ > + if (elem) { > + virtqueue_unpop(svq->vq, elem, elem->len); Shouldn't we need to wait until all pending requests to be drained? Or we may end up duplicated requests? Thanks > + } > + } > } > > /* > @@ -284,7 +466,7 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) > unsigned num = virtio_queue_get_num(dev->vdev, vq_idx); > size_t ring_size = vring_size(num, VRING_DESC_ALIGN_SIZE); > g_autofree VhostShadowVirtqueue *svq = g_malloc0(sizeof(*svq) + ring_size); > - int r; > + int r, i; > > r = event_notifier_init(&svq->kick_notifier, 0); > if (r != 0) { > @@ -303,6 +485,11 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) > vring_init(&svq->vring, num, svq->descs, VRING_DESC_ALIGN_SIZE); > svq->vq = virtio_get_queue(dev->vdev, vq_idx); > svq->vdev = dev->vdev; > + for (i = 0; i < num - 1; i++) { > + svq->descs[i].next = virtio_tswap16(dev->vdev, i + 1); > + } > + > + svq->ring_id_maps = g_new0(VirtQueueElement *, num); > event_notifier_set_handler(&svq->call_notifier, > vhost_shadow_vq_handle_call); > qemu_event_init(&svq->masked_notifier.is_free, true); > @@ -324,5 +511,6 @@ void vhost_shadow_vq_free(VhostShadowVirtqueue *vq) > event_notifier_cleanup(&vq->kick_notifier); > event_notifier_set_handler(&vq->call_notifier, NULL); > event_notifier_cleanup(&vq->call_notifier); > + g_free(vq->ring_id_maps); > g_free(vq); > } > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > index eab3e334f2..a373999bc4 100644 > --- a/hw/virtio/vhost.c > +++ b/hw/virtio/vhost.c > @@ -1021,6 +1021,19 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) > > trace_vhost_iotlb_miss(dev, 1); > > + if (qatomic_load_acquire(&dev->shadow_vqs_enabled)) { > + uaddr = iova; > + len = 4096; > + ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len, > + IOMMU_RW); > + if (ret) { > + trace_vhost_iotlb_miss(dev, 2); > + error_report("Fail to update device iotlb"); > + } > + > + return ret; > + } > + > iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, > iova, write, > MEMTXATTRS_UNSPECIFIED); > @@ -1227,8 +1240,28 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) > /* Can be read by vhost_virtqueue_mask, from vm exit */ > qatomic_store_release(&dev->shadow_vqs_enabled, false); > > + dev->vhost_ops->vhost_set_vring_enable(dev, false); > + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { > + error_report("Fail to invalidate device iotlb"); > + } > + > for (idx = 0; idx < dev->nvqs; ++idx) { > + /* > + * Update used ring information for IOTLB to work correctly, > + * vhost-kernel code requires for this. > + */ > + struct vhost_virtqueue *vq = dev->vqs + idx; > + vhost_device_iotlb_miss(dev, vq->used_phys, true); > + > vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]); > + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], > + dev->vq_index + idx); > + } > + > + /* Enable guest's vq vring */ > + dev->vhost_ops->vhost_set_vring_enable(dev, true); > + > + for (idx = 0; idx < dev->nvqs; ++idx) { > vhost_shadow_vq_free(dev->shadow_vqs[idx]); > } > > @@ -1237,6 +1270,59 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) > return 0; > } > > +/* > + * Start shadow virtqueue in a given queue. > + * In failure case, this function leaves queue working as regular vhost mode. > + */ > +static bool vhost_sw_live_migration_start_vq(struct vhost_dev *dev, > + unsigned idx) > +{ > + struct vhost_vring_addr addr = { > + .index = idx, > + }; > + struct vhost_vring_state s = { > + .index = idx, > + }; > + int r; > + bool ok; > + > + vhost_virtqueue_stop(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); > + ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); > + if (unlikely(!ok)) { > + return false; > + } > + > + /* From this point, vhost_virtqueue_start can reset these changes */ > + vhost_shadow_vq_get_vring_addr(dev->shadow_vqs[idx], &addr); > + r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); > + if (unlikely(r != 0)) { > + VHOST_OPS_DEBUG("vhost_set_vring_addr for shadow vq failed"); > + goto err; > + } > + > + r = dev->vhost_ops->vhost_set_vring_base(dev, &s); > + if (unlikely(r != 0)) { > + VHOST_OPS_DEBUG("vhost_set_vring_base for shadow vq failed"); > + goto err; > + } > + > + /* > + * Update used ring information for IOTLB to work correctly, > + * vhost-kernel code requires for this. > + */ > + r = vhost_device_iotlb_miss(dev, addr.used_user_addr, true); > + if (unlikely(r != 0)) { > + /* Debug message already printed */ > + goto err; > + } > + > + return true; > + > +err: > + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); > + return false; > +} > + > static int vhost_sw_live_migration_start(struct vhost_dev *dev) > { > int idx, stop_idx; > @@ -1249,24 +1335,35 @@ static int vhost_sw_live_migration_start(struct vhost_dev *dev) > } > } > > + dev->vhost_ops->vhost_set_vring_enable(dev, false); > + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { > + error_report("Fail to invalidate device iotlb"); > + } > + > /* Can be read by vhost_virtqueue_mask, from vm exit */ > qatomic_store_release(&dev->shadow_vqs_enabled, true); > for (idx = 0; idx < dev->nvqs; ++idx) { > - bool ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); > + bool ok = vhost_sw_live_migration_start_vq(dev, idx); > if (unlikely(!ok)) { > goto err_start; > } > } > > + /* Enable shadow vq vring */ > + dev->vhost_ops->vhost_set_vring_enable(dev, true); > return 0; > > err_start: > qatomic_store_release(&dev->shadow_vqs_enabled, false); > for (stop_idx = 0; stop_idx < idx; stop_idx++) { > vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]); > + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], > + dev->vq_index + stop_idx); > } > > err_new: > + /* Enable guest's vring */ > + dev->vhost_ops->vhost_set_vring_enable(dev, true); > for (idx = 0; idx < dev->nvqs; ++idx) { > vhost_shadow_vq_free(dev->shadow_vqs[idx]); > } > @@ -1970,6 +2067,20 @@ void qmp_x_vhost_enable_shadow_vq(const char *name, bool enable, Error **errp) > > if (!hdev->started) { > err_cause = "Device is not started"; > + } else if (!vhost_dev_has_iommu(hdev)) { > + err_cause = "Does not support iommu"; > + } else if (hdev->acked_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { > + err_cause = "Is packed"; > + } else if (hdev->acked_features & BIT_ULL(VIRTIO_RING_F_EVENT_IDX)) { > + err_cause = "Have event idx"; > + } else if (hdev->acked_features & > + BIT_ULL(VIRTIO_RING_F_INDIRECT_DESC)) { > + err_cause = "Supports indirect descriptors"; > + } else if (!hdev->vhost_ops->vhost_set_vring_enable) { > + err_cause = "Cannot pause device"; > + } > + > + if (err_cause) { > goto err; > } >
On Tue, Mar 16, 2021 at 9:15 AM Jason Wang <jasowang@redhat.com> wrote: > > > 在 2021/3/16 上午3:48, Eugenio Pérez 写道: > > Initial version of shadow virtqueue that actually forward buffers. > > > > It reuses the VirtQueue code for the device part. The driver part is > > based on Linux's virtio_ring driver, but with stripped functionality > > and optimizations so it's easier to review. > > > > These will be added in later commits. > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > > --- > > hw/virtio/vhost-shadow-virtqueue.c | 212 +++++++++++++++++++++++++++-- > > hw/virtio/vhost.c | 113 ++++++++++++++- > > 2 files changed, 312 insertions(+), 13 deletions(-) > > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c > > index 1460d1d5d1..68ed0f2740 100644 > > --- a/hw/virtio/vhost-shadow-virtqueue.c > > +++ b/hw/virtio/vhost-shadow-virtqueue.c > > @@ -9,6 +9,7 @@ > > > > #include "hw/virtio/vhost-shadow-virtqueue.h" > > #include "hw/virtio/vhost.h" > > +#include "hw/virtio/virtio-access.h" > > > > #include "standard-headers/linux/vhost_types.h" > > > > @@ -55,11 +56,96 @@ typedef struct VhostShadowVirtqueue { > > /* Virtio device */ > > VirtIODevice *vdev; > > > > + /* Map for returning guest's descriptors */ > > + VirtQueueElement **ring_id_maps; > > + > > + /* Next head to expose to device */ > > + uint16_t avail_idx_shadow; > > + > > + /* Next free descriptor */ > > + uint16_t free_head; > > + > > + /* Last seen used idx */ > > + uint16_t shadow_used_idx; > > + > > + /* Next head to consume from device */ > > + uint16_t used_idx; > > + > > /* Descriptors copied from guest */ > > vring_desc_t descs[]; > > } VhostShadowVirtqueue; > > > > -/* Forward guest notifications */ > > +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, > > + const struct iovec *iovec, > > + size_t num, bool more_descs, bool write) > > +{ > > + uint16_t i = svq->free_head, last = svq->free_head; > > + unsigned n; > > + uint16_t flags = write ? virtio_tswap16(svq->vdev, VRING_DESC_F_WRITE) : 0; > > + vring_desc_t *descs = svq->vring.desc; > > + > > + if (num == 0) { > > + return; > > + } > > + > > + for (n = 0; n < num; n++) { > > + if (more_descs || (n + 1 < num)) { > > + descs[i].flags = flags | virtio_tswap16(svq->vdev, > > + VRING_DESC_F_NEXT); > > + } else { > > + descs[i].flags = flags; > > + } > > + descs[i].addr = virtio_tswap64(svq->vdev, (hwaddr)iovec[n].iov_base); > > > So unsing virtio_tswap() is probably not correct since we're talking > with vhost backends which has its own endiness. > I was trying to expose the buffer with the same endianness as the driver originally offered, so if guest->qemu requires a bswap, I think it will always require a bswap again to expose to the device again. > For vhost-vDPA, we can assume that it's a 1.0 device. Isn't it needed if the host is big endian? > > > > + descs[i].len = virtio_tswap32(svq->vdev, iovec[n].iov_len); > > + > > + last = i; > > + i = virtio_tswap16(svq->vdev, descs[i].next); > > + } > > + > > + svq->free_head = virtio_tswap16(svq->vdev, descs[last].next); > > +} > > + > > +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq, > > + VirtQueueElement *elem) > > +{ > > + int head; > > + unsigned avail_idx; > > + vring_avail_t *avail = svq->vring.avail; > > + > > + head = svq->free_head; > > + > > + /* We need some descriptors here */ > > + assert(elem->out_num || elem->in_num); > > + > > + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, > > + elem->in_num > 0, false); > > + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); > > + > > + /* > > + * Put entry in available array (but don't update avail->idx until they > > + * do sync). > > + */ > > + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); > > + avail->ring[avail_idx] = virtio_tswap16(svq->vdev, head); > > + svq->avail_idx_shadow++; > > + > > + /* Expose descriptors to device */ > > + smp_wmb(); > > + avail->idx = virtio_tswap16(svq->vdev, svq->avail_idx_shadow); > > + > > + return head; > > + > > +} > > + > > +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq, > > + VirtQueueElement *elem) > > +{ > > + unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem); > > + > > + svq->ring_id_maps[qemu_head] = elem; > > +} > > + > > +/* Handle guest->device notifications */ > > static void vhost_handle_guest_kick(EventNotifier *n) > > { > > VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > > @@ -69,7 +155,72 @@ static void vhost_handle_guest_kick(EventNotifier *n) > > return; > > } > > > > - event_notifier_set(&svq->kick_notifier); > > + /* Make available as many buffers as possible */ > > + do { > > + if (virtio_queue_get_notification(svq->vq)) { > > + /* No more notifications until process all available */ > > + virtio_queue_set_notification(svq->vq, false); > > + } > > + > > + while (true) { > > + VirtQueueElement *elem; > > + if (virtio_queue_full(svq->vq)) { > > + break; > > > So we've disabled guest notification. If buffer has been consumed, we > need to retry the handle_guest_kick here. But I didn't find the code? > This code follows the pattern of virtio_blk_handle_vq: we jump out of the inner while, and we re-enable the notifications. After that, we check for updates on guest avail_idx. > > > + } > > + > > + elem = virtqueue_pop(svq->vq, sizeof(*elem)); > > + if (!elem) { > > + break; > > + } > > + > > + vhost_shadow_vq_add(svq, elem); > > + event_notifier_set(&svq->kick_notifier); > > + } > > + > > + virtio_queue_set_notification(svq->vq, true); > > + } while (!virtio_queue_empty(svq->vq)); > > +} > > + > > +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq) > > +{ > > + if (svq->used_idx != svq->shadow_used_idx) { > > + return true; > > + } > > + > > + /* Get used idx must not be reordered */ > > + smp_rmb(); > > + svq->shadow_used_idx = virtio_tswap16(svq->vdev, svq->vring.used->idx); > > + > > + return svq->used_idx != svq->shadow_used_idx; > > +} > > + > > +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq) > > +{ > > + vring_desc_t *descs = svq->vring.desc; > > + const vring_used_t *used = svq->vring.used; > > + vring_used_elem_t used_elem; > > + uint16_t last_used; > > + > > + if (!vhost_shadow_vq_more_used(svq)) { > > + return NULL; > > + } > > + > > + last_used = svq->used_idx & (svq->vring.num - 1); > > + used_elem.id = virtio_tswap32(svq->vdev, used->ring[last_used].id); > > + used_elem.len = virtio_tswap32(svq->vdev, used->ring[last_used].len); > > + > > + if (unlikely(used_elem.id >= svq->vring.num)) { > > + error_report("Device %s says index %u is available", svq->vdev->name, > > + used_elem.id); > > + return NULL; > > + } > > + > > + descs[used_elem.id].next = svq->free_head; > > + svq->free_head = used_elem.id; > > + > > + svq->used_idx++; > > + svq->ring_id_maps[used_elem.id]->len = used_elem.len; > > + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); > > } > > > > /* Forward vhost notifications */ > > @@ -78,6 +229,7 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > > VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > > call_notifier); > > EventNotifier *masked_notifier; > > + VirtQueue *vq = svq->vq; > > > > /* Signal start of using masked notifier */ > > qemu_event_reset(&svq->masked_notifier.is_free); > > @@ -86,14 +238,29 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > > qemu_event_set(&svq->masked_notifier.is_free); > > } > > > > - if (!masked_notifier) { > > - unsigned n = virtio_get_queue_index(svq->vq); > > - virtio_queue_invalidate_signalled_used(svq->vdev, n); > > - virtio_notify_irqfd(svq->vdev, svq->vq); > > - } else if (!svq->masked_notifier.signaled) { > > - svq->masked_notifier.signaled = true; > > - event_notifier_set(svq->masked_notifier.n); > > - } > > + /* Make as many buffers as possible used. */ > > + do { > > + unsigned i = 0; > > + > > + /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */ > > + while (true) { > > + g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq); > > + if (!elem) { > > + break; > > + } > > + > > + assert(i < svq->vring.num); > > + virtqueue_fill(vq, elem, elem->len, i++); > > + } > > + > > + virtqueue_flush(vq, i); > > + if (!masked_notifier) { > > + virtio_notify_irqfd(svq->vdev, svq->vq); > > + } else if (!svq->masked_notifier.signaled) { > > + svq->masked_notifier.signaled = true; > > + event_notifier_set(svq->masked_notifier.n); > > + } > > + } while (vhost_shadow_vq_more_used(svq)); > > > > if (masked_notifier) { > > /* Signal not using it anymore */ > > @@ -103,7 +270,6 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > > > > static void vhost_shadow_vq_handle_call(EventNotifier *n) > > { > > - > > if (likely(event_notifier_test_and_clear(n))) { > > vhost_shadow_vq_handle_call_no_test(n); > > } > > @@ -254,7 +420,11 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, > > unsigned idx, > > VhostShadowVirtqueue *svq) > > { > > + int i; > > int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq); > > + > > + assert(!dev->shadow_vqs_enabled); > > + > > if (unlikely(r < 0)) { > > error_report("Couldn't restore vq kick fd: %s", strerror(-r)); > > } > > @@ -272,6 +442,18 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, > > /* Restore vhost call */ > > vhost_virtqueue_mask(dev, dev->vdev, dev->vq_index + idx, > > dev->vqs[idx].notifier_is_masked); > > + > > + > > + for (i = 0; i < svq->vring.num; ++i) { > > + g_autofree VirtQueueElement *elem = svq->ring_id_maps[i]; > > + /* > > + * Although the doc says we must unpop in order, it's ok to unpop > > + * everything. > > + */ > > + if (elem) { > > + virtqueue_unpop(svq->vq, elem, elem->len); > > > Shouldn't we need to wait until all pending requests to be drained? Or > we may end up duplicated requests? > Do you mean pending as in-flight/processing in the device? The device must be paused at this point. Currently there is no assertion for this, maybe we can track the device status for it. For the queue handlers to be running at this point, the main event loop should serialize QMP and handlers as far as I know (and they would make all state inconsistent if the device stops suddenly). It would need to be synchronized if the handlers run in their own AIO context. That would be nice to have but it's not included here. > Thanks > > > > + } > > + } > > } > > > > /* > > @@ -284,7 +466,7 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) > > unsigned num = virtio_queue_get_num(dev->vdev, vq_idx); > > size_t ring_size = vring_size(num, VRING_DESC_ALIGN_SIZE); > > g_autofree VhostShadowVirtqueue *svq = g_malloc0(sizeof(*svq) + ring_size); > > - int r; > > + int r, i; > > > > r = event_notifier_init(&svq->kick_notifier, 0); > > if (r != 0) { > > @@ -303,6 +485,11 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) > > vring_init(&svq->vring, num, svq->descs, VRING_DESC_ALIGN_SIZE); > > svq->vq = virtio_get_queue(dev->vdev, vq_idx); > > svq->vdev = dev->vdev; > > + for (i = 0; i < num - 1; i++) { > > + svq->descs[i].next = virtio_tswap16(dev->vdev, i + 1); > > + } > > + > > + svq->ring_id_maps = g_new0(VirtQueueElement *, num); > > event_notifier_set_handler(&svq->call_notifier, > > vhost_shadow_vq_handle_call); > > qemu_event_init(&svq->masked_notifier.is_free, true); > > @@ -324,5 +511,6 @@ void vhost_shadow_vq_free(VhostShadowVirtqueue *vq) > > event_notifier_cleanup(&vq->kick_notifier); > > event_notifier_set_handler(&vq->call_notifier, NULL); > > event_notifier_cleanup(&vq->call_notifier); > > + g_free(vq->ring_id_maps); > > g_free(vq); > > } > > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > > index eab3e334f2..a373999bc4 100644 > > --- a/hw/virtio/vhost.c > > +++ b/hw/virtio/vhost.c > > @@ -1021,6 +1021,19 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) > > > > trace_vhost_iotlb_miss(dev, 1); > > > > + if (qatomic_load_acquire(&dev->shadow_vqs_enabled)) { > > + uaddr = iova; > > + len = 4096; > > + ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len, > > + IOMMU_RW); > > + if (ret) { > > + trace_vhost_iotlb_miss(dev, 2); > > + error_report("Fail to update device iotlb"); > > + } > > + > > + return ret; > > + } > > + > > iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, > > iova, write, > > MEMTXATTRS_UNSPECIFIED); > > @@ -1227,8 +1240,28 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) > > /* Can be read by vhost_virtqueue_mask, from vm exit */ > > qatomic_store_release(&dev->shadow_vqs_enabled, false); > > > > + dev->vhost_ops->vhost_set_vring_enable(dev, false); > > + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { > > + error_report("Fail to invalidate device iotlb"); > > + } > > + > > for (idx = 0; idx < dev->nvqs; ++idx) { > > + /* > > + * Update used ring information for IOTLB to work correctly, > > + * vhost-kernel code requires for this. > > + */ > > + struct vhost_virtqueue *vq = dev->vqs + idx; > > + vhost_device_iotlb_miss(dev, vq->used_phys, true); > > + > > vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]); > > + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], > > + dev->vq_index + idx); > > + } > > + > > + /* Enable guest's vq vring */ > > + dev->vhost_ops->vhost_set_vring_enable(dev, true); > > + > > + for (idx = 0; idx < dev->nvqs; ++idx) { > > vhost_shadow_vq_free(dev->shadow_vqs[idx]); > > } > > > > @@ -1237,6 +1270,59 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) > > return 0; > > } > > > > +/* > > + * Start shadow virtqueue in a given queue. > > + * In failure case, this function leaves queue working as regular vhost mode. > > + */ > > +static bool vhost_sw_live_migration_start_vq(struct vhost_dev *dev, > > + unsigned idx) > > +{ > > + struct vhost_vring_addr addr = { > > + .index = idx, > > + }; > > + struct vhost_vring_state s = { > > + .index = idx, > > + }; > > + int r; > > + bool ok; > > + > > + vhost_virtqueue_stop(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); > > + ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); > > + if (unlikely(!ok)) { > > + return false; > > + } > > + > > + /* From this point, vhost_virtqueue_start can reset these changes */ > > + vhost_shadow_vq_get_vring_addr(dev->shadow_vqs[idx], &addr); > > + r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); > > + if (unlikely(r != 0)) { > > + VHOST_OPS_DEBUG("vhost_set_vring_addr for shadow vq failed"); > > + goto err; > > + } > > + > > + r = dev->vhost_ops->vhost_set_vring_base(dev, &s); > > + if (unlikely(r != 0)) { > > + VHOST_OPS_DEBUG("vhost_set_vring_base for shadow vq failed"); > > + goto err; > > + } > > + > > + /* > > + * Update used ring information for IOTLB to work correctly, > > + * vhost-kernel code requires for this. > > + */ > > + r = vhost_device_iotlb_miss(dev, addr.used_user_addr, true); > > + if (unlikely(r != 0)) { > > + /* Debug message already printed */ > > + goto err; > > + } > > + > > + return true; > > + > > +err: > > + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); > > + return false; > > +} > > + > > static int vhost_sw_live_migration_start(struct vhost_dev *dev) > > { > > int idx, stop_idx; > > @@ -1249,24 +1335,35 @@ static int vhost_sw_live_migration_start(struct vhost_dev *dev) > > } > > } > > > > + dev->vhost_ops->vhost_set_vring_enable(dev, false); > > + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { > > + error_report("Fail to invalidate device iotlb"); > > + } > > + > > /* Can be read by vhost_virtqueue_mask, from vm exit */ > > qatomic_store_release(&dev->shadow_vqs_enabled, true); > > for (idx = 0; idx < dev->nvqs; ++idx) { > > - bool ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); > > + bool ok = vhost_sw_live_migration_start_vq(dev, idx); > > if (unlikely(!ok)) { > > goto err_start; > > } > > } > > > > + /* Enable shadow vq vring */ > > + dev->vhost_ops->vhost_set_vring_enable(dev, true); > > return 0; > > > > err_start: > > qatomic_store_release(&dev->shadow_vqs_enabled, false); > > for (stop_idx = 0; stop_idx < idx; stop_idx++) { > > vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]); > > + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], > > + dev->vq_index + stop_idx); > > } > > > > err_new: > > + /* Enable guest's vring */ > > + dev->vhost_ops->vhost_set_vring_enable(dev, true); > > for (idx = 0; idx < dev->nvqs; ++idx) { > > vhost_shadow_vq_free(dev->shadow_vqs[idx]); > > } > > @@ -1970,6 +2067,20 @@ void qmp_x_vhost_enable_shadow_vq(const char *name, bool enable, Error **errp) > > > > if (!hdev->started) { > > err_cause = "Device is not started"; > > + } else if (!vhost_dev_has_iommu(hdev)) { > > + err_cause = "Does not support iommu"; > > + } else if (hdev->acked_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { > > + err_cause = "Is packed"; > > + } else if (hdev->acked_features & BIT_ULL(VIRTIO_RING_F_EVENT_IDX)) { > > + err_cause = "Have event idx"; > > + } else if (hdev->acked_features & > > + BIT_ULL(VIRTIO_RING_F_INDIRECT_DESC)) { > > + err_cause = "Supports indirect descriptors"; > > + } else if (!hdev->vhost_ops->vhost_set_vring_enable) { > > + err_cause = "Cannot pause device"; > > + } > > + > > + if (err_cause) { > > goto err; > > } > > >
在 2021/3/17 上午12:05, Eugenio Perez Martin 写道: > On Tue, Mar 16, 2021 at 9:15 AM Jason Wang <jasowang@redhat.com> wrote: >> >> 在 2021/3/16 上午3:48, Eugenio Pérez 写道: >>> Initial version of shadow virtqueue that actually forward buffers. >>> >>> It reuses the VirtQueue code for the device part. The driver part is >>> based on Linux's virtio_ring driver, but with stripped functionality >>> and optimizations so it's easier to review. >>> >>> These will be added in later commits. >>> >>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com> >>> --- >>> hw/virtio/vhost-shadow-virtqueue.c | 212 +++++++++++++++++++++++++++-- >>> hw/virtio/vhost.c | 113 ++++++++++++++- >>> 2 files changed, 312 insertions(+), 13 deletions(-) >>> >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c >>> index 1460d1d5d1..68ed0f2740 100644 >>> --- a/hw/virtio/vhost-shadow-virtqueue.c >>> +++ b/hw/virtio/vhost-shadow-virtqueue.c >>> @@ -9,6 +9,7 @@ >>> >>> #include "hw/virtio/vhost-shadow-virtqueue.h" >>> #include "hw/virtio/vhost.h" >>> +#include "hw/virtio/virtio-access.h" >>> >>> #include "standard-headers/linux/vhost_types.h" >>> >>> @@ -55,11 +56,96 @@ typedef struct VhostShadowVirtqueue { >>> /* Virtio device */ >>> VirtIODevice *vdev; >>> >>> + /* Map for returning guest's descriptors */ >>> + VirtQueueElement **ring_id_maps; >>> + >>> + /* Next head to expose to device */ >>> + uint16_t avail_idx_shadow; >>> + >>> + /* Next free descriptor */ >>> + uint16_t free_head; >>> + >>> + /* Last seen used idx */ >>> + uint16_t shadow_used_idx; >>> + >>> + /* Next head to consume from device */ >>> + uint16_t used_idx; >>> + >>> /* Descriptors copied from guest */ >>> vring_desc_t descs[]; >>> } VhostShadowVirtqueue; >>> >>> -/* Forward guest notifications */ >>> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, >>> + const struct iovec *iovec, >>> + size_t num, bool more_descs, bool write) >>> +{ >>> + uint16_t i = svq->free_head, last = svq->free_head; >>> + unsigned n; >>> + uint16_t flags = write ? virtio_tswap16(svq->vdev, VRING_DESC_F_WRITE) : 0; >>> + vring_desc_t *descs = svq->vring.desc; >>> + >>> + if (num == 0) { >>> + return; >>> + } >>> + >>> + for (n = 0; n < num; n++) { >>> + if (more_descs || (n + 1 < num)) { >>> + descs[i].flags = flags | virtio_tswap16(svq->vdev, >>> + VRING_DESC_F_NEXT); >>> + } else { >>> + descs[i].flags = flags; >>> + } >>> + descs[i].addr = virtio_tswap64(svq->vdev, (hwaddr)iovec[n].iov_base); >> >> So unsing virtio_tswap() is probably not correct since we're talking >> with vhost backends which has its own endiness. >> > I was trying to expose the buffer with the same endianness as the > driver originally offered, so if guest->qemu requires a bswap, I think > it will always require a bswap again to expose to the device again. So assumes vhost-vDPA always provide a non-transitional device[1]. Then if Qemu present a transitional device, we need to do the endian conversion when necessary, if Qemu present a non-transitional device, we don't need to do that, guest driver will do that for us. But it looks to me the virtio_tswap() can't be used for this since it: static inline bool virtio_access_is_big_endian(VirtIODevice *vdev) { #if defined(LEGACY_VIRTIO_IS_BIENDIAN) return virtio_is_big_endian(vdev); #elif defined(TARGET_WORDS_BIGENDIAN) if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { /* Devices conforming to VIRTIO 1.0 or later are always LE. */ return false; } return true; #else return false; #endif } So if we present a legacy device on top of a non-transitiaonl vDPA device. The VIRITIO_F_VERSION_1 check is wrong. > >> For vhost-vDPA, we can assume that it's a 1.0 device. > Isn't it needed if the host is big endian? [1] So non-transitional device always assume little endian. For vhost-vDPA, we don't want to present transitional device which may end up with a lot of burdens. I suspect the legacy driver plust vhost vDPA already break, so I plan to mandate VERSION_1 for all vDPA devices. > >> >>> + descs[i].len = virtio_tswap32(svq->vdev, iovec[n].iov_len); >>> + >>> + last = i; >>> + i = virtio_tswap16(svq->vdev, descs[i].next); >>> + } >>> + >>> + svq->free_head = virtio_tswap16(svq->vdev, descs[last].next); >>> +} >>> + >>> +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq, >>> + VirtQueueElement *elem) >>> +{ >>> + int head; >>> + unsigned avail_idx; >>> + vring_avail_t *avail = svq->vring.avail; >>> + >>> + head = svq->free_head; >>> + >>> + /* We need some descriptors here */ >>> + assert(elem->out_num || elem->in_num); >>> + >>> + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, >>> + elem->in_num > 0, false); >>> + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); >>> + >>> + /* >>> + * Put entry in available array (but don't update avail->idx until they >>> + * do sync). >>> + */ >>> + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); >>> + avail->ring[avail_idx] = virtio_tswap16(svq->vdev, head); >>> + svq->avail_idx_shadow++; >>> + >>> + /* Expose descriptors to device */ >>> + smp_wmb(); >>> + avail->idx = virtio_tswap16(svq->vdev, svq->avail_idx_shadow); >>> + >>> + return head; >>> + >>> +} >>> + >>> +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq, >>> + VirtQueueElement *elem) >>> +{ >>> + unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem); >>> + >>> + svq->ring_id_maps[qemu_head] = elem; >>> +} >>> + >>> +/* Handle guest->device notifications */ >>> static void vhost_handle_guest_kick(EventNotifier *n) >>> { >>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, >>> @@ -69,7 +155,72 @@ static void vhost_handle_guest_kick(EventNotifier *n) >>> return; >>> } >>> >>> - event_notifier_set(&svq->kick_notifier); >>> + /* Make available as many buffers as possible */ >>> + do { >>> + if (virtio_queue_get_notification(svq->vq)) { >>> + /* No more notifications until process all available */ >>> + virtio_queue_set_notification(svq->vq, false); >>> + } >>> + >>> + while (true) { >>> + VirtQueueElement *elem; >>> + if (virtio_queue_full(svq->vq)) { >>> + break; >> >> So we've disabled guest notification. If buffer has been consumed, we >> need to retry the handle_guest_kick here. But I didn't find the code? >> > This code follows the pattern of virtio_blk_handle_vq: we jump out of > the inner while, and we re-enable the notifications. After that, we > check for updates on guest avail_idx. Ok, but this will end up with a lot of unnecessary kicks without event index. > >>> + } >>> + >>> + elem = virtqueue_pop(svq->vq, sizeof(*elem)); >>> + if (!elem) { >>> + break; >>> + } >>> + >>> + vhost_shadow_vq_add(svq, elem); >>> + event_notifier_set(&svq->kick_notifier); >>> + } >>> + >>> + virtio_queue_set_notification(svq->vq, true); >>> + } while (!virtio_queue_empty(svq->vq)); >>> +} >>> + >>> +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq) >>> +{ >>> + if (svq->used_idx != svq->shadow_used_idx) { >>> + return true; >>> + } >>> + >>> + /* Get used idx must not be reordered */ >>> + smp_rmb(); >>> + svq->shadow_used_idx = virtio_tswap16(svq->vdev, svq->vring.used->idx); >>> + >>> + return svq->used_idx != svq->shadow_used_idx; >>> +} >>> + >>> +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq) >>> +{ >>> + vring_desc_t *descs = svq->vring.desc; >>> + const vring_used_t *used = svq->vring.used; >>> + vring_used_elem_t used_elem; >>> + uint16_t last_used; >>> + >>> + if (!vhost_shadow_vq_more_used(svq)) { >>> + return NULL; >>> + } >>> + >>> + last_used = svq->used_idx & (svq->vring.num - 1); >>> + used_elem.id = virtio_tswap32(svq->vdev, used->ring[last_used].id); >>> + used_elem.len = virtio_tswap32(svq->vdev, used->ring[last_used].len); >>> + >>> + if (unlikely(used_elem.id >= svq->vring.num)) { >>> + error_report("Device %s says index %u is available", svq->vdev->name, >>> + used_elem.id); >>> + return NULL; >>> + } >>> + >>> + descs[used_elem.id].next = svq->free_head; >>> + svq->free_head = used_elem.id; >>> + >>> + svq->used_idx++; >>> + svq->ring_id_maps[used_elem.id]->len = used_elem.len; >>> + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); >>> } >>> >>> /* Forward vhost notifications */ >>> @@ -78,6 +229,7 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) >>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, >>> call_notifier); >>> EventNotifier *masked_notifier; >>> + VirtQueue *vq = svq->vq; >>> >>> /* Signal start of using masked notifier */ >>> qemu_event_reset(&svq->masked_notifier.is_free); >>> @@ -86,14 +238,29 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) >>> qemu_event_set(&svq->masked_notifier.is_free); >>> } >>> >>> - if (!masked_notifier) { >>> - unsigned n = virtio_get_queue_index(svq->vq); >>> - virtio_queue_invalidate_signalled_used(svq->vdev, n); >>> - virtio_notify_irqfd(svq->vdev, svq->vq); >>> - } else if (!svq->masked_notifier.signaled) { >>> - svq->masked_notifier.signaled = true; >>> - event_notifier_set(svq->masked_notifier.n); >>> - } >>> + /* Make as many buffers as possible used. */ >>> + do { >>> + unsigned i = 0; >>> + >>> + /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */ >>> + while (true) { >>> + g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq); >>> + if (!elem) { >>> + break; >>> + } >>> + >>> + assert(i < svq->vring.num); >>> + virtqueue_fill(vq, elem, elem->len, i++); >>> + } >>> + >>> + virtqueue_flush(vq, i); >>> + if (!masked_notifier) { >>> + virtio_notify_irqfd(svq->vdev, svq->vq); >>> + } else if (!svq->masked_notifier.signaled) { >>> + svq->masked_notifier.signaled = true; >>> + event_notifier_set(svq->masked_notifier.n); >>> + } >>> + } while (vhost_shadow_vq_more_used(svq)); >>> >>> if (masked_notifier) { >>> /* Signal not using it anymore */ >>> @@ -103,7 +270,6 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) >>> >>> static void vhost_shadow_vq_handle_call(EventNotifier *n) >>> { >>> - >>> if (likely(event_notifier_test_and_clear(n))) { >>> vhost_shadow_vq_handle_call_no_test(n); >>> } >>> @@ -254,7 +420,11 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, >>> unsigned idx, >>> VhostShadowVirtqueue *svq) >>> { >>> + int i; >>> int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq); >>> + >>> + assert(!dev->shadow_vqs_enabled); >>> + >>> if (unlikely(r < 0)) { >>> error_report("Couldn't restore vq kick fd: %s", strerror(-r)); >>> } >>> @@ -272,6 +442,18 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, >>> /* Restore vhost call */ >>> vhost_virtqueue_mask(dev, dev->vdev, dev->vq_index + idx, >>> dev->vqs[idx].notifier_is_masked); >>> + >>> + >>> + for (i = 0; i < svq->vring.num; ++i) { >>> + g_autofree VirtQueueElement *elem = svq->ring_id_maps[i]; >>> + /* >>> + * Although the doc says we must unpop in order, it's ok to unpop >>> + * everything. >>> + */ >>> + if (elem) { >>> + virtqueue_unpop(svq->vq, elem, elem->len); >> >> Shouldn't we need to wait until all pending requests to be drained? Or >> we may end up duplicated requests? >> > Do you mean pending as in-flight/processing in the device? The device > must be paused at this point. Ok. I see there's a vhost_set_vring_enable(dev, false) in vhost_sw_live_migration_start(). > Currently there is no assertion for > this, maybe we can track the device status for it. > > For the queue handlers to be running at this point, the main event > loop should serialize QMP and handlers as far as I know (and they > would make all state inconsistent if the device stops suddenly). It > would need to be synchronized if the handlers run in their own AIO > context. That would be nice to have but it's not included here. That's why I suggest to just drop the QMP stuffs and use cli parameters to enable shadow virtqueue. Things would be greatly simplified I guess. Thanks > >> Thanks >> >> >>> + } >>> + } >>> } >>> >>> /* >>> @@ -284,7 +466,7 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) >>> unsigned num = virtio_queue_get_num(dev->vdev, vq_idx); >>> size_t ring_size = vring_size(num, VRING_DESC_ALIGN_SIZE); >>> g_autofree VhostShadowVirtqueue *svq = g_malloc0(sizeof(*svq) + ring_size); >>> - int r; >>> + int r, i; >>> >>> r = event_notifier_init(&svq->kick_notifier, 0); >>> if (r != 0) { >>> @@ -303,6 +485,11 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) >>> vring_init(&svq->vring, num, svq->descs, VRING_DESC_ALIGN_SIZE); >>> svq->vq = virtio_get_queue(dev->vdev, vq_idx); >>> svq->vdev = dev->vdev; >>> + for (i = 0; i < num - 1; i++) { >>> + svq->descs[i].next = virtio_tswap16(dev->vdev, i + 1); >>> + } >>> + >>> + svq->ring_id_maps = g_new0(VirtQueueElement *, num); >>> event_notifier_set_handler(&svq->call_notifier, >>> vhost_shadow_vq_handle_call); >>> qemu_event_init(&svq->masked_notifier.is_free, true); >>> @@ -324,5 +511,6 @@ void vhost_shadow_vq_free(VhostShadowVirtqueue *vq) >>> event_notifier_cleanup(&vq->kick_notifier); >>> event_notifier_set_handler(&vq->call_notifier, NULL); >>> event_notifier_cleanup(&vq->call_notifier); >>> + g_free(vq->ring_id_maps); >>> g_free(vq); >>> } >>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c >>> index eab3e334f2..a373999bc4 100644 >>> --- a/hw/virtio/vhost.c >>> +++ b/hw/virtio/vhost.c >>> @@ -1021,6 +1021,19 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) >>> >>> trace_vhost_iotlb_miss(dev, 1); >>> >>> + if (qatomic_load_acquire(&dev->shadow_vqs_enabled)) { >>> + uaddr = iova; >>> + len = 4096; >>> + ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len, >>> + IOMMU_RW); >>> + if (ret) { >>> + trace_vhost_iotlb_miss(dev, 2); >>> + error_report("Fail to update device iotlb"); >>> + } >>> + >>> + return ret; >>> + } >>> + >>> iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, >>> iova, write, >>> MEMTXATTRS_UNSPECIFIED); >>> @@ -1227,8 +1240,28 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) >>> /* Can be read by vhost_virtqueue_mask, from vm exit */ >>> qatomic_store_release(&dev->shadow_vqs_enabled, false); >>> >>> + dev->vhost_ops->vhost_set_vring_enable(dev, false); >>> + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { >>> + error_report("Fail to invalidate device iotlb"); >>> + } >>> + >>> for (idx = 0; idx < dev->nvqs; ++idx) { >>> + /* >>> + * Update used ring information for IOTLB to work correctly, >>> + * vhost-kernel code requires for this. >>> + */ >>> + struct vhost_virtqueue *vq = dev->vqs + idx; >>> + vhost_device_iotlb_miss(dev, vq->used_phys, true); >>> + >>> vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]); >>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], >>> + dev->vq_index + idx); >>> + } >>> + >>> + /* Enable guest's vq vring */ >>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); >>> + >>> + for (idx = 0; idx < dev->nvqs; ++idx) { >>> vhost_shadow_vq_free(dev->shadow_vqs[idx]); >>> } >>> >>> @@ -1237,6 +1270,59 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) >>> return 0; >>> } >>> >>> +/* >>> + * Start shadow virtqueue in a given queue. >>> + * In failure case, this function leaves queue working as regular vhost mode. >>> + */ >>> +static bool vhost_sw_live_migration_start_vq(struct vhost_dev *dev, >>> + unsigned idx) >>> +{ >>> + struct vhost_vring_addr addr = { >>> + .index = idx, >>> + }; >>> + struct vhost_vring_state s = { >>> + .index = idx, >>> + }; >>> + int r; >>> + bool ok; >>> + >>> + vhost_virtqueue_stop(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); >>> + ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); >>> + if (unlikely(!ok)) { >>> + return false; >>> + } >>> + >>> + /* From this point, vhost_virtqueue_start can reset these changes */ >>> + vhost_shadow_vq_get_vring_addr(dev->shadow_vqs[idx], &addr); >>> + r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); >>> + if (unlikely(r != 0)) { >>> + VHOST_OPS_DEBUG("vhost_set_vring_addr for shadow vq failed"); >>> + goto err; >>> + } >>> + >>> + r = dev->vhost_ops->vhost_set_vring_base(dev, &s); >>> + if (unlikely(r != 0)) { >>> + VHOST_OPS_DEBUG("vhost_set_vring_base for shadow vq failed"); >>> + goto err; >>> + } >>> + >>> + /* >>> + * Update used ring information for IOTLB to work correctly, >>> + * vhost-kernel code requires for this. >>> + */ >>> + r = vhost_device_iotlb_miss(dev, addr.used_user_addr, true); >>> + if (unlikely(r != 0)) { >>> + /* Debug message already printed */ >>> + goto err; >>> + } >>> + >>> + return true; >>> + >>> +err: >>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); >>> + return false; >>> +} >>> + >>> static int vhost_sw_live_migration_start(struct vhost_dev *dev) >>> { >>> int idx, stop_idx; >>> @@ -1249,24 +1335,35 @@ static int vhost_sw_live_migration_start(struct vhost_dev *dev) >>> } >>> } >>> >>> + dev->vhost_ops->vhost_set_vring_enable(dev, false); >>> + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { >>> + error_report("Fail to invalidate device iotlb"); >>> + } >>> + >>> /* Can be read by vhost_virtqueue_mask, from vm exit */ >>> qatomic_store_release(&dev->shadow_vqs_enabled, true); >>> for (idx = 0; idx < dev->nvqs; ++idx) { >>> - bool ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); >>> + bool ok = vhost_sw_live_migration_start_vq(dev, idx); >>> if (unlikely(!ok)) { >>> goto err_start; >>> } >>> } >>> >>> + /* Enable shadow vq vring */ >>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); >>> return 0; >>> >>> err_start: >>> qatomic_store_release(&dev->shadow_vqs_enabled, false); >>> for (stop_idx = 0; stop_idx < idx; stop_idx++) { >>> vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]); >>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], >>> + dev->vq_index + stop_idx); >>> } >>> >>> err_new: >>> + /* Enable guest's vring */ >>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); >>> for (idx = 0; idx < dev->nvqs; ++idx) { >>> vhost_shadow_vq_free(dev->shadow_vqs[idx]); >>> } >>> @@ -1970,6 +2067,20 @@ void qmp_x_vhost_enable_shadow_vq(const char *name, bool enable, Error **errp) >>> >>> if (!hdev->started) { >>> err_cause = "Device is not started"; >>> + } else if (!vhost_dev_has_iommu(hdev)) { >>> + err_cause = "Does not support iommu"; >>> + } else if (hdev->acked_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { >>> + err_cause = "Is packed"; >>> + } else if (hdev->acked_features & BIT_ULL(VIRTIO_RING_F_EVENT_IDX)) { >>> + err_cause = "Have event idx"; >>> + } else if (hdev->acked_features & >>> + BIT_ULL(VIRTIO_RING_F_INDIRECT_DESC)) { >>> + err_cause = "Supports indirect descriptors"; >>> + } else if (!hdev->vhost_ops->vhost_set_vring_enable) { >>> + err_cause = "Cannot pause device"; >>> + } >>> + >>> + if (err_cause) { >>> goto err; >>> } >>> >
On Wed, Mar 17, 2021 at 3:51 AM Jason Wang <jasowang@redhat.com> wrote: > > > 在 2021/3/17 上午12:05, Eugenio Perez Martin 写道: > > On Tue, Mar 16, 2021 at 9:15 AM Jason Wang <jasowang@redhat.com> wrote: > >> > >> 在 2021/3/16 上午3:48, Eugenio Pérez 写道: > >>> Initial version of shadow virtqueue that actually forward buffers. > >>> > >>> It reuses the VirtQueue code for the device part. The driver part is > >>> based on Linux's virtio_ring driver, but with stripped functionality > >>> and optimizations so it's easier to review. > >>> > >>> These will be added in later commits. > >>> > >>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > >>> --- > >>> hw/virtio/vhost-shadow-virtqueue.c | 212 +++++++++++++++++++++++++++-- > >>> hw/virtio/vhost.c | 113 ++++++++++++++- > >>> 2 files changed, 312 insertions(+), 13 deletions(-) > >>> > >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c > >>> index 1460d1d5d1..68ed0f2740 100644 > >>> --- a/hw/virtio/vhost-shadow-virtqueue.c > >>> +++ b/hw/virtio/vhost-shadow-virtqueue.c > >>> @@ -9,6 +9,7 @@ > >>> > >>> #include "hw/virtio/vhost-shadow-virtqueue.h" > >>> #include "hw/virtio/vhost.h" > >>> +#include "hw/virtio/virtio-access.h" > >>> > >>> #include "standard-headers/linux/vhost_types.h" > >>> > >>> @@ -55,11 +56,96 @@ typedef struct VhostShadowVirtqueue { > >>> /* Virtio device */ > >>> VirtIODevice *vdev; > >>> > >>> + /* Map for returning guest's descriptors */ > >>> + VirtQueueElement **ring_id_maps; > >>> + > >>> + /* Next head to expose to device */ > >>> + uint16_t avail_idx_shadow; > >>> + > >>> + /* Next free descriptor */ > >>> + uint16_t free_head; > >>> + > >>> + /* Last seen used idx */ > >>> + uint16_t shadow_used_idx; > >>> + > >>> + /* Next head to consume from device */ > >>> + uint16_t used_idx; > >>> + > >>> /* Descriptors copied from guest */ > >>> vring_desc_t descs[]; > >>> } VhostShadowVirtqueue; > >>> > >>> -/* Forward guest notifications */ > >>> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, > >>> + const struct iovec *iovec, > >>> + size_t num, bool more_descs, bool write) > >>> +{ > >>> + uint16_t i = svq->free_head, last = svq->free_head; > >>> + unsigned n; > >>> + uint16_t flags = write ? virtio_tswap16(svq->vdev, VRING_DESC_F_WRITE) : 0; > >>> + vring_desc_t *descs = svq->vring.desc; > >>> + > >>> + if (num == 0) { > >>> + return; > >>> + } > >>> + > >>> + for (n = 0; n < num; n++) { > >>> + if (more_descs || (n + 1 < num)) { > >>> + descs[i].flags = flags | virtio_tswap16(svq->vdev, > >>> + VRING_DESC_F_NEXT); > >>> + } else { > >>> + descs[i].flags = flags; > >>> + } > >>> + descs[i].addr = virtio_tswap64(svq->vdev, (hwaddr)iovec[n].iov_base); > >> > >> So unsing virtio_tswap() is probably not correct since we're talking > >> with vhost backends which has its own endiness. > >> > > I was trying to expose the buffer with the same endianness as the > > driver originally offered, so if guest->qemu requires a bswap, I think > > it will always require a bswap again to expose to the device again. > > > So assumes vhost-vDPA always provide a non-transitional device[1]. > > Then if Qemu present a transitional device, we need to do the endian > conversion when necessary, if Qemu present a non-transitional device, we > don't need to do that, guest driver will do that for us. > > But it looks to me the virtio_tswap() can't be used for this since it: > > static inline bool virtio_access_is_big_endian(VirtIODevice *vdev) > { > #if defined(LEGACY_VIRTIO_IS_BIENDIAN) > return virtio_is_big_endian(vdev); > #elif defined(TARGET_WORDS_BIGENDIAN) > if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { > /* Devices conforming to VIRTIO 1.0 or later are always LE. */ > return false; > } > return true; > #else > return false; > #endif > } > > So if we present a legacy device on top of a non-transitiaonl vDPA > device. The VIRITIO_F_VERSION_1 check is wrong. > > > > > >> For vhost-vDPA, we can assume that it's a 1.0 device. > > Isn't it needed if the host is big endian? > > > [1] > > So non-transitional device always assume little endian. > > For vhost-vDPA, we don't want to present transitional device which may > end up with a lot of burdens. > > I suspect the legacy driver plust vhost vDPA already break, so I plan to > mandate VERSION_1 for all vDPA devices. > Right. I think it's the best then. However, then we will need a similar method to always expose address/length as little endian, isn't it? > > > > >> > >>> + descs[i].len = virtio_tswap32(svq->vdev, iovec[n].iov_len); > >>> + > >>> + last = i; > >>> + i = virtio_tswap16(svq->vdev, descs[i].next); > >>> + } > >>> + > >>> + svq->free_head = virtio_tswap16(svq->vdev, descs[last].next); > >>> +} > >>> + > >>> +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq, > >>> + VirtQueueElement *elem) > >>> +{ > >>> + int head; > >>> + unsigned avail_idx; > >>> + vring_avail_t *avail = svq->vring.avail; > >>> + > >>> + head = svq->free_head; > >>> + > >>> + /* We need some descriptors here */ > >>> + assert(elem->out_num || elem->in_num); > >>> + > >>> + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, > >>> + elem->in_num > 0, false); > >>> + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); > >>> + > >>> + /* > >>> + * Put entry in available array (but don't update avail->idx until they > >>> + * do sync). > >>> + */ > >>> + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); > >>> + avail->ring[avail_idx] = virtio_tswap16(svq->vdev, head); > >>> + svq->avail_idx_shadow++; > >>> + > >>> + /* Expose descriptors to device */ > >>> + smp_wmb(); > >>> + avail->idx = virtio_tswap16(svq->vdev, svq->avail_idx_shadow); > >>> + > >>> + return head; > >>> + > >>> +} > >>> + > >>> +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq, > >>> + VirtQueueElement *elem) > >>> +{ > >>> + unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem); > >>> + > >>> + svq->ring_id_maps[qemu_head] = elem; > >>> +} > >>> + > >>> +/* Handle guest->device notifications */ > >>> static void vhost_handle_guest_kick(EventNotifier *n) > >>> { > >>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > >>> @@ -69,7 +155,72 @@ static void vhost_handle_guest_kick(EventNotifier *n) > >>> return; > >>> } > >>> > >>> - event_notifier_set(&svq->kick_notifier); > >>> + /* Make available as many buffers as possible */ > >>> + do { > >>> + if (virtio_queue_get_notification(svq->vq)) { > >>> + /* No more notifications until process all available */ > >>> + virtio_queue_set_notification(svq->vq, false); > >>> + } > >>> + > >>> + while (true) { > >>> + VirtQueueElement *elem; > >>> + if (virtio_queue_full(svq->vq)) { > >>> + break; > >> > >> So we've disabled guest notification. If buffer has been consumed, we > >> need to retry the handle_guest_kick here. But I didn't find the code? > >> > > This code follows the pattern of virtio_blk_handle_vq: we jump out of > > the inner while, and we re-enable the notifications. After that, we > > check for updates on guest avail_idx. > > > Ok, but this will end up with a lot of unnecessary kicks without event > index. > I can move the kick out of the inner loop, but that could add latency. > > > > >>> + } > >>> + > >>> + elem = virtqueue_pop(svq->vq, sizeof(*elem)); > >>> + if (!elem) { > >>> + break; > >>> + } > >>> + > >>> + vhost_shadow_vq_add(svq, elem); > >>> + event_notifier_set(&svq->kick_notifier); > >>> + } > >>> + > >>> + virtio_queue_set_notification(svq->vq, true); > >>> + } while (!virtio_queue_empty(svq->vq)); > >>> +} > >>> + > >>> +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq) > >>> +{ > >>> + if (svq->used_idx != svq->shadow_used_idx) { > >>> + return true; > >>> + } > >>> + > >>> + /* Get used idx must not be reordered */ > >>> + smp_rmb(); > >>> + svq->shadow_used_idx = virtio_tswap16(svq->vdev, svq->vring.used->idx); > >>> + > >>> + return svq->used_idx != svq->shadow_used_idx; > >>> +} > >>> + > >>> +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq) > >>> +{ > >>> + vring_desc_t *descs = svq->vring.desc; > >>> + const vring_used_t *used = svq->vring.used; > >>> + vring_used_elem_t used_elem; > >>> + uint16_t last_used; > >>> + > >>> + if (!vhost_shadow_vq_more_used(svq)) { > >>> + return NULL; > >>> + } > >>> + > >>> + last_used = svq->used_idx & (svq->vring.num - 1); > >>> + used_elem.id = virtio_tswap32(svq->vdev, used->ring[last_used].id); > >>> + used_elem.len = virtio_tswap32(svq->vdev, used->ring[last_used].len); > >>> + > >>> + if (unlikely(used_elem.id >= svq->vring.num)) { > >>> + error_report("Device %s says index %u is available", svq->vdev->name, > >>> + used_elem.id); > >>> + return NULL; > >>> + } > >>> + > >>> + descs[used_elem.id].next = svq->free_head; > >>> + svq->free_head = used_elem.id; > >>> + > >>> + svq->used_idx++; > >>> + svq->ring_id_maps[used_elem.id]->len = used_elem.len; > >>> + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); > >>> } > >>> > >>> /* Forward vhost notifications */ > >>> @@ -78,6 +229,7 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > >>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > >>> call_notifier); > >>> EventNotifier *masked_notifier; > >>> + VirtQueue *vq = svq->vq; > >>> > >>> /* Signal start of using masked notifier */ > >>> qemu_event_reset(&svq->masked_notifier.is_free); > >>> @@ -86,14 +238,29 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > >>> qemu_event_set(&svq->masked_notifier.is_free); > >>> } > >>> > >>> - if (!masked_notifier) { > >>> - unsigned n = virtio_get_queue_index(svq->vq); > >>> - virtio_queue_invalidate_signalled_used(svq->vdev, n); > >>> - virtio_notify_irqfd(svq->vdev, svq->vq); > >>> - } else if (!svq->masked_notifier.signaled) { > >>> - svq->masked_notifier.signaled = true; > >>> - event_notifier_set(svq->masked_notifier.n); > >>> - } > >>> + /* Make as many buffers as possible used. */ > >>> + do { > >>> + unsigned i = 0; > >>> + > >>> + /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */ > >>> + while (true) { > >>> + g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq); > >>> + if (!elem) { > >>> + break; > >>> + } > >>> + > >>> + assert(i < svq->vring.num); > >>> + virtqueue_fill(vq, elem, elem->len, i++); > >>> + } > >>> + > >>> + virtqueue_flush(vq, i); > >>> + if (!masked_notifier) { > >>> + virtio_notify_irqfd(svq->vdev, svq->vq); > >>> + } else if (!svq->masked_notifier.signaled) { > >>> + svq->masked_notifier.signaled = true; > >>> + event_notifier_set(svq->masked_notifier.n); > >>> + } > >>> + } while (vhost_shadow_vq_more_used(svq)); > >>> > >>> if (masked_notifier) { > >>> /* Signal not using it anymore */ > >>> @@ -103,7 +270,6 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > >>> > >>> static void vhost_shadow_vq_handle_call(EventNotifier *n) > >>> { > >>> - > >>> if (likely(event_notifier_test_and_clear(n))) { > >>> vhost_shadow_vq_handle_call_no_test(n); > >>> } > >>> @@ -254,7 +420,11 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, > >>> unsigned idx, > >>> VhostShadowVirtqueue *svq) > >>> { > >>> + int i; > >>> int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq); > >>> + > >>> + assert(!dev->shadow_vqs_enabled); > >>> + > >>> if (unlikely(r < 0)) { > >>> error_report("Couldn't restore vq kick fd: %s", strerror(-r)); > >>> } > >>> @@ -272,6 +442,18 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, > >>> /* Restore vhost call */ > >>> vhost_virtqueue_mask(dev, dev->vdev, dev->vq_index + idx, > >>> dev->vqs[idx].notifier_is_masked); > >>> + > >>> + > >>> + for (i = 0; i < svq->vring.num; ++i) { > >>> + g_autofree VirtQueueElement *elem = svq->ring_id_maps[i]; > >>> + /* > >>> + * Although the doc says we must unpop in order, it's ok to unpop > >>> + * everything. > >>> + */ > >>> + if (elem) { > >>> + virtqueue_unpop(svq->vq, elem, elem->len); > >> > >> Shouldn't we need to wait until all pending requests to be drained? Or > >> we may end up duplicated requests? > >> > > Do you mean pending as in-flight/processing in the device? The device > > must be paused at this point. > > > Ok. I see there's a vhost_set_vring_enable(dev, false) in > vhost_sw_live_migration_start(). > > > > Currently there is no assertion for > > this, maybe we can track the device status for it. > > > > For the queue handlers to be running at this point, the main event > > loop should serialize QMP and handlers as far as I know (and they > > would make all state inconsistent if the device stops suddenly). It > > would need to be synchronized if the handlers run in their own AIO > > context. That would be nice to have but it's not included here. > > > That's why I suggest to just drop the QMP stuffs and use cli parameters > to enable shadow virtqueue. Things would be greatly simplified I guess. > I can send a series without it, but SVQ will need to be able to kick in dynamically sooner or later if we want to use it for live migration. > Thanks > > > > > >> Thanks > >> > >> > >>> + } > >>> + } > >>> } > >>> > >>> /* > >>> @@ -284,7 +466,7 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) > >>> unsigned num = virtio_queue_get_num(dev->vdev, vq_idx); > >>> size_t ring_size = vring_size(num, VRING_DESC_ALIGN_SIZE); > >>> g_autofree VhostShadowVirtqueue *svq = g_malloc0(sizeof(*svq) + ring_size); > >>> - int r; > >>> + int r, i; > >>> > >>> r = event_notifier_init(&svq->kick_notifier, 0); > >>> if (r != 0) { > >>> @@ -303,6 +485,11 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) > >>> vring_init(&svq->vring, num, svq->descs, VRING_DESC_ALIGN_SIZE); > >>> svq->vq = virtio_get_queue(dev->vdev, vq_idx); > >>> svq->vdev = dev->vdev; > >>> + for (i = 0; i < num - 1; i++) { > >>> + svq->descs[i].next = virtio_tswap16(dev->vdev, i + 1); > >>> + } > >>> + > >>> + svq->ring_id_maps = g_new0(VirtQueueElement *, num); > >>> event_notifier_set_handler(&svq->call_notifier, > >>> vhost_shadow_vq_handle_call); > >>> qemu_event_init(&svq->masked_notifier.is_free, true); > >>> @@ -324,5 +511,6 @@ void vhost_shadow_vq_free(VhostShadowVirtqueue *vq) > >>> event_notifier_cleanup(&vq->kick_notifier); > >>> event_notifier_set_handler(&vq->call_notifier, NULL); > >>> event_notifier_cleanup(&vq->call_notifier); > >>> + g_free(vq->ring_id_maps); > >>> g_free(vq); > >>> } > >>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > >>> index eab3e334f2..a373999bc4 100644 > >>> --- a/hw/virtio/vhost.c > >>> +++ b/hw/virtio/vhost.c > >>> @@ -1021,6 +1021,19 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) > >>> > >>> trace_vhost_iotlb_miss(dev, 1); > >>> > >>> + if (qatomic_load_acquire(&dev->shadow_vqs_enabled)) { > >>> + uaddr = iova; > >>> + len = 4096; > >>> + ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len, > >>> + IOMMU_RW); > >>> + if (ret) { > >>> + trace_vhost_iotlb_miss(dev, 2); > >>> + error_report("Fail to update device iotlb"); > >>> + } > >>> + > >>> + return ret; > >>> + } > >>> + > >>> iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, > >>> iova, write, > >>> MEMTXATTRS_UNSPECIFIED); > >>> @@ -1227,8 +1240,28 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) > >>> /* Can be read by vhost_virtqueue_mask, from vm exit */ > >>> qatomic_store_release(&dev->shadow_vqs_enabled, false); > >>> > >>> + dev->vhost_ops->vhost_set_vring_enable(dev, false); > >>> + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { > >>> + error_report("Fail to invalidate device iotlb"); > >>> + } > >>> + > >>> for (idx = 0; idx < dev->nvqs; ++idx) { > >>> + /* > >>> + * Update used ring information for IOTLB to work correctly, > >>> + * vhost-kernel code requires for this. > >>> + */ > >>> + struct vhost_virtqueue *vq = dev->vqs + idx; > >>> + vhost_device_iotlb_miss(dev, vq->used_phys, true); > >>> + > >>> vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]); > >>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], > >>> + dev->vq_index + idx); > >>> + } > >>> + > >>> + /* Enable guest's vq vring */ > >>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); > >>> + > >>> + for (idx = 0; idx < dev->nvqs; ++idx) { > >>> vhost_shadow_vq_free(dev->shadow_vqs[idx]); > >>> } > >>> > >>> @@ -1237,6 +1270,59 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) > >>> return 0; > >>> } > >>> > >>> +/* > >>> + * Start shadow virtqueue in a given queue. > >>> + * In failure case, this function leaves queue working as regular vhost mode. > >>> + */ > >>> +static bool vhost_sw_live_migration_start_vq(struct vhost_dev *dev, > >>> + unsigned idx) > >>> +{ > >>> + struct vhost_vring_addr addr = { > >>> + .index = idx, > >>> + }; > >>> + struct vhost_vring_state s = { > >>> + .index = idx, > >>> + }; > >>> + int r; > >>> + bool ok; > >>> + > >>> + vhost_virtqueue_stop(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); > >>> + ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); > >>> + if (unlikely(!ok)) { > >>> + return false; > >>> + } > >>> + > >>> + /* From this point, vhost_virtqueue_start can reset these changes */ > >>> + vhost_shadow_vq_get_vring_addr(dev->shadow_vqs[idx], &addr); > >>> + r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); > >>> + if (unlikely(r != 0)) { > >>> + VHOST_OPS_DEBUG("vhost_set_vring_addr for shadow vq failed"); > >>> + goto err; > >>> + } > >>> + > >>> + r = dev->vhost_ops->vhost_set_vring_base(dev, &s); > >>> + if (unlikely(r != 0)) { > >>> + VHOST_OPS_DEBUG("vhost_set_vring_base for shadow vq failed"); > >>> + goto err; > >>> + } > >>> + > >>> + /* > >>> + * Update used ring information for IOTLB to work correctly, > >>> + * vhost-kernel code requires for this. > >>> + */ > >>> + r = vhost_device_iotlb_miss(dev, addr.used_user_addr, true); > >>> + if (unlikely(r != 0)) { > >>> + /* Debug message already printed */ > >>> + goto err; > >>> + } > >>> + > >>> + return true; > >>> + > >>> +err: > >>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); > >>> + return false; > >>> +} > >>> + > >>> static int vhost_sw_live_migration_start(struct vhost_dev *dev) > >>> { > >>> int idx, stop_idx; > >>> @@ -1249,24 +1335,35 @@ static int vhost_sw_live_migration_start(struct vhost_dev *dev) > >>> } > >>> } > >>> > >>> + dev->vhost_ops->vhost_set_vring_enable(dev, false); > >>> + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { > >>> + error_report("Fail to invalidate device iotlb"); > >>> + } > >>> + > >>> /* Can be read by vhost_virtqueue_mask, from vm exit */ > >>> qatomic_store_release(&dev->shadow_vqs_enabled, true); > >>> for (idx = 0; idx < dev->nvqs; ++idx) { > >>> - bool ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); > >>> + bool ok = vhost_sw_live_migration_start_vq(dev, idx); > >>> if (unlikely(!ok)) { > >>> goto err_start; > >>> } > >>> } > >>> > >>> + /* Enable shadow vq vring */ > >>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); > >>> return 0; > >>> > >>> err_start: > >>> qatomic_store_release(&dev->shadow_vqs_enabled, false); > >>> for (stop_idx = 0; stop_idx < idx; stop_idx++) { > >>> vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]); > >>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], > >>> + dev->vq_index + stop_idx); > >>> } > >>> > >>> err_new: > >>> + /* Enable guest's vring */ > >>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); > >>> for (idx = 0; idx < dev->nvqs; ++idx) { > >>> vhost_shadow_vq_free(dev->shadow_vqs[idx]); > >>> } > >>> @@ -1970,6 +2067,20 @@ void qmp_x_vhost_enable_shadow_vq(const char *name, bool enable, Error **errp) > >>> > >>> if (!hdev->started) { > >>> err_cause = "Device is not started"; > >>> + } else if (!vhost_dev_has_iommu(hdev)) { > >>> + err_cause = "Does not support iommu"; > >>> + } else if (hdev->acked_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { > >>> + err_cause = "Is packed"; > >>> + } else if (hdev->acked_features & BIT_ULL(VIRTIO_RING_F_EVENT_IDX)) { > >>> + err_cause = "Have event idx"; > >>> + } else if (hdev->acked_features & > >>> + BIT_ULL(VIRTIO_RING_F_INDIRECT_DESC)) { > >>> + err_cause = "Supports indirect descriptors"; > >>> + } else if (!hdev->vhost_ops->vhost_set_vring_enable) { > >>> + err_cause = "Cannot pause device"; > >>> + } > >>> + > >>> + if (err_cause) { > >>> goto err; > >>> } > >>> > > >
在 2021/3/17 下午10:38, Eugenio Perez Martin 写道: > On Wed, Mar 17, 2021 at 3:51 AM Jason Wang <jasowang@redhat.com> wrote: >> >> 在 2021/3/17 上午12:05, Eugenio Perez Martin 写道: >>> On Tue, Mar 16, 2021 at 9:15 AM Jason Wang <jasowang@redhat.com> wrote: >>>> 在 2021/3/16 上午3:48, Eugenio Pérez 写道: >>>>> Initial version of shadow virtqueue that actually forward buffers. >>>>> >>>>> It reuses the VirtQueue code for the device part. The driver part is >>>>> based on Linux's virtio_ring driver, but with stripped functionality >>>>> and optimizations so it's easier to review. >>>>> >>>>> These will be added in later commits. >>>>> >>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com> >>>>> --- >>>>> hw/virtio/vhost-shadow-virtqueue.c | 212 +++++++++++++++++++++++++++-- >>>>> hw/virtio/vhost.c | 113 ++++++++++++++- >>>>> 2 files changed, 312 insertions(+), 13 deletions(-) >>>>> >>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c >>>>> index 1460d1d5d1..68ed0f2740 100644 >>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c >>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c >>>>> @@ -9,6 +9,7 @@ >>>>> >>>>> #include "hw/virtio/vhost-shadow-virtqueue.h" >>>>> #include "hw/virtio/vhost.h" >>>>> +#include "hw/virtio/virtio-access.h" >>>>> >>>>> #include "standard-headers/linux/vhost_types.h" >>>>> >>>>> @@ -55,11 +56,96 @@ typedef struct VhostShadowVirtqueue { >>>>> /* Virtio device */ >>>>> VirtIODevice *vdev; >>>>> >>>>> + /* Map for returning guest's descriptors */ >>>>> + VirtQueueElement **ring_id_maps; >>>>> + >>>>> + /* Next head to expose to device */ >>>>> + uint16_t avail_idx_shadow; >>>>> + >>>>> + /* Next free descriptor */ >>>>> + uint16_t free_head; >>>>> + >>>>> + /* Last seen used idx */ >>>>> + uint16_t shadow_used_idx; >>>>> + >>>>> + /* Next head to consume from device */ >>>>> + uint16_t used_idx; >>>>> + >>>>> /* Descriptors copied from guest */ >>>>> vring_desc_t descs[]; >>>>> } VhostShadowVirtqueue; >>>>> >>>>> -/* Forward guest notifications */ >>>>> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, >>>>> + const struct iovec *iovec, >>>>> + size_t num, bool more_descs, bool write) >>>>> +{ >>>>> + uint16_t i = svq->free_head, last = svq->free_head; >>>>> + unsigned n; >>>>> + uint16_t flags = write ? virtio_tswap16(svq->vdev, VRING_DESC_F_WRITE) : 0; >>>>> + vring_desc_t *descs = svq->vring.desc; >>>>> + >>>>> + if (num == 0) { >>>>> + return; >>>>> + } >>>>> + >>>>> + for (n = 0; n < num; n++) { >>>>> + if (more_descs || (n + 1 < num)) { >>>>> + descs[i].flags = flags | virtio_tswap16(svq->vdev, >>>>> + VRING_DESC_F_NEXT); >>>>> + } else { >>>>> + descs[i].flags = flags; >>>>> + } >>>>> + descs[i].addr = virtio_tswap64(svq->vdev, (hwaddr)iovec[n].iov_base); >>>> So unsing virtio_tswap() is probably not correct since we're talking >>>> with vhost backends which has its own endiness. >>>> >>> I was trying to expose the buffer with the same endianness as the >>> driver originally offered, so if guest->qemu requires a bswap, I think >>> it will always require a bswap again to expose to the device again. >> >> So assumes vhost-vDPA always provide a non-transitional device[1]. >> >> Then if Qemu present a transitional device, we need to do the endian >> conversion when necessary, if Qemu present a non-transitional device, we >> don't need to do that, guest driver will do that for us. >> >> But it looks to me the virtio_tswap() can't be used for this since it: >> >> static inline bool virtio_access_is_big_endian(VirtIODevice *vdev) >> { >> #if defined(LEGACY_VIRTIO_IS_BIENDIAN) >> return virtio_is_big_endian(vdev); >> #elif defined(TARGET_WORDS_BIGENDIAN) >> if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { >> /* Devices conforming to VIRTIO 1.0 or later are always LE. */ >> return false; >> } >> return true; >> #else >> return false; >> #endif >> } >> >> So if we present a legacy device on top of a non-transitiaonl vDPA >> device. The VIRITIO_F_VERSION_1 check is wrong. >> >> >>>> For vhost-vDPA, we can assume that it's a 1.0 device. >>> Isn't it needed if the host is big endian? >> >> [1] >> >> So non-transitional device always assume little endian. >> >> For vhost-vDPA, we don't want to present transitional device which may >> end up with a lot of burdens. >> >> I suspect the legacy driver plust vhost vDPA already break, so I plan to >> mandate VERSION_1 for all vDPA devices. >> > Right. I think it's the best then. > > However, then we will need a similar method to always expose > address/length as little endian, isn't it? Yes. > >>>>> + descs[i].len = virtio_tswap32(svq->vdev, iovec[n].iov_len); >>>>> + >>>>> + last = i; >>>>> + i = virtio_tswap16(svq->vdev, descs[i].next); >>>>> + } >>>>> + >>>>> + svq->free_head = virtio_tswap16(svq->vdev, descs[last].next); >>>>> +} >>>>> + >>>>> +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq, >>>>> + VirtQueueElement *elem) >>>>> +{ >>>>> + int head; >>>>> + unsigned avail_idx; >>>>> + vring_avail_t *avail = svq->vring.avail; >>>>> + >>>>> + head = svq->free_head; >>>>> + >>>>> + /* We need some descriptors here */ >>>>> + assert(elem->out_num || elem->in_num); >>>>> + >>>>> + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, >>>>> + elem->in_num > 0, false); >>>>> + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); >>>>> + >>>>> + /* >>>>> + * Put entry in available array (but don't update avail->idx until they >>>>> + * do sync). >>>>> + */ >>>>> + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); >>>>> + avail->ring[avail_idx] = virtio_tswap16(svq->vdev, head); >>>>> + svq->avail_idx_shadow++; >>>>> + >>>>> + /* Expose descriptors to device */ >>>>> + smp_wmb(); >>>>> + avail->idx = virtio_tswap16(svq->vdev, svq->avail_idx_shadow); >>>>> + >>>>> + return head; >>>>> + >>>>> +} >>>>> + >>>>> +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq, >>>>> + VirtQueueElement *elem) >>>>> +{ >>>>> + unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem); >>>>> + >>>>> + svq->ring_id_maps[qemu_head] = elem; >>>>> +} >>>>> + >>>>> +/* Handle guest->device notifications */ >>>>> static void vhost_handle_guest_kick(EventNotifier *n) >>>>> { >>>>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, >>>>> @@ -69,7 +155,72 @@ static void vhost_handle_guest_kick(EventNotifier *n) >>>>> return; >>>>> } >>>>> >>>>> - event_notifier_set(&svq->kick_notifier); >>>>> + /* Make available as many buffers as possible */ >>>>> + do { >>>>> + if (virtio_queue_get_notification(svq->vq)) { >>>>> + /* No more notifications until process all available */ >>>>> + virtio_queue_set_notification(svq->vq, false); >>>>> + } >>>>> + >>>>> + while (true) { >>>>> + VirtQueueElement *elem; >>>>> + if (virtio_queue_full(svq->vq)) { >>>>> + break; >>>> So we've disabled guest notification. If buffer has been consumed, we >>>> need to retry the handle_guest_kick here. But I didn't find the code? >>>> >>> This code follows the pattern of virtio_blk_handle_vq: we jump out of >>> the inner while, and we re-enable the notifications. After that, we >>> check for updates on guest avail_idx. >> >> Ok, but this will end up with a lot of unnecessary kicks without event >> index. >> > I can move the kick out of the inner loop, but that could add latency. So I think the right way is to disable the notification until some buffers are consumed by used ring. > >>>>> + } >>>>> + >>>>> + elem = virtqueue_pop(svq->vq, sizeof(*elem)); >>>>> + if (!elem) { >>>>> + break; >>>>> + } >>>>> + >>>>> + vhost_shadow_vq_add(svq, elem); >>>>> + event_notifier_set(&svq->kick_notifier); >>>>> + } >>>>> + >>>>> + virtio_queue_set_notification(svq->vq, true); >>>>> + } while (!virtio_queue_empty(svq->vq)); >>>>> +} >>>>> + >>>>> +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq) >>>>> +{ >>>>> + if (svq->used_idx != svq->shadow_used_idx) { >>>>> + return true; >>>>> + } >>>>> + >>>>> + /* Get used idx must not be reordered */ >>>>> + smp_rmb(); >>>>> + svq->shadow_used_idx = virtio_tswap16(svq->vdev, svq->vring.used->idx); >>>>> + >>>>> + return svq->used_idx != svq->shadow_used_idx; >>>>> +} >>>>> + >>>>> +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq) >>>>> +{ >>>>> + vring_desc_t *descs = svq->vring.desc; >>>>> + const vring_used_t *used = svq->vring.used; >>>>> + vring_used_elem_t used_elem; >>>>> + uint16_t last_used; >>>>> + >>>>> + if (!vhost_shadow_vq_more_used(svq)) { >>>>> + return NULL; >>>>> + } >>>>> + >>>>> + last_used = svq->used_idx & (svq->vring.num - 1); >>>>> + used_elem.id = virtio_tswap32(svq->vdev, used->ring[last_used].id); >>>>> + used_elem.len = virtio_tswap32(svq->vdev, used->ring[last_used].len); >>>>> + >>>>> + if (unlikely(used_elem.id >= svq->vring.num)) { >>>>> + error_report("Device %s says index %u is available", svq->vdev->name, >>>>> + used_elem.id); >>>>> + return NULL; >>>>> + } >>>>> + >>>>> + descs[used_elem.id].next = svq->free_head; >>>>> + svq->free_head = used_elem.id; >>>>> + >>>>> + svq->used_idx++; >>>>> + svq->ring_id_maps[used_elem.id]->len = used_elem.len; >>>>> + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); >>>>> } >>>>> >>>>> /* Forward vhost notifications */ >>>>> @@ -78,6 +229,7 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) >>>>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, >>>>> call_notifier); >>>>> EventNotifier *masked_notifier; >>>>> + VirtQueue *vq = svq->vq; >>>>> >>>>> /* Signal start of using masked notifier */ >>>>> qemu_event_reset(&svq->masked_notifier.is_free); >>>>> @@ -86,14 +238,29 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) >>>>> qemu_event_set(&svq->masked_notifier.is_free); >>>>> } >>>>> >>>>> - if (!masked_notifier) { >>>>> - unsigned n = virtio_get_queue_index(svq->vq); >>>>> - virtio_queue_invalidate_signalled_used(svq->vdev, n); >>>>> - virtio_notify_irqfd(svq->vdev, svq->vq); >>>>> - } else if (!svq->masked_notifier.signaled) { >>>>> - svq->masked_notifier.signaled = true; >>>>> - event_notifier_set(svq->masked_notifier.n); >>>>> - } >>>>> + /* Make as many buffers as possible used. */ >>>>> + do { >>>>> + unsigned i = 0; >>>>> + >>>>> + /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */ >>>>> + while (true) { >>>>> + g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq); >>>>> + if (!elem) { >>>>> + break; >>>>> + } >>>>> + >>>>> + assert(i < svq->vring.num); >>>>> + virtqueue_fill(vq, elem, elem->len, i++); >>>>> + } >>>>> + >>>>> + virtqueue_flush(vq, i); >>>>> + if (!masked_notifier) { >>>>> + virtio_notify_irqfd(svq->vdev, svq->vq); >>>>> + } else if (!svq->masked_notifier.signaled) { >>>>> + svq->masked_notifier.signaled = true; >>>>> + event_notifier_set(svq->masked_notifier.n); >>>>> + } >>>>> + } while (vhost_shadow_vq_more_used(svq)); >>>>> >>>>> if (masked_notifier) { >>>>> /* Signal not using it anymore */ >>>>> @@ -103,7 +270,6 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) >>>>> >>>>> static void vhost_shadow_vq_handle_call(EventNotifier *n) >>>>> { >>>>> - >>>>> if (likely(event_notifier_test_and_clear(n))) { >>>>> vhost_shadow_vq_handle_call_no_test(n); >>>>> } >>>>> @@ -254,7 +420,11 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, >>>>> unsigned idx, >>>>> VhostShadowVirtqueue *svq) >>>>> { >>>>> + int i; >>>>> int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq); >>>>> + >>>>> + assert(!dev->shadow_vqs_enabled); >>>>> + >>>>> if (unlikely(r < 0)) { >>>>> error_report("Couldn't restore vq kick fd: %s", strerror(-r)); >>>>> } >>>>> @@ -272,6 +442,18 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, >>>>> /* Restore vhost call */ >>>>> vhost_virtqueue_mask(dev, dev->vdev, dev->vq_index + idx, >>>>> dev->vqs[idx].notifier_is_masked); >>>>> + >>>>> + >>>>> + for (i = 0; i < svq->vring.num; ++i) { >>>>> + g_autofree VirtQueueElement *elem = svq->ring_id_maps[i]; >>>>> + /* >>>>> + * Although the doc says we must unpop in order, it's ok to unpop >>>>> + * everything. >>>>> + */ >>>>> + if (elem) { >>>>> + virtqueue_unpop(svq->vq, elem, elem->len); >>>> Shouldn't we need to wait until all pending requests to be drained? Or >>>> we may end up duplicated requests? >>>> >>> Do you mean pending as in-flight/processing in the device? The device >>> must be paused at this point. >> >> Ok. I see there's a vhost_set_vring_enable(dev, false) in >> vhost_sw_live_migration_start(). >> >> >>> Currently there is no assertion for >>> this, maybe we can track the device status for it. >>> >>> For the queue handlers to be running at this point, the main event >>> loop should serialize QMP and handlers as far as I know (and they >>> would make all state inconsistent if the device stops suddenly). It >>> would need to be synchronized if the handlers run in their own AIO >>> context. That would be nice to have but it's not included here. >> >> That's why I suggest to just drop the QMP stuffs and use cli parameters >> to enable shadow virtqueue. Things would be greatly simplified I guess. >> > I can send a series without it, but SVQ will need to be able to kick > in dynamically sooner or later if we want to use it for live > migration. I'm not sure I get the issue here. My understnading is everyhing will be processed in the same aio context. Thanks > >> Thanks >> >> >>>> Thanks >>>> >>>> >>>>> + } >>>>> + } >>>>> } >>>>> >>>>> /* >>>>> @@ -284,7 +466,7 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) >>>>> unsigned num = virtio_queue_get_num(dev->vdev, vq_idx); >>>>> size_t ring_size = vring_size(num, VRING_DESC_ALIGN_SIZE); >>>>> g_autofree VhostShadowVirtqueue *svq = g_malloc0(sizeof(*svq) + ring_size); >>>>> - int r; >>>>> + int r, i; >>>>> >>>>> r = event_notifier_init(&svq->kick_notifier, 0); >>>>> if (r != 0) { >>>>> @@ -303,6 +485,11 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) >>>>> vring_init(&svq->vring, num, svq->descs, VRING_DESC_ALIGN_SIZE); >>>>> svq->vq = virtio_get_queue(dev->vdev, vq_idx); >>>>> svq->vdev = dev->vdev; >>>>> + for (i = 0; i < num - 1; i++) { >>>>> + svq->descs[i].next = virtio_tswap16(dev->vdev, i + 1); >>>>> + } >>>>> + >>>>> + svq->ring_id_maps = g_new0(VirtQueueElement *, num); >>>>> event_notifier_set_handler(&svq->call_notifier, >>>>> vhost_shadow_vq_handle_call); >>>>> qemu_event_init(&svq->masked_notifier.is_free, true); >>>>> @@ -324,5 +511,6 @@ void vhost_shadow_vq_free(VhostShadowVirtqueue *vq) >>>>> event_notifier_cleanup(&vq->kick_notifier); >>>>> event_notifier_set_handler(&vq->call_notifier, NULL); >>>>> event_notifier_cleanup(&vq->call_notifier); >>>>> + g_free(vq->ring_id_maps); >>>>> g_free(vq); >>>>> } >>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c >>>>> index eab3e334f2..a373999bc4 100644 >>>>> --- a/hw/virtio/vhost.c >>>>> +++ b/hw/virtio/vhost.c >>>>> @@ -1021,6 +1021,19 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) >>>>> >>>>> trace_vhost_iotlb_miss(dev, 1); >>>>> >>>>> + if (qatomic_load_acquire(&dev->shadow_vqs_enabled)) { >>>>> + uaddr = iova; >>>>> + len = 4096; >>>>> + ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len, >>>>> + IOMMU_RW); >>>>> + if (ret) { >>>>> + trace_vhost_iotlb_miss(dev, 2); >>>>> + error_report("Fail to update device iotlb"); >>>>> + } >>>>> + >>>>> + return ret; >>>>> + } >>>>> + >>>>> iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, >>>>> iova, write, >>>>> MEMTXATTRS_UNSPECIFIED); >>>>> @@ -1227,8 +1240,28 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) >>>>> /* Can be read by vhost_virtqueue_mask, from vm exit */ >>>>> qatomic_store_release(&dev->shadow_vqs_enabled, false); >>>>> >>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, false); >>>>> + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { >>>>> + error_report("Fail to invalidate device iotlb"); >>>>> + } >>>>> + >>>>> for (idx = 0; idx < dev->nvqs; ++idx) { >>>>> + /* >>>>> + * Update used ring information for IOTLB to work correctly, >>>>> + * vhost-kernel code requires for this. >>>>> + */ >>>>> + struct vhost_virtqueue *vq = dev->vqs + idx; >>>>> + vhost_device_iotlb_miss(dev, vq->used_phys, true); >>>>> + >>>>> vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]); >>>>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], >>>>> + dev->vq_index + idx); >>>>> + } >>>>> + >>>>> + /* Enable guest's vq vring */ >>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); >>>>> + >>>>> + for (idx = 0; idx < dev->nvqs; ++idx) { >>>>> vhost_shadow_vq_free(dev->shadow_vqs[idx]); >>>>> } >>>>> >>>>> @@ -1237,6 +1270,59 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) >>>>> return 0; >>>>> } >>>>> >>>>> +/* >>>>> + * Start shadow virtqueue in a given queue. >>>>> + * In failure case, this function leaves queue working as regular vhost mode. >>>>> + */ >>>>> +static bool vhost_sw_live_migration_start_vq(struct vhost_dev *dev, >>>>> + unsigned idx) >>>>> +{ >>>>> + struct vhost_vring_addr addr = { >>>>> + .index = idx, >>>>> + }; >>>>> + struct vhost_vring_state s = { >>>>> + .index = idx, >>>>> + }; >>>>> + int r; >>>>> + bool ok; >>>>> + >>>>> + vhost_virtqueue_stop(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); >>>>> + ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); >>>>> + if (unlikely(!ok)) { >>>>> + return false; >>>>> + } >>>>> + >>>>> + /* From this point, vhost_virtqueue_start can reset these changes */ >>>>> + vhost_shadow_vq_get_vring_addr(dev->shadow_vqs[idx], &addr); >>>>> + r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); >>>>> + if (unlikely(r != 0)) { >>>>> + VHOST_OPS_DEBUG("vhost_set_vring_addr for shadow vq failed"); >>>>> + goto err; >>>>> + } >>>>> + >>>>> + r = dev->vhost_ops->vhost_set_vring_base(dev, &s); >>>>> + if (unlikely(r != 0)) { >>>>> + VHOST_OPS_DEBUG("vhost_set_vring_base for shadow vq failed"); >>>>> + goto err; >>>>> + } >>>>> + >>>>> + /* >>>>> + * Update used ring information for IOTLB to work correctly, >>>>> + * vhost-kernel code requires for this. >>>>> + */ >>>>> + r = vhost_device_iotlb_miss(dev, addr.used_user_addr, true); >>>>> + if (unlikely(r != 0)) { >>>>> + /* Debug message already printed */ >>>>> + goto err; >>>>> + } >>>>> + >>>>> + return true; >>>>> + >>>>> +err: >>>>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); >>>>> + return false; >>>>> +} >>>>> + >>>>> static int vhost_sw_live_migration_start(struct vhost_dev *dev) >>>>> { >>>>> int idx, stop_idx; >>>>> @@ -1249,24 +1335,35 @@ static int vhost_sw_live_migration_start(struct vhost_dev *dev) >>>>> } >>>>> } >>>>> >>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, false); >>>>> + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { >>>>> + error_report("Fail to invalidate device iotlb"); >>>>> + } >>>>> + >>>>> /* Can be read by vhost_virtqueue_mask, from vm exit */ >>>>> qatomic_store_release(&dev->shadow_vqs_enabled, true); >>>>> for (idx = 0; idx < dev->nvqs; ++idx) { >>>>> - bool ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); >>>>> + bool ok = vhost_sw_live_migration_start_vq(dev, idx); >>>>> if (unlikely(!ok)) { >>>>> goto err_start; >>>>> } >>>>> } >>>>> >>>>> + /* Enable shadow vq vring */ >>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); >>>>> return 0; >>>>> >>>>> err_start: >>>>> qatomic_store_release(&dev->shadow_vqs_enabled, false); >>>>> for (stop_idx = 0; stop_idx < idx; stop_idx++) { >>>>> vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]); >>>>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], >>>>> + dev->vq_index + stop_idx); >>>>> } >>>>> >>>>> err_new: >>>>> + /* Enable guest's vring */ >>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); >>>>> for (idx = 0; idx < dev->nvqs; ++idx) { >>>>> vhost_shadow_vq_free(dev->shadow_vqs[idx]); >>>>> } >>>>> @@ -1970,6 +2067,20 @@ void qmp_x_vhost_enable_shadow_vq(const char *name, bool enable, Error **errp) >>>>> >>>>> if (!hdev->started) { >>>>> err_cause = "Device is not started"; >>>>> + } else if (!vhost_dev_has_iommu(hdev)) { >>>>> + err_cause = "Does not support iommu"; >>>>> + } else if (hdev->acked_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { >>>>> + err_cause = "Is packed"; >>>>> + } else if (hdev->acked_features & BIT_ULL(VIRTIO_RING_F_EVENT_IDX)) { >>>>> + err_cause = "Have event idx"; >>>>> + } else if (hdev->acked_features & >>>>> + BIT_ULL(VIRTIO_RING_F_INDIRECT_DESC)) { >>>>> + err_cause = "Supports indirect descriptors"; >>>>> + } else if (!hdev->vhost_ops->vhost_set_vring_enable) { >>>>> + err_cause = "Cannot pause device"; >>>>> + } >>>>> + >>>>> + if (err_cause) { >>>>> goto err; >>>>> } >>>>>
On Thu, Mar 18, 2021 at 4:14 AM Jason Wang <jasowang@redhat.com> wrote: > > > 在 2021/3/17 下午10:38, Eugenio Perez Martin 写道: > > On Wed, Mar 17, 2021 at 3:51 AM Jason Wang <jasowang@redhat.com> wrote: > >> > >> 在 2021/3/17 上午12:05, Eugenio Perez Martin 写道: > >>> On Tue, Mar 16, 2021 at 9:15 AM Jason Wang <jasowang@redhat.com> wrote: > >>>> 在 2021/3/16 上午3:48, Eugenio Pérez 写道: > >>>>> Initial version of shadow virtqueue that actually forward buffers. > >>>>> > >>>>> It reuses the VirtQueue code for the device part. The driver part is > >>>>> based on Linux's virtio_ring driver, but with stripped functionality > >>>>> and optimizations so it's easier to review. > >>>>> > >>>>> These will be added in later commits. > >>>>> > >>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > >>>>> --- > >>>>> hw/virtio/vhost-shadow-virtqueue.c | 212 +++++++++++++++++++++++++++-- > >>>>> hw/virtio/vhost.c | 113 ++++++++++++++- > >>>>> 2 files changed, 312 insertions(+), 13 deletions(-) > >>>>> > >>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c > >>>>> index 1460d1d5d1..68ed0f2740 100644 > >>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c > >>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c > >>>>> @@ -9,6 +9,7 @@ > >>>>> > >>>>> #include "hw/virtio/vhost-shadow-virtqueue.h" > >>>>> #include "hw/virtio/vhost.h" > >>>>> +#include "hw/virtio/virtio-access.h" > >>>>> > >>>>> #include "standard-headers/linux/vhost_types.h" > >>>>> > >>>>> @@ -55,11 +56,96 @@ typedef struct VhostShadowVirtqueue { > >>>>> /* Virtio device */ > >>>>> VirtIODevice *vdev; > >>>>> > >>>>> + /* Map for returning guest's descriptors */ > >>>>> + VirtQueueElement **ring_id_maps; > >>>>> + > >>>>> + /* Next head to expose to device */ > >>>>> + uint16_t avail_idx_shadow; > >>>>> + > >>>>> + /* Next free descriptor */ > >>>>> + uint16_t free_head; > >>>>> + > >>>>> + /* Last seen used idx */ > >>>>> + uint16_t shadow_used_idx; > >>>>> + > >>>>> + /* Next head to consume from device */ > >>>>> + uint16_t used_idx; > >>>>> + > >>>>> /* Descriptors copied from guest */ > >>>>> vring_desc_t descs[]; > >>>>> } VhostShadowVirtqueue; > >>>>> > >>>>> -/* Forward guest notifications */ > >>>>> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, > >>>>> + const struct iovec *iovec, > >>>>> + size_t num, bool more_descs, bool write) > >>>>> +{ > >>>>> + uint16_t i = svq->free_head, last = svq->free_head; > >>>>> + unsigned n; > >>>>> + uint16_t flags = write ? virtio_tswap16(svq->vdev, VRING_DESC_F_WRITE) : 0; > >>>>> + vring_desc_t *descs = svq->vring.desc; > >>>>> + > >>>>> + if (num == 0) { > >>>>> + return; > >>>>> + } > >>>>> + > >>>>> + for (n = 0; n < num; n++) { > >>>>> + if (more_descs || (n + 1 < num)) { > >>>>> + descs[i].flags = flags | virtio_tswap16(svq->vdev, > >>>>> + VRING_DESC_F_NEXT); > >>>>> + } else { > >>>>> + descs[i].flags = flags; > >>>>> + } > >>>>> + descs[i].addr = virtio_tswap64(svq->vdev, (hwaddr)iovec[n].iov_base); > >>>> So unsing virtio_tswap() is probably not correct since we're talking > >>>> with vhost backends which has its own endiness. > >>>> > >>> I was trying to expose the buffer with the same endianness as the > >>> driver originally offered, so if guest->qemu requires a bswap, I think > >>> it will always require a bswap again to expose to the device again. > >> > >> So assumes vhost-vDPA always provide a non-transitional device[1]. > >> > >> Then if Qemu present a transitional device, we need to do the endian > >> conversion when necessary, if Qemu present a non-transitional device, we > >> don't need to do that, guest driver will do that for us. > >> > >> But it looks to me the virtio_tswap() can't be used for this since it: > >> > >> static inline bool virtio_access_is_big_endian(VirtIODevice *vdev) > >> { > >> #if defined(LEGACY_VIRTIO_IS_BIENDIAN) > >> return virtio_is_big_endian(vdev); > >> #elif defined(TARGET_WORDS_BIGENDIAN) > >> if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { > >> /* Devices conforming to VIRTIO 1.0 or later are always LE. */ > >> return false; > >> } > >> return true; > >> #else > >> return false; > >> #endif > >> } > >> > >> So if we present a legacy device on top of a non-transitiaonl vDPA > >> device. The VIRITIO_F_VERSION_1 check is wrong. > >> > >> > >>>> For vhost-vDPA, we can assume that it's a 1.0 device. > >>> Isn't it needed if the host is big endian? > >> > >> [1] > >> > >> So non-transitional device always assume little endian. > >> > >> For vhost-vDPA, we don't want to present transitional device which may > >> end up with a lot of burdens. > >> > >> I suspect the legacy driver plust vhost vDPA already break, so I plan to > >> mandate VERSION_1 for all vDPA devices. > >> > > Right. I think it's the best then. > > > > However, then we will need a similar method to always expose > > address/length as little endian, isn't it? > > > Yes. > > > > > >>>>> + descs[i].len = virtio_tswap32(svq->vdev, iovec[n].iov_len); > >>>>> + > >>>>> + last = i; > >>>>> + i = virtio_tswap16(svq->vdev, descs[i].next); > >>>>> + } > >>>>> + > >>>>> + svq->free_head = virtio_tswap16(svq->vdev, descs[last].next); > >>>>> +} > >>>>> + > >>>>> +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq, > >>>>> + VirtQueueElement *elem) > >>>>> +{ > >>>>> + int head; > >>>>> + unsigned avail_idx; > >>>>> + vring_avail_t *avail = svq->vring.avail; > >>>>> + > >>>>> + head = svq->free_head; > >>>>> + > >>>>> + /* We need some descriptors here */ > >>>>> + assert(elem->out_num || elem->in_num); > >>>>> + > >>>>> + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, > >>>>> + elem->in_num > 0, false); > >>>>> + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); > >>>>> + > >>>>> + /* > >>>>> + * Put entry in available array (but don't update avail->idx until they > >>>>> + * do sync). > >>>>> + */ > >>>>> + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); > >>>>> + avail->ring[avail_idx] = virtio_tswap16(svq->vdev, head); > >>>>> + svq->avail_idx_shadow++; > >>>>> + > >>>>> + /* Expose descriptors to device */ > >>>>> + smp_wmb(); > >>>>> + avail->idx = virtio_tswap16(svq->vdev, svq->avail_idx_shadow); > >>>>> + > >>>>> + return head; > >>>>> + > >>>>> +} > >>>>> + > >>>>> +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq, > >>>>> + VirtQueueElement *elem) > >>>>> +{ > >>>>> + unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem); > >>>>> + > >>>>> + svq->ring_id_maps[qemu_head] = elem; > >>>>> +} > >>>>> + > >>>>> +/* Handle guest->device notifications */ > >>>>> static void vhost_handle_guest_kick(EventNotifier *n) > >>>>> { > >>>>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > >>>>> @@ -69,7 +155,72 @@ static void vhost_handle_guest_kick(EventNotifier *n) > >>>>> return; > >>>>> } > >>>>> > >>>>> - event_notifier_set(&svq->kick_notifier); > >>>>> + /* Make available as many buffers as possible */ > >>>>> + do { > >>>>> + if (virtio_queue_get_notification(svq->vq)) { > >>>>> + /* No more notifications until process all available */ > >>>>> + virtio_queue_set_notification(svq->vq, false); > >>>>> + } > >>>>> + > >>>>> + while (true) { > >>>>> + VirtQueueElement *elem; > >>>>> + if (virtio_queue_full(svq->vq)) { > >>>>> + break; > >>>> So we've disabled guest notification. If buffer has been consumed, we > >>>> need to retry the handle_guest_kick here. But I didn't find the code? > >>>> > >>> This code follows the pattern of virtio_blk_handle_vq: we jump out of > >>> the inner while, and we re-enable the notifications. After that, we > >>> check for updates on guest avail_idx. > >> > >> Ok, but this will end up with a lot of unnecessary kicks without event > >> index. > >> > > I can move the kick out of the inner loop, but that could add latency. > > > So I think the right way is to disable the notification until some > buffers are consumed by used ring. > I'm not sure if you mean: a) To limit to the maximum amount of buffers that can be available in Shadow Virtqueue at the same time. As I can see, the easiest way to do this would be to unregister vhost_handle_guest_kick from the event loop and let vhost_shadow_vq_handle_call to re-register it at some threshold of available buffers. I'm not sure how much this limit should be, but it seems wasteful for me to not to fill shadow virqueue naturally. b) To limit the amount of buffers that vhost_handle_guest_kick forwards to shadow virtqueue in one call. This already has a natural limit of the queue size, since the buffers will not be consumed (as forarded-to-guest) by qemu while this function is running. This limit could be reduced and vhost_handle_guest_kick could re-enqueue itself if its not reached. Same as previous, I'm not sure how much is a right limit, but vhost_handle_guest_kick will not make available more than queue size. c) To kick every N buffers made available, instead of N=1. I think this is not the solution you are proposing, but maybe is simpler than previous. > > > > >>>>> + } > >>>>> + > >>>>> + elem = virtqueue_pop(svq->vq, sizeof(*elem)); > >>>>> + if (!elem) { > >>>>> + break; > >>>>> + } > >>>>> + > >>>>> + vhost_shadow_vq_add(svq, elem); > >>>>> + event_notifier_set(&svq->kick_notifier); > >>>>> + } > >>>>> + > >>>>> + virtio_queue_set_notification(svq->vq, true); > >>>>> + } while (!virtio_queue_empty(svq->vq)); > >>>>> +} > >>>>> + > >>>>> +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq) > >>>>> +{ > >>>>> + if (svq->used_idx != svq->shadow_used_idx) { > >>>>> + return true; > >>>>> + } > >>>>> + > >>>>> + /* Get used idx must not be reordered */ > >>>>> + smp_rmb(); > >>>>> + svq->shadow_used_idx = virtio_tswap16(svq->vdev, svq->vring.used->idx); > >>>>> + > >>>>> + return svq->used_idx != svq->shadow_used_idx; > >>>>> +} > >>>>> + > >>>>> +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq) > >>>>> +{ > >>>>> + vring_desc_t *descs = svq->vring.desc; > >>>>> + const vring_used_t *used = svq->vring.used; > >>>>> + vring_used_elem_t used_elem; > >>>>> + uint16_t last_used; > >>>>> + > >>>>> + if (!vhost_shadow_vq_more_used(svq)) { > >>>>> + return NULL; > >>>>> + } > >>>>> + > >>>>> + last_used = svq->used_idx & (svq->vring.num - 1); > >>>>> + used_elem.id = virtio_tswap32(svq->vdev, used->ring[last_used].id); > >>>>> + used_elem.len = virtio_tswap32(svq->vdev, used->ring[last_used].len); > >>>>> + > >>>>> + if (unlikely(used_elem.id >= svq->vring.num)) { > >>>>> + error_report("Device %s says index %u is available", svq->vdev->name, > >>>>> + used_elem.id); > >>>>> + return NULL; > >>>>> + } > >>>>> + > >>>>> + descs[used_elem.id].next = svq->free_head; > >>>>> + svq->free_head = used_elem.id; > >>>>> + > >>>>> + svq->used_idx++; > >>>>> + svq->ring_id_maps[used_elem.id]->len = used_elem.len; > >>>>> + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); > >>>>> } > >>>>> > >>>>> /* Forward vhost notifications */ > >>>>> @@ -78,6 +229,7 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > >>>>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > >>>>> call_notifier); > >>>>> EventNotifier *masked_notifier; > >>>>> + VirtQueue *vq = svq->vq; > >>>>> > >>>>> /* Signal start of using masked notifier */ > >>>>> qemu_event_reset(&svq->masked_notifier.is_free); > >>>>> @@ -86,14 +238,29 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > >>>>> qemu_event_set(&svq->masked_notifier.is_free); > >>>>> } > >>>>> > >>>>> - if (!masked_notifier) { > >>>>> - unsigned n = virtio_get_queue_index(svq->vq); > >>>>> - virtio_queue_invalidate_signalled_used(svq->vdev, n); > >>>>> - virtio_notify_irqfd(svq->vdev, svq->vq); > >>>>> - } else if (!svq->masked_notifier.signaled) { > >>>>> - svq->masked_notifier.signaled = true; > >>>>> - event_notifier_set(svq->masked_notifier.n); > >>>>> - } > >>>>> + /* Make as many buffers as possible used. */ > >>>>> + do { > >>>>> + unsigned i = 0; > >>>>> + > >>>>> + /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */ > >>>>> + while (true) { > >>>>> + g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq); > >>>>> + if (!elem) { > >>>>> + break; > >>>>> + } > >>>>> + > >>>>> + assert(i < svq->vring.num); > >>>>> + virtqueue_fill(vq, elem, elem->len, i++); > >>>>> + } > >>>>> + > >>>>> + virtqueue_flush(vq, i); > >>>>> + if (!masked_notifier) { > >>>>> + virtio_notify_irqfd(svq->vdev, svq->vq); > >>>>> + } else if (!svq->masked_notifier.signaled) { > >>>>> + svq->masked_notifier.signaled = true; > >>>>> + event_notifier_set(svq->masked_notifier.n); > >>>>> + } > >>>>> + } while (vhost_shadow_vq_more_used(svq)); > >>>>> > >>>>> if (masked_notifier) { > >>>>> /* Signal not using it anymore */ > >>>>> @@ -103,7 +270,6 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > >>>>> > >>>>> static void vhost_shadow_vq_handle_call(EventNotifier *n) > >>>>> { > >>>>> - > >>>>> if (likely(event_notifier_test_and_clear(n))) { > >>>>> vhost_shadow_vq_handle_call_no_test(n); > >>>>> } > >>>>> @@ -254,7 +420,11 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, > >>>>> unsigned idx, > >>>>> VhostShadowVirtqueue *svq) > >>>>> { > >>>>> + int i; > >>>>> int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq); > >>>>> + > >>>>> + assert(!dev->shadow_vqs_enabled); > >>>>> + > >>>>> if (unlikely(r < 0)) { > >>>>> error_report("Couldn't restore vq kick fd: %s", strerror(-r)); > >>>>> } > >>>>> @@ -272,6 +442,18 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, > >>>>> /* Restore vhost call */ > >>>>> vhost_virtqueue_mask(dev, dev->vdev, dev->vq_index + idx, > >>>>> dev->vqs[idx].notifier_is_masked); > >>>>> + > >>>>> + > >>>>> + for (i = 0; i < svq->vring.num; ++i) { > >>>>> + g_autofree VirtQueueElement *elem = svq->ring_id_maps[i]; > >>>>> + /* > >>>>> + * Although the doc says we must unpop in order, it's ok to unpop > >>>>> + * everything. > >>>>> + */ > >>>>> + if (elem) { > >>>>> + virtqueue_unpop(svq->vq, elem, elem->len); > >>>> Shouldn't we need to wait until all pending requests to be drained? Or > >>>> we may end up duplicated requests? > >>>> > >>> Do you mean pending as in-flight/processing in the device? The device > >>> must be paused at this point. > >> > >> Ok. I see there's a vhost_set_vring_enable(dev, false) in > >> vhost_sw_live_migration_start(). > >> > >> > >>> Currently there is no assertion for > >>> this, maybe we can track the device status for it. > >>> > >>> For the queue handlers to be running at this point, the main event > >>> loop should serialize QMP and handlers as far as I know (and they > >>> would make all state inconsistent if the device stops suddenly). It > >>> would need to be synchronized if the handlers run in their own AIO > >>> context. That would be nice to have but it's not included here. > >> > >> That's why I suggest to just drop the QMP stuffs and use cli parameters > >> to enable shadow virtqueue. Things would be greatly simplified I guess. > >> > > I can send a series without it, but SVQ will need to be able to kick > > in dynamically sooner or later if we want to use it for live > > migration. > > > I'm not sure I get the issue here. My understnading is everyhing will be > processed in the same aio context. > What I meant is that QMP allows us to activate the shadow virtqueue mode in any moment, similar to how live migration would activate it. To enable SVQ with a command line would imply that it runs the same way for all the time qemu runs. If we do that way, we don't need more synchronization, since we have deleted the event that could run concurrently with the masking. But this synchronization will be needed if we want to enable SVQ dynamically for live migration, so we are "just" delaying work. However, if we add vdpa iova range to this patch series, I think it would be a good idea to delay that synchronization work to future series, so they are smaller and the first one can be tested better. > Thanks > > > > > >> Thanks > >> > >> > >>>> Thanks > >>>> > >>>> > >>>>> + } > >>>>> + } > >>>>> } > >>>>> > >>>>> /* > >>>>> @@ -284,7 +466,7 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) > >>>>> unsigned num = virtio_queue_get_num(dev->vdev, vq_idx); > >>>>> size_t ring_size = vring_size(num, VRING_DESC_ALIGN_SIZE); > >>>>> g_autofree VhostShadowVirtqueue *svq = g_malloc0(sizeof(*svq) + ring_size); > >>>>> - int r; > >>>>> + int r, i; > >>>>> > >>>>> r = event_notifier_init(&svq->kick_notifier, 0); > >>>>> if (r != 0) { > >>>>> @@ -303,6 +485,11 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) > >>>>> vring_init(&svq->vring, num, svq->descs, VRING_DESC_ALIGN_SIZE); > >>>>> svq->vq = virtio_get_queue(dev->vdev, vq_idx); > >>>>> svq->vdev = dev->vdev; > >>>>> + for (i = 0; i < num - 1; i++) { > >>>>> + svq->descs[i].next = virtio_tswap16(dev->vdev, i + 1); > >>>>> + } > >>>>> + > >>>>> + svq->ring_id_maps = g_new0(VirtQueueElement *, num); > >>>>> event_notifier_set_handler(&svq->call_notifier, > >>>>> vhost_shadow_vq_handle_call); > >>>>> qemu_event_init(&svq->masked_notifier.is_free, true); > >>>>> @@ -324,5 +511,6 @@ void vhost_shadow_vq_free(VhostShadowVirtqueue *vq) > >>>>> event_notifier_cleanup(&vq->kick_notifier); > >>>>> event_notifier_set_handler(&vq->call_notifier, NULL); > >>>>> event_notifier_cleanup(&vq->call_notifier); > >>>>> + g_free(vq->ring_id_maps); > >>>>> g_free(vq); > >>>>> } > >>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > >>>>> index eab3e334f2..a373999bc4 100644 > >>>>> --- a/hw/virtio/vhost.c > >>>>> +++ b/hw/virtio/vhost.c > >>>>> @@ -1021,6 +1021,19 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) > >>>>> > >>>>> trace_vhost_iotlb_miss(dev, 1); > >>>>> > >>>>> + if (qatomic_load_acquire(&dev->shadow_vqs_enabled)) { > >>>>> + uaddr = iova; > >>>>> + len = 4096; > >>>>> + ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len, > >>>>> + IOMMU_RW); > >>>>> + if (ret) { > >>>>> + trace_vhost_iotlb_miss(dev, 2); > >>>>> + error_report("Fail to update device iotlb"); > >>>>> + } > >>>>> + > >>>>> + return ret; > >>>>> + } > >>>>> + > >>>>> iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, > >>>>> iova, write, > >>>>> MEMTXATTRS_UNSPECIFIED); > >>>>> @@ -1227,8 +1240,28 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) > >>>>> /* Can be read by vhost_virtqueue_mask, from vm exit */ > >>>>> qatomic_store_release(&dev->shadow_vqs_enabled, false); > >>>>> > >>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, false); > >>>>> + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { > >>>>> + error_report("Fail to invalidate device iotlb"); > >>>>> + } > >>>>> + > >>>>> for (idx = 0; idx < dev->nvqs; ++idx) { > >>>>> + /* > >>>>> + * Update used ring information for IOTLB to work correctly, > >>>>> + * vhost-kernel code requires for this. > >>>>> + */ > >>>>> + struct vhost_virtqueue *vq = dev->vqs + idx; > >>>>> + vhost_device_iotlb_miss(dev, vq->used_phys, true); > >>>>> + > >>>>> vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]); > >>>>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], > >>>>> + dev->vq_index + idx); > >>>>> + } > >>>>> + > >>>>> + /* Enable guest's vq vring */ > >>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); > >>>>> + > >>>>> + for (idx = 0; idx < dev->nvqs; ++idx) { > >>>>> vhost_shadow_vq_free(dev->shadow_vqs[idx]); > >>>>> } > >>>>> > >>>>> @@ -1237,6 +1270,59 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) > >>>>> return 0; > >>>>> } > >>>>> > >>>>> +/* > >>>>> + * Start shadow virtqueue in a given queue. > >>>>> + * In failure case, this function leaves queue working as regular vhost mode. > >>>>> + */ > >>>>> +static bool vhost_sw_live_migration_start_vq(struct vhost_dev *dev, > >>>>> + unsigned idx) > >>>>> +{ > >>>>> + struct vhost_vring_addr addr = { > >>>>> + .index = idx, > >>>>> + }; > >>>>> + struct vhost_vring_state s = { > >>>>> + .index = idx, > >>>>> + }; > >>>>> + int r; > >>>>> + bool ok; > >>>>> + > >>>>> + vhost_virtqueue_stop(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); > >>>>> + ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); > >>>>> + if (unlikely(!ok)) { > >>>>> + return false; > >>>>> + } > >>>>> + > >>>>> + /* From this point, vhost_virtqueue_start can reset these changes */ > >>>>> + vhost_shadow_vq_get_vring_addr(dev->shadow_vqs[idx], &addr); > >>>>> + r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); > >>>>> + if (unlikely(r != 0)) { > >>>>> + VHOST_OPS_DEBUG("vhost_set_vring_addr for shadow vq failed"); > >>>>> + goto err; > >>>>> + } > >>>>> + > >>>>> + r = dev->vhost_ops->vhost_set_vring_base(dev, &s); > >>>>> + if (unlikely(r != 0)) { > >>>>> + VHOST_OPS_DEBUG("vhost_set_vring_base for shadow vq failed"); > >>>>> + goto err; > >>>>> + } > >>>>> + > >>>>> + /* > >>>>> + * Update used ring information for IOTLB to work correctly, > >>>>> + * vhost-kernel code requires for this. > >>>>> + */ > >>>>> + r = vhost_device_iotlb_miss(dev, addr.used_user_addr, true); > >>>>> + if (unlikely(r != 0)) { > >>>>> + /* Debug message already printed */ > >>>>> + goto err; > >>>>> + } > >>>>> + > >>>>> + return true; > >>>>> + > >>>>> +err: > >>>>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); > >>>>> + return false; > >>>>> +} > >>>>> + > >>>>> static int vhost_sw_live_migration_start(struct vhost_dev *dev) > >>>>> { > >>>>> int idx, stop_idx; > >>>>> @@ -1249,24 +1335,35 @@ static int vhost_sw_live_migration_start(struct vhost_dev *dev) > >>>>> } > >>>>> } > >>>>> > >>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, false); > >>>>> + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { > >>>>> + error_report("Fail to invalidate device iotlb"); > >>>>> + } > >>>>> + > >>>>> /* Can be read by vhost_virtqueue_mask, from vm exit */ > >>>>> qatomic_store_release(&dev->shadow_vqs_enabled, true); > >>>>> for (idx = 0; idx < dev->nvqs; ++idx) { > >>>>> - bool ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); > >>>>> + bool ok = vhost_sw_live_migration_start_vq(dev, idx); > >>>>> if (unlikely(!ok)) { > >>>>> goto err_start; > >>>>> } > >>>>> } > >>>>> > >>>>> + /* Enable shadow vq vring */ > >>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); > >>>>> return 0; > >>>>> > >>>>> err_start: > >>>>> qatomic_store_release(&dev->shadow_vqs_enabled, false); > >>>>> for (stop_idx = 0; stop_idx < idx; stop_idx++) { > >>>>> vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]); > >>>>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], > >>>>> + dev->vq_index + stop_idx); > >>>>> } > >>>>> > >>>>> err_new: > >>>>> + /* Enable guest's vring */ > >>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); > >>>>> for (idx = 0; idx < dev->nvqs; ++idx) { > >>>>> vhost_shadow_vq_free(dev->shadow_vqs[idx]); > >>>>> } > >>>>> @@ -1970,6 +2067,20 @@ void qmp_x_vhost_enable_shadow_vq(const char *name, bool enable, Error **errp) > >>>>> > >>>>> if (!hdev->started) { > >>>>> err_cause = "Device is not started"; > >>>>> + } else if (!vhost_dev_has_iommu(hdev)) { > >>>>> + err_cause = "Does not support iommu"; > >>>>> + } else if (hdev->acked_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { > >>>>> + err_cause = "Is packed"; > >>>>> + } else if (hdev->acked_features & BIT_ULL(VIRTIO_RING_F_EVENT_IDX)) { > >>>>> + err_cause = "Have event idx"; > >>>>> + } else if (hdev->acked_features & > >>>>> + BIT_ULL(VIRTIO_RING_F_INDIRECT_DESC)) { > >>>>> + err_cause = "Supports indirect descriptors"; > >>>>> + } else if (!hdev->vhost_ops->vhost_set_vring_enable) { > >>>>> + err_cause = "Cannot pause device"; > >>>>> + } > >>>>> + > >>>>> + if (err_cause) { > >>>>> goto err; > >>>>> } > >>>>> >
在 2021/3/18 下午4:06, Eugenio Perez Martin 写道: > On Thu, Mar 18, 2021 at 4:14 AM Jason Wang <jasowang@redhat.com> wrote: >> >> 在 2021/3/17 下午10:38, Eugenio Perez Martin 写道: >>> On Wed, Mar 17, 2021 at 3:51 AM Jason Wang <jasowang@redhat.com> wrote: >>>> 在 2021/3/17 上午12:05, Eugenio Perez Martin 写道: >>>>> On Tue, Mar 16, 2021 at 9:15 AM Jason Wang <jasowang@redhat.com> wrote: >>>>>> 在 2021/3/16 上午3:48, Eugenio Pérez 写道: >>>>>>> Initial version of shadow virtqueue that actually forward buffers. >>>>>>> >>>>>>> It reuses the VirtQueue code for the device part. The driver part is >>>>>>> based on Linux's virtio_ring driver, but with stripped functionality >>>>>>> and optimizations so it's easier to review. >>>>>>> >>>>>>> These will be added in later commits. >>>>>>> >>>>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com> >>>>>>> --- >>>>>>> hw/virtio/vhost-shadow-virtqueue.c | 212 +++++++++++++++++++++++++++-- >>>>>>> hw/virtio/vhost.c | 113 ++++++++++++++- >>>>>>> 2 files changed, 312 insertions(+), 13 deletions(-) >>>>>>> >>>>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c >>>>>>> index 1460d1d5d1..68ed0f2740 100644 >>>>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c >>>>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c >>>>>>> @@ -9,6 +9,7 @@ >>>>>>> >>>>>>> #include "hw/virtio/vhost-shadow-virtqueue.h" >>>>>>> #include "hw/virtio/vhost.h" >>>>>>> +#include "hw/virtio/virtio-access.h" >>>>>>> >>>>>>> #include "standard-headers/linux/vhost_types.h" >>>>>>> >>>>>>> @@ -55,11 +56,96 @@ typedef struct VhostShadowVirtqueue { >>>>>>> /* Virtio device */ >>>>>>> VirtIODevice *vdev; >>>>>>> >>>>>>> + /* Map for returning guest's descriptors */ >>>>>>> + VirtQueueElement **ring_id_maps; >>>>>>> + >>>>>>> + /* Next head to expose to device */ >>>>>>> + uint16_t avail_idx_shadow; >>>>>>> + >>>>>>> + /* Next free descriptor */ >>>>>>> + uint16_t free_head; >>>>>>> + >>>>>>> + /* Last seen used idx */ >>>>>>> + uint16_t shadow_used_idx; >>>>>>> + >>>>>>> + /* Next head to consume from device */ >>>>>>> + uint16_t used_idx; >>>>>>> + >>>>>>> /* Descriptors copied from guest */ >>>>>>> vring_desc_t descs[]; >>>>>>> } VhostShadowVirtqueue; >>>>>>> >>>>>>> -/* Forward guest notifications */ >>>>>>> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, >>>>>>> + const struct iovec *iovec, >>>>>>> + size_t num, bool more_descs, bool write) >>>>>>> +{ >>>>>>> + uint16_t i = svq->free_head, last = svq->free_head; >>>>>>> + unsigned n; >>>>>>> + uint16_t flags = write ? virtio_tswap16(svq->vdev, VRING_DESC_F_WRITE) : 0; >>>>>>> + vring_desc_t *descs = svq->vring.desc; >>>>>>> + >>>>>>> + if (num == 0) { >>>>>>> + return; >>>>>>> + } >>>>>>> + >>>>>>> + for (n = 0; n < num; n++) { >>>>>>> + if (more_descs || (n + 1 < num)) { >>>>>>> + descs[i].flags = flags | virtio_tswap16(svq->vdev, >>>>>>> + VRING_DESC_F_NEXT); >>>>>>> + } else { >>>>>>> + descs[i].flags = flags; >>>>>>> + } >>>>>>> + descs[i].addr = virtio_tswap64(svq->vdev, (hwaddr)iovec[n].iov_base); >>>>>> So unsing virtio_tswap() is probably not correct since we're talking >>>>>> with vhost backends which has its own endiness. >>>>>> >>>>> I was trying to expose the buffer with the same endianness as the >>>>> driver originally offered, so if guest->qemu requires a bswap, I think >>>>> it will always require a bswap again to expose to the device again. >>>> So assumes vhost-vDPA always provide a non-transitional device[1]. >>>> >>>> Then if Qemu present a transitional device, we need to do the endian >>>> conversion when necessary, if Qemu present a non-transitional device, we >>>> don't need to do that, guest driver will do that for us. >>>> >>>> But it looks to me the virtio_tswap() can't be used for this since it: >>>> >>>> static inline bool virtio_access_is_big_endian(VirtIODevice *vdev) >>>> { >>>> #if defined(LEGACY_VIRTIO_IS_BIENDIAN) >>>> return virtio_is_big_endian(vdev); >>>> #elif defined(TARGET_WORDS_BIGENDIAN) >>>> if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { >>>> /* Devices conforming to VIRTIO 1.0 or later are always LE. */ >>>> return false; >>>> } >>>> return true; >>>> #else >>>> return false; >>>> #endif >>>> } >>>> >>>> So if we present a legacy device on top of a non-transitiaonl vDPA >>>> device. The VIRITIO_F_VERSION_1 check is wrong. >>>> >>>> >>>>>> For vhost-vDPA, we can assume that it's a 1.0 device. >>>>> Isn't it needed if the host is big endian? >>>> [1] >>>> >>>> So non-transitional device always assume little endian. >>>> >>>> For vhost-vDPA, we don't want to present transitional device which may >>>> end up with a lot of burdens. >>>> >>>> I suspect the legacy driver plust vhost vDPA already break, so I plan to >>>> mandate VERSION_1 for all vDPA devices. >>>> >>> Right. I think it's the best then. >>> >>> However, then we will need a similar method to always expose >>> address/length as little endian, isn't it? >> >> Yes. >> >> >>>>>>> + descs[i].len = virtio_tswap32(svq->vdev, iovec[n].iov_len); >>>>>>> + >>>>>>> + last = i; >>>>>>> + i = virtio_tswap16(svq->vdev, descs[i].next); >>>>>>> + } >>>>>>> + >>>>>>> + svq->free_head = virtio_tswap16(svq->vdev, descs[last].next); >>>>>>> +} >>>>>>> + >>>>>>> +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq, >>>>>>> + VirtQueueElement *elem) >>>>>>> +{ >>>>>>> + int head; >>>>>>> + unsigned avail_idx; >>>>>>> + vring_avail_t *avail = svq->vring.avail; >>>>>>> + >>>>>>> + head = svq->free_head; >>>>>>> + >>>>>>> + /* We need some descriptors here */ >>>>>>> + assert(elem->out_num || elem->in_num); >>>>>>> + >>>>>>> + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, >>>>>>> + elem->in_num > 0, false); >>>>>>> + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); >>>>>>> + >>>>>>> + /* >>>>>>> + * Put entry in available array (but don't update avail->idx until they >>>>>>> + * do sync). >>>>>>> + */ >>>>>>> + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); >>>>>>> + avail->ring[avail_idx] = virtio_tswap16(svq->vdev, head); >>>>>>> + svq->avail_idx_shadow++; >>>>>>> + >>>>>>> + /* Expose descriptors to device */ >>>>>>> + smp_wmb(); >>>>>>> + avail->idx = virtio_tswap16(svq->vdev, svq->avail_idx_shadow); >>>>>>> + >>>>>>> + return head; >>>>>>> + >>>>>>> +} >>>>>>> + >>>>>>> +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq, >>>>>>> + VirtQueueElement *elem) >>>>>>> +{ >>>>>>> + unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem); >>>>>>> + >>>>>>> + svq->ring_id_maps[qemu_head] = elem; >>>>>>> +} >>>>>>> + >>>>>>> +/* Handle guest->device notifications */ >>>>>>> static void vhost_handle_guest_kick(EventNotifier *n) >>>>>>> { >>>>>>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, >>>>>>> @@ -69,7 +155,72 @@ static void vhost_handle_guest_kick(EventNotifier *n) >>>>>>> return; >>>>>>> } >>>>>>> >>>>>>> - event_notifier_set(&svq->kick_notifier); >>>>>>> + /* Make available as many buffers as possible */ >>>>>>> + do { >>>>>>> + if (virtio_queue_get_notification(svq->vq)) { >>>>>>> + /* No more notifications until process all available */ >>>>>>> + virtio_queue_set_notification(svq->vq, false); >>>>>>> + } >>>>>>> + >>>>>>> + while (true) { >>>>>>> + VirtQueueElement *elem; >>>>>>> + if (virtio_queue_full(svq->vq)) { >>>>>>> + break; >>>>>> So we've disabled guest notification. If buffer has been consumed, we >>>>>> need to retry the handle_guest_kick here. But I didn't find the code? >>>>>> >>>>> This code follows the pattern of virtio_blk_handle_vq: we jump out of >>>>> the inner while, and we re-enable the notifications. After that, we >>>>> check for updates on guest avail_idx. >>>> Ok, but this will end up with a lot of unnecessary kicks without event >>>> index. >>>> >>> I can move the kick out of the inner loop, but that could add latency. >> >> So I think the right way is to disable the notification until some >> buffers are consumed by used ring. >> > I'm not sure if you mean: > > a) To limit to the maximum amount of buffers that can be available in > Shadow Virtqueue at the same time. > > As I can see, the easiest way to do this would be to unregister > vhost_handle_guest_kick from the event loop and let > vhost_shadow_vq_handle_call to re-register it at some threshold of > available buffers. > > I'm not sure how much this limit should be, but it seems wasteful for > me to not to fill shadow virqueue naturally. Yes, and I'm not sure how much we could gain from this extra complexity. > > b) To limit the amount of buffers that vhost_handle_guest_kick > forwards to shadow virtqueue in one call. > > This already has a natural limit of the queue size, since the buffers > will not be consumed (as forarded-to-guest) by qemu while this > function is running. This limit could be reduced and > vhost_handle_guest_kick could re-enqueue itself if its not reached. > Same as previous, I'm not sure how much is a right limit, but > vhost_handle_guest_kick will not make available more than queue size. Yes, so using queue size is how the code works currently and it should be fine if we know svq and vq are the same size. We can leave the kick notification for the future, (I guess at least for networking device, hitting virtio_queue_full() should be rare). It will be an real issue if svq and vq doesn't have the same size, but we can also leave this for future. > > c) To kick every N buffers made available, instead of N=1. > > I think this is not the solution you are proposing, but maybe is > simpler than previous. > >>>>>>> + } >>>>>>> + >>>>>>> + elem = virtqueue_pop(svq->vq, sizeof(*elem)); >>>>>>> + if (!elem) { >>>>>>> + break; >>>>>>> + } >>>>>>> + >>>>>>> + vhost_shadow_vq_add(svq, elem); >>>>>>> + event_notifier_set(&svq->kick_notifier); >>>>>>> + } >>>>>>> + >>>>>>> + virtio_queue_set_notification(svq->vq, true); >>>>>>> + } while (!virtio_queue_empty(svq->vq)); >>>>>>> +} >>>>>>> + >>>>>>> +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq) >>>>>>> +{ >>>>>>> + if (svq->used_idx != svq->shadow_used_idx) { >>>>>>> + return true; >>>>>>> + } >>>>>>> + >>>>>>> + /* Get used idx must not be reordered */ >>>>>>> + smp_rmb(); >>>>>>> + svq->shadow_used_idx = virtio_tswap16(svq->vdev, svq->vring.used->idx); >>>>>>> + >>>>>>> + return svq->used_idx != svq->shadow_used_idx; >>>>>>> +} >>>>>>> + >>>>>>> +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq) >>>>>>> +{ >>>>>>> + vring_desc_t *descs = svq->vring.desc; >>>>>>> + const vring_used_t *used = svq->vring.used; >>>>>>> + vring_used_elem_t used_elem; >>>>>>> + uint16_t last_used; >>>>>>> + >>>>>>> + if (!vhost_shadow_vq_more_used(svq)) { >>>>>>> + return NULL; >>>>>>> + } >>>>>>> + >>>>>>> + last_used = svq->used_idx & (svq->vring.num - 1); >>>>>>> + used_elem.id = virtio_tswap32(svq->vdev, used->ring[last_used].id); >>>>>>> + used_elem.len = virtio_tswap32(svq->vdev, used->ring[last_used].len); >>>>>>> + >>>>>>> + if (unlikely(used_elem.id >= svq->vring.num)) { >>>>>>> + error_report("Device %s says index %u is available", svq->vdev->name, >>>>>>> + used_elem.id); >>>>>>> + return NULL; >>>>>>> + } >>>>>>> + >>>>>>> + descs[used_elem.id].next = svq->free_head; >>>>>>> + svq->free_head = used_elem.id; >>>>>>> + >>>>>>> + svq->used_idx++; >>>>>>> + svq->ring_id_maps[used_elem.id]->len = used_elem.len; >>>>>>> + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); >>>>>>> } >>>>>>> >>>>>>> /* Forward vhost notifications */ >>>>>>> @@ -78,6 +229,7 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) >>>>>>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, >>>>>>> call_notifier); >>>>>>> EventNotifier *masked_notifier; >>>>>>> + VirtQueue *vq = svq->vq; >>>>>>> >>>>>>> /* Signal start of using masked notifier */ >>>>>>> qemu_event_reset(&svq->masked_notifier.is_free); >>>>>>> @@ -86,14 +238,29 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) >>>>>>> qemu_event_set(&svq->masked_notifier.is_free); >>>>>>> } >>>>>>> >>>>>>> - if (!masked_notifier) { >>>>>>> - unsigned n = virtio_get_queue_index(svq->vq); >>>>>>> - virtio_queue_invalidate_signalled_used(svq->vdev, n); >>>>>>> - virtio_notify_irqfd(svq->vdev, svq->vq); >>>>>>> - } else if (!svq->masked_notifier.signaled) { >>>>>>> - svq->masked_notifier.signaled = true; >>>>>>> - event_notifier_set(svq->masked_notifier.n); >>>>>>> - } >>>>>>> + /* Make as many buffers as possible used. */ >>>>>>> + do { >>>>>>> + unsigned i = 0; >>>>>>> + >>>>>>> + /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */ >>>>>>> + while (true) { >>>>>>> + g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq); >>>>>>> + if (!elem) { >>>>>>> + break; >>>>>>> + } >>>>>>> + >>>>>>> + assert(i < svq->vring.num); >>>>>>> + virtqueue_fill(vq, elem, elem->len, i++); >>>>>>> + } >>>>>>> + >>>>>>> + virtqueue_flush(vq, i); >>>>>>> + if (!masked_notifier) { >>>>>>> + virtio_notify_irqfd(svq->vdev, svq->vq); >>>>>>> + } else if (!svq->masked_notifier.signaled) { >>>>>>> + svq->masked_notifier.signaled = true; >>>>>>> + event_notifier_set(svq->masked_notifier.n); >>>>>>> + } >>>>>>> + } while (vhost_shadow_vq_more_used(svq)); >>>>>>> >>>>>>> if (masked_notifier) { >>>>>>> /* Signal not using it anymore */ >>>>>>> @@ -103,7 +270,6 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) >>>>>>> >>>>>>> static void vhost_shadow_vq_handle_call(EventNotifier *n) >>>>>>> { >>>>>>> - >>>>>>> if (likely(event_notifier_test_and_clear(n))) { >>>>>>> vhost_shadow_vq_handle_call_no_test(n); >>>>>>> } >>>>>>> @@ -254,7 +420,11 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, >>>>>>> unsigned idx, >>>>>>> VhostShadowVirtqueue *svq) >>>>>>> { >>>>>>> + int i; >>>>>>> int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq); >>>>>>> + >>>>>>> + assert(!dev->shadow_vqs_enabled); >>>>>>> + >>>>>>> if (unlikely(r < 0)) { >>>>>>> error_report("Couldn't restore vq kick fd: %s", strerror(-r)); >>>>>>> } >>>>>>> @@ -272,6 +442,18 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, >>>>>>> /* Restore vhost call */ >>>>>>> vhost_virtqueue_mask(dev, dev->vdev, dev->vq_index + idx, >>>>>>> dev->vqs[idx].notifier_is_masked); >>>>>>> + >>>>>>> + >>>>>>> + for (i = 0; i < svq->vring.num; ++i) { >>>>>>> + g_autofree VirtQueueElement *elem = svq->ring_id_maps[i]; >>>>>>> + /* >>>>>>> + * Although the doc says we must unpop in order, it's ok to unpop >>>>>>> + * everything. >>>>>>> + */ >>>>>>> + if (elem) { >>>>>>> + virtqueue_unpop(svq->vq, elem, elem->len); >>>>>> Shouldn't we need to wait until all pending requests to be drained? Or >>>>>> we may end up duplicated requests? >>>>>> >>>>> Do you mean pending as in-flight/processing in the device? The device >>>>> must be paused at this point. >>>> Ok. I see there's a vhost_set_vring_enable(dev, false) in >>>> vhost_sw_live_migration_start(). >>>> >>>> >>>>> Currently there is no assertion for >>>>> this, maybe we can track the device status for it. >>>>> >>>>> For the queue handlers to be running at this point, the main event >>>>> loop should serialize QMP and handlers as far as I know (and they >>>>> would make all state inconsistent if the device stops suddenly). It >>>>> would need to be synchronized if the handlers run in their own AIO >>>>> context. That would be nice to have but it's not included here. >>>> That's why I suggest to just drop the QMP stuffs and use cli parameters >>>> to enable shadow virtqueue. Things would be greatly simplified I guess. >>>> >>> I can send a series without it, but SVQ will need to be able to kick >>> in dynamically sooner or later if we want to use it for live >>> migration. >> >> I'm not sure I get the issue here. My understnading is everyhing will be >> processed in the same aio context. >> > What I meant is that QMP allows us to activate the shadow virtqueue > mode in any moment, similar to how live migration would activate it. I get you. > To enable SVQ with a command line would imply that it runs the same > way for all the time qemu runs. Ok. > > If we do that way, we don't need more synchronization, since we have > deleted the event that could run concurrently with the masking. But > this synchronization will be needed if we want to enable SVQ > dynamically for live migration, so we are "just" delaying work. > > However, if we add vdpa iova range to this patch series, I think it > would be a good idea to delay that synchronization work to future > series, so they are smaller and the first one can be tested better. Yes, that's why I think we can start from simple case. E.g to let the shadow virtqueue logic run. Then we can consider to add synchronization in the future. I guess things like mutex or bh might help, it would be more easier to add those stuffs on top. Thanks > >> Thanks >> >> >>>> Thanks >>>> >>>> >>>>>> Thanks >>>>>> >>>>>> >>>>>>> + } >>>>>>> + } >>>>>>> } >>>>>>> >>>>>>> /* >>>>>>> @@ -284,7 +466,7 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) >>>>>>> unsigned num = virtio_queue_get_num(dev->vdev, vq_idx); >>>>>>> size_t ring_size = vring_size(num, VRING_DESC_ALIGN_SIZE); >>>>>>> g_autofree VhostShadowVirtqueue *svq = g_malloc0(sizeof(*svq) + ring_size); >>>>>>> - int r; >>>>>>> + int r, i; >>>>>>> >>>>>>> r = event_notifier_init(&svq->kick_notifier, 0); >>>>>>> if (r != 0) { >>>>>>> @@ -303,6 +485,11 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) >>>>>>> vring_init(&svq->vring, num, svq->descs, VRING_DESC_ALIGN_SIZE); >>>>>>> svq->vq = virtio_get_queue(dev->vdev, vq_idx); >>>>>>> svq->vdev = dev->vdev; >>>>>>> + for (i = 0; i < num - 1; i++) { >>>>>>> + svq->descs[i].next = virtio_tswap16(dev->vdev, i + 1); >>>>>>> + } >>>>>>> + >>>>>>> + svq->ring_id_maps = g_new0(VirtQueueElement *, num); >>>>>>> event_notifier_set_handler(&svq->call_notifier, >>>>>>> vhost_shadow_vq_handle_call); >>>>>>> qemu_event_init(&svq->masked_notifier.is_free, true); >>>>>>> @@ -324,5 +511,6 @@ void vhost_shadow_vq_free(VhostShadowVirtqueue *vq) >>>>>>> event_notifier_cleanup(&vq->kick_notifier); >>>>>>> event_notifier_set_handler(&vq->call_notifier, NULL); >>>>>>> event_notifier_cleanup(&vq->call_notifier); >>>>>>> + g_free(vq->ring_id_maps); >>>>>>> g_free(vq); >>>>>>> } >>>>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c >>>>>>> index eab3e334f2..a373999bc4 100644 >>>>>>> --- a/hw/virtio/vhost.c >>>>>>> +++ b/hw/virtio/vhost.c >>>>>>> @@ -1021,6 +1021,19 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) >>>>>>> >>>>>>> trace_vhost_iotlb_miss(dev, 1); >>>>>>> >>>>>>> + if (qatomic_load_acquire(&dev->shadow_vqs_enabled)) { >>>>>>> + uaddr = iova; >>>>>>> + len = 4096; >>>>>>> + ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len, >>>>>>> + IOMMU_RW); >>>>>>> + if (ret) { >>>>>>> + trace_vhost_iotlb_miss(dev, 2); >>>>>>> + error_report("Fail to update device iotlb"); >>>>>>> + } >>>>>>> + >>>>>>> + return ret; >>>>>>> + } >>>>>>> + >>>>>>> iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, >>>>>>> iova, write, >>>>>>> MEMTXATTRS_UNSPECIFIED); >>>>>>> @@ -1227,8 +1240,28 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) >>>>>>> /* Can be read by vhost_virtqueue_mask, from vm exit */ >>>>>>> qatomic_store_release(&dev->shadow_vqs_enabled, false); >>>>>>> >>>>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, false); >>>>>>> + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { >>>>>>> + error_report("Fail to invalidate device iotlb"); >>>>>>> + } >>>>>>> + >>>>>>> for (idx = 0; idx < dev->nvqs; ++idx) { >>>>>>> + /* >>>>>>> + * Update used ring information for IOTLB to work correctly, >>>>>>> + * vhost-kernel code requires for this. >>>>>>> + */ >>>>>>> + struct vhost_virtqueue *vq = dev->vqs + idx; >>>>>>> + vhost_device_iotlb_miss(dev, vq->used_phys, true); >>>>>>> + >>>>>>> vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]); >>>>>>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], >>>>>>> + dev->vq_index + idx); >>>>>>> + } >>>>>>> + >>>>>>> + /* Enable guest's vq vring */ >>>>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); >>>>>>> + >>>>>>> + for (idx = 0; idx < dev->nvqs; ++idx) { >>>>>>> vhost_shadow_vq_free(dev->shadow_vqs[idx]); >>>>>>> } >>>>>>> >>>>>>> @@ -1237,6 +1270,59 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) >>>>>>> return 0; >>>>>>> } >>>>>>> >>>>>>> +/* >>>>>>> + * Start shadow virtqueue in a given queue. >>>>>>> + * In failure case, this function leaves queue working as regular vhost mode. >>>>>>> + */ >>>>>>> +static bool vhost_sw_live_migration_start_vq(struct vhost_dev *dev, >>>>>>> + unsigned idx) >>>>>>> +{ >>>>>>> + struct vhost_vring_addr addr = { >>>>>>> + .index = idx, >>>>>>> + }; >>>>>>> + struct vhost_vring_state s = { >>>>>>> + .index = idx, >>>>>>> + }; >>>>>>> + int r; >>>>>>> + bool ok; >>>>>>> + >>>>>>> + vhost_virtqueue_stop(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); >>>>>>> + ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); >>>>>>> + if (unlikely(!ok)) { >>>>>>> + return false; >>>>>>> + } >>>>>>> + >>>>>>> + /* From this point, vhost_virtqueue_start can reset these changes */ >>>>>>> + vhost_shadow_vq_get_vring_addr(dev->shadow_vqs[idx], &addr); >>>>>>> + r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); >>>>>>> + if (unlikely(r != 0)) { >>>>>>> + VHOST_OPS_DEBUG("vhost_set_vring_addr for shadow vq failed"); >>>>>>> + goto err; >>>>>>> + } >>>>>>> + >>>>>>> + r = dev->vhost_ops->vhost_set_vring_base(dev, &s); >>>>>>> + if (unlikely(r != 0)) { >>>>>>> + VHOST_OPS_DEBUG("vhost_set_vring_base for shadow vq failed"); >>>>>>> + goto err; >>>>>>> + } >>>>>>> + >>>>>>> + /* >>>>>>> + * Update used ring information for IOTLB to work correctly, >>>>>>> + * vhost-kernel code requires for this. >>>>>>> + */ >>>>>>> + r = vhost_device_iotlb_miss(dev, addr.used_user_addr, true); >>>>>>> + if (unlikely(r != 0)) { >>>>>>> + /* Debug message already printed */ >>>>>>> + goto err; >>>>>>> + } >>>>>>> + >>>>>>> + return true; >>>>>>> + >>>>>>> +err: >>>>>>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); >>>>>>> + return false; >>>>>>> +} >>>>>>> + >>>>>>> static int vhost_sw_live_migration_start(struct vhost_dev *dev) >>>>>>> { >>>>>>> int idx, stop_idx; >>>>>>> @@ -1249,24 +1335,35 @@ static int vhost_sw_live_migration_start(struct vhost_dev *dev) >>>>>>> } >>>>>>> } >>>>>>> >>>>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, false); >>>>>>> + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { >>>>>>> + error_report("Fail to invalidate device iotlb"); >>>>>>> + } >>>>>>> + >>>>>>> /* Can be read by vhost_virtqueue_mask, from vm exit */ >>>>>>> qatomic_store_release(&dev->shadow_vqs_enabled, true); >>>>>>> for (idx = 0; idx < dev->nvqs; ++idx) { >>>>>>> - bool ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); >>>>>>> + bool ok = vhost_sw_live_migration_start_vq(dev, idx); >>>>>>> if (unlikely(!ok)) { >>>>>>> goto err_start; >>>>>>> } >>>>>>> } >>>>>>> >>>>>>> + /* Enable shadow vq vring */ >>>>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); >>>>>>> return 0; >>>>>>> >>>>>>> err_start: >>>>>>> qatomic_store_release(&dev->shadow_vqs_enabled, false); >>>>>>> for (stop_idx = 0; stop_idx < idx; stop_idx++) { >>>>>>> vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]); >>>>>>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], >>>>>>> + dev->vq_index + stop_idx); >>>>>>> } >>>>>>> >>>>>>> err_new: >>>>>>> + /* Enable guest's vring */ >>>>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); >>>>>>> for (idx = 0; idx < dev->nvqs; ++idx) { >>>>>>> vhost_shadow_vq_free(dev->shadow_vqs[idx]); >>>>>>> } >>>>>>> @@ -1970,6 +2067,20 @@ void qmp_x_vhost_enable_shadow_vq(const char *name, bool enable, Error **errp) >>>>>>> >>>>>>> if (!hdev->started) { >>>>>>> err_cause = "Device is not started"; >>>>>>> + } else if (!vhost_dev_has_iommu(hdev)) { >>>>>>> + err_cause = "Does not support iommu"; >>>>>>> + } else if (hdev->acked_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { >>>>>>> + err_cause = "Is packed"; >>>>>>> + } else if (hdev->acked_features & BIT_ULL(VIRTIO_RING_F_EVENT_IDX)) { >>>>>>> + err_cause = "Have event idx"; >>>>>>> + } else if (hdev->acked_features & >>>>>>> + BIT_ULL(VIRTIO_RING_F_INDIRECT_DESC)) { >>>>>>> + err_cause = "Supports indirect descriptors"; >>>>>>> + } else if (!hdev->vhost_ops->vhost_set_vring_enable) { >>>>>>> + err_cause = "Cannot pause device"; >>>>>>> + } >>>>>>> + >>>>>>> + if (err_cause) { >>>>>>> goto err; >>>>>>> } >>>>>>> >
On Thu, Mar 18, 2021 at 10:22 AM Jason Wang <jasowang@redhat.com> wrote: > > > 在 2021/3/18 下午4:06, Eugenio Perez Martin 写道: > > On Thu, Mar 18, 2021 at 4:14 AM Jason Wang <jasowang@redhat.com> wrote: > >> > >> 在 2021/3/17 下午10:38, Eugenio Perez Martin 写道: > >>> On Wed, Mar 17, 2021 at 3:51 AM Jason Wang <jasowang@redhat.com> wrote: > >>>> 在 2021/3/17 上午12:05, Eugenio Perez Martin 写道: > >>>>> On Tue, Mar 16, 2021 at 9:15 AM Jason Wang <jasowang@redhat.com> wrote: > >>>>>> 在 2021/3/16 上午3:48, Eugenio Pérez 写道: > >>>>>>> Initial version of shadow virtqueue that actually forward buffers. > >>>>>>> > >>>>>>> It reuses the VirtQueue code for the device part. The driver part is > >>>>>>> based on Linux's virtio_ring driver, but with stripped functionality > >>>>>>> and optimizations so it's easier to review. > >>>>>>> > >>>>>>> These will be added in later commits. > >>>>>>> > >>>>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > >>>>>>> --- > >>>>>>> hw/virtio/vhost-shadow-virtqueue.c | 212 +++++++++++++++++++++++++++-- > >>>>>>> hw/virtio/vhost.c | 113 ++++++++++++++- > >>>>>>> 2 files changed, 312 insertions(+), 13 deletions(-) > >>>>>>> > >>>>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c > >>>>>>> index 1460d1d5d1..68ed0f2740 100644 > >>>>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c > >>>>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c > >>>>>>> @@ -9,6 +9,7 @@ > >>>>>>> > >>>>>>> #include "hw/virtio/vhost-shadow-virtqueue.h" > >>>>>>> #include "hw/virtio/vhost.h" > >>>>>>> +#include "hw/virtio/virtio-access.h" > >>>>>>> > >>>>>>> #include "standard-headers/linux/vhost_types.h" > >>>>>>> > >>>>>>> @@ -55,11 +56,96 @@ typedef struct VhostShadowVirtqueue { > >>>>>>> /* Virtio device */ > >>>>>>> VirtIODevice *vdev; > >>>>>>> > >>>>>>> + /* Map for returning guest's descriptors */ > >>>>>>> + VirtQueueElement **ring_id_maps; > >>>>>>> + > >>>>>>> + /* Next head to expose to device */ > >>>>>>> + uint16_t avail_idx_shadow; > >>>>>>> + > >>>>>>> + /* Next free descriptor */ > >>>>>>> + uint16_t free_head; > >>>>>>> + > >>>>>>> + /* Last seen used idx */ > >>>>>>> + uint16_t shadow_used_idx; > >>>>>>> + > >>>>>>> + /* Next head to consume from device */ > >>>>>>> + uint16_t used_idx; > >>>>>>> + > >>>>>>> /* Descriptors copied from guest */ > >>>>>>> vring_desc_t descs[]; > >>>>>>> } VhostShadowVirtqueue; > >>>>>>> > >>>>>>> -/* Forward guest notifications */ > >>>>>>> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, > >>>>>>> + const struct iovec *iovec, > >>>>>>> + size_t num, bool more_descs, bool write) > >>>>>>> +{ > >>>>>>> + uint16_t i = svq->free_head, last = svq->free_head; > >>>>>>> + unsigned n; > >>>>>>> + uint16_t flags = write ? virtio_tswap16(svq->vdev, VRING_DESC_F_WRITE) : 0; > >>>>>>> + vring_desc_t *descs = svq->vring.desc; > >>>>>>> + > >>>>>>> + if (num == 0) { > >>>>>>> + return; > >>>>>>> + } > >>>>>>> + > >>>>>>> + for (n = 0; n < num; n++) { > >>>>>>> + if (more_descs || (n + 1 < num)) { > >>>>>>> + descs[i].flags = flags | virtio_tswap16(svq->vdev, > >>>>>>> + VRING_DESC_F_NEXT); > >>>>>>> + } else { > >>>>>>> + descs[i].flags = flags; > >>>>>>> + } > >>>>>>> + descs[i].addr = virtio_tswap64(svq->vdev, (hwaddr)iovec[n].iov_base); > >>>>>> So unsing virtio_tswap() is probably not correct since we're talking > >>>>>> with vhost backends which has its own endiness. > >>>>>> > >>>>> I was trying to expose the buffer with the same endianness as the > >>>>> driver originally offered, so if guest->qemu requires a bswap, I think > >>>>> it will always require a bswap again to expose to the device again. > >>>> So assumes vhost-vDPA always provide a non-transitional device[1]. > >>>> > >>>> Then if Qemu present a transitional device, we need to do the endian > >>>> conversion when necessary, if Qemu present a non-transitional device, we > >>>> don't need to do that, guest driver will do that for us. > >>>> > >>>> But it looks to me the virtio_tswap() can't be used for this since it: > >>>> > >>>> static inline bool virtio_access_is_big_endian(VirtIODevice *vdev) > >>>> { > >>>> #if defined(LEGACY_VIRTIO_IS_BIENDIAN) > >>>> return virtio_is_big_endian(vdev); > >>>> #elif defined(TARGET_WORDS_BIGENDIAN) > >>>> if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { > >>>> /* Devices conforming to VIRTIO 1.0 or later are always LE. */ > >>>> return false; > >>>> } > >>>> return true; > >>>> #else > >>>> return false; > >>>> #endif > >>>> } > >>>> > >>>> So if we present a legacy device on top of a non-transitiaonl vDPA > >>>> device. The VIRITIO_F_VERSION_1 check is wrong. > >>>> > >>>> > >>>>>> For vhost-vDPA, we can assume that it's a 1.0 device. > >>>>> Isn't it needed if the host is big endian? > >>>> [1] > >>>> > >>>> So non-transitional device always assume little endian. > >>>> > >>>> For vhost-vDPA, we don't want to present transitional device which may > >>>> end up with a lot of burdens. > >>>> > >>>> I suspect the legacy driver plust vhost vDPA already break, so I plan to > >>>> mandate VERSION_1 for all vDPA devices. > >>>> > >>> Right. I think it's the best then. > >>> > >>> However, then we will need a similar method to always expose > >>> address/length as little endian, isn't it? > >> > >> Yes. > >> > >> > >>>>>>> + descs[i].len = virtio_tswap32(svq->vdev, iovec[n].iov_len); > >>>>>>> + > >>>>>>> + last = i; > >>>>>>> + i = virtio_tswap16(svq->vdev, descs[i].next); > >>>>>>> + } > >>>>>>> + > >>>>>>> + svq->free_head = virtio_tswap16(svq->vdev, descs[last].next); > >>>>>>> +} > >>>>>>> + > >>>>>>> +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq, > >>>>>>> + VirtQueueElement *elem) > >>>>>>> +{ > >>>>>>> + int head; > >>>>>>> + unsigned avail_idx; > >>>>>>> + vring_avail_t *avail = svq->vring.avail; > >>>>>>> + > >>>>>>> + head = svq->free_head; > >>>>>>> + > >>>>>>> + /* We need some descriptors here */ > >>>>>>> + assert(elem->out_num || elem->in_num); > >>>>>>> + > >>>>>>> + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, > >>>>>>> + elem->in_num > 0, false); > >>>>>>> + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); > >>>>>>> + > >>>>>>> + /* > >>>>>>> + * Put entry in available array (but don't update avail->idx until they > >>>>>>> + * do sync). > >>>>>>> + */ > >>>>>>> + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); > >>>>>>> + avail->ring[avail_idx] = virtio_tswap16(svq->vdev, head); > >>>>>>> + svq->avail_idx_shadow++; > >>>>>>> + > >>>>>>> + /* Expose descriptors to device */ > >>>>>>> + smp_wmb(); > >>>>>>> + avail->idx = virtio_tswap16(svq->vdev, svq->avail_idx_shadow); > >>>>>>> + > >>>>>>> + return head; > >>>>>>> + > >>>>>>> +} > >>>>>>> + > >>>>>>> +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq, > >>>>>>> + VirtQueueElement *elem) > >>>>>>> +{ > >>>>>>> + unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem); > >>>>>>> + > >>>>>>> + svq->ring_id_maps[qemu_head] = elem; > >>>>>>> +} > >>>>>>> + > >>>>>>> +/* Handle guest->device notifications */ > >>>>>>> static void vhost_handle_guest_kick(EventNotifier *n) > >>>>>>> { > >>>>>>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > >>>>>>> @@ -69,7 +155,72 @@ static void vhost_handle_guest_kick(EventNotifier *n) > >>>>>>> return; > >>>>>>> } > >>>>>>> > >>>>>>> - event_notifier_set(&svq->kick_notifier); > >>>>>>> + /* Make available as many buffers as possible */ > >>>>>>> + do { > >>>>>>> + if (virtio_queue_get_notification(svq->vq)) { > >>>>>>> + /* No more notifications until process all available */ > >>>>>>> + virtio_queue_set_notification(svq->vq, false); > >>>>>>> + } > >>>>>>> + > >>>>>>> + while (true) { > >>>>>>> + VirtQueueElement *elem; > >>>>>>> + if (virtio_queue_full(svq->vq)) { > >>>>>>> + break; > >>>>>> So we've disabled guest notification. If buffer has been consumed, we > >>>>>> need to retry the handle_guest_kick here. But I didn't find the code? > >>>>>> > >>>>> This code follows the pattern of virtio_blk_handle_vq: we jump out of > >>>>> the inner while, and we re-enable the notifications. After that, we > >>>>> check for updates on guest avail_idx. > >>>> Ok, but this will end up with a lot of unnecessary kicks without event > >>>> index. > >>>> > >>> I can move the kick out of the inner loop, but that could add latency. > >> > >> So I think the right way is to disable the notification until some > >> buffers are consumed by used ring. > >> > > I'm not sure if you mean: > > > > a) To limit to the maximum amount of buffers that can be available in > > Shadow Virtqueue at the same time. > > > > As I can see, the easiest way to do this would be to unregister > > vhost_handle_guest_kick from the event loop and let > > vhost_shadow_vq_handle_call to re-register it at some threshold of > > available buffers. > > > > I'm not sure how much this limit should be, but it seems wasteful for > > me to not to fill shadow virqueue naturally. > > > Yes, and I'm not sure how much we could gain from this extra complexity. > > > > > > b) To limit the amount of buffers that vhost_handle_guest_kick > > forwards to shadow virtqueue in one call. > > > > This already has a natural limit of the queue size, since the buffers > > will not be consumed (as forarded-to-guest) by qemu while this > > function is running. This limit could be reduced and > > vhost_handle_guest_kick could re-enqueue itself if its not reached. > > Same as previous, I'm not sure how much is a right limit, but > > vhost_handle_guest_kick will not make available more than queue size. > > > Yes, so using queue size is how the code works currently and it should > be fine if we know svq and vq are the same size. We can leave the kick > notification for the future, (I guess at least for networking device, > hitting virtio_queue_full() should be rare). > It happens in the rx queue. I think it could also happen in the tx queue under some conditions, but I didn't test. > It will be an real issue if svq and vq doesn't have the same size, but > we can also leave this for future. > > > > > > c) To kick every N buffers made available, instead of N=1. > > > > I think this is not the solution you are proposing, but maybe is > > simpler than previous. > > > >>>>>>> + } > >>>>>>> + > >>>>>>> + elem = virtqueue_pop(svq->vq, sizeof(*elem)); > >>>>>>> + if (!elem) { > >>>>>>> + break; > >>>>>>> + } > >>>>>>> + > >>>>>>> + vhost_shadow_vq_add(svq, elem); > >>>>>>> + event_notifier_set(&svq->kick_notifier); > >>>>>>> + } > >>>>>>> + > >>>>>>> + virtio_queue_set_notification(svq->vq, true); > >>>>>>> + } while (!virtio_queue_empty(svq->vq)); > >>>>>>> +} > >>>>>>> + > >>>>>>> +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq) > >>>>>>> +{ > >>>>>>> + if (svq->used_idx != svq->shadow_used_idx) { > >>>>>>> + return true; > >>>>>>> + } > >>>>>>> + > >>>>>>> + /* Get used idx must not be reordered */ > >>>>>>> + smp_rmb(); > >>>>>>> + svq->shadow_used_idx = virtio_tswap16(svq->vdev, svq->vring.used->idx); > >>>>>>> + > >>>>>>> + return svq->used_idx != svq->shadow_used_idx; > >>>>>>> +} > >>>>>>> + > >>>>>>> +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq) > >>>>>>> +{ > >>>>>>> + vring_desc_t *descs = svq->vring.desc; > >>>>>>> + const vring_used_t *used = svq->vring.used; > >>>>>>> + vring_used_elem_t used_elem; > >>>>>>> + uint16_t last_used; > >>>>>>> + > >>>>>>> + if (!vhost_shadow_vq_more_used(svq)) { > >>>>>>> + return NULL; > >>>>>>> + } > >>>>>>> + > >>>>>>> + last_used = svq->used_idx & (svq->vring.num - 1); > >>>>>>> + used_elem.id = virtio_tswap32(svq->vdev, used->ring[last_used].id); > >>>>>>> + used_elem.len = virtio_tswap32(svq->vdev, used->ring[last_used].len); > >>>>>>> + > >>>>>>> + if (unlikely(used_elem.id >= svq->vring.num)) { > >>>>>>> + error_report("Device %s says index %u is available", svq->vdev->name, > >>>>>>> + used_elem.id); > >>>>>>> + return NULL; > >>>>>>> + } > >>>>>>> + > >>>>>>> + descs[used_elem.id].next = svq->free_head; > >>>>>>> + svq->free_head = used_elem.id; > >>>>>>> + > >>>>>>> + svq->used_idx++; > >>>>>>> + svq->ring_id_maps[used_elem.id]->len = used_elem.len; > >>>>>>> + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); > >>>>>>> } > >>>>>>> > >>>>>>> /* Forward vhost notifications */ > >>>>>>> @@ -78,6 +229,7 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > >>>>>>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > >>>>>>> call_notifier); > >>>>>>> EventNotifier *masked_notifier; > >>>>>>> + VirtQueue *vq = svq->vq; > >>>>>>> > >>>>>>> /* Signal start of using masked notifier */ > >>>>>>> qemu_event_reset(&svq->masked_notifier.is_free); > >>>>>>> @@ -86,14 +238,29 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > >>>>>>> qemu_event_set(&svq->masked_notifier.is_free); > >>>>>>> } > >>>>>>> > >>>>>>> - if (!masked_notifier) { > >>>>>>> - unsigned n = virtio_get_queue_index(svq->vq); > >>>>>>> - virtio_queue_invalidate_signalled_used(svq->vdev, n); > >>>>>>> - virtio_notify_irqfd(svq->vdev, svq->vq); > >>>>>>> - } else if (!svq->masked_notifier.signaled) { > >>>>>>> - svq->masked_notifier.signaled = true; > >>>>>>> - event_notifier_set(svq->masked_notifier.n); > >>>>>>> - } > >>>>>>> + /* Make as many buffers as possible used. */ > >>>>>>> + do { > >>>>>>> + unsigned i = 0; > >>>>>>> + > >>>>>>> + /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */ > >>>>>>> + while (true) { > >>>>>>> + g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq); > >>>>>>> + if (!elem) { > >>>>>>> + break; > >>>>>>> + } > >>>>>>> + > >>>>>>> + assert(i < svq->vring.num); > >>>>>>> + virtqueue_fill(vq, elem, elem->len, i++); > >>>>>>> + } > >>>>>>> + > >>>>>>> + virtqueue_flush(vq, i); > >>>>>>> + if (!masked_notifier) { > >>>>>>> + virtio_notify_irqfd(svq->vdev, svq->vq); > >>>>>>> + } else if (!svq->masked_notifier.signaled) { > >>>>>>> + svq->masked_notifier.signaled = true; > >>>>>>> + event_notifier_set(svq->masked_notifier.n); > >>>>>>> + } > >>>>>>> + } while (vhost_shadow_vq_more_used(svq)); > >>>>>>> > >>>>>>> if (masked_notifier) { > >>>>>>> /* Signal not using it anymore */ > >>>>>>> @@ -103,7 +270,6 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) > >>>>>>> > >>>>>>> static void vhost_shadow_vq_handle_call(EventNotifier *n) > >>>>>>> { > >>>>>>> - > >>>>>>> if (likely(event_notifier_test_and_clear(n))) { > >>>>>>> vhost_shadow_vq_handle_call_no_test(n); > >>>>>>> } > >>>>>>> @@ -254,7 +420,11 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, > >>>>>>> unsigned idx, > >>>>>>> VhostShadowVirtqueue *svq) > >>>>>>> { > >>>>>>> + int i; > >>>>>>> int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq); > >>>>>>> + > >>>>>>> + assert(!dev->shadow_vqs_enabled); > >>>>>>> + > >>>>>>> if (unlikely(r < 0)) { > >>>>>>> error_report("Couldn't restore vq kick fd: %s", strerror(-r)); > >>>>>>> } > >>>>>>> @@ -272,6 +442,18 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, > >>>>>>> /* Restore vhost call */ > >>>>>>> vhost_virtqueue_mask(dev, dev->vdev, dev->vq_index + idx, > >>>>>>> dev->vqs[idx].notifier_is_masked); > >>>>>>> + > >>>>>>> + > >>>>>>> + for (i = 0; i < svq->vring.num; ++i) { > >>>>>>> + g_autofree VirtQueueElement *elem = svq->ring_id_maps[i]; > >>>>>>> + /* > >>>>>>> + * Although the doc says we must unpop in order, it's ok to unpop > >>>>>>> + * everything. > >>>>>>> + */ > >>>>>>> + if (elem) { > >>>>>>> + virtqueue_unpop(svq->vq, elem, elem->len); > >>>>>> Shouldn't we need to wait until all pending requests to be drained? Or > >>>>>> we may end up duplicated requests? > >>>>>> > >>>>> Do you mean pending as in-flight/processing in the device? The device > >>>>> must be paused at this point. > >>>> Ok. I see there's a vhost_set_vring_enable(dev, false) in > >>>> vhost_sw_live_migration_start(). > >>>> > >>>> > >>>>> Currently there is no assertion for > >>>>> this, maybe we can track the device status for it. > >>>>> > >>>>> For the queue handlers to be running at this point, the main event > >>>>> loop should serialize QMP and handlers as far as I know (and they > >>>>> would make all state inconsistent if the device stops suddenly). It > >>>>> would need to be synchronized if the handlers run in their own AIO > >>>>> context. That would be nice to have but it's not included here. > >>>> That's why I suggest to just drop the QMP stuffs and use cli parameters > >>>> to enable shadow virtqueue. Things would be greatly simplified I guess. > >>>> > >>> I can send a series without it, but SVQ will need to be able to kick > >>> in dynamically sooner or later if we want to use it for live > >>> migration. > >> > >> I'm not sure I get the issue here. My understnading is everyhing will be > >> processed in the same aio context. > >> > > What I meant is that QMP allows us to activate the shadow virtqueue > > mode in any moment, similar to how live migration would activate it. > > > I get you. > > > > To enable SVQ with a command line would imply that it runs the same > > way for all the time qemu runs. > > > Ok. > > > > > > If we do that way, we don't need more synchronization, since we have > > deleted the event that could run concurrently with the masking. But > > this synchronization will be needed if we want to enable SVQ > > dynamically for live migration, so we are "just" delaying work. > > > > However, if we add vdpa iova range to this patch series, I think it > > would be a good idea to delay that synchronization work to future > > series, so they are smaller and the first one can be tested better. > > > Yes, that's why I think we can start from simple case. E.g to let the > shadow virtqueue logic run. Then we can consider to add synchronization > in the future. > > I guess things like mutex or bh might help, it would be more easier to > add those stuffs on top. > Continuing this topic in patch 05/13, since both have converged. > Thanks > > > > > >> Thanks > >> > >> > >>>> Thanks > >>>> > >>>> > >>>>>> Thanks > >>>>>> > >>>>>> > >>>>>>> + } > >>>>>>> + } > >>>>>>> } > >>>>>>> > >>>>>>> /* > >>>>>>> @@ -284,7 +466,7 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) > >>>>>>> unsigned num = virtio_queue_get_num(dev->vdev, vq_idx); > >>>>>>> size_t ring_size = vring_size(num, VRING_DESC_ALIGN_SIZE); > >>>>>>> g_autofree VhostShadowVirtqueue *svq = g_malloc0(sizeof(*svq) + ring_size); > >>>>>>> - int r; > >>>>>>> + int r, i; > >>>>>>> > >>>>>>> r = event_notifier_init(&svq->kick_notifier, 0); > >>>>>>> if (r != 0) { > >>>>>>> @@ -303,6 +485,11 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) > >>>>>>> vring_init(&svq->vring, num, svq->descs, VRING_DESC_ALIGN_SIZE); > >>>>>>> svq->vq = virtio_get_queue(dev->vdev, vq_idx); > >>>>>>> svq->vdev = dev->vdev; > >>>>>>> + for (i = 0; i < num - 1; i++) { > >>>>>>> + svq->descs[i].next = virtio_tswap16(dev->vdev, i + 1); > >>>>>>> + } > >>>>>>> + > >>>>>>> + svq->ring_id_maps = g_new0(VirtQueueElement *, num); > >>>>>>> event_notifier_set_handler(&svq->call_notifier, > >>>>>>> vhost_shadow_vq_handle_call); > >>>>>>> qemu_event_init(&svq->masked_notifier.is_free, true); > >>>>>>> @@ -324,5 +511,6 @@ void vhost_shadow_vq_free(VhostShadowVirtqueue *vq) > >>>>>>> event_notifier_cleanup(&vq->kick_notifier); > >>>>>>> event_notifier_set_handler(&vq->call_notifier, NULL); > >>>>>>> event_notifier_cleanup(&vq->call_notifier); > >>>>>>> + g_free(vq->ring_id_maps); > >>>>>>> g_free(vq); > >>>>>>> } > >>>>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > >>>>>>> index eab3e334f2..a373999bc4 100644 > >>>>>>> --- a/hw/virtio/vhost.c > >>>>>>> +++ b/hw/virtio/vhost.c > >>>>>>> @@ -1021,6 +1021,19 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) > >>>>>>> > >>>>>>> trace_vhost_iotlb_miss(dev, 1); > >>>>>>> > >>>>>>> + if (qatomic_load_acquire(&dev->shadow_vqs_enabled)) { > >>>>>>> + uaddr = iova; > >>>>>>> + len = 4096; > >>>>>>> + ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len, > >>>>>>> + IOMMU_RW); > >>>>>>> + if (ret) { > >>>>>>> + trace_vhost_iotlb_miss(dev, 2); > >>>>>>> + error_report("Fail to update device iotlb"); > >>>>>>> + } > >>>>>>> + > >>>>>>> + return ret; > >>>>>>> + } > >>>>>>> + > >>>>>>> iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, > >>>>>>> iova, write, > >>>>>>> MEMTXATTRS_UNSPECIFIED); > >>>>>>> @@ -1227,8 +1240,28 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) > >>>>>>> /* Can be read by vhost_virtqueue_mask, from vm exit */ > >>>>>>> qatomic_store_release(&dev->shadow_vqs_enabled, false); > >>>>>>> > >>>>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, false); > >>>>>>> + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { > >>>>>>> + error_report("Fail to invalidate device iotlb"); > >>>>>>> + } > >>>>>>> + > >>>>>>> for (idx = 0; idx < dev->nvqs; ++idx) { > >>>>>>> + /* > >>>>>>> + * Update used ring information for IOTLB to work correctly, > >>>>>>> + * vhost-kernel code requires for this. > >>>>>>> + */ > >>>>>>> + struct vhost_virtqueue *vq = dev->vqs + idx; > >>>>>>> + vhost_device_iotlb_miss(dev, vq->used_phys, true); > >>>>>>> + > >>>>>>> vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]); > >>>>>>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], > >>>>>>> + dev->vq_index + idx); > >>>>>>> + } > >>>>>>> + > >>>>>>> + /* Enable guest's vq vring */ > >>>>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); > >>>>>>> + > >>>>>>> + for (idx = 0; idx < dev->nvqs; ++idx) { > >>>>>>> vhost_shadow_vq_free(dev->shadow_vqs[idx]); > >>>>>>> } > >>>>>>> > >>>>>>> @@ -1237,6 +1270,59 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) > >>>>>>> return 0; > >>>>>>> } > >>>>>>> > >>>>>>> +/* > >>>>>>> + * Start shadow virtqueue in a given queue. > >>>>>>> + * In failure case, this function leaves queue working as regular vhost mode. > >>>>>>> + */ > >>>>>>> +static bool vhost_sw_live_migration_start_vq(struct vhost_dev *dev, > >>>>>>> + unsigned idx) > >>>>>>> +{ > >>>>>>> + struct vhost_vring_addr addr = { > >>>>>>> + .index = idx, > >>>>>>> + }; > >>>>>>> + struct vhost_vring_state s = { > >>>>>>> + .index = idx, > >>>>>>> + }; > >>>>>>> + int r; > >>>>>>> + bool ok; > >>>>>>> + > >>>>>>> + vhost_virtqueue_stop(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); > >>>>>>> + ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); > >>>>>>> + if (unlikely(!ok)) { > >>>>>>> + return false; > >>>>>>> + } > >>>>>>> + > >>>>>>> + /* From this point, vhost_virtqueue_start can reset these changes */ > >>>>>>> + vhost_shadow_vq_get_vring_addr(dev->shadow_vqs[idx], &addr); > >>>>>>> + r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); > >>>>>>> + if (unlikely(r != 0)) { > >>>>>>> + VHOST_OPS_DEBUG("vhost_set_vring_addr for shadow vq failed"); > >>>>>>> + goto err; > >>>>>>> + } > >>>>>>> + > >>>>>>> + r = dev->vhost_ops->vhost_set_vring_base(dev, &s); > >>>>>>> + if (unlikely(r != 0)) { > >>>>>>> + VHOST_OPS_DEBUG("vhost_set_vring_base for shadow vq failed"); > >>>>>>> + goto err; > >>>>>>> + } > >>>>>>> + > >>>>>>> + /* > >>>>>>> + * Update used ring information for IOTLB to work correctly, > >>>>>>> + * vhost-kernel code requires for this. > >>>>>>> + */ > >>>>>>> + r = vhost_device_iotlb_miss(dev, addr.used_user_addr, true); > >>>>>>> + if (unlikely(r != 0)) { > >>>>>>> + /* Debug message already printed */ > >>>>>>> + goto err; > >>>>>>> + } > >>>>>>> + > >>>>>>> + return true; > >>>>>>> + > >>>>>>> +err: > >>>>>>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); > >>>>>>> + return false; > >>>>>>> +} > >>>>>>> + > >>>>>>> static int vhost_sw_live_migration_start(struct vhost_dev *dev) > >>>>>>> { > >>>>>>> int idx, stop_idx; > >>>>>>> @@ -1249,24 +1335,35 @@ static int vhost_sw_live_migration_start(struct vhost_dev *dev) > >>>>>>> } > >>>>>>> } > >>>>>>> > >>>>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, false); > >>>>>>> + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { > >>>>>>> + error_report("Fail to invalidate device iotlb"); > >>>>>>> + } > >>>>>>> + > >>>>>>> /* Can be read by vhost_virtqueue_mask, from vm exit */ > >>>>>>> qatomic_store_release(&dev->shadow_vqs_enabled, true); > >>>>>>> for (idx = 0; idx < dev->nvqs; ++idx) { > >>>>>>> - bool ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); > >>>>>>> + bool ok = vhost_sw_live_migration_start_vq(dev, idx); > >>>>>>> if (unlikely(!ok)) { > >>>>>>> goto err_start; > >>>>>>> } > >>>>>>> } > >>>>>>> > >>>>>>> + /* Enable shadow vq vring */ > >>>>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); > >>>>>>> return 0; > >>>>>>> > >>>>>>> err_start: > >>>>>>> qatomic_store_release(&dev->shadow_vqs_enabled, false); > >>>>>>> for (stop_idx = 0; stop_idx < idx; stop_idx++) { > >>>>>>> vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]); > >>>>>>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], > >>>>>>> + dev->vq_index + stop_idx); > >>>>>>> } > >>>>>>> > >>>>>>> err_new: > >>>>>>> + /* Enable guest's vring */ > >>>>>>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); > >>>>>>> for (idx = 0; idx < dev->nvqs; ++idx) { > >>>>>>> vhost_shadow_vq_free(dev->shadow_vqs[idx]); > >>>>>>> } > >>>>>>> @@ -1970,6 +2067,20 @@ void qmp_x_vhost_enable_shadow_vq(const char *name, bool enable, Error **errp) > >>>>>>> > >>>>>>> if (!hdev->started) { > >>>>>>> err_cause = "Device is not started"; > >>>>>>> + } else if (!vhost_dev_has_iommu(hdev)) { > >>>>>>> + err_cause = "Does not support iommu"; > >>>>>>> + } else if (hdev->acked_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { > >>>>>>> + err_cause = "Is packed"; > >>>>>>> + } else if (hdev->acked_features & BIT_ULL(VIRTIO_RING_F_EVENT_IDX)) { > >>>>>>> + err_cause = "Have event idx"; > >>>>>>> + } else if (hdev->acked_features & > >>>>>>> + BIT_ULL(VIRTIO_RING_F_INDIRECT_DESC)) { > >>>>>>> + err_cause = "Supports indirect descriptors"; > >>>>>>> + } else if (!hdev->vhost_ops->vhost_set_vring_enable) { > >>>>>>> + err_cause = "Cannot pause device"; > >>>>>>> + } > >>>>>>> + > >>>>>>> + if (err_cause) { > >>>>>>> goto err; > >>>>>>> } > >>>>>>> > > > >
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index 1460d1d5d1..68ed0f2740 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -9,6 +9,7 @@ #include "hw/virtio/vhost-shadow-virtqueue.h" #include "hw/virtio/vhost.h" +#include "hw/virtio/virtio-access.h" #include "standard-headers/linux/vhost_types.h" @@ -55,11 +56,96 @@ typedef struct VhostShadowVirtqueue { /* Virtio device */ VirtIODevice *vdev; + /* Map for returning guest's descriptors */ + VirtQueueElement **ring_id_maps; + + /* Next head to expose to device */ + uint16_t avail_idx_shadow; + + /* Next free descriptor */ + uint16_t free_head; + + /* Last seen used idx */ + uint16_t shadow_used_idx; + + /* Next head to consume from device */ + uint16_t used_idx; + /* Descriptors copied from guest */ vring_desc_t descs[]; } VhostShadowVirtqueue; -/* Forward guest notifications */ +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, + const struct iovec *iovec, + size_t num, bool more_descs, bool write) +{ + uint16_t i = svq->free_head, last = svq->free_head; + unsigned n; + uint16_t flags = write ? virtio_tswap16(svq->vdev, VRING_DESC_F_WRITE) : 0; + vring_desc_t *descs = svq->vring.desc; + + if (num == 0) { + return; + } + + for (n = 0; n < num; n++) { + if (more_descs || (n + 1 < num)) { + descs[i].flags = flags | virtio_tswap16(svq->vdev, + VRING_DESC_F_NEXT); + } else { + descs[i].flags = flags; + } + descs[i].addr = virtio_tswap64(svq->vdev, (hwaddr)iovec[n].iov_base); + descs[i].len = virtio_tswap32(svq->vdev, iovec[n].iov_len); + + last = i; + i = virtio_tswap16(svq->vdev, descs[i].next); + } + + svq->free_head = virtio_tswap16(svq->vdev, descs[last].next); +} + +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq, + VirtQueueElement *elem) +{ + int head; + unsigned avail_idx; + vring_avail_t *avail = svq->vring.avail; + + head = svq->free_head; + + /* We need some descriptors here */ + assert(elem->out_num || elem->in_num); + + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, + elem->in_num > 0, false); + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); + + /* + * Put entry in available array (but don't update avail->idx until they + * do sync). + */ + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); + avail->ring[avail_idx] = virtio_tswap16(svq->vdev, head); + svq->avail_idx_shadow++; + + /* Expose descriptors to device */ + smp_wmb(); + avail->idx = virtio_tswap16(svq->vdev, svq->avail_idx_shadow); + + return head; + +} + +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq, + VirtQueueElement *elem) +{ + unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem); + + svq->ring_id_maps[qemu_head] = elem; +} + +/* Handle guest->device notifications */ static void vhost_handle_guest_kick(EventNotifier *n) { VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, @@ -69,7 +155,72 @@ static void vhost_handle_guest_kick(EventNotifier *n) return; } - event_notifier_set(&svq->kick_notifier); + /* Make available as many buffers as possible */ + do { + if (virtio_queue_get_notification(svq->vq)) { + /* No more notifications until process all available */ + virtio_queue_set_notification(svq->vq, false); + } + + while (true) { + VirtQueueElement *elem; + if (virtio_queue_full(svq->vq)) { + break; + } + + elem = virtqueue_pop(svq->vq, sizeof(*elem)); + if (!elem) { + break; + } + + vhost_shadow_vq_add(svq, elem); + event_notifier_set(&svq->kick_notifier); + } + + virtio_queue_set_notification(svq->vq, true); + } while (!virtio_queue_empty(svq->vq)); +} + +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq) +{ + if (svq->used_idx != svq->shadow_used_idx) { + return true; + } + + /* Get used idx must not be reordered */ + smp_rmb(); + svq->shadow_used_idx = virtio_tswap16(svq->vdev, svq->vring.used->idx); + + return svq->used_idx != svq->shadow_used_idx; +} + +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq) +{ + vring_desc_t *descs = svq->vring.desc; + const vring_used_t *used = svq->vring.used; + vring_used_elem_t used_elem; + uint16_t last_used; + + if (!vhost_shadow_vq_more_used(svq)) { + return NULL; + } + + last_used = svq->used_idx & (svq->vring.num - 1); + used_elem.id = virtio_tswap32(svq->vdev, used->ring[last_used].id); + used_elem.len = virtio_tswap32(svq->vdev, used->ring[last_used].len); + + if (unlikely(used_elem.id >= svq->vring.num)) { + error_report("Device %s says index %u is available", svq->vdev->name, + used_elem.id); + return NULL; + } + + descs[used_elem.id].next = svq->free_head; + svq->free_head = used_elem.id; + + svq->used_idx++; + svq->ring_id_maps[used_elem.id]->len = used_elem.len; + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); } /* Forward vhost notifications */ @@ -78,6 +229,7 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, call_notifier); EventNotifier *masked_notifier; + VirtQueue *vq = svq->vq; /* Signal start of using masked notifier */ qemu_event_reset(&svq->masked_notifier.is_free); @@ -86,14 +238,29 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) qemu_event_set(&svq->masked_notifier.is_free); } - if (!masked_notifier) { - unsigned n = virtio_get_queue_index(svq->vq); - virtio_queue_invalidate_signalled_used(svq->vdev, n); - virtio_notify_irqfd(svq->vdev, svq->vq); - } else if (!svq->masked_notifier.signaled) { - svq->masked_notifier.signaled = true; - event_notifier_set(svq->masked_notifier.n); - } + /* Make as many buffers as possible used. */ + do { + unsigned i = 0; + + /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */ + while (true) { + g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq); + if (!elem) { + break; + } + + assert(i < svq->vring.num); + virtqueue_fill(vq, elem, elem->len, i++); + } + + virtqueue_flush(vq, i); + if (!masked_notifier) { + virtio_notify_irqfd(svq->vdev, svq->vq); + } else if (!svq->masked_notifier.signaled) { + svq->masked_notifier.signaled = true; + event_notifier_set(svq->masked_notifier.n); + } + } while (vhost_shadow_vq_more_used(svq)); if (masked_notifier) { /* Signal not using it anymore */ @@ -103,7 +270,6 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) static void vhost_shadow_vq_handle_call(EventNotifier *n) { - if (likely(event_notifier_test_and_clear(n))) { vhost_shadow_vq_handle_call_no_test(n); } @@ -254,7 +420,11 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, unsigned idx, VhostShadowVirtqueue *svq) { + int i; int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq); + + assert(!dev->shadow_vqs_enabled); + if (unlikely(r < 0)) { error_report("Couldn't restore vq kick fd: %s", strerror(-r)); } @@ -272,6 +442,18 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, /* Restore vhost call */ vhost_virtqueue_mask(dev, dev->vdev, dev->vq_index + idx, dev->vqs[idx].notifier_is_masked); + + + for (i = 0; i < svq->vring.num; ++i) { + g_autofree VirtQueueElement *elem = svq->ring_id_maps[i]; + /* + * Although the doc says we must unpop in order, it's ok to unpop + * everything. + */ + if (elem) { + virtqueue_unpop(svq->vq, elem, elem->len); + } + } } /* @@ -284,7 +466,7 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) unsigned num = virtio_queue_get_num(dev->vdev, vq_idx); size_t ring_size = vring_size(num, VRING_DESC_ALIGN_SIZE); g_autofree VhostShadowVirtqueue *svq = g_malloc0(sizeof(*svq) + ring_size); - int r; + int r, i; r = event_notifier_init(&svq->kick_notifier, 0); if (r != 0) { @@ -303,6 +485,11 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) vring_init(&svq->vring, num, svq->descs, VRING_DESC_ALIGN_SIZE); svq->vq = virtio_get_queue(dev->vdev, vq_idx); svq->vdev = dev->vdev; + for (i = 0; i < num - 1; i++) { + svq->descs[i].next = virtio_tswap16(dev->vdev, i + 1); + } + + svq->ring_id_maps = g_new0(VirtQueueElement *, num); event_notifier_set_handler(&svq->call_notifier, vhost_shadow_vq_handle_call); qemu_event_init(&svq->masked_notifier.is_free, true); @@ -324,5 +511,6 @@ void vhost_shadow_vq_free(VhostShadowVirtqueue *vq) event_notifier_cleanup(&vq->kick_notifier); event_notifier_set_handler(&vq->call_notifier, NULL); event_notifier_cleanup(&vq->call_notifier); + g_free(vq->ring_id_maps); g_free(vq); } diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index eab3e334f2..a373999bc4 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1021,6 +1021,19 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) trace_vhost_iotlb_miss(dev, 1); + if (qatomic_load_acquire(&dev->shadow_vqs_enabled)) { + uaddr = iova; + len = 4096; + ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len, + IOMMU_RW); + if (ret) { + trace_vhost_iotlb_miss(dev, 2); + error_report("Fail to update device iotlb"); + } + + return ret; + } + iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, iova, write, MEMTXATTRS_UNSPECIFIED); @@ -1227,8 +1240,28 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) /* Can be read by vhost_virtqueue_mask, from vm exit */ qatomic_store_release(&dev->shadow_vqs_enabled, false); + dev->vhost_ops->vhost_set_vring_enable(dev, false); + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { + error_report("Fail to invalidate device iotlb"); + } + for (idx = 0; idx < dev->nvqs; ++idx) { + /* + * Update used ring information for IOTLB to work correctly, + * vhost-kernel code requires for this. + */ + struct vhost_virtqueue *vq = dev->vqs + idx; + vhost_device_iotlb_miss(dev, vq->used_phys, true); + vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]); + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], + dev->vq_index + idx); + } + + /* Enable guest's vq vring */ + dev->vhost_ops->vhost_set_vring_enable(dev, true); + + for (idx = 0; idx < dev->nvqs; ++idx) { vhost_shadow_vq_free(dev->shadow_vqs[idx]); } @@ -1237,6 +1270,59 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) return 0; } +/* + * Start shadow virtqueue in a given queue. + * In failure case, this function leaves queue working as regular vhost mode. + */ +static bool vhost_sw_live_migration_start_vq(struct vhost_dev *dev, + unsigned idx) +{ + struct vhost_vring_addr addr = { + .index = idx, + }; + struct vhost_vring_state s = { + .index = idx, + }; + int r; + bool ok; + + vhost_virtqueue_stop(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); + ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); + if (unlikely(!ok)) { + return false; + } + + /* From this point, vhost_virtqueue_start can reset these changes */ + vhost_shadow_vq_get_vring_addr(dev->shadow_vqs[idx], &addr); + r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); + if (unlikely(r != 0)) { + VHOST_OPS_DEBUG("vhost_set_vring_addr for shadow vq failed"); + goto err; + } + + r = dev->vhost_ops->vhost_set_vring_base(dev, &s); + if (unlikely(r != 0)) { + VHOST_OPS_DEBUG("vhost_set_vring_base for shadow vq failed"); + goto err; + } + + /* + * Update used ring information for IOTLB to work correctly, + * vhost-kernel code requires for this. + */ + r = vhost_device_iotlb_miss(dev, addr.used_user_addr, true); + if (unlikely(r != 0)) { + /* Debug message already printed */ + goto err; + } + + return true; + +err: + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); + return false; +} + static int vhost_sw_live_migration_start(struct vhost_dev *dev) { int idx, stop_idx; @@ -1249,24 +1335,35 @@ static int vhost_sw_live_migration_start(struct vhost_dev *dev) } } + dev->vhost_ops->vhost_set_vring_enable(dev, false); + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { + error_report("Fail to invalidate device iotlb"); + } + /* Can be read by vhost_virtqueue_mask, from vm exit */ qatomic_store_release(&dev->shadow_vqs_enabled, true); for (idx = 0; idx < dev->nvqs; ++idx) { - bool ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); + bool ok = vhost_sw_live_migration_start_vq(dev, idx); if (unlikely(!ok)) { goto err_start; } } + /* Enable shadow vq vring */ + dev->vhost_ops->vhost_set_vring_enable(dev, true); return 0; err_start: qatomic_store_release(&dev->shadow_vqs_enabled, false); for (stop_idx = 0; stop_idx < idx; stop_idx++) { vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]); + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], + dev->vq_index + stop_idx); } err_new: + /* Enable guest's vring */ + dev->vhost_ops->vhost_set_vring_enable(dev, true); for (idx = 0; idx < dev->nvqs; ++idx) { vhost_shadow_vq_free(dev->shadow_vqs[idx]); } @@ -1970,6 +2067,20 @@ void qmp_x_vhost_enable_shadow_vq(const char *name, bool enable, Error **errp) if (!hdev->started) { err_cause = "Device is not started"; + } else if (!vhost_dev_has_iommu(hdev)) { + err_cause = "Does not support iommu"; + } else if (hdev->acked_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { + err_cause = "Is packed"; + } else if (hdev->acked_features & BIT_ULL(VIRTIO_RING_F_EVENT_IDX)) { + err_cause = "Have event idx"; + } else if (hdev->acked_features & + BIT_ULL(VIRTIO_RING_F_INDIRECT_DESC)) { + err_cause = "Supports indirect descriptors"; + } else if (!hdev->vhost_ops->vhost_set_vring_enable) { + err_cause = "Cannot pause device"; + } + + if (err_cause) { goto err; }
Initial version of shadow virtqueue that actually forward buffers. It reuses the VirtQueue code for the device part. The driver part is based on Linux's virtio_ring driver, but with stripped functionality and optimizations so it's easier to review. These will be added in later commits. Signed-off-by: Eugenio Pérez <eperezma@redhat.com> --- hw/virtio/vhost-shadow-virtqueue.c | 212 +++++++++++++++++++++++++++-- hw/virtio/vhost.c | 113 ++++++++++++++- 2 files changed, 312 insertions(+), 13 deletions(-)