Message ID | 20201120185105.279030-8-eperezma@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vDPA software assisted live migration | expand |
On Fri, Nov 20, 2020 at 07:50:45PM +0100, Eugenio Pérez wrote: > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > --- > hw/virtio/vhost-sw-lm-ring.h | 26 +++++++++ > include/hw/virtio/vhost.h | 3 ++ > hw/virtio/vhost-sw-lm-ring.c | 60 +++++++++++++++++++++ > hw/virtio/vhost.c | 100 +++++++++++++++++++++++++++++++++-- > hw/virtio/meson.build | 2 +- > 5 files changed, 187 insertions(+), 4 deletions(-) > create mode 100644 hw/virtio/vhost-sw-lm-ring.h > create mode 100644 hw/virtio/vhost-sw-lm-ring.c > > diff --git a/hw/virtio/vhost-sw-lm-ring.h b/hw/virtio/vhost-sw-lm-ring.h > new file mode 100644 > index 0000000000..86dc081b93 > --- /dev/null > +++ b/hw/virtio/vhost-sw-lm-ring.h > @@ -0,0 +1,26 @@ > +/* > + * vhost software live migration ring > + * > + * SPDX-FileCopyrightText: Red Hat, Inc. 2020 > + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com> > + * > + * SPDX-License-Identifier: GPL-2.0-or-later > + */ > + > +#ifndef VHOST_SW_LM_RING_H > +#define VHOST_SW_LM_RING_H > + > +#include "qemu/osdep.h" > + > +#include "hw/virtio/virtio.h" > +#include "hw/virtio/vhost.h" > + > +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue; Here it's called a shadow virtqueue while the file calls it a sw-lm-ring. Please use a single name. > + > +bool vhost_vring_kick(VhostShadowVirtqueue *vq); vhost_shadow_vq_kick()? > + > +VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx); vhost_dev_get_shadow_vq()? This could be in include/hw/virtio/vhost.h with the other vhost_dev_*() functions. > + > +void vhost_sw_lm_shadow_vq_free(VhostShadowVirtqueue *vq); Hmm...now I wonder what the lifecycle is. Does vhost_sw_lm_shadow_vq() allocate it? Please add doc comments explaining these functions either in this header file or in vhost-sw-lm-ring.c. > + > +#endif > diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h > index b5b7496537..93cc3f1ae3 100644 > --- a/include/hw/virtio/vhost.h > +++ b/include/hw/virtio/vhost.h > @@ -54,6 +54,8 @@ struct vhost_iommu { > QLIST_ENTRY(vhost_iommu) iommu_next; > }; > > +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue; > + > typedef struct VhostDevConfigOps { > /* Vhost device config space changed callback > */ > @@ -83,6 +85,7 @@ struct vhost_dev { > bool started; > bool log_enabled; > uint64_t log_size; > + VhostShadowVirtqueue *sw_lm_shadow_vq[2]; The hardcoded 2 is probably for single-queue virtio-net? I guess this will eventually become VhostShadowVirtqueue *shadow_vqs or VhostShadowVirtqueue **shadow_vqs, depending on whether each one should be allocated individually. > VirtIOHandleOutput sw_lm_vq_handler; > Error *migration_blocker; > const VhostOps *vhost_ops; > diff --git a/hw/virtio/vhost-sw-lm-ring.c b/hw/virtio/vhost-sw-lm-ring.c > new file mode 100644 > index 0000000000..0192e77831 > --- /dev/null > +++ b/hw/virtio/vhost-sw-lm-ring.c > @@ -0,0 +1,60 @@ > +/* > + * vhost software live migration ring > + * > + * SPDX-FileCopyrightText: Red Hat, Inc. 2020 > + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com> > + * > + * SPDX-License-Identifier: GPL-2.0-or-later > + */ > + > +#include "hw/virtio/vhost-sw-lm-ring.h" > +#include "hw/virtio/vhost.h" > + > +#include "standard-headers/linux/vhost_types.h" > +#include "standard-headers/linux/virtio_ring.h" > + > +#include "qemu/event_notifier.h" > + > +typedef struct VhostShadowVirtqueue { > + EventNotifier hdev_notifier; > + VirtQueue *vq; > +} VhostShadowVirtqueue; > + > +static inline bool vhost_vring_should_kick(VhostShadowVirtqueue *vq) > +{ > + return virtio_queue_get_used_notify_split(vq->vq); > +} > + > +bool vhost_vring_kick(VhostShadowVirtqueue *vq) > +{ > + return vhost_vring_should_kick(vq) ? event_notifier_set(&vq->hdev_notifier) > + : true; > +} How is the return value used? event_notifier_set() returns -errno so this function returns false on success, and true when notifications are disabled or event_notifier_set() failed. I'm not sure this return value can be used for anything. > + > +VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx) I see now that this function allocates the VhostShadowVirtqueue. Maybe adding _new() to the name would make that clear? > +{ > + struct vhost_vring_file file = { > + .index = idx > + }; > + VirtQueue *vq = virtio_get_queue(dev->vdev, idx); > + VhostShadowVirtqueue *svq; > + int r; > + > + svq = g_new0(VhostShadowVirtqueue, 1); > + svq->vq = vq; > + > + r = event_notifier_init(&svq->hdev_notifier, 0); > + assert(r == 0); > + > + file.fd = event_notifier_get_fd(&svq->hdev_notifier); > + r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); > + assert(r == 0); > + > + return svq; > +} I guess there are assumptions about the status of the device? Does the virtqueue need to be disabled when this function is called? > + > +void vhost_sw_lm_shadow_vq_free(VhostShadowVirtqueue *vq) > +{ > + event_notifier_cleanup(&vq->hdev_notifier); > + g_free(vq); > +} > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > index 9cbd52a7f1..a55b684b5f 100644 > --- a/hw/virtio/vhost.c > +++ b/hw/virtio/vhost.c > @@ -13,6 +13,8 @@ > * GNU GPL, version 2 or (at your option) any later version. > */ > > +#include "hw/virtio/vhost-sw-lm-ring.h" > + > #include "qemu/osdep.h" > #include "qapi/error.h" > #include "hw/virtio/vhost.h" > @@ -61,6 +63,20 @@ bool vhost_has_free_slot(void) > return slots_limit > used_memslots; > } > > +static struct vhost_dev *vhost_dev_from_virtio(const VirtIODevice *vdev) > +{ > + struct vhost_dev *hdev; > + > + QLIST_FOREACH(hdev, &vhost_devices, entry) { > + if (hdev->vdev == vdev) { > + return hdev; > + } > + } > + > + assert(hdev); > + return NULL; > +} > + > static bool vhost_dev_can_log(const struct vhost_dev *hdev) > { > return hdev->features & (0x1ULL << VHOST_F_LOG_ALL); > @@ -148,6 +164,12 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, > return 0; > } > > +static void vhost_log_sync_nop(MemoryListener *listener, > + MemoryRegionSection *section) > +{ > + return; > +} > + > static void vhost_log_sync(MemoryListener *listener, > MemoryRegionSection *section) > { > @@ -928,6 +950,71 @@ static void vhost_log_global_stop(MemoryListener *listener) > } > } > > +static void handle_sw_lm_vq(VirtIODevice *vdev, VirtQueue *vq) > +{ > + struct vhost_dev *hdev = vhost_dev_from_virtio(vdev); If this lookup becomes a performance bottleneck there are other options for determining the vhost_dev. For example VirtIODevice could have a field for stashing the vhost_dev pointer. > + uint16_t idx = virtio_get_queue_index(vq); > + > + VhostShadowVirtqueue *svq = hdev->sw_lm_shadow_vq[idx]; > + > + vhost_vring_kick(svq); > +} I'm a confused. Do we need to pop elements from vq and push equivalent elements onto svq before kicking? Either a todo comment is missing or I misunderstand how this works. > + > +static int vhost_sw_live_migration_stop(struct vhost_dev *dev) > +{ > + int idx; > + > + vhost_dev_enable_notifiers(dev, dev->vdev); > + for (idx = 0; idx < dev->nvqs; ++idx) { > + vhost_sw_lm_shadow_vq_free(dev->sw_lm_shadow_vq[idx]); > + } > + > + return 0; > +} > + > +static int vhost_sw_live_migration_start(struct vhost_dev *dev) > +{ > + int idx; > + > + for (idx = 0; idx < dev->nvqs; ++idx) { > + dev->sw_lm_shadow_vq[idx] = vhost_sw_lm_shadow_vq(dev, idx); > + } > + > + vhost_dev_disable_notifiers(dev, dev->vdev); There is a race condition if the guest kicks the vq while this is happening. The shadow vq hdev_notifier needs to be set so the vhost device checks the virtqueue for requests that slipped in during the race window. > + > + return 0; > +} > + > +static int vhost_sw_live_migration_enable(struct vhost_dev *dev, > + bool enable_lm) > +{ > + if (enable_lm) { > + return vhost_sw_live_migration_start(dev); > + } else { > + return vhost_sw_live_migration_stop(dev); > + } > +} > + > +static void vhost_sw_lm_global_start(MemoryListener *listener) > +{ > + int r; > + > + r = vhost_migration_log(listener, true, vhost_sw_live_migration_enable); > + if (r < 0) { > + abort(); > + } > +} > + > +static void vhost_sw_lm_global_stop(MemoryListener *listener) > +{ > + int r; > + > + r = vhost_migration_log(listener, false, vhost_sw_live_migration_enable); > + if (r < 0) { > + abort(); > + } > +} > + > static void vhost_log_start(MemoryListener *listener, > MemoryRegionSection *section, > int old, int new) > @@ -1334,9 +1421,14 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, > .region_nop = vhost_region_addnop, > .log_start = vhost_log_start, > .log_stop = vhost_log_stop, > - .log_sync = vhost_log_sync, > - .log_global_start = vhost_log_global_start, > - .log_global_stop = vhost_log_global_stop, > + .log_sync = !vhost_dev_can_log(hdev) ? > + vhost_log_sync_nop : > + vhost_log_sync, Why is this change necessary now? It's not clear to me why it was previously okay to call vhost_log_sync(). > + .log_global_start = !vhost_dev_can_log(hdev) ? > + vhost_sw_lm_global_start : > + vhost_log_global_start, > + .log_global_stop = !vhost_dev_can_log(hdev) ? vhost_sw_lm_global_stop : > + vhost_log_global_stop, > .eventfd_add = vhost_eventfd_add, > .eventfd_del = vhost_eventfd_del, > .priority = 10 > @@ -1364,6 +1456,8 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, > error_free(hdev->migration_blocker); > goto fail_busyloop; > } > + } else { > + hdev->sw_lm_vq_handler = handle_sw_lm_vq; > } > > hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); > diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build > index fbff9bc9d4..17419cb13e 100644 > --- a/hw/virtio/meson.build > +++ b/hw/virtio/meson.build > @@ -11,7 +11,7 @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c')) > > virtio_ss = ss.source_set() > virtio_ss.add(files('virtio.c')) > -virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c')) > +virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-sw-lm-ring.c')) > virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c')) > virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c')) > virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c')) > -- > 2.18.4 >
On Mon, Dec 7, 2020 at 6:42 PM Stefan Hajnoczi <stefanha@gmail.com> wrote: > > On Fri, Nov 20, 2020 at 07:50:45PM +0100, Eugenio Pérez wrote: > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com> > > --- > > hw/virtio/vhost-sw-lm-ring.h | 26 +++++++++ > > include/hw/virtio/vhost.h | 3 ++ > > hw/virtio/vhost-sw-lm-ring.c | 60 +++++++++++++++++++++ > > hw/virtio/vhost.c | 100 +++++++++++++++++++++++++++++++++-- > > hw/virtio/meson.build | 2 +- > > 5 files changed, 187 insertions(+), 4 deletions(-) > > create mode 100644 hw/virtio/vhost-sw-lm-ring.h > > create mode 100644 hw/virtio/vhost-sw-lm-ring.c > > > > diff --git a/hw/virtio/vhost-sw-lm-ring.h b/hw/virtio/vhost-sw-lm-ring.h > > new file mode 100644 > > index 0000000000..86dc081b93 > > --- /dev/null > > +++ b/hw/virtio/vhost-sw-lm-ring.h > > @@ -0,0 +1,26 @@ > > +/* > > + * vhost software live migration ring > > + * > > + * SPDX-FileCopyrightText: Red Hat, Inc. 2020 > > + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com> > > + * > > + * SPDX-License-Identifier: GPL-2.0-or-later > > + */ > > + > > +#ifndef VHOST_SW_LM_RING_H > > +#define VHOST_SW_LM_RING_H > > + > > +#include "qemu/osdep.h" > > + > > +#include "hw/virtio/virtio.h" > > +#include "hw/virtio/vhost.h" > > + > > +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue; > > Here it's called a shadow virtqueue while the file calls it a > sw-lm-ring. Please use a single name. > I will switch to shadow virtqueue. > > + > > +bool vhost_vring_kick(VhostShadowVirtqueue *vq); > > vhost_shadow_vq_kick()? > > > + > > +VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx); > > vhost_dev_get_shadow_vq()? This could be in include/hw/virtio/vhost.h > with the other vhost_dev_*() functions. > I agree, that is a better place. > > + > > +void vhost_sw_lm_shadow_vq_free(VhostShadowVirtqueue *vq); > > Hmm...now I wonder what the lifecycle is. Does vhost_sw_lm_shadow_vq() > allocate it? > > Please add doc comments explaining these functions either in this header > file or in vhost-sw-lm-ring.c. > Will document. > > + > > +#endif > > diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h > > index b5b7496537..93cc3f1ae3 100644 > > --- a/include/hw/virtio/vhost.h > > +++ b/include/hw/virtio/vhost.h > > @@ -54,6 +54,8 @@ struct vhost_iommu { > > QLIST_ENTRY(vhost_iommu) iommu_next; > > }; > > > > +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue; > > + > > typedef struct VhostDevConfigOps { > > /* Vhost device config space changed callback > > */ > > @@ -83,6 +85,7 @@ struct vhost_dev { > > bool started; > > bool log_enabled; > > uint64_t log_size; > > + VhostShadowVirtqueue *sw_lm_shadow_vq[2]; > > The hardcoded 2 is probably for single-queue virtio-net? I guess this > will eventually become VhostShadowVirtqueue *shadow_vqs or > VhostShadowVirtqueue **shadow_vqs, depending on whether each one should > be allocated individually. > Yes, I will switch to one way or another for the next series. > > VirtIOHandleOutput sw_lm_vq_handler; > > Error *migration_blocker; > > const VhostOps *vhost_ops; > > diff --git a/hw/virtio/vhost-sw-lm-ring.c b/hw/virtio/vhost-sw-lm-ring.c > > new file mode 100644 > > index 0000000000..0192e77831 > > --- /dev/null > > +++ b/hw/virtio/vhost-sw-lm-ring.c > > @@ -0,0 +1,60 @@ > > +/* > > + * vhost software live migration ring > > + * > > + * SPDX-FileCopyrightText: Red Hat, Inc. 2020 > > + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com> > > + * > > + * SPDX-License-Identifier: GPL-2.0-or-later > > + */ > > + > > +#include "hw/virtio/vhost-sw-lm-ring.h" > > +#include "hw/virtio/vhost.h" > > + > > +#include "standard-headers/linux/vhost_types.h" > > +#include "standard-headers/linux/virtio_ring.h" > > + > > +#include "qemu/event_notifier.h" > > + > > +typedef struct VhostShadowVirtqueue { > > + EventNotifier hdev_notifier; > > + VirtQueue *vq; > > +} VhostShadowVirtqueue; > > + > > +static inline bool vhost_vring_should_kick(VhostShadowVirtqueue *vq) > > +{ > > + return virtio_queue_get_used_notify_split(vq->vq); > > +} > > + > > +bool vhost_vring_kick(VhostShadowVirtqueue *vq) > > +{ > > + return vhost_vring_should_kick(vq) ? event_notifier_set(&vq->hdev_notifier) > > + : true; > > +} > > How is the return value used? event_notifier_set() returns -errno so > this function returns false on success, and true when notifications are > disabled or event_notifier_set() failed. I'm not sure this return value > can be used for anything. > I think you are right, this is bad. It could be used for retry, but the failure is unlikely and the fail path is easy to add in the future if needed. It will be void. > > + > > +VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx) > > I see now that this function allocates the VhostShadowVirtqueue. Maybe > adding _new() to the name would make that clear? > Yes, I will rename. > > +{ > > + struct vhost_vring_file file = { > > + .index = idx > > + }; > > + VirtQueue *vq = virtio_get_queue(dev->vdev, idx); > > + VhostShadowVirtqueue *svq; > > + int r; > > + > > + svq = g_new0(VhostShadowVirtqueue, 1); > > + svq->vq = vq; > > + > > + r = event_notifier_init(&svq->hdev_notifier, 0); > > + assert(r == 0); > > + > > + file.fd = event_notifier_get_fd(&svq->hdev_notifier); > > + r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); > > + assert(r == 0); > > + > > + return svq; > > +} > > I guess there are assumptions about the status of the device? Does the > virtqueue need to be disabled when this function is called? > Yes. Maybe an assertion checking the notification state? > > + > > +void vhost_sw_lm_shadow_vq_free(VhostShadowVirtqueue *vq) > > +{ > > + event_notifier_cleanup(&vq->hdev_notifier); > > + g_free(vq); > > +} > > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > > index 9cbd52a7f1..a55b684b5f 100644 > > --- a/hw/virtio/vhost.c > > +++ b/hw/virtio/vhost.c > > @@ -13,6 +13,8 @@ > > * GNU GPL, version 2 or (at your option) any later version. > > */ > > > > +#include "hw/virtio/vhost-sw-lm-ring.h" > > + > > #include "qemu/osdep.h" > > #include "qapi/error.h" > > #include "hw/virtio/vhost.h" > > @@ -61,6 +63,20 @@ bool vhost_has_free_slot(void) > > return slots_limit > used_memslots; > > } > > > > +static struct vhost_dev *vhost_dev_from_virtio(const VirtIODevice *vdev) > > +{ > > + struct vhost_dev *hdev; > > + > > + QLIST_FOREACH(hdev, &vhost_devices, entry) { > > + if (hdev->vdev == vdev) { > > + return hdev; > > + } > > + } > > + > > + assert(hdev); > > + return NULL; > > +} > > + > > static bool vhost_dev_can_log(const struct vhost_dev *hdev) > > { > > return hdev->features & (0x1ULL << VHOST_F_LOG_ALL); > > @@ -148,6 +164,12 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, > > return 0; > > } > > > > +static void vhost_log_sync_nop(MemoryListener *listener, > > + MemoryRegionSection *section) > > +{ > > + return; > > +} > > + > > static void vhost_log_sync(MemoryListener *listener, > > MemoryRegionSection *section) > > { > > @@ -928,6 +950,71 @@ static void vhost_log_global_stop(MemoryListener *listener) > > } > > } > > > > +static void handle_sw_lm_vq(VirtIODevice *vdev, VirtQueue *vq) > > +{ > > + struct vhost_dev *hdev = vhost_dev_from_virtio(vdev); > > If this lookup becomes a performance bottleneck there are other options > for determining the vhost_dev. For example VirtIODevice could have a > field for stashing the vhost_dev pointer. > I would like to have something like that for the definitive patch series, yes. I would not like to increase the virtio knowledge of vhost, but it seems the most straightforward change for it. > > + uint16_t idx = virtio_get_queue_index(vq); > > + > > + VhostShadowVirtqueue *svq = hdev->sw_lm_shadow_vq[idx]; > > + > > + vhost_vring_kick(svq); > > +} > > I'm a confused. Do we need to pop elements from vq and push equivalent > elements onto svq before kicking? Either a todo comment is missing or I > misunderstand how this works. > At this commit only notifications are forwarded, buffers are still fetched directly from the guest. A TODO comment would have been helpful, yes :). > > + > > +static int vhost_sw_live_migration_stop(struct vhost_dev *dev) > > +{ > > + int idx; > > + > > + vhost_dev_enable_notifiers(dev, dev->vdev); > > + for (idx = 0; idx < dev->nvqs; ++idx) { > > + vhost_sw_lm_shadow_vq_free(dev->sw_lm_shadow_vq[idx]); > > + } > > + > > + return 0; > > +} > > + > > +static int vhost_sw_live_migration_start(struct vhost_dev *dev) > > +{ > > + int idx; > > + > > + for (idx = 0; idx < dev->nvqs; ++idx) { > > + dev->sw_lm_shadow_vq[idx] = vhost_sw_lm_shadow_vq(dev, idx); > > + } > > + > > + vhost_dev_disable_notifiers(dev, dev->vdev); > > There is a race condition if the guest kicks the vq while this is > happening. The shadow vq hdev_notifier needs to be set so the vhost > device checks the virtqueue for requests that slipped in during the > race window. > I'm not sure if I follow you. If I understand correctly, vhost_dev_disable_notifiers calls virtio_bus_cleanup_host_notifier, and the latter calls virtio_queue_host_notifier_read. That's why the documentation says "This might actually run the qemu handlers right away, so virtio in qemu must be completely setup when this is called.". Am I missing something? > > + > > + return 0; > > +} > > + > > +static int vhost_sw_live_migration_enable(struct vhost_dev *dev, > > + bool enable_lm) > > +{ > > + if (enable_lm) { > > + return vhost_sw_live_migration_start(dev); > > + } else { > > + return vhost_sw_live_migration_stop(dev); > > + } > > +} > > + > > +static void vhost_sw_lm_global_start(MemoryListener *listener) > > +{ > > + int r; > > + > > + r = vhost_migration_log(listener, true, vhost_sw_live_migration_enable); > > + if (r < 0) { > > + abort(); > > + } > > +} > > + > > +static void vhost_sw_lm_global_stop(MemoryListener *listener) > > +{ > > + int r; > > + > > + r = vhost_migration_log(listener, false, vhost_sw_live_migration_enable); > > + if (r < 0) { > > + abort(); > > + } > > +} > > + > > static void vhost_log_start(MemoryListener *listener, > > MemoryRegionSection *section, > > int old, int new) > > @@ -1334,9 +1421,14 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, > > .region_nop = vhost_region_addnop, > > .log_start = vhost_log_start, > > .log_stop = vhost_log_stop, > > - .log_sync = vhost_log_sync, > > - .log_global_start = vhost_log_global_start, > > - .log_global_stop = vhost_log_global_stop, > > + .log_sync = !vhost_dev_can_log(hdev) ? > > + vhost_log_sync_nop : > > + vhost_log_sync, > > Why is this change necessary now? It's not clear to me why it was > previously okay to call vhost_log_sync(). > This is only needed because I'm hijacking the vhost log system to know when migration has started. Since vhost log is not allocated, the call to vhost_log_sync() will fail to write in the bitmap. Likely, this change will be discarded in the final patch series, since another way of detecting live migration will be used. > > + .log_global_start = !vhost_dev_can_log(hdev) ? > > + vhost_sw_lm_global_start : > > + vhost_log_global_start, > > + .log_global_stop = !vhost_dev_can_log(hdev) ? vhost_sw_lm_global_stop : > > + vhost_log_global_stop, > > .eventfd_add = vhost_eventfd_add, > > .eventfd_del = vhost_eventfd_del, > > .priority = 10 > > @@ -1364,6 +1456,8 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, > > error_free(hdev->migration_blocker); > > goto fail_busyloop; > > } > > + } else { > > + hdev->sw_lm_vq_handler = handle_sw_lm_vq; > > } > > > > hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); > > diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build > > index fbff9bc9d4..17419cb13e 100644 > > --- a/hw/virtio/meson.build > > +++ b/hw/virtio/meson.build > > @@ -11,7 +11,7 @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c')) > > > > virtio_ss = ss.source_set() > > virtio_ss.add(files('virtio.c')) > > -virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c')) > > +virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-sw-lm-ring.c')) > > virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c')) > > virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c')) > > virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c')) > > -- > > 2.18.4 > >
On Wed, Dec 09, 2020 at 06:08:14PM +0100, Eugenio Perez Martin wrote: > On Mon, Dec 7, 2020 at 6:42 PM Stefan Hajnoczi <stefanha@gmail.com> wrote: > > On Fri, Nov 20, 2020 at 07:50:45PM +0100, Eugenio Pérez wrote: > > > +{ > > > + struct vhost_vring_file file = { > > > + .index = idx > > > + }; > > > + VirtQueue *vq = virtio_get_queue(dev->vdev, idx); > > > + VhostShadowVirtqueue *svq; > > > + int r; > > > + > > > + svq = g_new0(VhostShadowVirtqueue, 1); > > > + svq->vq = vq; > > > + > > > + r = event_notifier_init(&svq->hdev_notifier, 0); > > > + assert(r == 0); > > > + > > > + file.fd = event_notifier_get_fd(&svq->hdev_notifier); > > > + r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); > > > + assert(r == 0); > > > + > > > + return svq; > > > +} > > > > I guess there are assumptions about the status of the device? Does the > > virtqueue need to be disabled when this function is called? > > > > Yes. Maybe an assertion checking the notification state? Sounds good. > > > + > > > +static int vhost_sw_live_migration_stop(struct vhost_dev *dev) > > > +{ > > > + int idx; > > > + > > > + vhost_dev_enable_notifiers(dev, dev->vdev); > > > + for (idx = 0; idx < dev->nvqs; ++idx) { > > > + vhost_sw_lm_shadow_vq_free(dev->sw_lm_shadow_vq[idx]); > > > + } > > > + > > > + return 0; > > > +} > > > + > > > +static int vhost_sw_live_migration_start(struct vhost_dev *dev) > > > +{ > > > + int idx; > > > + > > > + for (idx = 0; idx < dev->nvqs; ++idx) { > > > + dev->sw_lm_shadow_vq[idx] = vhost_sw_lm_shadow_vq(dev, idx); > > > + } > > > + > > > + vhost_dev_disable_notifiers(dev, dev->vdev); > > > > There is a race condition if the guest kicks the vq while this is > > happening. The shadow vq hdev_notifier needs to be set so the vhost > > device checks the virtqueue for requests that slipped in during the > > race window. > > > > I'm not sure if I follow you. If I understand correctly, > vhost_dev_disable_notifiers calls virtio_bus_cleanup_host_notifier, > and the latter calls virtio_queue_host_notifier_read. That's why the > documentation says "This might actually run the qemu handlers right > away, so virtio in qemu must be completely setup when this is > called.". Am I missing something? There are at least two cases: 1. Virtqueue kicks that come before vhost_dev_disable_notifiers(). vhost_dev_disable_notifiers() notices that and calls virtio_queue_notify_vq(). Will handle_sw_lm_vq() be invoked or is the device's vq handler function invoked? 2. Virtqueue kicks that come in after vhost_dev_disable_notifiers() returns. We hold the QEMU global mutex so the vCPU thread cannot perform MMIO/PIO dispatch immediately. The vCPU thread's ioctl(KVM_RUN) has already returned and will dispatch dispatch the MMIO/PIO access inside QEMU as soon as the global mutex is released. In other words, we're not taking the kvm.ko ioeventfd path but memory_region_dispatch_write_eventfds() should signal the ioeventfd that is registered at the time the dispatch occurs. Is that eventfd handled by handle_sw_lm_vq()? Neither of these cases are obvious from the code. At least comments would help but I suspect restructuring the code so the critical ioeventfd state changes happen in a sequence would make it even clearer.
On Thu, Dec 10, 2020 at 12:51 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > On Wed, Dec 09, 2020 at 06:08:14PM +0100, Eugenio Perez Martin wrote: > > On Mon, Dec 7, 2020 at 6:42 PM Stefan Hajnoczi <stefanha@gmail.com> wrote: > > > On Fri, Nov 20, 2020 at 07:50:45PM +0100, Eugenio Pérez wrote: > > > > +{ > > > > + struct vhost_vring_file file = { > > > > + .index = idx > > > > + }; > > > > + VirtQueue *vq = virtio_get_queue(dev->vdev, idx); > > > > + VhostShadowVirtqueue *svq; > > > > + int r; > > > > + > > > > + svq = g_new0(VhostShadowVirtqueue, 1); > > > > + svq->vq = vq; > > > > + > > > > + r = event_notifier_init(&svq->hdev_notifier, 0); > > > > + assert(r == 0); > > > > + > > > > + file.fd = event_notifier_get_fd(&svq->hdev_notifier); > > > > + r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); > > > > + assert(r == 0); > > > > + > > > > + return svq; > > > > +} > > > > > > I guess there are assumptions about the status of the device? Does the > > > virtqueue need to be disabled when this function is called? > > > > > > > Yes. Maybe an assertion checking the notification state? > > Sounds good. > > > > > + > > > > +static int vhost_sw_live_migration_stop(struct vhost_dev *dev) > > > > +{ > > > > + int idx; > > > > + > > > > + vhost_dev_enable_notifiers(dev, dev->vdev); > > > > + for (idx = 0; idx < dev->nvqs; ++idx) { > > > > + vhost_sw_lm_shadow_vq_free(dev->sw_lm_shadow_vq[idx]); > > > > + } > > > > + > > > > + return 0; > > > > +} > > > > + > > > > +static int vhost_sw_live_migration_start(struct vhost_dev *dev) > > > > +{ > > > > + int idx; > > > > + > > > > + for (idx = 0; idx < dev->nvqs; ++idx) { > > > > + dev->sw_lm_shadow_vq[idx] = vhost_sw_lm_shadow_vq(dev, idx); > > > > + } > > > > + > > > > + vhost_dev_disable_notifiers(dev, dev->vdev); > > > > > > There is a race condition if the guest kicks the vq while this is > > > happening. The shadow vq hdev_notifier needs to be set so the vhost > > > device checks the virtqueue for requests that slipped in during the > > > race window. > > > > > > > I'm not sure if I follow you. If I understand correctly, > > vhost_dev_disable_notifiers calls virtio_bus_cleanup_host_notifier, > > and the latter calls virtio_queue_host_notifier_read. That's why the > > documentation says "This might actually run the qemu handlers right > > away, so virtio in qemu must be completely setup when this is > > called.". Am I missing something? > > There are at least two cases: > > 1. Virtqueue kicks that come before vhost_dev_disable_notifiers(). > vhost_dev_disable_notifiers() notices that and calls > virtio_queue_notify_vq(). Will handle_sw_lm_vq() be invoked or is the > device's vq handler function invoked? > As I understand both the code and your question, no kick can call handle_sw_lm_vq before vhost_dev_disable_notifiers (in particular, before memory_region_add_eventfd calls in virtio_{pci,mmio,ccw}_ioeventfd_assign(true) calls. So these will be handled by the device. > 2. Virtqueue kicks that come in after vhost_dev_disable_notifiers() > returns. We hold the QEMU global mutex so the vCPU thread cannot > perform MMIO/PIO dispatch immediately. The vCPU thread's > ioctl(KVM_RUN) has already returned and will dispatch dispatch the > MMIO/PIO access inside QEMU as soon as the global mutex is released. > In other words, we're not taking the kvm.ko ioeventfd path but > memory_region_dispatch_write_eventfds() should signal the ioeventfd > that is registered at the time the dispatch occurs. Is that eventfd > handled by handle_sw_lm_vq()? > I didn't think on that case, but it's being very difficult for me to reproduce that behavior. It should be handled by handle_sw_lm_vq, but maybe I'm trusting too much in vhost_dev_disable_notifiers. > Neither of these cases are obvious from the code. At least comments > would help but I suspect restructuring the code so the critical > ioeventfd state changes happen in a sequence would make it even clearer. Could you expand on this? That change is managed entirely by virtio_bus_set_host_notifier, and the virtqueue callback is already changed before the call to vhost_dev_disable_notifiers(). Did you mean to restructure virtio_bus_set_host_notifier or vhost_dev_disable_notifiers maybe? Thanks!
diff --git a/hw/virtio/vhost-sw-lm-ring.h b/hw/virtio/vhost-sw-lm-ring.h new file mode 100644 index 0000000000..86dc081b93 --- /dev/null +++ b/hw/virtio/vhost-sw-lm-ring.h @@ -0,0 +1,26 @@ +/* + * vhost software live migration ring + * + * SPDX-FileCopyrightText: Red Hat, Inc. 2020 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef VHOST_SW_LM_RING_H +#define VHOST_SW_LM_RING_H + +#include "qemu/osdep.h" + +#include "hw/virtio/virtio.h" +#include "hw/virtio/vhost.h" + +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue; + +bool vhost_vring_kick(VhostShadowVirtqueue *vq); + +VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx); + +void vhost_sw_lm_shadow_vq_free(VhostShadowVirtqueue *vq); + +#endif diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index b5b7496537..93cc3f1ae3 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -54,6 +54,8 @@ struct vhost_iommu { QLIST_ENTRY(vhost_iommu) iommu_next; }; +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue; + typedef struct VhostDevConfigOps { /* Vhost device config space changed callback */ @@ -83,6 +85,7 @@ struct vhost_dev { bool started; bool log_enabled; uint64_t log_size; + VhostShadowVirtqueue *sw_lm_shadow_vq[2]; VirtIOHandleOutput sw_lm_vq_handler; Error *migration_blocker; const VhostOps *vhost_ops; diff --git a/hw/virtio/vhost-sw-lm-ring.c b/hw/virtio/vhost-sw-lm-ring.c new file mode 100644 index 0000000000..0192e77831 --- /dev/null +++ b/hw/virtio/vhost-sw-lm-ring.c @@ -0,0 +1,60 @@ +/* + * vhost software live migration ring + * + * SPDX-FileCopyrightText: Red Hat, Inc. 2020 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "hw/virtio/vhost-sw-lm-ring.h" +#include "hw/virtio/vhost.h" + +#include "standard-headers/linux/vhost_types.h" +#include "standard-headers/linux/virtio_ring.h" + +#include "qemu/event_notifier.h" + +typedef struct VhostShadowVirtqueue { + EventNotifier hdev_notifier; + VirtQueue *vq; +} VhostShadowVirtqueue; + +static inline bool vhost_vring_should_kick(VhostShadowVirtqueue *vq) +{ + return virtio_queue_get_used_notify_split(vq->vq); +} + +bool vhost_vring_kick(VhostShadowVirtqueue *vq) +{ + return vhost_vring_should_kick(vq) ? event_notifier_set(&vq->hdev_notifier) + : true; +} + +VhostShadowVirtqueue *vhost_sw_lm_shadow_vq(struct vhost_dev *dev, int idx) +{ + struct vhost_vring_file file = { + .index = idx + }; + VirtQueue *vq = virtio_get_queue(dev->vdev, idx); + VhostShadowVirtqueue *svq; + int r; + + svq = g_new0(VhostShadowVirtqueue, 1); + svq->vq = vq; + + r = event_notifier_init(&svq->hdev_notifier, 0); + assert(r == 0); + + file.fd = event_notifier_get_fd(&svq->hdev_notifier); + r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); + assert(r == 0); + + return svq; +} + +void vhost_sw_lm_shadow_vq_free(VhostShadowVirtqueue *vq) +{ + event_notifier_cleanup(&vq->hdev_notifier); + g_free(vq); +} diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 9cbd52a7f1..a55b684b5f 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -13,6 +13,8 @@ * GNU GPL, version 2 or (at your option) any later version. */ +#include "hw/virtio/vhost-sw-lm-ring.h" + #include "qemu/osdep.h" #include "qapi/error.h" #include "hw/virtio/vhost.h" @@ -61,6 +63,20 @@ bool vhost_has_free_slot(void) return slots_limit > used_memslots; } +static struct vhost_dev *vhost_dev_from_virtio(const VirtIODevice *vdev) +{ + struct vhost_dev *hdev; + + QLIST_FOREACH(hdev, &vhost_devices, entry) { + if (hdev->vdev == vdev) { + return hdev; + } + } + + assert(hdev); + return NULL; +} + static bool vhost_dev_can_log(const struct vhost_dev *hdev) { return hdev->features & (0x1ULL << VHOST_F_LOG_ALL); @@ -148,6 +164,12 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, return 0; } +static void vhost_log_sync_nop(MemoryListener *listener, + MemoryRegionSection *section) +{ + return; +} + static void vhost_log_sync(MemoryListener *listener, MemoryRegionSection *section) { @@ -928,6 +950,71 @@ static void vhost_log_global_stop(MemoryListener *listener) } } +static void handle_sw_lm_vq(VirtIODevice *vdev, VirtQueue *vq) +{ + struct vhost_dev *hdev = vhost_dev_from_virtio(vdev); + uint16_t idx = virtio_get_queue_index(vq); + + VhostShadowVirtqueue *svq = hdev->sw_lm_shadow_vq[idx]; + + vhost_vring_kick(svq); +} + +static int vhost_sw_live_migration_stop(struct vhost_dev *dev) +{ + int idx; + + vhost_dev_enable_notifiers(dev, dev->vdev); + for (idx = 0; idx < dev->nvqs; ++idx) { + vhost_sw_lm_shadow_vq_free(dev->sw_lm_shadow_vq[idx]); + } + + return 0; +} + +static int vhost_sw_live_migration_start(struct vhost_dev *dev) +{ + int idx; + + for (idx = 0; idx < dev->nvqs; ++idx) { + dev->sw_lm_shadow_vq[idx] = vhost_sw_lm_shadow_vq(dev, idx); + } + + vhost_dev_disable_notifiers(dev, dev->vdev); + + return 0; +} + +static int vhost_sw_live_migration_enable(struct vhost_dev *dev, + bool enable_lm) +{ + if (enable_lm) { + return vhost_sw_live_migration_start(dev); + } else { + return vhost_sw_live_migration_stop(dev); + } +} + +static void vhost_sw_lm_global_start(MemoryListener *listener) +{ + int r; + + r = vhost_migration_log(listener, true, vhost_sw_live_migration_enable); + if (r < 0) { + abort(); + } +} + +static void vhost_sw_lm_global_stop(MemoryListener *listener) +{ + int r; + + r = vhost_migration_log(listener, false, vhost_sw_live_migration_enable); + if (r < 0) { + abort(); + } +} + static void vhost_log_start(MemoryListener *listener, MemoryRegionSection *section, int old, int new) @@ -1334,9 +1421,14 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, .region_nop = vhost_region_addnop, .log_start = vhost_log_start, .log_stop = vhost_log_stop, - .log_sync = vhost_log_sync, - .log_global_start = vhost_log_global_start, - .log_global_stop = vhost_log_global_stop, + .log_sync = !vhost_dev_can_log(hdev) ? + vhost_log_sync_nop : + vhost_log_sync, + .log_global_start = !vhost_dev_can_log(hdev) ? + vhost_sw_lm_global_start : + vhost_log_global_start, + .log_global_stop = !vhost_dev_can_log(hdev) ? vhost_sw_lm_global_stop : + vhost_log_global_stop, .eventfd_add = vhost_eventfd_add, .eventfd_del = vhost_eventfd_del, .priority = 10 @@ -1364,6 +1456,8 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, error_free(hdev->migration_blocker); goto fail_busyloop; } + } else { + hdev->sw_lm_vq_handler = handle_sw_lm_vq; } hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build index fbff9bc9d4..17419cb13e 100644 --- a/hw/virtio/meson.build +++ b/hw/virtio/meson.build @@ -11,7 +11,7 @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c')) virtio_ss = ss.source_set() virtio_ss.add(files('virtio.c')) -virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c')) +virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-sw-lm-ring.c')) virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c')) virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c')) virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
Signed-off-by: Eugenio Pérez <eperezma@redhat.com> --- hw/virtio/vhost-sw-lm-ring.h | 26 +++++++++ include/hw/virtio/vhost.h | 3 ++ hw/virtio/vhost-sw-lm-ring.c | 60 +++++++++++++++++++++ hw/virtio/vhost.c | 100 +++++++++++++++++++++++++++++++++-- hw/virtio/meson.build | 2 +- 5 files changed, 187 insertions(+), 4 deletions(-) create mode 100644 hw/virtio/vhost-sw-lm-ring.h create mode 100644 hw/virtio/vhost-sw-lm-ring.c