Message ID | 1519980450-3404-3-git-send-email-wei.w.wang@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Mar 02, 2018 at 04:47:29PM +0800, Wei Wang wrote: > The new feature enables the virtio-balloon device to receive hints of > guest free pages from the free page vq. Callers call the > free_page_start API to start the reporting, which creates a thread to > poll for free page hints. The free_page_stop API stops the reporting and > makes the thread exit. > > Signed-off-by: Wei Wang <wei.w.wang@intel.com> > Signed-off-by: Liang Li <liang.z.li@intel.com> > CC: Michael S. Tsirkin <mst@redhat.com> > CC: Dr. David Alan Gilbert <dgilbert@redhat.com> > CC: Juan Quintela <quintela@redhat.com> > --- > balloon.c | 49 +++++-- > hw/virtio/virtio-balloon.c | 172 +++++++++++++++++++++--- > include/hw/virtio/virtio-balloon.h | 14 +- > include/standard-headers/linux/virtio_balloon.h | 7 + > include/sysemu/balloon.h | 15 ++- > 5 files changed, 228 insertions(+), 29 deletions(-) > > diff --git a/balloon.c b/balloon.c > index d8dd6fe..b0b0749 100644 > --- a/balloon.c > +++ b/balloon.c > @@ -36,6 +36,9 @@ > > static QEMUBalloonEvent *balloon_event_fn; > static QEMUBalloonStatus *balloon_stat_fn; > +static QEMUBalloonFreePageSupport *balloon_free_page_support_fn; > +static QEMUBalloonFreePageStart *balloon_free_page_start_fn; > +static QEMUBalloonFreePageStop *balloon_free_page_stop_fn; > static void *balloon_opaque; > static bool balloon_inhibited; > > @@ -64,19 +67,42 @@ static bool have_balloon(Error **errp) > return true; > } > > -int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, > - QEMUBalloonStatus *stat_func, void *opaque) > +bool balloon_free_page_support(void) > { > - if (balloon_event_fn || balloon_stat_fn || balloon_opaque) { > - /* We're already registered one balloon handler. How many can > - * a guest really have? > - */ > - return -1; > + return balloon_free_page_support_fn && > + balloon_free_page_support_fn(balloon_opaque); > +} > + > +void balloon_free_page_start(void) > +{ > + balloon_free_page_start_fn(balloon_opaque); > +} > + > +void balloon_free_page_stop(void) > +{ > + balloon_free_page_stop_fn(balloon_opaque); > +} > + > +void qemu_add_balloon_handler(QEMUBalloonEvent *event_fn, > + QEMUBalloonStatus *stat_fn, > + QEMUBalloonFreePageSupport *free_page_support_fn, > + QEMUBalloonFreePageStart *free_page_start_fn, > + QEMUBalloonFreePageStop *free_page_stop_fn, > + void *opaque) > +{ > + if (balloon_event_fn || balloon_stat_fn || balloon_free_page_support_fn || > + balloon_free_page_start_fn || balloon_free_page_stop_fn || > + balloon_opaque) { > + /* We already registered one balloon handler. */ > + return; > } > - balloon_event_fn = event_func; > - balloon_stat_fn = stat_func; > + > + balloon_event_fn = event_fn; > + balloon_stat_fn = stat_fn; > + balloon_free_page_support_fn = free_page_support_fn; > + balloon_free_page_start_fn = free_page_start_fn; > + balloon_free_page_stop_fn = free_page_stop_fn; > balloon_opaque = opaque; > - return 0; > } > > void qemu_remove_balloon_handler(void *opaque) > @@ -86,6 +112,9 @@ void qemu_remove_balloon_handler(void *opaque) > } > balloon_event_fn = NULL; > balloon_stat_fn = NULL; > + balloon_free_page_support_fn = NULL; > + balloon_free_page_start_fn = NULL; > + balloon_free_page_stop_fn = NULL; > balloon_opaque = NULL; > } > > diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c > index 4822449..4607879 100644 > --- a/hw/virtio/virtio-balloon.c > +++ b/hw/virtio/virtio-balloon.c > @@ -31,6 +31,7 @@ > > #include "hw/virtio/virtio-bus.h" > #include "hw/virtio/virtio-access.h" > +#include "migration/misc.h" > > #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) > > @@ -308,6 +309,100 @@ out: > } > } > > +static void *virtio_balloon_poll_free_page_hints(void *opaque) > +{ > + VirtQueueElement *elem; > + VirtIOBalloon *dev = opaque; > + VirtQueue *vq = dev->free_page_vq; > + uint32_t id; > + size_t size; > + > + /* > + * Poll the vq till the status changed to STOP. This happens when > + * the guest finishes reporting hints or the migration thread actively > + * stops the reporting. > + */ > + while (dev->free_page_report_status != FREE_PAGE_REPORT_S_STOP) { > + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); > + if (!elem) { > + continue; > + } > + > + if (elem->out_num) { > + size = iov_to_buf(elem->out_sg, elem->out_num, 0, &id, sizeof(id)); > + virtqueue_push(vq, elem, size); > + g_free(elem); > + if (unlikely(size != sizeof(id))) { > + warn_report("%s: received an incorrect cmd id", __func__); > + break; > + } > + if (id == dev->free_page_report_cmd_id) { > + dev->free_page_report_status = FREE_PAGE_REPORT_S_START; > + } else if (dev->free_page_report_status == > + FREE_PAGE_REPORT_S_START) { > + /* > + * Stop the optimization only when it has started. This avoids > + * obsolete stop sign for the previous command. > + */ > + dev->free_page_report_status = FREE_PAGE_REPORT_S_STOP; > + break; > + } > + } > + > + if (elem->in_num) { > + if (dev->free_page_report_status == FREE_PAGE_REPORT_S_START && > + !dev->poison_val) { > + qemu_guest_free_page_hint(elem->in_sg[0].iov_base, > + elem->in_sg[0].iov_len); > + } > + virtqueue_push(vq, elem, 0); > + g_free(elem); > + } > + } > + return NULL; > +} > + > +static bool virtio_balloon_free_page_support(void *opaque) > +{ > + VirtIOBalloon *s = opaque; > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > + > + return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT); > +} > + > +static void virtio_balloon_free_page_start(void *opaque) > +{ > + VirtIOBalloon *s = opaque; > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > + > + if (unlikely(s->free_page_report_cmd_id == UINT_MAX)) { > + s->free_page_report_cmd_id = > + VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN; > + } else { > + s->free_page_report_cmd_id++; > + } > + > + s->free_page_report_status = FREE_PAGE_REPORT_S_REQUESTED; > + virtio_notify_config(vdev); > + qemu_thread_create(&s->free_page_thread, "free_page_optimization_thread", > + virtio_balloon_poll_free_page_hints, s, > + QEMU_THREAD_JOINABLE); > +} > + > +static void virtio_balloon_free_page_stop(void *opaque) > +{ > + VirtIOBalloon *s = opaque; > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > + > + if (s->free_page_report_status == FREE_PAGE_REPORT_S_STOP) { > + return; > + } > + > + s->free_page_report_status = FREE_PAGE_REPORT_S_STOP; > + virtio_notify_config(vdev); > + qemu_thread_join(&s->free_page_thread); > +} > + > static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) > { > VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); > @@ -315,6 +410,15 @@ static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) > > config.num_pages = cpu_to_le32(dev->num_pages); > config.actual = cpu_to_le32(dev->actual); > + config.poison_val = cpu_to_le32(dev->poison_val); > + > + if (dev->free_page_report_status == FREE_PAGE_REPORT_S_STOP) { > + config.free_page_report_cmd_id = > + cpu_to_le32(VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID); > + } else { > + config.free_page_report_cmd_id = > + cpu_to_le32(dev->free_page_report_cmd_id); > + } > > trace_virtio_balloon_get_config(config.num_pages, config.actual); > memcpy(config_data, &config, sizeof(struct virtio_balloon_config)); > @@ -368,6 +472,7 @@ static void virtio_balloon_set_config(VirtIODevice *vdev, > ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT), > &error_abort); > } > + dev->poison_val = le32_to_cpu(config.poison_val); > trace_virtio_balloon_set_config(dev->actual, oldactual); > } > > @@ -377,6 +482,11 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, > VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); > f |= dev->host_features; > virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ); > + > + if (dev->host_features & 1ULL << VIRTIO_BALLOON_F_FREE_PAGE_HINT) { > + virtio_add_feature(&f, VIRTIO_BALLOON_F_PAGE_POISON); > + } > + > return f; > } > > @@ -413,6 +523,18 @@ static int virtio_balloon_post_load_device(void *opaque, int version_id) > return 0; > } > > +static const VMStateDescription vmstate_virtio_balloon_free_page_report = { > + .name = "virtio-balloon-device/free-page-report", > + .version_id = 1, > + .minimum_version_id = 1, > + .needed = virtio_balloon_free_page_support, > + .fields = (VMStateField[]) { > + VMSTATE_UINT32(free_page_report_cmd_id, VirtIOBalloon), > + VMSTATE_UINT32(poison_val, VirtIOBalloon), > + VMSTATE_END_OF_LIST() > + } > +}; > + > static const VMStateDescription vmstate_virtio_balloon_device = { > .name = "virtio-balloon-device", > .version_id = 1, > @@ -423,30 +545,30 @@ static const VMStateDescription vmstate_virtio_balloon_device = { > VMSTATE_UINT32(actual, VirtIOBalloon), > VMSTATE_END_OF_LIST() > }, > + .subsections = (const VMStateDescription * []) { > + &vmstate_virtio_balloon_free_page_report, > + NULL > + } > }; > > static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) > { > VirtIODevice *vdev = VIRTIO_DEVICE(dev); > VirtIOBalloon *s = VIRTIO_BALLOON(dev); > - int ret; > > virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON, > sizeof(struct virtio_balloon_config)); > > - ret = qemu_add_balloon_handler(virtio_balloon_to_target, > - virtio_balloon_stat, s); > - > - if (ret < 0) { > - error_setg(errp, "Only one balloon device is supported"); > - virtio_cleanup(vdev); > - return; > - } > - > s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); > s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); > s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); > - > + if (virtio_has_feature(s->host_features, > + VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { > + s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE, NULL); > + s->free_page_report_status = FREE_PAGE_REPORT_S_STOP; > + s->free_page_report_cmd_id = > + VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN - 1; > + } > reset_stats(s); > } > > @@ -475,11 +597,27 @@ static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status) > { > VirtIOBalloon *s = VIRTIO_BALLOON(vdev); > > - if (!s->stats_vq_elem && vdev->vm_running && > - (status & VIRTIO_CONFIG_S_DRIVER_OK) && virtqueue_rewind(s->svq, 1)) { > - /* poll stats queue for the element we have discarded when the VM > - * was stopped */ > - virtio_balloon_receive_stats(vdev, s->svq); > + if (status & VIRTIO_CONFIG_S_DRIVER_OK) { > + if (!s->stats_vq_elem && vdev->vm_running && > + virtqueue_rewind(s->svq, 1)) { > + /* > + * Poll stats queue for the element we have discarded when the VM > + * was stopped. > + */ > + virtio_balloon_receive_stats(vdev, s->svq); > + } > + > + if (virtio_balloon_free_page_support(s)) { > + qemu_add_balloon_handler(virtio_balloon_to_target, > + virtio_balloon_stat, > + virtio_balloon_free_page_support, > + virtio_balloon_free_page_start, > + virtio_balloon_free_page_stop, > + s); > + } else { > + qemu_add_balloon_handler(virtio_balloon_to_target, > + virtio_balloon_stat, NULL, NULL, NULL, s); > + } > } > } > > @@ -509,6 +647,8 @@ static const VMStateDescription vmstate_virtio_balloon = { > static Property virtio_balloon_properties[] = { > DEFINE_PROP_BIT("deflate-on-oom", VirtIOBalloon, host_features, > VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false), > + DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features, > + VIRTIO_BALLOON_F_FREE_PAGE_HINT, false), > DEFINE_PROP_END_OF_LIST(), > }; > > diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h > index 1ea13bd..ce77382 100644 > --- a/include/hw/virtio/virtio-balloon.h > +++ b/include/hw/virtio/virtio-balloon.h > @@ -23,6 +23,8 @@ > #define VIRTIO_BALLOON(obj) \ > OBJECT_CHECK(VirtIOBalloon, (obj), TYPE_VIRTIO_BALLOON) > > +#define VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN 0x80000000 > + > typedef struct virtio_balloon_stat VirtIOBalloonStat; > > typedef struct virtio_balloon_stat_modern { > @@ -31,15 +33,25 @@ typedef struct virtio_balloon_stat_modern { > uint64_t val; > } VirtIOBalloonStatModern; > > +enum virtio_balloon_free_page_report_status { > + FREE_PAGE_REPORT_S_REQUESTED, > + FREE_PAGE_REPORT_S_START, > + FREE_PAGE_REPORT_S_STOP, > +}; > + > typedef struct VirtIOBalloon { > VirtIODevice parent_obj; > - VirtQueue *ivq, *dvq, *svq; > + VirtQueue *ivq, *dvq, *svq, *free_page_vq; > + uint32_t free_page_report_status; > uint32_t num_pages; > uint32_t actual; > + uint32_t free_page_report_cmd_id; > + uint32_t poison_val; > uint64_t stats[VIRTIO_BALLOON_S_NR]; > VirtQueueElement *stats_vq_elem; > size_t stats_vq_offset; > QEMUTimer *stats_timer; > + QemuThread free_page_thread; > int64_t stats_last_update; > int64_t stats_poll_interval; > uint32_t host_features; > diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h > index 7b0a41b..f89e80f 100644 > --- a/include/standard-headers/linux/virtio_balloon.h > +++ b/include/standard-headers/linux/virtio_balloon.h > @@ -34,15 +34,22 @@ > #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ > #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ > #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ > +#define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ > +#define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ > > /* Size of a PFN in the balloon interface. */ > #define VIRTIO_BALLOON_PFN_SHIFT 12 > > +#define VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID 0 > struct virtio_balloon_config { > /* Number of pages host wants Guest to give up. */ > uint32_t num_pages; > /* Number of pages we've actually got in balloon. */ > uint32_t actual; > + /* Free page report command id, readonly by guest */ > + uint32_t free_page_report_cmd_id; > + /* Stores PAGE_POISON if page poisoning is in use */ > + uint32_t poison_val; > }; > > #define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */ > diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h > index af49e19..16a2aae 100644 > --- a/include/sysemu/balloon.h > +++ b/include/sysemu/balloon.h > @@ -18,11 +18,22 @@ > > typedef void (QEMUBalloonEvent)(void *opaque, ram_addr_t target); > typedef void (QEMUBalloonStatus)(void *opaque, BalloonInfo *info); > +typedef bool (QEMUBalloonFreePageSupport)(void *opaque); > +typedef void (QEMUBalloonFreePageStart)(void *opaque); > +typedef void (QEMUBalloonFreePageStop)(void *opaque); Could you add some documentation about these? E.g. I think the assumption made here is that all memory is write-protected when QEMUBalloonFreePageStart runs. > > -int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, > - QEMUBalloonStatus *stat_func, void *opaque); > void qemu_remove_balloon_handler(void *opaque); > bool qemu_balloon_is_inhibited(void); > void qemu_balloon_inhibit(bool state); > +bool balloon_free_page_support(void); > +void balloon_free_page_start(void); > +void balloon_free_page_stop(void); > + > +void qemu_add_balloon_handler(QEMUBalloonEvent *event_fn, > + QEMUBalloonStatus *stat_fn, > + QEMUBalloonFreePageSupport *free_page_support_fn, > + QEMUBalloonFreePageStart *free_page_start_fn, > + QEMUBalloonFreePageStop *free_page_stop_fn, > + void *opaque); > > #endif > -- > 1.8.3.1
On Fri, Mar 02, 2018 at 04:47:29PM +0800, Wei Wang wrote: > diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h > index af49e19..16a2aae 100644 > --- a/include/sysemu/balloon.h > +++ b/include/sysemu/balloon.h ... > +typedef void (QEMUBalloonFreePageStart)(void *opaque); > +typedef void (QEMUBalloonFreePageStop)(void *opaque); So I think the rule is that no bitmap sync must happen between these two, otherwise a hint might arrive and override the sync output. Should be documented I think.
On 03/03/2018 02:37 AM, Michael S. Tsirkin wrote: > On Fri, Mar 02, 2018 at 04:47:29PM +0800, Wei Wang wrote: >> diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h >> index af49e19..16a2aae 100644 >> --- a/include/sysemu/balloon.h >> +++ b/include/sysemu/balloon.h > ... > >> +typedef void (QEMUBalloonFreePageStart)(void *opaque); >> +typedef void (QEMUBalloonFreePageStop)(void *opaque); > So I think the rule is that no bitmap sync must happen > between these two, otherwise a hint might arrive and > override the sync output. > > Should be documented I think. > Yes, agree. How about adding the following new balloon API explanation to this patch's commit: - balloon_free_page_start: Callers call this API to obtain guest free page hints, and clear the related bits from the migration dirty bitmap. The whole process is implemented in a new thread independent of the migration thread. Free page hints imply the part of guest memory is likely to be free without a guarantee. That is, the reported free pages may not be free any more when QEMU receives them, so callers are responsible for detecting those pages that are not free pages after the bits are cleared from the dirty bitmap. To ensure the above, this API should be used when the migration dirty logging mechanism (e.g. guest memory write-protection) has started. - balloon_free_page_stop: Callers call this API to stop the guest from reporting free page hints. Bits from the dirty bitmap are safe to be cleared on condition that the dirty logging mechanism is recording pages that the guest has written to. To avoid the case that clearing bits of free page hints overrides the dirty bits offered by the dirty logging mechanism, this API is suggested to be called before QEMU synchronizes the dirty logging bitmap. - balloon_free_page_support: This API is called to check whether the balloon device supports the guest free page reporting feature. The balloon_free_page_start and balloon_free_page_stop APIs should be used only when this API returns true. Best, Wei
On Mon, Mar 05, 2018 at 11:36:15AM +0800, Wei Wang wrote: > On 03/03/2018 02:37 AM, Michael S. Tsirkin wrote: > > On Fri, Mar 02, 2018 at 04:47:29PM +0800, Wei Wang wrote: > > > diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h > > > index af49e19..16a2aae 100644 > > > --- a/include/sysemu/balloon.h > > > +++ b/include/sysemu/balloon.h > > ... > > > > > +typedef void (QEMUBalloonFreePageStart)(void *opaque); > > > +typedef void (QEMUBalloonFreePageStop)(void *opaque); > > So I think the rule is that no bitmap sync must happen > > between these two, otherwise a hint might arrive and > > override the sync output. > > > > Should be documented I think. > > > > Yes, agree. Ideally we'd also detect violations and trigger an assert. > How about adding the following new balloon API explanation to > this patch's commit: > > - balloon_free_page_start: Callers call this API to obtain guest free > page hints, and clear the related bits from the migration dirty > bitmap. > The whole process is implemented in a new thread independent of the > migration thread. Free page hints imply the part of guest memory is > likely to be free without a guarantee. That is, the reported free > pages > may not be free any more when QEMU receives them, so callers are > responsible for detecting those pages that are not free pages after > the > bits are cleared from the dirty bitmap. To ensure the above, this API > should be used when the migration dirty logging mechanism (e.g. > guest memory write-protection) has started. > > - balloon_free_page_stop: Callers call this API to stop the guest from > reporting free page hints. Bits from the dirty bitmap are safe to > be cleared on condition that the dirty logging mechanism is recording > pages that the guest has written to. To avoid the case that clearing > bits of free page hints overrides the dirty bits offered by the dirty > logging mechanism, this API is suggested to be called before QEMU > synchronizes the dirty logging bitmap. > > - balloon_free_page_support: This API is called to check whether the > balloon device supports the guest free page reporting feature. The > balloon_free_page_start and balloon_free_page_stop APIs should be used > only when this API returns true. > > > Best, > Wei I find this more confusing than explaining. Let me try balloon_free_page_start - start guest free page hint reporting. Note: balloon will report pages which were free at the time of this call. As the reporting happens asynchronously, we rely on dirty logging to be started before this call is made. The dirty logging bitmap must be synchronized before this call and then after balloon_free_page_stop. balloon_free_page_stop: stop the guest reporting of free pages. dirty logging bitmap can be synchronized after this point. No bitmap synchronizations are allowed between these two points.
On 03/05/2018 10:09 PM, Michael S. Tsirkin wrote: > On Mon, Mar 05, 2018 at 11:36:15AM +0800, Wei Wang wrote: >> On 03/03/2018 02:37 AM, Michael S. Tsirkin wrote: >>> On Fri, Mar 02, 2018 at 04:47:29PM +0800, Wei Wang wrote: >>>> diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h >>>> index af49e19..16a2aae 100644 >>>> --- a/include/sysemu/balloon.h >>>> +++ b/include/sysemu/balloon.h >>> ... >>> >>>> +typedef void (QEMUBalloonFreePageStart)(void *opaque); >>>> +typedef void (QEMUBalloonFreePageStop)(void *opaque); >>> So I think the rule is that no bitmap sync must happen >>> between these two, otherwise a hint might arrive and >>> override the sync output. >>> >>> Should be documented I think. >>> >> Yes, agree. > Ideally we'd also detect violations and trigger an assert. How about just invoking if (rs->free_page_support) balloon_free_page_stop(); at the beginning of migration_bitmap_sync()? (balloon_free_page_stop will just return if the optimization has stopped.) In this way, we can always have the guarantee that "no bitmap sync must happen between these two" > >> How about adding the following new balloon API explanation to >> this patch's commit: >> >> - balloon_free_page_start: Callers call this API to obtain guest free >> page hints, and clear the related bits from the migration dirty >> bitmap. >> The whole process is implemented in a new thread independent of the >> migration thread. Free page hints imply the part of guest memory is >> likely to be free without a guarantee. That is, the reported free >> pages >> may not be free any more when QEMU receives them, so callers are >> responsible for detecting those pages that are not free pages after >> the >> bits are cleared from the dirty bitmap. To ensure the above, this API >> should be used when the migration dirty logging mechanism (e.g. >> guest memory write-protection) has started. >> >> - balloon_free_page_stop: Callers call this API to stop the guest from >> reporting free page hints. Bits from the dirty bitmap are safe to >> be cleared on condition that the dirty logging mechanism is recording >> pages that the guest has written to. To avoid the case that clearing >> bits of free page hints overrides the dirty bits offered by the dirty >> logging mechanism, this API is suggested to be called before QEMU >> synchronizes the dirty logging bitmap. >> >> - balloon_free_page_support: This API is called to check whether the >> balloon device supports the guest free page reporting feature. The >> balloon_free_page_start and balloon_free_page_stop APIs should be used >> only when this API returns true. >> >> >> Best, >> Wei > I find this more confusing than explaining. > > Let me try > > balloon_free_page_start - start guest free page hint reporting. > Note: balloon will report pages which were free at the time > of this call. As the reporting happens asynchronously, > we rely on dirty logging to be started before this call is made. > > The dirty logging bitmap must be synchronized before this call > and then after balloon_free_page_stop. I think it would be better to remove the above one sentence. I agree "No dirty bitmap synchronizations are allowed between balloon_free_page_start and balloon_free_page_stop", but "The dirty logging bitmap MUST be synchronized before balloon_free_page_start" seems confusing, for example the bulk stage doesn't have to start with a bitmap sync. > > balloon_free_page_stop: stop the guest reporting > of free pages. dirty logging bitmap can be synchronized > after this point. > > No bitmap synchronizations are allowed between these two points. > Best, Wei
On Tue, Mar 06, 2018 at 09:54:49AM +0800, Wei Wang wrote: > On 03/05/2018 10:09 PM, Michael S. Tsirkin wrote: > > On Mon, Mar 05, 2018 at 11:36:15AM +0800, Wei Wang wrote: > > > On 03/03/2018 02:37 AM, Michael S. Tsirkin wrote: > > > > On Fri, Mar 02, 2018 at 04:47:29PM +0800, Wei Wang wrote: > > > > > diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h > > > > > index af49e19..16a2aae 100644 > > > > > --- a/include/sysemu/balloon.h > > > > > +++ b/include/sysemu/balloon.h > > > > ... > > > > > > > > > +typedef void (QEMUBalloonFreePageStart)(void *opaque); > > > > > +typedef void (QEMUBalloonFreePageStop)(void *opaque); > > > > So I think the rule is that no bitmap sync must happen > > > > between these two, otherwise a hint might arrive and > > > > override the sync output. > > > > > > > > Should be documented I think. > > > > > > > Yes, agree. > > Ideally we'd also detect violations and trigger an assert. > > How about just invoking > > if (rs->free_page_support) > balloon_free_page_stop(); > > at the beginning of migration_bitmap_sync()? (balloon_free_page_stop will > just return if the optimization has stopped.) > > In this way, we can always have the guarantee that "no bitmap sync must > happen between these two" Why not. And in fact you can do balloon_free_page_start at the end of sync. > > > > > > How about adding the following new balloon API explanation to > > > this patch's commit: > > > > > > - balloon_free_page_start: Callers call this API to obtain guest free > > > page hints, and clear the related bits from the migration dirty > > > bitmap. > > > The whole process is implemented in a new thread independent of the > > > migration thread. Free page hints imply the part of guest memory is > > > likely to be free without a guarantee. That is, the reported free > > > pages > > > may not be free any more when QEMU receives them, so callers are > > > responsible for detecting those pages that are not free pages after > > > the > > > bits are cleared from the dirty bitmap. To ensure the above, this API > > > should be used when the migration dirty logging mechanism (e.g. > > > guest memory write-protection) has started. > > > > > > - balloon_free_page_stop: Callers call this API to stop the guest from > > > reporting free page hints. Bits from the dirty bitmap are safe to > > > be cleared on condition that the dirty logging mechanism is recording > > > pages that the guest has written to. To avoid the case that clearing > > > bits of free page hints overrides the dirty bits offered by the dirty > > > logging mechanism, this API is suggested to be called before QEMU > > > synchronizes the dirty logging bitmap. > > > > > > - balloon_free_page_support: This API is called to check whether the > > > balloon device supports the guest free page reporting feature. The > > > balloon_free_page_start and balloon_free_page_stop APIs should be used > > > only when this API returns true. > > > > > > > > > Best, > > > Wei > > I find this more confusing than explaining. > > > > Let me try > > > > balloon_free_page_start - start guest free page hint reporting. > > Note: balloon will report pages which were free at the time > > of this call. As the reporting happens asynchronously, > > we rely on dirty logging to be started before this call is made. > > > > The dirty logging bitmap must be synchronized before this call > > and then after balloon_free_page_stop. > > I think it would be better to remove the above one sentence. > I agree "No dirty bitmap synchronizations are allowed between > balloon_free_page_start and balloon_free_page_stop", but "The dirty logging > bitmap MUST be synchronized before balloon_free_page_start" seems confusing, > for example the bulk stage doesn't have to start with a bitmap sync. OK I guess "dirty logging must be enabled" would be better. And with above we can say hinting must be disabled before logging bitmap is synchronized. > > > > > balloon_free_page_stop: stop the guest reporting > > of free pages. dirty logging bitmap can be synchronized > > after this point. > > > > No bitmap synchronizations are allowed between these two points. > > > > Best, > Wei
On 03/06/2018 10:38 AM, Michael S. Tsirkin wrote: > On Tue, Mar 06, 2018 at 09:54:49AM +0800, Wei Wang wrote: >> On 03/05/2018 10:09 PM, Michael S. Tsirkin wrote: >>> On Mon, Mar 05, 2018 at 11:36:15AM +0800, Wei Wang wrote: >>>> On 03/03/2018 02:37 AM, Michael S. Tsirkin wrote: >>>>> On Fri, Mar 02, 2018 at 04:47:29PM +0800, Wei Wang wrote: >>>>>> diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h >>>>>> index af49e19..16a2aae 100644 >>>>>> --- a/include/sysemu/balloon.h >>>>>> +++ b/include/sysemu/balloon.h >>>>> ... >>>>> >>>>>> +typedef void (QEMUBalloonFreePageStart)(void *opaque); >>>>>> +typedef void (QEMUBalloonFreePageStop)(void *opaque); >>>>> So I think the rule is that no bitmap sync must happen >>>>> between these two, otherwise a hint might arrive and >>>>> override the sync output. >>>>> >>>>> Should be documented I think. >>>>> >>>> Yes, agree. >>> Ideally we'd also detect violations and trigger an assert. >> How about just invoking >> >> if (rs->free_page_support) >> balloon_free_page_stop(); >> >> at the beginning of migration_bitmap_sync()? (balloon_free_page_stop will >> just return if the optimization has stopped.) >> >> In this way, we can always have the guarantee that "no bitmap sync must >> happen between these two" > Why not. And in fact you can do balloon_free_page_start at the > end of sync. Sounds good. I implemented it this way in v4. In this case, we actually extend the usage of this optimization beyond the bulk stage. Though it shows similar test results as v3 which optimizes bulk stage only, but still good to leave the optimization there for the 2nd stage onward as well. It will speed up 2nd stage onward, for example, in this scenario: the guest writes page A, and then free(A) soon. After QEMU syncs bitmap, it sees bit of A is set, but now A is free page, the optimization can help to clear A from the bitmap. Best, Wei
diff --git a/balloon.c b/balloon.c index d8dd6fe..b0b0749 100644 --- a/balloon.c +++ b/balloon.c @@ -36,6 +36,9 @@ static QEMUBalloonEvent *balloon_event_fn; static QEMUBalloonStatus *balloon_stat_fn; +static QEMUBalloonFreePageSupport *balloon_free_page_support_fn; +static QEMUBalloonFreePageStart *balloon_free_page_start_fn; +static QEMUBalloonFreePageStop *balloon_free_page_stop_fn; static void *balloon_opaque; static bool balloon_inhibited; @@ -64,19 +67,42 @@ static bool have_balloon(Error **errp) return true; } -int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, - QEMUBalloonStatus *stat_func, void *opaque) +bool balloon_free_page_support(void) { - if (balloon_event_fn || balloon_stat_fn || balloon_opaque) { - /* We're already registered one balloon handler. How many can - * a guest really have? - */ - return -1; + return balloon_free_page_support_fn && + balloon_free_page_support_fn(balloon_opaque); +} + +void balloon_free_page_start(void) +{ + balloon_free_page_start_fn(balloon_opaque); +} + +void balloon_free_page_stop(void) +{ + balloon_free_page_stop_fn(balloon_opaque); +} + +void qemu_add_balloon_handler(QEMUBalloonEvent *event_fn, + QEMUBalloonStatus *stat_fn, + QEMUBalloonFreePageSupport *free_page_support_fn, + QEMUBalloonFreePageStart *free_page_start_fn, + QEMUBalloonFreePageStop *free_page_stop_fn, + void *opaque) +{ + if (balloon_event_fn || balloon_stat_fn || balloon_free_page_support_fn || + balloon_free_page_start_fn || balloon_free_page_stop_fn || + balloon_opaque) { + /* We already registered one balloon handler. */ + return; } - balloon_event_fn = event_func; - balloon_stat_fn = stat_func; + + balloon_event_fn = event_fn; + balloon_stat_fn = stat_fn; + balloon_free_page_support_fn = free_page_support_fn; + balloon_free_page_start_fn = free_page_start_fn; + balloon_free_page_stop_fn = free_page_stop_fn; balloon_opaque = opaque; - return 0; } void qemu_remove_balloon_handler(void *opaque) @@ -86,6 +112,9 @@ void qemu_remove_balloon_handler(void *opaque) } balloon_event_fn = NULL; balloon_stat_fn = NULL; + balloon_free_page_support_fn = NULL; + balloon_free_page_start_fn = NULL; + balloon_free_page_stop_fn = NULL; balloon_opaque = NULL; } diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 4822449..4607879 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -31,6 +31,7 @@ #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-access.h" +#include "migration/misc.h" #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) @@ -308,6 +309,100 @@ out: } } +static void *virtio_balloon_poll_free_page_hints(void *opaque) +{ + VirtQueueElement *elem; + VirtIOBalloon *dev = opaque; + VirtQueue *vq = dev->free_page_vq; + uint32_t id; + size_t size; + + /* + * Poll the vq till the status changed to STOP. This happens when + * the guest finishes reporting hints or the migration thread actively + * stops the reporting. + */ + while (dev->free_page_report_status != FREE_PAGE_REPORT_S_STOP) { + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + if (!elem) { + continue; + } + + if (elem->out_num) { + size = iov_to_buf(elem->out_sg, elem->out_num, 0, &id, sizeof(id)); + virtqueue_push(vq, elem, size); + g_free(elem); + if (unlikely(size != sizeof(id))) { + warn_report("%s: received an incorrect cmd id", __func__); + break; + } + if (id == dev->free_page_report_cmd_id) { + dev->free_page_report_status = FREE_PAGE_REPORT_S_START; + } else if (dev->free_page_report_status == + FREE_PAGE_REPORT_S_START) { + /* + * Stop the optimization only when it has started. This avoids + * obsolete stop sign for the previous command. + */ + dev->free_page_report_status = FREE_PAGE_REPORT_S_STOP; + break; + } + } + + if (elem->in_num) { + if (dev->free_page_report_status == FREE_PAGE_REPORT_S_START && + !dev->poison_val) { + qemu_guest_free_page_hint(elem->in_sg[0].iov_base, + elem->in_sg[0].iov_len); + } + virtqueue_push(vq, elem, 0); + g_free(elem); + } + } + return NULL; +} + +static bool virtio_balloon_free_page_support(void *opaque) +{ + VirtIOBalloon *s = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT); +} + +static void virtio_balloon_free_page_start(void *opaque) +{ + VirtIOBalloon *s = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + if (unlikely(s->free_page_report_cmd_id == UINT_MAX)) { + s->free_page_report_cmd_id = + VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN; + } else { + s->free_page_report_cmd_id++; + } + + s->free_page_report_status = FREE_PAGE_REPORT_S_REQUESTED; + virtio_notify_config(vdev); + qemu_thread_create(&s->free_page_thread, "free_page_optimization_thread", + virtio_balloon_poll_free_page_hints, s, + QEMU_THREAD_JOINABLE); +} + +static void virtio_balloon_free_page_stop(void *opaque) +{ + VirtIOBalloon *s = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + if (s->free_page_report_status == FREE_PAGE_REPORT_S_STOP) { + return; + } + + s->free_page_report_status = FREE_PAGE_REPORT_S_STOP; + virtio_notify_config(vdev); + qemu_thread_join(&s->free_page_thread); +} + static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) { VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); @@ -315,6 +410,15 @@ static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) config.num_pages = cpu_to_le32(dev->num_pages); config.actual = cpu_to_le32(dev->actual); + config.poison_val = cpu_to_le32(dev->poison_val); + + if (dev->free_page_report_status == FREE_PAGE_REPORT_S_STOP) { + config.free_page_report_cmd_id = + cpu_to_le32(VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID); + } else { + config.free_page_report_cmd_id = + cpu_to_le32(dev->free_page_report_cmd_id); + } trace_virtio_balloon_get_config(config.num_pages, config.actual); memcpy(config_data, &config, sizeof(struct virtio_balloon_config)); @@ -368,6 +472,7 @@ static void virtio_balloon_set_config(VirtIODevice *vdev, ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT), &error_abort); } + dev->poison_val = le32_to_cpu(config.poison_val); trace_virtio_balloon_set_config(dev->actual, oldactual); } @@ -377,6 +482,11 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); f |= dev->host_features; virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ); + + if (dev->host_features & 1ULL << VIRTIO_BALLOON_F_FREE_PAGE_HINT) { + virtio_add_feature(&f, VIRTIO_BALLOON_F_PAGE_POISON); + } + return f; } @@ -413,6 +523,18 @@ static int virtio_balloon_post_load_device(void *opaque, int version_id) return 0; } +static const VMStateDescription vmstate_virtio_balloon_free_page_report = { + .name = "virtio-balloon-device/free-page-report", + .version_id = 1, + .minimum_version_id = 1, + .needed = virtio_balloon_free_page_support, + .fields = (VMStateField[]) { + VMSTATE_UINT32(free_page_report_cmd_id, VirtIOBalloon), + VMSTATE_UINT32(poison_val, VirtIOBalloon), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_virtio_balloon_device = { .name = "virtio-balloon-device", .version_id = 1, @@ -423,30 +545,30 @@ static const VMStateDescription vmstate_virtio_balloon_device = { VMSTATE_UINT32(actual, VirtIOBalloon), VMSTATE_END_OF_LIST() }, + .subsections = (const VMStateDescription * []) { + &vmstate_virtio_balloon_free_page_report, + NULL + } }; static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOBalloon *s = VIRTIO_BALLOON(dev); - int ret; virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON, sizeof(struct virtio_balloon_config)); - ret = qemu_add_balloon_handler(virtio_balloon_to_target, - virtio_balloon_stat, s); - - if (ret < 0) { - error_setg(errp, "Only one balloon device is supported"); - virtio_cleanup(vdev); - return; - } - s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); - + if (virtio_has_feature(s->host_features, + VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE, NULL); + s->free_page_report_status = FREE_PAGE_REPORT_S_STOP; + s->free_page_report_cmd_id = + VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN - 1; + } reset_stats(s); } @@ -475,11 +597,27 @@ static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status) { VirtIOBalloon *s = VIRTIO_BALLOON(vdev); - if (!s->stats_vq_elem && vdev->vm_running && - (status & VIRTIO_CONFIG_S_DRIVER_OK) && virtqueue_rewind(s->svq, 1)) { - /* poll stats queue for the element we have discarded when the VM - * was stopped */ - virtio_balloon_receive_stats(vdev, s->svq); + if (status & VIRTIO_CONFIG_S_DRIVER_OK) { + if (!s->stats_vq_elem && vdev->vm_running && + virtqueue_rewind(s->svq, 1)) { + /* + * Poll stats queue for the element we have discarded when the VM + * was stopped. + */ + virtio_balloon_receive_stats(vdev, s->svq); + } + + if (virtio_balloon_free_page_support(s)) { + qemu_add_balloon_handler(virtio_balloon_to_target, + virtio_balloon_stat, + virtio_balloon_free_page_support, + virtio_balloon_free_page_start, + virtio_balloon_free_page_stop, + s); + } else { + qemu_add_balloon_handler(virtio_balloon_to_target, + virtio_balloon_stat, NULL, NULL, NULL, s); + } } } @@ -509,6 +647,8 @@ static const VMStateDescription vmstate_virtio_balloon = { static Property virtio_balloon_properties[] = { DEFINE_PROP_BIT("deflate-on-oom", VirtIOBalloon, host_features, VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false), + DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features, + VIRTIO_BALLOON_F_FREE_PAGE_HINT, false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h index 1ea13bd..ce77382 100644 --- a/include/hw/virtio/virtio-balloon.h +++ b/include/hw/virtio/virtio-balloon.h @@ -23,6 +23,8 @@ #define VIRTIO_BALLOON(obj) \ OBJECT_CHECK(VirtIOBalloon, (obj), TYPE_VIRTIO_BALLOON) +#define VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN 0x80000000 + typedef struct virtio_balloon_stat VirtIOBalloonStat; typedef struct virtio_balloon_stat_modern { @@ -31,15 +33,25 @@ typedef struct virtio_balloon_stat_modern { uint64_t val; } VirtIOBalloonStatModern; +enum virtio_balloon_free_page_report_status { + FREE_PAGE_REPORT_S_REQUESTED, + FREE_PAGE_REPORT_S_START, + FREE_PAGE_REPORT_S_STOP, +}; + typedef struct VirtIOBalloon { VirtIODevice parent_obj; - VirtQueue *ivq, *dvq, *svq; + VirtQueue *ivq, *dvq, *svq, *free_page_vq; + uint32_t free_page_report_status; uint32_t num_pages; uint32_t actual; + uint32_t free_page_report_cmd_id; + uint32_t poison_val; uint64_t stats[VIRTIO_BALLOON_S_NR]; VirtQueueElement *stats_vq_elem; size_t stats_vq_offset; QEMUTimer *stats_timer; + QemuThread free_page_thread; int64_t stats_last_update; int64_t stats_poll_interval; uint32_t host_features; diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h index 7b0a41b..f89e80f 100644 --- a/include/standard-headers/linux/virtio_balloon.h +++ b/include/standard-headers/linux/virtio_balloon.h @@ -34,15 +34,22 @@ #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ +#define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ +#define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12 +#define VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID 0 struct virtio_balloon_config { /* Number of pages host wants Guest to give up. */ uint32_t num_pages; /* Number of pages we've actually got in balloon. */ uint32_t actual; + /* Free page report command id, readonly by guest */ + uint32_t free_page_report_cmd_id; + /* Stores PAGE_POISON if page poisoning is in use */ + uint32_t poison_val; }; #define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */ diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h index af49e19..16a2aae 100644 --- a/include/sysemu/balloon.h +++ b/include/sysemu/balloon.h @@ -18,11 +18,22 @@ typedef void (QEMUBalloonEvent)(void *opaque, ram_addr_t target); typedef void (QEMUBalloonStatus)(void *opaque, BalloonInfo *info); +typedef bool (QEMUBalloonFreePageSupport)(void *opaque); +typedef void (QEMUBalloonFreePageStart)(void *opaque); +typedef void (QEMUBalloonFreePageStop)(void *opaque); -int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, - QEMUBalloonStatus *stat_func, void *opaque); void qemu_remove_balloon_handler(void *opaque); bool qemu_balloon_is_inhibited(void); void qemu_balloon_inhibit(bool state); +bool balloon_free_page_support(void); +void balloon_free_page_start(void); +void balloon_free_page_stop(void); + +void qemu_add_balloon_handler(QEMUBalloonEvent *event_fn, + QEMUBalloonStatus *stat_fn, + QEMUBalloonFreePageSupport *free_page_support_fn, + QEMUBalloonFreePageStart *free_page_start_fn, + QEMUBalloonFreePageStop *free_page_stop_fn, + void *opaque); #endif