Message ID | 20190710195303.19690-1-nitesh@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [QEMU] virtio-baloon: Support for page hinting | expand |
On Wed, Jul 10, 2019 at 12:53 PM Nitesh Narayan Lal <nitesh@redhat.com> wrote: > > Enables QEMU to perform madvise free on the memory range reported > by the vm. > > Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com> > --- > hw/virtio/trace-events | 1 + > hw/virtio/virtio-balloon.c | 59 +++++++++++++++++++ > include/hw/virtio/virtio-balloon.h | 2 +- > include/qemu/osdep.h | 7 +++ > .../standard-headers/linux/virtio_balloon.h | 1 + > 5 files changed, 69 insertions(+), 1 deletion(-) > > diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events > index e28ba48da6..f703a22d36 100644 > --- a/hw/virtio/trace-events > +++ b/hw/virtio/trace-events > @@ -46,6 +46,7 @@ virtio_balloon_handle_output(const char *name, uint64_t gpa) "section name: %s g > virtio_balloon_get_config(uint32_t num_pages, uint32_t actual) "num_pages: %d actual: %d" > virtio_balloon_set_config(uint32_t actual, uint32_t oldactual) "actual: %d oldactual: %d" > virtio_balloon_to_target(uint64_t target, uint32_t num_pages) "balloon target: 0x%"PRIx64" num_pages: %d" > +virtio_balloon_hinting_request(unsigned long pfn, unsigned int num_pages) "Guest page hinting request PFN:%lu size: %d" > > # virtio-mmio.c > virtio_mmio_read(uint64_t offset) "virtio_mmio_read offset 0x%" PRIx64 > diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c > index 2112874055..5d186707b5 100644 > --- a/hw/virtio/virtio-balloon.c > +++ b/hw/virtio/virtio-balloon.c > @@ -34,6 +34,9 @@ > > #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) > > +#define VIRTIO_BALLOON_PAGE_HINTING_MAX_PAGES 16 > +void free_mem_range(uint64_t addr, uint64_t len); > + The definition you have here is unused. I think you can drop it. Also why do you need this forward declaration? Couldn't you just leave free_mem_range below as a static and still have this compile? > struct PartiallyBalloonedPage { > RAMBlock *rb; > ram_addr_t base; > @@ -328,6 +331,58 @@ static void balloon_stats_set_poll_interval(Object *obj, Visitor *v, > balloon_stats_change_timer(s, 0); > } > > +void free_mem_range(uint64_t addr, uint64_t len) > +{ > + int ret = 0; > + void *hvaddr_to_free; > + MemoryRegionSection mrs = memory_region_find(get_system_memory(), > + addr, 1); > + if (!mrs.mr) { > + warn_report("%s:No memory is mapped at address 0x%lu", __func__, addr); > + return; > + } > + > + if (!memory_region_is_ram(mrs.mr) && !memory_region_is_romd(mrs.mr)) { > + warn_report("%s:Memory at address 0x%s is not RAM:0x%lu", __func__, > + HWADDR_PRIx, addr); > + memory_region_unref(mrs.mr); > + return; > + } > + > + hvaddr_to_free = qemu_map_ram_ptr(mrs.mr->ram_block, mrs.offset_within_region); > + trace_virtio_balloon_hinting_request(addr, len); > + ret = qemu_madvise(hvaddr_to_free,len, QEMU_MADV_FREE); > + if (ret == -1) { > + warn_report("%s: Madvise failed with error:%d", __func__, ret); > + } > +} > + > +static void virtio_balloon_handle_page_hinting(VirtIODevice *vdev, > + VirtQueue *vq) > +{ > + VirtQueueElement *elem; > + size_t offset = 0; > + uint64_t gpa, len; > + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); > + if (!elem) { > + return; > + } > + /* For pending hints which are < max_pages(16), 'gpa != 0' ensures that we > + * only read the buffer which holds a valid PFN value. > + * TODO: Find a better way to do this. > + */ I'm not sure this comment makes much sense to me. Shouldn't the iov_to_buf be limiting you anyway? Why do you need the additional gpa check? > + while (iov_to_buf(elem->out_sg, elem->out_num, offset, &gpa, 8) == 8 && gpa != 0) { > + offset += 8; > + offset += iov_to_buf(elem->out_sg, elem->out_num, offset, &len, 8); Why pull this out as two separate buffers? Why not just define a structure that consists of the two uint64_t values and then pull the entire thing as one buffer? I'm pretty sure the solution as you have it now opens you up to an error since you could have a malicious guest only give you a part of the structure and you really should be verifying you get the entire structure. > + if (!qemu_balloon_is_inhibited()) { > + free_mem_range(gpa, len); > + } > + } > + virtqueue_push(vq, elem, offset); > + virtio_notify(vdev, vq); > + g_free(elem); > +} > + > static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) > { > VirtIOBalloon *s = VIRTIO_BALLOON(vdev); > @@ -694,6 +749,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, > VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); > f |= dev->host_features; > virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ); > + virtio_add_feature(&f, VIRTIO_BALLOON_F_HINTING); > > return f; > } > @@ -780,6 +836,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) > s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); > s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); > s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); > + s->hvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_page_hinting); > > if (virtio_has_feature(s->host_features, > VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { > @@ -875,6 +932,8 @@ static void virtio_balloon_instance_init(Object *obj) > > object_property_add(obj, "guest-stats", "guest statistics", > balloon_stats_get_all, NULL, NULL, s, NULL); > + object_property_add(obj, "guest-page-hinting", "guest page hinting", > + NULL, NULL, NULL, s, NULL); > > object_property_add(obj, "guest-stats-polling-interval", "int", > balloon_stats_get_poll_interval, > diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h > index 1afafb12f6..a58b24fdf2 100644 > --- a/include/hw/virtio/virtio-balloon.h > +++ b/include/hw/virtio/virtio-balloon.h > @@ -44,7 +44,7 @@ enum virtio_balloon_free_page_report_status { > > typedef struct VirtIOBalloon { > VirtIODevice parent_obj; > - VirtQueue *ivq, *dvq, *svq, *free_page_vq; > + VirtQueue *ivq, *dvq, *svq, *free_page_vq, *hvq; > uint32_t free_page_report_status; > uint32_t num_pages; > uint32_t actual; > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h > index af2b91f0b8..bb9207e7f4 100644 > --- a/include/qemu/osdep.h > +++ b/include/qemu/osdep.h > @@ -360,6 +360,11 @@ void qemu_anon_ram_free(void *ptr, size_t size); > #else > #define QEMU_MADV_REMOVE QEMU_MADV_INVALID > #endif > +#ifdef MADV_FREE > +#define QEMU_MADV_FREE MADV_FREE > +#else > +#define QEMU_MADV_FREE QEMU_MADV_INVALID > +#endif As I mentioned before it might make more sense to use MADV_DONTNEED instead of just disabling this functionality if the host kernel doesn't have MADV_FREE support. That way you would still have the functionality on kernels prior to 4.5 if they need it. > #elif defined(CONFIG_POSIX_MADVISE) > > @@ -373,6 +378,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); > #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_REMOVE QEMU_MADV_INVALID > +#define QEMU_MADV_FREE QEMU_MADV_INVALID Same here. It might make more sense to use the POSIX_MADV_DONTNEED instead of just making it invalid. > #else /* no-op */ > > @@ -386,6 +392,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); > #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_REMOVE QEMU_MADV_INVALID > +#define QEMU_MADV_FREE QEMU_MADV_INVALID > > #endif > > diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h > index 9375ca2a70..f9e3e82562 100644 > --- a/include/standard-headers/linux/virtio_balloon.h > +++ b/include/standard-headers/linux/virtio_balloon.h > @@ -36,6 +36,7 @@ > #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ > #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ > #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ > +#define VIRTIO_BALLOON_F_HINTING 5 /* Page hinting virtqueue */ > > /* Size of a PFN in the balloon interface. */ > #define VIRTIO_BALLOON_PFN_SHIFT 12 > -- > 2.21.0 >
On Wed, 10 Jul 2019 15:53:03 -0400 Nitesh Narayan Lal <nitesh@redhat.com> wrote: $SUBJECT: s/baloon/balloon/ > Enables QEMU to perform madvise free on the memory range reported > by the vm. [No comments on the actual functionality; just some stuff I noticed.] > > Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com> > --- > hw/virtio/trace-events | 1 + > hw/virtio/virtio-balloon.c | 59 +++++++++++++++++++ > include/hw/virtio/virtio-balloon.h | 2 +- > include/qemu/osdep.h | 7 +++ > .../standard-headers/linux/virtio_balloon.h | 1 + > 5 files changed, 69 insertions(+), 1 deletion(-) > (...) > diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c > index 2112874055..5d186707b5 100644 > --- a/hw/virtio/virtio-balloon.c > +++ b/hw/virtio/virtio-balloon.c > @@ -34,6 +34,9 @@ > > #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) > > +#define VIRTIO_BALLOON_PAGE_HINTING_MAX_PAGES 16 > +void free_mem_range(uint64_t addr, uint64_t len); > + > struct PartiallyBalloonedPage { > RAMBlock *rb; > ram_addr_t base; > @@ -328,6 +331,58 @@ static void balloon_stats_set_poll_interval(Object *obj, Visitor *v, > balloon_stats_change_timer(s, 0); > } > > +void free_mem_range(uint64_t addr, uint64_t len) > +{ > + int ret = 0; > + void *hvaddr_to_free; > + MemoryRegionSection mrs = memory_region_find(get_system_memory(), > + addr, 1); > + if (!mrs.mr) { > + warn_report("%s:No memory is mapped at address 0x%lu", __func__, addr); Indentation seems to be off here (also in other places; please double check.) > + return; > + } > + > + if (!memory_region_is_ram(mrs.mr) && !memory_region_is_romd(mrs.mr)) { > + warn_report("%s:Memory at address 0x%s is not RAM:0x%lu", __func__, > + HWADDR_PRIx, addr); > + memory_region_unref(mrs.mr); > + return; > + } > + > + hvaddr_to_free = qemu_map_ram_ptr(mrs.mr->ram_block, mrs.offset_within_region); > + trace_virtio_balloon_hinting_request(addr, len); > + ret = qemu_madvise(hvaddr_to_free,len, QEMU_MADV_FREE); > + if (ret == -1) { > + warn_report("%s: Madvise failed with error:%d", __func__, ret); > + } > +} > + > +static void virtio_balloon_handle_page_hinting(VirtIODevice *vdev, > + VirtQueue *vq) > +{ > + VirtQueueElement *elem; > + size_t offset = 0; > + uint64_t gpa, len; > + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); > + if (!elem) { > + return; > + } > + /* For pending hints which are < max_pages(16), 'gpa != 0' ensures that we > + * only read the buffer which holds a valid PFN value. > + * TODO: Find a better way to do this. > + */ > + while (iov_to_buf(elem->out_sg, elem->out_num, offset, &gpa, 8) == 8 && gpa != 0) { > + offset += 8; > + offset += iov_to_buf(elem->out_sg, elem->out_num, offset, &len, 8); > + if (!qemu_balloon_is_inhibited()) { > + free_mem_range(gpa, len); > + } > + } > + virtqueue_push(vq, elem, offset); > + virtio_notify(vdev, vq); > + g_free(elem); > +} > + > static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) > { > VirtIOBalloon *s = VIRTIO_BALLOON(vdev); > @@ -694,6 +749,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, > VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); > f |= dev->host_features; > virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ); > + virtio_add_feature(&f, VIRTIO_BALLOON_F_HINTING); I don't think you can add this unconditionally if you want to keep this migratable. This should be done via a property (as for deflate-on-oom and free-page-hint) so it can be turned off in compat machines. > > return f; > } > @@ -780,6 +836,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) > s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); > s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); > s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); > + s->hvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_page_hinting); This should probably be conditional in the same way as the free page hint queue (also see above). > > if (virtio_has_feature(s->host_features, > VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { > @@ -875,6 +932,8 @@ static void virtio_balloon_instance_init(Object *obj) > > object_property_add(obj, "guest-stats", "guest statistics", > balloon_stats_get_all, NULL, NULL, s, NULL); > + object_property_add(obj, "guest-page-hinting", "guest page hinting", > + NULL, NULL, NULL, s, NULL); This object does not have any accessors; what purpose does it serve? > > object_property_add(obj, "guest-stats-polling-interval", "int", > balloon_stats_get_poll_interval, (...) > diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h > index 9375ca2a70..f9e3e82562 100644 > --- a/include/standard-headers/linux/virtio_balloon.h > +++ b/include/standard-headers/linux/virtio_balloon.h > @@ -36,6 +36,7 @@ > #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ > #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ > #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ > +#define VIRTIO_BALLOON_F_HINTING 5 /* Page hinting virtqueue */ > > /* Size of a PFN in the balloon interface. */ > #define VIRTIO_BALLOON_PFN_SHIFT 12 Please split off any update to these headers into a separate patch, so that it can be replaced by a proper headers update when it is merged.
On 7/11/19 4:49 AM, Cornelia Huck wrote: > On Wed, 10 Jul 2019 15:53:03 -0400 > Nitesh Narayan Lal <nitesh@redhat.com> wrote: > > > $SUBJECT: s/baloon/balloon/ > >> Enables QEMU to perform madvise free on the memory range reported >> by the vm. > [No comments on the actual functionality; just some stuff I noticed.] > >> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com> >> --- >> hw/virtio/trace-events | 1 + >> hw/virtio/virtio-balloon.c | 59 +++++++++++++++++++ >> include/hw/virtio/virtio-balloon.h | 2 +- >> include/qemu/osdep.h | 7 +++ >> .../standard-headers/linux/virtio_balloon.h | 1 + >> 5 files changed, 69 insertions(+), 1 deletion(-) >> > (...) > >> diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c >> index 2112874055..5d186707b5 100644 >> --- a/hw/virtio/virtio-balloon.c >> +++ b/hw/virtio/virtio-balloon.c >> @@ -34,6 +34,9 @@ >> >> #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) >> >> +#define VIRTIO_BALLOON_PAGE_HINTING_MAX_PAGES 16 >> +void free_mem_range(uint64_t addr, uint64_t len); >> + >> struct PartiallyBalloonedPage { >> RAMBlock *rb; >> ram_addr_t base; >> @@ -328,6 +331,58 @@ static void balloon_stats_set_poll_interval(Object *obj, Visitor *v, >> balloon_stats_change_timer(s, 0); >> } >> >> +void free_mem_range(uint64_t addr, uint64_t len) >> +{ >> + int ret = 0; >> + void *hvaddr_to_free; >> + MemoryRegionSection mrs = memory_region_find(get_system_memory(), >> + addr, 1); >> + if (!mrs.mr) { >> + warn_report("%s:No memory is mapped at address 0x%lu", __func__, addr); > Indentation seems to be off here (also in other places; please double > check.) Thanks, I will check it. > >> + return; >> + } >> + >> + if (!memory_region_is_ram(mrs.mr) && !memory_region_is_romd(mrs.mr)) { >> + warn_report("%s:Memory at address 0x%s is not RAM:0x%lu", __func__, >> + HWADDR_PRIx, addr); >> + memory_region_unref(mrs.mr); >> + return; >> + } >> + >> + hvaddr_to_free = qemu_map_ram_ptr(mrs.mr->ram_block, mrs.offset_within_region); >> + trace_virtio_balloon_hinting_request(addr, len); >> + ret = qemu_madvise(hvaddr_to_free,len, QEMU_MADV_FREE); >> + if (ret == -1) { >> + warn_report("%s: Madvise failed with error:%d", __func__, ret); >> + } >> +} >> + >> +static void virtio_balloon_handle_page_hinting(VirtIODevice *vdev, >> + VirtQueue *vq) >> +{ >> + VirtQueueElement *elem; >> + size_t offset = 0; >> + uint64_t gpa, len; >> + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); >> + if (!elem) { >> + return; >> + } >> + /* For pending hints which are < max_pages(16), 'gpa != 0' ensures that we >> + * only read the buffer which holds a valid PFN value. >> + * TODO: Find a better way to do this. >> + */ >> + while (iov_to_buf(elem->out_sg, elem->out_num, offset, &gpa, 8) == 8 && gpa != 0) { >> + offset += 8; >> + offset += iov_to_buf(elem->out_sg, elem->out_num, offset, &len, 8); >> + if (!qemu_balloon_is_inhibited()) { >> + free_mem_range(gpa, len); >> + } >> + } >> + virtqueue_push(vq, elem, offset); >> + virtio_notify(vdev, vq); >> + g_free(elem); >> +} >> + >> static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) >> { >> VirtIOBalloon *s = VIRTIO_BALLOON(vdev); >> @@ -694,6 +749,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, >> VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); >> f |= dev->host_features; >> virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ); >> + virtio_add_feature(&f, VIRTIO_BALLOON_F_HINTING); > I don't think you can add this unconditionally if you want to keep this > migratable. This should be done via a property (as for deflate-on-oom > and free-page-hint) so it can be turned off in compat machines. I see, I will take a look at it. > >> >> return f; >> } >> @@ -780,6 +836,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) >> s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); >> s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); >> s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); >> + s->hvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_page_hinting); > This should probably be conditional in the same way as the free page hint > queue (also see above). Makes sense. Thanks. > >> >> if (virtio_has_feature(s->host_features, >> VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { >> @@ -875,6 +932,8 @@ static void virtio_balloon_instance_init(Object *obj) >> >> object_property_add(obj, "guest-stats", "guest statistics", >> balloon_stats_get_all, NULL, NULL, s, NULL); >> + object_property_add(obj, "guest-page-hinting", "guest page hinting", >> + NULL, NULL, NULL, s, NULL); > This object does not have any accessors; what purpose does it serve? I think its not required. I will correct this. > >> >> object_property_add(obj, "guest-stats-polling-interval", "int", >> balloon_stats_get_poll_interval, > (...) > >> diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h >> index 9375ca2a70..f9e3e82562 100644 >> --- a/include/standard-headers/linux/virtio_balloon.h >> +++ b/include/standard-headers/linux/virtio_balloon.h >> @@ -36,6 +36,7 @@ >> #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ >> #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ >> #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ >> +#define VIRTIO_BALLOON_F_HINTING 5 /* Page hinting virtqueue */ >> >> /* Size of a PFN in the balloon interface. */ >> #define VIRTIO_BALLOON_PFN_SHIFT 12 > Please split off any update to these headers into a separate patch, so > that it can be replaced by a proper headers update when it is merged. I will do that.
On 7/10/19 4:17 PM, Alexander Duyck wrote: > On Wed, Jul 10, 2019 at 12:53 PM Nitesh Narayan Lal <nitesh@redhat.com> wrote: >> Enables QEMU to perform madvise free on the memory range reported >> by the vm. >> >> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com> >> --- >> hw/virtio/trace-events | 1 + >> hw/virtio/virtio-balloon.c | 59 +++++++++++++++++++ >> include/hw/virtio/virtio-balloon.h | 2 +- >> include/qemu/osdep.h | 7 +++ >> .../standard-headers/linux/virtio_balloon.h | 1 + >> 5 files changed, 69 insertions(+), 1 deletion(-) >> >> diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events >> index e28ba48da6..f703a22d36 100644 >> --- a/hw/virtio/trace-events >> +++ b/hw/virtio/trace-events >> @@ -46,6 +46,7 @@ virtio_balloon_handle_output(const char *name, uint64_t gpa) "section name: %s g >> virtio_balloon_get_config(uint32_t num_pages, uint32_t actual) "num_pages: %d actual: %d" >> virtio_balloon_set_config(uint32_t actual, uint32_t oldactual) "actual: %d oldactual: %d" >> virtio_balloon_to_target(uint64_t target, uint32_t num_pages) "balloon target: 0x%"PRIx64" num_pages: %d" >> +virtio_balloon_hinting_request(unsigned long pfn, unsigned int num_pages) "Guest page hinting request PFN:%lu size: %d" >> >> # virtio-mmio.c >> virtio_mmio_read(uint64_t offset) "virtio_mmio_read offset 0x%" PRIx64 >> diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c >> index 2112874055..5d186707b5 100644 >> --- a/hw/virtio/virtio-balloon.c >> +++ b/hw/virtio/virtio-balloon.c >> @@ -34,6 +34,9 @@ >> >> #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) >> >> +#define VIRTIO_BALLOON_PAGE_HINTING_MAX_PAGES 16 >> +void free_mem_range(uint64_t addr, uint64_t len); >> + > The definition you have here is unused. I think you can drop it. Also > why do you need this forward declaration? Couldn't you just leave > free_mem_range below as a static and still have this compile? +1. Thanks for pointing this out. > >> struct PartiallyBalloonedPage { >> RAMBlock *rb; >> ram_addr_t base; >> @@ -328,6 +331,58 @@ static void balloon_stats_set_poll_interval(Object *obj, Visitor *v, >> balloon_stats_change_timer(s, 0); >> } >> >> +void free_mem_range(uint64_t addr, uint64_t len) >> +{ >> + int ret = 0; >> + void *hvaddr_to_free; >> + MemoryRegionSection mrs = memory_region_find(get_system_memory(), >> + addr, 1); >> + if (!mrs.mr) { >> + warn_report("%s:No memory is mapped at address 0x%lu", __func__, addr); >> + return; >> + } >> + >> + if (!memory_region_is_ram(mrs.mr) && !memory_region_is_romd(mrs.mr)) { >> + warn_report("%s:Memory at address 0x%s is not RAM:0x%lu", __func__, >> + HWADDR_PRIx, addr); >> + memory_region_unref(mrs.mr); >> + return; >> + } >> + >> + hvaddr_to_free = qemu_map_ram_ptr(mrs.mr->ram_block, mrs.offset_within_region); >> + trace_virtio_balloon_hinting_request(addr, len); >> + ret = qemu_madvise(hvaddr_to_free,len, QEMU_MADV_FREE); >> + if (ret == -1) { >> + warn_report("%s: Madvise failed with error:%d", __func__, ret); >> + } >> +} >> + >> +static void virtio_balloon_handle_page_hinting(VirtIODevice *vdev, >> + VirtQueue *vq) >> +{ >> + VirtQueueElement *elem; >> + size_t offset = 0; >> + uint64_t gpa, len; >> + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); >> + if (!elem) { >> + return; >> + } >> + /* For pending hints which are < max_pages(16), 'gpa != 0' ensures that we >> + * only read the buffer which holds a valid PFN value. >> + * TODO: Find a better way to do this. >> + */ > I'm not sure this comment makes much sense to me. Shouldn't the > iov_to_buf be limiting you anyway? Why do you need the additional gpa > check? > >> + while (iov_to_buf(elem->out_sg, elem->out_num, offset, &gpa, 8) == 8 && gpa != 0) { >> + offset += 8; >> + offset += iov_to_buf(elem->out_sg, elem->out_num, offset, &len, 8); > Why pull this out as two separate buffers? Why not just define a > structure that consists of the two uint64_t values and then pull the > entire thing as one buffer? This does make sense. I will correct this. Thanks. > I'm pretty sure the solution as you have > it now opens you up to an error since you could have a malicious guest > only give you a part of the structure and you really should be > verifying you get the entire structure. > >> + if (!qemu_balloon_is_inhibited()) { >> + free_mem_range(gpa, len); >> + } >> + } >> + virtqueue_push(vq, elem, offset); >> + virtio_notify(vdev, vq); >> + g_free(elem); >> +} >> + >> static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) >> { >> VirtIOBalloon *s = VIRTIO_BALLOON(vdev); >> @@ -694,6 +749,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, >> VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); >> f |= dev->host_features; >> virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ); >> + virtio_add_feature(&f, VIRTIO_BALLOON_F_HINTING); >> >> return f; >> } >> @@ -780,6 +836,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) >> s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); >> s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); >> s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); >> + s->hvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_page_hinting); >> >> if (virtio_has_feature(s->host_features, >> VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { >> @@ -875,6 +932,8 @@ static void virtio_balloon_instance_init(Object *obj) >> >> object_property_add(obj, "guest-stats", "guest statistics", >> balloon_stats_get_all, NULL, NULL, s, NULL); >> + object_property_add(obj, "guest-page-hinting", "guest page hinting", >> + NULL, NULL, NULL, s, NULL); >> >> object_property_add(obj, "guest-stats-polling-interval", "int", >> balloon_stats_get_poll_interval, >> diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h >> index 1afafb12f6..a58b24fdf2 100644 >> --- a/include/hw/virtio/virtio-balloon.h >> +++ b/include/hw/virtio/virtio-balloon.h >> @@ -44,7 +44,7 @@ enum virtio_balloon_free_page_report_status { >> >> typedef struct VirtIOBalloon { >> VirtIODevice parent_obj; >> - VirtQueue *ivq, *dvq, *svq, *free_page_vq; >> + VirtQueue *ivq, *dvq, *svq, *free_page_vq, *hvq; >> uint32_t free_page_report_status; >> uint32_t num_pages; >> uint32_t actual; >> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h >> index af2b91f0b8..bb9207e7f4 100644 >> --- a/include/qemu/osdep.h >> +++ b/include/qemu/osdep.h >> @@ -360,6 +360,11 @@ void qemu_anon_ram_free(void *ptr, size_t size); >> #else >> #define QEMU_MADV_REMOVE QEMU_MADV_INVALID >> #endif >> +#ifdef MADV_FREE >> +#define QEMU_MADV_FREE MADV_FREE >> +#else >> +#define QEMU_MADV_FREE QEMU_MADV_INVALID >> +#endif > As I mentioned before it might make more sense to use MADV_DONTNEED > instead of just disabling this functionality if the host kernel > doesn't have MADV_FREE support. I have been trying to find the reason for it and later decided to just avoid hinting and print an error message instead. > That way you would still have the > functionality on kernels prior to 4.5 if they need it. I didn't think of this earlier. If that's the case it does make sense fallback to DONTNEED. > >> #elif defined(CONFIG_POSIX_MADVISE) >> >> @@ -373,6 +378,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); >> #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID >> #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID >> #define QEMU_MADV_REMOVE QEMU_MADV_INVALID >> +#define QEMU_MADV_FREE QEMU_MADV_INVALID > Same here. It might make more sense to use the POSIX_MADV_DONTNEED > instead of just making it invalid. > >> #else /* no-op */ >> >> @@ -386,6 +392,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); >> #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID >> #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID >> #define QEMU_MADV_REMOVE QEMU_MADV_INVALID >> +#define QEMU_MADV_FREE QEMU_MADV_INVALID >> >> #endif >> >> diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h >> index 9375ca2a70..f9e3e82562 100644 >> --- a/include/standard-headers/linux/virtio_balloon.h >> +++ b/include/standard-headers/linux/virtio_balloon.h >> @@ -36,6 +36,7 @@ >> #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ >> #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ >> #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ >> +#define VIRTIO_BALLOON_F_HINTING 5 /* Page hinting virtqueue */ >> >> /* Size of a PFN in the balloon interface. */ >> #define VIRTIO_BALLOON_PFN_SHIFT 12 >> -- >> 2.21.0 >>
On Wed, Jul 10, 2019 at 03:53:03PM -0400, Nitesh Narayan Lal wrote: > Enables QEMU to perform madvise free on the memory range reported > by the vm. > > Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com> Missing second "l" in the subject :) > --- > hw/virtio/trace-events | 1 + > hw/virtio/virtio-balloon.c | 59 +++++++++++++++++++ > include/hw/virtio/virtio-balloon.h | 2 +- > include/qemu/osdep.h | 7 +++ > .../standard-headers/linux/virtio_balloon.h | 1 + > 5 files changed, 69 insertions(+), 1 deletion(-) > > diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events > index e28ba48da6..f703a22d36 100644 > --- a/hw/virtio/trace-events > +++ b/hw/virtio/trace-events > @@ -46,6 +46,7 @@ virtio_balloon_handle_output(const char *name, uint64_t gpa) "section name: %s g > virtio_balloon_get_config(uint32_t num_pages, uint32_t actual) "num_pages: %d actual: %d" > virtio_balloon_set_config(uint32_t actual, uint32_t oldactual) "actual: %d oldactual: %d" > virtio_balloon_to_target(uint64_t target, uint32_t num_pages) "balloon target: 0x%"PRIx64" num_pages: %d" > +virtio_balloon_hinting_request(unsigned long pfn, unsigned int num_pages) "Guest page hinting request PFN:%lu size: %d" > > # virtio-mmio.c > virtio_mmio_read(uint64_t offset) "virtio_mmio_read offset 0x%" PRIx64 > diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c > index 2112874055..5d186707b5 100644 > --- a/hw/virtio/virtio-balloon.c > +++ b/hw/virtio/virtio-balloon.c > @@ -34,6 +34,9 @@ > > #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) > > +#define VIRTIO_BALLOON_PAGE_HINTING_MAX_PAGES 16 > +void free_mem_range(uint64_t addr, uint64_t len); > + > struct PartiallyBalloonedPage { > RAMBlock *rb; > ram_addr_t base; > @@ -328,6 +331,58 @@ static void balloon_stats_set_poll_interval(Object *obj, Visitor *v, > balloon_stats_change_timer(s, 0); > } > > +void free_mem_range(uint64_t addr, uint64_t len) > +{ > + int ret = 0; > + void *hvaddr_to_free; > + MemoryRegionSection mrs = memory_region_find(get_system_memory(), > + addr, 1); > + if (!mrs.mr) { > + warn_report("%s:No memory is mapped at address 0x%lu", __func__, addr); > + return; > + } > + > + if (!memory_region_is_ram(mrs.mr) && !memory_region_is_romd(mrs.mr)) { > + warn_report("%s:Memory at address 0x%s is not RAM:0x%lu", __func__, > + HWADDR_PRIx, addr); > + memory_region_unref(mrs.mr); > + return; > + } > + > + hvaddr_to_free = qemu_map_ram_ptr(mrs.mr->ram_block, mrs.offset_within_region); > + trace_virtio_balloon_hinting_request(addr, len); > + ret = qemu_madvise(hvaddr_to_free,len, QEMU_MADV_FREE); > + if (ret == -1) { > + warn_report("%s: Madvise failed with error:%d", __func__, ret); > + } > +} > + > +static void virtio_balloon_handle_page_hinting(VirtIODevice *vdev, > + VirtQueue *vq) > +{ > + VirtQueueElement *elem; > + size_t offset = 0; > + uint64_t gpa, len; > + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); > + if (!elem) { > + return; > + } > + /* For pending hints which are < max_pages(16), 'gpa != 0' ensures that we > + * only read the buffer which holds a valid PFN value. > + * TODO: Find a better way to do this. Indeed. In fact, what is wrong with passing the gpa as part of the element itself? > + */ > + while (iov_to_buf(elem->out_sg, elem->out_num, offset, &gpa, 8) == 8 && gpa != 0) { > + offset += 8; > + offset += iov_to_buf(elem->out_sg, elem->out_num, offset, &len, 8); > + if (!qemu_balloon_is_inhibited()) { > + free_mem_range(gpa, len); > + } > + } > + virtqueue_push(vq, elem, offset); > + virtio_notify(vdev, vq); > + g_free(elem); > +} > + > static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) > { > VirtIOBalloon *s = VIRTIO_BALLOON(vdev); > @@ -694,6 +749,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, > VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); > f |= dev->host_features; > virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ); > + virtio_add_feature(&f, VIRTIO_BALLOON_F_HINTING); > > return f; > } > @@ -780,6 +836,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) > s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); > s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); > s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); > + s->hvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_page_hinting); > > if (virtio_has_feature(s->host_features, > VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { > @@ -875,6 +932,8 @@ static void virtio_balloon_instance_init(Object *obj) > > object_property_add(obj, "guest-stats", "guest statistics", > balloon_stats_get_all, NULL, NULL, s, NULL); > + object_property_add(obj, "guest-page-hinting", "guest page hinting", > + NULL, NULL, NULL, s, NULL); > > object_property_add(obj, "guest-stats-polling-interval", "int", > balloon_stats_get_poll_interval, > diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h > index 1afafb12f6..a58b24fdf2 100644 > --- a/include/hw/virtio/virtio-balloon.h > +++ b/include/hw/virtio/virtio-balloon.h > @@ -44,7 +44,7 @@ enum virtio_balloon_free_page_report_status { > > typedef struct VirtIOBalloon { > VirtIODevice parent_obj; > - VirtQueue *ivq, *dvq, *svq, *free_page_vq; > + VirtQueue *ivq, *dvq, *svq, *free_page_vq, *hvq; > uint32_t free_page_report_status; > uint32_t num_pages; > uint32_t actual; > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h > index af2b91f0b8..bb9207e7f4 100644 > --- a/include/qemu/osdep.h > +++ b/include/qemu/osdep.h > @@ -360,6 +360,11 @@ void qemu_anon_ram_free(void *ptr, size_t size); > #else > #define QEMU_MADV_REMOVE QEMU_MADV_INVALID > #endif > +#ifdef MADV_FREE > +#define QEMU_MADV_FREE MADV_FREE > +#else > +#define QEMU_MADV_FREE QEMU_MADV_INVALID > +#endif > > #elif defined(CONFIG_POSIX_MADVISE) > > @@ -373,6 +378,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); > #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_REMOVE QEMU_MADV_INVALID > +#define QEMU_MADV_FREE QEMU_MADV_INVALID > > #else /* no-op */ > > @@ -386,6 +392,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); > #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_REMOVE QEMU_MADV_INVALID > +#define QEMU_MADV_FREE QEMU_MADV_INVALID > > #endif > > diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h > index 9375ca2a70..f9e3e82562 100644 > --- a/include/standard-headers/linux/virtio_balloon.h > +++ b/include/standard-headers/linux/virtio_balloon.h > @@ -36,6 +36,7 @@ > #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ > #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ > #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ > +#define VIRTIO_BALLOON_F_HINTING 5 /* Page hinting virtqueue */ > > /* Size of a PFN in the balloon interface. */ > #define VIRTIO_BALLOON_PFN_SHIFT 12 > -- > 2.21.0
On 7/11/19 2:55 PM, Michael S. Tsirkin wrote: > On Wed, Jul 10, 2019 at 03:53:03PM -0400, Nitesh Narayan Lal wrote: >> Enables QEMU to perform madvise free on the memory range reported >> by the vm. >> >> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com> > Missing second "l" in the subject :) > >> --- >> hw/virtio/trace-events | 1 + >> hw/virtio/virtio-balloon.c | 59 +++++++++++++++++++ >> include/hw/virtio/virtio-balloon.h | 2 +- >> include/qemu/osdep.h | 7 +++ >> .../standard-headers/linux/virtio_balloon.h | 1 + >> 5 files changed, 69 insertions(+), 1 deletion(-) >> >> diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events >> index e28ba48da6..f703a22d36 100644 >> --- a/hw/virtio/trace-events >> +++ b/hw/virtio/trace-events >> @@ -46,6 +46,7 @@ virtio_balloon_handle_output(const char *name, uint64_t gpa) "section name: %s g >> virtio_balloon_get_config(uint32_t num_pages, uint32_t actual) "num_pages: %d actual: %d" >> virtio_balloon_set_config(uint32_t actual, uint32_t oldactual) "actual: %d oldactual: %d" >> virtio_balloon_to_target(uint64_t target, uint32_t num_pages) "balloon target: 0x%"PRIx64" num_pages: %d" >> +virtio_balloon_hinting_request(unsigned long pfn, unsigned int num_pages) "Guest page hinting request PFN:%lu size: %d" >> >> # virtio-mmio.c >> virtio_mmio_read(uint64_t offset) "virtio_mmio_read offset 0x%" PRIx64 >> diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c >> index 2112874055..5d186707b5 100644 >> --- a/hw/virtio/virtio-balloon.c >> +++ b/hw/virtio/virtio-balloon.c >> @@ -34,6 +34,9 @@ >> >> #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) >> >> +#define VIRTIO_BALLOON_PAGE_HINTING_MAX_PAGES 16 >> +void free_mem_range(uint64_t addr, uint64_t len); >> + >> struct PartiallyBalloonedPage { >> RAMBlock *rb; >> ram_addr_t base; >> @@ -328,6 +331,58 @@ static void balloon_stats_set_poll_interval(Object *obj, Visitor *v, >> balloon_stats_change_timer(s, 0); >> } >> >> +void free_mem_range(uint64_t addr, uint64_t len) >> +{ >> + int ret = 0; >> + void *hvaddr_to_free; >> + MemoryRegionSection mrs = memory_region_find(get_system_memory(), >> + addr, 1); >> + if (!mrs.mr) { >> + warn_report("%s:No memory is mapped at address 0x%lu", __func__, addr); >> + return; >> + } >> + >> + if (!memory_region_is_ram(mrs.mr) && !memory_region_is_romd(mrs.mr)) { >> + warn_report("%s:Memory at address 0x%s is not RAM:0x%lu", __func__, >> + HWADDR_PRIx, addr); >> + memory_region_unref(mrs.mr); >> + return; >> + } >> + >> + hvaddr_to_free = qemu_map_ram_ptr(mrs.mr->ram_block, mrs.offset_within_region); >> + trace_virtio_balloon_hinting_request(addr, len); >> + ret = qemu_madvise(hvaddr_to_free,len, QEMU_MADV_FREE); >> + if (ret == -1) { >> + warn_report("%s: Madvise failed with error:%d", __func__, ret); >> + } >> +} >> + >> +static void virtio_balloon_handle_page_hinting(VirtIODevice *vdev, >> + VirtQueue *vq) >> +{ >> + VirtQueueElement *elem; >> + size_t offset = 0; >> + uint64_t gpa, len; >> + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); >> + if (!elem) { >> + return; >> + } >> + /* For pending hints which are < max_pages(16), 'gpa != 0' ensures that we >> + * only read the buffer which holds a valid PFN value. >> + * TODO: Find a better way to do this. > Indeed. In fact, what is wrong with passing the gpa as > part of the element itself? There are two values which I need to read 'gpa' and 'len'. I will have to check how to pass them both as part of the element. But, I will look into it. >> + */ >> + while (iov_to_buf(elem->out_sg, elem->out_num, offset, &gpa, 8) == 8 && gpa != 0) { >> + offset += 8; >> + offset += iov_to_buf(elem->out_sg, elem->out_num, offset, &len, 8); >> + if (!qemu_balloon_is_inhibited()) { >> + free_mem_range(gpa, len); >> + } >> + } >> + virtqueue_push(vq, elem, offset); >> + virtio_notify(vdev, vq); >> + g_free(elem); >> +} >> + >> static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) >> { >> VirtIOBalloon *s = VIRTIO_BALLOON(vdev); >> @@ -694,6 +749,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, >> VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); >> f |= dev->host_features; >> virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ); >> + virtio_add_feature(&f, VIRTIO_BALLOON_F_HINTING); >> >> return f; >> } >> @@ -780,6 +836,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) >> s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); >> s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); >> s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); >> + s->hvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_page_hinting); >> >> if (virtio_has_feature(s->host_features, >> VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { >> @@ -875,6 +932,8 @@ static void virtio_balloon_instance_init(Object *obj) >> >> object_property_add(obj, "guest-stats", "guest statistics", >> balloon_stats_get_all, NULL, NULL, s, NULL); >> + object_property_add(obj, "guest-page-hinting", "guest page hinting", >> + NULL, NULL, NULL, s, NULL); >> >> object_property_add(obj, "guest-stats-polling-interval", "int", >> balloon_stats_get_poll_interval, >> diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h >> index 1afafb12f6..a58b24fdf2 100644 >> --- a/include/hw/virtio/virtio-balloon.h >> +++ b/include/hw/virtio/virtio-balloon.h >> @@ -44,7 +44,7 @@ enum virtio_balloon_free_page_report_status { >> >> typedef struct VirtIOBalloon { >> VirtIODevice parent_obj; >> - VirtQueue *ivq, *dvq, *svq, *free_page_vq; >> + VirtQueue *ivq, *dvq, *svq, *free_page_vq, *hvq; >> uint32_t free_page_report_status; >> uint32_t num_pages; >> uint32_t actual; >> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h >> index af2b91f0b8..bb9207e7f4 100644 >> --- a/include/qemu/osdep.h >> +++ b/include/qemu/osdep.h >> @@ -360,6 +360,11 @@ void qemu_anon_ram_free(void *ptr, size_t size); >> #else >> #define QEMU_MADV_REMOVE QEMU_MADV_INVALID >> #endif >> +#ifdef MADV_FREE >> +#define QEMU_MADV_FREE MADV_FREE >> +#else >> +#define QEMU_MADV_FREE QEMU_MADV_INVALID >> +#endif >> >> #elif defined(CONFIG_POSIX_MADVISE) >> >> @@ -373,6 +378,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); >> #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID >> #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID >> #define QEMU_MADV_REMOVE QEMU_MADV_INVALID >> +#define QEMU_MADV_FREE QEMU_MADV_INVALID >> >> #else /* no-op */ >> >> @@ -386,6 +392,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); >> #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID >> #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID >> #define QEMU_MADV_REMOVE QEMU_MADV_INVALID >> +#define QEMU_MADV_FREE QEMU_MADV_INVALID >> >> #endif >> >> diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h >> index 9375ca2a70..f9e3e82562 100644 >> --- a/include/standard-headers/linux/virtio_balloon.h >> +++ b/include/standard-headers/linux/virtio_balloon.h >> @@ -36,6 +36,7 @@ >> #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ >> #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ >> #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ >> +#define VIRTIO_BALLOON_F_HINTING 5 /* Page hinting virtqueue */ >> >> /* Size of a PFN in the balloon interface. */ >> #define VIRTIO_BALLOON_PFN_SHIFT 12 >> -- >> 2.21.0
On Thu, Jul 11, 2019 at 12:06 PM Nitesh Narayan Lal <nitesh@redhat.com> wrote: > > > On 7/11/19 2:55 PM, Michael S. Tsirkin wrote: > > On Wed, Jul 10, 2019 at 03:53:03PM -0400, Nitesh Narayan Lal wrote: > >> Enables QEMU to perform madvise free on the memory range reported > >> by the vm. > >> > >> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com> > > Missing second "l" in the subject :) > > > >> --- > >> hw/virtio/trace-events | 1 + > >> hw/virtio/virtio-balloon.c | 59 +++++++++++++++++++ > >> include/hw/virtio/virtio-balloon.h | 2 +- > >> include/qemu/osdep.h | 7 +++ > >> .../standard-headers/linux/virtio_balloon.h | 1 + > >> 5 files changed, 69 insertions(+), 1 deletion(-) > >> > >> diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events > >> index e28ba48da6..f703a22d36 100644 > >> --- a/hw/virtio/trace-events > >> +++ b/hw/virtio/trace-events > >> @@ -46,6 +46,7 @@ virtio_balloon_handle_output(const char *name, uint64_t gpa) "section name: %s g > >> virtio_balloon_get_config(uint32_t num_pages, uint32_t actual) "num_pages: %d actual: %d" > >> virtio_balloon_set_config(uint32_t actual, uint32_t oldactual) "actual: %d oldactual: %d" > >> virtio_balloon_to_target(uint64_t target, uint32_t num_pages) "balloon target: 0x%"PRIx64" num_pages: %d" > >> +virtio_balloon_hinting_request(unsigned long pfn, unsigned int num_pages) "Guest page hinting request PFN:%lu size: %d" > >> > >> # virtio-mmio.c > >> virtio_mmio_read(uint64_t offset) "virtio_mmio_read offset 0x%" PRIx64 > >> diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c > >> index 2112874055..5d186707b5 100644 > >> --- a/hw/virtio/virtio-balloon.c > >> +++ b/hw/virtio/virtio-balloon.c > >> @@ -34,6 +34,9 @@ > >> > >> #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) > >> > >> +#define VIRTIO_BALLOON_PAGE_HINTING_MAX_PAGES 16 > >> +void free_mem_range(uint64_t addr, uint64_t len); > >> + > >> struct PartiallyBalloonedPage { > >> RAMBlock *rb; > >> ram_addr_t base; > >> @@ -328,6 +331,58 @@ static void balloon_stats_set_poll_interval(Object *obj, Visitor *v, > >> balloon_stats_change_timer(s, 0); > >> } > >> > >> +void free_mem_range(uint64_t addr, uint64_t len) > >> +{ > >> + int ret = 0; > >> + void *hvaddr_to_free; > >> + MemoryRegionSection mrs = memory_region_find(get_system_memory(), > >> + addr, 1); > >> + if (!mrs.mr) { > >> + warn_report("%s:No memory is mapped at address 0x%lu", __func__, addr); > >> + return; > >> + } > >> + > >> + if (!memory_region_is_ram(mrs.mr) && !memory_region_is_romd(mrs.mr)) { > >> + warn_report("%s:Memory at address 0x%s is not RAM:0x%lu", __func__, > >> + HWADDR_PRIx, addr); > >> + memory_region_unref(mrs.mr); > >> + return; > >> + } > >> + > >> + hvaddr_to_free = qemu_map_ram_ptr(mrs.mr->ram_block, mrs.offset_within_region); > >> + trace_virtio_balloon_hinting_request(addr, len); > >> + ret = qemu_madvise(hvaddr_to_free,len, QEMU_MADV_FREE); > >> + if (ret == -1) { > >> + warn_report("%s: Madvise failed with error:%d", __func__, ret); > >> + } > >> +} > >> + > >> +static void virtio_balloon_handle_page_hinting(VirtIODevice *vdev, > >> + VirtQueue *vq) > >> +{ > >> + VirtQueueElement *elem; > >> + size_t offset = 0; > >> + uint64_t gpa, len; > >> + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); > >> + if (!elem) { > >> + return; > >> + } > >> + /* For pending hints which are < max_pages(16), 'gpa != 0' ensures that we > >> + * only read the buffer which holds a valid PFN value. > >> + * TODO: Find a better way to do this. > > Indeed. In fact, what is wrong with passing the gpa as > > part of the element itself? > There are two values which I need to read 'gpa' and 'len'. I will have > to check how to pass them both as part of the element. > But, I will look into it. One advantage of doing it as a scatter-gather list being passed via the element is that you only get one completion. If you are going to do an element per page then you will need to somehow identify if the entire ring has been processed or not before you free your local page list. > >> + */ > >> + while (iov_to_buf(elem->out_sg, elem->out_num, offset, &gpa, 8) == 8 && gpa != 0) { > >> + offset += 8; > >> + offset += iov_to_buf(elem->out_sg, elem->out_num, offset, &len, 8); > >> + if (!qemu_balloon_is_inhibited()) { > >> + free_mem_range(gpa, len); > >> + } > >> + } > >> + virtqueue_push(vq, elem, offset); > >> + virtio_notify(vdev, vq); > >> + g_free(elem); > >> +} > >> + > >> static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) > >> { > >> VirtIOBalloon *s = VIRTIO_BALLOON(vdev); > >> @@ -694,6 +749,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, > >> VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); > >> f |= dev->host_features; > >> virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ); > >> + virtio_add_feature(&f, VIRTIO_BALLOON_F_HINTING); > >> > >> return f; > >> } > >> @@ -780,6 +836,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) > >> s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); > >> s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); > >> s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); > >> + s->hvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_page_hinting); > >> > >> if (virtio_has_feature(s->host_features, > >> VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { > >> @@ -875,6 +932,8 @@ static void virtio_balloon_instance_init(Object *obj) > >> > >> object_property_add(obj, "guest-stats", "guest statistics", > >> balloon_stats_get_all, NULL, NULL, s, NULL); > >> + object_property_add(obj, "guest-page-hinting", "guest page hinting", > >> + NULL, NULL, NULL, s, NULL); > >> > >> object_property_add(obj, "guest-stats-polling-interval", "int", > >> balloon_stats_get_poll_interval, > >> diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h > >> index 1afafb12f6..a58b24fdf2 100644 > >> --- a/include/hw/virtio/virtio-balloon.h > >> +++ b/include/hw/virtio/virtio-balloon.h > >> @@ -44,7 +44,7 @@ enum virtio_balloon_free_page_report_status { > >> > >> typedef struct VirtIOBalloon { > >> VirtIODevice parent_obj; > >> - VirtQueue *ivq, *dvq, *svq, *free_page_vq; > >> + VirtQueue *ivq, *dvq, *svq, *free_page_vq, *hvq; > >> uint32_t free_page_report_status; > >> uint32_t num_pages; > >> uint32_t actual; > >> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h > >> index af2b91f0b8..bb9207e7f4 100644 > >> --- a/include/qemu/osdep.h > >> +++ b/include/qemu/osdep.h > >> @@ -360,6 +360,11 @@ void qemu_anon_ram_free(void *ptr, size_t size); > >> #else > >> #define QEMU_MADV_REMOVE QEMU_MADV_INVALID > >> #endif > >> +#ifdef MADV_FREE > >> +#define QEMU_MADV_FREE MADV_FREE > >> +#else > >> +#define QEMU_MADV_FREE QEMU_MADV_INVALID > >> +#endif > >> > >> #elif defined(CONFIG_POSIX_MADVISE) > >> > >> @@ -373,6 +378,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); > >> #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID > >> #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID > >> #define QEMU_MADV_REMOVE QEMU_MADV_INVALID > >> +#define QEMU_MADV_FREE QEMU_MADV_INVALID > >> > >> #else /* no-op */ > >> > >> @@ -386,6 +392,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); > >> #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID > >> #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID > >> #define QEMU_MADV_REMOVE QEMU_MADV_INVALID > >> +#define QEMU_MADV_FREE QEMU_MADV_INVALID > >> > >> #endif > >> > >> diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h > >> index 9375ca2a70..f9e3e82562 100644 > >> --- a/include/standard-headers/linux/virtio_balloon.h > >> +++ b/include/standard-headers/linux/virtio_balloon.h > >> @@ -36,6 +36,7 @@ > >> #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ > >> #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ > >> #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ > >> +#define VIRTIO_BALLOON_F_HINTING 5 /* Page hinting virtqueue */ > >> > >> /* Size of a PFN in the balloon interface. */ > >> #define VIRTIO_BALLOON_PFN_SHIFT 12 > >> -- > >> 2.21.0 > -- > Thanks > Nitesh >
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index e28ba48da6..f703a22d36 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -46,6 +46,7 @@ virtio_balloon_handle_output(const char *name, uint64_t gpa) "section name: %s g virtio_balloon_get_config(uint32_t num_pages, uint32_t actual) "num_pages: %d actual: %d" virtio_balloon_set_config(uint32_t actual, uint32_t oldactual) "actual: %d oldactual: %d" virtio_balloon_to_target(uint64_t target, uint32_t num_pages) "balloon target: 0x%"PRIx64" num_pages: %d" +virtio_balloon_hinting_request(unsigned long pfn, unsigned int num_pages) "Guest page hinting request PFN:%lu size: %d" # virtio-mmio.c virtio_mmio_read(uint64_t offset) "virtio_mmio_read offset 0x%" PRIx64 diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 2112874055..5d186707b5 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -34,6 +34,9 @@ #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) +#define VIRTIO_BALLOON_PAGE_HINTING_MAX_PAGES 16 +void free_mem_range(uint64_t addr, uint64_t len); + struct PartiallyBalloonedPage { RAMBlock *rb; ram_addr_t base; @@ -328,6 +331,58 @@ static void balloon_stats_set_poll_interval(Object *obj, Visitor *v, balloon_stats_change_timer(s, 0); } +void free_mem_range(uint64_t addr, uint64_t len) +{ + int ret = 0; + void *hvaddr_to_free; + MemoryRegionSection mrs = memory_region_find(get_system_memory(), + addr, 1); + if (!mrs.mr) { + warn_report("%s:No memory is mapped at address 0x%lu", __func__, addr); + return; + } + + if (!memory_region_is_ram(mrs.mr) && !memory_region_is_romd(mrs.mr)) { + warn_report("%s:Memory at address 0x%s is not RAM:0x%lu", __func__, + HWADDR_PRIx, addr); + memory_region_unref(mrs.mr); + return; + } + + hvaddr_to_free = qemu_map_ram_ptr(mrs.mr->ram_block, mrs.offset_within_region); + trace_virtio_balloon_hinting_request(addr, len); + ret = qemu_madvise(hvaddr_to_free,len, QEMU_MADV_FREE); + if (ret == -1) { + warn_report("%s: Madvise failed with error:%d", __func__, ret); + } +} + +static void virtio_balloon_handle_page_hinting(VirtIODevice *vdev, + VirtQueue *vq) +{ + VirtQueueElement *elem; + size_t offset = 0; + uint64_t gpa, len; + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + if (!elem) { + return; + } + /* For pending hints which are < max_pages(16), 'gpa != 0' ensures that we + * only read the buffer which holds a valid PFN value. + * TODO: Find a better way to do this. + */ + while (iov_to_buf(elem->out_sg, elem->out_num, offset, &gpa, 8) == 8 && gpa != 0) { + offset += 8; + offset += iov_to_buf(elem->out_sg, elem->out_num, offset, &len, 8); + if (!qemu_balloon_is_inhibited()) { + free_mem_range(gpa, len); + } + } + virtqueue_push(vq, elem, offset); + virtio_notify(vdev, vq); + g_free(elem); +} + static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtIOBalloon *s = VIRTIO_BALLOON(vdev); @@ -694,6 +749,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); f |= dev->host_features; virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ); + virtio_add_feature(&f, VIRTIO_BALLOON_F_HINTING); return f; } @@ -780,6 +836,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); + s->hvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_page_hinting); if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { @@ -875,6 +932,8 @@ static void virtio_balloon_instance_init(Object *obj) object_property_add(obj, "guest-stats", "guest statistics", balloon_stats_get_all, NULL, NULL, s, NULL); + object_property_add(obj, "guest-page-hinting", "guest page hinting", + NULL, NULL, NULL, s, NULL); object_property_add(obj, "guest-stats-polling-interval", "int", balloon_stats_get_poll_interval, diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h index 1afafb12f6..a58b24fdf2 100644 --- a/include/hw/virtio/virtio-balloon.h +++ b/include/hw/virtio/virtio-balloon.h @@ -44,7 +44,7 @@ enum virtio_balloon_free_page_report_status { typedef struct VirtIOBalloon { VirtIODevice parent_obj; - VirtQueue *ivq, *dvq, *svq, *free_page_vq; + VirtQueue *ivq, *dvq, *svq, *free_page_vq, *hvq; uint32_t free_page_report_status; uint32_t num_pages; uint32_t actual; diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index af2b91f0b8..bb9207e7f4 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -360,6 +360,11 @@ void qemu_anon_ram_free(void *ptr, size_t size); #else #define QEMU_MADV_REMOVE QEMU_MADV_INVALID #endif +#ifdef MADV_FREE +#define QEMU_MADV_FREE MADV_FREE +#else +#define QEMU_MADV_FREE QEMU_MADV_INVALID +#endif #elif defined(CONFIG_POSIX_MADVISE) @@ -373,6 +378,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID #define QEMU_MADV_REMOVE QEMU_MADV_INVALID +#define QEMU_MADV_FREE QEMU_MADV_INVALID #else /* no-op */ @@ -386,6 +392,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID #define QEMU_MADV_REMOVE QEMU_MADV_INVALID +#define QEMU_MADV_FREE QEMU_MADV_INVALID #endif diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h index 9375ca2a70..f9e3e82562 100644 --- a/include/standard-headers/linux/virtio_balloon.h +++ b/include/standard-headers/linux/virtio_balloon.h @@ -36,6 +36,7 @@ #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ +#define VIRTIO_BALLOON_F_HINTING 5 /* Page hinting virtqueue */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12
Enables QEMU to perform madvise free on the memory range reported by the vm. Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com> --- hw/virtio/trace-events | 1 + hw/virtio/virtio-balloon.c | 59 +++++++++++++++++++ include/hw/virtio/virtio-balloon.h | 2 +- include/qemu/osdep.h | 7 +++ .../standard-headers/linux/virtio_balloon.h | 1 + 5 files changed, 69 insertions(+), 1 deletion(-)