diff mbox

[RFC,QEMU] kvm: Support for guest page hinting

Message ID 20180611152005.15591-1-nilal@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nitesh Lal June 11, 2018, 3:20 p.m. UTC
From: Nitesh Narayan Lal <nilal@redhat.com>

This patch enables QEMU to handle page hinting requests
from the guest. Once the guest kicks QEMU to free a list of
page, QEMU retrives the guest physical address in the list
and converts each to host virtual address and then
MADVISE that memory.

Signed-off-by: Nitesh Narayan Lal <nilal@redhat.com>
---
 hw/virtio/virtio-balloon.c                      | 117 +++++++++++++++++++++++-
 hw/virtio/virtio.c                              |  23 +++++
 include/hw/virtio/virtio-access.h               |   1 +
 include/hw/virtio/virtio-balloon.h              |   2 +-
 include/qemu/osdep.h                            |   7 ++
 include/standard-headers/linux/virtio_balloon.h |   1 +
 6 files changed, 149 insertions(+), 2 deletions(-)

Comments

Luiz Capitulino June 11, 2018, 8:32 p.m. UTC | #1
On Mon, 11 Jun 2018 11:20:05 -0400
nilal@redhat.com wrote:

> From: Nitesh Narayan Lal <nilal@redhat.com>
> 
> This patch enables QEMU to handle page hinting requests
> from the guest. Once the guest kicks QEMU to free a list of
> page, QEMU retrives the guest physical address in the list
> and converts each to host virtual address and then
> MADVISE that memory.
> 
> Signed-off-by: Nitesh Narayan Lal <nilal@redhat.com>
> ---
>  hw/virtio/virtio-balloon.c                      | 117 +++++++++++++++++++++++-
>  hw/virtio/virtio.c                              |  23 +++++
>  include/hw/virtio/virtio-access.h               |   1 +
>  include/hw/virtio/virtio-balloon.h              |   2 +-
>  include/qemu/osdep.h                            |   7 ++
>  include/standard-headers/linux/virtio_balloon.h |   1 +
>  6 files changed, 149 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
> index 1f7a87f..46deacb 100644
> --- a/hw/virtio/virtio-balloon.c
> +++ b/hw/virtio/virtio-balloon.c
> @@ -34,6 +34,8 @@
>  
>  #define BALLOON_PAGE_SIZE  (1 << VIRTIO_BALLOON_PFN_SHIFT)
>  
> +void page_hinting_request(uint64_t addr, uint32_t len);
> +
>  static void balloon_page(void *addr, int deflate)
>  {
>      if (!qemu_balloon_is_inhibited() && (!kvm_enabled() ||
> @@ -77,11 +79,22 @@ static bool balloon_stats_supported(const VirtIOBalloon *s)
>      return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
>  }
>  
> +static bool balloon_hinting_supported(const VirtIOBalloon *s)
> +{
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    return virtio_vdev_has_feature(vdev, VIRTIO_GUEST_PAGE_HINTING_VQ);
> +}
> +
>  static bool balloon_stats_enabled(const VirtIOBalloon *s)
>  {
>      return s->stats_poll_interval > 0;
>  }
>  
> +static bool page_hinting_enabled(const VirtIOBalloon *s)
> +{
> +    return s->stats_poll_interval > 0;
> +}
> +
>  static void balloon_stats_destroy_timer(VirtIOBalloon *s)
>  {
>      if (balloon_stats_enabled(s)) {
> @@ -97,14 +110,20 @@ static void balloon_stats_change_timer(VirtIOBalloon *s, int64_t secs)
>      timer_mod(s->stats_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + secs * 1000);
>  }
>  
> +static void page_hinting_change_timer(VirtIOBalloon *s, int64_t secs)
> +{
> +    timer_mod(s->stats_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + secs * 1000);
> +}
> +
>  static void balloon_stats_poll_cb(void *opaque)
>  {
>      VirtIOBalloon *s = opaque;
>      VirtIODevice *vdev = VIRTIO_DEVICE(s);
>  
> -    if (s->stats_vq_elem == NULL || !balloon_stats_supported(s)) {
> +    if (s->stats_vq_elem == NULL || !balloon_stats_supported(s) || !balloon_hinting_supported(s)) {
>          /* re-schedule */
>          balloon_stats_change_timer(s, s->stats_poll_interval);
> +        page_hinting_change_timer(s, s->stats_poll_interval);
>          return;
>      }
>  
> @@ -201,12 +220,101 @@ static void balloon_stats_set_poll_interval(Object *obj, Visitor *v,
>          balloon_stats_change_timer(s, value);
>          return;
>      }
> +    
> +    if (page_hinting_enabled(s)) {
> +        /* timer interval change */
> +        s->stats_poll_interval = value;
> +        page_hinting_change_timer(s, value);
> +        return;
> +    }
>  
>      /* create a new timer */
>      g_assert(s->stats_timer == NULL);
>      s->stats_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, balloon_stats_poll_cb, s);
>      s->stats_poll_interval = value;
>      balloon_stats_change_timer(s, 0);
> +    /* create a new timer */
> +    g_assert(s->stats_timer == NULL);
> +    s->stats_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, balloon_stats_poll_cb, s);
> +    s->stats_poll_interval = value;
> +    page_hinting_change_timer(s, 0);

So, I'm not sure I understand the changes to the balloon stats polling
code. Why is this needed?

> +}
> +
> +static void *gpa2hva(MemoryRegion **p_mr, hwaddr addr, Error **errp)
> +{
> +    MemoryRegionSection mrs = memory_region_find(get_system_memory(),
> +                                                 addr, 1);
> +
> +    if (!mrs.mr) {
> +        error_setg(errp, "No memory is mapped at address 0x%" HWADDR_PRIx, addr);
> +        return NULL;
> +    }
> +
> +    if (!memory_region_is_ram(mrs.mr) && !memory_region_is_romd(mrs.mr)) {
> +        error_setg(errp, "Memory at address 0x%" HWADDR_PRIx "is not RAM", addr);
> +        memory_region_unref(mrs.mr);
> +        return NULL;
> +    }
> +
> +    *p_mr = mrs.mr;
> +    return qemu_map_ram_ptr(mrs.mr->ram_block, mrs.offset_within_region);
> +}
> +
> +struct guest_pages {
> +	unsigned long pfn;
> +	unsigned int pages;
> +};
> +
> +
> +void page_hinting_request(uint64_t addr, uint32_t len)
> +{
> +    Error *local_err = NULL;
> +    MemoryRegion *mr = NULL;
> +    void *hvaddr;
> +    int ret = 0;
> +    struct guest_pages *guest_obj;
> +    int i = 0;
> +    void *hvaddr_to_free;
> +    unsigned long pfn, pfn_end;
> +    uint64_t gpaddr_to_free;
> +
> +    hvaddr = gpa2hva(&mr, addr, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +        return;
> +    }
> +    guest_obj = hvaddr;
> +
> +    while (i < len) {
> +        pfn = guest_obj[i].pfn;
> +	pfn_end = guest_obj[i].pfn + guest_obj[i].pages - 1;
> +	while (pfn <= pfn_end) {
> +	        gpaddr_to_free = pfn << VIRTIO_BALLOON_PFN_SHIFT;
> +	        hvaddr_to_free = gpa2hva(&mr, gpaddr_to_free, &local_err);
> +	        if (local_err) {
> +			error_report_err(local_err);
> +		        return;
> +		}
> +		ret = qemu_madvise((void *)hvaddr_to_free, 4096, QEMU_MADV_FREE);

I asked this question on the kernel patches, but if I understood correctly
you're passing the whole free pages array to the host, right? If that's
correct, would it work to madvise an entire region instead of going page
by page? If that's possible, it would allow you to kill the inner while loop.

> +		if (ret == -1)
> +		    printf("\n%d:%s Error: Madvise failed with error:%d\n", __LINE__, __func__, ret);
> +		pfn++;
> +	}
> +	i++;
> +    }
> +}
> +
> +
> +static void virtio_balloon_page_hinting(VirtIODevice *vdev, VirtQueue *vq)
> +{
> +    uint64_t addr;
> +    uint32_t len;
> +    VirtQueueElement elem = {};
> +
> +    pop_hinting_addr(vq, &addr, &len);
> +    page_hinting_request(addr, len);
> +    virtqueue_push(vq, &elem, 0);
> +    virtio_notify(vdev, vq);
>  }
>  
>  static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
> @@ -379,6 +487,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
>      VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
>      f |= dev->host_features;
>      virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ);
> +    virtio_add_feature(&f, VIRTIO_GUEST_PAGE_HINTING_VQ);
>      return f;
>  }
>  
> @@ -412,6 +521,9 @@ static int virtio_balloon_post_load_device(void *opaque, int version_id)
>      if (balloon_stats_enabled(s)) {
>          balloon_stats_change_timer(s, s->stats_poll_interval);
>      }
> +    if (page_hinting_enabled(s)) {
> +        page_hinting_change_timer(s, s->stats_poll_interval);
> +    }
>      return 0;
>  }
>  
> @@ -448,6 +560,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
>      s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
>      s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
>      s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats);
> +    s->hvq = virtio_add_queue(vdev, 128, virtio_balloon_page_hinting);
>  
>      reset_stats(s);
>  }
> @@ -491,6 +604,8 @@ static void virtio_balloon_instance_init(Object *obj)
>  
>      object_property_add(obj, "guest-stats", "guest statistics",
>                          balloon_stats_get_all, NULL, NULL, s, NULL);
> +    object_property_add(obj, "guest-page-hinting", "guest page hinting",
> +                        NULL, NULL, NULL, s, NULL);
>  
>      object_property_add(obj, "guest-stats-polling-interval", "int",
>                          balloon_stats_get_poll_interval,
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 1debb01..11ddc13 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -825,6 +825,29 @@ static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_nu
>      return elem;
>  }
>  
> +void pop_hinting_addr(VirtQueue *vq, uint64_t *addr, uint32_t *len)
> +{
> +   VRingMemoryRegionCaches *caches;
> +   VRingDesc desc;
> +   MemoryRegionCache *desc_cache;
> +   VirtIODevice *vdev = vq->vdev;
> +   unsigned int head, max;
> +
> +   max = vq->vring.num;
> +   if (!virtqueue_get_head(vq, vq->last_avail_idx++, &head)) {
> +	printf("\n%d:%sError: Unable to read head\n", __LINE__, __func__);
> +   }
> +
> +   caches = vring_get_region_caches(vq);
> +   if (caches->desc.len < max * sizeof(VRingDesc)) {
> +       virtio_error(vdev, "Cannot map descriptor ring");
> +   }

Should you really continue on error?

> +   desc_cache = &caches->desc;
> +   vring_desc_read(vdev, &desc, desc_cache, head);
> +   *addr = desc.addr;
> +   *len = desc.len;
> +}
> +
>  void *virtqueue_pop(VirtQueue *vq, size_t sz)
>  {
>      unsigned int i, head, max;
> diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h
> index 2e92074..568d71f 100644
> --- a/include/hw/virtio/virtio-access.h
> +++ b/include/hw/virtio/virtio-access.h
> @@ -24,6 +24,7 @@
>  #define LEGACY_VIRTIO_IS_BIENDIAN 1
>  #endif
>  
> +void pop_hinting_addr(VirtQueue *vq, uint64_t *addr, uint32_t *len);
>  static inline bool virtio_access_is_big_endian(VirtIODevice *vdev)
>  {
>  #if defined(LEGACY_VIRTIO_IS_BIENDIAN)
> diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h
> index e0df352..774498a 100644
> --- a/include/hw/virtio/virtio-balloon.h
> +++ b/include/hw/virtio/virtio-balloon.h
> @@ -32,7 +32,7 @@ typedef struct virtio_balloon_stat_modern {
>  
>  typedef struct VirtIOBalloon {
>      VirtIODevice parent_obj;
> -    VirtQueue *ivq, *dvq, *svq;
> +    VirtQueue *ivq, *dvq, *svq, *hvq;
>      uint32_t num_pages;
>      uint32_t actual;
>      uint64_t stats[VIRTIO_BALLOON_S_NR];
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index 4165806..dc62f17 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -305,6 +305,11 @@ void qemu_anon_ram_free(void *ptr, size_t size);
>  #else
>  #define QEMU_MADV_REMOVE QEMU_MADV_INVALID
>  #endif
> +#ifdef MADV_FREE
> +#define QEMU_MADV_FREE MADV_FREE
> +#else
> +#define QEMU_MADV_FREE QEMU_MADV_INVALID
> +#endif
>  
>  #elif defined(CONFIG_POSIX_MADVISE)
>  
> @@ -318,6 +323,7 @@ void qemu_anon_ram_free(void *ptr, size_t size);
>  #define QEMU_MADV_HUGEPAGE  QEMU_MADV_INVALID
>  #define QEMU_MADV_NOHUGEPAGE  QEMU_MADV_INVALID
>  #define QEMU_MADV_REMOVE QEMU_MADV_INVALID
> +#define QEMU_MADV_FREE QEMU_MADV_INVALID
>  
>  #else /* no-op */
>  
> @@ -331,6 +337,7 @@ void qemu_anon_ram_free(void *ptr, size_t size);
>  #define QEMU_MADV_HUGEPAGE  QEMU_MADV_INVALID
>  #define QEMU_MADV_NOHUGEPAGE  QEMU_MADV_INVALID
>  #define QEMU_MADV_REMOVE QEMU_MADV_INVALID
> +#define QEMU_MADV_FREE QEMU_MADV_INVALID
>  
>  #endif
>  
> diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h
> index e446805..1372da1 100644
> --- a/include/standard-headers/linux/virtio_balloon.h
> +++ b/include/standard-headers/linux/virtio_balloon.h
> @@ -34,6 +34,7 @@
>  #define VIRTIO_BALLOON_F_MUST_TELL_HOST	0 /* Tell before reclaiming pages */
>  #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
> +#define VIRTIO_GUEST_PAGE_HINTING_VQ	3 /* Page hinting virtqueue */
>  
>  /* Size of a PFN in the balloon interface. */
>  #define VIRTIO_BALLOON_PFN_SHIFT 12
Nitesh Lal June 12, 2018, 3:10 p.m. UTC | #2
On 06/11/2018 04:32 PM, Luiz Capitulino wrote:
> On Mon, 11 Jun 2018 11:20:05 -0400
> nilal@redhat.com wrote:
>
>> From: Nitesh Narayan Lal <nilal@redhat.com>
>>
>> This patch enables QEMU to handle page hinting requests
>> from the guest. Once the guest kicks QEMU to free a list of
>> page, QEMU retrives the guest physical address in the list
>> and converts each to host virtual address and then
>> MADVISE that memory.
>>
>> Signed-off-by: Nitesh Narayan Lal <nilal@redhat.com>
>> ---
>>  hw/virtio/virtio-balloon.c                      | 117 +++++++++++++++++++++++-
>>  hw/virtio/virtio.c                              |  23 +++++
>>  include/hw/virtio/virtio-access.h               |   1 +
>>  include/hw/virtio/virtio-balloon.h              |   2 +-
>>  include/qemu/osdep.h                            |   7 ++
>>  include/standard-headers/linux/virtio_balloon.h |   1 +
>>  6 files changed, 149 insertions(+), 2 deletions(-)
>>
>> diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
>> index 1f7a87f..46deacb 100644
>> --- a/hw/virtio/virtio-balloon.c
>> +++ b/hw/virtio/virtio-balloon.c
>> @@ -34,6 +34,8 @@
>>  
>>  #define BALLOON_PAGE_SIZE  (1 << VIRTIO_BALLOON_PFN_SHIFT)
>>  
>> +void page_hinting_request(uint64_t addr, uint32_t len);
>> +
>>  static void balloon_page(void *addr, int deflate)
>>  {
>>      if (!qemu_balloon_is_inhibited() && (!kvm_enabled() ||
>> @@ -77,11 +79,22 @@ static bool balloon_stats_supported(const VirtIOBalloon *s)
>>      return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
>>  }
>>  
>> +static bool balloon_hinting_supported(const VirtIOBalloon *s)
>> +{
>> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
>> +    return virtio_vdev_has_feature(vdev, VIRTIO_GUEST_PAGE_HINTING_VQ);
>> +}
>> +
>>  static bool balloon_stats_enabled(const VirtIOBalloon *s)
>>  {
>>      return s->stats_poll_interval > 0;
>>  }
>>  
>> +static bool page_hinting_enabled(const VirtIOBalloon *s)
>> +{
>> +    return s->stats_poll_interval > 0;
>> +}
>> +
>>  static void balloon_stats_destroy_timer(VirtIOBalloon *s)
>>  {
>>      if (balloon_stats_enabled(s)) {
>> @@ -97,14 +110,20 @@ static void balloon_stats_change_timer(VirtIOBalloon *s, int64_t secs)
>>      timer_mod(s->stats_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + secs * 1000);
>>  }
>>  
>> +static void page_hinting_change_timer(VirtIOBalloon *s, int64_t secs)
>> +{
>> +    timer_mod(s->stats_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + secs * 1000);
>> +}
>> +
>>  static void balloon_stats_poll_cb(void *opaque)
>>  {
>>      VirtIOBalloon *s = opaque;
>>      VirtIODevice *vdev = VIRTIO_DEVICE(s);
>>  
>> -    if (s->stats_vq_elem == NULL || !balloon_stats_supported(s)) {
>> +    if (s->stats_vq_elem == NULL || !balloon_stats_supported(s) || !balloon_hinting_supported(s)) {
>>          /* re-schedule */
>>          balloon_stats_change_timer(s, s->stats_poll_interval);
>> +        page_hinting_change_timer(s, s->stats_poll_interval);
>>          return;
>>      }
>>  
>> @@ -201,12 +220,101 @@ static void balloon_stats_set_poll_interval(Object *obj, Visitor *v,
>>          balloon_stats_change_timer(s, value);
>>          return;
>>      }
>> +    
>> +    if (page_hinting_enabled(s)) {
>> +        /* timer interval change */
>> +        s->stats_poll_interval = value;
>> +        page_hinting_change_timer(s, value);
>> +        return;
>> +    }
>>  
>>      /* create a new timer */
>>      g_assert(s->stats_timer == NULL);
>>      s->stats_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, balloon_stats_poll_cb, s);
>>      s->stats_poll_interval = value;
>>      balloon_stats_change_timer(s, 0);
>> +    /* create a new timer */
>> +    g_assert(s->stats_timer == NULL);
>> +    s->stats_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, balloon_stats_poll_cb, s);
>> +    s->stats_poll_interval = value;
>> +    page_hinting_change_timer(s, 0);
> So, I'm not sure I understand the changes to the balloon stats polling
> code. Why is this needed?
My bad, this is not required. I will remove this. Thank you for pointing
this out.
>
>> +}
>> +
>> +static void *gpa2hva(MemoryRegion **p_mr, hwaddr addr, Error **errp)
>> +{
>> +    MemoryRegionSection mrs = memory_region_find(get_system_memory(),
>> +                                                 addr, 1);
>> +
>> +    if (!mrs.mr) {
>> +        error_setg(errp, "No memory is mapped at address 0x%" HWADDR_PRIx, addr);
>> +        return NULL;
>> +    }
>> +
>> +    if (!memory_region_is_ram(mrs.mr) && !memory_region_is_romd(mrs.mr)) {
>> +        error_setg(errp, "Memory at address 0x%" HWADDR_PRIx "is not RAM", addr);
>> +        memory_region_unref(mrs.mr);
>> +        return NULL;
>> +    }
>> +
>> +    *p_mr = mrs.mr;
>> +    return qemu_map_ram_ptr(mrs.mr->ram_block, mrs.offset_within_region);
>> +}
>> +
>> +struct guest_pages {
>> +	unsigned long pfn;
>> +	unsigned int pages;
>> +};
>> +
>> +
>> +void page_hinting_request(uint64_t addr, uint32_t len)
>> +{
>> +    Error *local_err = NULL;
>> +    MemoryRegion *mr = NULL;
>> +    void *hvaddr;
>> +    int ret = 0;
>> +    struct guest_pages *guest_obj;
>> +    int i = 0;
>> +    void *hvaddr_to_free;
>> +    unsigned long pfn, pfn_end;
>> +    uint64_t gpaddr_to_free;
>> +
>> +    hvaddr = gpa2hva(&mr, addr, &local_err);
>> +    if (local_err) {
>> +        error_report_err(local_err);
>> +        return;
>> +    }
>> +    guest_obj = hvaddr;
>> +
>> +    while (i < len) {
>> +        pfn = guest_obj[i].pfn;
>> +	pfn_end = guest_obj[i].pfn + guest_obj[i].pages - 1;
>> +	while (pfn <= pfn_end) {
>> +	        gpaddr_to_free = pfn << VIRTIO_BALLOON_PFN_SHIFT;
>> +	        hvaddr_to_free = gpa2hva(&mr, gpaddr_to_free, &local_err);
>> +	        if (local_err) {
>> +			error_report_err(local_err);
>> +		        return;
>> +		}
>> +		ret = qemu_madvise((void *)hvaddr_to_free, 4096, QEMU_MADV_FREE);
> I asked this question on the kernel patches, but if I understood correctly
> you're passing the whole free pages array to the host, right? If that's
> correct, would it work to madvise an entire region instead of going page
> by page? If that's possible, it would allow you to kill the inner while loop.
I will check if it is possible.
>
>> +		if (ret == -1)
>> +		    printf("\n%d:%s Error: Madvise failed with error:%d\n", __LINE__, __func__, ret);
>> +		pfn++;
>> +	}
>> +	i++;
>> +    }
>> +}
>> +
>> +
>> +static void virtio_balloon_page_hinting(VirtIODevice *vdev, VirtQueue *vq)
>> +{
>> +    uint64_t addr;
>> +    uint32_t len;
>> +    VirtQueueElement elem = {};
>> +
>> +    pop_hinting_addr(vq, &addr, &len);
>> +    page_hinting_request(addr, len);
>> +    virtqueue_push(vq, &elem, 0);
>> +    virtio_notify(vdev, vq);
>>  }
>>  
>>  static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
>> @@ -379,6 +487,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
>>      VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
>>      f |= dev->host_features;
>>      virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ);
>> +    virtio_add_feature(&f, VIRTIO_GUEST_PAGE_HINTING_VQ);
>>      return f;
>>  }
>>  
>> @@ -412,6 +521,9 @@ static int virtio_balloon_post_load_device(void *opaque, int version_id)
>>      if (balloon_stats_enabled(s)) {
>>          balloon_stats_change_timer(s, s->stats_poll_interval);
>>      }
>> +    if (page_hinting_enabled(s)) {
>> +        page_hinting_change_timer(s, s->stats_poll_interval);
>> +    }
>>      return 0;
>>  }
>>  
>> @@ -448,6 +560,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
>>      s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
>>      s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
>>      s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats);
>> +    s->hvq = virtio_add_queue(vdev, 128, virtio_balloon_page_hinting);
>>  
>>      reset_stats(s);
>>  }
>> @@ -491,6 +604,8 @@ static void virtio_balloon_instance_init(Object *obj)
>>  
>>      object_property_add(obj, "guest-stats", "guest statistics",
>>                          balloon_stats_get_all, NULL, NULL, s, NULL);
>> +    object_property_add(obj, "guest-page-hinting", "guest page hinting",
>> +                        NULL, NULL, NULL, s, NULL);
>>  
>>      object_property_add(obj, "guest-stats-polling-interval", "int",
>>                          balloon_stats_get_poll_interval,
>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>> index 1debb01..11ddc13 100644
>> --- a/hw/virtio/virtio.c
>> +++ b/hw/virtio/virtio.c
>> @@ -825,6 +825,29 @@ static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_nu
>>      return elem;
>>  }
>>  
>> +void pop_hinting_addr(VirtQueue *vq, uint64_t *addr, uint32_t *len)
>> +{
>> +   VRingMemoryRegionCaches *caches;
>> +   VRingDesc desc;
>> +   MemoryRegionCache *desc_cache;
>> +   VirtIODevice *vdev = vq->vdev;
>> +   unsigned int head, max;
>> +
>> +   max = vq->vring.num;
>> +   if (!virtqueue_get_head(vq, vq->last_avail_idx++, &head)) {
>> +	printf("\n%d:%sError: Unable to read head\n", __LINE__, __func__);
>> +   }
>> +
>> +   caches = vring_get_region_caches(vq);
>> +   if (caches->desc.len < max * sizeof(VRingDesc)) {
>> +       virtio_error(vdev, "Cannot map descriptor ring");
>> +   }
> Should you really continue on error?
No, I should not. I will make the change.
>
>> +   desc_cache = &caches->desc;
>> +   vring_desc_read(vdev, &desc, desc_cache, head);
>> +   *addr = desc.addr;
>> +   *len = desc.len;
>> +}
>> +
>>  void *virtqueue_pop(VirtQueue *vq, size_t sz)
>>  {
>>      unsigned int i, head, max;
>> diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h
>> index 2e92074..568d71f 100644
>> --- a/include/hw/virtio/virtio-access.h
>> +++ b/include/hw/virtio/virtio-access.h
>> @@ -24,6 +24,7 @@
>>  #define LEGACY_VIRTIO_IS_BIENDIAN 1
>>  #endif
>>  
>> +void pop_hinting_addr(VirtQueue *vq, uint64_t *addr, uint32_t *len);
>>  static inline bool virtio_access_is_big_endian(VirtIODevice *vdev)
>>  {
>>  #if defined(LEGACY_VIRTIO_IS_BIENDIAN)
>> diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h
>> index e0df352..774498a 100644
>> --- a/include/hw/virtio/virtio-balloon.h
>> +++ b/include/hw/virtio/virtio-balloon.h
>> @@ -32,7 +32,7 @@ typedef struct virtio_balloon_stat_modern {
>>  
>>  typedef struct VirtIOBalloon {
>>      VirtIODevice parent_obj;
>> -    VirtQueue *ivq, *dvq, *svq;
>> +    VirtQueue *ivq, *dvq, *svq, *hvq;
>>      uint32_t num_pages;
>>      uint32_t actual;
>>      uint64_t stats[VIRTIO_BALLOON_S_NR];
>> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
>> index 4165806..dc62f17 100644
>> --- a/include/qemu/osdep.h
>> +++ b/include/qemu/osdep.h
>> @@ -305,6 +305,11 @@ void qemu_anon_ram_free(void *ptr, size_t size);
>>  #else
>>  #define QEMU_MADV_REMOVE QEMU_MADV_INVALID
>>  #endif
>> +#ifdef MADV_FREE
>> +#define QEMU_MADV_FREE MADV_FREE
>> +#else
>> +#define QEMU_MADV_FREE QEMU_MADV_INVALID
>> +#endif
>>  
>>  #elif defined(CONFIG_POSIX_MADVISE)
>>  
>> @@ -318,6 +323,7 @@ void qemu_anon_ram_free(void *ptr, size_t size);
>>  #define QEMU_MADV_HUGEPAGE  QEMU_MADV_INVALID
>>  #define QEMU_MADV_NOHUGEPAGE  QEMU_MADV_INVALID
>>  #define QEMU_MADV_REMOVE QEMU_MADV_INVALID
>> +#define QEMU_MADV_FREE QEMU_MADV_INVALID
>>  
>>  #else /* no-op */
>>  
>> @@ -331,6 +337,7 @@ void qemu_anon_ram_free(void *ptr, size_t size);
>>  #define QEMU_MADV_HUGEPAGE  QEMU_MADV_INVALID
>>  #define QEMU_MADV_NOHUGEPAGE  QEMU_MADV_INVALID
>>  #define QEMU_MADV_REMOVE QEMU_MADV_INVALID
>> +#define QEMU_MADV_FREE QEMU_MADV_INVALID
>>  
>>  #endif
>>  
>> diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h
>> index e446805..1372da1 100644
>> --- a/include/standard-headers/linux/virtio_balloon.h
>> +++ b/include/standard-headers/linux/virtio_balloon.h
>> @@ -34,6 +34,7 @@
>>  #define VIRTIO_BALLOON_F_MUST_TELL_HOST	0 /* Tell before reclaiming pages */
>>  #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
>>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
>> +#define VIRTIO_GUEST_PAGE_HINTING_VQ	3 /* Page hinting virtqueue */
>>  
>>  /* Size of a PFN in the balloon interface. */
>>  #define VIRTIO_BALLOON_PFN_SHIFT 12
diff mbox

Patch

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 1f7a87f..46deacb 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -34,6 +34,8 @@ 
 
 #define BALLOON_PAGE_SIZE  (1 << VIRTIO_BALLOON_PFN_SHIFT)
 
+void page_hinting_request(uint64_t addr, uint32_t len);
+
 static void balloon_page(void *addr, int deflate)
 {
     if (!qemu_balloon_is_inhibited() && (!kvm_enabled() ||
@@ -77,11 +79,22 @@  static bool balloon_stats_supported(const VirtIOBalloon *s)
     return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
 }
 
+static bool balloon_hinting_supported(const VirtIOBalloon *s)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    return virtio_vdev_has_feature(vdev, VIRTIO_GUEST_PAGE_HINTING_VQ);
+}
+
 static bool balloon_stats_enabled(const VirtIOBalloon *s)
 {
     return s->stats_poll_interval > 0;
 }
 
+static bool page_hinting_enabled(const VirtIOBalloon *s)
+{
+    return s->stats_poll_interval > 0;
+}
+
 static void balloon_stats_destroy_timer(VirtIOBalloon *s)
 {
     if (balloon_stats_enabled(s)) {
@@ -97,14 +110,20 @@  static void balloon_stats_change_timer(VirtIOBalloon *s, int64_t secs)
     timer_mod(s->stats_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + secs * 1000);
 }
 
+static void page_hinting_change_timer(VirtIOBalloon *s, int64_t secs)
+{
+    timer_mod(s->stats_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + secs * 1000);
+}
+
 static void balloon_stats_poll_cb(void *opaque)
 {
     VirtIOBalloon *s = opaque;
     VirtIODevice *vdev = VIRTIO_DEVICE(s);
 
-    if (s->stats_vq_elem == NULL || !balloon_stats_supported(s)) {
+    if (s->stats_vq_elem == NULL || !balloon_stats_supported(s) || !balloon_hinting_supported(s)) {
         /* re-schedule */
         balloon_stats_change_timer(s, s->stats_poll_interval);
+        page_hinting_change_timer(s, s->stats_poll_interval);
         return;
     }
 
@@ -201,12 +220,101 @@  static void balloon_stats_set_poll_interval(Object *obj, Visitor *v,
         balloon_stats_change_timer(s, value);
         return;
     }
+    
+    if (page_hinting_enabled(s)) {
+        /* timer interval change */
+        s->stats_poll_interval = value;
+        page_hinting_change_timer(s, value);
+        return;
+    }
 
     /* create a new timer */
     g_assert(s->stats_timer == NULL);
     s->stats_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, balloon_stats_poll_cb, s);
     s->stats_poll_interval = value;
     balloon_stats_change_timer(s, 0);
+    /* create a new timer */
+    g_assert(s->stats_timer == NULL);
+    s->stats_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, balloon_stats_poll_cb, s);
+    s->stats_poll_interval = value;
+    page_hinting_change_timer(s, 0);
+}
+
+static void *gpa2hva(MemoryRegion **p_mr, hwaddr addr, Error **errp)
+{
+    MemoryRegionSection mrs = memory_region_find(get_system_memory(),
+                                                 addr, 1);
+
+    if (!mrs.mr) {
+        error_setg(errp, "No memory is mapped at address 0x%" HWADDR_PRIx, addr);
+        return NULL;
+    }
+
+    if (!memory_region_is_ram(mrs.mr) && !memory_region_is_romd(mrs.mr)) {
+        error_setg(errp, "Memory at address 0x%" HWADDR_PRIx "is not RAM", addr);
+        memory_region_unref(mrs.mr);
+        return NULL;
+    }
+
+    *p_mr = mrs.mr;
+    return qemu_map_ram_ptr(mrs.mr->ram_block, mrs.offset_within_region);
+}
+
+struct guest_pages {
+	unsigned long pfn;
+	unsigned int pages;
+};
+
+
+void page_hinting_request(uint64_t addr, uint32_t len)
+{
+    Error *local_err = NULL;
+    MemoryRegion *mr = NULL;
+    void *hvaddr;
+    int ret = 0;
+    struct guest_pages *guest_obj;
+    int i = 0;
+    void *hvaddr_to_free;
+    unsigned long pfn, pfn_end;
+    uint64_t gpaddr_to_free;
+
+    hvaddr = gpa2hva(&mr, addr, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        return;
+    }
+    guest_obj = hvaddr;
+
+    while (i < len) {
+        pfn = guest_obj[i].pfn;
+	pfn_end = guest_obj[i].pfn + guest_obj[i].pages - 1;
+	while (pfn <= pfn_end) {
+	        gpaddr_to_free = pfn << VIRTIO_BALLOON_PFN_SHIFT;
+	        hvaddr_to_free = gpa2hva(&mr, gpaddr_to_free, &local_err);
+	        if (local_err) {
+			error_report_err(local_err);
+		        return;
+		}
+		ret = qemu_madvise((void *)hvaddr_to_free, 4096, QEMU_MADV_FREE);
+		if (ret == -1)
+		    printf("\n%d:%s Error: Madvise failed with error:%d\n", __LINE__, __func__, ret);
+		pfn++;
+	}
+	i++;
+    }
+}
+
+
+static void virtio_balloon_page_hinting(VirtIODevice *vdev, VirtQueue *vq)
+{
+    uint64_t addr;
+    uint32_t len;
+    VirtQueueElement elem = {};
+
+    pop_hinting_addr(vq, &addr, &len);
+    page_hinting_request(addr, len);
+    virtqueue_push(vq, &elem, 0);
+    virtio_notify(vdev, vq);
 }
 
 static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
@@ -379,6 +487,7 @@  static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
     VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
     f |= dev->host_features;
     virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ);
+    virtio_add_feature(&f, VIRTIO_GUEST_PAGE_HINTING_VQ);
     return f;
 }
 
@@ -412,6 +521,9 @@  static int virtio_balloon_post_load_device(void *opaque, int version_id)
     if (balloon_stats_enabled(s)) {
         balloon_stats_change_timer(s, s->stats_poll_interval);
     }
+    if (page_hinting_enabled(s)) {
+        page_hinting_change_timer(s, s->stats_poll_interval);
+    }
     return 0;
 }
 
@@ -448,6 +560,7 @@  static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
     s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
     s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
     s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats);
+    s->hvq = virtio_add_queue(vdev, 128, virtio_balloon_page_hinting);
 
     reset_stats(s);
 }
@@ -491,6 +604,8 @@  static void virtio_balloon_instance_init(Object *obj)
 
     object_property_add(obj, "guest-stats", "guest statistics",
                         balloon_stats_get_all, NULL, NULL, s, NULL);
+    object_property_add(obj, "guest-page-hinting", "guest page hinting",
+                        NULL, NULL, NULL, s, NULL);
 
     object_property_add(obj, "guest-stats-polling-interval", "int",
                         balloon_stats_get_poll_interval,
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 1debb01..11ddc13 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -825,6 +825,29 @@  static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_nu
     return elem;
 }
 
+void pop_hinting_addr(VirtQueue *vq, uint64_t *addr, uint32_t *len)
+{
+   VRingMemoryRegionCaches *caches;
+   VRingDesc desc;
+   MemoryRegionCache *desc_cache;
+   VirtIODevice *vdev = vq->vdev;
+   unsigned int head, max;
+
+   max = vq->vring.num;
+   if (!virtqueue_get_head(vq, vq->last_avail_idx++, &head)) {
+	printf("\n%d:%sError: Unable to read head\n", __LINE__, __func__);
+   }
+
+   caches = vring_get_region_caches(vq);
+   if (caches->desc.len < max * sizeof(VRingDesc)) {
+       virtio_error(vdev, "Cannot map descriptor ring");
+   }
+   desc_cache = &caches->desc;
+   vring_desc_read(vdev, &desc, desc_cache, head);
+   *addr = desc.addr;
+   *len = desc.len;
+}
+
 void *virtqueue_pop(VirtQueue *vq, size_t sz)
 {
     unsigned int i, head, max;
diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h
index 2e92074..568d71f 100644
--- a/include/hw/virtio/virtio-access.h
+++ b/include/hw/virtio/virtio-access.h
@@ -24,6 +24,7 @@ 
 #define LEGACY_VIRTIO_IS_BIENDIAN 1
 #endif
 
+void pop_hinting_addr(VirtQueue *vq, uint64_t *addr, uint32_t *len);
 static inline bool virtio_access_is_big_endian(VirtIODevice *vdev)
 {
 #if defined(LEGACY_VIRTIO_IS_BIENDIAN)
diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h
index e0df352..774498a 100644
--- a/include/hw/virtio/virtio-balloon.h
+++ b/include/hw/virtio/virtio-balloon.h
@@ -32,7 +32,7 @@  typedef struct virtio_balloon_stat_modern {
 
 typedef struct VirtIOBalloon {
     VirtIODevice parent_obj;
-    VirtQueue *ivq, *dvq, *svq;
+    VirtQueue *ivq, *dvq, *svq, *hvq;
     uint32_t num_pages;
     uint32_t actual;
     uint64_t stats[VIRTIO_BALLOON_S_NR];
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 4165806..dc62f17 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -305,6 +305,11 @@  void qemu_anon_ram_free(void *ptr, size_t size);
 #else
 #define QEMU_MADV_REMOVE QEMU_MADV_INVALID
 #endif
+#ifdef MADV_FREE
+#define QEMU_MADV_FREE MADV_FREE
+#else
+#define QEMU_MADV_FREE QEMU_MADV_INVALID
+#endif
 
 #elif defined(CONFIG_POSIX_MADVISE)
 
@@ -318,6 +323,7 @@  void qemu_anon_ram_free(void *ptr, size_t size);
 #define QEMU_MADV_HUGEPAGE  QEMU_MADV_INVALID
 #define QEMU_MADV_NOHUGEPAGE  QEMU_MADV_INVALID
 #define QEMU_MADV_REMOVE QEMU_MADV_INVALID
+#define QEMU_MADV_FREE QEMU_MADV_INVALID
 
 #else /* no-op */
 
@@ -331,6 +337,7 @@  void qemu_anon_ram_free(void *ptr, size_t size);
 #define QEMU_MADV_HUGEPAGE  QEMU_MADV_INVALID
 #define QEMU_MADV_NOHUGEPAGE  QEMU_MADV_INVALID
 #define QEMU_MADV_REMOVE QEMU_MADV_INVALID
+#define QEMU_MADV_FREE QEMU_MADV_INVALID
 
 #endif
 
diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h
index e446805..1372da1 100644
--- a/include/standard-headers/linux/virtio_balloon.h
+++ b/include/standard-headers/linux/virtio_balloon.h
@@ -34,6 +34,7 @@ 
 #define VIRTIO_BALLOON_F_MUST_TELL_HOST	0 /* Tell before reclaiming pages */
 #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
+#define VIRTIO_GUEST_PAGE_HINTING_VQ	3 /* Page hinting virtqueue */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12