diff mbox series

[v14,6/6] virtio-balloon: Add support for providing unused page reports to host

Message ID 20191119214653.24996.90695.stgit@localhost.localdomain (mailing list archive)
State New, archived
Headers show
Series mm / virtio: Provide support for unused page reporting | expand

Commit Message

Alexander Duyck Nov. 19, 2019, 9:46 p.m. UTC
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>

Add support for the page reporting feature provided by virtio-balloon.
Reporting differs from the regular balloon functionality in that is is
much less durable than a standard memory balloon. Instead of creating a
list of pages that cannot be accessed the pages are only inaccessible
while they are being indicated to the virtio interface. Once the
interface has acknowledged them they are placed back into their respective
free lists and are once again accessible by the guest system.

Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
---
 drivers/virtio/Kconfig              |    1 +
 drivers/virtio/virtio_balloon.c     |   65 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/virtio_balloon.h |    1 +
 3 files changed, 67 insertions(+)

Comments

David Hildenbrand Nov. 28, 2019, 3:25 p.m. UTC | #1
On 19.11.19 22:46, Alexander Duyck wrote:
> From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> 
> Add support for the page reporting feature provided by virtio-balloon.
> Reporting differs from the regular balloon functionality in that is is
> much less durable than a standard memory balloon. Instead of creating a
> list of pages that cannot be accessed the pages are only inaccessible
> while they are being indicated to the virtio interface. Once the
> interface has acknowledged them they are placed back into their respective
> free lists and are once again accessible by the guest system.

Maybe add something like "In contrast to ordinary balloon
inflation/deflation, the guest can reuse all reported pages immediately
after reporting has finished, without having to notify the hypervisor
about it (e.g., VIRTIO_BALLOON_F_MUST_TELL_HOST does not apply)."

[...]

>  /*
>   * Balloon device works in 4K page units.  So each page is pointed to by
> @@ -37,6 +38,9 @@
>  #define VIRTIO_BALLOON_FREE_PAGE_SIZE \
>  	(1 << (VIRTIO_BALLOON_FREE_PAGE_ORDER + PAGE_SHIFT))
>  
> +/*  limit on the number of pages that can be on the reporting vq */
> +#define VIRTIO_BALLOON_VRING_HINTS_MAX	16

Maybe rename that from HINTS to REPORTS

> +
>  #ifdef CONFIG_BALLOON_COMPACTION
>  static struct vfsmount *balloon_mnt;
>  #endif
> @@ -46,6 +50,7 @@ enum virtio_balloon_vq {
>  	VIRTIO_BALLOON_VQ_DEFLATE,
>  	VIRTIO_BALLOON_VQ_STATS,
>  	VIRTIO_BALLOON_VQ_FREE_PAGE,
> +	VIRTIO_BALLOON_VQ_REPORTING,
>  	VIRTIO_BALLOON_VQ_MAX
>  };
>  
> @@ -113,6 +118,10 @@ struct virtio_balloon {
>  
>  	/* To register a shrinker to shrink memory upon memory pressure */
>  	struct shrinker shrinker;
> +
> +	/* Unused page reporting device */

Sounds like the device is unused :D

"Device info for reporting unused pages" ?

I am in general wondering, should we rename "unused" to "free". I.e.,
"free page reporting" instead of "unused page reporting"? Or what was
the motivation behind using "unused" ?

> +	struct virtqueue *reporting_vq;
> +	struct page_reporting_dev_info pr_dev_info;
>  };
>  
>  static struct virtio_device_id id_table[] = {
> @@ -152,6 +161,32 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
>  
>  }
>  
> +void virtballoon_unused_page_report(struct page_reporting_dev_info *pr_dev_info,
> +				    unsigned int nents)
> +{
> +	struct virtio_balloon *vb =
> +		container_of(pr_dev_info, struct virtio_balloon, pr_dev_info);
> +	struct virtqueue *vq = vb->reporting_vq;
> +	unsigned int unused, err;
> +
> +	/* We should always be able to add these buffers to an empty queue. */

This comment somewhat contradicts the error handling (and comment)
below. Maybe just drop it?

> +	err = virtqueue_add_inbuf(vq, pr_dev_info->sg, nents, vb,
> +				  GFP_NOWAIT | __GFP_NOWARN);
> +
> +	/*
> +	 * In the extremely unlikely case that something has changed and we
> +	 * are able to trigger an error we will simply display a warning
> +	 * and exit without actually processing the pages.
> +	 */
> +	if (WARN_ON(err))
> +		return;

Maybe WARN_ON_ONCE? (to not flood the log on recurring errors)

> +
> +	virtqueue_kick(vq);
> +
> +	/* When host has read buffer, this completes via balloon_ack */
> +	wait_event(vb->acked, virtqueue_get_buf(vq, &unused));

Is it safe to rely on the same ack-ing mechanism as the inflate/deflate
queue? What if both mechanisms are used concurrently and race/both wait
for the hypervisor?

Maybe we need a separate vb->acked + callback function.

> +}
> +
>  static void set_page_pfns(struct virtio_balloon *vb,
>  			  __virtio32 pfns[], struct page *page)
>  {
> @@ -476,6 +511,7 @@ static int init_vqs(struct virtio_balloon *vb)
>  	names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
>  	names[VIRTIO_BALLOON_VQ_STATS] = NULL;
>  	names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> +	names[VIRTIO_BALLOON_VQ_REPORTING] = NULL;
>  
>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
>  		names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> @@ -487,11 +523,19 @@ static int init_vqs(struct virtio_balloon *vb)
>  		callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
>  	}
>  
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
> +		names[VIRTIO_BALLOON_VQ_REPORTING] = "reporting_vq";
> +		callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
> +	}
> +
>  	err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
>  					 vqs, callbacks, names, NULL, NULL);
>  	if (err)
>  		return err;
>  
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> +		vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING];
> +

I'd register these in the same order they are defined (IOW, move this
further down)

>  	vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
>  	vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> @@ -932,12 +976,30 @@ static int virtballoon_probe(struct virtio_device *vdev)
>  		if (err)
>  			goto out_del_balloon_wq;
>  	}
> +
> +	vb->pr_dev_info.report = virtballoon_unused_page_report;
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
> +		unsigned int capacity;
> +
> +		capacity = min_t(unsigned int,
> +				 virtqueue_get_vring_size(vb->reporting_vq),
> +				 VIRTIO_BALLOON_VRING_HINTS_MAX);
> +		vb->pr_dev_info.capacity = capacity;
> +
> +		err = page_reporting_register(&vb->pr_dev_info);
> +		if (err)
> +			goto out_unregister_shrinker;
> +	}

It can happen here that we start reporting before marking the device
ready. Can that be problematic?

Maybe we have to ignore any reports in virtballoon_unused_page_report()
until ready...

> +
>  	virtio_device_ready(vdev);
>  
>  	if (towards_target(vb))
>  		virtballoon_changed(vdev);
>  	return 0;
>  
> +out_unregister_shrinker:
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> +		virtio_balloon_unregister_shrinker(vb);

A sync is done implicitly, right? So after this call, we won't get any
new callbacks/are stuck in a callback.

>  out_del_balloon_wq:
>  	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
>  		destroy_workqueue(vb->balloon_wq);
> @@ -966,6 +1028,8 @@ static void virtballoon_remove(struct virtio_device *vdev)
>  {
>  	struct virtio_balloon *vb = vdev->priv;
>  
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> +		page_reporting_unregister(&vb->pr_dev_info);

Dito, same question regarding syncs.

>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
>  		virtio_balloon_unregister_shrinker(vb);
>  	spin_lock_irq(&vb->stop_update_lock);
> @@ -1038,6 +1102,7 @@ static int virtballoon_validate(struct virtio_device *vdev)
>  	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
>  	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
>  	VIRTIO_BALLOON_F_PAGE_POISON,
> +	VIRTIO_BALLOON_F_REPORTING,
>  };
>  
>  static struct virtio_driver virtio_balloon_driver = {
> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index a1966cd7b677..19974392d324 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -36,6 +36,7 @@
>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
>  #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
>  #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
> +#define VIRTIO_BALLOON_F_REPORTING	5 /* Page reporting virtqueue */
>  
>  /* Size of a PFN in the balloon interface. */
>  #define VIRTIO_BALLOON_PFN_SHIFT 12
> 
> 

Small and powerful patch :)
Michael S. Tsirkin Nov. 28, 2019, 5 p.m. UTC | #2
On Thu, Nov 28, 2019 at 04:25:54PM +0100, David Hildenbrand wrote:
> On 19.11.19 22:46, Alexander Duyck wrote:
> > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > 
> > Add support for the page reporting feature provided by virtio-balloon.
> > Reporting differs from the regular balloon functionality in that is is
> > much less durable than a standard memory balloon. Instead of creating a
> > list of pages that cannot be accessed the pages are only inaccessible
> > while they are being indicated to the virtio interface. Once the
> > interface has acknowledged them they are placed back into their respective
> > free lists and are once again accessible by the guest system.
> 
> Maybe add something like "In contrast to ordinary balloon
> inflation/deflation, the guest can reuse all reported pages immediately
> after reporting has finished, without having to notify the hypervisor
> about it (e.g., VIRTIO_BALLOON_F_MUST_TELL_HOST does not apply)."

Maybe we can make apply. The effect of reporting a page is effectively
putting it in a balloon then immediately taking it out. Maybe without
VIRTIO_BALLOON_F_MUST_TELL_HOST the pages can be reused before host
marked buffers used?

We didn't teach existing page hinting to behave like this, but maybe we
should, and maybe it's not too late, not a long time passed
since it was merged, and the whole shrinker based thing
seems to have been broken ...


BTW generally UAPI patches will have to be sent to virtio-dev
mailing list before they are merged.

> [...]
> 
> >  /*
> >   * Balloon device works in 4K page units.  So each page is pointed to by
> > @@ -37,6 +38,9 @@
> >  #define VIRTIO_BALLOON_FREE_PAGE_SIZE \
> >  	(1 << (VIRTIO_BALLOON_FREE_PAGE_ORDER + PAGE_SHIFT))
> >  
> > +/*  limit on the number of pages that can be on the reporting vq */
> > +#define VIRTIO_BALLOON_VRING_HINTS_MAX	16
> 
> Maybe rename that from HINTS to REPORTS
> 
> > +
> >  #ifdef CONFIG_BALLOON_COMPACTION
> >  static struct vfsmount *balloon_mnt;
> >  #endif
> > @@ -46,6 +50,7 @@ enum virtio_balloon_vq {
> >  	VIRTIO_BALLOON_VQ_DEFLATE,
> >  	VIRTIO_BALLOON_VQ_STATS,
> >  	VIRTIO_BALLOON_VQ_FREE_PAGE,
> > +	VIRTIO_BALLOON_VQ_REPORTING,
> >  	VIRTIO_BALLOON_VQ_MAX
> >  };
> >  
> > @@ -113,6 +118,10 @@ struct virtio_balloon {
> >  
> >  	/* To register a shrinker to shrink memory upon memory pressure */
> >  	struct shrinker shrinker;
> > +
> > +	/* Unused page reporting device */
> 
> Sounds like the device is unused :D
> 
> "Device info for reporting unused pages" ?
> 
> I am in general wondering, should we rename "unused" to "free". I.e.,
> "free page reporting" instead of "unused page reporting"? Or what was
> the motivation behind using "unused" ?
> 
> > +	struct virtqueue *reporting_vq;
> > +	struct page_reporting_dev_info pr_dev_info;
> >  };
> >  
> >  static struct virtio_device_id id_table[] = {
> > @@ -152,6 +161,32 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
> >  
> >  }
> >  
> > +void virtballoon_unused_page_report(struct page_reporting_dev_info *pr_dev_info,
> > +				    unsigned int nents)
> > +{
> > +	struct virtio_balloon *vb =
> > +		container_of(pr_dev_info, struct virtio_balloon, pr_dev_info);
> > +	struct virtqueue *vq = vb->reporting_vq;
> > +	unsigned int unused, err;
> > +
> > +	/* We should always be able to add these buffers to an empty queue. */
> 
> This comment somewhat contradicts the error handling (and comment)
> below. Maybe just drop it?
> 
> > +	err = virtqueue_add_inbuf(vq, pr_dev_info->sg, nents, vb,
> > +				  GFP_NOWAIT | __GFP_NOWARN);
> > +
> > +	/*
> > +	 * In the extremely unlikely case that something has changed and we
> > +	 * are able to trigger an error we will simply display a warning
> > +	 * and exit without actually processing the pages.
> > +	 */
> > +	if (WARN_ON(err))
> > +		return;
> 
> Maybe WARN_ON_ONCE? (to not flood the log on recurring errors)
> 
> > +
> > +	virtqueue_kick(vq);
> > +
> > +	/* When host has read buffer, this completes via balloon_ack */
> > +	wait_event(vb->acked, virtqueue_get_buf(vq, &unused));
> 
> Is it safe to rely on the same ack-ing mechanism as the inflate/deflate
> queue? What if both mechanisms are used concurrently and race/both wait
> for the hypervisor?
> 
> Maybe we need a separate vb->acked + callback function.
> 
> > +}
> > +
> >  static void set_page_pfns(struct virtio_balloon *vb,
> >  			  __virtio32 pfns[], struct page *page)
> >  {
> > @@ -476,6 +511,7 @@ static int init_vqs(struct virtio_balloon *vb)
> >  	names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
> >  	names[VIRTIO_BALLOON_VQ_STATS] = NULL;
> >  	names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> > +	names[VIRTIO_BALLOON_VQ_REPORTING] = NULL;
> >  
> >  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> >  		names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> > @@ -487,11 +523,19 @@ static int init_vqs(struct virtio_balloon *vb)
> >  		callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> >  	}
> >  
> > +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
> > +		names[VIRTIO_BALLOON_VQ_REPORTING] = "reporting_vq";
> > +		callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
> > +	}
> > +
> >  	err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
> >  					 vqs, callbacks, names, NULL, NULL);
> >  	if (err)
> >  		return err;
> >  
> > +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> > +		vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING];
> > +
> 
> I'd register these in the same order they are defined (IOW, move this
> further down)
> 
> >  	vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
> >  	vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
> >  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> > @@ -932,12 +976,30 @@ static int virtballoon_probe(struct virtio_device *vdev)
> >  		if (err)
> >  			goto out_del_balloon_wq;
> >  	}
> > +
> > +	vb->pr_dev_info.report = virtballoon_unused_page_report;
> > +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
> > +		unsigned int capacity;
> > +
> > +		capacity = min_t(unsigned int,
> > +				 virtqueue_get_vring_size(vb->reporting_vq),
> > +				 VIRTIO_BALLOON_VRING_HINTS_MAX);
> > +		vb->pr_dev_info.capacity = capacity;
> > +
> > +		err = page_reporting_register(&vb->pr_dev_info);
> > +		if (err)
> > +			goto out_unregister_shrinker;
> > +	}
> 
> It can happen here that we start reporting before marking the device
> ready. Can that be problematic?
> 
> Maybe we have to ignore any reports in virtballoon_unused_page_report()
> until ready...
> 
> > +
> >  	virtio_device_ready(vdev);
> >  
> >  	if (towards_target(vb))
> >  		virtballoon_changed(vdev);
> >  	return 0;
> >  
> > +out_unregister_shrinker:
> > +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> > +		virtio_balloon_unregister_shrinker(vb);
> 
> A sync is done implicitly, right? So after this call, we won't get any
> new callbacks/are stuck in a callback.
> 
> >  out_del_balloon_wq:
> >  	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
> >  		destroy_workqueue(vb->balloon_wq);
> > @@ -966,6 +1028,8 @@ static void virtballoon_remove(struct virtio_device *vdev)
> >  {
> >  	struct virtio_balloon *vb = vdev->priv;
> >  
> > +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> > +		page_reporting_unregister(&vb->pr_dev_info);
> 
> Dito, same question regarding syncs.
> 
> >  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> >  		virtio_balloon_unregister_shrinker(vb);
> >  	spin_lock_irq(&vb->stop_update_lock);
> > @@ -1038,6 +1102,7 @@ static int virtballoon_validate(struct virtio_device *vdev)
> >  	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> >  	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> >  	VIRTIO_BALLOON_F_PAGE_POISON,
> > +	VIRTIO_BALLOON_F_REPORTING,
> >  };
> >  
> >  static struct virtio_driver virtio_balloon_driver = {
> > diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> > index a1966cd7b677..19974392d324 100644
> > --- a/include/uapi/linux/virtio_balloon.h
> > +++ b/include/uapi/linux/virtio_balloon.h
> > @@ -36,6 +36,7 @@
> >  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
> >  #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
> >  #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
> > +#define VIRTIO_BALLOON_F_REPORTING	5 /* Page reporting virtqueue */
> >  
> >  /* Size of a PFN in the balloon interface. */
> >  #define VIRTIO_BALLOON_PFN_SHIFT 12
> > 
> > 
> 
> Small and powerful patch :)
> 
> -- 
> Thanks,
> 
> David / dhildenb
Alexander Duyck Nov. 29, 2019, 9:13 p.m. UTC | #3
On Thu, Nov 28, 2019 at 7:26 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 19.11.19 22:46, Alexander Duyck wrote:
> > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> >
> > Add support for the page reporting feature provided by virtio-balloon.
> > Reporting differs from the regular balloon functionality in that is is
> > much less durable than a standard memory balloon. Instead of creating a
> > list of pages that cannot be accessed the pages are only inaccessible
> > while they are being indicated to the virtio interface. Once the
> > interface has acknowledged them they are placed back into their respective
> > free lists and are once again accessible by the guest system.
>
> Maybe add something like "In contrast to ordinary balloon
> inflation/deflation, the guest can reuse all reported pages immediately
> after reporting has finished, without having to notify the hypervisor
> about it (e.g., VIRTIO_BALLOON_F_MUST_TELL_HOST does not apply)."

Okay. I'll make a note of it for next version.

> [...]
>
> >  /*
> >   * Balloon device works in 4K page units.  So each page is pointed to by
> > @@ -37,6 +38,9 @@
> >  #define VIRTIO_BALLOON_FREE_PAGE_SIZE \
> >       (1 << (VIRTIO_BALLOON_FREE_PAGE_ORDER + PAGE_SHIFT))
> >
> > +/*  limit on the number of pages that can be on the reporting vq */
> > +#define VIRTIO_BALLOON_VRING_HINTS_MAX       16
>
> Maybe rename that from HINTS to REPORTS

I'll fix it for the next version.

> > +
> >  #ifdef CONFIG_BALLOON_COMPACTION
> >  static struct vfsmount *balloon_mnt;
> >  #endif
> > @@ -46,6 +50,7 @@ enum virtio_balloon_vq {
> >       VIRTIO_BALLOON_VQ_DEFLATE,
> >       VIRTIO_BALLOON_VQ_STATS,
> >       VIRTIO_BALLOON_VQ_FREE_PAGE,
> > +     VIRTIO_BALLOON_VQ_REPORTING,
> >       VIRTIO_BALLOON_VQ_MAX
> >  };
> >
> > @@ -113,6 +118,10 @@ struct virtio_balloon {
> >
> >       /* To register a shrinker to shrink memory upon memory pressure */
> >       struct shrinker shrinker;
> > +
> > +     /* Unused page reporting device */
>
> Sounds like the device is unused :D
>
> "Device info for reporting unused pages" ?
>
> I am in general wondering, should we rename "unused" to "free". I.e.,
> "free page reporting" instead of "unused page reporting"? Or what was
> the motivation behind using "unused" ?

I honestly don't remember why I chose "unused" at this point. I can
switch over to "free" if that is what is preferred.

Looking over the code a bit more I suspect the reason for avoiding it
is because free page hinting also mentioned reporting in a few spots.

> > +     struct virtqueue *reporting_vq;
> > +     struct page_reporting_dev_info pr_dev_info;
> >  };
> >
> >  static struct virtio_device_id id_table[] = {
> > @@ -152,6 +161,32 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
> >
> >  }
> >
> > +void virtballoon_unused_page_report(struct page_reporting_dev_info *pr_dev_info,
> > +                                 unsigned int nents)
> > +{
> > +     struct virtio_balloon *vb =
> > +             container_of(pr_dev_info, struct virtio_balloon, pr_dev_info);
> > +     struct virtqueue *vq = vb->reporting_vq;
> > +     unsigned int unused, err;
> > +
> > +     /* We should always be able to add these buffers to an empty queue. */
>
> This comment somewhat contradicts the error handling (and comment)
> below. Maybe just drop it?
>
> > +     err = virtqueue_add_inbuf(vq, pr_dev_info->sg, nents, vb,
> > +                               GFP_NOWAIT | __GFP_NOWARN);
> > +
> > +     /*
> > +      * In the extremely unlikely case that something has changed and we
> > +      * are able to trigger an error we will simply display a warning
> > +      * and exit without actually processing the pages.
> > +      */
> > +     if (WARN_ON(err))
> > +             return;
>
> Maybe WARN_ON_ONCE? (to not flood the log on recurring errors)

Actually I might need to tweak things here a bit. It occurs to me that
this can fail for more than just there not being space in the ring. I
forgot that DMA mapping needs to also occur so in the case of a DMA
mapping failure we would also see an error.

I probably will switch it to a WARN_ON_ONCE. I may also need to add a
return value to the function so that we can indicate that an entire
batch has failed and that we need to abort.

> > +
> > +     virtqueue_kick(vq);
> > +
> > +     /* When host has read buffer, this completes via balloon_ack */
> > +     wait_event(vb->acked, virtqueue_get_buf(vq, &unused));
>
> Is it safe to rely on the same ack-ing mechanism as the inflate/deflate
> queue? What if both mechanisms are used concurrently and race/both wait
> for the hypervisor?
>
> Maybe we need a separate vb->acked + callback function.

So if I understand correctly what is actually happening is that the
wait event is simply a trigger that will wake us up, and at that point
we check to see if the buffer we submitted is done. If not we go back
to sleep. As such all we are really waiting on is the notification
that the buffers we submitted have been processed. So it is using the
same function but on a different virtual queue.

> > +}
> > +
> >  static void set_page_pfns(struct virtio_balloon *vb,
> >                         __virtio32 pfns[], struct page *page)
> >  {
> > @@ -476,6 +511,7 @@ static int init_vqs(struct virtio_balloon *vb)
> >       names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
> >       names[VIRTIO_BALLOON_VQ_STATS] = NULL;
> >       names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> > +     names[VIRTIO_BALLOON_VQ_REPORTING] = NULL;
> >
> >       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> >               names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> > @@ -487,11 +523,19 @@ static int init_vqs(struct virtio_balloon *vb)
> >               callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> >       }
> >
> > +     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
> > +             names[VIRTIO_BALLOON_VQ_REPORTING] = "reporting_vq";
> > +             callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
> > +     }
> > +
> >       err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
> >                                        vqs, callbacks, names, NULL, NULL);
> >       if (err)
> >               return err;
> >
> > +     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> > +             vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING];
> > +
>
> I'd register these in the same order they are defined (IOW, move this
> further down)

done.

> >       vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
> >       vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
> >       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> > @@ -932,12 +976,30 @@ static int virtballoon_probe(struct virtio_device *vdev)
> >               if (err)
> >                       goto out_del_balloon_wq;
> >       }
> > +
> > +     vb->pr_dev_info.report = virtballoon_unused_page_report;
> > +     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
> > +             unsigned int capacity;
> > +
> > +             capacity = min_t(unsigned int,
> > +                              virtqueue_get_vring_size(vb->reporting_vq),
> > +                              VIRTIO_BALLOON_VRING_HINTS_MAX);
> > +             vb->pr_dev_info.capacity = capacity;
> > +
> > +             err = page_reporting_register(&vb->pr_dev_info);
> > +             if (err)
> > +                     goto out_unregister_shrinker;
> > +     }
>
> It can happen here that we start reporting before marking the device
> ready. Can that be problematic?
>
> Maybe we have to ignore any reports in virtballoon_unused_page_report()
> until ready...

I don't think there is an issue with us putting buffers on the ring
before it is ready. I think it will just cause our function to sleep.

I'm guessing that is the case since init_vqs will add a buffer to the
stats vq and that happens even earlier in virtballoon_probe.

> > +
> >       virtio_device_ready(vdev);
> >
> >       if (towards_target(vb))
> >               virtballoon_changed(vdev);
> >       return 0;
> >
> > +out_unregister_shrinker:
> > +     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> > +             virtio_balloon_unregister_shrinker(vb);
>
> A sync is done implicitly, right? So after this call, we won't get any
> new callbacks/are stuck in a callback.

From what I can tell a read/write semaphore is used in
unregister_shrinker when we delete it from the list so it shouldn't be
an issue.

> >  out_del_balloon_wq:
> >       if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
> >               destroy_workqueue(vb->balloon_wq);
> > @@ -966,6 +1028,8 @@ static void virtballoon_remove(struct virtio_device *vdev)
> >  {
> >       struct virtio_balloon *vb = vdev->priv;
> >
> > +     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> > +             page_reporting_unregister(&vb->pr_dev_info);
>
> Dito, same question regarding syncs.

Yes, although for that one I was using pointer deletion, a barrier,
and a cancel_work_sync since I didn't support a list.

> >       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> >               virtio_balloon_unregister_shrinker(vb);
> >       spin_lock_irq(&vb->stop_update_lock);
> > @@ -1038,6 +1102,7 @@ static int virtballoon_validate(struct virtio_device *vdev)
> >       VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> >       VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> >       VIRTIO_BALLOON_F_PAGE_POISON,
> > +     VIRTIO_BALLOON_F_REPORTING,
> >  };
> >
> >  static struct virtio_driver virtio_balloon_driver = {
> > diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> > index a1966cd7b677..19974392d324 100644
> > --- a/include/uapi/linux/virtio_balloon.h
> > +++ b/include/uapi/linux/virtio_balloon.h
> > @@ -36,6 +36,7 @@
> >  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM      2 /* Deflate balloon on OOM */
> >  #define VIRTIO_BALLOON_F_FREE_PAGE_HINT      3 /* VQ to report free pages */
> >  #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */
> > +#define VIRTIO_BALLOON_F_REPORTING   5 /* Page reporting virtqueue */
> >
> >  /* Size of a PFN in the balloon interface. */
> >  #define VIRTIO_BALLOON_PFN_SHIFT 12
> >
> >
>
> Small and powerful patch :)

Agreed. Although we will have to see if we can keep it that way.
Ideally I want to leave this with the ability so specify what size
scatterlist we receive. However if we have to flip it around then it
will force us to add logic for chopping up the scatterlist for
processing in chunks.

Thanks for the review.

- Alex
Michael S. Tsirkin Dec. 1, 2019, 11:46 a.m. UTC | #4
On Fri, Nov 29, 2019 at 01:13:32PM -0800, Alexander Duyck wrote:
> On Thu, Nov 28, 2019 at 7:26 AM David Hildenbrand <david@redhat.com> wrote:
> >
> > On 19.11.19 22:46, Alexander Duyck wrote:
> > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > >
> > > Add support for the page reporting feature provided by virtio-balloon.
> > > Reporting differs from the regular balloon functionality in that is is
> > > much less durable than a standard memory balloon. Instead of creating a
> > > list of pages that cannot be accessed the pages are only inaccessible
> > > while they are being indicated to the virtio interface. Once the
> > > interface has acknowledged them they are placed back into their respective
> > > free lists and are once again accessible by the guest system.
> >
> > Maybe add something like "In contrast to ordinary balloon
> > inflation/deflation, the guest can reuse all reported pages immediately
> > after reporting has finished, without having to notify the hypervisor
> > about it (e.g., VIRTIO_BALLOON_F_MUST_TELL_HOST does not apply)."
> 
> Okay. I'll make a note of it for next version.


VIRTIO_BALLOON_F_MUST_TELL_HOST is IMHO misdocumented.
It states:
	VIRTIO_BALLOON_F_MUST_TELL_HOST (0) Host has to be told before pages from the balloon are
	used.
but really balloon always told host. The difference is in timing,
historically balloon gave up pages before sending the
message and before waiting for the buffer to be used by host.

I think this feature can be the same if we want.


> > [...]
> >
> > >  /*
> > >   * Balloon device works in 4K page units.  So each page is pointed to by
> > > @@ -37,6 +38,9 @@
> > >  #define VIRTIO_BALLOON_FREE_PAGE_SIZE \
> > >       (1 << (VIRTIO_BALLOON_FREE_PAGE_ORDER + PAGE_SHIFT))
> > >
> > > +/*  limit on the number of pages that can be on the reporting vq */
> > > +#define VIRTIO_BALLOON_VRING_HINTS_MAX       16
> >
> > Maybe rename that from HINTS to REPORTS
> 
> I'll fix it for the next version.
> 
> > > +
> > >  #ifdef CONFIG_BALLOON_COMPACTION
> > >  static struct vfsmount *balloon_mnt;
> > >  #endif
> > > @@ -46,6 +50,7 @@ enum virtio_balloon_vq {
> > >       VIRTIO_BALLOON_VQ_DEFLATE,
> > >       VIRTIO_BALLOON_VQ_STATS,
> > >       VIRTIO_BALLOON_VQ_FREE_PAGE,
> > > +     VIRTIO_BALLOON_VQ_REPORTING,
> > >       VIRTIO_BALLOON_VQ_MAX
> > >  };
> > >
> > > @@ -113,6 +118,10 @@ struct virtio_balloon {
> > >
> > >       /* To register a shrinker to shrink memory upon memory pressure */
> > >       struct shrinker shrinker;
> > > +
> > > +     /* Unused page reporting device */
> >
> > Sounds like the device is unused :D
> >
> > "Device info for reporting unused pages" ?
> >
> > I am in general wondering, should we rename "unused" to "free". I.e.,
> > "free page reporting" instead of "unused page reporting"? Or what was
> > the motivation behind using "unused" ?
> 
> I honestly don't remember why I chose "unused" at this point. I can
> switch over to "free" if that is what is preferred.
> 
> Looking over the code a bit more I suspect the reason for avoiding it
> is because free page hinting also mentioned reporting in a few spots.
> 
> > > +     struct virtqueue *reporting_vq;
> > > +     struct page_reporting_dev_info pr_dev_info;
> > >  };
> > >
> > >  static struct virtio_device_id id_table[] = {
> > > @@ -152,6 +161,32 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
> > >
> > >  }
> > >
> > > +void virtballoon_unused_page_report(struct page_reporting_dev_info *pr_dev_info,
> > > +                                 unsigned int nents)
> > > +{
> > > +     struct virtio_balloon *vb =
> > > +             container_of(pr_dev_info, struct virtio_balloon, pr_dev_info);
> > > +     struct virtqueue *vq = vb->reporting_vq;
> > > +     unsigned int unused, err;
> > > +
> > > +     /* We should always be able to add these buffers to an empty queue. */
> >
> > This comment somewhat contradicts the error handling (and comment)
> > below. Maybe just drop it?
> >
> > > +     err = virtqueue_add_inbuf(vq, pr_dev_info->sg, nents, vb,
> > > +                               GFP_NOWAIT | __GFP_NOWARN);
> > > +
> > > +     /*
> > > +      * In the extremely unlikely case that something has changed and we
> > > +      * are able to trigger an error we will simply display a warning
> > > +      * and exit without actually processing the pages.
> > > +      */
> > > +     if (WARN_ON(err))
> > > +             return;
> >
> > Maybe WARN_ON_ONCE? (to not flood the log on recurring errors)
> 
> Actually I might need to tweak things here a bit. It occurs to me that
> this can fail for more than just there not being space in the ring. I
> forgot that DMA mapping needs to also occur so in the case of a DMA
> mapping failure we would also see an error.

Balloon assumes DMA mapping is bypassed right now:

static int virtballoon_validate(struct virtio_device *vdev)
{
        if (!page_poisoning_enabled())
                __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON);

        __virtio_clear_bit(vdev, VIRTIO_F_IOMMU_PLATFORM);

^^^^^^^^


        return 0;
}

I don't think it can work with things like a bounce buffer.

> I probably will switch it to a WARN_ON_ONCE. I may also need to add a
> return value to the function so that we can indicate that an entire
> batch has failed and that we need to abort.
> 
> > > +
> > > +     virtqueue_kick(vq);
> > > +
> > > +     /* When host has read buffer, this completes via balloon_ack */
> > > +     wait_event(vb->acked, virtqueue_get_buf(vq, &unused));
> >
> > Is it safe to rely on the same ack-ing mechanism as the inflate/deflate
> > queue? What if both mechanisms are used concurrently and race/both wait
> > for the hypervisor?
> >
> > Maybe we need a separate vb->acked + callback function.
> 
> So if I understand correctly what is actually happening is that the
> wait event is simply a trigger that will wake us up, and at that point
> we check to see if the buffer we submitted is done. If not we go back
> to sleep. As such all we are really waiting on is the notification
> that the buffers we submitted have been processed. So it is using the
> same function but on a different virtual queue.
> 
> > > +}
> > > +
> > >  static void set_page_pfns(struct virtio_balloon *vb,
> > >                         __virtio32 pfns[], struct page *page)
> > >  {
> > > @@ -476,6 +511,7 @@ static int init_vqs(struct virtio_balloon *vb)
> > >       names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
> > >       names[VIRTIO_BALLOON_VQ_STATS] = NULL;
> > >       names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> > > +     names[VIRTIO_BALLOON_VQ_REPORTING] = NULL;
> > >
> > >       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> > >               names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> > > @@ -487,11 +523,19 @@ static int init_vqs(struct virtio_balloon *vb)
> > >               callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> > >       }
> > >
> > > +     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
> > > +             names[VIRTIO_BALLOON_VQ_REPORTING] = "reporting_vq";
> > > +             callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
> > > +     }
> > > +
> > >       err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
> > >                                        vqs, callbacks, names, NULL, NULL);
> > >       if (err)
> > >               return err;
> > >
> > > +     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> > > +             vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING];
> > > +
> >
> > I'd register these in the same order they are defined (IOW, move this
> > further down)
> 
> done.
> 
> > >       vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
> > >       vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
> > >       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> > > @@ -932,12 +976,30 @@ static int virtballoon_probe(struct virtio_device *vdev)
> > >               if (err)
> > >                       goto out_del_balloon_wq;
> > >       }
> > > +
> > > +     vb->pr_dev_info.report = virtballoon_unused_page_report;
> > > +     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
> > > +             unsigned int capacity;
> > > +
> > > +             capacity = min_t(unsigned int,
> > > +                              virtqueue_get_vring_size(vb->reporting_vq),
> > > +                              VIRTIO_BALLOON_VRING_HINTS_MAX);
> > > +             vb->pr_dev_info.capacity = capacity;
> > > +
> > > +             err = page_reporting_register(&vb->pr_dev_info);
> > > +             if (err)
> > > +                     goto out_unregister_shrinker;
> > > +     }
> >
> > It can happen here that we start reporting before marking the device
> > ready. Can that be problematic?
> >
> > Maybe we have to ignore any reports in virtballoon_unused_page_report()
> > until ready...
> 
> I don't think there is an issue with us putting buffers on the ring
> before it is ready. I think it will just cause our function to sleep.
> 
> I'm guessing that is the case since init_vqs will add a buffer to the
> stats vq and that happens even earlier in virtballoon_probe.
> 
> > > +
> > >       virtio_device_ready(vdev);
> > >
> > >       if (towards_target(vb))
> > >               virtballoon_changed(vdev);
> > >       return 0;
> > >
> > > +out_unregister_shrinker:
> > > +     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> > > +             virtio_balloon_unregister_shrinker(vb);
> >
> > A sync is done implicitly, right? So after this call, we won't get any
> > new callbacks/are stuck in a callback.
> 
> >From what I can tell a read/write semaphore is used in
> unregister_shrinker when we delete it from the list so it shouldn't be
> an issue.
> 
> > >  out_del_balloon_wq:
> > >       if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
> > >               destroy_workqueue(vb->balloon_wq);
> > > @@ -966,6 +1028,8 @@ static void virtballoon_remove(struct virtio_device *vdev)
> > >  {
> > >       struct virtio_balloon *vb = vdev->priv;
> > >
> > > +     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> > > +             page_reporting_unregister(&vb->pr_dev_info);
> >
> > Dito, same question regarding syncs.
> 
> Yes, although for that one I was using pointer deletion, a barrier,
> and a cancel_work_sync since I didn't support a list.
> 
> > >       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> > >               virtio_balloon_unregister_shrinker(vb);
> > >       spin_lock_irq(&vb->stop_update_lock);
> > > @@ -1038,6 +1102,7 @@ static int virtballoon_validate(struct virtio_device *vdev)
> > >       VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> > >       VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> > >       VIRTIO_BALLOON_F_PAGE_POISON,
> > > +     VIRTIO_BALLOON_F_REPORTING,
> > >  };
> > >
> > >  static struct virtio_driver virtio_balloon_driver = {
> > > diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> > > index a1966cd7b677..19974392d324 100644
> > > --- a/include/uapi/linux/virtio_balloon.h
> > > +++ b/include/uapi/linux/virtio_balloon.h
> > > @@ -36,6 +36,7 @@
> > >  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM      2 /* Deflate balloon on OOM */
> > >  #define VIRTIO_BALLOON_F_FREE_PAGE_HINT      3 /* VQ to report free pages */
> > >  #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */
> > > +#define VIRTIO_BALLOON_F_REPORTING   5 /* Page reporting virtqueue */
> > >
> > >  /* Size of a PFN in the balloon interface. */
> > >  #define VIRTIO_BALLOON_PFN_SHIFT 12
> > >
> > >
> >
> > Small and powerful patch :)
> 
> Agreed. Although we will have to see if we can keep it that way.
> Ideally I want to leave this with the ability so specify what size
> scatterlist we receive. However if we have to flip it around then it
> will force us to add logic for chopping up the scatterlist for
> processing in chunks.
> 
> Thanks for the review.
> 
> - Alex
Alexander Duyck Dec. 1, 2019, 6:25 p.m. UTC | #5
On Sun, Dec 1, 2019 at 3:46 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Nov 29, 2019 at 01:13:32PM -0800, Alexander Duyck wrote:
> > On Thu, Nov 28, 2019 at 7:26 AM David Hildenbrand <david@redhat.com> wrote:
> > >
> > > On 19.11.19 22:46, Alexander Duyck wrote:
> > > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > >
> > > > Add support for the page reporting feature provided by virtio-balloon.
> > > > Reporting differs from the regular balloon functionality in that is is
> > > > much less durable than a standard memory balloon. Instead of creating a
> > > > list of pages that cannot be accessed the pages are only inaccessible
> > > > while they are being indicated to the virtio interface. Once the
> > > > interface has acknowledged them they are placed back into their respective
> > > > free lists and are once again accessible by the guest system.
> > >
> > > Maybe add something like "In contrast to ordinary balloon
> > > inflation/deflation, the guest can reuse all reported pages immediately
> > > after reporting has finished, without having to notify the hypervisor
> > > about it (e.g., VIRTIO_BALLOON_F_MUST_TELL_HOST does not apply)."
> >
> > Okay. I'll make a note of it for next version.
>
>
> VIRTIO_BALLOON_F_MUST_TELL_HOST is IMHO misdocumented.
> It states:
>         VIRTIO_BALLOON_F_MUST_TELL_HOST (0) Host has to be told before pages from the balloon are
>         used.
> but really balloon always told host. The difference is in timing,
> historically balloon gave up pages before sending the
> message and before waiting for the buffer to be used by host.
>
> I think this feature can be the same if we want.

Okay. I'll still probably try to document the behavior a bit better though.

> > > [...]
> > >
> > > >  /*
> > > >   * Balloon device works in 4K page units.  So each page is pointed to by
> > > > @@ -37,6 +38,9 @@
> > > >  #define VIRTIO_BALLOON_FREE_PAGE_SIZE \
> > > >       (1 << (VIRTIO_BALLOON_FREE_PAGE_ORDER + PAGE_SHIFT))
> > > >
> > > > +/*  limit on the number of pages that can be on the reporting vq */
> > > > +#define VIRTIO_BALLOON_VRING_HINTS_MAX       16
> > >
> > > Maybe rename that from HINTS to REPORTS
> >
> > I'll fix it for the next version.
> >
> > > > +
> > > >  #ifdef CONFIG_BALLOON_COMPACTION
> > > >  static struct vfsmount *balloon_mnt;
> > > >  #endif
> > > > @@ -46,6 +50,7 @@ enum virtio_balloon_vq {
> > > >       VIRTIO_BALLOON_VQ_DEFLATE,
> > > >       VIRTIO_BALLOON_VQ_STATS,
> > > >       VIRTIO_BALLOON_VQ_FREE_PAGE,
> > > > +     VIRTIO_BALLOON_VQ_REPORTING,
> > > >       VIRTIO_BALLOON_VQ_MAX
> > > >  };
> > > >
> > > > @@ -113,6 +118,10 @@ struct virtio_balloon {
> > > >
> > > >       /* To register a shrinker to shrink memory upon memory pressure */
> > > >       struct shrinker shrinker;
> > > > +
> > > > +     /* Unused page reporting device */
> > >
> > > Sounds like the device is unused :D
> > >
> > > "Device info for reporting unused pages" ?
> > >
> > > I am in general wondering, should we rename "unused" to "free". I.e.,
> > > "free page reporting" instead of "unused page reporting"? Or what was
> > > the motivation behind using "unused" ?
> >
> > I honestly don't remember why I chose "unused" at this point. I can
> > switch over to "free" if that is what is preferred.
> >
> > Looking over the code a bit more I suspect the reason for avoiding it
> > is because free page hinting also mentioned reporting in a few spots.
> >
> > > > +     struct virtqueue *reporting_vq;
> > > > +     struct page_reporting_dev_info pr_dev_info;
> > > >  };
> > > >
> > > >  static struct virtio_device_id id_table[] = {
> > > > @@ -152,6 +161,32 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
> > > >
> > > >  }
> > > >
> > > > +void virtballoon_unused_page_report(struct page_reporting_dev_info *pr_dev_info,
> > > > +                                 unsigned int nents)
> > > > +{
> > > > +     struct virtio_balloon *vb =
> > > > +             container_of(pr_dev_info, struct virtio_balloon, pr_dev_info);
> > > > +     struct virtqueue *vq = vb->reporting_vq;
> > > > +     unsigned int unused, err;
> > > > +
> > > > +     /* We should always be able to add these buffers to an empty queue. */
> > >
> > > This comment somewhat contradicts the error handling (and comment)
> > > below. Maybe just drop it?
> > >
> > > > +     err = virtqueue_add_inbuf(vq, pr_dev_info->sg, nents, vb,
> > > > +                               GFP_NOWAIT | __GFP_NOWARN);
> > > > +
> > > > +     /*
> > > > +      * In the extremely unlikely case that something has changed and we
> > > > +      * are able to trigger an error we will simply display a warning
> > > > +      * and exit without actually processing the pages.
> > > > +      */
> > > > +     if (WARN_ON(err))
> > > > +             return;
> > >
> > > Maybe WARN_ON_ONCE? (to not flood the log on recurring errors)
> >
> > Actually I might need to tweak things here a bit. It occurs to me that
> > this can fail for more than just there not being space in the ring. I
> > forgot that DMA mapping needs to also occur so in the case of a DMA
> > mapping failure we would also see an error.
>
> Balloon assumes DMA mapping is bypassed right now:
>
> static int virtballoon_validate(struct virtio_device *vdev)
> {
>         if (!page_poisoning_enabled())
>                 __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON);
>
>         __virtio_clear_bit(vdev, VIRTIO_F_IOMMU_PLATFORM);
>
> ^^^^^^^^
>
>
>         return 0;
> }
>
> I don't think it can work with things like a bounce buffer.

Right. It wouldn't work with a bounce buffer. I was thinking more of
something like an IOMMU. So it sounds like the device is doing direct
map always anyway.

In any case I will add some logic so that if we encounter an error we
will just abort the reporting. That way if another user has some issue
like that it can be dealt with sooner and we can avoid flagging pages
as reported that are not.

- Alex
David Hildenbrand Dec. 2, 2019, 10:43 a.m. UTC | #6
[...]

>> Sounds like the device is unused :D
>>
>> "Device info for reporting unused pages" ?
>>
>> I am in general wondering, should we rename "unused" to "free". I.e.,
>> "free page reporting" instead of "unused page reporting"? Or what was
>> the motivation behind using "unused" ?
> 
> I honestly don't remember why I chose "unused" at this point. I can
> switch over to "free" if that is what is preferred.
> 
> Looking over the code a bit more I suspect the reason for avoiding it
> is because free page hinting also mentioned reporting in a few spots.

Maybe we should fix these cases. FWIW, I'd prefer "free page reporting".
(e.g., pairs nicely with "free page hinting").

>>> +
>>> +     virtqueue_kick(vq);
>>> +
>>> +     /* When host has read buffer, this completes via balloon_ack */
>>> +     wait_event(vb->acked, virtqueue_get_buf(vq, &unused));
>>
>> Is it safe to rely on the same ack-ing mechanism as the inflate/deflate
>> queue? What if both mechanisms are used concurrently and race/both wait
>> for the hypervisor?
>>
>> Maybe we need a separate vb->acked + callback function.
> 
> So if I understand correctly what is actually happening is that the
> wait event is simply a trigger that will wake us up, and at that point
> we check to see if the buffer we submitted is done. If not we go back
> to sleep. As such all we are really waiting on is the notification
> that the buffers we submitted have been processed. So it is using the
> same function but on a different virtual queue.

Very right, this is just a waitqueue (was only looking at this patch,
not the full code). This should indeed be fine.

>>>       vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
>>>       vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
>>>       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
>>> @@ -932,12 +976,30 @@ static int virtballoon_probe(struct virtio_device *vdev)
>>>               if (err)
>>>                       goto out_del_balloon_wq;
>>>       }
>>> +
>>> +     vb->pr_dev_info.report = virtballoon_unused_page_report;
>>> +     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
>>> +             unsigned int capacity;
>>> +
>>> +             capacity = min_t(unsigned int,
>>> +                              virtqueue_get_vring_size(vb->reporting_vq),
>>> +                              VIRTIO_BALLOON_VRING_HINTS_MAX);
>>> +             vb->pr_dev_info.capacity = capacity;
>>> +
>>> +             err = page_reporting_register(&vb->pr_dev_info);
>>> +             if (err)
>>> +                     goto out_unregister_shrinker;
>>> +     }
>>
>> It can happen here that we start reporting before marking the device
>> ready. Can that be problematic?
>>
>> Maybe we have to ignore any reports in virtballoon_unused_page_report()
>> until ready...
> 
> I don't think there is an issue with us putting buffers on the ring
> before it is ready. I think it will just cause our function to sleep.
> 
> I'm guessing that is the case since init_vqs will add a buffer to the
> stats vq and that happens even earlier in virtballoon_probe.
> 

Interesting: "Note: vqs are enabled automatically after probe returns.".
Learned something new.

The virtballoon_changed(vdev) *after* virtio_device_ready(vdev) made me
wonder, because that could also fill the queues.

Maybe Michael can clarify.

>>> +
>>>       virtio_device_ready(vdev);
>>>
>>>       if (towards_target(vb))
>>>               virtballoon_changed(vdev);
>>>       return 0;
>>>
>>> +out_unregister_shrinker:
>>> +     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
>>> +             virtio_balloon_unregister_shrinker(vb);
>>
>> A sync is done implicitly, right? So after this call, we won't get any
>> new callbacks/are stuck in a callback.
> 
> From what I can tell a read/write semaphore is used in
> unregister_shrinker when we delete it from the list so it shouldn't be
> an issue.

Yes, makes sense.

> 
>>>  out_del_balloon_wq:
>>>       if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
>>>               destroy_workqueue(vb->balloon_wq);
>>> @@ -966,6 +1028,8 @@ static void virtballoon_remove(struct virtio_device *vdev)
>>>  {
>>>       struct virtio_balloon *vb = vdev->priv;
>>>
>>> +     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
>>> +             page_reporting_unregister(&vb->pr_dev_info);
>>
>> Dito, same question regarding syncs.
> 
> Yes, although for that one I was using pointer deletion, a barrier,
> and a cancel_work_sync since I didn't support a list.

Okay, perfect.

[...]
>>
>> Small and powerful patch :)
> 
> Agreed. Although we will have to see if we can keep it that way.
> Ideally I want to leave this with the ability so specify what size
> scatterlist we receive. However if we have to flip it around then it
> will force us to add logic for chopping up the scatterlist for
> processing in chunks.

I hope we can keep it like that. Otherwise each and every driver has to
implement this chopping-up (e.g., a hypervisor that can only send one
hint at a time - e.g., via  a simple hypercall - would have to implement
that).
Alexander Duyck Dec. 4, 2019, 5:48 p.m. UTC | #7
On Thu, Nov 28, 2019 at 9:00 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Thu, Nov 28, 2019 at 04:25:54PM +0100, David Hildenbrand wrote:
> > On 19.11.19 22:46, Alexander Duyck wrote:
> > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > >
> > > Add support for the page reporting feature provided by virtio-balloon.
> > > Reporting differs from the regular balloon functionality in that is is
> > > much less durable than a standard memory balloon. Instead of creating a
> > > list of pages that cannot be accessed the pages are only inaccessible
> > > while they are being indicated to the virtio interface. Once the
> > > interface has acknowledged them they are placed back into their respective
> > > free lists and are once again accessible by the guest system.
> >
> > Maybe add something like "In contrast to ordinary balloon
> > inflation/deflation, the guest can reuse all reported pages immediately
> > after reporting has finished, without having to notify the hypervisor
> > about it (e.g., VIRTIO_BALLOON_F_MUST_TELL_HOST does not apply)."
>
> Maybe we can make apply. The effect of reporting a page is effectively
> putting it in a balloon then immediately taking it out. Maybe without
> VIRTIO_BALLOON_F_MUST_TELL_HOST the pages can be reused before host
> marked buffers used?
>
> We didn't teach existing page hinting to behave like this, but maybe we
> should, and maybe it's not too late, not a long time passed
> since it was merged, and the whole shrinker based thing
> seems to have been broken ...
>
>
> BTW generally UAPI patches will have to be sent to virtio-dev
> mailing list before they are merged.
>
> > [...]
> >
> > >  /*
> > >   * Balloon device works in 4K page units.  So each page is pointed to by
> > > @@ -37,6 +38,9 @@
> > >  #define VIRTIO_BALLOON_FREE_PAGE_SIZE \
> > >     (1 << (VIRTIO_BALLOON_FREE_PAGE_ORDER + PAGE_SHIFT))
> > >
> > > +/*  limit on the number of pages that can be on the reporting vq */
> > > +#define VIRTIO_BALLOON_VRING_HINTS_MAX     16
> >
> > Maybe rename that from HINTS to REPORTS
> >
> > > +
> > >  #ifdef CONFIG_BALLOON_COMPACTION
> > >  static struct vfsmount *balloon_mnt;
> > >  #endif
> > > @@ -46,6 +50,7 @@ enum virtio_balloon_vq {
> > >     VIRTIO_BALLOON_VQ_DEFLATE,
> > >     VIRTIO_BALLOON_VQ_STATS,
> > >     VIRTIO_BALLOON_VQ_FREE_PAGE,
> > > +   VIRTIO_BALLOON_VQ_REPORTING,
> > >     VIRTIO_BALLOON_VQ_MAX
> > >  };
> > >
> > > @@ -113,6 +118,10 @@ struct virtio_balloon {
> > >
> > >     /* To register a shrinker to shrink memory upon memory pressure */
> > >     struct shrinker shrinker;
> > > +
> > > +   /* Unused page reporting device */
> >
> > Sounds like the device is unused :D
> >
> > "Device info for reporting unused pages" ?
> >
> > I am in general wondering, should we rename "unused" to "free". I.e.,
> > "free page reporting" instead of "unused page reporting"? Or what was
> > the motivation behind using "unused" ?
> >
> > > +   struct virtqueue *reporting_vq;
> > > +   struct page_reporting_dev_info pr_dev_info;
> > >  };
> > >
> > >  static struct virtio_device_id id_table[] = {
> > > @@ -152,6 +161,32 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
> > >
> > >  }
> > >
> > > +void virtballoon_unused_page_report(struct page_reporting_dev_info *pr_dev_info,
> > > +                               unsigned int nents)
> > > +{
> > > +   struct virtio_balloon *vb =
> > > +           container_of(pr_dev_info, struct virtio_balloon, pr_dev_info);
> > > +   struct virtqueue *vq = vb->reporting_vq;
> > > +   unsigned int unused, err;
> > > +
> > > +   /* We should always be able to add these buffers to an empty queue. */
> >
> > This comment somewhat contradicts the error handling (and comment)
> > below. Maybe just drop it?
> >
> > > +   err = virtqueue_add_inbuf(vq, pr_dev_info->sg, nents, vb,
> > > +                             GFP_NOWAIT | __GFP_NOWARN);
> > > +
> > > +   /*
> > > +    * In the extremely unlikely case that something has changed and we
> > > +    * are able to trigger an error we will simply display a warning
> > > +    * and exit without actually processing the pages.
> > > +    */
> > > +   if (WARN_ON(err))
> > > +           return;
> >
> > Maybe WARN_ON_ONCE? (to not flood the log on recurring errors)
> >
> > > +
> > > +   virtqueue_kick(vq);
> > > +
> > > +   /* When host has read buffer, this completes via balloon_ack */
> > > +   wait_event(vb->acked, virtqueue_get_buf(vq, &unused));
> >
> > Is it safe to rely on the same ack-ing mechanism as the inflate/deflate
> > queue? What if both mechanisms are used concurrently and race/both wait
> > for the hypervisor?
> >
> > Maybe we need a separate vb->acked + callback function.
> >
> > > +}
> > > +
> > >  static void set_page_pfns(struct virtio_balloon *vb,
> > >                       __virtio32 pfns[], struct page *page)
> > >  {
> > > @@ -476,6 +511,7 @@ static int init_vqs(struct virtio_balloon *vb)
> > >     names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
> > >     names[VIRTIO_BALLOON_VQ_STATS] = NULL;
> > >     names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> > > +   names[VIRTIO_BALLOON_VQ_REPORTING] = NULL;
> > >
> > >     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> > >             names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> > > @@ -487,11 +523,19 @@ static int init_vqs(struct virtio_balloon *vb)
> > >             callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> > >     }
> > >
> > > +   if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
> > > +           names[VIRTIO_BALLOON_VQ_REPORTING] = "reporting_vq";
> > > +           callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
> > > +   }
> > > +
> > >     err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
> > >                                      vqs, callbacks, names, NULL, NULL);
> > >     if (err)
> > >             return err;
> > >
> > > +   if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> > > +           vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING];
> > > +
> >
> > I'd register these in the same order they are defined (IOW, move this
> > further down)
> >
> > >     vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
> > >     vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
> > >     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> > > @@ -932,12 +976,30 @@ static int virtballoon_probe(struct virtio_device *vdev)
> > >             if (err)
> > >                     goto out_del_balloon_wq;
> > >     }
> > > +
> > > +   vb->pr_dev_info.report = virtballoon_unused_page_report;
> > > +   if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
> > > +           unsigned int capacity;
> > > +
> > > +           capacity = min_t(unsigned int,
> > > +                            virtqueue_get_vring_size(vb->reporting_vq),
> > > +                            VIRTIO_BALLOON_VRING_HINTS_MAX);
> > > +           vb->pr_dev_info.capacity = capacity;
> > > +
> > > +           err = page_reporting_register(&vb->pr_dev_info);
> > > +           if (err)
> > > +                   goto out_unregister_shrinker;
> > > +   }
> >
> > It can happen here that we start reporting before marking the device
> > ready. Can that be problematic?
> >
> > Maybe we have to ignore any reports in virtballoon_unused_page_report()
> > until ready...
> >
> > > +
> > >     virtio_device_ready(vdev);
> > >
> > >     if (towards_target(vb))
> > >             virtballoon_changed(vdev);
> > >     return 0;
> > >
> > > +out_unregister_shrinker:
> > > +   if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> > > +           virtio_balloon_unregister_shrinker(vb);
> >
> > A sync is done implicitly, right? So after this call, we won't get any
> > new callbacks/are stuck in a callback.
> >
> > >  out_del_balloon_wq:
> > >     if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
> > >             destroy_workqueue(vb->balloon_wq);
> > > @@ -966,6 +1028,8 @@ static void virtballoon_remove(struct virtio_device *vdev)
> > >  {
> > >     struct virtio_balloon *vb = vdev->priv;
> > >
> > > +   if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> > > +           page_reporting_unregister(&vb->pr_dev_info);
> >
> > Dito, same question regarding syncs.
> >
> > >     if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> > >             virtio_balloon_unregister_shrinker(vb);
> > >     spin_lock_irq(&vb->stop_update_lock);
> > > @@ -1038,6 +1102,7 @@ static int virtballoon_validate(struct virtio_device *vdev)
> > >     VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> > >     VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> > >     VIRTIO_BALLOON_F_PAGE_POISON,
> > > +   VIRTIO_BALLOON_F_REPORTING,
> > >  };
> > >
> > >  static struct virtio_driver virtio_balloon_driver = {
> > > diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> > > index a1966cd7b677..19974392d324 100644
> > > --- a/include/uapi/linux/virtio_balloon.h
> > > +++ b/include/uapi/linux/virtio_balloon.h
> > > @@ -36,6 +36,7 @@
> > >  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM    2 /* Deflate balloon on OOM */
> > >  #define VIRTIO_BALLOON_F_FREE_PAGE_HINT    3 /* VQ to report free pages */
> > >  #define VIRTIO_BALLOON_F_PAGE_POISON       4 /* Guest is using page poisoning */
> > > +#define VIRTIO_BALLOON_F_REPORTING 5 /* Page reporting virtqueue */
> > >
> > >  /* Size of a PFN in the balloon interface. */
> > >  #define VIRTIO_BALLOON_PFN_SHIFT 12
> > >
> > >
> >
> > Small and powerful patch :)
> >
> > --
> > Thanks,
> >
> > David / dhildenb
>
>
Alexander Duyck Dec. 4, 2019, 5:53 p.m. UTC | #8
On Thu, Nov 28, 2019 at 9:00 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Thu, Nov 28, 2019 at 04:25:54PM +0100, David Hildenbrand wrote:
> > On 19.11.19 22:46, Alexander Duyck wrote:
> > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > >
> > > Add support for the page reporting feature provided by virtio-balloon.
> > > Reporting differs from the regular balloon functionality in that is is
> > > much less durable than a standard memory balloon. Instead of creating a
> > > list of pages that cannot be accessed the pages are only inaccessible
> > > while they are being indicated to the virtio interface. Once the
> > > interface has acknowledged them they are placed back into their respective
> > > free lists and are once again accessible by the guest system.
> >
> > Maybe add something like "In contrast to ordinary balloon
> > inflation/deflation, the guest can reuse all reported pages immediately
> > after reporting has finished, without having to notify the hypervisor
> > about it (e.g., VIRTIO_BALLOON_F_MUST_TELL_HOST does not apply)."
>
> Maybe we can make apply. The effect of reporting a page is effectively
> putting it in a balloon then immediately taking it out. Maybe without
> VIRTIO_BALLOON_F_MUST_TELL_HOST the pages can be reused before host
> marked buffers used?
>
> We didn't teach existing page hinting to behave like this, but maybe we
> should, and maybe it's not too late, not a long time passed
> since it was merged, and the whole shrinker based thing
> seems to have been broken ...

The problem is the existing hinting implementation relies on pushing
the memory to the point of OOM in order to avoid having to re-hint on
pages. What it is looking for is a snapshot rather than a running
tally. The page reporting bit approach would only work for the first
migration. The problem is the bit is persistent and would leave unused
pages flagged as reported if another migration starts so it wouldn't
re-report those pages.

> BTW generally UAPI patches will have to be sent to virtio-dev
> mailing list before they are merged.

Do you need just the QEMU patches submitted to virtio-dev or both the
virtio kernel patches and the QEMU patches?

One piece of feedback I got was that it was annoying that I was
including virtio-dev since it requires a subscription to send to it.
If you would like I could apply it on the QEMU patches which would
make the changes more visible at least.

Thanks.

- Alex
diff mbox series

Patch

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 078615cf2afc..4b2dd8259ff5 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -58,6 +58,7 @@  config VIRTIO_BALLOON
 	tristate "Virtio balloon driver"
 	depends on VIRTIO
 	select MEMORY_BALLOON
+	select PAGE_REPORTING
 	---help---
 	 This driver supports increasing and decreasing the amount
 	 of memory within a KVM guest.
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 92099298bc16..6f5c6555765a 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -19,6 +19,7 @@ 
 #include <linux/mount.h>
 #include <linux/magic.h>
 #include <linux/pseudo_fs.h>
+#include <linux/page_reporting.h>
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -37,6 +38,9 @@ 
 #define VIRTIO_BALLOON_FREE_PAGE_SIZE \
 	(1 << (VIRTIO_BALLOON_FREE_PAGE_ORDER + PAGE_SHIFT))
 
+/*  limit on the number of pages that can be on the reporting vq */
+#define VIRTIO_BALLOON_VRING_HINTS_MAX	16
+
 #ifdef CONFIG_BALLOON_COMPACTION
 static struct vfsmount *balloon_mnt;
 #endif
@@ -46,6 +50,7 @@  enum virtio_balloon_vq {
 	VIRTIO_BALLOON_VQ_DEFLATE,
 	VIRTIO_BALLOON_VQ_STATS,
 	VIRTIO_BALLOON_VQ_FREE_PAGE,
+	VIRTIO_BALLOON_VQ_REPORTING,
 	VIRTIO_BALLOON_VQ_MAX
 };
 
@@ -113,6 +118,10 @@  struct virtio_balloon {
 
 	/* To register a shrinker to shrink memory upon memory pressure */
 	struct shrinker shrinker;
+
+	/* Unused page reporting device */
+	struct virtqueue *reporting_vq;
+	struct page_reporting_dev_info pr_dev_info;
 };
 
 static struct virtio_device_id id_table[] = {
@@ -152,6 +161,32 @@  static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 
 }
 
+void virtballoon_unused_page_report(struct page_reporting_dev_info *pr_dev_info,
+				    unsigned int nents)
+{
+	struct virtio_balloon *vb =
+		container_of(pr_dev_info, struct virtio_balloon, pr_dev_info);
+	struct virtqueue *vq = vb->reporting_vq;
+	unsigned int unused, err;
+
+	/* We should always be able to add these buffers to an empty queue. */
+	err = virtqueue_add_inbuf(vq, pr_dev_info->sg, nents, vb,
+				  GFP_NOWAIT | __GFP_NOWARN);
+
+	/*
+	 * In the extremely unlikely case that something has changed and we
+	 * are able to trigger an error we will simply display a warning
+	 * and exit without actually processing the pages.
+	 */
+	if (WARN_ON(err))
+		return;
+
+	virtqueue_kick(vq);
+
+	/* When host has read buffer, this completes via balloon_ack */
+	wait_event(vb->acked, virtqueue_get_buf(vq, &unused));
+}
+
 static void set_page_pfns(struct virtio_balloon *vb,
 			  __virtio32 pfns[], struct page *page)
 {
@@ -476,6 +511,7 @@  static int init_vqs(struct virtio_balloon *vb)
 	names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
 	names[VIRTIO_BALLOON_VQ_STATS] = NULL;
 	names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
+	names[VIRTIO_BALLOON_VQ_REPORTING] = NULL;
 
 	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
 		names[VIRTIO_BALLOON_VQ_STATS] = "stats";
@@ -487,11 +523,19 @@  static int init_vqs(struct virtio_balloon *vb)
 		callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
 	}
 
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
+		names[VIRTIO_BALLOON_VQ_REPORTING] = "reporting_vq";
+		callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
+	}
+
 	err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
 					 vqs, callbacks, names, NULL, NULL);
 	if (err)
 		return err;
 
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
+		vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING];
+
 	vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
 	vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
 	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
@@ -932,12 +976,30 @@  static int virtballoon_probe(struct virtio_device *vdev)
 		if (err)
 			goto out_del_balloon_wq;
 	}
+
+	vb->pr_dev_info.report = virtballoon_unused_page_report;
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
+		unsigned int capacity;
+
+		capacity = min_t(unsigned int,
+				 virtqueue_get_vring_size(vb->reporting_vq),
+				 VIRTIO_BALLOON_VRING_HINTS_MAX);
+		vb->pr_dev_info.capacity = capacity;
+
+		err = page_reporting_register(&vb->pr_dev_info);
+		if (err)
+			goto out_unregister_shrinker;
+	}
+
 	virtio_device_ready(vdev);
 
 	if (towards_target(vb))
 		virtballoon_changed(vdev);
 	return 0;
 
+out_unregister_shrinker:
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
+		virtio_balloon_unregister_shrinker(vb);
 out_del_balloon_wq:
 	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
 		destroy_workqueue(vb->balloon_wq);
@@ -966,6 +1028,8 @@  static void virtballoon_remove(struct virtio_device *vdev)
 {
 	struct virtio_balloon *vb = vdev->priv;
 
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
+		page_reporting_unregister(&vb->pr_dev_info);
 	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
 		virtio_balloon_unregister_shrinker(vb);
 	spin_lock_irq(&vb->stop_update_lock);
@@ -1038,6 +1102,7 @@  static int virtballoon_validate(struct virtio_device *vdev)
 	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
 	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
 	VIRTIO_BALLOON_F_PAGE_POISON,
+	VIRTIO_BALLOON_F_REPORTING,
 };
 
 static struct virtio_driver virtio_balloon_driver = {
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index a1966cd7b677..19974392d324 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -36,6 +36,7 @@ 
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
 #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
 #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
+#define VIRTIO_BALLOON_F_REPORTING	5 /* Page reporting virtqueue */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12