diff mbox

[RFC,V7,2/7] KVM: Guest page hinting functionality

Message ID 20180611151902.14383-3-nilal@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nitesh Lal June 11, 2018, 3:18 p.m. UTC
From: Nitesh Narayan Lal <nilal@redhat.com>

This patch adds the guest implementation in order to maintain the list of
pages which are freed by the guest and are not reused. To avoid any
reallocation it includes seqlock once the list is completely filled.
Though it doesn't carries the hypercall related changes.

Signed-off-by: Nitesh Narayan Lal <nilal@redhat.com>
---
 virt/kvm/page_hinting.c | 287 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 287 insertions(+)
 create mode 100644 virt/kvm/page_hinting.c

Comments

Luiz Capitulino June 11, 2018, 7:34 p.m. UTC | #1
On Mon, 11 Jun 2018 11:18:57 -0400
nilal@redhat.com wrote:

> From: Nitesh Narayan Lal <nilal@redhat.com>
> 
> This patch adds the guest implementation in order to maintain the list of
> pages which are freed by the guest and are not reused. To avoid any
> reallocation it includes seqlock once the list is completely filled.
> Though it doesn't carries the hypercall related changes.
> 
> Signed-off-by: Nitesh Narayan Lal <nilal@redhat.com>
> ---
>  virt/kvm/page_hinting.c | 287 ++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 287 insertions(+)
>  create mode 100644 virt/kvm/page_hinting.c
> 
> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
> new file mode 100644
> index 0000000..77dc865
> --- /dev/null
> +++ b/virt/kvm/page_hinting.c
> @@ -0,0 +1,287 @@
> +#include <linux/gfp.h>
> +#include <linux/mm.h>
> +#include <linux/page_ref.h>
> +#include <linux/kvm_host.h>
> +#include <linux/sort.h>
> +#include <linux/kernel.h>
> +
> +#define MAX_FGPT_ENTRIES	1000
> +#define HYPERLIST_THRESHOLD	1	/* FIXME: find a good threshold */
> +/*
> + * struct kvm_free_pages - Tracks the pages which are freed by the guest.
> + * @pfn	- page frame number for the page which is to be freed
> + * @pages - number of pages which are supposed to be freed.
> + * A global array object is used to hold the list of pfn and number of pages
> + * which are freed by the guest. This list may also have fragmentated pages so
> + * defragmentation is a must prior to the hypercall.
> + */
> +struct kvm_free_pages {
> +	unsigned long pfn;
> +	unsigned int pages;
> +};
> +
> +/*
> + * hypervisor_pages - It is a dummy structure passed with the hypercall.
> + * @pfn - page frame number for the page which is to be freed.
> + * @pages - number of pages which are supposed to be freed.
> + * A global array object is used to to hold the list of pfn and pages and is
> + * passed as part of the hypercall.
> + */
> +struct hypervisor_pages {
> +	unsigned long pfn;
> +	unsigned int pages;
> +};

Maybe rename 'pages' to 'nr_pages' in both structs?

More important comments below :)

> +
> +static __cacheline_aligned_in_smp DEFINE_SEQLOCK(guest_page_lock);
> +DEFINE_PER_CPU(struct kvm_free_pages [MAX_FGPT_ENTRIES], kvm_pt);
> +DEFINE_PER_CPU(int, kvm_pt_idx);
> +struct hypervisor_pages hypervisor_pagelist[MAX_FGPT_ENTRIES];
> +
> +static void empty_hyperlist(void)
> +{
> +	int i = 0;
> +
> +	while (i < MAX_FGPT_ENTRIES) {
> +		hypervisor_pagelist[i].pfn = 0;
> +		hypervisor_pagelist[i].pages = 0;
> +		i++;
> +	}
> +}
> +
> +static void make_hypercall(void)
> +{
> +	/*
> +	 * Dummy function: Tobe filled later.
> +	 */
> +	empty_hyperlist();
> +}
> +
> +static int sort_pfn(const void *a1, const void *b1)
> +{
> +	const struct hypervisor_pages *a = a1;
> +	const struct hypervisor_pages *b = b1;
> +
> +	if (a->pfn > b->pfn)
> +		return 1;
> +
> +	if (a->pfn < b->pfn)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static int pack_hyperlist(void)
> +{
> +	int i = 0, j = 0;
> +
> +	while (i < MAX_FGPT_ENTRIES) {
> +		if (hypervisor_pagelist[i].pfn != 0) {
> +			if (i != j) {
> +				hypervisor_pagelist[j].pfn =
> +						hypervisor_pagelist[i].pfn;
> +				hypervisor_pagelist[j].pages =
> +						hypervisor_pagelist[i].pages;
> +			}
> +			j++;
> +		}
> +		i++;
> +	}
> +	i = j;
> +	while (j < MAX_FGPT_ENTRIES) {
> +		hypervisor_pagelist[j].pfn = 0;
> +		hypervisor_pagelist[j].pages = 0;
> +		j++;
> +	}
> +	return i;
> +}
> +
> +int compress_hyperlist(void)
> +{
> +	int i = 0, j = 1, merge_counter = 0, ret = 0;
> +
> +	sort(hypervisor_pagelist, MAX_FGPT_ENTRIES,
> +	     sizeof(struct hypervisor_pages), sort_pfn, NULL);
> +	while (i < MAX_FGPT_ENTRIES && j < MAX_FGPT_ENTRIES) {
> +		unsigned long pfni = hypervisor_pagelist[i].pfn;
> +		unsigned int pagesi = hypervisor_pagelist[i].pages;
> +		unsigned long pfnj = hypervisor_pagelist[j].pfn;
> +		unsigned int pagesj = hypervisor_pagelist[j].pages;
> +
> +		if (pfnj <= pfni) {
> +			if (((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) &&
> +			    ((pfnj + pagesj - 1) >= (pfni - 1))) {
> +				hypervisor_pagelist[i].pfn = pfnj;
> +				hypervisor_pagelist[i].pages += pfni - pfnj;
> +				hypervisor_pagelist[j].pfn = 0;
> +				hypervisor_pagelist[j].pages = 0;
> +				j++;
> +				merge_counter++;
> +				continue;
> +			} else if ((pfnj + pagesj - 1) > (pfni + pagesi - 1)) {
> +				hypervisor_pagelist[i].pfn = pfnj;
> +				hypervisor_pagelist[i].pages = pagesj;
> +				hypervisor_pagelist[j].pfn = 0;
> +				hypervisor_pagelist[j].pages = 0;
> +				j++;
> +				merge_counter++;
> +				continue;
> +			}
> +		} else if (pfnj > pfni) {
> +			if ((pfnj + pagesj - 1) > (pfni + pagesi - 1) &&
> +			    (pfnj <= pfni + pagesi)) {
> +				hypervisor_pagelist[i].pages +=
> +						(pfnj + pagesj - 1) -
> +						(pfni + pagesi - 1);
> +				hypervisor_pagelist[j].pfn = 0;
> +				hypervisor_pagelist[j].pages = 0;
> +				j++;
> +				merge_counter++;
> +				continue;
> +			} else if ((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) {
> +				hypervisor_pagelist[j].pfn = 0;
> +				hypervisor_pagelist[j].pages = 0;
> +				j++;
> +				merge_counter++;
> +				continue;
> +			}
> +		}
> +		i = j;
> +		j++;
> +	}
> +	if (merge_counter != 0)
> +		ret = pack_hyperlist() - 1;
> +	else
> +		ret = MAX_FGPT_ENTRIES - 1;
> +	return ret;
> +}
> +
> +void copy_hyperlist(int hyper_idx)
> +{
> +	int *idx = &get_cpu_var(kvm_pt_idx);
> +	struct kvm_free_pages *free_page_obj;
> +	int i = 0;
> +
> +	free_page_obj = &get_cpu_var(kvm_pt)[0];
> +	while (i < hyper_idx) {
> +		free_page_obj[*idx].pfn = hypervisor_pagelist[i].pfn;
> +		free_page_obj[*idx].pages = hypervisor_pagelist[i].pages;
> +		*idx += 1;
> +		i++;
> +	}
> +	empty_hyperlist();
> +	put_cpu_var(kvm_pt);
> +	put_cpu_var(kvm_pt_idx);
> +}
> +
> +/*
> + * arch_free_page_slowpath() - This function adds the guest free page entries
> + * to hypervisor_pages list and also ensures defragmentation prior to addition
> + * if it is present with any entry of the kvm_free_pages list.
> + */
> +void arch_free_page_slowpath(void)
> +{
> +	int idx = 0;
> +	int hyper_idx = -1;
> +	int *kvm_idx = &get_cpu_var(kvm_pt_idx);
> +	struct kvm_free_pages *free_page_obj = &get_cpu_var(kvm_pt)[0];
> +
> +	write_seqlock(&guest_page_lock);

I think this lock is used to synchronize CPUs wanting to allocate pages
with the CPU doing the free page reporting, right? In other words, once
a CPU takes this lock no other CPU will be able to allocate pages in the
guest. Is this correct?

I'm wondering whether we could use some page flag for this purpose. I mean,
the race condition here seems to be: once we decide to report a page as
free, the guest can't use that page until the host has received and acked
this page.

So, is there any page flag we could use which:

1. Will prevent the page from being allocated when set
2. Won't mess up with the page allocator when setting it at free time
   in arch_free_page()

I don't know this myself.

> +	while (idx < MAX_FGPT_ENTRIES) {
> +		unsigned long pfn = free_page_obj[idx].pfn;
> +		unsigned long pfn_end = free_page_obj[idx].pfn +
> +					free_page_obj[idx].pages - 1;
> +		bool prev_free = false;
> +
> +		while (pfn <= pfn_end) {
> +			struct page *p = pfn_to_page(pfn);
> +
> +			if (PageCompound(p)) {
> +				struct page *head_page = compound_head(p);
> +				unsigned long head_pfn = page_to_pfn(head_page);
> +				unsigned int alloc_pages =
> +					1 << compound_order(head_page);
> +
> +				pfn = head_pfn + alloc_pages;
> +				prev_free = false;
> +				continue;
> +			}
> +			if (page_ref_count(p)) {
> +				pfn++;
> +				prev_free = false;
> +				continue;
> +			}
> +			/*
> +			 * The page is free so add it to the list and free the
> +			 * hypervisor_pagelist if required.
> +			 */
> +			if (!prev_free) {
> +				hyper_idx++;
> +				hypervisor_pagelist[hyper_idx].pfn = pfn;
> +				hypervisor_pagelist[hyper_idx].pages = 1;
> +				if (hyper_idx == MAX_FGPT_ENTRIES - 1) {
> +					hyper_idx =  compress_hyperlist();
> +					if (hyper_idx >=
> +					    HYPERLIST_THRESHOLD) {
> +						make_hypercall();
> +						hyper_idx = 0;
> +					}

Do we have a clear definition of when free pages are reported to the host?

If I'm understanding the code correctly, the following conditions have
to be met:

1. The free_page_obj[] array fills up at least once (ie. we have
   MAX_FGPT_ENTRIES entries at least once)

2. We skip pages that might already be freed and merge contiguous regions
   in the same array entry

3. We only report free pages to the host when the number of non-contiguous
   free regions reaches HYPERLIST_THRESHOLD

If that's correct, then one problem with that approach is that we might
report some megas (if the memory being freed by the guest is fragmented)
or we might also report several GBs at once (if the memory being freed by
the guest happens to be contiguous).

Does it make sense to instead allow users to configure the granularity they
want the guest to give to the host? For example, users could do:

(qemu) set-page-hinting 1GB

So, every time a CPU has 1GB free it would report it to the host. Of course,
this method also raise its own set of questions:

 - What's a good default?
 - How users can choose a good value for their workloads?

> +				}
> +				/*
> +				 * If the next contiguous page is free, it can
> +				 * be added to this same entry.
> +				 */
> +				prev_free = true;
> +			} else {
> +				/*
> +				 * Multiple adjacent free pages
> +				 */
> +				hypervisor_pagelist[hyper_idx].pages++;
> +			}
> +			pfn++;
> +		}
> +		free_page_obj[idx].pfn = 0;
> +		free_page_obj[idx].pages = 0;
> +		idx++;
> +	}
> +	*kvm_idx = 0;
> +	put_cpu_var(kvm_pt);
> +	put_cpu_var(kvm_pt_idx);
> +	write_sequnlock(&guest_page_lock);
> +}
> +
> +void arch_alloc_page(struct page *page, int order)
> +{
> +	unsigned int seq;
> +
> +	/*
> +	 * arch_free_page will acquire the lock once the list carrying guest
> +	 * free pages is full and a hypercall will be made. Until complete free
> +	 * page list is traversed no further allocaiton will be allowed.
> +	 */
> +	do {
> +		seq = read_seqbegin(&guest_page_lock);
> +	} while (read_seqretry(&guest_page_lock, seq));
> +}
> +
> +void arch_free_page(struct page *page, int order)
> +{
> +	int *free_page_idx = &get_cpu_var(kvm_pt_idx);

Since you're disabling interrupts, I think you can use this_cpu_ptr()
instead of get_cpu_var() (from inside the atomic section). And maybe pass
free_page_idx and free_page_obj to arch_free_pages() to simplify a bit.

But please ignore this if you like the suggestion below.

> +	struct kvm_free_pages *free_page_obj;
> +	unsigned long flags;
> +
> +	/*
> +	 * use of global variables may trigger a race condition between irq and
> +	 * process context causing unwanted overwrites. This will be replaced
> +	 * with a better solution to prevent such race conditions.
> +	 */
> +	local_irq_save(flags);

So, this is disabling interrupts on the CPU for the entire reporting
duration. I have no idea how long this process takes to tell whether
or not we're disabling interrupts for too long. But I think disabling
interrupts is protecting against free_page_obj being accessed from
interrupt context, right?

If that's correct and if we want to reduce the time duration in which
interrupts are disabled, then the following idea might work:

1. Disable interrupts only to add the page's PFN to free_page_obj[]
2. Only call arch_free_page_slowpath() if !in_interrupt() is true

We also have to protect against preemption, but get_cpu_var() will
take care of this. If we don't want to disable preemption for this
long either, then maybe we could use a mutex instead (in which case
I guess you probably want to skip the whole thing if the mutex is
already taken).

> +	free_page_obj = &get_cpu_var(kvm_pt)[0];
> +	free_page_obj[*free_page_idx].pfn = page_to_pfn(page);
> +	free_page_obj[*free_page_idx].pages = 1 << order;
> +	*free_page_idx += 1;
> +	if (*free_page_idx == MAX_FGPT_ENTRIES)
> +		arch_free_page_slowpath();
> +	put_cpu_var(kvm_pt);
> +	put_cpu_var(kvm_pt_idx);
> +	local_irq_restore(flags);
> +}
Nitesh Lal June 11, 2018, 9:27 p.m. UTC | #2
On 06/11/2018 03:34 PM, Luiz Capitulino wrote:
> On Mon, 11 Jun 2018 11:18:57 -0400
> nilal@redhat.com wrote:
>
>> From: Nitesh Narayan Lal <nilal@redhat.com>
>>
>> This patch adds the guest implementation in order to maintain the list of
>> pages which are freed by the guest and are not reused. To avoid any
>> reallocation it includes seqlock once the list is completely filled.
>> Though it doesn't carries the hypercall related changes.
>>
>> Signed-off-by: Nitesh Narayan Lal <nilal@redhat.com>
>> ---
>>  virt/kvm/page_hinting.c | 287 ++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 287 insertions(+)
>>  create mode 100644 virt/kvm/page_hinting.c
>>
>> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
>> new file mode 100644
>> index 0000000..77dc865
>> --- /dev/null
>> +++ b/virt/kvm/page_hinting.c
>> @@ -0,0 +1,287 @@
>> +#include <linux/gfp.h>
>> +#include <linux/mm.h>
>> +#include <linux/page_ref.h>
>> +#include <linux/kvm_host.h>
>> +#include <linux/sort.h>
>> +#include <linux/kernel.h>
>> +
>> +#define MAX_FGPT_ENTRIES	1000
>> +#define HYPERLIST_THRESHOLD	1	/* FIXME: find a good threshold */
>> +/*
>> + * struct kvm_free_pages - Tracks the pages which are freed by the guest.
>> + * @pfn	- page frame number for the page which is to be freed
>> + * @pages - number of pages which are supposed to be freed.
>> + * A global array object is used to hold the list of pfn and number of pages
>> + * which are freed by the guest. This list may also have fragmentated pages so
>> + * defragmentation is a must prior to the hypercall.
>> + */
>> +struct kvm_free_pages {
>> +	unsigned long pfn;
>> +	unsigned int pages;
>> +};
>> +
>> +/*
>> + * hypervisor_pages - It is a dummy structure passed with the hypercall.
>> + * @pfn - page frame number for the page which is to be freed.
>> + * @pages - number of pages which are supposed to be freed.
>> + * A global array object is used to to hold the list of pfn and pages and is
>> + * passed as part of the hypercall.
>> + */
>> +struct hypervisor_pages {
>> +	unsigned long pfn;
>> +	unsigned int pages;
>> +};
> Maybe rename 'pages' to 'nr_pages' in both structs?
Yeap, I can do this.
>
> More important comments below :)
>
>> +
>> +static __cacheline_aligned_in_smp DEFINE_SEQLOCK(guest_page_lock);
>> +DEFINE_PER_CPU(struct kvm_free_pages [MAX_FGPT_ENTRIES], kvm_pt);
>> +DEFINE_PER_CPU(int, kvm_pt_idx);
>> +struct hypervisor_pages hypervisor_pagelist[MAX_FGPT_ENTRIES];
>> +
>> +static void empty_hyperlist(void)
>> +{
>> +	int i = 0;
>> +
>> +	while (i < MAX_FGPT_ENTRIES) {
>> +		hypervisor_pagelist[i].pfn = 0;
>> +		hypervisor_pagelist[i].pages = 0;
>> +		i++;
>> +	}
>> +}
>> +
>> +static void make_hypercall(void)
>> +{
>> +	/*
>> +	 * Dummy function: Tobe filled later.
>> +	 */
>> +	empty_hyperlist();
>> +}
>> +
>> +static int sort_pfn(const void *a1, const void *b1)
>> +{
>> +	const struct hypervisor_pages *a = a1;
>> +	const struct hypervisor_pages *b = b1;
>> +
>> +	if (a->pfn > b->pfn)
>> +		return 1;
>> +
>> +	if (a->pfn < b->pfn)
>> +		return -1;
>> +
>> +	return 0;
>> +}
>> +
>> +static int pack_hyperlist(void)
>> +{
>> +	int i = 0, j = 0;
>> +
>> +	while (i < MAX_FGPT_ENTRIES) {
>> +		if (hypervisor_pagelist[i].pfn != 0) {
>> +			if (i != j) {
>> +				hypervisor_pagelist[j].pfn =
>> +						hypervisor_pagelist[i].pfn;
>> +				hypervisor_pagelist[j].pages =
>> +						hypervisor_pagelist[i].pages;
>> +			}
>> +			j++;
>> +		}
>> +		i++;
>> +	}
>> +	i = j;
>> +	while (j < MAX_FGPT_ENTRIES) {
>> +		hypervisor_pagelist[j].pfn = 0;
>> +		hypervisor_pagelist[j].pages = 0;
>> +		j++;
>> +	}
>> +	return i;
>> +}
>> +
>> +int compress_hyperlist(void)
>> +{
>> +	int i = 0, j = 1, merge_counter = 0, ret = 0;
>> +
>> +	sort(hypervisor_pagelist, MAX_FGPT_ENTRIES,
>> +	     sizeof(struct hypervisor_pages), sort_pfn, NULL);
>> +	while (i < MAX_FGPT_ENTRIES && j < MAX_FGPT_ENTRIES) {
>> +		unsigned long pfni = hypervisor_pagelist[i].pfn;
>> +		unsigned int pagesi = hypervisor_pagelist[i].pages;
>> +		unsigned long pfnj = hypervisor_pagelist[j].pfn;
>> +		unsigned int pagesj = hypervisor_pagelist[j].pages;
>> +
>> +		if (pfnj <= pfni) {
>> +			if (((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) &&
>> +			    ((pfnj + pagesj - 1) >= (pfni - 1))) {
>> +				hypervisor_pagelist[i].pfn = pfnj;
>> +				hypervisor_pagelist[i].pages += pfni - pfnj;
>> +				hypervisor_pagelist[j].pfn = 0;
>> +				hypervisor_pagelist[j].pages = 0;
>> +				j++;
>> +				merge_counter++;
>> +				continue;
>> +			} else if ((pfnj + pagesj - 1) > (pfni + pagesi - 1)) {
>> +				hypervisor_pagelist[i].pfn = pfnj;
>> +				hypervisor_pagelist[i].pages = pagesj;
>> +				hypervisor_pagelist[j].pfn = 0;
>> +				hypervisor_pagelist[j].pages = 0;
>> +				j++;
>> +				merge_counter++;
>> +				continue;
>> +			}
>> +		} else if (pfnj > pfni) {
>> +			if ((pfnj + pagesj - 1) > (pfni + pagesi - 1) &&
>> +			    (pfnj <= pfni + pagesi)) {
>> +				hypervisor_pagelist[i].pages +=
>> +						(pfnj + pagesj - 1) -
>> +						(pfni + pagesi - 1);
>> +				hypervisor_pagelist[j].pfn = 0;
>> +				hypervisor_pagelist[j].pages = 0;
>> +				j++;
>> +				merge_counter++;
>> +				continue;
>> +			} else if ((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) {
>> +				hypervisor_pagelist[j].pfn = 0;
>> +				hypervisor_pagelist[j].pages = 0;
>> +				j++;
>> +				merge_counter++;
>> +				continue;
>> +			}
>> +		}
>> +		i = j;
>> +		j++;
>> +	}
>> +	if (merge_counter != 0)
>> +		ret = pack_hyperlist() - 1;
>> +	else
>> +		ret = MAX_FGPT_ENTRIES - 1;
>> +	return ret;
>> +}
>> +
>> +void copy_hyperlist(int hyper_idx)
>> +{
>> +	int *idx = &get_cpu_var(kvm_pt_idx);
>> +	struct kvm_free_pages *free_page_obj;
>> +	int i = 0;
>> +
>> +	free_page_obj = &get_cpu_var(kvm_pt)[0];
>> +	while (i < hyper_idx) {
>> +		free_page_obj[*idx].pfn = hypervisor_pagelist[i].pfn;
>> +		free_page_obj[*idx].pages = hypervisor_pagelist[i].pages;
>> +		*idx += 1;
>> +		i++;
>> +	}
>> +	empty_hyperlist();
>> +	put_cpu_var(kvm_pt);
>> +	put_cpu_var(kvm_pt_idx);
>> +}
>> +
>> +/*
>> + * arch_free_page_slowpath() - This function adds the guest free page entries
>> + * to hypervisor_pages list and also ensures defragmentation prior to addition
>> + * if it is present with any entry of the kvm_free_pages list.
>> + */
>> +void arch_free_page_slowpath(void)
>> +{
>> +	int idx = 0;
>> +	int hyper_idx = -1;
>> +	int *kvm_idx = &get_cpu_var(kvm_pt_idx);
>> +	struct kvm_free_pages *free_page_obj = &get_cpu_var(kvm_pt)[0];
>> +
>> +	write_seqlock(&guest_page_lock);
> I think this lock is used to synchronize CPUs wanting to allocate pages
> with the CPU doing the free page reporting, right? In other words, once
> a CPU takes this lock no other CPU will be able to allocate pages in the
> guest. Is this correct?
Yes, that is correct. I have used the lock to prevent any allocations.
>
> I'm wondering whether we could use some page flag for this purpose. I mean,
> the race condition here seems to be: once we decide to report a page as
> free, the guest can't use that page until the host has received and acked
> this page.
>
> So, is there any page flag we could use which:
>
> 1. Will prevent the page from being allocated when set
> 2. Won't mess up with the page allocator when setting it at free time
>    in arch_free_page()
>
> I don't know this myself.
I am not sure about this, I will try to find out.
>> +	while (idx < MAX_FGPT_ENTRIES) {
>> +		unsigned long pfn = free_page_obj[idx].pfn;
>> +		unsigned long pfn_end = free_page_obj[idx].pfn +
>> +					free_page_obj[idx].pages - 1;
>> +		bool prev_free = false;
>> +
>> +		while (pfn <= pfn_end) {
>> +			struct page *p = pfn_to_page(pfn);
>> +
>> +			if (PageCompound(p)) {
>> +				struct page *head_page = compound_head(p);
>> +				unsigned long head_pfn = page_to_pfn(head_page);
>> +				unsigned int alloc_pages =
>> +					1 << compound_order(head_page);
>> +
>> +				pfn = head_pfn + alloc_pages;
>> +				prev_free = false;
>> +				continue;
>> +			}
>> +			if (page_ref_count(p)) {
>> +				pfn++;
>> +				prev_free = false;
>> +				continue;
>> +			}
>> +			/*
>> +			 * The page is free so add it to the list and free the
>> +			 * hypervisor_pagelist if required.
>> +			 */
>> +			if (!prev_free) {
>> +				hyper_idx++;
>> +				hypervisor_pagelist[hyper_idx].pfn = pfn;
>> +				hypervisor_pagelist[hyper_idx].pages = 1;
>> +				if (hyper_idx == MAX_FGPT_ENTRIES - 1) {
>> +					hyper_idx =  compress_hyperlist();
>> +					if (hyper_idx >=
>> +					    HYPERLIST_THRESHOLD) {
>> +						make_hypercall();
>> +						hyper_idx = 0;
>> +					}
> Do we have a clear definition of when free pages are reported to the host?
>
> If I'm understanding the code correctly, the following conditions have
> to be met:
>
> 1. The free_page_obj[] array fills up at least once (ie. we have
>    MAX_FGPT_ENTRIES entries at least once)
>
> 2. We skip pages that might already be freed and merge contiguous regions
>    in the same array entry
>
> 3. We only report free pages to the host when the number of non-contiguous
>    free regions reaches HYPERLIST_THRESHOLD
>
> If that's correct, then one problem with that approach is that we might
> report some megas (if the memory being freed by the guest is fragmented)
> or we might also report several GBs at once (if the memory being freed by
> the guest happens to be contiguous).
>
> Does it make sense to instead allow users to configure the granularity they
> want the guest to give to the host? For example, users could do:
Yes, this is the reason why I have changed the THRESHOLD value to 1 for
now. I have to either allow the user to configure this value through
qemu console or via sysctl interface or I have to add another condition
and constant such as Memory Threshold to take care of this situation.
>
> (qemu) set-page-hinting 1GB
>
> So, every time a CPU has 1GB free it would report it to the host. Of course,
> this method also raise its own set of questions:
>
>  - What's a good default?
>  - How users can choose a good value for their workloads?
I am not entirely sure about this.
>
>> +				}
>> +				/*
>> +				 * If the next contiguous page is free, it can
>> +				 * be added to this same entry.
>> +				 */
>> +				prev_free = true;
>> +			} else {
>> +				/*
>> +				 * Multiple adjacent free pages
>> +				 */
>> +				hypervisor_pagelist[hyper_idx].pages++;
>> +			}
>> +			pfn++;
>> +		}
>> +		free_page_obj[idx].pfn = 0;
>> +		free_page_obj[idx].pages = 0;
>> +		idx++;
>> +	}
>> +	*kvm_idx = 0;
>> +	put_cpu_var(kvm_pt);
>> +	put_cpu_var(kvm_pt_idx);
>> +	write_sequnlock(&guest_page_lock);
>> +}
>> +
>> +void arch_alloc_page(struct page *page, int order)
>> +{
>> +	unsigned int seq;
>> +
>> +	/*
>> +	 * arch_free_page will acquire the lock once the list carrying guest
>> +	 * free pages is full and a hypercall will be made. Until complete free
>> +	 * page list is traversed no further allocaiton will be allowed.
>> +	 */
>> +	do {
>> +		seq = read_seqbegin(&guest_page_lock);
>> +	} while (read_seqretry(&guest_page_lock, seq));
>> +}
>> +
>> +void arch_free_page(struct page *page, int order)
>> +{
>> +	int *free_page_idx = &get_cpu_var(kvm_pt_idx);
> Since you're disabling interrupts, I think you can use this_cpu_ptr()
> instead of get_cpu_var() (from inside the atomic section). And maybe pass
> free_page_idx and free_page_obj to arch_free_pages() to simplify a bit.
>
> But please ignore this if you like the suggestion below.
>
>> +	struct kvm_free_pages *free_page_obj;
>> +	unsigned long flags;
>> +
>> +	/*
>> +	 * use of global variables may trigger a race condition between irq and
>> +	 * process context causing unwanted overwrites. This will be replaced
>> +	 * with a better solution to prevent such race conditions.
>> +	 */
>> +	local_irq_save(flags);
> So, this is disabling interrupts on the CPU for the entire reporting
> duration. I have no idea how long this process takes to tell whether
> or not we're disabling interrupts for too long. But I think disabling
> interrupts is protecting against free_page_obj being accessed from
> interrupt context, right?
Yes, I introduced this because there was a race condition between
process context and the irq context.
>
> If that's correct and if we want to reduce the time duration in which
> interrupts are disabled, then the following idea might work:
>
> 1. Disable interrupts only to add the page's PFN to free_page_obj[]
> 2. Only call arch_free_page_slowpath() if !in_interrupt() is true
>
> We also have to protect against preemption, but get_cpu_var() will
> take care of this. If we don't want to disable preemption for this
> long either, then maybe we could use a mutex instead (in which case
> I guess you probably want to skip the whole thing if the mutex is
> already taken).
I can try using mutex here. Thank you for the suggestion.
>
>> +	free_page_obj = &get_cpu_var(kvm_pt)[0];
>> +	free_page_obj[*free_page_idx].pfn = page_to_pfn(page);
>> +	free_page_obj[*free_page_idx].pages = 1 << order;
>> +	*free_page_idx += 1;
>> +	if (*free_page_idx == MAX_FGPT_ENTRIES)
>> +		arch_free_page_slowpath();
>> +	put_cpu_var(kvm_pt);
>> +	put_cpu_var(kvm_pt_idx);
>> +	local_irq_restore(flags);
>> +}
Michael S. Tsirkin June 15, 2018, 3:09 p.m. UTC | #3
On Mon, Jun 11, 2018 at 11:18:57AM -0400, nilal@redhat.com wrote:
> From: Nitesh Narayan Lal <nilal@redhat.com>
> 
> This patch adds the guest implementation in order to maintain the list of
> pages which are freed by the guest and are not reused. To avoid any
> reallocation it includes seqlock once the list is completely filled.
> Though it doesn't carries the hypercall related changes.
> 
> Signed-off-by: Nitesh Narayan Lal <nilal@redhat.com>
> ---
>  virt/kvm/page_hinting.c | 287 ++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 287 insertions(+)
>  create mode 100644 virt/kvm/page_hinting.c
> 
> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
> new file mode 100644
> index 0000000..77dc865
> --- /dev/null
> +++ b/virt/kvm/page_hinting.c
> @@ -0,0 +1,287 @@
> +#include <linux/gfp.h>
> +#include <linux/mm.h>
> +#include <linux/page_ref.h>
> +#include <linux/kvm_host.h>
> +#include <linux/sort.h>
> +#include <linux/kernel.h>
> +
> +#define MAX_FGPT_ENTRIES	1000
> +#define HYPERLIST_THRESHOLD	1	/* FIXME: find a good threshold */
> +/*
> + * struct kvm_free_pages - Tracks the pages which are freed by the guest.
> + * @pfn	- page frame number for the page which is to be freed
> + * @pages - number of pages which are supposed to be freed.
> + * A global array object is used to hold the list of pfn and number of pages
> + * which are freed by the guest. This list may also have fragmentated pages so
> + * defragmentation is a must prior to the hypercall.
> + */
> +struct kvm_free_pages {
> +	unsigned long pfn;
> +	unsigned int pages;
> +};
> +
> +/*
> + * hypervisor_pages - It is a dummy structure passed with the hypercall.
> + * @pfn - page frame number for the page which is to be freed.
> + * @pages - number of pages which are supposed to be freed.
> + * A global array object is used to to hold the list of pfn and pages and is
> + * passed as part of the hypercall.
> + */
> +struct hypervisor_pages {
> +	unsigned long pfn;
> +	unsigned int pages;
> +};
> +
> +static __cacheline_aligned_in_smp DEFINE_SEQLOCK(guest_page_lock);
> +DEFINE_PER_CPU(struct kvm_free_pages [MAX_FGPT_ENTRIES], kvm_pt);
> +DEFINE_PER_CPU(int, kvm_pt_idx);
> +struct hypervisor_pages hypervisor_pagelist[MAX_FGPT_ENTRIES];
> +
> +static void empty_hyperlist(void)
> +{
> +	int i = 0;
> +
> +	while (i < MAX_FGPT_ENTRIES) {
> +		hypervisor_pagelist[i].pfn = 0;
> +		hypervisor_pagelist[i].pages = 0;
> +		i++;
> +	}
> +}
> +
> +static void make_hypercall(void)
> +{
> +	/*
> +	 * Dummy function: Tobe filled later.
> +	 */
> +	empty_hyperlist();
> +}
> +
> +static int sort_pfn(const void *a1, const void *b1)
> +{
> +	const struct hypervisor_pages *a = a1;
> +	const struct hypervisor_pages *b = b1;
> +
> +	if (a->pfn > b->pfn)
> +		return 1;
> +
> +	if (a->pfn < b->pfn)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static int pack_hyperlist(void)
> +{
> +	int i = 0, j = 0;
> +
> +	while (i < MAX_FGPT_ENTRIES) {
> +		if (hypervisor_pagelist[i].pfn != 0) {
> +			if (i != j) {
> +				hypervisor_pagelist[j].pfn =
> +						hypervisor_pagelist[i].pfn;
> +				hypervisor_pagelist[j].pages =
> +						hypervisor_pagelist[i].pages;
> +			}
> +			j++;
> +		}
> +		i++;
> +	}
> +	i = j;
> +	while (j < MAX_FGPT_ENTRIES) {
> +		hypervisor_pagelist[j].pfn = 0;
> +		hypervisor_pagelist[j].pages = 0;
> +		j++;
> +	}
> +	return i;
> +}
> +
> +int compress_hyperlist(void)
> +{
> +	int i = 0, j = 1, merge_counter = 0, ret = 0;
> +
> +	sort(hypervisor_pagelist, MAX_FGPT_ENTRIES,
> +	     sizeof(struct hypervisor_pages), sort_pfn, NULL);
> +	while (i < MAX_FGPT_ENTRIES && j < MAX_FGPT_ENTRIES) {
> +		unsigned long pfni = hypervisor_pagelist[i].pfn;
> +		unsigned int pagesi = hypervisor_pagelist[i].pages;
> +		unsigned long pfnj = hypervisor_pagelist[j].pfn;
> +		unsigned int pagesj = hypervisor_pagelist[j].pages;
> +
> +		if (pfnj <= pfni) {
> +			if (((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) &&
> +			    ((pfnj + pagesj - 1) >= (pfni - 1))) {
> +				hypervisor_pagelist[i].pfn = pfnj;
> +				hypervisor_pagelist[i].pages += pfni - pfnj;
> +				hypervisor_pagelist[j].pfn = 0;
> +				hypervisor_pagelist[j].pages = 0;
> +				j++;
> +				merge_counter++;
> +				continue;
> +			} else if ((pfnj + pagesj - 1) > (pfni + pagesi - 1)) {
> +				hypervisor_pagelist[i].pfn = pfnj;
> +				hypervisor_pagelist[i].pages = pagesj;
> +				hypervisor_pagelist[j].pfn = 0;
> +				hypervisor_pagelist[j].pages = 0;
> +				j++;
> +				merge_counter++;
> +				continue;
> +			}
> +		} else if (pfnj > pfni) {
> +			if ((pfnj + pagesj - 1) > (pfni + pagesi - 1) &&
> +			    (pfnj <= pfni + pagesi)) {
> +				hypervisor_pagelist[i].pages +=
> +						(pfnj + pagesj - 1) -
> +						(pfni + pagesi - 1);
> +				hypervisor_pagelist[j].pfn = 0;
> +				hypervisor_pagelist[j].pages = 0;
> +				j++;
> +				merge_counter++;
> +				continue;
> +			} else if ((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) {
> +				hypervisor_pagelist[j].pfn = 0;
> +				hypervisor_pagelist[j].pages = 0;
> +				j++;
> +				merge_counter++;
> +				continue;
> +			}
> +		}
> +		i = j;
> +		j++;
> +	}
> +	if (merge_counter != 0)
> +		ret = pack_hyperlist() - 1;
> +	else
> +		ret = MAX_FGPT_ENTRIES - 1;
> +	return ret;
> +}
> +
> +void copy_hyperlist(int hyper_idx)
> +{
> +	int *idx = &get_cpu_var(kvm_pt_idx);
> +	struct kvm_free_pages *free_page_obj;
> +	int i = 0;
> +
> +	free_page_obj = &get_cpu_var(kvm_pt)[0];
> +	while (i < hyper_idx) {
> +		free_page_obj[*idx].pfn = hypervisor_pagelist[i].pfn;
> +		free_page_obj[*idx].pages = hypervisor_pagelist[i].pages;
> +		*idx += 1;
> +		i++;
> +	}
> +	empty_hyperlist();
> +	put_cpu_var(kvm_pt);
> +	put_cpu_var(kvm_pt_idx);
> +}
> +
> +/*
> + * arch_free_page_slowpath() - This function adds the guest free page entries
> + * to hypervisor_pages list and also ensures defragmentation prior to addition
> + * if it is present with any entry of the kvm_free_pages list.
> + */
> +void arch_free_page_slowpath(void)
> +{
> +	int idx = 0;
> +	int hyper_idx = -1;
> +	int *kvm_idx = &get_cpu_var(kvm_pt_idx);
> +	struct kvm_free_pages *free_page_obj = &get_cpu_var(kvm_pt)[0];
> +
> +	write_seqlock(&guest_page_lock);
> +	while (idx < MAX_FGPT_ENTRIES) {
> +		unsigned long pfn = free_page_obj[idx].pfn;
> +		unsigned long pfn_end = free_page_obj[idx].pfn +
> +					free_page_obj[idx].pages - 1;
> +		bool prev_free = false;
> +
> +		while (pfn <= pfn_end) {

It seems like this can be MAX_FGPT_ENTRIES^2 so 1000000 cycles in the
inner-most loop in the worst case, all with interrupts disabled and all
kind of MM locks held. This kind of latency likely won't be acceptable
to the MM crowd even if it only happens once in a while.


> +			struct page *p = pfn_to_page(pfn);
> +
> +			if (PageCompound(p)) {
> +				struct page *head_page = compound_head(p);
> +				unsigned long head_pfn = page_to_pfn(head_page);
> +				unsigned int alloc_pages =
> +					1 << compound_order(head_page);
> +
> +				pfn = head_pfn + alloc_pages;
> +				prev_free = false;
> +				continue;
> +			}
> +			if (page_ref_count(p)) {
> +				pfn++;
> +				prev_free = false;
> +				continue;
> +			}
> +			/*
> +			 * The page is free so add it to the list and free the
> +			 * hypervisor_pagelist if required.
> +			 */
> +			if (!prev_free) {
> +				hyper_idx++;
> +				hypervisor_pagelist[hyper_idx].pfn = pfn;
> +				hypervisor_pagelist[hyper_idx].pages = 1;
> +				if (hyper_idx == MAX_FGPT_ENTRIES - 1) {
> +					hyper_idx =  compress_hyperlist();
> +					if (hyper_idx >=
> +					    HYPERLIST_THRESHOLD) {
> +						make_hypercall();
> +						hyper_idx = 0;
> +					}
> +				}
> +				/*
> +				 * If the next contiguous page is free, it can
> +				 * be added to this same entry.
> +				 */
> +				prev_free = true;
> +			} else {
> +				/*
> +				 * Multiple adjacent free pages
> +				 */
> +				hypervisor_pagelist[hyper_idx].pages++;
> +			}
> +			pfn++;
> +		}
> +		free_page_obj[idx].pfn = 0;
> +		free_page_obj[idx].pages = 0;
> +		idx++;
> +	}
> +	*kvm_idx = 0;
> +	put_cpu_var(kvm_pt);
> +	put_cpu_var(kvm_pt_idx);
> +	write_sequnlock(&guest_page_lock);
> +}
> +
> +void arch_alloc_page(struct page *page, int order)
> +{
> +	unsigned int seq;
> +
> +	/*
> +	 * arch_free_page will acquire the lock once the list carrying guest
> +	 * free pages is full and a hypercall will be made. Until complete free
> +	 * page list is traversed no further allocaiton will be allowed.

Yes that's how locks work, but why are we doing it here?
You want a comment that supplies the reason for action,
rather that just restates it.

> +	 */
> +	do {
> +		seq = read_seqbegin(&guest_page_lock);
> +	} while (read_seqretry(&guest_page_lock, seq));

No actual action is taken here. Doesn't this mean pages can
get used after they are in the free_page_obj array
but before they are sent to hypervisor?
Won't sending them to hypervisor confuse things then?



> +}
> +
> +void arch_free_page(struct page *page, int order)
> +{
> +	int *free_page_idx = &get_cpu_var(kvm_pt_idx);
> +	struct kvm_free_pages *free_page_obj;
> +	unsigned long flags;
> +
> +	/*
> +	 * use of global variables may trigger a race condition between irq and
> +	 * process context causing unwanted overwrites. This will be replaced
> +	 * with a better solution to prevent such race conditions.
> +	 */
> +	local_irq_save(flags);
> +	free_page_obj = &get_cpu_var(kvm_pt)[0];
> +	free_page_obj[*free_page_idx].pfn = page_to_pfn(page);
> +	free_page_obj[*free_page_idx].pages = 1 << order;
> +	*free_page_idx += 1;
> +	if (*free_page_idx == MAX_FGPT_ENTRIES)
> +		arch_free_page_slowpath();
> +	put_cpu_var(kvm_pt);
> +	put_cpu_var(kvm_pt_idx);
> +	local_irq_restore(flags);
> +}
> -- 
> 2.9.5
Rik van Riel June 15, 2018, 4:14 p.m. UTC | #4
On Fri, 2018-06-15 at 18:09 +0300, Michael S. Tsirkin wrote:
> On Mon, Jun 11, 2018 at 11:18:57AM -0400, nilal@redhat.com wrote:
> > From: Nitesh Narayan Lal <nilal@redhat.com>

> > +	write_seqlock(&guest_page_lock);
> > +	while (idx < MAX_FGPT_ENTRIES) {
> > +		unsigned long pfn = free_page_obj[idx].pfn;
> > +		unsigned long pfn_end = free_page_obj[idx].pfn +
> > +					free_page_obj[idx].pages -
> > 1;
> > +		bool prev_free = false;
> > +
> > +		while (pfn <= pfn_end) {
> 
> It seems like this can be MAX_FGPT_ENTRIES^2 so 1000000 cycles in the
> inner-most loop in the worst case, all with interrupts disabled and
> all
> kind of MM locks held. This kind of latency likely won't be
> acceptable
> to the MM crowd even if it only happens once in a while.

That makes me wonder if instead of spending time making
this code perfect, it would make time to also implement
a prototype of your bitmap page hinting idea, and compare
the two.

This project is probably at the "50% effort" stage, and
the bitmap page hinting approach seems like an interesting
possible different tradeoff...
Nitesh Lal June 15, 2018, 4:55 p.m. UTC | #5
On 06/15/2018 11:09 AM, Michael S. Tsirkin wrote:
> On Mon, Jun 11, 2018 at 11:18:57AM -0400, nilal@redhat.com wrote:
>> From: Nitesh Narayan Lal <nilal@redhat.com>
>>
>> This patch adds the guest implementation in order to maintain the list of
>> pages which are freed by the guest and are not reused. To avoid any
>> reallocation it includes seqlock once the list is completely filled.
>> Though it doesn't carries the hypercall related changes.
>>
>> Signed-off-by: Nitesh Narayan Lal <nilal@redhat.com>
>> ---
>>  virt/kvm/page_hinting.c | 287 ++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 287 insertions(+)
>>  create mode 100644 virt/kvm/page_hinting.c
>>
>> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
>> new file mode 100644
>> index 0000000..77dc865
>> --- /dev/null
>> +++ b/virt/kvm/page_hinting.c
>> @@ -0,0 +1,287 @@
>> +#include <linux/gfp.h>
>> +#include <linux/mm.h>
>> +#include <linux/page_ref.h>
>> +#include <linux/kvm_host.h>
>> +#include <linux/sort.h>
>> +#include <linux/kernel.h>
>> +
>> +#define MAX_FGPT_ENTRIES	1000
>> +#define HYPERLIST_THRESHOLD	1	/* FIXME: find a good threshold */
>> +/*
>> + * struct kvm_free_pages - Tracks the pages which are freed by the guest.
>> + * @pfn	- page frame number for the page which is to be freed
>> + * @pages - number of pages which are supposed to be freed.
>> + * A global array object is used to hold the list of pfn and number of pages
>> + * which are freed by the guest. This list may also have fragmentated pages so
>> + * defragmentation is a must prior to the hypercall.
>> + */
>> +struct kvm_free_pages {
>> +	unsigned long pfn;
>> +	unsigned int pages;
>> +};
>> +
>> +/*
>> + * hypervisor_pages - It is a dummy structure passed with the hypercall.
>> + * @pfn - page frame number for the page which is to be freed.
>> + * @pages - number of pages which are supposed to be freed.
>> + * A global array object is used to to hold the list of pfn and pages and is
>> + * passed as part of the hypercall.
>> + */
>> +struct hypervisor_pages {
>> +	unsigned long pfn;
>> +	unsigned int pages;
>> +};
>> +
>> +static __cacheline_aligned_in_smp DEFINE_SEQLOCK(guest_page_lock);
>> +DEFINE_PER_CPU(struct kvm_free_pages [MAX_FGPT_ENTRIES], kvm_pt);
>> +DEFINE_PER_CPU(int, kvm_pt_idx);
>> +struct hypervisor_pages hypervisor_pagelist[MAX_FGPT_ENTRIES];
>> +
>> +static void empty_hyperlist(void)
>> +{
>> +	int i = 0;
>> +
>> +	while (i < MAX_FGPT_ENTRIES) {
>> +		hypervisor_pagelist[i].pfn = 0;
>> +		hypervisor_pagelist[i].pages = 0;
>> +		i++;
>> +	}
>> +}
>> +
>> +static void make_hypercall(void)
>> +{
>> +	/*
>> +	 * Dummy function: Tobe filled later.
>> +	 */
>> +	empty_hyperlist();
>> +}
>> +
>> +static int sort_pfn(const void *a1, const void *b1)
>> +{
>> +	const struct hypervisor_pages *a = a1;
>> +	const struct hypervisor_pages *b = b1;
>> +
>> +	if (a->pfn > b->pfn)
>> +		return 1;
>> +
>> +	if (a->pfn < b->pfn)
>> +		return -1;
>> +
>> +	return 0;
>> +}
>> +
>> +static int pack_hyperlist(void)
>> +{
>> +	int i = 0, j = 0;
>> +
>> +	while (i < MAX_FGPT_ENTRIES) {
>> +		if (hypervisor_pagelist[i].pfn != 0) {
>> +			if (i != j) {
>> +				hypervisor_pagelist[j].pfn =
>> +						hypervisor_pagelist[i].pfn;
>> +				hypervisor_pagelist[j].pages =
>> +						hypervisor_pagelist[i].pages;
>> +			}
>> +			j++;
>> +		}
>> +		i++;
>> +	}
>> +	i = j;
>> +	while (j < MAX_FGPT_ENTRIES) {
>> +		hypervisor_pagelist[j].pfn = 0;
>> +		hypervisor_pagelist[j].pages = 0;
>> +		j++;
>> +	}
>> +	return i;
>> +}
>> +
>> +int compress_hyperlist(void)
>> +{
>> +	int i = 0, j = 1, merge_counter = 0, ret = 0;
>> +
>> +	sort(hypervisor_pagelist, MAX_FGPT_ENTRIES,
>> +	     sizeof(struct hypervisor_pages), sort_pfn, NULL);
>> +	while (i < MAX_FGPT_ENTRIES && j < MAX_FGPT_ENTRIES) {
>> +		unsigned long pfni = hypervisor_pagelist[i].pfn;
>> +		unsigned int pagesi = hypervisor_pagelist[i].pages;
>> +		unsigned long pfnj = hypervisor_pagelist[j].pfn;
>> +		unsigned int pagesj = hypervisor_pagelist[j].pages;
>> +
>> +		if (pfnj <= pfni) {
>> +			if (((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) &&
>> +			    ((pfnj + pagesj - 1) >= (pfni - 1))) {
>> +				hypervisor_pagelist[i].pfn = pfnj;
>> +				hypervisor_pagelist[i].pages += pfni - pfnj;
>> +				hypervisor_pagelist[j].pfn = 0;
>> +				hypervisor_pagelist[j].pages = 0;
>> +				j++;
>> +				merge_counter++;
>> +				continue;
>> +			} else if ((pfnj + pagesj - 1) > (pfni + pagesi - 1)) {
>> +				hypervisor_pagelist[i].pfn = pfnj;
>> +				hypervisor_pagelist[i].pages = pagesj;
>> +				hypervisor_pagelist[j].pfn = 0;
>> +				hypervisor_pagelist[j].pages = 0;
>> +				j++;
>> +				merge_counter++;
>> +				continue;
>> +			}
>> +		} else if (pfnj > pfni) {
>> +			if ((pfnj + pagesj - 1) > (pfni + pagesi - 1) &&
>> +			    (pfnj <= pfni + pagesi)) {
>> +				hypervisor_pagelist[i].pages +=
>> +						(pfnj + pagesj - 1) -
>> +						(pfni + pagesi - 1);
>> +				hypervisor_pagelist[j].pfn = 0;
>> +				hypervisor_pagelist[j].pages = 0;
>> +				j++;
>> +				merge_counter++;
>> +				continue;
>> +			} else if ((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) {
>> +				hypervisor_pagelist[j].pfn = 0;
>> +				hypervisor_pagelist[j].pages = 0;
>> +				j++;
>> +				merge_counter++;
>> +				continue;
>> +			}
>> +		}
>> +		i = j;
>> +		j++;
>> +	}
>> +	if (merge_counter != 0)
>> +		ret = pack_hyperlist() - 1;
>> +	else
>> +		ret = MAX_FGPT_ENTRIES - 1;
>> +	return ret;
>> +}
>> +
>> +void copy_hyperlist(int hyper_idx)
>> +{
>> +	int *idx = &get_cpu_var(kvm_pt_idx);
>> +	struct kvm_free_pages *free_page_obj;
>> +	int i = 0;
>> +
>> +	free_page_obj = &get_cpu_var(kvm_pt)[0];
>> +	while (i < hyper_idx) {
>> +		free_page_obj[*idx].pfn = hypervisor_pagelist[i].pfn;
>> +		free_page_obj[*idx].pages = hypervisor_pagelist[i].pages;
>> +		*idx += 1;
>> +		i++;
>> +	}
>> +	empty_hyperlist();
>> +	put_cpu_var(kvm_pt);
>> +	put_cpu_var(kvm_pt_idx);
>> +}
>> +
>> +/*
>> + * arch_free_page_slowpath() - This function adds the guest free page entries
>> + * to hypervisor_pages list and also ensures defragmentation prior to addition
>> + * if it is present with any entry of the kvm_free_pages list.
>> + */
>> +void arch_free_page_slowpath(void)
>> +{
>> +	int idx = 0;
>> +	int hyper_idx = -1;
>> +	int *kvm_idx = &get_cpu_var(kvm_pt_idx);
>> +	struct kvm_free_pages *free_page_obj = &get_cpu_var(kvm_pt)[0];
>> +
>> +	write_seqlock(&guest_page_lock);
>> +	while (idx < MAX_FGPT_ENTRIES) {
>> +		unsigned long pfn = free_page_obj[idx].pfn;
>> +		unsigned long pfn_end = free_page_obj[idx].pfn +
>> +					free_page_obj[idx].pages - 1;
>> +		bool prev_free = false;
>> +
>> +		while (pfn <= pfn_end) {
> It seems like this can be MAX_FGPT_ENTRIES^2 so 1000000 cycles in the
> inner-most loop in the worst case, all with interrupts disabled and all
> kind of MM locks held. This kind of latency likely won't be acceptable
> to the MM crowd even if it only happens once in a while.
Thank you for pointing this out. I will try to resolve this issue in my
next patch series.
>> +			struct page *p = pfn_to_page(pfn);
>> +
>> +			if (PageCompound(p)) {
>> +				struct page *head_page = compound_head(p);
>> +				unsigned long head_pfn = page_to_pfn(head_page);
>> +				unsigned int alloc_pages =
>> +					1 << compound_order(head_page);
>> +
>> +				pfn = head_pfn + alloc_pages;
>> +				prev_free = false;
>> +				continue;
>> +			}
>> +			if (page_ref_count(p)) {
>> +				pfn++;
>> +				prev_free = false;
>> +				continue;
>> +			}
>> +			/*
>> +			 * The page is free so add it to the list and free the
>> +			 * hypervisor_pagelist if required.
>> +			 */
>> +			if (!prev_free) {
>> +				hyper_idx++;
>> +				hypervisor_pagelist[hyper_idx].pfn = pfn;
>> +				hypervisor_pagelist[hyper_idx].pages = 1;
>> +				if (hyper_idx == MAX_FGPT_ENTRIES - 1) {
>> +					hyper_idx =  compress_hyperlist();
>> +					if (hyper_idx >=
>> +					    HYPERLIST_THRESHOLD) {
>> +						make_hypercall();
>> +						hyper_idx = 0;
>> +					}
>> +				}
>> +				/*
>> +				 * If the next contiguous page is free, it can
>> +				 * be added to this same entry.
>> +				 */
>> +				prev_free = true;
>> +			} else {
>> +				/*
>> +				 * Multiple adjacent free pages
>> +				 */
>> +				hypervisor_pagelist[hyper_idx].pages++;
>> +			}
>> +			pfn++;
>> +		}
>> +		free_page_obj[idx].pfn = 0;
>> +		free_page_obj[idx].pages = 0;
>> +		idx++;
>> +	}
>> +	*kvm_idx = 0;
>> +	put_cpu_var(kvm_pt);
>> +	put_cpu_var(kvm_pt_idx);
>> +	write_sequnlock(&guest_page_lock);
>> +}
>> +
>> +void arch_alloc_page(struct page *page, int order)
>> +{
>> +	unsigned int seq;
>> +
>> +	/*
>> +	 * arch_free_page will acquire the lock once the list carrying guest
>> +	 * free pages is full and a hypercall will be made. Until complete free
>> +	 * page list is traversed no further allocaiton will be allowed.
> Yes that's how locks work, but why are we doing it here?
> You want a comment that supplies the reason for action,
> rather that just restates it.
I will correct this. The intent is to prevent any allocation as we don't
want pages to be reallocated while we are scanning the per cpu list. It
is possible that while pages were getting added to the per cpu list some
of the earlier added pages are reused. Now, to remove the reused pages
and to prevent further allocations during that time, I take the lock and
prepare a final list of free pages which is sent to the host.
>
>> +	 */
>> +	do {
>> +		seq = read_seqbegin(&guest_page_lock);
>> +	} while (read_seqretry(&guest_page_lock, seq));
> No actual action is taken here. Doesn't this mean pages can
> get used after they are in the free_page_obj array
> but before they are sent to hypervisor?
> Won't sending them to hypervisor confuse things then?
Yes, pages can be re-allocated as long as free_page_obj is not
completely filled. Once it is filled I am taking the lock to prevent any
further allocation. After this I scan the free_obj_list for
re-allocations and prepare the final free page list containing only free
pages and send it to the host.
>
>
>
>> +}
>> +
>> +void arch_free_page(struct page *page, int order)
>> +{
>> +	int *free_page_idx = &get_cpu_var(kvm_pt_idx);
>> +	struct kvm_free_pages *free_page_obj;
>> +	unsigned long flags;
>> +
>> +	/*
>> +	 * use of global variables may trigger a race condition between irq and
>> +	 * process context causing unwanted overwrites. This will be replaced
>> +	 * with a better solution to prevent such race conditions.
>> +	 */
>> +	local_irq_save(flags);
>> +	free_page_obj = &get_cpu_var(kvm_pt)[0];
>> +	free_page_obj[*free_page_idx].pfn = page_to_pfn(page);
>> +	free_page_obj[*free_page_idx].pages = 1 << order;
>> +	*free_page_idx += 1;
>> +	if (*free_page_idx == MAX_FGPT_ENTRIES)
>> +		arch_free_page_slowpath();
>> +	put_cpu_var(kvm_pt);
>> +	put_cpu_var(kvm_pt_idx);
>> +	local_irq_restore(flags);
>> +}
>> -- 
>> 2.9.5
Michael S. Tsirkin July 11, 2018, 7:43 p.m. UTC | #6
On Fri, Jun 15, 2018 at 12:14:21PM -0400, Rik van Riel wrote:
> On Fri, 2018-06-15 at 18:09 +0300, Michael S. Tsirkin wrote:
> > On Mon, Jun 11, 2018 at 11:18:57AM -0400, nilal@redhat.com wrote:
> > > From: Nitesh Narayan Lal <nilal@redhat.com>
> 
> > > +	write_seqlock(&guest_page_lock);
> > > +	while (idx < MAX_FGPT_ENTRIES) {
> > > +		unsigned long pfn = free_page_obj[idx].pfn;
> > > +		unsigned long pfn_end = free_page_obj[idx].pfn +
> > > +					free_page_obj[idx].pages -
> > > 1;
> > > +		bool prev_free = false;
> > > +
> > > +		while (pfn <= pfn_end) {
> > 
> > It seems like this can be MAX_FGPT_ENTRIES^2 so 1000000 cycles in the
> > inner-most loop in the worst case, all with interrupts disabled and
> > all
> > kind of MM locks held. This kind of latency likely won't be
> > acceptable
> > to the MM crowd even if it only happens once in a while.
> 
> That makes me wonder if instead of spending time making
> this code perfect, it would make time to also implement
> a prototype of your bitmap page hinting idea, and compare
> the two.
> 
> This project is probably at the "50% effort" stage, and
> the bitmap page hinting approach seems like an interesting
> possible different tradeoff...

Since you mention a bitmap, this is suspeciously similar to
frontswap. Would it help to support tmem on kvm?

> -- 
> All Rights Reversed.
diff mbox

Patch

diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
new file mode 100644
index 0000000..77dc865
--- /dev/null
+++ b/virt/kvm/page_hinting.c
@@ -0,0 +1,287 @@ 
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/page_ref.h>
+#include <linux/kvm_host.h>
+#include <linux/sort.h>
+#include <linux/kernel.h>
+
+#define MAX_FGPT_ENTRIES	1000
+#define HYPERLIST_THRESHOLD	1	/* FIXME: find a good threshold */
+/*
+ * struct kvm_free_pages - Tracks the pages which are freed by the guest.
+ * @pfn	- page frame number for the page which is to be freed
+ * @pages - number of pages which are supposed to be freed.
+ * A global array object is used to hold the list of pfn and number of pages
+ * which are freed by the guest. This list may also have fragmentated pages so
+ * defragmentation is a must prior to the hypercall.
+ */
+struct kvm_free_pages {
+	unsigned long pfn;
+	unsigned int pages;
+};
+
+/*
+ * hypervisor_pages - It is a dummy structure passed with the hypercall.
+ * @pfn - page frame number for the page which is to be freed.
+ * @pages - number of pages which are supposed to be freed.
+ * A global array object is used to to hold the list of pfn and pages and is
+ * passed as part of the hypercall.
+ */
+struct hypervisor_pages {
+	unsigned long pfn;
+	unsigned int pages;
+};
+
+static __cacheline_aligned_in_smp DEFINE_SEQLOCK(guest_page_lock);
+DEFINE_PER_CPU(struct kvm_free_pages [MAX_FGPT_ENTRIES], kvm_pt);
+DEFINE_PER_CPU(int, kvm_pt_idx);
+struct hypervisor_pages hypervisor_pagelist[MAX_FGPT_ENTRIES];
+
+static void empty_hyperlist(void)
+{
+	int i = 0;
+
+	while (i < MAX_FGPT_ENTRIES) {
+		hypervisor_pagelist[i].pfn = 0;
+		hypervisor_pagelist[i].pages = 0;
+		i++;
+	}
+}
+
+static void make_hypercall(void)
+{
+	/*
+	 * Dummy function: Tobe filled later.
+	 */
+	empty_hyperlist();
+}
+
+static int sort_pfn(const void *a1, const void *b1)
+{
+	const struct hypervisor_pages *a = a1;
+	const struct hypervisor_pages *b = b1;
+
+	if (a->pfn > b->pfn)
+		return 1;
+
+	if (a->pfn < b->pfn)
+		return -1;
+
+	return 0;
+}
+
+static int pack_hyperlist(void)
+{
+	int i = 0, j = 0;
+
+	while (i < MAX_FGPT_ENTRIES) {
+		if (hypervisor_pagelist[i].pfn != 0) {
+			if (i != j) {
+				hypervisor_pagelist[j].pfn =
+						hypervisor_pagelist[i].pfn;
+				hypervisor_pagelist[j].pages =
+						hypervisor_pagelist[i].pages;
+			}
+			j++;
+		}
+		i++;
+	}
+	i = j;
+	while (j < MAX_FGPT_ENTRIES) {
+		hypervisor_pagelist[j].pfn = 0;
+		hypervisor_pagelist[j].pages = 0;
+		j++;
+	}
+	return i;
+}
+
+int compress_hyperlist(void)
+{
+	int i = 0, j = 1, merge_counter = 0, ret = 0;
+
+	sort(hypervisor_pagelist, MAX_FGPT_ENTRIES,
+	     sizeof(struct hypervisor_pages), sort_pfn, NULL);
+	while (i < MAX_FGPT_ENTRIES && j < MAX_FGPT_ENTRIES) {
+		unsigned long pfni = hypervisor_pagelist[i].pfn;
+		unsigned int pagesi = hypervisor_pagelist[i].pages;
+		unsigned long pfnj = hypervisor_pagelist[j].pfn;
+		unsigned int pagesj = hypervisor_pagelist[j].pages;
+
+		if (pfnj <= pfni) {
+			if (((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) &&
+			    ((pfnj + pagesj - 1) >= (pfni - 1))) {
+				hypervisor_pagelist[i].pfn = pfnj;
+				hypervisor_pagelist[i].pages += pfni - pfnj;
+				hypervisor_pagelist[j].pfn = 0;
+				hypervisor_pagelist[j].pages = 0;
+				j++;
+				merge_counter++;
+				continue;
+			} else if ((pfnj + pagesj - 1) > (pfni + pagesi - 1)) {
+				hypervisor_pagelist[i].pfn = pfnj;
+				hypervisor_pagelist[i].pages = pagesj;
+				hypervisor_pagelist[j].pfn = 0;
+				hypervisor_pagelist[j].pages = 0;
+				j++;
+				merge_counter++;
+				continue;
+			}
+		} else if (pfnj > pfni) {
+			if ((pfnj + pagesj - 1) > (pfni + pagesi - 1) &&
+			    (pfnj <= pfni + pagesi)) {
+				hypervisor_pagelist[i].pages +=
+						(pfnj + pagesj - 1) -
+						(pfni + pagesi - 1);
+				hypervisor_pagelist[j].pfn = 0;
+				hypervisor_pagelist[j].pages = 0;
+				j++;
+				merge_counter++;
+				continue;
+			} else if ((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) {
+				hypervisor_pagelist[j].pfn = 0;
+				hypervisor_pagelist[j].pages = 0;
+				j++;
+				merge_counter++;
+				continue;
+			}
+		}
+		i = j;
+		j++;
+	}
+	if (merge_counter != 0)
+		ret = pack_hyperlist() - 1;
+	else
+		ret = MAX_FGPT_ENTRIES - 1;
+	return ret;
+}
+
+void copy_hyperlist(int hyper_idx)
+{
+	int *idx = &get_cpu_var(kvm_pt_idx);
+	struct kvm_free_pages *free_page_obj;
+	int i = 0;
+
+	free_page_obj = &get_cpu_var(kvm_pt)[0];
+	while (i < hyper_idx) {
+		free_page_obj[*idx].pfn = hypervisor_pagelist[i].pfn;
+		free_page_obj[*idx].pages = hypervisor_pagelist[i].pages;
+		*idx += 1;
+		i++;
+	}
+	empty_hyperlist();
+	put_cpu_var(kvm_pt);
+	put_cpu_var(kvm_pt_idx);
+}
+
+/*
+ * arch_free_page_slowpath() - This function adds the guest free page entries
+ * to hypervisor_pages list and also ensures defragmentation prior to addition
+ * if it is present with any entry of the kvm_free_pages list.
+ */
+void arch_free_page_slowpath(void)
+{
+	int idx = 0;
+	int hyper_idx = -1;
+	int *kvm_idx = &get_cpu_var(kvm_pt_idx);
+	struct kvm_free_pages *free_page_obj = &get_cpu_var(kvm_pt)[0];
+
+	write_seqlock(&guest_page_lock);
+	while (idx < MAX_FGPT_ENTRIES) {
+		unsigned long pfn = free_page_obj[idx].pfn;
+		unsigned long pfn_end = free_page_obj[idx].pfn +
+					free_page_obj[idx].pages - 1;
+		bool prev_free = false;
+
+		while (pfn <= pfn_end) {
+			struct page *p = pfn_to_page(pfn);
+
+			if (PageCompound(p)) {
+				struct page *head_page = compound_head(p);
+				unsigned long head_pfn = page_to_pfn(head_page);
+				unsigned int alloc_pages =
+					1 << compound_order(head_page);
+
+				pfn = head_pfn + alloc_pages;
+				prev_free = false;
+				continue;
+			}
+			if (page_ref_count(p)) {
+				pfn++;
+				prev_free = false;
+				continue;
+			}
+			/*
+			 * The page is free so add it to the list and free the
+			 * hypervisor_pagelist if required.
+			 */
+			if (!prev_free) {
+				hyper_idx++;
+				hypervisor_pagelist[hyper_idx].pfn = pfn;
+				hypervisor_pagelist[hyper_idx].pages = 1;
+				if (hyper_idx == MAX_FGPT_ENTRIES - 1) {
+					hyper_idx =  compress_hyperlist();
+					if (hyper_idx >=
+					    HYPERLIST_THRESHOLD) {
+						make_hypercall();
+						hyper_idx = 0;
+					}
+				}
+				/*
+				 * If the next contiguous page is free, it can
+				 * be added to this same entry.
+				 */
+				prev_free = true;
+			} else {
+				/*
+				 * Multiple adjacent free pages
+				 */
+				hypervisor_pagelist[hyper_idx].pages++;
+			}
+			pfn++;
+		}
+		free_page_obj[idx].pfn = 0;
+		free_page_obj[idx].pages = 0;
+		idx++;
+	}
+	*kvm_idx = 0;
+	put_cpu_var(kvm_pt);
+	put_cpu_var(kvm_pt_idx);
+	write_sequnlock(&guest_page_lock);
+}
+
+void arch_alloc_page(struct page *page, int order)
+{
+	unsigned int seq;
+
+	/*
+	 * arch_free_page will acquire the lock once the list carrying guest
+	 * free pages is full and a hypercall will be made. Until complete free
+	 * page list is traversed no further allocaiton will be allowed.
+	 */
+	do {
+		seq = read_seqbegin(&guest_page_lock);
+	} while (read_seqretry(&guest_page_lock, seq));
+}
+
+void arch_free_page(struct page *page, int order)
+{
+	int *free_page_idx = &get_cpu_var(kvm_pt_idx);
+	struct kvm_free_pages *free_page_obj;
+	unsigned long flags;
+
+	/*
+	 * use of global variables may trigger a race condition between irq and
+	 * process context causing unwanted overwrites. This will be replaced
+	 * with a better solution to prevent such race conditions.
+	 */
+	local_irq_save(flags);
+	free_page_obj = &get_cpu_var(kvm_pt)[0];
+	free_page_obj[*free_page_idx].pfn = page_to_pfn(page);
+	free_page_obj[*free_page_idx].pages = 1 << order;
+	*free_page_idx += 1;
+	if (*free_page_idx == MAX_FGPT_ENTRIES)
+		arch_free_page_slowpath();
+	put_cpu_var(kvm_pt);
+	put_cpu_var(kvm_pt_idx);
+	local_irq_restore(flags);
+}