[v4,2/6] KVM: Guest page hinting functionality

Message ID	20171103203013.9521-3-nilal@redhat.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <kvm-owner@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mx1.redhat.com F3C1A2CE970 From: nilal@redhat.com To: kvm@vger.kernel.org, pbonzini@redhat.com, pagupta@redhat.com, wei.w.wang@intel.com, yang.zhang.wz@gmail.com, riel@redhat.com, david@redhat.com, mst@redhat.com, dodgen@google.com, konrad.wilk@oracle.com Subject: [Patch v4 2/6] KVM: Guest page hinting functionality Date: Fri, 3 Nov 2017 16:30:09 -0400 Message-Id: <20171103203013.9521-3-nilal@redhat.com> In-Reply-To: <20171103203013.9521-1-nilal@redhat.com> References: <20171103203013.9521-1-nilal@redhat.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk

Message ID

20171103203013.9521-3-nilal@redhat.com (mailing list archive)

State

New, archived

Headers

DMARC-Filter: OpenDMARC Filter v1.3.2 mx1.redhat.com F3C1A2CE970
From: nilal@redhat.com
To: kvm@vger.kernel.org, pbonzini@redhat.com, pagupta@redhat.com,
	wei.w.wang@intel.com, yang.zhang.wz@gmail.com, riel@redhat.com,
	david@redhat.com, mst@redhat.com, dodgen@google.com,
	konrad.wilk@oracle.com
Subject: [Patch v4 2/6] KVM: Guest page hinting functionality
Date: Fri,  3 Nov 2017 16:30:09 -0400
Message-Id: <20171103203013.9521-3-nilal@redhat.com>
In-Reply-To: <20171103203013.9521-1-nilal@redhat.com>
References: <20171103203013.9521-1-nilal@redhat.com>
Sender: kvm-owner@vger.kernel.org
Precedence: bulk

Commit Message

Nitesh Lal Nov. 3, 2017, 8:30 p.m. UTC

From: Nitesh Narayan Lal <niteshnarayanlalleo@gmail.com>

This patch adds the guest implementation in order to maintain the list of
pages which are freed by the guest and are not reused. To avoid any
reallocation it includes seqlock once the list is completely filled.
Though it doesn't carries the hypercall related changes.

Signed-off-by: Nitesh Narayan Lal <nilal@redhat.com>
---
 virt/kvm/page_hinting.c | 245 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 243 insertions(+), 2 deletions(-)

Comments

Michael S. Tsirkin Nov. 13, 2017, 6:03 p.m. UTC | #1

On Fri, Nov 03, 2017 at 04:30:09PM -0400, nilal@redhat.com wrote:
> From: Nitesh Narayan Lal <niteshnarayanlalleo@gmail.com>
> 
> This patch adds the guest implementation in order to maintain the list of
> pages which are freed by the guest and are not reused. To avoid any
> reallocation it includes seqlock once the list is completely filled.
> Though it doesn't carries the hypercall related changes.
> 
> Signed-off-by: Nitesh Narayan Lal <nilal@redhat.com>
> ---
>  virt/kvm/page_hinting.c | 245 +++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 243 insertions(+), 2 deletions(-)
> 
> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
> index 39d2b1d..658856d 100644
> --- a/virt/kvm/page_hinting.c
> +++ b/virt/kvm/page_hinting.c
> @@ -3,8 +3,7 @@
>  #include <linux/page_ref.h>
>  #include <linux/kvm_host.h>
>  #include <linux/sort.h>
> -
> -#include <trace/events/kvm.h>
> +#include <linux/kernel.h>
>  
>  #define MAX_FGPT_ENTRIES	1000
>  #define HYPERLIST_THRESHOLD	500
> @@ -33,14 +32,256 @@ struct hypervisor_pages {
>  	unsigned int pages;
>  };
>  
> +static __cacheline_aligned_in_smp DEFINE_SEQLOCK(guest_page_lock);
>  DEFINE_PER_CPU(struct kvm_free_pages [MAX_FGPT_ENTRIES], kvm_pt);
>  DEFINE_PER_CPU(int, kvm_pt_idx);
>  struct hypervisor_pages hypervisor_pagelist[MAX_FGPT_ENTRIES];
>  
> +static void empty_hyperlist(void)
> +{
> +	int i = 0;
> +
> +	while (i < MAX_FGPT_ENTRIES) {
> +		hypervisor_pagelist[i].pfn = 0;
> +		hypervisor_pagelist[i].pages = 0;
> +		i++;
> +	}
> +}
> +
> +static void make_hypercall(void)
> +{
> +	/*
> +	 * Dummy function: Tobe filled later.
> +	 */
> +	empty_hyperlist();
> +}
> +
> +static int sort_pfn(const void *a1, const void *b1)
> +{
> +	const struct hypervisor_pages *a = a1;
> +	const struct hypervisor_pages *b = b1;
> +
> +	if (a->pfn > b->pfn)
> +		return 1;
> +
> +	if (a->pfn < b->pfn)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static int pack_hyperlist(void)
> +{
> +	int i = 0, j = 0;
> +
> +	while (i < MAX_FGPT_ENTRIES) {
> +		if (hypervisor_pagelist[i].pfn != 0) {
> +			if (i != j) {
> +				hypervisor_pagelist[j].pfn =
> +						hypervisor_pagelist[i].pfn;
> +				hypervisor_pagelist[j].pages =
> +						hypervisor_pagelist[i].pages;
> +			}
> +			j++;
> +		}
> +		i++;
> +	}
> +	i = j;
> +	while (j < MAX_FGPT_ENTRIES) {
> +		hypervisor_pagelist[j].pfn = 0;
> +		hypervisor_pagelist[j].pages = 0;
> +		j++;
> +	}
> +	return i;
> +}
> +
> +int compress_hyperlist(void)
> +{
> +	int i = 0, j = 1, merge_counter = 0, ret = 0;
> +
> +	sort(hypervisor_pagelist, MAX_FGPT_ENTRIES,
> +	     sizeof(struct hypervisor_pages), sort_pfn, NULL);
> +	while (i < MAX_FGPT_ENTRIES && j < MAX_FGPT_ENTRIES) {
> +		unsigned long pfni = hypervisor_pagelist[i].pfn;
> +		unsigned int pagesi = hypervisor_pagelist[i].pages;
> +		unsigned long pfnj = hypervisor_pagelist[j].pfn;
> +		unsigned int pagesj = hypervisor_pagelist[j].pages;
> +
> +		if (pfnj <= pfni) {
> +			if (((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) &&
> +			    ((pfnj + pagesj - 1) >= (pfni - 1))) {
> +				hypervisor_pagelist[i].pfn = pfnj;
> +				hypervisor_pagelist[i].pages += pfni - pfnj;
> +				hypervisor_pagelist[j].pfn = 0;
> +				hypervisor_pagelist[j].pages = 0;
> +				j++;
> +				merge_counter++;
> +				continue;
> +			} else if ((pfnj + pagesj - 1) > (pfni + pagesi - 1)) {
> +				hypervisor_pagelist[i].pfn = pfnj;
> +				hypervisor_pagelist[i].pages = pagesj;
> +				hypervisor_pagelist[j].pfn = 0;
> +				hypervisor_pagelist[j].pages = 0;
> +				j++;
> +				merge_counter++;
> +				continue;
> +			}
> +		} else if (pfnj > pfni) {
> +			if ((pfnj + pagesj - 1) > (pfni + pagesi - 1) &&
> +			    (pfnj <= pfni + pagesi)) {
> +				hypervisor_pagelist[i].pages +=
> +						(pfnj + pagesj - 1) -
> +						(pfni + pagesi - 1);
> +				hypervisor_pagelist[j].pfn = 0;
> +				hypervisor_pagelist[j].pages = 0;
> +				j++;
> +				merge_counter++;
> +				continue;
> +			} else if ((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) {
> +				hypervisor_pagelist[j].pfn = 0;
> +				hypervisor_pagelist[j].pages = 0;
> +				j++;
> +				merge_counter++;
> +				continue;
> +			}
> +		}
> +		i = j;
> +		j++;
> +	}
> +	if (merge_counter != 0)
> +		ret = pack_hyperlist() - 1;
> +	else
> +		ret = MAX_FGPT_ENTRIES - 1;
> +	return ret;
> +}
> +
> +void copy_hyperlist(int hyper_idx)
> +{
> +	int *idx = &get_cpu_var(kvm_pt_idx);
> +	struct kvm_free_pages *free_page_obj;
> +	int i = 0;
> +
> +	free_page_obj = &get_cpu_var(kvm_pt)[0];
> +	while (i < hyper_idx) {
> +		free_page_obj[*idx].pfn = hypervisor_pagelist[i].pfn;
> +		free_page_obj[*idx].pages = hypervisor_pagelist[i].pages;
> +		*idx += 1;
> +		i++;
> +	}
> +	empty_hyperlist();
> +	put_cpu_var(kvm_pt);
> +	put_cpu_var(kvm_pt_idx);
> +}
> +
> +/*
> + * arch_free_page_slowpath() - This function adds the guest free page entries
> + * to hypervisor_pages list and also ensures defragmentation prior to addition
> + * if it is present with any entry of the kvm_free_pages list.
> + */
> +void arch_free_page_slowpath(void)
> +{
> +	int idx = 0;
> +	int hyper_idx = -1;
> +	int *kvm_idx = &get_cpu_var(kvm_pt_idx);
> +	struct kvm_free_pages *free_page_obj = &get_cpu_var(kvm_pt)[0];
> +
> +	write_seqlock(&guest_page_lock);
> +	while (idx < MAX_FGPT_ENTRIES) {
> +		unsigned long pfn = free_page_obj[idx].pfn;
> +		unsigned long pfn_end = free_page_obj[idx].pfn +
> +					free_page_obj[idx].pages - 1;
> +		bool prev_free = false;
> +
> +		while (pfn <= pfn_end) {
> +			struct page *p = pfn_to_page(pfn);
> +
> +			if (PageCompound(p)) {
> +				struct page *head_page = compound_head(p);
> +				unsigned long head_pfn = page_to_pfn(head_page);
> +				unsigned int alloc_pages =
> +					1 << compound_order(head_page);
> +
> +				pfn = head_pfn + alloc_pages;
> +				prev_free = false;
> +				continue;
> +			}
> +			if (page_ref_count(p)) {
> +				pfn++;
> +				prev_free = false;
> +				continue;
> +			}
> +			/*
> +			 * The page is free so add it to the list and free the
> +			 * hypervisor_pagelist if required.
> +			 */
> +			if (!prev_free) {
> +				hyper_idx++;
> +				hypervisor_pagelist[hyper_idx].pfn = pfn;
> +				hypervisor_pagelist[hyper_idx].pages = 1;
> +				if (hyper_idx == MAX_FGPT_ENTRIES - 1) {
> +					hyper_idx =  compress_hyperlist();
> +					if (hyper_idx >=
> +					    HYPERLIST_THRESHOLD) {
> +						make_hypercall();
> +						hyper_idx = 0;
> +					}
> +				}
> +				/*
> +				 * If the next contiguous page is free, it can
> +				 * be added to this same entry.
> +				 */
> +				prev_free = true;
> +			} else {
> +				/*
> +				 * Multiple adjacent free pages
> +				 */
> +				hypervisor_pagelist[hyper_idx].pages++;
> +			}
> +			pfn++;
> +		}
> +		free_page_obj[idx].pfn = 0;
> +		free_page_obj[idx].pages = 0;
> +		idx++;
> +	}
> +	*kvm_idx = 0;
> +	put_cpu_var(kvm_pt);
> +	put_cpu_var(kvm_pt_idx);
> +	write_sequnlock(&guest_page_lock);
> +}
> +
>  void arch_alloc_page(struct page *page, int order)
>  {
> +	unsigned int seq;
> +
> +	/*
> +	 * arch_free_page will acquire the lock once the list carrying guest
> +	 * free pages is full and a hypercall will be made. Until complete free
> +	 * page list is traversed no further allocaiton will be allowed.
> +	 */
> +	do {
> +		seq = read_seqbegin(&guest_page_lock);
> +	} while (read_seqretry(&guest_page_lock, seq));
>  }
>  
>  void arch_free_page(struct page *page, int order)
>  {
> +	int *free_page_idx = &get_cpu_var(kvm_pt_idx);
> +	struct kvm_free_pages *free_page_obj;
> +	unsigned long flags;
> +
> +	/*
> +	 * use of global variables may trigger a race condition between irq and
> +	 * process context causing unwanted overwrites. This will be replaced
> +	 * with a better solution to prevent such race conditions.

When do you plan to replace this? If you don't want this to
be merged yet pls include RFC in subject.

> +	 */
> +	local_irq_save(flags);
> +	free_page_obj = &get_cpu_var(kvm_pt)[0];
> +	free_page_obj[*free_page_idx].pfn = page_to_pfn(page);
> +	free_page_obj[*free_page_idx].pages = 1 << order;
> +	*free_page_idx += 1;
> +	if (*free_page_idx == MAX_FGPT_ENTRIES)
> +		arch_free_page_slowpath();
> +	put_cpu_var(kvm_pt);
> +	put_cpu_var(kvm_pt_idx);
> +	local_irq_restore(flags);
>  }
> -- 
> 2.9.4

diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
index 39d2b1d..658856d 100644
--- a/virt/kvm/page_hinting.c
+++ b/virt/kvm/page_hinting.c
@@ -3,8 +3,7 @@ 
 #include <linux/page_ref.h>
 #include <linux/kvm_host.h>
 #include <linux/sort.h>
-
-#include <trace/events/kvm.h>
+#include <linux/kernel.h>
 
 #define MAX_FGPT_ENTRIES	1000
 #define HYPERLIST_THRESHOLD	500
@@ -33,14 +32,256 @@  struct hypervisor_pages {
 	unsigned int pages;
 };
 
+static __cacheline_aligned_in_smp DEFINE_SEQLOCK(guest_page_lock);
 DEFINE_PER_CPU(struct kvm_free_pages [MAX_FGPT_ENTRIES], kvm_pt);
 DEFINE_PER_CPU(int, kvm_pt_idx);
 struct hypervisor_pages hypervisor_pagelist[MAX_FGPT_ENTRIES];
 
+static void empty_hyperlist(void)
+{
+	int i = 0;
+
+	while (i < MAX_FGPT_ENTRIES) {
+		hypervisor_pagelist[i].pfn = 0;
+		hypervisor_pagelist[i].pages = 0;
+		i++;
+	}
+}
+
+static void make_hypercall(void)
+{
+	/*
+	 * Dummy function: Tobe filled later.
+	 */
+	empty_hyperlist();
+}
+
+static int sort_pfn(const void *a1, const void *b1)
+{
+	const struct hypervisor_pages *a = a1;
+	const struct hypervisor_pages *b = b1;
+
+	if (a->pfn > b->pfn)
+		return 1;
+
+	if (a->pfn < b->pfn)
+		return -1;
+
+	return 0;
+}
+
+static int pack_hyperlist(void)
+{
+	int i = 0, j = 0;
+
+	while (i < MAX_FGPT_ENTRIES) {
+		if (hypervisor_pagelist[i].pfn != 0) {
+			if (i != j) {
+				hypervisor_pagelist[j].pfn =
+						hypervisor_pagelist[i].pfn;
+				hypervisor_pagelist[j].pages =
+						hypervisor_pagelist[i].pages;
+			}
+			j++;
+		}
+		i++;
+	}
+	i = j;
+	while (j < MAX_FGPT_ENTRIES) {
+		hypervisor_pagelist[j].pfn = 0;
+		hypervisor_pagelist[j].pages = 0;
+		j++;
+	}
+	return i;
+}
+
+int compress_hyperlist(void)
+{
+	int i = 0, j = 1, merge_counter = 0, ret = 0;
+
+	sort(hypervisor_pagelist, MAX_FGPT_ENTRIES,
+	     sizeof(struct hypervisor_pages), sort_pfn, NULL);
+	while (i < MAX_FGPT_ENTRIES && j < MAX_FGPT_ENTRIES) {
+		unsigned long pfni = hypervisor_pagelist[i].pfn;
+		unsigned int pagesi = hypervisor_pagelist[i].pages;
+		unsigned long pfnj = hypervisor_pagelist[j].pfn;
+		unsigned int pagesj = hypervisor_pagelist[j].pages;
+
+		if (pfnj <= pfni) {
+			if (((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) &&
+			    ((pfnj + pagesj - 1) >= (pfni - 1))) {
+				hypervisor_pagelist[i].pfn = pfnj;
+				hypervisor_pagelist[i].pages += pfni - pfnj;
+				hypervisor_pagelist[j].pfn = 0;
+				hypervisor_pagelist[j].pages = 0;
+				j++;
+				merge_counter++;
+				continue;
+			} else if ((pfnj + pagesj - 1) > (pfni + pagesi - 1)) {
+				hypervisor_pagelist[i].pfn = pfnj;
+				hypervisor_pagelist[i].pages = pagesj;
+				hypervisor_pagelist[j].pfn = 0;
+				hypervisor_pagelist[j].pages = 0;
+				j++;
+				merge_counter++;
+				continue;
+			}
+		} else if (pfnj > pfni) {
+			if ((pfnj + pagesj - 1) > (pfni + pagesi - 1) &&
+			    (pfnj <= pfni + pagesi)) {
+				hypervisor_pagelist[i].pages +=
+						(pfnj + pagesj - 1) -
+						(pfni + pagesi - 1);
+				hypervisor_pagelist[j].pfn = 0;
+				hypervisor_pagelist[j].pages = 0;
+				j++;
+				merge_counter++;
+				continue;
+			} else if ((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) {
+				hypervisor_pagelist[j].pfn = 0;
+				hypervisor_pagelist[j].pages = 0;
+				j++;
+				merge_counter++;
+				continue;
+			}
+		}
+		i = j;
+		j++;
+	}
+	if (merge_counter != 0)
+		ret = pack_hyperlist() - 1;
+	else
+		ret = MAX_FGPT_ENTRIES - 1;
+	return ret;
+}
+
+void copy_hyperlist(int hyper_idx)
+{
+	int *idx = &get_cpu_var(kvm_pt_idx);
+	struct kvm_free_pages *free_page_obj;
+	int i = 0;
+
+	free_page_obj = &get_cpu_var(kvm_pt)[0];
+	while (i < hyper_idx) {
+		free_page_obj[*idx].pfn = hypervisor_pagelist[i].pfn;
+		free_page_obj[*idx].pages = hypervisor_pagelist[i].pages;
+		*idx += 1;
+		i++;
+	}
+	empty_hyperlist();
+	put_cpu_var(kvm_pt);
+	put_cpu_var(kvm_pt_idx);
+}
+
+/*
+ * arch_free_page_slowpath() - This function adds the guest free page entries
+ * to hypervisor_pages list and also ensures defragmentation prior to addition
+ * if it is present with any entry of the kvm_free_pages list.
+ */
+void arch_free_page_slowpath(void)
+{
+	int idx = 0;
+	int hyper_idx = -1;
+	int *kvm_idx = &get_cpu_var(kvm_pt_idx);
+	struct kvm_free_pages *free_page_obj = &get_cpu_var(kvm_pt)[0];
+
+	write_seqlock(&guest_page_lock);
+	while (idx < MAX_FGPT_ENTRIES) {
+		unsigned long pfn = free_page_obj[idx].pfn;
+		unsigned long pfn_end = free_page_obj[idx].pfn +
+					free_page_obj[idx].pages - 1;
+		bool prev_free = false;
+
+		while (pfn <= pfn_end) {
+			struct page *p = pfn_to_page(pfn);
+
+			if (PageCompound(p)) {
+				struct page *head_page = compound_head(p);
+				unsigned long head_pfn = page_to_pfn(head_page);
+				unsigned int alloc_pages =
+					1 << compound_order(head_page);
+
+				pfn = head_pfn + alloc_pages;
+				prev_free = false;
+				continue;
+			}
+			if (page_ref_count(p)) {
+				pfn++;
+				prev_free = false;
+				continue;
+			}
+			/*
+			 * The page is free so add it to the list and free the
+			 * hypervisor_pagelist if required.
+			 */
+			if (!prev_free) {
+				hyper_idx++;
+				hypervisor_pagelist[hyper_idx].pfn = pfn;
+				hypervisor_pagelist[hyper_idx].pages = 1;
+				if (hyper_idx == MAX_FGPT_ENTRIES - 1) {
+					hyper_idx =  compress_hyperlist();
+					if (hyper_idx >=
+					    HYPERLIST_THRESHOLD) {
+						make_hypercall();
+						hyper_idx = 0;
+					}
+				}
+				/*
+				 * If the next contiguous page is free, it can
+				 * be added to this same entry.
+				 */
+				prev_free = true;
+			} else {
+				/*
+				 * Multiple adjacent free pages
+				 */
+				hypervisor_pagelist[hyper_idx].pages++;
+			}
+			pfn++;
+		}
+		free_page_obj[idx].pfn = 0;
+		free_page_obj[idx].pages = 0;
+		idx++;
+	}
+	*kvm_idx = 0;
+	put_cpu_var(kvm_pt);
+	put_cpu_var(kvm_pt_idx);
+	write_sequnlock(&guest_page_lock);
+}
+
 void arch_alloc_page(struct page *page, int order)
 {
+	unsigned int seq;
+
+	/*
+	 * arch_free_page will acquire the lock once the list carrying guest
+	 * free pages is full and a hypercall will be made. Until complete free
+	 * page list is traversed no further allocaiton will be allowed.
+	 */
+	do {
+		seq = read_seqbegin(&guest_page_lock);
+	} while (read_seqretry(&guest_page_lock, seq));
 }
 
 void arch_free_page(struct page *page, int order)
 {
+	int *free_page_idx = &get_cpu_var(kvm_pt_idx);
+	struct kvm_free_pages *free_page_obj;
+	unsigned long flags;
+
+	/*
+	 * use of global variables may trigger a race condition between irq and
+	 * process context causing unwanted overwrites. This will be replaced
+	 * with a better solution to prevent such race conditions.
+	 */
+	local_irq_save(flags);
+	free_page_obj = &get_cpu_var(kvm_pt)[0];
+	free_page_obj[*free_page_idx].pfn = page_to_pfn(page);
+	free_page_obj[*free_page_idx].pages = 1 << order;
+	*free_page_idx += 1;
+	if (*free_page_idx == MAX_FGPT_ENTRIES)
+		arch_free_page_slowpath();
+	put_cpu_var(kvm_pt);
+	put_cpu_var(kvm_pt_idx);
+	local_irq_restore(flags);
 }