diff mbox

[v3,06/15] KVM: MMU: allow concurrently clearing spte on remove-only pte-list

Message ID 1366093973-2617-7-git-send-email-xiaoguangrong@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Xiao Guangrong April 16, 2013, 6:32 a.m. UTC
This patch introduce PTE_LIST_SPTE_SKIP which is the placeholder and
it will be set on pte-list after removing a spte so that other sptes
on this pte_list are not moved and the pte-list-descs on the pte-list
are not freed.

If vcpu can not add spte to the pte-list (e.g. the rmap on invalid
memslot) and spte can not be freed during pte-list walk, we can
concurrently clear sptes on the pte-list, the worst case is, we double
zap a spte that is safe.

This patch only ensures that concurrently zapping pte-list is safe,
we will keep spte available during concurrently clearing in the later
patches

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c |   62 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 files changed, 57 insertions(+), 5 deletions(-)
diff mbox

Patch

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 99ad2a4..850eab5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -900,6 +900,18 @@  static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 }
 
 /*
+ * It is the placeholder and it will be set on pte-list after removing
+ * a spte so that other sptes on this pte_list are not moved and the
+ * pte-list-descs on the pte-list are not freed.
+ *
+ * If vcpu can not add spte to the pte-list (e.g. the rmap on invalid
+ * memslot) and spte can not be freed during pte-list walk, we can
+ * cocurrently clear sptes on the pte-list, the worst case is, we double
+ * zap a spte that is safe.
+ */
+#define PTE_LIST_SPTE_SKIP	(u64 *)((~0x0ul) & (~1))
+
+/*
  * Pte mapping structures:
  *
  * If pte_list bit zero is zero, then pte_list point to the spte.
@@ -1003,6 +1015,40 @@  static void pte_list_remove(u64 *spte, unsigned long *pte_list)
 	}
 }
 
+static void pte_list_clear_concurrently(u64 *spte, unsigned long *pte_list)
+{
+	struct pte_list_desc *desc;
+	unsigned long pte_value = *pte_list;
+	int i;
+
+	/* Empty pte list stores nothing. */
+	WARN_ON(!pte_value);
+
+	if (!(pte_value & 1)) {
+		if ((u64 *)pte_value == spte) {
+			*pte_list = (unsigned long)PTE_LIST_SPTE_SKIP;
+			return;
+		}
+
+		/* someone has already cleared it. */
+		WARN_ON(pte_value != (unsigned long)PTE_LIST_SPTE_SKIP);
+		return;
+	}
+
+	desc = (struct pte_list_desc *)(pte_value & ~1ul);
+	while (desc) {
+		for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
+			if (desc->sptes[i] == spte) {
+				desc->sptes[i] = PTE_LIST_SPTE_SKIP;
+				return;
+			}
+
+		desc = desc->more;
+	}
+
+	return;
+}
+
 typedef void (*pte_list_walk_fn) (u64 *spte);
 static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
 {
@@ -1214,6 +1260,12 @@  spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
 	return false;
 }
 
+/* PTE_LIST_SPTE_SKIP is only used on invalid rmap. */
+static void check_valid_sptep(u64 *sptep)
+{
+	WARN_ON(sptep == PTE_LIST_SPTE_SKIP || !is_rmap_spte(*sptep));
+}
+
 static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
 				 bool pt_protect)
 {
@@ -1222,7 +1274,7 @@  static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
 	bool flush = false;
 
 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+		check_valid_sptep(sptep);
 		if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
 			sptep = rmap_get_first(*rmapp, &iter);
 			continue;
@@ -1293,7 +1345,7 @@  static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
 	int need_tlb_flush = 0;
 
 	while ((sptep = rmap_get_first(*rmapp, &iter))) {
-		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+		check_valid_sptep(sptep);
 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
 
 		drop_spte(kvm, sptep);
@@ -1322,7 +1374,7 @@  static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	new_pfn = pte_pfn(*ptep);
 
 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-		BUG_ON(!is_shadow_present_pte(*sptep));
+		check_valid_sptep(sptep);
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep);
 
 		need_flush = 1;
@@ -1455,7 +1507,7 @@  static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
 
 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
 	     sptep = rmap_get_next(&iter)) {
-		BUG_ON(!is_shadow_present_pte(*sptep));
+		check_valid_sptep(sptep);
 
 		if (*sptep & shadow_accessed_mask) {
 			young = 1;
@@ -1493,7 +1545,7 @@  static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
 
 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
 	     sptep = rmap_get_next(&iter)) {
-		BUG_ON(!is_shadow_present_pte(*sptep));
+		check_valid_sptep(sptep);
 
 		if (*sptep & shadow_accessed_mask) {
 			young = 1;