[10/15] KVM: MMU: lockless walking shadow page table

Message ID	4DEE21E2.8000301@cn.fujitsu.com (mailing list archive)
State	New, archived
Headers	show Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter2.kernel.org (8.14.4/8.14.3) with ESMTP id p57D2ncY019318 for <patchwork-kvm@patchwork.kernel.org>; Tue, 7 Jun 2011 13:02:49 GMT Message-ID: <4DEE21E2.8000301@cn.fujitsu.com> Date: Tue, 07 Jun 2011 21:04:34 +0800 From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> User-Agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.15) Gecko/20110307 Fedora/3.1.9-0.39.b3pre.fc14 Thunderbird/3.1.9 MIME-Version: 1.0 To: Avi Kivity <avi@redhat.com> CC: Marcelo Tosatti <mtosatti@redhat.com>, LKML <linux-kernel@vger.kernel.org>, KVM <kvm@vger.kernel.org> Subject: [PATCH 10/15] KVM: MMU: lockless walking shadow page table References: <4DEE205E.8000601@cn.fujitsu.com> In-Reply-To: <4DEE205E.8000601@cn.fujitsu.com> Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset=UTF-8 Sender: kvm-owner@vger.kernel.org Precedence: bulk

Message ID

4DEE21E2.8000301@cn.fujitsu.com (mailing list archive)

State

New, archived

Headers

Message-ID: <4DEE21E2.8000301@cn.fujitsu.com>
Date: Tue, 07 Jun 2011 21:04:34 +0800
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
User-Agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US;
	rv:1.9.2.15) Gecko/20110307 Fedora/3.1.9-0.39.b3pre.fc14
	Thunderbird/3.1.9
MIME-Version: 1.0
To: Avi Kivity <avi@redhat.com>
CC: Marcelo Tosatti <mtosatti@redhat.com>,
	LKML <linux-kernel@vger.kernel.org>, KVM <kvm@vger.kernel.org>
Subject: [PATCH 10/15] KVM: MMU: lockless walking shadow page table
References: <4DEE205E.8000601@cn.fujitsu.com>
In-Reply-To: <4DEE205E.8000601@cn.fujitsu.com>
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset=UTF-8
Sender: kvm-owner@vger.kernel.org
Precedence: bulk

Commit Message

Xiao Guangrong June 7, 2011, 1:04 p.m. UTC

Using rcu to protect shadow pages table to be freed, so we can safely walk it,
it should run fast and is needed by mmio page fault

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
 arch/x86/include/asm/kvm_host.h |    4 ++
 arch/x86/kvm/mmu.c              |   79 ++++++++++++++++++++++++++++++---------
 arch/x86/kvm/mmu.h              |    4 +-
 arch/x86/kvm/vmx.c              |    2 +-
 4 files changed, 69 insertions(+), 20 deletions(-)

Comments

Paul E. McKenney June 9, 2011, 8:09 p.m. UTC | #1

On Tue, Jun 07, 2011 at 09:04:34PM +0800, Xiao Guangrong wrote:
> Using rcu to protect shadow pages table to be freed, so we can safely walk it,
> it should run fast and is needed by mmio page fault

A couple of question below.

							Thanx, Paul

> Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    4 ++
>  arch/x86/kvm/mmu.c              |   79 ++++++++++++++++++++++++++++++---------
>  arch/x86/kvm/mmu.h              |    4 +-
>  arch/x86/kvm/vmx.c              |    2 +-
>  4 files changed, 69 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 326af42..260582b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -232,6 +232,8 @@ struct kvm_mmu_page {
>  	unsigned int unsync_children;
>  	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
>  	DECLARE_BITMAP(unsync_child_bitmap, 512);
> +
> +	struct rcu_head rcu;
>  };
> 
>  struct kvm_pv_mmu_op_buffer {
> @@ -478,6 +480,8 @@ struct kvm_arch {
>  	u64 hv_guest_os_id;
>  	u64 hv_hypercall;
> 
> +	atomic_t reader_counter;
> +
>  	#ifdef CONFIG_KVM_MMU_AUDIT
>  	int audit_point;
>  	#endif
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 9f3a746..52d4682 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1675,6 +1675,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
>  	return ret;
>  }
> 
> +static void free_mmu_pages_unlock_parts(struct list_head *invalid_list)
> +{
> +	struct kvm_mmu_page *sp;
> +
> +	list_for_each_entry(sp, invalid_list, link)
> +		kvm_mmu_free_lock_parts(sp);
> +}
> +
> +static void free_invalid_pages_rcu(struct rcu_head *head)
> +{
> +	struct kvm_mmu_page *next, *sp;
> +
> +	sp = container_of(head, struct kvm_mmu_page, rcu);
> +	while (sp) {
> +		if (!list_empty(&sp->link))
> +			next = list_first_entry(&sp->link,
> +				      struct kvm_mmu_page, link);
> +		else
> +			next = NULL;
> +		kvm_mmu_free_unlock_parts(sp);
> +		sp = next;
> +	}
> +}
> +
>  static void kvm_mmu_commit_zap_page(struct kvm *kvm,
>  				    struct list_head *invalid_list)
>  {
> @@ -1685,6 +1709,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
> 
>  	kvm_flush_remote_tlbs(kvm);
> 
> +	if (atomic_read(&kvm->arch.reader_counter)) {

This is the slowpath to be executed if there are currently readers
in kvm->arch.reader_counter(), correct?

> +		free_mmu_pages_unlock_parts(invalid_list);
> +		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
> +		list_del_init(invalid_list);
> +		call_rcu(&sp->rcu, free_invalid_pages_rcu);
> +		return;
> +	}

OK, so it also looks like kvm->arch.reader_counter could transition from
zero to non-zero at this point due to a concurrent call from a reader in
the kvm_mmu_walk_shadow_page_lockless() function.  Does the following code
avoid messing up the reader?  If so, why bother with the slowpath above?

> +
>  	do {
>  		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
>  		WARN_ON(!sp->role.invalid || sp->root_count);
> @@ -2601,6 +2633,35 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
>  	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
>  }
> 
> +int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
> +				      u64 sptes[4])
> +{
> +	struct kvm_shadow_walk_iterator iterator;
> +	int nr_sptes = 0;
> +
> +	rcu_read_lock();
> +
> +	atomic_inc(&vcpu->kvm->arch.reader_counter);
> +	/* Increase the counter before walking shadow page table */
> +	smp_mb__after_atomic_inc();
> +
> +	for_each_shadow_entry(vcpu, addr, iterator) {
> +		sptes[iterator.level-1] = *iterator.sptep;
> +		nr_sptes++;
> +		if (!is_shadow_present_pte(*iterator.sptep))
> +			break;
> +	}
> +
> +	/* Decrease the counter after walking shadow page table finished */
> +	smp_mb__before_atomic_dec();
> +	atomic_dec(&vcpu->kvm->arch.reader_counter);
> +
> +	rcu_read_unlock();
> +
> +	return nr_sptes;
> +}
> +EXPORT_SYMBOL_GPL(kvm_mmu_walk_shadow_page_lockless);
> +
>  static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
>  				u32 error_code, bool prefault)
>  {
> @@ -3684,24 +3745,6 @@ out:
>  	return r;
>  }
> 
> -int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
> -{
> -	struct kvm_shadow_walk_iterator iterator;
> -	int nr_sptes = 0;
> -
> -	spin_lock(&vcpu->kvm->mmu_lock);
> -	for_each_shadow_entry(vcpu, addr, iterator) {
> -		sptes[iterator.level-1] = *iterator.sptep;
> -		nr_sptes++;
> -		if (!is_shadow_present_pte(*iterator.sptep))
> -			break;
> -	}
> -	spin_unlock(&vcpu->kvm->mmu_lock);
> -
> -	return nr_sptes;
> -}
> -EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
> -
>  void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
>  {
>  	ASSERT(vcpu);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 05310b1..e7725c4 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -48,7 +48,9 @@
>  #define PFERR_RSVD_MASK (1U << 3)
>  #define PFERR_FETCH_MASK (1U << 4)
> 
> -int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
> +int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
> +				      u64 sptes[4]);
> +
>  int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
> 
>  static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index b54c186..20dbf7f 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -4681,7 +4681,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
>  	printk(KERN_ERR "EPT: Misconfiguration.\n");
>  	printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
> 
> -	nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
> +	nr_sptes = kvm_mmu_walk_shadow_page_lockless(vcpu, gpa, sptes);
> 
>  	for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
>  		ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
> -- 
> 1.7.4.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Xiao Guangrong June 10, 2011, 4:23 a.m. UTC | #2

On 06/10/2011 04:09 AM, Paul E. McKenney wrote:
> On Tue, Jun 07, 2011 at 09:04:34PM +0800, Xiao Guangrong wrote:
>> Using rcu to protect shadow pages table to be freed, so we can safely walk it,
>> it should run fast and is needed by mmio page fault
> 
> A couple of question below.

Thanks for your review!

>> +	if (atomic_read(&kvm->arch.reader_counter)) {
> 
> This is the slowpath to be executed if there are currently readers
> in kvm->arch.reader_counter(), correct?
> 

Yes, we will free the pages in RCU context if it is in kvm->arch.reader_counter

>> +		free_mmu_pages_unlock_parts(invalid_list);
>> +		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
>> +		list_del_init(invalid_list);
>> +		call_rcu(&sp->rcu, free_invalid_pages_rcu);
>> +		return;
>> +	}
> 
> OK, so it also looks like kvm->arch.reader_counter could transition from
> zero to non-zero at this point due to a concurrent call from a reader in
> the kvm_mmu_walk_shadow_page_lockless() function.  Does the following code
> avoid messing up the reader?  If so, why bother with the slowpath above?
> 

Actually, we have split the free operation to two steps, the first step is
kvm_mmu_prepare_zap_page(), it isolates the page from shadow page table, so
after call it, we can not get the page from the shadow page table, and the
later steps is kvm_mmu_commit_zap_page(), it frees the page.

kvm_mmu_walk_shadow_page_lockless() get the page from shadow page table,
so, even if kvm->arch.reader_counter transition from zero to non-zero in
the fallowing code, we can sure the page is not used by
kvm_mmu_walk_shadow_page_lockless(), so we can free the page directly.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Marcelo Tosatti June 20, 2011, 4:37 p.m. UTC | #3

On Tue, Jun 07, 2011 at 09:04:34PM +0800, Xiao Guangrong wrote:
> Using rcu to protect shadow pages table to be freed, so we can safely walk it,
> it should run fast and is needed by mmio page fault
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    4 ++
>  arch/x86/kvm/mmu.c              |   79 ++++++++++++++++++++++++++++++---------
>  arch/x86/kvm/mmu.h              |    4 +-
>  arch/x86/kvm/vmx.c              |    2 +-
>  4 files changed, 69 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 326af42..260582b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -232,6 +232,8 @@ struct kvm_mmu_page {
>  	unsigned int unsync_children;
>  	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
>  	DECLARE_BITMAP(unsync_child_bitmap, 512);
> +
> +	struct rcu_head rcu;
>  };
>  
>  struct kvm_pv_mmu_op_buffer {
> @@ -478,6 +480,8 @@ struct kvm_arch {
>  	u64 hv_guest_os_id;
>  	u64 hv_hypercall;
>  
> +	atomic_t reader_counter;
> +
>  	#ifdef CONFIG_KVM_MMU_AUDIT
>  	int audit_point;
>  	#endif
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 9f3a746..52d4682 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1675,6 +1675,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
>  	return ret;
>  }
>  
> +static void free_mmu_pages_unlock_parts(struct list_head *invalid_list)
> +{
> +	struct kvm_mmu_page *sp;
> +
> +	list_for_each_entry(sp, invalid_list, link)
> +		kvm_mmu_free_lock_parts(sp);
> +}
> +
> +static void free_invalid_pages_rcu(struct rcu_head *head)
> +{
> +	struct kvm_mmu_page *next, *sp;
> +
> +	sp = container_of(head, struct kvm_mmu_page, rcu);
> +	while (sp) {
> +		if (!list_empty(&sp->link))
> +			next = list_first_entry(&sp->link,
> +				      struct kvm_mmu_page, link);
> +		else
> +			next = NULL;
> +		kvm_mmu_free_unlock_parts(sp);
> +		sp = next;
> +	}
> +}
> +
>  static void kvm_mmu_commit_zap_page(struct kvm *kvm,
>  				    struct list_head *invalid_list)
>  {
> @@ -1685,6 +1709,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
>  
>  	kvm_flush_remote_tlbs(kvm);
>  
> +	if (atomic_read(&kvm->arch.reader_counter)) {
> +		free_mmu_pages_unlock_parts(invalid_list);
> +		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
> +		list_del_init(invalid_list);
> +		call_rcu(&sp->rcu, free_invalid_pages_rcu);
> +		return;
> +	}

This is probably wrong, the caller wants the page to be zapped by the 
time the function returns, not scheduled sometime in the future.

> +
>  	do {
>  		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
>  		WARN_ON(!sp->role.invalid || sp->root_count);
> @@ -2601,6 +2633,35 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
>  	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
>  }
>  
> +int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
> +				      u64 sptes[4])
> +{
> +	struct kvm_shadow_walk_iterator iterator;
> +	int nr_sptes = 0;
> +
> +	rcu_read_lock();
> +
> +	atomic_inc(&vcpu->kvm->arch.reader_counter);
> +	/* Increase the counter before walking shadow page table */
> +	smp_mb__after_atomic_inc();
> +
> +	for_each_shadow_entry(vcpu, addr, iterator) {
> +		sptes[iterator.level-1] = *iterator.sptep;
> +		nr_sptes++;
> +		if (!is_shadow_present_pte(*iterator.sptep))
> +			break;
> +	}

Why is lockless access needed for the MMIO optimization? Note the spte 
contents are copied to the array here are used for debugging purposes
only, their contents are potentially stale.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Xiao Guangrong June 20, 2011, 6:54 p.m. UTC | #4

On 06/21/2011 12:37 AM, Marcelo Tosatti wrote:

>> +	if (atomic_read(&kvm->arch.reader_counter)) {
>> +		free_mmu_pages_unlock_parts(invalid_list);
>> +		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
>> +		list_del_init(invalid_list);
>> +		call_rcu(&sp->rcu, free_invalid_pages_rcu);
>> +		return;
>> +	}
> 
> This is probably wrong, the caller wants the page to be zapped by the 
> time the function returns, not scheduled sometime in the future.
> 

It can be freed soon and KVM does not reuse these pages anymore...
it is not too bad, no?

>> +
>>  	do {
>>  		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
>>  		WARN_ON(!sp->role.invalid || sp->root_count);
>> @@ -2601,6 +2633,35 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
>>  	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
>>  }
>>  
>> +int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
>> +				      u64 sptes[4])
>> +{
>> +	struct kvm_shadow_walk_iterator iterator;
>> +	int nr_sptes = 0;
>> +
>> +	rcu_read_lock();
>> +
>> +	atomic_inc(&vcpu->kvm->arch.reader_counter);
>> +	/* Increase the counter before walking shadow page table */
>> +	smp_mb__after_atomic_inc();
>> +
>> +	for_each_shadow_entry(vcpu, addr, iterator) {
>> +		sptes[iterator.level-1] = *iterator.sptep;
>> +		nr_sptes++;
>> +		if (!is_shadow_present_pte(*iterator.sptep))
>> +			break;
>> +	}
> 
> Why is lockless access needed for the MMIO optimization? Note the spte 
> contents are copied to the array here are used for debugging purposes
> only, their contents are potentially stale.
> 

Um, we can use it to check the mmio page fault if it is the real mmio access or the
bug of KVM, i discussed it with Avi:

===============================================
>
> Yes, it is, i just want to detect BUG for KVM, it helps us to know if "ept misconfig" is the
> real MMIO or the BUG. I noticed some "ept misconfig" BUGs is reported before, so i think doing
> this is necessary, and i think it is not too bad, since walking spte hierarchy is lockless,
> it really fast.

Okay.  We can later see if it show up on profiles. 
===============================================

And it is really fast, i will attach the 'perf result' when the v2 is posted.

Yes, their contents are potentially stale, we just use it to check mmio, after all, if we get the
stale spte, we will call page fault path to fix it.
 


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 326af42..260582b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -232,6 +232,8 @@  struct kvm_mmu_page {
 	unsigned int unsync_children;
 	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
 	DECLARE_BITMAP(unsync_child_bitmap, 512);
+
+	struct rcu_head rcu;
 };
 
 struct kvm_pv_mmu_op_buffer {
@@ -478,6 +480,8 @@  struct kvm_arch {
 	u64 hv_guest_os_id;
 	u64 hv_hypercall;
 
+	atomic_t reader_counter;
+
 	#ifdef CONFIG_KVM_MMU_AUDIT
 	int audit_point;
 	#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9f3a746..52d4682 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1675,6 +1675,30 @@  static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	return ret;
 }
 
+static void free_mmu_pages_unlock_parts(struct list_head *invalid_list)
+{
+	struct kvm_mmu_page *sp;
+
+	list_for_each_entry(sp, invalid_list, link)
+		kvm_mmu_free_lock_parts(sp);
+}
+
+static void free_invalid_pages_rcu(struct rcu_head *head)
+{
+	struct kvm_mmu_page *next, *sp;
+
+	sp = container_of(head, struct kvm_mmu_page, rcu);
+	while (sp) {
+		if (!list_empty(&sp->link))
+			next = list_first_entry(&sp->link,
+				      struct kvm_mmu_page, link);
+		else
+			next = NULL;
+		kvm_mmu_free_unlock_parts(sp);
+		sp = next;
+	}
+}
+
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 				    struct list_head *invalid_list)
 {
@@ -1685,6 +1709,14 @@  static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
 	kvm_flush_remote_tlbs(kvm);
 
+	if (atomic_read(&kvm->arch.reader_counter)) {
+		free_mmu_pages_unlock_parts(invalid_list);
+		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+		list_del_init(invalid_list);
+		call_rcu(&sp->rcu, free_invalid_pages_rcu);
+		return;
+	}
+
 	do {
 		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
 		WARN_ON(!sp->role.invalid || sp->root_count);
@@ -2601,6 +2633,35 @@  static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
 	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
 }
 
+int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
+				      u64 sptes[4])
+{
+	struct kvm_shadow_walk_iterator iterator;
+	int nr_sptes = 0;
+
+	rcu_read_lock();
+
+	atomic_inc(&vcpu->kvm->arch.reader_counter);
+	/* Increase the counter before walking shadow page table */
+	smp_mb__after_atomic_inc();
+
+	for_each_shadow_entry(vcpu, addr, iterator) {
+		sptes[iterator.level-1] = *iterator.sptep;
+		nr_sptes++;
+		if (!is_shadow_present_pte(*iterator.sptep))
+			break;
+	}
+
+	/* Decrease the counter after walking shadow page table finished */
+	smp_mb__before_atomic_dec();
+	atomic_dec(&vcpu->kvm->arch.reader_counter);
+
+	rcu_read_unlock();
+
+	return nr_sptes;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_walk_shadow_page_lockless);
+
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 				u32 error_code, bool prefault)
 {
@@ -3684,24 +3745,6 @@  out:
 	return r;
 }
 
-int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
-{
-	struct kvm_shadow_walk_iterator iterator;
-	int nr_sptes = 0;
-
-	spin_lock(&vcpu->kvm->mmu_lock);
-	for_each_shadow_entry(vcpu, addr, iterator) {
-		sptes[iterator.level-1] = *iterator.sptep;
-		nr_sptes++;
-		if (!is_shadow_present_pte(*iterator.sptep))
-			break;
-	}
-	spin_unlock(&vcpu->kvm->mmu_lock);
-
-	return nr_sptes;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
-
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
 	ASSERT(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 05310b1..e7725c4 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,7 +48,9 @@ 
 #define PFERR_RSVD_MASK (1U << 3)
 #define PFERR_FETCH_MASK (1U << 4)
 
-int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
+				      u64 sptes[4]);
+
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b54c186..20dbf7f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4681,7 +4681,7 @@  static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 	printk(KERN_ERR "EPT: Misconfiguration.\n");
 	printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
 
-	nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
+	nr_sptes = kvm_mmu_walk_shadow_page_lockless(vcpu, gpa, sptes);
 
 	for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
 		ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);

[10/15] KVM: MMU: lockless walking shadow page table

Commit Message

Comments

Patch