diff mbox series

[v2,4/6] KVM: x86/mmu: Fix wrong start gfn of tlb flushing with range

Message ID 888399c78eab9d965657c5983f8096c707664c30.1661331396.git.houwenlong.hwl@antgroup.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/mmu: Fix wrong usages of range-based tlb flushing | expand

Commit Message

Hou Wenlong Aug. 24, 2022, 9:29 a.m. UTC
When a spte is dropped, the start gfn of tlb flushing should
be the gfn of spte not the base gfn of SP which contains the
spte. Also introduce a helper function to do range-based
flushing when a spte is dropped, which would help prevent
future buggy use of kvm_flush_remote_tlbs_with_address() in
such case.

Fixes: c3134ce240eed ("KVM: Replace old tlb flush function with new one to flush a specified range.")
Suggested-by: David Matlack <dmatlack@google.com>
Signed-off-by: Hou Wenlong <houwenlong.hwl@antgroup.com>
---
 arch/x86/kvm/mmu/mmu.c         | 20 +++++++++++++++-----
 arch/x86/kvm/mmu/paging_tmpl.h |  3 +--
 2 files changed, 16 insertions(+), 7 deletions(-)

Comments

David Matlack Sept. 7, 2022, 6:25 p.m. UTC | #1
On Wed, Aug 24, 2022 at 05:29:21PM +0800, Hou Wenlong wrote:
> When a spte is dropped, the start gfn of tlb flushing should
> be the gfn of spte not the base gfn of SP which contains the
> spte. Also introduce a helper function to do range-based
> flushing when a spte is dropped, which would help prevent
> future buggy use of kvm_flush_remote_tlbs_with_address() in
> such case.
> 
> Fixes: c3134ce240eed ("KVM: Replace old tlb flush function with new one to flush a specified range.")
> Suggested-by: David Matlack <dmatlack@google.com>
> Signed-off-by: Hou Wenlong <houwenlong.hwl@antgroup.com>
> ---
>  arch/x86/kvm/mmu/mmu.c         | 20 +++++++++++++++-----
>  arch/x86/kvm/mmu/paging_tmpl.h |  3 +--
>  2 files changed, 16 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 3bcff56df109..e0b9432b9491 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -260,6 +260,18 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
>  	kvm_flush_remote_tlbs_with_range(kvm, &range);
>  }
>  
> +static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
> +
> +/* Flush the range of guest memory mapped by the given SPTE. */
> +static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep)
> +{
> +	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
> +	gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep));
> +
> +	kvm_flush_remote_tlbs_with_address(kvm, gfn,
> +					   KVM_PAGES_PER_HPAGE(sp->role.level));

How is the range-based TLB flushing supposed to work with indirect MMUs?
When KVM is using shadow paging, the gfn here is not part of the actual
translation.

For example, when TDP is disabled, KVM's shadow page tables translate
GVA to HPA. When Nested Virtualization is in use and running L2, KVM's
shadow page tables translate nGPA to HPA.

Ah, I see x86_ops.tlb_remote_flush_with_range is only set when running
on Hyper-V and TDP is enabled (VMX checks enable_ept and SVM checks
npt_enabled). But it looks like the nested case might still be broken?

> +}
> +
>  /* Flush all memory mapped by the given direct SP. */
>  static void kvm_flush_remote_tlbs_direct_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>  {
> @@ -1156,8 +1168,7 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
>  	drop_spte(kvm, sptep);
>  
>  	if (flush)
> -		kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
> -			KVM_PAGES_PER_HPAGE(sp->role.level));
> +		kvm_flush_remote_tlbs_sptep(kvm, sptep);
>  }
>  
>  /*
> @@ -1608,7 +1619,7 @@ static void __rmap_add(struct kvm *kvm,
>  	if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
>  		kvm_zap_all_rmap_sptes(kvm, rmap_head);
>  		kvm_flush_remote_tlbs_with_address(
> -				kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
> +				kvm, gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
>  	}
>  }
>  
> @@ -6402,8 +6413,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
>  			kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
>  
>  			if (kvm_available_flush_tlb_with_range())
> -				kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
> -					KVM_PAGES_PER_HPAGE(sp->role.level));
> +				kvm_flush_remote_tlbs_sptep(kvm, sptep);
>  			else
>  				need_tlb_flush = 1;
>  
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 39e0205e7300..04149c704d5b 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -937,8 +937,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
>  
>  			mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
>  			if (is_shadow_present_pte(old_spte))
> -				kvm_flush_remote_tlbs_with_address(vcpu->kvm,
> -					sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
> +				kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
>  
>  			if (!rmap_can_add(vcpu))
>  				break;
> -- 
> 2.31.1
>
Hou Wenlong Sept. 13, 2022, 12:50 p.m. UTC | #2
On Thu, Sep 08, 2022 at 02:25:42AM +0800, David Matlack wrote:
> On Wed, Aug 24, 2022 at 05:29:21PM +0800, Hou Wenlong wrote:
> > When a spte is dropped, the start gfn of tlb flushing should
> > be the gfn of spte not the base gfn of SP which contains the
> > spte. Also introduce a helper function to do range-based
> > flushing when a spte is dropped, which would help prevent
> > future buggy use of kvm_flush_remote_tlbs_with_address() in
> > such case.
> > 
> > Fixes: c3134ce240eed ("KVM: Replace old tlb flush function with new one to flush a specified range.")
> > Suggested-by: David Matlack <dmatlack@google.com>
> > Signed-off-by: Hou Wenlong <houwenlong.hwl@antgroup.com>
> > ---
> >  arch/x86/kvm/mmu/mmu.c         | 20 +++++++++++++++-----
> >  arch/x86/kvm/mmu/paging_tmpl.h |  3 +--
> >  2 files changed, 16 insertions(+), 7 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 3bcff56df109..e0b9432b9491 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -260,6 +260,18 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
> >  	kvm_flush_remote_tlbs_with_range(kvm, &range);
> >  }
> >  
> > +static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
> > +
> > +/* Flush the range of guest memory mapped by the given SPTE. */
> > +static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep)
> > +{
> > +	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
> > +	gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep));
> > +
> > +	kvm_flush_remote_tlbs_with_address(kvm, gfn,
> > +					   KVM_PAGES_PER_HPAGE(sp->role.level));
> 
> How is the range-based TLB flushing supposed to work with indirect MMUs?
> When KVM is using shadow paging, the gfn here is not part of the actual
> translation.
> 
> For example, when TDP is disabled, KVM's shadow page tables translate
> GVA to HPA. When Nested Virtualization is in use and running L2, KVM's
> shadow page tables translate nGPA to HPA.
> 
> Ah, I see x86_ops.tlb_remote_flush_with_range is only set when running
> on Hyper-V and TDP is enabled (VMX checks enable_ept and SVM checks
> npt_enabled). But it looks like the nested case might still be broken?
>
Yeah, range based tlb flushing is only used on Hyper-V as Paolo said.
Actually, I don't know how Hyper-V implements the range based tlb
flushing hypercall, since KVM on Hyper-V is already nested, so gfn here
is already nGPA. It seems that the current used TDP root is passed in
hypercall, so maybe Hyper-V could do nGPA to HPA translation by looking
up TDP page table. Then for nested case, it chould work well?

> > +}
> > +
> >  /* Flush all memory mapped by the given direct SP. */
> >  static void kvm_flush_remote_tlbs_direct_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> >  {
> > @@ -1156,8 +1168,7 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
> >  	drop_spte(kvm, sptep);
> >  
> >  	if (flush)
> > -		kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
> > -			KVM_PAGES_PER_HPAGE(sp->role.level));
> > +		kvm_flush_remote_tlbs_sptep(kvm, sptep);
> >  }
> >  
> >  /*
> > @@ -1608,7 +1619,7 @@ static void __rmap_add(struct kvm *kvm,
> >  	if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
> >  		kvm_zap_all_rmap_sptes(kvm, rmap_head);
> >  		kvm_flush_remote_tlbs_with_address(
> > -				kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
> > +				kvm, gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
> >  	}
> >  }
> >  
> > @@ -6402,8 +6413,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
> >  			kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
> >  
> >  			if (kvm_available_flush_tlb_with_range())
> > -				kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
> > -					KVM_PAGES_PER_HPAGE(sp->role.level));
> > +				kvm_flush_remote_tlbs_sptep(kvm, sptep);
> >  			else
> >  				need_tlb_flush = 1;
> >  
> > diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> > index 39e0205e7300..04149c704d5b 100644
> > --- a/arch/x86/kvm/mmu/paging_tmpl.h
> > +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> > @@ -937,8 +937,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
> >  
> >  			mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
> >  			if (is_shadow_present_pte(old_spte))
> > -				kvm_flush_remote_tlbs_with_address(vcpu->kvm,
> > -					sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
> > +				kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
> >  
> >  			if (!rmap_can_add(vcpu))
> >  				break;
> > -- 
> > 2.31.1
> >
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3bcff56df109..e0b9432b9491 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -260,6 +260,18 @@  void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
 	kvm_flush_remote_tlbs_with_range(kvm, &range);
 }
 
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
+
+/* Flush the range of guest memory mapped by the given SPTE. */
+static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep)
+{
+	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+	gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep));
+
+	kvm_flush_remote_tlbs_with_address(kvm, gfn,
+					   KVM_PAGES_PER_HPAGE(sp->role.level));
+}
+
 /* Flush all memory mapped by the given direct SP. */
 static void kvm_flush_remote_tlbs_direct_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
@@ -1156,8 +1168,7 @@  static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
 	drop_spte(kvm, sptep);
 
 	if (flush)
-		kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
-			KVM_PAGES_PER_HPAGE(sp->role.level));
+		kvm_flush_remote_tlbs_sptep(kvm, sptep);
 }
 
 /*
@@ -1608,7 +1619,7 @@  static void __rmap_add(struct kvm *kvm,
 	if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
 		kvm_zap_all_rmap_sptes(kvm, rmap_head);
 		kvm_flush_remote_tlbs_with_address(
-				kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
+				kvm, gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
 	}
 }
 
@@ -6402,8 +6413,7 @@  static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
 			kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
 
 			if (kvm_available_flush_tlb_with_range())
-				kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
-					KVM_PAGES_PER_HPAGE(sp->role.level));
+				kvm_flush_remote_tlbs_sptep(kvm, sptep);
 			else
 				need_tlb_flush = 1;
 
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 39e0205e7300..04149c704d5b 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -937,8 +937,7 @@  static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
 
 			mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
 			if (is_shadow_present_pte(old_spte))
-				kvm_flush_remote_tlbs_with_address(vcpu->kvm,
-					sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
+				kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
 
 			if (!rmap_can_add(vcpu))
 				break;