diff mbox series

[v4,09/12] KVM: x86/mmu: serialize vCPUs to zap gfn when guest MTRRs are honored

Message ID 20230714065454.20688-1-yan.y.zhao@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/mmu: refine memtype related mmu zap | expand

Commit Message

Yan Zhao July 14, 2023, 6:54 a.m. UTC
Serialize concurrent and repeated calls of kvm_zap_gfn_range() from every
vCPU for CR0.CD toggles and MTRR updates when guest MTRRs are honored.

During guest boot-up, if guest MTRRs are honored by TDP, TDP zaps are
triggered several times by each vCPU for CR0.CD toggles and MTRRs updates.
This will take unexpected longer CPU cycles because of the contention of
kvm->mmu_lock.

Therefore, introduce a mtrr_zap_list to remove duplicated zap and an atomic
mtrr_zapping to allow only one vCPU to do the real zap work at one time.

Cc: Yuan Yao <yuan.yao@linux.intel.com>
Suggested-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
 arch/x86/include/asm/kvm_host.h |   4 ++
 arch/x86/kvm/mtrr.c             | 122 +++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c              |   5 +-
 arch/x86/kvm/x86.h              |   1 +
 4 files changed, 130 insertions(+), 2 deletions(-)

Comments

Sean Christopherson Aug. 25, 2023, 10:47 p.m. UTC | #1
On Fri, Jul 14, 2023, Yan Zhao wrote:
> +/*
> + * Add @range into kvm->arch.mtrr_zap_list and sort the list in
> + * "length" ascending + "start" descending order, so that
> + * ranges consuming more zap cycles can be dequeued later and their
> + * chances of being found duplicated are increased.

Wrap comments as close to 80 chars as possible.

> + */
> +static void kvm_add_mtrr_zap_list(struct kvm *kvm, struct mtrr_zap_range *range)
> +{
> +	struct list_head *head = &kvm->arch.mtrr_zap_list;
> +	u64 len = range->end - range->start;
> +	struct mtrr_zap_range *cur, *n;
> +	bool added = false;
> +
> +	spin_lock(&kvm->arch.mtrr_zap_list_lock);
> +
> +	if (list_empty(head)) {
> +		list_add(&range->node, head);
> +		spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> +		return;

Make this

		goto out;

or
		goto out_unlock;

and then do the same instead of the break; in the loop.  Then "added" goes away
and there's a single unlock.

> +	}
> +
> +	list_for_each_entry_safe(cur, n, head, node) {

This shouldn't need to use the _safe() variant, it's not deleting anything.

> +		u64 cur_len = cur->end - cur->start;
> +
> +		if (len < cur_len)
> +			break;
> +
> +		if (len > cur_len)
> +			continue;
> +
> +		if (range->start > cur->start)
> +			break;
> +
> +		if (range->start < cur->start)
> +			continue;

Looking at kvm_zap_mtrr_zap_list(), wouldn't we be better off sorting by start,
and then batching in kvm_zap_mtrr_zap_list()?  And maybe make the batching "fuzzy"
for fixed MTRRs?  I.e. if KVM is zapping any fixed MTRRs, zap all fixed MTRR ranges
even if there's a gap.

> +
> +		/* equal len & start, no need to add */
> +		added = true;
> +		kfree(range);


Hmm, the memory allocations are a bit of complexity that'd I'd prefer to avoid.
At a minimum, I think kvm_add_mtrr_zap_list() should do the allocation.  That'll
dedup a decount amount of code.

At the risk of rehashing the old memslots implementation, I think we should simply
have a statically sized array in struct kvm to hold "range to zap".  E.g. use 16
entries, bin all fixed MTRRs into a single range, and if the remaining 15 fill up,
purge and fall back to a full zap.

128 bytes per VM is totally acceptable, especially since we're burning waaay
more than that to deal with per-vCPU MTRRs.  And a well-behaved guest should have
identical MTRRs across all vCPUs, or maybe at worst one config for the BSP and
one for APs.

> +		break;
> +	}
> +
> +	if (!added)
> +		list_add_tail(&range->node, &cur->node);
> +
> +	spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> +}
> +
> +static void kvm_zap_mtrr_zap_list(struct kvm *kvm)
> +{
> +	struct list_head *head = &kvm->arch.mtrr_zap_list;
> +	struct mtrr_zap_range *cur = NULL;
> +
> +	spin_lock(&kvm->arch.mtrr_zap_list_lock);
> +
> +	while (!list_empty(head)) {
> +		u64 start, end;
> +
> +		cur = list_first_entry(head, typeof(*cur), node);
> +		start = cur->start;
> +		end = cur->end;
> +		list_del(&cur->node);
> +		kfree(cur);

Hmm, the memory allocations are a bit of complexity that'd I'd prefer to avoid.

> +		spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> +
> +		kvm_zap_gfn_range(kvm, start, end);
> +
> +		spin_lock(&kvm->arch.mtrr_zap_list_lock);
> +	}
> +
> +	spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> +}
> +
> +static void kvm_zap_or_wait_mtrr_zap_list(struct kvm *kvm)
> +{
> +	if (atomic_cmpxchg_acquire(&kvm->arch.mtrr_zapping, 0, 1) == 0) {
> +		kvm_zap_mtrr_zap_list(kvm);
> +		atomic_set_release(&kvm->arch.mtrr_zapping, 0);
> +		return;
> +	}
> +
> +	while (atomic_read(&kvm->arch.mtrr_zapping))
> +		cpu_relax();
> +}
> +
> +static void kvm_mtrr_zap_gfn_range(struct kvm_vcpu *vcpu,
> +				   gfn_t gfn_start, gfn_t gfn_end)
> +{
> +	struct mtrr_zap_range *range;
> +
> +	range = kmalloc(sizeof(*range), GFP_KERNEL_ACCOUNT);
> +	if (!range)
> +		goto fail;
> +
> +	range->start = gfn_start;
> +	range->end = gfn_end;
> +
> +	kvm_add_mtrr_zap_list(vcpu->kvm, range);
> +
> +	kvm_zap_or_wait_mtrr_zap_list(vcpu->kvm);
> +	return;
> +
> +fail:
> +	kvm_zap_gfn_range(vcpu->kvm, gfn_start, gfn_end);
> +}
> +
> +void kvm_honors_guest_mtrrs_zap_on_cd_toggle(struct kvm_vcpu *vcpu)

Rather than provide a one-liner, add something like

  void kvm_mtrr_cr0_cd_changed(struct kvm_vcpu *vcpu)
  {
	if (!kvm_mmu_honors_guest_mtrrs(vcpu->kvm))
		return;

	return kvm_zap_gfn_range(vcpu, 0, -1ull);
  }

that avoids the comically long function name, and keeps the MTRR logic more
contained in the MTRR code.

> +{
> +	return kvm_mtrr_zap_gfn_range(vcpu, gpa_to_gfn(0), gpa_to_gfn(~0ULL));

Meh, just zap 0 => ~0ull.  That 51:0 happens to be the theoretical max gfn on
x86 is coincidence (AFAIK).  And if the guest.MAXPHYADDR < 52, shifting ~0ull
still doesn't yield a "legal" gfn.

> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 32cc8bfaa5f1..bb79154cf465 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -943,7 +943,7 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
>  
>  	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
>  	    kvm_mmu_honors_guest_mtrrs(vcpu->kvm))
> -		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
> +		kvm_honors_guest_mtrrs_zap_on_cd_toggle(vcpu);
>  }
>  EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
>  
> @@ -12310,6 +12310,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>  	kvm->arch.guest_can_read_msr_platform_info = true;
>  	kvm->arch.enable_pmu = enable_pmu;
>  
> +	spin_lock_init(&kvm->arch.mtrr_zap_list_lock);
> +	INIT_LIST_HEAD(&kvm->arch.mtrr_zap_list);
> +
>  #if IS_ENABLED(CONFIG_HYPERV)
>  	spin_lock_init(&kvm->arch.hv_root_tdp_lock);
>  	kvm->arch.hv_root_tdp = INVALID_PAGE;
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index e7733dc4dccc..56d8755b2560 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -315,6 +315,7 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
>  					  int page_num);
>  void kvm_honors_guest_mtrrs_get_cd_memtype(struct kvm_vcpu *vcpu,
>  					   u8 *type, bool *ipat);
> +void kvm_honors_guest_mtrrs_zap_on_cd_toggle(struct kvm_vcpu *vcpu);
>  bool kvm_vector_hashing_enabled(void);
>  void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
>  int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
> -- 
> 2.17.1
>
Yan Zhao Sept. 4, 2023, 8:24 a.m. UTC | #2
On Fri, Aug 25, 2023 at 03:47:11PM -0700, Sean Christopherson wrote:
> On Fri, Jul 14, 2023, Yan Zhao wrote:
> > +/*
> > + * Add @range into kvm->arch.mtrr_zap_list and sort the list in
> > + * "length" ascending + "start" descending order, so that
> > + * ranges consuming more zap cycles can be dequeued later and their
> > + * chances of being found duplicated are increased.
> 
> Wrap comments as close to 80 chars as possible.
Got it!
I thought it's easy to interpret if a group of words are in one line :)


> > + */
> > +static void kvm_add_mtrr_zap_list(struct kvm *kvm, struct mtrr_zap_range *range)
> > +{
> > +	struct list_head *head = &kvm->arch.mtrr_zap_list;
> > +	u64 len = range->end - range->start;
> > +	struct mtrr_zap_range *cur, *n;
> > +	bool added = false;
> > +
> > +	spin_lock(&kvm->arch.mtrr_zap_list_lock);
> > +
> > +	if (list_empty(head)) {
> > +		list_add(&range->node, head);
> > +		spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> > +		return;
> 
> Make this
> 
> 		goto out;
> 
> or
> 		goto out_unlock;
> 
> and then do the same instead of the break; in the loop.  Then "added" goes away
> and there's a single unlock.
>
Ok.

> > +	}
> > +
> > +	list_for_each_entry_safe(cur, n, head, node) {
> 
> This shouldn't need to use the _safe() variant, it's not deleting anything.
Right. Will remove it.
_safe() version was a legacy of my initial test versions that items were merged
and deleted and I later found they don't have any performance benefit.

> > +		u64 cur_len = cur->end - cur->start;
> > +
> > +		if (len < cur_len)
> > +			break;
> > +
> > +		if (len > cur_len)
> > +			continue;
> > +
> > +		if (range->start > cur->start)
> > +			break;
> > +
> > +		if (range->start < cur->start)
> > +			continue;
> 
> Looking at kvm_zap_mtrr_zap_list(), wouldn't we be better off sorting by start,
> and then batching in kvm_zap_mtrr_zap_list()?  And maybe make the batching "fuzzy"
> for fixed MTRRs?  I.e. if KVM is zapping any fixed MTRRs, zap all fixed MTRR ranges
> even if there's a gap.
Yes, this "fuzzy" is done in the next patch.
In prepare_zaplist_fixed_mtrr_of_non_type(),
	range->start = gpa_to_gfn(fixed_seg_table[0].start);
	range->end = gpa_to_gfn(fixed_seg_table[seg_end].end);
range start is set to start of first fixed range, and end to the end of
last fixed range.

> 
> > +
> > +		/* equal len & start, no need to add */
> > +		added = true;
> > +		kfree(range);
> 
> 
> Hmm, the memory allocations are a bit of complexity that'd I'd prefer to avoid.
> At a minimum, I think kvm_add_mtrr_zap_list() should do the allocation.  That'll
> dedup a decount amount of code.
> 
> At the risk of rehashing the old memslots implementation, I think we should simply
> have a statically sized array in struct kvm to hold "range to zap".  E.g. use 16
> entries, bin all fixed MTRRs into a single range, and if the remaining 15 fill up,
> purge and fall back to a full zap.
> 
> 128 bytes per VM is totally acceptable, especially since we're burning waaay
> more than that to deal with per-vCPU MTRRs.  And a well-behaved guest should have
> identical MTRRs across all vCPUs, or maybe at worst one config for the BSP and
> one for APs.

Ok, will do it in the next version.

> 
> > +		break;
> > +	}
> > +
> > +	if (!added)
> > +		list_add_tail(&range->node, &cur->node);
> > +
> > +	spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> > +}
> > +
> > +static void kvm_zap_mtrr_zap_list(struct kvm *kvm)
> > +{
> > +	struct list_head *head = &kvm->arch.mtrr_zap_list;
> > +	struct mtrr_zap_range *cur = NULL;
> > +
> > +	spin_lock(&kvm->arch.mtrr_zap_list_lock);
> > +
> > +	while (!list_empty(head)) {
> > +		u64 start, end;
> > +
> > +		cur = list_first_entry(head, typeof(*cur), node);
> > +		start = cur->start;
> > +		end = cur->end;
> > +		list_del(&cur->node);
> > +		kfree(cur);
> 
> Hmm, the memory allocations are a bit of complexity that'd I'd prefer to avoid.
yes.

> 
> > +		spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> > +
> > +		kvm_zap_gfn_range(kvm, start, end);
> > +
> > +		spin_lock(&kvm->arch.mtrr_zap_list_lock);
> > +	}
> > +
> > +	spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> > +}
> > +
> > +static void kvm_zap_or_wait_mtrr_zap_list(struct kvm *kvm)
> > +{
> > +	if (atomic_cmpxchg_acquire(&kvm->arch.mtrr_zapping, 0, 1) == 0) {
> > +		kvm_zap_mtrr_zap_list(kvm);
> > +		atomic_set_release(&kvm->arch.mtrr_zapping, 0);
> > +		return;
> > +	}
> > +
> > +	while (atomic_read(&kvm->arch.mtrr_zapping))
> > +		cpu_relax();
> > +}
> > +
> > +static void kvm_mtrr_zap_gfn_range(struct kvm_vcpu *vcpu,
> > +				   gfn_t gfn_start, gfn_t gfn_end)
> > +{
> > +	struct mtrr_zap_range *range;
> > +
> > +	range = kmalloc(sizeof(*range), GFP_KERNEL_ACCOUNT);
> > +	if (!range)
> > +		goto fail;
> > +
> > +	range->start = gfn_start;
> > +	range->end = gfn_end;
> > +
> > +	kvm_add_mtrr_zap_list(vcpu->kvm, range);
> > +
> > +	kvm_zap_or_wait_mtrr_zap_list(vcpu->kvm);
> > +	return;
> > +
> > +fail:
> > +	kvm_zap_gfn_range(vcpu->kvm, gfn_start, gfn_end);
> > +}
> > +
> > +void kvm_honors_guest_mtrrs_zap_on_cd_toggle(struct kvm_vcpu *vcpu)
> 
> Rather than provide a one-liner, add something like
> 
>   void kvm_mtrr_cr0_cd_changed(struct kvm_vcpu *vcpu)
>   {
> 	if (!kvm_mmu_honors_guest_mtrrs(vcpu->kvm))
> 		return;
> 
> 	return kvm_zap_gfn_range(vcpu, 0, -1ull);
>   }
> 
> that avoids the comically long function name, and keeps the MTRR logic more
> contained in the MTRR code.
Yes, it's better!
Thanks for you guiding :)

> 
> > +{
> > +	return kvm_mtrr_zap_gfn_range(vcpu, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
> 
> Meh, just zap 0 => ~0ull.  That 51:0 happens to be the theoretical max gfn on
> x86 is coincidence (AFAIK).  And if the guest.MAXPHYADDR < 52, shifting ~0ull
> still doesn't yield a "legal" gfn.
Yes. I think I just wanted to make npage to be less in kvm_zap_gfn_range().

kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);

> 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 32cc8bfaa5f1..bb79154cf465 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -943,7 +943,7 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
> >  
> >  	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
> >  	    kvm_mmu_honors_guest_mtrrs(vcpu->kvm))
> > -		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
> > +		kvm_honors_guest_mtrrs_zap_on_cd_toggle(vcpu);
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
> >  
> > @@ -12310,6 +12310,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
> >  	kvm->arch.guest_can_read_msr_platform_info = true;
> >  	kvm->arch.enable_pmu = enable_pmu;
> >  
> > +	spin_lock_init(&kvm->arch.mtrr_zap_list_lock);
> > +	INIT_LIST_HEAD(&kvm->arch.mtrr_zap_list);
> > +
> >  #if IS_ENABLED(CONFIG_HYPERV)
> >  	spin_lock_init(&kvm->arch.hv_root_tdp_lock);
> >  	kvm->arch.hv_root_tdp = INVALID_PAGE;
> > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> > index e7733dc4dccc..56d8755b2560 100644
> > --- a/arch/x86/kvm/x86.h
> > +++ b/arch/x86/kvm/x86.h
> > @@ -315,6 +315,7 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
> >  					  int page_num);
> >  void kvm_honors_guest_mtrrs_get_cd_memtype(struct kvm_vcpu *vcpu,
> >  					   u8 *type, bool *ipat);
> > +void kvm_honors_guest_mtrrs_zap_on_cd_toggle(struct kvm_vcpu *vcpu);
> >  bool kvm_vector_hashing_enabled(void);
> >  void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
> >  int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
> > -- 
> > 2.17.1
> >
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 28bd38303d70..8da1517a1513 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1444,6 +1444,10 @@  struct kvm_arch {
 	 */
 #define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1)
 	struct kvm_mmu_memory_cache split_desc_cache;
+
+	struct list_head mtrr_zap_list;
+	spinlock_t mtrr_zap_list_lock;
+	atomic_t mtrr_zapping;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 64c6daa659c8..996a274cee40 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -25,6 +25,8 @@ 
 #define IA32_MTRR_DEF_TYPE_FE		(1ULL << 10)
 #define IA32_MTRR_DEF_TYPE_TYPE_MASK	(0xff)
 
+static void kvm_mtrr_zap_gfn_range(struct kvm_vcpu *vcpu,
+				   gfn_t gfn_start, gfn_t gfn_end);
 static bool is_mtrr_base_msr(unsigned int msr)
 {
 	/* MTRR base MSRs use even numbers, masks use odd numbers. */
@@ -341,7 +343,7 @@  static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
 		var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end);
 	}
 
-	kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
+	kvm_mtrr_zap_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(end));
 }
 
 static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
@@ -737,3 +739,121 @@  void kvm_honors_guest_mtrrs_get_cd_memtype(struct kvm_vcpu *vcpu,
 	}
 }
 EXPORT_SYMBOL_GPL(kvm_honors_guest_mtrrs_get_cd_memtype);
+
+struct mtrr_zap_range {
+	gfn_t start;
+	/* end is exclusive */
+	gfn_t end;
+	struct list_head node;
+};
+
+/*
+ * Add @range into kvm->arch.mtrr_zap_list and sort the list in
+ * "length" ascending + "start" descending order, so that
+ * ranges consuming more zap cycles can be dequeued later and their
+ * chances of being found duplicated are increased.
+ */
+static void kvm_add_mtrr_zap_list(struct kvm *kvm, struct mtrr_zap_range *range)
+{
+	struct list_head *head = &kvm->arch.mtrr_zap_list;
+	u64 len = range->end - range->start;
+	struct mtrr_zap_range *cur, *n;
+	bool added = false;
+
+	spin_lock(&kvm->arch.mtrr_zap_list_lock);
+
+	if (list_empty(head)) {
+		list_add(&range->node, head);
+		spin_unlock(&kvm->arch.mtrr_zap_list_lock);
+		return;
+	}
+
+	list_for_each_entry_safe(cur, n, head, node) {
+		u64 cur_len = cur->end - cur->start;
+
+		if (len < cur_len)
+			break;
+
+		if (len > cur_len)
+			continue;
+
+		if (range->start > cur->start)
+			break;
+
+		if (range->start < cur->start)
+			continue;
+
+		/* equal len & start, no need to add */
+		added = true;
+		kfree(range);
+		break;
+	}
+
+	if (!added)
+		list_add_tail(&range->node, &cur->node);
+
+	spin_unlock(&kvm->arch.mtrr_zap_list_lock);
+}
+
+static void kvm_zap_mtrr_zap_list(struct kvm *kvm)
+{
+	struct list_head *head = &kvm->arch.mtrr_zap_list;
+	struct mtrr_zap_range *cur = NULL;
+
+	spin_lock(&kvm->arch.mtrr_zap_list_lock);
+
+	while (!list_empty(head)) {
+		u64 start, end;
+
+		cur = list_first_entry(head, typeof(*cur), node);
+		start = cur->start;
+		end = cur->end;
+		list_del(&cur->node);
+		kfree(cur);
+		spin_unlock(&kvm->arch.mtrr_zap_list_lock);
+
+		kvm_zap_gfn_range(kvm, start, end);
+
+		spin_lock(&kvm->arch.mtrr_zap_list_lock);
+	}
+
+	spin_unlock(&kvm->arch.mtrr_zap_list_lock);
+}
+
+static void kvm_zap_or_wait_mtrr_zap_list(struct kvm *kvm)
+{
+	if (atomic_cmpxchg_acquire(&kvm->arch.mtrr_zapping, 0, 1) == 0) {
+		kvm_zap_mtrr_zap_list(kvm);
+		atomic_set_release(&kvm->arch.mtrr_zapping, 0);
+		return;
+	}
+
+	while (atomic_read(&kvm->arch.mtrr_zapping))
+		cpu_relax();
+}
+
+static void kvm_mtrr_zap_gfn_range(struct kvm_vcpu *vcpu,
+				   gfn_t gfn_start, gfn_t gfn_end)
+{
+	struct mtrr_zap_range *range;
+
+	range = kmalloc(sizeof(*range), GFP_KERNEL_ACCOUNT);
+	if (!range)
+		goto fail;
+
+	range->start = gfn_start;
+	range->end = gfn_end;
+
+	kvm_add_mtrr_zap_list(vcpu->kvm, range);
+
+	kvm_zap_or_wait_mtrr_zap_list(vcpu->kvm);
+	return;
+
+fail:
+	kvm_zap_gfn_range(vcpu->kvm, gfn_start, gfn_end);
+}
+
+void kvm_honors_guest_mtrrs_zap_on_cd_toggle(struct kvm_vcpu *vcpu)
+{
+	return kvm_mtrr_zap_gfn_range(vcpu, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
+}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 32cc8bfaa5f1..bb79154cf465 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -943,7 +943,7 @@  void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
 
 	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
 	    kvm_mmu_honors_guest_mtrrs(vcpu->kvm))
-		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
+		kvm_honors_guest_mtrrs_zap_on_cd_toggle(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
 
@@ -12310,6 +12310,9 @@  int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	kvm->arch.guest_can_read_msr_platform_info = true;
 	kvm->arch.enable_pmu = enable_pmu;
 
+	spin_lock_init(&kvm->arch.mtrr_zap_list_lock);
+	INIT_LIST_HEAD(&kvm->arch.mtrr_zap_list);
+
 #if IS_ENABLED(CONFIG_HYPERV)
 	spin_lock_init(&kvm->arch.hv_root_tdp_lock);
 	kvm->arch.hv_root_tdp = INVALID_PAGE;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e7733dc4dccc..56d8755b2560 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -315,6 +315,7 @@  bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
 					  int page_num);
 void kvm_honors_guest_mtrrs_get_cd_memtype(struct kvm_vcpu *vcpu,
 					   u8 *type, bool *ipat);
+void kvm_honors_guest_mtrrs_zap_on_cd_toggle(struct kvm_vcpu *vcpu);
 bool kvm_vector_hashing_enabled(void);
 void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
 int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,