diff mbox

[v3,08/15] KVM: MMU: allow unmap invalid rmap out of mmu-lock

Message ID 1366093973-2617-9-git-send-email-xiaoguangrong@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Xiao Guangrong April 16, 2013, 6:32 a.m. UTC
pte_list_clear_concurrently allows us to reset pte-desc entry
out of mmu-lock. We can reset spte out of mmu-lock if we can protect the
lifecycle of sp, we use this way to achieve the goal:

unmap_memslot_rmap_nolock():
for-each-rmap-in-slot:
      preempt_disable
      kvm->arch.being_unmapped_rmap = rmapp
      clear spte and reset rmap entry
      kvm->arch.being_unmapped_rmap = NULL
      preempt_enable

Other patch like zap-sp and mmu-notify which are protected
by mmu-lock:
      clear spte and reset rmap entry
retry:
      if (kvm->arch.being_unmapped_rmap == rmap)
		goto retry
(the wait is very rare and clear one rmap is very fast, it
is not bad even if wait is needed)

Then, we can sure the spte is always available when we do
unmap_memslot_rmap_nolock

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/include/asm/kvm_host.h |    2 +
 arch/x86/kvm/mmu.c              |  114 ++++++++++++++++++++++++++++++++++++---
 arch/x86/kvm/mmu.h              |    2 +-
 3 files changed, 110 insertions(+), 8 deletions(-)

Comments

Gleb Natapov April 18, 2013, 11 a.m. UTC | #1
On Tue, Apr 16, 2013 at 02:32:46PM +0800, Xiao Guangrong wrote:
> pte_list_clear_concurrently allows us to reset pte-desc entry
> out of mmu-lock. We can reset spte out of mmu-lock if we can protect the
> lifecycle of sp, we use this way to achieve the goal:
> 
> unmap_memslot_rmap_nolock():
> for-each-rmap-in-slot:
>       preempt_disable
>       kvm->arch.being_unmapped_rmap = rmapp
>       clear spte and reset rmap entry
>       kvm->arch.being_unmapped_rmap = NULL
>       preempt_enable
> 
> Other patch like zap-sp and mmu-notify which are protected
> by mmu-lock:
>       clear spte and reset rmap entry
> retry:
>       if (kvm->arch.being_unmapped_rmap == rmap)
> 		goto retry
> (the wait is very rare and clear one rmap is very fast, it
> is not bad even if wait is needed)
> 
I do not understand what how this achieve the goal. Suppose that rmap
== X and kvm->arch.being_unmapped_rmap == NULL so "goto retry" is skipped,
but moment later unmap_memslot_rmap_nolock() does
vm->arch.being_unmapped_rmap = X.
 
> Then, we can sure the spte is always available when we do
> unmap_memslot_rmap_nolock
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    2 +
>  arch/x86/kvm/mmu.c              |  114 ++++++++++++++++++++++++++++++++++++---
>  arch/x86/kvm/mmu.h              |    2 +-
>  3 files changed, 110 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 5fd6ed1..1ad9a34 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -536,6 +536,8 @@ struct kvm_arch {
>  	 * Hash table of struct kvm_mmu_page.
>  	 */
>  	struct list_head active_mmu_pages;
> +	unsigned long *being_unmapped_rmap;
> +
>  	struct list_head assigned_dev_head;
>  	struct iommu_domain *iommu_domain;
>  	int iommu_flags;
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 2a7a5d0..e6414d2 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1104,10 +1104,10 @@ static int slot_rmap_add(struct kvm_memory_slot *slot,
>  	return slot->arch.ops->rmap_add(vcpu, spte, rmapp);
>  }
>  
> -static void slot_rmap_remove(struct kvm_memory_slot *slot,
> +static void slot_rmap_remove(struct kvm_memory_slot *slot, struct kvm *kvm,
>  			     unsigned long *rmapp, u64 *spte)
>  {
> -	slot->arch.ops->rmap_remove(spte, rmapp);
> +	slot->arch.ops->rmap_remove(kvm, spte, rmapp);
>  }
>  
>  static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
> @@ -1132,7 +1132,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
>  	sp = page_header(__pa(spte));
>  	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
>  	rmapp = gfn_to_rmap(kvm, &slot, gfn, sp->role.level);
> -	slot_rmap_remove(slot, rmapp, spte);
> +	slot_rmap_remove(slot, kvm, rmapp, spte);
>  }
>  
>  /*
> @@ -1589,9 +1589,14 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
>  	return kvm_handle_hva(kvm, hva, 0, slot_rmap_test_age);
>  }
>  
> +static void rmap_remove_spte(struct kvm *kvm, u64 *spte, unsigned long *rmapp)
> +{
> +	pte_list_remove(spte, rmapp);
> +}
> +
>  static struct rmap_operations normal_rmap_ops = {
>  	.rmap_add = pte_list_add,
> -	.rmap_remove = pte_list_remove,
> +	.rmap_remove = rmap_remove_spte,
>  
>  	.rmap_write_protect = __rmap_write_protect,
>  
> @@ -1613,9 +1618,27 @@ static int invalid_rmap_add(struct kvm_vcpu *vcpu, u64 *spte,
>  	return 0;
>  }
>  
> -static void invalid_rmap_remove(u64 *spte, unsigned long *rmapp)
> +static void sync_being_unmapped_rmap(struct kvm *kvm, unsigned long *rmapp)
> +{
> +	/*
> +	 * Ensure all the sptes on the rmap have been zapped and
> +	 * the rmap's entries have been reset so that
> +	 * unmap_invalid_rmap_nolock can not get any spte from the
> +	 * rmap after calling sync_being_unmapped_rmap().
> +	 */
> +	smp_mb();
> +retry:
> +	if (unlikely(ACCESS_ONCE(kvm->arch.being_unmapped_rmap) == rmapp)) {
> +		cpu_relax();
> +		goto retry;
> +	}
> +}
> +
> +static void
> +invalid_rmap_remove(struct kvm *kvm, u64 *spte, unsigned long *rmapp)
>  {
>  	pte_list_clear_concurrently(spte, rmapp);
> +	sync_being_unmapped_rmap(kvm, rmapp);
>  }
>  
>  static bool invalid_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
> @@ -1635,7 +1658,11 @@ static int __kvm_unmap_invalid_rmapp(unsigned long *rmapp)
>  		if (sptep == PTE_LIST_SPTE_SKIP)
>  			continue;
>  
> -		/* Do not call .rmap_remove(). */
> +		/*
> +		 * Do not call .rmap_remove() since we do not want to wait
> +		 * on sync_being_unmapped_rmap() when all sptes should be
> +		 * removed from the rmap.
> +		 */
>  		if (mmu_spte_clear_track_bits(sptep))
>  			pte_list_clear_concurrently(sptep, rmapp);
>  	}
> @@ -1645,7 +1672,10 @@ static int __kvm_unmap_invalid_rmapp(unsigned long *rmapp)
>  
>  static int kvm_unmap_invalid_rmapp(struct kvm *kvm, unsigned long *rmapp)
>  {
> -	return __kvm_unmap_invalid_rmapp(rmapp);
> +	int ret = __kvm_unmap_invalid_rmapp(rmapp);
> +
> +	sync_being_unmapped_rmap(kvm, rmapp);
> +	return ret;
>  }
>  
>  static int invalid_rmap_set_pte(struct kvm *kvm, unsigned long *rmapp,
> @@ -1686,6 +1716,76 @@ static struct rmap_operations invalid_rmap_ops = {
>  	.rmap_unmap = kvm_unmap_invalid_rmapp
>  };
>  
> +typedef void (*handle_rmap_fun)(unsigned long *rmapp, void *data);
> +static void walk_memslot_rmap_nolock(struct kvm_memory_slot *slot,
> +				     handle_rmap_fun fun, void *data)
> +{
> +	int level;
> +
> +	for (level = PT_PAGE_TABLE_LEVEL;
> +	      level < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++level) {
> +		unsigned long idx, *rmapp;
> +
> +		rmapp = slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL];
> +		idx = gfn_to_index(slot->base_gfn + slot->npages - 1,
> +				   slot->base_gfn, level) + 1;
> +		/*
> +		 * Walk ramp from the high index to low index to reduce
> +		 * possible wait in sync_being_unmapped_rmap().
> +		 */
> +		while (idx--)
> +			fun(rmapp + idx, data);
> +	}
> +}
> +
> +static void unmap_rmap_no_lock_begin(struct kvm *kvm, unsigned long *rmapp)
> +{
> +	preempt_disable();
> +	kvm->arch.being_unmapped_rmap = rmapp;
> +
> +	/*
> +	 * Set being_unmapped_rmap should be before read/write any
> +	 * sptes on the rmaps.
> +	 * See the comment in sync_being_unmapped_rmap().
> +	 */
> +	smp_mb();
> +}
> +
> +static void unmap_rmap_no_lock_end(struct kvm *kvm)
> +{
> +	/*
> +	 * Ensure clearing spte and resetting rmap's entries has
> +	 * been finished.
> +	 * See the comment in sync_being_unmapped_rmap().
> +	 */
> +	smp_mb();
> +	kvm->arch.being_unmapped_rmap = NULL;
> +	preempt_enable();
> +}
> +
> +static void unmap_invalid_rmap_nolock(unsigned long *rmapp, void *data)
> +{
> +	struct kvm *kvm = (struct kvm *)data;
> +
> +	if (!ACCESS_ONCE(*rmapp))
> +		return;
> +
> +	unmap_rmap_no_lock_begin(kvm, rmapp);
> +	__kvm_unmap_invalid_rmapp(rmapp);
> +	unmap_rmap_no_lock_end(kvm);
> +}
> +
> +static void
> +unmap_memslot_rmap_nolock(struct kvm *kvm, struct kvm_memory_slot *slot)
> +{
> +	/* Only invalid rmaps can be unmapped out of mmu-lock. */
> +	WARN_ON(slot->arch.ops != &invalid_rmap_ops);
> +	/* Use slots_lock to protect kvm->arch.being_unmapped_rmap. */
> +	WARN_ON(!mutex_is_locked(&kvm->slots_lock));
> +
> +	walk_memslot_rmap_nolock(slot, unmap_invalid_rmap_nolock, kvm);
> +}
> +
>  #ifdef MMU_DEBUG
>  static int is_empty_shadow_page(u64 *spt)
>  {
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index bb2b22e..d6aa31a 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -117,7 +117,7 @@ static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
>  struct rmap_operations {
>  	int (*rmap_add)(struct kvm_vcpu *vcpu, u64 *spte,
>  			unsigned long *rmap);
> -	void (*rmap_remove)(u64 *spte, unsigned long *rmap);
> +	void (*rmap_remove)(struct kvm *kvm, u64 *spte, unsigned long *rmap);
>  
>  	bool (*rmap_write_protect)(struct kvm *kvm, unsigned long *rmap,
>  				   bool pt_protect);
> -- 
> 1.7.7.6

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong April 18, 2013, 11:22 a.m. UTC | #2
On 04/18/2013 07:00 PM, Gleb Natapov wrote:
> On Tue, Apr 16, 2013 at 02:32:46PM +0800, Xiao Guangrong wrote:
>> pte_list_clear_concurrently allows us to reset pte-desc entry
>> out of mmu-lock. We can reset spte out of mmu-lock if we can protect the
>> lifecycle of sp, we use this way to achieve the goal:
>>
>> unmap_memslot_rmap_nolock():
>> for-each-rmap-in-slot:
>>       preempt_disable
>>       kvm->arch.being_unmapped_rmap = rmapp
>>       clear spte and reset rmap entry
>>       kvm->arch.being_unmapped_rmap = NULL
>>       preempt_enable
>>
>> Other patch like zap-sp and mmu-notify which are protected
>> by mmu-lock:
>>       clear spte and reset rmap entry
>> retry:
>>       if (kvm->arch.being_unmapped_rmap == rmap)
>> 		goto retry
>> (the wait is very rare and clear one rmap is very fast, it
>> is not bad even if wait is needed)
>>
> I do not understand what how this achieve the goal. Suppose that rmap
> == X and kvm->arch.being_unmapped_rmap == NULL so "goto retry" is skipped,
> but moment later unmap_memslot_rmap_nolock() does
> vm->arch.being_unmapped_rmap = X.

Access rmap is always safe since rmap and its entries are valid until
memslot is destroyed.

This algorithm protects spte since it can be freed in the protection of mmu-lock.

In your scenario:

======
   CPU 1                                      CPU 2

vcpu / mmu-notify access the RMAP         unmap rmap out of mmu-lock which is under
which is under mmu-lock                   slot-lock

zap spte1
clear RMAP entry

kvm->arch.being_unmapped_rmap = NULL,
do not wait

free spte1

                                        set kvm->arch.being_unmapped_rmap = RMAP
                                        walking RMAP and do not see spet1 on RMAP
                                        (the entry of spte 1 has been reset by CPU 1)
                                        set kvm->arch.being_unmapped_rmap = NULL
======

That protect CPU 2 can not access the freed-spte.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov April 18, 2013, 11:38 a.m. UTC | #3
On Thu, Apr 18, 2013 at 07:22:23PM +0800, Xiao Guangrong wrote:
> On 04/18/2013 07:00 PM, Gleb Natapov wrote:
> > On Tue, Apr 16, 2013 at 02:32:46PM +0800, Xiao Guangrong wrote:
> >> pte_list_clear_concurrently allows us to reset pte-desc entry
> >> out of mmu-lock. We can reset spte out of mmu-lock if we can protect the
> >> lifecycle of sp, we use this way to achieve the goal:
> >>
> >> unmap_memslot_rmap_nolock():
> >> for-each-rmap-in-slot:
> >>       preempt_disable
> >>       kvm->arch.being_unmapped_rmap = rmapp
> >>       clear spte and reset rmap entry
> >>       kvm->arch.being_unmapped_rmap = NULL
> >>       preempt_enable
> >>
> >> Other patch like zap-sp and mmu-notify which are protected
> >> by mmu-lock:
> >>       clear spte and reset rmap entry
> >> retry:
> >>       if (kvm->arch.being_unmapped_rmap == rmap)
> >> 		goto retry
> >> (the wait is very rare and clear one rmap is very fast, it
> >> is not bad even if wait is needed)
> >>
> > I do not understand what how this achieve the goal. Suppose that rmap
> > == X and kvm->arch.being_unmapped_rmap == NULL so "goto retry" is skipped,
> > but moment later unmap_memslot_rmap_nolock() does
> > vm->arch.being_unmapped_rmap = X.
> 
> Access rmap is always safe since rmap and its entries are valid until
> memslot is destroyed.
> 
> This algorithm protects spte since it can be freed in the protection of mmu-lock.
> 
> In your scenario:
> 
> ======
>    CPU 1                                      CPU 2
> 
> vcpu / mmu-notify access the RMAP         unmap rmap out of mmu-lock which is under
> which is under mmu-lock                   slot-lock
> 
> zap spte1
> clear RMAP entry
> 
> kvm->arch.being_unmapped_rmap = NULL,
> do not wait
> 
> free spte1
> 
>                                         set kvm->arch.being_unmapped_rmap = RMAP
>                                         walking RMAP and do not see spet1 on RMAP
>                                         (the entry of spte 1 has been reset by CPU 1)
and what prevents this from happening concurrently with "clear RMAP
entry"? Is it safe?

>                                         set kvm->arch.being_unmapped_rmap = NULL
> ======
> 
> That protect CPU 2 can not access the freed-spte.
>

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong April 18, 2013, 12:10 p.m. UTC | #4
On 04/18/2013 07:38 PM, Gleb Natapov wrote:
> On Thu, Apr 18, 2013 at 07:22:23PM +0800, Xiao Guangrong wrote:
>> On 04/18/2013 07:00 PM, Gleb Natapov wrote:
>>> On Tue, Apr 16, 2013 at 02:32:46PM +0800, Xiao Guangrong wrote:
>>>> pte_list_clear_concurrently allows us to reset pte-desc entry
>>>> out of mmu-lock. We can reset spte out of mmu-lock if we can protect the
>>>> lifecycle of sp, we use this way to achieve the goal:
>>>>
>>>> unmap_memslot_rmap_nolock():
>>>> for-each-rmap-in-slot:
>>>>       preempt_disable
>>>>       kvm->arch.being_unmapped_rmap = rmapp
>>>>       clear spte and reset rmap entry
>>>>       kvm->arch.being_unmapped_rmap = NULL
>>>>       preempt_enable
>>>>
>>>> Other patch like zap-sp and mmu-notify which are protected
>>>> by mmu-lock:
>>>>       clear spte and reset rmap entry
>>>> retry:
>>>>       if (kvm->arch.being_unmapped_rmap == rmap)
>>>> 		goto retry
>>>> (the wait is very rare and clear one rmap is very fast, it
>>>> is not bad even if wait is needed)
>>>>
>>> I do not understand what how this achieve the goal. Suppose that rmap
>>> == X and kvm->arch.being_unmapped_rmap == NULL so "goto retry" is skipped,
>>> but moment later unmap_memslot_rmap_nolock() does
>>> vm->arch.being_unmapped_rmap = X.
>>
>> Access rmap is always safe since rmap and its entries are valid until
>> memslot is destroyed.
>>
>> This algorithm protects spte since it can be freed in the protection of mmu-lock.
>>
>> In your scenario:
>>
>> ======
>>    CPU 1                                      CPU 2
>>
>> vcpu / mmu-notify access the RMAP         unmap rmap out of mmu-lock which is under
>> which is under mmu-lock                   slot-lock
>>
>> zap spte1
>> clear RMAP entry
>>
>> kvm->arch.being_unmapped_rmap = NULL,
>> do not wait
>>
>> free spte1
>>
>>                                         set kvm->arch.being_unmapped_rmap = RMAP
>>                                         walking RMAP and do not see spet1 on RMAP
>>                                         (the entry of spte 1 has been reset by CPU 1)
> and what prevents this from happening concurrently with "clear RMAP
> entry"? Is it safe?

All the possible changes on the RMAP entry is from valid-spte to PTE_LIST_SPTE_SKIP.
(no valid-spte to valid-spte / no spte to new-spte)

There are three possible cases:
case 1): both two paths can see the valid-spte.
         the worst case is, the host page can be double A/D tracked
         (multi calling of kvm_set_pfn_accessed/kvm_set_pfn_dirty), it is safe.

case 2): only the path under protection of mmu-lock see the valid-spte
         this is safe since RMAP and spte are always valid under mmu-lock

case 3): only the path out of mmu-lock see the valid-spte
         then the path under mmu-lock will being wait until the no-lock path has
         finished. The spte is valid and no-lock path is safe to call
         kvm_set_pfn_accessed/kvm_set_pfn_dirty.

Do you get any potential issue?


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5fd6ed1..1ad9a34 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -536,6 +536,8 @@  struct kvm_arch {
 	 * Hash table of struct kvm_mmu_page.
 	 */
 	struct list_head active_mmu_pages;
+	unsigned long *being_unmapped_rmap;
+
 	struct list_head assigned_dev_head;
 	struct iommu_domain *iommu_domain;
 	int iommu_flags;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2a7a5d0..e6414d2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1104,10 +1104,10 @@  static int slot_rmap_add(struct kvm_memory_slot *slot,
 	return slot->arch.ops->rmap_add(vcpu, spte, rmapp);
 }
 
-static void slot_rmap_remove(struct kvm_memory_slot *slot,
+static void slot_rmap_remove(struct kvm_memory_slot *slot, struct kvm *kvm,
 			     unsigned long *rmapp, u64 *spte)
 {
-	slot->arch.ops->rmap_remove(spte, rmapp);
+	slot->arch.ops->rmap_remove(kvm, spte, rmapp);
 }
 
 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -1132,7 +1132,7 @@  static void rmap_remove(struct kvm *kvm, u64 *spte)
 	sp = page_header(__pa(spte));
 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
 	rmapp = gfn_to_rmap(kvm, &slot, gfn, sp->role.level);
-	slot_rmap_remove(slot, rmapp, spte);
+	slot_rmap_remove(slot, kvm, rmapp, spte);
 }
 
 /*
@@ -1589,9 +1589,14 @@  int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
 	return kvm_handle_hva(kvm, hva, 0, slot_rmap_test_age);
 }
 
+static void rmap_remove_spte(struct kvm *kvm, u64 *spte, unsigned long *rmapp)
+{
+	pte_list_remove(spte, rmapp);
+}
+
 static struct rmap_operations normal_rmap_ops = {
 	.rmap_add = pte_list_add,
-	.rmap_remove = pte_list_remove,
+	.rmap_remove = rmap_remove_spte,
 
 	.rmap_write_protect = __rmap_write_protect,
 
@@ -1613,9 +1618,27 @@  static int invalid_rmap_add(struct kvm_vcpu *vcpu, u64 *spte,
 	return 0;
 }
 
-static void invalid_rmap_remove(u64 *spte, unsigned long *rmapp)
+static void sync_being_unmapped_rmap(struct kvm *kvm, unsigned long *rmapp)
+{
+	/*
+	 * Ensure all the sptes on the rmap have been zapped and
+	 * the rmap's entries have been reset so that
+	 * unmap_invalid_rmap_nolock can not get any spte from the
+	 * rmap after calling sync_being_unmapped_rmap().
+	 */
+	smp_mb();
+retry:
+	if (unlikely(ACCESS_ONCE(kvm->arch.being_unmapped_rmap) == rmapp)) {
+		cpu_relax();
+		goto retry;
+	}
+}
+
+static void
+invalid_rmap_remove(struct kvm *kvm, u64 *spte, unsigned long *rmapp)
 {
 	pte_list_clear_concurrently(spte, rmapp);
+	sync_being_unmapped_rmap(kvm, rmapp);
 }
 
 static bool invalid_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
@@ -1635,7 +1658,11 @@  static int __kvm_unmap_invalid_rmapp(unsigned long *rmapp)
 		if (sptep == PTE_LIST_SPTE_SKIP)
 			continue;
 
-		/* Do not call .rmap_remove(). */
+		/*
+		 * Do not call .rmap_remove() since we do not want to wait
+		 * on sync_being_unmapped_rmap() when all sptes should be
+		 * removed from the rmap.
+		 */
 		if (mmu_spte_clear_track_bits(sptep))
 			pte_list_clear_concurrently(sptep, rmapp);
 	}
@@ -1645,7 +1672,10 @@  static int __kvm_unmap_invalid_rmapp(unsigned long *rmapp)
 
 static int kvm_unmap_invalid_rmapp(struct kvm *kvm, unsigned long *rmapp)
 {
-	return __kvm_unmap_invalid_rmapp(rmapp);
+	int ret = __kvm_unmap_invalid_rmapp(rmapp);
+
+	sync_being_unmapped_rmap(kvm, rmapp);
+	return ret;
 }
 
 static int invalid_rmap_set_pte(struct kvm *kvm, unsigned long *rmapp,
@@ -1686,6 +1716,76 @@  static struct rmap_operations invalid_rmap_ops = {
 	.rmap_unmap = kvm_unmap_invalid_rmapp
 };
 
+typedef void (*handle_rmap_fun)(unsigned long *rmapp, void *data);
+static void walk_memslot_rmap_nolock(struct kvm_memory_slot *slot,
+				     handle_rmap_fun fun, void *data)
+{
+	int level;
+
+	for (level = PT_PAGE_TABLE_LEVEL;
+	      level < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++level) {
+		unsigned long idx, *rmapp;
+
+		rmapp = slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL];
+		idx = gfn_to_index(slot->base_gfn + slot->npages - 1,
+				   slot->base_gfn, level) + 1;
+		/*
+		 * Walk ramp from the high index to low index to reduce
+		 * possible wait in sync_being_unmapped_rmap().
+		 */
+		while (idx--)
+			fun(rmapp + idx, data);
+	}
+}
+
+static void unmap_rmap_no_lock_begin(struct kvm *kvm, unsigned long *rmapp)
+{
+	preempt_disable();
+	kvm->arch.being_unmapped_rmap = rmapp;
+
+	/*
+	 * Set being_unmapped_rmap should be before read/write any
+	 * sptes on the rmaps.
+	 * See the comment in sync_being_unmapped_rmap().
+	 */
+	smp_mb();
+}
+
+static void unmap_rmap_no_lock_end(struct kvm *kvm)
+{
+	/*
+	 * Ensure clearing spte and resetting rmap's entries has
+	 * been finished.
+	 * See the comment in sync_being_unmapped_rmap().
+	 */
+	smp_mb();
+	kvm->arch.being_unmapped_rmap = NULL;
+	preempt_enable();
+}
+
+static void unmap_invalid_rmap_nolock(unsigned long *rmapp, void *data)
+{
+	struct kvm *kvm = (struct kvm *)data;
+
+	if (!ACCESS_ONCE(*rmapp))
+		return;
+
+	unmap_rmap_no_lock_begin(kvm, rmapp);
+	__kvm_unmap_invalid_rmapp(rmapp);
+	unmap_rmap_no_lock_end(kvm);
+}
+
+static void
+unmap_memslot_rmap_nolock(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+	/* Only invalid rmaps can be unmapped out of mmu-lock. */
+	WARN_ON(slot->arch.ops != &invalid_rmap_ops);
+	/* Use slots_lock to protect kvm->arch.being_unmapped_rmap. */
+	WARN_ON(!mutex_is_locked(&kvm->slots_lock));
+
+	walk_memslot_rmap_nolock(slot, unmap_invalid_rmap_nolock, kvm);
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index bb2b22e..d6aa31a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -117,7 +117,7 @@  static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
 struct rmap_operations {
 	int (*rmap_add)(struct kvm_vcpu *vcpu, u64 *spte,
 			unsigned long *rmap);
-	void (*rmap_remove)(u64 *spte, unsigned long *rmap);
+	void (*rmap_remove)(struct kvm *kvm, u64 *spte, unsigned long *rmap);
 
 	bool (*rmap_write_protect)(struct kvm *kvm, unsigned long *rmap,
 				   bool pt_protect);