diff mbox

[v5,3/8] KVM: MMU: fast invalidate all pages

Message ID 1368706673-8530-4-git-send-email-xiaoguangrong@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Xiao Guangrong May 16, 2013, 12:17 p.m. UTC
The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
walk and zap all shadow pages one by one, also it need to zap all guest
page's rmap and all shadow page's parent spte list. Particularly, things
become worse if guest uses more memory or vcpus. It is not good for
scalability

In this patch, we introduce a faster way to invalidate all shadow pages.
KVM maintains a global mmu invalid generation-number which is stored in
kvm->arch.mmu_valid_gen and every shadow page stores the current global
generation-number into sp->mmu_valid_gen when it is created

When KVM need zap all shadow pages sptes, it just simply increase the
global generation-number then reload root shadow pages on all vcpus.
Vcpu will create a new shadow page table according to current kvm's
generation-number. It ensures the old pages are not used any more.
Then the invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
are zapped by using lock-break technique

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/include/asm/kvm_host.h |    2 +
 arch/x86/kvm/mmu.c              |   98 +++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/mmu.h              |    2 +
 3 files changed, 102 insertions(+), 0 deletions(-)

Comments

Gleb Natapov May 16, 2013, 12:43 p.m. UTC | #1
On Thu, May 16, 2013 at 08:17:48PM +0800, Xiao Guangrong wrote:
> The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
> walk and zap all shadow pages one by one, also it need to zap all guest
> page's rmap and all shadow page's parent spte list. Particularly, things
> become worse if guest uses more memory or vcpus. It is not good for
> scalability
> 
> In this patch, we introduce a faster way to invalidate all shadow pages.
> KVM maintains a global mmu invalid generation-number which is stored in
> kvm->arch.mmu_valid_gen and every shadow page stores the current global
> generation-number into sp->mmu_valid_gen when it is created
> 
> When KVM need zap all shadow pages sptes, it just simply increase the
> global generation-number then reload root shadow pages on all vcpus.
> Vcpu will create a new shadow page table according to current kvm's
> generation-number. It ensures the old pages are not used any more.
> Then the invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
> are zapped by using lock-break technique
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    2 +
>  arch/x86/kvm/mmu.c              |   98 +++++++++++++++++++++++++++++++++++++++
>  arch/x86/kvm/mmu.h              |    2 +
>  3 files changed, 102 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3741c65..bff7d46 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -222,6 +222,7 @@ struct kvm_mmu_page {
>  	int root_count;          /* Currently serving as active root */
>  	unsigned int unsync_children;
>  	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
> +	unsigned long mmu_valid_gen;
>  	DECLARE_BITMAP(unsync_child_bitmap, 512);
>  
>  #ifdef CONFIG_X86_32
> @@ -529,6 +530,7 @@ struct kvm_arch {
>  	unsigned int n_requested_mmu_pages;
>  	unsigned int n_max_mmu_pages;
>  	unsigned int indirect_shadow_pages;
> +	unsigned long mmu_valid_gen;
>  	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
>  	/*
>  	 * Hash table of struct kvm_mmu_page.
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 682ecb4..d9343fe 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1839,6 +1839,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
>  	__clear_sp_write_flooding_count(sp);
>  }
>  
> +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> +{
> +	return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
> +}
> +
>  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>  					     gfn_t gfn,
>  					     gva_t gaddr,
> @@ -1865,6 +1870,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>  		role.quadrant = quadrant;
>  	}
>  	for_each_gfn_sp(vcpu->kvm, sp, gfn) {
> +		if (is_obsolete_sp(vcpu->kvm, sp))
> +			continue;
> +
>  		if (!need_sync && sp->unsync)
>  			need_sync = true;
>  
> @@ -1901,6 +1909,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>  
>  		account_shadowed(vcpu->kvm, gfn);
>  	}
> +	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
>  	init_shadow_page_table(sp);
>  	trace_kvm_mmu_get_page(sp, true);
>  	return sp;
> @@ -2071,8 +2080,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
>  	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
>  	kvm_mmu_page_unlink_children(kvm, sp);
>  	kvm_mmu_unlink_parents(kvm, sp);
> +
>  	if (!sp->role.invalid && !sp->role.direct)
>  		unaccount_shadowed(kvm, sp->gfn);
> +
>  	if (sp->unsync)
>  		kvm_unlink_unsync_page(kvm, sp);
>  
> @@ -4196,6 +4207,93 @@ restart:
>  	spin_unlock(&kvm->mmu_lock);
>  }
>  
> +static void zap_invalid_pages(struct kvm *kvm)
> +{
> +	struct kvm_mmu_page *sp, *node;
> +	LIST_HEAD(invalid_list);
> +
> +restart:
> +	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
> +		if (!is_obsolete_sp(kvm, sp))
> +			continue;
What if we save kvm->arch.active_mmu_pages on the stack and init
kvm->arch.active_mmu_pages to be empty at the entrance to
zap_invalid_pages(). This loop will iterate over saved list. This will
allow us to drop the is_obsolete_sp() check and will save time since we
will not be iterating over newly created sps.

> +
> +		/*
> +		 * Do not repeatedly zap a root page to avoid unnecessary
> +		 * KVM_REQ_MMU_RELOAD, otherwise we may not be able to
> +		 * progress:
> +		 *    vcpu 0                        vcpu 1
> +		 *                         call vcpu_enter_guest():
> +		 *                            1): handle KVM_REQ_MMU_RELOAD
> +		 *                                and require mmu-lock to
> +		 *                                load mmu
> +		 * repeat:
> +		 *    1): zap root page and
> +		 *        send KVM_REQ_MMU_RELOAD
> +		 *
> +		 *    2): if (cond_resched_lock(mmu-lock))
> +		 *
> +		 *                            2): hold mmu-lock and load mmu
> +		 *
> +		 *                            3): see KVM_REQ_MMU_RELOAD bit
> +		 *                                on vcpu->requests is set
> +		 *                                then return 1 to call
> +		 *                                vcpu_enter_guest() again.
> +		 *            goto repeat;
> +		 *
> +		 */
> +		if (sp->role.invalid)
> +			continue;
> +		/*
> +		 * Need not flush tlb since we only zap the sp with invalid
> +		 * generation number.
> +		 */
> +		if (cond_resched_lock(&kvm->mmu_lock))
> +			goto restart;
> +
> +		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
> +			goto restart;
> +	}
> +
> +	/*
> +	 * Should flush tlb before free page tables since lockless-walking
> +	 * may use the pages.
> +	 */
> +	kvm_mmu_commit_zap_page(kvm, &invalid_list);
> +}
> +
> +/*
> + * Fast invalidate all shadow pages belong to @slot.
> + *
> + * @slot != NULL means the invalidation is caused the memslot specified
> + * by @slot is being deleted, in this case, we should ensure that rmap
> + * and lpage-info of the @slot can not be used after calling the function.
> + *
> + * @slot == NULL means the invalidation due to other reasons, we need
> + * not care rmap and lpage-info since they are still valid after calling
> + * the function.
> + */
> +void kvm_mmu_invalidate_memslot_pages(struct kvm *kvm,
> +				      struct kvm_memory_slot *slot)
> +{
> +	spin_lock(&kvm->mmu_lock);
> +	kvm->arch.mmu_valid_gen++;
> +
> +	/*
> +	 * Notify all vcpus to reload its shadow page table
> +	 * and flush TLB. Then all vcpus will switch to new
> +	 * shadow page table with the new mmu_valid_gen.
> +	 *
> +	 * Note: we should do this under the protection of
> +	 * mmu-lock, otherwise, vcpu would purge shadow page
> +	 * but miss tlb flush.
> +	 */
> +	kvm_reload_remote_mmus(kvm);
> +
> +	if (slot)
> +		zap_invalid_pages(kvm);
> +	spin_unlock(&kvm->mmu_lock);
> +}
> +
>  void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
>  {
>  	struct kvm_mmu_page *sp, *node;
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 2adcbc2..bd57466 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -97,4 +97,6 @@ static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
>  	return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
>  }
>  
> +void kvm_mmu_invalidate_memslot_pages(struct kvm *kvm,
> +				      struct kvm_memory_slot *slot);
>  #endif
> -- 
> 1.7.7.6

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini May 16, 2013, 1:14 p.m. UTC | #2
Il 16/05/2013 14:43, Gleb Natapov ha scritto:
>> > +restart:
>> > +	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
>> > +		if (!is_obsolete_sp(kvm, sp))
>> > +			continue;
> What if we save kvm->arch.active_mmu_pages on the stack and init
> kvm->arch.active_mmu_pages to be empty at the entrance to
> zap_invalid_pages(). This loop will iterate over saved list. This will
> allow us to drop the is_obsolete_sp() check and will save time since we
> will not be iterating over newly created sps.
> 

But when you add cond_resched_lock a thread may want to zap pages itself
(e.g. from prepare_zap_oldest_mmu_page) and it won't find them.

Here is another proposal...  The idea is to avoid looking at new pages
more than necessary after a "goto restart".

Basically, you alternate between two phases:

- look for pages to be zapped, group them together

- zap the pages

Something like:

      moved = 0;
restart:
      zapping = true;
      for each page in active_mmu_pages [reverse and safe] {
             if (!is_obsolete || invalid) {
                 /*
                  * Found a new page, stop zapping for now and
                  * try to segregate the invalid ones at one end
                  * of the list.
                  */
                 zapping = false;
                 continue;
             }

             if (batch > 10 && ...) {
                 cond_resched_lock
                 batch = 0;
                 goto restart;
             }

             if (!zapping) {
                 /*
                  * Segregate pages to one end of the list where
                  * new pages don't get in the way.
                  */
                 list_move_tail(page, active_mmu_pages)
                 batch++; /* or maybe not? */
                 moved++;
             } else {
                 batch += prepare_zap_page
                 goto restart;
             }
     }

     /* Need another pass to look at segregated pages?  */
     if (moved) {
         moved = 0;
         goto restart;
     }
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong May 16, 2013, 1:25 p.m. UTC | #3
On 05/16/2013 08:43 PM, Gleb Natapov wrote:
> On Thu, May 16, 2013 at 08:17:48PM +0800, Xiao Guangrong wrote:
>> The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
>> walk and zap all shadow pages one by one, also it need to zap all guest
>> page's rmap and all shadow page's parent spte list. Particularly, things
>> become worse if guest uses more memory or vcpus. It is not good for
>> scalability
>>
>> In this patch, we introduce a faster way to invalidate all shadow pages.
>> KVM maintains a global mmu invalid generation-number which is stored in
>> kvm->arch.mmu_valid_gen and every shadow page stores the current global
>> generation-number into sp->mmu_valid_gen when it is created
>>
>> When KVM need zap all shadow pages sptes, it just simply increase the
>> global generation-number then reload root shadow pages on all vcpus.
>> Vcpu will create a new shadow page table according to current kvm's
>> generation-number. It ensures the old pages are not used any more.
>> Then the invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
>> are zapped by using lock-break technique
>>
>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h |    2 +
>>  arch/x86/kvm/mmu.c              |   98 +++++++++++++++++++++++++++++++++++++++
>>  arch/x86/kvm/mmu.h              |    2 +
>>  3 files changed, 102 insertions(+), 0 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 3741c65..bff7d46 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -222,6 +222,7 @@ struct kvm_mmu_page {
>>  	int root_count;          /* Currently serving as active root */
>>  	unsigned int unsync_children;
>>  	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
>> +	unsigned long mmu_valid_gen;
>>  	DECLARE_BITMAP(unsync_child_bitmap, 512);
>>  
>>  #ifdef CONFIG_X86_32
>> @@ -529,6 +530,7 @@ struct kvm_arch {
>>  	unsigned int n_requested_mmu_pages;
>>  	unsigned int n_max_mmu_pages;
>>  	unsigned int indirect_shadow_pages;
>> +	unsigned long mmu_valid_gen;
>>  	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
>>  	/*
>>  	 * Hash table of struct kvm_mmu_page.
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 682ecb4..d9343fe 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -1839,6 +1839,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
>>  	__clear_sp_write_flooding_count(sp);
>>  }
>>  
>> +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>> +{
>> +	return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
>> +}
>> +
>>  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>>  					     gfn_t gfn,
>>  					     gva_t gaddr,
>> @@ -1865,6 +1870,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>>  		role.quadrant = quadrant;
>>  	}
>>  	for_each_gfn_sp(vcpu->kvm, sp, gfn) {
>> +		if (is_obsolete_sp(vcpu->kvm, sp))
>> +			continue;
>> +
>>  		if (!need_sync && sp->unsync)
>>  			need_sync = true;
>>  
>> @@ -1901,6 +1909,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>>  
>>  		account_shadowed(vcpu->kvm, gfn);
>>  	}
>> +	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
>>  	init_shadow_page_table(sp);
>>  	trace_kvm_mmu_get_page(sp, true);
>>  	return sp;
>> @@ -2071,8 +2080,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
>>  	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
>>  	kvm_mmu_page_unlink_children(kvm, sp);
>>  	kvm_mmu_unlink_parents(kvm, sp);
>> +
>>  	if (!sp->role.invalid && !sp->role.direct)
>>  		unaccount_shadowed(kvm, sp->gfn);
>> +
>>  	if (sp->unsync)
>>  		kvm_unlink_unsync_page(kvm, sp);
>>  
>> @@ -4196,6 +4207,93 @@ restart:
>>  	spin_unlock(&kvm->mmu_lock);
>>  }
>>  
>> +static void zap_invalid_pages(struct kvm *kvm)
>> +{
>> +	struct kvm_mmu_page *sp, *node;
>> +	LIST_HEAD(invalid_list);
>> +
>> +restart:
>> +	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
>> +		if (!is_obsolete_sp(kvm, sp))
>> +			continue;
> What if we save kvm->arch.active_mmu_pages on the stack and init
> kvm->arch.active_mmu_pages to be empty at the entrance to
> zap_invalid_pages(). This loop will iterate over saved list. This will
> allow us to drop the is_obsolete_sp() check and will save time since we
> will not be iterating over newly created sps.

This idea is really smart.

It also seems tricky, vcpu can see the page in its page table and hash table but
it has already been deleted from kvm->active_list, but i do not see any issue.

Hmm, can we walk kvm->ative_mmu_pages from tail to head then break the walking
if we meet the sp->valid_gen == kvm->valid_gen? This way also can skip walking
new created sps and more straight.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov May 16, 2013, 1:41 p.m. UTC | #4
On Thu, May 16, 2013 at 03:14:35PM +0200, Paolo Bonzini wrote:
> Il 16/05/2013 14:43, Gleb Natapov ha scritto:
> >> > +restart:
> >> > +	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
> >> > +		if (!is_obsolete_sp(kvm, sp))
> >> > +			continue;
> > What if we save kvm->arch.active_mmu_pages on the stack and init
> > kvm->arch.active_mmu_pages to be empty at the entrance to
> > zap_invalid_pages(). This loop will iterate over saved list. This will
> > allow us to drop the is_obsolete_sp() check and will save time since we
> > will not be iterating over newly created sps.
> > 
> 
> But when you add cond_resched_lock a thread may want to zap pages itself
> (e.g. from prepare_zap_oldest_mmu_page) and it won't find them.
> 
Yes, this will break mmu pages accounting. We can make
prepare_zap_oldest_mmu_page() wait while zap_invalid_pages()
frees needed amount of pages if one is in progress.

> Here is another proposal...  The idea is to avoid looking at new pages
> more than necessary after a "goto restart".
> 
> Basically, you alternate between two phases:
> 
> - look for pages to be zapped, group them together
> 
> - zap the pages
> 
> Something like:
> 
>       moved = 0;
> restart:
>       zapping = true;
>       for each page in active_mmu_pages [reverse and safe] {
>              if (!is_obsolete || invalid) {
>                  /*
>                   * Found a new page, stop zapping for now and
>                   * try to segregate the invalid ones at one end
>                   * of the list.
>                   */
>                  zapping = false;
>                  continue;
>              }
> 
>              if (batch > 10 && ...) {
>                  cond_resched_lock
>                  batch = 0;
>                  goto restart;
>              }
> 
>              if (!zapping) {
>                  /*
>                   * Segregate pages to one end of the list where
>                   * new pages don't get in the way.
>                   */
>                  list_move_tail(page, active_mmu_pages)
>                  batch++; /* or maybe not? */
>                  moved++;
>              } else {
>                  batch += prepare_zap_page
>                  goto restart;
>              }
>      }
> 
>      /* Need another pass to look at segregated pages?  */
>      if (moved) {
>          moved = 0;
>          goto restart;
>      }
Not sure what are you trying to achieve with "moved" tricks. Just
walking the list from the end and stopping on first valid sp should be
enough since active_mmu_pages list is a FIFO right now.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov May 16, 2013, 1:43 p.m. UTC | #5
On Thu, May 16, 2013 at 09:25:28PM +0800, Xiao Guangrong wrote:
> On 05/16/2013 08:43 PM, Gleb Natapov wrote:
> > On Thu, May 16, 2013 at 08:17:48PM +0800, Xiao Guangrong wrote:
> >> The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
> >> walk and zap all shadow pages one by one, also it need to zap all guest
> >> page's rmap and all shadow page's parent spte list. Particularly, things
> >> become worse if guest uses more memory or vcpus. It is not good for
> >> scalability
> >>
> >> In this patch, we introduce a faster way to invalidate all shadow pages.
> >> KVM maintains a global mmu invalid generation-number which is stored in
> >> kvm->arch.mmu_valid_gen and every shadow page stores the current global
> >> generation-number into sp->mmu_valid_gen when it is created
> >>
> >> When KVM need zap all shadow pages sptes, it just simply increase the
> >> global generation-number then reload root shadow pages on all vcpus.
> >> Vcpu will create a new shadow page table according to current kvm's
> >> generation-number. It ensures the old pages are not used any more.
> >> Then the invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
> >> are zapped by using lock-break technique
> >>
> >> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> >> ---
> >>  arch/x86/include/asm/kvm_host.h |    2 +
> >>  arch/x86/kvm/mmu.c              |   98 +++++++++++++++++++++++++++++++++++++++
> >>  arch/x86/kvm/mmu.h              |    2 +
> >>  3 files changed, 102 insertions(+), 0 deletions(-)
> >>
> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >> index 3741c65..bff7d46 100644
> >> --- a/arch/x86/include/asm/kvm_host.h
> >> +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -222,6 +222,7 @@ struct kvm_mmu_page {
> >>  	int root_count;          /* Currently serving as active root */
> >>  	unsigned int unsync_children;
> >>  	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
> >> +	unsigned long mmu_valid_gen;
> >>  	DECLARE_BITMAP(unsync_child_bitmap, 512);
> >>  
> >>  #ifdef CONFIG_X86_32
> >> @@ -529,6 +530,7 @@ struct kvm_arch {
> >>  	unsigned int n_requested_mmu_pages;
> >>  	unsigned int n_max_mmu_pages;
> >>  	unsigned int indirect_shadow_pages;
> >> +	unsigned long mmu_valid_gen;
> >>  	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
> >>  	/*
> >>  	 * Hash table of struct kvm_mmu_page.
> >> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> >> index 682ecb4..d9343fe 100644
> >> --- a/arch/x86/kvm/mmu.c
> >> +++ b/arch/x86/kvm/mmu.c
> >> @@ -1839,6 +1839,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
> >>  	__clear_sp_write_flooding_count(sp);
> >>  }
> >>  
> >> +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> >> +{
> >> +	return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
> >> +}
> >> +
> >>  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> >>  					     gfn_t gfn,
> >>  					     gva_t gaddr,
> >> @@ -1865,6 +1870,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> >>  		role.quadrant = quadrant;
> >>  	}
> >>  	for_each_gfn_sp(vcpu->kvm, sp, gfn) {
> >> +		if (is_obsolete_sp(vcpu->kvm, sp))
> >> +			continue;
> >> +
> >>  		if (!need_sync && sp->unsync)
> >>  			need_sync = true;
> >>  
> >> @@ -1901,6 +1909,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> >>  
> >>  		account_shadowed(vcpu->kvm, gfn);
> >>  	}
> >> +	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
> >>  	init_shadow_page_table(sp);
> >>  	trace_kvm_mmu_get_page(sp, true);
> >>  	return sp;
> >> @@ -2071,8 +2080,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
> >>  	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
> >>  	kvm_mmu_page_unlink_children(kvm, sp);
> >>  	kvm_mmu_unlink_parents(kvm, sp);
> >> +
> >>  	if (!sp->role.invalid && !sp->role.direct)
> >>  		unaccount_shadowed(kvm, sp->gfn);
> >> +
> >>  	if (sp->unsync)
> >>  		kvm_unlink_unsync_page(kvm, sp);
> >>  
> >> @@ -4196,6 +4207,93 @@ restart:
> >>  	spin_unlock(&kvm->mmu_lock);
> >>  }
> >>  
> >> +static void zap_invalid_pages(struct kvm *kvm)
> >> +{
> >> +	struct kvm_mmu_page *sp, *node;
> >> +	LIST_HEAD(invalid_list);
> >> +
> >> +restart:
> >> +	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
> >> +		if (!is_obsolete_sp(kvm, sp))
> >> +			continue;
> > What if we save kvm->arch.active_mmu_pages on the stack and init
> > kvm->arch.active_mmu_pages to be empty at the entrance to
> > zap_invalid_pages(). This loop will iterate over saved list. This will
> > allow us to drop the is_obsolete_sp() check and will save time since we
> > will not be iterating over newly created sps.
> 
> This idea is really smart.
> 
> It also seems tricky, vcpu can see the page in its page table and hash table but
> it has already been deleted from kvm->active_list, but i do not see any issue.
> 
Paolo pointed that it breaks mmu pages accounting. Can be solved, but
not trivial.

> Hmm, can we walk kvm->ative_mmu_pages from tail to head then break the walking
> if we meet the sp->valid_gen == kvm->valid_gen? This way also can skip walking
> new created sps and more straight.
> 
Yes, that should be better than walking it from the start each time.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini May 16, 2013, 1:49 p.m. UTC | #6
Il 16/05/2013 15:41, Gleb Natapov ha scritto:
> On Thu, May 16, 2013 at 03:14:35PM +0200, Paolo Bonzini wrote:
>> Il 16/05/2013 14:43, Gleb Natapov ha scritto:
>>>>> +restart:
>>>>> +	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
>>>>> +		if (!is_obsolete_sp(kvm, sp))
>>>>> +			continue;
>>> What if we save kvm->arch.active_mmu_pages on the stack and init
>>> kvm->arch.active_mmu_pages to be empty at the entrance to
>>> zap_invalid_pages(). This loop will iterate over saved list. This will
>>> allow us to drop the is_obsolete_sp() check and will save time since we
>>> will not be iterating over newly created sps.
>>>
>>
>> But when you add cond_resched_lock a thread may want to zap pages itself
>> (e.g. from prepare_zap_oldest_mmu_page) and it won't find them.
>>
> Yes, this will break mmu pages accounting. We can make
> prepare_zap_oldest_mmu_page() wait while zap_invalid_pages()
> frees needed amount of pages if one is in progress.
> 
>> Here is another proposal...  The idea is to avoid looking at new pages
>> more than necessary after a "goto restart".
>>
>> Basically, you alternate between two phases:
>>
>> - look for pages to be zapped, group them together
>>
>> - zap the pages
>>
>> Something like:
>>
>>       moved = 0;
>> restart:
>>       zapping = true;
>>       for each page in active_mmu_pages [reverse and safe] {
>>              if (!is_obsolete || invalid) {
>>                  /*
>>                   * Found a new page, stop zapping for now and
>>                   * try to segregate the invalid ones at one end
>>                   * of the list.
>>                   */
>>                  zapping = false;
>>                  continue;
>>              }
>>
>>              if (batch > 10 && ...) {
>>                  cond_resched_lock
>>                  batch = 0;
>>                  goto restart;
>>              }
>>
>>              if (!zapping) {
>>                  /*
>>                   * Segregate pages to one end of the list where
>>                   * new pages don't get in the way.
>>                   */
>>                  list_move_tail(page, active_mmu_pages)
>>                  batch++; /* or maybe not? */
>>                  moved++;
>>              } else {
>>                  batch += prepare_zap_page
>>                  goto restart;
>>              }
>>      }
>>
>>      /* Need another pass to look at segregated pages?  */
>>      if (moved) {
>>          moved = 0;
>>          goto restart;
>>      }
> Not sure what are you trying to achieve with "moved" tricks. Just
> walking the list from the end and stopping on first valid sp should be
> enough since active_mmu_pages list is a FIFO right now.

Right, I missed that "sp->role.invalid = 1" will ensure anyway that
pages are visited at most twice.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov May 16, 2013, 3:57 p.m. UTC | #7
On Thu, May 16, 2013 at 04:43:21PM +0300, Gleb Natapov wrote:
> On Thu, May 16, 2013 at 09:25:28PM +0800, Xiao Guangrong wrote:
> > On 05/16/2013 08:43 PM, Gleb Natapov wrote:
> > > On Thu, May 16, 2013 at 08:17:48PM +0800, Xiao Guangrong wrote:
> > >> The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
> > >> walk and zap all shadow pages one by one, also it need to zap all guest
> > >> page's rmap and all shadow page's parent spte list. Particularly, things
> > >> become worse if guest uses more memory or vcpus. It is not good for
> > >> scalability
> > >>
> > >> In this patch, we introduce a faster way to invalidate all shadow pages.
> > >> KVM maintains a global mmu invalid generation-number which is stored in
> > >> kvm->arch.mmu_valid_gen and every shadow page stores the current global
> > >> generation-number into sp->mmu_valid_gen when it is created
> > >>
> > >> When KVM need zap all shadow pages sptes, it just simply increase the
> > >> global generation-number then reload root shadow pages on all vcpus.
> > >> Vcpu will create a new shadow page table according to current kvm's
> > >> generation-number. It ensures the old pages are not used any more.
> > >> Then the invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
> > >> are zapped by using lock-break technique
> > >>
> > >> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> > >> ---
> > >>  arch/x86/include/asm/kvm_host.h |    2 +
> > >>  arch/x86/kvm/mmu.c              |   98 +++++++++++++++++++++++++++++++++++++++
> > >>  arch/x86/kvm/mmu.h              |    2 +
> > >>  3 files changed, 102 insertions(+), 0 deletions(-)
> > >>
> > >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > >> index 3741c65..bff7d46 100644
> > >> --- a/arch/x86/include/asm/kvm_host.h
> > >> +++ b/arch/x86/include/asm/kvm_host.h
> > >> @@ -222,6 +222,7 @@ struct kvm_mmu_page {
> > >>  	int root_count;          /* Currently serving as active root */
> > >>  	unsigned int unsync_children;
> > >>  	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
> > >> +	unsigned long mmu_valid_gen;
> > >>  	DECLARE_BITMAP(unsync_child_bitmap, 512);
> > >>  
> > >>  #ifdef CONFIG_X86_32
> > >> @@ -529,6 +530,7 @@ struct kvm_arch {
> > >>  	unsigned int n_requested_mmu_pages;
> > >>  	unsigned int n_max_mmu_pages;
> > >>  	unsigned int indirect_shadow_pages;
> > >> +	unsigned long mmu_valid_gen;
> > >>  	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
> > >>  	/*
> > >>  	 * Hash table of struct kvm_mmu_page.
> > >> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > >> index 682ecb4..d9343fe 100644
> > >> --- a/arch/x86/kvm/mmu.c
> > >> +++ b/arch/x86/kvm/mmu.c
> > >> @@ -1839,6 +1839,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
> > >>  	__clear_sp_write_flooding_count(sp);
> > >>  }
> > >>  
> > >> +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> > >> +{
> > >> +	return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
> > >> +}
> > >> +
> > >>  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> > >>  					     gfn_t gfn,
> > >>  					     gva_t gaddr,
> > >> @@ -1865,6 +1870,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> > >>  		role.quadrant = quadrant;
> > >>  	}
> > >>  	for_each_gfn_sp(vcpu->kvm, sp, gfn) {
> > >> +		if (is_obsolete_sp(vcpu->kvm, sp))
> > >> +			continue;
> > >> +
> > >>  		if (!need_sync && sp->unsync)
> > >>  			need_sync = true;
> > >>  
> > >> @@ -1901,6 +1909,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> > >>  
> > >>  		account_shadowed(vcpu->kvm, gfn);
> > >>  	}
> > >> +	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
> > >>  	init_shadow_page_table(sp);
> > >>  	trace_kvm_mmu_get_page(sp, true);
> > >>  	return sp;
> > >> @@ -2071,8 +2080,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
> > >>  	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
> > >>  	kvm_mmu_page_unlink_children(kvm, sp);
> > >>  	kvm_mmu_unlink_parents(kvm, sp);
> > >> +
> > >>  	if (!sp->role.invalid && !sp->role.direct)
> > >>  		unaccount_shadowed(kvm, sp->gfn);
> > >> +
> > >>  	if (sp->unsync)
> > >>  		kvm_unlink_unsync_page(kvm, sp);
> > >>  
> > >> @@ -4196,6 +4207,93 @@ restart:
> > >>  	spin_unlock(&kvm->mmu_lock);
> > >>  }
> > >>  
> > >> +static void zap_invalid_pages(struct kvm *kvm)
> > >> +{
> > >> +	struct kvm_mmu_page *sp, *node;
> > >> +	LIST_HEAD(invalid_list);
> > >> +
> > >> +restart:
> > >> +	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
> > >> +		if (!is_obsolete_sp(kvm, sp))
> > >> +			continue;
> > > What if we save kvm->arch.active_mmu_pages on the stack and init
> > > kvm->arch.active_mmu_pages to be empty at the entrance to
> > > zap_invalid_pages(). This loop will iterate over saved list. This will
> > > allow us to drop the is_obsolete_sp() check and will save time since we
> > > will not be iterating over newly created sps.
> > 
> > This idea is really smart.
> > 
> > It also seems tricky, vcpu can see the page in its page table and hash table but
> > it has already been deleted from kvm->active_list, but i do not see any issue.
> > 
> Paolo pointed that it breaks mmu pages accounting. Can be solved, but
> not trivial.
> 
> > Hmm, can we walk kvm->ative_mmu_pages from tail to head then break the walking
> > if we meet the sp->valid_gen == kvm->valid_gen? This way also can skip walking
> > new created sps and more straight.
> > 
> Yes, that should be better than walking it from the start each time.
> 
One more thought. With current patch if zap_invalid_page() will be
called second time while another zap_invalid_page() is still running
(can that happen?) they will both run concurrently fighting for the
mmu_lock. Is this a problem?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov May 16, 2013, 4:18 p.m. UTC | #8
On Thu, May 16, 2013 at 08:17:48PM +0800, Xiao Guangrong wrote:
> The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
> walk and zap all shadow pages one by one, also it need to zap all guest
> page's rmap and all shadow page's parent spte list. Particularly, things
> become worse if guest uses more memory or vcpus. It is not good for
> scalability
> 
> In this patch, we introduce a faster way to invalidate all shadow pages.
> KVM maintains a global mmu invalid generation-number which is stored in
> kvm->arch.mmu_valid_gen and every shadow page stores the current global
> generation-number into sp->mmu_valid_gen when it is created
> 
> When KVM need zap all shadow pages sptes, it just simply increase the
> global generation-number then reload root shadow pages on all vcpus.
> Vcpu will create a new shadow page table according to current kvm's
> generation-number. It ensures the old pages are not used any more.
> Then the invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
> are zapped by using lock-break technique
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    2 +
>  arch/x86/kvm/mmu.c              |   98 +++++++++++++++++++++++++++++++++++++++
>  arch/x86/kvm/mmu.h              |    2 +
>  3 files changed, 102 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3741c65..bff7d46 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -222,6 +222,7 @@ struct kvm_mmu_page {
>  	int root_count;          /* Currently serving as active root */
>  	unsigned int unsync_children;
>  	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
> +	unsigned long mmu_valid_gen;
>  	DECLARE_BITMAP(unsync_child_bitmap, 512);
>  
>  #ifdef CONFIG_X86_32
> @@ -529,6 +530,7 @@ struct kvm_arch {
>  	unsigned int n_requested_mmu_pages;
>  	unsigned int n_max_mmu_pages;
>  	unsigned int indirect_shadow_pages;
> +	unsigned long mmu_valid_gen;
>  	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
>  	/*
>  	 * Hash table of struct kvm_mmu_page.
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 682ecb4..d9343fe 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1839,6 +1839,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
>  	__clear_sp_write_flooding_count(sp);
>  }
>  
> +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> +{
> +	return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
> +}
> +
>  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>  					     gfn_t gfn,
>  					     gva_t gaddr,
> @@ -1865,6 +1870,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>  		role.quadrant = quadrant;
>  	}
>  	for_each_gfn_sp(vcpu->kvm, sp, gfn) {
> +		if (is_obsolete_sp(vcpu->kvm, sp))
> +			continue;
> +
>  		if (!need_sync && sp->unsync)
>  			need_sync = true;
>  
> @@ -1901,6 +1909,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>  
>  		account_shadowed(vcpu->kvm, gfn);
>  	}
> +	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
>  	init_shadow_page_table(sp);
>  	trace_kvm_mmu_get_page(sp, true);
>  	return sp;
> @@ -2071,8 +2080,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
>  	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
>  	kvm_mmu_page_unlink_children(kvm, sp);
>  	kvm_mmu_unlink_parents(kvm, sp);
> +
>  	if (!sp->role.invalid && !sp->role.direct)
>  		unaccount_shadowed(kvm, sp->gfn);
> +
>  	if (sp->unsync)
>  		kvm_unlink_unsync_page(kvm, sp);
>  
> @@ -4196,6 +4207,93 @@ restart:
>  	spin_unlock(&kvm->mmu_lock);
>  }
>  
> +static void zap_invalid_pages(struct kvm *kvm)
> +{
> +	struct kvm_mmu_page *sp, *node;
> +	LIST_HEAD(invalid_list);
> +
> +restart:
> +	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
> +		if (!is_obsolete_sp(kvm, sp))
> +			continue;
> +
> +		/*
> +		 * Do not repeatedly zap a root page to avoid unnecessary
> +		 * KVM_REQ_MMU_RELOAD, otherwise we may not be able to
> +		 * progress:
> +		 *    vcpu 0                        vcpu 1
> +		 *                         call vcpu_enter_guest():
> +		 *                            1): handle KVM_REQ_MMU_RELOAD
> +		 *                                and require mmu-lock to
> +		 *                                load mmu
> +		 * repeat:
> +		 *    1): zap root page and
> +		 *        send KVM_REQ_MMU_RELOAD
> +		 *
> +		 *    2): if (cond_resched_lock(mmu-lock))
> +		 *
> +		 *                            2): hold mmu-lock and load mmu
> +		 *
> +		 *                            3): see KVM_REQ_MMU_RELOAD bit
> +		 *                                on vcpu->requests is set
> +		 *                                then return 1 to call
> +		 *                                vcpu_enter_guest() again.
> +		 *            goto repeat;
> +		 *
> +		 */
> +		if (sp->role.invalid)
> +			continue;
> +		/*
> +		 * Need not flush tlb since we only zap the sp with invalid
> +		 * generation number.
> +		 */
> +		if (cond_resched_lock(&kvm->mmu_lock))
> +			goto restart;
> +
> +		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
> +			goto restart;
> +	}
> +
> +	/*
> +	 * Should flush tlb before free page tables since lockless-walking
> +	 * may use the pages.
> +	 */
> +	kvm_mmu_commit_zap_page(kvm, &invalid_list);
> +}
> +
> +/*
> + * Fast invalidate all shadow pages belong to @slot.
> + *
> + * @slot != NULL means the invalidation is caused the memslot specified
> + * by @slot is being deleted, in this case, we should ensure that rmap
> + * and lpage-info of the @slot can not be used after calling the function.
> + *
> + * @slot == NULL means the invalidation due to other reasons, we need
> + * not care rmap and lpage-info since they are still valid after calling
> + * the function.
> + */
> +void kvm_mmu_invalidate_memslot_pages(struct kvm *kvm,
> +				      struct kvm_memory_slot *slot)

Why pass "slot" here? If we want the function to sometimes wait for purge
and sometimes not the more straightforward way is to have a "bool wait"
parameter instead.

> +{
> +	spin_lock(&kvm->mmu_lock);
> +	kvm->arch.mmu_valid_gen++;
> +
> +	/*
> +	 * Notify all vcpus to reload its shadow page table
> +	 * and flush TLB. Then all vcpus will switch to new
> +	 * shadow page table with the new mmu_valid_gen.
> +	 *
> +	 * Note: we should do this under the protection of
> +	 * mmu-lock, otherwise, vcpu would purge shadow page
> +	 * but miss tlb flush.
> +	 */
> +	kvm_reload_remote_mmus(kvm);
> +
> +	if (slot)
> +		zap_invalid_pages(kvm);
> +	spin_unlock(&kvm->mmu_lock);
> +}
> +
>  void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
>  {
>  	struct kvm_mmu_page *sp, *node;
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 2adcbc2..bd57466 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -97,4 +97,6 @@ static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
>  	return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
>  }
>  
> +void kvm_mmu_invalidate_memslot_pages(struct kvm *kvm,
> +				      struct kvm_memory_slot *slot);
>  #endif
> -- 
> 1.7.7.6

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong May 16, 2013, 6:39 p.m. UTC | #9
On 05/16/2013 11:57 PM, Gleb Natapov wrote:

> One more thought. With current patch if zap_invalid_page() will be
> called second time while another zap_invalid_page() is still running
> (can that happen?) they will both run concurrently fighting for the

Currently, it can not happen since zap_invalid_page is needed when slot
is being deleted which protected by slot-lock.

But we allow it to be concurrent as you commented: we can use it in
->release() instead of calling kvm_mmu_zap_all(), in that case, multiple
call zap_invalid_page() can happen.

> mmu_lock. Is this a problem?

Are you worry about that it can not progress due to lock contention when
walking active_list? Zapping at least 10 pages before releasing the lock
should ensure that it can progress.

Do you see any potential issue?



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong May 16, 2013, 6:40 p.m. UTC | #10
On 05/17/2013 12:18 AM, Gleb Natapov wrote:

>> +
>> +/*
>> + * Fast invalidate all shadow pages belong to @slot.
>> + *
>> + * @slot != NULL means the invalidation is caused the memslot specified
>> + * by @slot is being deleted, in this case, we should ensure that rmap
>> + * and lpage-info of the @slot can not be used after calling the function.
>> + *
>> + * @slot == NULL means the invalidation due to other reasons, we need
>> + * not care rmap and lpage-info since they are still valid after calling
>> + * the function.
>> + */
>> +void kvm_mmu_invalidate_memslot_pages(struct kvm *kvm,
>> +				      struct kvm_memory_slot *slot)
> 
> Why pass "slot" here? If we want the function to sometimes wait for purge
> and sometimes not the more straightforward way is to have a "bool wait"
> parameter instead.
> 

That's my fault, i forgot to update it. Will use 'bool zap_invalid_pages'
instead.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov May 16, 2013, 7:57 p.m. UTC | #11
On Fri, May 17, 2013 at 02:39:07AM +0800, Xiao Guangrong wrote:
> On 05/16/2013 11:57 PM, Gleb Natapov wrote:
> 
> > One more thought. With current patch if zap_invalid_page() will be
> > called second time while another zap_invalid_page() is still running
> > (can that happen?) they will both run concurrently fighting for the
> 
> Currently, it can not happen since zap_invalid_page is needed when slot
> is being deleted which protected by slot-lock.
> 
> But we allow it to be concurrent as you commented: we can use it in
> ->release() instead of calling kvm_mmu_zap_all(), in that case, multiple
> call zap_invalid_page() can happen.
> 
That's only during VM destruction, no need to optimize for. 

> > mmu_lock. Is this a problem?
> 
> Are you worry about that it can not progress due to lock contention when
> walking active_list? Zapping at least 10 pages before releasing the lock
> should ensure that it can progress.
Yes, it will progress, but will bounce between two threads after each 10
pages. This is less efficient that letting one thread to finish zapping.
Theoretically we can make one thread wait for the other.

> 
> Do you see any potential issue?
> 
Just thinking out loud :). If this cannot happen during normal
operation, no reason to optimize for it.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3741c65..bff7d46 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -222,6 +222,7 @@  struct kvm_mmu_page {
 	int root_count;          /* Currently serving as active root */
 	unsigned int unsync_children;
 	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
+	unsigned long mmu_valid_gen;
 	DECLARE_BITMAP(unsync_child_bitmap, 512);
 
 #ifdef CONFIG_X86_32
@@ -529,6 +530,7 @@  struct kvm_arch {
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_max_mmu_pages;
 	unsigned int indirect_shadow_pages;
+	unsigned long mmu_valid_gen;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	/*
 	 * Hash table of struct kvm_mmu_page.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 682ecb4..d9343fe 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1839,6 +1839,11 @@  static void clear_sp_write_flooding_count(u64 *spte)
 	__clear_sp_write_flooding_count(sp);
 }
 
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     gfn_t gfn,
 					     gva_t gaddr,
@@ -1865,6 +1870,9 @@  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		role.quadrant = quadrant;
 	}
 	for_each_gfn_sp(vcpu->kvm, sp, gfn) {
+		if (is_obsolete_sp(vcpu->kvm, sp))
+			continue;
+
 		if (!need_sync && sp->unsync)
 			need_sync = true;
 
@@ -1901,6 +1909,7 @@  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 
 		account_shadowed(vcpu->kvm, gfn);
 	}
+	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
 	init_shadow_page_table(sp);
 	trace_kvm_mmu_get_page(sp, true);
 	return sp;
@@ -2071,8 +2080,10 @@  static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
 	kvm_mmu_page_unlink_children(kvm, sp);
 	kvm_mmu_unlink_parents(kvm, sp);
+
 	if (!sp->role.invalid && !sp->role.direct)
 		unaccount_shadowed(kvm, sp->gfn);
+
 	if (sp->unsync)
 		kvm_unlink_unsync_page(kvm, sp);
 
@@ -4196,6 +4207,93 @@  restart:
 	spin_unlock(&kvm->mmu_lock);
 }
 
+static void zap_invalid_pages(struct kvm *kvm)
+{
+	struct kvm_mmu_page *sp, *node;
+	LIST_HEAD(invalid_list);
+
+restart:
+	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+		if (!is_obsolete_sp(kvm, sp))
+			continue;
+
+		/*
+		 * Do not repeatedly zap a root page to avoid unnecessary
+		 * KVM_REQ_MMU_RELOAD, otherwise we may not be able to
+		 * progress:
+		 *    vcpu 0                        vcpu 1
+		 *                         call vcpu_enter_guest():
+		 *                            1): handle KVM_REQ_MMU_RELOAD
+		 *                                and require mmu-lock to
+		 *                                load mmu
+		 * repeat:
+		 *    1): zap root page and
+		 *        send KVM_REQ_MMU_RELOAD
+		 *
+		 *    2): if (cond_resched_lock(mmu-lock))
+		 *
+		 *                            2): hold mmu-lock and load mmu
+		 *
+		 *                            3): see KVM_REQ_MMU_RELOAD bit
+		 *                                on vcpu->requests is set
+		 *                                then return 1 to call
+		 *                                vcpu_enter_guest() again.
+		 *            goto repeat;
+		 *
+		 */
+		if (sp->role.invalid)
+			continue;
+		/*
+		 * Need not flush tlb since we only zap the sp with invalid
+		 * generation number.
+		 */
+		if (cond_resched_lock(&kvm->mmu_lock))
+			goto restart;
+
+		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+			goto restart;
+	}
+
+	/*
+	 * Should flush tlb before free page tables since lockless-walking
+	 * may use the pages.
+	 */
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
+}
+
+/*
+ * Fast invalidate all shadow pages belong to @slot.
+ *
+ * @slot != NULL means the invalidation is caused the memslot specified
+ * by @slot is being deleted, in this case, we should ensure that rmap
+ * and lpage-info of the @slot can not be used after calling the function.
+ *
+ * @slot == NULL means the invalidation due to other reasons, we need
+ * not care rmap and lpage-info since they are still valid after calling
+ * the function.
+ */
+void kvm_mmu_invalidate_memslot_pages(struct kvm *kvm,
+				      struct kvm_memory_slot *slot)
+{
+	spin_lock(&kvm->mmu_lock);
+	kvm->arch.mmu_valid_gen++;
+
+	/*
+	 * Notify all vcpus to reload its shadow page table
+	 * and flush TLB. Then all vcpus will switch to new
+	 * shadow page table with the new mmu_valid_gen.
+	 *
+	 * Note: we should do this under the protection of
+	 * mmu-lock, otherwise, vcpu would purge shadow page
+	 * but miss tlb flush.
+	 */
+	kvm_reload_remote_mmus(kvm);
+
+	if (slot)
+		zap_invalid_pages(kvm);
+	spin_unlock(&kvm->mmu_lock);
+}
+
 void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 2adcbc2..bd57466 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -97,4 +97,6 @@  static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
 	return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
 }
 
+void kvm_mmu_invalidate_memslot_pages(struct kvm *kvm,
+				      struct kvm_memory_slot *slot);
 #endif