diff mbox

[6/6] KVM: MMU: fast zap all shadow pages

Message ID 514007A0.1040400@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Xiao Guangrong March 13, 2013, 4:59 a.m. UTC
The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
walk and zap all shadow pages one by one, also it need to zap all guest
page's rmap and all shadow page's parent spte list. Particularly, things
become worse if guest uses more memory or vcpus. It is not good for
scalability.

Since all shadow page will be zapped, we can directly zap the mmu-cache
and rmap so that vcpu will fault on the new mmu-cache, after that, we can
directly free the memory used by old mmu-cache.

The root shadow page is little especial since they are currently used by
vcpus, we can not directly free them. So, we zap the root shadow pages and
re-add them into the new mmu-cache.

After this patch, kvm_mmu_zap_all can be faster 113% than before

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c |   62 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 56 insertions(+), 6 deletions(-)

Comments

Marcelo Tosatti March 14, 2013, 1:07 a.m. UTC | #1
On Wed, Mar 13, 2013 at 12:59:12PM +0800, Xiao Guangrong wrote:
> The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
> walk and zap all shadow pages one by one, also it need to zap all guest
> page's rmap and all shadow page's parent spte list. Particularly, things
> become worse if guest uses more memory or vcpus. It is not good for
> scalability.
> 
> Since all shadow page will be zapped, we can directly zap the mmu-cache
> and rmap so that vcpu will fault on the new mmu-cache, after that, we can
> directly free the memory used by old mmu-cache.
> 
> The root shadow page is little especial since they are currently used by
> vcpus, we can not directly free them. So, we zap the root shadow pages and
> re-add them into the new mmu-cache.
> 
> After this patch, kvm_mmu_zap_all can be faster 113% than before
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  arch/x86/kvm/mmu.c |   62 ++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 files changed, 56 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index e326099..536d9ce 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -4186,18 +4186,68 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
> 
>  void kvm_mmu_zap_all(struct kvm *kvm)
>  {
> -	struct kvm_mmu_page *sp, *node;
> +	LIST_HEAD(root_mmu_pages);
>  	LIST_HEAD(invalid_list);
> +	struct list_head pte_list_descs;
> +	struct kvm_mmu_cache *cache = &kvm->arch.mmu_cache;
> +	struct kvm_mmu_page *sp, *node;
> +	struct pte_list_desc *desc, *ndesc;
> +	int root_sp = 0;
> 
>  	spin_lock(&kvm->mmu_lock);
> +
>  restart:
> -	list_for_each_entry_safe(sp, node,
> -	      &kvm->arch.mmu_cache.active_mmu_pages, link)
> -		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
> -			goto restart;
> +	/*
> +	 * The root shadow pages are being used on vcpus that can not
> +	 * directly removed, we filter them out and re-add them to the
> +	 * new mmu cache.
> +	 */
> +	list_for_each_entry_safe(sp, node, &cache->active_mmu_pages, link)
> +		if (sp->root_count) {
> +			int ret;
> +
> +			root_sp++;
> +			ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
> +			list_move(&sp->link, &root_mmu_pages);
> +			if (ret)
> +				goto restart;
> +		}

Why is it safe to skip flushing of root pages, for all
kvm_flush_shadow() callers?

Should revisit KVM_REQ_MMU_RELOAD... not clear it is necessary for NPT
(unrelated).

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti March 14, 2013, 1:35 a.m. UTC | #2
On Wed, Mar 13, 2013 at 10:07:06PM -0300, Marcelo Tosatti wrote:
> On Wed, Mar 13, 2013 at 12:59:12PM +0800, Xiao Guangrong wrote:
> > The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
> > walk and zap all shadow pages one by one, also it need to zap all guest
> > page's rmap and all shadow page's parent spte list. Particularly, things
> > become worse if guest uses more memory or vcpus. It is not good for
> > scalability.
> > 
> > Since all shadow page will be zapped, we can directly zap the mmu-cache
> > and rmap so that vcpu will fault on the new mmu-cache, after that, we can
> > directly free the memory used by old mmu-cache.
> > 
> > The root shadow page is little especial since they are currently used by
> > vcpus, we can not directly free them. So, we zap the root shadow pages and
> > re-add them into the new mmu-cache.
> > 
> > After this patch, kvm_mmu_zap_all can be faster 113% than before
> > 
> > Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> > ---
> >  arch/x86/kvm/mmu.c |   62 ++++++++++++++++++++++++++++++++++++++++++++++-----
> >  1 files changed, 56 insertions(+), 6 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index e326099..536d9ce 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -4186,18 +4186,68 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
> > 
> >  void kvm_mmu_zap_all(struct kvm *kvm)
> >  {
> > -	struct kvm_mmu_page *sp, *node;
> > +	LIST_HEAD(root_mmu_pages);
> >  	LIST_HEAD(invalid_list);
> > +	struct list_head pte_list_descs;
> > +	struct kvm_mmu_cache *cache = &kvm->arch.mmu_cache;
> > +	struct kvm_mmu_page *sp, *node;
> > +	struct pte_list_desc *desc, *ndesc;
> > +	int root_sp = 0;
> > 
> >  	spin_lock(&kvm->mmu_lock);
> > +
> >  restart:
> > -	list_for_each_entry_safe(sp, node,
> > -	      &kvm->arch.mmu_cache.active_mmu_pages, link)
> > -		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
> > -			goto restart;
> > +	/*
> > +	 * The root shadow pages are being used on vcpus that can not
> > +	 * directly removed, we filter them out and re-add them to the
> > +	 * new mmu cache.
> > +	 */
> > +	list_for_each_entry_safe(sp, node, &cache->active_mmu_pages, link)
> > +		if (sp->root_count) {
> > +			int ret;
> > +
> > +			root_sp++;
> > +			ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
> > +			list_move(&sp->link, &root_mmu_pages);
> > +			if (ret)
> > +				goto restart;
> > +		}
> 
> Why is it safe to skip flushing of root pages, for all
> kvm_flush_shadow() callers?

You are not skipping the flush, only moving to the new mmu cache.

> Should revisit KVM_REQ_MMU_RELOAD... not clear it is necessary for NPT
> (unrelated).

Actually, what i meant is: you can batch KVM_REQ_MMU_RELOAD requests to
the end of kvm_mmu_zap_all. Waking up vcpus is not optimal since they're
going to contend for mmu_lock anyway.

Need more time to have more useful comments to this patchset, sorry.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong March 14, 2013, 4:42 a.m. UTC | #3
On 03/14/2013 09:35 AM, Marcelo Tosatti wrote:
> On Wed, Mar 13, 2013 at 10:07:06PM -0300, Marcelo Tosatti wrote:
>> On Wed, Mar 13, 2013 at 12:59:12PM +0800, Xiao Guangrong wrote:
>>> The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
>>> walk and zap all shadow pages one by one, also it need to zap all guest
>>> page's rmap and all shadow page's parent spte list. Particularly, things
>>> become worse if guest uses more memory or vcpus. It is not good for
>>> scalability.
>>>
>>> Since all shadow page will be zapped, we can directly zap the mmu-cache
>>> and rmap so that vcpu will fault on the new mmu-cache, after that, we can
>>> directly free the memory used by old mmu-cache.
>>>
>>> The root shadow page is little especial since they are currently used by
>>> vcpus, we can not directly free them. So, we zap the root shadow pages and
>>> re-add them into the new mmu-cache.
>>>
>>> After this patch, kvm_mmu_zap_all can be faster 113% than before
>>>
>>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
>>> ---
>>>  arch/x86/kvm/mmu.c |   62 ++++++++++++++++++++++++++++++++++++++++++++++-----
>>>  1 files changed, 56 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>>> index e326099..536d9ce 100644
>>> --- a/arch/x86/kvm/mmu.c
>>> +++ b/arch/x86/kvm/mmu.c
>>> @@ -4186,18 +4186,68 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
>>>
>>>  void kvm_mmu_zap_all(struct kvm *kvm)
>>>  {
>>> -	struct kvm_mmu_page *sp, *node;
>>> +	LIST_HEAD(root_mmu_pages);
>>>  	LIST_HEAD(invalid_list);
>>> +	struct list_head pte_list_descs;
>>> +	struct kvm_mmu_cache *cache = &kvm->arch.mmu_cache;
>>> +	struct kvm_mmu_page *sp, *node;
>>> +	struct pte_list_desc *desc, *ndesc;
>>> +	int root_sp = 0;
>>>
>>>  	spin_lock(&kvm->mmu_lock);
>>> +
>>>  restart:
>>> -	list_for_each_entry_safe(sp, node,
>>> -	      &kvm->arch.mmu_cache.active_mmu_pages, link)
>>> -		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
>>> -			goto restart;
>>> +	/*
>>> +	 * The root shadow pages are being used on vcpus that can not
>>> +	 * directly removed, we filter them out and re-add them to the
>>> +	 * new mmu cache.
>>> +	 */
>>> +	list_for_each_entry_safe(sp, node, &cache->active_mmu_pages, link)
>>> +		if (sp->root_count) {
>>> +			int ret;
>>> +
>>> +			root_sp++;
>>> +			ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
>>> +			list_move(&sp->link, &root_mmu_pages);
>>> +			if (ret)
>>> +				goto restart;
>>> +		}
>>
>> Why is it safe to skip flushing of root pages, for all
>> kvm_flush_shadow() callers?
> 
> You are not skipping the flush, only moving to the new mmu cache.
> 
>> Should revisit KVM_REQ_MMU_RELOAD... not clear it is necessary for NPT
>> (unrelated).
> 
> Actually, what i meant is: you can batch KVM_REQ_MMU_RELOAD requests to
> the end of kvm_mmu_zap_all. Waking up vcpus is not optimal since they're
> going to contend for mmu_lock anyway.

Yes, I agree. Will move KVM_REQ_MMU_RELOAD to the end of kvm_mmu_zap_all in
the V2.

BTW, the TLB flushed is not needed if no root shadow page zapped since all
vcpus are not using shadow pages. The code may be simplified to (after batch
KVM_REQ_MMU_RELOAD):

if (root_sp)
	kvm_reload_remote_mmus()
> 
> Need more time to have more useful comments to this patchset, sorry.

No problem. ;) The current comments is really useful for me.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti March 18, 2013, 8:46 p.m. UTC | #4
On Wed, Mar 13, 2013 at 12:59:12PM +0800, Xiao Guangrong wrote:
> The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
> walk and zap all shadow pages one by one, also it need to zap all guest
> page's rmap and all shadow page's parent spte list. Particularly, things
> become worse if guest uses more memory or vcpus. It is not good for
> scalability.
> 
> Since all shadow page will be zapped, we can directly zap the mmu-cache
> and rmap so that vcpu will fault on the new mmu-cache, after that, we can
> directly free the memory used by old mmu-cache.
> 
> The root shadow page is little especial since they are currently used by
> vcpus, we can not directly free them. So, we zap the root shadow pages and
> re-add them into the new mmu-cache.
> 
> After this patch, kvm_mmu_zap_all can be faster 113% than before
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  arch/x86/kvm/mmu.c |   62 ++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 files changed, 56 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index e326099..536d9ce 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -4186,18 +4186,68 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
> 
>  void kvm_mmu_zap_all(struct kvm *kvm)
>  {
> -	struct kvm_mmu_page *sp, *node;
> +	LIST_HEAD(root_mmu_pages);
>  	LIST_HEAD(invalid_list);
> +	struct list_head pte_list_descs;
> +	struct kvm_mmu_cache *cache = &kvm->arch.mmu_cache;
> +	struct kvm_mmu_page *sp, *node;
> +	struct pte_list_desc *desc, *ndesc;
> +	int root_sp = 0;
> 
>  	spin_lock(&kvm->mmu_lock);
> +
>  restart:
> -	list_for_each_entry_safe(sp, node,
> -	      &kvm->arch.mmu_cache.active_mmu_pages, link)
> -		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
> -			goto restart;
> +	/*
> +	 * The root shadow pages are being used on vcpus that can not
> +	 * directly removed, we filter them out and re-add them to the
> +	 * new mmu cache.
> +	 */
> +	list_for_each_entry_safe(sp, node, &cache->active_mmu_pages, link)
> +		if (sp->root_count) {
> +			int ret;
> +
> +			root_sp++;
> +			ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
> +			list_move(&sp->link, &root_mmu_pages);
> +			if (ret)
> +				goto restart;
> +		}
> +
> +	list_splice(&cache->active_mmu_pages, &invalid_list);
> +	list_replace(&cache->pte_list_descs, &pte_list_descs);
> +
> +	/*
> +	 * Reset the mmu cache so that later vcpu will fault on the new
> +	 * mmu cache.
> +	 */
> +	memset(cache, 0, sizeof(*cache));
> +	kvm_mmu_init(kvm);

Xiao,

I suppose zeroing of kvm_mmu_cache can be avoided, if the links are
removed at prepare_zap_page. So perhaps

- spin_lock(mmu_lock)
- for each page
	- zero sp->spt[], remove page from linked lists
- flush remote TLB (batched)
- spin_unlock(mmu_lock)
- free data (which is safe because freeing has its own serialization)
- spin_lock(mmu_lock)
- account for the pages freed
- spin_unlock(mmu_lock)

(or if you think of some other way to not have the mmu_cache zeroing step).

Note the account for pages freed step after pages are actually
freed: as discussed with Takuya, having pages freed and freed page
accounting out of sync across mmu_lock is potentially problematic:
kvm->arch.n_used_mmu_pages and friends do not reflect reality which can
cause problems for SLAB freeing and page allocation throttling.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong March 19, 2013, 3:06 a.m. UTC | #5
On 03/19/2013 04:46 AM, Marcelo Tosatti wrote:
> On Wed, Mar 13, 2013 at 12:59:12PM +0800, Xiao Guangrong wrote:
>> The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
>> walk and zap all shadow pages one by one, also it need to zap all guest
>> page's rmap and all shadow page's parent spte list. Particularly, things
>> become worse if guest uses more memory or vcpus. It is not good for
>> scalability.
>>
>> Since all shadow page will be zapped, we can directly zap the mmu-cache
>> and rmap so that vcpu will fault on the new mmu-cache, after that, we can
>> directly free the memory used by old mmu-cache.
>>
>> The root shadow page is little especial since they are currently used by
>> vcpus, we can not directly free them. So, we zap the root shadow pages and
>> re-add them into the new mmu-cache.
>>
>> After this patch, kvm_mmu_zap_all can be faster 113% than before
>>
>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
>> ---
>>  arch/x86/kvm/mmu.c |   62 ++++++++++++++++++++++++++++++++++++++++++++++-----
>>  1 files changed, 56 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index e326099..536d9ce 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -4186,18 +4186,68 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
>>
>>  void kvm_mmu_zap_all(struct kvm *kvm)
>>  {
>> -	struct kvm_mmu_page *sp, *node;
>> +	LIST_HEAD(root_mmu_pages);
>>  	LIST_HEAD(invalid_list);
>> +	struct list_head pte_list_descs;
>> +	struct kvm_mmu_cache *cache = &kvm->arch.mmu_cache;
>> +	struct kvm_mmu_page *sp, *node;
>> +	struct pte_list_desc *desc, *ndesc;
>> +	int root_sp = 0;
>>
>>  	spin_lock(&kvm->mmu_lock);
>> +
>>  restart:
>> -	list_for_each_entry_safe(sp, node,
>> -	      &kvm->arch.mmu_cache.active_mmu_pages, link)
>> -		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
>> -			goto restart;
>> +	/*
>> +	 * The root shadow pages are being used on vcpus that can not
>> +	 * directly removed, we filter them out and re-add them to the
>> +	 * new mmu cache.
>> +	 */
>> +	list_for_each_entry_safe(sp, node, &cache->active_mmu_pages, link)
>> +		if (sp->root_count) {
>> +			int ret;
>> +
>> +			root_sp++;
>> +			ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
>> +			list_move(&sp->link, &root_mmu_pages);
>> +			if (ret)
>> +				goto restart;
>> +		}
>> +
>> +	list_splice(&cache->active_mmu_pages, &invalid_list);
>> +	list_replace(&cache->pte_list_descs, &pte_list_descs);
>> +
>> +	/*
>> +	 * Reset the mmu cache so that later vcpu will fault on the new
>> +	 * mmu cache.
>> +	 */
>> +	memset(cache, 0, sizeof(*cache));
>> +	kvm_mmu_init(kvm);
> 
> Xiao,
> 
> I suppose zeroing of kvm_mmu_cache can be avoided, if the links are
> removed at prepare_zap_page. So perhaps

The purpose of zeroing of kvm_mmu_cache is resetting the hashtable and
some count numbers.
[.n_request_mmu_pages and .n_max_mmu_pages should not be changed, i will
fix this].

> 
> - spin_lock(mmu_lock)
> - for each page
> 	- zero sp->spt[], remove page from linked lists

sizeof(mmu_cache) is:
(1 << 10) * sizeof (hlist_head) + 4 * sizeof(unsigned int) = 2^13 + 16
and it is constant. In your way, for every sp, we need to zap:
512 entries + a hash-node = 2^12 + 8
especially the workload depends on the size of guest memory.
Why you think this way is better?

> - flush remote TLB (batched)
> - spin_unlock(mmu_lock)
> - free data (which is safe because freeing has its own serialization)

We should free the root sp in mmu-lock like my patch.

> - spin_lock(mmu_lock)
> - account for the pages freed
> - spin_unlock(mmu_lock)

The count numbers are still inconsistent if other thread hold mmu-lock between
zero shadow page and recount.

Marcelo, i really confused what is the benefit in this way but i might
completely misunderstand it.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti March 19, 2013, 2:40 p.m. UTC | #6
On Tue, Mar 19, 2013 at 11:06:35AM +0800, Xiao Guangrong wrote:
> On 03/19/2013 04:46 AM, Marcelo Tosatti wrote:
> > On Wed, Mar 13, 2013 at 12:59:12PM +0800, Xiao Guangrong wrote:
> >> The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
> >> walk and zap all shadow pages one by one, also it need to zap all guest
> >> page's rmap and all shadow page's parent spte list. Particularly, things
> >> become worse if guest uses more memory or vcpus. It is not good for
> >> scalability.
> >>
> >> Since all shadow page will be zapped, we can directly zap the mmu-cache
> >> and rmap so that vcpu will fault on the new mmu-cache, after that, we can
> >> directly free the memory used by old mmu-cache.
> >>
> >> The root shadow page is little especial since they are currently used by
> >> vcpus, we can not directly free them. So, we zap the root shadow pages and
> >> re-add them into the new mmu-cache.
> >>
> >> After this patch, kvm_mmu_zap_all can be faster 113% than before
> >>
> >> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> >> ---
> >>  arch/x86/kvm/mmu.c |   62 ++++++++++++++++++++++++++++++++++++++++++++++-----
> >>  1 files changed, 56 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> >> index e326099..536d9ce 100644
> >> --- a/arch/x86/kvm/mmu.c
> >> +++ b/arch/x86/kvm/mmu.c
> >> @@ -4186,18 +4186,68 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
> >>
> >>  void kvm_mmu_zap_all(struct kvm *kvm)
> >>  {
> >> -	struct kvm_mmu_page *sp, *node;
> >> +	LIST_HEAD(root_mmu_pages);
> >>  	LIST_HEAD(invalid_list);
> >> +	struct list_head pte_list_descs;
> >> +	struct kvm_mmu_cache *cache = &kvm->arch.mmu_cache;
> >> +	struct kvm_mmu_page *sp, *node;
> >> +	struct pte_list_desc *desc, *ndesc;
> >> +	int root_sp = 0;
> >>
> >>  	spin_lock(&kvm->mmu_lock);
> >> +
> >>  restart:
> >> -	list_for_each_entry_safe(sp, node,
> >> -	      &kvm->arch.mmu_cache.active_mmu_pages, link)
> >> -		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
> >> -			goto restart;
> >> +	/*
> >> +	 * The root shadow pages are being used on vcpus that can not
> >> +	 * directly removed, we filter them out and re-add them to the
> >> +	 * new mmu cache.
> >> +	 */
> >> +	list_for_each_entry_safe(sp, node, &cache->active_mmu_pages, link)
> >> +		if (sp->root_count) {
> >> +			int ret;
> >> +
> >> +			root_sp++;
> >> +			ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
> >> +			list_move(&sp->link, &root_mmu_pages);
> >> +			if (ret)
> >> +				goto restart;
> >> +		}
> >> +
> >> +	list_splice(&cache->active_mmu_pages, &invalid_list);
> >> +	list_replace(&cache->pte_list_descs, &pte_list_descs);
> >> +
> >> +	/*
> >> +	 * Reset the mmu cache so that later vcpu will fault on the new
> >> +	 * mmu cache.
> >> +	 */
> >> +	memset(cache, 0, sizeof(*cache));
> >> +	kvm_mmu_init(kvm);
> > 
> > Xiao,
> > 
> > I suppose zeroing of kvm_mmu_cache can be avoided, if the links are
> > removed at prepare_zap_page. So perhaps
> 
> The purpose of zeroing of kvm_mmu_cache is resetting the hashtable and
> some count numbers.
> [.n_request_mmu_pages and .n_max_mmu_pages should not be changed, i will
> fix this].
> 
> > 
> > - spin_lock(mmu_lock)
> > - for each page
> > 	- zero sp->spt[], remove page from linked lists
> 
> sizeof(mmu_cache) is:
> (1 << 10) * sizeof (hlist_head) + 4 * sizeof(unsigned int) = 2^13 + 16
> and it is constant. In your way, for every sp, we need to zap:
> 512 entries + a hash-node = 2^12 + 8
> especially the workload depends on the size of guest memory.
> Why you think this way is better?

Its not of course. 

> > - flush remote TLB (batched)
> > - spin_unlock(mmu_lock)
> > - free data (which is safe because freeing has its own serialization)
> 
> We should free the root sp in mmu-lock like my patch.
> 
> > - spin_lock(mmu_lock)
> > - account for the pages freed
> > - spin_unlock(mmu_lock)
> 
> The count numbers are still inconsistent if other thread hold mmu-lock between
> zero shadow page and recount.
> 
> Marcelo, i really confused what is the benefit in this way but i might
> completely misunderstand it.

I misunderstood the benefit of your idea (now i got it: zap root
and flush TLB guarantees vcpus will refault). What i'd like to avoid is

memset(cache, 0, sizeof(*cache));
kvm_mmu_init(kvm);

I'd prefer normal operations on those data structures (in mmu_cache).
And also the page accounting is a problem.

Perhaps you can use a generation number to consider whether shadow pages
are valid? So: 

find_sp(gfn_t gfn)
lookup hash
if sp->generation_number != mmu->current_generation_number
	initialize page as if it were just allocated (but keep it in the hash list)

And on kvm_mmu_zap_all()
spin_lock(mmu_lock)
for each page
if page->root_count
	zero sp->spt[]

flush TLB
mmu->current_generation_number++
spin_unlock(mmu_lock)

Then have kvm_mmu_free_all() that actually frees all data.

Hum, not sure if thats any better than your current patchset.
Well, maybe resend patchset with bug fixes / improvements and 
we go from there.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong March 19, 2013, 3:37 p.m. UTC | #7
On 03/19/2013 10:40 PM, Marcelo Tosatti wrote:

> 
> I misunderstood the benefit of your idea (now i got it: zap root
> and flush TLB guarantees vcpus will refault). What i'd like to avoid is
> 
> memset(cache, 0, sizeof(*cache));
> kvm_mmu_init(kvm);
> 
> I'd prefer normal operations on those data structures (in mmu_cache).
> And also the page accounting is a problem.
> 
> Perhaps you can use a generation number to consider whether shadow pages
> are valid? So: 
> 
> find_sp(gfn_t gfn)
> lookup hash
> if sp->generation_number != mmu->current_generation_number
> 	initialize page as if it were just allocated (but keep it in the hash list)
> 
> And on kvm_mmu_zap_all()
> spin_lock(mmu_lock)
> for each page
> if page->root_count
> 	zero sp->spt[]
> 
> flush TLB
> mmu->current_generation_number++
> spin_unlock(mmu_lock)
> 
> Then have kvm_mmu_free_all() that actually frees all data.
> 
> Hum, not sure if thats any better than your current patchset.

I also got the idea of generation number like yours which i mentioned
it in the [PATCH 0/6]:

* TODO
Avoiding Marcelo beat me :), they are some works not attached to make the
patchset more smaller:
(1): batch kvm_reload_remote_mmus for zapping root shadow pages
(2): free shadow pages by using generation-number
(3): remove unneeded kvm_reload_remote_mmus after kvm_mmu_zap_all
(4): drop unnecessary @npages from kvm_arch_create_memslot
(5): rename init_kvm_mmu to init_vcpu_mmu

> Well, maybe resend patchset with bug fixes / improvements and 
> we go from there.

I agree. Thanks for your time, Marcelo!


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti March 19, 2013, 10:37 p.m. UTC | #8
On Tue, Mar 19, 2013 at 11:37:38PM +0800, Xiao Guangrong wrote:
> On 03/19/2013 10:40 PM, Marcelo Tosatti wrote:
> 
> > 
> > I misunderstood the benefit of your idea (now i got it: zap root
> > and flush TLB guarantees vcpus will refault). What i'd like to avoid is
> > 
> > memset(cache, 0, sizeof(*cache));
> > kvm_mmu_init(kvm);
> > 
> > I'd prefer normal operations on those data structures (in mmu_cache).
> > And also the page accounting is a problem.
> > 
> > Perhaps you can use a generation number to consider whether shadow pages
> > are valid? So: 
> > 
> > find_sp(gfn_t gfn)
> > lookup hash
> > if sp->generation_number != mmu->current_generation_number
> > 	initialize page as if it were just allocated (but keep it in the hash list)
> > 
> > And on kvm_mmu_zap_all()
> > spin_lock(mmu_lock)
> > for each page
> > if page->root_count
> > 	zero sp->spt[]
> > 
> > flush TLB
> > mmu->current_generation_number++
> > spin_unlock(mmu_lock)
> > 
> > Then have kvm_mmu_free_all() that actually frees all data.
> > 
> > Hum, not sure if thats any better than your current patchset.
> 
> I also got the idea of generation number like yours which i mentioned
> it in the [PATCH 0/6]:
> 
> * TODO
> Avoiding Marcelo beat me :), they are some works not attached to make the
> patchset more smaller:
> (1): batch kvm_reload_remote_mmus for zapping root shadow pages
> (2): free shadow pages by using generation-number
> (3): remove unneeded kvm_reload_remote_mmus after kvm_mmu_zap_all
> (4): drop unnecessary @npages from kvm_arch_create_memslot
> (5): rename init_kvm_mmu to init_vcpu_mmu

Long patch sets are OK. Problem are unrelated changes in a patchset.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e326099..536d9ce 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4186,18 +4186,68 @@  void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)

 void kvm_mmu_zap_all(struct kvm *kvm)
 {
-	struct kvm_mmu_page *sp, *node;
+	LIST_HEAD(root_mmu_pages);
 	LIST_HEAD(invalid_list);
+	struct list_head pte_list_descs;
+	struct kvm_mmu_cache *cache = &kvm->arch.mmu_cache;
+	struct kvm_mmu_page *sp, *node;
+	struct pte_list_desc *desc, *ndesc;
+	int root_sp = 0;

 	spin_lock(&kvm->mmu_lock);
+
 restart:
-	list_for_each_entry_safe(sp, node,
-	      &kvm->arch.mmu_cache.active_mmu_pages, link)
-		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
-			goto restart;
+	/*
+	 * The root shadow pages are being used on vcpus that can not
+	 * directly removed, we filter them out and re-add them to the
+	 * new mmu cache.
+	 */
+	list_for_each_entry_safe(sp, node, &cache->active_mmu_pages, link)
+		if (sp->root_count) {
+			int ret;
+
+			root_sp++;
+			ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+			list_move(&sp->link, &root_mmu_pages);
+			if (ret)
+				goto restart;
+		}
+
+	list_splice(&cache->active_mmu_pages, &invalid_list);
+	list_replace(&cache->pte_list_descs, &pte_list_descs);
+
+	/*
+	 * Reset the mmu cache so that later vcpu will fault on the new
+	 * mmu cache.
+	 */
+	memset(cache, 0, sizeof(*cache));
+	kvm_mmu_init(kvm);
+
+	/*
+	 * Now, the mmu cache has been reset, we can re-add the root shadow
+	 * pages into the cache.
+	 */
+	list_replace(&root_mmu_pages, &cache->active_mmu_pages);
+	kvm_mod_used_mmu_pages(kvm, root_sp);
+
+	/* Reset gfn's rmap and lpage info. */
+	kvm_clear_all_gfn_page_info(kvm);
+
+	/*
+	 * Flush all TLBs so that vcpu can not use the invalid mappings.
+	 * Do not disturb vcpus if root shadow pages have been zapped
+	 * since KVM_REQ_MMU_RELOAD will force TLB to be flushed.
+	 */
+	if (!root_sp && !list_empty(&invalid_list))
+		kvm_flush_remote_tlbs(kvm);

-	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 	spin_unlock(&kvm->mmu_lock);
+
+	list_for_each_entry_safe(sp, node, &invalid_list, link)
+		kvm_mmu_free_page(sp);
+
+	list_for_each_entry_safe(desc, ndesc, &pte_list_descs, list)
+		mmu_free_pte_list_desc(desc);
 }

 static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)