diff mbox

[05/10] KVM: x86: MMU: Use for_each_rmap_spte macro instead of pte_list_walk()

Message ID 20151112205245.6bd773737cfa78422dac5a79@lab.ntt.co.jp (mailing list archive)
State New, archived
Headers show

Commit Message

Takuya Yoshikawa Nov. 12, 2015, 11:52 a.m. UTC
kvm_mmu_mark_parents_unsync() alone uses pte_list_walk(), witch does
nearly the same as the for_each_rmap_spte macro.  The only difference
is that is_shadow_present_pte() checks cannot be placed there because
kvm_mmu_mark_parents_unsync() can be called with a new parent pointer
whose entry is not set yet.

By calling mark_unsync() separately for the parent and adding the parent
pointer to the parent_ptes chain later in kvm_mmu_get_page(), the macro
works with no problem.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
---
 arch/x86/kvm/mmu.c | 36 +++++++++++++-----------------------
 1 file changed, 13 insertions(+), 23 deletions(-)

Comments

Marcelo Tosatti Nov. 13, 2015, 9:47 p.m. UTC | #1
On Thu, Nov 12, 2015 at 08:52:45PM +0900, Takuya Yoshikawa wrote:
> kvm_mmu_mark_parents_unsync() alone uses pte_list_walk(), witch does
> nearly the same as the for_each_rmap_spte macro.  The only difference
> is that is_shadow_present_pte() checks cannot be placed there because
> kvm_mmu_mark_parents_unsync() can be called with a new parent pointer
> whose entry is not set yet.
> 
> By calling mark_unsync() separately for the parent and adding the parent
> pointer to the parent_ptes chain later in kvm_mmu_get_page(), the macro
> works with no problem.
> 
> Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
> ---
>  arch/x86/kvm/mmu.c | 36 +++++++++++++-----------------------
>  1 file changed, 13 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index e8cfdc4..1691171 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1007,26 +1007,6 @@ static void pte_list_remove(u64 *spte, unsigned long *pte_list)
>  	}
>  }
>  
> -typedef void (*pte_list_walk_fn) (u64 *spte);
> -static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
> -{
> -	struct pte_list_desc *desc;
> -	int i;
> -
> -	if (!*pte_list)
> -		return;
> -
> -	if (!(*pte_list & 1))
> -		return fn((u64 *)*pte_list);
> -
> -	desc = (struct pte_list_desc *)(*pte_list & ~1ul);
> -	while (desc) {
> -		for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
> -			fn(desc->sptes[i]);
> -		desc = desc->more;
> -	}
> -}
> -
>  static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
>  				    struct kvm_memory_slot *slot)
>  {
> @@ -1741,7 +1721,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
>  static void mark_unsync(u64 *spte);
>  static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
>  {
> -	pte_list_walk(&sp->parent_ptes, mark_unsync);
> +	u64 *sptep;
> +	struct rmap_iterator iter;
> +
> +	for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
> +		mark_unsync(sptep);
> +	}
>  }
>  
>  static void mark_unsync(u64 *spte)
> @@ -2111,12 +2096,17 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,

Faulting a spte, and one of the levels of sptes, either


		spte-1
		spte-2
		spte-3

has present bit clear. So we're searching for a guest page to shadow, with
gfn "gfn".

>  		if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
>  			break;

If a shadow for gfn exists, but is unsync, sync guest-page ---to--> kvm
sptes.

> -		mmu_page_add_parent_pte(vcpu, sp, parent_pte);

add "gfn" (actually its "struct kvm_mmu_page *sp" pointer) to
the parent.
>  		if (sp->unsync_children) {
>  			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
>  			kvm_mmu_mark_parents_unsync(sp);

kvm_mmu_mark_parents_unsync relied on the links from current level all
the way to top level to mark all levels unsync, so that on guest entry,
KVM_REQ_MMU_SYNC is processed and any level is brought from guest -->
kvm pages. This now fails, because you removed "mmu_page_add_parent_pte"
(the link is not formed all the way to root).

Unless i am missing something, this is not correct.

> -		} else if (sp->unsync)
> +			if (parent_pte)
> +				mark_unsync(parent_pte);
> +		} else if (sp->unsync) {
>  			kvm_mmu_mark_parents_unsync(sp);
> +			if (parent_pte)
> +				mark_unsync(parent_pte);
> +		}
> +		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
>  
>  		__clear_sp_write_flooding_count(sp);
>  		trace_kvm_mmu_get_page(sp, false);

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Nov. 14, 2015, 9:20 a.m. UTC | #2
On Fri, Nov 13, 2015 at 07:47:28PM -0200, Marcelo Tosatti wrote:
> On Thu, Nov 12, 2015 at 08:52:45PM +0900, Takuya Yoshikawa wrote:
> > kvm_mmu_mark_parents_unsync() alone uses pte_list_walk(), witch does
> > nearly the same as the for_each_rmap_spte macro.  The only difference
> > is that is_shadow_present_pte() checks cannot be placed there because
> > kvm_mmu_mark_parents_unsync() can be called with a new parent pointer
> > whose entry is not set yet.
> > 
> > By calling mark_unsync() separately for the parent and adding the parent
> > pointer to the parent_ptes chain later in kvm_mmu_get_page(), the macro
> > works with no problem.
> > 
> > Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
> > ---
> >  arch/x86/kvm/mmu.c | 36 +++++++++++++-----------------------
> >  1 file changed, 13 insertions(+), 23 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index e8cfdc4..1691171 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -1007,26 +1007,6 @@ static void pte_list_remove(u64 *spte, unsigned long *pte_list)
> >  	}
> >  }
> >  
> > -typedef void (*pte_list_walk_fn) (u64 *spte);
> > -static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
> > -{
> > -	struct pte_list_desc *desc;
> > -	int i;
> > -
> > -	if (!*pte_list)
> > -		return;
> > -
> > -	if (!(*pte_list & 1))
> > -		return fn((u64 *)*pte_list);
> > -
> > -	desc = (struct pte_list_desc *)(*pte_list & ~1ul);
> > -	while (desc) {
> > -		for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
> > -			fn(desc->sptes[i]);
> > -		desc = desc->more;
> > -	}
> > -}
> > -
> >  static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
> >  				    struct kvm_memory_slot *slot)
> >  {
> > @@ -1741,7 +1721,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
> >  static void mark_unsync(u64 *spte);
> >  static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
> >  {
> > -	pte_list_walk(&sp->parent_ptes, mark_unsync);
> > +	u64 *sptep;
> > +	struct rmap_iterator iter;
> > +
> > +	for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
> > +		mark_unsync(sptep);
> > +	}
> >  }
> >  
> >  static void mark_unsync(u64 *spte)
> > @@ -2111,12 +2096,17 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> 
> Faulting a spte, and one of the levels of sptes, either
> 
> 
> 		spte-1
> 		spte-2
> 		spte-3
> 
> has present bit clear. So we're searching for a guest page to shadow, with
> gfn "gfn".
> 
> >  		if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
> >  			break;
> 
> If a shadow for gfn exists, but is unsync, sync guest-page ---to--> kvm
> sptes.
> 
> > -		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
> 
> add "gfn" (actually its "struct kvm_mmu_page *sp" pointer) to
> the parent.
> >  		if (sp->unsync_children) {
> >  			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
> >  			kvm_mmu_mark_parents_unsync(sp);
> 
> kvm_mmu_mark_parents_unsync relied on the links from current level all
> the way to top level to mark all levels unsync, so that on guest entry,
> KVM_REQ_MMU_SYNC is processed and any level is brought from guest -->
> kvm pages. This now fails, because you removed "mmu_page_add_parent_pte"
> (the link is not formed all the way to root).
> 
> Unless i am missing something, this is not correct.

The actual issue is this: a higher level page that had, under its children,
no out of sync pages, now, due to your addition, a child that is unsync:

initial state:
	level1 

final state:

	level1 -x-> level2 -x-> level3

Where -x-> are the links created by this pagefault fixing round.

If _any_ page under you is unsync (not necessarily the ones this
pagefault is accessing), you have to mark parents unsync.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Takuya Yoshikawa Nov. 16, 2015, 2:51 a.m. UTC | #3
On 2015/11/14 18:20, Marcelo Tosatti wrote:

> The actual issue is this: a higher level page that had, under its children,
> no out of sync pages, now, due to your addition, a child that is unsync:
>
> initial state:
> 	level1
>
> final state:
>
> 	level1 -x-> level2 -x-> level3
>
> Where -x-> are the links created by this pagefault fixing round.
>
> If _any_ page under you is unsync (not necessarily the ones this
> pagefault is accessing), you have to mark parents unsync.

I understand this, but I don't think my patch will break this.

What kvm_mmu_mark_parents_unsync() does is:

   for each p_i in sp->parent_ptes rmap chain
     mark_unsync(p_i);

Then, mark_unsync() finds the parent sp including that p_i to
set ->unsync_child_bitmap and increment ->unsync_children if
necessary.  It may also call kvm_mmu_mark_parents_unsync()
recursively.

I understand we need to tell the parents "you have an unsync
child/descendant" until this information reaches the top level
by that recursive calls.

But since these recursive calls cannot come back to the starting sp,
the child->parent graph has no loop, each mark_unsync(p_i) will not
be affected by other parents in that sp->parent_ptes rmap chain,
from which we started the recursive calls.


As the following code shows, my patch does mark_unsync(parent_pte)
separately, and then mmu_page_add_parent_pte(vcpu, sp, parent_pte):

> -		} else if (sp->unsync)
> +			if (parent_pte)
> +				mark_unsync(parent_pte);
> +		} else if (sp->unsync) {
>  			kvm_mmu_mark_parents_unsync(sp);
> +			if (parent_pte)
> +				mark_unsync(parent_pte);
> +		}
> +		mmu_page_add_parent_pte(vcpu, sp, parent_pte);

So, as you worried, during each mark_unsync(p_i) is processed,
this parent_pte does not exist in that sp->parent_ptes rmap chain.

But as I explained above, this does not change anything about what
each mark_unsync(p_i) call does, so keeps the original behaviour.


By the way, I think "kvm_mmu_mark_parents_unsync" and "mark_unsync"
do not tell what they actually do well. When I first saw the names,
I thought they would just set the parents' sp->unsync.

To reflect the following meaning better, it should be
propagate_unsync(_to_parents) or something:

   Tell the parents "you have an unsync child/descendant"
   until this unsync information reaches the top level


Thanks,
   Takuya


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Nov. 17, 2015, 5:58 p.m. UTC | #4
On 16/11/2015 03:51, Takuya Yoshikawa wrote:
> What kvm_mmu_mark_parents_unsync() does is:
> 
>   for each p_i in sp->parent_ptes rmap chain
>     mark_unsync(p_i);
> 
> Then, mark_unsync() finds the parent sp including that p_i to
> set ->unsync_child_bitmap and increment ->unsync_children if
> necessary.  It may also call kvm_mmu_mark_parents_unsync()
> recursively.

I agree.  sp->parent_ptes goes up one level from sp;
kvm_mmu_mark_parents_unsync(sp) visits the level above sp, while
mark_unsync(sp) visit sp and all the levels above it.

Calling mark_unsync(parent_pte) is enough to complete the visit, after
kvm_mmu_mark_parents_unsync has already processed the level above sp.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Nov. 18, 2015, 3:07 a.m. UTC | #5
On 11/12/2015 07:52 PM, Takuya Yoshikawa wrote:
> kvm_mmu_mark_parents_unsync() alone uses pte_list_walk(), witch does
> nearly the same as the for_each_rmap_spte macro.  The only difference
> is that is_shadow_present_pte() checks cannot be placed there because
> kvm_mmu_mark_parents_unsync() can be called with a new parent pointer
> whose entry is not set yet.
>
> By calling mark_unsync() separately for the parent and adding the parent
> pointer to the parent_ptes chain later in kvm_mmu_get_page(), the macro
> works with no problem.
>
> Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
> ---
>   arch/x86/kvm/mmu.c | 36 +++++++++++++-----------------------
>   1 file changed, 13 insertions(+), 23 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index e8cfdc4..1691171 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1007,26 +1007,6 @@ static void pte_list_remove(u64 *spte, unsigned long *pte_list)
>   	}
>   }
>
> -typedef void (*pte_list_walk_fn) (u64 *spte);
> -static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
> -{
> -	struct pte_list_desc *desc;
> -	int i;
> -
> -	if (!*pte_list)
> -		return;
> -
> -	if (!(*pte_list & 1))
> -		return fn((u64 *)*pte_list);
> -
> -	desc = (struct pte_list_desc *)(*pte_list & ~1ul);
> -	while (desc) {
> -		for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
> -			fn(desc->sptes[i]);
> -		desc = desc->more;
> -	}
> -}
> -
>   static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
>   				    struct kvm_memory_slot *slot)
>   {
> @@ -1741,7 +1721,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
>   static void mark_unsync(u64 *spte);
>   static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
>   {
> -	pte_list_walk(&sp->parent_ptes, mark_unsync);
> +	u64 *sptep;
> +	struct rmap_iterator iter;
> +
> +	for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
> +		mark_unsync(sptep);
> +	}
>   }
>
>   static void mark_unsync(u64 *spte)
> @@ -2111,12 +2096,17 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>   		if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
>   			break;
>
> -		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
>   		if (sp->unsync_children) {
>   			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
>   			kvm_mmu_mark_parents_unsync(sp);

After your change above, the @sp's parents have not been changed, no need to call it now.

> -		} else if (sp->unsync)
> +			if (parent_pte)
> +				mark_unsync(parent_pte);
> +		} else if (sp->unsync) {
>   			kvm_mmu_mark_parents_unsync(sp);

Ditto.

> +			if (parent_pte)
> +				mark_unsync(parent_pte);
> +		}
> +		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
>
>   		__clear_sp_write_flooding_count(sp);
>   		trace_kvm_mmu_get_page(sp, false);
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e8cfdc4..1691171 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1007,26 +1007,6 @@  static void pte_list_remove(u64 *spte, unsigned long *pte_list)
 	}
 }
 
-typedef void (*pte_list_walk_fn) (u64 *spte);
-static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
-{
-	struct pte_list_desc *desc;
-	int i;
-
-	if (!*pte_list)
-		return;
-
-	if (!(*pte_list & 1))
-		return fn((u64 *)*pte_list);
-
-	desc = (struct pte_list_desc *)(*pte_list & ~1ul);
-	while (desc) {
-		for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
-			fn(desc->sptes[i]);
-		desc = desc->more;
-	}
-}
-
 static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
 				    struct kvm_memory_slot *slot)
 {
@@ -1741,7 +1721,12 @@  static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 static void mark_unsync(u64 *spte);
 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
 {
-	pte_list_walk(&sp->parent_ptes, mark_unsync);
+	u64 *sptep;
+	struct rmap_iterator iter;
+
+	for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
+		mark_unsync(sptep);
+	}
 }
 
 static void mark_unsync(u64 *spte)
@@ -2111,12 +2096,17 @@  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
 			break;
 
-		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 		if (sp->unsync_children) {
 			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
 			kvm_mmu_mark_parents_unsync(sp);
-		} else if (sp->unsync)
+			if (parent_pte)
+				mark_unsync(parent_pte);
+		} else if (sp->unsync) {
 			kvm_mmu_mark_parents_unsync(sp);
+			if (parent_pte)
+				mark_unsync(parent_pte);
+		}
+		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 
 		__clear_sp_write_flooding_count(sp);
 		trace_kvm_mmu_get_page(sp, false);