diff mbox

[4/4] KVM: MMU: Split out the main body of walk_addr_generic()

Message ID 20110609230524.346b3d1c.takuya.yoshikawa@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Takuya Yoshikawa June 9, 2011, 2:05 p.m. UTC
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>

The code has clearly suffered from over inlining.  So make the body of
the walk loop a separate function: do_walk().

This will make it easy to do more cleanups and optimizations later.

This was suggested by Ingo Molnar.

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
---
 arch/x86/kvm/mmu.c         |   21 ++++
 arch/x86/kvm/paging_tmpl.h |  227 ++++++++++++++++++++++++--------------------
 2 files changed, 145 insertions(+), 103 deletions(-)

Comments

Ingo Molnar June 9, 2011, 2:18 p.m. UTC | #1
* Takuya Yoshikawa <takuya.yoshikawa@gmail.com> wrote:

> +/*
> + * do_walk() returns one of these.
> + *
> + * WALK_NEXT:		Continue the walk loop.
> + * WALK_DONE:		Break from the walk loop.
> + * WALK_RETRY:		Retry walk.
> + * WALK_NOT_PRESENT:		Set PFERR_PRESENT_MASK and goto error.
> + * WALK_RSVD_FAULT:		Set PFERR_RSVD_MASK and goto error.
> + * WALK_ERROR:		Goto error.
> + * WALK_ABORT:		Return immediately.

hm, this iterator turned out to be more complex than i thought it 
would become. Avi, are you still happy with that?

> +	if (!*eperm && unlikely(!(*pte & PT_ACCESSED_MASK))) {
> +		int ret;
> +
> +		trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(*pte));
> +		ret = FNAME(cmpxchg_gpte)(vcpu, mmu, *ptep_user, index,
> +					  *pte, *pte|PT_ACCESSED_MASK);
> +		if (unlikely(ret < 0))
> +			return WALK_NOT_PRESENT;
> +		else if (ret)
> +			return WALK_RETRY;
> +
> +		mark_page_dirty(vcpu->kvm, table_gfn);
> +		*pte |= PT_ACCESSED_MASK;
> +	}

This wants to move into a set-accessed-bit helper inline.

> +	if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
> +	    ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(*pte) &&
> +	     (PTTYPE == 64 || is_pse(vcpu))) ||
> +	    ((walker->level == PT_PDPE_LEVEL) && is_large_pte(*pte) &&
> +	     (mmu->root_level == PT64_ROOT_LEVEL))) {

This condition wants to move into a is-pte-large inline function.

> +		gpa_t real_gpa;
> +		gfn_t gfn;
> +		u32 ac;
> +
> +		gfn = gpte_to_gfn_lvl(*pte, walker->level);
> +		gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
> +
> +		if (PTTYPE == 32 && (walker->level == PT_DIRECTORY_LEVEL) &&
> +		    is_cpuid_PSE36())
> +			gfn += pse36_gfn_delta(*pte);
> +
> +		ac = write_fault | fetch_fault | user_fault;
> +
> +		real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), ac);
> +		if (real_gpa == UNMAPPED_GVA)
> +			return WALK_ABORT;
> +
> +		walker->gfn = real_gpa >> PAGE_SHIFT;
> +
> +		return WALK_DONE;

And this would look cleaner if it was in a handle-large-pte inline function?

> +		ret = FNAME(do_walk)(walker, vcpu, mmu, addr, access,
> +				     &eperm, &pte, &ptep_user);
> +		switch (ret) {
> +		case WALK_NEXT:
> +			break;
> +		case WALK_DONE:
> +			goto walk_done;
> +		case WALK_RETRY:
> +			goto walk_retry;
> +		case WALK_NOT_PRESENT:
>  			errcode |= PFERR_PRESENT_MASK;
>  			goto error;
> +		case WALK_RSVD_FAULT:
>  			errcode |= PFERR_RSVD_MASK;
>  			goto error;
> +		case WALK_ERROR:
> +			goto error;
> +		case WALK_ABORT:
> +			return 0;

Btw., there's a stylistic trick you could use here to make the 
iteration logic even clearer:

		switch (ret) {
		case WALK_NEXT:						break;
		case WALK_DONE:						goto walk_done;
		case WALK_RETRY:					goto walk_retry;
		case WALK_NOT_PRESENT:	errcode |= PFERR_PRESENT_MASK;	goto error;
		case WALK_RSVD_FAULT:	errcode |= PFERR_RSVD_MASK;	goto error;
		case WALK_ERROR:					goto error;
		case WALK_ABORT:					return 0;
		}

But it's a pure matter of taste - it might not really fit into KVM 
code. Avi's call :-)

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 12, 2011, 3:47 p.m. UTC | #2
On 06/09/2011 05:18 PM, Ingo Molnar wrote:
> * Takuya Yoshikawa<takuya.yoshikawa@gmail.com>  wrote:
>
> >  +/*
> >  + * do_walk() returns one of these.
> >  + *
> >  + * WALK_NEXT:		Continue the walk loop.
> >  + * WALK_DONE:		Break from the walk loop.
> >  + * WALK_RETRY:		Retry walk.
> >  + * WALK_NOT_PRESENT:		Set PFERR_PRESENT_MASK and goto error.
> >  + * WALK_RSVD_FAULT:		Set PFERR_RSVD_MASK and goto error.
> >  + * WALK_ERROR:		Goto error.
> >  + * WALK_ABORT:		Return immediately.
>
> hm, this iterator turned out to be more complex than i thought it
> would become. Avi, are you still happy with that?

No - a lot of code is spent just communicating between the two 
functions.  Best to leave it in a single function.

> >  +	if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
> >  +	    ((walker->level == PT_DIRECTORY_LEVEL)&&  is_large_pte(*pte)&&
> >  +	     (PTTYPE == 64 || is_pse(vcpu))) ||
> >  +	    ((walker->level == PT_PDPE_LEVEL)&&  is_large_pte(*pte)&&
> >  +	     (mmu->root_level == PT64_ROOT_LEVEL))) {
>
> This condition wants to move into a is-pte-large inline function.

Better, is_last_gpte().  We already have an is_last_spte().
diff mbox

Patch

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2d14434..16ccf4b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -69,6 +69,27 @@  char *audit_point_name[] = {
 	"post sync"
 };
 
+/*
+ * do_walk() returns one of these.
+ *
+ * WALK_NEXT:		Continue the walk loop.
+ * WALK_DONE:		Break from the walk loop.
+ * WALK_RETRY:		Retry walk.
+ * WALK_NOT_PRESENT:	Set PFERR_PRESENT_MASK and goto error.
+ * WALK_RSVD_FAULT:	Set PFERR_RSVD_MASK and goto error.
+ * WALK_ERROR:		Goto error.
+ * WALK_ABORT:		Return immediately.
+ */
+enum {
+	WALK_NEXT,
+	WALK_DONE,
+	WALK_RETRY,
+	WALK_NOT_PRESENT,
+	WALK_RSVD_FAULT,
+	WALK_ERROR,
+	WALK_ABORT
+};
+
 #undef MMU_DEBUG
 
 #ifdef MMU_DEBUG
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 711336b..4913aa5 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -114,6 +114,111 @@  static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 }
 
 /*
+ * Walk one level.
+ * Guest pte and its user address will be put in *pte and *ptep_user.
+ */
+static inline int
+FNAME(do_walk)(struct guest_walker *walker, struct kvm_vcpu *vcpu,
+	       struct kvm_mmu *mmu, gva_t addr, u32 access, bool *eperm,
+	       pt_element_t *pte, pt_element_t __user **ptep_user)
+{
+	gfn_t real_gfn;
+	unsigned long host_addr;
+	unsigned index  = PT_INDEX(addr, walker->level);
+	int offset      = index * sizeof(pt_element_t);
+	gfn_t table_gfn = gpte_to_gfn(*pte);
+	gpa_t pte_gpa   = gfn_to_gpa(table_gfn) + offset;
+	const int write_fault = access & PFERR_WRITE_MASK;
+	const int user_fault  = access & PFERR_USER_MASK;
+	const int fetch_fault = access & PFERR_FETCH_MASK;
+
+	walker->table_gfn[walker->level - 1] = table_gfn;
+	walker->pte_gpa[walker->level - 1] = pte_gpa;
+
+	real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
+				      PFERR_USER_MASK|PFERR_WRITE_MASK);
+	if (unlikely(real_gfn == UNMAPPED_GVA))
+		return WALK_NOT_PRESENT;
+	real_gfn = gpa_to_gfn(real_gfn);
+
+	host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
+	if (unlikely(kvm_is_error_hva(host_addr)))
+		return WALK_NOT_PRESENT;
+
+	*ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
+	if (unlikely(__copy_from_user(pte, *ptep_user, sizeof(*pte))))
+		return WALK_NOT_PRESENT;
+
+	trace_kvm_mmu_paging_element(*pte, walker->level);
+
+	if (unlikely(!is_present_gpte(*pte)))
+		return WALK_NOT_PRESENT;
+
+	if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, *pte, walker->level)))
+		return WALK_RSVD_FAULT;
+
+	if (unlikely(write_fault && !is_writable_pte(*pte)
+		     && (user_fault || is_write_protection(vcpu))))
+		*eperm = true;
+
+	if (unlikely(user_fault && !(*pte & PT_USER_MASK)))
+		*eperm = true;
+
+#if PTTYPE == 64
+	if (unlikely(fetch_fault && (*pte & PT64_NX_MASK)))
+		*eperm = true;
+#endif
+
+	if (!*eperm && unlikely(!(*pte & PT_ACCESSED_MASK))) {
+		int ret;
+
+		trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(*pte));
+		ret = FNAME(cmpxchg_gpte)(vcpu, mmu, *ptep_user, index,
+					  *pte, *pte|PT_ACCESSED_MASK);
+		if (unlikely(ret < 0))
+			return WALK_NOT_PRESENT;
+		else if (ret)
+			return WALK_RETRY;
+
+		mark_page_dirty(vcpu->kvm, table_gfn);
+		*pte |= PT_ACCESSED_MASK;
+	}
+
+	walker->pte_access = walker->pt_access & FNAME(gpte_access)(vcpu, *pte);
+
+	walker->ptes[walker->level - 1] = *pte;
+
+	if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
+	    ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(*pte) &&
+	     (PTTYPE == 64 || is_pse(vcpu))) ||
+	    ((walker->level == PT_PDPE_LEVEL) && is_large_pte(*pte) &&
+	     (mmu->root_level == PT64_ROOT_LEVEL))) {
+		gpa_t real_gpa;
+		gfn_t gfn;
+		u32 ac;
+
+		gfn = gpte_to_gfn_lvl(*pte, walker->level);
+		gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
+
+		if (PTTYPE == 32 && (walker->level == PT_DIRECTORY_LEVEL) &&
+		    is_cpuid_PSE36())
+			gfn += pse36_gfn_delta(*pte);
+
+		ac = write_fault | fetch_fault | user_fault;
+
+		real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), ac);
+		if (real_gpa == UNMAPPED_GVA)
+			return WALK_ABORT;
+
+		walker->gfn = real_gpa >> PAGE_SHIFT;
+
+		return WALK_DONE;
+	}
+
+	return WALK_NEXT;
+}
+
+/*
  * Fetch a guest pte for a guest virtual address
  */
 static int FNAME(walk_addr_generic)(struct guest_walker *walker,
@@ -130,7 +235,7 @@  static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 
 	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
 				     fetch_fault);
-walk:
+walk_retry:
 	eperm = false;
 	walker->level = mmu->root_level;
 	pte           = mmu->get_cr3(vcpu);
@@ -152,118 +257,34 @@  walk:
 	walker->pt_access = ACC_ALL;
 
 	for (;;) {
-		gfn_t real_gfn;
-		unsigned long host_addr;
-		unsigned index  = PT_INDEX(addr, walker->level);
-		int offset      = index * sizeof(pt_element_t);
-		gfn_t table_gfn = gpte_to_gfn(pte);
-		gpa_t pte_gpa   = gfn_to_gpa(table_gfn) + offset;
-
-		walker->table_gfn[walker->level - 1] = table_gfn;
-		walker->pte_gpa[walker->level - 1] = pte_gpa;
-
-		real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
-					      PFERR_USER_MASK|PFERR_WRITE_MASK);
-		if (unlikely(real_gfn == UNMAPPED_GVA)) {
-			errcode |= PFERR_PRESENT_MASK;
-			goto error;
-		}
-		real_gfn = gpa_to_gfn(real_gfn);
-
-		host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
-		if (unlikely(kvm_is_error_hva(host_addr))) {
-			errcode |= PFERR_PRESENT_MASK;
-			goto error;
-		}
-
-		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
-		if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) {
-			errcode |= PFERR_PRESENT_MASK;
-			goto error;
-		}
-
-		trace_kvm_mmu_paging_element(pte, walker->level);
+		int ret;
 
-		if (unlikely(!is_present_gpte(pte))) {
+		ret = FNAME(do_walk)(walker, vcpu, mmu, addr, access,
+				     &eperm, &pte, &ptep_user);
+		switch (ret) {
+		case WALK_NEXT:
+			break;
+		case WALK_DONE:
+			goto walk_done;
+		case WALK_RETRY:
+			goto walk_retry;
+		case WALK_NOT_PRESENT:
 			errcode |= PFERR_PRESENT_MASK;
 			goto error;
-		}
-
-		if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
-					      walker->level))) {
+		case WALK_RSVD_FAULT:
 			errcode |= PFERR_RSVD_MASK;
 			goto error;
-		}
-
-		if (unlikely(write_fault && !is_writable_pte(pte)
-			     && (user_fault || is_write_protection(vcpu))))
-			eperm = true;
-
-		if (unlikely(user_fault && !(pte & PT_USER_MASK)))
-			eperm = true;
-
-#if PTTYPE == 64
-		if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
-			eperm = true;
-#endif
-
-		if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
-			int ret;
-			trace_kvm_mmu_set_accessed_bit(table_gfn, index,
-						       sizeof(pte));
-			ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
-						  pte, pte|PT_ACCESSED_MASK);
-			if (unlikely(ret < 0)) {
-				errcode |= PFERR_PRESENT_MASK;
-				goto error;
-			} else if (ret)
-				goto walk;
-
-			mark_page_dirty(vcpu->kvm, table_gfn);
-			pte |= PT_ACCESSED_MASK;
-		}
-
-		walker->pte_access = walker->pt_access &
-				     FNAME(gpte_access)(vcpu, pte);
-
-		walker->ptes[walker->level - 1] = pte;
-
-		if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
-		    ((walker->level == PT_DIRECTORY_LEVEL) &&
-				is_large_pte(pte) &&
-				(PTTYPE == 64 || is_pse(vcpu))) ||
-		    ((walker->level == PT_PDPE_LEVEL) &&
-				is_large_pte(pte) &&
-				mmu->root_level == PT64_ROOT_LEVEL)) {
-			int lvl = walker->level;
-			gpa_t real_gpa;
-			gfn_t gfn;
-			u32 ac;
-
-			gfn = gpte_to_gfn_lvl(pte, lvl);
-			gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
-
-			if (PTTYPE == 32 &&
-			    walker->level == PT_DIRECTORY_LEVEL &&
-			    is_cpuid_PSE36())
-				gfn += pse36_gfn_delta(pte);
-
-			ac = write_fault | fetch_fault | user_fault;
-
-			real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
-						      ac);
-			if (real_gpa == UNMAPPED_GVA)
-				return 0;
-
-			walker->gfn = real_gpa >> PAGE_SHIFT;
-
-			break;
+		case WALK_ERROR:
+			goto error;
+		case WALK_ABORT:
+			return 0;
 		}
 
 		walker->pt_access = walker->pte_access;
 		--walker->level;
 	}
 
+walk_done:
 	if (unlikely(eperm))
 		goto error;
 
@@ -279,7 +300,7 @@  walk:
 			errcode |= PFERR_PRESENT_MASK;
 			goto error;
 		} else if (ret)
-			goto walk;
+			goto walk_retry;
 
 		mark_page_dirty(vcpu->kvm, table_gfn);
 		pte |= PT_DIRTY_MASK;