diff mbox

[02/10] nEPT: Add EPT tables support to paging_tmpl.h

Message ID 201208011437.q71Ebfif023819@rice.haifa.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nadav Har'El Aug. 1, 2012, 2:37 p.m. UTC
This is the first patch in a series which adds nested EPT support to KVM's
nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
to set its own cr3 and take its own page faults without either of L0 or L1
getting involved. This often significanlty improves L2's performance over the
previous two alternatives (shadow page tables over EPT, and shadow page
tables over shadow page tables).

This patch adds EPT support to paging_tmpl.h.

paging_tmpl.h contains the code for reading and writing page tables. The code
for 32-bit and 64-bit tables is very similar, but not identical, so
paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
with PTTYPE=64, and this generates the two sets of similar functions.

There are subtle but important differences between the format of EPT tables
and that of ordinary x86 64-bit page tables, so for nested EPT we need a
third set of functions to read the guest EPT table and to write the shadow
EPT table.

So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
with "EPT") which correctly read and write EPT tables.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
---
 arch/x86/kvm/mmu.c         |   14 +----
 arch/x86/kvm/paging_tmpl.h |   98 ++++++++++++++++++++++++++++++++---
 2 files changed, 96 insertions(+), 16 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Xiao Guangrong Aug. 2, 2012, 4 a.m. UTC | #1
On 08/01/2012 10:37 PM, Nadav Har'El wrote:
> This is the first patch in a series which adds nested EPT support to KVM's
> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
> to set its own cr3 and take its own page faults without either of L0 or L1
> getting involved. This often significanlty improves L2's performance over the
> previous two alternatives (shadow page tables over EPT, and shadow page
> tables over shadow page tables).
> 
> This patch adds EPT support to paging_tmpl.h.
> 
> paging_tmpl.h contains the code for reading and writing page tables. The code
> for 32-bit and 64-bit tables is very similar, but not identical, so
> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
> with PTTYPE=64, and this generates the two sets of similar functions.
> 
> There are subtle but important differences between the format of EPT tables
> and that of ordinary x86 64-bit page tables, so for nested EPT we need a
> third set of functions to read the guest EPT table and to write the shadow
> EPT table.
> 
> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
> with "EPT") which correctly read and write EPT tables.
> 

Now, paging_tmpl.h becomes really untidy and hard to read, may be we need
to abstract the specified operations depends on PTTYPE.

> Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
> ---
>  arch/x86/kvm/mmu.c         |   14 +----
>  arch/x86/kvm/paging_tmpl.h |   98 ++++++++++++++++++++++++++++++++---
>  2 files changed, 96 insertions(+), 16 deletions(-)
> 
> --- .before/arch/x86/kvm/mmu.c	2012-08-01 17:22:46.000000000 +0300
> +++ .after/arch/x86/kvm/mmu.c	2012-08-01 17:22:46.000000000 +0300
> @@ -1971,15 +1971,6 @@ static void shadow_walk_next(struct kvm_
>  	return __shadow_walk_next(iterator, *iterator->sptep);
>  }
> 
> -static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
> -{
> -	u64 spte;
> -
> -	spte = __pa(sp->spt)
> -		| PT_PRESENT_MASK | PT_ACCESSED_MASK
> -		| PT_WRITABLE_MASK | PT_USER_MASK;
> -	mmu_spte_set(sptep, spte);
> -}
> 
>  static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>  				   unsigned direct_access)
> @@ -3427,6 +3418,11 @@ static bool sync_mmio_spte(u64 *sptep, g
>  	return false;
>  }
> 
> +#define PTTYPE_EPT 18 /* arbitrary */
> +#define PTTYPE PTTYPE_EPT
> +#include "paging_tmpl.h"
> +#undef PTTYPE
> +
>  #define PTTYPE 64
>  #include "paging_tmpl.h"
>  #undef PTTYPE
> --- .before/arch/x86/kvm/paging_tmpl.h	2012-08-01 17:22:46.000000000 +0300
> +++ .after/arch/x86/kvm/paging_tmpl.h	2012-08-01 17:22:46.000000000 +0300
> @@ -50,6 +50,22 @@
>  	#define PT_LEVEL_BITS PT32_LEVEL_BITS
>  	#define PT_MAX_FULL_LEVELS 2
>  	#define CMPXCHG cmpxchg
> +#elif PTTYPE == PTTYPE_EPT
> +	#define pt_element_t u64
> +	#define guest_walker guest_walkerEPT
> +	#define FNAME(name) EPT_##name
> +	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> +	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> +	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> +	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> +	#define PT_LEVEL_BITS PT64_LEVEL_BITS
> +	#ifdef CONFIG_X86_64
> +	#define PT_MAX_FULL_LEVELS 4
> +	#define CMPXCHG cmpxchg
> +	#else
> +	#define CMPXCHG cmpxchg64
> +	#define PT_MAX_FULL_LEVELS 2
> +	#endif

Missing the case of FULL_LEVELS == 3? Oh, you mentioned it
as PAE case in the PATCH 0.

>  #else
>  	#error Invalid PTTYPE value
>  #endif
> @@ -78,6 +94,7 @@ static gfn_t gpte_to_gfn_lvl(pt_element_
>  	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
>  }
> 
> +#if PTTYPE != PTTYPE_EPT
>  static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>  			       pt_element_t __user *ptep_user, unsigned index,
>  			       pt_element_t orig_pte, pt_element_t new_pte)
> @@ -100,15 +117,22 @@ static int FNAME(cmpxchg_gpte)(struct kv
> 
>  	return (ret != orig_pte);
>  }
> +#endif
> 

Note A/D bits are supported on new intel cpus, this function should be reworked
for nept. I know you did not export this feather to guest, but we can reduce
the difference between nept and other mmu models if A/D are supported.

>  static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
>  				   bool last)
>  {
>  	unsigned access;
> 
> +#if PTTYPE == PTTYPE_EPT
> +	/* We rely here that ACC_WRITE_MASK==VMX_EPT_WRITABLE_MASK */
> +	access = (gpte & VMX_EPT_WRITABLE_MASK) | ACC_USER_MASK |
> +			((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0);
> +#else
>  	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
>  	if (last && !is_dirty_gpte(gpte))
>  		access &= ~ACC_WRITE_MASK;
> +#endif
> 

May be we can introduce PT_xxx_MASK to abstracter the access bits.

>  #if PTTYPE == 64
>  	if (vcpu->arch.mmu.nx)
> @@ -135,6 +159,30 @@ static bool FNAME(is_last_gpte)(struct g
>  	return false;
>  }
> 
> +static inline int FNAME(is_present_gpte)(unsigned long pte)
> +{
> +#if PTTYPE == PTTYPE_EPT
> +	return pte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
> +			VMX_EPT_EXECUTABLE_MASK);
> +#else
> +	return is_present_gpte(pte);
> +#endif
> +}
> +

Introduce PT_PRESENT_BITS can eliminate the dependence, and we
need to rework is_present_gpte since it is used out of paging_tmpl.h.

> +static inline int FNAME(check_write_user_access)(struct kvm_vcpu *vcpu,
> +					   bool write_fault, bool user_fault,
> +					   unsigned long pte)
> +{
> +#if PTTYPE == PTTYPE_EPT
> +	if (unlikely(write_fault && !(pte & VMX_EPT_WRITABLE_MASK)
> +	      && (user_fault || is_write_protection(vcpu))))
> +		return false;
> +	return true;
> +#else
> +	return check_write_user_access(vcpu, write_fault, user_fault, pte);
> +#endif
> +}
> +

Ditto, need to rework check_write_user_access.

>  /*
>   * Fetch a guest pte for a guest virtual address
>   */
> @@ -155,7 +203,9 @@ static int FNAME(walk_addr_generic)(stru
>  	u16 errcode = 0;
> 
>  	trace_kvm_mmu_pagetable_walk(addr, access);
> +#if PTTYPE != PTTYPE_EPT
>  retry_walk:
> +#endif
>  	eperm = false;
>  	walker->level = mmu->root_level;
>  	pte           = mmu->get_cr3(vcpu);
> @@ -202,7 +252,7 @@ retry_walk:
> 
>  		trace_kvm_mmu_paging_element(pte, walker->level);
> 
> -		if (unlikely(!is_present_gpte(pte)))
> +		if (unlikely(!FNAME(is_present_gpte)(pte)))
>  			goto error;
> 
>  		if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
> @@ -211,13 +261,16 @@ retry_walk:
>  			goto error;
>  		}
> 
> -		if (!check_write_user_access(vcpu, write_fault, user_fault,
> -					  pte))
> +		if (!FNAME(check_write_user_access)(vcpu, write_fault,
> +					  user_fault, pte))
>  			eperm = true;
> 
>  #if PTTYPE == 64
>  		if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
>  			eperm = true;
> +#elif PTTYPE == PTTYPE_EPT
> +		if (unlikely(fetch_fault && !(pte & VMX_EPT_EXECUTABLE_MASK)))
> +			eperm = true;
>  #endif
> 
>  		last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
> @@ -225,12 +278,15 @@ retry_walk:
>  			pte_access = pt_access &
>  				     FNAME(gpte_access)(vcpu, pte, true);
>  			/* check if the kernel is fetching from user page */
> +#if PTTYPE != PTTYPE_EPT
>  			if (unlikely(pte_access & PT_USER_MASK) &&
>  			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
>  				if (fetch_fault && !user_fault)
>  					eperm = true;
> +#endif
>  		}
> 
> +#if PTTYPE != PTTYPE_EPT
>  		if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
>  			int ret;
>  			trace_kvm_mmu_set_accessed_bit(table_gfn, index,
> @@ -245,6 +301,7 @@ retry_walk:
>  			mark_page_dirty(vcpu->kvm, table_gfn);
>  			pte |= PT_ACCESSED_MASK;
>  		}
> +#endif

If A/D supported, these differences can be be removed?

> 
>  		walker->ptes[walker->level - 1] = pte;
> 
> @@ -283,6 +340,7 @@ retry_walk:
>  		goto error;
>  	}
> 
> +#if PTTYPE != PTTYPE_EPT
>  	if (write_fault && unlikely(!is_dirty_gpte(pte))) {
>  		int ret;
> 
> @@ -298,6 +356,7 @@ retry_walk:
>  		pte |= PT_DIRTY_MASK;
>  		walker->ptes[walker->level - 1] = pte;
>  	}
> +#endif
> 
>  	walker->pt_access = pt_access;
>  	walker->pte_access = pte_access;
> @@ -328,6 +387,7 @@ static int FNAME(walk_addr)(struct guest
>  					access);
>  }
> 
> +#if PTTYPE != PTTYPE_EPT
>  static int FNAME(walk_addr_nested)(struct guest_walker *walker,
>  				   struct kvm_vcpu *vcpu, gva_t addr,
>  				   u32 access)
> @@ -335,6 +395,7 @@ static int FNAME(walk_addr_nested)(struc
>  	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
>  					addr, access);
>  }
> +#endif
> 

Hmm, you do not need the special walking functions?

>  static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
>  				    struct kvm_mmu_page *sp, u64 *spte,
> @@ -343,11 +404,13 @@ static bool FNAME(prefetch_invalid_gpte)
>  	if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
>  		goto no_present;
> 
> -	if (!is_present_gpte(gpte))
> +	if (!FNAME(is_present_gpte)(gpte))
>  		goto no_present;
> 
> +#if PTTYPE != PTTYPE_EPT
>  	if (!(gpte & PT_ACCESSED_MASK))
>  		goto no_present;
> +#endif
> 
>  	return false;
> 
> @@ -458,6 +521,20 @@ static void FNAME(pte_prefetch)(struct k
>  			     pfn, true, true);
>  	}
>  }
> +static void FNAME(link_shadow_page)(u64 *sptep, struct kvm_mmu_page *sp)
> +{
> +	u64 spte;
> +
> +	spte = __pa(sp->spt)
> +#if PTTYPE == PTTYPE_EPT
> +		| VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK
> +		| VMX_EPT_EXECUTABLE_MASK;
> +#else
> +		| PT_PRESENT_MASK | PT_ACCESSED_MASK
> +		| PT_WRITABLE_MASK | PT_USER_MASK;
> +#endif
> +	mmu_spte_set(sptep, spte);
> +}
> 
>  /*
>   * Fetch a shadow pte for a specific level in the paging hierarchy.
> @@ -474,7 +551,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
>  	unsigned direct_access;
>  	struct kvm_shadow_walk_iterator it;
> 
> -	if (!is_present_gpte(gw->ptes[gw->level - 1]))
> +	if (!FNAME(is_present_gpte)(gw->ptes[gw->level - 1]))
>  		return NULL;
> 
>  	direct_access = gw->pte_access;
> @@ -514,7 +591,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
>  			goto out_gpte_changed;
> 
>  		if (sp)
> -			link_shadow_page(it.sptep, sp);
> +			FNAME(link_shadow_page)(it.sptep, sp);
>  	}
> 
>  	for (;
> @@ -534,10 +611,15 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
> 
>  		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
>  				      true, direct_access, it.sptep);
> -		link_shadow_page(it.sptep, sp);
> +		FNAME(link_shadow_page)(it.sptep, sp);
>  	}
> 
>  	clear_sp_write_flooding_count(it.sptep);
> +	/* TODO: Consider if everything that set_spte() does is correct when
> +	   the shadow page table is actually EPT. Most is fine (for direct_map)
> +	   but it appears there they be a few wrong corner cases with
> +	   PT_USER_MASK, PT64_NX_MASK, etc., and I need to review everything
> +	 */

Maybe it is ok. But you need to care A/D bits (current, you did not export
A/D bits to guest, however, it may be supported on L0).

>  	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
>  		     user_fault, write_fault, emulate, it.level,
>  		     gw->gfn, pfn, prefault, map_writable);
> @@ -733,6 +815,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kv
>  	return gpa;
>  }
> 
> +#if PTTYPE != PTTYPE_EPT
>  static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
>  				      u32 access,
>  				      struct x86_exception *exception)
> @@ -751,6 +834,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(st
> 
>  	return gpa;
>  }
> +#endif

Why it is not needed?



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nadav Har'El Aug. 2, 2012, 9:25 p.m. UTC | #2
On Thu, Aug 02, 2012, Xiao Guangrong wrote about "Re: [PATCH 02/10] nEPT: Add EPT tables support to paging_tmpl.h":
> > +	#ifdef CONFIG_X86_64
> > +	#define PT_MAX_FULL_LEVELS 4
> > +	#define CMPXCHG cmpxchg
> > +	#else
> > +	#define CMPXCHG cmpxchg64
> > +	#define PT_MAX_FULL_LEVELS 2
> > +	#endif
> 
> Missing the case of FULL_LEVELS == 3? Oh, you mentioned it
> as PAE case in the PATCH 0.

I understood this differently (and it would not be surprising if
wrongly...): With nested EPT, we only deal with two *EPT* tables -
the shadowed page table and shadow page table are both EPT.
And EPT tables cannot have three levels - even if PAE is used. Or at least,
that's what I thought...

> Note A/D bits are supported on new intel cpus, this function should be reworked
> for nept. I know you did not export this feather to guest, but we can reduce
> the difference between nept and other mmu models if A/D are supported.

I'm not sure what you meant: If the access/dirty bits are supported in
newer cpus, do you think we *should* support them also in the processor
L1 processor, or are you saying that it would be easier to support them
because this is what the shadow page table code normally does anyway,
so *not* supporting them will take effort?

> > +#if PTTYPE != PTTYPE_EPT
> >  static int FNAME(walk_addr_nested)(struct guest_walker *walker,
> >  				   struct kvm_vcpu *vcpu, gva_t addr,
> >  				   u32 access)
> > @@ -335,6 +395,7 @@ static int FNAME(walk_addr_nested)(struc
> >  	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
> >  					addr, access);
> >  }
> > +#endif
> > 
> 
> Hmm, you do not need the special walking functions?

Since these functions are static, the compiler warns me on every
function that is never used, so I had to #if them out...
Xiao Guangrong Aug. 3, 2012, 8:08 a.m. UTC | #3
On 08/03/2012 05:25 AM, Nadav Har'El wrote:
> On Thu, Aug 02, 2012, Xiao Guangrong wrote about "Re: [PATCH 02/10] nEPT: Add EPT tables support to paging_tmpl.h":
>>> +	#ifdef CONFIG_X86_64
>>> +	#define PT_MAX_FULL_LEVELS 4
>>> +	#define CMPXCHG cmpxchg
>>> +	#else
>>> +	#define CMPXCHG cmpxchg64
>>> +	#define PT_MAX_FULL_LEVELS 2
>>> +	#endif
>>
>> Missing the case of FULL_LEVELS == 3? Oh, you mentioned it
>> as PAE case in the PATCH 0.
> 
> I understood this differently (and it would not be surprising if
> wrongly...): With nested EPT, we only deal with two *EPT* tables -
> the shadowed page table and shadow page table are both EPT.
> And EPT tables cannot have three levels - even if PAE is used. Or at least,
> that's what I thought...
> 
>> Note A/D bits are supported on new intel cpus, this function should be reworked
>> for nept. I know you did not export this feather to guest, but we can reduce
>> the difference between nept and other mmu models if A/D are supported.
> 
> I'm not sure what you meant: If the access/dirty bits are supported in
> newer cpus, do you think we *should* support them also in the processor
> L1 processor, or are you saying that it would be easier to support them
> because this is what the shadow page table code normally does anyway,
> so *not* supporting them will take effort?

I mean "it would be easier to support them
 because this is what the shadow page table code normally does anyway,
 so *not* supporting them will take effort" :)

Then, we can drop "ifndef PTTYPT_EPT"...

Actuality, we can redefine some bits (like PRSENT, WRTIABLE, DRITY...) to
let the paging_tmpl code work for all models.

> 
>>> +#if PTTYPE != PTTYPE_EPT
>>>  static int FNAME(walk_addr_nested)(struct guest_walker *walker,
>>>  				   struct kvm_vcpu *vcpu, gva_t addr,
>>>  				   u32 access)
>>> @@ -335,6 +395,7 @@ static int FNAME(walk_addr_nested)(struc
>>>  	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
>>>  					addr, access);
>>>  }
>>> +#endif
>>>
>>
>> Hmm, you do not need the special walking functions?
> 
> Since these functions are static, the compiler warns me on every
> function that is never used, so I had to #if them out...
> 
> 

IIUC, you did not implement the functions (like walk_addr_nested) to translate
L2's VA to L2's PA, yes? (it is needed for emulation.)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

--- .before/arch/x86/kvm/mmu.c	2012-08-01 17:22:46.000000000 +0300
+++ .after/arch/x86/kvm/mmu.c	2012-08-01 17:22:46.000000000 +0300
@@ -1971,15 +1971,6 @@  static void shadow_walk_next(struct kvm_
 	return __shadow_walk_next(iterator, *iterator->sptep);
 }
 
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
-{
-	u64 spte;
-
-	spte = __pa(sp->spt)
-		| PT_PRESENT_MASK | PT_ACCESSED_MASK
-		| PT_WRITABLE_MASK | PT_USER_MASK;
-	mmu_spte_set(sptep, spte);
-}
 
 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 				   unsigned direct_access)
@@ -3427,6 +3418,11 @@  static bool sync_mmio_spte(u64 *sptep, g
 	return false;
 }
 
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
--- .before/arch/x86/kvm/paging_tmpl.h	2012-08-01 17:22:46.000000000 +0300
+++ .after/arch/x86/kvm/paging_tmpl.h	2012-08-01 17:22:46.000000000 +0300
@@ -50,6 +50,22 @@ 
 	#define PT_LEVEL_BITS PT32_LEVEL_BITS
 	#define PT_MAX_FULL_LEVELS 2
 	#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+	#define pt_element_t u64
+	#define guest_walker guest_walkerEPT
+	#define FNAME(name) EPT_##name
+	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+	#define PT_LEVEL_BITS PT64_LEVEL_BITS
+	#ifdef CONFIG_X86_64
+	#define PT_MAX_FULL_LEVELS 4
+	#define CMPXCHG cmpxchg
+	#else
+	#define CMPXCHG cmpxchg64
+	#define PT_MAX_FULL_LEVELS 2
+	#endif
 #else
 	#error Invalid PTTYPE value
 #endif
@@ -78,6 +94,7 @@  static gfn_t gpte_to_gfn_lvl(pt_element_
 	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
 
+#if PTTYPE != PTTYPE_EPT
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 			       pt_element_t __user *ptep_user, unsigned index,
 			       pt_element_t orig_pte, pt_element_t new_pte)
@@ -100,15 +117,22 @@  static int FNAME(cmpxchg_gpte)(struct kv
 
 	return (ret != orig_pte);
 }
+#endif
 
 static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
 				   bool last)
 {
 	unsigned access;
 
+#if PTTYPE == PTTYPE_EPT
+	/* We rely here that ACC_WRITE_MASK==VMX_EPT_WRITABLE_MASK */
+	access = (gpte & VMX_EPT_WRITABLE_MASK) | ACC_USER_MASK |
+			((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0);
+#else
 	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
 	if (last && !is_dirty_gpte(gpte))
 		access &= ~ACC_WRITE_MASK;
+#endif
 
 #if PTTYPE == 64
 	if (vcpu->arch.mmu.nx)
@@ -135,6 +159,30 @@  static bool FNAME(is_last_gpte)(struct g
 	return false;
 }
 
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+#if PTTYPE == PTTYPE_EPT
+	return pte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
+			VMX_EPT_EXECUTABLE_MASK);
+#else
+	return is_present_gpte(pte);
+#endif
+}
+
+static inline int FNAME(check_write_user_access)(struct kvm_vcpu *vcpu,
+					   bool write_fault, bool user_fault,
+					   unsigned long pte)
+{
+#if PTTYPE == PTTYPE_EPT
+	if (unlikely(write_fault && !(pte & VMX_EPT_WRITABLE_MASK)
+	      && (user_fault || is_write_protection(vcpu))))
+		return false;
+	return true;
+#else
+	return check_write_user_access(vcpu, write_fault, user_fault, pte);
+#endif
+}
+
 /*
  * Fetch a guest pte for a guest virtual address
  */
@@ -155,7 +203,9 @@  static int FNAME(walk_addr_generic)(stru
 	u16 errcode = 0;
 
 	trace_kvm_mmu_pagetable_walk(addr, access);
+#if PTTYPE != PTTYPE_EPT
 retry_walk:
+#endif
 	eperm = false;
 	walker->level = mmu->root_level;
 	pte           = mmu->get_cr3(vcpu);
@@ -202,7 +252,7 @@  retry_walk:
 
 		trace_kvm_mmu_paging_element(pte, walker->level);
 
-		if (unlikely(!is_present_gpte(pte)))
+		if (unlikely(!FNAME(is_present_gpte)(pte)))
 			goto error;
 
 		if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
@@ -211,13 +261,16 @@  retry_walk:
 			goto error;
 		}
 
-		if (!check_write_user_access(vcpu, write_fault, user_fault,
-					  pte))
+		if (!FNAME(check_write_user_access)(vcpu, write_fault,
+					  user_fault, pte))
 			eperm = true;
 
 #if PTTYPE == 64
 		if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
 			eperm = true;
+#elif PTTYPE == PTTYPE_EPT
+		if (unlikely(fetch_fault && !(pte & VMX_EPT_EXECUTABLE_MASK)))
+			eperm = true;
 #endif
 
 		last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
@@ -225,12 +278,15 @@  retry_walk:
 			pte_access = pt_access &
 				     FNAME(gpte_access)(vcpu, pte, true);
 			/* check if the kernel is fetching from user page */
+#if PTTYPE != PTTYPE_EPT
 			if (unlikely(pte_access & PT_USER_MASK) &&
 			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
 				if (fetch_fault && !user_fault)
 					eperm = true;
+#endif
 		}
 
+#if PTTYPE != PTTYPE_EPT
 		if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
 			int ret;
 			trace_kvm_mmu_set_accessed_bit(table_gfn, index,
@@ -245,6 +301,7 @@  retry_walk:
 			mark_page_dirty(vcpu->kvm, table_gfn);
 			pte |= PT_ACCESSED_MASK;
 		}
+#endif
 
 		walker->ptes[walker->level - 1] = pte;
 
@@ -283,6 +340,7 @@  retry_walk:
 		goto error;
 	}
 
+#if PTTYPE != PTTYPE_EPT
 	if (write_fault && unlikely(!is_dirty_gpte(pte))) {
 		int ret;
 
@@ -298,6 +356,7 @@  retry_walk:
 		pte |= PT_DIRTY_MASK;
 		walker->ptes[walker->level - 1] = pte;
 	}
+#endif
 
 	walker->pt_access = pt_access;
 	walker->pte_access = pte_access;
@@ -328,6 +387,7 @@  static int FNAME(walk_addr)(struct guest
 					access);
 }
 
+#if PTTYPE != PTTYPE_EPT
 static int FNAME(walk_addr_nested)(struct guest_walker *walker,
 				   struct kvm_vcpu *vcpu, gva_t addr,
 				   u32 access)
@@ -335,6 +395,7 @@  static int FNAME(walk_addr_nested)(struc
 	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
 					addr, access);
 }
+#endif
 
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 				    struct kvm_mmu_page *sp, u64 *spte,
@@ -343,11 +404,13 @@  static bool FNAME(prefetch_invalid_gpte)
 	if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
 		goto no_present;
 
-	if (!is_present_gpte(gpte))
+	if (!FNAME(is_present_gpte)(gpte))
 		goto no_present;
 
+#if PTTYPE != PTTYPE_EPT
 	if (!(gpte & PT_ACCESSED_MASK))
 		goto no_present;
+#endif
 
 	return false;
 
@@ -458,6 +521,20 @@  static void FNAME(pte_prefetch)(struct k
 			     pfn, true, true);
 	}
 }
+static void FNAME(link_shadow_page)(u64 *sptep, struct kvm_mmu_page *sp)
+{
+	u64 spte;
+
+	spte = __pa(sp->spt)
+#if PTTYPE == PTTYPE_EPT
+		| VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK
+		| VMX_EPT_EXECUTABLE_MASK;
+#else
+		| PT_PRESENT_MASK | PT_ACCESSED_MASK
+		| PT_WRITABLE_MASK | PT_USER_MASK;
+#endif
+	mmu_spte_set(sptep, spte);
+}
 
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
@@ -474,7 +551,7 @@  static u64 *FNAME(fetch)(struct kvm_vcpu
 	unsigned direct_access;
 	struct kvm_shadow_walk_iterator it;
 
-	if (!is_present_gpte(gw->ptes[gw->level - 1]))
+	if (!FNAME(is_present_gpte)(gw->ptes[gw->level - 1]))
 		return NULL;
 
 	direct_access = gw->pte_access;
@@ -514,7 +591,7 @@  static u64 *FNAME(fetch)(struct kvm_vcpu
 			goto out_gpte_changed;
 
 		if (sp)
-			link_shadow_page(it.sptep, sp);
+			FNAME(link_shadow_page)(it.sptep, sp);
 	}
 
 	for (;
@@ -534,10 +611,15 @@  static u64 *FNAME(fetch)(struct kvm_vcpu
 
 		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
 				      true, direct_access, it.sptep);
-		link_shadow_page(it.sptep, sp);
+		FNAME(link_shadow_page)(it.sptep, sp);
 	}
 
 	clear_sp_write_flooding_count(it.sptep);
+	/* TODO: Consider if everything that set_spte() does is correct when
+	   the shadow page table is actually EPT. Most is fine (for direct_map)
+	   but it appears there they be a few wrong corner cases with
+	   PT_USER_MASK, PT64_NX_MASK, etc., and I need to review everything
+	 */
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
 		     user_fault, write_fault, emulate, it.level,
 		     gw->gfn, pfn, prefault, map_writable);
@@ -733,6 +815,7 @@  static gpa_t FNAME(gva_to_gpa)(struct kv
 	return gpa;
 }
 
+#if PTTYPE != PTTYPE_EPT
 static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 				      u32 access,
 				      struct x86_exception *exception)
@@ -751,6 +834,7 @@  static gpa_t FNAME(gva_to_gpa_nested)(st
 
 	return gpa;
 }
+#endif
 
 /*
  * Using the cached information from sp->gfns is safe because: