[02/12] Subject: [PATCH 02/10] nEPT: Add EPT tables support to paging_tmpl.h

This is the first patch in a series which adds nested EPT support to KVM's
nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
to set its own cr3 and take its own page faults without either of L0 or L1
getting involved. This often significanlty improves L2's performance over the
previous two alternatives (shadow page tables over EPT, and shadow page
tables over shadow page tables).

This patch adds EPT support to paging_tmpl.h.

paging_tmpl.h contains the code for reading and writing page tables. The code
for 32-bit and 64-bit tables is very similar, but not identical, so
paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
with PTTYPE=64, and this generates the two sets of similar functions.

There are subtle but important differences between the format of EPT tables
and that of ordinary x86 64-bit page tables, so for nested EPT we need a
third set of functions to read the guest EPT table and to write the shadow
EPT table.

So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
with "EPT") which correctly read and write EPT tables.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>

modified:   arch/x86/kvm/mmu.c
modified:   arch/x86/kvm/paging_tmpl.h
---
 arch/x86/kvm/mmu.c         |   5 ++
 arch/x86/kvm/paging_tmpl.h | 135 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 131 insertions(+), 9 deletions(-)

      struct guest_walker *walker,
@@ -139,6 +201,7 @@ static int
FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
  }
  return 0;
 }
+#endif

 /*
  * Fetch a guest pte for a guest virtual address
@@ -147,7 +210,6 @@ static int FNAME(walk_addr_generic)(struct
guest_walker *walker,
     struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
     gva_t addr, u32 access)
 {
- int ret;
  pt_element_t pte;
  pt_element_t __user *uninitialized_var(ptep_user);
  gfn_t table_gfn;
@@ -162,7 +224,9 @@ static int FNAME(walk_addr_generic)(struct
guest_walker *walker,
  gfn_t gfn;

  trace_kvm_mmu_pagetable_walk(addr, access);
+#if PTTYPE != PTTYPE_EPT
 retry_walk:
+#endif
  walker->level = mmu->root_level;
  pte           = mmu->get_cr3(vcpu);

@@ -215,18 +279,21 @@ retry_walk:

  trace_kvm_mmu_paging_element(pte, walker->level);

- if (unlikely(!is_present_gpte(pte)))
+ if (unlikely(!FNAME(is_present_gpte)(pte)))
  goto error;

  if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
       walker->level))) {
- errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
+ errcode |= PFERR_PRESENT_MASK;
+#if PTTYPE != PTTYPE_EPT
+ errcode |= PFERR_RSVD_MASK;
+#endif
  goto error;
  }

  accessed_dirty &= pte;
- pte_access = pt_access & gpte_access(vcpu, pte);
-
+ pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
+
  walker->ptes[walker->level - 1] = pte;
  } while (!is_last_gpte(mmu, walker->level, pte));

@@ -247,6 +314,7 @@ retry_walk:

  walker->gfn = real_gpa >> PAGE_SHIFT;

+#if PTTYPE != PTTYPE_EPT
  if (!write_fault)
  protect_clean_gpte(&pte_access, pte);
  else
@@ -257,12 +325,15 @@ retry_walk:
  accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);

  if (unlikely(!accessed_dirty)) {
+ int ret;
+
  ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
  if (unlikely(ret < 0))
  goto error;
  else if (ret)
  goto retry_walk;
  }
+#endif

  walker->pt_access = pt_access;
  walker->pte_access = pte_access;
@@ -293,6 +364,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
  access);
 }

+#if PTTYPE != PTTYPE_EPT
 static int FNAME(walk_addr_nested)(struct guest_walker *walker,
    struct kvm_vcpu *vcpu, gva_t addr,
    u32 access)
@@ -300,6 +372,29 @@ static int FNAME(walk_addr_nested)(struct
guest_walker *walker,
  return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
  addr, access);
 }
+#endif
+
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+  struct kvm_mmu_page *sp, u64 *spte,
+  u64 gpte)
+{
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
+ if (!is_present_gpte(gpte))
+ goto no_present;
+
+#if PTTYPE != PTTYPE_EPT
+ if (!(gpte & PT_ACCESSED_MASK))
+ goto no_present;
+#endif
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte);
+ return true;
+}

 static bool
 FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -309,13 +404,13 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
  gfn_t gfn;
  pfn_t pfn;

- if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
  return false;

  pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);

  gfn = gpte_to_gfn(gpte);
- pte_access = sp->role.access & gpte_access(vcpu, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
  protect_clean_gpte(&pte_access, gpte);
  pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
  no_dirty_log && (pte_access & ACC_WRITE_MASK));
@@ -394,6 +489,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu
*vcpu, struct guest_walker *gw,
  }
 }

+#if PTTYPE == PTTYPE_EPT
+static void FNAME(link_shadow_page)(u64 *sptep, struct kvm_mmu_page *sp)
+{
+ u64 spte;
+
+ spte = __pa(sp->spt) | VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
+ VMX_EPT_EXECUTABLE_MASK;
+
+ mmu_spte_set(sptep, spte);
+}
+#endif
+
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  * If the guest tries to write a write-protected page, we need to
@@ -446,7 +553,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
  goto out_gpte_changed;

  if (sp)
+#if PTTYPE == PTTYPE_EPT
+ FNAME(link_shadow_page)(it.sptep, sp);
+#else
  link_shadow_page(it.sptep, sp);
+#endif
  }

  for (;
@@ -466,7 +577,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,

  sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
       true, direct_access, it.sptep);
+#if PTTYPE == PTTYPE_EPT
+ FNAME(link_shadow_page)(it.sptep, sp);
+#else
  link_shadow_page(it.sptep, sp);
+#endif
  }

  clear_sp_write_flooding_count(it.sptep);
@@ -724,6 +839,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu
*vcpu, gva_t vaddr, u32 access,
  return gpa;
 }

+#if PTTYPE != PTTYPE_EPT
 static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
       u32 access,
       struct x86_exception *exception)
@@ -742,6 +858,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct
kvm_vcpu *vcpu, gva_t vaddr,

  return gpa;
 }
+#endif

 /*
  * Using the cached information from sp->gfns is safe because:
@@ -782,14 +899,14 @@ static int FNAME(sync_page)(struct kvm_vcpu
*vcpu, struct kvm_mmu_page *sp)
   sizeof(pt_element_t)))
  return -EINVAL;

- if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
  vcpu->kvm->tlbs_dirty++;
  continue;
  }

  gfn = gpte_to_gfn(gpte);
  pte_access = sp->role.access;
- pte_access &= gpte_access(vcpu, gpte);
+ pte_access &= FNAME(gpte_access)(vcpu, gpte);
  protect_clean_gpte(&pte_access, gpte);

  if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
--
1.8.2.1.610.g562af5b
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[02/12] Subject: [PATCH 02/10] nEPT: Add EPT tables support to paging_tmpl.h

Commit Message

Patch