@@ -69,6 +69,27 @@ char *audit_point_name[] = {
"post sync"
};
+/*
+ * do_walk() returns one of these.
+ *
+ * WALK_NEXT: Continue the walk loop.
+ * WALK_DONE: Break from the walk loop.
+ * WALK_RETRY: Retry walk.
+ * WALK_NOT_PRESENT: Set PFERR_PRESENT_MASK and goto error.
+ * WALK_RSVD_FAULT: Set PFERR_RSVD_MASK and goto error.
+ * WALK_ERROR: Goto error.
+ * WALK_ABORT: Return immediately.
+ */
+enum {
+ WALK_NEXT,
+ WALK_DONE,
+ WALK_RETRY,
+ WALK_NOT_PRESENT,
+ WALK_RSVD_FAULT,
+ WALK_ERROR,
+ WALK_ABORT
+};
+
#undef MMU_DEBUG
#ifdef MMU_DEBUG
@@ -114,6 +114,111 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
}
/*
+ * Walk one level.
+ * Guest pte and its user address will be put in *pte and *ptep_user.
+ */
+static inline int
+FNAME(do_walk)(struct guest_walker *walker, struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu, gva_t addr, u32 access, bool *eperm,
+ pt_element_t *pte, pt_element_t __user **ptep_user)
+{
+ gfn_t real_gfn;
+ unsigned long host_addr;
+ unsigned index = PT_INDEX(addr, walker->level);
+ int offset = index * sizeof(pt_element_t);
+ gfn_t table_gfn = gpte_to_gfn(*pte);
+ gpa_t pte_gpa = gfn_to_gpa(table_gfn) + offset;
+ const int write_fault = access & PFERR_WRITE_MASK;
+ const int user_fault = access & PFERR_USER_MASK;
+ const int fetch_fault = access & PFERR_FETCH_MASK;
+
+ walker->table_gfn[walker->level - 1] = table_gfn;
+ walker->pte_gpa[walker->level - 1] = pte_gpa;
+
+ real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
+ PFERR_USER_MASK|PFERR_WRITE_MASK);
+ if (unlikely(real_gfn == UNMAPPED_GVA))
+ return WALK_NOT_PRESENT;
+ real_gfn = gpa_to_gfn(real_gfn);
+
+ host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
+ if (unlikely(kvm_is_error_hva(host_addr)))
+ return WALK_NOT_PRESENT;
+
+ *ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
+ if (unlikely(__copy_from_user(pte, *ptep_user, sizeof(*pte))))
+ return WALK_NOT_PRESENT;
+
+ trace_kvm_mmu_paging_element(*pte, walker->level);
+
+ if (unlikely(!is_present_gpte(*pte)))
+ return WALK_NOT_PRESENT;
+
+ if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, *pte, walker->level)))
+ return WALK_RSVD_FAULT;
+
+ if (unlikely(write_fault && !is_writable_pte(*pte)
+ && (user_fault || is_write_protection(vcpu))))
+ *eperm = true;
+
+ if (unlikely(user_fault && !(*pte & PT_USER_MASK)))
+ *eperm = true;
+
+#if PTTYPE == 64
+ if (unlikely(fetch_fault && (*pte & PT64_NX_MASK)))
+ *eperm = true;
+#endif
+
+ if (!*eperm && unlikely(!(*pte & PT_ACCESSED_MASK))) {
+ int ret;
+
+ trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(*pte));
+ ret = FNAME(cmpxchg_gpte)(vcpu, mmu, *ptep_user, index,
+ *pte, *pte|PT_ACCESSED_MASK);
+ if (unlikely(ret < 0))
+ return WALK_NOT_PRESENT;
+ else if (ret)
+ return WALK_RETRY;
+
+ mark_page_dirty(vcpu->kvm, table_gfn);
+ *pte |= PT_ACCESSED_MASK;
+ }
+
+ walker->pte_access = walker->pt_access & FNAME(gpte_access)(vcpu, *pte);
+
+ walker->ptes[walker->level - 1] = *pte;
+
+ if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
+ ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(*pte) &&
+ (PTTYPE == 64 || is_pse(vcpu))) ||
+ ((walker->level == PT_PDPE_LEVEL) && is_large_pte(*pte) &&
+ (mmu->root_level == PT64_ROOT_LEVEL))) {
+ gpa_t real_gpa;
+ gfn_t gfn;
+ u32 ac;
+
+ gfn = gpte_to_gfn_lvl(*pte, walker->level);
+ gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
+
+ if (PTTYPE == 32 && (walker->level == PT_DIRECTORY_LEVEL) &&
+ is_cpuid_PSE36())
+ gfn += pse36_gfn_delta(*pte);
+
+ ac = write_fault | fetch_fault | user_fault;
+
+ real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), ac);
+ if (real_gpa == UNMAPPED_GVA)
+ return WALK_ABORT;
+
+ walker->gfn = real_gpa >> PAGE_SHIFT;
+
+ return WALK_DONE;
+ }
+
+ return WALK_NEXT;
+}
+
+/*
* Fetch a guest pte for a guest virtual address
*/
static int FNAME(walk_addr_generic)(struct guest_walker *walker,
@@ -130,7 +235,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
fetch_fault);
-walk:
+walk_retry:
eperm = false;
walker->level = mmu->root_level;
pte = mmu->get_cr3(vcpu);
@@ -152,118 +257,34 @@ walk:
walker->pt_access = ACC_ALL;
for (;;) {
- gfn_t real_gfn;
- unsigned long host_addr;
- unsigned index = PT_INDEX(addr, walker->level);
- int offset = index * sizeof(pt_element_t);
- gfn_t table_gfn = gpte_to_gfn(pte);
- gpa_t pte_gpa = gfn_to_gpa(table_gfn) + offset;
-
- walker->table_gfn[walker->level - 1] = table_gfn;
- walker->pte_gpa[walker->level - 1] = pte_gpa;
-
- real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
- PFERR_USER_MASK|PFERR_WRITE_MASK);
- if (unlikely(real_gfn == UNMAPPED_GVA)) {
- errcode |= PFERR_PRESENT_MASK;
- goto error;
- }
- real_gfn = gpa_to_gfn(real_gfn);
-
- host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
- if (unlikely(kvm_is_error_hva(host_addr))) {
- errcode |= PFERR_PRESENT_MASK;
- goto error;
- }
-
- ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
- if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) {
- errcode |= PFERR_PRESENT_MASK;
- goto error;
- }
-
- trace_kvm_mmu_paging_element(pte, walker->level);
+ int ret;
- if (unlikely(!is_present_gpte(pte))) {
+ ret = FNAME(do_walk)(walker, vcpu, mmu, addr, access,
+ &eperm, &pte, &ptep_user);
+ switch (ret) {
+ case WALK_NEXT:
+ break;
+ case WALK_DONE:
+ goto walk_done;
+ case WALK_RETRY:
+ goto walk_retry;
+ case WALK_NOT_PRESENT:
errcode |= PFERR_PRESENT_MASK;
goto error;
- }
-
- if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
- walker->level))) {
+ case WALK_RSVD_FAULT:
errcode |= PFERR_RSVD_MASK;
goto error;
- }
-
- if (unlikely(write_fault && !is_writable_pte(pte)
- && (user_fault || is_write_protection(vcpu))))
- eperm = true;
-
- if (unlikely(user_fault && !(pte & PT_USER_MASK)))
- eperm = true;
-
-#if PTTYPE == 64
- if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
- eperm = true;
-#endif
-
- if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
- int ret;
- trace_kvm_mmu_set_accessed_bit(table_gfn, index,
- sizeof(pte));
- ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
- pte, pte|PT_ACCESSED_MASK);
- if (unlikely(ret < 0)) {
- errcode |= PFERR_PRESENT_MASK;
- goto error;
- } else if (ret)
- goto walk;
-
- mark_page_dirty(vcpu->kvm, table_gfn);
- pte |= PT_ACCESSED_MASK;
- }
-
- walker->pte_access = walker->pt_access &
- FNAME(gpte_access)(vcpu, pte);
-
- walker->ptes[walker->level - 1] = pte;
-
- if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
- ((walker->level == PT_DIRECTORY_LEVEL) &&
- is_large_pte(pte) &&
- (PTTYPE == 64 || is_pse(vcpu))) ||
- ((walker->level == PT_PDPE_LEVEL) &&
- is_large_pte(pte) &&
- mmu->root_level == PT64_ROOT_LEVEL)) {
- int lvl = walker->level;
- gpa_t real_gpa;
- gfn_t gfn;
- u32 ac;
-
- gfn = gpte_to_gfn_lvl(pte, lvl);
- gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
-
- if (PTTYPE == 32 &&
- walker->level == PT_DIRECTORY_LEVEL &&
- is_cpuid_PSE36())
- gfn += pse36_gfn_delta(pte);
-
- ac = write_fault | fetch_fault | user_fault;
-
- real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
- ac);
- if (real_gpa == UNMAPPED_GVA)
- return 0;
-
- walker->gfn = real_gpa >> PAGE_SHIFT;
-
- break;
+ case WALK_ERROR:
+ goto error;
+ case WALK_ABORT:
+ return 0;
}
walker->pt_access = walker->pte_access;
--walker->level;
}
+walk_done:
if (unlikely(eperm))
goto error;
@@ -279,7 +300,7 @@ walk:
errcode |= PFERR_PRESENT_MASK;
goto error;
} else if (ret)
- goto walk;
+ goto walk_retry;
mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_DIRTY_MASK;