Message ID | 4E01FDE0.5080800@cn.fujitsu.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Xiao, On Wed, Jun 22, 2011 at 10:36:16PM +0800, Xiao Guangrong wrote: > The idea is from Avi: > > | We could cache the result of a miss in an spte by using a reserved bit, and > | checking the page fault error code (or seeing if we get an ept violation or > | ept misconfiguration), so if we get repeated mmio on a page, we don't need to > | search the slot list/tree. > | (https://lkml.org/lkml/2011/2/22/221) > > When the page fault is caused by mmio, we cache the info in the shadow page > table, and also set the reserved bits in the shadow page table, so if the mmio > is caused again, we can quickly identify it and emulate it directly > > Searching mmio gfn in memslots is heavy since we need to walk all memeslots, it > can be reduced by this feature, and also avoid walking guest page table for > soft mmu. > > Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> > --- > arch/x86/kvm/mmu.c | 156 ++++++++++++++++++++++++++++++++++++++++++-- > arch/x86/kvm/mmu.h | 1 + > arch/x86/kvm/paging_tmpl.h | 18 ++++-- > arch/x86/kvm/vmx.c | 10 +++- > 4 files changed, 173 insertions(+), 12 deletions(-) > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 1319050..e69a47a 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -197,6 +197,41 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ > static u64 __read_mostly shadow_user_mask; > static u64 __read_mostly shadow_accessed_mask; > static u64 __read_mostly shadow_dirty_mask; > +static u64 __read_mostly shadow_mmio_mask = (0xffull << 49 | 1ULL); > + > +static void mmu_spte_set(u64 *sptep, u64 spte); > + > +static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) > +{ > + access &= ACC_WRITE_MASK | ACC_USER_MASK; > + > + mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); > +} > + > +static bool is_mmio_spte(u64 spte) > +{ > + return (spte & shadow_mmio_mask) == shadow_mmio_mask; > +} > + > +static gfn_t get_mmio_spte_gfn(u64 spte) > +{ > + return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; > +} > + > +static unsigned get_mmio_spte_access(u64 spte) > +{ > + return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; > +} > + > +static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) > +{ > + if (unlikely(is_mmio_pfn(pfn))) { > + mark_mmio_spte(sptep, gfn, access); > + return true; > + } > + > + return false; > +} > > static inline u64 rsvd_bits(int s, int e) > { > @@ -226,7 +261,7 @@ static int is_nx(struct kvm_vcpu *vcpu) > > static int is_shadow_present_pte(u64 pte) > { > - return pte != 0ull; > + return pte != 0ull && !is_mmio_spte(pte); > } > > static int is_large_pte(u64 pte) > @@ -2123,6 +2158,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, > u64 spte, entry = *sptep; > int ret = 0; > > + if (set_mmio_spte(sptep, gfn, pfn, pte_access)) > + return 0; > + Should zap all mmio sptes when creating new memory slots (current qemu never exhibits that pattern, but its not an unsupported scenario). Easier to zap all mmu in that case. For shadow you need to remove mmio sptes on invlpg and all mmio sptes under CR3 (mmu_sync_roots), other than clearing the gva/gpa variables. Otherwise you can move the mmio info from an mmio spte back to mmio_gva/mmio_gfn after a TLB flush, without rereading the guest pagetable. > +{ > + struct kvm_shadow_walk_iterator iterator; > + u64 spte, ret = 0ull; > + > + walk_shadow_page_lockless_begin(vcpu); > + for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { > + if (iterator.level == 1) { > + ret = spte; > + break; > + } Have you verified it is safe to walk sptes with parallel modifications to them? Other than shadow page deletion which is in theory covered by RCU. It would be good to have all cases written down. > + > + if (!is_shadow_present_pte(spte)) > + break; > + } > + walk_shadow_page_lockless_end(vcpu); > + > + return ret; > +} > + > +/* > + * If it is a real mmio page fault, return 1 and emulat the instruction > + * directly, return 0 if it needs page fault path to fix it, -1 is > + * returned if bug is detected. > + */ > +int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) > +{ > + u64 spte; > + > + if (quickly_check_mmio_pf(vcpu, addr, direct)) > + return 1; > + > + spte = walk_shadow_page_get_mmio_spte(vcpu, addr); > + > + if (is_mmio_spte(spte)) { > + gfn_t gfn = get_mmio_spte_gfn(spte); > + unsigned access = get_mmio_spte_access(spte); > + > + if (direct) > + addr = 0; > + vcpu_cache_mmio_info(vcpu, addr, gfn, access); > + return 1; > + } > + > + /* > + * It's ok if the gva is remapped by other cpus on shadow guest, > + * it's a BUG if the gfn is not a mmio page. > + */ If the gva is remapped there should be no mmio page fault in the first place. > + if (direct && is_shadow_present_pte(spte)) > + return -1; An spte does not have to contain the present bit to generate a valid EPT misconfiguration (and an spte dump is still required in that case). Use !is_mmio_spte() instead. > + > + /* > + * If the page table is zapped by other cpus, let the page > + * fault path to fix it. > + */ > + return 0; > +} I don't understand when would this happen, can you please explain? > +EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); > + > +static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, > + u32 error_code, bool direct) > +{ > + int ret; > + > + if (likely(!(error_code & PFERR_RSVD_MASK))) > + return 0; > + > + ret = handle_mmio_page_fault_common(vcpu, addr, direct); > + WARN_ON(ret < 0); > + return ret; > +} > + > static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, > u32 error_code, bool prefault) > { > @@ -2823,6 +2941,11 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, > int r; > > pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); > + > + r = handle_mmio_page_fault(vcpu, gva, error_code, true); > + if (r) > + return r; > + > r = mmu_topup_memory_caches(vcpu); > if (r) > return r; > @@ -2899,6 +3022,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, > ASSERT(vcpu); > ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); > > + r = handle_mmio_page_fault(vcpu, gpa, error_code, true); > + if (r) > + return r; > + > r = mmu_topup_memory_caches(vcpu); > if (r) > return r; > @@ -2996,6 +3123,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) > return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; > } > > +static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, > + int *nr_present) > +{ > + if (unlikely(is_mmio_spte(*sptep))) { > + if (gfn != get_mmio_spte_gfn(*sptep)) { > + mmu_spte_clear_no_track(sptep); > + return true; > + } > + > + (*nr_present)++; > + mark_mmio_spte(sptep, gfn, access); > + return true; > + } > + > + return false; > +} > + > #define PTTYPE 64 > #include "paging_tmpl.h" > #undef PTTYPE > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h > index 05310b1..ebf09bf 100644 > --- a/arch/x86/kvm/mmu.h > +++ b/arch/x86/kvm/mmu.h > @@ -49,6 +49,7 @@ > #define PFERR_FETCH_MASK (1U << 4) > > int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); > +int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); > int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); > > static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h > index 870be69..cbdc1ae 100644 > --- a/arch/x86/kvm/paging_tmpl.h > +++ b/arch/x86/kvm/paging_tmpl.h > @@ -583,6 +583,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, > > pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); > > + r = handle_mmio_page_fault(vcpu, addr, error_code, mmu_is_nested(vcpu)); > + if (r) > + return r; > + > r = mmu_topup_memory_caches(vcpu); > if (r) > return r; > @@ -786,7 +790,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) > gpa_t pte_gpa; > gfn_t gfn; > > - if (!is_shadow_present_pte(sp->spt[i])) > + if (!sp->spt[i]) > continue; > > pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); > @@ -795,13 +799,18 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) > sizeof(pt_element_t))) > return -EINVAL; > > - gfn = gpte_to_gfn(gpte); > - > if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { > vcpu->kvm->tlbs_dirty++; > continue; > } > > + gfn = gpte_to_gfn(gpte); > + pte_access = sp->role.access; > + pte_access &= FNAME(gpte_access)(vcpu, gpte, true); > + > + if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) > + continue; > + > if (gfn != sp->gfns[i]) { > drop_spte(vcpu->kvm, &sp->spt[i]); > vcpu->kvm->tlbs_dirty++; > @@ -809,8 +818,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) > } > > nr_present++; > - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, > - true); > + > host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; > > set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index a644acb..37a49f1 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -4671,11 +4671,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, > static int handle_ept_misconfig(struct kvm_vcpu *vcpu) > { > u64 sptes[4]; > - int nr_sptes, i; > + int nr_sptes, i, ret; > gpa_t gpa; > > gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); > > + ret = handle_mmio_page_fault_common(vcpu, gpa, true); > + if (likely(ret == 1)) > + return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == > + EMULATE_DONE; > + if (unlikely(!ret)) > + return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); > + > + /* It is the real ept misconfig */ > printk(KERN_ERR "EPT: Misconfiguration.\n"); > printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); > > -- > 1.7.5.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Marcelo, Thanks for your review! On 06/23/2011 05:59 AM, Marcelo Tosatti wrote: >> static int is_large_pte(u64 pte) >> @@ -2123,6 +2158,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, >> u64 spte, entry = *sptep; >> int ret = 0; >> >> + if (set_mmio_spte(sptep, gfn, pfn, pte_access)) >> + return 0; >> + > > > Should zap all mmio sptes when creating new memory slots (current qemu > never exhibits that pattern, but its not an unsupported scenario). > Easier to zap all mmu in that case. > OK, will do it in the next version. > For shadow you need to remove mmio sptes on invlpg and all mmio sptes > under CR3 (mmu_sync_roots), other than clearing the gva/gpa variables. > Oh, kvm_mmu_pte_write and invlpg do not zap mmio spte, i will fix these, thanks for your point it out. For mmu_sync_roots, we properly do it in FNAME(sync_page) in this patch: static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, int *nr_present) { if (unlikely(is_mmio_spte(*sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { mmu_spte_clear_no_track(sptep); return true; } (*nr_present)++; mark_mmio_spte(sptep, gfn, access); return true; } return false; } > Otherwise you can move the mmio info from an mmio spte back to > mmio_gva/mmio_gfn after a TLB flush, without rereading the guest > pagetable. > We do not read the guest page table when mmio page fault occurred, we just do it as you say: - Firstly, walk the shadow page table to get the mmio spte - Then, cache the mmio spte info to mmio_gva/mmio_gfn I missed your meaning? >> +{ >> + struct kvm_shadow_walk_iterator iterator; >> + u64 spte, ret = 0ull; >> + >> + walk_shadow_page_lockless_begin(vcpu); >> + for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { >> + if (iterator.level == 1) { >> + ret = spte; >> + break; >> + } > > Have you verified it is safe to walk sptes with parallel modifications > to them? Other than shadow page deletion which is in theory covered by > RCU. It would be good to have all cases written down. > Um, i think it is safe, when spte is being write, we can get the old spte or the new spte, both cases are ok for us: it is just like the page structure cache on the real machine, the cpu can fetch the old spte even if the page structure is changed by other cpus. >> + >> + if (!is_shadow_present_pte(spte)) >> + break; >> + } >> + walk_shadow_page_lockless_end(vcpu); >> + >> + return ret; >> +} >> + >> +/* >> + * If it is a real mmio page fault, return 1 and emulat the instruction >> + * directly, return 0 if it needs page fault path to fix it, -1 is >> + * returned if bug is detected. >> + */ >> +int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) >> +{ >> + u64 spte; >> + >> + if (quickly_check_mmio_pf(vcpu, addr, direct)) >> + return 1; >> + >> + spte = walk_shadow_page_get_mmio_spte(vcpu, addr); >> + >> + if (is_mmio_spte(spte)) { >> + gfn_t gfn = get_mmio_spte_gfn(spte); >> + unsigned access = get_mmio_spte_access(spte); >> + >> + if (direct) >> + addr = 0; >> + vcpu_cache_mmio_info(vcpu, addr, gfn, access); >> + return 1; >> + } >> + >> + /* >> + * It's ok if the gva is remapped by other cpus on shadow guest, >> + * it's a BUG if the gfn is not a mmio page. >> + */ > > If the gva is remapped there should be no mmio page fault in the first > place. > For example, as follow case: VCPU 0 VCPU 1 gva is mapped to a mmio region access gva remap gva to other page emulate mmio access by kvm (*the point we are discussing*) flush all tlb In theory, it can happen. >> + if (direct && is_shadow_present_pte(spte)) >> + return -1; > > An spte does not have to contain the present bit to generate a valid EPT > misconfiguration (and an spte dump is still required in that case). > Use !is_mmio_spte() instead. > We can not use !is_mmio_spte() here, since the shadow page can be zapped anytime, for example: sp.spt[i] = mmio-spte VCPU 0 VCPU 1 Access sp.spte[i], ept misconfig is occurred delete sp (if the number of shadow page is out of the limit or page shrink is required, and other events...) Walk shadow page out of the lock and get the non-present spte (*the point we are discussing*) So, the bug we can detect is: it is the mmio access but the spte is point to the normal page. > >> + >> + /* >> + * If the page table is zapped by other cpus, let the page >> + * fault path to fix it. >> + */ >> + return 0; >> +} > > I don't understand when would this happen, can you please explain? > The case is above :-) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/23/2011 11:19 AM, Xiao Guangrong wrote: >> Otherwise you can move the mmio info from an mmio spte back to >> mmio_gva/mmio_gfn after a TLB flush, without rereading the guest >> pagetable. >> > > We do not read the guest page table when mmio page fault occurred, > we just do it as you say: > - Firstly, walk the shadow page table to get the mmio spte > - Then, cache the mmio spte info to mmio_gva/mmio_gfn > > I missed your meaning? > Marcelo, sorry for my poor English, i missed your meaning, please ignore this. :( -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jun 23, 2011 at 11:19:26AM +0800, Xiao Guangrong wrote: > Marcelo, > > Thanks for your review! > > On 06/23/2011 05:59 AM, Marcelo Tosatti wrote: > > >> static int is_large_pte(u64 pte) > >> @@ -2123,6 +2158,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, > >> u64 spte, entry = *sptep; > >> int ret = 0; > >> > >> + if (set_mmio_spte(sptep, gfn, pfn, pte_access)) > >> + return 0; > >> + > > > > > > Should zap all mmio sptes when creating new memory slots (current qemu > > never exhibits that pattern, but its not an unsupported scenario). > > Easier to zap all mmu in that case. > > > > OK, will do it in the next version. > > > For shadow you need to remove mmio sptes on invlpg and all mmio sptes > > under CR3 (mmu_sync_roots), other than clearing the gva/gpa variables. > > > > Oh, kvm_mmu_pte_write and invlpg do not zap mmio spte, i will > fix these, thanks for your point it out. > > For mmu_sync_roots, we properly do it in FNAME(sync_page) in this patch: > > static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, > int *nr_present) > { > if (unlikely(is_mmio_spte(*sptep))) { > if (gfn != get_mmio_spte_gfn(*sptep)) { > mmu_spte_clear_no_track(sptep); > return true; > } > > (*nr_present)++; > mark_mmio_spte(sptep, gfn, access); > return true; > } > > return false; > } > > > Otherwise you can move the mmio info from an mmio spte back to > > mmio_gva/mmio_gfn after a TLB flush, without rereading the guest > > pagetable. > > > > We do not read the guest page table when mmio page fault occurred, > we just do it as you say: > - Firstly, walk the shadow page table to get the mmio spte > - Then, cache the mmio spte info to mmio_gva/mmio_gfn > > I missed your meaning? I meant that guest pagetables must be read again on CR3 switch or invlpg (GVA->GPA can change in that case), which only happens if mmio sptes are zapped. > >> + struct kvm_shadow_walk_iterator iterator; > >> + u64 spte, ret = 0ull; > >> + > >> + walk_shadow_page_lockless_begin(vcpu); > >> + for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { > >> + if (iterator.level == 1) { > >> + ret = spte; > >> + break; > >> + } > > > > Have you verified it is safe to walk sptes with parallel modifications > > to them? Other than shadow page deletion which is in theory covered by > > RCU. It would be good to have all cases written down. > > > > Um, i think it is safe, when spte is being write, we can get the old spte > or the new spte, both cases are ok for us: it is just like the page structure > cache on the real machine, the cpu can fetch the old spte even if the page > structure is changed by other cpus. > >> + > >> + if (!is_shadow_present_pte(spte)) > >> + break; > >> + } > >> + walk_shadow_page_lockless_end(vcpu); > >> + > >> + return ret; > >> +} > >> + > >> +/* > >> + * If it is a real mmio page fault, return 1 and emulat the instruction > >> + * directly, return 0 if it needs page fault path to fix it, -1 is > >> + * returned if bug is detected. > >> + */ > >> +int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) > >> +{ > >> + u64 spte; > >> + > >> + if (quickly_check_mmio_pf(vcpu, addr, direct)) > >> + return 1; > >> + > >> + spte = walk_shadow_page_get_mmio_spte(vcpu, addr); > >> + > >> + if (is_mmio_spte(spte)) { > >> + gfn_t gfn = get_mmio_spte_gfn(spte); > >> + unsigned access = get_mmio_spte_access(spte); > >> + > >> + if (direct) > >> + addr = 0; > >> + vcpu_cache_mmio_info(vcpu, addr, gfn, access); > >> + return 1; > >> + } > >> + > >> + /* > >> + * It's ok if the gva is remapped by other cpus on shadow guest, > >> + * it's a BUG if the gfn is not a mmio page. > >> + */ > > > > If the gva is remapped there should be no mmio page fault in the first > > place. > > > > For example, as follow case: > > VCPU 0 VCPU 1 > gva is mapped to a mmio region > access gva > remap gva to other page > emulate mmio access by kvm > (*the point we are discussing*) > flush all tlb > > In theory, it can happen. > > >> + if (direct && is_shadow_present_pte(spte)) > >> + return -1; > > > > An spte does not have to contain the present bit to generate a valid EPT > > misconfiguration (and an spte dump is still required in that case). > > Use !is_mmio_spte() instead. > > > > We can not use !is_mmio_spte() here, since the shadow page can be zapped anytime, > for example: > > sp.spt[i] = mmio-spte > > VCPU 0 VCPU 1 > Access sp.spte[i], ept misconfig is occurred > delete sp > (if the number of shadow page is out of the limit > or page shrink is required, and other events...) > > Walk shadow page out of the lock and get the > non-present spte > (*the point we are discussing*) Then is_mmio_spte(non-present spte) == false, right? Point is that it only sptes with precise mmio spte pattern should be considered mmio sptes, otherwise consider a genuine EPT misconfiguration error (which must be reported). What about using fault code instead of spte as Avi suggested instead? > So, the bug we can detect is: it is the mmio access but the spte is point to the normal > page. > > > > >> + > >> + /* > >> + * If the page table is zapped by other cpus, let the page > >> + * fault path to fix it. > >> + */ > >> + return 0; > >> +} > > > > I don't understand when would this happen, can you please explain? > > > > The case is above :-) No need to jump to page fault handler, can let CPU fault again on non present spte. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/23/2011 10:21 PM, Marcelo Tosatti wrote: >>> An spte does not have to contain the present bit to generate a valid EPT >>> misconfiguration (and an spte dump is still required in that case). >>> Use !is_mmio_spte() instead. >>> >> >> We can not use !is_mmio_spte() here, since the shadow page can be zapped anytime, >> for example: >> >> sp.spt[i] = mmio-spte >> >> VCPU 0 VCPU 1 >> Access sp.spte[i], ept misconfig is occurred >> delete sp >> (if the number of shadow page is out of the limit >> or page shrink is required, and other events...) >> >> Walk shadow page out of the lock and get the >> non-present spte >> (*the point we are discussing*) > > Then is_mmio_spte(non-present spte) == false, right? Point is that it > only sptes with precise mmio spte pattern should be considered mmio > sptes, otherwise consider a genuine EPT misconfiguration error (which > must be reported). > No, not all no mmio spte is considered a genuine EPT misconfig, as the above case, we can get !is_mmio_spte(), but it is not the genuine EPT misconfig since it is caused by shadow page zapped > What about using fault code instead of spte as Avi suggested instead? > Do you mean waking guest page table to get mmio gva/mmio gpa for softmmu instead of walking shadow page table? I think it is unsafe, since guest can change the mapping anytime, we can get the wrong mmio gva/mmio gpa to mmio emulate, consider follow case: gva is mapped to the mmio region, we set the reserved bits in the spte: VCPU 0 VCPU 1 Access gva, reserved page fault is occurred map gva to the RAM region Walking guest page table and get the RAM gpa TLB flush (*the point we are discussing*) Then we can get the wrong gpa to mmio emulate, so - VMM can detected the invalid mmio access - the event is missed, it neither accesses the mmio region nor the RAM region, it is not as the real cpu does Anyway, mmio spte is needed to detect bugs for hard mmu >> So, the bug we can detect is: it is the mmio access but the spte is point to the normal >> page. >> >>> >>>> + >>>> + /* >>>> + * If the page table is zapped by other cpus, let the page >>>> + * fault path to fix it. >>>> + */ >>>> + return 0; >>>> +} >>> >>> I don't understand when would this happen, can you please explain? >>> >> >> The case is above :-) > > No need to jump to page fault handler, can let CPU fault again on non > present spte. > It is a good idea, will do, thanks! > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jun 24, 2011 at 01:55:45AM +0800, Xiao Guangrong wrote: > On 06/23/2011 10:21 PM, Marcelo Tosatti wrote: > > >>> An spte does not have to contain the present bit to generate a valid EPT > >>> misconfiguration (and an spte dump is still required in that case). > >>> Use !is_mmio_spte() instead. > >>> > >> > >> We can not use !is_mmio_spte() here, since the shadow page can be zapped anytime, > >> for example: > >> > >> sp.spt[i] = mmio-spte > >> > >> VCPU 0 VCPU 1 > >> Access sp.spte[i], ept misconfig is occurred > >> delete sp > >> (if the number of shadow page is out of the limit > >> or page shrink is required, and other events...) > >> > >> Walk shadow page out of the lock and get the > >> non-present spte > >> (*the point we are discussing*) > > > > Then is_mmio_spte(non-present spte) == false, right? Point is that it > > only sptes with precise mmio spte pattern should be considered mmio > > sptes, otherwise consider a genuine EPT misconfiguration error (which > > must be reported). > > > > No, not all no mmio spte is considered a genuine EPT misconfig, as the above > case, we can get !is_mmio_spte(), but it is not the genuine EPT misconfig > since it is caused by shadow page zapped I mean it must be if (is_mmio_spte(spte)) handle_mmio if (spte == spte_not_present) /* race, let CPU refault */ return handle EPT misconf > > What about using fault code instead of spte as Avi suggested instead? > > > > Do you mean waking guest page table to get mmio gva/mmio gpa for softmmu instead > of walking shadow page table? > > I think it is unsafe, since guest can change the mapping anytime, we can get the > wrong mmio gva/mmio gpa to mmio emulate, consider follow case: > > gva is mapped to the mmio region, we set the reserved bits in the spte: > > VCPU 0 VCPU 1 > Access gva, reserved page fault is occurred > map gva to the RAM region > Walking guest page table and get the RAM gpa TLB flush > (*the point we are discussing*) > > Then we can get the wrong gpa to mmio emulate, so > - VMM can detected the invalid mmio access > - the event is missed, it neither accesses the mmio region nor the RAM region, > it is not as the real cpu does > > Anyway, mmio spte is needed to detect bugs for hard mmu Actually i was thinking about EPT misconf, but there are no other fields than GPA. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/24/2011 04:13 AM, Marcelo Tosatti wrote: >> No, not all no mmio spte is considered a genuine EPT misconfig, as the above >> case, we can get !is_mmio_spte(), but it is not the genuine EPT misconfig >> since it is caused by shadow page zapped > > I mean it must be > > if (is_mmio_spte(spte)) > handle_mmio > if (spte == spte_not_present) /* race, let CPU refault */ > return > handle EPT misconf > The patch already did it as you say :p if (is_mmio_spte(spte)) return handle_mmio if (spte_present(spte)) return handle EPT misconfig return page fault path /*I will fix it, let cpu refault instead*/ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/23/2011 08:55 PM, Xiao Guangrong wrote: > Anyway, mmio spte is needed to detect bugs for hard mmu > Note, we have the equivalent problem with bypass_guest_pf=1 and it works fine. We did have an issue with EPT due to a missing tlb flush, but we don't need to design the whole mmu around it.
On 06/22/2011 05:36 PM, Xiao Guangrong wrote: > The idea is from Avi: > > | We could cache the result of a miss in an spte by using a reserved bit, and > | checking the page fault error code (or seeing if we get an ept violation or > | ept misconfiguration), so if we get repeated mmio on a page, we don't need to > | search the slot list/tree. > | (https://lkml.org/lkml/2011/2/22/221) > > When the page fault is caused by mmio, we cache the info in the shadow page > table, and also set the reserved bits in the shadow page table, so if the mmio > is caused again, we can quickly identify it and emulate it directly > > Searching mmio gfn in memslots is heavy since we need to walk all memeslots, it > can be reduced by this feature, and also avoid walking guest page table for > soft mmu. > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 1319050..e69a47a 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -197,6 +197,41 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ > static u64 __read_mostly shadow_user_mask; > static u64 __read_mostly shadow_accessed_mask; > static u64 __read_mostly shadow_dirty_mask; > +static u64 __read_mostly shadow_mmio_mask = (0xffull<< 49 | 1ULL); One bit is shifted out. And it will fail with 52-bit MAXPHYADDR. Please in addition, set the xwr bits to an invalid pattern on EPT (there is an MSR which specifies which patterns are valid; for example execute-only or write-only are invalid). If all patterns are valid AND MAXPHYADDR == 52, then just set the mask to 0 and it the optimization will be disabled.
On 06/29/2011 05:22 PM, Avi Kivity wrote: > On 06/22/2011 05:36 PM, Xiao Guangrong wrote: >> The idea is from Avi: >> >> | We could cache the result of a miss in an spte by using a reserved bit, and >> | checking the page fault error code (or seeing if we get an ept violation or >> | ept misconfiguration), so if we get repeated mmio on a page, we don't need to >> | search the slot list/tree. >> | (https://lkml.org/lkml/2011/2/22/221) >> >> When the page fault is caused by mmio, we cache the info in the shadow page >> table, and also set the reserved bits in the shadow page table, so if the mmio >> is caused again, we can quickly identify it and emulate it directly >> >> Searching mmio gfn in memslots is heavy since we need to walk all memeslots, it >> can be reduced by this feature, and also avoid walking guest page table for >> soft mmu. >> >> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c >> index 1319050..e69a47a 100644 >> --- a/arch/x86/kvm/mmu.c >> +++ b/arch/x86/kvm/mmu.c >> @@ -197,6 +197,41 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ >> static u64 __read_mostly shadow_user_mask; >> static u64 __read_mostly shadow_accessed_mask; >> static u64 __read_mostly shadow_dirty_mask; >> +static u64 __read_mostly shadow_mmio_mask = (0xffull<< 49 | 1ULL); > > One bit is shifted out. And it will fail with 52-bit MAXPHYADDR. > > Please in addition, set the xwr bits to an invalid pattern on EPT (there is an MSR which specifies which patterns are valid; for example execute-only or write-only are invalid). If all patterns are valid AND MAXPHYADDR == 52, then just set the mask to 0 and it the optimization will be disabled. > OK, will fix, thanks! -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1319050..e69a47a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -197,6 +197,41 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; +static u64 __read_mostly shadow_mmio_mask = (0xffull << 49 | 1ULL); + +static void mmu_spte_set(u64 *sptep, u64 spte); + +static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) +{ + access &= ACC_WRITE_MASK | ACC_USER_MASK; + + mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); +} + +static bool is_mmio_spte(u64 spte) +{ + return (spte & shadow_mmio_mask) == shadow_mmio_mask; +} + +static gfn_t get_mmio_spte_gfn(u64 spte) +{ + return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; +} + +static unsigned get_mmio_spte_access(u64 spte) +{ + return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; +} + +static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) +{ + if (unlikely(is_mmio_pfn(pfn))) { + mark_mmio_spte(sptep, gfn, access); + return true; + } + + return false; +} static inline u64 rsvd_bits(int s, int e) { @@ -226,7 +261,7 @@ static int is_nx(struct kvm_vcpu *vcpu) static int is_shadow_present_pte(u64 pte) { - return pte != 0ull; + return pte != 0ull && !is_mmio_spte(pte); } static int is_large_pte(u64 pte) @@ -2123,6 +2158,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte, entry = *sptep; int ret = 0; + if (set_mmio_spte(sptep, gfn, pfn, pte_access)) + return 0; + /* * We don't set the accessed bit, since we sometimes want to see * whether the guest actually used the pte (in order to detect @@ -2258,6 +2296,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, kvm_mmu_flush_tlb(vcpu); } + if (unlikely(is_mmio_spte(*sptep) && emulate)) + *emulate = 1; + pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", is_large_pte(*sptep)? "2MB" : "4kB", @@ -2484,7 +2525,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, static bool mmu_invalid_pfn(pfn_t pfn) { - return unlikely(is_invalid_pfn(pfn) || is_mmio_pfn(pfn)); + return unlikely(is_invalid_pfn(pfn)); } static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, @@ -2498,11 +2539,8 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, goto exit; } - if (unlikely(is_mmio_pfn(pfn))) { + if (unlikely(is_mmio_pfn(pfn))) vcpu_cache_mmio_info(vcpu, gva, gfn, access); - *ret_val = 1; - goto exit; - } ret = false; exit: @@ -2816,6 +2854,86 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); } +static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) +{ + if (direct) + return vcpu_match_mmio_gpa(vcpu, addr); + + return vcpu_match_mmio_gva(vcpu, addr); +} + +static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) +{ + struct kvm_shadow_walk_iterator iterator; + u64 spte, ret = 0ull; + + walk_shadow_page_lockless_begin(vcpu); + for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { + if (iterator.level == 1) { + ret = spte; + break; + } + + if (!is_shadow_present_pte(spte)) + break; + } + walk_shadow_page_lockless_end(vcpu); + + return ret; +} + +/* + * If it is a real mmio page fault, return 1 and emulat the instruction + * directly, return 0 if it needs page fault path to fix it, -1 is + * returned if bug is detected. + */ +int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) +{ + u64 spte; + + if (quickly_check_mmio_pf(vcpu, addr, direct)) + return 1; + + spte = walk_shadow_page_get_mmio_spte(vcpu, addr); + + if (is_mmio_spte(spte)) { + gfn_t gfn = get_mmio_spte_gfn(spte); + unsigned access = get_mmio_spte_access(spte); + + if (direct) + addr = 0; + vcpu_cache_mmio_info(vcpu, addr, gfn, access); + return 1; + } + + /* + * It's ok if the gva is remapped by other cpus on shadow guest, + * it's a BUG if the gfn is not a mmio page. + */ + if (direct && is_shadow_present_pte(spte)) + return -1; + + /* + * If the page table is zapped by other cpus, let the page + * fault path to fix it. + */ + return 0; +} +EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); + +static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, + u32 error_code, bool direct) +{ + int ret; + + if (likely(!(error_code & PFERR_RSVD_MASK))) + return 0; + + ret = handle_mmio_page_fault_common(vcpu, addr, direct); + WARN_ON(ret < 0); + return ret; +} + static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, bool prefault) { @@ -2823,6 +2941,11 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int r; pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); + + r = handle_mmio_page_fault(vcpu, gva, error_code, true); + if (r) + return r; + r = mmu_topup_memory_caches(vcpu); if (r) return r; @@ -2899,6 +3022,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + r = handle_mmio_page_fault(vcpu, gpa, error_code, true); + if (r) + return r; + r = mmu_topup_memory_caches(vcpu); if (r) return r; @@ -2996,6 +3123,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; } +static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, + int *nr_present) +{ + if (unlikely(is_mmio_spte(*sptep))) { + if (gfn != get_mmio_spte_gfn(*sptep)) { + mmu_spte_clear_no_track(sptep); + return true; + } + + (*nr_present)++; + mark_mmio_spte(sptep, gfn, access); + return true; + } + + return false; +} + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 05310b1..ebf09bf 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -49,6 +49,7 @@ #define PFERR_FETCH_MASK (1U << 4) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); +int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 870be69..cbdc1ae 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -583,6 +583,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); + r = handle_mmio_page_fault(vcpu, addr, error_code, mmu_is_nested(vcpu)); + if (r) + return r; + r = mmu_topup_memory_caches(vcpu); if (r) return r; @@ -786,7 +790,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gpa_t pte_gpa; gfn_t gfn; - if (!is_shadow_present_pte(sp->spt[i])) + if (!sp->spt[i]) continue; pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); @@ -795,13 +799,18 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - gfn = gpte_to_gfn(gpte); - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { vcpu->kvm->tlbs_dirty++; continue; } + gfn = gpte_to_gfn(gpte); + pte_access = sp->role.access; + pte_access &= FNAME(gpte_access)(vcpu, gpte, true); + + if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) + continue; + if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); vcpu->kvm->tlbs_dirty++; @@ -809,8 +818,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) } nr_present++; - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, - true); + host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a644acb..37a49f1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4671,11 +4671,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { u64 sptes[4]; - int nr_sptes, i; + int nr_sptes, i, ret; gpa_t gpa; gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + ret = handle_mmio_page_fault_common(vcpu, gpa, true); + if (likely(ret == 1)) + return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == + EMULATE_DONE; + if (unlikely(!ret)) + return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); + + /* It is the real ept misconfig */ printk(KERN_ERR "EPT: Misconfiguration.\n"); printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
The idea is from Avi: | We could cache the result of a miss in an spte by using a reserved bit, and | checking the page fault error code (or seeing if we get an ept violation or | ept misconfiguration), so if we get repeated mmio on a page, we don't need to | search the slot list/tree. | (https://lkml.org/lkml/2011/2/22/221) When the page fault is caused by mmio, we cache the info in the shadow page table, and also set the reserved bits in the shadow page table, so if the mmio is caused again, we can quickly identify it and emulate it directly Searching mmio gfn in memslots is heavy since we need to walk all memeslots, it can be reduced by this feature, and also avoid walking guest page table for soft mmu. Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> --- arch/x86/kvm/mmu.c | 156 ++++++++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/mmu.h | 1 + arch/x86/kvm/paging_tmpl.h | 18 ++++-- arch/x86/kvm/vmx.c | 10 +++- 4 files changed, 173 insertions(+), 12 deletions(-)