Message ID | 20220915142913.2213336-8-chao.p.peng@linux.intel.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | KVM: mm: fd-based approach for supporting KVM | expand |
On Thu, Sep 15, 2022, Chao Peng wrote: > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index a0f198cede3d..81ab20003824 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -3028,6 +3028,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, > break; > } > > + if (kvm_mem_is_private(kvm, gfn)) Rather than reload the Xarray info, which is unnecessary overhead, pass in @is_private. The caller must hold mmu_lock, i.e. invalidations from private<->shared conversions will be stalled and will zap the new SPTE if the state is changed. E.g. diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d68944f07b4b..44eea47697d8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3072,8 +3072,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, - fault->gfn, fault->max_level); + fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, fault->gfn, + fault->max_level, fault->is_private); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; @@ -6460,7 +6460,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, */ if (sp->role.direct && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, - PG_LEVEL_NUM)) { + PG_LEVEL_NUM, false)) { kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 7670c13ce251..9acdf72537ce 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -315,6 +315,12 @@ static inline bool is_dirty_spte(u64 spte) return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; } +static inline bool is_private_spte(u64 spte) +{ + /* FIXME: Query C-bit/S-bit for SEV/TDX. */ + return false; +} + static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) { diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 672f0432d777..69ba00157e90 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1767,8 +1767,9 @@ static void zap_collapsible_spte_range(struct kvm *kvm, if (iter.gfn < start || iter.gfn >= end) continue; - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, - iter.gfn, PG_LEVEL_NUM); + max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, + PG_LEVEL_NUM, + is_private_spte(iter.old_spte)); if (max_mapping_level < iter.level) continue;
On Fri, Oct 14, 2022 at 06:57:20PM +0000, Sean Christopherson wrote: > On Thu, Sep 15, 2022, Chao Peng wrote: > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > index a0f198cede3d..81ab20003824 100644 > > --- a/arch/x86/kvm/mmu/mmu.c > > +++ b/arch/x86/kvm/mmu/mmu.c > > @@ -3028,6 +3028,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, > > break; > > } > > > > + if (kvm_mem_is_private(kvm, gfn)) > > Rather than reload the Xarray info, which is unnecessary overhead, pass in > @is_private. The caller must hold mmu_lock, i.e. invalidations from > private<->shared conversions will be stalled and will zap the new SPTE if the > state is changed. Make sense. TDX/SEV should be easy to query that. Chao > > E.g. > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index d68944f07b4b..44eea47697d8 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -3072,8 +3072,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > * Enforce the iTLB multihit workaround after capturing the requested > * level, which will be used to do precise, accurate accounting. > */ > - fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, > - fault->gfn, fault->max_level); > + fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, fault->gfn, > + fault->max_level, fault->is_private); > if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) > return; > > @@ -6460,7 +6460,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, > */ > if (sp->role.direct && > sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, > - PG_LEVEL_NUM)) { > + PG_LEVEL_NUM, false)) { > kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); > > if (kvm_available_flush_tlb_with_range()) > diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h > index 7670c13ce251..9acdf72537ce 100644 > --- a/arch/x86/kvm/mmu/spte.h > +++ b/arch/x86/kvm/mmu/spte.h > @@ -315,6 +315,12 @@ static inline bool is_dirty_spte(u64 spte) > return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; > } > > +static inline bool is_private_spte(u64 spte) > +{ > + /* FIXME: Query C-bit/S-bit for SEV/TDX. */ > + return false; > +} > + > static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, > int level) > { > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 672f0432d777..69ba00157e90 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -1767,8 +1767,9 @@ static void zap_collapsible_spte_range(struct kvm *kvm, > if (iter.gfn < start || iter.gfn >= end) > continue; > > - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, > - iter.gfn, PG_LEVEL_NUM); > + max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, > + PG_LEVEL_NUM, > + is_private_spte(iter.old_spte)); > if (max_mapping_level < iter.level) > continue; >
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a0f198cede3d..81ab20003824 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3028,6 +3028,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, break; } + if (kvm_mem_is_private(kvm, gfn)) + return max_level; + if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; @@ -4127,6 +4130,32 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); } +static inline u8 order_to_level(int order) +{ + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) + return PG_LEVEL_1G; + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) + return PG_LEVEL_2M; + + return PG_LEVEL_4K; +} + +static int kvm_faultin_pfn_private(struct kvm_page_fault *fault) +{ + int order; + struct kvm_memory_slot *slot = fault->slot; + + if (kvm_private_mem_get_pfn(slot, fault->gfn, &fault->pfn, &order)) + return RET_PF_RETRY; + + fault->max_level = min(order_to_level(order), fault->max_level); + fault->map_writable = !(slot->flags & KVM_MEM_READONLY); + return RET_PF_CONTINUE; +} + static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; @@ -4159,6 +4188,22 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return RET_PF_EMULATE; } + if (kvm_slot_can_be_private(slot) && + fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { + vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; + if (fault->is_private) + vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE; + else + vcpu->run->memory.flags = 0; + vcpu->run->memory.padding = 0; + vcpu->run->memory.gpa = fault->gfn << PAGE_SHIFT; + vcpu->run->memory.size = PAGE_SIZE; + return RET_PF_USER; + } + + if (fault->is_private) + return kvm_faultin_pfn_private(fault); + async = false; fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async, fault->write, &fault->map_writable, @@ -4267,7 +4312,11 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault read_unlock(&vcpu->kvm->mmu_lock); else write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); + + if (fault->is_private) + kvm_private_mem_put_pfn(fault->slot, fault->pfn); + else + kvm_release_pfn_clean(fault->pfn); return r; } @@ -5543,6 +5592,9 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err return -EIO; } + if (r == RET_PF_USER) + return 0; + if (r < 0) return r; if (r != RET_PF_EMULATE) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 582def531d4d..a55e352246a7 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -188,6 +188,7 @@ struct kvm_page_fault { /* Derived from mmu and global state. */ const bool is_tdp; + const bool is_private; const bool nx_huge_page_workaround_enabled; /* @@ -236,6 +237,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); * RET_PF_RETRY: let CPU fault again on the address. * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. + * RET_PF_USER: need to exit to userspace to handle this fault. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. * @@ -252,6 +254,7 @@ enum { RET_PF_RETRY, RET_PF_EMULATE, RET_PF_INVALID, + RET_PF_USER, RET_PF_FIXED, RET_PF_SPURIOUS, }; @@ -318,4 +321,19 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); +#ifndef CONFIG_HAVE_KVM_PRIVATE_MEM +static inline int kvm_private_mem_get_pfn(struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *order) +{ + WARN_ON_ONCE(1); + return -EOPNOTSUPP; +} + +static inline void kvm_private_mem_put_pfn(struct kvm_memory_slot *slot, + kvm_pfn_t pfn) +{ + WARN_ON_ONCE(1); +} +#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */ + #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index ae86820cef69..2d7555381955 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -58,6 +58,7 @@ TRACE_DEFINE_ENUM(RET_PF_CONTINUE); TRACE_DEFINE_ENUM(RET_PF_RETRY); TRACE_DEFINE_ENUM(RET_PF_EMULATE); TRACE_DEFINE_ENUM(RET_PF_INVALID); +TRACE_DEFINE_ENUM(RET_PF_USER); TRACE_DEFINE_ENUM(RET_PF_FIXED); TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index fd36ce6597ad..b9906cdf468b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2292,6 +2292,30 @@ static inline void kvm_arch_update_mem_attr(struct kvm *kvm, unsigned int attr, } #endif +static inline int kvm_private_mem_get_pfn(struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *order) +{ + int ret; + pfn_t pfnt; + pgoff_t index = gfn - slot->base_gfn + + (slot->private_offset >> PAGE_SHIFT); + + ret = inaccessible_get_pfn(slot->private_file, index, &pfnt, order); + *pfn = pfn_t_to_pfn(pfnt); + return ret; +} + +static inline void kvm_private_mem_put_pfn(struct kvm_memory_slot *slot, + kvm_pfn_t pfn) +{ + inaccessible_put_pfn(slot->private_file, pfn_to_pfn_t(pfn)); +} + +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) +{ + return !xa_load(&kvm->mem_attr_array, gfn); +} + #endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */ #endif