Message ID | 20221202061347.1070246-9-chao.p.peng@linux.intel.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | KVM: mm: fd-based approach for supporting KVM | expand |
On Fri, Dec 02, 2022 at 02:13:46PM +0800, Chao Peng wrote: > A KVM_MEM_PRIVATE memslot can include both fd-based private memory and > hva-based shared memory. Architecture code (like TDX code) can tell > whether the on-going fault is private or not. This patch adds a > 'is_private' field to kvm_page_fault to indicate this and architecture > code is expected to set it. > > To handle page fault for such memslot, the handling logic is different > depending on whether the fault is private or shared. KVM checks if > 'is_private' matches the host's view of the page (maintained in > mem_attr_array). > - For a successful match, private pfn is obtained with > restrictedmem_get_page() and shared pfn is obtained with existing > get_user_pages(). > - For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to > userspace. Userspace then can convert memory between private/shared > in host's view and retry the fault. > > Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com> > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com> > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com> > --- > arch/x86/kvm/mmu/mmu.c | 63 +++++++++++++++++++++++++++++++-- > arch/x86/kvm/mmu/mmu_internal.h | 14 +++++++- > arch/x86/kvm/mmu/mmutrace.h | 1 + > arch/x86/kvm/mmu/tdp_mmu.c | 2 +- > include/linux/kvm_host.h | 30 ++++++++++++++++ > 5 files changed, 105 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 2190fd8c95c0..b1953ebc012e 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -3058,7 +3058,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, > > int kvm_mmu_max_mapping_level(struct kvm *kvm, > const struct kvm_memory_slot *slot, gfn_t gfn, > - int max_level) > + int max_level, bool is_private) > { > struct kvm_lpage_info *linfo; > int host_level; > @@ -3070,6 +3070,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, > break; > } > > + if (is_private) > + return max_level; lpage mixed information already saved, so is that possible to query info->disallow_lpage without care 'is_private' ? > + > if (max_level == PG_LEVEL_4K) > return PG_LEVEL_4K; > > @@ -3098,7 +3101,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > * level, which will be used to do precise, accurate accounting. > */ > fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, > - fault->gfn, fault->max_level); > + fault->gfn, fault->max_level, > + fault->is_private); > if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) > return; > > @@ -4178,6 +4182,49 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) > kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); > } > > +static inline u8 order_to_level(int order) > +{ > + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); > + > + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) > + return PG_LEVEL_1G; > + > + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) > + return PG_LEVEL_2M; > + > + return PG_LEVEL_4K; > +} > + > +static int kvm_do_memory_fault_exit(struct kvm_vcpu *vcpu, > + struct kvm_page_fault *fault) > +{ > + vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; > + if (fault->is_private) > + vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE; > + else > + vcpu->run->memory.flags = 0; > + vcpu->run->memory.gpa = fault->gfn << PAGE_SHIFT; > + vcpu->run->memory.size = PAGE_SIZE; > + return RET_PF_USER; > +} > + > +static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, > + struct kvm_page_fault *fault) > +{ > + int order; > + struct kvm_memory_slot *slot = fault->slot; > + > + if (!kvm_slot_can_be_private(slot)) > + return kvm_do_memory_fault_exit(vcpu, fault); > + > + if (kvm_restricted_mem_get_pfn(slot, fault->gfn, &fault->pfn, &order)) > + return RET_PF_RETRY; > + > + fault->max_level = min(order_to_level(order), fault->max_level); > + fault->map_writable = !(slot->flags & KVM_MEM_READONLY); > + return RET_PF_CONTINUE; > +} > + > static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > { > struct kvm_memory_slot *slot = fault->slot; > @@ -4210,6 +4257,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > return RET_PF_EMULATE; > } > > + if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) > + return kvm_do_memory_fault_exit(vcpu, fault); > + > + if (fault->is_private) > + return kvm_faultin_pfn_private(vcpu, fault); > + > async = false; > fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, > fault->write, &fault->map_writable, > @@ -5599,6 +5652,9 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err > return -EIO; > } > > + if (r == RET_PF_USER) > + return 0; > + > if (r < 0) > return r; > if (r != RET_PF_EMULATE) > @@ -6452,7 +6508,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, > */ > if (sp->role.direct && > sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, > - PG_LEVEL_NUM)) { > + PG_LEVEL_NUM, > + false)) { > kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); > > if (kvm_available_flush_tlb_with_range()) > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h > index dbaf6755c5a7..5ccf08183b00 100644 > --- a/arch/x86/kvm/mmu/mmu_internal.h > +++ b/arch/x86/kvm/mmu/mmu_internal.h > @@ -189,6 +189,7 @@ struct kvm_page_fault { > > /* Derived from mmu and global state. */ > const bool is_tdp; > + const bool is_private; > const bool nx_huge_page_workaround_enabled; > > /* > @@ -237,6 +238,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); > * RET_PF_RETRY: let CPU fault again on the address. > * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. > * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. > + * RET_PF_USER: need to exit to userspace to handle this fault. > * RET_PF_FIXED: The faulting entry has been fixed. > * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. > * > @@ -253,6 +255,7 @@ enum { > RET_PF_RETRY, > RET_PF_EMULATE, > RET_PF_INVALID, > + RET_PF_USER, > RET_PF_FIXED, > RET_PF_SPURIOUS, > }; > @@ -310,7 +313,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, > > int kvm_mmu_max_mapping_level(struct kvm *kvm, > const struct kvm_memory_slot *slot, gfn_t gfn, > - int max_level); > + int max_level, bool is_private); > void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); > void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); > > @@ -319,4 +322,13 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); > void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); > void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); > > +#ifndef CONFIG_HAVE_KVM_RESTRICTED_MEM > +static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot, > + gfn_t gfn, kvm_pfn_t *pfn, int *order) > +{ > + WARN_ON_ONCE(1); > + return -EOPNOTSUPP; > +} > +#endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */ > + > #endif /* __KVM_X86_MMU_INTERNAL_H */ > diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h > index ae86820cef69..2d7555381955 100644 > --- a/arch/x86/kvm/mmu/mmutrace.h > +++ b/arch/x86/kvm/mmu/mmutrace.h > @@ -58,6 +58,7 @@ TRACE_DEFINE_ENUM(RET_PF_CONTINUE); > TRACE_DEFINE_ENUM(RET_PF_RETRY); > TRACE_DEFINE_ENUM(RET_PF_EMULATE); > TRACE_DEFINE_ENUM(RET_PF_INVALID); > +TRACE_DEFINE_ENUM(RET_PF_USER); > TRACE_DEFINE_ENUM(RET_PF_FIXED); > TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 771210ce5181..8ba1a4afc546 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -1768,7 +1768,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, > continue; > > max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, > - iter.gfn, PG_LEVEL_NUM); > + iter.gfn, PG_LEVEL_NUM, false); > if (max_mapping_level < iter.level) > continue; > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 25099c94e770..153842bb33df 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -2335,4 +2335,34 @@ static inline void kvm_arch_set_memory_attributes(struct kvm *kvm, > } > #endif /* __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES */ > > +#ifdef CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) > +{ > + return xa_to_value(xa_load(&kvm->mem_attr_array, gfn)) & > + KVM_MEMORY_ATTRIBUTE_PRIVATE; > +} > +#else > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) > +{ > + return false; > +} > + > +#endif /* CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES */ > + > +#ifdef CONFIG_HAVE_KVM_RESTRICTED_MEM > +static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot, > + gfn_t gfn, kvm_pfn_t *pfn, int *order) > +{ > + int ret; > + struct page *page; > + pgoff_t index = gfn - slot->base_gfn + > + (slot->restricted_offset >> PAGE_SHIFT); > + > + ret = restrictedmem_get_page(slot->restricted_file, index, > + &page, order); > + *pfn = page_to_pfn(page); > + return ret; > +} > +#endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */ > + > #endif > -- > 2.25.1 > >
On Thu, Dec 08, 2022 at 10:29:18AM +0800, Yuan Yao wrote: > On Fri, Dec 02, 2022 at 02:13:46PM +0800, Chao Peng wrote: > > A KVM_MEM_PRIVATE memslot can include both fd-based private memory and > > hva-based shared memory. Architecture code (like TDX code) can tell > > whether the on-going fault is private or not. This patch adds a > > 'is_private' field to kvm_page_fault to indicate this and architecture > > code is expected to set it. > > > > To handle page fault for such memslot, the handling logic is different > > depending on whether the fault is private or shared. KVM checks if > > 'is_private' matches the host's view of the page (maintained in > > mem_attr_array). > > - For a successful match, private pfn is obtained with > > restrictedmem_get_page() and shared pfn is obtained with existing > > get_user_pages(). > > - For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to > > userspace. Userspace then can convert memory between private/shared > > in host's view and retry the fault. > > > > Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com> > > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com> > > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com> > > --- > > arch/x86/kvm/mmu/mmu.c | 63 +++++++++++++++++++++++++++++++-- > > arch/x86/kvm/mmu/mmu_internal.h | 14 +++++++- > > arch/x86/kvm/mmu/mmutrace.h | 1 + > > arch/x86/kvm/mmu/tdp_mmu.c | 2 +- > > include/linux/kvm_host.h | 30 ++++++++++++++++ > > 5 files changed, 105 insertions(+), 5 deletions(-) > > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > index 2190fd8c95c0..b1953ebc012e 100644 > > --- a/arch/x86/kvm/mmu/mmu.c > > +++ b/arch/x86/kvm/mmu/mmu.c > > @@ -3058,7 +3058,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, > > > > int kvm_mmu_max_mapping_level(struct kvm *kvm, > > const struct kvm_memory_slot *slot, gfn_t gfn, > > - int max_level) > > + int max_level, bool is_private) > > { > > struct kvm_lpage_info *linfo; > > int host_level; > > @@ -3070,6 +3070,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, > > break; > > } > > > > + if (is_private) > > + return max_level; > > lpage mixed information already saved, so is that possible > to query info->disallow_lpage without care 'is_private' ? Actually we already queried info->disallow_lpage just before this sentence. The check is needed because later in the function we call host_pfn_mapping_level() which is shared memory specific. Thanks, Chao > > > + > > if (max_level == PG_LEVEL_4K) > > return PG_LEVEL_4K; > > > > @@ -3098,7 +3101,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > * level, which will be used to do precise, accurate accounting. > > */ > > fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, > > - fault->gfn, fault->max_level); > > + fault->gfn, fault->max_level, > > + fault->is_private); > > if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) > > return; > > > > @@ -4178,6 +4182,49 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) > > kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); > > } > > > > +static inline u8 order_to_level(int order) > > +{ > > + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); > > + > > + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) > > + return PG_LEVEL_1G; > > + > > + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) > > + return PG_LEVEL_2M; > > + > > + return PG_LEVEL_4K; > > +} > > + > > +static int kvm_do_memory_fault_exit(struct kvm_vcpu *vcpu, > > + struct kvm_page_fault *fault) > > +{ > > + vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; > > + if (fault->is_private) > > + vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE; > > + else > > + vcpu->run->memory.flags = 0; > > + vcpu->run->memory.gpa = fault->gfn << PAGE_SHIFT; > > + vcpu->run->memory.size = PAGE_SIZE; > > + return RET_PF_USER; > > +} > > + > > +static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, > > + struct kvm_page_fault *fault) > > +{ > > + int order; > > + struct kvm_memory_slot *slot = fault->slot; > > + > > + if (!kvm_slot_can_be_private(slot)) > > + return kvm_do_memory_fault_exit(vcpu, fault); > > + > > + if (kvm_restricted_mem_get_pfn(slot, fault->gfn, &fault->pfn, &order)) > > + return RET_PF_RETRY; > > + > > + fault->max_level = min(order_to_level(order), fault->max_level); > > + fault->map_writable = !(slot->flags & KVM_MEM_READONLY); > > + return RET_PF_CONTINUE; > > +} > > + > > static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > > { > > struct kvm_memory_slot *slot = fault->slot; > > @@ -4210,6 +4257,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > > return RET_PF_EMULATE; > > } > > > > + if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) > > + return kvm_do_memory_fault_exit(vcpu, fault); > > + > > + if (fault->is_private) > > + return kvm_faultin_pfn_private(vcpu, fault); > > + > > async = false; > > fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, > > fault->write, &fault->map_writable, > > @@ -5599,6 +5652,9 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err > > return -EIO; > > } > > > > + if (r == RET_PF_USER) > > + return 0; > > + > > if (r < 0) > > return r; > > if (r != RET_PF_EMULATE) > > @@ -6452,7 +6508,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, > > */ > > if (sp->role.direct && > > sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, > > - PG_LEVEL_NUM)) { > > + PG_LEVEL_NUM, > > + false)) { > > kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); > > > > if (kvm_available_flush_tlb_with_range()) > > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h > > index dbaf6755c5a7..5ccf08183b00 100644 > > --- a/arch/x86/kvm/mmu/mmu_internal.h > > +++ b/arch/x86/kvm/mmu/mmu_internal.h > > @@ -189,6 +189,7 @@ struct kvm_page_fault { > > > > /* Derived from mmu and global state. */ > > const bool is_tdp; > > + const bool is_private; > > const bool nx_huge_page_workaround_enabled; > > > > /* > > @@ -237,6 +238,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); > > * RET_PF_RETRY: let CPU fault again on the address. > > * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. > > * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. > > + * RET_PF_USER: need to exit to userspace to handle this fault. > > * RET_PF_FIXED: The faulting entry has been fixed. > > * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. > > * > > @@ -253,6 +255,7 @@ enum { > > RET_PF_RETRY, > > RET_PF_EMULATE, > > RET_PF_INVALID, > > + RET_PF_USER, > > RET_PF_FIXED, > > RET_PF_SPURIOUS, > > }; > > @@ -310,7 +313,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, > > > > int kvm_mmu_max_mapping_level(struct kvm *kvm, > > const struct kvm_memory_slot *slot, gfn_t gfn, > > - int max_level); > > + int max_level, bool is_private); > > void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); > > void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); > > > > @@ -319,4 +322,13 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); > > void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); > > void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); > > > > +#ifndef CONFIG_HAVE_KVM_RESTRICTED_MEM > > +static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot, > > + gfn_t gfn, kvm_pfn_t *pfn, int *order) > > +{ > > + WARN_ON_ONCE(1); > > + return -EOPNOTSUPP; > > +} > > +#endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */ > > + > > #endif /* __KVM_X86_MMU_INTERNAL_H */ > > diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h > > index ae86820cef69..2d7555381955 100644 > > --- a/arch/x86/kvm/mmu/mmutrace.h > > +++ b/arch/x86/kvm/mmu/mmutrace.h > > @@ -58,6 +58,7 @@ TRACE_DEFINE_ENUM(RET_PF_CONTINUE); > > TRACE_DEFINE_ENUM(RET_PF_RETRY); > > TRACE_DEFINE_ENUM(RET_PF_EMULATE); > > TRACE_DEFINE_ENUM(RET_PF_INVALID); > > +TRACE_DEFINE_ENUM(RET_PF_USER); > > TRACE_DEFINE_ENUM(RET_PF_FIXED); > > TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); > > > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > > index 771210ce5181..8ba1a4afc546 100644 > > --- a/arch/x86/kvm/mmu/tdp_mmu.c > > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > > @@ -1768,7 +1768,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, > > continue; > > > > max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, > > - iter.gfn, PG_LEVEL_NUM); > > + iter.gfn, PG_LEVEL_NUM, false); > > if (max_mapping_level < iter.level) > > continue; > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > index 25099c94e770..153842bb33df 100644 > > --- a/include/linux/kvm_host.h > > +++ b/include/linux/kvm_host.h > > @@ -2335,4 +2335,34 @@ static inline void kvm_arch_set_memory_attributes(struct kvm *kvm, > > } > > #endif /* __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES */ > > > > +#ifdef CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES > > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) > > +{ > > + return xa_to_value(xa_load(&kvm->mem_attr_array, gfn)) & > > + KVM_MEMORY_ATTRIBUTE_PRIVATE; > > +} > > +#else > > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) > > +{ > > + return false; > > +} > > + > > +#endif /* CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES */ > > + > > +#ifdef CONFIG_HAVE_KVM_RESTRICTED_MEM > > +static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot, > > + gfn_t gfn, kvm_pfn_t *pfn, int *order) > > +{ > > + int ret; > > + struct page *page; > > + pgoff_t index = gfn - slot->base_gfn + > > + (slot->restricted_offset >> PAGE_SHIFT); > > + > > + ret = restrictedmem_get_page(slot->restricted_file, index, > > + &page, order); > > + *pfn = page_to_pfn(page); > > + return ret; > > +} > > +#endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */ > > + > > #endif > > -- > > 2.25.1 > > > >
On Thu, Dec 08, 2022 at 07:23:46PM +0800, Chao Peng wrote: > On Thu, Dec 08, 2022 at 10:29:18AM +0800, Yuan Yao wrote: > > On Fri, Dec 02, 2022 at 02:13:46PM +0800, Chao Peng wrote: > > > A KVM_MEM_PRIVATE memslot can include both fd-based private memory and > > > hva-based shared memory. Architecture code (like TDX code) can tell > > > whether the on-going fault is private or not. This patch adds a > > > 'is_private' field to kvm_page_fault to indicate this and architecture > > > code is expected to set it. > > > > > > To handle page fault for such memslot, the handling logic is different > > > depending on whether the fault is private or shared. KVM checks if > > > 'is_private' matches the host's view of the page (maintained in > > > mem_attr_array). > > > - For a successful match, private pfn is obtained with > > > restrictedmem_get_page() and shared pfn is obtained with existing > > > get_user_pages(). > > > - For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to > > > userspace. Userspace then can convert memory between private/shared > > > in host's view and retry the fault. > > > > > > Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com> > > > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com> > > > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com> > > > --- > > > arch/x86/kvm/mmu/mmu.c | 63 +++++++++++++++++++++++++++++++-- > > > arch/x86/kvm/mmu/mmu_internal.h | 14 +++++++- > > > arch/x86/kvm/mmu/mmutrace.h | 1 + > > > arch/x86/kvm/mmu/tdp_mmu.c | 2 +- > > > include/linux/kvm_host.h | 30 ++++++++++++++++ > > > 5 files changed, 105 insertions(+), 5 deletions(-) > > > > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > > index 2190fd8c95c0..b1953ebc012e 100644 > > > --- a/arch/x86/kvm/mmu/mmu.c > > > +++ b/arch/x86/kvm/mmu/mmu.c > > > @@ -3058,7 +3058,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, > > > > > > int kvm_mmu_max_mapping_level(struct kvm *kvm, > > > const struct kvm_memory_slot *slot, gfn_t gfn, > > > - int max_level) > > > + int max_level, bool is_private) > > > { > > > struct kvm_lpage_info *linfo; > > > int host_level; > > > @@ -3070,6 +3070,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, > > > break; > > > } > > > > > > + if (is_private) > > > + return max_level; > > > > lpage mixed information already saved, so is that possible > > to query info->disallow_lpage without care 'is_private' ? > > Actually we already queried info->disallow_lpage just before this > sentence. The check is needed because later in the function we call > host_pfn_mapping_level() which is shared memory specific. You're right. We can't get mapping level info for private page from host_pfn_mapping_level(). > > Thanks, > Chao > > > > > + > > > if (max_level == PG_LEVEL_4K) > > > return PG_LEVEL_4K; > > > > > > @@ -3098,7 +3101,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > > * level, which will be used to do precise, accurate accounting. > > > */ > > > fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, > > > - fault->gfn, fault->max_level); > > > + fault->gfn, fault->max_level, > > > + fault->is_private); > > > if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) > > > return; > > > > > > @@ -4178,6 +4182,49 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) > > > kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); > > > } > > > > > > +static inline u8 order_to_level(int order) > > > +{ > > > + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); > > > + > > > + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) > > > + return PG_LEVEL_1G; > > > + > > > + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) > > > + return PG_LEVEL_2M; > > > + > > > + return PG_LEVEL_4K; > > > +} > > > + > > > +static int kvm_do_memory_fault_exit(struct kvm_vcpu *vcpu, > > > + struct kvm_page_fault *fault) > > > +{ > > > + vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; > > > + if (fault->is_private) > > > + vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE; > > > + else > > > + vcpu->run->memory.flags = 0; > > > + vcpu->run->memory.gpa = fault->gfn << PAGE_SHIFT; > > > + vcpu->run->memory.size = PAGE_SIZE; > > > + return RET_PF_USER; > > > +} > > > + > > > +static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, > > > + struct kvm_page_fault *fault) > > > +{ > > > + int order; > > > + struct kvm_memory_slot *slot = fault->slot; > > > + > > > + if (!kvm_slot_can_be_private(slot)) > > > + return kvm_do_memory_fault_exit(vcpu, fault); > > > + > > > + if (kvm_restricted_mem_get_pfn(slot, fault->gfn, &fault->pfn, &order)) > > > + return RET_PF_RETRY; > > > + > > > + fault->max_level = min(order_to_level(order), fault->max_level); > > > + fault->map_writable = !(slot->flags & KVM_MEM_READONLY); > > > + return RET_PF_CONTINUE; > > > +} > > > + > > > static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > > > { > > > struct kvm_memory_slot *slot = fault->slot; > > > @@ -4210,6 +4257,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > > > return RET_PF_EMULATE; > > > } > > > > > > + if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) > > > + return kvm_do_memory_fault_exit(vcpu, fault); > > > + > > > + if (fault->is_private) > > > + return kvm_faultin_pfn_private(vcpu, fault); > > > + > > > async = false; > > > fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, > > > fault->write, &fault->map_writable, > > > @@ -5599,6 +5652,9 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err > > > return -EIO; > > > } > > > > > > + if (r == RET_PF_USER) > > > + return 0; > > > + > > > if (r < 0) > > > return r; > > > if (r != RET_PF_EMULATE) > > > @@ -6452,7 +6508,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, > > > */ > > > if (sp->role.direct && > > > sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, > > > - PG_LEVEL_NUM)) { > > > + PG_LEVEL_NUM, > > > + false)) { > > > kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); > > > > > > if (kvm_available_flush_tlb_with_range()) > > > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h > > > index dbaf6755c5a7..5ccf08183b00 100644 > > > --- a/arch/x86/kvm/mmu/mmu_internal.h > > > +++ b/arch/x86/kvm/mmu/mmu_internal.h > > > @@ -189,6 +189,7 @@ struct kvm_page_fault { > > > > > > /* Derived from mmu and global state. */ > > > const bool is_tdp; > > > + const bool is_private; > > > const bool nx_huge_page_workaround_enabled; > > > > > > /* > > > @@ -237,6 +238,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); > > > * RET_PF_RETRY: let CPU fault again on the address. > > > * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. > > > * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. > > > + * RET_PF_USER: need to exit to userspace to handle this fault. > > > * RET_PF_FIXED: The faulting entry has been fixed. > > > * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. > > > * > > > @@ -253,6 +255,7 @@ enum { > > > RET_PF_RETRY, > > > RET_PF_EMULATE, > > > RET_PF_INVALID, > > > + RET_PF_USER, > > > RET_PF_FIXED, > > > RET_PF_SPURIOUS, > > > }; > > > @@ -310,7 +313,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, > > > > > > int kvm_mmu_max_mapping_level(struct kvm *kvm, > > > const struct kvm_memory_slot *slot, gfn_t gfn, > > > - int max_level); > > > + int max_level, bool is_private); > > > void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); > > > void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); > > > > > > @@ -319,4 +322,13 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); > > > void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); > > > void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); > > > > > > +#ifndef CONFIG_HAVE_KVM_RESTRICTED_MEM > > > +static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot, > > > + gfn_t gfn, kvm_pfn_t *pfn, int *order) > > > +{ > > > + WARN_ON_ONCE(1); > > > + return -EOPNOTSUPP; > > > +} > > > +#endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */ > > > + > > > #endif /* __KVM_X86_MMU_INTERNAL_H */ > > > diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h > > > index ae86820cef69..2d7555381955 100644 > > > --- a/arch/x86/kvm/mmu/mmutrace.h > > > +++ b/arch/x86/kvm/mmu/mmutrace.h > > > @@ -58,6 +58,7 @@ TRACE_DEFINE_ENUM(RET_PF_CONTINUE); > > > TRACE_DEFINE_ENUM(RET_PF_RETRY); > > > TRACE_DEFINE_ENUM(RET_PF_EMULATE); > > > TRACE_DEFINE_ENUM(RET_PF_INVALID); > > > +TRACE_DEFINE_ENUM(RET_PF_USER); > > > TRACE_DEFINE_ENUM(RET_PF_FIXED); > > > TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); > > > > > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > > > index 771210ce5181..8ba1a4afc546 100644 > > > --- a/arch/x86/kvm/mmu/tdp_mmu.c > > > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > > > @@ -1768,7 +1768,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, > > > continue; > > > > > > max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, > > > - iter.gfn, PG_LEVEL_NUM); > > > + iter.gfn, PG_LEVEL_NUM, false); > > > if (max_mapping_level < iter.level) > > > continue; > > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > > index 25099c94e770..153842bb33df 100644 > > > --- a/include/linux/kvm_host.h > > > +++ b/include/linux/kvm_host.h > > > @@ -2335,4 +2335,34 @@ static inline void kvm_arch_set_memory_attributes(struct kvm *kvm, > > > } > > > #endif /* __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES */ > > > > > > +#ifdef CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES > > > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) > > > +{ > > > + return xa_to_value(xa_load(&kvm->mem_attr_array, gfn)) & > > > + KVM_MEMORY_ATTRIBUTE_PRIVATE; > > > +} > > > +#else > > > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) > > > +{ > > > + return false; > > > +} > > > + > > > +#endif /* CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES */ > > > + > > > +#ifdef CONFIG_HAVE_KVM_RESTRICTED_MEM > > > +static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot, > > > + gfn_t gfn, kvm_pfn_t *pfn, int *order) > > > +{ > > > + int ret; > > > + struct page *page; > > > + pgoff_t index = gfn - slot->base_gfn + > > > + (slot->restricted_offset >> PAGE_SHIFT); > > > + > > > + ret = restrictedmem_get_page(slot->restricted_file, index, > > > + &page, order); > > > + *pfn = page_to_pfn(page); > > > + return ret; > > > +} > > > +#endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */ > > > + > > > #endif > > > -- > > > 2.25.1 > > > > > >
Hi, On Fri, Dec 2, 2022 at 6:19 AM Chao Peng <chao.p.peng@linux.intel.com> wrote: > > A KVM_MEM_PRIVATE memslot can include both fd-based private memory and > hva-based shared memory. Architecture code (like TDX code) can tell > whether the on-going fault is private or not. This patch adds a > 'is_private' field to kvm_page_fault to indicate this and architecture > code is expected to set it. > > To handle page fault for such memslot, the handling logic is different > depending on whether the fault is private or shared. KVM checks if > 'is_private' matches the host's view of the page (maintained in > mem_attr_array). > - For a successful match, private pfn is obtained with > restrictedmem_get_page() and shared pfn is obtained with existing > get_user_pages(). > - For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to > userspace. Userspace then can convert memory between private/shared > in host's view and retry the fault. > > Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com> > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com> > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com> > --- > arch/x86/kvm/mmu/mmu.c | 63 +++++++++++++++++++++++++++++++-- > arch/x86/kvm/mmu/mmu_internal.h | 14 +++++++- > arch/x86/kvm/mmu/mmutrace.h | 1 + > arch/x86/kvm/mmu/tdp_mmu.c | 2 +- > include/linux/kvm_host.h | 30 ++++++++++++++++ > 5 files changed, 105 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 2190fd8c95c0..b1953ebc012e 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -3058,7 +3058,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, > > int kvm_mmu_max_mapping_level(struct kvm *kvm, > const struct kvm_memory_slot *slot, gfn_t gfn, > - int max_level) > + int max_level, bool is_private) > { > struct kvm_lpage_info *linfo; > int host_level; > @@ -3070,6 +3070,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, > break; > } > > + if (is_private) > + return max_level; > + > if (max_level == PG_LEVEL_4K) > return PG_LEVEL_4K; > > @@ -3098,7 +3101,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > * level, which will be used to do precise, accurate accounting. > */ > fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, > - fault->gfn, fault->max_level); > + fault->gfn, fault->max_level, > + fault->is_private); > if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) > return; > > @@ -4178,6 +4182,49 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) > kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); > } > > +static inline u8 order_to_level(int order) > +{ > + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); > + > + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) > + return PG_LEVEL_1G; > + > + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) > + return PG_LEVEL_2M; > + > + return PG_LEVEL_4K; > +} > + > +static int kvm_do_memory_fault_exit(struct kvm_vcpu *vcpu, > + struct kvm_page_fault *fault) > +{ > + vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; > + if (fault->is_private) > + vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE; > + else > + vcpu->run->memory.flags = 0; > + vcpu->run->memory.gpa = fault->gfn << PAGE_SHIFT; nit: As in previous patches, use helpers (for this and other similar shifts in this patch)? > + vcpu->run->memory.size = PAGE_SIZE; > + return RET_PF_USER; > +} > + > +static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, > + struct kvm_page_fault *fault) > +{ > + int order; > + struct kvm_memory_slot *slot = fault->slot; > + > + if (!kvm_slot_can_be_private(slot)) > + return kvm_do_memory_fault_exit(vcpu, fault); > + > + if (kvm_restricted_mem_get_pfn(slot, fault->gfn, &fault->pfn, &order)) > + return RET_PF_RETRY; > + > + fault->max_level = min(order_to_level(order), fault->max_level); > + fault->map_writable = !(slot->flags & KVM_MEM_READONLY); > + return RET_PF_CONTINUE; > +} > + > static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > { > struct kvm_memory_slot *slot = fault->slot; > @@ -4210,6 +4257,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > return RET_PF_EMULATE; > } > > + if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) > + return kvm_do_memory_fault_exit(vcpu, fault); > + > + if (fault->is_private) > + return kvm_faultin_pfn_private(vcpu, fault); > + > async = false; > fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, > fault->write, &fault->map_writable, > @@ -5599,6 +5652,9 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err > return -EIO; > } > > + if (r == RET_PF_USER) > + return 0; > + > if (r < 0) > return r; > if (r != RET_PF_EMULATE) > @@ -6452,7 +6508,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, > */ > if (sp->role.direct && > sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, > - PG_LEVEL_NUM)) { > + PG_LEVEL_NUM, > + false)) { > kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); > > if (kvm_available_flush_tlb_with_range()) > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h > index dbaf6755c5a7..5ccf08183b00 100644 > --- a/arch/x86/kvm/mmu/mmu_internal.h > +++ b/arch/x86/kvm/mmu/mmu_internal.h > @@ -189,6 +189,7 @@ struct kvm_page_fault { > > /* Derived from mmu and global state. */ > const bool is_tdp; > + const bool is_private; > const bool nx_huge_page_workaround_enabled; > > /* > @@ -237,6 +238,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); > * RET_PF_RETRY: let CPU fault again on the address. > * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. > * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. > + * RET_PF_USER: need to exit to userspace to handle this fault. > * RET_PF_FIXED: The faulting entry has been fixed. > * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. > * > @@ -253,6 +255,7 @@ enum { > RET_PF_RETRY, > RET_PF_EMULATE, > RET_PF_INVALID, > + RET_PF_USER, > RET_PF_FIXED, > RET_PF_SPURIOUS, > }; > @@ -310,7 +313,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, > > int kvm_mmu_max_mapping_level(struct kvm *kvm, > const struct kvm_memory_slot *slot, gfn_t gfn, > - int max_level); > + int max_level, bool is_private); > void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); > void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); > > @@ -319,4 +322,13 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); > void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); > void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); > > +#ifndef CONFIG_HAVE_KVM_RESTRICTED_MEM > +static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot, > + gfn_t gfn, kvm_pfn_t *pfn, int *order) > +{ > + WARN_ON_ONCE(1); > + return -EOPNOTSUPP; > +} > +#endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */ > + > #endif /* __KVM_X86_MMU_INTERNAL_H */ > diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h > index ae86820cef69..2d7555381955 100644 > --- a/arch/x86/kvm/mmu/mmutrace.h > +++ b/arch/x86/kvm/mmu/mmutrace.h > @@ -58,6 +58,7 @@ TRACE_DEFINE_ENUM(RET_PF_CONTINUE); > TRACE_DEFINE_ENUM(RET_PF_RETRY); > TRACE_DEFINE_ENUM(RET_PF_EMULATE); > TRACE_DEFINE_ENUM(RET_PF_INVALID); > +TRACE_DEFINE_ENUM(RET_PF_USER); > TRACE_DEFINE_ENUM(RET_PF_FIXED); > TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 771210ce5181..8ba1a4afc546 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -1768,7 +1768,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, > continue; > > max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, > - iter.gfn, PG_LEVEL_NUM); > + iter.gfn, PG_LEVEL_NUM, false); > if (max_mapping_level < iter.level) > continue; > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 25099c94e770..153842bb33df 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -2335,4 +2335,34 @@ static inline void kvm_arch_set_memory_attributes(struct kvm *kvm, > } > #endif /* __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES */ > > +#ifdef CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) > +{ > + return xa_to_value(xa_load(&kvm->mem_attr_array, gfn)) & > + KVM_MEMORY_ATTRIBUTE_PRIVATE; > +} > +#else > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) > +{ > + return false; > +} > + > +#endif /* CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES */ > + > +#ifdef CONFIG_HAVE_KVM_RESTRICTED_MEM > +static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot, > + gfn_t gfn, kvm_pfn_t *pfn, int *order) > +{ > + int ret; > + struct page *page; > + pgoff_t index = gfn - slot->base_gfn + > + (slot->restricted_offset >> PAGE_SHIFT); > + > + ret = restrictedmem_get_page(slot->restricted_file, index, > + &page, order); > + *pfn = page_to_pfn(page); > + return ret; > +} > +#endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */ > + > #endif > -- > 2.25.1 > With my limited understanding of x86 code: Reviewed-by: Fuad Tabba <tabba@google.com> The common code in kvm_host.h was used in the port to arm64, and the x86 fault handling code was used as a guide to how it should be done in pKVM (with similar code added there). So with these caveats in mind: Tested-by: Fuad Tabba <tabba@google.com> Cheers, /fuad
On Fri, Dec 09, 2022 at 09:01:04AM +0000, Fuad Tabba wrote: > Hi, > > On Fri, Dec 2, 2022 at 6:19 AM Chao Peng <chao.p.peng@linux.intel.com> wrote: > > > > A KVM_MEM_PRIVATE memslot can include both fd-based private memory and > > hva-based shared memory. Architecture code (like TDX code) can tell > > whether the on-going fault is private or not. This patch adds a > > 'is_private' field to kvm_page_fault to indicate this and architecture > > code is expected to set it. > > > > To handle page fault for such memslot, the handling logic is different > > depending on whether the fault is private or shared. KVM checks if > > 'is_private' matches the host's view of the page (maintained in > > mem_attr_array). > > - For a successful match, private pfn is obtained with > > restrictedmem_get_page() and shared pfn is obtained with existing > > get_user_pages(). > > - For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to > > userspace. Userspace then can convert memory between private/shared > > in host's view and retry the fault. > > > > Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com> > > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com> > > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com> > > --- > > arch/x86/kvm/mmu/mmu.c | 63 +++++++++++++++++++++++++++++++-- > > arch/x86/kvm/mmu/mmu_internal.h | 14 +++++++- > > arch/x86/kvm/mmu/mmutrace.h | 1 + > > arch/x86/kvm/mmu/tdp_mmu.c | 2 +- > > include/linux/kvm_host.h | 30 ++++++++++++++++ > > 5 files changed, 105 insertions(+), 5 deletions(-) > > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > index 2190fd8c95c0..b1953ebc012e 100644 > > --- a/arch/x86/kvm/mmu/mmu.c > > +++ b/arch/x86/kvm/mmu/mmu.c > > @@ -3058,7 +3058,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, > > > > int kvm_mmu_max_mapping_level(struct kvm *kvm, > > const struct kvm_memory_slot *slot, gfn_t gfn, > > - int max_level) > > + int max_level, bool is_private) > > { > > struct kvm_lpage_info *linfo; > > int host_level; > > @@ -3070,6 +3070,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, > > break; > > } > > > > + if (is_private) > > + return max_level; > > + > > if (max_level == PG_LEVEL_4K) > > return PG_LEVEL_4K; > > > > @@ -3098,7 +3101,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > * level, which will be used to do precise, accurate accounting. > > */ > > fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, > > - fault->gfn, fault->max_level); > > + fault->gfn, fault->max_level, > > + fault->is_private); > > if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) > > return; > > > > @@ -4178,6 +4182,49 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) > > kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); > > } > > > > +static inline u8 order_to_level(int order) > > +{ > > + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); > > + > > + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) > > + return PG_LEVEL_1G; > > + > > + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) > > + return PG_LEVEL_2M; > > + > > + return PG_LEVEL_4K; > > +} > > + > > +static int kvm_do_memory_fault_exit(struct kvm_vcpu *vcpu, > > + struct kvm_page_fault *fault) > > +{ > > + vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; > > + if (fault->is_private) > > + vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE; > > + else > > + vcpu->run->memory.flags = 0; > > + vcpu->run->memory.gpa = fault->gfn << PAGE_SHIFT; > > nit: As in previous patches, use helpers (for this and other similar > shifts in this patch)? Agreed. > > > + vcpu->run->memory.size = PAGE_SIZE; > > + return RET_PF_USER; > > +} > > + > > +static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, > > + struct kvm_page_fault *fault) > > +{ > > + int order; > > + struct kvm_memory_slot *slot = fault->slot; > > + > > + if (!kvm_slot_can_be_private(slot)) > > + return kvm_do_memory_fault_exit(vcpu, fault); > > + > > + if (kvm_restricted_mem_get_pfn(slot, fault->gfn, &fault->pfn, &order)) > > + return RET_PF_RETRY; > > + > > + fault->max_level = min(order_to_level(order), fault->max_level); > > + fault->map_writable = !(slot->flags & KVM_MEM_READONLY); > > + return RET_PF_CONTINUE; > > +} > > + > > static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > > { > > struct kvm_memory_slot *slot = fault->slot; > > @@ -4210,6 +4257,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > > return RET_PF_EMULATE; > > } > > > > + if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) > > + return kvm_do_memory_fault_exit(vcpu, fault); > > + > > + if (fault->is_private) > > + return kvm_faultin_pfn_private(vcpu, fault); > > + > > async = false; > > fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, > > fault->write, &fault->map_writable, > > @@ -5599,6 +5652,9 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err > > return -EIO; > > } > > > > + if (r == RET_PF_USER) > > + return 0; > > + > > if (r < 0) > > return r; > > if (r != RET_PF_EMULATE) > > @@ -6452,7 +6508,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, > > */ > > if (sp->role.direct && > > sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, > > - PG_LEVEL_NUM)) { > > + PG_LEVEL_NUM, > > + false)) { > > kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); > > > > if (kvm_available_flush_tlb_with_range()) > > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h > > index dbaf6755c5a7..5ccf08183b00 100644 > > --- a/arch/x86/kvm/mmu/mmu_internal.h > > +++ b/arch/x86/kvm/mmu/mmu_internal.h > > @@ -189,6 +189,7 @@ struct kvm_page_fault { > > > > /* Derived from mmu and global state. */ > > const bool is_tdp; > > + const bool is_private; > > const bool nx_huge_page_workaround_enabled; > > > > /* > > @@ -237,6 +238,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); > > * RET_PF_RETRY: let CPU fault again on the address. > > * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. > > * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. > > + * RET_PF_USER: need to exit to userspace to handle this fault. > > * RET_PF_FIXED: The faulting entry has been fixed. > > * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. > > * > > @@ -253,6 +255,7 @@ enum { > > RET_PF_RETRY, > > RET_PF_EMULATE, > > RET_PF_INVALID, > > + RET_PF_USER, > > RET_PF_FIXED, > > RET_PF_SPURIOUS, > > }; > > @@ -310,7 +313,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, > > > > int kvm_mmu_max_mapping_level(struct kvm *kvm, > > const struct kvm_memory_slot *slot, gfn_t gfn, > > - int max_level); > > + int max_level, bool is_private); > > void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); > > void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); > > > > @@ -319,4 +322,13 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); > > void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); > > void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); > > > > +#ifndef CONFIG_HAVE_KVM_RESTRICTED_MEM > > +static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot, > > + gfn_t gfn, kvm_pfn_t *pfn, int *order) > > +{ > > + WARN_ON_ONCE(1); > > + return -EOPNOTSUPP; > > +} > > +#endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */ > > + > > #endif /* __KVM_X86_MMU_INTERNAL_H */ > > diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h > > index ae86820cef69..2d7555381955 100644 > > --- a/arch/x86/kvm/mmu/mmutrace.h > > +++ b/arch/x86/kvm/mmu/mmutrace.h > > @@ -58,6 +58,7 @@ TRACE_DEFINE_ENUM(RET_PF_CONTINUE); > > TRACE_DEFINE_ENUM(RET_PF_RETRY); > > TRACE_DEFINE_ENUM(RET_PF_EMULATE); > > TRACE_DEFINE_ENUM(RET_PF_INVALID); > > +TRACE_DEFINE_ENUM(RET_PF_USER); > > TRACE_DEFINE_ENUM(RET_PF_FIXED); > > TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); > > > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > > index 771210ce5181..8ba1a4afc546 100644 > > --- a/arch/x86/kvm/mmu/tdp_mmu.c > > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > > @@ -1768,7 +1768,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, > > continue; > > > > max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, > > - iter.gfn, PG_LEVEL_NUM); > > + iter.gfn, PG_LEVEL_NUM, false); > > if (max_mapping_level < iter.level) > > continue; > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > index 25099c94e770..153842bb33df 100644 > > --- a/include/linux/kvm_host.h > > +++ b/include/linux/kvm_host.h > > @@ -2335,4 +2335,34 @@ static inline void kvm_arch_set_memory_attributes(struct kvm *kvm, > > } > > #endif /* __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES */ > > > > +#ifdef CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES > > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) > > +{ > > + return xa_to_value(xa_load(&kvm->mem_attr_array, gfn)) & > > + KVM_MEMORY_ATTRIBUTE_PRIVATE; > > +} > > +#else > > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) > > +{ > > + return false; > > +} > > + > > +#endif /* CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES */ > > + > > +#ifdef CONFIG_HAVE_KVM_RESTRICTED_MEM > > +static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot, > > + gfn_t gfn, kvm_pfn_t *pfn, int *order) > > +{ > > + int ret; > > + struct page *page; > > + pgoff_t index = gfn - slot->base_gfn + > > + (slot->restricted_offset >> PAGE_SHIFT); > > + > > + ret = restrictedmem_get_page(slot->restricted_file, index, > > + &page, order); > > + *pfn = page_to_pfn(page); > > + return ret; > > +} > > +#endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */ > > + > > #endif > > -- > > 2.25.1 > > > > With my limited understanding of x86 code: > Reviewed-by: Fuad Tabba <tabba@google.com> > > The common code in kvm_host.h was used in the port to arm64, and the > x86 fault handling code was used as a guide to how it should be done > in pKVM (with similar code added there). So with these caveats in > mind: > Tested-by: Fuad Tabba <tabba@google.com> > > Cheers, > /fuad
On Fri, Dec 02, 2022, Chao Peng wrote: > @@ -5599,6 +5652,9 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err > return -EIO; > } > > + if (r == RET_PF_USER) > + return 0; > + > if (r < 0) > return r; > if (r != RET_PF_EMULATE) > @@ -6452,7 +6508,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, > */ > if (sp->role.direct && > sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, > - PG_LEVEL_NUM)) { > + PG_LEVEL_NUM, > + false)) { Passing %false is incorrect. It might not cause problems because KVM currently doesn't allowing modifying private memslots (that likely needs to change to allow dirty logging), but it's wrong since nothing guarantees KVM is operating on SPTEs for shared memory. One option would be to take the patches from the TDX series that add a "private" flag to the shadow page role, but I'd rather not add the role until it's truly necessary. For now, I think we can do this without impacting performance of guests that don't support private memory. int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, int max_level) { bool is_private = kvm_slot_can_be_private(slot) && kvm_mem_is_private(kvm, gfn); return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private); } > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 25099c94e770..153842bb33df 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -2335,4 +2335,34 @@ static inline void kvm_arch_set_memory_attributes(struct kvm *kvm, > } > #endif /* __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES */ > > +#ifdef CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) > +{ This code, i.e. the generic KVM changes, belongs in a separate patch. It'll be small, but I want to separate x86's page fault changes from the restrictedmem support adding to common KVM. This should also short-circuit based on CONFIG_HAVE_KVM_RESTRICTED_MEM, though I would name that CONFIG_KVM_PRIVATE_MEMORY since in KVM's world, it's all about private vs. shared at this time. > + return xa_to_value(xa_load(&kvm->mem_attr_array, gfn)) & > + KVM_MEMORY_ATTRIBUTE_PRIVATE; > +} > +#else > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) > +{ > + return false; > +} > + > +#endif /* CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES */ > + > +#ifdef CONFIG_HAVE_KVM_RESTRICTED_MEM > +static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot, > + gfn_t gfn, kvm_pfn_t *pfn, int *order) > +{ > + int ret; > + struct page *page; > + pgoff_t index = gfn - slot->base_gfn + > + (slot->restricted_offset >> PAGE_SHIFT); > + > + ret = restrictedmem_get_page(slot->restricted_file, index, > + &page, order); This needs handle errors. If "ret" is non-zero, "page" is garbage. > + *pfn = page_to_pfn(page); > + return ret; > +} > +#endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */ > + > #endif > -- > 2.25.1 >
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2190fd8c95c0..b1953ebc012e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3058,7 +3058,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, - int max_level) + int max_level, bool is_private) { struct kvm_lpage_info *linfo; int host_level; @@ -3070,6 +3070,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, break; } + if (is_private) + return max_level; + if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; @@ -3098,7 +3101,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * level, which will be used to do precise, accurate accounting. */ fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, - fault->gfn, fault->max_level); + fault->gfn, fault->max_level, + fault->is_private); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; @@ -4178,6 +4182,49 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); } +static inline u8 order_to_level(int order) +{ + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) + return PG_LEVEL_1G; + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) + return PG_LEVEL_2M; + + return PG_LEVEL_4K; +} + +static int kvm_do_memory_fault_exit(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; + if (fault->is_private) + vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE; + else + vcpu->run->memory.flags = 0; + vcpu->run->memory.gpa = fault->gfn << PAGE_SHIFT; + vcpu->run->memory.size = PAGE_SIZE; + return RET_PF_USER; +} + +static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + int order; + struct kvm_memory_slot *slot = fault->slot; + + if (!kvm_slot_can_be_private(slot)) + return kvm_do_memory_fault_exit(vcpu, fault); + + if (kvm_restricted_mem_get_pfn(slot, fault->gfn, &fault->pfn, &order)) + return RET_PF_RETRY; + + fault->max_level = min(order_to_level(order), fault->max_level); + fault->map_writable = !(slot->flags & KVM_MEM_READONLY); + return RET_PF_CONTINUE; +} + static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; @@ -4210,6 +4257,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return RET_PF_EMULATE; } + if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) + return kvm_do_memory_fault_exit(vcpu, fault); + + if (fault->is_private) + return kvm_faultin_pfn_private(vcpu, fault); + async = false; fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, fault->write, &fault->map_writable, @@ -5599,6 +5652,9 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err return -EIO; } + if (r == RET_PF_USER) + return 0; + if (r < 0) return r; if (r != RET_PF_EMULATE) @@ -6452,7 +6508,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, */ if (sp->role.direct && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, - PG_LEVEL_NUM)) { + PG_LEVEL_NUM, + false)) { kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index dbaf6755c5a7..5ccf08183b00 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -189,6 +189,7 @@ struct kvm_page_fault { /* Derived from mmu and global state. */ const bool is_tdp; + const bool is_private; const bool nx_huge_page_workaround_enabled; /* @@ -237,6 +238,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); * RET_PF_RETRY: let CPU fault again on the address. * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. + * RET_PF_USER: need to exit to userspace to handle this fault. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. * @@ -253,6 +255,7 @@ enum { RET_PF_RETRY, RET_PF_EMULATE, RET_PF_INVALID, + RET_PF_USER, RET_PF_FIXED, RET_PF_SPURIOUS, }; @@ -310,7 +313,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, - int max_level); + int max_level, bool is_private); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); @@ -319,4 +322,13 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); +#ifndef CONFIG_HAVE_KVM_RESTRICTED_MEM +static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *order) +{ + WARN_ON_ONCE(1); + return -EOPNOTSUPP; +} +#endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */ + #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index ae86820cef69..2d7555381955 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -58,6 +58,7 @@ TRACE_DEFINE_ENUM(RET_PF_CONTINUE); TRACE_DEFINE_ENUM(RET_PF_RETRY); TRACE_DEFINE_ENUM(RET_PF_EMULATE); TRACE_DEFINE_ENUM(RET_PF_INVALID); +TRACE_DEFINE_ENUM(RET_PF_USER); TRACE_DEFINE_ENUM(RET_PF_FIXED); TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 771210ce5181..8ba1a4afc546 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1768,7 +1768,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, continue; max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, - iter.gfn, PG_LEVEL_NUM); + iter.gfn, PG_LEVEL_NUM, false); if (max_mapping_level < iter.level) continue; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 25099c94e770..153842bb33df 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2335,4 +2335,34 @@ static inline void kvm_arch_set_memory_attributes(struct kvm *kvm, } #endif /* __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES */ +#ifdef CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) +{ + return xa_to_value(xa_load(&kvm->mem_attr_array, gfn)) & + KVM_MEMORY_ATTRIBUTE_PRIVATE; +} +#else +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) +{ + return false; +} + +#endif /* CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES */ + +#ifdef CONFIG_HAVE_KVM_RESTRICTED_MEM +static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *order) +{ + int ret; + struct page *page; + pgoff_t index = gfn - slot->base_gfn + + (slot->restricted_offset >> PAGE_SHIFT); + + ret = restrictedmem_get_page(slot->restricted_file, index, + &page, order); + *pfn = page_to_pfn(page); + return ret; +} +#endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */ + #endif