diff mbox series

[v8,7/8] KVM: Handle page fault for private memory

Message ID 20220915142913.2213336-8-chao.p.peng@linux.intel.com (mailing list archive)
State New
Headers show
Series KVM: mm: fd-based approach for supporting KVM | expand

Commit Message

Chao Peng Sept. 15, 2022, 2:29 p.m. UTC
A memslot with KVM_MEM_PRIVATE being set can include both fd-based
private memory and hva-based shared memory. Architecture code (like TDX
code) can tell whether the on-going fault is private or not. This patch
adds a 'is_private' field to kvm_page_fault to indicate this and
architecture code is expected to set it.

To handle page fault for such memslot, the handling logic is different
depending on whether the fault is private or shared. KVM checks if
'is_private' matches the host's view of the page (this is maintained in
mem_attr_array).
  - For a successful match, private pfn is obtained with
    inaccessible_get_pfn() from private fd and shared pfn is obtained
    with existing get_user_pages().
  - For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to
    userspace. Userspace then can convert memory between private/shared
    in host's view and then retry the access.

Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
 arch/x86/kvm/mmu/mmu.c          | 54 ++++++++++++++++++++++++++++++++-
 arch/x86/kvm/mmu/mmu_internal.h | 18 +++++++++++
 arch/x86/kvm/mmu/mmutrace.h     |  1 +
 include/linux/kvm_host.h        | 24 +++++++++++++++
 4 files changed, 96 insertions(+), 1 deletion(-)

Comments

Sean Christopherson Oct. 14, 2022, 6:57 p.m. UTC | #1
On Thu, Sep 15, 2022, Chao Peng wrote:
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index a0f198cede3d..81ab20003824 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3028,6 +3028,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
>  			break;
>  	}
>  
> +	if (kvm_mem_is_private(kvm, gfn))

Rather than reload the Xarray info, which is unnecessary overhead, pass in
@is_private.  The caller must hold mmu_lock, i.e. invalidations from
private<->shared conversions will be stalled and will zap the new SPTE if the
state is changed.

E.g.

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index d68944f07b4b..44eea47697d8 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3072,8 +3072,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
         * Enforce the iTLB multihit workaround after capturing the requested
         * level, which will be used to do precise, accurate accounting.
         */
-       fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
-                                                    fault->gfn, fault->max_level);
+       fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, fault->gfn,
+                                                    fault->max_level, fault->is_private);
        if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
                return;
 
@@ -6460,7 +6460,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
                 */
                if (sp->role.direct &&
                    sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
-                                                              PG_LEVEL_NUM)) {
+                                                              PG_LEVEL_NUM, false)) {
                        kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
 
                        if (kvm_available_flush_tlb_with_range())
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 7670c13ce251..9acdf72537ce 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -315,6 +315,12 @@ static inline bool is_dirty_spte(u64 spte)
        return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
 }
 
+static inline bool is_private_spte(u64 spte)
+{
+       /* FIXME: Query C-bit/S-bit for SEV/TDX. */
+       return false;
+}
+
 static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,
                                int level)
 {
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 672f0432d777..69ba00157e90 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1767,8 +1767,9 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
                if (iter.gfn < start || iter.gfn >= end)
                        continue;
 
-               max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
-                                                             iter.gfn, PG_LEVEL_NUM);
+               max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
+                                                             PG_LEVEL_NUM,
+                                                             is_private_spte(iter.old_spte));
                if (max_mapping_level < iter.level)
                        continue;
Chao Peng Oct. 17, 2022, 2:48 p.m. UTC | #2
On Fri, Oct 14, 2022 at 06:57:20PM +0000, Sean Christopherson wrote:
> On Thu, Sep 15, 2022, Chao Peng wrote:
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index a0f198cede3d..81ab20003824 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -3028,6 +3028,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
> >  			break;
> >  	}
> >  
> > +	if (kvm_mem_is_private(kvm, gfn))
> 
> Rather than reload the Xarray info, which is unnecessary overhead, pass in
> @is_private.  The caller must hold mmu_lock, i.e. invalidations from
> private<->shared conversions will be stalled and will zap the new SPTE if the
> state is changed.

Make sense. TDX/SEV should be easy to query that.

Chao
> 
> E.g.
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index d68944f07b4b..44eea47697d8 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3072,8 +3072,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>          * Enforce the iTLB multihit workaround after capturing the requested
>          * level, which will be used to do precise, accurate accounting.
>          */
> -       fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
> -                                                    fault->gfn, fault->max_level);
> +       fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, fault->gfn,
> +                                                    fault->max_level, fault->is_private);
>         if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
>                 return;
>  
> @@ -6460,7 +6460,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
>                  */
>                 if (sp->role.direct &&
>                     sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
> -                                                              PG_LEVEL_NUM)) {
> +                                                              PG_LEVEL_NUM, false)) {
>                         kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
>  
>                         if (kvm_available_flush_tlb_with_range())
> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
> index 7670c13ce251..9acdf72537ce 100644
> --- a/arch/x86/kvm/mmu/spte.h
> +++ b/arch/x86/kvm/mmu/spte.h
> @@ -315,6 +315,12 @@ static inline bool is_dirty_spte(u64 spte)
>         return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
>  }
>  
> +static inline bool is_private_spte(u64 spte)
> +{
> +       /* FIXME: Query C-bit/S-bit for SEV/TDX. */
> +       return false;
> +}
> +
>  static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,
>                                 int level)
>  {
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 672f0432d777..69ba00157e90 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1767,8 +1767,9 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
>                 if (iter.gfn < start || iter.gfn >= end)
>                         continue;
>  
> -               max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
> -                                                             iter.gfn, PG_LEVEL_NUM);
> +               max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
> +                                                             PG_LEVEL_NUM,
> +                                                             is_private_spte(iter.old_spte));
>                 if (max_mapping_level < iter.level)
>                         continue;
>
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a0f198cede3d..81ab20003824 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3028,6 +3028,9 @@  int kvm_mmu_max_mapping_level(struct kvm *kvm,
 			break;
 	}
 
+	if (kvm_mem_is_private(kvm, gfn))
+		return max_level;
+
 	if (max_level == PG_LEVEL_4K)
 		return PG_LEVEL_4K;
 
@@ -4127,6 +4130,32 @@  void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 	kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
 }
 
+static inline u8 order_to_level(int order)
+{
+	BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
+
+	if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
+		return PG_LEVEL_1G;
+
+	if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+		return PG_LEVEL_2M;
+
+	return PG_LEVEL_4K;
+}
+
+static int kvm_faultin_pfn_private(struct kvm_page_fault *fault)
+{
+	int order;
+	struct kvm_memory_slot *slot = fault->slot;
+
+	if (kvm_private_mem_get_pfn(slot, fault->gfn, &fault->pfn, &order))
+		return RET_PF_RETRY;
+
+	fault->max_level = min(order_to_level(order), fault->max_level);
+	fault->map_writable = !(slot->flags & KVM_MEM_READONLY);
+	return RET_PF_CONTINUE;
+}
+
 static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
 	struct kvm_memory_slot *slot = fault->slot;
@@ -4159,6 +4188,22 @@  static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 			return RET_PF_EMULATE;
 	}
 
+	if (kvm_slot_can_be_private(slot) &&
+	    fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
+		vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
+		if (fault->is_private)
+			vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE;
+		else
+			vcpu->run->memory.flags = 0;
+		vcpu->run->memory.padding = 0;
+		vcpu->run->memory.gpa = fault->gfn << PAGE_SHIFT;
+		vcpu->run->memory.size = PAGE_SIZE;
+		return RET_PF_USER;
+	}
+
+	if (fault->is_private)
+		return kvm_faultin_pfn_private(fault);
+
 	async = false;
 	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
 					  fault->write, &fault->map_writable,
@@ -4267,7 +4312,11 @@  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 		read_unlock(&vcpu->kvm->mmu_lock);
 	else
 		write_unlock(&vcpu->kvm->mmu_lock);
-	kvm_release_pfn_clean(fault->pfn);
+
+	if (fault->is_private)
+		kvm_private_mem_put_pfn(fault->slot, fault->pfn);
+	else
+		kvm_release_pfn_clean(fault->pfn);
 	return r;
 }
 
@@ -5543,6 +5592,9 @@  int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
 			return -EIO;
 	}
 
+	if (r == RET_PF_USER)
+		return 0;
+
 	if (r < 0)
 		return r;
 	if (r != RET_PF_EMULATE)
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 582def531d4d..a55e352246a7 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -188,6 +188,7 @@  struct kvm_page_fault {
 
 	/* Derived from mmu and global state.  */
 	const bool is_tdp;
+	const bool is_private;
 	const bool nx_huge_page_workaround_enabled;
 
 	/*
@@ -236,6 +237,7 @@  int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
  * RET_PF_RETRY: let CPU fault again on the address.
  * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
  * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ * RET_PF_USER: need to exit to userspace to handle this fault.
  * RET_PF_FIXED: The faulting entry has been fixed.
  * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
  *
@@ -252,6 +254,7 @@  enum {
 	RET_PF_RETRY,
 	RET_PF_EMULATE,
 	RET_PF_INVALID,
+	RET_PF_USER,
 	RET_PF_FIXED,
 	RET_PF_SPURIOUS,
 };
@@ -318,4 +321,19 @@  void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
+#ifndef CONFIG_HAVE_KVM_PRIVATE_MEM
+static inline int kvm_private_mem_get_pfn(struct kvm_memory_slot *slot,
+					  gfn_t gfn, kvm_pfn_t *pfn, int *order)
+{
+	WARN_ON_ONCE(1);
+	return -EOPNOTSUPP;
+}
+
+static inline void kvm_private_mem_put_pfn(struct kvm_memory_slot *slot,
+					   kvm_pfn_t pfn)
+{
+	WARN_ON_ONCE(1);
+}
+#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */
+
 #endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index ae86820cef69..2d7555381955 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -58,6 +58,7 @@  TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
 TRACE_DEFINE_ENUM(RET_PF_RETRY);
 TRACE_DEFINE_ENUM(RET_PF_EMULATE);
 TRACE_DEFINE_ENUM(RET_PF_INVALID);
+TRACE_DEFINE_ENUM(RET_PF_USER);
 TRACE_DEFINE_ENUM(RET_PF_FIXED);
 TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fd36ce6597ad..b9906cdf468b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2292,6 +2292,30 @@  static inline void kvm_arch_update_mem_attr(struct kvm *kvm, unsigned int attr,
 }
 #endif
 
+static inline int kvm_private_mem_get_pfn(struct kvm_memory_slot *slot,
+					  gfn_t gfn, kvm_pfn_t *pfn, int *order)
+{
+	int ret;
+	pfn_t pfnt;
+	pgoff_t index = gfn - slot->base_gfn +
+			(slot->private_offset >> PAGE_SHIFT);
+
+	ret = inaccessible_get_pfn(slot->private_file, index, &pfnt, order);
+	*pfn = pfn_t_to_pfn(pfnt);
+	return ret;
+}
+
+static inline void kvm_private_mem_put_pfn(struct kvm_memory_slot *slot,
+					   kvm_pfn_t pfn)
+{
+	inaccessible_put_pfn(slot->private_file, pfn_to_pfn_t(pfn));
+}
+
+static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
+{
+	return !xa_load(&kvm->mem_attr_array, gfn);
+}
+
 #endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */
 
 #endif