Message ID | 20220519153713.819591-7-chao.p.peng@linux.intel.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | KVM: mm: fd-based approach for supporting KVM guest private memory | expand |
On Thu, May 19, 2022, Chao Peng wrote: > @@ -4028,8 +4081,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, > if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) > return true; > > - return fault->slot && > - mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); > + if (fault->is_private) > + return mmu_notifier_retry(vcpu->kvm, mmu_seq); Hmm, this is somewhat undesirable, because faulting in private pfns will be blocked by unrelated mmu_notifier updates. The issue is mitigated to some degree by bumping the sequence count if and only if overlap with a memslot is detected, e.g. mapping changes that affects only userspace won't block the guest. It probably won't be an issue, but at the same time it's easy to solve, and I don't like piggybacking mmu_notifier_seq as private mappings shouldn't be subject to the mmu_notifier. That would also fix a theoretical bug in this patch where mmu_notifier_retry() wouldn't be defined if CONFIG_MEMFILE_NOTIFIER=y && CONFIG_MMU_NOTIFIER=n.a --- arch/x86/kvm/mmu/mmu.c | 11 ++++++----- include/linux/kvm_host.h | 16 +++++++++++----- virt/kvm/kvm_main.c | 2 +- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0b455c16ec64..a4cbd29433e7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4100,10 +4100,10 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, return true; if (fault->is_private) - return mmu_notifier_retry(vcpu->kvm, mmu_seq); - else - return fault->slot && - mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); + return memfile_notifier_retry(vcpu->kvm, mmu_seq); + + return fault->slot && + mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); } static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -4127,7 +4127,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - mmu_seq = vcpu->kvm->mmu_notifier_seq; + mmu_seq = fault->is_private ? vcpu->kvm->memfile_notifier_seq : + vcpu->kvm->mmu_notifier_seq; smp_rmb(); r = kvm_faultin_pfn(vcpu, fault); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 92afa5bddbc5..31f704c83099 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -773,16 +773,15 @@ struct kvm { struct hlist_head irq_ack_notifier_list; #endif -#if (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)) ||\ - defined(CONFIG_MEMFILE_NOTIFIER) +#if (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)) unsigned long mmu_notifier_seq; -#endif - -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) struct mmu_notifier mmu_notifier; long mmu_notifier_count; unsigned long mmu_notifier_range_start; unsigned long mmu_notifier_range_end; +#endif +#ifdef CONFIG_MEMFILE_NOTIFIER + unsigned long memfile_notifier_seq; #endif struct list_head devices; u64 manual_dirty_log_protect; @@ -1964,6 +1963,13 @@ static inline int mmu_notifier_retry_hva(struct kvm *kvm, } #endif +#ifdef CONFIG_MEMFILE_NOTIFIER +static inline bool memfile_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) +{ + return kvm->memfile_notifier_seq != mmu_seq; +} +#endif + #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING #define KVM_MAX_IRQ_ROUTES 4096 /* might need extension/rework in the future */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2b416d3bd60e..e6d34c964d51 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -898,7 +898,7 @@ static void kvm_private_mem_notifier_handler(struct memfile_notifier *notifier, KVM_MMU_LOCK(kvm); if (kvm_unmap_gfn_range(kvm, &gfn_range)) kvm_flush_remote_tlbs(kvm); - kvm->mmu_notifier_seq++; + kvm->memfile_notifier_seq++; KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); } base-commit: 333ef501c7f6c6d4ef2b7678905cad0f8ef3e271 -- > + else > + return fault->slot && > + mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); > } > > static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > @@ -4088,7 +4144,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > read_unlock(&vcpu->kvm->mmu_lock); > else > write_unlock(&vcpu->kvm->mmu_lock); > - kvm_release_pfn_clean(fault->pfn); > + > + if (fault->is_private) > + kvm_private_mem_put_pfn(fault->slot, fault->pfn); Why does the shmem path lock the page, and then unlock it here? Same question for why this path marks it dirty? The guest has the page mapped so the dirty flag is immediately stale. In other words, why does KVM need to do something different for private pfns? > + else > + kvm_release_pfn_clean(fault->pfn); > + > return r; > } > ... > diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h > index 7f8f1c8dbed2..1d857919a947 100644 > --- a/arch/x86/kvm/mmu/paging_tmpl.h > +++ b/arch/x86/kvm/mmu/paging_tmpl.h > @@ -878,7 +878,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > out_unlock: > write_unlock(&vcpu->kvm->mmu_lock); > - kvm_release_pfn_clean(fault->pfn); > + if (fault->is_private) Indirect MMUs can't support private faults, i.e. this is unnecessary. > + kvm_private_mem_put_pfn(fault->slot, fault->pfn); > + else > + kvm_release_pfn_clean(fault->pfn); > return r; > } > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 3fd168972ecd..b0a7910505ed 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -2241,4 +2241,26 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) > /* Max number of entries allowed for each kvm dirty ring */ > #define KVM_DIRTY_RING_MAX_ENTRIES 65536 > > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM > +static inline int kvm_private_mem_get_pfn(struct kvm_memory_slot *slot, > + gfn_t gfn, kvm_pfn_t *pfn, int *order) > +{ > + int ret; > + pfn_t pfnt; > + pgoff_t index = gfn - slot->base_gfn + > + (slot->private_offset >> PAGE_SHIFT); > + > + ret = slot->notifier.bs->get_lock_pfn(slot->private_file, index, &pfnt, > + order); > + *pfn = pfn_t_to_pfn(pfnt); > + return ret; > +} > + > +static inline void kvm_private_mem_put_pfn(struct kvm_memory_slot *slot, > + kvm_pfn_t pfn) > +{ > + slot->notifier.bs->put_unlock_pfn(pfn_to_pfn_t(pfn)); > +} > +#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */ > + > #endif > -- > 2.25.1 >
On Fri, Jun 17, 2022 at 09:30:53PM +0000, Sean Christopherson wrote: > On Thu, May 19, 2022, Chao Peng wrote: > > @@ -4028,8 +4081,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, > > if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) > > return true; > > > > - return fault->slot && > > - mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); > > + if (fault->is_private) > > + return mmu_notifier_retry(vcpu->kvm, mmu_seq); > > Hmm, this is somewhat undesirable, because faulting in private pfns will be blocked > by unrelated mmu_notifier updates. The issue is mitigated to some degree by bumping > the sequence count if and only if overlap with a memslot is detected, e.g. mapping > changes that affects only userspace won't block the guest. > > It probably won't be an issue, but at the same time it's easy to solve, and I don't > like piggybacking mmu_notifier_seq as private mappings shouldn't be subject to the > mmu_notifier. > > That would also fix a theoretical bug in this patch where mmu_notifier_retry() > wouldn't be defined if CONFIG_MEMFILE_NOTIFIER=y && CONFIG_MMU_NOTIFIER=n.a Agreed, Thanks. > > --- > arch/x86/kvm/mmu/mmu.c | 11 ++++++----- > include/linux/kvm_host.h | 16 +++++++++++----- > virt/kvm/kvm_main.c | 2 +- > 3 files changed, 18 insertions(+), 11 deletions(-) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 0b455c16ec64..a4cbd29433e7 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -4100,10 +4100,10 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, > return true; > > if (fault->is_private) > - return mmu_notifier_retry(vcpu->kvm, mmu_seq); > - else > - return fault->slot && > - mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); > + return memfile_notifier_retry(vcpu->kvm, mmu_seq); > + > + return fault->slot && > + mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); > } > > static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > @@ -4127,7 +4127,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > if (r) > return r; > > - mmu_seq = vcpu->kvm->mmu_notifier_seq; > + mmu_seq = fault->is_private ? vcpu->kvm->memfile_notifier_seq : > + vcpu->kvm->mmu_notifier_seq; > smp_rmb(); > > r = kvm_faultin_pfn(vcpu, fault); > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 92afa5bddbc5..31f704c83099 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -773,16 +773,15 @@ struct kvm { > struct hlist_head irq_ack_notifier_list; > #endif > > -#if (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)) ||\ > - defined(CONFIG_MEMFILE_NOTIFIER) > +#if (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)) > unsigned long mmu_notifier_seq; > -#endif > - > -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) > struct mmu_notifier mmu_notifier; > long mmu_notifier_count; > unsigned long mmu_notifier_range_start; > unsigned long mmu_notifier_range_end; > +#endif > +#ifdef CONFIG_MEMFILE_NOTIFIER > + unsigned long memfile_notifier_seq; > #endif > struct list_head devices; > u64 manual_dirty_log_protect; > @@ -1964,6 +1963,13 @@ static inline int mmu_notifier_retry_hva(struct kvm *kvm, > } > #endif > > +#ifdef CONFIG_MEMFILE_NOTIFIER > +static inline bool memfile_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) > +{ > + return kvm->memfile_notifier_seq != mmu_seq; > +} > +#endif > + > #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING > > #define KVM_MAX_IRQ_ROUTES 4096 /* might need extension/rework in the future */ > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 2b416d3bd60e..e6d34c964d51 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -898,7 +898,7 @@ static void kvm_private_mem_notifier_handler(struct memfile_notifier *notifier, > KVM_MMU_LOCK(kvm); > if (kvm_unmap_gfn_range(kvm, &gfn_range)) > kvm_flush_remote_tlbs(kvm); > - kvm->mmu_notifier_seq++; > + kvm->memfile_notifier_seq++; > KVM_MMU_UNLOCK(kvm); > srcu_read_unlock(&kvm->srcu, idx); > } > > base-commit: 333ef501c7f6c6d4ef2b7678905cad0f8ef3e271 > -- > > > + else > > + return fault->slot && > > + mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); > > } > > > > static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > > @@ -4088,7 +4144,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > read_unlock(&vcpu->kvm->mmu_lock); > > else > > write_unlock(&vcpu->kvm->mmu_lock); > > - kvm_release_pfn_clean(fault->pfn); > > + > > + if (fault->is_private) > > + kvm_private_mem_put_pfn(fault->slot, fault->pfn); > > Why does the shmem path lock the page, and then unlock it here? Initially this is to prevent race between SLPT population and truncate/punch on the fd. Without this, a gfn may become stale before the page is populated in SLPT. However, with memfile_notifier_retry mechanism, this sounds not needed. > > Same question for why this path marks it dirty? The guest has the page mapped > so the dirty flag is immediately stale. I believe so. > > In other words, why does KVM need to do something different for private pfns? These two are inherited from Kirill's previous code. See if he has any comment. > > > + else > > + kvm_release_pfn_clean(fault->pfn); > > + > > return r; > > } > > > > ... > > > diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h > > index 7f8f1c8dbed2..1d857919a947 100644 > > --- a/arch/x86/kvm/mmu/paging_tmpl.h > > +++ b/arch/x86/kvm/mmu/paging_tmpl.h > > @@ -878,7 +878,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > > > out_unlock: > > write_unlock(&vcpu->kvm->mmu_lock); > > - kvm_release_pfn_clean(fault->pfn); > > + if (fault->is_private) > > Indirect MMUs can't support private faults, i.e. this is unnecessary. Okay. > > > + kvm_private_mem_put_pfn(fault->slot, fault->pfn); > > + else > > + kvm_release_pfn_clean(fault->pfn); > > return r; > > } > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > index 3fd168972ecd..b0a7910505ed 100644 > > --- a/include/linux/kvm_host.h > > +++ b/include/linux/kvm_host.h > > @@ -2241,4 +2241,26 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) > > /* Max number of entries allowed for each kvm dirty ring */ > > #define KVM_DIRTY_RING_MAX_ENTRIES 65536 > > > > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM > > +static inline int kvm_private_mem_get_pfn(struct kvm_memory_slot *slot, > > + gfn_t gfn, kvm_pfn_t *pfn, int *order) > > +{ > > + int ret; > > + pfn_t pfnt; > > + pgoff_t index = gfn - slot->base_gfn + > > + (slot->private_offset >> PAGE_SHIFT); > > + > > + ret = slot->notifier.bs->get_lock_pfn(slot->private_file, index, &pfnt, > > + order); > > + *pfn = pfn_t_to_pfn(pfnt); > > + return ret; > > +} > > + > > +static inline void kvm_private_mem_put_pfn(struct kvm_memory_slot *slot, > > + kvm_pfn_t pfn) > > +{ > > + slot->notifier.bs->put_unlock_pfn(pfn_to_pfn_t(pfn)); > > +} > > +#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */ > > + > > #endif > > -- > > 2.25.1 > >
On 5/19/2022 9:07 PM, Chao Peng wrote: > A page fault can carry the information of whether the access if private > or not for KVM_MEM_PRIVATE memslot, this can be filled by architecture > code(like TDX code). To handle page faut for such access, KVM maps the > page only when this private property matches host's view on this page > which can be decided by checking whether the corresponding page is > populated in the private fd or not. A page is considered as private when > the page is populated in the private fd, otherwise it's shared. > > For a successful match, private pfn is obtained with memfile_notifier > callbacks from private fd and shared pfn is obtained with existing > get_user_pages. > > For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to > userspace. Userspace then can convert memory between private/shared from > host's view then retry the access. > > Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com> > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com> > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com> > --- > arch/x86/kvm/mmu.h | 1 + > arch/x86/kvm/mmu/mmu.c | 70 +++++++++++++++++++++++++++++++-- > arch/x86/kvm/mmu/mmu_internal.h | 17 ++++++++ > arch/x86/kvm/mmu/mmutrace.h | 1 + > arch/x86/kvm/mmu/paging_tmpl.h | 5 ++- > include/linux/kvm_host.h | 22 +++++++++++ > 6 files changed, 112 insertions(+), 4 deletions(-) > > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h > index 7e258cc94152..c84835762249 100644 > --- a/arch/x86/kvm/mmu.h > +++ b/arch/x86/kvm/mmu.h > @@ -176,6 +176,7 @@ struct kvm_page_fault { > > /* Derived from mmu and global state. */ > const bool is_tdp; > + const bool is_private; > const bool nx_huge_page_workaround_enabled; > > /* > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index afe18d70ece7..e18460e0d743 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -2899,6 +2899,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, > if (max_level == PG_LEVEL_4K) > return PG_LEVEL_4K; > > + if (kvm_slot_is_private(slot)) > + return max_level; Can you explain the rationale behind the above change? AFAIU, this overrides the transparent_hugepage=never setting for both shared and private mappings. > host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); > return min(host_level, max_level); > } Regards Nikunj
On Fri, Jun 24, 2022 at 09:28:23AM +0530, Nikunj A. Dadhania wrote: > On 5/19/2022 9:07 PM, Chao Peng wrote: > > A page fault can carry the information of whether the access if private > > or not for KVM_MEM_PRIVATE memslot, this can be filled by architecture > > code(like TDX code). To handle page faut for such access, KVM maps the > > page only when this private property matches host's view on this page > > which can be decided by checking whether the corresponding page is > > populated in the private fd or not. A page is considered as private when > > the page is populated in the private fd, otherwise it's shared. > > > > For a successful match, private pfn is obtained with memfile_notifier > > callbacks from private fd and shared pfn is obtained with existing > > get_user_pages. > > > > For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to > > userspace. Userspace then can convert memory between private/shared from > > host's view then retry the access. > > > > Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com> > > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com> > > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com> > > --- > > arch/x86/kvm/mmu.h | 1 + > > arch/x86/kvm/mmu/mmu.c | 70 +++++++++++++++++++++++++++++++-- > > arch/x86/kvm/mmu/mmu_internal.h | 17 ++++++++ > > arch/x86/kvm/mmu/mmutrace.h | 1 + > > arch/x86/kvm/mmu/paging_tmpl.h | 5 ++- > > include/linux/kvm_host.h | 22 +++++++++++ > > 6 files changed, 112 insertions(+), 4 deletions(-) > > > > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h > > index 7e258cc94152..c84835762249 100644 > > --- a/arch/x86/kvm/mmu.h > > +++ b/arch/x86/kvm/mmu.h > > @@ -176,6 +176,7 @@ struct kvm_page_fault { > > > > /* Derived from mmu and global state. */ > > const bool is_tdp; > > + const bool is_private; > > const bool nx_huge_page_workaround_enabled; > > > > /* > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > index afe18d70ece7..e18460e0d743 100644 > > --- a/arch/x86/kvm/mmu/mmu.c > > +++ b/arch/x86/kvm/mmu/mmu.c > > @@ -2899,6 +2899,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, > > if (max_level == PG_LEVEL_4K) > > return PG_LEVEL_4K; > > > > + if (kvm_slot_is_private(slot)) > > + return max_level; > > Can you explain the rationale behind the above change? > AFAIU, this overrides the transparent_hugepage=never setting for both > shared and private mappings. As Sean pointed out, this should check against fault->is_private instead of the slot. For private fault, the level is retrieved and stored to fault->max_level in kvm_faultin_pfn_private() instead of here. For shared fault, it will continue to query host_level below. For private fault, the host level has already been accounted in kvm_faultin_pfn_private(). Chao > > > host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); > > return min(host_level, max_level); > > } > > Regards > Nikunj
... > > > /* > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > > index afe18d70ece7..e18460e0d743 100644 > > > --- a/arch/x86/kvm/mmu/mmu.c > > > +++ b/arch/x86/kvm/mmu/mmu.c > > > @@ -2899,6 +2899,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, > > > if (max_level == PG_LEVEL_4K) > > > return PG_LEVEL_4K; > > > > > > + if (kvm_slot_is_private(slot)) > > > + return max_level; > > > > Can you explain the rationale behind the above change? > > AFAIU, this overrides the transparent_hugepage=never setting for both > > shared and private mappings. > > As Sean pointed out, this should check against fault->is_private instead > of the slot. For private fault, the level is retrieved and stored to > fault->max_level in kvm_faultin_pfn_private() instead of here. > > For shared fault, it will continue to query host_level below. For > private fault, the host level has already been accounted in > kvm_faultin_pfn_private(). > > Chao > > With transparent_hugepages=always setting I see issues with the current implementation. Scenario: 1) Guest accesses a gfn range 0x800-0xa00 as private 2) Guest calls mapgpa to convert the range 0x84d-0x86e as shared 3) Guest tries to access recently converted memory as shared for the first time Guest VM shutdown is observed after step 3 -> Guest is unable to proceed further since somehow code section is not as expected Corresponding KVM trace logs after step 3: VCPU-0-61883 [078] ..... 72276.115679: kvm_page_fault: address 84d000 error_code 4 VCPU-0-61883 [078] ..... 72276.127005: kvm_mmu_spte_requested: gfn 84d pfn 100b4a4d level 2 VCPU-0-61883 [078] ..... 72276.127008: kvm_tdp_mmu_spte_changed: as id 0 gfn 800 level 2 old_spte 100b1b16827 new_spte 100b4a00ea7 VCPU-0-61883 [078] ..... 72276.127009: kvm_mmu_prepare_zap_page: sp gen 0 gfn 800 l1 8-byte q0 direct wux nxe ad root 0 sync VCPU-0-61883 [078] ..... 72276.127009: kvm_tdp_mmu_spte_changed: as id 0 gfn 800 level 1 old_spte 1003eb27e67 new_spte 5a0 VCPU-0-61883 [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as id 0 gfn 801 level 1 old_spte 10056cc8e67 new_spte 5a0 VCPU-0-61883 [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as id 0 gfn 802 level 1 old_spte 10056fa2e67 new_spte 5a0 VCPU-0-61883 [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as id 0 gfn 803 level 1 old_spte 0 new_spte 5a0 .... VCPU-0-61883 [078] ..... 72276.127089: kvm_tdp_mmu_spte_changed: as id 0 gfn 9ff level 1 old_spte 100a43f4e67 new_spte 5a0 VCPU-0-61883 [078] ..... 72276.127090: kvm_mmu_set_spte: gfn 800 spte 100b4a00ea7 (rwxu) level 2 at 10052fa5020 VCPU-0-61883 [078] ..... 72276.127091: kvm_fpu: unload Looks like with transparent huge pages enabled kvm tried to handle the shared memory fault on 0x84d gfn by coalescing nearby 4K pages to form a contiguous 2MB page mapping at gfn 0x800, since level 2 was requested in kvm_mmu_spte_requested. This caused the private memory contents from regions 0x800-0x84c and 0x86e-0xa00 to get unmapped from the guest leading to guest vm shutdown. Does getting the mapping level as per the fault access type help address the above issue? Any such coalescing should not cross between private to shared or shared to private memory regions. > > > host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); > > > return min(host_level, max_level); > > > } > > Regards, Vishal
On Thu, Jun 30, 2022 at 12:14:13PM -0700, Vishal Annapurve wrote: > With transparent_hugepages=always setting I see issues with the > current implementation. > > Scenario: > 1) Guest accesses a gfn range 0x800-0xa00 as private > 2) Guest calls mapgpa to convert the range 0x84d-0x86e as shared > 3) Guest tries to access recently converted memory as shared for the first time > Guest VM shutdown is observed after step 3 -> Guest is unable to > proceed further since somehow code section is not as expected > > Corresponding KVM trace logs after step 3: > VCPU-0-61883 [078] ..... 72276.115679: kvm_page_fault: address > 84d000 error_code 4 > VCPU-0-61883 [078] ..... 72276.127005: kvm_mmu_spte_requested: gfn > 84d pfn 100b4a4d level 2 > VCPU-0-61883 [078] ..... 72276.127008: kvm_tdp_mmu_spte_changed: as > id 0 gfn 800 level 2 old_spte 100b1b16827 new_spte 100b4a00ea7 > VCPU-0-61883 [078] ..... 72276.127009: kvm_mmu_prepare_zap_page: sp > gen 0 gfn 800 l1 8-byte q0 direct wux nxe ad root 0 sync > VCPU-0-61883 [078] ..... 72276.127009: kvm_tdp_mmu_spte_changed: as > id 0 gfn 800 level 1 old_spte 1003eb27e67 new_spte 5a0 > VCPU-0-61883 [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as > id 0 gfn 801 level 1 old_spte 10056cc8e67 new_spte 5a0 > VCPU-0-61883 [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as > id 0 gfn 802 level 1 old_spte 10056fa2e67 new_spte 5a0 > VCPU-0-61883 [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as > id 0 gfn 803 level 1 old_spte 0 new_spte 5a0 > .... > VCPU-0-61883 [078] ..... 72276.127089: kvm_tdp_mmu_spte_changed: as > id 0 gfn 9ff level 1 old_spte 100a43f4e67 new_spte 5a0 > VCPU-0-61883 [078] ..... 72276.127090: kvm_mmu_set_spte: gfn 800 > spte 100b4a00ea7 (rwxu) level 2 at 10052fa5020 > VCPU-0-61883 [078] ..... 72276.127091: kvm_fpu: unload > > Looks like with transparent huge pages enabled kvm tried to handle the > shared memory fault on 0x84d gfn by coalescing nearby 4K pages > to form a contiguous 2MB page mapping at gfn 0x800, since level 2 was > requested in kvm_mmu_spte_requested. > This caused the private memory contents from regions 0x800-0x84c and > 0x86e-0xa00 to get unmapped from the guest leading to guest vm > shutdown. Interesting... seems like that wouldn't be an issue for non-UPM SEV, since the private pages would still be mapped as part of that 2M mapping, and it's completely up to the guest as to whether it wants to access as private or shared. But for UPM it makes sense this would cause issues. > > Does getting the mapping level as per the fault access type help > address the above issue? Any such coalescing should not cross between > private to > shared or shared to private memory regions. Doesn't seem like changing the check to fault->is_private would help in your particular case, since the subsequent host_pfn_mapping_level() call only seems to limit the mapping level to whatever the mapping level is for the HVA in the host page table. Seems like with UPM we need some additional handling here that also checks that the entire 2M HVA range is backed by non-private memory. Non-UPM SNP hypervisor patches already have a similar hook added to host_pfn_mapping_level() which implements such a check via RMP table, so UPM might need something similar: https://github.com/AMDESE/linux/commit/ae4475bc740eb0b9d031a76412b0117339794139 -Mike > > > > > host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); > > > > return min(host_level, max_level); > > > > } > > > > > Regards, > Vishal
On 7/1/2022 6:21 AM, Michael Roth wrote: > On Thu, Jun 30, 2022 at 12:14:13PM -0700, Vishal Annapurve wrote: >> With transparent_hugepages=always setting I see issues with the >> current implementation. >> >> Scenario: >> 1) Guest accesses a gfn range 0x800-0xa00 as private >> 2) Guest calls mapgpa to convert the range 0x84d-0x86e as shared >> 3) Guest tries to access recently converted memory as shared for the first time >> Guest VM shutdown is observed after step 3 -> Guest is unable to >> proceed further since somehow code section is not as expected >> >> Corresponding KVM trace logs after step 3: >> VCPU-0-61883 [078] ..... 72276.115679: kvm_page_fault: address >> 84d000 error_code 4 >> VCPU-0-61883 [078] ..... 72276.127005: kvm_mmu_spte_requested: gfn >> 84d pfn 100b4a4d level 2 >> VCPU-0-61883 [078] ..... 72276.127008: kvm_tdp_mmu_spte_changed: as >> id 0 gfn 800 level 2 old_spte 100b1b16827 new_spte 100b4a00ea7 >> VCPU-0-61883 [078] ..... 72276.127009: kvm_mmu_prepare_zap_page: sp >> gen 0 gfn 800 l1 8-byte q0 direct wux nxe ad root 0 sync >> VCPU-0-61883 [078] ..... 72276.127009: kvm_tdp_mmu_spte_changed: as >> id 0 gfn 800 level 1 old_spte 1003eb27e67 new_spte 5a0 >> VCPU-0-61883 [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as >> id 0 gfn 801 level 1 old_spte 10056cc8e67 new_spte 5a0 >> VCPU-0-61883 [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as >> id 0 gfn 802 level 1 old_spte 10056fa2e67 new_spte 5a0 >> VCPU-0-61883 [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as >> id 0 gfn 803 level 1 old_spte 0 new_spte 5a0 >> .... >> VCPU-0-61883 [078] ..... 72276.127089: kvm_tdp_mmu_spte_changed: as >> id 0 gfn 9ff level 1 old_spte 100a43f4e67 new_spte 5a0 >> VCPU-0-61883 [078] ..... 72276.127090: kvm_mmu_set_spte: gfn 800 >> spte 100b4a00ea7 (rwxu) level 2 at 10052fa5020 >> VCPU-0-61883 [078] ..... 72276.127091: kvm_fpu: unload >> >> Looks like with transparent huge pages enabled kvm tried to handle the >> shared memory fault on 0x84d gfn by coalescing nearby 4K pages >> to form a contiguous 2MB page mapping at gfn 0x800, since level 2 was >> requested in kvm_mmu_spte_requested. >> This caused the private memory contents from regions 0x800-0x84c and >> 0x86e-0xa00 to get unmapped from the guest leading to guest vm >> shutdown. > > Interesting... seems like that wouldn't be an issue for non-UPM SEV, since > the private pages would still be mapped as part of that 2M mapping, and > it's completely up to the guest as to whether it wants to access as > private or shared. But for UPM it makes sense this would cause issues. > >> >> Does getting the mapping level as per the fault access type help >> address the above issue? Any such coalescing should not cross between >> private to >> shared or shared to private memory regions. > > Doesn't seem like changing the check to fault->is_private would help in > your particular case, since the subsequent host_pfn_mapping_level() call > only seems to limit the mapping level to whatever the mapping level is > for the HVA in the host page table. > > Seems like with UPM we need some additional handling here that also > checks that the entire 2M HVA range is backed by non-private memory. > > Non-UPM SNP hypervisor patches already have a similar hook added to > host_pfn_mapping_level() which implements such a check via RMP table, so > UPM might need something similar: > > https://github.com/AMDESE/linux/commit/ae4475bc740eb0b9d031a76412b0117339794139 > > -Mike > For TDX, we try to track the page type (shared, private, mixed) of each gfn at given level. Only when the type is shared/private, can it be mapped at that level. When it's mixed, i.e., it contains both shared pages and private pages at given level, it has to go to next smaller level. https://github.com/intel/tdx/commit/ed97f4042eb69a210d9e972ccca6a84234028cad
On Fri, Jul 01, 2022, Xiaoyao Li wrote: > On 7/1/2022 6:21 AM, Michael Roth wrote: > > On Thu, Jun 30, 2022 at 12:14:13PM -0700, Vishal Annapurve wrote: > > > With transparent_hugepages=always setting I see issues with the > > > current implementation. ... > > > Looks like with transparent huge pages enabled kvm tried to handle the > > > shared memory fault on 0x84d gfn by coalescing nearby 4K pages > > > to form a contiguous 2MB page mapping at gfn 0x800, since level 2 was > > > requested in kvm_mmu_spte_requested. > > > This caused the private memory contents from regions 0x800-0x84c and > > > 0x86e-0xa00 to get unmapped from the guest leading to guest vm > > > shutdown. > > > > Interesting... seems like that wouldn't be an issue for non-UPM SEV, since > > the private pages would still be mapped as part of that 2M mapping, and > > it's completely up to the guest as to whether it wants to access as > > private or shared. But for UPM it makes sense this would cause issues. > > > > > > > > Does getting the mapping level as per the fault access type help > > > address the above issue? Any such coalescing should not cross between > > > private to > > > shared or shared to private memory regions. > > > > Doesn't seem like changing the check to fault->is_private would help in > > your particular case, since the subsequent host_pfn_mapping_level() call > > only seems to limit the mapping level to whatever the mapping level is > > for the HVA in the host page table. > > > > Seems like with UPM we need some additional handling here that also > > checks that the entire 2M HVA range is backed by non-private memory. > > > > Non-UPM SNP hypervisor patches already have a similar hook added to > > host_pfn_mapping_level() which implements such a check via RMP table, so > > UPM might need something similar: > > > > https://github.com/AMDESE/linux/commit/ae4475bc740eb0b9d031a76412b0117339794139 > > > > -Mike > > > > For TDX, we try to track the page type (shared, private, mixed) of each gfn > at given level. Only when the type is shared/private, can it be mapped at > that level. When it's mixed, i.e., it contains both shared pages and private > pages at given level, it has to go to next smaller level. > > https://github.com/intel/tdx/commit/ed97f4042eb69a210d9e972ccca6a84234028cad Hmm, so a new slot->arch.page_attr array shouldn't be necessary, KVM can instead update slot->arch.lpage_info on shared<->private conversions. Detecting whether a given range is partially mapped could get nasty if KVM defers tracking to the backing store, but if KVM itself does the tracking as was previously suggested[*], then updating lpage_info should be relatively straightfoward, e.g. use xa_for_each_range() to see if a given 2mb/1gb range is completely covered (fully shared) or not covered at all (fully private). [*] https://lore.kernel.org/all/YofeZps9YXgtP3f1@google.com
On 7/8/2022 4:08 AM, Sean Christopherson wrote: > On Fri, Jul 01, 2022, Xiaoyao Li wrote: >> On 7/1/2022 6:21 AM, Michael Roth wrote: >>> On Thu, Jun 30, 2022 at 12:14:13PM -0700, Vishal Annapurve wrote: >>>> With transparent_hugepages=always setting I see issues with the >>>> current implementation. > > ... > >>>> Looks like with transparent huge pages enabled kvm tried to handle the >>>> shared memory fault on 0x84d gfn by coalescing nearby 4K pages >>>> to form a contiguous 2MB page mapping at gfn 0x800, since level 2 was >>>> requested in kvm_mmu_spte_requested. >>>> This caused the private memory contents from regions 0x800-0x84c and >>>> 0x86e-0xa00 to get unmapped from the guest leading to guest vm >>>> shutdown. >>> >>> Interesting... seems like that wouldn't be an issue for non-UPM SEV, since >>> the private pages would still be mapped as part of that 2M mapping, and >>> it's completely up to the guest as to whether it wants to access as >>> private or shared. But for UPM it makes sense this would cause issues. >>> >>>> >>>> Does getting the mapping level as per the fault access type help >>>> address the above issue? Any such coalescing should not cross between >>>> private to >>>> shared or shared to private memory regions. >>> >>> Doesn't seem like changing the check to fault->is_private would help in >>> your particular case, since the subsequent host_pfn_mapping_level() call >>> only seems to limit the mapping level to whatever the mapping level is >>> for the HVA in the host page table. >>> >>> Seems like with UPM we need some additional handling here that also >>> checks that the entire 2M HVA range is backed by non-private memory. >>> >>> Non-UPM SNP hypervisor patches already have a similar hook added to >>> host_pfn_mapping_level() which implements such a check via RMP table, so >>> UPM might need something similar: >>> >>> https://github.com/AMDESE/linux/commit/ae4475bc740eb0b9d031a76412b0117339794139 >>> >>> -Mike >>> >> >> For TDX, we try to track the page type (shared, private, mixed) of each gfn >> at given level. Only when the type is shared/private, can it be mapped at >> that level. When it's mixed, i.e., it contains both shared pages and private >> pages at given level, it has to go to next smaller level. >> >> https://github.com/intel/tdx/commit/ed97f4042eb69a210d9e972ccca6a84234028cad > > Hmm, so a new slot->arch.page_attr array shouldn't be necessary, KVM can instead > update slot->arch.lpage_info on shared<->private conversions. Detecting whether > a given range is partially mapped could get nasty if KVM defers tracking to the > backing store, but if KVM itself does the tracking as was previously suggested[*], > then updating lpage_info should be relatively straightfoward, e.g. use > xa_for_each_range() to see if a given 2mb/1gb range is completely covered (fully > shared) or not covered at all (fully private). > > [*] https://lore.kernel.org/all/YofeZps9YXgtP3f1@google.com Yes, slot->arch.page_attr was introduced to help identify whether a page is completely shared/private at given level. It seems XARRAY can serve the same purpose, though I know nothing about it. Looking forward to seeing the patch of using XARRAY. yes, update slot->arch.lpage_info is good to utilize the existing logic and Isaku has applied it to slot->arch.lpage_info for 2MB support patches.
> > Hmm, so a new slot->arch.page_attr array shouldn't be necessary, KVM can instead > > update slot->arch.lpage_info on shared<->private conversions. Detecting whether > > a given range is partially mapped could get nasty if KVM defers tracking to the > > backing store, but if KVM itself does the tracking as was previously suggested[*], > > then updating lpage_info should be relatively straightfoward, e.g. use > > xa_for_each_range() to see if a given 2mb/1gb range is completely covered (fully > > shared) or not covered at all (fully private). > > > > [*] https://lore.kernel.org/all/YofeZps9YXgtP3f1@google.com > > Yes, slot->arch.page_attr was introduced to help identify whether a page > is completely shared/private at given level. It seems XARRAY can serve > the same purpose, though I know nothing about it. Looking forward to > seeing the patch of using XARRAY. > > yes, update slot->arch.lpage_info is good to utilize the existing logic > and Isaku has applied it to slot->arch.lpage_info for 2MB support patches. Chao, are you planning to implement these changes to ensure proper handling of hugepages partially mapped as private/shared in subsequent versions of this series? Or is this something left to be handled by the architecture specific code? Regards, Vishal
On Wed, Jul 20, 2022 at 04:08:10PM -0700, Vishal Annapurve wrote: > > > Hmm, so a new slot->arch.page_attr array shouldn't be necessary, KVM can instead > > > update slot->arch.lpage_info on shared<->private conversions. Detecting whether > > > a given range is partially mapped could get nasty if KVM defers tracking to the > > > backing store, but if KVM itself does the tracking as was previously suggested[*], > > > then updating lpage_info should be relatively straightfoward, e.g. use > > > xa_for_each_range() to see if a given 2mb/1gb range is completely covered (fully > > > shared) or not covered at all (fully private). > > > > > > [*] https://lore.kernel.org/all/YofeZps9YXgtP3f1@google.com > > > > Yes, slot->arch.page_attr was introduced to help identify whether a page > > is completely shared/private at given level. It seems XARRAY can serve > > the same purpose, though I know nothing about it. Looking forward to > > seeing the patch of using XARRAY. > > > > yes, update slot->arch.lpage_info is good to utilize the existing logic > > and Isaku has applied it to slot->arch.lpage_info for 2MB support patches. > > Chao, are you planning to implement these changes to ensure proper > handling of hugepages partially mapped as private/shared in subsequent > versions of this series? > Or is this something left to be handled by the architecture specific code? Ah, the topic gets moved to a different place. I should update here. There were more discussions under TDX KVM patch series and I actually just sent out the draft code for this: https://lkml.org/lkml/2022/7/20/610 That patch is based on UPM v7 here. If I can get more feedbacks there then I will include an udpated version in UPM v8. If you have bandwdith, you can also play with that patch, any feedback is welcome. Chao > > Regards, > Vishal
On Fri, Jun 17, 2022 at 09:30:53PM +0000, Sean Christopherson wrote: > > @@ -4088,7 +4144,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > read_unlock(&vcpu->kvm->mmu_lock); > > else > > write_unlock(&vcpu->kvm->mmu_lock); > > - kvm_release_pfn_clean(fault->pfn); > > + > > + if (fault->is_private) > > + kvm_private_mem_put_pfn(fault->slot, fault->pfn); > > Why does the shmem path lock the page, and then unlock it here? Lock is require to avoid race with truncate / punch hole. Like if truncate happens after get_pfn(), but before it gets into SEPT we are screwed. > Same question for why this path marks it dirty? The guest has the page mapped > so the dirty flag is immediately stale. If page is clean and refcount is not elevated, vmscan is free to drop the page from page cache. I don't think we want this. > In other words, why does KVM need to do something different for private pfns? Because in the traditional KVM memslot scheme, core mm takes care about this. The changes in v7 is wrong. Page has be locked until it lends into SEPT and must make it dirty before unlocking.
On Fri, Aug 19, 2022, Kirill A. Shutemov wrote: > On Fri, Jun 17, 2022 at 09:30:53PM +0000, Sean Christopherson wrote: > > > @@ -4088,7 +4144,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > > read_unlock(&vcpu->kvm->mmu_lock); > > > else > > > write_unlock(&vcpu->kvm->mmu_lock); > > > - kvm_release_pfn_clean(fault->pfn); > > > + > > > + if (fault->is_private) > > > + kvm_private_mem_put_pfn(fault->slot, fault->pfn); > > > > Why does the shmem path lock the page, and then unlock it here? > > Lock is require to avoid race with truncate / punch hole. Like if truncate > happens after get_pfn(), but before it gets into SEPT we are screwed. Getting the PFN into the SPTE doesn't provide protection in and of itself. The protection against truncation and whatnot comes from KVM getting a notification and either retrying the fault (notification acquires mmu_lock before direct_page_fault()), or blocking the notification (truncate / punch hole) until after KVM installs the SPTE. I.e. KVM just needs to ensure it doesn't install a SPTE _after_ getting notified. If the API is similar to gup(), i.e. only elevates the refcount but doesn't lock the page, then there's no need for a separate kvm_private_mem_put_pfn(), and in fact no need for ->put_unlock_pfn() because can KVM do set_page_dirty() and put_page() directly as needed using all of KVM's existing mechanisms.
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 7e258cc94152..c84835762249 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -176,6 +176,7 @@ struct kvm_page_fault { /* Derived from mmu and global state. */ const bool is_tdp; + const bool is_private; const bool nx_huge_page_workaround_enabled; /* diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index afe18d70ece7..e18460e0d743 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2899,6 +2899,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; + if (kvm_slot_is_private(slot)) + return max_level; + host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); return min(host_level, max_level); } @@ -3948,10 +3951,54 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } +static inline u8 order_to_level(int order) +{ + enum pg_level level; + + for (level = KVM_MAX_HUGEPAGE_LEVEL; level > PG_LEVEL_4K; level--) + if (order >= page_level_shift(level) - PAGE_SHIFT) + return level; + return level; +} + +static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + int order; + struct kvm_memory_slot *slot = fault->slot; + bool private_exist = !kvm_private_mem_get_pfn(slot, fault->gfn, + &fault->pfn, &order); + + if (fault->is_private != private_exist) { + if (private_exist) + kvm_private_mem_put_pfn(slot, fault->pfn); + + vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; + if (fault->is_private) + vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE; + else + vcpu->run->memory.flags = 0; + vcpu->run->memory.padding = 0; + vcpu->run->memory.gpa = fault->gfn << PAGE_SHIFT; + vcpu->run->memory.size = PAGE_SIZE; + return RET_PF_USER; + } + + if (fault->is_private) { + fault->max_level = min(order_to_level(order), fault->max_level); + fault->map_writable = !(slot->flags & KVM_MEM_READONLY); + return RET_PF_FIXED; + } + + /* Fault is shared, fallthrough to the standard path. */ + return RET_PF_CONTINUE; +} + static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; bool async; + int r; /* * Retry the page fault if the gfn hit a memslot that is being deleted @@ -3980,6 +4027,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return RET_PF_EMULATE; } + if (kvm_slot_is_private(slot)) { + r = kvm_faultin_pfn_private(vcpu, fault); + if (r != RET_PF_CONTINUE) + return r == RET_PF_FIXED ? RET_PF_CONTINUE : r; + } + async = false; fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async, fault->write, &fault->map_writable, @@ -4028,8 +4081,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) return true; - return fault->slot && - mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); + if (fault->is_private) + return mmu_notifier_retry(vcpu->kvm, mmu_seq); + else + return fault->slot && + mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); } static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -4088,7 +4144,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault read_unlock(&vcpu->kvm->mmu_lock); else write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); + + if (fault->is_private) + kvm_private_mem_put_pfn(fault->slot, fault->pfn); + else + kvm_release_pfn_clean(fault->pfn); + return r; } @@ -5372,6 +5433,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, return -EIO; } + if (r == RET_PF_USER) + return 0; + if (r < 0) return r; if (r != RET_PF_EMULATE) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index c0e502b17ef7..14932cf97655 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -147,6 +147,7 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); * RET_PF_RETRY: let CPU fault again on the address. * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. + * RET_PF_USER: need to exit to userspace to handle this fault. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. * @@ -163,6 +164,7 @@ enum { RET_PF_RETRY, RET_PF_EMULATE, RET_PF_INVALID, + RET_PF_USER, RET_PF_FIXED, RET_PF_SPURIOUS, }; @@ -178,4 +180,19 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); +#ifndef CONFIG_HAVE_KVM_PRIVATE_MEM +static inline int kvm_private_mem_get_pfn(struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *order) +{ + WARN_ON_ONCE(1); + return -EOPNOTSUPP; +} + +static inline void kvm_private_mem_put_pfn(struct kvm_memory_slot *slot, + kvm_pfn_t pfn) +{ + WARN_ON_ONCE(1); +} +#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */ + #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index ae86820cef69..2d7555381955 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -58,6 +58,7 @@ TRACE_DEFINE_ENUM(RET_PF_CONTINUE); TRACE_DEFINE_ENUM(RET_PF_RETRY); TRACE_DEFINE_ENUM(RET_PF_EMULATE); TRACE_DEFINE_ENUM(RET_PF_INVALID); +TRACE_DEFINE_ENUM(RET_PF_USER); TRACE_DEFINE_ENUM(RET_PF_FIXED); TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 7f8f1c8dbed2..1d857919a947 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -878,7 +878,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault out_unlock: write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); + if (fault->is_private) + kvm_private_mem_put_pfn(fault->slot, fault->pfn); + else + kvm_release_pfn_clean(fault->pfn); return r; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3fd168972ecd..b0a7910505ed 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2241,4 +2241,26 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) /* Max number of entries allowed for each kvm dirty ring */ #define KVM_DIRTY_RING_MAX_ENTRIES 65536 +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM +static inline int kvm_private_mem_get_pfn(struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *order) +{ + int ret; + pfn_t pfnt; + pgoff_t index = gfn - slot->base_gfn + + (slot->private_offset >> PAGE_SHIFT); + + ret = slot->notifier.bs->get_lock_pfn(slot->private_file, index, &pfnt, + order); + *pfn = pfn_t_to_pfn(pfnt); + return ret; +} + +static inline void kvm_private_mem_put_pfn(struct kvm_memory_slot *slot, + kvm_pfn_t pfn) +{ + slot->notifier.bs->put_unlock_pfn(pfn_to_pfn_t(pfn)); +} +#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */ + #endif