diff mbox series

[03/22] KVM: x86/mmu: Trigger unprotect logic only on write-protection page faults

Message ID 20240809190319.1710470-4-seanjc@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86: Fix multiple #PF RO infinite loop bugs | expand

Commit Message

Sean Christopherson Aug. 9, 2024, 7:03 p.m. UTC
Trigger KVM's various "unprotect gfn" paths if and only if the page fault
was a write to a write-protected gfn.  To do so, add a new page fault
return code, RET_PF_WRITE_PROTECTED, to explicitly and precisely track
such page faults.

If a page fault requires emulation for any MMIO (or any reason besides
write-protection), trying to unprotect the gfn is pointless and risks
putting the vCPU into an infinite loop.  E.g. KVM will put the vCPU into
an infinite loop if the vCPU manages to trigger MMIO on a page table walk.

Fixes: 147277540bbc ("kvm: svm: Add support for additional SVM NPF error codes")
Cc: stable@vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c          | 78 +++++++++++++++++++--------------
 arch/x86/kvm/mmu/mmu_internal.h |  3 ++
 arch/x86/kvm/mmu/mmutrace.h     |  1 +
 arch/x86/kvm/mmu/paging_tmpl.h  |  2 +-
 arch/x86/kvm/mmu/tdp_mmu.c      |  6 +--
 5 files changed, 53 insertions(+), 37 deletions(-)

Comments

Yuan Yao Aug. 14, 2024, 11:42 a.m. UTC | #1
On Fri, Aug 09, 2024 at 12:03:00PM -0700, Sean Christopherson wrote:
> Trigger KVM's various "unprotect gfn" paths if and only if the page fault
> was a write to a write-protected gfn.  To do so, add a new page fault
> return code, RET_PF_WRITE_PROTECTED, to explicitly and precisely track
> such page faults.
>
> If a page fault requires emulation for any MMIO (or any reason besides
> write-protection), trying to unprotect the gfn is pointless and risks
> putting the vCPU into an infinite loop.  E.g. KVM will put the vCPU into
> an infinite loop if the vCPU manages to trigger MMIO on a page table walk.
>
> Fixes: 147277540bbc ("kvm: svm: Add support for additional SVM NPF error codes")
> Cc: stable@vger.kernel.org
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/mmu/mmu.c          | 78 +++++++++++++++++++--------------
>  arch/x86/kvm/mmu/mmu_internal.h |  3 ++
>  arch/x86/kvm/mmu/mmutrace.h     |  1 +
>  arch/x86/kvm/mmu/paging_tmpl.h  |  2 +-
>  arch/x86/kvm/mmu/tdp_mmu.c      |  6 +--
>  5 files changed, 53 insertions(+), 37 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 901be9e420a4..e3aa04c498ea 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -2914,10 +2914,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
>  		trace_kvm_mmu_set_spte(level, gfn, sptep);
>  	}
>
> -	if (wrprot) {
> -		if (write_fault)
> -			ret = RET_PF_EMULATE;
> -	}
> +	if (wrprot && write_fault)
> +		ret = RET_PF_WRITE_PROTECTED;
>
>  	if (flush)
>  		kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
> @@ -4549,7 +4547,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  		return RET_PF_RETRY;
>
>  	if (page_fault_handle_page_track(vcpu, fault))
> -		return RET_PF_EMULATE;
> +		return RET_PF_WRITE_PROTECTED;
>
>  	r = fast_page_fault(vcpu, fault);
>  	if (r != RET_PF_INVALID)
> @@ -4642,7 +4640,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
>  	int r;
>
>  	if (page_fault_handle_page_track(vcpu, fault))
> -		return RET_PF_EMULATE;
> +		return RET_PF_WRITE_PROTECTED;
>
>  	r = fast_page_fault(vcpu, fault);
>  	if (r != RET_PF_INVALID)
> @@ -4726,6 +4724,9 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
>  	case RET_PF_EMULATE:
>  		return -ENOENT;
>
> +	case RET_PF_WRITE_PROTECTED:
> +		return -EPERM;
> +
>  	case RET_PF_RETRY:
>  	case RET_PF_CONTINUE:
>  	case RET_PF_INVALID:
> @@ -5960,6 +5961,41 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
>  	write_unlock(&vcpu->kvm->mmu_lock);
>  }
>
> +static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> +				       u64 error_code, int *emulation_type)
> +{
> +	bool direct = vcpu->arch.mmu->root_role.direct;
> +
> +	/*
> +	 * Before emulating the instruction, check if the error code
> +	 * was due to a RO violation while translating the guest page.
> +	 * This can occur when using nested virtualization with nested
> +	 * paging in both guests. If true, we simply unprotect the page
> +	 * and resume the guest.
> +	 */
> +	if (direct &&
> +	    (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
> +		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
> +		return RET_PF_FIXED;
> +	}
> +
> +	/*
> +	 * The gfn is write-protected, but if emulation fails we can still
> +	 * optimistically try to just unprotect the page and let the processor
> +	 * re-execute the instruction that caused the page fault.  Do not allow
> +	 * retrying MMIO emulation, as it's not only pointless but could also
> +	 * cause us to enter an infinite loop because the processor will keep
> +	 * faulting on the non-existent MMIO address.  Retrying an instruction
> +	 * from a nested guest is also pointless and dangerous as we are only
> +	 * explicitly shadowing L1's page tables, i.e. unprotecting something
> +	 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
> +	 */
> +	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))

Looks the mmio_info_in_cache() checking can be removed,
emulation should not come here with RET_PF_WRITE_PROTECTED
introduced, may WARN_ON_ONCE() it.


> +		*emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
> +
> +	return RET_PF_EMULATE;
> +}
> +
>  int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
>  		       void *insn, int insn_len)
>  {
> @@ -6005,6 +6041,10 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
>  	if (r < 0)
>  		return r;
>
> +	if (r == RET_PF_WRITE_PROTECTED)
> +		r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code,
> +						&emulation_type);
> +
>  	if (r == RET_PF_FIXED)
>  		vcpu->stat.pf_fixed++;
>  	else if (r == RET_PF_EMULATE)
> @@ -6015,32 +6055,6 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
>  	if (r != RET_PF_EMULATE)
>  		return 1;
>
> -	/*
> -	 * Before emulating the instruction, check if the error code
> -	 * was due to a RO violation while translating the guest page.
> -	 * This can occur when using nested virtualization with nested
> -	 * paging in both guests. If true, we simply unprotect the page
> -	 * and resume the guest.
> -	 */
> -	if (vcpu->arch.mmu->root_role.direct &&
> -	    (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
> -		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
> -		return 1;
> -	}
> -
> -	/*
> -	 * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
> -	 * optimistically try to just unprotect the page and let the processor
> -	 * re-execute the instruction that caused the page fault.  Do not allow
> -	 * retrying MMIO emulation, as it's not only pointless but could also
> -	 * cause us to enter an infinite loop because the processor will keep
> -	 * faulting on the non-existent MMIO address.  Retrying an instruction
> -	 * from a nested guest is also pointless and dangerous as we are only
> -	 * explicitly shadowing L1's page tables, i.e. unprotecting something
> -	 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
> -	 */
> -	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
> -		emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
>  emulate:
>  	return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
>  				       insn_len);
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 1721d97743e9..50d2624111f8 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -258,6 +258,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
>   * RET_PF_CONTINUE: So far, so good, keep handling the page fault.
>   * RET_PF_RETRY: let CPU fault again on the address.
>   * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
> + * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the
> + *                         gfn and retry, or emulate the instruction directly.
>   * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
>   * RET_PF_FIXED: The faulting entry has been fixed.
>   * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
> @@ -274,6 +276,7 @@ enum {
>  	RET_PF_CONTINUE = 0,
>  	RET_PF_RETRY,
>  	RET_PF_EMULATE,
> +	RET_PF_WRITE_PROTECTED,
>  	RET_PF_INVALID,
>  	RET_PF_FIXED,
>  	RET_PF_SPURIOUS,
> diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
> index 195d98bc8de8..f35a830ce469 100644
> --- a/arch/x86/kvm/mmu/mmutrace.h
> +++ b/arch/x86/kvm/mmu/mmutrace.h
> @@ -57,6 +57,7 @@
>  TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
>  TRACE_DEFINE_ENUM(RET_PF_RETRY);
>  TRACE_DEFINE_ENUM(RET_PF_EMULATE);
> +TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED);
>  TRACE_DEFINE_ENUM(RET_PF_INVALID);
>  TRACE_DEFINE_ENUM(RET_PF_FIXED);
>  TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 69941cebb3a8..a722a3c96af9 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -805,7 +805,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>
>  	if (page_fault_handle_page_track(vcpu, fault)) {
>  		shadow_page_table_clear_flood(vcpu, fault->addr);
> -		return RET_PF_EMULATE;
> +		return RET_PF_WRITE_PROTECTED;
>  	}
>
>  	r = mmu_topup_memory_caches(vcpu, true);
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index c7dc49ee7388..8bf44ac9372f 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1046,10 +1046,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
>  	 * protected, emulation is needed. If the emulation was skipped,
>  	 * the vCPU would have the same fault again.
>  	 */
> -	if (wrprot) {
> -		if (fault->write)
> -			ret = RET_PF_EMULATE;
> -	}
> +	if (wrprot && fault->write)
> +		ret = RET_PF_WRITE_PROTECTED;
>
>  	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
>  	if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
> --
> 2.46.0.76.ge559c4bf1a-goog
>
>
Sean Christopherson Aug. 14, 2024, 2:21 p.m. UTC | #2
On Wed, Aug 14, 2024, Yuan Yao wrote:
> > @@ -5960,6 +5961,41 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
> >  	write_unlock(&vcpu->kvm->mmu_lock);
> >  }
> >
> > +static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> > +				       u64 error_code, int *emulation_type)
> > +{
> > +	bool direct = vcpu->arch.mmu->root_role.direct;
> > +
> > +	/*
> > +	 * Before emulating the instruction, check if the error code
> > +	 * was due to a RO violation while translating the guest page.
> > +	 * This can occur when using nested virtualization with nested
> > +	 * paging in both guests. If true, we simply unprotect the page
> > +	 * and resume the guest.
> > +	 */
> > +	if (direct &&
> > +	    (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
> > +		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
> > +		return RET_PF_FIXED;
> > +	}
> > +
> > +	/*
> > +	 * The gfn is write-protected, but if emulation fails we can still
> > +	 * optimistically try to just unprotect the page and let the processor
> > +	 * re-execute the instruction that caused the page fault.  Do not allow
> > +	 * retrying MMIO emulation, as it's not only pointless but could also
> > +	 * cause us to enter an infinite loop because the processor will keep
> > +	 * faulting on the non-existent MMIO address.  Retrying an instruction
> > +	 * from a nested guest is also pointless and dangerous as we are only
> > +	 * explicitly shadowing L1's page tables, i.e. unprotecting something
> > +	 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
> > +	 */
> > +	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
> 
> Looks the mmio_info_in_cache() checking can be removed,
> emulation should not come here with RET_PF_WRITE_PROTECTED
> introduced, may WARN_ON_ONCE() it.

Yeah, that was my instinct as well.  I kept it mostly because I liked having the
comment, but also because I was thinking the cache could theoretically get a hit.
But that's not true.  KVM should return RET_PF_WRITE_PROTECTED if and only if
there is a memslot, and creating a memslot is supposed to invalidate the MMIO
cache by virtue of changing the memslot generation.

Unless someone feels strongly that the mmio_info_in_cache() call should be
deleted entirely, I'll tack on this patch.  The cache lookup is cheap, and IMO
it's helpful to explicitly document that it should be impossible to reach this
point with what appears to be MMIO.

---
 arch/x86/kvm/mmu/mmu.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 50695eb2ee22..7f3f57237f23 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5997,6 +5997,18 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	vcpu->arch.last_retry_eip = 0;
 	vcpu->arch.last_retry_addr = 0;
 
+	/*
+	 * It should be impossible to reach this point with an MMIO cache hit,
+	 * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid,
+	 * writable memslot, and creating a memslot should invalidate the MMIO
+	 * cache by way of changing the memslot generation.  WARN and disallow
+	 * retry if MMIO is detect, as retrying MMIO emulation is pointless and
+	 * could put the vCPU into an infinite loop because the processor will
+	 * keep faulting on the non-existent MMIO address.
+	 */
+	if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct)))
+		return RET_PF_EMULATE;
+
 	/*
 	 * Before emulating the instruction, check to see if the access may be
 	 * due to L1 accessing nested NPT/EPT entries used for L2, i.e. if the
@@ -6029,17 +6041,15 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 		return RET_PF_FIXED;
 
 	/*
-	 * The gfn is write-protected, but if emulation fails we can still
-	 * optimistically try to just unprotect the page and let the processor
+	 * The gfn is write-protected, but if KVM detects its emulating an
+	 * instruction that is unlikely to be used to modify page tables, or if
+	 * emulation fails, KVM can try to unprotect the gfn and let the CPU
 	 * re-execute the instruction that caused the page fault.  Do not allow
-	 * retrying MMIO emulation, as it's not only pointless but could also
-	 * cause us to enter an infinite loop because the processor will keep
-	 * faulting on the non-existent MMIO address.  Retrying an instruction
-	 * from a nested guest is also pointless and dangerous as we are only
-	 * explicitly shadowing L1's page tables, i.e. unprotecting something
-	 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
+	 * retrying an instruction from a nested guest as KVM is only explicitly
+	 * shadowing L1's page tables, i.e. unprotecting something for L1 isn't
+	 * going to magically fix whatever issue cause L2 to fail.
 	 */
-	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
+	if (!is_guest_mode(vcpu))
 		*emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
 
 	return RET_PF_EMULATE;

base-commit: 7d33880356496eb0640c6c825cc60898063c4902
--
Paolo Bonzini Aug. 14, 2024, 4:40 p.m. UTC | #3
On 8/9/24 21:03, Sean Christopherson wrote:
> Trigger KVM's various "unprotect gfn" paths if and only if the page fault
> was a write to a write-protected gfn.  To do so, add a new page fault
> return code, RET_PF_WRITE_PROTECTED, to explicitly and precisely track
> such page faults.
> 
> If a page fault requires emulation for any MMIO (or any reason besides
> write-protection), trying to unprotect the gfn is pointless and risks
> putting the vCPU into an infinite loop.  E.g. KVM will put the vCPU into
> an infinite loop if the vCPU manages to trigger MMIO on a page table walk.
> 
> Fixes: 147277540bbc ("kvm: svm: Add support for additional SVM NPF error codes")
> Cc: stable@vger.kernel.org

Do we really want Cc: stable@ for all these patches?  Most of them are 
of the "if it hurts, don't do it" kind; as long as there are no infinite 
loops in a non-killable region, I prefer not to complicate our lives 
with cherry picks of unknown quality.

That said, this patch could be interesting for 6.11 because of the 
effect on prefaulting (see below).

> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>   arch/x86/kvm/mmu/mmu.c          | 78 +++++++++++++++++++--------------
>   arch/x86/kvm/mmu/mmu_internal.h |  3 ++
>   arch/x86/kvm/mmu/mmutrace.h     |  1 +
>   arch/x86/kvm/mmu/paging_tmpl.h  |  2 +-
>   arch/x86/kvm/mmu/tdp_mmu.c      |  6 +--
>   5 files changed, 53 insertions(+), 37 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 901be9e420a4..e3aa04c498ea 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -2914,10 +2914,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
>   		trace_kvm_mmu_set_spte(level, gfn, sptep);
>   	}
>   
> -	if (wrprot) {
> -		if (write_fault)
> -			ret = RET_PF_EMULATE;
> -	}
> +	if (wrprot && write_fault)
> +		ret = RET_PF_WRITE_PROTECTED;
>   
>   	if (flush)
>   		kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
> @@ -4549,7 +4547,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>   		return RET_PF_RETRY;
>   
>   	if (page_fault_handle_page_track(vcpu, fault))
> -		return RET_PF_EMULATE;
> +		return RET_PF_WRITE_PROTECTED;
>   
>   	r = fast_page_fault(vcpu, fault);
>   	if (r != RET_PF_INVALID)
> @@ -4642,7 +4640,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
>   	int r;
>   
>   	if (page_fault_handle_page_track(vcpu, fault))
> -		return RET_PF_EMULATE;
> +		return RET_PF_WRITE_PROTECTED;
>   
>   	r = fast_page_fault(vcpu, fault);
>   	if (r != RET_PF_INVALID)
> @@ -4726,6 +4724,9 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
>   	case RET_PF_EMULATE:
>   		return -ENOENT;
>   
> +	case RET_PF_WRITE_PROTECTED:
> +		return -EPERM;

Shouldn't this be a "return 0"?  Even if kvm_mmu_do_page_fault() cannot 
fully unprotect the page, it was nevertheless prefaulted as much as 
possible.

Paolo
Sean Christopherson Aug. 14, 2024, 7:34 p.m. UTC | #4
On Wed, Aug 14, 2024, Paolo Bonzini wrote:
> On 8/9/24 21:03, Sean Christopherson wrote:
> > Trigger KVM's various "unprotect gfn" paths if and only if the page fault
> > was a write to a write-protected gfn.  To do so, add a new page fault
> > return code, RET_PF_WRITE_PROTECTED, to explicitly and precisely track
> > such page faults.
> > 
> > If a page fault requires emulation for any MMIO (or any reason besides
> > write-protection), trying to unprotect the gfn is pointless and risks
> > putting the vCPU into an infinite loop.  E.g. KVM will put the vCPU into
> > an infinite loop if the vCPU manages to trigger MMIO on a page table walk.
> > 
> > Fixes: 147277540bbc ("kvm: svm: Add support for additional SVM NPF error codes")
> > Cc: stable@vger.kernel.org
> 
> Do we really want Cc: stable@ for all these patches?  Most of them are of
> the "if it hurts, don't do it" kind;

True.  I was thinking that the VMX PFERR_GUEST_{FINAL,PAGE}_MASK bug in particular
was stable-worthy, but until TDX comes along, it's only relevant if guests puts
PDPTRs in an MMIO region.  And in that case, the guest is likely hosed anyway,
the only difference is if it gets stuck or killed.

I'll drop the stable@ tags unless someone objects.

> as long as there are no infinite loops in a non-killable region, I prefer not
> to complicate our lives with cherry picks of unknown quality.

Yeah, the RET_PF_WRITE_PROTECTED one in particular has high potential for a bad
cherry-pick.

> That said, this patch could be interesting for 6.11 because of the effect on
> prefaulting (see below).
> 
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > ---
> >   arch/x86/kvm/mmu/mmu.c          | 78 +++++++++++++++++++--------------
> >   arch/x86/kvm/mmu/mmu_internal.h |  3 ++
> >   arch/x86/kvm/mmu/mmutrace.h     |  1 +
> >   arch/x86/kvm/mmu/paging_tmpl.h  |  2 +-
> >   arch/x86/kvm/mmu/tdp_mmu.c      |  6 +--
> >   5 files changed, 53 insertions(+), 37 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 901be9e420a4..e3aa04c498ea 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -2914,10 +2914,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
> >   		trace_kvm_mmu_set_spte(level, gfn, sptep);
> >   	}
> > -	if (wrprot) {
> > -		if (write_fault)
> > -			ret = RET_PF_EMULATE;
> > -	}
> > +	if (wrprot && write_fault)
> > +		ret = RET_PF_WRITE_PROTECTED;
> >   	if (flush)
> >   		kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
> > @@ -4549,7 +4547,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> >   		return RET_PF_RETRY;
> >   	if (page_fault_handle_page_track(vcpu, fault))
> > -		return RET_PF_EMULATE;
> > +		return RET_PF_WRITE_PROTECTED;
> >   	r = fast_page_fault(vcpu, fault);
> >   	if (r != RET_PF_INVALID)
> > @@ -4642,7 +4640,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
> >   	int r;
> >   	if (page_fault_handle_page_track(vcpu, fault))
> > -		return RET_PF_EMULATE;
> > +		return RET_PF_WRITE_PROTECTED;
> >   	r = fast_page_fault(vcpu, fault);
> >   	if (r != RET_PF_INVALID)
> > @@ -4726,6 +4724,9 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
> >   	case RET_PF_EMULATE:
> >   		return -ENOENT;
> > +	case RET_PF_WRITE_PROTECTED:
> > +		return -EPERM;
> 
> Shouldn't this be a "return 0"?  Even if kvm_mmu_do_page_fault() cannot
> fully unprotect the page, it was nevertheless prefaulted as much as
> possible.

Hmm, I hadn't thought about it from that perspective.  Ah, right, and the early
check in page_fault_handle_page_track() only handles PRESENT faults, so KVM will
at least install a read-only mapping.

So yeah, agreed this should return 0.
Yuan Yao Aug. 15, 2024, 8:30 a.m. UTC | #5
On Wed, Aug 14, 2024 at 07:21:06AM -0700, Sean Christopherson wrote:
> On Wed, Aug 14, 2024, Yuan Yao wrote:
> > > @@ -5960,6 +5961,41 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
> > >  	write_unlock(&vcpu->kvm->mmu_lock);
> > >  }
> > >
> > > +static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> > > +				       u64 error_code, int *emulation_type)
> > > +{
> > > +	bool direct = vcpu->arch.mmu->root_role.direct;
> > > +
> > > +	/*
> > > +	 * Before emulating the instruction, check if the error code
> > > +	 * was due to a RO violation while translating the guest page.
> > > +	 * This can occur when using nested virtualization with nested
> > > +	 * paging in both guests. If true, we simply unprotect the page
> > > +	 * and resume the guest.
> > > +	 */
> > > +	if (direct &&
> > > +	    (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
> > > +		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
> > > +		return RET_PF_FIXED;
> > > +	}
> > > +
> > > +	/*
> > > +	 * The gfn is write-protected, but if emulation fails we can still
> > > +	 * optimistically try to just unprotect the page and let the processor
> > > +	 * re-execute the instruction that caused the page fault.  Do not allow
> > > +	 * retrying MMIO emulation, as it's not only pointless but could also
> > > +	 * cause us to enter an infinite loop because the processor will keep
> > > +	 * faulting on the non-existent MMIO address.  Retrying an instruction
> > > +	 * from a nested guest is also pointless and dangerous as we are only
> > > +	 * explicitly shadowing L1's page tables, i.e. unprotecting something
> > > +	 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
> > > +	 */
> > > +	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
> >
> > Looks the mmio_info_in_cache() checking can be removed,
> > emulation should not come here with RET_PF_WRITE_PROTECTED
> > introduced, may WARN_ON_ONCE() it.
>
> Yeah, that was my instinct as well.  I kept it mostly because I liked having the
> comment, but also because I was thinking the cache could theoretically get a hit.
> But that's not true.  KVM should return RET_PF_WRITE_PROTECTED if and only if
> there is a memslot, and creating a memslot is supposed to invalidate the MMIO
> cache by virtue of changing the memslot generation.
>
> Unless someone feels strongly that the mmio_info_in_cache() call should be
> deleted entirely, I'll tack on this patch.  The cache lookup is cheap, and IMO
> it's helpful to explicitly document that it should be impossible to reach this
> point with what appears to be MMIO.
>
> ---
>  arch/x86/kvm/mmu/mmu.c | 28 +++++++++++++++++++---------
>  1 file changed, 19 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 50695eb2ee22..7f3f57237f23 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5997,6 +5997,18 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  	vcpu->arch.last_retry_eip = 0;
>  	vcpu->arch.last_retry_addr = 0;
>
> +	/*
> +	 * It should be impossible to reach this point with an MMIO cache hit,
> +	 * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid,
> +	 * writable memslot, and creating a memslot should invalidate the MMIO
> +	 * cache by way of changing the memslot generation.  WARN and disallow
> +	 * retry if MMIO is detect, as retrying MMIO emulation is pointless and
> +	 * could put the vCPU into an infinite loop because the processor will
> +	 * keep faulting on the non-existent MMIO address.
> +	 */
> +	if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct)))
> +		return RET_PF_EMULATE;
> +

LGTM.

Reviewed-by: Yuan Yao <yuan.yao@intel.com>

>  	/*
>  	 * Before emulating the instruction, check to see if the access may be
>  	 * due to L1 accessing nested NPT/EPT entries used for L2, i.e. if the
> @@ -6029,17 +6041,15 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  		return RET_PF_FIXED;
>
>  	/*
> -	 * The gfn is write-protected, but if emulation fails we can still
> -	 * optimistically try to just unprotect the page and let the processor
> +	 * The gfn is write-protected, but if KVM detects its emulating an
> +	 * instruction that is unlikely to be used to modify page tables, or if
> +	 * emulation fails, KVM can try to unprotect the gfn and let the CPU
>  	 * re-execute the instruction that caused the page fault.  Do not allow
> -	 * retrying MMIO emulation, as it's not only pointless but could also
> -	 * cause us to enter an infinite loop because the processor will keep
> -	 * faulting on the non-existent MMIO address.  Retrying an instruction
> -	 * from a nested guest is also pointless and dangerous as we are only
> -	 * explicitly shadowing L1's page tables, i.e. unprotecting something
> -	 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
> +	 * retrying an instruction from a nested guest as KVM is only explicitly
> +	 * shadowing L1's page tables, i.e. unprotecting something for L1 isn't
> +	 * going to magically fix whatever issue cause L2 to fail.
>  	 */
> -	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
> +	if (!is_guest_mode(vcpu))
>  		*emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
>
>  	return RET_PF_EMULATE;
>
> base-commit: 7d33880356496eb0640c6c825cc60898063c4902
> --
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 901be9e420a4..e3aa04c498ea 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2914,10 +2914,8 @@  static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
 		trace_kvm_mmu_set_spte(level, gfn, sptep);
 	}
 
-	if (wrprot) {
-		if (write_fault)
-			ret = RET_PF_EMULATE;
-	}
+	if (wrprot && write_fault)
+		ret = RET_PF_WRITE_PROTECTED;
 
 	if (flush)
 		kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
@@ -4549,7 +4547,7 @@  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 		return RET_PF_RETRY;
 
 	if (page_fault_handle_page_track(vcpu, fault))
-		return RET_PF_EMULATE;
+		return RET_PF_WRITE_PROTECTED;
 
 	r = fast_page_fault(vcpu, fault);
 	if (r != RET_PF_INVALID)
@@ -4642,7 +4640,7 @@  static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
 	int r;
 
 	if (page_fault_handle_page_track(vcpu, fault))
-		return RET_PF_EMULATE;
+		return RET_PF_WRITE_PROTECTED;
 
 	r = fast_page_fault(vcpu, fault);
 	if (r != RET_PF_INVALID)
@@ -4726,6 +4724,9 @@  static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
 	case RET_PF_EMULATE:
 		return -ENOENT;
 
+	case RET_PF_WRITE_PROTECTED:
+		return -EPERM;
+
 	case RET_PF_RETRY:
 	case RET_PF_CONTINUE:
 	case RET_PF_INVALID:
@@ -5960,6 +5961,41 @@  void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
 	write_unlock(&vcpu->kvm->mmu_lock);
 }
 
+static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+				       u64 error_code, int *emulation_type)
+{
+	bool direct = vcpu->arch.mmu->root_role.direct;
+
+	/*
+	 * Before emulating the instruction, check if the error code
+	 * was due to a RO violation while translating the guest page.
+	 * This can occur when using nested virtualization with nested
+	 * paging in both guests. If true, we simply unprotect the page
+	 * and resume the guest.
+	 */
+	if (direct &&
+	    (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
+		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
+		return RET_PF_FIXED;
+	}
+
+	/*
+	 * The gfn is write-protected, but if emulation fails we can still
+	 * optimistically try to just unprotect the page and let the processor
+	 * re-execute the instruction that caused the page fault.  Do not allow
+	 * retrying MMIO emulation, as it's not only pointless but could also
+	 * cause us to enter an infinite loop because the processor will keep
+	 * faulting on the non-existent MMIO address.  Retrying an instruction
+	 * from a nested guest is also pointless and dangerous as we are only
+	 * explicitly shadowing L1's page tables, i.e. unprotecting something
+	 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
+	 */
+	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
+		*emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
+
+	return RET_PF_EMULATE;
+}
+
 int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
 		       void *insn, int insn_len)
 {
@@ -6005,6 +6041,10 @@  int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
 	if (r < 0)
 		return r;
 
+	if (r == RET_PF_WRITE_PROTECTED)
+		r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code,
+						&emulation_type);
+
 	if (r == RET_PF_FIXED)
 		vcpu->stat.pf_fixed++;
 	else if (r == RET_PF_EMULATE)
@@ -6015,32 +6055,6 @@  int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
 	if (r != RET_PF_EMULATE)
 		return 1;
 
-	/*
-	 * Before emulating the instruction, check if the error code
-	 * was due to a RO violation while translating the guest page.
-	 * This can occur when using nested virtualization with nested
-	 * paging in both guests. If true, we simply unprotect the page
-	 * and resume the guest.
-	 */
-	if (vcpu->arch.mmu->root_role.direct &&
-	    (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
-		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
-		return 1;
-	}
-
-	/*
-	 * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
-	 * optimistically try to just unprotect the page and let the processor
-	 * re-execute the instruction that caused the page fault.  Do not allow
-	 * retrying MMIO emulation, as it's not only pointless but could also
-	 * cause us to enter an infinite loop because the processor will keep
-	 * faulting on the non-existent MMIO address.  Retrying an instruction
-	 * from a nested guest is also pointless and dangerous as we are only
-	 * explicitly shadowing L1's page tables, i.e. unprotecting something
-	 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
-	 */
-	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
-		emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
 emulate:
 	return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
 				       insn_len);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 1721d97743e9..50d2624111f8 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -258,6 +258,8 @@  int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
  * RET_PF_CONTINUE: So far, so good, keep handling the page fault.
  * RET_PF_RETRY: let CPU fault again on the address.
  * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the
+ *                         gfn and retry, or emulate the instruction directly.
  * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
  * RET_PF_FIXED: The faulting entry has been fixed.
  * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
@@ -274,6 +276,7 @@  enum {
 	RET_PF_CONTINUE = 0,
 	RET_PF_RETRY,
 	RET_PF_EMULATE,
+	RET_PF_WRITE_PROTECTED,
 	RET_PF_INVALID,
 	RET_PF_FIXED,
 	RET_PF_SPURIOUS,
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 195d98bc8de8..f35a830ce469 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -57,6 +57,7 @@ 
 TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
 TRACE_DEFINE_ENUM(RET_PF_RETRY);
 TRACE_DEFINE_ENUM(RET_PF_EMULATE);
+TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED);
 TRACE_DEFINE_ENUM(RET_PF_INVALID);
 TRACE_DEFINE_ENUM(RET_PF_FIXED);
 TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 69941cebb3a8..a722a3c96af9 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -805,7 +805,7 @@  static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 
 	if (page_fault_handle_page_track(vcpu, fault)) {
 		shadow_page_table_clear_flood(vcpu, fault->addr);
-		return RET_PF_EMULATE;
+		return RET_PF_WRITE_PROTECTED;
 	}
 
 	r = mmu_topup_memory_caches(vcpu, true);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c7dc49ee7388..8bf44ac9372f 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1046,10 +1046,8 @@  static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 	 * protected, emulation is needed. If the emulation was skipped,
 	 * the vCPU would have the same fault again.
 	 */
-	if (wrprot) {
-		if (fault->write)
-			ret = RET_PF_EMULATE;
-	}
+	if (wrprot && fault->write)
+		ret = RET_PF_WRITE_PROTECTED;
 
 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 	if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {