diff mbox series

[v2,10/11] KVM: SVM: implement support for vNMI

Message ID 20221129193717.513824-11-mlevitsk@redhat.com (mailing list archive)
State New, archived
Headers show
Series SVM: vNMI (with my fixes) | expand

Commit Message

Maxim Levitsky Nov. 29, 2022, 7:37 p.m. UTC
This patch implements support for injecting pending
NMIs via the .kvm_x86_set_hw_nmi_pending using new AMD's vNMI
feature.

Note that the vNMI can't cause a VM exit, which is needed
when a nested guest intercepts NMIs.

Therefore to avoid breaking nesting, the vNMI is inhibited while
a nested guest is running and instead, the legacy NMI window
detection and delivery method is used.

While it is possible to passthrough the vNMI if a nested guest
doesn't intercept NMIs, such usage is very uncommon, and it's
not worth to optimize for.

Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/svm/nested.c |  42 +++++++++++++++
 arch/x86/kvm/svm/svm.c    | 111 ++++++++++++++++++++++++++++++--------
 arch/x86/kvm/svm/svm.h    |  10 ++++
 3 files changed, 140 insertions(+), 23 deletions(-)

Comments

Maxim Levitsky Dec. 4, 2022, 5:18 p.m. UTC | #1
On Tue, 2022-11-29 at 21:37 +0200, Maxim Levitsky wrote:
> This patch implements support for injecting pending
> NMIs via the .kvm_x86_set_hw_nmi_pending using new AMD's vNMI
> feature.
> 
> Note that the vNMI can't cause a VM exit, which is needed
> when a nested guest intercepts NMIs.
> 
> Therefore to avoid breaking nesting, the vNMI is inhibited while
> a nested guest is running and instead, the legacy NMI window
> detection and delivery method is used.
> 
> While it is possible to passthrough the vNMI if a nested guest
> doesn't intercept NMIs, such usage is very uncommon, and it's
> not worth to optimize for.
> 
> Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>  arch/x86/kvm/svm/nested.c |  42 +++++++++++++++
>  arch/x86/kvm/svm/svm.c    | 111 ++++++++++++++++++++++++++++++--------
>  arch/x86/kvm/svm/svm.h    |  10 ++++
>  3 files changed, 140 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index e891318595113e..5bea672bf8b12d 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -623,6 +623,42 @@ static bool is_evtinj_nmi(u32 evtinj)
>  	return type == SVM_EVTINJ_TYPE_NMI;
>  }


Tiny bits of self review, I noticed some mistakes:

>  
> +static void nested_svm_save_vnmi(struct vcpu_svm *svm)
> +{
> +	struct vmcb *vmcb01 = svm->vmcb01.ptr;
> +
> +	/*
> +	 * Copy the vNMI state back to software NMI tracking state
> +	 * for the duration of the nested run
> +	 */
> +
> +	svm->nmi_masked = vmcb01->control.int_ctl & V_NMI_MASK;
> +	svm->vcpu.arch.nmi_pending += vmcb01->control.int_ctl & V_NMI_PENDING;
Sorry for a mistake here - this should be converted to a boolean first, something like that:


if (vmcb01->control.int_ctl & V_NMI_PENDING)
	svm->vcpu.arch.nmi_pending++;



> +}
> +
> +static void nested_svm_restore_vnmi(struct vcpu_svm *svm)
> +{
> +	struct kvm_vcpu *vcpu = &svm->vcpu;
> +	struct vmcb *vmcb01 = svm->vmcb01.ptr;
> +
> +	/*
> +	 * Restore the vNMI state from the software NMI tracking state
> +	 * after a nested run
> +	 */
> +
> +	if (svm->nmi_masked)
> +		vmcb01->control.int_ctl |= V_NMI_MASK;
> +	else
> +		vmcb01->control.int_ctl &= ~V_NMI_MASK;
> +
> +	if (vcpu->arch.nmi_pending) {
> +		vcpu->arch.nmi_pending--;
> +		vmcb01->control.int_ctl |= V_NMI_PENDING;
> +	} else
> +		vmcb01->control.int_ctl &= ~V_NMI_PENDING;
> +}
> +
> +
>  static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
>  					  unsigned long vmcb12_rip,
>  					  unsigned long vmcb12_csbase)
> @@ -646,6 +682,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
>  	else
>  		int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
>  
> +	if (vnmi)
> +		nested_svm_save_vnmi(svm);
> +
>  	/* Copied from vmcb01.  msrpm_base can be overwritten later.  */
>  	vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
>  	vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
> @@ -1049,6 +1088,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>  		svm_update_lbrv(vcpu);
>  	}
>  
> +	if (vnmi)
> +		nested_svm_restore_vnmi(svm);
> +
>  	/*
>  	 * On vmexit the  GIF is set to false and
>  	 * no event can be injected in L1.
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index cfed6ab29c839a..bf10adcf3170a8 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -230,6 +230,8 @@ module_param(dump_invalid_vmcb, bool, 0644);
>  bool intercept_smi = true;
>  module_param(intercept_smi, bool, 0444);
>  
> +bool vnmi = true;
> +module_param(vnmi, bool, 0444);
>  
>  static bool svm_gp_erratum_intercept = true;
>  
> @@ -1299,6 +1301,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
>  	if (kvm_vcpu_apicv_active(vcpu))
>  		avic_init_vmcb(svm, vmcb);
>  
> +	if (vnmi)
> +		svm->vmcb->control.int_ctl |= V_NMI_ENABLE;
> +
>  	if (vgif) {
>  		svm_clr_intercept(svm, INTERCEPT_STGI);
>  		svm_clr_intercept(svm, INTERCEPT_CLGI);
> @@ -3487,6 +3492,39 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
>  	++vcpu->stat.nmi_injections;
>  }
>  
> +
> +static bool svm_get_hw_nmi_pending(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (!is_vnmi_enabled(svm))
> +		return false;
> +
> +	return !!(svm->vmcb->control.int_ctl & V_NMI_MASK);
> +}
> +
> +static bool svm_set_hw_nmi_pending(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (!is_vnmi_enabled(svm))
> +		return false;
> +
> +	if (svm->vmcb->control.int_ctl & V_NMI_PENDING)
> +		return false;
> +
> +	svm->vmcb->control.int_ctl |= V_NMI_PENDING;
> +	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);

vmcb_mark_dirty is not needed I think now, as KVM always leaves the VMCB_INTR dirty bit
due to vTPR update.


Best regards,
	Maxim Levitsky

> +
> +	/*
> +	 * NMI isn't yet technically injected but
> +	 * this rough estimation should be good enough
> +	 */
> +	++vcpu->stat.nmi_injections;
> +
> +	return true;
> +}
> +
>  static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> @@ -3582,11 +3620,38 @@ static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
>  		svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
>  }
>  
> +static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (is_vnmi_enabled(svm))
> +		return svm->vmcb->control.int_ctl & V_NMI_MASK;
> +	else
> +		return svm->nmi_masked;
> +}
> +
> +static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (is_vnmi_enabled(svm)) {
> +		if (masked)
> +			svm->vmcb->control.int_ctl |= V_NMI_MASK;
> +		else
> +			svm->vmcb->control.int_ctl &= ~V_NMI_MASK;
> +	} else {
> +		svm->nmi_masked = masked;
> +		if (masked)
> +			svm_enable_iret_interception(svm);
> +		else
> +			svm_disable_iret_interception(svm);
> +	}
> +}
> +
>  bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
>  	struct vmcb *vmcb = svm->vmcb;
> -	bool ret;
>  
>  	if (!gif_set(svm))
>  		return true;
> @@ -3594,10 +3659,10 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
>  	if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
>  		return false;
>  
> -	ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
> -	      (svm->nmi_masked);
> +	if (svm_get_nmi_mask(vcpu))
> +		return true;
>  
> -	return ret;
> +	return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
>  }
>  
>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
> @@ -3615,24 +3680,6 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
>  	return 1;
>  }
>  
> -static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
> -{
> -	return to_svm(vcpu)->nmi_masked;
> -}
> -
> -static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
> -{
> -	struct vcpu_svm *svm = to_svm(vcpu);
> -
> -	if (masked) {
> -		svm->nmi_masked = true;
> -		svm_enable_iret_interception(svm);
> -	} else {
> -		svm->nmi_masked = false;
> -		svm_disable_iret_interception(svm);
> -	}
> -}
> -
>  bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> @@ -3725,10 +3772,16 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
>  	/*
>  	 * Something prevents NMI from been injected. Single step over possible
>  	 * problem (IRET or exception injection or interrupt shadow)
> +	 *
> +	 * With vNMI we should never need an NMI window
> +	 * (we can always inject vNMI either by setting VNMI_PENDING or by EVENTINJ)
>  	 */
> +	if (WARN_ON_ONCE(is_vnmi_enabled(svm)))
> +		return;
> +
>  	svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
> -	svm->nmi_singlestep = true;
>  	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
> +	svm->nmi_singlestep = true;
>  }
>  
>  static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
> @@ -4770,6 +4823,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>  	.patch_hypercall = svm_patch_hypercall,
>  	.inject_irq = svm_inject_irq,
>  	.inject_nmi = svm_inject_nmi,
> +	.get_hw_nmi_pending = svm_get_hw_nmi_pending,
> +	.set_hw_nmi_pending = svm_set_hw_nmi_pending,
>  	.inject_exception = svm_inject_exception,
>  	.cancel_injection = svm_cancel_injection,
>  	.interrupt_allowed = svm_interrupt_allowed,
> @@ -5058,6 +5113,16 @@ static __init int svm_hardware_setup(void)
>  			pr_info("Virtual GIF supported\n");
>  	}
>  
> +
> +	vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_AMD_VNMI);
> +	if (vnmi)
> +		pr_info("Virtual NMI enabled\n");
> +
> +	if (!vnmi) {
> +		svm_x86_ops.get_hw_nmi_pending = NULL;
> +		svm_x86_ops.set_hw_nmi_pending = NULL;
> +	}
> +
>  	if (lbrv) {
>  		if (!boot_cpu_has(X86_FEATURE_LBRV))
>  			lbrv = false;
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 587ddc150f9f34..0b7e1790fadde1 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -35,6 +35,7 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
>  extern bool npt_enabled;
>  extern int vgif;
>  extern bool intercept_smi;
> +extern bool vnmi;
>  
>  enum avic_modes {
>  	AVIC_MODE_NONE = 0,
> @@ -553,6 +554,15 @@ static inline bool is_x2apic_msrpm_offset(u32 offset)
>  	       (msr < (APIC_BASE_MSR + 0x100));
>  }
>  
> +static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
> +{
> +	/* L1's vNMI is inhibited while nested guest is running */
> +	if (is_guest_mode(&svm->vcpu))
> +		return false;
> +
> +	return !!(svm->vmcb01.ptr->control.int_ctl & V_NMI_ENABLE);
> +}
> +
>  /* svm.c */
>  #define MSR_INVALID				0xffffffffU
>
Shukla, Santosh Dec. 5, 2022, 5:07 p.m. UTC | #2
On 11/30/2022 1:07 AM, Maxim Levitsky wrote:
> This patch implements support for injecting pending
> NMIs via the .kvm_x86_set_hw_nmi_pending using new AMD's vNMI
> feature.
> 
> Note that the vNMI can't cause a VM exit, which is needed
> when a nested guest intercepts NMIs.
> 
> Therefore to avoid breaking nesting, the vNMI is inhibited while
> a nested guest is running and instead, the legacy NMI window
> detection and delivery method is used.
> 
> While it is possible to passthrough the vNMI if a nested guest
> doesn't intercept NMIs, such usage is very uncommon, and it's
> not worth to optimize for.
> 
> Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>  arch/x86/kvm/svm/nested.c |  42 +++++++++++++++
>  arch/x86/kvm/svm/svm.c    | 111 ++++++++++++++++++++++++++++++--------
>  arch/x86/kvm/svm/svm.h    |  10 ++++
>  3 files changed, 140 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index e891318595113e..5bea672bf8b12d 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -623,6 +623,42 @@ static bool is_evtinj_nmi(u32 evtinj)
>  	return type == SVM_EVTINJ_TYPE_NMI;
>  }
>  
> +static void nested_svm_save_vnmi(struct vcpu_svm *svm)
> +{
> +	struct vmcb *vmcb01 = svm->vmcb01.ptr;
> +
> +	/*
> +	 * Copy the vNMI state back to software NMI tracking state
> +	 * for the duration of the nested run
> +	 */
> +
nits - Extra line.

> +	svm->nmi_masked = vmcb01->control.int_ctl & V_NMI_MASK;
> +	svm->vcpu.arch.nmi_pending += vmcb01->control.int_ctl & V_NMI_PENDING;
> +}
> +
> +static void nested_svm_restore_vnmi(struct vcpu_svm *svm)
> +{
> +	struct kvm_vcpu *vcpu = &svm->vcpu;
> +	struct vmcb *vmcb01 = svm->vmcb01.ptr;
> +
> +	/*
> +	 * Restore the vNMI state from the software NMI tracking state
> +	 * after a nested run
> +	 */
> +
Ditto...

Thanks,
Santosh
> +	if (svm->nmi_masked)
> +		vmcb01->control.int_ctl |= V_NMI_MASK;
> +	else
> +		vmcb01->control.int_ctl &= ~V_NMI_MASK;
> +
> +	if (vcpu->arch.nmi_pending) {
> +		vcpu->arch.nmi_pending--;
> +		vmcb01->control.int_ctl |= V_NMI_PENDING;
> +	} else
> +		vmcb01->control.int_ctl &= ~V_NMI_PENDING;
> +}
> +
> +
>  static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
>  					  unsigned long vmcb12_rip,
>  					  unsigned long vmcb12_csbase)
> @@ -646,6 +682,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
>  	else
>  		int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
>  
> +	if (vnmi)
> +		nested_svm_save_vnmi(svm);
> +
>  	/* Copied from vmcb01.  msrpm_base can be overwritten later.  */
>  	vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
>  	vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
> @@ -1049,6 +1088,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>  		svm_update_lbrv(vcpu);
>  	}
>  
> +	if (vnmi)
> +		nested_svm_restore_vnmi(svm);
> +
>  	/*
>  	 * On vmexit the  GIF is set to false and
>  	 * no event can be injected in L1.
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index cfed6ab29c839a..bf10adcf3170a8 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -230,6 +230,8 @@ module_param(dump_invalid_vmcb, bool, 0644);
>  bool intercept_smi = true;
>  module_param(intercept_smi, bool, 0444);
>  
> +bool vnmi = true;
> +module_param(vnmi, bool, 0444);
>  
>  static bool svm_gp_erratum_intercept = true;
>  
> @@ -1299,6 +1301,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
>  	if (kvm_vcpu_apicv_active(vcpu))
>  		avic_init_vmcb(svm, vmcb);
>  
> +	if (vnmi)
> +		svm->vmcb->control.int_ctl |= V_NMI_ENABLE;
> +
>  	if (vgif) {
>  		svm_clr_intercept(svm, INTERCEPT_STGI);
>  		svm_clr_intercept(svm, INTERCEPT_CLGI);
> @@ -3487,6 +3492,39 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
>  	++vcpu->stat.nmi_injections;
>  }
>  
> +
> +static bool svm_get_hw_nmi_pending(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (!is_vnmi_enabled(svm))
> +		return false;
> +
> +	return !!(svm->vmcb->control.int_ctl & V_NMI_MASK);
> +}
> +
> +static bool svm_set_hw_nmi_pending(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (!is_vnmi_enabled(svm))
> +		return false;
> +
> +	if (svm->vmcb->control.int_ctl & V_NMI_PENDING)
> +		return false;
> +
> +	svm->vmcb->control.int_ctl |= V_NMI_PENDING;
> +	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
> +
> +	/*
> +	 * NMI isn't yet technically injected but
> +	 * this rough estimation should be good enough
> +	 */
> +	++vcpu->stat.nmi_injections;
> +
> +	return true;
> +}
> +
>  static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> @@ -3582,11 +3620,38 @@ static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
>  		svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
>  }
>  
> +static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (is_vnmi_enabled(svm))
> +		return svm->vmcb->control.int_ctl & V_NMI_MASK;
> +	else
> +		return svm->nmi_masked;
> +}
> +
> +static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (is_vnmi_enabled(svm)) {
> +		if (masked)
> +			svm->vmcb->control.int_ctl |= V_NMI_MASK;
> +		else
> +			svm->vmcb->control.int_ctl &= ~V_NMI_MASK;
> +	} else {
> +		svm->nmi_masked = masked;
> +		if (masked)
> +			svm_enable_iret_interception(svm);
> +		else
> +			svm_disable_iret_interception(svm);
> +	}
> +}
> +
>  bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
>  	struct vmcb *vmcb = svm->vmcb;
> -	bool ret;
>  
>  	if (!gif_set(svm))
>  		return true;
> @@ -3594,10 +3659,10 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
>  	if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
>  		return false;
>  
> -	ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
> -	      (svm->nmi_masked);
> +	if (svm_get_nmi_mask(vcpu))
> +		return true;
>  
> -	return ret;
> +	return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
>  }
>  
>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
> @@ -3615,24 +3680,6 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
>  	return 1;
>  }
>  
> -static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
> -{
> -	return to_svm(vcpu)->nmi_masked;
> -}
> -
> -static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
> -{
> -	struct vcpu_svm *svm = to_svm(vcpu);
> -
> -	if (masked) {
> -		svm->nmi_masked = true;
> -		svm_enable_iret_interception(svm);
> -	} else {
> -		svm->nmi_masked = false;
> -		svm_disable_iret_interception(svm);
> -	}
> -}
> -
>  bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> @@ -3725,10 +3772,16 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
>  	/*
>  	 * Something prevents NMI from been injected. Single step over possible
>  	 * problem (IRET or exception injection or interrupt shadow)
> +	 *
> +	 * With vNMI we should never need an NMI window
> +	 * (we can always inject vNMI either by setting VNMI_PENDING or by EVENTINJ)
>  	 */
> +	if (WARN_ON_ONCE(is_vnmi_enabled(svm)))
> +		return;
> +
>  	svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
> -	svm->nmi_singlestep = true;
>  	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
> +	svm->nmi_singlestep = true;
>  }
>  
>  static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
> @@ -4770,6 +4823,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>  	.patch_hypercall = svm_patch_hypercall,
>  	.inject_irq = svm_inject_irq,
>  	.inject_nmi = svm_inject_nmi,
> +	.get_hw_nmi_pending = svm_get_hw_nmi_pending,
> +	.set_hw_nmi_pending = svm_set_hw_nmi_pending,
>  	.inject_exception = svm_inject_exception,
>  	.cancel_injection = svm_cancel_injection,
>  	.interrupt_allowed = svm_interrupt_allowed,
> @@ -5058,6 +5113,16 @@ static __init int svm_hardware_setup(void)
>  			pr_info("Virtual GIF supported\n");
>  	}
>  
> +
> +	vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_AMD_VNMI);
> +	if (vnmi)
> +		pr_info("Virtual NMI enabled\n");
> +
> +	if (!vnmi) {
> +		svm_x86_ops.get_hw_nmi_pending = NULL;
> +		svm_x86_ops.set_hw_nmi_pending = NULL;
> +	}
> +
>  	if (lbrv) {
>  		if (!boot_cpu_has(X86_FEATURE_LBRV))
>  			lbrv = false;
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 587ddc150f9f34..0b7e1790fadde1 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -35,6 +35,7 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
>  extern bool npt_enabled;
>  extern int vgif;
>  extern bool intercept_smi;
> +extern bool vnmi;
>  
>  enum avic_modes {
>  	AVIC_MODE_NONE = 0,
> @@ -553,6 +554,15 @@ static inline bool is_x2apic_msrpm_offset(u32 offset)
>  	       (msr < (APIC_BASE_MSR + 0x100));
>  }
>  
> +static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
> +{
> +	/* L1's vNMI is inhibited while nested guest is running */
> +	if (is_guest_mode(&svm->vcpu))
> +		return false;
> +
> +	return !!(svm->vmcb01.ptr->control.int_ctl & V_NMI_ENABLE);
> +}
> +
>  /* svm.c */
>  #define MSR_INVALID				0xffffffffU
>
Sean Christopherson Jan. 28, 2023, 1:10 a.m. UTC | #3
On Tue, Nov 29, 2022, Maxim Levitsky wrote:
> This patch implements support for injecting pending
> NMIs via the .kvm_x86_set_hw_nmi_pending using new AMD's vNMI
> feature.
> 
> Note that the vNMI can't cause a VM exit, which is needed
> when a nested guest intercepts NMIs.
> 
> Therefore to avoid breaking nesting, the vNMI is inhibited while
> a nested guest is running and instead, the legacy NMI window
> detection and delivery method is used.
> 
> While it is possible to passthrough the vNMI if a nested guest
> doesn't intercept NMIs, such usage is very uncommon, and it's
> not worth to optimize for.
> 
> Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>  arch/x86/kvm/svm/nested.c |  42 +++++++++++++++
>  arch/x86/kvm/svm/svm.c    | 111 ++++++++++++++++++++++++++++++--------
>  arch/x86/kvm/svm/svm.h    |  10 ++++
>  3 files changed, 140 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index e891318595113e..5bea672bf8b12d 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -623,6 +623,42 @@ static bool is_evtinj_nmi(u32 evtinj)
>  	return type == SVM_EVTINJ_TYPE_NMI;
>  }
>  
> +static void nested_svm_save_vnmi(struct vcpu_svm *svm)
> +{
> +	struct vmcb *vmcb01 = svm->vmcb01.ptr;
> +
> +	/*
> +	 * Copy the vNMI state back to software NMI tracking state
> +	 * for the duration of the nested run
> +	 */
> +

Unecessary newline.

> +	svm->nmi_masked = vmcb01->control.int_ctl & V_NMI_MASK;
> +	svm->vcpu.arch.nmi_pending += vmcb01->control.int_ctl & V_NMI_PENDING;
> +}
> +
> +static void nested_svm_restore_vnmi(struct vcpu_svm *svm)
> +{
> +	struct kvm_vcpu *vcpu = &svm->vcpu;
> +	struct vmcb *vmcb01 = svm->vmcb01.ptr;
> +
> +	/*
> +	 * Restore the vNMI state from the software NMI tracking state
> +	 * after a nested run
> +	 */
> +

Unnecessary newline.

> +	if (svm->nmi_masked)
> +		vmcb01->control.int_ctl |= V_NMI_MASK;
> +	else
> +		vmcb01->control.int_ctl &= ~V_NMI_MASK;
> +
> +	if (vcpu->arch.nmi_pending) {
> +		vcpu->arch.nmi_pending--;
> +		vmcb01->control.int_ctl |= V_NMI_PENDING;
> +	} else
> +		vmcb01->control.int_ctl &= ~V_NMI_PENDING;

Needs curly braces.
Sean Christopherson Feb. 1, 2023, 12:22 a.m. UTC | #4
On Tue, Nov 29, 2022, Maxim Levitsky wrote:
> This patch implements support for injecting pending
> NMIs via the .kvm_x86_set_hw_nmi_pending using new AMD's vNMI

Wrap closer to 75 chars, and please try to be consistent in how your format
things like changelogs and comments.  It's jarring as a reader when the wrap
column is constantly changing.

> feature.

Please combine the introduction, usage, and implementation of the hew kvm_x86_ops,
i.e. introduce and use the ops in this patch.  It's extremely difficult to review
the common x86 code that uses the ops without seeing how they're implemented in
SVM.  I believe the overall size/scope of the patch can be kept reasonable by
introducing some of the common changes in advance of the new ops, e.g. tweaking
the KVM_SET_VCPU_EVENTS flow.

> Note that the vNMI can't cause a VM exit, which is needed
> when a nested guest intercepts NMIs.

I can't tell if this is saying "SVM doesn't allow intercepting virtual NMIs", or
"KVM never enables virtual NMI interception".

> Therefore to avoid breaking nesting, the vNMI is inhibited while
> a nested guest is running and instead, the legacy NMI window
> detection and delivery method is used.

State what KVM does, don't describe the effects.  E.g. "Disable vNMI while running
L2".  When a changelog describes the effects, it's unclear whether the effects are
new behavior introduced by the patch, hardware behavior, etc...

> While it is possible to passthrough the vNMI if a nested guest
> doesn't intercept NMIs, such usage is very uncommon, and it's
> not worth to optimize for.

Can you elaborate on why not?  It's not obvious to me that the code will end up
more complex, and oftentimes omitting code increases the cognitive load on readers,
i.e. makes things more complex in a way.  vNMI is mutually exclusive with NMI
passthrough, i.e. KVM doesn't need to handle NMI passthrough and vNMI simultaneously.

> Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>

SoB chain is wrong.  Maxim is credited as the sole Author, i.e. Santosh shouldn't
have a SoB.  Assuming the intent is to attribute both of ya'll this needs to be

 Co-developed-by: Santosh Shukla <santosh.shukla@amd.com>
 Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
 Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>

if Maxim is the primary author, or this if Santosh is the primary author

 From: Santosh Shukla <santosh.shukla@amd.com>

 <blah blah blah>

 Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
 Developed-by: Maxim Levitsky <mlevitsk@redhat.com>
 Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>

> ---
>  arch/x86/kvm/svm/nested.c |  42 +++++++++++++++
>  arch/x86/kvm/svm/svm.c    | 111 ++++++++++++++++++++++++++++++--------
>  arch/x86/kvm/svm/svm.h    |  10 ++++
>  3 files changed, 140 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index e891318595113e..5bea672bf8b12d 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -623,6 +623,42 @@ static bool is_evtinj_nmi(u32 evtinj)
>  	return type == SVM_EVTINJ_TYPE_NMI;
>  }
>  
> +static void nested_svm_save_vnmi(struct vcpu_svm *svm)

Please avoid save/restore names.  KVM (selftests in particular) uses save/restore
to refer to a saving and restoring state across a migration.  "sync" is probably
the best option, or just open code the flows.

> +{
> +	struct vmcb *vmcb01 = svm->vmcb01.ptr;
> +
> +	/*
> +	 * Copy the vNMI state back to software NMI tracking state
> +	 * for the duration of the nested run
> +	 */
> +
> +	svm->nmi_masked = vmcb01->control.int_ctl & V_NMI_MASK;
> +	svm->vcpu.arch.nmi_pending += vmcb01->control.int_ctl & V_NMI_PENDING;

This is wrong.  V_NMI_PENDING is bit 11, i.e. the bitwise-AND does not yield a
boolean value and will increment nmi_pending by 2048 instead of by 1.

	if (vmcb01->control.int_ctl & V_NMI_PENDING)
		svm->vcpu.arch.nmi_pending++;

And this needs a KVM_REQ_EVENT to ensure KVM processes the newly pending NMI.

> +}
> +
> +static void nested_svm_restore_vnmi(struct vcpu_svm *svm)
> +{
> +	struct kvm_vcpu *vcpu = &svm->vcpu;
> +	struct vmcb *vmcb01 = svm->vmcb01.ptr;
> +
> +	/*
> +	 * Restore the vNMI state from the software NMI tracking state
> +	 * after a nested run
> +	 */
> +
> +	if (svm->nmi_masked)
> +		vmcb01->control.int_ctl |= V_NMI_MASK;
> +	else
> +		vmcb01->control.int_ctl &= ~V_NMI_MASK;

As proposed, this needs to clear nmi_masked to avoid false positives.  The better
approach is to not have any open coded accesses to svm->nmi_masked outside of
flows that specifically need to deal with vNMI logic.

E.g. svm_enable_nmi_window() reads the raw nmi_masked.

> +
> +	if (vcpu->arch.nmi_pending) {
> +		vcpu->arch.nmi_pending--;
> +		vmcb01->control.int_ctl |= V_NMI_PENDING;
> +	} else

Curly braces on all paths if any path needs 'em.

> +		vmcb01->control.int_ctl &= ~V_NMI_PENDING;
> +}

...

> + static bool svm_set_hw_nmi_pending(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (!is_vnmi_enabled(svm))
> +		return false;
> +
> +	if (svm->vmcb->control.int_ctl & V_NMI_PENDING)
> +		return false;
> +
> +	svm->vmcb->control.int_ctl |= V_NMI_PENDING;
> +	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
> +
> +	/*
> +	 * NMI isn't yet technically injected but
> +	 * this rough estimation should be good enough

Again, wrap at 80 chars, not at random points.

> +	 */
> +	++vcpu->stat.nmi_injections;
> +
> +	return true;
> +}
> +

...

>  bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> @@ -3725,10 +3772,16 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
>  	/*
>  	 * Something prevents NMI from been injected. Single step over possible
>  	 * problem (IRET or exception injection or interrupt shadow)
> +	 *
> +	 * With vNMI we should never need an NMI window
> +	 * (we can always inject vNMI either by setting VNMI_PENDING or by EVENTINJ)

Please honor the soft limit and avoid pronouns.  There's also no need to put the
blurb in parantheses on its own line.

As for the code, I believe there are bugs.  Pulling in the context...

	if (svm->nmi_masked && !svm->awaiting_iret_completion)
		return; /* IRET will cause a vm exit */

Checking nmi_masked is wrong, this should use the helper.  Even if this code can
only be reached on error, it should still try its best to not make things worse.

	if (!gif_set(svm)) {
		if (vgif)
			svm_set_intercept(svm, INTERCEPT_STGI);
		return; /* STGI will cause a vm exit */
	}

	/*
	 * Something prevents NMI from been injected. Single step over possible
	 * problem (IRET or exception injection or interrupt shadow)
	 *
	 * With vNMI we should never need an NMI window
	 * (we can always inject vNMI either by setting VNMI_PENDING or by EVENTINJ)
	 */
	if (WARN_ON_ONCE(is_vnmi_enabled(svm)))
		return;

This is flawed, where "this" means handling of NMI windows when vNMI is enabled.

IIUC, if there are back-to-back NMIs, the intent is to set V_NMI for one and
inject the other.  I believe there are multiple bugs svm_inject_nmi().  The one
that's definitely a bug is setting svm->nmi_masked.  The other suspected bug,
which is related to the above WARN, is setting the IRET intercept.  The resulting
IRET interception will set awaiting_iret_completion, and then the above WARN is
reachable (even if the masking check is fixed).

I don't think KVM needs to ever intercept IRET.  One NMI gets injected, and the
other is sitting in INT_CTL.V_NMI_PENDING, i.e. there's no need for KVM to regain
control.  If another NMI comes along before V_NMI_PENDING is handled, it will
either get injected or dropped.

So I _think_ KVM can skip the intercept code when injecting an NMI, and this WARN
can be hoisted to the top of svm_enable_nmi_window(), because as stated above, KVM
should never need to request an NMI window.

Last thought, unless there's something that will obviously break, it's probably
better to WARN and continue than to bail.  I.e. do the single-step and hope for
the best.  Bailing at this point doesn't seem like it would help.

>  	 */
> +	if (WARN_ON_ONCE(is_vnmi_enabled(svm)))
> +		return;
> +
>  	svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
> -	svm->nmi_singlestep = true;
>  	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
> +	svm->nmi_singlestep = true;
>  }

...

> @@ -553,6 +554,15 @@ static inline bool is_x2apic_msrpm_offset(u32 offset)
>  	       (msr < (APIC_BASE_MSR + 0x100));
>  }
>  
> +static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
> +{
> +	/* L1's vNMI is inhibited while nested guest is running */
> +	if (is_guest_mode(&svm->vcpu))

I would rather check the current VMCB.  I don't see any value in hardcoding the
"KVM doesn't support vNMI in L2" in multiple places.  And I find the above comment
about "L1's vNMI is inhibited" confusing.  vNMI isn't inhibited/blocked, KVM just
doesn't utilize vNMI while L2 is active (IIUC, as proposed).

> +		return false;
> +
> +	return !!(svm->vmcb01.ptr->control.int_ctl & V_NMI_ENABLE);
> +}
> +
>  /* svm.c */
>  #define MSR_INVALID				0xffffffffU
>  
> -- 
> 2.26.3
>
Sean Christopherson Feb. 1, 2023, 12:39 a.m. UTC | #5
On Wed, Feb 01, 2023, Sean Christopherson wrote:
> On Tue, Nov 29, 2022, Maxim Levitsky wrote:
> > @@ -553,6 +554,15 @@ static inline bool is_x2apic_msrpm_offset(u32 offset)
> >  	       (msr < (APIC_BASE_MSR + 0x100));
> >  }
> >  
> > +static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
> > +{
> > +	/* L1's vNMI is inhibited while nested guest is running */
> > +	if (is_guest_mode(&svm->vcpu))
> 
> I would rather check the current VMCB.  I don't see any value in hardcoding the
> "KVM doesn't support vNMI in L2" in multiple places.  And I find the above comment
> about "L1's vNMI is inhibited" confusing.  vNMI isn't inhibited/blocked, KVM just
> doesn't utilize vNMI while L2 is active (IIUC, as proposed).

Oof.  Scratch that, code is correct as proposed, but the comment and function name
are confusing.

After looking at the nested support, is_vnmi_enabled() actually means "is vNMI
currently enabled _for L1_".  And it has less to do with vNMI being "inhibited" and
everything to do with KVM choosing not to utilize vNMI for L1 while running L2.

"inhibited" in quotes because "inhibited" is a synonym of "blocked", i.e. I read
that as L1 NMIs are blocked.

So regardless of whether or not KVM decides to utilize vNMI for L2 if L1's NMIs
are passed through, the function name needs to clarify that it's referring to
L1.  E.g. is_vnmi_enabled_for_l1() or so.

And if KVM decides not to use vNMI for L1 while running L2, state that more
explicitly instead of saying it's inhibited.

And if KVM does decide to use vNMI while running L2 if NMIs are passed through,
then the comment goes away and KVM toggles the flag in vmcb01 on nested enter
and exit.

> > +		return false;
> > +
> > +	return !!(svm->vmcb01.ptr->control.int_ctl & V_NMI_ENABLE);
> > +}
> > +
> >  /* svm.c */
> >  #define MSR_INVALID				0xffffffffU
> >  
> > -- 
> > 2.26.3
> >
Shukla, Santosh Feb. 10, 2023, 12:02 p.m. UTC | #6
On 1/28/2023 6:40 AM, Sean Christopherson wrote:
> On Tue, Nov 29, 2022, Maxim Levitsky wrote:
>> This patch implements support for injecting pending
>> NMIs via the .kvm_x86_set_hw_nmi_pending using new AMD's vNMI
>> feature.
>>
>> Note that the vNMI can't cause a VM exit, which is needed
>> when a nested guest intercepts NMIs.
>>
>> Therefore to avoid breaking nesting, the vNMI is inhibited while
>> a nested guest is running and instead, the legacy NMI window
>> detection and delivery method is used.
>>
>> While it is possible to passthrough the vNMI if a nested guest
>> doesn't intercept NMIs, such usage is very uncommon, and it's
>> not worth to optimize for.
>>
>> Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
>> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
>> ---
>>  arch/x86/kvm/svm/nested.c |  42 +++++++++++++++
>>  arch/x86/kvm/svm/svm.c    | 111 ++++++++++++++++++++++++++++++--------
>>  arch/x86/kvm/svm/svm.h    |  10 ++++
>>  3 files changed, 140 insertions(+), 23 deletions(-)
>>
>> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
>> index e891318595113e..5bea672bf8b12d 100644
>> --- a/arch/x86/kvm/svm/nested.c
>> +++ b/arch/x86/kvm/svm/nested.c
>> @@ -623,6 +623,42 @@ static bool is_evtinj_nmi(u32 evtinj)
>>  	return type == SVM_EVTINJ_TYPE_NMI;
>>  }
>>  
>> +static void nested_svm_save_vnmi(struct vcpu_svm *svm)
>> +{
>> +	struct vmcb *vmcb01 = svm->vmcb01.ptr;
>> +
>> +	/*
>> +	 * Copy the vNMI state back to software NMI tracking state
>> +	 * for the duration of the nested run
>> +	 */
>> +
> 
> Unecessary newline.
> 
ok.

>> +	svm->nmi_masked = vmcb01->control.int_ctl & V_NMI_MASK;
>> +	svm->vcpu.arch.nmi_pending += vmcb01->control.int_ctl & V_NMI_PENDING;
>> +}
>> +
>> +static void nested_svm_restore_vnmi(struct vcpu_svm *svm)
>> +{
>> +	struct kvm_vcpu *vcpu = &svm->vcpu;
>> +	struct vmcb *vmcb01 = svm->vmcb01.ptr;
>> +
>> +	/*
>> +	 * Restore the vNMI state from the software NMI tracking state
>> +	 * after a nested run
>> +	 */
>> +
> 
> Unnecessary newline.
>

ok.
 
>> +	if (svm->nmi_masked)
>> +		vmcb01->control.int_ctl |= V_NMI_MASK;
>> +	else
>> +		vmcb01->control.int_ctl &= ~V_NMI_MASK;
>> +
>> +	if (vcpu->arch.nmi_pending) {
>> +		vcpu->arch.nmi_pending--;
>> +		vmcb01->control.int_ctl |= V_NMI_PENDING;
>> +	} else
>> +		vmcb01->control.int_ctl &= ~V_NMI_PENDING;
> 
> Needs curly braces.
> 
Yes.

Thanks,
Santosh
Shukla, Santosh Feb. 10, 2023, 12:24 p.m. UTC | #7
On 2/1/2023 5:52 AM, Sean Christopherson wrote:
> On Tue, Nov 29, 2022, Maxim Levitsky wrote:
>> This patch implements support for injecting pending
>> NMIs via the .kvm_x86_set_hw_nmi_pending using new AMD's vNMI
> 
> Wrap closer to 75 chars, and please try to be consistent in how your format
> things like changelogs and comments.  It's jarring as a reader when the wrap
> column is constantly changing.
> 
>> feature.
> 
> Please combine the introduction, usage, and implementation of the hew kvm_x86_ops,
> i.e. introduce and use the ops in this patch.  It's extremely difficult to review
> the common x86 code that uses the ops without seeing how they're implemented in
> SVM.  I believe the overall size/scope of the patch can be kept reasonable by
> introducing some of the common changes in advance of the new ops, e.g. tweaking
> the KVM_SET_VCPU_EVENTS flow.
> 
>> Note that the vNMI can't cause a VM exit, which is needed
>> when a nested guest intercepts NMIs.
> 
> I can't tell if this is saying "SVM doesn't allow intercepting virtual NMIs", or
> "KVM never enables virtual NMI interception".
> 

I think, It meant to say that vNMI doesn't need nmi_window_exiting feature to
pend the new virtual NMI. Will reword.

>> Therefore to avoid breaking nesting, the vNMI is inhibited while
>> a nested guest is running and instead, the legacy NMI window
>> detection and delivery method is used.
> 
> State what KVM does, don't describe the effects.  E.g. "Disable vNMI while running
> L2".  When a changelog describes the effects, it's unclear whether the effects are
> new behavior introduced by the patch, hardware behavior, etc...
> 
>> While it is possible to passthrough the vNMI if a nested guest
>> doesn't intercept NMIs, such usage is very uncommon, and it's
>> not worth to optimize for.
> 
> Can you elaborate on why not?  It's not obvious to me that the code will end up
> more complex, and oftentimes omitting code increases the cognitive load on readers,
> i.e. makes things more complex in a way.  vNMI is mutually exclusive with NMI
> passthrough, i.e. KVM doesn't need to handle NMI passthrough and vNMI simultaneously.
> 
>> Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
>> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> 
> SoB chain is wrong.  Maxim is credited as the sole Author, i.e. Santosh shouldn't
> have a SoB.  Assuming the intent is to attribute both of ya'll this needs to be
> 
>  Co-developed-by: Santosh Shukla <santosh.shukla@amd.com>
>  Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
>  Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> 
> if Maxim is the primary author, or this if Santosh is the primary author
> 
>  From: Santosh Shukla <santosh.shukla@amd.com>
> 
>  <blah blah blah>
> 
>  Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
>  Developed-by: Maxim Levitsky <mlevitsk@redhat.com>
>  Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> 

Will sort those in v3.

>> ---
>>  arch/x86/kvm/svm/nested.c |  42 +++++++++++++++
>>  arch/x86/kvm/svm/svm.c    | 111 ++++++++++++++++++++++++++++++--------
>>  arch/x86/kvm/svm/svm.h    |  10 ++++
>>  3 files changed, 140 insertions(+), 23 deletions(-)
>>
>> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
>> index e891318595113e..5bea672bf8b12d 100644
>> --- a/arch/x86/kvm/svm/nested.c
>> +++ b/arch/x86/kvm/svm/nested.c
>> @@ -623,6 +623,42 @@ static bool is_evtinj_nmi(u32 evtinj)
>>  	return type == SVM_EVTINJ_TYPE_NMI;
>>  }
>>  
>> +static void nested_svm_save_vnmi(struct vcpu_svm *svm)
> 
> Please avoid save/restore names.  KVM (selftests in particular) uses save/restore
> to refer to a saving and restoring state across a migration.  "sync" is probably
> the best option, or just open code the flows.
> 

ok.

I chose to open code that way I don't need to consider using svm->nmi_masked which should be
used for non-vNMI case.

>> +{
>> +	struct vmcb *vmcb01 = svm->vmcb01.ptr;
>> +
>> +	/*
>> +	 * Copy the vNMI state back to software NMI tracking state
>> +	 * for the duration of the nested run
>> +	 */
>> +
>> +	svm->nmi_masked = vmcb01->control.int_ctl & V_NMI_MASK;
>> +	svm->vcpu.arch.nmi_pending += vmcb01->control.int_ctl & V_NMI_PENDING;
> 
> This is wrong.  V_NMI_PENDING is bit 11, i.e. the bitwise-AND does not yield a
> boolean value and will increment nmi_pending by 2048 instead of by 1.
>

Right.
 
> 	if (vmcb01->control.int_ctl & V_NMI_PENDING)
> 		svm->vcpu.arch.nmi_pending++;
> 
> And this needs a KVM_REQ_EVENT to ensure KVM processes the newly pending NMI.
> 

Ok.

>> +}
>> +
>> +static void nested_svm_restore_vnmi(struct vcpu_svm *svm)
>> +{
>> +	struct kvm_vcpu *vcpu = &svm->vcpu;
>> +	struct vmcb *vmcb01 = svm->vmcb01.ptr;
>> +
>> +	/*
>> +	 * Restore the vNMI state from the software NMI tracking state
>> +	 * after a nested run
>> +	 */
>> +
>> +	if (svm->nmi_masked)
>> +		vmcb01->control.int_ctl |= V_NMI_MASK;
>> +	else
>> +		vmcb01->control.int_ctl &= ~V_NMI_MASK;
> 
> As proposed, this needs to clear nmi_masked to avoid false positives.  The better
> approach is to not have any open coded accesses to svm->nmi_masked outside of
> flows that specifically need to deal with vNMI logic.
>
ok.
 
> E.g. svm_enable_nmi_window() reads the raw nmi_masked.
> 
>> +
>> +	if (vcpu->arch.nmi_pending) {
>> +		vcpu->arch.nmi_pending--;
>> +		vmcb01->control.int_ctl |= V_NMI_PENDING;
>> +	} else
> 
> Curly braces on all paths if any path needs 'em.
>

ok. 

>> +		vmcb01->control.int_ctl &= ~V_NMI_PENDING;
>> +}
> 
> ...
> 
>> + static bool svm_set_hw_nmi_pending(struct kvm_vcpu *vcpu)
>> +{
>> +	struct vcpu_svm *svm = to_svm(vcpu);
>> +
>> +	if (!is_vnmi_enabled(svm))
>> +		return false;
>> +
>> +	if (svm->vmcb->control.int_ctl & V_NMI_PENDING)
>> +		return false;
>> +
>> +	svm->vmcb->control.int_ctl |= V_NMI_PENDING;
>> +	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
>> +
>> +	/*
>> +	 * NMI isn't yet technically injected but
>> +	 * this rough estimation should be good enough
> 
> Again, wrap at 80 chars, not at random points.
>

ok.
 
>> +	 */
>> +	++vcpu->stat.nmi_injections;
>> +
>> +	return true;
>> +}
>> +
> 
> ...
> 
>>  bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
>>  {
>>  	struct vcpu_svm *svm = to_svm(vcpu);
>> @@ -3725,10 +3772,16 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
>>  	/*
>>  	 * Something prevents NMI from been injected. Single step over possible
>>  	 * problem (IRET or exception injection or interrupt shadow)
>> +	 *
>> +	 * With vNMI we should never need an NMI window
>> +	 * (we can always inject vNMI either by setting VNMI_PENDING or by EVENTINJ)
> 
> Please honor the soft limit and avoid pronouns.  There's also no need to put the
> blurb in parantheses on its own line.
> 
> As for the code, I believe there are bugs.  Pulling in the context...
> 
> 	if (svm->nmi_masked && !svm->awaiting_iret_completion)
> 		return; /* IRET will cause a vm exit */
> 
> Checking nmi_masked is wrong, this should use the helper.  Even if this code can

Right,.

> only be reached on error, it should still try its best to not make things worse.
> 
> 	if (!gif_set(svm)) {
> 		if (vgif)
> 			svm_set_intercept(svm, INTERCEPT_STGI);
> 		return; /* STGI will cause a vm exit */
> 	}
> 
> 	/*
> 	 * Something prevents NMI from been injected. Single step over possible
> 	 * problem (IRET or exception injection or interrupt shadow)
> 	 *
> 	 * With vNMI we should never need an NMI window
> 	 * (we can always inject vNMI either by setting VNMI_PENDING or by EVENTINJ)
> 	 */
> 	if (WARN_ON_ONCE(is_vnmi_enabled(svm)))
> 		return;
> 
> This is flawed, where "this" means handling of NMI windows when vNMI is enabled.
> 
> IIUC, if there are back-to-back NMIs, the intent is to set V_NMI for one and
> inject the other.  I believe there are multiple bugs svm_inject_nmi().  The one
> that's definitely a bug is setting svm->nmi_masked.  The other suspected bug,
> which is related to the above WARN, is setting the IRET intercept.  The resulting
> IRET interception will set awaiting_iret_completion, and then the above WARN is
> reachable (even if the masking check is fixed).
> 
> I don't think KVM needs to ever intercept IRET.  One NMI gets injected, and the
> other is sitting in INT_CTL.V_NMI_PENDING, i.e. there's no need for KVM to regain
> control.  If another NMI comes along before V_NMI_PENDING is handled, it will
> either get injected or dropped.
> 
> So I _think_ KVM can skip the intercept code when injecting an NMI, and this WARN
> can be hoisted to the top of svm_enable_nmi_window(), because as stated above, KVM
> should never need to request an NMI window.
> 
> Last thought, unless there's something that will obviously break, it's probably
> better to WARN and continue than to bail.  I.e. do the single-step and hope for
> the best.  Bailing at this point doesn't seem like it would help.
> 
>>  	 */
>> +	if (WARN_ON_ONCE(is_vnmi_enabled(svm)))
>> +		return;
>> +
>>  	svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
>> -	svm->nmi_singlestep = true;
>>  	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
>> +	svm->nmi_singlestep = true;
>>  }
> 

Ok.

So you mean.. In vNMI mode, KVM should never need to request NMI window and eventually
it reaches to NMI window then WARN_ON and cont.. to single step... so modified code change
may look something like below:

static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        /*
         * With vNMI we should never need an NMI window.
         * and if we reach here then better WARN and continue to single step.
         */
        WARN_ON_ONCE(is_vnmi_enabled(svm));

        if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
                return; /* IRET will cause a vm exit */

        if (!gif_set(svm)) {
                if (vgif)
                        svm_set_intercept(svm, INTERCEPT_STGI);
                return; /* STGI will cause a vm exit */
        }

        /*
         * Something prevents NMI from been injected. Single step over possible
         * problem (IRET or exception injection or interrupt shadow)
         */

        svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
        svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
        svm->nmi_singlestep = true;
}

Does that make sense?

Thanks,
Santosh

> ...
> 
>> @@ -553,6 +554,15 @@ static inline bool is_x2apic_msrpm_offset(u32 offset)
>>  	       (msr < (APIC_BASE_MSR + 0x100));
>>  }
>>  
>> +static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
>> +{
>> +	/* L1's vNMI is inhibited while nested guest is running */
>> +	if (is_guest_mode(&svm->vcpu))
> 
> I would rather check the current VMCB.  I don't see any value in hardcoding the
> "KVM doesn't support vNMI in L2" in multiple places.  And I find the above comment
> about "L1's vNMI is inhibited" confusing.  vNMI isn't inhibited/blocked, KVM just
> doesn't utilize vNMI while L2 is active (IIUC, as proposed).
> 
>> +		return false;
>> +
>> +	return !!(svm->vmcb01.ptr->control.int_ctl & V_NMI_ENABLE);
>> +}
>> +
>>  /* svm.c */
>>  #define MSR_INVALID				0xffffffffU
>>  
>> -- 
>> 2.26.3
>>
Sean Christopherson Feb. 10, 2023, 4:44 p.m. UTC | #8
On Fri, Feb 10, 2023, Santosh Shukla wrote:
> On 2/1/2023 5:52 AM, Sean Christopherson wrote:
> So you mean.. In vNMI mode, KVM should never need to request NMI window and eventually
> it reaches to NMI window then WARN_ON and cont.. to single step... so modified code change
> may look something like below:
> 
> static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
> {
>         struct vcpu_svm *svm = to_svm(vcpu);
> 
>         /*
>          * With vNMI we should never need an NMI window.
>          * and if we reach here then better WARN and continue to single step.
>          */
>         WARN_ON_ONCE(is_vnmi_enabled(svm));
> 
>         if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
>                 return; /* IRET will cause a vm exit */
> 
>         if (!gif_set(svm)) {
>                 if (vgif)
>                         svm_set_intercept(svm, INTERCEPT_STGI);
>                 return; /* STGI will cause a vm exit */
>         }
> 
>         /*
>          * Something prevents NMI from been injected. Single step over possible
>          * problem (IRET or exception injection or interrupt shadow)
>          */
> 
>         svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
>         svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
>         svm->nmi_singlestep = true;
> }
> 
> Does that make sense?

Yep.  Though please avoid "we" and other pronouns in changelogs and comments,
and wrap as close to the boundary as possible.
diff mbox series

Patch

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index e891318595113e..5bea672bf8b12d 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -623,6 +623,42 @@  static bool is_evtinj_nmi(u32 evtinj)
 	return type == SVM_EVTINJ_TYPE_NMI;
 }
 
+static void nested_svm_save_vnmi(struct vcpu_svm *svm)
+{
+	struct vmcb *vmcb01 = svm->vmcb01.ptr;
+
+	/*
+	 * Copy the vNMI state back to software NMI tracking state
+	 * for the duration of the nested run
+	 */
+
+	svm->nmi_masked = vmcb01->control.int_ctl & V_NMI_MASK;
+	svm->vcpu.arch.nmi_pending += vmcb01->control.int_ctl & V_NMI_PENDING;
+}
+
+static void nested_svm_restore_vnmi(struct vcpu_svm *svm)
+{
+	struct kvm_vcpu *vcpu = &svm->vcpu;
+	struct vmcb *vmcb01 = svm->vmcb01.ptr;
+
+	/*
+	 * Restore the vNMI state from the software NMI tracking state
+	 * after a nested run
+	 */
+
+	if (svm->nmi_masked)
+		vmcb01->control.int_ctl |= V_NMI_MASK;
+	else
+		vmcb01->control.int_ctl &= ~V_NMI_MASK;
+
+	if (vcpu->arch.nmi_pending) {
+		vcpu->arch.nmi_pending--;
+		vmcb01->control.int_ctl |= V_NMI_PENDING;
+	} else
+		vmcb01->control.int_ctl &= ~V_NMI_PENDING;
+}
+
+
 static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
 					  unsigned long vmcb12_rip,
 					  unsigned long vmcb12_csbase)
@@ -646,6 +682,9 @@  static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
 	else
 		int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
 
+	if (vnmi)
+		nested_svm_save_vnmi(svm);
+
 	/* Copied from vmcb01.  msrpm_base can be overwritten later.  */
 	vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
 	vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
@@ -1049,6 +1088,9 @@  int nested_svm_vmexit(struct vcpu_svm *svm)
 		svm_update_lbrv(vcpu);
 	}
 
+	if (vnmi)
+		nested_svm_restore_vnmi(svm);
+
 	/*
 	 * On vmexit the  GIF is set to false and
 	 * no event can be injected in L1.
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index cfed6ab29c839a..bf10adcf3170a8 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -230,6 +230,8 @@  module_param(dump_invalid_vmcb, bool, 0644);
 bool intercept_smi = true;
 module_param(intercept_smi, bool, 0444);
 
+bool vnmi = true;
+module_param(vnmi, bool, 0444);
 
 static bool svm_gp_erratum_intercept = true;
 
@@ -1299,6 +1301,9 @@  static void init_vmcb(struct kvm_vcpu *vcpu)
 	if (kvm_vcpu_apicv_active(vcpu))
 		avic_init_vmcb(svm, vmcb);
 
+	if (vnmi)
+		svm->vmcb->control.int_ctl |= V_NMI_ENABLE;
+
 	if (vgif) {
 		svm_clr_intercept(svm, INTERCEPT_STGI);
 		svm_clr_intercept(svm, INTERCEPT_CLGI);
@@ -3487,6 +3492,39 @@  static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 	++vcpu->stat.nmi_injections;
 }
 
+
+static bool svm_get_hw_nmi_pending(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (!is_vnmi_enabled(svm))
+		return false;
+
+	return !!(svm->vmcb->control.int_ctl & V_NMI_MASK);
+}
+
+static bool svm_set_hw_nmi_pending(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (!is_vnmi_enabled(svm))
+		return false;
+
+	if (svm->vmcb->control.int_ctl & V_NMI_PENDING)
+		return false;
+
+	svm->vmcb->control.int_ctl |= V_NMI_PENDING;
+	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
+
+	/*
+	 * NMI isn't yet technically injected but
+	 * this rough estimation should be good enough
+	 */
+	++vcpu->stat.nmi_injections;
+
+	return true;
+}
+
 static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -3582,11 +3620,38 @@  static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 		svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
 }
 
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (is_vnmi_enabled(svm))
+		return svm->vmcb->control.int_ctl & V_NMI_MASK;
+	else
+		return svm->nmi_masked;
+}
+
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (is_vnmi_enabled(svm)) {
+		if (masked)
+			svm->vmcb->control.int_ctl |= V_NMI_MASK;
+		else
+			svm->vmcb->control.int_ctl &= ~V_NMI_MASK;
+	} else {
+		svm->nmi_masked = masked;
+		if (masked)
+			svm_enable_iret_interception(svm);
+		else
+			svm_disable_iret_interception(svm);
+	}
+}
+
 bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb *vmcb = svm->vmcb;
-	bool ret;
 
 	if (!gif_set(svm))
 		return true;
@@ -3594,10 +3659,10 @@  bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
 	if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
 		return false;
 
-	ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
-	      (svm->nmi_masked);
+	if (svm_get_nmi_mask(vcpu))
+		return true;
 
-	return ret;
+	return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
 }
 
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
@@ -3615,24 +3680,6 @@  static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 	return 1;
 }
 
-static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
-{
-	return to_svm(vcpu)->nmi_masked;
-}
-
-static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	if (masked) {
-		svm->nmi_masked = true;
-		svm_enable_iret_interception(svm);
-	} else {
-		svm->nmi_masked = false;
-		svm_disable_iret_interception(svm);
-	}
-}
-
 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -3725,10 +3772,16 @@  static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
 	/*
 	 * Something prevents NMI from been injected. Single step over possible
 	 * problem (IRET or exception injection or interrupt shadow)
+	 *
+	 * With vNMI we should never need an NMI window
+	 * (we can always inject vNMI either by setting VNMI_PENDING or by EVENTINJ)
 	 */
+	if (WARN_ON_ONCE(is_vnmi_enabled(svm)))
+		return;
+
 	svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
-	svm->nmi_singlestep = true;
 	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+	svm->nmi_singlestep = true;
 }
 
 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
@@ -4770,6 +4823,8 @@  static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.patch_hypercall = svm_patch_hypercall,
 	.inject_irq = svm_inject_irq,
 	.inject_nmi = svm_inject_nmi,
+	.get_hw_nmi_pending = svm_get_hw_nmi_pending,
+	.set_hw_nmi_pending = svm_set_hw_nmi_pending,
 	.inject_exception = svm_inject_exception,
 	.cancel_injection = svm_cancel_injection,
 	.interrupt_allowed = svm_interrupt_allowed,
@@ -5058,6 +5113,16 @@  static __init int svm_hardware_setup(void)
 			pr_info("Virtual GIF supported\n");
 	}
 
+
+	vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_AMD_VNMI);
+	if (vnmi)
+		pr_info("Virtual NMI enabled\n");
+
+	if (!vnmi) {
+		svm_x86_ops.get_hw_nmi_pending = NULL;
+		svm_x86_ops.set_hw_nmi_pending = NULL;
+	}
+
 	if (lbrv) {
 		if (!boot_cpu_has(X86_FEATURE_LBRV))
 			lbrv = false;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 587ddc150f9f34..0b7e1790fadde1 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -35,6 +35,7 @@  extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 extern bool npt_enabled;
 extern int vgif;
 extern bool intercept_smi;
+extern bool vnmi;
 
 enum avic_modes {
 	AVIC_MODE_NONE = 0,
@@ -553,6 +554,15 @@  static inline bool is_x2apic_msrpm_offset(u32 offset)
 	       (msr < (APIC_BASE_MSR + 0x100));
 }
 
+static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
+{
+	/* L1's vNMI is inhibited while nested guest is running */
+	if (is_guest_mode(&svm->vcpu))
+		return false;
+
+	return !!(svm->vmcb01.ptr->control.int_ctl & V_NMI_ENABLE);
+}
+
 /* svm.c */
 #define MSR_INVALID				0xffffffffU