diff mbox series

[v2,07/11] KVM: x86: add a delayed hardware NMI injection interface

Message ID 20221129193717.513824-8-mlevitsk@redhat.com (mailing list archive)
State New, archived
Headers show
Series SVM: vNMI (with my fixes) | expand

Commit Message

Maxim Levitsky Nov. 29, 2022, 7:37 p.m. UTC
This patch adds two new vendor callbacks:

- kvm_x86_get_hw_nmi_pending()
- kvm_x86_set_hw_nmi_pending()

Using those callbacks the KVM can take advantage of the
hardware's accelerated delayed NMI delivery (currently vNMI on SVM).

Once NMI is set to pending via this interface, it is assumed that
the hardware will deliver the NMI on its own to the guest once
all the x86 conditions for the NMI delivery are met.

Note that the 'kvm_x86_set_hw_nmi_pending()' callback is allowed
to fail, in which case a normal NMI injection will be attempted
when NMI can be delivered (possibly by using a NMI window).

With vNMI that can happen either if vNMI is already pending or
if a nested guest is running.

When the vNMI injection fails due to the 'vNMI is already pending'
condition, the new NMI will be dropped unless the new NMI can be
injected immediately, so no NMI window will be requested.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |  2 ++
 arch/x86/include/asm/kvm_host.h    | 15 ++++++++++++-
 arch/x86/kvm/x86.c                 | 36 ++++++++++++++++++++++++++----
 3 files changed, 48 insertions(+), 5 deletions(-)

Comments

Sean Christopherson Jan. 28, 2023, 1:09 a.m. UTC | #1
On Tue, Nov 29, 2022, Maxim Levitsky wrote:
> This patch adds two new vendor callbacks:

No "this patch" please, just say what it does.

> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 684a5519812fb2..46993ce61c92db 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -871,8 +871,13 @@ struct kvm_vcpu_arch {
>  	u64 tsc_scaling_ratio; /* current scaling ratio */
>  
>  	atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
> -	unsigned nmi_pending; /* NMI queued after currently running handler */
> +
> +	unsigned int nmi_pending; /*
> +				   * NMI queued after currently running handler
> +				   * (not including a hardware pending NMI (e.g vNMI))
> +				   */

Put the block comment above.  I'd say collapse all of the comments about NMIs into
a single big block comment.

>  	bool nmi_injected;    /* Trying to inject an NMI this entry */
> +
>  	bool smi_pending;    /* SMI queued after currently running handler */
>  	u8 handling_intr_from_guest;
>  
> @@ -10015,13 +10022,34 @@ static void process_nmi(struct kvm_vcpu *vcpu)
>  	 * Otherwise, allow two (and we'll inject the first one immediately).
>  	 */
>  	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
> -		limit = 1;
> +		limit--;
> +
> +	/* Also if there is already a NMI hardware queued to be injected,
> +	 * decrease the limit again
> +	 */

	/*
	 * Block comment ...
	 */

> +	if (static_call(kvm_x86_get_hw_nmi_pending)(vcpu))

I'd prefer "is_hw_nmi_pending()" over "get", even if it means not pairing with
"set".  Though I think that's a good thing since they aren't perfect pairs.

> +		limit--;
>  
> -	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
> +	if (limit <= 0)
> +		return;
> +
> +	/* Attempt to use hardware NMI queueing */
> +	if (static_call(kvm_x86_set_hw_nmi_pending)(vcpu)) {
> +		limit--;
> +		nmi_to_queue--;
> +	}
> +
> +	vcpu->arch.nmi_pending += nmi_to_queue;
>  	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
>  	kvm_make_request(KVM_REQ_EVENT, vcpu);
>  }
>  
> +/* Return total number of NMIs pending injection to the VM */
> +int kvm_get_total_nmi_pending(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.nmi_pending + static_call(kvm_x86_get_hw_nmi_pending)(vcpu);

Nothing cares about the total count, this can just be;


	bool kvm_is_nmi_pending(struct kvm_vcpu *vcpu)
	{
		return vcpu->arch.nmi_pending ||
		       static_call(kvm_x86_is_hw_nmi_pending)(vcpu);
	}


> +}
> +
>  void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
>  				       unsigned long *vcpu_bitmap)
>  {
> -- 
> 2.26.3
>
Sean Christopherson Jan. 31, 2023, 9:12 p.m. UTC | #2
On Sat, Jan 28, 2023, Sean Christopherson wrote:
> On Tue, Nov 29, 2022, Maxim Levitsky wrote:
> > @@ -10015,13 +10022,34 @@ static void process_nmi(struct kvm_vcpu *vcpu)
> >  	 * Otherwise, allow two (and we'll inject the first one immediately).
> >  	 */
> >  	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
> > -		limit = 1;
> > +		limit--;
> > +
> > +	/* Also if there is already a NMI hardware queued to be injected,
> > +	 * decrease the limit again
> > +	 */
> 
> 	/*
> 	 * Block comment ...
> 	 */
> 
> > +	if (static_call(kvm_x86_get_hw_nmi_pending)(vcpu))
> 
> I'd prefer "is_hw_nmi_pending()" over "get", even if it means not pairing with
> "set".  Though I think that's a good thing since they aren't perfect pairs.

Thinking more, I vote for s/hw_nmi/vnmi.  "hardware" usually means actual hardware,
i.e. a pending NMI for the host.
Sean Christopherson Jan. 31, 2023, 10:28 p.m. UTC | #3
On Tue, Nov 29, 2022, Maxim Levitsky wrote:
> @@ -5191,9 +5191,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
>  
>  	vcpu->arch.nmi_injected = events->nmi.injected;
>  	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
> -		vcpu->arch.nmi_pending = events->nmi.pending;
> +		atomic_add(events->nmi.pending, &vcpu->arch.nmi_queued);
> +
>  	static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
>  
> +	process_nmi(vcpu);

Argh, having two process_nmi() calls is ugly (not blaming your code, it's KVM's
ABI that's ugly).  E.g. if we collapse this down, it becomes:

	process_nmi(vcpu);

	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
		<blah blah blah>
	}
	static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);

	process_nmi(vcpu);

And the second mess is that V_NMI needs to be cleared.

The first process_nmi() effectively exists to (a) purge nmi_queued and (b) keep
nmi_pending if KVM_VCPUEVENT_VALID_NMI_PENDING is not set.  I think we can just
replace that with an set of nmi_queued, i.e.

	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
		vcpu->arch-nmi_pending = 0;	
		atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
		process_nmi();
	}

because if nmi_queued is non-zero in the !KVM_VCPUEVENT_VALID_NMI_PENDING, then
there should already be a pending KVM_REQ_NMI.  Alternatively, replace process_nmi()
with a KVM_REQ_NMI request (that probably has my vote?).

If that works, can you do that in a separate patch?  Then this patch can tack on
a call to clear V_NMI.

>  	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
>  	    lapic_in_kernel(vcpu))
>  		vcpu->arch.apic->sipi_vector = events->sipi_vector;
> @@ -10008,6 +10011,10 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
>  static void process_nmi(struct kvm_vcpu *vcpu)
>  {
>  	unsigned limit = 2;
> +	int nmi_to_queue = atomic_xchg(&vcpu->arch.nmi_queued, 0);
> +
> +	if (!nmi_to_queue)
> +		return;
>  
>  	/*
>  	 * x86 is limited to one NMI running, and one NMI pending after it.
> @@ -10015,13 +10022,34 @@ static void process_nmi(struct kvm_vcpu *vcpu)
>  	 * Otherwise, allow two (and we'll inject the first one immediately).
>  	 */
>  	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
> -		limit = 1;
> +		limit--;
> +
> +	/* Also if there is already a NMI hardware queued to be injected,
> +	 * decrease the limit again
> +	 */
> +	if (static_call(kvm_x86_get_hw_nmi_pending)(vcpu))
> +		limit--;

I don't think this is correct.  If a vNMI is pending and NMIs are blocked, then
limit will end up '0' and KVM will fail to pend the additional NMI in software.
After much fiddling, and factoring in the above, how about this?

	unsigned limit = 2;

	/*
	 * x86 is limited to one NMI running, and one NMI pending after it.
	 * If an NMI is already in progress, limit further NMIs to just one.
	 * Otherwise, allow two (and we'll inject the first one immediately).
	 */
	if (vcpu->arch.nmi_injected) {
		/* vNMI counts as the "one pending NMI". */
		if (static_call(kvm_x86_is_vnmi_pending)(vcpu))
			limit = 0;
		else
			limit = 1;
	} else if (static_call(kvm_x86_get_nmi_mask)(vcpu) ||
		   static_call(kvm_x86_is_vnmi_pending)(vcpu)) {
		limit = 1;
	}

	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);

	if (vcpu->arch.nmi_pending &&
	    static_call(kvm_x86_set_vnmi_pending(vcpu)))
		vcpu->arch.nmi_pending--;

	if (vcpu->arch.nmi_pending)
		kvm_make_request(KVM_REQ_EVENT, vcpu);

With the KVM_REQ_EVENT change in a separate prep patch:

--
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 31 Jan 2023 13:33:02 -0800
Subject: [PATCH] KVM: x86: Raise an event request when processing NMIs iff an
 NMI is pending

Don't raise KVM_REQ_EVENT if no NMIs are pending at the end of
process_nmi().  Finishing process_nmi() without a pending NMI will become
much more likely when KVM gains support for AMD's vNMI, which allows
pending vNMIs in hardware, i.e. doesn't require explicit injection.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 508074e47bc0..030136b6ebbd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10134,7 +10134,9 @@ static void process_nmi(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
 	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+	if (vcpu->arch.nmi_pending)
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
 void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,

base-commit: 916b54a7688b0b9a1c48c708b848e4348c3ae2ab
--
Sean Christopherson Feb. 1, 2023, 12:06 a.m. UTC | #4
On Tue, Jan 31, 2023, Sean Christopherson wrote:
> On Tue, Nov 29, 2022, Maxim Levitsky wrote:
> > @@ -10015,13 +10022,34 @@ static void process_nmi(struct kvm_vcpu *vcpu)
> >  	 * Otherwise, allow two (and we'll inject the first one immediately).
> >  	 */
> >  	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
> > -		limit = 1;
> > +		limit--;
> > +
> > +	/* Also if there is already a NMI hardware queued to be injected,
> > +	 * decrease the limit again
> > +	 */
> > +	if (static_call(kvm_x86_get_hw_nmi_pending)(vcpu))
> > +		limit--;
> 
> I don't think this is correct.  If a vNMI is pending and NMIs are blocked, then
> limit will end up '0' and KVM will fail to pend the additional NMI in software.

Scratch that, dropping the second NMI in this case is correct.  The "running" part
of the existing "x86 is limited to one NMI running, and one NMI pending after it"
confused me.  The "running" thing is really just a variant on NMIs being blocked.

I'd also like to avoid the double decrement logic.  Accounting the virtual NMI is
a very different thing than dealing with concurrent NMIs, I'd prefer to reflect
that in the code.

Any objection to folding in the below to end up with:

	unsigned limit;

	/*
	 * x86 is limited to one NMI pending, but because KVM can't react to
	 * incoming NMIs as quickly as bare metal, e.g. if the vCPU is
	 * scheduled out, KVM needs to play nice with two queued NMIs showing
	 * up at the same time.  To handle this scenario, allow two NMIs to be
	 * (temporarily) pending so long as NMIs are not blocked and KVM is not
	 * waiting for a previous NMI injection to complete (which effectively
	 * blocks NMIs).  KVM will immediately inject one of the two NMIs, and
	 * will request an NMI window to handle the second NMI.
	 */
	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
		limit = 1;
	else
		limit = 2;

	/*
	 * Adjust the limit to account for pending virtual NMIs, which aren't
	 * tracked in in vcpu->arch.nmi_pending.
	 */
	if (static_call(kvm_x86_is_vnmi_pending)(vcpu))
		limit--;

	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);

	if (vcpu->arch.nmi_pending)
		kvm_make_request(KVM_REQ_EVENT, vcpu);

--
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 31 Jan 2023 16:02:21 -0800
Subject: [PATCH] KVM: x86: Tweak the code and comment related to handling
 concurrent NMIs

Tweak the code and comment that deals with concurrent NMIs to explicitly
call out that x86 allows exactly one pending NMI, but that KVM needs to
temporarily allow two pending NMIs in order to workaround the fact that
the target vCPU cannot immediately recognize an incoming NMI, unlike bare
metal.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 030136b6ebbd..fda09ba48b6b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10122,15 +10122,22 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
 
 static void process_nmi(struct kvm_vcpu *vcpu)
 {
-	unsigned limit = 2;
+	unsigned limit;
 
 	/*
-	 * x86 is limited to one NMI running, and one NMI pending after it.
-	 * If an NMI is already in progress, limit further NMIs to just one.
-	 * Otherwise, allow two (and we'll inject the first one immediately).
+	 * x86 is limited to one NMI pending, but because KVM can't react to
+	 * incoming NMIs as quickly as bare metal, e.g. if the vCPU is
+	 * scheduled out, KVM needs to play nice with two queued NMIs showing
+	 * up at the same time.  To handle this scenario, allow two NMIs to be
+	 * (temporarily) pending so long as NMIs are not blocked and KVM is not
+	 * waiting for a previous NMI injection to complete (which effectively
+	 * blocks NMIs).  KVM will immediately inject one of the two NMIs, and
+	 * will request an NMI window to handle the second NMI.
 	 */
 	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
 		limit = 1;
+	else
+		limit = 2;
 
 	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
 	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);

base-commit: 07222b33fd1af78dca77c7c66db31477f1b87f0f
--
Shukla, Santosh Feb. 8, 2023, 9:32 a.m. UTC | #5
On 1/28/2023 6:39 AM, Sean Christopherson wrote:
> On Tue, Nov 29, 2022, Maxim Levitsky wrote:
>> This patch adds two new vendor callbacks:
> 
> No "this patch" please, just say what it does.
> 
Sure.

>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 684a5519812fb2..46993ce61c92db 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -871,8 +871,13 @@ struct kvm_vcpu_arch {
>>  	u64 tsc_scaling_ratio; /* current scaling ratio */
>>  
>>  	atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
>> -	unsigned nmi_pending; /* NMI queued after currently running handler */
>> +
>> +	unsigned int nmi_pending; /*
>> +				   * NMI queued after currently running handler
>> +				   * (not including a hardware pending NMI (e.g vNMI))
>> +				   */
> 
> Put the block comment above.  I'd say collapse all of the comments about NMIs into
> a single big block comment.
>

ok.
 
>>  	bool nmi_injected;    /* Trying to inject an NMI this entry */
>> +
>>  	bool smi_pending;    /* SMI queued after currently running handler */
>>  	u8 handling_intr_from_guest;
>>  
>> @@ -10015,13 +10022,34 @@ static void process_nmi(struct kvm_vcpu *vcpu)
>>  	 * Otherwise, allow two (and we'll inject the first one immediately).
>>  	 */
>>  	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
>> -		limit = 1;
>> +		limit--;
>> +
>> +	/* Also if there is already a NMI hardware queued to be injected,
>> +	 * decrease the limit again
>> +	 */
> 
> 	/*
> 	 * Block comment ...
> 	 */
>

ok.
 
>> +	if (static_call(kvm_x86_get_hw_nmi_pending)(vcpu))
> 
> I'd prefer "is_hw_nmi_pending()" over "get", even if it means not pairing with
> "set".  Though I think that's a good thing since they aren't perfect pairs.
>

Sure thing, will spin in v3.
 
>> +		limit--;
>>  
>> -	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
>> +	if (limit <= 0)
>> +		return;
>> +
>> +	/* Attempt to use hardware NMI queueing */
>> +	if (static_call(kvm_x86_set_hw_nmi_pending)(vcpu)) {
>> +		limit--;
>> +		nmi_to_queue--;
>> +	}
>> +
>> +	vcpu->arch.nmi_pending += nmi_to_queue;
>>  	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
>>  	kvm_make_request(KVM_REQ_EVENT, vcpu);
>>  }
>>  
>> +/* Return total number of NMIs pending injection to the VM */
>> +int kvm_get_total_nmi_pending(struct kvm_vcpu *vcpu)
>> +{
>> +	return vcpu->arch.nmi_pending + static_call(kvm_x86_get_hw_nmi_pending)(vcpu);
> 
> Nothing cares about the total count, this can just be;
> 
> 
> 	bool kvm_is_nmi_pending(struct kvm_vcpu *vcpu)
> 	{
> 		return vcpu->arch.nmi_pending ||
> 		       static_call(kvm_x86_is_hw_nmi_pending)(vcpu);
> 	}
>

Yes, this simplifies things.

Thanks,
Santosh
 
> 
>> +}
>> +
>>  void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
>>  				       unsigned long *vcpu_bitmap)
>>  {
>> -- 
>> 2.26.3
>>
Shukla, Santosh Feb. 8, 2023, 9:35 a.m. UTC | #6
On 2/1/2023 2:42 AM, Sean Christopherson wrote:
> On Sat, Jan 28, 2023, Sean Christopherson wrote:
>> On Tue, Nov 29, 2022, Maxim Levitsky wrote:
>>> @@ -10015,13 +10022,34 @@ static void process_nmi(struct kvm_vcpu *vcpu)
>>>  	 * Otherwise, allow two (and we'll inject the first one immediately).
>>>  	 */
>>>  	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
>>> -		limit = 1;
>>> +		limit--;
>>> +
>>> +	/* Also if there is already a NMI hardware queued to be injected,
>>> +	 * decrease the limit again
>>> +	 */
>>
>> 	/*
>> 	 * Block comment ...
>> 	 */
>>
>>> +	if (static_call(kvm_x86_get_hw_nmi_pending)(vcpu))
>>
>> I'd prefer "is_hw_nmi_pending()" over "get", even if it means not pairing with
>> "set".  Though I think that's a good thing since they aren't perfect pairs.
> 
> Thinking more, I vote for s/hw_nmi/vnmi.  "hardware" usually means actual hardware,
> i.e. a pending NMI for the host.

Ah!,. better. 

Thanks,
Santosh
Shukla, Santosh Feb. 8, 2023, 9:43 a.m. UTC | #7
On 2/1/2023 3:58 AM, Sean Christopherson wrote:
> On Tue, Nov 29, 2022, Maxim Levitsky wrote:
>> @@ -5191,9 +5191,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
>>  
>>  	vcpu->arch.nmi_injected = events->nmi.injected;
>>  	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
>> -		vcpu->arch.nmi_pending = events->nmi.pending;
>> +		atomic_add(events->nmi.pending, &vcpu->arch.nmi_queued);
>> +
>>  	static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
>>  
>> +	process_nmi(vcpu);
> 
> Argh, having two process_nmi() calls is ugly (not blaming your code, it's KVM's
> ABI that's ugly).  E.g. if we collapse this down, it becomes:
> 
> 	process_nmi(vcpu);
> 
> 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
> 		<blah blah blah>
> 	}
> 	static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
> 
> 	process_nmi(vcpu);
> 
> And the second mess is that V_NMI needs to be cleared.
> 

Can you please elaborate on "V_NMI cleared" scenario? Are you mentioning about V_NMI_MASK or svm->nmi_masked?

> The first process_nmi() effectively exists to (a) purge nmi_queued and (b) keep
> nmi_pending if KVM_VCPUEVENT_VALID_NMI_PENDING is not set.  I think we can just
> replace that with an set of nmi_queued, i.e.
> 
> 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
> 		vcpu->arch-nmi_pending = 0;	
> 		atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
> 		process_nmi();
> 
You mean replace above process_nmi() with kvm_make_request(KVM_REQ_NMI, vcpu), right?
I'll try with above proposal.

>	}
> 
> because if nmi_queued is non-zero in the !KVM_VCPUEVENT_VALID_NMI_PENDING, then
> there should already be a pending KVM_REQ_NMI.  Alternatively, replace process_nmi()
> with a KVM_REQ_NMI request (that probably has my vote?).
> 
> If that works, can you do that in a separate patch?  Then this patch can tack on
> a call to clear V_NMI.
> 
>>  	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
>>  	    lapic_in_kernel(vcpu))
>>  		vcpu->arch.apic->sipi_vector = events->sipi_vector;
>> @@ -10008,6 +10011,10 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
>>  static void process_nmi(struct kvm_vcpu *vcpu)
>>  {
>>  	unsigned limit = 2;
>> +	int nmi_to_queue = atomic_xchg(&vcpu->arch.nmi_queued, 0);
>> +
>> +	if (!nmi_to_queue)
>> +		return;
>>  
>>  	/*
>>  	 * x86 is limited to one NMI running, and one NMI pending after it.
>> @@ -10015,13 +10022,34 @@ static void process_nmi(struct kvm_vcpu *vcpu)
>>  	 * Otherwise, allow two (and we'll inject the first one immediately).
>>  	 */
>>  	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
>> -		limit = 1;
>> +		limit--;
>> +
>> +	/* Also if there is already a NMI hardware queued to be injected,
>> +	 * decrease the limit again
>> +	 */
>> +	if (static_call(kvm_x86_get_hw_nmi_pending)(vcpu))
>> +		limit--;
> 
> I don't think this is correct.  If a vNMI is pending and NMIs are blocked, then
> limit will end up '0' and KVM will fail to pend the additional NMI in software.
> After much fiddling, and factoring in the above, how about this?
> 
> 	unsigned limit = 2;
> 
> 	/*
> 	 * x86 is limited to one NMI running, and one NMI pending after it.
> 	 * If an NMI is already in progress, limit further NMIs to just one.
> 	 * Otherwise, allow two (and we'll inject the first one immediately).
> 	 */
> 	if (vcpu->arch.nmi_injected) {
> 		/* vNMI counts as the "one pending NMI". */
> 		if (static_call(kvm_x86_is_vnmi_pending)(vcpu))
> 			limit = 0;
> 		else
> 			limit = 1;
> 	} else if (static_call(kvm_x86_get_nmi_mask)(vcpu) ||
> 		   static_call(kvm_x86_is_vnmi_pending)(vcpu)) {
> 		limit = 1;
> 	}
> 
> 	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
> 	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
> 
> 	if (vcpu->arch.nmi_pending &&
> 	    static_call(kvm_x86_set_vnmi_pending(vcpu)))
> 		vcpu->arch.nmi_pending--;
> 
> 	if (vcpu->arch.nmi_pending)
> 		kvm_make_request(KVM_REQ_EVENT, vcpu);
> 
> With the KVM_REQ_EVENT change in a separate prep patch:
> 
> --
> From: Sean Christopherson <seanjc@google.com>
> Date: Tue, 31 Jan 2023 13:33:02 -0800
> Subject: [PATCH] KVM: x86: Raise an event request when processing NMIs iff an
>  NMI is pending
> 
> Don't raise KVM_REQ_EVENT if no NMIs are pending at the end of
> process_nmi().  Finishing process_nmi() without a pending NMI will become
> much more likely when KVM gains support for AMD's vNMI, which allows
> pending vNMIs in hardware, i.e. doesn't require explicit injection.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/x86.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 508074e47bc0..030136b6ebbd 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -10134,7 +10134,9 @@ static void process_nmi(struct kvm_vcpu *vcpu)
>  
>  	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
>  	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
> -	kvm_make_request(KVM_REQ_EVENT, vcpu);
> +
> +	if (vcpu->arch.nmi_pending)
> +		kvm_make_request(KVM_REQ_EVENT, vcpu);
>  }
>  
>  void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
> 

Looks good to me, will include as separate patch.

Thanks,
Santosh
Shukla, Santosh Feb. 8, 2023, 9:51 a.m. UTC | #8
On 2/1/2023 5:36 AM, Sean Christopherson wrote:
> On Tue, Jan 31, 2023, Sean Christopherson wrote:
>> On Tue, Nov 29, 2022, Maxim Levitsky wrote:
>>> @@ -10015,13 +10022,34 @@ static void process_nmi(struct kvm_vcpu *vcpu)
>>>  	 * Otherwise, allow two (and we'll inject the first one immediately).
>>>  	 */
>>>  	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
>>> -		limit = 1;
>>> +		limit--;
>>> +
>>> +	/* Also if there is already a NMI hardware queued to be injected,
>>> +	 * decrease the limit again
>>> +	 */
>>> +	if (static_call(kvm_x86_get_hw_nmi_pending)(vcpu))
>>> +		limit--;
>>
>> I don't think this is correct.  If a vNMI is pending and NMIs are blocked, then
>> limit will end up '0' and KVM will fail to pend the additional NMI in software.
> 
> Scratch that, dropping the second NMI in this case is correct.  The "running" part
> of the existing "x86 is limited to one NMI running, and one NMI pending after it"
> confused me.  The "running" thing is really just a variant on NMIs being blocked.
> 
> I'd also like to avoid the double decrement logic.  Accounting the virtual NMI is
> a very different thing than dealing with concurrent NMIs, I'd prefer to reflect
> that in the code.
> 
> Any objection to folding in the below to end up with:
> 
> 	unsigned limit;
> 
> 	/*
> 	 * x86 is limited to one NMI pending, but because KVM can't react to
> 	 * incoming NMIs as quickly as bare metal, e.g. if the vCPU is
> 	 * scheduled out, KVM needs to play nice with two queued NMIs showing
> 	 * up at the same time.  To handle this scenario, allow two NMIs to be
> 	 * (temporarily) pending so long as NMIs are not blocked and KVM is not
> 	 * waiting for a previous NMI injection to complete (which effectively
> 	 * blocks NMIs).  KVM will immediately inject one of the two NMIs, and
> 	 * will request an NMI window to handle the second NMI.
> 	 */
> 	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
> 		limit = 1;
> 	else
> 		limit = 2;
> 
> 	/*
> 	 * Adjust the limit to account for pending virtual NMIs, which aren't
> 	 * tracked in in vcpu->arch.nmi_pending.
> 	 */
> 	if (static_call(kvm_x86_is_vnmi_pending)(vcpu))
> 		limit--;
> 
> 	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
> 	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
> 

I believe, you missed the function below hunk -

	if (vcpu->arch.nmi_pending &&
	    static_call(kvm_x86_set_vnmi_pending(vcpu)))
		vcpu->arch.nmi_pending--;

Or am I missing something.. please suggest.

> 	if (vcpu->arch.nmi_pending)
> 		kvm_make_request(KVM_REQ_EVENT, vcpu);
> 
> --
> From: Sean Christopherson <seanjc@google.com>
> Date: Tue, 31 Jan 2023 16:02:21 -0800
> Subject: [PATCH] KVM: x86: Tweak the code and comment related to handling
>  concurrent NMIs
> 
> Tweak the code and comment that deals with concurrent NMIs to explicitly
> call out that x86 allows exactly one pending NMI, but that KVM needs to
> temporarily allow two pending NMIs in order to workaround the fact that
> the target vCPU cannot immediately recognize an incoming NMI, unlike bare
> metal.
> 
> No functional change intended.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/x86.c | 15 +++++++++++----
>  1 file changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 030136b6ebbd..fda09ba48b6b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -10122,15 +10122,22 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
>  
>  static void process_nmi(struct kvm_vcpu *vcpu)
>  {
> -	unsigned limit = 2;
> +	unsigned limit;
>  
>  	/*
> -	 * x86 is limited to one NMI running, and one NMI pending after it.
> -	 * If an NMI is already in progress, limit further NMIs to just one.
> -	 * Otherwise, allow two (and we'll inject the first one immediately).
> +	 * x86 is limited to one NMI pending, but because KVM can't react to
> +	 * incoming NMIs as quickly as bare metal, e.g. if the vCPU is
> +	 * scheduled out, KVM needs to play nice with two queued NMIs showing
> +	 * up at the same time.  To handle this scenario, allow two NMIs to be
> +	 * (temporarily) pending so long as NMIs are not blocked and KVM is not
> +	 * waiting for a previous NMI injection to complete (which effectively
> +	 * blocks NMIs).  KVM will immediately inject one of the two NMIs, and
> +	 * will request an NMI window to handle the second NMI.
>  	 */
>  	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
>  		limit = 1;
> +	else
> +		limit = 2;
>  
>  	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
>  	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
> 

Looks good to me, will include in v3.

Thanks,
Santosh
Sean Christopherson Feb. 8, 2023, 4:06 p.m. UTC | #9
On Wed, Feb 08, 2023, Santosh Shukla wrote:
> On 2/1/2023 3:58 AM, Sean Christopherson wrote:
> > On Tue, Nov 29, 2022, Maxim Levitsky wrote:
> >> @@ -5191,9 +5191,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
> >>  
> >>  	vcpu->arch.nmi_injected = events->nmi.injected;
> >>  	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
> >> -		vcpu->arch.nmi_pending = events->nmi.pending;
> >> +		atomic_add(events->nmi.pending, &vcpu->arch.nmi_queued);
> >> +
> >>  	static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
> >>  
> >> +	process_nmi(vcpu);
> > 
> > Argh, having two process_nmi() calls is ugly (not blaming your code, it's KVM's
> > ABI that's ugly).  E.g. if we collapse this down, it becomes:
> > 
> > 	process_nmi(vcpu);
> > 
> > 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
> > 		<blah blah blah>
> > 	}
> > 	static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
> > 
> > 	process_nmi(vcpu);
> > 
> > And the second mess is that V_NMI needs to be cleared.
> > 
> 
> Can you please elaborate on "V_NMI cleared" scenario? Are you mentioning
> about V_NMI_MASK or svm->nmi_masked?

V_NMI_MASK.  KVM needs to purge any pending virtual NMIs when userspace sets
vCPU event state and KVM_VCPUEVENT_VALID_NMI_PENDING is set.

> > The first process_nmi() effectively exists to (a) purge nmi_queued and (b) keep
> > nmi_pending if KVM_VCPUEVENT_VALID_NMI_PENDING is not set.  I think we can just
> > replace that with an set of nmi_queued, i.e.
> > 
> > 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
> > 		vcpu->arch-nmi_pending = 0;	
> > 		atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
> > 		process_nmi();
> > 
> You mean replace above process_nmi() with kvm_make_request(KVM_REQ_NMI, vcpu), right?
> I'll try with above proposal.

Yep, if that works.  Actually, that might be a requirement.  There's a

  static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);

lurking below this.  Invoking process_nmi() before NMI blocking is updated could
result in KVM incorrectly dropping/keeping NMIs.  I don't think it would be a
problem in practice since KVM save only one NMI, but userspace could stuff NMIs.

Huh.  The the existing code is buggy.  events->nmi.pending is a u8, and
arch.nmi_pending is an unsigned int.  KVM doesn't cap the incoming value, so
userspace could set up to 255 pending NMIs.  The extra weird part is that the extra
NMIs will get dropped the next time KVM stumbles through process_nmi().

Amusingly, KVM only saves one pending NMI, i.e. in a true migration scenario KVM
may drop an NMI.

  events->nmi.pending = vcpu->arch.nmi_pending != 0;

The really amusing part is that that code was added by 7460fb4a3400 ("KVM: Fix
simultaneous NMIs").  The only thing I can figure is that KVM_GET_VCPU_EVENTS was
somewhat blindly updated without much thought about what should actually happen.

So, can you slide the below in early in the series?  Then in this series, convert
to the above suggested flow (zero nmi_pending, stuff nmi_queued) in another patch?

From: Sean Christopherson <seanjc@google.com>
Date: Wed, 8 Feb 2023 07:44:16 -0800
Subject: [PATCH] KVM: x86: Save/restore all NMIs when multiple NMIs are
 pending

Save all pending NMIs in KVM_GET_VCPU_EVENTS, and queue KVM_REQ_NMI if one
or more NMIs are pending after KVM_SET_VCPU_EVENTS in order to re-evaluate
pending NMIs with respect to NMI blocking.

KVM allows multiple NMIs to be pending in order to faithfully emulate bare
metal handling of simultaneous NMIs (on bare metal, truly simultaneous
NMIs are impossible, i.e. one will always arrive first and be consumed).
Support for simultaneous NMIs botched the save/restore though.  KVM only
saves one pending NMI, but allows userspace to restore 255 pending NMIs
as kvm_vcpu_events.nmi.pending is a u8, and KVM's internal state is stored
in an unsigned int.

7460fb4a3400 ("KVM: Fix simultaneous NMIs")
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 508074e47bc0..e9339acbf82a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5115,7 +5115,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
 
 	events->nmi.injected = vcpu->arch.nmi_injected;
-	events->nmi.pending = vcpu->arch.nmi_pending != 0;
+	events->nmi.pending = vcpu->arch.nmi_pending;
 	events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
 
 	/* events->sipi_vector is never valid when reporting to user space */
@@ -5202,8 +5202,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 						events->interrupt.shadow);
 
 	vcpu->arch.nmi_injected = events->nmi.injected;
-	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
+	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
 		vcpu->arch.nmi_pending = events->nmi.pending;
+		if (vcpu->arch.nmi_pending)
+			kvm_make_request(KVM_REQ_NMI, vcpu);
+	}
 	static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
 
 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&

base-commit: 6c77ae716d546d71b21f0c9ee7d405314a3f3f9e
--
Sean Christopherson Feb. 8, 2023, 4:09 p.m. UTC | #10
On Wed, Feb 08, 2023, Santosh Shukla wrote:
> On 2/1/2023 5:36 AM, Sean Christopherson wrote:
> > On Tue, Jan 31, 2023, Sean Christopherson wrote:
> >> On Tue, Nov 29, 2022, Maxim Levitsky wrote:
> >>> @@ -10015,13 +10022,34 @@ static void process_nmi(struct kvm_vcpu *vcpu)
> >>>  	 * Otherwise, allow two (and we'll inject the first one immediately).
> >>>  	 */
> >>>  	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
> >>> -		limit = 1;
> >>> +		limit--;
> >>> +
> >>> +	/* Also if there is already a NMI hardware queued to be injected,
> >>> +	 * decrease the limit again
> >>> +	 */
> >>> +	if (static_call(kvm_x86_get_hw_nmi_pending)(vcpu))
> >>> +		limit--;
> >>
> >> I don't think this is correct.  If a vNMI is pending and NMIs are blocked, then
> >> limit will end up '0' and KVM will fail to pend the additional NMI in software.
> > 
> > Scratch that, dropping the second NMI in this case is correct.  The "running" part
> > of the existing "x86 is limited to one NMI running, and one NMI pending after it"
> > confused me.  The "running" thing is really just a variant on NMIs being blocked.
> > 
> > I'd also like to avoid the double decrement logic.  Accounting the virtual NMI is
> > a very different thing than dealing with concurrent NMIs, I'd prefer to reflect
> > that in the code.
> > 
> > Any objection to folding in the below to end up with:
> > 
> > 	unsigned limit;
> > 
> > 	/*
> > 	 * x86 is limited to one NMI pending, but because KVM can't react to
> > 	 * incoming NMIs as quickly as bare metal, e.g. if the vCPU is
> > 	 * scheduled out, KVM needs to play nice with two queued NMIs showing
> > 	 * up at the same time.  To handle this scenario, allow two NMIs to be
> > 	 * (temporarily) pending so long as NMIs are not blocked and KVM is not
> > 	 * waiting for a previous NMI injection to complete (which effectively
> > 	 * blocks NMIs).  KVM will immediately inject one of the two NMIs, and
> > 	 * will request an NMI window to handle the second NMI.
> > 	 */
> > 	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
> > 		limit = 1;
> > 	else
> > 		limit = 2;
> > 
> > 	/*
> > 	 * Adjust the limit to account for pending virtual NMIs, which aren't
> > 	 * tracked in in vcpu->arch.nmi_pending.
> > 	 */
> > 	if (static_call(kvm_x86_is_vnmi_pending)(vcpu))
> > 		limit--;
> > 
> > 	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
> > 	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
> > 
> 
> I believe, you missed the function below hunk -
> 
> 	if (vcpu->arch.nmi_pending &&
> 	    static_call(kvm_x86_set_vnmi_pending(vcpu)))
> 		vcpu->arch.nmi_pending--;
> 
> Or am I missing something.. please suggest.

You're not missing anything, I'm pretty sure I just lost tracking of things.
Shukla, Santosh Feb. 14, 2023, 10:22 a.m. UTC | #11
On 2/8/2023 9:36 PM, Sean Christopherson wrote:
> On Wed, Feb 08, 2023, Santosh Shukla wrote:
>> On 2/1/2023 3:58 AM, Sean Christopherson wrote:
>>> On Tue, Nov 29, 2022, Maxim Levitsky wrote:
>>>> @@ -5191,9 +5191,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
>>>>  
>>>>  	vcpu->arch.nmi_injected = events->nmi.injected;
>>>>  	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
>>>> -		vcpu->arch.nmi_pending = events->nmi.pending;
>>>> +		atomic_add(events->nmi.pending, &vcpu->arch.nmi_queued);
>>>> +
>>>>  	static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
>>>>  
>>>> +	process_nmi(vcpu);
>>>
>>> Argh, having two process_nmi() calls is ugly (not blaming your code, it's KVM's
>>> ABI that's ugly).  E.g. if we collapse this down, it becomes:
>>>
>>> 	process_nmi(vcpu);
>>>
>>> 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
>>> 		<blah blah blah>
>>> 	}
>>> 	static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
>>>
>>> 	process_nmi(vcpu);
>>>
>>> And the second mess is that V_NMI needs to be cleared.
>>>
>>
>> Can you please elaborate on "V_NMI cleared" scenario? Are you mentioning
>> about V_NMI_MASK or svm->nmi_masked?
> 
> V_NMI_MASK.  KVM needs to purge any pending virtual NMIs when userspace sets
> vCPU event state and KVM_VCPUEVENT_VALID_NMI_PENDING is set.
> 

As per the APM: V_NMI_MASK is managed by the processor
"
V_NMI_MASK: Indicates whether virtual NMIs are masked. The processor will set V_NMI_MASK
once it takes the virtual NMI. V_NMI_MASK is cleared when the guest successfully completes an
IRET instruction or #VMEXIT occurs while delivering the virtual NMI
"

In my initial implementation I had changed V_NMI_MASK for the SMM scenario [1],
This is also not required as HW will save the V_NMI/V_NMI_MASK on 
SMM entry and restore them on RSM.

That said the svm_{get,set}_nmi_mask will look something like:

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index a9e9bfbffd72..08911a33cf1e 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3635,13 +3635,21 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)

 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
-       return to_svm(vcpu)->nmi_masked;
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (is_vnmi_enabled(svm))
+               return svm->vmcb->control.int_ctl & V_NMI_MASK;
+       else
+               return svm->nmi_masked;
 }

 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 {
        struct vcpu_svm *svm = to_svm(vcpu);

+       if (is_vnmi_enabled(svm))
+               return;
+
        if (masked) {
                svm->nmi_masked = true;
                svm_set_iret_intercept(svm);

is there any inputs on above approach?

[1] https://lore.kernel.org/all/20220810061226.1286-4-santosh.shukla@amd.com/

>>> The first process_nmi() effectively exists to (a) purge nmi_queued and (b) keep
>>> nmi_pending if KVM_VCPUEVENT_VALID_NMI_PENDING is not set.  I think we can just
>>> replace that with an set of nmi_queued, i.e.
>>>
>>> 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
>>> 		vcpu->arch-nmi_pending = 0;	
>>> 		atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
>>> 		process_nmi();
>>>
>> You mean replace above process_nmi() with kvm_make_request(KVM_REQ_NMI, vcpu), right?
>> I'll try with above proposal.
> 
> Yep, if that works.  Actually, that might be a requirement.  There's a
> 
>   static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
> 
> lurking below this.  Invoking process_nmi() before NMI blocking is updated could
> result in KVM incorrectly dropping/keeping NMIs.  I don't think it would be a
> problem in practice since KVM save only one NMI, but userspace could stuff NMIs.
> 
> Huh.  The the existing code is buggy.  events->nmi.pending is a u8, and
> arch.nmi_pending is an unsigned int.  KVM doesn't cap the incoming value, so
> userspace could set up to 255 pending NMIs.  The extra weird part is that the extra
> NMIs will get dropped the next time KVM stumbles through process_nmi().
> 
> Amusingly, KVM only saves one pending NMI, i.e. in a true migration scenario KVM
> may drop an NMI.
> 
>   events->nmi.pending = vcpu->arch.nmi_pending != 0;
> 
> The really amusing part is that that code was added by 7460fb4a3400 ("KVM: Fix
> simultaneous NMIs").  The only thing I can figure is that KVM_GET_VCPU_EVENTS was
> somewhat blindly updated without much thought about what should actually happen.
> 
> So, can you slide the below in early in the series?  Then in this series, convert
> to the above suggested flow (zero nmi_pending, stuff nmi_queued) in another patch?
> 
> From: Sean Christopherson <seanjc@google.com>
> Date: Wed, 8 Feb 2023 07:44:16 -0800
> Subject: [PATCH] KVM: x86: Save/restore all NMIs when multiple NMIs are
>  pending
> 
> Save all pending NMIs in KVM_GET_VCPU_EVENTS, and queue KVM_REQ_NMI if one
> or more NMIs are pending after KVM_SET_VCPU_EVENTS in order to re-evaluate
> pending NMIs with respect to NMI blocking.
> 
> KVM allows multiple NMIs to be pending in order to faithfully emulate bare
> metal handling of simultaneous NMIs (on bare metal, truly simultaneous
> NMIs are impossible, i.e. one will always arrive first and be consumed).
> Support for simultaneous NMIs botched the save/restore though.  KVM only
> saves one pending NMI, but allows userspace to restore 255 pending NMIs
> as kvm_vcpu_events.nmi.pending is a u8, and KVM's internal state is stored
> in an unsigned int.
> 
> 7460fb4a3400 ("KVM: Fix simultaneous NMIs")
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/x86.c | 7 +++++--
>  1 file changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 508074e47bc0..e9339acbf82a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5115,7 +5115,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
>  	events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
>  
>  	events->nmi.injected = vcpu->arch.nmi_injected;
> -	events->nmi.pending = vcpu->arch.nmi_pending != 0;
> +	events->nmi.pending = vcpu->arch.nmi_pending;
>  	events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
>  
>  	/* events->sipi_vector is never valid when reporting to user space */
> @@ -5202,8 +5202,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
>  						events->interrupt.shadow);
>  
>  	vcpu->arch.nmi_injected = events->nmi.injected;
> -	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
> +	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
>  		vcpu->arch.nmi_pending = events->nmi.pending;
> +		if (vcpu->arch.nmi_pending)
> +			kvm_make_request(KVM_REQ_NMI, vcpu);
> +	}
>  	static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
>  
>  	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
> 

Ok.

On top of the above, I am including your suggested change as below...

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e0855599df65..437a6cea3bc7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5201,9 +5201,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,

        vcpu->arch.nmi_injected = events->nmi.injected;
        if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
-               vcpu->arch.nmi_pending = events->nmi.pending;
-               if (vcpu->arch.nmi_pending)
-                       kvm_make_request(KVM_REQ_NMI, vcpu);
+               vcpu->arch.nmi_pending = 0;
+               atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
+               kvm_make_request(KVM_REQ_NMI, vcpu);
        }
        static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);

does that make sense?

> base-commit: 6c77ae716d546d71b21f0c9ee7d405314a3f3f9e
Sean Christopherson Feb. 15, 2023, 10:43 p.m. UTC | #12
On Tue, Feb 14, 2023, Santosh Shukla wrote:
> On 2/8/2023 9:36 PM, Sean Christopherson wrote:
> > On Wed, Feb 08, 2023, Santosh Shukla wrote:
> >> On 2/1/2023 3:58 AM, Sean Christopherson wrote:
> >>> On Tue, Nov 29, 2022, Maxim Levitsky wrote:
> >>>> @@ -5191,9 +5191,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
> >>>>  
> >>>>  	vcpu->arch.nmi_injected = events->nmi.injected;
> >>>>  	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
> >>>> -		vcpu->arch.nmi_pending = events->nmi.pending;
> >>>> +		atomic_add(events->nmi.pending, &vcpu->arch.nmi_queued);
> >>>> +
> >>>>  	static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
> >>>>  
> >>>> +	process_nmi(vcpu);
> >>>
> >>> Argh, having two process_nmi() calls is ugly (not blaming your code, it's KVM's
> >>> ABI that's ugly).  E.g. if we collapse this down, it becomes:
> >>>
> >>> 	process_nmi(vcpu);
> >>>
> >>> 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
> >>> 		<blah blah blah>
> >>> 	}
> >>> 	static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
> >>>
> >>> 	process_nmi(vcpu);
> >>>
> >>> And the second mess is that V_NMI needs to be cleared.
> >>>
> >>
> >> Can you please elaborate on "V_NMI cleared" scenario? Are you mentioning
> >> about V_NMI_MASK or svm->nmi_masked?
> > 
> > V_NMI_MASK.  KVM needs to purge any pending virtual NMIs when userspace sets
> > vCPU event state and KVM_VCPUEVENT_VALID_NMI_PENDING is set.
> > 
> 
> As per the APM: V_NMI_MASK is managed by the processor

Heh, we're running afoul over KVM's bad terminology conflicting with the APM's
terminology.  By V_NMI_MASK, I meant "KVM's V_NMI_MASK", a.k.a. the flag that says
whether or there's a pending NMI.


However...

> "
> V_NMI_MASK: Indicates whether virtual NMIs are masked. The processor will set V_NMI_MASK
> once it takes the virtual NMI. V_NMI_MASK is cleared when the guest successfully completes an
> IRET instruction or #VMEXIT occurs while delivering the virtual NMI
> "
>
> In my initial implementation I had changed V_NMI_MASK for the SMM scenario [1],
> This is also not required as HW will save the V_NMI/V_NMI_MASK on 
> SMM entry and restore them on RSM.
> 
> That said the svm_{get,set}_nmi_mask will look something like:
> 
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index a9e9bfbffd72..08911a33cf1e 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -3635,13 +3635,21 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
> 
>  static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
>  {
> -       return to_svm(vcpu)->nmi_masked;
> +       struct vcpu_svm *svm = to_svm(vcpu);
> +
> +       if (is_vnmi_enabled(svm))
> +               return svm->vmcb->control.int_ctl & V_NMI_MASK;
> +       else
> +               return svm->nmi_masked;
>  }
> 
>  static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
>  {
>         struct vcpu_svm *svm = to_svm(vcpu);
> 
> +       if (is_vnmi_enabled(svm))
> +               return;
> +
>         if (masked) {
>                 svm->nmi_masked = true;
>                 svm_set_iret_intercept(svm);
> 
> is there any inputs on above approach?

What happens if software clears the "NMIs are blocked" flag?  If KVM can't clear
the flag, then we've got problems.  E.g. if KVM emulates IRET or SMI+RSM.  And I
I believe there are use cases that use KVM to snapshot and reload vCPU state,
e.g. record+replay?, in which case KVM_SET_VCPU_EVENTS needs to be able to adjust
NMI blocking too.

> On top of the above, I am including your suggested change as below...
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e0855599df65..437a6cea3bc7 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5201,9 +5201,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
> 
>         vcpu->arch.nmi_injected = events->nmi.injected;
>         if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
> -               vcpu->arch.nmi_pending = events->nmi.pending;
> -               if (vcpu->arch.nmi_pending)
> -                       kvm_make_request(KVM_REQ_NMI, vcpu);
> +               vcpu->arch.nmi_pending = 0;
> +               atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
> +               kvm_make_request(KVM_REQ_NMI, vcpu);
>         }
>         static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
> 
> does that make sense?

Yep!
Sean Christopherson Feb. 16, 2023, 12:22 a.m. UTC | #13
On Wed, Feb 15, 2023, Sean Christopherson wrote:
> On Tue, Feb 14, 2023, Santosh Shukla wrote:
> > "
> > V_NMI_MASK: Indicates whether virtual NMIs are masked. The processor will set V_NMI_MASK
> > once it takes the virtual NMI. V_NMI_MASK is cleared when the guest successfully completes an
> > IRET instruction or #VMEXIT occurs while delivering the virtual NMI
> > "
> >
> > In my initial implementation I had changed V_NMI_MASK for the SMM scenario [1],
> > This is also not required as HW will save the V_NMI/V_NMI_MASK on 
> > SMM entry and restore them on RSM.
> > 
> > That said the svm_{get,set}_nmi_mask will look something like:

...

> >  static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
> >  {
> >         struct vcpu_svm *svm = to_svm(vcpu);
> > 
> > +       if (is_vnmi_enabled(svm))
> > +               return;
> > +
> >         if (masked) {
> >                 svm->nmi_masked = true;
> >                 svm_set_iret_intercept(svm);
> > 
> > is there any inputs on above approach?
> 
> What happens if software clears the "NMIs are blocked" flag?  If KVM can't clear
> the flag, then we've got problems.  E.g. if KVM emulates IRET or SMI+RSM.  And I
> I believe there are use cases that use KVM to snapshot and reload vCPU state,
> e.g. record+replay?, in which case KVM_SET_VCPU_EVENTS needs to be able to adjust
> NMI blocking too.

Actually, what am I thinking.  Any type of state save/restore will need to stuff
NMI blocking.  E.g. live migration of a VM that is handling an NMI (V_NMI_MASK=1)
_and_ has a pending NMI (V_NMI=1) absolutely needs to set V_NMI_MASK=1 on the dest,
otherwise the pending NMI will get serviced when the guest expects NMIs to be blocked.
Shukla, Santosh Feb. 17, 2023, 7:56 a.m. UTC | #14
On 2/16/2023 5:52 AM, Sean Christopherson wrote:
> On Wed, Feb 15, 2023, Sean Christopherson wrote:
>> On Tue, Feb 14, 2023, Santosh Shukla wrote:
>>> "
>>> V_NMI_MASK: Indicates whether virtual NMIs are masked. The processor will set V_NMI_MASK
>>> once it takes the virtual NMI. V_NMI_MASK is cleared when the guest successfully completes an
>>> IRET instruction or #VMEXIT occurs while delivering the virtual NMI
>>> "
>>>
>>> In my initial implementation I had changed V_NMI_MASK for the SMM scenario [1],
>>> This is also not required as HW will save the V_NMI/V_NMI_MASK on 
>>> SMM entry and restore them on RSM.
>>>
>>> That said the svm_{get,set}_nmi_mask will look something like:
> 
> ...
> 
>>>  static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
>>>  {
>>>         struct vcpu_svm *svm = to_svm(vcpu);
>>>
>>> +       if (is_vnmi_enabled(svm))
>>> +               return;
>>> +
>>>         if (masked) {
>>>                 svm->nmi_masked = true;
>>>                 svm_set_iret_intercept(svm);
>>>
>>> is there any inputs on above approach?
>>
>> What happens if software clears the "NMIs are blocked" flag?  If KVM can't clear
>> the flag, then we've got problems.  E.g. if KVM emulates IRET or SMI+RSM.  And I
>> I believe there are use cases that use KVM to snapshot and reload vCPU state,
>> e.g. record+replay?, in which case KVM_SET_VCPU_EVENTS needs to be able to adjust
>> NMI blocking too.
> 
> Actually, what am I thinking.  Any type of state save/restore will need to stuff
> NMI blocking.  E.g. live migration of a VM that is handling an NMI (V_NMI_MASK=1)
> _and_ has a pending NMI (V_NMI=1) absolutely needs to set V_NMI_MASK=1 on the dest,
> otherwise the pending NMI will get serviced when the guest expects NMIs to be blocked.

Sure, Make sense. Will include V_NMI_BLOCKING_MASK set/clear in svm_set_nmi_mask() in v3
and will soon share patches for review.

Thanks.
Maxim Levitsky Feb. 24, 2023, 2:39 p.m. UTC | #15
On Sat, 2023-01-28 at 01:09 +0000, Sean Christopherson wrote:
> On Tue, Nov 29, 2022, Maxim Levitsky wrote:
> > This patch adds two new vendor callbacks:
> 
> No "this patch" please, just say what it does.
> 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 684a5519812fb2..46993ce61c92db 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -871,8 +871,13 @@ struct kvm_vcpu_arch {
> >  	u64 tsc_scaling_ratio; /* current scaling ratio */
> >  
> >  	atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
> > -	unsigned nmi_pending; /* NMI queued after currently running handler */
> > +
> > +	unsigned int nmi_pending; /*
> > +				   * NMI queued after currently running handler
> > +				   * (not including a hardware pending NMI (e.g vNMI))
> > +				   */
> 
> Put the block comment above.  I'd say collapse all of the comments about NMIs into
> a single big block comment.
> 
> >  	bool nmi_injected;    /* Trying to inject an NMI this entry */
> > +
> >  	bool smi_pending;    /* SMI queued after currently running handler */
> >  	u8 handling_intr_from_guest;
> >  
> > @@ -10015,13 +10022,34 @@ static void process_nmi(struct kvm_vcpu *vcpu)
> >  	 * Otherwise, allow two (and we'll inject the first one immediately).
> >  	 */
> >  	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
> > -		limit = 1;
> > +		limit--;
> > +
> > +	/* Also if there is already a NMI hardware queued to be injected,
> > +	 * decrease the limit again
> > +	 */
> 
> 	/*
> 	 * Block comment ...
> 	 */
> 
> > +	if (static_call(kvm_x86_get_hw_nmi_pending)(vcpu))
> 
> I'd prefer "is_hw_nmi_pending()" over "get", even if it means not pairing with
> "set".  Though I think that's a good thing since they aren't perfect pairs.
> 
> > +		limit--;
> >  
> > -	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
> > +	if (limit <= 0)
> > +		return;
> > +
> > +	/* Attempt to use hardware NMI queueing */
> > +	if (static_call(kvm_x86_set_hw_nmi_pending)(vcpu)) {
> > +		limit--;
> > +		nmi_to_queue--;
> > +	}
> > +
> > +	vcpu->arch.nmi_pending += nmi_to_queue;
> >  	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
> >  	kvm_make_request(KVM_REQ_EVENT, vcpu);
> >  }
> >  
> > +/* Return total number of NMIs pending injection to the VM */
> > +int kvm_get_total_nmi_pending(struct kvm_vcpu *vcpu)
> > +{
> > +	return vcpu->arch.nmi_pending + static_call(kvm_x86_get_hw_nmi_pending)(vcpu);
> 
> Nothing cares about the total count, this can just be;

I wanted to have the interface to be a bit more generic so that in theory you could have
more that one hardware NMI pending. I don't care much about it.


Best regards,
	Maxim Levitsky

> 
> 
> 	bool kvm_is_nmi_pending(struct kvm_vcpu *vcpu)
> 	{
> 		return vcpu->arch.nmi_pending ||
> 		       static_call(kvm_x86_is_hw_nmi_pending)(vcpu);
> 	}
> 
> 
> > +}
> > +
> >  void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
> >  				       unsigned long *vcpu_bitmap)
> >  {
> > -- 
> > 2.26.3
> >
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index abccd51dcfca1b..9e2db6cf7cc041 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -67,6 +67,8 @@  KVM_X86_OP(get_interrupt_shadow)
 KVM_X86_OP(patch_hypercall)
 KVM_X86_OP(inject_irq)
 KVM_X86_OP(inject_nmi)
+KVM_X86_OP_OPTIONAL_RET0(get_hw_nmi_pending)
+KVM_X86_OP_OPTIONAL_RET0(set_hw_nmi_pending)
 KVM_X86_OP(inject_exception)
 KVM_X86_OP(cancel_injection)
 KVM_X86_OP(interrupt_allowed)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 684a5519812fb2..46993ce61c92db 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -871,8 +871,13 @@  struct kvm_vcpu_arch {
 	u64 tsc_scaling_ratio; /* current scaling ratio */
 
 	atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
-	unsigned nmi_pending; /* NMI queued after currently running handler */
+
+	unsigned int nmi_pending; /*
+				   * NMI queued after currently running handler
+				   * (not including a hardware pending NMI (e.g vNMI))
+				   */
 	bool nmi_injected;    /* Trying to inject an NMI this entry */
+
 	bool smi_pending;    /* SMI queued after currently running handler */
 	u8 handling_intr_from_guest;
 
@@ -1602,6 +1607,13 @@  struct kvm_x86_ops {
 	int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
 	bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
 	void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
+
+	/* returns true, if a NMI is pending injection on hardware level (e.g vNMI) */
+	bool (*get_hw_nmi_pending)(struct kvm_vcpu *vcpu);
+
+	/* attempts make a NMI pending via hardware interface (e.g vNMI) */
+	bool (*set_hw_nmi_pending)(struct kvm_vcpu *vcpu);
+
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
@@ -1964,6 +1976,7 @@  int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
 void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
+int kvm_get_total_nmi_pending(struct kvm_vcpu *vcpu);
 
 void kvm_update_dr7(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 85d2a12c214dda..3c30e3f1106f79 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5103,7 +5103,7 @@  static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
 
 	events->nmi.injected = vcpu->arch.nmi_injected;
-	events->nmi.pending = vcpu->arch.nmi_pending != 0;
+	events->nmi.pending = kvm_get_total_nmi_pending(vcpu) != 0;
 	events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
 
 	/* events->sipi_vector is never valid when reporting to user space */
@@ -5191,9 +5191,12 @@  static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 
 	vcpu->arch.nmi_injected = events->nmi.injected;
 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
-		vcpu->arch.nmi_pending = events->nmi.pending;
+		atomic_add(events->nmi.pending, &vcpu->arch.nmi_queued);
+
 	static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
 
+	process_nmi(vcpu);
+
 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
 	    lapic_in_kernel(vcpu))
 		vcpu->arch.apic->sipi_vector = events->sipi_vector;
@@ -10008,6 +10011,10 @@  static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
 static void process_nmi(struct kvm_vcpu *vcpu)
 {
 	unsigned limit = 2;
+	int nmi_to_queue = atomic_xchg(&vcpu->arch.nmi_queued, 0);
+
+	if (!nmi_to_queue)
+		return;
 
 	/*
 	 * x86 is limited to one NMI running, and one NMI pending after it.
@@ -10015,13 +10022,34 @@  static void process_nmi(struct kvm_vcpu *vcpu)
 	 * Otherwise, allow two (and we'll inject the first one immediately).
 	 */
 	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
-		limit = 1;
+		limit--;
+
+	/* Also if there is already a NMI hardware queued to be injected,
+	 * decrease the limit again
+	 */
+	if (static_call(kvm_x86_get_hw_nmi_pending)(vcpu))
+		limit--;
 
-	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
+	if (limit <= 0)
+		return;
+
+	/* Attempt to use hardware NMI queueing */
+	if (static_call(kvm_x86_set_hw_nmi_pending)(vcpu)) {
+		limit--;
+		nmi_to_queue--;
+	}
+
+	vcpu->arch.nmi_pending += nmi_to_queue;
 	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
+/* Return total number of NMIs pending injection to the VM */
+int kvm_get_total_nmi_pending(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.nmi_pending + static_call(kvm_x86_get_hw_nmi_pending)(vcpu);
+}
+
 void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
 				       unsigned long *vcpu_bitmap)
 {