diff mbox series

[v19,085/130] KVM: TDX: Complete interrupts after tdexit

Message ID aa6a927214a5d29d5591a0079f4374b05a82a03f.1708933498.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series [v19,001/130] x86/virt/tdx: Rename _offset to _member for TD_SYSINFO_MAP() macro | expand

Commit Message

Isaku Yamahata Feb. 26, 2024, 8:26 a.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

This corresponds to VMX __vmx_complete_interrupts().  Because TDX
virtualize vAPIC, KVM only needs to care NMI injection.

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
---
v19:
- move tdvps_management_check() to this patch
- typo: complete -> Complete in short log
---
 arch/x86/kvm/vmx/tdx.c | 10 ++++++++++
 arch/x86/kvm/vmx/tdx.h |  4 ++++
 2 files changed, 14 insertions(+)

Comments

Reinette Chatre April 16, 2024, 6:23 p.m. UTC | #1
Hi Isaku,

(In shortlog "tdexit" can be "TD exit" to be consistent with
documentation.)

On 2/26/2024 12:26 AM, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> This corresponds to VMX __vmx_complete_interrupts().  Because TDX
> virtualize vAPIC, KVM only needs to care NMI injection.

This seems to be the first appearance of NMI and the changelog
is very brief. How about expending it with:

"This corresponds to VMX __vmx_complete_interrupts().  Because TDX
 virtualize vAPIC, KVM only needs to care about NMI injection.

 KVM can request TDX to inject an NMI into a guest TD vCPU when the
 vCPU is not active. TDX will attempt to inject an NMI as soon as
 possible on TD entry. NMI injection is managed by writing to (to
 inject NMI) and reading from (to get status of NMI injection)
 the PEND_NMI field within the TDX vCPU scope metadata (Trust
 Domain Virtual Processor State (TDVPS)).

 Update KVM's NMI status on TD exit by checking whether a requested
 NMI has been injected into the TD. Reading the metadata via SEAMCALL
 is expensive so only perform the check if an NMI was injected.

 This is the first need to access vCPU scope metadata in the
 "management" class. Ensure that needed accessor is available. 
"

> 
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
> Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
> ---
> v19:
> - move tdvps_management_check() to this patch
> - typo: complete -> Complete in short log
> ---
>  arch/x86/kvm/vmx/tdx.c | 10 ++++++++++
>  arch/x86/kvm/vmx/tdx.h |  4 ++++
>  2 files changed, 14 insertions(+)
> 
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 83dcaf5b6fbd..b8b168f74dfe 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -535,6 +535,14 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>  	 */
>  }
>  
> +static void tdx_complete_interrupts(struct kvm_vcpu *vcpu)
> +{
> +	/* Avoid costly SEAMCALL if no nmi was injected */

	/* Avoid costly SEAMCALL if no NMI was injected. */

> +	if (vcpu->arch.nmi_injected)
> +		vcpu->arch.nmi_injected = td_management_read8(to_tdx(vcpu),
> +							      TD_VCPU_PEND_NMI);
> +}
> +
>  struct tdx_uret_msr {
>  	u32 msr;
>  	unsigned int slot;
> @@ -663,6 +671,8 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu)
>  	vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
>  	trace_kvm_exit(vcpu, KVM_ISA_VMX);
>  
> +	tdx_complete_interrupts(vcpu);
> +
>  	return EXIT_FASTPATH_NONE;
>  }
>  
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index 44eab734e702..0d8a98feb58e 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -142,6 +142,8 @@ static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
>  			 "Invalid TD VMCS access for 16-bit field");
>  }
>  
> +static __always_inline void tdvps_management_check(u64 field, u8 bits) {}

Is this intended to be a stub or is it expected to be fleshed out with
some checks?

> +
>  #define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass)				\
>  static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx,	\
>  							u32 field)		\
> @@ -200,6 +202,8 @@ TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs);
>  TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs);
>  TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs);
>  
> +TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management);
> +
>  static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
>  {
>  	struct tdx_module_args out;

Reinette
Isaku Yamahata April 17, 2024, 6:56 a.m. UTC | #2
On Tue, Apr 16, 2024 at 11:23:01AM -0700,
Reinette Chatre <reinette.chatre@intel.com> wrote:

> Hi Isaku,
> 
> (In shortlog "tdexit" can be "TD exit" to be consistent with
> documentation.)
> 
> On 2/26/2024 12:26 AM, isaku.yamahata@intel.com wrote:
> > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > 
> > This corresponds to VMX __vmx_complete_interrupts().  Because TDX
> > virtualize vAPIC, KVM only needs to care NMI injection.
> 
> This seems to be the first appearance of NMI and the changelog
> is very brief. How about expending it with:
> 
> "This corresponds to VMX __vmx_complete_interrupts().  Because TDX
>  virtualize vAPIC, KVM only needs to care about NMI injection.
> 
>  KVM can request TDX to inject an NMI into a guest TD vCPU when the
>  vCPU is not active. TDX will attempt to inject an NMI as soon as
>  possible on TD entry. NMI injection is managed by writing to (to
>  inject NMI) and reading from (to get status of NMI injection)
>  the PEND_NMI field within the TDX vCPU scope metadata (Trust
>  Domain Virtual Processor State (TDVPS)).
> 
>  Update KVM's NMI status on TD exit by checking whether a requested
>  NMI has been injected into the TD. Reading the metadata via SEAMCALL
>  is expensive so only perform the check if an NMI was injected.
> 
>  This is the first need to access vCPU scope metadata in the
>  "management" class. Ensure that needed accessor is available. 
> "
> 
> > 
> > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
> > Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
> > ---
> > v19:
> > - move tdvps_management_check() to this patch
> > - typo: complete -> Complete in short log
> > ---
> >  arch/x86/kvm/vmx/tdx.c | 10 ++++++++++
> >  arch/x86/kvm/vmx/tdx.h |  4 ++++
> >  2 files changed, 14 insertions(+)
> > 
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index 83dcaf5b6fbd..b8b168f74dfe 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -535,6 +535,14 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> >  	 */
> >  }
> >  
> > +static void tdx_complete_interrupts(struct kvm_vcpu *vcpu)
> > +{
> > +	/* Avoid costly SEAMCALL if no nmi was injected */
> 
> 	/* Avoid costly SEAMCALL if no NMI was injected. */
> 
> > +	if (vcpu->arch.nmi_injected)
> > +		vcpu->arch.nmi_injected = td_management_read8(to_tdx(vcpu),
> > +							      TD_VCPU_PEND_NMI);
> > +}
> > +
> >  struct tdx_uret_msr {
> >  	u32 msr;
> >  	unsigned int slot;
> > @@ -663,6 +671,8 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu)
> >  	vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
> >  	trace_kvm_exit(vcpu, KVM_ISA_VMX);
> >  
> > +	tdx_complete_interrupts(vcpu);
> > +
> >  	return EXIT_FASTPATH_NONE;
> >  }
> >  
> > diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> > index 44eab734e702..0d8a98feb58e 100644
> > --- a/arch/x86/kvm/vmx/tdx.h
> > +++ b/arch/x86/kvm/vmx/tdx.h
> > @@ -142,6 +142,8 @@ static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
> >  			 "Invalid TD VMCS access for 16-bit field");
> >  }
> >  
> > +static __always_inline void tdvps_management_check(u64 field, u8 bits) {}
> 
> Is this intended to be a stub or is it expected to be fleshed out with
> some checks?

It was used to check if field id matches bits.  We should make
tdvps_vmcs_check() common for vmcs, management and state_non_arch.
Binbin Wu April 23, 2024, 1:15 p.m. UTC | #3
On 4/17/2024 2:23 AM, Reinette Chatre wrote:
> Hi Isaku,
>
> (In shortlog "tdexit" can be "TD exit" to be consistent with
> documentation.)
>
> On 2/26/2024 12:26 AM, isaku.yamahata@intel.com wrote:
>> From: Isaku Yamahata <isaku.yamahata@intel.com>
>>
>> This corresponds to VMX __vmx_complete_interrupts().  Because TDX
>> virtualize vAPIC, KVM only needs to care NMI injection.
> This seems to be the first appearance of NMI and the changelog
> is very brief. How about expending it with:
>
> "This corresponds to VMX __vmx_complete_interrupts().  Because TDX
>   virtualize vAPIC, KVM only needs to care about NMI injection.
   ^
   virtualizes

Also, does it need to mention that non-NMI interrupts are handled by 
posted-interrupt mechanism?

For example:

"This corresponds to VMX __vmx_complete_interrupts().  Because TDX
  virtualizes vAPIC, and non-NMI interrupts are delivered using 
posted-interrupt
  mechanism, KVM only needs to care about NMI injection.
...
"

>
>   KVM can request TDX to inject an NMI into a guest TD vCPU when the
>   vCPU is not active. TDX will attempt to inject an NMI as soon as
>   possible on TD entry. NMI injection is managed by writing to (to
>   inject NMI) and reading from (to get status of NMI injection)
>   the PEND_NMI field within the TDX vCPU scope metadata (Trust
>   Domain Virtual Processor State (TDVPS)).
>
>   Update KVM's NMI status on TD exit by checking whether a requested
>   NMI has been injected into the TD. Reading the metadata via SEAMCALL
>   is expensive so only perform the check if an NMI was injected.
>
>   This is the first need to access vCPU scope metadata in the
>   "management" class. Ensure that needed accessor is available.
> "
>
Reinette Chatre April 23, 2024, 2:48 p.m. UTC | #4
On 4/23/2024 6:15 AM, Binbin Wu wrote:
> 
> 
> On 4/17/2024 2:23 AM, Reinette Chatre wrote:
>> Hi Isaku,
>>
>> (In shortlog "tdexit" can be "TD exit" to be consistent with
>> documentation.)
>>
>> On 2/26/2024 12:26 AM, isaku.yamahata@intel.com wrote:
>>> From: Isaku Yamahata <isaku.yamahata@intel.com>
>>>
>>> This corresponds to VMX __vmx_complete_interrupts().  Because TDX
>>> virtualize vAPIC, KVM only needs to care NMI injection.
>> This seems to be the first appearance of NMI and the changelog
>> is very brief. How about expending it with:
>>
>> "This corresponds to VMX __vmx_complete_interrupts().  Because TDX
>>   virtualize vAPIC, KVM only needs to care about NMI injection.
>   ^
>   virtualizes
> 
> Also, does it need to mention that non-NMI interrupts are handled by posted-interrupt mechanism?
> 
> For example:
> 
> "This corresponds to VMX __vmx_complete_interrupts().  Because TDX
>  virtualizes vAPIC, and non-NMI interrupts are delivered using posted-interrupt
>  mechanism, KVM only needs to care about NMI injection.
> ...
> "
> 

Thank you Binbin. Looks good to me.

Reinette
Yuan Yao June 17, 2024, 8:07 a.m. UTC | #5
On Mon, Feb 26, 2024 at 12:26:27AM -0800, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
>
> This corresponds to VMX __vmx_complete_interrupts().  Because TDX
> virtualize vAPIC, KVM only needs to care NMI injection.
>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
> Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
> ---
> v19:
> - move tdvps_management_check() to this patch
> - typo: complete -> Complete in short log
> ---
>  arch/x86/kvm/vmx/tdx.c | 10 ++++++++++
>  arch/x86/kvm/vmx/tdx.h |  4 ++++
>  2 files changed, 14 insertions(+)
>
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 83dcaf5b6fbd..b8b168f74dfe 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -535,6 +535,14 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>  	 */
>  }
>
> +static void tdx_complete_interrupts(struct kvm_vcpu *vcpu)
> +{
> +	/* Avoid costly SEAMCALL if no nmi was injected */
> +	if (vcpu->arch.nmi_injected)
> +		vcpu->arch.nmi_injected = td_management_read8(to_tdx(vcpu),
> +							      TD_VCPU_PEND_NMI);
> +}

Looks this leads to NMI injection delay or even won't be
reinjected if KVM_REQ_EVENT is not set on the target cpu
when more than 1 NMIs are pending there.

On normal VM, KVM uses NMI window vmexit for injection
successful case to rasie the KVM_REQ_EVENT again for remain
pending NMIs, see handle_nmi_window(). KVM also checks
vectoring info after VMEXIT for case that the NMI is not
injected successfully in this vmentry vmexit round, and
raise KVM_REQ_EVENT to try again, see __vmx_complete_interrupts().

In TDX, consider there's no way to get vectoring info or
handle nmi window vmexit, below checking should cover both
scenarios for NMI injection:

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index e9c9a185bb7b..9edf446acd3b 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -835,9 +835,12 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 static void tdx_complete_interrupts(struct kvm_vcpu *vcpu)
 {
        /* Avoid costly SEAMCALL if no nmi was injected */
-       if (vcpu->arch.nmi_injected)
+       if (vcpu->arch.nmi_injected) {
                vcpu->arch.nmi_injected = td_management_read8(to_tdx(vcpu),
                                                              TD_VCPU_PEND_NMI);
+               if (vcpu->arch.nmi_injected || vcpu->arch.nmi_pending)
+                       kvm_make_request(KVM_REQ_EVENT, vcpu);
+       }
 }

> +
>  struct tdx_uret_msr {
>  	u32 msr;
>  	unsigned int slot;
> @@ -663,6 +671,8 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu)
>  	vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
>  	trace_kvm_exit(vcpu, KVM_ISA_VMX);
>
> +	tdx_complete_interrupts(vcpu);
> +
>  	return EXIT_FASTPATH_NONE;
>  }
>
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index 44eab734e702..0d8a98feb58e 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -142,6 +142,8 @@ static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
>  			 "Invalid TD VMCS access for 16-bit field");
>  }
>
> +static __always_inline void tdvps_management_check(u64 field, u8 bits) {}
> +
>  #define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass)				\
>  static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx,	\
>  							u32 field)		\
> @@ -200,6 +202,8 @@ TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs);
>  TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs);
>  TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs);
>
> +TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management);
> +
>  static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
>  {
>  	struct tdx_module_args out;
> --
> 2.25.1
>
>
Binbin Wu June 17, 2024, 9:07 a.m. UTC | #6
On 6/17/2024 4:07 PM, Yuan Yao wrote:
> On Mon, Feb 26, 2024 at 12:26:27AM -0800, isaku.yamahata@intel.com wrote:
>> From: Isaku Yamahata <isaku.yamahata@intel.com>
>>
>> This corresponds to VMX __vmx_complete_interrupts().  Because TDX
>> virtualize vAPIC, KVM only needs to care NMI injection.
>>
>> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
>> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
>> Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
>> ---
>> v19:
>> - move tdvps_management_check() to this patch
>> - typo: complete -> Complete in short log
>> ---
>>   arch/x86/kvm/vmx/tdx.c | 10 ++++++++++
>>   arch/x86/kvm/vmx/tdx.h |  4 ++++
>>   2 files changed, 14 insertions(+)
>>
>> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
>> index 83dcaf5b6fbd..b8b168f74dfe 100644
>> --- a/arch/x86/kvm/vmx/tdx.c
>> +++ b/arch/x86/kvm/vmx/tdx.c
>> @@ -535,6 +535,14 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>>   	 */
>>   }
>>
>> +static void tdx_complete_interrupts(struct kvm_vcpu *vcpu)
>> +{
>> +	/* Avoid costly SEAMCALL if no nmi was injected */
>> +	if (vcpu->arch.nmi_injected)
>> +		vcpu->arch.nmi_injected = td_management_read8(to_tdx(vcpu),
>> +							      TD_VCPU_PEND_NMI);
>> +}
> Looks this leads to NMI injection delay or even won't be
> reinjected if KVM_REQ_EVENT is not set on the target cpu
> when more than 1 NMIs are pending there.
>
> On normal VM, KVM uses NMI window vmexit for injection
> successful case to rasie the KVM_REQ_EVENT again for remain
> pending NMIs, see handle_nmi_window(). KVM also checks
> vectoring info after VMEXIT for case that the NMI is not
> injected successfully in this vmentry vmexit round, and
> raise KVM_REQ_EVENT to try again, see __vmx_complete_interrupts().
>
> In TDX, consider there's no way to get vectoring info or
> handle nmi window vmexit, below checking should cover both
> scenarios for NMI injection:
>
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index e9c9a185bb7b..9edf446acd3b 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -835,9 +835,12 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>   static void tdx_complete_interrupts(struct kvm_vcpu *vcpu)
>   {
>          /* Avoid costly SEAMCALL if no nmi was injected */
> -       if (vcpu->arch.nmi_injected)
> +       if (vcpu->arch.nmi_injected) {
>                  vcpu->arch.nmi_injected = td_management_read8(to_tdx(vcpu),
>                                                                TD_VCPU_PEND_NMI);
> +               if (vcpu->arch.nmi_injected || vcpu->arch.nmi_pending)
> +                       kvm_make_request(KVM_REQ_EVENT, vcpu);

For nmi_injected, it should be OK because TD_VCPU_PEND_NMI is still set.
But for nmi_pending, it should be checked and raise event.

I remember there was a discussion in the following link:
https://lore.kernel.org/kvm/20240402065254.GY2444378@ls.amr.corp.intel.com/
It said  tdx_vcpu_run() will ignore force_immediate_exit.
If force_immediate_exit is igored for TDX, then the nmi_pending handling 
could still be delayed if the previous NMI was injected successfully.


> +       }
>   }
>
>> +
>>   struct tdx_uret_msr {
>>   	u32 msr;
>>   	unsigned int slot;
>> @@ -663,6 +671,8 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu)
>>   	vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
>>   	trace_kvm_exit(vcpu, KVM_ISA_VMX);
>>
>> +	tdx_complete_interrupts(vcpu);
>> +
>>   	return EXIT_FASTPATH_NONE;
>>   }
>>
>> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
>> index 44eab734e702..0d8a98feb58e 100644
>> --- a/arch/x86/kvm/vmx/tdx.h
>> +++ b/arch/x86/kvm/vmx/tdx.h
>> @@ -142,6 +142,8 @@ static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
>>   			 "Invalid TD VMCS access for 16-bit field");
>>   }
>>
>> +static __always_inline void tdvps_management_check(u64 field, u8 bits) {}
>> +
>>   #define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass)				\
>>   static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx,	\
>>   							u32 field)		\
>> @@ -200,6 +202,8 @@ TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs);
>>   TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs);
>>   TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs);
>>
>> +TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management);
>> +
>>   static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
>>   {
>>   	struct tdx_module_args out;
>> --
>> 2.25.1
>>
>>
Yuan Yao June 18, 2024, 3:28 a.m. UTC | #7
On Mon, Jun 17, 2024 at 05:07:56PM +0800, Binbin Wu wrote:
>
>
> On 6/17/2024 4:07 PM, Yuan Yao wrote:
> > On Mon, Feb 26, 2024 at 12:26:27AM -0800, isaku.yamahata@intel.com wrote:
> > > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > >
> > > This corresponds to VMX __vmx_complete_interrupts().  Because TDX
> > > virtualize vAPIC, KVM only needs to care NMI injection.
> > >
> > > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > > Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
> > > Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
> > > ---
> > > v19:
> > > - move tdvps_management_check() to this patch
> > > - typo: complete -> Complete in short log
> > > ---
> > >   arch/x86/kvm/vmx/tdx.c | 10 ++++++++++
> > >   arch/x86/kvm/vmx/tdx.h |  4 ++++
> > >   2 files changed, 14 insertions(+)
> > >
> > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > > index 83dcaf5b6fbd..b8b168f74dfe 100644
> > > --- a/arch/x86/kvm/vmx/tdx.c
> > > +++ b/arch/x86/kvm/vmx/tdx.c
> > > @@ -535,6 +535,14 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> > >   	 */
> > >   }
> > >
> > > +static void tdx_complete_interrupts(struct kvm_vcpu *vcpu)
> > > +{
> > > +	/* Avoid costly SEAMCALL if no nmi was injected */
> > > +	if (vcpu->arch.nmi_injected)
> > > +		vcpu->arch.nmi_injected = td_management_read8(to_tdx(vcpu),
> > > +							      TD_VCPU_PEND_NMI);
> > > +}
> > Looks this leads to NMI injection delay or even won't be
> > reinjected if KVM_REQ_EVENT is not set on the target cpu
> > when more than 1 NMIs are pending there.
> >
> > On normal VM, KVM uses NMI window vmexit for injection
> > successful case to rasie the KVM_REQ_EVENT again for remain
> > pending NMIs, see handle_nmi_window(). KVM also checks
> > vectoring info after VMEXIT for case that the NMI is not
> > injected successfully in this vmentry vmexit round, and
> > raise KVM_REQ_EVENT to try again, see __vmx_complete_interrupts().
> >
> > In TDX, consider there's no way to get vectoring info or
> > handle nmi window vmexit, below checking should cover both
> > scenarios for NMI injection:
> >
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index e9c9a185bb7b..9edf446acd3b 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -835,9 +835,12 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> >   static void tdx_complete_interrupts(struct kvm_vcpu *vcpu)
> >   {
> >          /* Avoid costly SEAMCALL if no nmi was injected */
> > -       if (vcpu->arch.nmi_injected)
> > +       if (vcpu->arch.nmi_injected) {
> >                  vcpu->arch.nmi_injected = td_management_read8(to_tdx(vcpu),
> >                                                                TD_VCPU_PEND_NMI);
> > +               if (vcpu->arch.nmi_injected || vcpu->arch.nmi_pending)
> > +                       kvm_make_request(KVM_REQ_EVENT, vcpu);
>
> For nmi_injected, it should be OK because TD_VCPU_PEND_NMI is still set.
> But for nmi_pending, it should be checked and raise event.

Right, I just forgot the tdx module can do more than "hardware":

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index e9c9a185bb7b..3530a4882efc 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -835,9 +835,16 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 static void tdx_complete_interrupts(struct kvm_vcpu *vcpu)
 {
        /* Avoid costly SEAMCALL if no nmi was injected */
-       if (vcpu->arch.nmi_injected)
+       if (vcpu->arch.nmi_injected) {
                vcpu->arch.nmi_injected = td_management_read8(to_tdx(vcpu),
                                                              TD_VCPU_PEND_NMI);
+               /*
+                  tdx module will retry injection in case of TD_VCPU_PEND_NMI,
+                  so don't need to set KVM_REQ_EVENT for it again.
+                */
+               if (!vcpu->arch.nmi_injected && vcpu->arch.nmi_pending)
+                       kvm_make_request(KVM_REQ_EVENT, vcpu);
+       }
 }

>
> I remember there was a discussion in the following link:
> https://lore.kernel.org/kvm/20240402065254.GY2444378@ls.amr.corp.intel.com/
> It said  tdx_vcpu_run() will ignore force_immediate_exit.
> If force_immediate_exit is igored for TDX, then the nmi_pending handling
> could still be delayed if the previous NMI was injected successfully.

Yes, not sure the possibility of meeting this in real use
case, I know it happens in some testing, e.g. the kvm
unit test's multiple NMI tesing.

>
>
> > +       }
> >   }
> >
> > > +
> > >   struct tdx_uret_msr {
> > >   	u32 msr;
> > >   	unsigned int slot;
> > > @@ -663,6 +671,8 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu)
> > >   	vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
> > >   	trace_kvm_exit(vcpu, KVM_ISA_VMX);
> > >
> > > +	tdx_complete_interrupts(vcpu);
> > > +
> > >   	return EXIT_FASTPATH_NONE;
> > >   }
> > >
> > > diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> > > index 44eab734e702..0d8a98feb58e 100644
> > > --- a/arch/x86/kvm/vmx/tdx.h
> > > +++ b/arch/x86/kvm/vmx/tdx.h
> > > @@ -142,6 +142,8 @@ static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
> > >   			 "Invalid TD VMCS access for 16-bit field");
> > >   }
> > >
> > > +static __always_inline void tdvps_management_check(u64 field, u8 bits) {}
> > > +
> > >   #define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass)				\
> > >   static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx,	\
> > >   							u32 field)		\
> > > @@ -200,6 +202,8 @@ TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs);
> > >   TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs);
> > >   TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs);
> > >
> > > +TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management);
> > > +
> > >   static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
> > >   {
> > >   	struct tdx_module_args out;
> > > --
> > > 2.25.1
> > >
> > >
>
Binbin Wu July 8, 2024, 6:11 a.m. UTC | #8
On 6/18/2024 11:28 AM, Yuan Yao wrote:
> On Mon, Jun 17, 2024 at 05:07:56PM +0800, Binbin Wu wrote:
>>
>> On 6/17/2024 4:07 PM, Yuan Yao wrote:
>>> On Mon, Feb 26, 2024 at 12:26:27AM -0800, isaku.yamahata@intel.com wrote:
>>>> From: Isaku Yamahata <isaku.yamahata@intel.com>
>>>>
>>>> This corresponds to VMX __vmx_complete_interrupts().  Because TDX
>>>> virtualize vAPIC, KVM only needs to care NMI injection.
>>>>
>>>> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
>>>> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
>>>> Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
>>>> ---
>>>> v19:
>>>> - move tdvps_management_check() to this patch
>>>> - typo: complete -> Complete in short log
>>>> ---
>>>>    arch/x86/kvm/vmx/tdx.c | 10 ++++++++++
>>>>    arch/x86/kvm/vmx/tdx.h |  4 ++++
>>>>    2 files changed, 14 insertions(+)
>>>>
>>>> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
>>>> index 83dcaf5b6fbd..b8b168f74dfe 100644
>>>> --- a/arch/x86/kvm/vmx/tdx.c
>>>> +++ b/arch/x86/kvm/vmx/tdx.c
>>>> @@ -535,6 +535,14 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>>>>    	 */
>>>>    }
>>>>
>>>> +static void tdx_complete_interrupts(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	/* Avoid costly SEAMCALL if no nmi was injected */
>>>> +	if (vcpu->arch.nmi_injected)
>>>> +		vcpu->arch.nmi_injected = td_management_read8(to_tdx(vcpu),
>>>> +							      TD_VCPU_PEND_NMI);
>>>> +}
>>> Looks this leads to NMI injection delay or even won't be
>>> reinjected if KVM_REQ_EVENT is not set on the target cpu
>>> when more than 1 NMIs are pending there.
>>>
>>> On normal VM, KVM uses NMI window vmexit for injection
>>> successful case to rasie the KVM_REQ_EVENT again for remain
>>> pending NMIs, see handle_nmi_window(). KVM also checks
>>> vectoring info after VMEXIT for case that the NMI is not
>>> injected successfully in this vmentry vmexit round, and
>>> raise KVM_REQ_EVENT to try again, see __vmx_complete_interrupts().
>>>
>>> In TDX, consider there's no way to get vectoring info or
>>> handle nmi window vmexit, below checking should cover both
>>> scenarios for NMI injection:
>>>
>>> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
>>> index e9c9a185bb7b..9edf446acd3b 100644
>>> --- a/arch/x86/kvm/vmx/tdx.c
>>> +++ b/arch/x86/kvm/vmx/tdx.c
>>> @@ -835,9 +835,12 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>>>    static void tdx_complete_interrupts(struct kvm_vcpu *vcpu)
>>>    {
>>>           /* Avoid costly SEAMCALL if no nmi was injected */
>>> -       if (vcpu->arch.nmi_injected)
>>> +       if (vcpu->arch.nmi_injected) {
>>>                   vcpu->arch.nmi_injected = td_management_read8(to_tdx(vcpu),
>>>                                                                 TD_VCPU_PEND_NMI);
>>> +               if (vcpu->arch.nmi_injected || vcpu->arch.nmi_pending)
>>> +                       kvm_make_request(KVM_REQ_EVENT, vcpu);
>> For nmi_injected, it should be OK because TD_VCPU_PEND_NMI is still set.
>> But for nmi_pending, it should be checked and raise event.
> Right, I just forgot the tdx module can do more than "hardware":
>
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index e9c9a185bb7b..3530a4882efc 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -835,9 +835,16 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>   static void tdx_complete_interrupts(struct kvm_vcpu *vcpu)
>   {
>          /* Avoid costly SEAMCALL if no nmi was injected */
> -       if (vcpu->arch.nmi_injected)
> +       if (vcpu->arch.nmi_injected) {
>                  vcpu->arch.nmi_injected = td_management_read8(to_tdx(vcpu),
>                                                                TD_VCPU_PEND_NMI);
> +               /*
> +                  tdx module will retry injection in case of TD_VCPU_PEND_NMI,
> +                  so don't need to set KVM_REQ_EVENT for it again.
> +                */
> +               if (!vcpu->arch.nmi_injected && vcpu->arch.nmi_pending)
> +                       kvm_make_request(KVM_REQ_EVENT, vcpu);
> +       }
>   }
>
>> I remember there was a discussion in the following link:
>> https://lore.kernel.org/kvm/20240402065254.GY2444378@ls.amr.corp.intel.com/
>> It said  tdx_vcpu_run() will ignore force_immediate_exit.
>> If force_immediate_exit is igored for TDX, then the nmi_pending handling
>> could still be delayed if the previous NMI was injected successfully.
> Yes, not sure the possibility of meeting this in real use
> case, I know it happens in some testing, e.g. the kvm
> unit test's multiple NMI tesing.

Delay the pending NMI to the next VM exit will have problem.
Current Linux kernel code on NMI handling, it will check back-to-back 
NMI when handling unknown NMI.
Here are the comments in arch/x86/kernel/nmi.c
         /*
          * Only one NMI can be latched at a time.  To handle
          * this we may process multiple nmi handlers at once to
          * cover the case where an NMI is dropped.  The downside
          * to this approach is we may process an NMI prematurely,
          * while its real NMI is sitting latched.  This will cause
          * an unknown NMI on the next run of the NMI processing.
          *
          * We tried to flag that condition above, by setting the
          * swallow_nmi flag when we process more than one event.
          * This condition is also only present on the second half
          * of a back-to-back NMI, so we flag that condition too.
          *
          * If both are true, we assume we already processed this
          * NMI previously and we swallow it. ...
          */
Assume there are two NMIs pending in KVM, i.e. nmi_pending is 2.
KVM injects one NMI by settting TD_VCPU_PEND_NMI field and the 
nmi_pending is decreased to 1.
The pending NMI will be delayed until the next VM Exit, it will not be 
detected as the second half of back-to-back NMI in guest.
Then it will be considered as a real unknown NMI, and if no one handles 
it (because it could have been handled in the previous NMI handler).
At last, guest kernel will fire error message for the "unhandled" 
unknown NMI, and even panic if unknown_nmi_panic or 
panic_on_unrecovered_nmi is set true.


Since KVM doesn't have the capability to get NMI blocking status or 
request NMI-window exit for TDX, how about limiting the nmi pending to 1 
for TDX?
I.e. if TD_VCPU_PEND_NMI is not set, limit nmi_pending to 1 in 
process_nmi();
      if TD_VCPU_PEND_NMI is set, limit nmi_pending to 0 in process_nmi().

Had some checks about the history when nmi_pending limit changed to 2.
The discussion in the 
link https://lore.kernel.org/all/4E723A8A.7050405@redhat.com/ said:
" ... the NMI handlers are now being reworked to handle
just one NMI source (hopefully the cheapest) in the handler, and if we
detect a back-to-back NMI, handle all possible NMI sources."
IIUC, the change in NMI handlers described above is referring to the 
patch set "x86, nmi: new NMI handling routines"
https://lore.kernel.org/all/1317409584-23662-1-git-send-email-dzickus@redhat.com/

I noticed that in v6 of the patch series, there was an optimization, but 
removed in v7.
v6 link: 
https://lore.kernel.org/all/1316805435-14832-5-git-send-email-dzickus@redhat.com/
v7 link: 
https://lore.kernel.org/all/1317409584-23662-5-git-send-email-dzickus@redhat.com/
The Optimization code in v6, but removed in v7:
           -static int notrace __kprobes nmi_handle(unsigned int type, 
struct pt_regs *regs)
           +static int notrace __kprobes nmi_handle(unsigned int type, 
struct pt_regs *regs, bool b2b)
           {
                struct nmi_desc *desc = nmi_to_desc(type);
                struct nmiaction *a;
           @@ -89,6 +89,15 @@ static int notrace __kprobes 
nmi_handle(unsigned int type, struct pt_regs *regs)

                handled += a->handler(type, regs);

           +        /*
           +          * Optimization: only loop once if this is not a
           +          * back-to-back NMI.  The idea is nothing is dropped
           +          * on the first NMI, only on the second of a 
back-to-back
           +          * NMI.  No need to waste cycles going through all the
           +          * handlers.
           +          */
           +        if (!b2b && handled)
           +            break;
                }

At last, back-to-back NMI optimization is not used in Linux kernel.
So the kernel is able to handle NMI sources if we drop later NMIs when 
there is already one virtual NMI pending for TDX.
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 83dcaf5b6fbd..b8b168f74dfe 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -535,6 +535,14 @@  void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	 */
 }
 
+static void tdx_complete_interrupts(struct kvm_vcpu *vcpu)
+{
+	/* Avoid costly SEAMCALL if no nmi was injected */
+	if (vcpu->arch.nmi_injected)
+		vcpu->arch.nmi_injected = td_management_read8(to_tdx(vcpu),
+							      TD_VCPU_PEND_NMI);
+}
+
 struct tdx_uret_msr {
 	u32 msr;
 	unsigned int slot;
@@ -663,6 +671,8 @@  fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu)
 	vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
 	trace_kvm_exit(vcpu, KVM_ISA_VMX);
 
+	tdx_complete_interrupts(vcpu);
+
 	return EXIT_FASTPATH_NONE;
 }
 
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 44eab734e702..0d8a98feb58e 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -142,6 +142,8 @@  static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
 			 "Invalid TD VMCS access for 16-bit field");
 }
 
+static __always_inline void tdvps_management_check(u64 field, u8 bits) {}
+
 #define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass)				\
 static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx,	\
 							u32 field)		\
@@ -200,6 +202,8 @@  TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs);
 TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs);
 TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs);
 
+TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management);
+
 static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
 {
 	struct tdx_module_args out;