diff mbox series

[7/7] KVM: TDX: Add TSX_CTRL msr into uret_msrs list

Message ID 20241121201448.36170-8-adrian.hunter@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: TDX: TD vcpu enter/exit | expand

Commit Message

Adrian Hunter Nov. 21, 2024, 8:14 p.m. UTC
From: Yang Weijiang <weijiang.yang@intel.com>

TDX module resets the TSX_CTRL MSR to 0 at TD exit if TSX is enabled for
TD. Or it preserves the TSX_CTRL MSR if TSX is disabled for TD.  VMM can
rely on uret_msrs mechanism to defer the reload of host value until exiting
to user space.

Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
TD vcpu enter/exit v1:
 - Update from rename in earlier patches (Binbin)

v19:
- fix the type of tdx_uret_tsx_ctrl_slot. unguent int => int.
---
 arch/x86/kvm/vmx/tdx.c | 33 +++++++++++++++++++++++++++++++--
 arch/x86/kvm/vmx/tdx.h |  8 ++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

Comments

Chao Gao Nov. 22, 2024, 3:27 a.m. UTC | #1
>+static bool tdparams_tsx_supported(struct kvm_cpuid2 *cpuid)
>+{
>+	const struct kvm_cpuid_entry2 *entry;
>+	u64 mask;
>+	u32 ebx;
>+
>+	entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x7, 0);
>+	if (entry)
>+		ebx = entry->ebx;
>+	else
>+		ebx = 0;
>+
>+	mask = __feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM);
>+	return ebx & mask;
>+}
>+
> static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
> 			struct kvm_tdx_init_vm *init_vm)
> {
>@@ -1299,6 +1322,7 @@ static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
> 	MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
> 	MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
> 
>+	to_kvm_tdx(kvm)->tsx_supported = tdparams_tsx_supported(cpuid);
> 	return 0;
> }
> 
>@@ -2272,6 +2296,11 @@ static int __init __tdx_bringup(void)
> 			return -EIO;
> 		}
> 	}
>+	tdx_uret_tsx_ctrl_slot = kvm_find_user_return_msr(MSR_IA32_TSX_CTRL);
>+	if (tdx_uret_tsx_ctrl_slot == -1 && boot_cpu_has(X86_FEATURE_MSR_TSX_CTRL)) {
>+		pr_err("MSR_IA32_TSX_CTRL isn't included by kvm_find_user_return_msr\n");
>+		return -EIO;
>+	}
> 
> 	/*
> 	 * Enabling TDX requires enabling hardware virtualization first,
>diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
>index 48cf0a1abfcc..815ff6bdbc7e 100644
>--- a/arch/x86/kvm/vmx/tdx.h
>+++ b/arch/x86/kvm/vmx/tdx.h
>@@ -29,6 +29,14 @@ struct kvm_tdx {
> 	u8 nr_tdcs_pages;
> 	u8 nr_vcpu_tdcx_pages;
> 
>+	/*
>+	 * Used on each TD-exit, see tdx_user_return_msr_update_cache().
>+	 * TSX_CTRL value on TD exit
>+	 * - set 0     if guest TSX enabled
>+	 * - preserved if guest TSX disabled
>+	 */
>+	bool tsx_supported;

Is it possible to drop this boolean and tdparams_tsx_supported()? I think we
can use the guest_can_use() framework instead.

>+
> 	u64 tsc_offset;
> 
> 	enum kvm_tdx_state state;
>-- 
>2.43.0
>
Sean Christopherson Nov. 27, 2024, 2 p.m. UTC | #2
On Fri, Nov 22, 2024, Chao Gao wrote:
> >+static bool tdparams_tsx_supported(struct kvm_cpuid2 *cpuid)
> >+{
> >+	const struct kvm_cpuid_entry2 *entry;
> >+	u64 mask;
> >+	u32 ebx;
> >+
> >+	entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x7, 0);
> >+	if (entry)
> >+		ebx = entry->ebx;
> >+	else
> >+		ebx = 0;
> >+
> >+	mask = __feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM);
> >+	return ebx & mask;
> >+}
> >+
> > static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
> > 			struct kvm_tdx_init_vm *init_vm)
> > {
> >@@ -1299,6 +1322,7 @@ static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
> > 	MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
> > 	MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
> > 
> >+	to_kvm_tdx(kvm)->tsx_supported = tdparams_tsx_supported(cpuid);
> > 	return 0;
> > }
> > 
> >@@ -2272,6 +2296,11 @@ static int __init __tdx_bringup(void)
> > 			return -EIO;
> > 		}
> > 	}
> >+	tdx_uret_tsx_ctrl_slot = kvm_find_user_return_msr(MSR_IA32_TSX_CTRL);
> >+	if (tdx_uret_tsx_ctrl_slot == -1 && boot_cpu_has(X86_FEATURE_MSR_TSX_CTRL)) {
> >+		pr_err("MSR_IA32_TSX_CTRL isn't included by kvm_find_user_return_msr\n");
> >+		return -EIO;
> >+	}
> > 
> > 	/*
> > 	 * Enabling TDX requires enabling hardware virtualization first,
> >diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> >index 48cf0a1abfcc..815ff6bdbc7e 100644
> >--- a/arch/x86/kvm/vmx/tdx.h
> >+++ b/arch/x86/kvm/vmx/tdx.h
> >@@ -29,6 +29,14 @@ struct kvm_tdx {
> > 	u8 nr_tdcs_pages;
> > 	u8 nr_vcpu_tdcx_pages;
> > 
> >+	/*
> >+	 * Used on each TD-exit, see tdx_user_return_msr_update_cache().
> >+	 * TSX_CTRL value on TD exit
> >+	 * - set 0     if guest TSX enabled
> >+	 * - preserved if guest TSX disabled
> >+	 */
> >+	bool tsx_supported;
> 
> Is it possible to drop this boolean and tdparams_tsx_supported()? I think we
> can use the guest_can_use() framework instead.

Yeah, though that optimized handling will soon come for free[*], and I plan on
landing that sooner than TDX, so don't fret too much over this.

[*] https://lore.kernel.org/all/20240517173926.965351-1-seanjc@google.com
Adrian Hunter Nov. 29, 2024, 11:39 a.m. UTC | #3
On 27/11/24 16:00, Sean Christopherson wrote:
> On Fri, Nov 22, 2024, Chao Gao wrote:
>>> +static bool tdparams_tsx_supported(struct kvm_cpuid2 *cpuid)
>>> +{
>>> +	const struct kvm_cpuid_entry2 *entry;
>>> +	u64 mask;
>>> +	u32 ebx;
>>> +
>>> +	entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x7, 0);
>>> +	if (entry)
>>> +		ebx = entry->ebx;
>>> +	else
>>> +		ebx = 0;
>>> +
>>> +	mask = __feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM);
>>> +	return ebx & mask;
>>> +}
>>> +
>>> static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
>>> 			struct kvm_tdx_init_vm *init_vm)
>>> {
>>> @@ -1299,6 +1322,7 @@ static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
>>> 	MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
>>> 	MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
>>>
>>> +	to_kvm_tdx(kvm)->tsx_supported = tdparams_tsx_supported(cpuid);
>>> 	return 0;
>>> }
>>>
>>> @@ -2272,6 +2296,11 @@ static int __init __tdx_bringup(void)
>>> 			return -EIO;
>>> 		}
>>> 	}
>>> +	tdx_uret_tsx_ctrl_slot = kvm_find_user_return_msr(MSR_IA32_TSX_CTRL);
>>> +	if (tdx_uret_tsx_ctrl_slot == -1 && boot_cpu_has(X86_FEATURE_MSR_TSX_CTRL)) {
>>> +		pr_err("MSR_IA32_TSX_CTRL isn't included by kvm_find_user_return_msr\n");
>>> +		return -EIO;
>>> +	}
>>>
>>> 	/*
>>> 	 * Enabling TDX requires enabling hardware virtualization first,
>>> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
>>> index 48cf0a1abfcc..815ff6bdbc7e 100644
>>> --- a/arch/x86/kvm/vmx/tdx.h
>>> +++ b/arch/x86/kvm/vmx/tdx.h
>>> @@ -29,6 +29,14 @@ struct kvm_tdx {
>>> 	u8 nr_tdcs_pages;
>>> 	u8 nr_vcpu_tdcx_pages;
>>>
>>> +	/*
>>> +	 * Used on each TD-exit, see tdx_user_return_msr_update_cache().
>>> +	 * TSX_CTRL value on TD exit
>>> +	 * - set 0     if guest TSX enabled
>>> +	 * - preserved if guest TSX disabled
>>> +	 */
>>> +	bool tsx_supported;
>>
>> Is it possible to drop this boolean and tdparams_tsx_supported()? I think we
>> can use the guest_can_use() framework instead.
> 
> Yeah, though that optimized handling will soon come for free[*], and I plan on
> landing that sooner than TDX, so don't fret too much over this.
> 
> [*] https://lore.kernel.org/all/20240517173926.965351-1-seanjc@google.com

guest_can_use() is per-vcpu whereas we are currently using the
CPUID from TD_PARAMS (as per spec) before there are any VCPU's.
It is a bit of a disconnect so let's keep tsx_supported for now.
Sean Christopherson Dec. 2, 2024, 7:07 p.m. UTC | #4
On Fri, Nov 29, 2024, Adrian Hunter wrote:
> On 27/11/24 16:00, Sean Christopherson wrote:
> > On Fri, Nov 22, 2024, Chao Gao wrote:
> >>> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> >>> index 48cf0a1abfcc..815ff6bdbc7e 100644
> >>> --- a/arch/x86/kvm/vmx/tdx.h
> >>> +++ b/arch/x86/kvm/vmx/tdx.h
> >>> @@ -29,6 +29,14 @@ struct kvm_tdx {
> >>> 	u8 nr_tdcs_pages;
> >>> 	u8 nr_vcpu_tdcx_pages;
> >>>
> >>> +	/*
> >>> +	 * Used on each TD-exit, see tdx_user_return_msr_update_cache().
> >>> +	 * TSX_CTRL value on TD exit
> >>> +	 * - set 0     if guest TSX enabled
> >>> +	 * - preserved if guest TSX disabled
> >>> +	 */
> >>> +	bool tsx_supported;
> >>
> >> Is it possible to drop this boolean and tdparams_tsx_supported()? I think we
> >> can use the guest_can_use() framework instead.
> > 
> > Yeah, though that optimized handling will soon come for free[*], and I plan on
> > landing that sooner than TDX, so don't fret too much over this.
> > 
> > [*] https://lore.kernel.org/all/20240517173926.965351-1-seanjc@google.com
> 
> guest_can_use() is per-vcpu whereas we are currently using the
> CPUID from TD_PARAMS (as per spec) before there are any VCPU's.
> It is a bit of a disconnect so let's keep tsx_supported for now.

No, as was agreed upon[*], KVM needs to ensure consistency between what KVM sees
as guest CPUID and what is actually enabled/exposed to the guest.  If there are
no vCPUs, then there's zero reason to snapshot the value in kvm_tdx.  And if there
are vCPUs, then their CPUID info needs to be consistent with respect to TDPARAMS.

 - Don't hardcode fixed/required CPUID values in KVM, use available metadata
   from TDX Module to reject "bad" guest CPUID (or let the TDX module reject?).
   I.e. don't let a guest silently run with a CPUID that diverges from what
   userspace provided.

[*] https://lore.kernel.org/all/20240405165844.1018872-1-seanjc@google.com
Edgecombe, Rick P Dec. 2, 2024, 7:24 p.m. UTC | #5
On Mon, 2024-12-02 at 11:07 -0800, Sean Christopherson wrote:
> > guest_can_use() is per-vcpu whereas we are currently using the
> > CPUID from TD_PARAMS (as per spec) before there are any VCPU's.
> > It is a bit of a disconnect so let's keep tsx_supported for now.
> 
> No, as was agreed upon[*], KVM needs to ensure consistency between what KVM
> sees
> as guest CPUID and what is actually enabled/exposed to the guest.  If there
> are
> no vCPUs, then there's zero reason to snapshot the value in kvm_tdx.  And if
> there
> are vCPUs, then their CPUID info needs to be consistent with respect to
> TDPARAMS.

Small point - the last conversation[0] we had on this was to let *userspace*
ensure consistency between KVM's CPUID (i.e. KVM_SET_CPUID2) and the TDX
Module's view. So the configuration goes:
1. Userspace configures per-VM CPU features
2. Userspace gets TDX Module's final per-vCPU version of CPUID configuration via
KVM API
3. Userspace calls KVM_SET_CPUID2 with the merge of TDX Module's version, and
userspace's desired values for KVM "owned" CPUID leads (pv features, etc)

But KVM's knowledge of CPUID bits still remains per-vcpu for TDX in any case.

> 
>  - Don't hardcode fixed/required CPUID values in KVM, use available metadata
>    from TDX Module to reject "bad" guest CPUID (or let the TDX module
> reject?).
>    I.e. don't let a guest silently run with a CPUID that diverges from what
>    userspace provided.

The latest QEMU patches have this fixed bit data hardcoded in QEMU. Then the
long term solution is to make the TDX module return this data. Xiaoyao will post
a proposal on how the TDX module should expose this soon.

> 
> [*] https://lore.kernel.org/all/20240405165844.1018872-1-seanjc@google.com


[0]https://lore.kernel.org/kvm/CABgObfaobJ=G18JO9Jx6-K2mhZ2saVyLY-tHOgab1cJupOe-0Q@mail.gmail.com/
Sean Christopherson Dec. 3, 2024, 12:34 a.m. UTC | #6
On Mon, Dec 02, 2024, Rick P Edgecombe wrote:
> On Mon, 2024-12-02 at 11:07 -0800, Sean Christopherson wrote:
> > > guest_can_use() is per-vcpu whereas we are currently using the
> > > CPUID from TD_PARAMS (as per spec) before there are any VCPU's.
> > > It is a bit of a disconnect so let's keep tsx_supported for now.
> > 
> > No, as was agreed upon[*], KVM needs to ensure consistency between what KVM
> > sees

Rick, fix your MUA to not wrap already :-)

> > as guest CPUID and what is actually enabled/exposed to the guest.  If there
> > are no vCPUs, then there's zero reason to snapshot the value in kvm_tdx. 
> > And if there are vCPUs, then their CPUID info needs to be consistent with
> > respect to TDPARAMS.
> 
> Small point - the last conversation[0] we had on this was to let *userspace*
> ensure consistency between KVM's CPUID (i.e. KVM_SET_CPUID2) and the TDX
> Module's view.

I'm all for that, right up until KVM needs to protect itself again userspace and
flawed TDX architecture.  A relevant comment I made in that thread:

 : If the upgrade breaks a setup because it confuses _KVM_, then I'll care

As it applies here, letting vCPU CPUID and actual guest functionality diverge for
features that KVM cares about _will_ cause problems.

This will be less ugly to handle once kvm_vcpu_arch.cpu_caps is a thing.  KVM
can simply force set/clear bits to match the actual guest functionality that's
hardcoded by the TDX Module or defined by TDPARAMS.

> So the configuration goes:
> 1. Userspace configures per-VM CPU features
> 2. Userspace gets TDX Module's final per-vCPU version of CPUID configuration via
> KVM API
> 3. Userspace calls KVM_SET_CPUID2 with the merge of TDX Module's version, and
> userspace's desired values for KVM "owned" CPUID leads (pv features, etc)
> 
> But KVM's knowledge of CPUID bits still remains per-vcpu for TDX in any case.
> 
> > 
> >  - Don't hardcode fixed/required CPUID values in KVM, use available metadata
> >    from TDX Module to reject "bad" guest CPUID (or let the TDX module reject?).
> >    I.e. don't let a guest silently run with a CPUID that diverges from what
> >    userspace provided.
> 
> The latest QEMU patches have this fixed bit data hardcoded in QEMU. Then the
> long term solution is to make the TDX module return this data. Xiaoyao will post
> a proposal on how the TDX module should expose this soon.

Punting the "merge" to userspace is fine, but KVM still needs to ensure it doesn't
have holes where userspace can attack the kernel by lying about what features the
guest has access to.  And that means forcing bits in kvm_vcpu_arch.cpu_caps;
anything else is just asking for problems.

> > [*] https://lore.kernel.org/all/20240405165844.1018872-1-seanjc@google.com
> 
> 
> [0]https://lore.kernel.org/kvm/CABgObfaobJ=G18JO9Jx6-K2mhZ2saVyLY-tHOgab1cJupOe-0Q@mail.gmail.com/
Edgecombe, Rick P Dec. 3, 2024, 5:34 p.m. UTC | #7
On Mon, 2024-12-02 at 16:34 -0800, Sean Christopherson wrote:
> > Small point - the last conversation[0] we had on this was to let *userspace*
> > ensure consistency between KVM's CPUID (i.e. KVM_SET_CPUID2) and the TDX
> > Module's view.
> 
> I'm all for that, right up until KVM needs to protect itself again userspace
> and
> flawed TDX architecture.  A relevant comment I made in that thread:
> 
>  : If the upgrade breaks a setup because it confuses _KVM_, then I'll care
> 
> As it applies here, letting vCPU CPUID and actual guest functionality diverge
> for
> features that KVM cares about _will_ cause problems.

Right, just wanted to make sure we don't need to re-open the major design.

> 
> This will be less ugly to handle once kvm_vcpu_arch.cpu_caps is a thing.  KVM
> can simply force set/clear bits to match the actual guest functionality that's
> hardcoded by the TDX Module or defined by TDPARAMS.
> 
> > So the configuration goes:
> > 1. Userspace configures per-VM CPU features
> > 2. Userspace gets TDX Module's final per-vCPU version of CPUID configuration
> > via
> > KVM API
> > 3. Userspace calls KVM_SET_CPUID2 with the merge of TDX Module's version,
> > and
> > userspace's desired values for KVM "owned" CPUID leads (pv features, etc)
> > 
> > But KVM's knowledge of CPUID bits still remains per-vcpu for TDX in any
> > case.
> > 
> > > 
> > >  - Don't hardcode fixed/required CPUID values in KVM, use available
> > > metadata
> > >    from TDX Module to reject "bad" guest CPUID (or let the TDX module
> > > reject?).
> > >    I.e. don't let a guest silently run with a CPUID that diverges from
> > > what
> > >    userspace provided.
> > 
> > The latest QEMU patches have this fixed bit data hardcoded in QEMU. Then the
> > long term solution is to make the TDX module return this data. Xiaoyao will
> > post
> > a proposal on how the TDX module should expose this soon.
> 
> Punting the "merge" to userspace is fine, but KVM still needs to ensure it
> doesn't
> have holes where userspace can attack the kernel by lying about what features
> the
> guest has access to.  And that means forcing bits in kvm_vcpu_arch.cpu_caps;
> anything else is just asking for problems.

Ok, then for now let's just address them on a case-by-case basis for logic that
protects KVM. I'll add to look at using kvm_vcpu_arch.cpu_caps to our future-
things todo list.

I think Adrian is going post a proposal for how to handle this case better.
Adrian Hunter Dec. 3, 2024, 7:17 p.m. UTC | #8
On 3/12/24 19:34, Edgecombe, Rick P wrote:
> On Mon, 2024-12-02 at 16:34 -0800, Sean Christopherson wrote:
>>> Small point - the last conversation[0] we had on this was to let *userspace*
>>> ensure consistency between KVM's CPUID (i.e. KVM_SET_CPUID2) and the TDX
>>> Module's view.
>>
>> I'm all for that, right up until KVM needs to protect itself again userspace
>> and
>> flawed TDX architecture.  A relevant comment I made in that thread:
>>
>>  : If the upgrade breaks a setup because it confuses _KVM_, then I'll care
>>
>> As it applies here, letting vCPU CPUID and actual guest functionality diverge
>> for
>> features that KVM cares about _will_ cause problems.
> 
> Right, just wanted to make sure we don't need to re-open the major design.
> 
>>
>> This will be less ugly to handle once kvm_vcpu_arch.cpu_caps is a thing.  KVM
>> can simply force set/clear bits to match the actual guest functionality that's
>> hardcoded by the TDX Module or defined by TDPARAMS.
>>
>>> So the configuration goes:
>>> 1. Userspace configures per-VM CPU features
>>> 2. Userspace gets TDX Module's final per-vCPU version of CPUID configuration
>>> via
>>> KVM API
>>> 3. Userspace calls KVM_SET_CPUID2 with the merge of TDX Module's version,
>>> and
>>> userspace's desired values for KVM "owned" CPUID leads (pv features, etc)
>>>
>>> But KVM's knowledge of CPUID bits still remains per-vcpu for TDX in any
>>> case.
>>>
>>>>
>>>>  - Don't hardcode fixed/required CPUID values in KVM, use available
>>>> metadata
>>>>    from TDX Module to reject "bad" guest CPUID (or let the TDX module
>>>> reject?).
>>>>    I.e. don't let a guest silently run with a CPUID that diverges from
>>>> what
>>>>    userspace provided.
>>>
>>> The latest QEMU patches have this fixed bit data hardcoded in QEMU. Then the
>>> long term solution is to make the TDX module return this data. Xiaoyao will
>>> post
>>> a proposal on how the TDX module should expose this soon.
>>
>> Punting the "merge" to userspace is fine, but KVM still needs to ensure it
>> doesn't
>> have holes where userspace can attack the kernel by lying about what features
>> the
>> guest has access to.  And that means forcing bits in kvm_vcpu_arch.cpu_caps;
>> anything else is just asking for problems.
> 
> Ok, then for now let's just address them on a case-by-case basis for logic that
> protects KVM. I'll add to look at using kvm_vcpu_arch.cpu_caps to our future-
> things todo list.
> 
> I think Adrian is going post a proposal for how to handle this case better.

Perhaps just do without TSX support to start with e.g. drop
this "KVM: TDX: Add TSX_CTRL msr into uret_msrs list" patch
and instead add the following:


From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 3 Dec 2024 08:20:03 +0200
Subject: [PATCH] KVM: TDX: Disable support for TSX and WAITPKG

Support for restoring IA32_TSX_CTRL MSR and IA32_UMWAIT_CONTROL MSR is not
yet implemented, so disable support for TSX and WAITPKG for now.  Clear the
associated CPUID bits returned by KVM_TDX_CAPABILITIES, and return an error
if those bits are set in KVM_TDX_INIT_VM.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
 arch/x86/kvm/vmx/tdx.c | 43 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 69bb3136076d..947f78dc3429 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -105,6 +105,44 @@ static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
 	return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
 }
 
+#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
+
+static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
+{
+	return entry->function == 7 && entry->index == 0 &&
+	       (entry->ebx & TDX_FEATURE_TSX);
+}
+
+static void clear_tsx(struct kvm_cpuid_entry2 *entry)
+{
+	entry->ebx &= ~TDX_FEATURE_TSX;
+}
+
+static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
+{
+	return entry->function == 7 && entry->index == 0 &&
+	       (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
+}
+
+static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
+{
+	entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
+}
+
+static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
+{
+	if (has_tsx(entry))
+		clear_tsx(entry);
+
+	if (has_waitpkg(entry))
+		clear_waitpkg(entry);
+}
+
+static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
+{
+	return has_tsx(entry) || has_waitpkg(entry);
+}
+
 #define KVM_TDX_CPUID_NO_SUBLEAF	((__u32)-1)
 
 static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
@@ -124,6 +162,8 @@ static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char i
 	/* Work around missing support on old TDX modules */
 	if (entry->function == 0x80000008)
 		entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
+
+	tdx_clear_unsupported_cpuid(entry);
 }
 
 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
@@ -1235,6 +1275,9 @@ static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
 		if (!entry)
 			continue;
 
+		if (tdx_unsupported_cpuid(entry))
+			return -EINVAL;
+
 		copy_cnt++;
 
 		value = &td_params->cpuid_values[i];
Chao Gao Dec. 4, 2024, 1:25 a.m. UTC | #9
>+#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
>+
>+static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
>+{
>+	return entry->function == 7 && entry->index == 0 &&
>+	       (entry->ebx & TDX_FEATURE_TSX);
>+}
>+
>+static void clear_tsx(struct kvm_cpuid_entry2 *entry)
>+{
>+	entry->ebx &= ~TDX_FEATURE_TSX;
>+}
>+
>+static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
>+{
>+	return entry->function == 7 && entry->index == 0 &&
>+	       (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
>+}
>+
>+static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
>+{
>+	entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
>+}
>+
>+static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
>+{
>+	if (has_tsx(entry))
>+		clear_tsx(entry);
>+
>+	if (has_waitpkg(entry))
>+		clear_waitpkg(entry);
>+}
>+
>+static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
>+{
>+	return has_tsx(entry) || has_waitpkg(entry);
>+}

No need to check TSX/WAITPKG explicitly because setup_tdparams_cpuids() already
ensures that unconfigurable bits are not set by userspace.

>+
> #define KVM_TDX_CPUID_NO_SUBLEAF	((__u32)-1)
> 
> static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
>@@ -124,6 +162,8 @@ static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char i
> 	/* Work around missing support on old TDX modules */
> 	if (entry->function == 0x80000008)
> 		entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
>+
>+	tdx_clear_unsupported_cpuid(entry);
> }
> 
> static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
>@@ -1235,6 +1275,9 @@ static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
> 		if (!entry)
> 			continue;
> 
>+		if (tdx_unsupported_cpuid(entry))
>+			return -EINVAL;
>+
> 		copy_cnt++;
> 
> 		value = &td_params->cpuid_values[i];
>-- 
>2.43.0
>
Adrian Hunter Dec. 4, 2024, 6:18 a.m. UTC | #10
On 4/12/24 03:25, Chao Gao wrote:
>> +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
>> +
>> +static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
>> +{
>> +	return entry->function == 7 && entry->index == 0 &&
>> +	       (entry->ebx & TDX_FEATURE_TSX);
>> +}
>> +
>> +static void clear_tsx(struct kvm_cpuid_entry2 *entry)
>> +{
>> +	entry->ebx &= ~TDX_FEATURE_TSX;
>> +}
>> +
>> +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
>> +{
>> +	return entry->function == 7 && entry->index == 0 &&
>> +	       (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
>> +}
>> +
>> +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
>> +{
>> +	entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
>> +}
>> +
>> +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
>> +{
>> +	if (has_tsx(entry))
>> +		clear_tsx(entry);
>> +
>> +	if (has_waitpkg(entry))
>> +		clear_waitpkg(entry);
>> +}
>> +
>> +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
>> +{
>> +	return has_tsx(entry) || has_waitpkg(entry);
>> +}
> 
> No need to check TSX/WAITPKG explicitly because setup_tdparams_cpuids() already
> ensures that unconfigurable bits are not set by userspace.

Aren't they configurable?

> 
>> +
>> #define KVM_TDX_CPUID_NO_SUBLEAF	((__u32)-1)
>>
>> static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
>> @@ -124,6 +162,8 @@ static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char i
>> 	/* Work around missing support on old TDX modules */
>> 	if (entry->function == 0x80000008)
>> 		entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
>> +
>> +	tdx_clear_unsupported_cpuid(entry);
>> }
>>
>> static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
>> @@ -1235,6 +1275,9 @@ static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
>> 		if (!entry)
>> 			continue;
>>
>> +		if (tdx_unsupported_cpuid(entry))
>> +			return -EINVAL;
>> +
>> 		copy_cnt++;
>>
>> 		value = &td_params->cpuid_values[i];
>> -- 
>> 2.43.0
>>
Chao Gao Dec. 4, 2024, 6:37 a.m. UTC | #11
On Wed, Dec 04, 2024 at 08:18:32AM +0200, Adrian Hunter wrote:
>On 4/12/24 03:25, Chao Gao wrote:
>>> +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
>>> +
>>> +static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
>>> +{
>>> +	return entry->function == 7 && entry->index == 0 &&
>>> +	       (entry->ebx & TDX_FEATURE_TSX);
>>> +}
>>> +
>>> +static void clear_tsx(struct kvm_cpuid_entry2 *entry)
>>> +{
>>> +	entry->ebx &= ~TDX_FEATURE_TSX;
>>> +}
>>> +
>>> +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
>>> +{
>>> +	return entry->function == 7 && entry->index == 0 &&
>>> +	       (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
>>> +}
>>> +
>>> +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
>>> +{
>>> +	entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
>>> +}
>>> +
>>> +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
>>> +{
>>> +	if (has_tsx(entry))
>>> +		clear_tsx(entry);
>>> +
>>> +	if (has_waitpkg(entry))
>>> +		clear_waitpkg(entry);
>>> +}
>>> +
>>> +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
>>> +{
>>> +	return has_tsx(entry) || has_waitpkg(entry);
>>> +}
>> 
>> No need to check TSX/WAITPKG explicitly because setup_tdparams_cpuids() already
>> ensures that unconfigurable bits are not set by userspace.
>
>Aren't they configurable?

They are cleared from the configurable bitmap by tdx_clear_unsupported_cpuid(),
so they are not configurable from a userspace perspective. Did I miss anything?
KVM should check user inputs against its adjusted configurable bitmap, right?

>
>> 
>>> +
>>> #define KVM_TDX_CPUID_NO_SUBLEAF	((__u32)-1)
>>>
>>> static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
>>> @@ -124,6 +162,8 @@ static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char i
>>> 	/* Work around missing support on old TDX modules */
>>> 	if (entry->function == 0x80000008)
>>> 		entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
>>> +
>>> +	tdx_clear_unsupported_cpuid(entry);
>>> }
>>>
>>> static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
>>> @@ -1235,6 +1275,9 @@ static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
>>> 		if (!entry)
>>> 			continue;
>>>
>>> +		if (tdx_unsupported_cpuid(entry))
>>> +			return -EINVAL;
>>> +
>>> 		copy_cnt++;
>>>
>>> 		value = &td_params->cpuid_values[i];
>>> -- 
>>> 2.43.0
>>>
>
Adrian Hunter Dec. 4, 2024, 6:57 a.m. UTC | #12
On 4/12/24 08:37, Chao Gao wrote:
> On Wed, Dec 04, 2024 at 08:18:32AM +0200, Adrian Hunter wrote:
>> On 4/12/24 03:25, Chao Gao wrote:
>>>> +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
>>>> +
>>>> +static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
>>>> +{
>>>> +	return entry->function == 7 && entry->index == 0 &&
>>>> +	       (entry->ebx & TDX_FEATURE_TSX);
>>>> +}
>>>> +
>>>> +static void clear_tsx(struct kvm_cpuid_entry2 *entry)
>>>> +{
>>>> +	entry->ebx &= ~TDX_FEATURE_TSX;
>>>> +}
>>>> +
>>>> +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
>>>> +{
>>>> +	return entry->function == 7 && entry->index == 0 &&
>>>> +	       (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
>>>> +}
>>>> +
>>>> +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
>>>> +{
>>>> +	entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
>>>> +}
>>>> +
>>>> +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
>>>> +{
>>>> +	if (has_tsx(entry))
>>>> +		clear_tsx(entry);
>>>> +
>>>> +	if (has_waitpkg(entry))
>>>> +		clear_waitpkg(entry);
>>>> +}
>>>> +
>>>> +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
>>>> +{
>>>> +	return has_tsx(entry) || has_waitpkg(entry);
>>>> +}
>>>
>>> No need to check TSX/WAITPKG explicitly because setup_tdparams_cpuids() already
>>> ensures that unconfigurable bits are not set by userspace.
>>
>> Aren't they configurable?
> 
> They are cleared from the configurable bitmap by tdx_clear_unsupported_cpuid(),
> so they are not configurable from a userspace perspective. Did I miss anything?
> KVM should check user inputs against its adjusted configurable bitmap, right?

Maybe I misunderstand but we rely on the TDX module to reject
invalid configuration.  We don't check exactly what is configurable
for the TDX Module.

TSX and WAITPKG are not invalid for the TDX Module, but KVM
must either support them by restoring their MSRs, or disallow
them.  This patch disallows them for now.

> 
>>
>>>
>>>> +
>>>> #define KVM_TDX_CPUID_NO_SUBLEAF	((__u32)-1)
>>>>
>>>> static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
>>>> @@ -124,6 +162,8 @@ static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char i
>>>> 	/* Work around missing support on old TDX modules */
>>>> 	if (entry->function == 0x80000008)
>>>> 		entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
>>>> +
>>>> +	tdx_clear_unsupported_cpuid(entry);
>>>> }
>>>>
>>>> static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
>>>> @@ -1235,6 +1275,9 @@ static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
>>>> 		if (!entry)
>>>> 			continue;
>>>>
>>>> +		if (tdx_unsupported_cpuid(entry))
>>>> +			return -EINVAL;
>>>> +
>>>> 		copy_cnt++;
>>>>
>>>> 		value = &td_params->cpuid_values[i];
>>>> -- 
>>>> 2.43.0
>>>>
>>
Chao Gao Dec. 4, 2024, 11:13 a.m. UTC | #13
On Wed, Dec 04, 2024 at 08:57:23AM +0200, Adrian Hunter wrote:
>On 4/12/24 08:37, Chao Gao wrote:
>> On Wed, Dec 04, 2024 at 08:18:32AM +0200, Adrian Hunter wrote:
>>> On 4/12/24 03:25, Chao Gao wrote:
>>>>> +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
>>>>> +
>>>>> +static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
>>>>> +{
>>>>> +	return entry->function == 7 && entry->index == 0 &&
>>>>> +	       (entry->ebx & TDX_FEATURE_TSX);
>>>>> +}
>>>>> +
>>>>> +static void clear_tsx(struct kvm_cpuid_entry2 *entry)
>>>>> +{
>>>>> +	entry->ebx &= ~TDX_FEATURE_TSX;
>>>>> +}
>>>>> +
>>>>> +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
>>>>> +{
>>>>> +	return entry->function == 7 && entry->index == 0 &&
>>>>> +	       (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
>>>>> +}
>>>>> +
>>>>> +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
>>>>> +{
>>>>> +	entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
>>>>> +}
>>>>> +
>>>>> +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
>>>>> +{
>>>>> +	if (has_tsx(entry))
>>>>> +		clear_tsx(entry);
>>>>> +
>>>>> +	if (has_waitpkg(entry))
>>>>> +		clear_waitpkg(entry);
>>>>> +}
>>>>> +
>>>>> +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
>>>>> +{
>>>>> +	return has_tsx(entry) || has_waitpkg(entry);
>>>>> +}
>>>>
>>>> No need to check TSX/WAITPKG explicitly because setup_tdparams_cpuids() already
>>>> ensures that unconfigurable bits are not set by userspace.
>>>
>>> Aren't they configurable?
>> 
>> They are cleared from the configurable bitmap by tdx_clear_unsupported_cpuid(),
>> so they are not configurable from a userspace perspective. Did I miss anything?
>> KVM should check user inputs against its adjusted configurable bitmap, right?
>
>Maybe I misunderstand but we rely on the TDX module to reject
>invalid configuration.  We don't check exactly what is configurable
>for the TDX Module.

Ok, this is what I missed. I thought KVM validated user input and masked
out all unsupported features. sorry for this.

>
>TSX and WAITPKG are not invalid for the TDX Module, but KVM
>must either support them by restoring their MSRs, or disallow
>them.  This patch disallows them for now.

Yes. I agree. what if a new feature (supported by a future TDX module) also
needs KVM to restore some MSRs? current KVM will allow it to be exposed (since
only TSX/WAITPKG are checked); then some MSRs may get corrupted. I may think
this is not a good design. Current KVM should work with future TDX modules.
Adrian Hunter Dec. 4, 2024, 11:55 a.m. UTC | #14
On 4/12/24 13:13, Chao Gao wrote:
> On Wed, Dec 04, 2024 at 08:57:23AM +0200, Adrian Hunter wrote:
>> On 4/12/24 08:37, Chao Gao wrote:
>>> On Wed, Dec 04, 2024 at 08:18:32AM +0200, Adrian Hunter wrote:
>>>> On 4/12/24 03:25, Chao Gao wrote:
>>>>>> +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
>>>>>> +
>>>>>> +static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
>>>>>> +{
>>>>>> +	return entry->function == 7 && entry->index == 0 &&
>>>>>> +	       (entry->ebx & TDX_FEATURE_TSX);
>>>>>> +}
>>>>>> +
>>>>>> +static void clear_tsx(struct kvm_cpuid_entry2 *entry)
>>>>>> +{
>>>>>> +	entry->ebx &= ~TDX_FEATURE_TSX;
>>>>>> +}
>>>>>> +
>>>>>> +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
>>>>>> +{
>>>>>> +	return entry->function == 7 && entry->index == 0 &&
>>>>>> +	       (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
>>>>>> +}
>>>>>> +
>>>>>> +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
>>>>>> +{
>>>>>> +	entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
>>>>>> +}
>>>>>> +
>>>>>> +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
>>>>>> +{
>>>>>> +	if (has_tsx(entry))
>>>>>> +		clear_tsx(entry);
>>>>>> +
>>>>>> +	if (has_waitpkg(entry))
>>>>>> +		clear_waitpkg(entry);
>>>>>> +}
>>>>>> +
>>>>>> +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
>>>>>> +{
>>>>>> +	return has_tsx(entry) || has_waitpkg(entry);
>>>>>> +}
>>>>>
>>>>> No need to check TSX/WAITPKG explicitly because setup_tdparams_cpuids() already
>>>>> ensures that unconfigurable bits are not set by userspace.
>>>>
>>>> Aren't they configurable?
>>>
>>> They are cleared from the configurable bitmap by tdx_clear_unsupported_cpuid(),
>>> so they are not configurable from a userspace perspective. Did I miss anything?
>>> KVM should check user inputs against its adjusted configurable bitmap, right?
>>
>> Maybe I misunderstand but we rely on the TDX module to reject
>> invalid configuration.  We don't check exactly what is configurable
>> for the TDX Module.
> 
> Ok, this is what I missed. I thought KVM validated user input and masked
> out all unsupported features. sorry for this.
> 
>>
>> TSX and WAITPKG are not invalid for the TDX Module, but KVM
>> must either support them by restoring their MSRs, or disallow
>> them.  This patch disallows them for now.
> 
> Yes. I agree. what if a new feature (supported by a future TDX module) also
> needs KVM to restore some MSRs? current KVM will allow it to be exposed (since
> only TSX/WAITPKG are checked); then some MSRs may get corrupted. I may think
> this is not a good design. Current KVM should work with future TDX modules.

With respect to CPUID, I gather this kind of thing has been
discussed, such as here:

	https://lore.kernel.org/all/ZhVsHVqaff7AKagu@google.com/

and Rick and Xiaoyao are working on something.

In general, I would expect a new TDX Module would advertise support for
new features, but KVM would have to opt in to use them.
Xiaoyao Li Dec. 4, 2024, 3:33 p.m. UTC | #15
On 12/4/2024 7:55 PM, Adrian Hunter wrote:
> On 4/12/24 13:13, Chao Gao wrote:
>> On Wed, Dec 04, 2024 at 08:57:23AM +0200, Adrian Hunter wrote:
>>> On 4/12/24 08:37, Chao Gao wrote:
>>>> On Wed, Dec 04, 2024 at 08:18:32AM +0200, Adrian Hunter wrote:
>>>>> On 4/12/24 03:25, Chao Gao wrote:
>>>>>>> +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
>>>>>>> +
>>>>>>> +static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
>>>>>>> +{
>>>>>>> +	return entry->function == 7 && entry->index == 0 &&
>>>>>>> +	       (entry->ebx & TDX_FEATURE_TSX);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void clear_tsx(struct kvm_cpuid_entry2 *entry)
>>>>>>> +{
>>>>>>> +	entry->ebx &= ~TDX_FEATURE_TSX;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
>>>>>>> +{
>>>>>>> +	return entry->function == 7 && entry->index == 0 &&
>>>>>>> +	       (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
>>>>>>> +{
>>>>>>> +	entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
>>>>>>> +{
>>>>>>> +	if (has_tsx(entry))
>>>>>>> +		clear_tsx(entry);
>>>>>>> +
>>>>>>> +	if (has_waitpkg(entry))
>>>>>>> +		clear_waitpkg(entry);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
>>>>>>> +{
>>>>>>> +	return has_tsx(entry) || has_waitpkg(entry);
>>>>>>> +}
>>>>>>
>>>>>> No need to check TSX/WAITPKG explicitly because setup_tdparams_cpuids() already
>>>>>> ensures that unconfigurable bits are not set by userspace.
>>>>>
>>>>> Aren't they configurable?
>>>>
>>>> They are cleared from the configurable bitmap by tdx_clear_unsupported_cpuid(),
>>>> so they are not configurable from a userspace perspective. Did I miss anything?
>>>> KVM should check user inputs against its adjusted configurable bitmap, right?
>>>
>>> Maybe I misunderstand but we rely on the TDX module to reject
>>> invalid configuration.  We don't check exactly what is configurable
>>> for the TDX Module.
>>
>> Ok, this is what I missed. I thought KVM validated user input and masked
>> out all unsupported features. sorry for this.
>>
>>>
>>> TSX and WAITPKG are not invalid for the TDX Module, but KVM
>>> must either support them by restoring their MSRs, or disallow
>>> them.  This patch disallows them for now.
>>
>> Yes. I agree. what if a new feature (supported by a future TDX module) also
>> needs KVM to restore some MSRs? current KVM will allow it to be exposed (since
>> only TSX/WAITPKG are checked); then some MSRs may get corrupted. I may think
>> this is not a good design. Current KVM should work with future TDX modules.
> 
> With respect to CPUID, I gather this kind of thing has been
> discussed, such as here:
> 
> 	https://lore.kernel.org/all/ZhVsHVqaff7AKagu@google.com/
> 
> and Rick and Xiaoyao are working on something.
> 
> In general, I would expect a new TDX Module would advertise support for
> new features, but KVM would have to opt in to use them.
> 

There were discussion[1] on whether KVM to gatekeep the 
configurable/supported CPUIDs for TDX. I stand by Sean that KVM needs to 
do so.

Regarding KVM opt in the new feature, KVM gatekeeps the CPUID bit that 
can be set by userspace is exactly the behavior of opt-in. i.e., for a 
given KVM, it only allows a CPUID set {S} to be configured by userspace, 
if new TDX module supports new feature X, it needs KVM to opt-in X by 
adding X to {S} so that X is allowed to be configured by userspace.

Besides, I find current interface between KVM and userspace lacks the 
ability to tell userspace what bits are not supported by KVM. 
KVM_TDX_CAPABILITIES.cpuid doesn't work because it represents the 
configurable CPUIDs, not supported CPUIDs (I think we might rename it to 
configurable_cpuid to better reflect its meaning). So userspace has to 
hardcode that TSX and WAITPKG is not support itself.

[1] https://lore.kernel.org/all/ZuM12EFbOXmpHHVQ@google.com/
Edgecombe, Rick P Dec. 4, 2024, 11:40 p.m. UTC | #16
On Wed, 2024-12-04 at 13:55 +0200, Adrian Hunter wrote:
> > > > 
> > > > They are cleared from the configurable bitmap by
> > > > tdx_clear_unsupported_cpuid(),
> > > > so they are not configurable from a userspace perspective. Did I miss
> > > > anything?
> > > > KVM should check user inputs against its adjusted configurable bitmap,
> > > > right?
> > > 
> > > Maybe I misunderstand but we rely on the TDX module to reject
> > > invalid configuration.  We don't check exactly what is configurable
> > > for the TDX Module.
> > 
> > Ok, this is what I missed. I thought KVM validated user input and masked
> > out all unsupported features. sorry for this.

This used to be how it behaved, but IIRC Paolo had suggested to simplify it by
letting the TDX module do the rejection. But that was under the assumption there
wasn't any TDX supported CPUID configuration that was harmful to KVM.

We also used to filter which CPUID features that weren't supported by KVM, but
this was also dropped to make things simpler.

> > 
> > > 
> > > TSX and WAITPKG are not invalid for the TDX Module, but KVM
> > > must either support them by restoring their MSRs, or disallow
> > > them.  This patch disallows them for now.
> > 
> > Yes. I agree. what if a new feature (supported by a future TDX module) also
> > needs KVM to restore some MSRs? current KVM will allow it to be exposed
> > (since
> > only TSX/WAITPKG are checked); then some MSRs may get corrupted. I may think
> > this is not a good design. Current KVM should work with future TDX modules.
> 
> With respect to CPUID, I gather this kind of thing has been
> discussed, such as here:
> 
> 	https://lore.kernel.org/all/ZhVsHVqaff7AKagu@google.com/
> 
> and Rick and Xiaoyao are working on something.

This is around fixed 0/1 bits, and just to help userspace understand what
configurations are possible. It isn't intended to filter any bits on the KVM
side.

> 
> In general, I would expect a new TDX Module would advertise support for
> new features, but KVM would have to opt in to use them.

This is true for attributes/xfam, but isn't for CPUID leafs. We used to filter
them in various ways but it was messy. The suggestion was to simplify it. The
current approach is to treat any TDX module changes that break the host like
that as TDX module bugs. See this coverletter for more info on the history:

	https://lore.kernel.org/kvm/20240812224820.34826-1-rick.p.edgecombe@intel.com/
Edgecombe, Rick P Dec. 4, 2024, 11:51 p.m. UTC | #17
On Wed, 2024-12-04 at 23:33 +0800, Xiaoyao Li wrote:
> > 
> 
> There were discussion[1] on whether KVM to gatekeep the 
> configurable/supported CPUIDs for TDX. I stand by Sean that KVM needs to 
> do so.
> 
> Regarding KVM opt in the new feature, KVM gatekeeps the CPUID bit that 
> can be set by userspace is exactly the behavior of opt-in. i.e., for a 
> given KVM, it only allows a CPUID set {S} to be configured by userspace, 
> if new TDX module supports new feature X, it needs KVM to opt-in X by 
> adding X to {S} so that X is allowed to be configured by userspace.
> 
> Besides, I find current interface between KVM and userspace lacks the 
> ability to tell userspace what bits are not supported by KVM. 
> KVM_TDX_CAPABILITIES.cpuid doesn't work because it represents the 
> configurable CPUIDs, not supported CPUIDs (I think we might rename it to 
> configurable_cpuid to better reflect its meaning). So userspace has to 
> hardcode that TSX and WAITPKG is not support itself.
> 
> [1] https://lore.kernel.org/all/ZuM12EFbOXmpHHVQ@google.com/

I mean, this is kind of a good example for why we might want to go back to
filtering CPUID bits. It kind of depends on how KVM wants to treat the TDX
module. Hostile like userspace, or more trusting. KVM maintaining code to let
the TDX module evolve unsafely would be unfortunate though.

If we keep the current approach, it probably would be educational to highlight
this example to the TDX module team. "Don't do this or you will have a bug on
your side".
Adrian Hunter Dec. 5, 2024, 5:31 p.m. UTC | #18
On 4/12/24 17:33, Xiaoyao Li wrote:
> On 12/4/2024 7:55 PM, Adrian Hunter wrote:
>> On 4/12/24 13:13, Chao Gao wrote:
>>> On Wed, Dec 04, 2024 at 08:57:23AM +0200, Adrian Hunter wrote:
>>>> On 4/12/24 08:37, Chao Gao wrote:
>>>>> On Wed, Dec 04, 2024 at 08:18:32AM +0200, Adrian Hunter wrote:
>>>>>> On 4/12/24 03:25, Chao Gao wrote:
>>>>>>>> +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
>>>>>>>> +
>>>>>>>> +static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
>>>>>>>> +{
>>>>>>>> +    return entry->function == 7 && entry->index == 0 &&
>>>>>>>> +           (entry->ebx & TDX_FEATURE_TSX);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void clear_tsx(struct kvm_cpuid_entry2 *entry)
>>>>>>>> +{
>>>>>>>> +    entry->ebx &= ~TDX_FEATURE_TSX;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
>>>>>>>> +{
>>>>>>>> +    return entry->function == 7 && entry->index == 0 &&
>>>>>>>> +           (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
>>>>>>>> +{
>>>>>>>> +    entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
>>>>>>>> +{
>>>>>>>> +    if (has_tsx(entry))
>>>>>>>> +        clear_tsx(entry);
>>>>>>>> +
>>>>>>>> +    if (has_waitpkg(entry))
>>>>>>>> +        clear_waitpkg(entry);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
>>>>>>>> +{
>>>>>>>> +    return has_tsx(entry) || has_waitpkg(entry);
>>>>>>>> +}
>>>>>>>
>>>>>>> No need to check TSX/WAITPKG explicitly because setup_tdparams_cpuids() already
>>>>>>> ensures that unconfigurable bits are not set by userspace.
>>>>>>
>>>>>> Aren't they configurable?
>>>>>
>>>>> They are cleared from the configurable bitmap by tdx_clear_unsupported_cpuid(),
>>>>> so they are not configurable from a userspace perspective. Did I miss anything?
>>>>> KVM should check user inputs against its adjusted configurable bitmap, right?
>>>>
>>>> Maybe I misunderstand but we rely on the TDX module to reject
>>>> invalid configuration.  We don't check exactly what is configurable
>>>> for the TDX Module.
>>>
>>> Ok, this is what I missed. I thought KVM validated user input and masked
>>> out all unsupported features. sorry for this.
>>>
>>>>
>>>> TSX and WAITPKG are not invalid for the TDX Module, but KVM
>>>> must either support them by restoring their MSRs, or disallow
>>>> them.  This patch disallows them for now.
>>>
>>> Yes. I agree. what if a new feature (supported by a future TDX module) also
>>> needs KVM to restore some MSRs? current KVM will allow it to be exposed (since
>>> only TSX/WAITPKG are checked); then some MSRs may get corrupted. I may think
>>> this is not a good design. Current KVM should work with future TDX modules.
>>
>> With respect to CPUID, I gather this kind of thing has been
>> discussed, such as here:
>>
>>     https://lore.kernel.org/all/ZhVsHVqaff7AKagu@google.com/
>>
>> and Rick and Xiaoyao are working on something.
>>
>> In general, I would expect a new TDX Module would advertise support for
>> new features, but KVM would have to opt in to use them.
>>
> 
> There were discussion[1] on whether KVM to gatekeep the configurable/supported CPUIDs for TDX. I stand by Sean that KVM needs to do so.
> 
> Regarding KVM opt in the new feature, KVM gatekeeps the CPUID bit that can be set by userspace is exactly the behavior of opt-in. i.e., for a given KVM, it only allows a CPUID set {S} to be configured by userspace, if new TDX module supports new feature X, it needs KVM to opt-in X by adding X to {S} so that X is allowed to be configured by userspace.
> 
> Besides, I find current interface between KVM and userspace lacks the ability to tell userspace what bits are not supported by KVM. KVM_TDX_CAPABILITIES.cpuid doesn't work because it represents the configurable CPUIDs, not supported CPUIDs (I think we might rename it to configurable_cpuid to better reflect its meaning). So userspace has to hardcode that TSX and WAITPKG is not support itself.

I don't follow why hardcoding would be necessary.

If the leaf is represented in KVM_TDX_CAPABILITIES.cpuid, and
the bits are 0 there, why would userspace try to set them to 1?

> 
> [1] https://lore.kernel.org/all/ZuM12EFbOXmpHHVQ@google.com/
>
Xiaoyao Li Dec. 6, 2024, 3:37 a.m. UTC | #19
On 12/6/2024 1:31 AM, Adrian Hunter wrote:
> On 4/12/24 17:33, Xiaoyao Li wrote:
>> On 12/4/2024 7:55 PM, Adrian Hunter wrote:
>>> On 4/12/24 13:13, Chao Gao wrote:
>>>> On Wed, Dec 04, 2024 at 08:57:23AM +0200, Adrian Hunter wrote:
>>>>> On 4/12/24 08:37, Chao Gao wrote:
>>>>>> On Wed, Dec 04, 2024 at 08:18:32AM +0200, Adrian Hunter wrote:
>>>>>>> On 4/12/24 03:25, Chao Gao wrote:
>>>>>>>>> +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
>>>>>>>>> +
>>>>>>>>> +static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
>>>>>>>>> +{
>>>>>>>>> +    return entry->function == 7 && entry->index == 0 &&
>>>>>>>>> +           (entry->ebx & TDX_FEATURE_TSX);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static void clear_tsx(struct kvm_cpuid_entry2 *entry)
>>>>>>>>> +{
>>>>>>>>> +    entry->ebx &= ~TDX_FEATURE_TSX;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
>>>>>>>>> +{
>>>>>>>>> +    return entry->function == 7 && entry->index == 0 &&
>>>>>>>>> +           (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
>>>>>>>>> +{
>>>>>>>>> +    entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
>>>>>>>>> +{
>>>>>>>>> +    if (has_tsx(entry))
>>>>>>>>> +        clear_tsx(entry);
>>>>>>>>> +
>>>>>>>>> +    if (has_waitpkg(entry))
>>>>>>>>> +        clear_waitpkg(entry);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
>>>>>>>>> +{
>>>>>>>>> +    return has_tsx(entry) || has_waitpkg(entry);
>>>>>>>>> +}
>>>>>>>>
>>>>>>>> No need to check TSX/WAITPKG explicitly because setup_tdparams_cpuids() already
>>>>>>>> ensures that unconfigurable bits are not set by userspace.
>>>>>>>
>>>>>>> Aren't they configurable?
>>>>>>
>>>>>> They are cleared from the configurable bitmap by tdx_clear_unsupported_cpuid(),
>>>>>> so they are not configurable from a userspace perspective. Did I miss anything?
>>>>>> KVM should check user inputs against its adjusted configurable bitmap, right?
>>>>>
>>>>> Maybe I misunderstand but we rely on the TDX module to reject
>>>>> invalid configuration.  We don't check exactly what is configurable
>>>>> for the TDX Module.
>>>>
>>>> Ok, this is what I missed. I thought KVM validated user input and masked
>>>> out all unsupported features. sorry for this.
>>>>
>>>>>
>>>>> TSX and WAITPKG are not invalid for the TDX Module, but KVM
>>>>> must either support them by restoring their MSRs, or disallow
>>>>> them.  This patch disallows them for now.
>>>>
>>>> Yes. I agree. what if a new feature (supported by a future TDX module) also
>>>> needs KVM to restore some MSRs? current KVM will allow it to be exposed (since
>>>> only TSX/WAITPKG are checked); then some MSRs may get corrupted. I may think
>>>> this is not a good design. Current KVM should work with future TDX modules.
>>>
>>> With respect to CPUID, I gather this kind of thing has been
>>> discussed, such as here:
>>>
>>>      https://lore.kernel.org/all/ZhVsHVqaff7AKagu@google.com/
>>>
>>> and Rick and Xiaoyao are working on something.
>>>
>>> In general, I would expect a new TDX Module would advertise support for
>>> new features, but KVM would have to opt in to use them.
>>>
>>
>> There were discussion[1] on whether KVM to gatekeep the configurable/supported CPUIDs for TDX. I stand by Sean that KVM needs to do so.
>>
>> Regarding KVM opt in the new feature, KVM gatekeeps the CPUID bit that can be set by userspace is exactly the behavior of opt-in. i.e., for a given KVM, it only allows a CPUID set {S} to be configured by userspace, if new TDX module supports new feature X, it needs KVM to opt-in X by adding X to {S} so that X is allowed to be configured by userspace.
>>
>> Besides, I find current interface between KVM and userspace lacks the ability to tell userspace what bits are not supported by KVM. KVM_TDX_CAPABILITIES.cpuid doesn't work because it represents the configurable CPUIDs, not supported CPUIDs (I think we might rename it to configurable_cpuid to better reflect its meaning). So userspace has to hardcode that TSX and WAITPKG is not support itself.
> 
> I don't follow why hardcoding would be necessary.
> 
> If the leaf is represented in KVM_TDX_CAPABILITIES.cpuid, and
> the bits are 0 there, why would userspace try to set them to 1?

Userspace doesn't set the bit to 1 in kvm_tdx_init_vm.cpuid, doesn't 
mean userspace wants the bit to be 0.

Note, KVM_TDX_CAPABILITIES.cpuid reports the configurable bits. The 
value 0 of a bit in KVM_TDX_CAPABILITIES.cpuid means the bit is not 
configurable, not means the bit is unsupported.

For kvm_tdx_init_vm.cpuid,
  - if the corresponding bit is reported as 1 in 
KVM_TDX_CAPABILITIES.cpuid, then a value 0 in kvm_tdx_init_vm.cpuid 
means userspace wants to configure it as 0.
  - if the corresponding bit is reported as 0 in 
KVM_TDX_CAPABILITIES.cpuid, then userspace has to pass a value 0 in 
kvm_tdx_init_vm.cpuid. But it doesn't mean the value of the bit will be 0.

e.g., X2APIC bit is 0 in KVM_TDX_CAPABILITIES.cpuid, and it's also 0 in 
kvm_tdx_init_vm.cpuid, but TD guest sees a value of 1. In the view of 
QEMU, it maintains the bit of X2APIC as 1, and QEMU filters X2APIC bit 
when calling KVM_TDX_INIT_VM because X2APIC is not configurable.

So when it comes to TSX and WAITPKG, QEMU also needs an interface to be 
informed that they are unsupported. Without the interface of fixed0 bits 
reported by KVM, QEMU needs to hardcode itself like [1]. The problem of 
hardcode is that it will conflict when future KVM allows them to be 
configurable.

In the future, if we have interface from KVM to report the fixed0 and 
fixed1 bit (on top of the proposal [2]), userspace can drop the 
hardcoded one it maintains. At that time, KVM can ensure no conflict by 
removing the bits from fixed0/1 array when allowing them to be 
configurable.

[1] 
https://lore.kernel.org/qemu-devel/20241105062408.3533704-49-xiaoyao.li@intel.com/
[2] 
https://lore.kernel.org/all/43b26df1-4c27-41ff-a482-e258f872cc31@intel.com/

>>
>> [1] https://lore.kernel.org/all/ZuM12EFbOXmpHHVQ@google.com/
>>
>
Adrian Hunter Dec. 6, 2024, 2:40 p.m. UTC | #20
On 6/12/24 05:37, Xiaoyao Li wrote:
> On 12/6/2024 1:31 AM, Adrian Hunter wrote:
>> On 4/12/24 17:33, Xiaoyao Li wrote:
>>> On 12/4/2024 7:55 PM, Adrian Hunter wrote:
>>>> On 4/12/24 13:13, Chao Gao wrote:
>>>>> On Wed, Dec 04, 2024 at 08:57:23AM +0200, Adrian Hunter wrote:
>>>>>> On 4/12/24 08:37, Chao Gao wrote:
>>>>>>> On Wed, Dec 04, 2024 at 08:18:32AM +0200, Adrian Hunter wrote:
>>>>>>>> On 4/12/24 03:25, Chao Gao wrote:
>>>>>>>>>> +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
>>>>>>>>>> +
>>>>>>>>>> +static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
>>>>>>>>>> +{
>>>>>>>>>> +    return entry->function == 7 && entry->index == 0 &&
>>>>>>>>>> +           (entry->ebx & TDX_FEATURE_TSX);
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static void clear_tsx(struct kvm_cpuid_entry2 *entry)
>>>>>>>>>> +{
>>>>>>>>>> +    entry->ebx &= ~TDX_FEATURE_TSX;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
>>>>>>>>>> +{
>>>>>>>>>> +    return entry->function == 7 && entry->index == 0 &&
>>>>>>>>>> +           (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
>>>>>>>>>> +{
>>>>>>>>>> +    entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
>>>>>>>>>> +{
>>>>>>>>>> +    if (has_tsx(entry))
>>>>>>>>>> +        clear_tsx(entry);
>>>>>>>>>> +
>>>>>>>>>> +    if (has_waitpkg(entry))
>>>>>>>>>> +        clear_waitpkg(entry);
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
>>>>>>>>>> +{
>>>>>>>>>> +    return has_tsx(entry) || has_waitpkg(entry);
>>>>>>>>>> +}
>>>>>>>>>
>>>>>>>>> No need to check TSX/WAITPKG explicitly because setup_tdparams_cpuids() already
>>>>>>>>> ensures that unconfigurable bits are not set by userspace.
>>>>>>>>
>>>>>>>> Aren't they configurable?
>>>>>>>
>>>>>>> They are cleared from the configurable bitmap by tdx_clear_unsupported_cpuid(),
>>>>>>> so they are not configurable from a userspace perspective. Did I miss anything?
>>>>>>> KVM should check user inputs against its adjusted configurable bitmap, right?
>>>>>>
>>>>>> Maybe I misunderstand but we rely on the TDX module to reject
>>>>>> invalid configuration.  We don't check exactly what is configurable
>>>>>> for the TDX Module.
>>>>>
>>>>> Ok, this is what I missed. I thought KVM validated user input and masked
>>>>> out all unsupported features. sorry for this.
>>>>>
>>>>>>
>>>>>> TSX and WAITPKG are not invalid for the TDX Module, but KVM
>>>>>> must either support them by restoring their MSRs, or disallow
>>>>>> them.  This patch disallows them for now.
>>>>>
>>>>> Yes. I agree. what if a new feature (supported by a future TDX module) also
>>>>> needs KVM to restore some MSRs? current KVM will allow it to be exposed (since
>>>>> only TSX/WAITPKG are checked); then some MSRs may get corrupted. I may think
>>>>> this is not a good design. Current KVM should work with future TDX modules.
>>>>
>>>> With respect to CPUID, I gather this kind of thing has been
>>>> discussed, such as here:
>>>>
>>>>      https://lore.kernel.org/all/ZhVsHVqaff7AKagu@google.com/
>>>>
>>>> and Rick and Xiaoyao are working on something.
>>>>
>>>> In general, I would expect a new TDX Module would advertise support for
>>>> new features, but KVM would have to opt in to use them.
>>>>
>>>
>>> There were discussion[1] on whether KVM to gatekeep the configurable/supported CPUIDs for TDX. I stand by Sean that KVM needs to do so.
>>>
>>> Regarding KVM opt in the new feature, KVM gatekeeps the CPUID bit that can be set by userspace is exactly the behavior of opt-in. i.e., for a given KVM, it only allows a CPUID set {S} to be configured by userspace, if new TDX module supports new feature X, it needs KVM to opt-in X by adding X to {S} so that X is allowed to be configured by userspace.
>>>
>>> Besides, I find current interface between KVM and userspace lacks the ability to tell userspace what bits are not supported by KVM. KVM_TDX_CAPABILITIES.cpuid doesn't work because it represents the configurable CPUIDs, not supported CPUIDs (I think we might rename it to configurable_cpuid to better reflect its meaning). So userspace has to hardcode that TSX and WAITPKG is not support itself.
>>
>> I don't follow why hardcoding would be necessary.
>>
>> If the leaf is represented in KVM_TDX_CAPABILITIES.cpuid, and
>> the bits are 0 there, why would userspace try to set them to 1?
> 
> Userspace doesn't set the bit to 1 in kvm_tdx_init_vm.cpuid, doesn't mean userspace wants the bit to be 0.
> 
> Note, KVM_TDX_CAPABILITIES.cpuid reports the configurable bits. The value 0 of a bit in KVM_TDX_CAPABILITIES.cpuid means the bit is not configurable, not means the bit is unsupported.

For features configurable by CPUID like TSX and WAITPKG,
a value of 0 does mean unsupported, because the value
has to be 1 to enable the feature.

From the TDX Module Base spec:

	11.18. Transactional Synchronization Extensions (TSX)
	The host VMM can enable TSX for a TD by configuring the following CPUID bits as enabled in the TD_PARAMS input to
	TDH.MNG.INIT:
	- CPUID(7,0).EBX[4] (HLE)
	- CPUID(7,0).EBX[11] (RTM)
	etc

	11.19.4. WAITPKG: TPAUSE, UMONITOR and UMWAIT Instructions
	The host VMM may allow guest TDs to use the TPAUSE, UMONITOR and UMWAIT instructions, if the CPU supports them,
	by configuring the virtual value of CPUID(7,0).ECX[5] (WAITPKG) to 1 using the CPUID configuration table which is part
	the TD_PARAMS input to TDH.MNG.INIT. Enabling CPUID(7,0).ECX[5] also enables TD access to IA32_UMWAIT_CONTROL
	(MSR 0xE1).
	If not allowed, then TD execution of TPAUSE, UMONITOR or UMWAIT results in a #UD, and access to
	IA32_UMWAIT_CONTROL results in a #GP(0).

> For kvm_tdx_init_vm.cpuid,
>  - if the corresponding bit is reported as 1 in KVM_TDX_CAPABILITIES.cpuid, then a value 0 in kvm_tdx_init_vm.cpuid means userspace wants to configure it as 0.
>  - if the corresponding bit is reported as 0 in KVM_TDX_CAPABILITIES.cpuid, then userspace has to pass a value 0 in kvm_tdx_init_vm.cpuid. But it doesn't mean the value of the bit will be 0.
> 
> e.g., X2APIC bit is 0 in KVM_TDX_CAPABILITIES.cpuid, and it's also 0 in kvm_tdx_init_vm.cpuid, but TD guest sees a value of 1. In the view of QEMU, it maintains the bit of X2APIC as 1, and QEMU filters X2APIC bit when calling KVM_TDX_INIT_VM because X2APIC is not configurable.
> 
> So when it comes to TSX and WAITPKG, QEMU also needs an interface to be informed that they are unsupported. Without the interface of fixed0 bits reported by KVM, QEMU needs to hardcode itself like [1]. The problem of hardcode is that it will conflict when future KVM allows them to be configurable.

So TSX and WAITPKG support should be based on KVM_TDX_CAPABILITIES.cpuid, and not hardcoded.

> 
> In the future, if we have interface from KVM to report the fixed0 and fixed1 bit (on top of the proposal [2]), userspace can drop the hardcoded one it maintains. At that time, KVM can ensure no conflict by removing the bits from fixed0/1 array when allowing them to be configurable.
> 
> [1] https://lore.kernel.org/qemu-devel/20241105062408.3533704-49-xiaoyao.li@intel.com/
> [2] https://lore.kernel.org/all/43b26df1-4c27-41ff-a482-e258f872cc31@intel.com/
> 
>>>
>>> [1] https://lore.kernel.org/all/ZuM12EFbOXmpHHVQ@google.com/
>>>
>>
>
Xiaoyao Li Dec. 9, 2024, 2:46 a.m. UTC | #21
On 12/6/2024 10:40 PM, Adrian Hunter wrote:
> On 6/12/24 05:37, Xiaoyao Li wrote:
>> On 12/6/2024 1:31 AM, Adrian Hunter wrote:
>>> On 4/12/24 17:33, Xiaoyao Li wrote:
>>>> On 12/4/2024 7:55 PM, Adrian Hunter wrote:
>>>>> On 4/12/24 13:13, Chao Gao wrote:
>>>>>> On Wed, Dec 04, 2024 at 08:57:23AM +0200, Adrian Hunter wrote:
>>>>>>> On 4/12/24 08:37, Chao Gao wrote:
>>>>>>>> On Wed, Dec 04, 2024 at 08:18:32AM +0200, Adrian Hunter wrote:
>>>>>>>>> On 4/12/24 03:25, Chao Gao wrote:
>>>>>>>>>>> +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
>>>>>>>>>>> +
>>>>>>>>>>> +static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>> +{
>>>>>>>>>>> +    return entry->function == 7 && entry->index == 0 &&
>>>>>>>>>>> +           (entry->ebx & TDX_FEATURE_TSX);
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +static void clear_tsx(struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>> +{
>>>>>>>>>>> +    entry->ebx &= ~TDX_FEATURE_TSX;
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>> +{
>>>>>>>>>>> +    return entry->function == 7 && entry->index == 0 &&
>>>>>>>>>>> +           (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>> +{
>>>>>>>>>>> +    entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>> +{
>>>>>>>>>>> +    if (has_tsx(entry))
>>>>>>>>>>> +        clear_tsx(entry);
>>>>>>>>>>> +
>>>>>>>>>>> +    if (has_waitpkg(entry))
>>>>>>>>>>> +        clear_waitpkg(entry);
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>> +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>> +{
>>>>>>>>>>> +    return has_tsx(entry) || has_waitpkg(entry);
>>>>>>>>>>> +}
>>>>>>>>>>
>>>>>>>>>> No need to check TSX/WAITPKG explicitly because setup_tdparams_cpuids() already
>>>>>>>>>> ensures that unconfigurable bits are not set by userspace.
>>>>>>>>>
>>>>>>>>> Aren't they configurable?
>>>>>>>>
>>>>>>>> They are cleared from the configurable bitmap by tdx_clear_unsupported_cpuid(),
>>>>>>>> so they are not configurable from a userspace perspective. Did I miss anything?
>>>>>>>> KVM should check user inputs against its adjusted configurable bitmap, right?
>>>>>>>
>>>>>>> Maybe I misunderstand but we rely on the TDX module to reject
>>>>>>> invalid configuration.  We don't check exactly what is configurable
>>>>>>> for the TDX Module.
>>>>>>
>>>>>> Ok, this is what I missed. I thought KVM validated user input and masked
>>>>>> out all unsupported features. sorry for this.
>>>>>>
>>>>>>>
>>>>>>> TSX and WAITPKG are not invalid for the TDX Module, but KVM
>>>>>>> must either support them by restoring their MSRs, or disallow
>>>>>>> them.  This patch disallows them for now.
>>>>>>
>>>>>> Yes. I agree. what if a new feature (supported by a future TDX module) also
>>>>>> needs KVM to restore some MSRs? current KVM will allow it to be exposed (since
>>>>>> only TSX/WAITPKG are checked); then some MSRs may get corrupted. I may think
>>>>>> this is not a good design. Current KVM should work with future TDX modules.
>>>>>
>>>>> With respect to CPUID, I gather this kind of thing has been
>>>>> discussed, such as here:
>>>>>
>>>>>       https://lore.kernel.org/all/ZhVsHVqaff7AKagu@google.com/
>>>>>
>>>>> and Rick and Xiaoyao are working on something.
>>>>>
>>>>> In general, I would expect a new TDX Module would advertise support for
>>>>> new features, but KVM would have to opt in to use them.
>>>>>
>>>>
>>>> There were discussion[1] on whether KVM to gatekeep the configurable/supported CPUIDs for TDX. I stand by Sean that KVM needs to do so.
>>>>
>>>> Regarding KVM opt in the new feature, KVM gatekeeps the CPUID bit that can be set by userspace is exactly the behavior of opt-in. i.e., for a given KVM, it only allows a CPUID set {S} to be configured by userspace, if new TDX module supports new feature X, it needs KVM to opt-in X by adding X to {S} so that X is allowed to be configured by userspace.
>>>>
>>>> Besides, I find current interface between KVM and userspace lacks the ability to tell userspace what bits are not supported by KVM. KVM_TDX_CAPABILITIES.cpuid doesn't work because it represents the configurable CPUIDs, not supported CPUIDs (I think we might rename it to configurable_cpuid to better reflect its meaning). So userspace has to hardcode that TSX and WAITPKG is not support itself.
>>>
>>> I don't follow why hardcoding would be necessary.
>>>
>>> If the leaf is represented in KVM_TDX_CAPABILITIES.cpuid, and
>>> the bits are 0 there, why would userspace try to set them to 1?
>>
>> Userspace doesn't set the bit to 1 in kvm_tdx_init_vm.cpuid, doesn't mean userspace wants the bit to be 0.
>>
>> Note, KVM_TDX_CAPABILITIES.cpuid reports the configurable bits. The value 0 of a bit in KVM_TDX_CAPABILITIES.cpuid means the bit is not configurable, not means the bit is unsupported.
> 
> For features configurable by CPUID like TSX and WAITPKG,
> a value of 0 does mean unsupported, because the value
> has to be 1 to enable the feature.
> 
>  From the TDX Module Base spec:
> 
> 	11.18. Transactional Synchronization Extensions (TSX)
> 	The host VMM can enable TSX for a TD by configuring the following CPUID bits as enabled in the TD_PARAMS input to
> 	TDH.MNG.INIT:
> 	- CPUID(7,0).EBX[4] (HLE)
> 	- CPUID(7,0).EBX[11] (RTM)
> 	etc
> 
> 	11.19.4. WAITPKG: TPAUSE, UMONITOR and UMWAIT Instructions
> 	The host VMM may allow guest TDs to use the TPAUSE, UMONITOR and UMWAIT instructions, if the CPU supports them,
> 	by configuring the virtual value of CPUID(7,0).ECX[5] (WAITPKG) to 1 using the CPUID configuration table which is part
> 	the TD_PARAMS input to TDH.MNG.INIT. Enabling CPUID(7,0).ECX[5] also enables TD access to IA32_UMWAIT_CONTROL
> 	(MSR 0xE1).
> 	If not allowed, then TD execution of TPAUSE, UMONITOR or UMWAIT results in a #UD, and access to
> 	IA32_UMWAIT_CONTROL results in a #GP(0).
> 
>> For kvm_tdx_init_vm.cpuid,
>>   - if the corresponding bit is reported as 1 in KVM_TDX_CAPABILITIES.cpuid, then a value 0 in kvm_tdx_init_vm.cpuid means userspace wants to configure it as 0.
>>   - if the corresponding bit is reported as 0 in KVM_TDX_CAPABILITIES.cpuid, then userspace has to pass a value 0 in kvm_tdx_init_vm.cpuid. But it doesn't mean the value of the bit will be 0.
>>
>> e.g., X2APIC bit is 0 in KVM_TDX_CAPABILITIES.cpuid, and it's also 0 in kvm_tdx_init_vm.cpuid, but TD guest sees a value of 1. In the view of QEMU, it maintains the bit of X2APIC as 1, and QEMU filters X2APIC bit when calling KVM_TDX_INIT_VM because X2APIC is not configurable.
>>
>> So when it comes to TSX and WAITPKG, QEMU also needs an interface to be informed that they are unsupported. Without the interface of fixed0 bits reported by KVM, QEMU needs to hardcode itself like [1]. The problem of hardcode is that it will conflict when future KVM allows them to be configurable.
> 
> So TSX and WAITPKG support should be based on KVM_TDX_CAPABILITIES.cpuid, and not hardcoded.

This requires Userspace to have the knowledge that "TSX and WAITPKG is 
supposed to be configurable and reported as 1 in 
KVM_TDX_CAPABILITIES.cpuid in normal case". And if userspace gets a 
value 0 of TSX and Waitpkg in KVM_TDX_CAPABILITIES.cpuid, it means 
either KVM doesn't support/allow it or the underlying TDX module doesn't.

I think it's an implicit form of hardcode. From the perspective of 
userspace, I would like to avoid all the special handling.

>>
>> In the future, if we have interface from KVM to report the fixed0 and fixed1 bit (on top of the proposal [2]), userspace can drop the hardcoded one it maintains. At that time, KVM can ensure no conflict by removing the bits from fixed0/1 array when allowing them to be configurable.
>>
>> [1] https://lore.kernel.org/qemu-devel/20241105062408.3533704-49-xiaoyao.li@intel.com/
>> [2] https://lore.kernel.org/all/43b26df1-4c27-41ff-a482-e258f872cc31@intel.com/
>>
>>>>
>>>> [1] https://lore.kernel.org/all/ZuM12EFbOXmpHHVQ@google.com/
>>>>
>>>
>>
>
Adrian Hunter Dec. 9, 2024, 7:08 a.m. UTC | #22
On 9/12/24 04:46, Xiaoyao Li wrote:
> On 12/6/2024 10:40 PM, Adrian Hunter wrote:
>> On 6/12/24 05:37, Xiaoyao Li wrote:
>>> On 12/6/2024 1:31 AM, Adrian Hunter wrote:
>>>> On 4/12/24 17:33, Xiaoyao Li wrote:
>>>>> On 12/4/2024 7:55 PM, Adrian Hunter wrote:
>>>>>> On 4/12/24 13:13, Chao Gao wrote:
>>>>>>> On Wed, Dec 04, 2024 at 08:57:23AM +0200, Adrian Hunter wrote:
>>>>>>>> On 4/12/24 08:37, Chao Gao wrote:
>>>>>>>>> On Wed, Dec 04, 2024 at 08:18:32AM +0200, Adrian Hunter wrote:
>>>>>>>>>> On 4/12/24 03:25, Chao Gao wrote:
>>>>>>>>>>>> +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
>>>>>>>>>>>> +
>>>>>>>>>>>> +static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +    return entry->function == 7 && entry->index == 0 &&
>>>>>>>>>>>> +           (entry->ebx & TDX_FEATURE_TSX);
>>>>>>>>>>>> +}
>>>>>>>>>>>> +
>>>>>>>>>>>> +static void clear_tsx(struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +    entry->ebx &= ~TDX_FEATURE_TSX;
>>>>>>>>>>>> +}
>>>>>>>>>>>> +
>>>>>>>>>>>> +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +    return entry->function == 7 && entry->index == 0 &&
>>>>>>>>>>>> +           (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
>>>>>>>>>>>> +}
>>>>>>>>>>>> +
>>>>>>>>>>>> +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +    entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
>>>>>>>>>>>> +}
>>>>>>>>>>>> +
>>>>>>>>>>>> +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +    if (has_tsx(entry))
>>>>>>>>>>>> +        clear_tsx(entry);
>>>>>>>>>>>> +
>>>>>>>>>>>> +    if (has_waitpkg(entry))
>>>>>>>>>>>> +        clear_waitpkg(entry);
>>>>>>>>>>>> +}
>>>>>>>>>>>> +
>>>>>>>>>>>> +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +    return has_tsx(entry) || has_waitpkg(entry);
>>>>>>>>>>>> +}
>>>>>>>>>>>
>>>>>>>>>>> No need to check TSX/WAITPKG explicitly because setup_tdparams_cpuids() already
>>>>>>>>>>> ensures that unconfigurable bits are not set by userspace.
>>>>>>>>>>
>>>>>>>>>> Aren't they configurable?
>>>>>>>>>
>>>>>>>>> They are cleared from the configurable bitmap by tdx_clear_unsupported_cpuid(),
>>>>>>>>> so they are not configurable from a userspace perspective. Did I miss anything?
>>>>>>>>> KVM should check user inputs against its adjusted configurable bitmap, right?
>>>>>>>>
>>>>>>>> Maybe I misunderstand but we rely on the TDX module to reject
>>>>>>>> invalid configuration.  We don't check exactly what is configurable
>>>>>>>> for the TDX Module.
>>>>>>>
>>>>>>> Ok, this is what I missed. I thought KVM validated user input and masked
>>>>>>> out all unsupported features. sorry for this.
>>>>>>>
>>>>>>>>
>>>>>>>> TSX and WAITPKG are not invalid for the TDX Module, but KVM
>>>>>>>> must either support them by restoring their MSRs, or disallow
>>>>>>>> them.  This patch disallows them for now.
>>>>>>>
>>>>>>> Yes. I agree. what if a new feature (supported by a future TDX module) also
>>>>>>> needs KVM to restore some MSRs? current KVM will allow it to be exposed (since
>>>>>>> only TSX/WAITPKG are checked); then some MSRs may get corrupted. I may think
>>>>>>> this is not a good design. Current KVM should work with future TDX modules.
>>>>>>
>>>>>> With respect to CPUID, I gather this kind of thing has been
>>>>>> discussed, such as here:
>>>>>>
>>>>>>       https://lore.kernel.org/all/ZhVsHVqaff7AKagu@google.com/
>>>>>>
>>>>>> and Rick and Xiaoyao are working on something.
>>>>>>
>>>>>> In general, I would expect a new TDX Module would advertise support for
>>>>>> new features, but KVM would have to opt in to use them.
>>>>>>
>>>>>
>>>>> There were discussion[1] on whether KVM to gatekeep the configurable/supported CPUIDs for TDX. I stand by Sean that KVM needs to do so.
>>>>>
>>>>> Regarding KVM opt in the new feature, KVM gatekeeps the CPUID bit that can be set by userspace is exactly the behavior of opt-in. i.e., for a given KVM, it only allows a CPUID set {S} to be configured by userspace, if new TDX module supports new feature X, it needs KVM to opt-in X by adding X to {S} so that X is allowed to be configured by userspace.
>>>>>
>>>>> Besides, I find current interface between KVM and userspace lacks the ability to tell userspace what bits are not supported by KVM. KVM_TDX_CAPABILITIES.cpuid doesn't work because it represents the configurable CPUIDs, not supported CPUIDs (I think we might rename it to configurable_cpuid to better reflect its meaning). So userspace has to hardcode that TSX and WAITPKG is not support itself.
>>>>
>>>> I don't follow why hardcoding would be necessary.
>>>>
>>>> If the leaf is represented in KVM_TDX_CAPABILITIES.cpuid, and
>>>> the bits are 0 there, why would userspace try to set them to 1?
>>>
>>> Userspace doesn't set the bit to 1 in kvm_tdx_init_vm.cpuid, doesn't mean userspace wants the bit to be 0.
>>>
>>> Note, KVM_TDX_CAPABILITIES.cpuid reports the configurable bits. The value 0 of a bit in KVM_TDX_CAPABILITIES.cpuid means the bit is not configurable, not means the bit is unsupported.
>>
>> For features configurable by CPUID like TSX and WAITPKG,
>> a value of 0 does mean unsupported, because the value
>> has to be 1 to enable the feature.
>>
>>  From the TDX Module Base spec:
>>
>>     11.18. Transactional Synchronization Extensions (TSX)
>>     The host VMM can enable TSX for a TD by configuring the following CPUID bits as enabled in the TD_PARAMS input to
>>     TDH.MNG.INIT:
>>     - CPUID(7,0).EBX[4] (HLE)
>>     - CPUID(7,0).EBX[11] (RTM)
>>     etc
>>
>>     11.19.4. WAITPKG: TPAUSE, UMONITOR and UMWAIT Instructions
>>     The host VMM may allow guest TDs to use the TPAUSE, UMONITOR and UMWAIT instructions, if the CPU supports them,
>>     by configuring the virtual value of CPUID(7,0).ECX[5] (WAITPKG) to 1 using the CPUID configuration table which is part
>>     the TD_PARAMS input to TDH.MNG.INIT. Enabling CPUID(7,0).ECX[5] also enables TD access to IA32_UMWAIT_CONTROL
>>     (MSR 0xE1).
>>     If not allowed, then TD execution of TPAUSE, UMONITOR or UMWAIT results in a #UD, and access to
>>     IA32_UMWAIT_CONTROL results in a #GP(0).
>>
>>> For kvm_tdx_init_vm.cpuid,
>>>   - if the corresponding bit is reported as 1 in KVM_TDX_CAPABILITIES.cpuid, then a value 0 in kvm_tdx_init_vm.cpuid means userspace wants to configure it as 0.
>>>   - if the corresponding bit is reported as 0 in KVM_TDX_CAPABILITIES.cpuid, then userspace has to pass a value 0 in kvm_tdx_init_vm.cpuid. But it doesn't mean the value of the bit will be 0.
>>>
>>> e.g., X2APIC bit is 0 in KVM_TDX_CAPABILITIES.cpuid, and it's also 0 in kvm_tdx_init_vm.cpuid, but TD guest sees a value of 1. In the view of QEMU, it maintains the bit of X2APIC as 1, and QEMU filters X2APIC bit when calling KVM_TDX_INIT_VM because X2APIC is not configurable.
>>>
>>> So when it comes to TSX and WAITPKG, QEMU also needs an interface to be informed that they are unsupported. Without the interface of fixed0 bits reported by KVM, QEMU needs to hardcode itself like [1]. The problem of hardcode is that it will conflict when future KVM allows them to be configurable.
>>
>> So TSX and WAITPKG support should be based on KVM_TDX_CAPABILITIES.cpuid, and not hardcoded.
> 
> This requires Userspace to have the knowledge that "TSX and WAITPKG is supposed to be configurable and reported as 1 in KVM_TDX_CAPABILITIES.cpuid in normal case". And if userspace gets a value 0 of TSX and Waitpkg in KVM_TDX_CAPABILITIES.cpuid, it means either KVM doesn't support/allow it or the underlying TDX module doesn't.
> 
> I think it's an implicit form of hardcode. From the perspective of userspace, I would like to avoid all the special handling.

It is consistent with what is done for TD attributes and XFAM.  Presumably feature names must be mapped to capability bits and configuration bits, and that information has to be represented somewhere.

> 
>>>
>>> In the future, if we have interface from KVM to report the fixed0 and fixed1 bit (on top of the proposal [2]), userspace can drop the hardcoded one it maintains. At that time, KVM can ensure no conflict by removing the bits from fixed0/1 array when allowing them to be configurable.
>>>
>>> [1] https://lore.kernel.org/qemu-devel/20241105062408.3533704-49-xiaoyao.li@intel.com/
>>> [2] https://lore.kernel.org/all/43b26df1-4c27-41ff-a482-e258f872cc31@intel.com/
>>>
>>>>>
>>>>> [1] https://lore.kernel.org/all/ZuM12EFbOXmpHHVQ@google.com/
>>>>>
>>>>
>>>
>>
>
Xiaoyao Li Dec. 10, 2024, 2:45 a.m. UTC | #23
On 12/9/2024 3:08 PM, Adrian Hunter wrote:
> On 9/12/24 04:46, Xiaoyao Li wrote:
>> On 12/6/2024 10:40 PM, Adrian Hunter wrote:
>>> On 6/12/24 05:37, Xiaoyao Li wrote:
>>>> On 12/6/2024 1:31 AM, Adrian Hunter wrote:
>>>>> On 4/12/24 17:33, Xiaoyao Li wrote:
>>>>>> On 12/4/2024 7:55 PM, Adrian Hunter wrote:
>>>>>>> On 4/12/24 13:13, Chao Gao wrote:
>>>>>>>> On Wed, Dec 04, 2024 at 08:57:23AM +0200, Adrian Hunter wrote:
>>>>>>>>> On 4/12/24 08:37, Chao Gao wrote:
>>>>>>>>>> On Wed, Dec 04, 2024 at 08:18:32AM +0200, Adrian Hunter wrote:
>>>>>>>>>>> On 4/12/24 03:25, Chao Gao wrote:
>>>>>>>>>>>>> +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>>>> +{
>>>>>>>>>>>>> +    return entry->function == 7 && entry->index == 0 &&
>>>>>>>>>>>>> +           (entry->ebx & TDX_FEATURE_TSX);
>>>>>>>>>>>>> +}
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +static void clear_tsx(struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>>>> +{
>>>>>>>>>>>>> +    entry->ebx &= ~TDX_FEATURE_TSX;
>>>>>>>>>>>>> +}
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>>>> +{
>>>>>>>>>>>>> +    return entry->function == 7 && entry->index == 0 &&
>>>>>>>>>>>>> +           (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
>>>>>>>>>>>>> +}
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>>>> +{
>>>>>>>>>>>>> +    entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
>>>>>>>>>>>>> +}
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>>>> +{
>>>>>>>>>>>>> +    if (has_tsx(entry))
>>>>>>>>>>>>> +        clear_tsx(entry);
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +    if (has_waitpkg(entry))
>>>>>>>>>>>>> +        clear_waitpkg(entry);
>>>>>>>>>>>>> +}
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
>>>>>>>>>>>>> +{
>>>>>>>>>>>>> +    return has_tsx(entry) || has_waitpkg(entry);
>>>>>>>>>>>>> +}
>>>>>>>>>>>>
>>>>>>>>>>>> No need to check TSX/WAITPKG explicitly because setup_tdparams_cpuids() already
>>>>>>>>>>>> ensures that unconfigurable bits are not set by userspace.
>>>>>>>>>>>
>>>>>>>>>>> Aren't they configurable?
>>>>>>>>>>
>>>>>>>>>> They are cleared from the configurable bitmap by tdx_clear_unsupported_cpuid(),
>>>>>>>>>> so they are not configurable from a userspace perspective. Did I miss anything?
>>>>>>>>>> KVM should check user inputs against its adjusted configurable bitmap, right?
>>>>>>>>>
>>>>>>>>> Maybe I misunderstand but we rely on the TDX module to reject
>>>>>>>>> invalid configuration.  We don't check exactly what is configurable
>>>>>>>>> for the TDX Module.
>>>>>>>>
>>>>>>>> Ok, this is what I missed. I thought KVM validated user input and masked
>>>>>>>> out all unsupported features. sorry for this.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> TSX and WAITPKG are not invalid for the TDX Module, but KVM
>>>>>>>>> must either support them by restoring their MSRs, or disallow
>>>>>>>>> them.  This patch disallows them for now.
>>>>>>>>
>>>>>>>> Yes. I agree. what if a new feature (supported by a future TDX module) also
>>>>>>>> needs KVM to restore some MSRs? current KVM will allow it to be exposed (since
>>>>>>>> only TSX/WAITPKG are checked); then some MSRs may get corrupted. I may think
>>>>>>>> this is not a good design. Current KVM should work with future TDX modules.
>>>>>>>
>>>>>>> With respect to CPUID, I gather this kind of thing has been
>>>>>>> discussed, such as here:
>>>>>>>
>>>>>>>        https://lore.kernel.org/all/ZhVsHVqaff7AKagu@google.com/
>>>>>>>
>>>>>>> and Rick and Xiaoyao are working on something.
>>>>>>>
>>>>>>> In general, I would expect a new TDX Module would advertise support for
>>>>>>> new features, but KVM would have to opt in to use them.
>>>>>>>
>>>>>>
>>>>>> There were discussion[1] on whether KVM to gatekeep the configurable/supported CPUIDs for TDX. I stand by Sean that KVM needs to do so.
>>>>>>
>>>>>> Regarding KVM opt in the new feature, KVM gatekeeps the CPUID bit that can be set by userspace is exactly the behavior of opt-in. i.e., for a given KVM, it only allows a CPUID set {S} to be configured by userspace, if new TDX module supports new feature X, it needs KVM to opt-in X by adding X to {S} so that X is allowed to be configured by userspace.
>>>>>>
>>>>>> Besides, I find current interface between KVM and userspace lacks the ability to tell userspace what bits are not supported by KVM. KVM_TDX_CAPABILITIES.cpuid doesn't work because it represents the configurable CPUIDs, not supported CPUIDs (I think we might rename it to configurable_cpuid to better reflect its meaning). So userspace has to hardcode that TSX and WAITPKG is not support itself.
>>>>>
>>>>> I don't follow why hardcoding would be necessary.
>>>>>
>>>>> If the leaf is represented in KVM_TDX_CAPABILITIES.cpuid, and
>>>>> the bits are 0 there, why would userspace try to set them to 1?
>>>>
>>>> Userspace doesn't set the bit to 1 in kvm_tdx_init_vm.cpuid, doesn't mean userspace wants the bit to be 0.
>>>>
>>>> Note, KVM_TDX_CAPABILITIES.cpuid reports the configurable bits. The value 0 of a bit in KVM_TDX_CAPABILITIES.cpuid means the bit is not configurable, not means the bit is unsupported.
>>>
>>> For features configurable by CPUID like TSX and WAITPKG,
>>> a value of 0 does mean unsupported, because the value
>>> has to be 1 to enable the feature.
>>>
>>>   From the TDX Module Base spec:
>>>
>>>      11.18. Transactional Synchronization Extensions (TSX)
>>>      The host VMM can enable TSX for a TD by configuring the following CPUID bits as enabled in the TD_PARAMS input to
>>>      TDH.MNG.INIT:
>>>      - CPUID(7,0).EBX[4] (HLE)
>>>      - CPUID(7,0).EBX[11] (RTM)
>>>      etc
>>>
>>>      11.19.4. WAITPKG: TPAUSE, UMONITOR and UMWAIT Instructions
>>>      The host VMM may allow guest TDs to use the TPAUSE, UMONITOR and UMWAIT instructions, if the CPU supports them,
>>>      by configuring the virtual value of CPUID(7,0).ECX[5] (WAITPKG) to 1 using the CPUID configuration table which is part
>>>      the TD_PARAMS input to TDH.MNG.INIT. Enabling CPUID(7,0).ECX[5] also enables TD access to IA32_UMWAIT_CONTROL
>>>      (MSR 0xE1).
>>>      If not allowed, then TD execution of TPAUSE, UMONITOR or UMWAIT results in a #UD, and access to
>>>      IA32_UMWAIT_CONTROL results in a #GP(0).
>>>
>>>> For kvm_tdx_init_vm.cpuid,
>>>>    - if the corresponding bit is reported as 1 in KVM_TDX_CAPABILITIES.cpuid, then a value 0 in kvm_tdx_init_vm.cpuid means userspace wants to configure it as 0.
>>>>    - if the corresponding bit is reported as 0 in KVM_TDX_CAPABILITIES.cpuid, then userspace has to pass a value 0 in kvm_tdx_init_vm.cpuid. But it doesn't mean the value of the bit will be 0.
>>>>
>>>> e.g., X2APIC bit is 0 in KVM_TDX_CAPABILITIES.cpuid, and it's also 0 in kvm_tdx_init_vm.cpuid, but TD guest sees a value of 1. In the view of QEMU, it maintains the bit of X2APIC as 1, and QEMU filters X2APIC bit when calling KVM_TDX_INIT_VM because X2APIC is not configurable.
>>>>
>>>> So when it comes to TSX and WAITPKG, QEMU also needs an interface to be informed that they are unsupported. Without the interface of fixed0 bits reported by KVM, QEMU needs to hardcode itself like [1]. The problem of hardcode is that it will conflict when future KVM allows them to be configurable.
>>>
>>> So TSX and WAITPKG support should be based on KVM_TDX_CAPABILITIES.cpuid, and not hardcoded.
>>
>> This requires Userspace to have the knowledge that "TSX and WAITPKG is supposed to be configurable and reported as 1 in KVM_TDX_CAPABILITIES.cpuid in normal case". And if userspace gets a value 0 of TSX and Waitpkg in KVM_TDX_CAPABILITIES.cpuid, it means either KVM doesn't support/allow it or the underlying TDX module doesn't.
>>
>> I think it's an implicit form of hardcode. From the perspective of userspace, I would like to avoid all the special handling.
> 
> It is consistent with what is done for TD attributes and XFAM.  Presumably feature names must be mapped to capability bits and configuration bits, and that information has to be represented somewhere.

Attrbutes and XFAM are different to cpuid either in KVM_TDX_CAPABILITIES 
or in struct kvm_tdx_init_vm.

KVM_TDX_CAPABILITIES reports the *supported* bits for Attributes and 
XFAM while it reports the configurable bits for CPUID.

The bits passed in struct kvm_tdx_init_vm for Attributes and XFAM are a 
whole of the final value that will be set for TD guest. While the bits 
passed in struct kvm_tdx_init_vm for cpuid are only the value of 
configurable bits, the final value that will be set for TD guest needs 
to be OR'ed with fixed1 bits.

 From the perspective of userspace,the information in 
KVM_TDX_CAPABILITIES.supported_attributes and 
KVM_TDX_CAPABILITIES.supported_xfam are enough, bits reported as 0 in 
them means they are not supported. While for cpuid, in general, the bits 
reported as 0 in KVM_TDX_CAPABILITIES.cpuid doesn't mean they are not 
supported.

Anyway, my point is, in general, value 0 of a bit in 
KVM_TDX_CAPABILITIES.cpuid doesn't mean the bit is not supported for TD 
guest. It's better to have KVM expose an interface to report the 
unsupported bits. Without it, userspace has to hardcode some information 
and maintain it itself.

QEMU already hardcodes the fixed0 and fixed1 bits itself[1] according to 
a specific version of TDX spec, while the tweak to TSX and WAITPKG in 
KVM will require more specific handling in QEMU. It's not the fault of 
the change to TSX/WAITPKG in KVM_TDX_CAPABILITIES.cpuid. It somehow 
reveals it's getting messier when KVM cannot report enough information 
to userspace.

[1] 
https://lore.kernel.org/qemu-devel/20241105062408.3533704-49-xiaoyao.li@intel.com/
>>
>>>>
>>>> In the future, if we have interface from KVM to report the fixed0 and fixed1 bit (on top of the proposal [2]), userspace can drop the hardcoded one it maintains. At that time, KVM can ensure no conflict by removing the bits from fixed0/1 array when allowing them to be configurable.
>>>>
>>>> [1] https://lore.kernel.org/qemu-devel/20241105062408.3533704-49-xiaoyao.li@intel.com/
>>>> [2] https://lore.kernel.org/all/43b26df1-4c27-41ff-a482-e258f872cc31@intel.com/
>>>>
>>>>>>
>>>>>> [1] https://lore.kernel.org/all/ZuM12EFbOXmpHHVQ@google.com/
>>>>>>
>>>>>
>>>>
>>>
>>
>
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 4a33ca54c8ba..2c7b6308da73 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -722,14 +722,21 @@  static struct tdx_uret_msr tdx_uret_msrs[] = {
 	{.msr = MSR_LSTAR,},
 	{.msr = MSR_TSC_AUX,},
 };
+static int tdx_uret_tsx_ctrl_slot;
 
-static void tdx_user_return_msr_update_cache(void)
+static void tdx_user_return_msr_update_cache(struct kvm_vcpu *vcpu)
 {
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
 		kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
 						 tdx_uret_msrs[i].defval);
+	/*
+	 * TSX_CTRL is reset to 0 if guest TSX is supported. Otherwise
+	 * preserved.
+	 */
+	if (to_kvm_tdx(vcpu->kvm)->tsx_supported && tdx_uret_tsx_ctrl_slot != -1)
+		kvm_user_return_msr_update_cache(tdx_uret_tsx_ctrl_slot, 0);
 }
 
 static void tdx_restore_host_xsave_state(struct kvm_vcpu *vcpu)
@@ -817,7 +824,7 @@  fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
 
 	tdx_vcpu_enter_exit(vcpu);
 
-	tdx_user_return_msr_update_cache();
+	tdx_user_return_msr_update_cache(vcpu);
 	tdx_restore_host_xsave_state(vcpu);
 	tdx->host_state_need_restore = true;
 
@@ -1258,6 +1265,22 @@  static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
 	return 0;
 }
 
+static bool tdparams_tsx_supported(struct kvm_cpuid2 *cpuid)
+{
+	const struct kvm_cpuid_entry2 *entry;
+	u64 mask;
+	u32 ebx;
+
+	entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x7, 0);
+	if (entry)
+		ebx = entry->ebx;
+	else
+		ebx = 0;
+
+	mask = __feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM);
+	return ebx & mask;
+}
+
 static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
 			struct kvm_tdx_init_vm *init_vm)
 {
@@ -1299,6 +1322,7 @@  static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
 	MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
 	MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
 
+	to_kvm_tdx(kvm)->tsx_supported = tdparams_tsx_supported(cpuid);
 	return 0;
 }
 
@@ -2272,6 +2296,11 @@  static int __init __tdx_bringup(void)
 			return -EIO;
 		}
 	}
+	tdx_uret_tsx_ctrl_slot = kvm_find_user_return_msr(MSR_IA32_TSX_CTRL);
+	if (tdx_uret_tsx_ctrl_slot == -1 && boot_cpu_has(X86_FEATURE_MSR_TSX_CTRL)) {
+		pr_err("MSR_IA32_TSX_CTRL isn't included by kvm_find_user_return_msr\n");
+		return -EIO;
+	}
 
 	/*
 	 * Enabling TDX requires enabling hardware virtualization first,
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 48cf0a1abfcc..815ff6bdbc7e 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -29,6 +29,14 @@  struct kvm_tdx {
 	u8 nr_tdcs_pages;
 	u8 nr_vcpu_tdcx_pages;
 
+	/*
+	 * Used on each TD-exit, see tdx_user_return_msr_update_cache().
+	 * TSX_CTRL value on TD exit
+	 * - set 0     if guest TSX enabled
+	 * - preserved if guest TSX disabled
+	 */
+	bool tsx_supported;
+
 	u64 tsc_offset;
 
 	enum kvm_tdx_state state;