diff mbox series

[v12,20/22] x86/virt/tdx: Allow SEAMCALL to handle #UD and #GP

Message ID c124550719716f1f7759c2bdea70f4722d8e0167.1687784645.git.kai.huang@intel.com (mailing list archive)
State New
Headers show
Series TDX host kernel support | expand

Commit Message

Huang, Kai June 26, 2023, 2:12 p.m. UTC
On the platform with the "partial write machine check" erratum, a kernel
partial write to TDX private memory may cause unexpected machine check.
It would be nice if the #MC handler could print additional information
to show the #MC was TDX private memory error due to possible kernel bug.

To do that, the machine check handler needs to use SEAMCALL to query
page type of the error memory from the TDX module, because there's no
existing infrastructure to track TDX private pages.

SEAMCALL instruction causes #UD if CPU isn't in VMX operation.  In #MC
handler, it is legal that CPU isn't in VMX operation when making this
SEAMCALL.  Extend the TDX_MODULE_CALL macro to handle #UD so the
SEAMCALL can return error code instead of Oops in the #MC handler.
Opportunistically handles #GP too since they share the same code.

A bonus is when kernel mistakenly calls SEAMCALL when CPU isn't in VMX
operation, or when TDX isn't enabled by the BIOS, or when the BIOS is
buggy, the kernel can get a nicer error message rather than a less
understandable Oops.

Signed-off-by: Kai Huang <kai.huang@intel.com>
---

v11 -> v12 (new patch):
  - Splitted out from "SEAMCALL infrastructure" patch for better review.
  - Provide justification in changelog (Dave/David)

---
 arch/x86/include/asm/tdx.h      |  5 +++++
 arch/x86/virt/vmx/tdx/tdx.c     |  7 +++++++
 arch/x86/virt/vmx/tdx/tdxcall.S | 19 +++++++++++++++++--
 3 files changed, 29 insertions(+), 2 deletions(-)

Comments

Kirill A . Shutemov June 28, 2023, 12:32 p.m. UTC | #1
On Tue, Jun 27, 2023 at 02:12:50AM +1200, Kai Huang wrote:
> On the platform with the "partial write machine check" erratum, a kernel
> partial write to TDX private memory may cause unexpected machine check.
> It would be nice if the #MC handler could print additional information
> to show the #MC was TDX private memory error due to possible kernel bug.
> 
> To do that, the machine check handler needs to use SEAMCALL to query
> page type of the error memory from the TDX module, because there's no
> existing infrastructure to track TDX private pages.
> 
> SEAMCALL instruction causes #UD if CPU isn't in VMX operation.  In #MC
> handler, it is legal that CPU isn't in VMX operation when making this
> SEAMCALL.  Extend the TDX_MODULE_CALL macro to handle #UD so the
> SEAMCALL can return error code instead of Oops in the #MC handler.
> Opportunistically handles #GP too since they share the same code.
> 
> A bonus is when kernel mistakenly calls SEAMCALL when CPU isn't in VMX
> operation, or when TDX isn't enabled by the BIOS, or when the BIOS is
> buggy, the kernel can get a nicer error message rather than a less
> understandable Oops.
> 
> Signed-off-by: Kai Huang <kai.huang@intel.com>

Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Peter Zijlstra June 28, 2023, 3:29 p.m. UTC | #2
On Tue, Jun 27, 2023 at 02:12:50AM +1200, Kai Huang wrote:
> diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S
> index 49a54356ae99..757b0c34be10 100644
> --- a/arch/x86/virt/vmx/tdx/tdxcall.S
> +++ b/arch/x86/virt/vmx/tdx/tdxcall.S
> @@ -1,6 +1,7 @@
>  /* SPDX-License-Identifier: GPL-2.0 */
>  #include <asm/asm-offsets.h>
>  #include <asm/tdx.h>
> +#include <asm/asm.h>
>  
>  /*
>   * TDCALL and SEAMCALL are supported in Binutils >= 2.36.
> @@ -45,6 +46,7 @@
>  	/* Leave input param 2 in RDX */
>  
>  	.if \host
> +1:
>  	seamcall

So what registers are actually clobbered by SEAMCALL ? There's a
distinct lack of it in SDM Vol.2 instruction list :-(

>  	/*
>  	 * SEAMCALL instruction is essentially a VMExit from VMX root
> @@ -57,10 +59,23 @@
>  	 * This value will never be used as actual SEAMCALL error code as
>  	 * it is from the Reserved status code class.
>  	 */
> -	jnc .Lno_vmfailinvalid
> +	jnc .Lseamcall_out
>  	mov $TDX_SEAMCALL_VMFAILINVALID, %rax
> -.Lno_vmfailinvalid:
> +	jmp .Lseamcall_out
> +2:
> +	/*
> +	 * SEAMCALL caused #GP or #UD.  By reaching here %eax contains
> +	 * the trap number.  Convert the trap number to the TDX error
> +	 * code by setting TDX_SW_ERROR to the high 32-bits of %rax.
> +	 *
> +	 * Note cannot OR TDX_SW_ERROR directly to %rax as OR instruction
> +	 * only accepts 32-bit immediate at most.
> +	 */
> +	mov $TDX_SW_ERROR, %r12
> +	orq %r12, %rax
>  
> +	_ASM_EXTABLE_FAULT(1b, 2b)
> +.Lseamcall_out:

This is all pretty atrocious code flow... would it at all be possible to
write it like:

SYM_FUNC_START(...)

.if \host
1:	seamcall
	cmovc	%spare, %rax
2:
.else
	tdcall
.endif

	.....
	RET


3:
	mov $TDX_SW_ERROR, %r12
	orq %r12, %rax
	jmp 2b

	_ASM_EXTABLE_FAULT(1b, 3b)

SYM_FUNC_END()

That is, having all that inline in the hotpath is quite horrific.
Peter Zijlstra June 28, 2023, 8:38 p.m. UTC | #3
On Wed, Jun 28, 2023 at 05:29:01PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 27, 2023 at 02:12:50AM +1200, Kai Huang wrote:
> > diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S
> > index 49a54356ae99..757b0c34be10 100644
> > --- a/arch/x86/virt/vmx/tdx/tdxcall.S
> > +++ b/arch/x86/virt/vmx/tdx/tdxcall.S
> > @@ -1,6 +1,7 @@
> >  /* SPDX-License-Identifier: GPL-2.0 */
> >  #include <asm/asm-offsets.h>
> >  #include <asm/tdx.h>
> > +#include <asm/asm.h>
> >  
> >  /*
> >   * TDCALL and SEAMCALL are supported in Binutils >= 2.36.
> > @@ -45,6 +46,7 @@
> >  	/* Leave input param 2 in RDX */
> >  
> >  	.if \host
> > +1:
> >  	seamcall
> 
> So what registers are actually clobbered by SEAMCALL ? There's a
> distinct lack of it in SDM Vol.2 instruction list :-(

With the exception of the abomination that is TDH.VP.ENTER all SEAMCALLs
seem to be limited to the set presented here (c,d,8,9,10,11) and all
other registers should be available.

Can we please make that a hard requirement, SEAMCALL must not use
registers outside this? We can hardly program to random future
extentions; we need hard ABI guarantees here.

That also means we should be able to use si,di for the cmovc below.

Kirill, back when we did __tdx_hypercall() we got bp removed as a valid
register, the 1.0 spec still lists that, and it is also listed in
TDH.VP.ENTER, I'm assuming it will be removed there too?

bp must not be used -- it violates the pre-existing calling convention.
Peter Zijlstra June 28, 2023, 9:11 p.m. UTC | #4
On Wed, Jun 28, 2023 at 10:38:23PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 28, 2023 at 05:29:01PM +0200, Peter Zijlstra wrote:
> > On Tue, Jun 27, 2023 at 02:12:50AM +1200, Kai Huang wrote:
> > > diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S
> > > index 49a54356ae99..757b0c34be10 100644
> > > --- a/arch/x86/virt/vmx/tdx/tdxcall.S
> > > +++ b/arch/x86/virt/vmx/tdx/tdxcall.S
> > > @@ -1,6 +1,7 @@
> > >  /* SPDX-License-Identifier: GPL-2.0 */
> > >  #include <asm/asm-offsets.h>
> > >  #include <asm/tdx.h>
> > > +#include <asm/asm.h>
> > >  
> > >  /*
> > >   * TDCALL and SEAMCALL are supported in Binutils >= 2.36.
> > > @@ -45,6 +46,7 @@
> > >  	/* Leave input param 2 in RDX */
> > >  
> > >  	.if \host
> > > +1:
> > >  	seamcall
> > 
> > So what registers are actually clobbered by SEAMCALL ? There's a
> > distinct lack of it in SDM Vol.2 instruction list :-(
> 
> With the exception of the abomination that is TDH.VP.ENTER all SEAMCALLs
> seem to be limited to the set presented here (c,d,8,9,10,11) and all
> other registers should be available.
> 
> Can we please make that a hard requirement, SEAMCALL must not use
> registers outside this? We can hardly program to random future
> extentions; we need hard ABI guarantees here.
> 
> That also means we should be able to use si,di for the cmovc below.
> 
> Kirill, back when we did __tdx_hypercall() we got bp removed as a valid
> register, the 1.0 spec still lists that, and it is also listed in
> TDH.VP.ENTER, I'm assuming it will be removed there too?
> 
> bp must not be used -- it violates the pre-existing calling convention.

How's this then? Utterly untested. Not been near a compiler even.

--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -109,10 +109,26 @@ EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
  * should only be used for calls that have no legitimate reason to fail
  * or where the kernel can not survive the call failing.
  */
-static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
-				   struct tdx_module_output *out)
+static inline void _tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9)
 {
-	if (__tdx_module_call(fn, rcx, rdx, r8, r9, out))
+	struct tdx_module_args args = {
+		.rcx = rcx,
+		.rdx = rdx,
+		.r8  = r8,
+		.r9  = r9,
+	};
+	return __tdx_module_call(fn, &args);
+}
+
+static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9)
+{
+	if (_tdx_module_call(fn, rcx, rdx, r8, r9))
+		panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
+}
+
+static inline void tdx_module_call_ret(u64 fn, struct tdx_module_args *args)
+{
+	if (__tdx_module_call(fn, args))
 		panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
 }
 
@@ -134,9 +150,9 @@ int tdx_mcall_get_report0(u8 *reportdata
 {
 	u64 ret;
 
-	ret = __tdx_module_call(TDX_GET_REPORT, virt_to_phys(tdreport),
-				virt_to_phys(reportdata), TDREPORT_SUBTYPE_0,
-				0, NULL);
+	ret = _tdx_module_call(TDX_GET_REPORT,
+			       virt_to_phys(tdreport), virt_to_phys(reportdata),
+			       TDREPORT_SUBTYPE_0, 0);
 	if (ret) {
 		if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
 			return -EINVAL;
@@ -184,7 +200,7 @@ static void __noreturn tdx_panic(const c
 
 static void tdx_parse_tdinfo(u64 *cc_mask)
 {
-	struct tdx_module_output out;
+	struct tdx_module_args args = {};
 	unsigned int gpa_width;
 	u64 td_attr;
 
@@ -195,7 +211,7 @@ static void tdx_parse_tdinfo(u64 *cc_mas
 	 * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
 	 * [TDG.VP.INFO].
 	 */
-	tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out);
+	tdx_module_call_ret(TDX_GET_INFO, &args);
 
 	/*
 	 * The highest bit of a guest physical address is the "sharing" bit.
@@ -204,7 +220,7 @@ static void tdx_parse_tdinfo(u64 *cc_mas
 	 * The GPA width that comes out of this call is critical. TDX guests
 	 * can not meaningfully run without it.
 	 */
-	gpa_width = out.rcx & GENMASK(5, 0);
+	gpa_width = args.rcx & GENMASK(5, 0);
 	*cc_mask = BIT_ULL(gpa_width - 1);
 
 	/*
@@ -212,7 +228,7 @@ static void tdx_parse_tdinfo(u64 *cc_mas
 	 * memory.  Ensure that no #VE will be delivered for accesses to
 	 * TD-private memory.  Only VMM-shared memory (MMIO) will #VE.
 	 */
-	td_attr = out.rdx;
+	td_attr = args.rdx;
 	if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
 		const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
 
@@ -620,7 +636,7 @@ __init bool tdx_early_handle_ve(struct p
 
 void tdx_get_ve_info(struct ve_info *ve)
 {
-	struct tdx_module_output out;
+	struct tdx_module_args args = {};
 
 	/*
 	 * Called during #VE handling to retrieve the #VE info from the
@@ -637,15 +653,15 @@ void tdx_get_ve_info(struct ve_info *ve)
 	 * Note, the TDX module treats virtual NMIs as inhibited if the #VE
 	 * valid flag is set. It means that NMI=>#VE will not result in a #DF.
 	 */
-	tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out);
+	tdx_module_call_ret(TDX_GET_VEINFO, &args);
 
 	/* Transfer the output parameters */
-	ve->exit_reason = out.rcx;
-	ve->exit_qual   = out.rdx;
-	ve->gla         = out.r8;
-	ve->gpa         = out.r9;
-	ve->instr_len   = lower_32_bits(out.r10);
-	ve->instr_info  = upper_32_bits(out.r10);
+	ve->exit_reason = args.rcx;
+	ve->exit_qual   = args.rdx;
+	ve->gla         = args.r8;
+	ve->gpa         = args.r9;
+	ve->instr_len   = lower_32_bits(args.r10);
+	ve->instr_info  = upper_32_bits(args.r10);
 }
 
 /*
@@ -779,7 +795,7 @@ static bool try_accept_one(phys_addr_t *
 	}
 
 	tdcall_rcx = *start | page_size;
-	if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL))
+	if (_tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0))
 		return false;
 
 	*start += accept_size;
@@ -857,7 +873,7 @@ void __init tdx_early_init(void)
 	cc_set_mask(cc_mask);
 
 	/* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
-	tdx_module_call(TDX_WR, 0, TDCS_NOTIFY_ENABLES, 0, -1ULL, NULL);
+	tdx_module_call(TDX_WR, 0, TDCS_NOTIFY_ENABLES, 0, -1ULL);
 
 	/*
 	 * All bits above GPA width are reserved and kernel treats shared bit
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -37,7 +37,7 @@
  *
  * This is a software only structure and not part of the TDX module/VMM ABI.
  */
-struct tdx_module_output {
+struct tdx_module_args {
 	u64 rcx;
 	u64 rdx;
 	u64 r8;
@@ -67,8 +67,8 @@ struct ve_info {
 void __init tdx_early_init(void);
 
 /* Used to communicate with the TDX module */
-u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
-		      struct tdx_module_output *out);
+u64 __tdx_module_call(u64 fn, struct tdx_module_args *args);
+u64 __tdx_module_call_ret(u64 fn, struct tdx_module_args *args);
 
 void tdx_get_ve_info(struct ve_info *ve);
 
--- a/arch/x86/virt/vmx/tdx/tdxcall.S
+++ b/arch/x86/virt/vmx/tdx/tdxcall.S
@@ -17,37 +17,44 @@
  *            TDX module and hypercalls to the VMM.
  * SEAMCALL - used by TDX hosts to make requests to the
  *            TDX module.
+ *
+ *-------------------------------------------------------------------------
+ * TDCALL/SEAMCALL ABI:
+ *-------------------------------------------------------------------------
+ * Input Registers:
+ *
+ * RAX                 - TDCALL Leaf number.
+ * RCX,RDX,R8-R9       - TDCALL Leaf specific input registers.
+ *
+ * Output Registers:
+ *
+ * RAX                 - TDCALL instruction error code.
+ * RCX,RDX,R8-R11      - TDCALL Leaf specific output registers.
+ *
+ *-------------------------------------------------------------------------
+ *
+ * __tdx_module_call() function ABI:
+ *
+ * @fn   (RDI)         - TDCALL Leaf ID,    moved to RAX
+ * @regs (RSI)         - struct tdx_regs pointer
+ *
+ * Return status of TDCALL via RAX.
  */
-.macro TDX_MODULE_CALL host:req
-	/*
-	 * R12 will be used as temporary storage for struct tdx_module_output
-	 * pointer. Since R12-R15 registers are not used by TDCALL/SEAMCALL
-	 * services supported by this function, it can be reused.
-	 */
+.macro TDX_MODULE_CALL host:req ret:req
+	FRAME_BEGIN
 
-	/* Callee saved, so preserve it */
-	push %r12
+	mov	%rdi, %rax
+	mov	$TDX_SEAMCALL_VMFAILINVALID, %rdi
 
-	/*
-	 * Push output pointer to stack.
-	 * After the operation, it will be fetched into R12 register.
-	 */
-	push %r9
+	mov	TDX_MODULE_rcx(%rsi), %rcx
+	mov	TDX_MODULE_rdx(%rsi), %rdx
+	mov	TDX_MODULE_r8(%rsi),  %r8
+	mov	TDX_MODULE_r9(%rsi),  %r9
+//	mov	TDX_MODULE_r10(%rsi), %r10
+//	mov	TDX_MODULE_r11(%rsi), %r11
 
-	/* Mangle function call ABI into TDCALL/SEAMCALL ABI: */
-	/* Move Leaf ID to RAX */
-	mov %rdi, %rax
-	/* Move input 4 to R9 */
-	mov %r8,  %r9
-	/* Move input 3 to R8 */
-	mov %rcx, %r8
-	/* Move input 1 to RCX */
-	mov %rsi, %rcx
-	/* Leave input param 2 in RDX */
-
-	.if \host
-1:
-	seamcall
+.if \host
+1:	seamcall
 	/*
 	 * SEAMCALL instruction is essentially a VMExit from VMX root
 	 * mode to SEAM VMX root mode.  VMfailInvalid (CF=1) indicates
@@ -59,53 +66,30 @@
 	 * This value will never be used as actual SEAMCALL error code as
 	 * it is from the Reserved status code class.
 	 */
-	jnc .Lseamcall_out
-	mov $TDX_SEAMCALL_VMFAILINVALID, %rax
-	jmp .Lseamcall_out
+	cmovc	%rdi, %rax
 2:
-	/*
-	 * SEAMCALL caused #GP or #UD.  By reaching here %eax contains
-	 * the trap number.  Convert the trap number to the TDX error
-	 * code by setting TDX_SW_ERROR to the high 32-bits of %rax.
-	 *
-	 * Note cannot OR TDX_SW_ERROR directly to %rax as OR instruction
-	 * only accepts 32-bit immediate at most.
-	 */
-	mov $TDX_SW_ERROR, %r12
-	orq %r12, %rax
-
-	_ASM_EXTABLE_FAULT(1b, 2b)
-.Lseamcall_out:
-	.else
+.else
 	tdcall
-	.endif
-
-	/*
-	 * Fetch output pointer from stack to R12 (It is used
-	 * as temporary storage)
-	 */
-	pop %r12
+.endif
 
-	/*
-	 * Since this macro can be invoked with NULL as an output pointer,
-	 * check if caller provided an output struct before storing output
-	 * registers.
-	 *
-	 * Update output registers, even if the call failed (RAX != 0).
-	 * Other registers may contain details of the failure.
-	 */
-	test %r12, %r12
-	jz .Lno_output_struct
+.if \ret
+	movq %rcx, TDX_MODULE_rcx(%rsi)
+	movq %rdx, TDX_MODULE_rdx(%rsi)
+	movq %r8,  TDX_MODULE_r8(%rsi)
+	movq %r9,  TDX_MODULE_r9(%rsi)
+	movq %r10, TDX_MODULE_r10(%rsi)
+	movq %r11, TDX_MODULE_r11(%rsi)
+.endif
+
+	FRAME_END
+	RET
+
+.if \host
+3:
+	mov	$TDX_SW_ERROR, %rdi
+	or	%rdi, %rax
+	jmp 2b
 
-	/* Copy result registers to output struct: */
-	movq %rcx, TDX_MODULE_rcx(%r12)
-	movq %rdx, TDX_MODULE_rdx(%r12)
-	movq %r8,  TDX_MODULE_r8(%r12)
-	movq %r9,  TDX_MODULE_r9(%r12)
-	movq %r10, TDX_MODULE_r10(%r12)
-	movq %r11, TDX_MODULE_r11(%r12)
-
-.Lno_output_struct:
-	/* Restore the state of R12 register */
-	pop %r12
+	_ASM_EXTABLE_FAULT(1b, 3b)
+.endif
 .endm
Peter Zijlstra June 28, 2023, 9:16 p.m. UTC | #5
On Wed, Jun 28, 2023 at 11:11:32PM +0200, Peter Zijlstra wrote:
> --- a/arch/x86/virt/vmx/tdx/tdxcall.S
> +++ b/arch/x86/virt/vmx/tdx/tdxcall.S
> @@ -17,37 +17,44 @@
>   *            TDX module and hypercalls to the VMM.
>   * SEAMCALL - used by TDX hosts to make requests to the
>   *            TDX module.
> + *
> + *-------------------------------------------------------------------------
> + * TDCALL/SEAMCALL ABI:
> + *-------------------------------------------------------------------------
> + * Input Registers:
> + *
> + * RAX                 - TDCALL Leaf number.
> + * RCX,RDX,R8-R9       - TDCALL Leaf specific input registers.
> + *
> + * Output Registers:
> + *
> + * RAX                 - TDCALL instruction error code.
> + * RCX,RDX,R8-R11      - TDCALL Leaf specific output registers.
> + *
> + *-------------------------------------------------------------------------
> + *
> + * __tdx_module_call() function ABI:
> + *
> + * @fn   (RDI)         - TDCALL Leaf ID,    moved to RAX
> + * @regs (RSI)         - struct tdx_regs pointer
> + *
> + * Return status of TDCALL via RAX.
>   */
> +.macro TDX_MODULE_CALL host:req ret:req
> +	FRAME_BEGIN
>  
> +	mov	%rdi, %rax
> +	mov	$TDX_SEAMCALL_VMFAILINVALID, %rdi
>  
> +	mov	TDX_MODULE_rcx(%rsi), %rcx
> +	mov	TDX_MODULE_rdx(%rsi), %rdx
> +	mov	TDX_MODULE_r8(%rsi),  %r8
> +	mov	TDX_MODULE_r9(%rsi),  %r9
> +//	mov	TDX_MODULE_r10(%rsi), %r10
> +//	mov	TDX_MODULE_r11(%rsi), %r11
>  
> +.if \host
> +1:	seamcall
>  	/*
>  	 * SEAMCALL instruction is essentially a VMExit from VMX root
>  	 * mode to SEAM VMX root mode.  VMfailInvalid (CF=1) indicates
	...
>  	 * This value will never be used as actual SEAMCALL error code as
>  	 * it is from the Reserved status code class.
>  	 */
> +	cmovc	%rdi, %rax
>  2:
> +.else
>  	tdcall
> +.endif
>  
> +.if \ret
> +	movq %rcx, TDX_MODULE_rcx(%rsi)
> +	movq %rdx, TDX_MODULE_rdx(%rsi)
> +	movq %r8,  TDX_MODULE_r8(%rsi)
> +	movq %r9,  TDX_MODULE_r9(%rsi)
> +	movq %r10, TDX_MODULE_r10(%rsi)
> +	movq %r11, TDX_MODULE_r11(%rsi)
> +.endif
> +
> +	FRAME_END
> +	RET
> +
> +.if \host
> +3:
> +	mov	$TDX_SW_ERROR, %rdi
> +	or	%rdi, %rax
> +	jmp 2b
>  
> +	_ASM_EXTABLE_FAULT(1b, 3b)
> +.endif
>  .endm

Isn't that much simpler?
Huang, Kai June 29, 2023, 10 a.m. UTC | #6
On Wed, 2023-06-28 at 17:29 +0200, Peter Zijlstra wrote:
> On Tue, Jun 27, 2023 at 02:12:50AM +1200, Kai Huang wrote:
> > diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S
> > index 49a54356ae99..757b0c34be10 100644
> > --- a/arch/x86/virt/vmx/tdx/tdxcall.S
> > +++ b/arch/x86/virt/vmx/tdx/tdxcall.S
> > @@ -1,6 +1,7 @@
> >  /* SPDX-License-Identifier: GPL-2.0 */
> >  #include <asm/asm-offsets.h>
> >  #include <asm/tdx.h>
> > +#include <asm/asm.h>
> >  
> >  /*
> >   * TDCALL and SEAMCALL are supported in Binutils >= 2.36.
> > @@ -45,6 +46,7 @@
> >  	/* Leave input param 2 in RDX */
> >  
> >  	.if \host
> > +1:
> >  	seamcall
> 
> So what registers are actually clobbered by SEAMCALL ? There's a
> distinct lack of it in SDM Vol.2 instruction list :-(
> 
> >  	/*
> >  	 * SEAMCALL instruction is essentially a VMExit from VMX root
> > @@ -57,10 +59,23 @@
> >  	 * This value will never be used as actual SEAMCALL error code as
> >  	 * it is from the Reserved status code class.
> >  	 */
> > -	jnc .Lno_vmfailinvalid
> > +	jnc .Lseamcall_out
> >  	mov $TDX_SEAMCALL_VMFAILINVALID, %rax
> > -.Lno_vmfailinvalid:
> > +	jmp .Lseamcall_out
> > +2:
> > +	/*
> > +	 * SEAMCALL caused #GP or #UD.  By reaching here %eax contains
> > +	 * the trap number.  Convert the trap number to the TDX error
> > +	 * code by setting TDX_SW_ERROR to the high 32-bits of %rax.
> > +	 *
> > +	 * Note cannot OR TDX_SW_ERROR directly to %rax as OR instruction
> > +	 * only accepts 32-bit immediate at most.
> > +	 */
> > +	mov $TDX_SW_ERROR, %r12
> > +	orq %r12, %rax
> >  
> > +	_ASM_EXTABLE_FAULT(1b, 2b)
> > +.Lseamcall_out:
> 
> This is all pretty atrocious code flow... would it at all be possible to
> write it like:
> 
> SYM_FUNC_START(...)
> 
> .if \host
> 1:	seamcall
> 	cmovc	%spare, %rax

Looks using cmovc can remove the using of one additional label.

I guess it can be done in a separate patch.

> 2:
> .else
> 	tdcall
> .endif
> 
> 	.....
> 	RET
> 
> 
> 3:
> 	mov $TDX_SW_ERROR, %r12
> 	orq %r12, %rax
> 	jmp 2b
> 
> 	_ASM_EXTABLE_FAULT(1b, 3b)
> 
> SYM_FUNC_END()
> 
> That is, having all that inline in the hotpath is quite horrific.
> 

The __tdx_module_call() and __seamcall() both has a "RET" at the end of this
macro:

SYM_FUNC_START(__tdx_module_call)       
        FRAME_BEGIN
        TDX_MODULE_CALL host=0
        FRAME_END
        RET                                                       
SYM_FUNC_END(__tdx_module_call)       

In this case, I think we need to remove the "RET" there.

I didn't think I would modify __tdx_module_call() and __seamcall() in this
patch.
Huang, Kai June 29, 2023, 10:33 a.m. UTC | #7
On Wed, 2023-06-28 at 22:38 +0200, Peter Zijlstra wrote:
> On Wed, Jun 28, 2023 at 05:29:01PM +0200, Peter Zijlstra wrote:
> > On Tue, Jun 27, 2023 at 02:12:50AM +1200, Kai Huang wrote:
> > > diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S
> > > index 49a54356ae99..757b0c34be10 100644
> > > --- a/arch/x86/virt/vmx/tdx/tdxcall.S
> > > +++ b/arch/x86/virt/vmx/tdx/tdxcall.S
> > > @@ -1,6 +1,7 @@
> > >  /* SPDX-License-Identifier: GPL-2.0 */
> > >  #include <asm/asm-offsets.h>
> > >  #include <asm/tdx.h>
> > > +#include <asm/asm.h>
> > >  
> > >  /*
> > >   * TDCALL and SEAMCALL are supported in Binutils >= 2.36.
> > > @@ -45,6 +46,7 @@
> > >  	/* Leave input param 2 in RDX */
> > >  
> > >  	.if \host
> > > +1:
> > >  	seamcall
> > 
> > So what registers are actually clobbered by SEAMCALL ? There's a
> > distinct lack of it in SDM Vol.2 instruction list :-(
> 
> With the exception of the abomination that is TDH.VP.ENTER all SEAMCALLs
> seem to be limited to the set presented here (c,d,8,9,10,11) and all
> other registers should be available.

RAX is also used as SEAMCALL return code.

Looking at the later versions of TDX spec (with TD live migration, etc), it
seems they are already using R12-R13 as SEAMCALL output:

https://cdrdv2.intel.com/v1/dl/getContent/733579

E.g., 6.3.15. NEW: TDH.IMPORT.MEM Leaf

It uses R12 and R13 as input.

> 
> Can we please make that a hard requirement, SEAMCALL must not use
> registers outside this? We can hardly program to random future
> extentions; we need hard ABI guarantees here.


I believe all other GPRs are just saved/restored in SEAMCALL/SEAMRET, so in
practice all other GPRs not used as input/output should not be clobbered.  But I
will confirm with TDX module guys.  And even it's true in practice it's better
to document it.  

But I think we also want to ask them to stop adding more registers as
input/output.

I'll talk to TDX module team on this.

> 
> That also means we should be able to use si,di for the cmovc below.
> 
> Kirill, back when we did __tdx_hypercall() we got bp removed as a valid
> register, the 1.0 spec still lists that, and it is also listed in
> TDH.VP.ENTER, I'm assuming it will be removed there too?
> 
> bp must not be used -- it violates the pre-existing calling convention.
> 
>
Kirill A . Shutemov June 29, 2023, 11:16 a.m. UTC | #8
On Wed, Jun 28, 2023 at 10:38:23PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 28, 2023 at 05:29:01PM +0200, Peter Zijlstra wrote:
> > On Tue, Jun 27, 2023 at 02:12:50AM +1200, Kai Huang wrote:
> > > diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S
> > > index 49a54356ae99..757b0c34be10 100644
> > > --- a/arch/x86/virt/vmx/tdx/tdxcall.S
> > > +++ b/arch/x86/virt/vmx/tdx/tdxcall.S
> > > @@ -1,6 +1,7 @@
> > >  /* SPDX-License-Identifier: GPL-2.0 */
> > >  #include <asm/asm-offsets.h>
> > >  #include <asm/tdx.h>
> > > +#include <asm/asm.h>
> > >  
> > >  /*
> > >   * TDCALL and SEAMCALL are supported in Binutils >= 2.36.
> > > @@ -45,6 +46,7 @@
> > >  	/* Leave input param 2 in RDX */
> > >  
> > >  	.if \host
> > > +1:
> > >  	seamcall
> > 
> > So what registers are actually clobbered by SEAMCALL ? There's a
> > distinct lack of it in SDM Vol.2 instruction list :-(
> 
> With the exception of the abomination that is TDH.VP.ENTER all SEAMCALLs
> seem to be limited to the set presented here (c,d,8,9,10,11) and all
> other registers should be available.
> 
> Can we please make that a hard requirement, SEAMCALL must not use
> registers outside this? We can hardly program to random future
> extentions; we need hard ABI guarantees here.
> 
> That also means we should be able to use si,di for the cmovc below.
> 
> Kirill, back when we did __tdx_hypercall() we got bp removed as a valid
> register, the 1.0 spec still lists that, and it is also listed in
> TDH.VP.ENTER, I'm assuming it will be removed there too?
> 
> bp must not be used -- it violates the pre-existing calling convention.

I've just brought it up again internally. Let's see what will happen.
Kirill A . Shutemov June 30, 2023, 9:03 a.m. UTC | #9
On Wed, Jun 28, 2023 at 11:16:41PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 28, 2023 at 11:11:32PM +0200, Peter Zijlstra wrote:
> > --- a/arch/x86/virt/vmx/tdx/tdxcall.S
> > +++ b/arch/x86/virt/vmx/tdx/tdxcall.S
> > @@ -17,37 +17,44 @@
> >   *            TDX module and hypercalls to the VMM.
> >   * SEAMCALL - used by TDX hosts to make requests to the
> >   *            TDX module.
> > + *
> > + *-------------------------------------------------------------------------
> > + * TDCALL/SEAMCALL ABI:
> > + *-------------------------------------------------------------------------
> > + * Input Registers:
> > + *
> > + * RAX                 - TDCALL Leaf number.
> > + * RCX,RDX,R8-R9       - TDCALL Leaf specific input registers.
> > + *
> > + * Output Registers:
> > + *
> > + * RAX                 - TDCALL instruction error code.
> > + * RCX,RDX,R8-R11      - TDCALL Leaf specific output registers.
> > + *
> > + *-------------------------------------------------------------------------
> > + *
> > + * __tdx_module_call() function ABI:
> > + *
> > + * @fn   (RDI)         - TDCALL Leaf ID,    moved to RAX
> > + * @regs (RSI)         - struct tdx_regs pointer
> > + *
> > + * Return status of TDCALL via RAX.
> >   */
> > +.macro TDX_MODULE_CALL host:req ret:req
> > +	FRAME_BEGIN
> >  
> > +	mov	%rdi, %rax
> > +	mov	$TDX_SEAMCALL_VMFAILINVALID, %rdi
> >  
> > +	mov	TDX_MODULE_rcx(%rsi), %rcx
> > +	mov	TDX_MODULE_rdx(%rsi), %rdx
> > +	mov	TDX_MODULE_r8(%rsi),  %r8
> > +	mov	TDX_MODULE_r9(%rsi),  %r9
> > +//	mov	TDX_MODULE_r10(%rsi), %r10
> > +//	mov	TDX_MODULE_r11(%rsi), %r11
> >  
> > +.if \host
> > +1:	seamcall
> >  	/*
> >  	 * SEAMCALL instruction is essentially a VMExit from VMX root
> >  	 * mode to SEAM VMX root mode.  VMfailInvalid (CF=1) indicates
> 	...
> >  	 * This value will never be used as actual SEAMCALL error code as
> >  	 * it is from the Reserved status code class.
> >  	 */
> > +	cmovc	%rdi, %rax
> >  2:
> > +.else
> >  	tdcall
> > +.endif
> >  
> > +.if \ret
> > +	movq %rcx, TDX_MODULE_rcx(%rsi)
> > +	movq %rdx, TDX_MODULE_rdx(%rsi)
> > +	movq %r8,  TDX_MODULE_r8(%rsi)
> > +	movq %r9,  TDX_MODULE_r9(%rsi)
> > +	movq %r10, TDX_MODULE_r10(%rsi)
> > +	movq %r11, TDX_MODULE_r11(%rsi)
> > +.endif
> > +
> > +	FRAME_END
> > +	RET
> > +
> > +.if \host
> > +3:
> > +	mov	$TDX_SW_ERROR, %rdi
> > +	or	%rdi, %rax
> > +	jmp 2b
> >  
> > +	_ASM_EXTABLE_FAULT(1b, 3b)
> > +.endif
> >  .endm
> 
> Isn't that much simpler?

I'm okay either way.

Obviously, arch/x86/coco/tdx/tdcall.S has to be patched to use the new
TDX_MODULE_CALL macro.
Huang, Kai June 30, 2023, 10:02 a.m. UTC | #10
> 
> I'm okay either way.
> 
> Obviously, arch/x86/coco/tdx/tdcall.S has to be patched to use the new
> TDX_MODULE_CALL macro.
> 

Cool then we have consensus.

Kirill will you do the patch(es), or you want me to do?

Unless Peter is already having this on his hand :)
Peter Zijlstra June 30, 2023, 10:06 a.m. UTC | #11
On Thu, Jun 29, 2023 at 10:33:38AM +0000, Huang, Kai wrote:
> On Wed, 2023-06-28 at 22:38 +0200, Peter Zijlstra wrote:
> > On Wed, Jun 28, 2023 at 05:29:01PM +0200, Peter Zijlstra wrote:
> > > On Tue, Jun 27, 2023 at 02:12:50AM +1200, Kai Huang wrote:
> > > > diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S
> > > > index 49a54356ae99..757b0c34be10 100644
> > > > --- a/arch/x86/virt/vmx/tdx/tdxcall.S
> > > > +++ b/arch/x86/virt/vmx/tdx/tdxcall.S
> > > > @@ -1,6 +1,7 @@
> > > >  /* SPDX-License-Identifier: GPL-2.0 */
> > > >  #include <asm/asm-offsets.h>
> > > >  #include <asm/tdx.h>
> > > > +#include <asm/asm.h>
> > > >  
> > > >  /*
> > > >   * TDCALL and SEAMCALL are supported in Binutils >= 2.36.
> > > > @@ -45,6 +46,7 @@
> > > >  	/* Leave input param 2 in RDX */
> > > >  
> > > >  	.if \host
> > > > +1:
> > > >  	seamcall
> > > 
> > > So what registers are actually clobbered by SEAMCALL ? There's a
> > > distinct lack of it in SDM Vol.2 instruction list :-(
> > 
> > With the exception of the abomination that is TDH.VP.ENTER all SEAMCALLs
> > seem to be limited to the set presented here (c,d,8,9,10,11) and all
> > other registers should be available.
> 
> RAX is also used as SEAMCALL return code.
> 
> Looking at the later versions of TDX spec (with TD live migration, etc), it
> seems they are already using R12-R13 as SEAMCALL output:
> 
> https://cdrdv2.intel.com/v1/dl/getContent/733579

Urgh.. I think I read an older versio because I got bleeding eyes from
all this colour coded crap.

All this red is unreadable :-( Have they been told about the glories of
TeX and diff ?

> E.g., 6.3.15. NEW: TDH.IMPORT.MEM Leaf
> 
> It uses R12 and R13 as input.

12 and 14. They skipped 13 for some mysterious raisin.

But also, 10,11 are frequently used as input with this new stuff, which
already suggests the setup from your patches is not tenable.

> > Can we please make that a hard requirement, SEAMCALL must not use
> > registers outside this? We can hardly program to random future
> > extentions; we need hard ABI guarantees here.
> 
> 
> I believe all other GPRs are just saved/restored in SEAMCALL/SEAMRET, so in
> practice all other GPRs not used as input/output should not be clobbered.  But I
> will confirm with TDX module guys.  And even it's true in practice it's better
> to document it.  
> 
> But I think we also want to ask them to stop adding more registers as
> input/output.
> 
> I'll talk to TDX module team on this.

Please, because 12,14 are callee-saved, which means we need to go add
push/pop to preserve them :-(

Then you end up with something like this...

/*
 * TDX_MODULE_CALL - common helper macro for both
 *                 TDCALL and SEAMCALL instructions.
 *
 * TDCALL   - used by TDX guests to make requests to the
 *            TDX module and hypercalls to the VMM.
 * SEAMCALL - used by TDX hosts to make requests to the
 *            TDX module.
 *
 *-------------------------------------------------------------------------
 * TDCALL/SEAMCALL ABI:
 *-------------------------------------------------------------------------
 * Input Registers:
 *
 * RAX                 - TDCALL Leaf number.
 * RCX,RDX,R8-R11      - TDCALL Leaf specific input registers.
 *
 * Output Registers:
 *
 * RAX                 - TDCALL instruction error code.
 * RCX,RDX,R8-R11      - TDCALL Leaf specific output registers.
 * R12-R14	       - extra output registers
 *
 *-------------------------------------------------------------------------
 *
 * __tdx_module_call() function ABI:
 *
 * @fn   (RDI)         - TDCALL Leaf ID,    moved to RAX
 * @regs (RSI)         - struct tdx_regs pointer
 *
 * Return status of TDCALL via RAX.
 */
.macro TDX_MODULE_CALL host:req ret:req extra:0
	FRAME_BEGIN

	movq	%rdi, %rax
	movq	$TDX_SEAMCALL_VMFAILINVALID, %rdi

	movq	TDX_MODULE_rcx(%rsi), %rcx
	movq	TDX_MODULE_rdx(%rsi), %rdx
	movq	TDX_MODULE_r8(%rsi),  %r8
	movq	TDX_MODULE_r9(%rsi),  %r9
	movq	TDX_MODULE_r10(%rsi), %r10
	movq	TDX_MODULE_r11(%rsi), %r11
.if \extra
	pushq	r12
	pushq	r13
	pushq	r14

//	movq	TDX_MODULE_r12(%rsi), %r12
//	movq	TDX_MODULE_r13(%rsi), %r13
//	movq	TDX_MODULE_r14(%rsi), %r14
.endif

.if \host
1:	seamcall
	/*
	 * SEAMCALL instruction is essentially a VMExit from VMX root
	 * mode to SEAM VMX root mode.  VMfailInvalid (CF=1) indicates
	 * that the targeted SEAM firmware is not loaded or disabled,
	 * or P-SEAMLDR is busy with another SEAMCALL.  %rax is not
	 * changed in this case.
	 *
	 * Set %rax to TDX_SEAMCALL_VMFAILINVALID for VMfailInvalid.
	 * This value will never be used as actual SEAMCALL error code as
	 * it is from the Reserved status code class.
	 */
	cmovc	%rdi, %rax
2:
.else
	tdcall
.endif

.if \ret
	movq	%rcx, TDX_MODULE_rcx(%rsi)
	movq	%rdx, TDX_MODULE_rdx(%rsi)
	movq	%r8,  TDX_MODULE_r8(%rsi)
	movq	%r9,  TDX_MODULE_r9(%rsi)
	movq	%r10, TDX_MODULE_r10(%rsi)
	movq	%r11, TDX_MODULE_r11(%rsi)
.endif
.if \extra
	movq	%r12, TDX_MODULE_r12(%rsi)
	movq	%r13, TDX_MODULE_r13(%rsi)
	movq	%r14, TDX_MODULE_r14(%rsi)

	popq	%r14
	popq	%r13
	popq	%r12
.endif

	FRAME_END
	RET

.if \host
3:
	mov	$TDX_SW_ERROR, %rdi
	or	%rdi, %rax
	jmp 2b

	_ASM_EXTABLE_FAULT(1b, 3b)
.endif
.endm
Huang, Kai June 30, 2023, 10:18 a.m. UTC | #12
On Fri, 2023-06-30 at 12:06 +0200, Peter Zijlstra wrote:
> On Thu, Jun 29, 2023 at 10:33:38AM +0000, Huang, Kai wrote:
> > On Wed, 2023-06-28 at 22:38 +0200, Peter Zijlstra wrote:
> > > On Wed, Jun 28, 2023 at 05:29:01PM +0200, Peter Zijlstra wrote:
> > > > On Tue, Jun 27, 2023 at 02:12:50AM +1200, Kai Huang wrote:
> > > > > diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S
> > > > > index 49a54356ae99..757b0c34be10 100644
> > > > > --- a/arch/x86/virt/vmx/tdx/tdxcall.S
> > > > > +++ b/arch/x86/virt/vmx/tdx/tdxcall.S
> > > > > @@ -1,6 +1,7 @@
> > > > >  /* SPDX-License-Identifier: GPL-2.0 */
> > > > >  #include <asm/asm-offsets.h>
> > > > >  #include <asm/tdx.h>
> > > > > +#include <asm/asm.h>
> > > > >  
> > > > >  /*
> > > > >   * TDCALL and SEAMCALL are supported in Binutils >= 2.36.
> > > > > @@ -45,6 +46,7 @@
> > > > >  	/* Leave input param 2 in RDX */
> > > > >  
> > > > >  	.if \host
> > > > > +1:
> > > > >  	seamcall
> > > > 
> > > > So what registers are actually clobbered by SEAMCALL ? There's a
> > > > distinct lack of it in SDM Vol.2 instruction list :-(
> > > 
> > > With the exception of the abomination that is TDH.VP.ENTER all SEAMCALLs
> > > seem to be limited to the set presented here (c,d,8,9,10,11) and all
> > > other registers should be available.
> > 
> > RAX is also used as SEAMCALL return code.
> > 
> > Looking at the later versions of TDX spec (with TD live migration, etc), it
> > seems they are already using R12-R13 as SEAMCALL output:
> > 
> > https://cdrdv2.intel.com/v1/dl/getContent/733579
> 
> Urgh.. I think I read an older versio because I got bleeding eyes from
> all this colour coded crap.
> 
> All this red is unreadable :-( Have they been told about the glories of
> TeX and diff ?
> 
> > E.g., 6.3.15. NEW: TDH.IMPORT.MEM Leaf
> > 
> > It uses R12 and R13 as input.
> 
> 12 and 14. They skipped 13 for some mysterious raisin.
> 
> But also, 10,11 are frequently used as input with this new stuff, which
> already suggests the setup from your patches is not tenable.
> 
> > > Can we please make that a hard requirement, SEAMCALL must not use
> > > registers outside this? We can hardly program to random future
> > > extentions; we need hard ABI guarantees here.
> > 
> > 
> > I believe all other GPRs are just saved/restored in SEAMCALL/SEAMRET, so in
> > practice all other GPRs not used as input/output should not be clobbered.  But I
> > will confirm with TDX module guys.  And even it's true in practice it's better
> > to document it.  
> > 
> > But I think we also want to ask them to stop adding more registers as
> > input/output.
> > 
> > I'll talk to TDX module team on this.
> 
> Please, because 12,14 are callee-saved, which means we need to go add
> push/pop to preserve them :-(

Yes.

However those new SEAMCALLs are for TDX guest live migration support,  which is
at a year(s)-later thing from upstreaming's point of view.  My thinking is we
can defer supporting those new SEAMCALls until that phase.  Yes we need to do
some assembly change at that time, but also looks fine to me.

How does this sound?
Peter Zijlstra June 30, 2023, 10:21 a.m. UTC | #13
On Fri, Jun 30, 2023 at 12:07:00PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 29, 2023 at 10:33:38AM +0000, Huang, Kai wrote:

> > Looking at the later versions of TDX spec (with TD live migration, etc), it
> > seems they are already using R12-R13 as SEAMCALL output:
> > 
> > https://cdrdv2.intel.com/v1/dl/getContent/733579
> 
> Urgh.. I think I read an older versio because I got bleeding eyes from
> all this colour coded crap.
> 
> All this red is unreadable :-( Have they been told about the glories of
> TeX and diff ?
> 
> > E.g., 6.3.15. NEW: TDH.IMPORT.MEM Leaf
> > 
> > It uses R12 and R13 as input.
> 
> 12 and 14. They skipped 13 for some mysterious raisin.

Things like TDH.SERVTD.BIND do use R13.

> But also, 10,11 are frequently used as input with this new stuff, which
> already suggests the setup from your patches is not tenable.


TDG.SERVTD.RD *why* can't they pass that TD_UUID as a pointer? Using *4*
registers like that is quite insane.

TDG.VP.ENTER :-(((( that has b,15,si,di as additional output.

That means there's not a single register left unused. Can we still get
this changed, please?!?
Kirill A . Shutemov June 30, 2023, 10:22 a.m. UTC | #14
On Fri, Jun 30, 2023 at 10:02:32AM +0000, Huang, Kai wrote:
> 
> > 
> > I'm okay either way.
> > 
> > Obviously, arch/x86/coco/tdx/tdcall.S has to be patched to use the new
> > TDX_MODULE_CALL macro.
> > 
> 
> Cool then we have consensus.
> 
> Kirill will you do the patch(es), or you want me to do?

Please, do.
Huang, Kai June 30, 2023, 11:05 a.m. UTC | #15
On Fri, 2023-06-30 at 12:21 +0200, Peter Zijlstra wrote:
> On Fri, Jun 30, 2023 at 12:07:00PM +0200, Peter Zijlstra wrote:
> > On Thu, Jun 29, 2023 at 10:33:38AM +0000, Huang, Kai wrote:
> 
> > > Looking at the later versions of TDX spec (with TD live migration, etc), it
> > > seems they are already using R12-R13 as SEAMCALL output:
> > > 
> > > https://cdrdv2.intel.com/v1/dl/getContent/733579
> > 
> > Urgh.. I think I read an older versio because I got bleeding eyes from
> > all this colour coded crap.
> > 
> > All this red is unreadable :-( Have they been told about the glories of
> > TeX and diff ?
> > 
> > > E.g., 6.3.15. NEW: TDH.IMPORT.MEM Leaf
> > > 
> > > It uses R12 and R13 as input.
> > 
> > 12 and 14. They skipped 13 for some mysterious raisin.
> 
> Things like TDH.SERVTD.BIND do use R13.
> 
> > But also, 10,11 are frequently used as input with this new stuff, which
> > already suggests the setup from your patches is not tenable.
> 
> 
> TDG.SERVTD.RD *why* can't they pass that TD_UUID as a pointer? Using *4*
> registers like that is quite insane.

I can ask TDX module team whether they can change to use a buffer instead of 4
registers.

> 
> TDG.VP.ENTER :-(((( that has b,15,si,di as additional output.
> 
> That means there's not a single register left unused. Can we still get
> this changed, please?!?
> 

I assume it's TDH.VP.ENTER.

TDH.VP.ENTER is a special SEAMCALL because it goes into TDX guest to run.  KVM
should be the only place that handles it.  And KVM won't use the __seamcall()
here to handle it but will have it's own assembly.

For normal VMX guests, since most GPRs are not saved/restored by hardware, KVM
already has sufficient code to save/restore relevant GPRs during VMENTER/VMEXIT.
Handling TDX can just follow that pattern.

So looks fine to me.
Huang, Kai June 30, 2023, 11:06 a.m. UTC | #16
On Fri, 2023-06-30 at 13:22 +0300, kirill.shutemov@linux.intel.com wrote:
> On Fri, Jun 30, 2023 at 10:02:32AM +0000, Huang, Kai wrote:
> > 
> > > 
> > > I'm okay either way.
> > > 
> > > Obviously, arch/x86/coco/tdx/tdcall.S has to be patched to use the new
> > > TDX_MODULE_CALL macro.
> > > 
> > 
> > Cool then we have consensus.
> > 
> > Kirill will you do the patch(es), or you want me to do?
> 
> Please, do.
> 

OK I'll do.
Peter Zijlstra June 30, 2023, 12:06 p.m. UTC | #17
On Fri, Jun 30, 2023 at 12:21:41PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 30, 2023 at 12:07:00PM +0200, Peter Zijlstra wrote:
> > On Thu, Jun 29, 2023 at 10:33:38AM +0000, Huang, Kai wrote:
> 
> > > Looking at the later versions of TDX spec (with TD live migration, etc), it
> > > seems they are already using R12-R13 as SEAMCALL output:
> > > 
> > > https://cdrdv2.intel.com/v1/dl/getContent/733579
> > 
> > Urgh.. I think I read an older versio because I got bleeding eyes from
> > all this colour coded crap.
> > 
> > All this red is unreadable :-( Have they been told about the glories of
> > TeX and diff ?
> > 
> > > E.g., 6.3.15. NEW: TDH.IMPORT.MEM Leaf
> > > 
> > > It uses R12 and R13 as input.
> > 
> > 12 and 14. They skipped 13 for some mysterious raisin.
> 
> Things like TDH.SERVTD.BIND do use R13.
> 
> > But also, 10,11 are frequently used as input with this new stuff, which
> > already suggests the setup from your patches is not tenable.
> 
> 
> TDG.SERVTD.RD *why* can't they pass that TD_UUID as a pointer? Using *4*
> registers like that is quite insane.
> 
> TDG.VP.ENTER :-(((( that has b,15,si,di as additional output.
> 
> That means there's not a single register left unused. Can we still get
> this changed, please?!?

Can't :/, VP.ENTER mirrors VP.VMCALL, so we need to deal with both.

So I think the below deals with everything and unifies __tdx_hypercall()
and __tdx_module_call(), since both sides needs to deal with exactly the
same trainwreck.


/*
 * Used for input/output registers values of the TDCALL and SEAMCALL
 * instructions when requesting services from the TDX module.
 *
 * This is a software only structure and not part of the TDX module/VMM ABI.
 */
struct tdx_module_args {
	/* callee-clobbered */
	u64 rdx;
	u64 rcx;
	u64 r8;
	u64 r9;
	/* extra callee-clobbered */
	u64 r10;
	u64 r11;
	/* callee-saved + rdi/rsi */
	u64 rdi;
	u64 rsi;
	u64 rbx;
	u64 r12;
	u64 r13;
	u64 r14;
	u64 r15;
};



/*
 * TDX_MODULE_CALL - common helper macro for both
 *                   TDCALL and SEAMCALL instructions.
 *
 * TDCALL   - used by TDX guests to make requests to the
 *            TDX module and hypercalls to the VMM.
 *
 * SEAMCALL - used by TDX hosts to make requests to the
 *            TDX module.
 *
 *-------------------------------------------------------------------------
 * TDCALL/SEAMCALL ABI:
 *-------------------------------------------------------------------------
 * Input Registers:
 *
 * RAX                 - Leaf number.
 * RCX,RDX,R8-R11      - Leaf specific input registers.
 * RDI,RSI,RBX,R11-R15 - VP.VMCALL VP.ENTER
 *
 * Output Registers:
 *
 * RAX                 - instruction error code.
 * RCX,RDX,R8-R11      - Leaf specific output registers.
 * RDI,RSI,RBX,R12-R15 - VP.VMCALL VP.ENTER
 *
 *-------------------------------------------------------------------------
 *
 * So while the common core (RAX,RCX,RDX,R8-R11) fits nicely in the
 * callee-clobbered registers and even leaves RDI,RSI free to act as a base
 * pointer some rare leafs (VP.VMCALL, VP.ENTER) make a giant mess of things.
 *
 * For simplicity, assume that anything that needs the callee-saved regs also
 * tramples on RDI,RSI. This isn't strictly true, see for example EXPORT.MEM.
 */
.macro TDX_MODULE_CALL host:req ret:req saved:0
	FRAME_BEGIN

	movq	%rdi, %rax

	movq	TDX_MODULE_rcx(%rsi), %rcx
	movq	TDX_MODULE_rdx(%rsi), %rdx
	movq	TDX_MODULE_r8(%rsi),  %r8
	movq	TDX_MODULE_r9(%rsi),  %r9
	movq	TDX_MODULE_r10(%rsi), %r10
	movq	TDX_MODULE_r11(%rsi), %r11

.if \saved
	pushq	rbx
	pushq	r12
	pushq	r13
	pushq	r14
	pushq	r15

	movq	TDX_MODULE_rbx(%rsi), %rbx
	movq	TDX_MODULE_r12(%rsi), %r12
	movq	TDX_MODULE_r13(%rsi), %r13
	movq	TDX_MODULE_r14(%rsi), %r14
	movq	TDX_MODULE_r15(%rsi), %r15

	/* VP.VMCALL and VP.ENTER */
.if \ret
	pushq	%rsi
.endif
	movq	TDX_MODULE_rdi(%rsi), %rdi
	movq	TDX_MODULE_rsi(%rsi), %rsi
.endif

.Lcall:
.if \host
	seamcall
	/*
	 * SEAMCALL instruction is essentially a VMExit from VMX root
	 * mode to SEAM VMX root mode.  VMfailInvalid (CF=1) indicates
	 * that the targeted SEAM firmware is not loaded or disabled,
	 * or P-SEAMLDR is busy with another SEAMCALL. RAX is not
	 * changed in this case.
	 */
	jc	.Lseamfail

.if \saved && \ret
	/*
	 * VP.ENTER clears RSI on output, use it to restore state.
	 */
	popq	%rsi
	xor	%edi,%edi
	movq	%rdi, TDX_MODULE_rdi(%rsi)
	movq	%rdi, TDX_MODULE_rsi(%rsi)
.endif
.else
	tdcall

	/*
	 * RAX!=0 indicates a failure, assume no return values.
	 */
	testq	%rax, %rax
	jne	.Lerror

.if \saved && \ret
	/*
	 * Since RAX==0, it can be used as a scratch register to restore state.
	 *
	 * [ assumes \saved implies \ret ]
	 */
	popq	%rax
	movq	%rdi, TDX_MODULE_rdi(%rax)
	movq	%rsi, TDX_MODULE_rsi(%rax)
	movq	%rax, %rsi
	xor	%eax, %eax;
.endif
.endif // \host

.if \ret
	/* RSI is restored */
	movq	%rcx, TDX_MODULE_rcx(%rsi)
	movq	%rdx, TDX_MODULE_rdx(%rsi)
	movq	%r8,  TDX_MODULE_r8(%rsi)
	movq	%r9,  TDX_MODULE_r9(%rsi)
	movq	%r10, TDX_MODULE_r10(%rsi)
	movq	%r11, TDX_MODULE_r11(%rsi)
.if \saved
	movq	%rbx, TDX_MODULE_rbx(%rsi)
	movq	%r12, TDX_MODULE_r12(%rsi)
	movq	%r13, TDX_MODULE_r13(%rsi)
	movq	%r14, TDX_MODULE_r14(%rsi)
	movq	%r15, TDX_MODULE_r15(%rsi)
.endif
.endif // \ret

.Lout:
.if \saved
	popq	%r15
	popq	%r14
	popq	%r13
	popq	%r12
	popq	%rbx
.endif
	FRAME_END
	RET

	/*
	 * Error and exception handling at .Lcall. Ignore \ret on failure.
	 */
.Lerror:
.if \saved && \ret
	popq	%rsi
.endif
	jmp	.Lout

.if \host
.Lseamfail:
	/*
	 * Set RAX to TDX_SEAMCALL_VMFAILINVALID for VMfailInvalid.
	 * This value will never be used as actual SEAMCALL error code as
	 * it is from the Reserved status code class.
	 */
	movq	$TDX_SEAMCALL_VMFAILINVALID, %rax
	jmp	.Lerror

.Lfault:
	/*
	 * SEAMCALL caused #GP or #UD. Per _ASM_EXTABLE_FAULT() RAX
	 * contains the trap number, convert to a TDX error code by
	 * setting the high word to TDX_SW_ERROR.
	 */
	mov	$TDX_SW_ERROR, %rdi
	or	%rdi, %rax
	jmp	.Lerror

	_ASM_EXTABLE_FAULT(.Lcall, .Lfault)
.endif
.endm
Peter Zijlstra June 30, 2023, 3:14 p.m. UTC | #18
On Fri, Jun 30, 2023 at 02:06:50PM +0200, Peter Zijlstra wrote:
> /*
>  * Used for input/output registers values of the TDCALL and SEAMCALL
>  * instructions when requesting services from the TDX module.
>  *
>  * This is a software only structure and not part of the TDX module/VMM ABI.
>  */
> struct tdx_module_args {
> 	/* callee-clobbered */
> 	u64 rdx;
> 	u64 rcx;
> 	u64 r8;
> 	u64 r9;
> 	/* extra callee-clobbered */
> 	u64 r10;
> 	u64 r11;
> 	/* callee-saved + rdi/rsi */
> 	u64 rdi;
> 	u64 rsi;
> 	u64 rbx;
> 	u64 r12;
> 	u64 r13;
> 	u64 r14;
> 	u64 r15;
> };
> 
> 
> 
> /*
>  * TDX_MODULE_CALL - common helper macro for both
>  *                   TDCALL and SEAMCALL instructions.
>  *
>  * TDCALL   - used by TDX guests to make requests to the
>  *            TDX module and hypercalls to the VMM.
>  *
>  * SEAMCALL - used by TDX hosts to make requests to the
>  *            TDX module.
>  *
>  *-------------------------------------------------------------------------
>  * TDCALL/SEAMCALL ABI:
>  *-------------------------------------------------------------------------
>  * Input Registers:
>  *
>  * RAX                 - Leaf number.
>  * RCX,RDX,R8-R11      - Leaf specific input registers.
>  * RDI,RSI,RBX,R11-R15 - VP.VMCALL VP.ENTER
>  *
>  * Output Registers:
>  *
>  * RAX                 - instruction error code.
>  * RCX,RDX,R8-R11      - Leaf specific output registers.
>  * RDI,RSI,RBX,R12-R15 - VP.VMCALL VP.ENTER
>  *
>  *-------------------------------------------------------------------------
>  *
>  * So while the common core (RAX,RCX,RDX,R8-R11) fits nicely in the
>  * callee-clobbered registers and even leaves RDI,RSI free to act as a base
>  * pointer some rare leafs (VP.VMCALL, VP.ENTER) make a giant mess of things.
>  *
>  * For simplicity, assume that anything that needs the callee-saved regs also
>  * tramples on RDI,RSI. This isn't strictly true, see for example EXPORT.MEM.
>  */
> .macro TDX_MODULE_CALL host:req ret:req saved:0
> 	FRAME_BEGIN
> 
> 	movq	%rdi, %rax
> 
> 	movq	TDX_MODULE_rcx(%rsi), %rcx
> 	movq	TDX_MODULE_rdx(%rsi), %rdx
> 	movq	TDX_MODULE_r8(%rsi),  %r8
> 	movq	TDX_MODULE_r9(%rsi),  %r9
> 	movq	TDX_MODULE_r10(%rsi), %r10
> 	movq	TDX_MODULE_r11(%rsi), %r11
> 
> .if \saved
> 	pushq	rbx
> 	pushq	r12
> 	pushq	r13
> 	pushq	r14
> 	pushq	r15
> 
> 	movq	TDX_MODULE_rbx(%rsi), %rbx
> 	movq	TDX_MODULE_r12(%rsi), %r12
> 	movq	TDX_MODULE_r13(%rsi), %r13
> 	movq	TDX_MODULE_r14(%rsi), %r14
> 	movq	TDX_MODULE_r15(%rsi), %r15
> 
> 	/* VP.VMCALL and VP.ENTER */
> .if \ret
> 	pushq	%rsi
> .endif
> 	movq	TDX_MODULE_rdi(%rsi), %rdi
> 	movq	TDX_MODULE_rsi(%rsi), %rsi
> .endif
> 
> .Lcall:
> .if \host
> 	seamcall
> 	/*
> 	 * SEAMCALL instruction is essentially a VMExit from VMX root
> 	 * mode to SEAM VMX root mode.  VMfailInvalid (CF=1) indicates
> 	 * that the targeted SEAM firmware is not loaded or disabled,
> 	 * or P-SEAMLDR is busy with another SEAMCALL. RAX is not
> 	 * changed in this case.
> 	 */
> 	jc	.Lseamfail
> 
> .if \saved && \ret
> 	/*
> 	 * VP.ENTER clears RSI on output, use it to restore state.
> 	 */
> 	popq	%rsi
> 	xor	%edi,%edi
> 	movq	%rdi, TDX_MODULE_rdi(%rsi)
> 	movq	%rdi, TDX_MODULE_rsi(%rsi)
> .endif
> .else
> 	tdcall
> 
> 	/*
> 	 * RAX!=0 indicates a failure, assume no return values.
> 	 */
> 	testq	%rax, %rax
> 	jne	.Lerror
> 
> .if \saved && \ret
> 	/*
> 	 * Since RAX==0, it can be used as a scratch register to restore state.
> 	 *
> 	 * [ assumes \saved implies \ret ]

This comment is wrong. As should be obvious from the condition above.

> 	 */
> 	popq	%rax
> 	movq	%rdi, TDX_MODULE_rdi(%rax)
> 	movq	%rsi, TDX_MODULE_rsi(%rax)
> 	movq	%rax, %rsi
> 	xor	%eax, %eax;
> .endif
> .endif // \host
> 
> .if \ret
> 	/* RSI is restored */
> 	movq	%rcx, TDX_MODULE_rcx(%rsi)
> 	movq	%rdx, TDX_MODULE_rdx(%rsi)
> 	movq	%r8,  TDX_MODULE_r8(%rsi)
> 	movq	%r9,  TDX_MODULE_r9(%rsi)
> 	movq	%r10, TDX_MODULE_r10(%rsi)
> 	movq	%r11, TDX_MODULE_r11(%rsi)
> .if \saved
> 	movq	%rbx, TDX_MODULE_rbx(%rsi)
> 	movq	%r12, TDX_MODULE_r12(%rsi)
> 	movq	%r13, TDX_MODULE_r13(%rsi)
> 	movq	%r14, TDX_MODULE_r14(%rsi)
> 	movq	%r15, TDX_MODULE_r15(%rsi)
> .endif
> .endif // \ret
> 
> .Lout:
> .if \saved
> 	popq	%r15
> 	popq	%r14
> 	popq	%r13
> 	popq	%r12
> 	popq	%rbx
> .endif
> 	FRAME_END
> 	RET
> 
> 	/*
> 	 * Error and exception handling at .Lcall. Ignore \ret on failure.
> 	 */
> .Lerror:
> .if \saved && \ret
> 	popq	%rsi
> .endif
> 	jmp	.Lout
> 
> .if \host
> .Lseamfail:
> 	/*
> 	 * Set RAX to TDX_SEAMCALL_VMFAILINVALID for VMfailInvalid.
> 	 * This value will never be used as actual SEAMCALL error code as
> 	 * it is from the Reserved status code class.
> 	 */
> 	movq	$TDX_SEAMCALL_VMFAILINVALID, %rax
> 	jmp	.Lerror
> 
> .Lfault:
> 	/*
> 	 * SEAMCALL caused #GP or #UD. Per _ASM_EXTABLE_FAULT() RAX
> 	 * contains the trap number, convert to a TDX error code by
> 	 * setting the high word to TDX_SW_ERROR.
> 	 */
> 	mov	$TDX_SW_ERROR, %rdi
> 	or	%rdi, %rax
> 	jmp	.Lerror
> 
> 	_ASM_EXTABLE_FAULT(.Lcall, .Lfault)
> .endif
> .endm
Dave Hansen June 30, 2023, 3:16 p.m. UTC | #19
On 6/30/23 03:18, Huang, Kai wrote:
>> Please, because 12,14 are callee-saved, which means we need to go add
>> push/pop to preserve them 
Huang, Kai July 1, 2023, 8:16 a.m. UTC | #20
On Fri, 2023-06-30 at 08:16 -0700, Dave Hansen wrote:
> On 6/30/23 03:18, Huang, Kai wrote:
> > > Please, because 12,14 are callee-saved, which means we need to go add
> > > push/pop to preserve them 
Huang, Kai July 3, 2023, 12:15 p.m. UTC | #21
> 
> So I think the below deals with everything and unifies __tdx_hypercall()
> and __tdx_module_call(), since both sides needs to deal with exactly the
> same trainwreck.

Hi Peter,

Just want to make sure I understand you correctly:

You want to make __tdx_module_call() look like __tdx_hypercall(), but not to
unify them into one assembly (at least for now), right?

I am confused you mentioned VP.VMCALL below, which is handled by
__tdx_hypercall().

> 
> 
> /*
>  * Used for input/output registers values of the TDCALL and SEAMCALL
>  * instructions when requesting services from the TDX module.
>  *
>  * This is a software only structure and not part of the TDX module/VMM ABI.
>  */
> struct tdx_module_args {
> 	/* callee-clobbered */
> 	u64 rdx;
> 	u64 rcx;
> 	u64 r8;
> 	u64 r9;
> 	/* extra callee-clobbered */
> 	u64 r10;
> 	u64 r11;
> 	/* callee-saved + rdi/rsi */
> 	u64 rdi;
> 	u64 rsi;
> 	u64 rbx;
> 	u64 r12;
> 	u64 r13;
> 	u64 r14;
> 	u64 r15;
> };
> 
> 
> 
> /*
>  * TDX_MODULE_CALL - common helper macro for both
>  *                   TDCALL and SEAMCALL instructions.
>  *
>  * TDCALL   - used by TDX guests to make requests to the
>  *            TDX module and hypercalls to the VMM.
>  *
>  * SEAMCALL - used by TDX hosts to make requests to the
>  *            TDX module.
>  *
>  *-------------------------------------------------------------------------
>  * TDCALL/SEAMCALL ABI:
>  *-------------------------------------------------------------------------
>  * Input Registers:
>  *
>  * RAX                 - Leaf number.
>  * RCX,RDX,R8-R11      - Leaf specific input registers.
>  * RDI,RSI,RBX,R11-R15 - VP.VMCALL VP.ENTER
>  *
>  * Output Registers:
>  *
>  * RAX                 - instruction error code.
>  * RCX,RDX,R8-R11      - Leaf specific output registers.
>  * RDI,RSI,RBX,R12-R15 - VP.VMCALL VP.ENTER

As mentioned above, VP.VMCALL is handled by __tdx_hypercall().  Also, VP.ENTER
will be handled by KVM's own assembly.  They both are not handled in this
TDX_MODULE_CALL assembly.

>  *
>  *-------------------------------------------------------------------------
>  *
>  * So while the common core (RAX,RCX,RDX,R8-R11) fits nicely in the
>  * callee-clobbered registers and even leaves RDI,RSI free to act as a base
>  * pointer some rare leafs (VP.VMCALL, VP.ENTER) make a giant mess of things.
>  *
>  * For simplicity, assume that anything that needs the callee-saved regs also
>  * tramples on RDI,RSI. This isn't strictly true, see for example EXPORT.MEM.
>  */
> .macro TDX_MODULE_CALL host:req ret:req saved:0
> 	FRAME_BEGIN
> 
> 	movq	%rdi, %rax
> 
> 	movq	TDX_MODULE_rcx(%rsi), %rcx
> 	movq	TDX_MODULE_rdx(%rsi), %rdx
> 	movq	TDX_MODULE_r8(%rsi),  %r8
> 	movq	TDX_MODULE_r9(%rsi),  %r9
> 	movq	TDX_MODULE_r10(%rsi), %r10
> 	movq	TDX_MODULE_r11(%rsi), %r11
> 
> .if \saved
> 	pushq	rbx
> 	pushq	r12
> 	pushq	r13
> 	pushq	r14
> 	pushq	r15
> 
> 	movq	TDX_MODULE_rbx(%rsi), %rbx
> 	movq	TDX_MODULE_r12(%rsi), %r12
> 	movq	TDX_MODULE_r13(%rsi), %r13
> 	movq	TDX_MODULE_r14(%rsi), %r14
> 	movq	TDX_MODULE_r15(%rsi), %r15
> 
> 	/* VP.VMCALL and VP.ENTER */
> .if \ret
> 	pushq	%rsi
> .endif
> 	movq	TDX_MODULE_rdi(%rsi), %rdi
> 	movq	TDX_MODULE_rsi(%rsi), %rsi
> .endif
> 
> .Lcall:
> .if \host
> 	seamcall
> 	/*
> 	 * SEAMCALL instruction is essentially a VMExit from VMX root
> 	 * mode to SEAM VMX root mode.  VMfailInvalid (CF=1) indicates
> 	 * that the targeted SEAM firmware is not loaded or disabled,
> 	 * or P-SEAMLDR is busy with another SEAMCALL. RAX is not
> 	 * changed in this case.
> 	 */
> 	jc	.Lseamfail
> 
> .if \saved && \ret
> 	/*
> 	 * VP.ENTER clears RSI on output, use it to restore state.
> 	 */
> 	popq	%rsi
> 	xor	%edi,%edi
> 	movq	%rdi, TDX_MODULE_rdi(%rsi)
> 	movq	%rdi, TDX_MODULE_rsi(%rsi)
> .endif
> .else
> 	tdcall
> 
> 	/*
> 	 * RAX!=0 indicates a failure, assume no return values.
> 	 */
> 	testq	%rax, %rax
> 	jne	.Lerror

For some SEAMCALL/TDCALL the output registers may contain additional error
information.  We need to jump to a location where whether returning those
additional regs to 'struct tdx_module_args' depends on \ret.

> 
> .if \saved && \ret
> 	/*
> 	 * Since RAX==0, it can be used as a scratch register to restore state.
> 	 *
> 	 * [ assumes \saved implies \ret ]
> 	 */
> 	popq	%rax
> 	movq	%rdi, TDX_MODULE_rdi(%rax)
> 	movq	%rsi, TDX_MODULE_rsi(%rax)
> 	movq	%rax, %rsi
> 	xor	%eax, %eax;
> .endif
> .endif // \host
> 
> .if \ret
> 	/* RSI is restored */
> 	movq	%rcx, TDX_MODULE_rcx(%rsi)
> 	movq	%rdx, TDX_MODULE_rdx(%rsi)
> 	movq	%r8,  TDX_MODULE_r8(%rsi)
> 	movq	%r9,  TDX_MODULE_r9(%rsi)
> 	movq	%r10, TDX_MODULE_r10(%rsi)
> 	movq	%r11, TDX_MODULE_r11(%rsi)
> .if \saved
> 	movq	%rbx, TDX_MODULE_rbx(%rsi)
> 	movq	%r12, TDX_MODULE_r12(%rsi)
> 	movq	%r13, TDX_MODULE_r13(%rsi)
> 	movq	%r14, TDX_MODULE_r14(%rsi)
> 	movq	%r15, TDX_MODULE_r15(%rsi)
> .endif
> .endif // \ret
> 
> .Lout:
> .if \saved
> 	popq	%r15
> 	popq	%r14
> 	popq	%r13
> 	popq	%r12
> 	popq	%rbx
> .endif
> 	FRAME_END
> 	RET
> 
> 	/*
> 	 * Error and exception handling at .Lcall. Ignore \ret on failure.
> 	 */
> .Lerror:
> .if \saved && \ret
> 	popq	%rsi
> .endif
> 	jmp	.Lout
> 
> .if \host
> .Lseamfail:
> 	/*
> 	 * Set RAX to TDX_SEAMCALL_VMFAILINVALID for VMfailInvalid.
> 	 * This value will never be used as actual SEAMCALL error code as
> 	 * it is from the Reserved status code class.
> 	 */
> 	movq	$TDX_SEAMCALL_VMFAILINVALID, %rax
> 	jmp	.Lerror
> 
> .Lfault:
> 	/*
> 	 * SEAMCALL caused #GP or #UD. Per _ASM_EXTABLE_FAULT() RAX
> 	 * contains the trap number, convert to a TDX error code by
> 	 * setting the high word to TDX_SW_ERROR.
> 	 */
> 	mov	$TDX_SW_ERROR, %rdi
> 	or	%rdi, %rax
> 	jmp	.Lerror
> 
> 	_ASM_EXTABLE_FAULT(.Lcall, .Lfault)
> .endif
> .endm
Peter Zijlstra July 5, 2023, 10:21 a.m. UTC | #22
On Mon, Jul 03, 2023 at 12:15:13PM +0000, Huang, Kai wrote:
> 
> > 
> > So I think the below deals with everything and unifies __tdx_hypercall()
> > and __tdx_module_call(), since both sides needs to deal with exactly the
> > same trainwreck.
> 
> Hi Peter,
> 
> Just want to make sure I understand you correctly:
> 
> You want to make __tdx_module_call() look like __tdx_hypercall(), but not to
> unify them into one assembly (at least for now), right?

Well, given the horrendous trainwreck this is all turning into, I
through it prudent to have it all in a single place. The moment you go
play games with callee-saved registers you're really close to what
hypercall does so then they might as well be the same.

> I am confused you mentioned VP.VMCALL below, which is handled by
> __tdx_hypercall().

But why? It really isn't *that* special if you consider the other calls
that are using callee-saved regs, yes it has the rdi/rsi extra, but meh,
it really just is tdcall-0.


> >  *-------------------------------------------------------------------------
> >  * TDCALL/SEAMCALL ABI:
> >  *-------------------------------------------------------------------------
> >  * Input Registers:
> >  *
> >  * RAX                 - Leaf number.
> >  * RCX,RDX,R8-R11      - Leaf specific input registers.
> >  * RDI,RSI,RBX,R11-R15 - VP.VMCALL VP.ENTER
> >  *
> >  * Output Registers:
> >  *
> >  * RAX                 - instruction error code.
> >  * RCX,RDX,R8-R11      - Leaf specific output registers.
> >  * RDI,RSI,RBX,R12-R15 - VP.VMCALL VP.ENTER
> 
> As mentioned above, VP.VMCALL is handled by __tdx_hypercall().  Also, VP.ENTER
> will be handled by KVM's own assembly.  They both are not handled in this
> TDX_MODULE_CALL assembly.

I don't think they should be special, they're really just yet another
leaf call. Yes, they have a shit calling convention, and yes VP.ENTER is
terminally broken for unconditionally clobbering BP :-(

That really *must* be fixed.

> > .Lcall:
> > .if \host
> > 	seamcall
> > 	/*
> > 	 * SEAMCALL instruction is essentially a VMExit from VMX root
> > 	 * mode to SEAM VMX root mode.  VMfailInvalid (CF=1) indicates
> > 	 * that the targeted SEAM firmware is not loaded or disabled,
> > 	 * or P-SEAMLDR is busy with another SEAMCALL. RAX is not
> > 	 * changed in this case.
> > 	 */
> > 	jc	.Lseamfail
> > 
> > .if \saved && \ret
> > 	/*
> > 	 * VP.ENTER clears RSI on output, use it to restore state.
> > 	 */
> > 	popq	%rsi
> > 	xor	%edi,%edi
> > 	movq	%rdi, TDX_MODULE_rdi(%rsi)
> > 	movq	%rdi, TDX_MODULE_rsi(%rsi)
> > .endif
> > .else
> > 	tdcall
> > 
> > 	/*
> > 	 * RAX!=0 indicates a failure, assume no return values.
> > 	 */
> > 	testq	%rax, %rax
> > 	jne	.Lerror
> 
> For some SEAMCALL/TDCALL the output registers may contain additional error
> information.  We need to jump to a location where whether returning those
> additional regs to 'struct tdx_module_args' depends on \ret.

I suppose we can move this into the below conditional :-( The [DS]I
register stuff requires a scratch reg to recover, AX being zero provides
that.

> > .if \saved && \ret
> > 	/*
> > 	 * Since RAX==0, it can be used as a scratch register to restore state.
> > 	 *
> > 	 * [ assumes \saved implies \ret ]
> > 	 */
> > 	popq	%rax
> > 	movq	%rdi, TDX_MODULE_rdi(%rax)
> > 	movq	%rsi, TDX_MODULE_rsi(%rax)
> > 	movq	%rax, %rsi
> > 	xor	%eax, %eax;
> > .endif
> > .endif // \host

So the reason I want this, is that I feel very strongly that if you
cannot write a single coherent wrapper for all this, its calling
convention is fundamentally *too* complex / broken.
Huang, Kai July 5, 2023, 11:34 a.m. UTC | #23
On Wed, 2023-07-05 at 12:21 +0200, Peter Zijlstra wrote:
> On Mon, Jul 03, 2023 at 12:15:13PM +0000, Huang, Kai wrote:
> > 
> > > 
> > > So I think the below deals with everything and unifies __tdx_hypercall()
> > > and __tdx_module_call(), since both sides needs to deal with exactly the
> > > same trainwreck.
> > 
> > Hi Peter,
> > 
> > Just want to make sure I understand you correctly:
> > 
> > You want to make __tdx_module_call() look like __tdx_hypercall(), but not to
> > unify them into one assembly (at least for now), right?
> 
> Well, given the horrendous trainwreck this is all turning into, I
> through it prudent to have it all in a single place. The moment you go
> play games with callee-saved registers you're really close to what
> hypercall does so then they might as well be the same.

OK I understand you now.  Thanks.

Yeah I think from long-term's view, since SEAMCALLs to support live migration
pretty much uses all RCX/RDX/R8-R15 as input/output, it seems reasonable to
unify all of them, although I guess there might be some special handling to
VP.VMCALL and/or VP.ENTER, e.g., below:

        /* TDVMCALL leaf return code is in R10 */                              
        movq %r10, %rax

So long-termly, I don't have objection to that.  But my thinking is for the
first version of TDX host support, we don't have to support all SEAMCALLs but
only those involved in basic TDX support.  Those SEAMCALLs (except VP.ENTER)
only uses RCX/RDX/R8/R9 as input and RCX/RDX/R8-R11 as output, so to me it looks
fine to only make __tdx_module_call() look like __tdx_hypercall() as the first
step.

Also, the new SEAMCALLs to handle live migration all seem to have below
statement:

	AVX, AVX2	May be reset to the architectural INIT state
	and
	AVX512
	state

Which means those SEAMCALLs need to preserve AVX* states too?

And reading the spec, the VP.VMCALL and VP.ENTER also can use XMM0 - XMM15 as
input/output.  Linux VP.VMCALL seems doesn't support using XMM0 - XMM15 as
input/output, but KVM can run other guest OSes too so I think KVM VP.ENTER needs
to handle XMM0-XMM15 as input/output too.

That being said, I think although we can provide a common asm macro to cover
VP.ENTER, I suspect KVM still needs to do additional assembly around the macro
too.  So I am not sure whether we should try to cover VP.ENTER.

And I don't want to speak for KVM maintainers. :)

Hi Sean/Paolo, do you have any comments here?

> 
> > I am confused you mentioned VP.VMCALL below, which is handled by
> > __tdx_hypercall().
> 
> But why? It really isn't *that* special if you consider the other calls
> that are using callee-saved regs, yes it has the rdi/rsi extra, but meh,
> it really just is tdcall-0.

As mentioned above I don't have objection to this :)

> 
> 
> > >  *-------------------------------------------------------------------------
> > >  * TDCALL/SEAMCALL ABI:
> > >  *-------------------------------------------------------------------------
> > >  * Input Registers:
> > >  *
> > >  * RAX                 - Leaf number.
> > >  * RCX,RDX,R8-R11      - Leaf specific input registers.
> > >  * RDI,RSI,RBX,R11-R15 - VP.VMCALL VP.ENTER
> > >  *
> > >  * Output Registers:
> > >  *
> > >  * RAX                 - instruction error code.
> > >  * RCX,RDX,R8-R11      - Leaf specific output registers.
> > >  * RDI,RSI,RBX,R12-R15 - VP.VMCALL VP.ENTER
> > 
> > As mentioned above, VP.VMCALL is handled by __tdx_hypercall().  Also, VP.ENTER
> > will be handled by KVM's own assembly.  They both are not handled in this
> > TDX_MODULE_CALL assembly.
> 
> I don't think they should be special, they're really just yet another
> leaf call. Yes, they have a shit calling convention, and yes VP.ENTER is
> terminally broken for unconditionally clobbering BP :-(
> 
> That really *must* be fixed.

Sure I don't have objection to this, and for VP.ENTER please see above.

But I'd like to say that, generally speaking, from virtualization's point of
view, guest has its own BP and conceptually the hypervisor needs to restore
guest's BP before jumping to the guest.  E.g., for normal VMX guest, KVM always
restores guest's BP before VMENTER (arch/x86/kvm/vmx/vmenter.S):

SYM_FUNC_START(__vmx_vcpu_run)
        push %_ASM_BP
        mov  %_ASM_SP, %_ASM_BP
	
	...
	mov VCPU_RBP(%_ASM_AX), %_ASM_BP
	...
	vmenter/vmresume
	...
SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
	.....
	mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
	...
	pop %_ASM_BP
        RET

> 
> > > .Lcall:
> > > .if \host
> > > 	seamcall
> > > 	/*
> > > 	 * SEAMCALL instruction is essentially a VMExit from VMX root
> > > 	 * mode to SEAM VMX root mode.  VMfailInvalid (CF=1) indicates
> > > 	 * that the targeted SEAM firmware is not loaded or disabled,
> > > 	 * or P-SEAMLDR is busy with another SEAMCALL. RAX is not
> > > 	 * changed in this case.
> > > 	 */
> > > 	jc	.Lseamfail
> > > 
> > > .if \saved && \ret
> > > 	/*
> > > 	 * VP.ENTER clears RSI on output, use it to restore state.
> > > 	 */
> > > 	popq	%rsi
> > > 	xor	%edi,%edi
> > > 	movq	%rdi, TDX_MODULE_rdi(%rsi)
> > > 	movq	%rdi, TDX_MODULE_rsi(%rsi)
> > > .endif
> > > .else
> > > 	tdcall
> > > 
> > > 	/*
> > > 	 * RAX!=0 indicates a failure, assume no return values.
> > > 	 */
> > > 	testq	%rax, %rax
> > > 	jne	.Lerror
> > 
> > For some SEAMCALL/TDCALL the output registers may contain additional error
> > information.  We need to jump to a location where whether returning those
> > additional regs to 'struct tdx_module_args' depends on \ret.
> 
> I suppose we can move this into the below conditional :-( The [DS]I
> register stuff requires a scratch reg to recover, AX being zero provides
> that.

Yeah this can certainly be done in one way or another.

> 
> > > .if \saved && \ret
> > > 	/*
> > > 	 * Since RAX==0, it can be used as a scratch register to restore state.
> > > 	 *
> > > 	 * [ assumes \saved implies \ret ]
> > > 	 */
> > > 	popq	%rax
> > > 	movq	%rdi, TDX_MODULE_rdi(%rax)
> > > 	movq	%rsi, TDX_MODULE_rsi(%rax)
> > > 	movq	%rax, %rsi
> > > 	xor	%eax, %eax;
> > > .endif
> > > .endif // \host
> 
> So the reason I want this, is that I feel very strongly that if you
> cannot write a single coherent wrapper for all this, its calling
> convention is fundamentally *too* complex / broken.

In general I agree, but I am not sure whether there's any detail holding us
back.  :)
Peter Zijlstra July 5, 2023, 12:19 p.m. UTC | #24
On Wed, Jul 05, 2023 at 11:34:53AM +0000, Huang, Kai wrote:

> Yeah I think from long-term's view, since SEAMCALLs to support live migration
> pretty much uses all RCX/RDX/R8-R15 as input/output, it seems reasonable to
> unify all of them, although I guess there might be some special handling to
> VP.VMCALL and/or VP.ENTER, e.g., below:
> 
>         /* TDVMCALL leaf return code is in R10 */                              
>         movq %r10, %rax
> 
> So long-termly, I don't have objection to that.  But my thinking is for the
> first version of TDX host support, we don't have to support all SEAMCALLs but
> only those involved in basic TDX support. 

Since those calls are out now, we should look at them now, there is no
point in delaying the pain. That then gives us two options:

 - we accept them and their wonky calling convention and our code should
   be ready for it.

 - we reject them and send the TDX team a message to please try again
   but with a saner calling convention.

Sticking our head in the sand and pretending like they don't exist isn't
really a viable option at this point.

> Also, the new SEAMCALLs to handle live migration all seem to have below
> statement:
> 
> 	AVX, AVX2	May be reset to the architectural INIT state
> 	and
> 	AVX512
> 	state
> 
> Which means those SEAMCALLs need to preserve AVX* states too?

Yes, we need to ensure the userspace 'FPU' state is saved before
we call them. But I _think_ that KVM already does much of that.

> And reading the spec, the VP.VMCALL and VP.ENTER also can use XMM0 - XMM15 as
> input/output.  Linux VP.VMCALL seems doesn't support using XMM0 - XMM15 as
> input/output, but KVM can run other guest OSes too so I think KVM VP.ENTER needs
> to handle XMM0-XMM15 as input/output too.

Why would KVM accept VMCALLs it doesn't know about? Just trash the
guest and call it a day.

> That being said, I think although we can provide a common asm macro to cover
> VP.ENTER, I suspect KVM still needs to do additional assembly around the macro
> too.  So I am not sure whether we should try to cover VP.ENTER.

Not sure about asm, we have interfaces to save the XMM/AVX regs.
kernel_fpu_begin() comes to mind, but I know there's more of that,
including some for KVM specifically.

> > I don't think they should be special, they're really just yet another
> > leaf call. Yes, they have a shit calling convention, and yes VP.ENTER is
> > terminally broken for unconditionally clobbering BP :-(
> > 
> > That really *must* be fixed.
> 
> Sure I don't have objection to this, and for VP.ENTER please see above.
> 
> But I'd like to say that, generally speaking, from virtualization's point of
> view, guest has its own BP and conceptually the hypervisor needs to restore
> guest's BP before jumping to the guest.  E.g., for normal VMX guest, KVM always
> restores guest's BP before VMENTER (arch/x86/kvm/vmx/vmenter.S):
> 
> SYM_FUNC_START(__vmx_vcpu_run)
>         push %_ASM_BP
>         mov  %_ASM_SP, %_ASM_BP
> 	
> 	...
> 	mov VCPU_RBP(%_ASM_AX), %_ASM_BP
> 	...
> 	vmenter/vmresume
> 	...
> SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
> 	.....
> 	mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
> 	...
> 	pop %_ASM_BP
>         RET

That's disgusting :/ So what happens if we get an NMI after VMENTER and
before POP? Then it sees a garbage BP value.

Why is all this stuff such utter crap?
Peter Zijlstra July 5, 2023, 12:21 p.m. UTC | #25
On Wed, Jul 05, 2023 at 11:34:53AM +0000, Huang, Kai wrote:

> Yeah I think from long-term's view, since SEAMCALLs to support live migration
> pretty much uses all RCX/RDX/R8-R15 as input/output, it seems reasonable to
> unify all of them, although I guess there might be some special handling to
> VP.VMCALL and/or VP.ENTER, e.g., below:
> 
>         /* TDVMCALL leaf return code is in R10 */                              
>         movq %r10, %rax

Well, that's a problem fo whoever does tdcall(0, &args), no?

But did I say it had a crap calling convention already?
Huang, Kai July 5, 2023, 12:53 p.m. UTC | #26
On Wed, 2023-07-05 at 14:19 +0200, Peter Zijlstra wrote:
> On Wed, Jul 05, 2023 at 11:34:53AM +0000, Huang, Kai wrote:
> 
> > Yeah I think from long-term's view, since SEAMCALLs to support live migration
> > pretty much uses all RCX/RDX/R8-R15 as input/output, it seems reasonable to
> > unify all of them, although I guess there might be some special handling to
> > VP.VMCALL and/or VP.ENTER, e.g., below:
> > 
> >         /* TDVMCALL leaf return code is in R10 */                              
> >         movq %r10, %rax
> > 
> > So long-termly, I don't have objection to that.  But my thinking is for the
> > first version of TDX host support, we don't have to support all SEAMCALLs but
> > only those involved in basic TDX support. 
> 
> Since those calls are out now, we should look at them now, there is no
> point in delaying the pain. That then gives us two options:
> 
>  - we accept them and their wonky calling convention and our code should
>    be ready for it.
> 
>  - we reject them and send the TDX team a message to please try again
>    but with a saner calling convention.
> 
> Sticking our head in the sand and pretending like they don't exist isn't
> really a viable option at this point.

OK.  I'll work on this.

But I think even we want to unify __tdx_module_call() and __tdx_hypercall(), the
first step should be making __tdx_module_call() look like __tdx_hypercall()?  I
mean from organizing patchset's point of view, we cannot just do in one big
patch but need to split into small patches with each doing one thing.

By thinking is perhaps we can organize this way:

 1) Patch(es) to make TDX_MODULE_CALL macro / __tdx_module_call() look like
__tdx_hypercall().
 2) Add SEAMCALL support based on TDX_MODULE_CALL, e.g., implement __seamcall().
 3) Unify __tdx_module_call()/__seamcall() with __tdx_hypercall().

Does this look good?

Btw, I've already part 1) based on your code, and sent the patches to Kirill for
review.  Should I sent them out first?

> 
> > Also, the new SEAMCALLs to handle live migration all seem to have below
> > statement:
> > 
> > 	AVX, AVX2	May be reset to the architectural INIT state
> > 	and
> > 	AVX512
> > 	state
> > 
> > Which means those SEAMCALLs need to preserve AVX* states too?
> 
> Yes, we need to ensure the userspace 'FPU' state is saved before
> we call them. But I _think_ that KVM already does much of that.

Let me look into this.

> 
> > And reading the spec, the VP.VMCALL and VP.ENTER also can use XMM0 - XMM15 as
> > input/output.  Linux VP.VMCALL seems doesn't support using XMM0 - XMM15 as
> > input/output, but KVM can run other guest OSes too so I think KVM VP.ENTER needs
> > to handle XMM0-XMM15 as input/output too.
> 
> Why would KVM accept VMCALLs it doesn't know about? Just trash the
> guest and call it a day.
> 
> > That being said, I think although we can provide a common asm macro to cover
> > VP.ENTER, I suspect KVM still needs to do additional assembly around the macro
> > too.  So I am not sure whether we should try to cover VP.ENTER.
> 
> Not sure about asm, we have interfaces to save the XMM/AVX regs.
> kernel_fpu_begin() comes to mind, but I know there's more of that,
> including some for KVM specifically.

Yeah doesn't have to be asm if it can be done in C.

> 
> > > I don't think they should be special, they're really just yet another
> > > leaf call. Yes, they have a shit calling convention, and yes VP.ENTER is
> > > terminally broken for unconditionally clobbering BP :-(
> > > 
> > > That really *must* be fixed.
> > 
> > Sure I don't have objection to this, and for VP.ENTER please see above.
> > 
> > But I'd like to say that, generally speaking, from virtualization's point of
> > view, guest has its own BP and conceptually the hypervisor needs to restore
> > guest's BP before jumping to the guest.  E.g., for normal VMX guest, KVM always
> > restores guest's BP before VMENTER (arch/x86/kvm/vmx/vmenter.S):
> > 
> > SYM_FUNC_START(__vmx_vcpu_run)
> >         push %_ASM_BP
> >         mov  %_ASM_SP, %_ASM_BP
> > 	
> > 	...
> > 	mov VCPU_RBP(%_ASM_AX), %_ASM_BP
> > 	...
> > 	vmenter/vmresume
> > 	...
> > SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
> > 	.....
> > 	mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
> > 	...
> > 	pop %_ASM_BP
> >         RET
> 
> That's disgusting :/ So what happens if we get an NMI after VMENTER and
> before POP? Then it sees a garbage BP value.

Looks so.

> 
> Why is all this stuff such utter crap?
> 

The problem is KVM has to save/restore BP for guest, because VMX hardware
doesn't save/restore BP during VMENTER/VMEXIT.  I am not sure whether there's a
better way to handle.

My brain is getting slow right now as it's 1-hour past midnight already.  I am
hoping Paolo/Sean can jump in here. :)
Isaku Yamahata July 5, 2023, 8:56 p.m. UTC | #27
On Wed, Jul 05, 2023 at 12:53:58PM +0000,
"Huang, Kai" <kai.huang@intel.com> wrote:

> On Wed, 2023-07-05 at 14:19 +0200, Peter Zijlstra wrote:
> > On Wed, Jul 05, 2023 at 11:34:53AM +0000, Huang, Kai wrote:
> > 
> > > Yeah I think from long-term's view, since SEAMCALLs to support live migration
> > > pretty much uses all RCX/RDX/R8-R15 as input/output, it seems reasonable to
> > > unify all of them, although I guess there might be some special handling to
> > > VP.VMCALL and/or VP.ENTER, e.g., below:
> > > 
> > >         /* TDVMCALL leaf return code is in R10 */                              
> > >         movq %r10, %rax
> > > 
> > > So long-termly, I don't have objection to that.  But my thinking is for the
> > > first version of TDX host support, we don't have to support all SEAMCALLs but
> > > only those involved in basic TDX support. 
> > 
> > Since those calls are out now, we should look at them now, there is no
> > point in delaying the pain. That then gives us two options:
> > 
> >  - we accept them and their wonky calling convention and our code should
> >    be ready for it.
> > 
> >  - we reject them and send the TDX team a message to please try again
> >    but with a saner calling convention.
> > 
> > Sticking our head in the sand and pretending like they don't exist isn't
> > really a viable option at this point.
> 
> OK.  I'll work on this.
> 
> But I think even we want to unify __tdx_module_call() and __tdx_hypercall(), the
> first step should be making __tdx_module_call() look like __tdx_hypercall()?  I
> mean from organizing patchset's point of view, we cannot just do in one big
> patch but need to split into small patches with each doing one thing.
> 
> By thinking is perhaps we can organize this way:
> 
>  1) Patch(es) to make TDX_MODULE_CALL macro / __tdx_module_call() look like
> __tdx_hypercall().
>  2) Add SEAMCALL support based on TDX_MODULE_CALL, e.g., implement __seamcall().
>  3) Unify __tdx_module_call()/__seamcall() with __tdx_hypercall().
> 
> Does this look good?
> 
> Btw, I've already part 1) based on your code, and sent the patches to Kirill for
> review.  Should I sent them out first?
> 
> > 
> > > Also, the new SEAMCALLs to handle live migration all seem to have below
> > > statement:
> > > 
> > > 	AVX, AVX2	May be reset to the architectural INIT state
> > > 	and
> > > 	AVX512
> > > 	state
> > > 
> > > Which means those SEAMCALLs need to preserve AVX* states too?
> > 
> > Yes, we need to ensure the userspace 'FPU' state is saved before
> > we call them. But I _think_ that KVM already does much of that.
> 
> Let me look into this.

KVM VCPU_RUN ioctl saves/restores FPU state by kvm_load_guest_fpu() and
kvm_put_guest_fpu() which calls fpu_swap_kvm_fpstate().
Other KVM ioctls doesn't modify FPU.  Because some SEAMCALLs related for live
migration don't preserve FPU state, we need explicit save/restore of FPU state.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index e95c9fbf52e4..8d3f85bcccc1 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -8,6 +8,8 @@ 
 #include <asm/ptrace.h>
 #include <asm/shared/tdx.h>
 
+#include <asm/trapnr.h>
+
 /*
  * SW-defined error codes.
  *
@@ -18,6 +20,9 @@ 
 #define TDX_SW_ERROR			(TDX_ERROR | GENMASK_ULL(47, 40))
 #define TDX_SEAMCALL_VMFAILINVALID	(TDX_SW_ERROR | _UL(0xFFFF0000))
 
+#define TDX_SEAMCALL_GP			(TDX_SW_ERROR | X86_TRAP_GP)
+#define TDX_SEAMCALL_UD			(TDX_SW_ERROR | X86_TRAP_UD)
+
 #ifndef __ASSEMBLY__
 
 /* TDX supported page sizes from the TDX module ABI. */
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 1107f4227568..eba7ff91206d 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -93,6 +93,13 @@  static int __always_unused seamcall(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
 	case TDX_SEAMCALL_VMFAILINVALID:
 		pr_err_once("module is not loaded.\n");
 		return -ENODEV;
+	case TDX_SEAMCALL_GP:
+		pr_err_once("not enabled by BIOS.\n");
+		return -ENODEV;
+	case TDX_SEAMCALL_UD:
+		pr_err_once("SEAMCALL failed: CPU %d is not in VMX operation.\n",
+				cpu);
+		return -EINVAL;
 	default:
 		pr_err_once("SEAMCALL failed: CPU %d: leaf %llu, error 0x%llx.\n",
 				cpu, fn, sret);
diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S
index 49a54356ae99..757b0c34be10 100644
--- a/arch/x86/virt/vmx/tdx/tdxcall.S
+++ b/arch/x86/virt/vmx/tdx/tdxcall.S
@@ -1,6 +1,7 @@ 
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <asm/asm-offsets.h>
 #include <asm/tdx.h>
+#include <asm/asm.h>
 
 /*
  * TDCALL and SEAMCALL are supported in Binutils >= 2.36.
@@ -45,6 +46,7 @@ 
 	/* Leave input param 2 in RDX */
 
 	.if \host
+1:
 	seamcall
 	/*
 	 * SEAMCALL instruction is essentially a VMExit from VMX root
@@ -57,10 +59,23 @@ 
 	 * This value will never be used as actual SEAMCALL error code as
 	 * it is from the Reserved status code class.
 	 */
-	jnc .Lno_vmfailinvalid
+	jnc .Lseamcall_out
 	mov $TDX_SEAMCALL_VMFAILINVALID, %rax
-.Lno_vmfailinvalid:
+	jmp .Lseamcall_out
+2:
+	/*
+	 * SEAMCALL caused #GP or #UD.  By reaching here %eax contains
+	 * the trap number.  Convert the trap number to the TDX error
+	 * code by setting TDX_SW_ERROR to the high 32-bits of %rax.
+	 *
+	 * Note cannot OR TDX_SW_ERROR directly to %rax as OR instruction
+	 * only accepts 32-bit immediate at most.
+	 */
+	mov $TDX_SW_ERROR, %r12
+	orq %r12, %rax
 
+	_ASM_EXTABLE_FAULT(1b, 2b)
+.Lseamcall_out:
 	.else
 	tdcall
 	.endif