Message ID | 20191008044613.12350-17-sean.j.christopherson@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | x86/vdso: sgx: Major vDSO cleanup | expand |
On 10/7/2019 9:46 PM, Sean Christopherson wrote: > Rework __vdso_sgx_enter_enclave() to prioritize the flow where userspace > is not providing a callback, which is the preferred method of operation. Processors have branch predictors so your "prioritizing" may not get what your want. But if you still insist, a simple ht/hnt prefixing the original branch instruction would have sufficed. Rearrangement of code blocks is indeed unnecessary. A caveat though, for any given process, whether it supplies a callback or not is usually hard-coded. So either it always takes the callback path, or it always takes the other. And the branch predictor will do well in both cases. It's usually unwise to apply ht/hnt prefixes. > Using a callback requires a retpoline, and the only known motivation for > employing a callback is to allow the enclave to muck with the stack of > the untrusted runtime. > > Opportunistically replace the majority of the local labels with local > symbol names to improve the readability of the code. > > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> > --- > arch/x86/entry/vdso/vsgx_enter_enclave.S | 120 ++++++++++++++--------- > 1 file changed, 71 insertions(+), 49 deletions(-) > > diff --git a/arch/x86/entry/vdso/vsgx_enter_enclave.S b/arch/x86/entry/vdso/vsgx_enter_enclave.S > index de54e47c83f4..fc5622dcd2fa 100644 > --- a/arch/x86/entry/vdso/vsgx_enter_enclave.S > +++ b/arch/x86/entry/vdso/vsgx_enter_enclave.S > @@ -85,75 +85,97 @@ ENTRY(__vdso_sgx_enter_enclave) > mov %rsp, %rbp > .cfi_def_cfa_register %rbp > > -1: /* EENTER <= leaf <= ERESUME */ > +.Lenter_enclave: > + /* EENTER <= leaf <= ERESUME */ > cmp $0x2, %eax > - jb 6f > + jb .Linvalid_leaf > cmp $0x3, %eax > - ja 6f > + ja .Linvalid_leaf > > /* Load TCS and AEP */ > mov 0x10(%rbp), %rbx > - lea 2f(%rip), %rcx > + lea .Lasync_exit_pointer(%rip), %rcx > > /* Single ENCLU serving as both EENTER and AEP (ERESUME) */ > -2: enclu > +.Lasync_exit_pointer: > +.Lenclu_eenter_eresume: > + enclu > > - /* EEXIT path */ > + /* EEXIT jumps here unless the enclave is doing something fancy. */ > xor %eax, %eax > -3: mov %eax, %ecx > - > - /* Call the exit handler if supplied */ > - mov 0x20(%rbp), %rax > - test %rax, %rax > - jz 7f > - /* Align stack per x86_64 ABI. The original %rsp is saved in %rbx to be > - * restored after the exit handler returns. */ > + > + /* Invoke userspace's exit handler if one was provided. */ > +.Lhandle_exit: > + cmp $0, 0x20(%rbp) > + jne .Linvoke_userspace_handler > + > +.Lout: > + leave > + .cfi_def_cfa %rsp, 8 > + ret > + > +.Linvalid_leaf: Please set frame pointer back to %rbp here, or stack unwinding will fail. > + mov $(-EINVAL), %eax > + jmp .Lout > + > +.Lhandle_exception: > + mov 0x18(%rbp), %rcx > + test %rcx, %rcx > + je .Lskip_exception_info A single "jrcxz .Lskip_exception_info" is equivalent to the above 2 instructions combined. > + > + /* Fill optional exception info. */ > + mov %eax, EX_LEAF(%rcx) > + mov %di, EX_TRAPNR(%rcx) > + mov %si, EX_ERROR_CODE(%rcx) > + mov %rdx, EX_ADDRESS(%rcx) > +.Lskip_exception_info: > + mov $(-EFAULT), %eax > + jmp .Lhandle_exit > + > +.Linvoke_userspace_handler: > + /* > + * Align stack per x86_64 ABI. Save the original %rsp in %rbx to be > + * restored after the callback returns. > + */ > mov %rsp, %rbx > and $-0x10, %rsp > - /* Clear RFLAGS.DF per x86_64 ABI */ > - cld > - /* Parameters for the exit handler */ > + > + /* Push @e, u_rsp and @tcs as parameters to the callback. */ > push 0x18(%rbp) > push %rbx > push 0x10(%rbp) > - /* Call *%rax via retpoline */ > - call 40f > - /* Restore %rsp to its original value left off by the enclave from last > - * exit */ > + > + /* Pass the "return" value to the callback via %rcx. */ > + mov %eax, %ecx @e (ex_info) is almost always needed by every callback as it also serves as the "context pointer". The return value on the other hand is insignificant because it could be deduced from @e->EX_LEAF anyway. So I'd retain %rcx and push %rax to the stack instead, given the purpose of this patch is to squeeze out a bit performance. > + > + /* Clear RFLAGS.DF per x86_64 ABI */ > + cld > + > + /* Load the callback pointer to %rax and invoke it via retpoline. */ > + mov 0x20(%rbp), %rax Per X86_64 ABI, %rsp shall be 16 bytes aligned before "call". But %rsp here doesn't look aligned properly. > + call .Lretpoline > + > + /* Restore %rsp to its post-exit value. */ > mov %rbx, %rsp > - /* Positive return value from the exit handler will be interpreted as > - * an ENCLU leaf, while a non-positive value will be interpreted as the > - * return value to be passed back to the caller. */ > - jmp 1b > -40: /* retpoline */ > - call 42f > -41: pause > - lfence > - jmp 41b > -42: mov %rax, (%rsp) > - ret > > -5: /* Exception path */ > - mov 0x18(%rbp), %rcx > - jrcxz 52f > - mov %eax, EX_LEAF(%rcx) > - mov %di, EX_TRAPNR(%rcx) > - mov %si, EX_ERROR_CODE(%rcx) > - mov %rdx, EX_ADDRESS(%rcx) > -52: mov $-EFAULT, %eax > - jmp 3b > - > -6: /* Unsupported ENCLU leaf */ > + /* > + * If the return from callback is zero or negative, return immediately, > + * else re-execute ENCLU with the postive return value interpreted as > + * the requested ENCLU leaf. > + */ > cmp $0, %eax > - jle 7f > - mov $-EINVAL, %eax > + jle .Lout > + jmp .Lenter_enclave > > -7: /* Epilog */ > - leave > - .cfi_def_cfa %rsp, 8 > +.Lretpoline: > + call 2f > +1: pause > + lfence > + jmp 1b > +2: mov %rax, (%rsp) > ret > .cfi_endproc > > -_ASM_VDSO_EXTABLE_HANDLE(2b, 5b) > +_ASM_VDSO_EXTABLE_HANDLE(.Lenclu_eenter_eresume, .Lhandle_exception) > > ENDPROC(__vdso_sgx_enter_enclave) >
On Wed, Oct 09, 2019 at 11:00:55AM -0700, Xing, Cedric wrote: > On 10/7/2019 9:46 PM, Sean Christopherson wrote: > >Rework __vdso_sgx_enter_enclave() to prioritize the flow where userspace > >is not providing a callback, which is the preferred method of operation. > > Processors have branch predictors so your "prioritizing" may not get what > your want. > > But if you still insist, a simple ht/hnt prefixing the original branch > instruction would have sufficed. Rearrangement of code blocks is indeed > unnecessary. Tying into the other thread, it's not a matter of absolute necessity, it's a matter of us providing the best code possible. > A caveat though, for any given process, whether it supplies a callback or > not is usually hard-coded. So either it always takes the callback path, or > it always takes the other. And the branch predictor will do well in both > cases. It's usually unwise to apply ht/hnt prefixes. Readability is the primary concern, performance is secondary concern. For a random joe user, this: 0x0000000000000a20 <+0>: push %rbp 0x0000000000000a21 <+1>: mov %rsp,%rbp 0x0000000000000a24 <+4>: cmp $0x2,%eax 0x0000000000000a27 <+7>: jb 0xa46 <__vdso_sgx_enter_enclave+38> 0x0000000000000a29 <+9>: cmp $0x3,%eax 0x0000000000000a2c <+12>: ja 0xa46 <__vdso_sgx_enter_enclave+38> 0x0000000000000a2e <+14>: mov 0x10(%rbp),%rbx 0x0000000000000a32 <+18>: lea 0x0(%rip),%rcx # 0xa39 <__vdso_sgx_enter_enclave+25> 0x0000000000000a39 <+25>: enclu 0x0000000000000a3c <+28>: xor %eax,%eax 0x0000000000000a3e <+30>: cmpl $0x0,0x20(%rbp) 0x0000000000000a42 <+34>: jne 0xa6b <__vdso_sgx_enter_enclave+75> 0x0000000000000a44 <+36>: leaveq 0x0000000000000a45 <+37>: retq is easier to follow for the happy path than this: 0x0000000000000a20 <+0>: push %rbp 0x0000000000000a21 <+1>: mov %rsp,%rbp 0x0000000000000a24 <+4>: cmp $0x2,%eax 0x0000000000000a27 <+7>: jb 0xa8e <__vdso_sgx_enter_enclave+110> 0x0000000000000a29 <+9>: cmp $0x3,%eax 0x0000000000000a2c <+12>: ja 0xa8e <__vdso_sgx_enter_enclave+110> 0x0000000000000a2e <+14>: mov 0x10(%rbp),%rbx 0x0000000000000a32 <+18>: lea 0x0(%rip),%rcx # 0xa39 <__vdso_sgx_enter_enclave+25> 0x0000000000000a39 <+25>: enclu 0x0000000000000a3c <+28>: xor %eax,%eax 0x0000000000000a3e <+30>: mov %eax,%ecx 0x0000000000000a40 <+32>: mov 0x20(%rbp),%rax 0x0000000000000a44 <+36>: test %rax,%rax 0x0000000000000a47 <+39>: je 0xa98 <__vdso_sgx_enter_enclave+120> 0x0000000000000a49 <+41>: mov %rsp,%rbx 0x0000000000000a4c <+44>: and $0xfffffffffffffff0,%rsp 0x0000000000000a50 <+48>: cld 0x0000000000000a51 <+49>: pushq 0x18(%rbp) 0x0000000000000a54 <+52>: push %rbx 0x0000000000000a55 <+53>: pushq 0x10(%rbp) 0x0000000000000a58 <+56>: callq 0xa62 <__vdso_sgx_enter_enclave+66> 0x0000000000000a5d <+61>: mov %rbx,%rsp 0x0000000000000a60 <+64>: jmp 0xa24 <__vdso_sgx_enter_enclave+4> 0x0000000000000a62 <+66>: callq 0xa6e <__vdso_sgx_enter_enclave+78> 0x0000000000000a67 <+71>: pause 0x0000000000000a69 <+73>: lfence 0x0000000000000a6c <+76>: jmp 0xa67 <__vdso_sgx_enter_enclave+71> 0x0000000000000a6e <+78>: mov %rax,(%rsp) 0x0000000000000a72 <+82>: retq 0x0000000000000a73 <+83>: mov 0x18(%rbp),%rcx 0x0000000000000a77 <+87>: jrcxz 0xa87 <__vdso_sgx_enter_enclave+103> 0x0000000000000a79 <+89>: mov %eax,(%rcx) 0x0000000000000a7b <+91>: mov %di,0x4(%rcx) 0x0000000000000a7f <+95>: mov %si,0x6(%rcx) 0x0000000000000a83 <+99>: mov %rdx,0x8(%rcx) 0x0000000000000a87 <+103>: mov $0xfffffff2,%eax 0x0000000000000a8c <+108>: jmp 0xa3e <__vdso_sgx_enter_enclave+30> 0x0000000000000a8e <+110>: cmp $0x0,%eax 0x0000000000000a91 <+113>: jle 0xa98 <__vdso_sgx_enter_enclave+120> 0x0000000000000a93 <+115>: mov $0xffffffea,%eax 0x0000000000000a98 <+120>: leaveq 0x0000000000000a99 <+121>: retq and much easier to follow than the version where the exception struct is filled in on a synchronous EEXIT: 0x0000000000000a20 <+0>: push %rbp 0x0000000000000a21 <+1>: mov %rsp,%rbp 0x0000000000000a24 <+4>: cmp $0x2,%eax 0x0000000000000a27 <+7>: jb 0xa90 <__vdso_sgx_enter_enclave+112> 0x0000000000000a29 <+9>: cmp $0x3,%eax 0x0000000000000a2c <+12>: ja 0xa90 <__vdso_sgx_enter_enclave+112> 0x0000000000000a2e <+14>: mov 0x10(%rbp),%rbx 0x0000000000000a32 <+18>: lea 0x0(%rip),%rcx # 0xa39 <__vdso_sgx_enter_enclave+25> 0x0000000000000a39 <+25>: enclu 0x0000000000000a3c <+28>: xor %ebx,%ebx 0x0000000000000a3e <+30>: mov 0x18(%rbp),%rcx 0x0000000000000a42 <+34>: jrcxz 0xa54 <__vdso_sgx_enter_enclave+52> 0x0000000000000a44 <+36>: mov %eax,(%rcx) 0x0000000000000a46 <+38>: jae 0xa54 <__vdso_sgx_enter_enclave+52> 0x0000000000000a48 <+40>: mov %di,0x4(%rcx) 0x0000000000000a4c <+44>: mov %si,0x6(%rcx) 0x0000000000000a50 <+48>: mov %rdx,0x8(%rcx) 0x0000000000000a54 <+52>: mov 0x20(%rbp),%rax 0x0000000000000a58 <+56>: test %rax,%rax 0x0000000000000a5b <+59>: cmove %rbx,%rax 0x0000000000000a5f <+63>: je 0xa9a <__vdso_sgx_enter_enclave+122> ... 0x0000000000000a9a <+122>: leaveq 0x0000000000000a9b <+123>: retq > >Using a callback requires a retpoline, and the only known motivation for > >employing a callback is to allow the enclave to muck with the stack of > >the untrusted runtime. > > > >Opportunistically replace the majority of the local labels with local > >symbol names to improve the readability of the code. > > > >Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> > >--- > > arch/x86/entry/vdso/vsgx_enter_enclave.S | 120 ++++++++++++++--------- > > 1 file changed, 71 insertions(+), 49 deletions(-) > > > >diff --git a/arch/x86/entry/vdso/vsgx_enter_enclave.S b/arch/x86/entry/vdso/vsgx_enter_enclave.S > >index de54e47c83f4..fc5622dcd2fa 100644 > >--- a/arch/x86/entry/vdso/vsgx_enter_enclave.S > >+++ b/arch/x86/entry/vdso/vsgx_enter_enclave.S > >@@ -85,75 +85,97 @@ ENTRY(__vdso_sgx_enter_enclave) > > mov %rsp, %rbp > > .cfi_def_cfa_register %rbp > >-1: /* EENTER <= leaf <= ERESUME */ > >+.Lenter_enclave: > >+ /* EENTER <= leaf <= ERESUME */ > > cmp $0x2, %eax > >- jb 6f > >+ jb .Linvalid_leaf > > cmp $0x3, %eax > >- ja 6f > >+ ja .Linvalid_leaf > > /* Load TCS and AEP */ > > mov 0x10(%rbp), %rbx > >- lea 2f(%rip), %rcx > >+ lea .Lasync_exit_pointer(%rip), %rcx > > /* Single ENCLU serving as both EENTER and AEP (ERESUME) */ > >-2: enclu > >+.Lasync_exit_pointer: > >+.Lenclu_eenter_eresume: > >+ enclu > >- /* EEXIT path */ > >+ /* EEXIT jumps here unless the enclave is doing something fancy. */ > > xor %eax, %eax > >-3: mov %eax, %ecx > >- > >- /* Call the exit handler if supplied */ > >- mov 0x20(%rbp), %rax > >- test %rax, %rax > >- jz 7f > > >- /* Align stack per x86_64 ABI. The original %rsp is saved in %rbx to be > >- * restored after the exit handler returns. */ > >+ > >+ /* Invoke userspace's exit handler if one was provided. */ > >+.Lhandle_exit: > >+ cmp $0, 0x20(%rbp) > >+ jne .Linvoke_userspace_handler > >+ > >+.Lout: > >+ leave > >+ .cfi_def_cfa %rsp, 8 > >+ ret > >+ > >+.Linvalid_leaf: > > Please set frame pointer back to %rbp here, or stack unwinding will fail. Sorry, coffee isn't doing it's job, what's getting crushed, and where? > >+ mov $(-EINVAL), %eax > >+ jmp .Lout > >+ > >+.Lhandle_exception: > >+ mov 0x18(%rbp), %rcx > >+ test %rcx, %rcx > >+ je .Lskip_exception_info > > A single "jrcxz .Lskip_exception_info" is equivalent to the above 2 > instructions combined. Both implementations take a single uop on CPUs that support SGX. IMO, using the simpler and more common instructions is more universally readable. > >+ > >+ /* Fill optional exception info. */ > >+ mov %eax, EX_LEAF(%rcx) > >+ mov %di, EX_TRAPNR(%rcx) > >+ mov %si, EX_ERROR_CODE(%rcx) > >+ mov %rdx, EX_ADDRESS(%rcx) > >+.Lskip_exception_info: > >+ mov $(-EFAULT), %eax > >+ jmp .Lhandle_exit > >+ > >+.Linvoke_userspace_handler: > >+ /* > >+ * Align stack per x86_64 ABI. Save the original %rsp in %rbx to be > >+ * restored after the callback returns. > >+ */ > > mov %rsp, %rbx > > and $-0x10, %rsp > >- /* Clear RFLAGS.DF per x86_64 ABI */ > >- cld > >- /* Parameters for the exit handler */ > >+ > >+ /* Push @e, u_rsp and @tcs as parameters to the callback. */ > > push 0x18(%rbp) > > push %rbx > > push 0x10(%rbp) > >- /* Call *%rax via retpoline */ > >- call 40f > >- /* Restore %rsp to its original value left off by the enclave from last > >- * exit */ > >+ > >+ /* Pass the "return" value to the callback via %rcx. */ > >+ mov %eax, %ecx > > @e (ex_info) is almost always needed by every callback as it also serves as > the "context pointer". The return value on the other hand is insignificant > because it could be deduced from @e->EX_LEAF anyway. So I'd retain %rcx and > push %rax to the stack instead, given the purpose of this patch is to > squeeze out a bit performance. Please take this up in patch 02/16, which actually introduced this change. > > >+ > >+ /* Clear RFLAGS.DF per x86_64 ABI */ > >+ cld > >+ > >+ /* Load the callback pointer to %rax and invoke it via retpoline. */ > >+ mov 0x20(%rbp), %rax > > Per X86_64 ABI, %rsp shall be 16 bytes aligned before "call". But %rsp here > doesn't look aligned properly. Argh, I probably botched it back in patch 02/16 too. I'll see if I can add a check to verify %rsp alignment in the selftest, verifying via code inspection is bound to be error prone. > >+ call .Lretpoline
On Wed, Oct 09, 2019 at 12:10:03PM -0700, Sean Christopherson wrote: > > >+ > > >+ /* Clear RFLAGS.DF per x86_64 ABI */ > > >+ cld > > >+ > > >+ /* Load the callback pointer to %rax and invoke it via retpoline. */ > > >+ mov 0x20(%rbp), %rax > > > > Per X86_64 ABI, %rsp shall be 16 bytes aligned before "call". But %rsp here > > doesn't look aligned properly. > > Argh, I probably botched it back in patch 02/16 too. I'll see if I can > add a check to verify %rsp alignment in the selftest, verifying via code > inspection is bound to be error prone. Added a selftest, stack is indeed not properly aligned.
On 10/9/2019 12:10 PM, Sean Christopherson wrote: > On Wed, Oct 09, 2019 at 11:00:55AM -0700, Xing, Cedric wrote: >> On 10/7/2019 9:46 PM, Sean Christopherson wrote: >>> - /* Align stack per x86_64 ABI. The original %rsp is saved in %rbx to be >>> - * restored after the exit handler returns. */ >>> + >>> + /* Invoke userspace's exit handler if one was provided. */ >>> +.Lhandle_exit: >>> + cmp $0, 0x20(%rbp) >>> + jne .Linvoke_userspace_handler >>> + >>> +.Lout: >>> + leave >>> + .cfi_def_cfa %rsp, 8 >>> + ret >>> + >>> +.Linvalid_leaf: >> >> Please set frame pointer back to %rbp here, or stack unwinding will fail. > > Sorry, coffee isn't doing it's job, what's getting crushed, and where? The frame pointer was %rbp but you changed it to %rsp 3 lines ago. That's correct after "leave" and execution won't pass "ret". But the unwinder doesn't know. So you have to restore frame pointer after "ret", by .cfi_def_cfa %rbp, 16 As you mentioned in the stack alignment case, we just can't rely on code review to catch such bugs. We need a test case to make sure all CFI directives are correct, which was also a request from Andy. >>> +.Lhandle_exception: >>> + mov 0x18(%rbp), %rcx >>> + test %rcx, %rcx >>> + je .Lskip_exception_info >> >> A single "jrcxz .Lskip_exception_info" is equivalent to the above 2 >> instructions combined. > > Both implementations take a single uop on CPUs that support SGX. IMO, > using the simpler and more common instructions is more universally > readable. I'm not sure the processor could combine 2 instructions ("test"+"je") into just 1 uop. And "jrcxz" is also a broadly used instruction. >>> + /* Push @e, u_rsp and @tcs as parameters to the callback. */ >>> push 0x18(%rbp) >>> push %rbx >>> push 0x10(%rbp) >>> - /* Call *%rax via retpoline */ >>> - call 40f >>> - /* Restore %rsp to its original value left off by the enclave from last >>> - * exit */ >>> + >>> + /* Pass the "return" value to the callback via %rcx. */ >>> + mov %eax, %ecx >> >> @e (ex_info) is almost always needed by every callback as it also serves as >> the "context pointer". The return value on the other hand is insignificant >> because it could be deduced from @e->EX_LEAF anyway. So I'd retain %rcx and >> push %rax to the stack instead, given the purpose of this patch is to >> squeeze out a bit performance. > > Please take this up in patch 02/16, which actually introduced this change. My apology but willing to pull all related discussions into a single thread. If you adhere to the convention of "%rcx containing @e", then the code here could be push %rax // for stack alignment push %rax // return value push %rbx // u_rsp push 0x10(%rsp) // tcs // %rcx left unchanged pointing to @e >>> + /* Clear RFLAGS.DF per x86_64 ABI */ >>> + cld >>> + >>> + /* Load the callback pointer to %rax and invoke it via retpoline. */ >>> + mov 0x20(%rbp), %rax >> >> Per X86_64 ABI, %rsp shall be 16 bytes aligned before "call". But %rsp here >> doesn't look aligned properly. > > Argh, I probably botched it back in patch 02/16 too. I'll see if I can > add a check to verify %rsp alignment in the selftest, verifying via code > inspection is bound to be error prone. > >>> + call .Lretpoline
On Thu, Oct 10, 2019 at 10:49:59AM -0700, Xing, Cedric wrote: > On 10/9/2019 12:10 PM, Sean Christopherson wrote: > >On Wed, Oct 09, 2019 at 11:00:55AM -0700, Xing, Cedric wrote: > >>On 10/7/2019 9:46 PM, Sean Christopherson wrote: > >>>- /* Align stack per x86_64 ABI. The original %rsp is saved in %rbx to be > >>>- * restored after the exit handler returns. */ > >>>+ > >>>+ /* Invoke userspace's exit handler if one was provided. */ > >>>+.Lhandle_exit: > >>>+ cmp $0, 0x20(%rbp) > >>>+ jne .Linvoke_userspace_handler > >>>+ > >>>+.Lout: > >>>+ leave > >>>+ .cfi_def_cfa %rsp, 8 > >>>+ ret > >>>+ > >>>+.Linvalid_leaf: > >> > >>Please set frame pointer back to %rbp here, or stack unwinding will fail. > > > >Sorry, coffee isn't doing it's job, what's getting crushed, and where? > > The frame pointer was %rbp but you changed it to %rsp 3 lines ago. That's > correct after "leave" and execution won't pass "ret". But the unwinder > doesn't know. So you have to restore frame pointer after "ret", by > .cfi_def_cfa %rbp, 16 Isn't the proper fix to move ".cfi_endproc" here? Which I incorrectly left after the RET for the retpoline. > As you mentioned in the stack alignment case, we just can't rely on code > review to catch such bugs. We need a test case to make sure all CFI > directives are correct, which was also a request from Andy. On the todo list... > >>>+.Lhandle_exception: > >>>+ mov 0x18(%rbp), %rcx > >>>+ test %rcx, %rcx > >>>+ je .Lskip_exception_info > >> > >>A single "jrcxz .Lskip_exception_info" is equivalent to the above 2 > >>instructions combined. > > > >Both implementations take a single uop on CPUs that support SGX. IMO, > >using the simpler and more common instructions is more universally > >readable. > > I'm not sure the processor could combine 2 instructions ("test"+"je") into > just 1 uop. And "jrcxz" is also a broadly used instruction. TEST+Jcc macrofusion has been supported since Merom (Core 2)[*]. CMP+Jcc have also been fused since Merom, though not for all Jcc flavors (uarch specific), whereas TEST can fuse with everything. Sandy Bridge added fusing of ADD, SUB, INC, DEC, AND and OR, with AND/OR following TEST in terms of fusing capabilities, the rest following CMP behavior. [*] https://en.wikichip.org/wiki/macro-operation_fusion > >>>+ /* Push @e, u_rsp and @tcs as parameters to the callback. */ > >>> push 0x18(%rbp) > >>> push %rbx > >>> push 0x10(%rbp) > >>>- /* Call *%rax via retpoline */ > >>>- call 40f > >>>- /* Restore %rsp to its original value left off by the enclave from last > >>>- * exit */ > >>>+ > >>>+ /* Pass the "return" value to the callback via %rcx. */ > >>>+ mov %eax, %ecx > >> > >>@e (ex_info) is almost always needed by every callback as it also serves as > >>the "context pointer". The return value on the other hand is insignificant > >>because it could be deduced from @e->EX_LEAF anyway. So I'd retain %rcx and > >>push %rax to the stack instead, given the purpose of this patch is to > >>squeeze out a bit performance. > > > >Please take this up in patch 02/16, which actually introduced this change. > > My apology but willing to pull all related discussions into a single thread. > > If you adhere to the convention of "%rcx containing @e", then the code here > could be > push %rax // for stack alignment > push %rax // return value > push %rbx // u_rsp > push 0x10(%rsp) // tcs > // %rcx left unchanged pointing to @e Hmm, I still think it makes sense to have @e as the last parameters since it's the one thing that's optional. What if the callback prototype were instead: typedef int (*sgx_enclave_exit_handler_t)(long rdi, long rsi, long rdx, long ursp, long r8, long r9, void *tcs, int ret, struct sgx_enclave_exception *e); I.e. put @ret and @e next to each other since they go hand-in-hand. For me, that's visually easies to parse than burying 'int ret' or 'struct ... *e' in the middle of the prototype. And the relevant asm: /* Push @e, "return" value and @tcs as parameters to the callback. */ push 0x18(%rbp) push %eax push 0x10(%rbp) /* Pass the untrusted RSP (at exit) to the callback via %rcx. */ mov %ebx, %ecx > >>>+ /* Clear RFLAGS.DF per x86_64 ABI */ > >>>+ cld > >>>+ > >>>+ /* Load the callback pointer to %rax and invoke it via retpoline. */ > >>>+ mov 0x20(%rbp), %rax > >> > >>Per X86_64 ABI, %rsp shall be 16 bytes aligned before "call". But %rsp here > >>doesn't look aligned properly. > > > >Argh, I probably botched it back in patch 02/16 too. I'll see if I can > >add a check to verify %rsp alignment in the selftest, verifying via code > >inspection is bound to be error prone. > > > >>>+ call .Lretpoline
On 10/10/2019 4:59 PM, Sean Christopherson wrote: > On Thu, Oct 10, 2019 at 10:49:59AM -0700, Xing, Cedric wrote: >> On 10/9/2019 12:10 PM, Sean Christopherson wrote: >>> On Wed, Oct 09, 2019 at 11:00:55AM -0700, Xing, Cedric wrote: >>>> On 10/7/2019 9:46 PM, Sean Christopherson wrote: >>>>> - /* Align stack per x86_64 ABI. The original %rsp is saved in %rbx to be >>>>> - * restored after the exit handler returns. */ >>>>> + >>>>> + /* Invoke userspace's exit handler if one was provided. */ >>>>> +.Lhandle_exit: >>>>> + cmp $0, 0x20(%rbp) >>>>> + jne .Linvoke_userspace_handler >>>>> + >>>>> +.Lout: >>>>> + leave >>>>> + .cfi_def_cfa %rsp, 8 >>>>> + ret >>>>> + >>>>> +.Linvalid_leaf: >>>> >>>> Please set frame pointer back to %rbp here, or stack unwinding will fail. >>> >>> Sorry, coffee isn't doing it's job, what's getting crushed, and where? >> >> The frame pointer was %rbp but you changed it to %rsp 3 lines ago. That's >> correct after "leave" and execution won't pass "ret". But the unwinder >> doesn't know. So you have to restore frame pointer after "ret", by >> .cfi_def_cfa %rbp, 16 > > Isn't the proper fix to move ".cfi_endproc" here? Which I incorrectly > left after the RET for the retpoline. No. .cfi_endproc is used by the unwinder to determine if an address falls within a function. Its location has nothing to do with where RET is but shall always be at the end of the whole function. .cfi_def_cfa tells the unwinder where the call frame starts. At here, the call frame starts at %rbp+16 but not %rsp+8, so ".cfi_def_cfa %rbp, 16" is a must. >>>>> +.Lhandle_exception: >>>>> + mov 0x18(%rbp), %rcx >>>>> + test %rcx, %rcx >>>>> + je .Lskip_exception_info >>>> >>>> A single "jrcxz .Lskip_exception_info" is equivalent to the above 2 >>>> instructions combined. >>> >>> Both implementations take a single uop on CPUs that support SGX. IMO, >>> using the simpler and more common instructions is more universally >>> readable. >> >> I'm not sure the processor could combine 2 instructions ("test"+"je") into >> just 1 uop. And "jrcxz" is also a broadly used instruction. > > TEST+Jcc macrofusion has been supported since Merom (Core 2)[*]. CMP+Jcc > have also been fused since Merom, though not for all Jcc flavors (uarch > specific), whereas TEST can fuse with everything. Sandy Bridge added > fusing of ADD, SUB, INC, DEC, AND and OR, with AND/OR following TEST > in terms of fusing capabilities, the rest following CMP behavior. > > [*] https://en.wikichip.org/wiki/macro-operation_fusion Good to know. Thanks for the info! >>>>> + /* Push @e, u_rsp and @tcs as parameters to the callback. */ >>>>> push 0x18(%rbp) >>>>> push %rbx >>>>> push 0x10(%rbp) >>>>> - /* Call *%rax via retpoline */ >>>>> - call 40f >>>>> - /* Restore %rsp to its original value left off by the enclave from last >>>>> - * exit */ >>>>> + >>>>> + /* Pass the "return" value to the callback via %rcx. */ >>>>> + mov %eax, %ecx >>>> >>>> @e (ex_info) is almost always needed by every callback as it also serves as >>>> the "context pointer". The return value on the other hand is insignificant >>>> because it could be deduced from @e->EX_LEAF anyway. So I'd retain %rcx and >>>> push %rax to the stack instead, given the purpose of this patch is to >>>> squeeze out a bit performance. >>> >>> Please take this up in patch 02/16, which actually introduced this change. >> >> My apology but willing to pull all related discussions into a single thread. >> >> If you adhere to the convention of "%rcx containing @e", then the code here >> could be >> push %rax // for stack alignment >> push %rax // return value >> push %rbx // u_rsp >> push 0x10(%rsp) // tcs >> // %rcx left unchanged pointing to @e > > Hmm, I still think it makes sense to have @e as the last parameters since > it's the one thing that's optional. What if the callback prototype were > instead: > > typedef int (*sgx_enclave_exit_handler_t)(long rdi, long rsi, long rdx, > long ursp, long r8, long r9, > void *tcs, int ret, > struct sgx_enclave_exception *e); > > I.e. put @ret and @e next to each other since they go hand-in-hand. For > me, that's visually easies to parse than burying 'int ret' or 'struct ... *e' > in the middle of the prototype. > > And the relevant asm: > /* Push @e, "return" value and @tcs as parameters to the callback. */ > push 0x18(%rbp) > push %eax > push 0x10(%rbp) > > /* Pass the untrusted RSP (at exit) to the callback via %rcx. */ > mov %ebx, %ecx > Looks good to me. Don't forget to align the stack though, and ursp shall be 64-bit. That is, push %rax // align stack push %rcx // @e push %rax // @ret push 0x10(%rsp) // @tcs mov %rbx, %rcx // @ursp
On Wed, Oct 16, 2019 at 03:18:05PM -0700, Xing, Cedric wrote: > On 10/10/2019 4:59 PM, Sean Christopherson wrote: > >On Thu, Oct 10, 2019 at 10:49:59AM -0700, Xing, Cedric wrote: > >>On 10/9/2019 12:10 PM, Sean Christopherson wrote: > >>>On Wed, Oct 09, 2019 at 11:00:55AM -0700, Xing, Cedric wrote: > >>>>On 10/7/2019 9:46 PM, Sean Christopherson wrote: > >>>>>- /* Align stack per x86_64 ABI. The original %rsp is saved in %rbx to be > >>>>>- * restored after the exit handler returns. */ > >>>>>+ > >>>>>+ /* Invoke userspace's exit handler if one was provided. */ > >>>>>+.Lhandle_exit: > >>>>>+ cmp $0, 0x20(%rbp) > >>>>>+ jne .Linvoke_userspace_handler > >>>>>+ > >>>>>+.Lout: > >>>>>+ leave > >>>>>+ .cfi_def_cfa %rsp, 8 > >>>>>+ ret > >>>>>+ > >>>>>+.Linvalid_leaf: > >>>> > >>>>Please set frame pointer back to %rbp here, or stack unwinding will fail. > >>> > >>>Sorry, coffee isn't doing it's job, what's getting crushed, and where? > >> > >>The frame pointer was %rbp but you changed it to %rsp 3 lines ago. That's > >>correct after "leave" and execution won't pass "ret". But the unwinder > >>doesn't know. So you have to restore frame pointer after "ret", by > >> .cfi_def_cfa %rbp, 16 > > > >Isn't the proper fix to move ".cfi_endproc" here? Which I incorrectly > >left after the RET for the retpoline. > > No. .cfi_endproc is used by the unwinder to determine if an address falls > within a function. Its location has nothing to do with where RET is but > shall always be at the end of the whole function. > > .cfi_def_cfa tells the unwinder where the call frame starts. At here, the > call frame starts at %rbp+16 but not %rsp+8, so ".cfi_def_cfa %rbp, 16" is a > must. Ahh, I understand now, hopefully. I was thinking the .cfi directives would magically understand the control flow. Thanks! > >>>>>+.Lhandle_exception: > >>>>>+ mov 0x18(%rbp), %rcx > >>>>>+ test %rcx, %rcx > >>>>>+ je .Lskip_exception_info > >>>>
diff --git a/arch/x86/entry/vdso/vsgx_enter_enclave.S b/arch/x86/entry/vdso/vsgx_enter_enclave.S index de54e47c83f4..fc5622dcd2fa 100644 --- a/arch/x86/entry/vdso/vsgx_enter_enclave.S +++ b/arch/x86/entry/vdso/vsgx_enter_enclave.S @@ -85,75 +85,97 @@ ENTRY(__vdso_sgx_enter_enclave) mov %rsp, %rbp .cfi_def_cfa_register %rbp -1: /* EENTER <= leaf <= ERESUME */ +.Lenter_enclave: + /* EENTER <= leaf <= ERESUME */ cmp $0x2, %eax - jb 6f + jb .Linvalid_leaf cmp $0x3, %eax - ja 6f + ja .Linvalid_leaf /* Load TCS and AEP */ mov 0x10(%rbp), %rbx - lea 2f(%rip), %rcx + lea .Lasync_exit_pointer(%rip), %rcx /* Single ENCLU serving as both EENTER and AEP (ERESUME) */ -2: enclu +.Lasync_exit_pointer: +.Lenclu_eenter_eresume: + enclu - /* EEXIT path */ + /* EEXIT jumps here unless the enclave is doing something fancy. */ xor %eax, %eax -3: mov %eax, %ecx - - /* Call the exit handler if supplied */ - mov 0x20(%rbp), %rax - test %rax, %rax - jz 7f - /* Align stack per x86_64 ABI. The original %rsp is saved in %rbx to be - * restored after the exit handler returns. */ + + /* Invoke userspace's exit handler if one was provided. */ +.Lhandle_exit: + cmp $0, 0x20(%rbp) + jne .Linvoke_userspace_handler + +.Lout: + leave + .cfi_def_cfa %rsp, 8 + ret + +.Linvalid_leaf: + mov $(-EINVAL), %eax + jmp .Lout + +.Lhandle_exception: + mov 0x18(%rbp), %rcx + test %rcx, %rcx + je .Lskip_exception_info + + /* Fill optional exception info. */ + mov %eax, EX_LEAF(%rcx) + mov %di, EX_TRAPNR(%rcx) + mov %si, EX_ERROR_CODE(%rcx) + mov %rdx, EX_ADDRESS(%rcx) +.Lskip_exception_info: + mov $(-EFAULT), %eax + jmp .Lhandle_exit + +.Linvoke_userspace_handler: + /* + * Align stack per x86_64 ABI. Save the original %rsp in %rbx to be + * restored after the callback returns. + */ mov %rsp, %rbx and $-0x10, %rsp - /* Clear RFLAGS.DF per x86_64 ABI */ - cld - /* Parameters for the exit handler */ + + /* Push @e, u_rsp and @tcs as parameters to the callback. */ push 0x18(%rbp) push %rbx push 0x10(%rbp) - /* Call *%rax via retpoline */ - call 40f - /* Restore %rsp to its original value left off by the enclave from last - * exit */ + + /* Pass the "return" value to the callback via %rcx. */ + mov %eax, %ecx + + /* Clear RFLAGS.DF per x86_64 ABI */ + cld + + /* Load the callback pointer to %rax and invoke it via retpoline. */ + mov 0x20(%rbp), %rax + call .Lretpoline + + /* Restore %rsp to its post-exit value. */ mov %rbx, %rsp - /* Positive return value from the exit handler will be interpreted as - * an ENCLU leaf, while a non-positive value will be interpreted as the - * return value to be passed back to the caller. */ - jmp 1b -40: /* retpoline */ - call 42f -41: pause - lfence - jmp 41b -42: mov %rax, (%rsp) - ret -5: /* Exception path */ - mov 0x18(%rbp), %rcx - jrcxz 52f - mov %eax, EX_LEAF(%rcx) - mov %di, EX_TRAPNR(%rcx) - mov %si, EX_ERROR_CODE(%rcx) - mov %rdx, EX_ADDRESS(%rcx) -52: mov $-EFAULT, %eax - jmp 3b - -6: /* Unsupported ENCLU leaf */ + /* + * If the return from callback is zero or negative, return immediately, + * else re-execute ENCLU with the postive return value interpreted as + * the requested ENCLU leaf. + */ cmp $0, %eax - jle 7f - mov $-EINVAL, %eax + jle .Lout + jmp .Lenter_enclave -7: /* Epilog */ - leave - .cfi_def_cfa %rsp, 8 +.Lretpoline: + call 2f +1: pause + lfence + jmp 1b +2: mov %rax, (%rsp) ret .cfi_endproc -_ASM_VDSO_EXTABLE_HANDLE(2b, 5b) +_ASM_VDSO_EXTABLE_HANDLE(.Lenclu_eenter_eresume, .Lhandle_exception) ENDPROC(__vdso_sgx_enter_enclave)
Rework __vdso_sgx_enter_enclave() to prioritize the flow where userspace is not providing a callback, which is the preferred method of operation. Using a callback requires a retpoline, and the only known motivation for employing a callback is to allow the enclave to muck with the stack of the untrusted runtime. Opportunistically replace the majority of the local labels with local symbol names to improve the readability of the code. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> --- arch/x86/entry/vdso/vsgx_enter_enclave.S | 120 ++++++++++++++--------- 1 file changed, 71 insertions(+), 49 deletions(-)