diff mbox

[v3,5/4] x86: reduce code size of struct cpu_info member accesses

Message ID 56EAE5EE02000078000DDFB5@prv-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich March 17, 2016, 4:14 p.m. UTC
Instead of addressing these fields via the base of the stack (which
uniformly requires 4-byte displacements), address them from the end
(which for everything other than guest_cpu_user_regs requires just
1-byte ones). This yields a code size reduction somewhere between 8k
and 12k in my builds.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Note that just like patch 4 of the series this also isn't directly
related to the SMEP/SMAP issue, but is again just a result of things
realized while doing that work, and again depends on the earlier
patches to apply cleanly.
x86: reduce code size of struct cpu_info member accesses

Instead of addressing these fields via the base of the stack (which
uniformly requires 4-byte displacements), address them from the end
(which for everything other than guest_cpu_user_regs requires just
1-byte ones). This yields a code size reduction somewhere between 8k
and 12k in my builds.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Note that just like patch 4 of the series this also isn't directly
related to the SMEP/SMAP issue, but is again just a result of things
realized while doing that work, and again depends on the earlier
patches to apply cleanly.

--- a/xen/arch/x86/hvm/svm/entry.S
+++ b/xen/arch/x86/hvm/svm/entry.S
@@ -31,7 +31,7 @@
 #define CLGI   .byte 0x0F,0x01,0xDD
 
 ENTRY(svm_asm_do_resume)
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
 .Lsvm_do_resume:
         call svm_intr_assist
         mov  %rsp,%rdi
@@ -97,7 +97,7 @@ UNLIKELY_END(svm_trace)
 
         VMRUN
 
-        GET_CURRENT(%rax)
+        GET_CURRENT(ax)
         push %rdi
         push %rsi
         push %rdx
--- a/xen/arch/x86/hvm/vmx/entry.S
+++ b/xen/arch/x86/hvm/vmx/entry.S
@@ -40,7 +40,7 @@ ENTRY(vmx_asm_vmexit_handler)
         push %r10
         push %r11
         push %rbx
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         push %rbp
         push %r12
         push %r13
@@ -113,7 +113,7 @@ UNLIKELY_END(realmode)
         BUG  /* vmx_vmentry_failure() shouldn't return. */
 
 ENTRY(vmx_asm_do_vmentry)
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         jmp  .Lvmx_do_vmentry
 
 .Lvmx_goto_emulator:
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -26,7 +26,7 @@ UNLIKELY_START(ne, msi_check)
 UNLIKELY_END(msi_check)
 
         movl  UREGS_rax(%rsp),%eax
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
 
         cmpl  $NR_hypercalls,%eax
         jae   compat_bad_hypercall
@@ -202,7 +202,7 @@ ENTRY(compat_restore_all_guest)
 /* This mustn't modify registers other than %rax. */
 ENTRY(cr4_pv32_restore)
         push  %rdx
-        GET_CPUINFO_FIELD(cr4, %rdx)
+        GET_CPUINFO_FIELD(cr4, dx)
         mov   (%rdx), %rax
         test  $X86_CR4_SMEP|X86_CR4_SMAP,%eax
         jnz   0f
@@ -245,7 +245,7 @@ ENTRY(cstar_enter)
         pushq %rcx
         pushq $0
         SAVE_VOLATILE TRAP_syscall
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         movq  VCPU_domain(%rbx),%rcx
         cmpb  $0,DOMAIN_is_32bit_pv(%rcx)
         je    switch_to_kernel
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -97,7 +97,7 @@ ENTRY(lstar_enter)
         pushq %rcx
         pushq $0
         SAVE_VOLATILE TRAP_syscall
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
         jz    switch_to_kernel
 
@@ -246,7 +246,7 @@ GLOBAL(sysenter_eflags_saved)
         pushq $0 /* null rip */
         pushq $0
         SAVE_VOLATILE TRAP_syscall
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         cmpb  $0,VCPU_sysenter_disables_events(%rbx)
         movq  VCPU_sysenter_addr(%rbx),%rax
         setne %cl
@@ -288,7 +288,7 @@ UNLIKELY_START(ne, msi_check)
         call  check_for_unexpected_msi
 UNLIKELY_END(msi_check)
 
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
 
         /* Check that the callback is non-null. */
         leaq  VCPU_int80_bounce(%rbx),%rdx
@@ -420,10 +420,10 @@ domain_crash_page_fault:
         call  show_page_walk
 ENTRY(dom_crash_sync_extable)
         # Get out of the guest-save area of the stack.
-        GET_STACK_BASE(%rax)
+        GET_STACK_END(ax)
         leaq  STACK_CPUINFO_FIELD(guest_cpu_user_regs)(%rax),%rsp
         # create_bounce_frame() temporarily clobbers CS.RPL. Fix up.
-        __GET_CURRENT(%rax)
+        __GET_CURRENT(ax)
         movq  VCPU_domain(%rax),%rax
         testb $1,DOMAIN_is_32bit_pv(%rax)
         setz  %al
@@ -441,7 +441,7 @@ ENTRY(common_interrupt)
 
 /* No special register assumptions. */
 ENTRY(ret_from_intr)
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         testb $3,UREGS_cs(%rsp)
         jz    restore_all_xen
         movq  VCPU_domain(%rbx),%rax
@@ -455,7 +455,7 @@ ENTRY(page_fault)
 GLOBAL(handle_exception)
         SAVE_ALL CLAC
 handle_exception_saved:
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
         jz    exception_with_ints_disabled
 
@@ -649,7 +649,7 @@ handle_ist_exception:
         testb $3,UREGS_cs(%rsp)
         jz    1f
         /* Interrupted guest context. Copy the context to stack bottom. */
-        GET_CPUINFO_FIELD(guest_cpu_user_regs,%rdi)
+        GET_CPUINFO_FIELD(guest_cpu_user_regs,di)
         movq  %rsp,%rsi
         movl  $UREGS_kernel_sizeof/8,%ecx
         movq  %rdi,%rsp
@@ -664,7 +664,7 @@ handle_ist_exception:
         /* We want to get straight to the IRET on the NMI exit path. */
         testb $3,UREGS_cs(%rsp)
         jz    restore_all_xen
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         /* Send an IPI to ourselves to cover for the lack of event checking. */
         movl  VCPU_processor(%rbx),%eax
         shll  $IRQSTAT_shift,%eax
--- a/xen/include/asm-x86/asm_defns.h
+++ b/xen/include/asm-x86/asm_defns.h
@@ -127,19 +127,19 @@ void ret_from_intr(void);
         UNLIKELY_DONE(mp, tag);   \
         __UNLIKELY_END(tag)
 
-#define STACK_CPUINFO_FIELD(field) (STACK_SIZE-CPUINFO_sizeof+CPUINFO_##field)
-#define GET_STACK_BASE(reg)                       \
-        movq $~(STACK_SIZE-1),reg;                \
-        andq %rsp,reg
+#define STACK_CPUINFO_FIELD(field) (1 - CPUINFO_sizeof + CPUINFO_##field)
+#define GET_STACK_END(reg)                        \
+        movl $STACK_SIZE-1, %e##reg;              \
+        orq  %rsp, %r##reg
 
 #define GET_CPUINFO_FIELD(field, reg)             \
-        GET_STACK_BASE(reg);                      \
-        addq $STACK_CPUINFO_FIELD(field),reg
+        GET_STACK_END(reg);                       \
+        addq $STACK_CPUINFO_FIELD(field), %r##reg
 
 #define __GET_CURRENT(reg)                        \
-        movq STACK_CPUINFO_FIELD(current_vcpu)(reg),reg
+        movq STACK_CPUINFO_FIELD(current_vcpu)(%r##reg), %r##reg
 #define GET_CURRENT(reg)                          \
-        GET_STACK_BASE(reg);                      \
+        GET_STACK_END(reg);                       \
         __GET_CURRENT(reg)
 
 #ifndef NDEBUG
--- a/xen/include/asm-x86/current.h
+++ b/xen/include/asm-x86/current.h
@@ -55,7 +55,7 @@ static inline struct cpu_info *get_cpu_i
     register unsigned long sp asm("rsp");
 #endif
 
-    return (struct cpu_info *)((sp & ~(STACK_SIZE-1)) + STACK_SIZE) - 1;
+    return (struct cpu_info *)((sp | (STACK_SIZE - 1)) + 1) - 1;
 }
 
 #define get_current()         (get_cpu_info()->current_vcpu)

Comments

Konrad Rzeszutek Wilk March 25, 2016, 6:47 p.m. UTC | #1
On Thu, Mar 17, 2016 at 10:14:22AM -0600, Jan Beulich wrote:

Something is off with your patch. This is 5/4 :-)

> Instead of addressing these fields via the base of the stack (which
> uniformly requires 4-byte displacements), address them from the end
> (which for everything other than guest_cpu_user_regs requires just
> 1-byte ones). This yields a code size reduction somewhere between 8k
> and 12k in my builds.

Also you made the macro a bit different - the %r is removed.

Particular reason? 

> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> Note that just like patch 4 of the series this also isn't directly
> related to the SMEP/SMAP issue, but is again just a result of things
> realized while doing that work, and again depends on the earlier
> patches to apply cleanly.
> 

.. snip..
> --- a/xen/include/asm-x86/asm_defns.h
> +++ b/xen/include/asm-x86/asm_defns.h
> @@ -127,19 +127,19 @@ void ret_from_intr(void);
>          UNLIKELY_DONE(mp, tag);   \
>          __UNLIKELY_END(tag)
>  
> -#define STACK_CPUINFO_FIELD(field) (STACK_SIZE-CPUINFO_sizeof+CPUINFO_##field)
> -#define GET_STACK_BASE(reg)                       \
> -        movq $~(STACK_SIZE-1),reg;                \
> -        andq %rsp,reg
> +#define STACK_CPUINFO_FIELD(field) (1 - CPUINFO_sizeof + CPUINFO_##field)
> +#define GET_STACK_END(reg)                        \
> +        movl $STACK_SIZE-1, %e##reg;              \
> +        orq  %rsp, %r##reg
>  
>  #define GET_CPUINFO_FIELD(field, reg)             \
> -        GET_STACK_BASE(reg);                      \
> -        addq $STACK_CPUINFO_FIELD(field),reg
> +        GET_STACK_END(reg);                       \
> +        addq $STACK_CPUINFO_FIELD(field), %r##reg

Not subq? The GET_STACK_END gets us ..[ edit: missed first time
the change to STACK_CPUINFO_FIELD].


Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Jan Beulich March 29, 2016, 6:59 a.m. UTC | #2
>>> On 25.03.16 at 19:47, <konrad.wilk@oracle.com> wrote:
> On Thu, Mar 17, 2016 at 10:14:22AM -0600, Jan Beulich wrote:
> 
> Something is off with your patch. This is 5/4 :-)

Well, yes - this got added later on top of the previously sent series,
to make the dependency obvious.

>> Instead of addressing these fields via the base of the stack (which
>> uniformly requires 4-byte displacements), address them from the end
>> (which for everything other than guest_cpu_user_regs requires just
>> 1-byte ones). This yields a code size reduction somewhere between 8k
>> and 12k in my builds.
> 
> Also you made the macro a bit different - the %r is removed.
> 
> Particular reason? 

This is an integral part of the change, so the macro can derive
e.g. both %eax and %rax from the passed argument

Jan
Konrad Rzeszutek Wilk March 30, 2016, 2:28 p.m. UTC | #3
On Tue, Mar 29, 2016 at 12:59:24AM -0600, Jan Beulich wrote:
> >>> On 25.03.16 at 19:47, <konrad.wilk@oracle.com> wrote:
> > On Thu, Mar 17, 2016 at 10:14:22AM -0600, Jan Beulich wrote:
> > 
> > Something is off with your patch. This is 5/4 :-)
> 
> Well, yes - this got added later on top of the previously sent series,
> to make the dependency obvious.
> 
> >> Instead of addressing these fields via the base of the stack (which
> >> uniformly requires 4-byte displacements), address them from the end
> >> (which for everything other than guest_cpu_user_regs requires just
> >> 1-byte ones). This yields a code size reduction somewhere between 8k
> >> and 12k in my builds.
> > 
> > Also you made the macro a bit different - the %r is removed.
> > 
> > Particular reason? 
> 
> This is an integral part of the change, so the macro can derive
> e.g. both %eax and %rax from the passed argument

Could you pls include that explanation in the commit description..

And with that you can put Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

on the patch.
Jan Beulich March 30, 2016, 2:42 p.m. UTC | #4
>>> On 30.03.16 at 16:28, <konrad.wilk@oracle.com> wrote:
> On Tue, Mar 29, 2016 at 12:59:24AM -0600, Jan Beulich wrote:
>> >>> On 25.03.16 at 19:47, <konrad.wilk@oracle.com> wrote:
>> > On Thu, Mar 17, 2016 at 10:14:22AM -0600, Jan Beulich wrote:
>> > 
>> > Something is off with your patch. This is 5/4 :-)
>> 
>> Well, yes - this got added later on top of the previously sent series,
>> to make the dependency obvious.
>> 
>> >> Instead of addressing these fields via the base of the stack (which
>> >> uniformly requires 4-byte displacements), address them from the end
>> >> (which for everything other than guest_cpu_user_regs requires just
>> >> 1-byte ones). This yields a code size reduction somewhere between 8k
>> >> and 12k in my builds.
>> > 
>> > Also you made the macro a bit different - the %r is removed.
>> > 
>> > Particular reason? 
>> 
>> This is an integral part of the change, so the macro can derive
>> e.g. both %eax and %rax from the passed argument
> 
> Could you pls include that explanation in the commit description..
> 
> And with that you can put Reviewed-by: Konrad Rzeszutek Wilk 
> <konrad.wilk@oracle.com>
> 
> on the patch.

Well, I've got one of these already, without having been asked to
do any adjustments. Did I mis-interpret
<20160325184701.GE20741@char.us.oracle.com>?
As to adding something like this to the commit message: I have
a hard time seeing how this would belong there. The fact _that_
this is being done is very obvious from the patch itself, and once
the reader has spotted this the fact _why_ this is needed then
should become immediately obvious too. I'm all for explaining
technically difficult or hard to see things, but I'm against
needless bloat.

Jan
Andrew Cooper May 13, 2016, 4:11 p.m. UTC | #5
On 17/03/16 16:14, Jan Beulich wrote:
> Instead of addressing these fields via the base of the stack (which
> uniformly requires 4-byte displacements), address them from the end
> (which for everything other than guest_cpu_user_regs requires just
> 1-byte ones). This yields a code size reduction somewhere between 8k
> and 12k in my builds.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper@citrix.com>
diff mbox

Patch

--- a/xen/arch/x86/hvm/svm/entry.S
+++ b/xen/arch/x86/hvm/svm/entry.S
@@ -31,7 +31,7 @@ 
 #define CLGI   .byte 0x0F,0x01,0xDD
 
 ENTRY(svm_asm_do_resume)
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
 .Lsvm_do_resume:
         call svm_intr_assist
         mov  %rsp,%rdi
@@ -97,7 +97,7 @@  UNLIKELY_END(svm_trace)
 
         VMRUN
 
-        GET_CURRENT(%rax)
+        GET_CURRENT(ax)
         push %rdi
         push %rsi
         push %rdx
--- a/xen/arch/x86/hvm/vmx/entry.S
+++ b/xen/arch/x86/hvm/vmx/entry.S
@@ -40,7 +40,7 @@  ENTRY(vmx_asm_vmexit_handler)
         push %r10
         push %r11
         push %rbx
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         push %rbp
         push %r12
         push %r13
@@ -113,7 +113,7 @@  UNLIKELY_END(realmode)
         BUG  /* vmx_vmentry_failure() shouldn't return. */
 
 ENTRY(vmx_asm_do_vmentry)
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         jmp  .Lvmx_do_vmentry
 
 .Lvmx_goto_emulator:
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -26,7 +26,7 @@  UNLIKELY_START(ne, msi_check)
 UNLIKELY_END(msi_check)
 
         movl  UREGS_rax(%rsp),%eax
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
 
         cmpl  $NR_hypercalls,%eax
         jae   compat_bad_hypercall
@@ -202,7 +202,7 @@  ENTRY(compat_restore_all_guest)
 /* This mustn't modify registers other than %rax. */
 ENTRY(cr4_pv32_restore)
         push  %rdx
-        GET_CPUINFO_FIELD(cr4, %rdx)
+        GET_CPUINFO_FIELD(cr4, dx)
         mov   (%rdx), %rax
         test  $X86_CR4_SMEP|X86_CR4_SMAP,%eax
         jnz   0f
@@ -245,7 +245,7 @@  ENTRY(cstar_enter)
         pushq %rcx
         pushq $0
         SAVE_VOLATILE TRAP_syscall
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         movq  VCPU_domain(%rbx),%rcx
         cmpb  $0,DOMAIN_is_32bit_pv(%rcx)
         je    switch_to_kernel
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -97,7 +97,7 @@  ENTRY(lstar_enter)
         pushq %rcx
         pushq $0
         SAVE_VOLATILE TRAP_syscall
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
         jz    switch_to_kernel
 
@@ -246,7 +246,7 @@  GLOBAL(sysenter_eflags_saved)
         pushq $0 /* null rip */
         pushq $0
         SAVE_VOLATILE TRAP_syscall
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         cmpb  $0,VCPU_sysenter_disables_events(%rbx)
         movq  VCPU_sysenter_addr(%rbx),%rax
         setne %cl
@@ -288,7 +288,7 @@  UNLIKELY_START(ne, msi_check)
         call  check_for_unexpected_msi
 UNLIKELY_END(msi_check)
 
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
 
         /* Check that the callback is non-null. */
         leaq  VCPU_int80_bounce(%rbx),%rdx
@@ -420,10 +420,10 @@  domain_crash_page_fault:
         call  show_page_walk
 ENTRY(dom_crash_sync_extable)
         # Get out of the guest-save area of the stack.
-        GET_STACK_BASE(%rax)
+        GET_STACK_END(ax)
         leaq  STACK_CPUINFO_FIELD(guest_cpu_user_regs)(%rax),%rsp
         # create_bounce_frame() temporarily clobbers CS.RPL. Fix up.
-        __GET_CURRENT(%rax)
+        __GET_CURRENT(ax)
         movq  VCPU_domain(%rax),%rax
         testb $1,DOMAIN_is_32bit_pv(%rax)
         setz  %al
@@ -441,7 +441,7 @@  ENTRY(common_interrupt)
 
 /* No special register assumptions. */
 ENTRY(ret_from_intr)
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         testb $3,UREGS_cs(%rsp)
         jz    restore_all_xen
         movq  VCPU_domain(%rbx),%rax
@@ -455,7 +455,7 @@  ENTRY(page_fault)
 GLOBAL(handle_exception)
         SAVE_ALL CLAC
 handle_exception_saved:
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
         jz    exception_with_ints_disabled
 
@@ -649,7 +649,7 @@  handle_ist_exception:
         testb $3,UREGS_cs(%rsp)
         jz    1f
         /* Interrupted guest context. Copy the context to stack bottom. */
-        GET_CPUINFO_FIELD(guest_cpu_user_regs,%rdi)
+        GET_CPUINFO_FIELD(guest_cpu_user_regs,di)
         movq  %rsp,%rsi
         movl  $UREGS_kernel_sizeof/8,%ecx
         movq  %rdi,%rsp
@@ -664,7 +664,7 @@  handle_ist_exception:
         /* We want to get straight to the IRET on the NMI exit path. */
         testb $3,UREGS_cs(%rsp)
         jz    restore_all_xen
-        GET_CURRENT(%rbx)
+        GET_CURRENT(bx)
         /* Send an IPI to ourselves to cover for the lack of event checking. */
         movl  VCPU_processor(%rbx),%eax
         shll  $IRQSTAT_shift,%eax
--- a/xen/include/asm-x86/asm_defns.h
+++ b/xen/include/asm-x86/asm_defns.h
@@ -127,19 +127,19 @@  void ret_from_intr(void);
         UNLIKELY_DONE(mp, tag);   \
         __UNLIKELY_END(tag)
 
-#define STACK_CPUINFO_FIELD(field) (STACK_SIZE-CPUINFO_sizeof+CPUINFO_##field)
-#define GET_STACK_BASE(reg)                       \
-        movq $~(STACK_SIZE-1),reg;                \
-        andq %rsp,reg
+#define STACK_CPUINFO_FIELD(field) (1 - CPUINFO_sizeof + CPUINFO_##field)
+#define GET_STACK_END(reg)                        \
+        movl $STACK_SIZE-1, %e##reg;              \
+        orq  %rsp, %r##reg
 
 #define GET_CPUINFO_FIELD(field, reg)             \
-        GET_STACK_BASE(reg);                      \
-        addq $STACK_CPUINFO_FIELD(field),reg
+        GET_STACK_END(reg);                       \
+        addq $STACK_CPUINFO_FIELD(field), %r##reg
 
 #define __GET_CURRENT(reg)                        \
-        movq STACK_CPUINFO_FIELD(current_vcpu)(reg),reg
+        movq STACK_CPUINFO_FIELD(current_vcpu)(%r##reg), %r##reg
 #define GET_CURRENT(reg)                          \
-        GET_STACK_BASE(reg);                      \
+        GET_STACK_END(reg);                       \
         __GET_CURRENT(reg)
 
 #ifndef NDEBUG
--- a/xen/include/asm-x86/current.h
+++ b/xen/include/asm-x86/current.h
@@ -55,7 +55,7 @@  static inline struct cpu_info *get_cpu_i
     register unsigned long sp asm("rsp");
 #endif
 
-    return (struct cpu_info *)((sp & ~(STACK_SIZE-1)) + STACK_SIZE) - 1;
+    return (struct cpu_info *)((sp | (STACK_SIZE - 1)) + 1) - 1;
 }
 
 #define get_current()         (get_cpu_info()->current_vcpu)