Message ID | 1460723596-13261-3-git-send-email-daniel.kiper@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Apr 15, 2016 at 02:33:02PM +0200, Daniel Kiper wrote: > Speedup BSS initialization by using stosl instead of stosb. > > Some may argue that Intel Ivy Bridge and later provide ERMSB feature. > This means that "rep stosb" gives better throughput than "rep stosl" on > above mentioned CPUs. However, this feature is only available on newer > Intel processors and e.g. AMD does not provide it at all. So, stosb will > just give real benefits and even beat stosl only on limited number of > machines. On the other hand stosl will speedup BSS initialization on > all x86 platforms. Hence, use stosl instead of stosb. > > Additionally, align relevant comment to coding style. > > Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com> > Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com> Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> > --- > v3 - suggestions/fixes: > - improve comments > (suggested by Konrad Rzeszutek Wilk), > - improve commit message > (suggested by Jan Beulich). > --- > xen/arch/x86/boot/head.S | 5 +++-- > xen/arch/x86/xen.lds.S | 3 +++ > 2 files changed, 6 insertions(+), 2 deletions(-) > > diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S > index f3501fd..32a54a0 100644 > --- a/xen/arch/x86/boot/head.S > +++ b/xen/arch/x86/boot/head.S > @@ -123,12 +123,13 @@ __start: > call reloc > mov %eax,sym_phys(multiboot_ptr) > > - /* Initialize BSS (no nasty surprises!) */ > + /* Initialize BSS (no nasty surprises!). */ > mov $sym_phys(__bss_start),%edi > mov $sym_phys(__bss_end),%ecx > sub %edi,%ecx > + shr $2,%ecx > xor %eax,%eax > - rep stosb > + rep stosl > > /* Interrogate CPU extended features via CPUID. */ > mov $0x80000000,%eax > diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S > index 961f48f..6802da1 100644 > --- a/xen/arch/x86/xen.lds.S > +++ b/xen/arch/x86/xen.lds.S > @@ -191,6 +191,8 @@ SECTIONS > CONSTRUCTORS > } :text > > + /* Align BSS to speedup its initialization. */ > + . = ALIGN(4); > .bss : { /* BSS */ > . = ALIGN(STACK_SIZE); > __bss_start = .; > @@ -205,6 +207,7 @@ SECTIONS > *(.bss.percpu.read_mostly) > . = ALIGN(SMP_CACHE_BYTES); > __per_cpu_data_end = .; > + . = ALIGN(4); > __bss_end = .; > } :text > _end = . ; > -- > 1.7.10.4 >
On 15/04/16 13:33, Daniel Kiper wrote: > Speedup BSS initialization by using stosl instead of stosb. > > Some may argue that Intel Ivy Bridge and later provide ERMSB feature. > This means that "rep stosb" gives better throughput than "rep stosl" on > above mentioned CPUs. However, this feature is only available on newer > Intel processors and e.g. AMD does not provide it at all. So, stosb will > just give real benefits and even beat stosl only on limited number of > machines. On the other hand stosl will speedup BSS initialization on > all x86 platforms. Hence, use stosl instead of stosb. > > Additionally, align relevant comment to coding style. > > Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com> > Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com> > --- > v3 - suggestions/fixes: > - improve comments > (suggested by Konrad Rzeszutek Wilk), > - improve commit message > (suggested by Jan Beulich). > --- > xen/arch/x86/boot/head.S | 5 +++-- > xen/arch/x86/xen.lds.S | 3 +++ > 2 files changed, 6 insertions(+), 2 deletions(-) > > diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S > index f3501fd..32a54a0 100644 > --- a/xen/arch/x86/boot/head.S > +++ b/xen/arch/x86/boot/head.S > @@ -123,12 +123,13 @@ __start: > call reloc > mov %eax,sym_phys(multiboot_ptr) > > - /* Initialize BSS (no nasty surprises!) */ > + /* Initialize BSS (no nasty surprises!). */ > mov $sym_phys(__bss_start),%edi > mov $sym_phys(__bss_end),%ecx > sub %edi,%ecx > + shr $2,%ecx > xor %eax,%eax > - rep stosb > + rep stosl > > /* Interrogate CPU extended features via CPUID. */ > mov $0x80000000,%eax > diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S > index 961f48f..6802da1 100644 > --- a/xen/arch/x86/xen.lds.S > +++ b/xen/arch/x86/xen.lds.S > @@ -191,6 +191,8 @@ SECTIONS > CONSTRUCTORS > } :text > > + /* Align BSS to speedup its initialization. */ > + . = ALIGN(4); This is not needed. There is already appropriate alignment before __bss_start. Also, you need to rebase this series onto staging - there are a lot of changes you are missing. ~Andrew > .bss : { /* BSS */ > . = ALIGN(STACK_SIZE); > __bss_start = .; > @@ -205,6 +207,7 @@ SECTIONS > *(.bss.percpu.read_mostly) > . = ALIGN(SMP_CACHE_BYTES); > __per_cpu_data_end = .; > + . = ALIGN(4); > __bss_end = .; > } :text > _end = . ;
diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S index f3501fd..32a54a0 100644 --- a/xen/arch/x86/boot/head.S +++ b/xen/arch/x86/boot/head.S @@ -123,12 +123,13 @@ __start: call reloc mov %eax,sym_phys(multiboot_ptr) - /* Initialize BSS (no nasty surprises!) */ + /* Initialize BSS (no nasty surprises!). */ mov $sym_phys(__bss_start),%edi mov $sym_phys(__bss_end),%ecx sub %edi,%ecx + shr $2,%ecx xor %eax,%eax - rep stosb + rep stosl /* Interrogate CPU extended features via CPUID. */ mov $0x80000000,%eax diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S index 961f48f..6802da1 100644 --- a/xen/arch/x86/xen.lds.S +++ b/xen/arch/x86/xen.lds.S @@ -191,6 +191,8 @@ SECTIONS CONSTRUCTORS } :text + /* Align BSS to speedup its initialization. */ + . = ALIGN(4); .bss : { /* BSS */ . = ALIGN(STACK_SIZE); __bss_start = .; @@ -205,6 +207,7 @@ SECTIONS *(.bss.percpu.read_mostly) . = ALIGN(SMP_CACHE_BYTES); __per_cpu_data_end = .; + . = ALIGN(4); __bss_end = .; } :text _end = . ;
Speedup BSS initialization by using stosl instead of stosb. Some may argue that Intel Ivy Bridge and later provide ERMSB feature. This means that "rep stosb" gives better throughput than "rep stosl" on above mentioned CPUs. However, this feature is only available on newer Intel processors and e.g. AMD does not provide it at all. So, stosb will just give real benefits and even beat stosl only on limited number of machines. On the other hand stosl will speedup BSS initialization on all x86 platforms. Hence, use stosl instead of stosb. Additionally, align relevant comment to coding style. Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com> Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com> --- v3 - suggestions/fixes: - improve comments (suggested by Konrad Rzeszutek Wilk), - improve commit message (suggested by Jan Beulich). --- xen/arch/x86/boot/head.S | 5 +++-- xen/arch/x86/xen.lds.S | 3 +++ 2 files changed, 6 insertions(+), 2 deletions(-)