Message ID | 51E42E11.1010903@dawncrow.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi André, On 15/07/13 18:14, André Hentschel wrote: > From: André Hentschel <nerv@dawncrow.de> > > This patch intents to reduce loading instructions when the resulting value is not used. > It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760 > Have you done any benchmarking to see that this has any real impact? Or tested on a !Vv6k system? It looks possible that the only case where this will perform better is where we're using switch_tls_none or switch_tls_software (both rare cases, I think) and there's some change it will make things worse in other cases? One of the reasons for Russell's suggestion of placing the ldrd (which became the two ldr instructions you've removed from __switch_to, in order to maintain building for older cores) where it is was in order to reduce the chance of pipeline stalls. As I've pointed out below, there is some risk that changing that has implications for the v6 only case below (and the v6k case is now more prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer cores should have more advanced scheduling to avoid such issues anyway...) > Signed-off-by: André Hentschel <nerv@dawncrow.de> > > --- > This patch is against Linux 3.11-rc1 (ad81f0545ef01ea651886dddac4bef6cec930092) > > Thanks to everyone who helped me with a4780adeefd042482f624f5e0d577bf9cdcbb760 > > diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h > index 83259b8..3742722 100644 > --- a/arch/arm/include/asm/tls.h > +++ b/arch/arm/include/asm/tls.h > @@ -3,29 +3,32 @@ > > #ifdef __ASSEMBLY__ > #include <asm/asm-offsets.h> > - .macro switch_tls_none, base, tp, tpuser, tmp1, tmp2 > + .macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2 > .endm > > - .macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2 > + .macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2 > + ldrd \tp, \tpuser, [\next, #TI_TP_VALUE] @ get the next TLS and user r/w register > mrc p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register > mcr p15, 0, \tp, c13, c0, 3 @ set TLS register > mcr p15, 0, \tpuser, c13, c0, 2 @ and the user r/w register > - str \tmp2, [\base, #TI_TP_VALUE + 4] @ save it > + str \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it > .endm > > - .macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2 > + .macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2 > ldr \tmp1, =elf_hwcap > ldr \tmp1, [\tmp1, #0] > mov \tmp2, #0xffff0fff > + ldr \tp, [\next, #TI_TP_VALUE] @ get the next TLS register > tst \tmp1, #HWCAP_TLS @ hardware TLS available? > streq \tp, [\tmp2, #-15] @ set TLS value at 0xffff0ff0 > - mrcne p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register > + mrcne p15, 0, \tmp2, c13, c0, 2 @ get the previous user r/w register > + ldrne \tpuser, [\next, #TI_TP_VALUE + 4] @ get the next user r/w register > mcrne p15, 0, \tp, c13, c0, 3 @ yes, set TLS register > mcrne p15, 0, \tpuser, c13, c0, 2 @ set user r/w register Now we've only got one instruction between the store and the load and risk stalling the pipeline... Dave M cautiously says "The ancient advice was that one instruction was enough" but this is very core dependent... I wonder if anyone has a good idea about whether this is an issue here...? > - strne \tmp2, [\base, #TI_TP_VALUE + 4] @ save it > + strne \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it > .endm > > - .macro switch_tls_software, base, tp, tpuser, tmp1, tmp2 > + .macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2 > mov \tmp1, #0xffff0fff > str \tp, [\tmp1, #-15] @ set TLS value at 0xffff0ff0 > .endm > diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S > index a39cfc2a1..1484b59 100644 > --- a/arch/arm/kernel/entry-armv.S > +++ b/arch/arm/kernel/entry-armv.S > @@ -689,12 +689,10 @@ ENTRY(__switch_to) > THUMB( stmia ip!, {r4 - sl, fp} ) @ Store most regs on stack > THUMB( str sp, [ip], #4 ) > THUMB( str lr, [ip], #4 ) > - ldr r4, [r2, #TI_TP_VALUE] > - ldr r5, [r2, #TI_TP_VALUE + 4] > #ifdef CONFIG_CPU_USE_DOMAINS > ldr r6, [r2, #TI_CPU_DOMAIN] > #endif > - switch_tls r1, r4, r5, r3, r7 > + switch_tls r1, r2, r4, r5, r3, r7 > #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP) > ldr r7, [r2, #TI_TASK] > ldr r8, =__stack_chk_guard > Jonny
Hi Jonathan, First, thank you for your review. Am 16.07.2013 19:31, schrieb Jonathan Austin: > Hi André, > > On 15/07/13 18:14, André Hentschel wrote: >> From: André Hentschel <nerv@dawncrow.de> >> >> This patch intents to reduce loading instructions when the resulting value is not used. >> It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760 >> > > Have you done any benchmarking to see that this has any real impact? Or tested on a !Vv6k system? It looks possible that the only case where this will perform better is where we're using switch_tls_none or switch_tls_software (both rare cases, I think) and there's some change it will make things worse in other cases? I have to admit that i only tested it on v6k and did no benchmark. > One of the reasons for Russell's suggestion of placing the ldrd (which became the two ldr instructions you've removed from __switch_to, in order to maintain building for older cores) where it is was in order to reduce the chance of pipeline stalls. > > As I've pointed out below, there is some risk that changing that has implications for the v6 only case below (and the v6k case is now more prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer cores should have more advanced scheduling to avoid such issues anyway...) I'm not sure how this could make things worse on v6k, could you elaborate please? Besides of the ldr and str being too close to each other i thought this patch is a good idea, because it removes two ldr which are always executed. (Continuing below...) >> Signed-off-by: André Hentschel <nerv@dawncrow.de> >> >> --- >> This patch is against Linux 3.11-rc1 (ad81f0545ef01ea651886dddac4bef6cec930092) >> >> Thanks to everyone who helped me with a4780adeefd042482f624f5e0d577bf9cdcbb760 >> >> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h >> index 83259b8..3742722 100644 >> --- a/arch/arm/include/asm/tls.h >> +++ b/arch/arm/include/asm/tls.h >> @@ -3,29 +3,32 @@ >> >> #ifdef __ASSEMBLY__ >> #include <asm/asm-offsets.h> >> - .macro switch_tls_none, base, tp, tpuser, tmp1, tmp2 >> + .macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2 >> .endm >> >> - .macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2 >> + .macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2 >> + ldrd \tp, \tpuser, [\next, #TI_TP_VALUE] @ get the next TLS and user r/w register >> mrc p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register >> mcr p15, 0, \tp, c13, c0, 3 @ set TLS register >> mcr p15, 0, \tpuser, c13, c0, 2 @ and the user r/w register >> - str \tmp2, [\base, #TI_TP_VALUE + 4] @ save it >> + str \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it >> .endm >> >> - .macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2 >> + .macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2 >> ldr \tmp1, =elf_hwcap >> ldr \tmp1, [\tmp1, #0] >> mov \tmp2, #0xffff0fff >> + ldr \tp, [\next, #TI_TP_VALUE] @ get the next TLS register >> tst \tmp1, #HWCAP_TLS @ hardware TLS available? >> streq \tp, [\tmp2, #-15] @ set TLS value at 0xffff0ff0 >> - mrcne p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register >> + mrcne p15, 0, \tmp2, c13, c0, 2 @ get the previous user r/w register >> + ldrne \tpuser, [\next, #TI_TP_VALUE + 4] @ get the next user r/w register >> mcrne p15, 0, \tp, c13, c0, 3 @ yes, set TLS register >> mcrne p15, 0, \tpuser, c13, c0, 2 @ set user r/w register > > Now we've only got one instruction between the store and the load and risk stalling the pipeline... > > Dave M cautiously says "The ancient advice was that one instruction was enough" but this is very core dependent... I wonder if anyone has a good idea about whether this is an issue here...? We could use a ldrd at the top, that'd be nearly what we have right now, don't we?
Hi André, On 16/07/13 20:27, André Hentschel wrote: > Hi Jonathan, First, thank you for your review. > > Am 16.07.2013 19:31, schrieb Jonathan Austin: >> Hi André, >> >> On 15/07/13 18:14, André Hentschel wrote: >>> From: André Hentschel <nerv@dawncrow.de> >>> >>> This patch intents to reduce loading instructions when the >>> resulting value is not used. It's a follow up on >>> a4780adeefd042482f624f5e0d577bf9cdcbb760 >>> >> >> Have you done any benchmarking to see that this has any real >> impact? Or tested on a !Vv6k system? It looks possible that the >> only case where this will perform better is where we're using >> switch_tls_none or switch_tls_software (both rare cases, I think) >> and there's some change it will make things worse in other cases? > > I have to admit that i only tested it on v6k and did no benchmark. > Do you have access to anything v6-NOT-k-ish? If not I can try and test this on something appropriate. How does your test-case access tpidrurw? If it uses inline asm then it won't work on v6-not-k, as those instructions aren't defined... >> One of the reasons for Russell's suggestion of placing the ldrd >> (which became the two ldr instructions you've removed from >> __switch_to, in order to maintain building for older cores) where >> it is was in order to reduce the chance of pipeline stalls. >> >> As I've pointed out below, there is some risk that changing that >> has implications for the v6 only case below (and the v6k case is >> now more prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer >> cores should have more advanced scheduling to avoid such issues >> anyway...) > > I'm not sure how this could make things worse on v6k, could you > elaborate please? Besides of the ldr and str being too close to each > other Yea, that's the only issue, and in the !CONFIG_CPU_USE_DOMAINS case things are slightly worse than they were before > i thought this patch is a good idea, because it removes two ldr > which are always executed. (Continuing below...) Indeed, as long as it doesn't cause pipeline stalls then that's a gain for some cases :) [...] >> Now we've only got one instruction between the store and the load >> and risk stalling the pipeline... >> >> Dave M cautiously says "The ancient advice was that one instruction >> was enough" but this is very core dependent... I wonder if anyone >> has a good idea about whether this is an issue here...? > > We could use a ldrd at the top, that'd be nearly what we have right > now, don't we? Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd *may* be two cycles (depending on alignment of the words) but the ldr and ldrne will always be two cycles. Ahhh, the joys of modifying the fast path ;) Jonny > > >
Am 17.07.2013 13:10, schrieb Jonathan Austin: > Hi André, > > On 16/07/13 20:27, André Hentschel wrote: >> Hi Jonathan, First, thank you for your review. >> >> Am 16.07.2013 19:31, schrieb Jonathan Austin: >>> Hi André, >>> >>> On 15/07/13 18:14, André Hentschel wrote: >>>> From: André Hentschel <nerv@dawncrow.de> >>>> >>>> This patch intents to reduce loading instructions when the >>>> resulting value is not used. It's a follow up on >>>> a4780adeefd042482f624f5e0d577bf9cdcbb760 >>>> >>> >>> Have you done any benchmarking to see that this has any real >>> impact? Or tested on a !Vv6k system? It looks possible that the >>> only case where this will perform better is where we're using >>> switch_tls_none or switch_tls_software (both rare cases, I think) >>> and there's some change it will make things worse in other cases? >> >> I have to admit that i only tested it on v6k and did no benchmark. >> > Do you have access to anything v6-NOT-k-ish? If not I can try and test this on something appropriate. How does your test-case access tpidrurw? If it uses inline asm then it won't work on v6-not-k, as those instructions aren't defined... I don't, so it'd be nice if you could do that. I could imagine you have a good choice of devices at ARM :) In my crappy test application i do it similar to Wine: https://github.com/AndreRH/tpidrurw-test/blob/master/main.c#L29 but Wine code won't work out of the box on v6: http://source.winehq.org/git/wine.git/blob/HEAD:/dlls/ntdll/signal_arm.c#l851 >>> One of the reasons for Russell's suggestion of placing the ldrd >>> (which became the two ldr instructions you've removed from >>> __switch_to, in order to maintain building for older cores) where >>> it is was in order to reduce the chance of pipeline stalls. >>> >>> As I've pointed out below, there is some risk that changing that >>> has implications for the v6 only case below (and the v6k case is >>> now more prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer >>> cores should have more advanced scheduling to avoid such issues >>> anyway...) >> >> I'm not sure how this could make things worse on v6k, could you >> elaborate please? Besides of the ldr and str being too close to each >> other > > Yea, that's the only issue, and in the !CONFIG_CPU_USE_DOMAINS case things are slightly worse than they were before > >> i thought this patch is a good idea, because it removes two ldr >> which are always executed. (Continuing below...) > > Indeed, as long as it doesn't cause pipeline stalls then that's a gain for some cases :) > > [...] >>> Now we've only got one instruction between the store and the load >>> and risk stalling the pipeline... >>> >>> Dave M cautiously says "The ancient advice was that one instruction >>> was enough" but this is very core dependent... I wonder if anyone >>> has a good idea about whether this is an issue here...? >> >> We could use a ldrd at the top, that'd be nearly what we have right >> now, don't we? > > Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd *may* be two cycles (depending on alignment of the words) but the ldr and ldrne will always be two cycles. Ahhh, the joys of modifying the fast path ;)
diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h index 83259b8..3742722 100644 --- a/arch/arm/include/asm/tls.h +++ b/arch/arm/include/asm/tls.h @@ -3,29 +3,32 @@ #ifdef __ASSEMBLY__ #include <asm/asm-offsets.h> - .macro switch_tls_none, base, tp, tpuser, tmp1, tmp2 + .macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2 .endm - .macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2 + .macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2 + ldrd \tp, \tpuser, [\next, #TI_TP_VALUE] @ get the next TLS and user r/w register mrc p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register mcr p15, 0, \tp, c13, c0, 3 @ set TLS register mcr p15, 0, \tpuser, c13, c0, 2 @ and the user r/w register - str \tmp2, [\base, #TI_TP_VALUE + 4] @ save it + str \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it .endm - .macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2 + .macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2 ldr \tmp1, =elf_hwcap ldr \tmp1, [\tmp1, #0] mov \tmp2, #0xffff0fff + ldr \tp, [\next, #TI_TP_VALUE] @ get the next TLS register tst \tmp1, #HWCAP_TLS @ hardware TLS available? streq \tp, [\tmp2, #-15] @ set TLS value at 0xffff0ff0 - mrcne p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register + mrcne p15, 0, \tmp2, c13, c0, 2 @ get the previous user r/w register + ldrne \tpuser, [\next, #TI_TP_VALUE + 4] @ get the next user r/w register mcrne p15, 0, \tp, c13, c0, 3 @ yes, set TLS register mcrne p15, 0, \tpuser, c13, c0, 2 @ set user r/w register - strne \tmp2, [\base, #TI_TP_VALUE + 4] @ save it + strne \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it .endm - .macro switch_tls_software, base, tp, tpuser, tmp1, tmp2 + .macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2 mov \tmp1, #0xffff0fff str \tp, [\tmp1, #-15] @ set TLS value at 0xffff0ff0 .endm diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index a39cfc2a1..1484b59 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -689,12 +689,10 @@ ENTRY(__switch_to) THUMB( stmia ip!, {r4 - sl, fp} ) @ Store most regs on stack THUMB( str sp, [ip], #4 ) THUMB( str lr, [ip], #4 ) - ldr r4, [r2, #TI_TP_VALUE] - ldr r5, [r2, #TI_TP_VALUE + 4] #ifdef CONFIG_CPU_USE_DOMAINS ldr r6, [r2, #TI_CPU_DOMAIN] #endif - switch_tls r1, r4, r5, r3, r7 + switch_tls r1, r2, r4, r5, r3, r7 #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP) ldr r7, [r2, #TI_TASK] ldr r8, =__stack_chk_guard