Message ID | a12f7c71-553c-1e4d-d986-8a966f6e83dc@bell.net (mailing list archive) |
---|---|
State | Accepted, archived |
Headers | show |
Series | parisc: Use ldcw instruction for SMP spinlock release barrier | expand |
On 15.04.19 01:20, John David Anglin wrote: > There are only a couple of instructions that can function as a memory barrier on > parisc. Currently, we use the sync instruction as a memory barrier when releasing > a spinlock. However, the ldcw instruction is a better barrier when we have a > handy memory location since it operates in the cache on coherent machines. > > This patch updates the spinlock release code to use ldcw. Just out of curiosity: Did you maye do any timing with that patch? Secondly: > /* Release pa_tlb_lock lock without reloading lock address. */ > - .macro tlb_unlock0 spc,tmp > + .macro tlb_unlock0 spc,tmp,tmp1 Above you add the tmp1 variable... > #ifdef CONFIG_SMP > 98: or,COND(=) %r0,\spc,%r0 > - stw,ma \spc,0(\tmp) > + LDCW 0(\tmp),\tmp1 couldn't instead %r0 be used as register target in all ldcw() accesses you added as barriers? Or would the processor "optimize" the access away (which I doubt)? Helge > I also changed the "stw,ma" > instructions to "stw" instructions as it is not an adequate barrier. > > Signed-off-by: John David Anglin <dave.anglin@bell.net> > > diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h > index 8a63515f03bf..197d2247e4db 100644 > --- a/arch/parisc/include/asm/spinlock.h > +++ b/arch/parisc/include/asm/spinlock.h > @@ -37,7 +37,11 @@ static inline void arch_spin_unlock(arch_spinlock_t *x) > volatile unsigned int *a; > > a = __ldcw_align(x); > +#ifdef CONFIG_SMP > + (void) __ldcw(a); > +#else > mb(); > +#endif > *a = 1; > } > > diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S > index d5eb19efa65b..a1fc04570ade 100644 > --- a/arch/parisc/kernel/entry.S > +++ b/arch/parisc/kernel/entry.S > @@ -471,8 +467,9 @@ > nop > LDREG 0(\ptp),\pte > bb,<,n \pte,_PAGE_PRESENT_BIT,3f > + LDCW 0(\tmp),\tmp1 > b \fault > - stw,ma \spc,0(\tmp) > + stw \spc,0(\tmp) > 99: ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP) > #endif > 2: LDREG 0(\ptp),\pte > @@ -481,20 +478,22 @@ > .endm > > /* Release pa_tlb_lock lock without reloading lock address. */ > - .macro tlb_unlock0 spc,tmp > + .macro tlb_unlock0 spc,tmp,tmp1 > #ifdef CONFIG_SMP > 98: or,COND(=) %r0,\spc,%r0 > - stw,ma \spc,0(\tmp) > + LDCW 0(\tmp),\tmp1 > + or,COND(=) %r0,\spc,%r0 > + stw \spc,0(\tmp) > 99: ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP) > #endif > .endm > > /* Release pa_tlb_lock lock. */ > - .macro tlb_unlock1 spc,tmp > + .macro tlb_unlock1 spc,tmp,tmp1 > #ifdef CONFIG_SMP > 98: load_pa_tlb_lock \tmp > 99: ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP) > - tlb_unlock0 \spc,\tmp > + tlb_unlock0 \spc,\tmp,\tmp1 > #endif > .endm > > @@ -1177,7 +1176,7 @@ dtlb_miss_20w: > > idtlbt pte,prot > > - tlb_unlock1 spc,t0 > + tlb_unlock1 spc,t0,t1 > rfir > nop > > @@ -1203,7 +1202,7 @@ nadtlb_miss_20w: > > idtlbt pte,prot > > - tlb_unlock1 spc,t0 > + tlb_unlock1 spc,t0,t1 > rfir > nop > > @@ -1237,7 +1236,7 @@ dtlb_miss_11: > > mtsp t1, %sr1 /* Restore sr1 */ > > - tlb_unlock1 spc,t0 > + tlb_unlock1 spc,t0,t1 > rfir > nop > > @@ -1270,7 +1269,7 @@ nadtlb_miss_11: > > mtsp t1, %sr1 /* Restore sr1 */ > > - tlb_unlock1 spc,t0 > + tlb_unlock1 spc,t0,t1 > rfir > nop > > @@ -1299,7 +1298,7 @@ dtlb_miss_20: > > idtlbt pte,prot > > - tlb_unlock1 spc,t0 > + tlb_unlock1 spc,t0,t1 > rfir > nop > > @@ -1327,7 +1326,7 @@ nadtlb_miss_20: > > idtlbt pte,prot > > - tlb_unlock1 spc,t0 > + tlb_unlock1 spc,t0,t1 > rfir > nop > > @@ -1434,7 +1433,7 @@ itlb_miss_20w: > > iitlbt pte,prot > > - tlb_unlock1 spc,t0 > + tlb_unlock1 spc,t0,t1 > rfir > nop > > @@ -1458,7 +1457,7 @@ naitlb_miss_20w: > > iitlbt pte,prot > > - tlb_unlock1 spc,t0 > + tlb_unlock1 spc,t0,t1 > rfir > nop > > @@ -1492,7 +1491,7 @@ itlb_miss_11: > > mtsp t1, %sr1 /* Restore sr1 */ > > - tlb_unlock1 spc,t0 > + tlb_unlock1 spc,t0,t1 > rfir > nop > > @@ -1516,7 +1515,7 @@ naitlb_miss_11: > > mtsp t1, %sr1 /* Restore sr1 */ > > - tlb_unlock1 spc,t0 > + tlb_unlock1 spc,t0,t1 > rfir > nop > > @@ -1546,7 +1545,7 @@ itlb_miss_20: > > iitlbt pte,prot > > - tlb_unlock1 spc,t0 > + tlb_unlock1 spc,t0,t1 > rfir > nop > > @@ -1566,7 +1565,7 @@ naitlb_miss_20: > > iitlbt pte,prot > > - tlb_unlock1 spc,t0 > + tlb_unlock1 spc,t0,t1 > rfir > nop > > @@ -1596,7 +1595,7 @@ dbit_trap_20w: > > idtlbt pte,prot > > - tlb_unlock0 spc,t0 > + tlb_unlock0 spc,t0,t1 > rfir > nop > #else > @@ -1622,7 +1621,7 @@ dbit_trap_11: > > mtsp t1, %sr1 /* Restore sr1 */ > > - tlb_unlock0 spc,t0 > + tlb_unlock0 spc,t0,t1 > rfir > nop > > @@ -1642,7 +1641,7 @@ dbit_trap_20: > > idtlbt pte,prot > > - tlb_unlock0 spc,t0 > + tlb_unlock0 spc,t0,t1 > rfir > nop > #endif > diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S > index 4f77bd9be66b..e2b4c8d81275 100644 > --- a/arch/parisc/kernel/syscall.S > +++ b/arch/parisc/kernel/syscall.S > @@ -640,7 +640,9 @@ cas_action: > sub,<> %r28, %r25, %r0 > 2: stw %r24, 0(%r26) > /* Free lock */ > - sync > +#ifdef CONFIG_SMP > + LDCW 0(%sr2,%r20), %r1 /* Barrier */ > +#endif > stw %r20, 0(%sr2,%r20) > #if ENABLE_LWS_DEBUG > /* Clear thread register indicator */ > @@ -655,7 +657,9 @@ cas_action: > 3: > /* Error occurred on load or store */ > /* Free lock */ > - sync > +#ifdef CONFIG_SMP > + LDCW 0(%sr2,%r20), %r1 /* Barrier */ > +#endif > stw %r20, 0(%sr2,%r20) > #if ENABLE_LWS_DEBUG > stw %r0, 4(%sr2,%r20) > @@ -857,7 +861,9 @@ cas2_action: > > cas2_end: > /* Free lock */ > - sync > +#ifdef CONFIG_SMP > + LDCW 0(%sr2,%r20), %r1 /* Barrier */ > +#endif > stw %r20, 0(%sr2,%r20) > /* Enable interrupts */ > ssm PSW_SM_I, %r0 > @@ -868,7 +874,9 @@ cas2_end: > 22: > /* Error occurred on load or store */ > /* Free lock */ > - sync > +#ifdef CONFIG_SMP > + LDCW 0(%sr2,%r20), %r1 /* Barrier */ > +#endif > stw %r20, 0(%sr2,%r20) > ssm PSW_SM_I, %r0 > ldo 1(%r0),%r28 > > >
On 2019-04-15 8:43 a.m., Helge Deller wrote: > On 15.04.19 01:20, John David Anglin wrote: >> There are only a couple of instructions that can function as a memory barrier on >> parisc. Currently, we use the sync instruction as a memory barrier when releasing >> a spinlock. However, the ldcw instruction is a better barrier when we have a >> handy memory location since it operates in the cache on coherent machines. >> >> This patch updates the spinlock release code to use ldcw. > Just out of curiosity: Did you maye do any timing with that patch? No, but I think it made 800 MHz rp3440 nearly as fast as 1 GHz c8000 based on buildd times. > > Secondly: > >> /* Release pa_tlb_lock lock without reloading lock address. */ >> - .macro tlb_unlock0 spc,tmp >> + .macro tlb_unlock0 spc,tmp,tmp1 > Above you add the tmp1 variable... > >> #ifdef CONFIG_SMP >> 98: or,COND(=) %r0,\spc,%r0 >> - stw,ma \spc,0(\tmp) >> + LDCW 0(\tmp),\tmp1 > couldn't instead %r0 be used as register target in all ldcw() > accesses you added as barriers? > Or would the processor "optimize" the access away (which I doubt)? No. See page 6-12 in PA 2.0 architecture manual. I had wondered about it when I wrote the code. Dave
On 15.04.19 15:44, John David Anglin wrote: > On 2019-04-15 8:43 a.m., Helge Deller wrote: >> On 15.04.19 01:20, John David Anglin wrote: >>> There are only a couple of instructions that can function as a memory barrier on >>> parisc. Currently, we use the sync instruction as a memory barrier when releasing >>> a spinlock. However, the ldcw instruction is a better barrier when we have a >>> handy memory location since it operates in the cache on coherent machines. >>> >>> This patch updates the spinlock release code to use ldcw. >> Just out of curiosity: Did you maye do any timing with that patch? > No, but I think it made 800 MHz rp3440 nearly as fast as 1 GHz c8000 based on buildd times. >> >> Secondly: >> >>> /* Release pa_tlb_lock lock without reloading lock address. */ >>> - .macro tlb_unlock0 spc,tmp >>> + .macro tlb_unlock0 spc,tmp,tmp1 >> Above you add the tmp1 variable... >> >>> #ifdef CONFIG_SMP >>> 98: or,COND(=) %r0,\spc,%r0 >>> - stw,ma \spc,0(\tmp) >>> + LDCW 0(\tmp),\tmp1 >> couldn't instead %r0 be used as register target in all ldcw() >> accesses you added as barriers? >> Or would the processor "optimize" the access away (which I doubt)? > No. See page 6-12 in PA 2.0 architecture manual. I had wondered > about it when I wrote the code. Ah, Ok! I've applied it to the for-next tree. Thanks! Helge
diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h index 8a63515f03bf..197d2247e4db 100644 --- a/arch/parisc/include/asm/spinlock.h +++ b/arch/parisc/include/asm/spinlock.h @@ -37,7 +37,11 @@ static inline void arch_spin_unlock(arch_spinlock_t *x) volatile unsigned int *a; a = __ldcw_align(x); +#ifdef CONFIG_SMP + (void) __ldcw(a); +#else mb(); +#endif *a = 1; } diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index d5eb19efa65b..a1fc04570ade 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -471,8 +467,9 @@ nop LDREG 0(\ptp),\pte bb,<,n \pte,_PAGE_PRESENT_BIT,3f + LDCW 0(\tmp),\tmp1 b \fault - stw,ma \spc,0(\tmp) + stw \spc,0(\tmp) 99: ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP) #endif 2: LDREG 0(\ptp),\pte @@ -481,20 +478,22 @@ .endm /* Release pa_tlb_lock lock without reloading lock address. */ - .macro tlb_unlock0 spc,tmp + .macro tlb_unlock0 spc,tmp,tmp1 #ifdef CONFIG_SMP 98: or,COND(=) %r0,\spc,%r0 - stw,ma \spc,0(\tmp) + LDCW 0(\tmp),\tmp1 + or,COND(=) %r0,\spc,%r0 + stw \spc,0(\tmp) 99: ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP) #endif .endm /* Release pa_tlb_lock lock. */ - .macro tlb_unlock1 spc,tmp + .macro tlb_unlock1 spc,tmp,tmp1 #ifdef CONFIG_SMP 98: load_pa_tlb_lock \tmp 99: ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP) - tlb_unlock0 \spc,\tmp + tlb_unlock0 \spc,\tmp,\tmp1 #endif .endm @@ -1177,7 +1176,7 @@ dtlb_miss_20w: idtlbt pte,prot - tlb_unlock1 spc,t0 + tlb_unlock1 spc,t0,t1 rfir nop @@ -1203,7 +1202,7 @@ nadtlb_miss_20w: idtlbt pte,prot - tlb_unlock1 spc,t0 + tlb_unlock1 spc,t0,t1 rfir nop @@ -1237,7 +1236,7 @@ dtlb_miss_11: mtsp t1, %sr1 /* Restore sr1 */ - tlb_unlock1 spc,t0 + tlb_unlock1 spc,t0,t1 rfir nop @@ -1270,7 +1269,7 @@ nadtlb_miss_11: mtsp t1, %sr1 /* Restore sr1 */ - tlb_unlock1 spc,t0 + tlb_unlock1 spc,t0,t1 rfir nop @@ -1299,7 +1298,7 @@ dtlb_miss_20: idtlbt pte,prot - tlb_unlock1 spc,t0 + tlb_unlock1 spc,t0,t1 rfir nop @@ -1327,7 +1326,7 @@ nadtlb_miss_20: idtlbt pte,prot - tlb_unlock1 spc,t0 + tlb_unlock1 spc,t0,t1 rfir nop @@ -1434,7 +1433,7 @@ itlb_miss_20w: iitlbt pte,prot - tlb_unlock1 spc,t0 + tlb_unlock1 spc,t0,t1 rfir nop @@ -1458,7 +1457,7 @@ naitlb_miss_20w: iitlbt pte,prot - tlb_unlock1 spc,t0 + tlb_unlock1 spc,t0,t1 rfir nop @@ -1492,7 +1491,7 @@ itlb_miss_11: mtsp t1, %sr1 /* Restore sr1 */ - tlb_unlock1 spc,t0 + tlb_unlock1 spc,t0,t1 rfir nop @@ -1516,7 +1515,7 @@ naitlb_miss_11: mtsp t1, %sr1 /* Restore sr1 */ - tlb_unlock1 spc,t0 + tlb_unlock1 spc,t0,t1 rfir nop @@ -1546,7 +1545,7 @@ itlb_miss_20: iitlbt pte,prot - tlb_unlock1 spc,t0 + tlb_unlock1 spc,t0,t1 rfir nop @@ -1566,7 +1565,7 @@ naitlb_miss_20: iitlbt pte,prot - tlb_unlock1 spc,t0 + tlb_unlock1 spc,t0,t1 rfir nop @@ -1596,7 +1595,7 @@ dbit_trap_20w: idtlbt pte,prot - tlb_unlock0 spc,t0 + tlb_unlock0 spc,t0,t1 rfir nop #else @@ -1622,7 +1621,7 @@ dbit_trap_11: mtsp t1, %sr1 /* Restore sr1 */ - tlb_unlock0 spc,t0 + tlb_unlock0 spc,t0,t1 rfir nop @@ -1642,7 +1641,7 @@ dbit_trap_20: idtlbt pte,prot - tlb_unlock0 spc,t0 + tlb_unlock0 spc,t0,t1 rfir nop #endif diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S index 4f77bd9be66b..e2b4c8d81275 100644 --- a/arch/parisc/kernel/syscall.S +++ b/arch/parisc/kernel/syscall.S @@ -640,7 +640,9 @@ cas_action: sub,<> %r28, %r25, %r0 2: stw %r24, 0(%r26) /* Free lock */ - sync +#ifdef CONFIG_SMP + LDCW 0(%sr2,%r20), %r1 /* Barrier */ +#endif stw %r20, 0(%sr2,%r20) #if ENABLE_LWS_DEBUG /* Clear thread register indicator */ @@ -655,7 +657,9 @@ cas_action: 3: /* Error occurred on load or store */ /* Free lock */ - sync +#ifdef CONFIG_SMP + LDCW 0(%sr2,%r20), %r1 /* Barrier */ +#endif stw %r20, 0(%sr2,%r20) #if ENABLE_LWS_DEBUG stw %r0, 4(%sr2,%r20) @@ -857,7 +861,9 @@ cas2_action: cas2_end: /* Free lock */ - sync +#ifdef CONFIG_SMP + LDCW 0(%sr2,%r20), %r1 /* Barrier */ +#endif stw %r20, 0(%sr2,%r20) /* Enable interrupts */ ssm PSW_SM_I, %r0 @@ -868,7 +874,9 @@ cas2_end: 22: /* Error occurred on load or store */ /* Free lock */ - sync +#ifdef CONFIG_SMP + LDCW 0(%sr2,%r20), %r1 /* Barrier */ +#endif stw %r20, 0(%sr2,%r20) ssm PSW_SM_I, %r0 ldo 1(%r0),%r28
There are only a couple of instructions that can function as a memory barrier on parisc. Currently, we use the sync instruction as a memory barrier when releasing a spinlock. However, the ldcw instruction is a better barrier when we have a handy memory location since it operates in the cache on coherent machines. This patch updates the spinlock release code to use ldcw. I also changed the "stw,ma" instructions to "stw" instructions as it is not an adequate barrier. Signed-off-by: John David Anglin <dave.anglin@bell.net>