diff mbox series

parisc: Use ldcw instruction for SMP spinlock release barrier

Message ID a12f7c71-553c-1e4d-d986-8a966f6e83dc@bell.net (mailing list archive)
State Accepted, archived
Headers show
Series parisc: Use ldcw instruction for SMP spinlock release barrier | expand

Commit Message

John David Anglin April 14, 2019, 11:20 p.m. UTC
There are only a couple of instructions that can function as a memory barrier on
parisc.  Currently, we use the sync instruction as a memory barrier when releasing
a spinlock.  However, the ldcw instruction is a better barrier when we have a
handy memory location since it operates in the cache on coherent machines.

This patch updates the spinlock release code to use ldcw.  I also changed the "stw,ma"
instructions to "stw" instructions as it is not an adequate barrier.

Signed-off-by: John David Anglin <dave.anglin@bell.net>

Comments

Helge Deller April 15, 2019, 12:43 p.m. UTC | #1
On 15.04.19 01:20, John David Anglin wrote:
> There are only a couple of instructions that can function as a memory barrier on
> parisc.  Currently, we use the sync instruction as a memory barrier when releasing
> a spinlock.  However, the ldcw instruction is a better barrier when we have a
> handy memory location since it operates in the cache on coherent machines.
>
> This patch updates the spinlock release code to use ldcw.

Just out of curiosity: Did you maye do any timing with that patch?

Secondly:

>  	/* Release pa_tlb_lock lock without reloading lock address. */
> -	.macro		tlb_unlock0	spc,tmp
> +	.macro		tlb_unlock0	spc,tmp,tmp1

Above you add the tmp1 variable...

>  #ifdef CONFIG_SMP
>  98:	or,COND(=)	%r0,\spc,%r0
> -	stw,ma		\spc,0(\tmp)
> +	LDCW		0(\tmp),\tmp1

couldn't instead %r0 be used as register target in all ldcw()
accesses you added as barriers?
Or would the processor "optimize" the access away (which I doubt)?

Helge


> I also changed the "stw,ma"
> instructions to "stw" instructions as it is not an adequate barrier.
>
> Signed-off-by: John David Anglin <dave.anglin@bell.net>
>
> diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
> index 8a63515f03bf..197d2247e4db 100644
> --- a/arch/parisc/include/asm/spinlock.h
> +++ b/arch/parisc/include/asm/spinlock.h
> @@ -37,7 +37,11 @@ static inline void arch_spin_unlock(arch_spinlock_t *x)
>  	volatile unsigned int *a;
>
>  	a = __ldcw_align(x);
> +#ifdef CONFIG_SMP
> +	(void) __ldcw(a);
> +#else
>  	mb();
> +#endif
>  	*a = 1;
>  }
>
> diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
> index d5eb19efa65b..a1fc04570ade 100644
> --- a/arch/parisc/kernel/entry.S
> +++ b/arch/parisc/kernel/entry.S
> @@ -471,8 +467,9 @@
>  	nop
>  	LDREG		0(\ptp),\pte
>  	bb,<,n		\pte,_PAGE_PRESENT_BIT,3f
> +	LDCW		0(\tmp),\tmp1
>  	b		\fault
> -	stw,ma		\spc,0(\tmp)
> +	stw		\spc,0(\tmp)
>  99:	ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP)
>  #endif
>  2:	LDREG		0(\ptp),\pte
> @@ -481,20 +478,22 @@
>  	.endm
>
>  	/* Release pa_tlb_lock lock without reloading lock address. */
> -	.macro		tlb_unlock0	spc,tmp
> +	.macro		tlb_unlock0	spc,tmp,tmp1
>  #ifdef CONFIG_SMP
>  98:	or,COND(=)	%r0,\spc,%r0
> -	stw,ma		\spc,0(\tmp)
> +	LDCW		0(\tmp),\tmp1
> +	or,COND(=)	%r0,\spc,%r0
> +	stw		\spc,0(\tmp)
>  99:	ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP)
>  #endif
>  	.endm
>
>  	/* Release pa_tlb_lock lock. */
> -	.macro		tlb_unlock1	spc,tmp
> +	.macro		tlb_unlock1	spc,tmp,tmp1
>  #ifdef CONFIG_SMP
>  98:	load_pa_tlb_lock \tmp
>  99:	ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP)
> -	tlb_unlock0	\spc,\tmp
> +	tlb_unlock0	\spc,\tmp,\tmp1
>  #endif
>  	.endm
>
> @@ -1177,7 +1176,7 @@ dtlb_miss_20w:
>
>  	idtlbt          pte,prot
>
> -	tlb_unlock1	spc,t0
> +	tlb_unlock1	spc,t0,t1
>  	rfir
>  	nop
>
> @@ -1203,7 +1202,7 @@ nadtlb_miss_20w:
>
>  	idtlbt          pte,prot
>
> -	tlb_unlock1	spc,t0
> +	tlb_unlock1	spc,t0,t1
>  	rfir
>  	nop
>
> @@ -1237,7 +1236,7 @@ dtlb_miss_11:
>
>  	mtsp		t1, %sr1	/* Restore sr1 */
>
> -	tlb_unlock1	spc,t0
> +	tlb_unlock1	spc,t0,t1
>  	rfir
>  	nop
>
> @@ -1270,7 +1269,7 @@ nadtlb_miss_11:
>
>  	mtsp		t1, %sr1	/* Restore sr1 */
>
> -	tlb_unlock1	spc,t0
> +	tlb_unlock1	spc,t0,t1
>  	rfir
>  	nop
>
> @@ -1299,7 +1298,7 @@ dtlb_miss_20:
>
>  	idtlbt          pte,prot
>
> -	tlb_unlock1	spc,t0
> +	tlb_unlock1	spc,t0,t1
>  	rfir
>  	nop
>
> @@ -1327,7 +1326,7 @@ nadtlb_miss_20:
>
>  	idtlbt		pte,prot
>
> -	tlb_unlock1	spc,t0
> +	tlb_unlock1	spc,t0,t1
>  	rfir
>  	nop
>
> @@ -1434,7 +1433,7 @@ itlb_miss_20w:
>
>  	iitlbt          pte,prot
>
> -	tlb_unlock1	spc,t0
> +	tlb_unlock1	spc,t0,t1
>  	rfir
>  	nop
>
> @@ -1458,7 +1457,7 @@ naitlb_miss_20w:
>
>  	iitlbt          pte,prot
>
> -	tlb_unlock1	spc,t0
> +	tlb_unlock1	spc,t0,t1
>  	rfir
>  	nop
>
> @@ -1492,7 +1491,7 @@ itlb_miss_11:
>
>  	mtsp		t1, %sr1	/* Restore sr1 */
>
> -	tlb_unlock1	spc,t0
> +	tlb_unlock1	spc,t0,t1
>  	rfir
>  	nop
>
> @@ -1516,7 +1515,7 @@ naitlb_miss_11:
>
>  	mtsp		t1, %sr1	/* Restore sr1 */
>
> -	tlb_unlock1	spc,t0
> +	tlb_unlock1	spc,t0,t1
>  	rfir
>  	nop
>
> @@ -1546,7 +1545,7 @@ itlb_miss_20:
>
>  	iitlbt          pte,prot
>
> -	tlb_unlock1	spc,t0
> +	tlb_unlock1	spc,t0,t1
>  	rfir
>  	nop
>
> @@ -1566,7 +1565,7 @@ naitlb_miss_20:
>
>  	iitlbt          pte,prot
>
> -	tlb_unlock1	spc,t0
> +	tlb_unlock1	spc,t0,t1
>  	rfir
>  	nop
>
> @@ -1596,7 +1595,7 @@ dbit_trap_20w:
>
>  	idtlbt          pte,prot
>
> -	tlb_unlock0	spc,t0
> +	tlb_unlock0	spc,t0,t1
>  	rfir
>  	nop
>  #else
> @@ -1622,7 +1621,7 @@ dbit_trap_11:
>
>  	mtsp            t1, %sr1     /* Restore sr1 */
>
> -	tlb_unlock0	spc,t0
> +	tlb_unlock0	spc,t0,t1
>  	rfir
>  	nop
>
> @@ -1642,7 +1641,7 @@ dbit_trap_20:
>
>  	idtlbt		pte,prot
>
> -	tlb_unlock0	spc,t0
> +	tlb_unlock0	spc,t0,t1
>  	rfir
>  	nop
>  #endif
> diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
> index 4f77bd9be66b..e2b4c8d81275 100644
> --- a/arch/parisc/kernel/syscall.S
> +++ b/arch/parisc/kernel/syscall.S
> @@ -640,7 +640,9 @@ cas_action:
>  	sub,<>	%r28, %r25, %r0
>  2:	stw	%r24, 0(%r26)
>  	/* Free lock */
> -	sync
> +#ifdef CONFIG_SMP
> +	LDCW	0(%sr2,%r20), %r1			/* Barrier */
> +#endif
>  	stw	%r20, 0(%sr2,%r20)
>  #if ENABLE_LWS_DEBUG
>  	/* Clear thread register indicator */
> @@ -655,7 +657,9 @@ cas_action:
>  3:
>  	/* Error occurred on load or store */
>  	/* Free lock */
> -	sync
> +#ifdef CONFIG_SMP
> +	LDCW	0(%sr2,%r20), %r1			/* Barrier */
> +#endif
>  	stw	%r20, 0(%sr2,%r20)
>  #if ENABLE_LWS_DEBUG
>  	stw	%r0, 4(%sr2,%r20)
> @@ -857,7 +861,9 @@ cas2_action:
>
>  cas2_end:
>  	/* Free lock */
> -	sync
> +#ifdef CONFIG_SMP
> +	LDCW	0(%sr2,%r20), %r1			/* Barrier */
> +#endif
>  	stw	%r20, 0(%sr2,%r20)
>  	/* Enable interrupts */
>  	ssm	PSW_SM_I, %r0
> @@ -868,7 +874,9 @@ cas2_end:
>  22:
>  	/* Error occurred on load or store */
>  	/* Free lock */
> -	sync
> +#ifdef CONFIG_SMP
> +	LDCW	0(%sr2,%r20), %r1			/* Barrier */
> +#endif
>  	stw	%r20, 0(%sr2,%r20)
>  	ssm	PSW_SM_I, %r0
>  	ldo	1(%r0),%r28
>
>
>
John David Anglin April 15, 2019, 1:44 p.m. UTC | #2
On 2019-04-15 8:43 a.m., Helge Deller wrote:
> On 15.04.19 01:20, John David Anglin wrote:
>> There are only a couple of instructions that can function as a memory barrier on
>> parisc.  Currently, we use the sync instruction as a memory barrier when releasing
>> a spinlock.  However, the ldcw instruction is a better barrier when we have a
>> handy memory location since it operates in the cache on coherent machines.
>>
>> This patch updates the spinlock release code to use ldcw.
> Just out of curiosity: Did you maye do any timing with that patch?
No, but I think it made 800 MHz rp3440 nearly as fast as 1 GHz c8000 based on buildd times.
>
> Secondly:
>
>>  	/* Release pa_tlb_lock lock without reloading lock address. */
>> -	.macro		tlb_unlock0	spc,tmp
>> +	.macro		tlb_unlock0	spc,tmp,tmp1
> Above you add the tmp1 variable...
>
>>  #ifdef CONFIG_SMP
>>  98:	or,COND(=)	%r0,\spc,%r0
>> -	stw,ma		\spc,0(\tmp)
>> +	LDCW		0(\tmp),\tmp1
> couldn't instead %r0 be used as register target in all ldcw()
> accesses you added as barriers?
> Or would the processor "optimize" the access away (which I doubt)?
No.  See page 6-12 in PA 2.0 architecture manual.  I had wondered about it when I wrote the code.

Dave
Helge Deller April 15, 2019, 3:54 p.m. UTC | #3
On 15.04.19 15:44, John David Anglin wrote:
> On 2019-04-15 8:43 a.m., Helge Deller wrote:
>> On 15.04.19 01:20, John David Anglin wrote:
>>> There are only a couple of instructions that can function as a memory barrier on
>>> parisc.  Currently, we use the sync instruction as a memory barrier when releasing
>>> a spinlock.  However, the ldcw instruction is a better barrier when we have a
>>> handy memory location since it operates in the cache on coherent machines.
>>>
>>> This patch updates the spinlock release code to use ldcw.
>> Just out of curiosity: Did you maye do any timing with that patch?
> No, but I think it made 800 MHz rp3440 nearly as fast as 1 GHz c8000 based on buildd times.
>>
>> Secondly:
>>
>>>  	/* Release pa_tlb_lock lock without reloading lock address. */
>>> -	.macro		tlb_unlock0	spc,tmp
>>> +	.macro		tlb_unlock0	spc,tmp,tmp1
>> Above you add the tmp1 variable...
>>
>>>  #ifdef CONFIG_SMP
>>>  98:	or,COND(=)	%r0,\spc,%r0
>>> -	stw,ma		\spc,0(\tmp)
>>> +	LDCW		0(\tmp),\tmp1
>> couldn't instead %r0 be used as register target in all ldcw()
>> accesses you added as barriers?
>> Or would the processor "optimize" the access away (which I doubt)?

> No.  See page 6-12 in PA 2.0 architecture manual.  I had wondered
> about it when I wrote the code.
Ah, Ok!

I've applied it to the for-next tree.
Thanks!

Helge
diff mbox series

Patch

diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index 8a63515f03bf..197d2247e4db 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -37,7 +37,11 @@  static inline void arch_spin_unlock(arch_spinlock_t *x)
 	volatile unsigned int *a;

 	a = __ldcw_align(x);
+#ifdef CONFIG_SMP
+	(void) __ldcw(a);
+#else
 	mb();
+#endif
 	*a = 1;
 }

diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index d5eb19efa65b..a1fc04570ade 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -471,8 +467,9 @@ 
 	nop
 	LDREG		0(\ptp),\pte
 	bb,<,n		\pte,_PAGE_PRESENT_BIT,3f
+	LDCW		0(\tmp),\tmp1
 	b		\fault
-	stw,ma		\spc,0(\tmp)
+	stw		\spc,0(\tmp)
 99:	ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP)
 #endif
 2:	LDREG		0(\ptp),\pte
@@ -481,20 +478,22 @@ 
 	.endm

 	/* Release pa_tlb_lock lock without reloading lock address. */
-	.macro		tlb_unlock0	spc,tmp
+	.macro		tlb_unlock0	spc,tmp,tmp1
 #ifdef CONFIG_SMP
 98:	or,COND(=)	%r0,\spc,%r0
-	stw,ma		\spc,0(\tmp)
+	LDCW		0(\tmp),\tmp1
+	or,COND(=)	%r0,\spc,%r0
+	stw		\spc,0(\tmp)
 99:	ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP)
 #endif
 	.endm

 	/* Release pa_tlb_lock lock. */
-	.macro		tlb_unlock1	spc,tmp
+	.macro		tlb_unlock1	spc,tmp,tmp1
 #ifdef CONFIG_SMP
 98:	load_pa_tlb_lock \tmp
 99:	ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP)
-	tlb_unlock0	\spc,\tmp
+	tlb_unlock0	\spc,\tmp,\tmp1
 #endif
 	.endm

@@ -1177,7 +1176,7 @@  dtlb_miss_20w:
 	
 	idtlbt          pte,prot

-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop

@@ -1203,7 +1202,7 @@  nadtlb_miss_20w:

 	idtlbt          pte,prot

-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop

@@ -1237,7 +1236,7 @@  dtlb_miss_11:

 	mtsp		t1, %sr1	/* Restore sr1 */

-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop

@@ -1270,7 +1269,7 @@  nadtlb_miss_11:

 	mtsp		t1, %sr1	/* Restore sr1 */

-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop

@@ -1299,7 +1298,7 @@  dtlb_miss_20:

 	idtlbt          pte,prot

-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop

@@ -1327,7 +1326,7 @@  nadtlb_miss_20:
 	
 	idtlbt		pte,prot

-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop

@@ -1434,7 +1433,7 @@  itlb_miss_20w:
 	
 	iitlbt          pte,prot

-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop

@@ -1458,7 +1457,7 @@  naitlb_miss_20w:

 	iitlbt          pte,prot

-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop

@@ -1492,7 +1491,7 @@  itlb_miss_11:

 	mtsp		t1, %sr1	/* Restore sr1 */

-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop

@@ -1516,7 +1515,7 @@  naitlb_miss_11:

 	mtsp		t1, %sr1	/* Restore sr1 */

-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop

@@ -1546,7 +1545,7 @@  itlb_miss_20:

 	iitlbt          pte,prot

-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop

@@ -1566,7 +1565,7 @@  naitlb_miss_20:

 	iitlbt          pte,prot

-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop

@@ -1596,7 +1595,7 @@  dbit_trap_20w:
 		
 	idtlbt          pte,prot

-	tlb_unlock0	spc,t0
+	tlb_unlock0	spc,t0,t1
 	rfir
 	nop
 #else
@@ -1622,7 +1621,7 @@  dbit_trap_11:

 	mtsp            t1, %sr1     /* Restore sr1 */

-	tlb_unlock0	spc,t0
+	tlb_unlock0	spc,t0,t1
 	rfir
 	nop

@@ -1642,7 +1641,7 @@  dbit_trap_20:
 	
 	idtlbt		pte,prot

-	tlb_unlock0	spc,t0
+	tlb_unlock0	spc,t0,t1
 	rfir
 	nop
 #endif
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index 4f77bd9be66b..e2b4c8d81275 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -640,7 +640,9 @@  cas_action:
 	sub,<>	%r28, %r25, %r0
 2:	stw	%r24, 0(%r26)
 	/* Free lock */
-	sync
+#ifdef CONFIG_SMP
+	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+#endif
 	stw	%r20, 0(%sr2,%r20)
 #if ENABLE_LWS_DEBUG
 	/* Clear thread register indicator */
@@ -655,7 +657,9 @@  cas_action:
 3:		
 	/* Error occurred on load or store */
 	/* Free lock */
-	sync
+#ifdef CONFIG_SMP
+	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+#endif
 	stw	%r20, 0(%sr2,%r20)
 #if ENABLE_LWS_DEBUG
 	stw	%r0, 4(%sr2,%r20)
@@ -857,7 +861,9 @@  cas2_action:

 cas2_end:
 	/* Free lock */
-	sync
+#ifdef CONFIG_SMP
+	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+#endif
 	stw	%r20, 0(%sr2,%r20)
 	/* Enable interrupts */
 	ssm	PSW_SM_I, %r0
@@ -868,7 +874,9 @@  cas2_end:
 22:
 	/* Error occurred on load or store */
 	/* Free lock */
-	sync
+#ifdef CONFIG_SMP
+	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+#endif
 	stw	%r20, 0(%sr2,%r20)
 	ssm	PSW_SM_I, %r0
 	ldo	1(%r0),%r28