diff mbox

[RFC] 64bit LWS CAS

Message ID 20140717220000.555b2bec@dellete.lux.tuxicoman.be (mailing list archive)
State Superseded
Headers show

Commit Message

Guy Martin July 17, 2014, 8 p.m. UTC
Hi,

I've attached the gcc and kernel patch for 64bit CAS. So far I've
implemented the easiest use case which is for 64bit kernel.

I'll investigate using the FPU register for 64 bit operations with
32bit kernels.

I feel like there is a lot of code duplication in my patches, this can
probably be optimized altho it might reduce readability.

Any comments ?


Thanks,
  Guy

Comments

John David Anglin July 17, 2014, 9:12 p.m. UTC | #1
On 7/17/2014 4:00 PM, Guy Martin wrote:
> +#include <stdint.h>
> +
I would kill the above include and use long long instead of int64_t.  
Generally,
we don't want libgcc routines to depend on system includes.

SYNC_LOCK_RELEASE needs to be fixed to use __kernel_cmpxchg and 
__kernel_cmpxchg_dword32.
The current implementation won't be atomic for long long, and in glibc, 
a similar implementation caused
problems for the int type.  I've been testing a patch for the latter and 
will apply it soon.

Dave
Helge Deller July 17, 2014, 9:30 p.m. UTC | #2
On 07/17/2014 10:00 PM, Guy Martin wrote:
+/* Kernel helper for compare-and-exchange a 64-bit value from ELF32.  */
+static inline long
+__kernel_cmpxchg_dword32 (int64_t oldval, int64_t newval, int64_t *mem)
+{
+  register unsigned long lws_mem asm("r26") = (unsigned long) (mem);
+  register long lws_ret_h   asm("r28");
+  register long lws_ret_l   asm("r29");
+  register long lws_errno   asm("r21");
+  register int lws_old_h    asm("r25") = oldval >> 32;
+  register int lws_old_l    asm("r24") = oldval & 0xffffffff;
+  register int lws_new_h    asm("r23") = newval >> 32;
+  register int lws_new_l    asm("r22") = newval & 0xffffffff;
+  asm volatile (	"ble	0xb0(%%sr2, %%r0)	\n\t"
+			"ldi	%8, %%r20		\n\t"
+	: "=r" (lws_ret_h), "=r" (lws_ret_l), "=r" (lws_errno), "=r" (lws_mem),
+	  "=r" (lws_old_h), "=r" (lws_old_l), "=r" (lws_new_h), "=r" (lws_new_l)
+	: "i" (2), "3" (lws_mem), "4" (lws_old_h), "5" (lws_old_l), "6" (lws_new_h), "7" (lws_new_l)
+	: "r1", "r20", "r31", "memory"
+  );

Just a thought:
I'm not sure how good gcc optimizes the assignment of the 64bit parameters to their final destination registers (r22-r25) with regard to the shifting and masking, but it might be worth to check if gcc's built-in "R2" functionality (sorry, I don't know the name of this feature!) can help here?

As an example see the __put_kernel_asm64() macro in the the kernel header arch/parisc/include/asm/uaccess.h:

#define __put_kernel_asm64(__val,ptr) do {                  \
        __asm__ __volatile__ (                              \
                "\n1:\tstw %2,0(%1)"                        \
                "\n2:\tstw %R2,4(%1)\n\t"                   \
                : "=r"(__pu_err)                            \
                : "r"(ptr), "r"(__val), "0"(__pu_err) \
                : "r1");

Helge
--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John David Anglin July 17, 2014, 10:51 p.m. UTC | #3
On 17-Jul-14, at 5:30 PM, Helge Deller wrote:

> On 07/17/2014 10:00 PM, Guy Martin wrote:
> +/* Kernel helper for compare-and-exchange a 64-bit value from  
> ELF32.  */
> +static inline long
> +__kernel_cmpxchg_dword32 (int64_t oldval, int64_t newval, int64_t  
> *mem)
> +{
> +  register unsigned long lws_mem asm("r26") = (unsigned long) (mem);
> +  register long lws_ret_h   asm("r28");
> +  register long lws_ret_l   asm("r29");
> +  register long lws_errno   asm("r21");
> +  register int lws_old_h    asm("r25") = oldval >> 32;
> +  register int lws_old_l    asm("r24") = oldval & 0xffffffff;
> +  register int lws_new_h    asm("r23") = newval >> 32;
> +  register int lws_new_l    asm("r22") = newval & 0xffffffff;
> +  asm volatile (	"ble	0xb0(%%sr2, %%r0)	\n\t"
> +			"ldi	%8, %%r20		\n\t"
> +	: "=r" (lws_ret_h), "=r" (lws_ret_l), "=r" (lws_errno),  
> "=r" (lws_mem),
> +	  "=r" (lws_old_h), "=r" (lws_old_l), "=r" (lws_new_h),  
> "=r" (lws_new_l)
> +	: "i" (2), "3" (lws_mem), "4" (lws_old_h), "5" (lws_old_l),  
> "6" (lws_new_h), "7" (lws_new_l)
> +	: "r1", "r20", "r31", "memory"
> +  );
>
> Just a thought:
> I'm not sure how good gcc optimizes the assignment of the 64bit  
> parameters to their final destination registers (r22-r25) with  
> regard to the shifting and masking, but it might be worth to check  
> if gcc's built-in "R2" functionality (sorry, I don't know the name  
> of this feature!) can help here?
>
> As an example see the __put_kernel_asm64() macro in the the kernel  
> header arch/parisc/include/asm/uaccess.h:
>
> #define __put_kernel_asm64(__val,ptr) do {                  \
>        __asm__ __volatile__ (                              \
>                "\n1:\tstw %2,0(%1)"                        \
>                "\n2:\tstw %R2,4(%1)\n\t"                   \
>                : "=r"(__pu_err)                            \
>                : "r"(ptr), "r"(__val), "0"(__pu_err) \
>                : "r1");


Maybe.  On the other hand, if mem was the third argument in the ble  
call, it likely would be possible to write:

register long long lws_ret asm("r28");
register long long lws_old asm("r25");
register long long lws_new asm("r23");
register unsigned long lws_mem asm("r22");

This is consistent with the parisc calling conventions for 64-bit  
objects.  If it works, the low part of lws_ret, etc,
should automatically get allocated to the correct register and the  
copy done efficiently.  Have to say I have
not tried it.

Dave
--
John David Anglin	dave.anglin@bell.net



--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John David Anglin July 17, 2014, 11:27 p.m. UTC | #4
On 17-Jul-14, at 5:12 PM, John David Anglin wrote:

> SYNC_LOCK_RELEASE needs to be fixed to use __kernel_cmpxchg and  
> __kernel_cmpxchg_dword32.
> The current implementation won't be atomic for long long, and in  
> glibc, a similar implementation caused
> problems for the int type.  I've been testing a patch for the latter  
> and will apply it soon.

The change is now committed to GCC 4.10.

Dave
--
John David Anglin	dave.anglin@bell.net



--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John David Anglin July 17, 2014, 11:57 p.m. UTC | #5
On 17-Jul-14, at 7:27 PM, John David Anglin wrote:

> On 17-Jul-14, at 5:12 PM, John David Anglin wrote:
>
>> SYNC_LOCK_RELEASE needs to be fixed to use __kernel_cmpxchg and  
>> __kernel_cmpxchg_dword32.
>> The current implementation won't be atomic for long long, and in  
>> glibc, a similar implementation caused
>> problems for the int type.  I've been testing a patch for the  
>> latter and will apply it soon.
>
> The change is now committed to GCC 4.10.

Also, see this change:
https://gcc.gnu.org/ml/gcc-patches/2014-07/msg01242.html

We will need a DI mode define for the 64-bit sync operation.

This enables "future" support in libstdc++.

Dave
--
John David Anglin	dave.anglin@bell.net



--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John David Anglin July 18, 2014, 2:49 p.m. UTC | #6
On 7/17/2014 4:00 PM, Guy Martin wrote:
> +/* Kernel helper for compare-and-exchange a 64-bit value from ELF32.  */
> +static inline long
> +__kernel_cmpxchg_dword32 (int64_t oldval, int64_t newval, int64_t *mem)
I'm thinking we don't need suffix "32".  If a 64-bit runtime is ever 
developed,
there probably would be a separate file for it as on arm.

This discussion has got me thinking that the char and short 
implementations may be broken (i.e,
the CAS operation may clobber data adjacent to location being operated 
on).  I'm thinking we need
separate LWS calls for them.  Thoughts?

Dave
Carlos O'Donell July 18, 2014, 9:55 p.m. UTC | #7
On Fri, Jul 18, 2014 at 10:49 AM, John David Anglin
<dave.anglin@bell.net> wrote:
> On 7/17/2014 4:00 PM, Guy Martin wrote:
>>
>> +/* Kernel helper for compare-and-exchange a 64-bit value from ELF32.  */
>> +static inline long
>> +__kernel_cmpxchg_dword32 (int64_t oldval, int64_t newval, int64_t *mem)
>
> I'm thinking we don't need suffix "32".  If a 64-bit runtime is ever
> developed,
> there probably would be a separate file for it as on arm.
>
> This discussion has got me thinking that the char and short implementations
> may be broken (i.e,
> the CAS operation may clobber data adjacent to location being operated on).
> I'm thinking we need
> separate LWS calls for them.  Thoughts?

Yes, you need masking or short load variants. How did you test the
char and short implementations? ;-)

Cheers,
Carlos.
--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John David Anglin July 18, 2014, 10:12 p.m. UTC | #8
On 18-Jul-14, at 5:55 PM, Carlos O'Donell wrote:

>> This discussion has got me thinking that the char and short  
>> implementations
>> may be broken (i.e,
>> the CAS operation may clobber data adjacent to location being  
>> operated on).
>> I'm thinking we need
>> separate LWS calls for them.  Thoughts?
>
> Yes, you need masking or short load variants. How did you test the
> char and short implementations? ;-)

There is no testing beyond the general testing of atomics in GCC.   
There is masking
in the GCC code but I think this is racy.  Another thread might modify  
the masked data
during the CAS operation and this might be lost.  I've never seen it  
in the real world though.

The implementation was originally copied from arm.

Dave
--
John David Anglin	dave.anglin@bell.net



--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Carlos O'Donell July 19, 2014, 12:38 a.m. UTC | #9
On Fri, Jul 18, 2014 at 6:12 PM, John David Anglin <dave.anglin@bell.net> wrote:
> There is no testing beyond the general testing of atomics in GCC.  There is
> masking
> in the GCC code but I think this is racy.  Another thread might modify the
> masked data
> during the CAS operation and this might be lost.  I've never seen it in the
> real world though.

The race is real IMO, but you're right that it might be hard to trigger.

My opinion is that the only way you can do this is to write new LWS
CAS variants that do half-word or byte loads, compares and stores. You
can't touch the surrounding data safely.

Cheers,
Carlos.
--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John David Anglin July 19, 2014, 12:42 a.m. UTC | #10
On 18-Jul-14, at 8:38 PM, Carlos O'Donell wrote:

> On Fri, Jul 18, 2014 at 6:12 PM, John David Anglin <dave.anglin@bell.net 
> > wrote:
>> There is no testing beyond the general testing of atomics in GCC.   
>> There is
>> masking
>> in the GCC code but I think this is racy.  Another thread might  
>> modify the
>> masked data
>> during the CAS operation and this might be lost.  I've never seen  
>> it in the
>> real world though.
>
> The race is real IMO, but you're right that it might be hard to  
> trigger.
>
> My opinion is that the only way you can do this is to write new LWS
> CAS variants that do half-word or byte loads, compares and stores. You
> can't touch the surrounding data safely.

I totally agree.

Dave
--
John David Anglin	dave.anglin@bell.net



--
To unsubscribe from this list: send the line "unsubscribe linux-parisc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

--- libgcc/config/pa/linux-atomic.c.orig	2014-07-16 19:29:28.670595484 +0000
+++ libgcc/config/pa/linux-atomic.c	2014-07-16 19:31:32.754003341 +0000
@@ -24,6 +24,8 @@ 
 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 <http://www.gnu.org/licenses/>.  */
 
+#include <stdint.h>
+
 #define EFAULT  14 
 #define EBUSY   16
 #define ENOSYS 251 
@@ -75,6 +77,39 @@ 
   return lws_errno;
 }
 
+/* Kernel helper for compare-and-exchange a 64-bit value from ELF32.  */
+static inline long
+__kernel_cmpxchg_dword32 (int64_t oldval, int64_t newval, int64_t *mem)
+{
+  register unsigned long lws_mem asm("r26") = (unsigned long) (mem);
+  register long lws_ret_h   asm("r28");
+  register long lws_ret_l   asm("r29");
+  register long lws_errno   asm("r21");
+  register int lws_old_h    asm("r25") = oldval >> 32;
+  register int lws_old_l    asm("r24") = oldval & 0xffffffff;
+  register int lws_new_h    asm("r23") = newval >> 32;
+  register int lws_new_l    asm("r22") = newval & 0xffffffff;
+  asm volatile (	"ble	0xb0(%%sr2, %%r0)	\n\t"
+			"ldi	%8, %%r20		\n\t"
+	: "=r" (lws_ret_h), "=r" (lws_ret_l), "=r" (lws_errno), "=r" (lws_mem),
+	  "=r" (lws_old_h), "=r" (lws_old_l), "=r" (lws_new_h), "=r" (lws_new_l)
+	: "i" (2), "3" (lws_mem), "4" (lws_old_h), "5" (lws_old_l), "6" (lws_new_h), "7" (lws_new_l)
+	: "r1", "r20", "r31", "memory"
+  );
+  if (__builtin_expect (lws_errno == -EFAULT || lws_errno == -ENOSYS, 0))
+    ABORT_INSTRUCTION;
+
+   int64_t lws_ret = ((int64_t)lws_ret_h << 32) | (int64_t)lws_ret_l;
+
+  /* If the kernel LWS call succeeded (lws_errno == 0), lws_ret contains
+     the old value from memory.  If this value is equal to OLDVAL, the
+     new value was written to memory.  If not, return -EBUSY.  */
+  if (!lws_errno && lws_ret != oldval)
+    lws_errno = -EBUSY;
+
+  return lws_errno;
+}
+
 #define HIDDEN __attribute__ ((visibility ("hidden")))
 
 /* Big endian masks  */
@@ -84,6 +119,28 @@ 
 #define MASK_1 0xffu
 #define MASK_2 0xffffu
 
+#define FETCH_AND_OP_DWORD(OP, PFX_OP, INF_OP)					\
+  int64_t HIDDEN								\
+  __sync_fetch_and_##OP##_8 (int64_t *ptr, int64_t val)				\
+  {										\
+    int64_t tmp;								\
+    int failure;								\
+										\
+    do {									\
+      tmp = *ptr;								\
+      failure = __kernel_cmpxchg_dword32 (tmp, PFX_OP (tmp INF_OP val), ptr);	\
+    } while (failure != 0);							\
+										\
+    return tmp;									\
+  }
+
+FETCH_AND_OP_DWORD (add,   , +)
+FETCH_AND_OP_DWORD (sub,   , -)
+FETCH_AND_OP_DWORD (or,    , |)
+FETCH_AND_OP_DWORD (and,   , &)
+FETCH_AND_OP_DWORD (xor,   , ^)
+FETCH_AND_OP_DWORD (nand, ~, &)
+
 #define FETCH_AND_OP_WORD(OP, PFX_OP, INF_OP)				\
   int HIDDEN								\
   __sync_fetch_and_##OP##_4 (int *ptr, int val)				\
@@ -147,6 +204,28 @@ 
 SUBWORD_SYNC_OP (xor,   , ^, unsigned char, 1, oldval)
 SUBWORD_SYNC_OP (nand, ~, &, unsigned char, 1, oldval)
 
+#define OP_AND_FETCH_DWORD(OP, PFX_OP, INF_OP)					\
+  int64_t HIDDEN								\
+  __sync_##OP##_and_fetch_8 (int64_t *ptr, int64_t val)				\
+  {										\
+    int64_t tmp;								\
+    int failure;								\
+										\
+    do {									\
+      tmp = *ptr;								\
+      failure = __kernel_cmpxchg_dword32 (tmp, PFX_OP (tmp INF_OP val), ptr);	\
+    } while (failure != 0);							\
+										\
+    return PFX_OP (tmp INF_OP val);						\
+  }
+
+OP_AND_FETCH_DWORD (add,   , +)
+OP_AND_FETCH_DWORD (sub,   , -)
+OP_AND_FETCH_DWORD (or,    , |)
+OP_AND_FETCH_DWORD (and,   , &)
+OP_AND_FETCH_DWORD (xor,   , ^)
+OP_AND_FETCH_DWORD (nand, ~, &)
+
 #define OP_AND_FETCH_WORD(OP, PFX_OP, INF_OP)				\
   int HIDDEN								\
   __sync_##OP##_and_fetch_4 (int *ptr, int val)				\
@@ -182,6 +261,26 @@ 
 SUBWORD_SYNC_OP (xor,   , ^, unsigned char, 1, newval)
 SUBWORD_SYNC_OP (nand, ~, &, unsigned char, 1, newval)
 
+int64_t HIDDEN
+__sync_val_compare_and_swap_8 (int64_t *ptr, int64_t oldval, int64_t newval)
+{
+  int64_t actual_oldval;
+  int fail;
+    
+  while (1)
+    {
+      actual_oldval = *ptr;
+
+      if (__builtin_expect (oldval != actual_oldval, 0))
+	return actual_oldval;
+
+      fail = __kernel_cmpxchg_dword32 (actual_oldval, newval, ptr);
+  
+      if (__builtin_expect (!fail, 1))
+	return actual_oldval;
+    }
+}
+
 int HIDDEN
 __sync_val_compare_and_swap_4 (int *ptr, int oldval, int newval)
 {
@@ -256,6 +355,20 @@ 
 SUBWORD_BOOL_CAS (unsigned short, 2)
 SUBWORD_BOOL_CAS (unsigned char,  1)
 
+int64_t HIDDEN
+__sync_lock_test_and_set_8 (int64_t *ptr, int64_t val)
+{
+  int64_t oldval;
+  int failure;
+
+  do {
+    oldval = *ptr;
+    failure = __kernel_cmpxchg_dword32 (oldval, val, ptr);
+  } while (failure != 0);
+
+  return oldval;
+}
+
 int HIDDEN
 __sync_lock_test_and_set_4 (int *ptr, int val)
 {
@@ -300,6 +413,7 @@ 
     *ptr = 0;								\
   }
 
-SYNC_LOCK_RELEASE (int,   4)
-SYNC_LOCK_RELEASE (short, 2)
-SYNC_LOCK_RELEASE (char,  1)
+SYNC_LOCK_RELEASE (int64_t, 8)
+SYNC_LOCK_RELEASE (int,     4)
+SYNC_LOCK_RELEASE (short,   2)
+SYNC_LOCK_RELEASE (char,    1)