diff mbox

[RFC,v1,1/3] target/ppc: Emulate LL/SC using cmpxchg helpers

Message ID 20170406102249.20383-2-nikunj@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nikunj A. Dadhania April 6, 2017, 10:22 a.m. UTC
Emulating LL/SC with cmpxchg is not correct, since it can suffer from
the ABA problem. However, portable parallel code is written assuming
only cmpxchg which means that in practice this is a viable alternative.

Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
---
 target/ppc/translate.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

Comments

Richard Henderson April 6, 2017, 3:51 p.m. UTC | #1
On 04/06/2017 03:22 AM, Nikunj A Dadhania wrote:
> +    TCGv_i32 tmp = tcg_temp_local_new_i32();
> +    TCGv t0;
>
> +    tcg_gen_movi_i32(tmp, 0);
>      tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_so);
>      l1 = gen_new_label();
>      tcg_gen_brcond_tl(TCG_COND_NE, EA, cpu_reserve, l1);
> -    tcg_gen_ori_i32(cpu_crf[0], cpu_crf[0], CRF_EQ);
> -    tcg_gen_qemu_st_tl(cpu_gpr[reg], EA, ctx->mem_idx, memop);
> +
> +    t0 = tcg_temp_new();
> +    tcg_gen_atomic_cmpxchg_tl(t0, EA, cpu_reserve_val, cpu_gpr[reg],
> +                              ctx->mem_idx, DEF_MEMOP(memop));
> +    tcg_gen_setcond_tl(TCG_COND_EQ, t0, t0, cpu_reserve_val);
> +    tcg_gen_trunc_tl_i32(tmp, t0);
> +
>      gen_set_label(l1);
> +    tcg_gen_shli_i32(tmp, tmp, CRF_EQ_BIT);
> +    tcg_gen_or_i32(cpu_crf[0], cpu_crf[0], tmp);

I encourage you to move these two lines up beside the setcond.
That way you don't need to use a local tmp, which implies a
spill/restore from the stack.


r~
Richard Henderson April 6, 2017, 3:53 p.m. UTC | #2
On 04/06/2017 03:22 AM, Nikunj A Dadhania wrote:
>      tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_so);
>      l1 = gen_new_label();
>      tcg_gen_brcond_tl(TCG_COND_NE, EA, cpu_reserve, l1);
> -    tcg_gen_ori_i32(cpu_crf[0], cpu_crf[0], CRF_EQ);
> -    tcg_gen_qemu_st_tl(cpu_gpr[reg], EA, ctx->mem_idx, memop);
> +
> +    t0 = tcg_temp_new();
> +    tcg_gen_atomic_cmpxchg_tl(t0, EA, cpu_reserve_val, cpu_gpr[reg],
> +                              ctx->mem_idx, DEF_MEMOP(memop));

Actually, I noticed another, existing, problem.

This code changes CRF[0] before the user memory write, which might fault.  This 
needs to delay any changes to the architecture visible state until after any 
exception may be triggered.


r~
Nikunj A. Dadhania April 7, 2017, 5:12 a.m. UTC | #3
Richard Henderson <rth@twiddle.net> writes:

> On 04/06/2017 03:22 AM, Nikunj A Dadhania wrote:
>> +    TCGv_i32 tmp = tcg_temp_local_new_i32();
>> +    TCGv t0;
>>
>> +    tcg_gen_movi_i32(tmp, 0);
>>      tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_so);
>>      l1 = gen_new_label();
>>      tcg_gen_brcond_tl(TCG_COND_NE, EA, cpu_reserve, l1);
>> -    tcg_gen_ori_i32(cpu_crf[0], cpu_crf[0], CRF_EQ);
>> -    tcg_gen_qemu_st_tl(cpu_gpr[reg], EA, ctx->mem_idx, memop);
>> +
>> +    t0 = tcg_temp_new();
>> +    tcg_gen_atomic_cmpxchg_tl(t0, EA, cpu_reserve_val, cpu_gpr[reg],
>> +                              ctx->mem_idx, DEF_MEMOP(memop));
>> +    tcg_gen_setcond_tl(TCG_COND_EQ, t0, t0, cpu_reserve_val);
>> +    tcg_gen_trunc_tl_i32(tmp, t0);
>> +
>>      gen_set_label(l1);
>> +    tcg_gen_shli_i32(tmp, tmp, CRF_EQ_BIT);
>> +    tcg_gen_or_i32(cpu_crf[0], cpu_crf[0], tmp);
>
> I encourage you to move these two lines up beside the setcond.
> That way you don't need to use a local tmp, which implies a
> spill/restore from the stack.

Sure.

Regards
Nikunj
Nikunj A. Dadhania April 7, 2017, 5:14 a.m. UTC | #4
Richard Henderson <rth@twiddle.net> writes:

> On 04/06/2017 03:22 AM, Nikunj A Dadhania wrote:
>>      tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_so);
>>      l1 = gen_new_label();
>>      tcg_gen_brcond_tl(TCG_COND_NE, EA, cpu_reserve, l1);
>> -    tcg_gen_ori_i32(cpu_crf[0], cpu_crf[0], CRF_EQ);
>> -    tcg_gen_qemu_st_tl(cpu_gpr[reg], EA, ctx->mem_idx, memop);
>> +
>> +    t0 = tcg_temp_new();
>> +    tcg_gen_atomic_cmpxchg_tl(t0, EA, cpu_reserve_val, cpu_gpr[reg],
>> +                              ctx->mem_idx, DEF_MEMOP(memop));
>
> Actually, I noticed another, existing, problem.
>
> This code changes CRF[0] before the user memory write, which might fault.  This 
> needs to delay any changes to the architecture visible state until after any 
> exception may be triggered.

Sure, here you are mentioning cpu_so being moved to CRF.

Regards
Nikunj
David Gibson April 7, 2017, 5:23 a.m. UTC | #5
On Thu, Apr 06, 2017 at 03:52:47PM +0530, Nikunj A Dadhania wrote:
> Emulating LL/SC with cmpxchg is not correct, since it can suffer from
> the ABA problem. However, portable parallel code is written assuming
> only cmpxchg which means that in practice this is a viable alternative.
> 
> Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
> ---
>  target/ppc/translate.c | 24 +++++++++++++++++++++---
>  1 file changed, 21 insertions(+), 3 deletions(-)
> 
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index b6abc60..a9c733d 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -73,6 +73,7 @@ static TCGv cpu_cfar;
>  #endif
>  static TCGv cpu_xer, cpu_so, cpu_ov, cpu_ca, cpu_ov32, cpu_ca32;
>  static TCGv cpu_reserve;
> +static TCGv cpu_reserve_val;
>  static TCGv cpu_fpscr;
>  static TCGv_i32 cpu_access_type;
>  
> @@ -181,6 +182,9 @@ void ppc_translate_init(void)
>      cpu_reserve = tcg_global_mem_new(cpu_env,
>                                       offsetof(CPUPPCState, reserve_addr),
>                                       "reserve_addr");
> +    cpu_reserve_val = tcg_global_mem_new(cpu_env,
> +                                     offsetof(CPUPPCState, reserve_val),
> +                                     "reserve_val");

I notice that lqarx is not updated.  Does that matter?

>      cpu_fpscr = tcg_global_mem_new(cpu_env,
>                                     offsetof(CPUPPCState, fpscr), "fpscr");
> @@ -3023,7 +3027,7 @@ static void gen_##name(DisasContext *ctx)                            \
>      }                                                                \
>      tcg_gen_qemu_ld_tl(gpr, t0, ctx->mem_idx, memop);                \
>      tcg_gen_mov_tl(cpu_reserve, t0);                                 \
> -    tcg_gen_st_tl(gpr, cpu_env, offsetof(CPUPPCState, reserve_val)); \
> +    tcg_gen_mov_tl(cpu_reserve_val, gpr);                            \
>      tcg_temp_free(t0);                                               \
>  }
>  
> @@ -3156,14 +3160,28 @@ static void gen_conditional_store(DisasContext *ctx, TCGv EA,
>                                    int reg, int memop)
>  {
>      TCGLabel *l1;
> +    TCGv_i32 tmp = tcg_temp_local_new_i32();
> +    TCGv t0;
>  
> +    tcg_gen_movi_i32(tmp, 0);
>      tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_so);
>      l1 = gen_new_label();
>      tcg_gen_brcond_tl(TCG_COND_NE, EA, cpu_reserve, l1);
> -    tcg_gen_ori_i32(cpu_crf[0], cpu_crf[0], CRF_EQ);
> -    tcg_gen_qemu_st_tl(cpu_gpr[reg], EA, ctx->mem_idx, memop);
> +
> +    t0 = tcg_temp_new();
> +    tcg_gen_atomic_cmpxchg_tl(t0, EA, cpu_reserve_val, cpu_gpr[reg],
> +                              ctx->mem_idx, DEF_MEMOP(memop));
> +    tcg_gen_setcond_tl(TCG_COND_EQ, t0, t0, cpu_reserve_val);
> +    tcg_gen_trunc_tl_i32(tmp, t0);
> +
>      gen_set_label(l1);
> +    tcg_gen_shli_i32(tmp, tmp, CRF_EQ_BIT);
> +    tcg_gen_or_i32(cpu_crf[0], cpu_crf[0], tmp);
>      tcg_gen_movi_tl(cpu_reserve, -1);
> +    tcg_gen_movi_tl(cpu_reserve_val, 0);
> +
> +    tcg_temp_free(t0);
> +    tcg_temp_free_i32(tmp);
>  }
>  #endif
>
Nikunj A. Dadhania April 7, 2017, 5:42 a.m. UTC | #6
David Gibson <david@gibson.dropbear.id.au> writes:

> [ Unknown signature status ]
> On Thu, Apr 06, 2017 at 03:52:47PM +0530, Nikunj A Dadhania wrote:
>> Emulating LL/SC with cmpxchg is not correct, since it can suffer from
>> the ABA problem. However, portable parallel code is written assuming
>> only cmpxchg which means that in practice this is a viable alternative.
>> 
>> Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
>> ---
>>  target/ppc/translate.c | 24 +++++++++++++++++++++---
>>  1 file changed, 21 insertions(+), 3 deletions(-)
>> 
>> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
>> index b6abc60..a9c733d 100644
>> --- a/target/ppc/translate.c
>> +++ b/target/ppc/translate.c
>> @@ -73,6 +73,7 @@ static TCGv cpu_cfar;
>>  #endif
>>  static TCGv cpu_xer, cpu_so, cpu_ov, cpu_ca, cpu_ov32, cpu_ca32;
>>  static TCGv cpu_reserve;
>> +static TCGv cpu_reserve_val;
>>  static TCGv cpu_fpscr;
>>  static TCGv_i32 cpu_access_type;
>>  
>> @@ -181,6 +182,9 @@ void ppc_translate_init(void)
>>      cpu_reserve = tcg_global_mem_new(cpu_env,
>>                                       offsetof(CPUPPCState, reserve_addr),
>>                                       "reserve_addr");
>> +    cpu_reserve_val = tcg_global_mem_new(cpu_env,
>> +                                     offsetof(CPUPPCState, reserve_val),
>> +                                     "reserve_val");
>
> I notice that lqarx is not updated.  Does that matter?

Thats correct, I haven't touched that yet. Most of the locks are
implemented using lwarx/stwcx. 

Regards
Nikunj
diff mbox

Patch

diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index b6abc60..a9c733d 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -73,6 +73,7 @@  static TCGv cpu_cfar;
 #endif
 static TCGv cpu_xer, cpu_so, cpu_ov, cpu_ca, cpu_ov32, cpu_ca32;
 static TCGv cpu_reserve;
+static TCGv cpu_reserve_val;
 static TCGv cpu_fpscr;
 static TCGv_i32 cpu_access_type;
 
@@ -181,6 +182,9 @@  void ppc_translate_init(void)
     cpu_reserve = tcg_global_mem_new(cpu_env,
                                      offsetof(CPUPPCState, reserve_addr),
                                      "reserve_addr");
+    cpu_reserve_val = tcg_global_mem_new(cpu_env,
+                                     offsetof(CPUPPCState, reserve_val),
+                                     "reserve_val");
 
     cpu_fpscr = tcg_global_mem_new(cpu_env,
                                    offsetof(CPUPPCState, fpscr), "fpscr");
@@ -3023,7 +3027,7 @@  static void gen_##name(DisasContext *ctx)                            \
     }                                                                \
     tcg_gen_qemu_ld_tl(gpr, t0, ctx->mem_idx, memop);                \
     tcg_gen_mov_tl(cpu_reserve, t0);                                 \
-    tcg_gen_st_tl(gpr, cpu_env, offsetof(CPUPPCState, reserve_val)); \
+    tcg_gen_mov_tl(cpu_reserve_val, gpr);                            \
     tcg_temp_free(t0);                                               \
 }
 
@@ -3156,14 +3160,28 @@  static void gen_conditional_store(DisasContext *ctx, TCGv EA,
                                   int reg, int memop)
 {
     TCGLabel *l1;
+    TCGv_i32 tmp = tcg_temp_local_new_i32();
+    TCGv t0;
 
+    tcg_gen_movi_i32(tmp, 0);
     tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_so);
     l1 = gen_new_label();
     tcg_gen_brcond_tl(TCG_COND_NE, EA, cpu_reserve, l1);
-    tcg_gen_ori_i32(cpu_crf[0], cpu_crf[0], CRF_EQ);
-    tcg_gen_qemu_st_tl(cpu_gpr[reg], EA, ctx->mem_idx, memop);
+
+    t0 = tcg_temp_new();
+    tcg_gen_atomic_cmpxchg_tl(t0, EA, cpu_reserve_val, cpu_gpr[reg],
+                              ctx->mem_idx, DEF_MEMOP(memop));
+    tcg_gen_setcond_tl(TCG_COND_EQ, t0, t0, cpu_reserve_val);
+    tcg_gen_trunc_tl_i32(tmp, t0);
+
     gen_set_label(l1);
+    tcg_gen_shli_i32(tmp, tmp, CRF_EQ_BIT);
+    tcg_gen_or_i32(cpu_crf[0], cpu_crf[0], tmp);
     tcg_gen_movi_tl(cpu_reserve, -1);
+    tcg_gen_movi_tl(cpu_reserve_val, 0);
+
+    tcg_temp_free(t0);
+    tcg_temp_free_i32(tmp);
 }
 #endif