diff mbox

[RFC,v7,16/16] target-arm: aarch64: add atomic instructions

Message ID 1454059965-23402-17-git-send-email-a.rigo@virtualopensystems.com (mailing list archive)
State New, archived
Headers show

Commit Message

alvise rigo Jan. 29, 2016, 9:32 a.m. UTC
Use the new LL/SC runtime helpers to handle the aarch64 atomic instructions
in softmmu_llsc_template.h.

The STXP emulation required a dedicated helper to handle the paired
doubleword case.

Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
---
 configure                  |   6 +-
 target-arm/helper-a64.c    |  55 +++++++++++++++++++
 target-arm/helper-a64.h    |   4 ++
 target-arm/op_helper.c     |   8 +++
 target-arm/translate-a64.c | 134 ++++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 204 insertions(+), 3 deletions(-)

Comments

Alex Bennée Feb. 19, 2016, 11:34 a.m. UTC | #1
Alvise Rigo <a.rigo@virtualopensystems.com> writes:

> Use the new LL/SC runtime helpers to handle the aarch64 atomic instructions
> in softmmu_llsc_template.h.
>
> The STXP emulation required a dedicated helper to handle the paired
> doubleword case.
>
> Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
> Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
> Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
> ---
>  configure                  |   6 +-
>  target-arm/helper-a64.c    |  55 +++++++++++++++++++
>  target-arm/helper-a64.h    |   4 ++
>  target-arm/op_helper.c     |   8 +++
>  target-arm/translate-a64.c | 134 ++++++++++++++++++++++++++++++++++++++++++++-
>  5 files changed, 204 insertions(+), 3 deletions(-)
>
> diff --git a/configure b/configure
> index 915efcc..38121ff 100755
> --- a/configure
> +++ b/configure
> @@ -5873,9 +5873,11 @@ echo "QEMU_CFLAGS+=$cflags" >> $config_target_mak
>  # Use tcg LL/SC tcg backend for exclusive instruction is arm/aarch64
>  # softmmus targets
>  if test "$arm_tcg_use_llsc" = "yes" ; then
> -  if test "$target" = "arm-softmmu" ; then
> +  case "$target" in
> +    arm-softmmu | aarch64-softmmu)
>      echo "CONFIG_ARM_USE_LDST_EXCL=y" >> $config_target_mak
> -  fi
> +    ;;
> +  esac
>  fi

See previous comments about configure code.

>  done # for target in $targets
>
> diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
> index c7bfb4d..dcee66f 100644
> --- a/target-arm/helper-a64.c
> +++ b/target-arm/helper-a64.c
> @@ -26,6 +26,7 @@
>  #include "qemu/bitops.h"
>  #include "internals.h"
>  #include "qemu/crc32c.h"
> +#include "tcg/tcg.h"
>  #include <zlib.h> /* For crc32 */
>
>  /* C2.4.7 Multiply and divide */
> @@ -443,3 +444,57 @@ uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
>      /* Linux crc32c converts the output to one's complement.  */
>      return crc32c(acc, buf, bytes) ^ 0xffffffff;
>  }
> +
> +#ifdef CONFIG_ARM_USE_LDST_EXCL
> +/* STXP emulation for two 64 bit doublewords. We can't use directly two
> + * stcond_i64 accesses, otherwise the first will conclude the LL/SC pair.
> + * Instead, two normal 64-bit accesses are used and the CPUState is
> + * updated accordingly. */
> +target_ulong HELPER(stxp_i128)(CPUArchState *env, target_ulong addr,
> +                               uint64_t vall, uint64_t valh,
> +                               uint32_t mmu_idx)
> +{
> +    CPUState *cpu = ENV_GET_CPU(env);
> +    TCGMemOpIdx op;
> +    target_ulong ret = 0;
> +
> +    if (!cpu->ll_sc_context) {
> +        cpu->excl_succeeded = false;
> +        ret = 1;
> +        goto out;
> +    }
> +
> +    op = make_memop_idx(MO_BEQ, mmu_idx);
> +
> +    /* According to section C6.6.191 of ARM ARM DDI 0487A.h, the access has to
> +     * be quadword aligned.  For the time being, we do not support paired STXPs
> +     * to MMIO memory, this will become trivial when the softmmu will support
> +     * 128bit memory accesses. */
> +    if (addr & 0xf) {
> +        /* TODO: Do unaligned access */

This should probably log UNIMP if you don't implement it now.

> +    }
> +
> +    /* Setting excl_succeeded to true will make the store exclusive. */
> +    cpu->excl_succeeded = true;
> +    helper_ret_stq_mmu(env, addr, vall, op, GETRA());
> +
> +    if (!cpu->excl_succeeded) {
> +        ret = 1;
> +        goto out;
> +    }
> +
> +    helper_ret_stq_mmu(env, addr + 8, valh, op, GETRA());
> +    if (!cpu->excl_succeeded) {
> +        ret = 1;
> +    } else {
> +        cpu->excl_succeeded = false;
> +    }
> +
> +out:
> +    /* Unset LL/SC context */
> +    cpu->ll_sc_context = false;
> +    cpu->excl_protected_range.begin = EXCLUSIVE_RESET_ADDR;
> +
> +    return ret;
> +}
> +#endif
> diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h
> index 1d3d10f..c416a83 100644
> --- a/target-arm/helper-a64.h
> +++ b/target-arm/helper-a64.h
> @@ -46,3 +46,7 @@ DEF_HELPER_FLAGS_2(frecpx_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
>  DEF_HELPER_FLAGS_2(fcvtx_f64_to_f32, TCG_CALL_NO_RWG, f32, f64, env)
>  DEF_HELPER_FLAGS_3(crc32_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
>  DEF_HELPER_FLAGS_3(crc32c_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
> +#ifdef CONFIG_ARM_USE_LDST_EXCL
> +/* STXP helper */
> +DEF_HELPER_5(stxp_i128, i64, env, i64, i64, i64, i32)
> +#endif
> diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
> index 404c13b..146fc9a 100644
> --- a/target-arm/op_helper.c
> +++ b/target-arm/op_helper.c
> @@ -34,6 +34,14 @@ static void raise_exception(CPUARMState *env, uint32_t excp,
>      cs->exception_index = excp;
>      env->exception.syndrome = syndrome;
>      env->exception.target_el = target_el;
> +#ifdef CONFIG_ARM_USE_LDST_EXCL
> +    HELPER(atomic_clear)(env);
> +    /* If the exception happens in the middle of a LL/SC, we need to clear
> +     * excl_succeeded to avoid that the normal store following the exception is
> +     * wrongly interpreted as exclusive.
> +     * */
> +    cs->excl_succeeded = 0;
> +#endif
>      cpu_loop_exit(cs);
>  }
>
> diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
> index 80f6c20..f34e957 100644
> --- a/target-arm/translate-a64.c
> +++ b/target-arm/translate-a64.c
> @@ -37,8 +37,10 @@
>  static TCGv_i64 cpu_X[32];
>  static TCGv_i64 cpu_pc;
>
> +#if !defined(CONFIG_ARM_USE_LDST_EXCL)
>  /* Load/store exclusive handling */
>  static TCGv_i64 cpu_exclusive_high;
> +#endif
>
>  static const char *regnames[] = {
>      "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
> @@ -94,8 +96,10 @@ void a64_translate_init(void)
>                                            regnames[i]);
>      }
>
> +#if !defined(CONFIG_ARM_USE_LDST_EXCL)
>      cpu_exclusive_high = tcg_global_mem_new_i64(TCG_AREG0,
>          offsetof(CPUARMState, exclusive_high), "exclusive_high");
> +#endif
>  }
>
>  static inline ARMMMUIdx get_a64_user_mem_index(DisasContext *s)
> @@ -1219,7 +1223,11 @@ static void handle_hint(DisasContext *s, uint32_t insn,
>
>  static void gen_clrex(DisasContext *s, uint32_t insn)
>  {
> +#ifndef CONFIG_ARM_USE_LDST_EXCL
>      tcg_gen_movi_i64(cpu_exclusive_addr, -1);
> +#else
> +    gen_helper_atomic_clear(cpu_env);
> +#endif
>  }
>
>  /* CLREX, DSB, DMB, ISB */
> @@ -1685,7 +1693,11 @@ static void disas_b_exc_sys(DisasContext *s, uint32_t insn)
>  }
>
>  /*
> - * Load/Store exclusive instructions are implemented by remembering
> + * If the softmmu is enabled, the translation of Load/Store exclusive
> + * instructions will rely on the gen_helper_{ldlink,stcond} helpers,
> + * offloading most of the work to the softmmu_llsc_template.h functions.
> + *
> + * Otherwise, instructions are implemented by remembering
>   * the value/address loaded, and seeing if these are the same
>   * when the store is performed. This is not actually the architecturally
>   * mandated semantics, but it works for typical guest code sequences
> @@ -1695,6 +1707,66 @@ static void disas_b_exc_sys(DisasContext *s, uint32_t insn)
>   * this sequence is effectively atomic.  In user emulation mode we
>   * throw an exception and handle the atomic operation elsewhere.
>   */
> +#ifdef CONFIG_ARM_USE_LDST_EXCL
> +static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
> +                               TCGv_i64 addr, int size, bool is_pair)
> +{
> +    /* In case @is_pair is set, we have to guarantee that at least the 128 bits
> +     * accessed by a Load Exclusive Pair (64-bit variant) are protected. Since
> +     * we do not have 128-bit helpers, we split the access in two halves, the
> +     * first of them will set the exclusive region to cover at least 128 bits
> +     * (this is why aarch64 has a custom cc->cpu_set_excl_protected_range which
> +     * covers 128 bits).
> +     * */
> +    TCGv_i32 mem_idx = tcg_temp_new_i32();
> +
> +    tcg_gen_movi_i32(mem_idx, get_mem_index(s));
> +
> +    g_assert(size <= 3);
> +
> +    if (size < 3) {
> +        TCGv_i32 tmp = tcg_temp_new_i32();
> +
> +        switch (size) {
> +        case 0:
> +            gen_helper_ldlink_i8(tmp, cpu_env, addr, mem_idx);
> +            break;
> +        case 1:
> +            gen_helper_ldlink_i16(tmp, cpu_env, addr, mem_idx);
> +            break;
> +        case 2:
> +            gen_helper_ldlink_i32(tmp, cpu_env, addr, mem_idx);
> +            break;
> +        default:
> +            abort();
> +        }
> +
> +        TCGv_i64 tmp64 = tcg_temp_new_i64();
> +        tcg_gen_ext_i32_i64(tmp64, tmp);
> +        tcg_gen_mov_i64(cpu_reg(s, rt), tmp64);
> +
> +        tcg_temp_free_i32(tmp);
> +        tcg_temp_free_i64(tmp64);
> +    } else {
> +        gen_helper_ldlink_i64(cpu_reg(s, rt), cpu_env, addr, mem_idx);
> +    }
> +
> +    if (is_pair) {
> +        TCGMemOp memop = MO_TE + size;
> +        TCGv_i64 addr2 = tcg_temp_new_i64();
> +        TCGv_i64 hitmp = tcg_temp_new_i64();
> +
> +        g_assert(size >= 2);
> +        tcg_gen_addi_i64(addr2, addr, 1 << size);
> +        tcg_gen_qemu_ld_i64(hitmp, addr2, get_mem_index(s), memop);
> +        tcg_temp_free_i64(addr2);
> +        tcg_gen_mov_i64(cpu_reg(s, rt2), hitmp);
> +        tcg_temp_free_i64(hitmp);
> +    }
> +
> +    tcg_temp_free_i32(mem_idx);
> +}
> +#else
>  static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
>                                 TCGv_i64 addr, int size, bool is_pair)
>  {
> @@ -1723,6 +1795,7 @@ static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
>      tcg_temp_free_i64(tmp);
>      tcg_gen_mov_i64(cpu_exclusive_addr, addr);
>  }
> +#endif
>
>  #ifdef CONFIG_USER_ONLY
>  static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
> @@ -1733,6 +1806,65 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
>                       size | is_pair << 2 | (rd << 4) | (rt << 9) | (rt2 << 14));
>      gen_exception_internal_insn(s, 4, EXCP_STREX);
>  }
> +#elif defined(CONFIG_ARM_USE_LDST_EXCL)
> +static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
> +                                TCGv_i64 addr, int size, int is_pair)
> +{
> +    /* Don't bother to check if we are actually in exclusive context since the
> +     * helpers keep care of it. */
> +    TCGv_i32 mem_idx = tcg_temp_new_i32();
> +
> +    tcg_gen_movi_i32(mem_idx, get_mem_index(s));
> +
> +    g_assert(size <= 3);
> +    if (is_pair) {
> +        if (size == 3) {
> +            gen_helper_stxp_i128(cpu_reg(s, rd), cpu_env, addr, cpu_reg(s, rt),
> +                                 cpu_reg(s, rt2), mem_idx);
> +        } else if (size == 2) {
> +            /* Paired single word case. After merging the two registers into
> +             * one, we use one stcond_i64 to store the value to memory. */
> +            TCGv_i64 val = tcg_temp_new_i64();
> +            TCGv_i64 valh = tcg_temp_new_i64();
> +            tcg_gen_shli_i64(valh, cpu_reg(s, rt2), 32);
> +            tcg_gen_and_i64(val, valh, cpu_reg(s, rt));
> +            gen_helper_stcond_i64(cpu_reg(s, rd), cpu_env, addr, val, mem_idx);
> +            tcg_temp_free_i64(valh);
> +            tcg_temp_free_i64(val);
> +        } else {
> +            abort();
> +        }
> +    } else {
> +        if (size < 3) {
> +            TCGv_i32 val = tcg_temp_new_i32();
> +
> +            tcg_gen_extrl_i64_i32(val, cpu_reg(s, rt));
> +
> +            switch (size) {
> +            case 0:
> +                gen_helper_stcond_i8(cpu_reg(s, rd), cpu_env, addr, val,
> +                                     mem_idx);
> +                break;
> +            case 1:
> +                gen_helper_stcond_i16(cpu_reg(s, rd), cpu_env, addr, val,
> +                                      mem_idx);
> +                break;
> +            case 2:
> +                gen_helper_stcond_i32(cpu_reg(s, rd), cpu_env, addr, val,
> +                                      mem_idx);
> +                break;
> +            default:
> +                abort();
> +            }
> +            tcg_temp_free_i32(val);
> +        } else {
> +            gen_helper_stcond_i64(cpu_reg(s, rd), cpu_env, addr, cpu_reg(s, rt),
> +                                  mem_idx);
> +        }
> +    }
> +
> +    tcg_temp_free_i32(mem_idx);
> +}
>  #else
>  static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
>                                  TCGv_i64 inaddr, int size, int is_pair)


--
Alex Bennée
diff mbox

Patch

diff --git a/configure b/configure
index 915efcc..38121ff 100755
--- a/configure
+++ b/configure
@@ -5873,9 +5873,11 @@  echo "QEMU_CFLAGS+=$cflags" >> $config_target_mak
 # Use tcg LL/SC tcg backend for exclusive instruction is arm/aarch64
 # softmmus targets
 if test "$arm_tcg_use_llsc" = "yes" ; then
-  if test "$target" = "arm-softmmu" ; then
+  case "$target" in
+    arm-softmmu | aarch64-softmmu)
     echo "CONFIG_ARM_USE_LDST_EXCL=y" >> $config_target_mak
-  fi
+    ;;
+  esac
 fi
 done # for target in $targets
 
diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index c7bfb4d..dcee66f 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -26,6 +26,7 @@ 
 #include "qemu/bitops.h"
 #include "internals.h"
 #include "qemu/crc32c.h"
+#include "tcg/tcg.h"
 #include <zlib.h> /* For crc32 */
 
 /* C2.4.7 Multiply and divide */
@@ -443,3 +444,57 @@  uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
     /* Linux crc32c converts the output to one's complement.  */
     return crc32c(acc, buf, bytes) ^ 0xffffffff;
 }
+
+#ifdef CONFIG_ARM_USE_LDST_EXCL
+/* STXP emulation for two 64 bit doublewords. We can't use directly two
+ * stcond_i64 accesses, otherwise the first will conclude the LL/SC pair.
+ * Instead, two normal 64-bit accesses are used and the CPUState is
+ * updated accordingly. */
+target_ulong HELPER(stxp_i128)(CPUArchState *env, target_ulong addr,
+                               uint64_t vall, uint64_t valh,
+                               uint32_t mmu_idx)
+{
+    CPUState *cpu = ENV_GET_CPU(env);
+    TCGMemOpIdx op;
+    target_ulong ret = 0;
+
+    if (!cpu->ll_sc_context) {
+        cpu->excl_succeeded = false;
+        ret = 1;
+        goto out;
+    }
+
+    op = make_memop_idx(MO_BEQ, mmu_idx);
+
+    /* According to section C6.6.191 of ARM ARM DDI 0487A.h, the access has to
+     * be quadword aligned.  For the time being, we do not support paired STXPs
+     * to MMIO memory, this will become trivial when the softmmu will support
+     * 128bit memory accesses. */
+    if (addr & 0xf) {
+        /* TODO: Do unaligned access */
+    }
+
+    /* Setting excl_succeeded to true will make the store exclusive. */
+    cpu->excl_succeeded = true;
+    helper_ret_stq_mmu(env, addr, vall, op, GETRA());
+
+    if (!cpu->excl_succeeded) {
+        ret = 1;
+        goto out;
+    }
+
+    helper_ret_stq_mmu(env, addr + 8, valh, op, GETRA());
+    if (!cpu->excl_succeeded) {
+        ret = 1;
+    } else {
+        cpu->excl_succeeded = false;
+    }
+
+out:
+    /* Unset LL/SC context */
+    cpu->ll_sc_context = false;
+    cpu->excl_protected_range.begin = EXCLUSIVE_RESET_ADDR;
+
+    return ret;
+}
+#endif
diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h
index 1d3d10f..c416a83 100644
--- a/target-arm/helper-a64.h
+++ b/target-arm/helper-a64.h
@@ -46,3 +46,7 @@  DEF_HELPER_FLAGS_2(frecpx_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
 DEF_HELPER_FLAGS_2(fcvtx_f64_to_f32, TCG_CALL_NO_RWG, f32, f64, env)
 DEF_HELPER_FLAGS_3(crc32_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
 DEF_HELPER_FLAGS_3(crc32c_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
+#ifdef CONFIG_ARM_USE_LDST_EXCL
+/* STXP helper */
+DEF_HELPER_5(stxp_i128, i64, env, i64, i64, i64, i32)
+#endif
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index 404c13b..146fc9a 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -34,6 +34,14 @@  static void raise_exception(CPUARMState *env, uint32_t excp,
     cs->exception_index = excp;
     env->exception.syndrome = syndrome;
     env->exception.target_el = target_el;
+#ifdef CONFIG_ARM_USE_LDST_EXCL
+    HELPER(atomic_clear)(env);
+    /* If the exception happens in the middle of a LL/SC, we need to clear
+     * excl_succeeded to avoid that the normal store following the exception is
+     * wrongly interpreted as exclusive.
+     * */
+    cs->excl_succeeded = 0;
+#endif
     cpu_loop_exit(cs);
 }
 
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 80f6c20..f34e957 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -37,8 +37,10 @@ 
 static TCGv_i64 cpu_X[32];
 static TCGv_i64 cpu_pc;
 
+#if !defined(CONFIG_ARM_USE_LDST_EXCL)
 /* Load/store exclusive handling */
 static TCGv_i64 cpu_exclusive_high;
+#endif
 
 static const char *regnames[] = {
     "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
@@ -94,8 +96,10 @@  void a64_translate_init(void)
                                           regnames[i]);
     }
 
+#if !defined(CONFIG_ARM_USE_LDST_EXCL)
     cpu_exclusive_high = tcg_global_mem_new_i64(TCG_AREG0,
         offsetof(CPUARMState, exclusive_high), "exclusive_high");
+#endif
 }
 
 static inline ARMMMUIdx get_a64_user_mem_index(DisasContext *s)
@@ -1219,7 +1223,11 @@  static void handle_hint(DisasContext *s, uint32_t insn,
 
 static void gen_clrex(DisasContext *s, uint32_t insn)
 {
+#ifndef CONFIG_ARM_USE_LDST_EXCL
     tcg_gen_movi_i64(cpu_exclusive_addr, -1);
+#else
+    gen_helper_atomic_clear(cpu_env);
+#endif
 }
 
 /* CLREX, DSB, DMB, ISB */
@@ -1685,7 +1693,11 @@  static void disas_b_exc_sys(DisasContext *s, uint32_t insn)
 }
 
 /*
- * Load/Store exclusive instructions are implemented by remembering
+ * If the softmmu is enabled, the translation of Load/Store exclusive
+ * instructions will rely on the gen_helper_{ldlink,stcond} helpers,
+ * offloading most of the work to the softmmu_llsc_template.h functions.
+ *
+ * Otherwise, instructions are implemented by remembering
  * the value/address loaded, and seeing if these are the same
  * when the store is performed. This is not actually the architecturally
  * mandated semantics, but it works for typical guest code sequences
@@ -1695,6 +1707,66 @@  static void disas_b_exc_sys(DisasContext *s, uint32_t insn)
  * this sequence is effectively atomic.  In user emulation mode we
  * throw an exception and handle the atomic operation elsewhere.
  */
+#ifdef CONFIG_ARM_USE_LDST_EXCL
+static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
+                               TCGv_i64 addr, int size, bool is_pair)
+{
+    /* In case @is_pair is set, we have to guarantee that at least the 128 bits
+     * accessed by a Load Exclusive Pair (64-bit variant) are protected. Since
+     * we do not have 128-bit helpers, we split the access in two halves, the
+     * first of them will set the exclusive region to cover at least 128 bits
+     * (this is why aarch64 has a custom cc->cpu_set_excl_protected_range which
+     * covers 128 bits).
+     * */
+    TCGv_i32 mem_idx = tcg_temp_new_i32();
+
+    tcg_gen_movi_i32(mem_idx, get_mem_index(s));
+
+    g_assert(size <= 3);
+
+    if (size < 3) {
+        TCGv_i32 tmp = tcg_temp_new_i32();
+
+        switch (size) {
+        case 0:
+            gen_helper_ldlink_i8(tmp, cpu_env, addr, mem_idx);
+            break;
+        case 1:
+            gen_helper_ldlink_i16(tmp, cpu_env, addr, mem_idx);
+            break;
+        case 2:
+            gen_helper_ldlink_i32(tmp, cpu_env, addr, mem_idx);
+            break;
+        default:
+            abort();
+        }
+
+        TCGv_i64 tmp64 = tcg_temp_new_i64();
+        tcg_gen_ext_i32_i64(tmp64, tmp);
+        tcg_gen_mov_i64(cpu_reg(s, rt), tmp64);
+
+        tcg_temp_free_i32(tmp);
+        tcg_temp_free_i64(tmp64);
+    } else {
+        gen_helper_ldlink_i64(cpu_reg(s, rt), cpu_env, addr, mem_idx);
+    }
+
+    if (is_pair) {
+        TCGMemOp memop = MO_TE + size;
+        TCGv_i64 addr2 = tcg_temp_new_i64();
+        TCGv_i64 hitmp = tcg_temp_new_i64();
+
+        g_assert(size >= 2);
+        tcg_gen_addi_i64(addr2, addr, 1 << size);
+        tcg_gen_qemu_ld_i64(hitmp, addr2, get_mem_index(s), memop);
+        tcg_temp_free_i64(addr2);
+        tcg_gen_mov_i64(cpu_reg(s, rt2), hitmp);
+        tcg_temp_free_i64(hitmp);
+    }
+
+    tcg_temp_free_i32(mem_idx);
+}
+#else
 static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
                                TCGv_i64 addr, int size, bool is_pair)
 {
@@ -1723,6 +1795,7 @@  static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
     tcg_temp_free_i64(tmp);
     tcg_gen_mov_i64(cpu_exclusive_addr, addr);
 }
+#endif
 
 #ifdef CONFIG_USER_ONLY
 static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
@@ -1733,6 +1806,65 @@  static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
                      size | is_pair << 2 | (rd << 4) | (rt << 9) | (rt2 << 14));
     gen_exception_internal_insn(s, 4, EXCP_STREX);
 }
+#elif defined(CONFIG_ARM_USE_LDST_EXCL)
+static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
+                                TCGv_i64 addr, int size, int is_pair)
+{
+    /* Don't bother to check if we are actually in exclusive context since the
+     * helpers keep care of it. */
+    TCGv_i32 mem_idx = tcg_temp_new_i32();
+
+    tcg_gen_movi_i32(mem_idx, get_mem_index(s));
+
+    g_assert(size <= 3);
+    if (is_pair) {
+        if (size == 3) {
+            gen_helper_stxp_i128(cpu_reg(s, rd), cpu_env, addr, cpu_reg(s, rt),
+                                 cpu_reg(s, rt2), mem_idx);
+        } else if (size == 2) {
+            /* Paired single word case. After merging the two registers into
+             * one, we use one stcond_i64 to store the value to memory. */
+            TCGv_i64 val = tcg_temp_new_i64();
+            TCGv_i64 valh = tcg_temp_new_i64();
+            tcg_gen_shli_i64(valh, cpu_reg(s, rt2), 32);
+            tcg_gen_and_i64(val, valh, cpu_reg(s, rt));
+            gen_helper_stcond_i64(cpu_reg(s, rd), cpu_env, addr, val, mem_idx);
+            tcg_temp_free_i64(valh);
+            tcg_temp_free_i64(val);
+        } else {
+            abort();
+        }
+    } else {
+        if (size < 3) {
+            TCGv_i32 val = tcg_temp_new_i32();
+
+            tcg_gen_extrl_i64_i32(val, cpu_reg(s, rt));
+
+            switch (size) {
+            case 0:
+                gen_helper_stcond_i8(cpu_reg(s, rd), cpu_env, addr, val,
+                                     mem_idx);
+                break;
+            case 1:
+                gen_helper_stcond_i16(cpu_reg(s, rd), cpu_env, addr, val,
+                                      mem_idx);
+                break;
+            case 2:
+                gen_helper_stcond_i32(cpu_reg(s, rd), cpu_env, addr, val,
+                                      mem_idx);
+                break;
+            default:
+                abort();
+            }
+            tcg_temp_free_i32(val);
+        } else {
+            gen_helper_stcond_i64(cpu_reg(s, rd), cpu_env, addr, cpu_reg(s, rt),
+                                  mem_idx);
+        }
+    }
+
+    tcg_temp_free_i32(mem_idx);
+}
 #else
 static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
                                 TCGv_i64 inaddr, int size, int is_pair)