diff mbox series

[v4,12/17] target/riscv: Add Zvksh ISA extension support

Message ID 20230622161646.32005-13-max.chou@sifive.com (mailing list archive)
State New, archived
Headers show
Series Add RISC-V vector cryptographic instruction set support | expand

Commit Message

Max Chou June 22, 2023, 4:16 p.m. UTC
From: Lawrence Hunter <lawrence.hunter@codethink.co.uk>

This commit adds support for the Zvksh vector-crypto extension, which
consists of the following instructions:

* vsm3me.vv
* vsm3c.vi

Translation functions are defined in
`target/riscv/insn_trans/trans_rvvk.c.inc` and helpers are defined in
`target/riscv/vcrypto_helper.c`.

Co-authored-by: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>
[max.chou@sifive.com: Replaced vstart checking by TCG op]
Signed-off-by: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>
Signed-off-by: Lawrence Hunter <lawrence.hunter@codethink.co.uk>
Signed-off-by: Max Chou <max.chou@sifive.com>
---
 target/riscv/cpu.c                       |   5 +-
 target/riscv/cpu_cfg.h                   |   1 +
 target/riscv/helper.h                    |   3 +
 target/riscv/insn32.decode               |   4 +
 target/riscv/insn_trans/trans_rvvk.c.inc |  31 ++++++
 target/riscv/vcrypto_helper.c            | 134 +++++++++++++++++++++++
 6 files changed, 176 insertions(+), 2 deletions(-)

Comments

Daniel Henrique Barboza June 22, 2023, 6:09 p.m. UTC | #1
On 6/22/23 13:16, Max Chou wrote:
> From: Lawrence Hunter <lawrence.hunter@codethink.co.uk>
> 
> This commit adds support for the Zvksh vector-crypto extension, which
> consists of the following instructions:
> 
> * vsm3me.vv
> * vsm3c.vi
> 
> Translation functions are defined in
> `target/riscv/insn_trans/trans_rvvk.c.inc` and helpers are defined in
> `target/riscv/vcrypto_helper.c`.
> 
> Co-authored-by: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>
> [max.chou@sifive.com: Replaced vstart checking by TCG op]
> Signed-off-by: Kiran Ostrolenk <kiran.ostrolenk@codethink.co.uk>
> Signed-off-by: Lawrence Hunter <lawrence.hunter@codethink.co.uk>
> Signed-off-by: Max Chou <max.chou@sifive.com>
> ---

Reviewed-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>

>   target/riscv/cpu.c                       |   5 +-
>   target/riscv/cpu_cfg.h                   |   1 +
>   target/riscv/helper.h                    |   3 +
>   target/riscv/insn32.decode               |   4 +
>   target/riscv/insn_trans/trans_rvvk.c.inc |  31 ++++++
>   target/riscv/vcrypto_helper.c            | 134 +++++++++++++++++++++++
>   6 files changed, 176 insertions(+), 2 deletions(-)
> 
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 6bba8ba8c9..c9a9ff80cd 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -121,6 +121,7 @@ static const struct isa_ext_data isa_edata_arr[] = {
>       ISA_EXT_DATA_ENTRY(zvkned, PRIV_VERSION_1_12_0, ext_zvkned),
>       ISA_EXT_DATA_ENTRY(zvknha, PRIV_VERSION_1_12_0, ext_zvknha),
>       ISA_EXT_DATA_ENTRY(zvknhb, PRIV_VERSION_1_12_0, ext_zvknhb),
> +    ISA_EXT_DATA_ENTRY(zvksh, PRIV_VERSION_1_12_0, ext_zvksh),
>       ISA_EXT_DATA_ENTRY(zhinx, PRIV_VERSION_1_12_0, ext_zhinx),
>       ISA_EXT_DATA_ENTRY(zhinxmin, PRIV_VERSION_1_12_0, ext_zhinxmin),
>       ISA_EXT_DATA_ENTRY(smaia, PRIV_VERSION_1_12_0, ext_smaia),
> @@ -1197,8 +1198,8 @@ void riscv_cpu_validate_set_extensions(RISCVCPU *cpu, Error **errp)
>        * In principle Zve*x would also suffice here, were they supported
>        * in qemu
>        */
> -    if ((cpu->cfg.ext_zvbb || cpu->cfg.ext_zvkned || cpu->cfg.ext_zvknha) &&
> -        !cpu->cfg.ext_zve32f) {
> +    if ((cpu->cfg.ext_zvbb || cpu->cfg.ext_zvkned || cpu->cfg.ext_zvknha ||
> +         cpu->cfg.ext_zvksh) && !cpu->cfg.ext_zve32f) {
>           error_setg(errp,
>                      "Vector crypto extensions require V or Zve* extensions");
>           return;
> diff --git a/target/riscv/cpu_cfg.h b/target/riscv/cpu_cfg.h
> index 41cce87ffc..f859d9e2f5 100644
> --- a/target/riscv/cpu_cfg.h
> +++ b/target/riscv/cpu_cfg.h
> @@ -88,6 +88,7 @@ struct RISCVCPUConfig {
>       bool ext_zvkned;
>       bool ext_zvknha;
>       bool ext_zvknhb;
> +    bool ext_zvksh;
>       bool ext_zmmul;
>       bool ext_zvfh;
>       bool ext_zvfhmin;
> diff --git a/target/riscv/helper.h b/target/riscv/helper.h
> index 19f5a8a28d..9220af18e6 100644
> --- a/target/riscv/helper.h
> +++ b/target/riscv/helper.h
> @@ -1238,3 +1238,6 @@ DEF_HELPER_5(vaeskf2_vi, void, ptr, ptr, i32, env, i32)
>   DEF_HELPER_5(vsha2ms_vv, void, ptr, ptr, ptr, env, i32)
>   DEF_HELPER_5(vsha2ch_vv, void, ptr, ptr, ptr, env, i32)
>   DEF_HELPER_5(vsha2cl_vv, void, ptr, ptr, ptr, env, i32)
> +
> +DEF_HELPER_5(vsm3me_vv, void, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_5(vsm3c_vi, void, ptr, ptr, i32, env, i32)
> diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
> index d2cfb2729c..5ca83e8462 100644
> --- a/target/riscv/insn32.decode
> +++ b/target/riscv/insn32.decode
> @@ -953,3 +953,7 @@ vaeskf2_vi  101010 1 ..... ..... 010 ..... 1110111 @r_vm_1
>   vsha2ms_vv  101101 1 ..... ..... 010 ..... 1110111 @r_vm_1
>   vsha2ch_vv  101110 1 ..... ..... 010 ..... 1110111 @r_vm_1
>   vsha2cl_vv  101111 1 ..... ..... 010 ..... 1110111 @r_vm_1
> +
> +# *** Zvksh vector crypto extension ***
> +vsm3me_vv   100000 1 ..... ..... 010 ..... 1110111 @r_vm_1
> +vsm3c_vi    101011 1 ..... ..... 010 ..... 1110111 @r_vm_1
> diff --git a/target/riscv/insn_trans/trans_rvvk.c.inc b/target/riscv/insn_trans/trans_rvvk.c.inc
> index 528a0d3b32..af1fb74c38 100644
> --- a/target/riscv/insn_trans/trans_rvvk.c.inc
> +++ b/target/riscv/insn_trans/trans_rvvk.c.inc
> @@ -479,3 +479,34 @@ static bool vsha_check(DisasContext *s, arg_rmrr *a)
>   GEN_VV_UNMASKED_TRANS(vsha2ms_vv, vsha_check, ZVKNH_EGS)
>   GEN_VV_UNMASKED_TRANS(vsha2cl_vv, vsha_check, ZVKNH_EGS)
>   GEN_VV_UNMASKED_TRANS(vsha2ch_vv, vsha_check, ZVKNH_EGS)
> +
> +/*
> + * Zvksh
> + */
> +
> +#define ZVKSH_EGS 8
> +
> +static inline bool vsm3_check(DisasContext *s, arg_rmrr *a)
> +{
> +    int egw_bytes = ZVKSH_EGS << s->sew;
> +    int mult = 1 << MAX(s->lmul, 0);
> +    return s->cfg_ptr->ext_zvksh == true &&
> +           require_rvv(s) &&
> +           vext_check_isa_ill(s) &&
> +           !is_overlapped(a->rd, mult, a->rs2, mult) &&
> +           MAXSZ(s) >= egw_bytes &&
> +           s->sew == MO_32;
> +}
> +
> +static inline bool vsm3me_check(DisasContext *s, arg_rmrr *a)
> +{
> +    return vsm3_check(s, a) && vext_check_sss(s, a->rd, a->rs1, a->rs2, a->vm);
> +}
> +
> +static inline bool vsm3c_check(DisasContext *s, arg_rmrr *a)
> +{
> +    return vsm3_check(s, a) && vext_check_ss(s, a->rd, a->rs2, a->vm);
> +}
> +
> +GEN_VV_UNMASKED_TRANS(vsm3me_vv, vsm3me_check, ZVKSH_EGS)
> +GEN_VI_UNMASKED_TRANS(vsm3c_vi, vsm3c_check, ZVKSH_EGS)
> diff --git a/target/riscv/vcrypto_helper.c b/target/riscv/vcrypto_helper.c
> index ca09062c6c..06c8f4adc7 100644
> --- a/target/riscv/vcrypto_helper.c
> +++ b/target/riscv/vcrypto_helper.c
> @@ -717,3 +717,137 @@ void HELPER(vsha2cl_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env,
>       vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz);
>       env->vstart = 0;
>   }
> +
> +static inline uint32_t p1(uint32_t x)
> +{
> +    return x ^ rol32(x, 15) ^ rol32(x, 23);
> +}
> +
> +static inline uint32_t zvksh_w(uint32_t m16, uint32_t m9, uint32_t m3,
> +                               uint32_t m13, uint32_t m6)
> +{
> +    return p1(m16 ^ m9 ^ rol32(m3, 15)) ^ rol32(m13, 7) ^ m6;
> +}
> +
> +void HELPER(vsm3me_vv)(void *vd_vptr, void *vs1_vptr, void *vs2_vptr,
> +                       CPURISCVState *env, uint32_t desc)
> +{
> +    uint32_t esz = memop_size(FIELD_EX64(env->vtype, VTYPE, VSEW));
> +    uint32_t total_elems = vext_get_total_elems(env, desc, esz);
> +    uint32_t vta = vext_vta(desc);
> +    uint32_t *vd = vd_vptr;
> +    uint32_t *vs1 = vs1_vptr;
> +    uint32_t *vs2 = vs2_vptr;
> +
> +    for (int i = env->vstart / 8; i < env->vl / 8; i++) {
> +        uint32_t w[24];
> +        for (int j = 0; j < 8; j++) {
> +            w[j] = bswap32(vs1[H4((i * 8) + j)]);
> +            w[j + 8] = bswap32(vs2[H4((i * 8) + j)]);
> +        }
> +        for (int j = 0; j < 8; j++) {
> +            w[j + 16] =
> +                zvksh_w(w[j], w[j + 7], w[j + 13], w[j + 3], w[j + 10]);
> +        }
> +        for (int j = 0; j < 8; j++) {
> +            vd[(i * 8) + j] = bswap32(w[H4(j + 16)]);
> +        }
> +    }
> +    vext_set_elems_1s(vd_vptr, vta, env->vl * esz, total_elems * esz);
> +    env->vstart = 0;
> +}
> +
> +static inline uint32_t ff1(uint32_t x, uint32_t y, uint32_t z)
> +{
> +    return x ^ y ^ z;
> +}
> +
> +static inline uint32_t ff2(uint32_t x, uint32_t y, uint32_t z)
> +{
> +    return (x & y) | (x & z) | (y & z);
> +}
> +
> +static inline uint32_t ff_j(uint32_t x, uint32_t y, uint32_t z, uint32_t j)
> +{
> +    return (j <= 15) ? ff1(x, y, z) : ff2(x, y, z);
> +}
> +
> +static inline uint32_t gg1(uint32_t x, uint32_t y, uint32_t z)
> +{
> +    return x ^ y ^ z;
> +}
> +
> +static inline uint32_t gg2(uint32_t x, uint32_t y, uint32_t z)
> +{
> +    return (x & y) | (~x & z);
> +}
> +
> +static inline uint32_t gg_j(uint32_t x, uint32_t y, uint32_t z, uint32_t j)
> +{
> +    return (j <= 15) ? gg1(x, y, z) : gg2(x, y, z);
> +}
> +
> +static inline uint32_t t_j(uint32_t j)
> +{
> +    return (j <= 15) ? 0x79cc4519 : 0x7a879d8a;
> +}
> +
> +static inline uint32_t p_0(uint32_t x)
> +{
> +    return x ^ rol32(x, 9) ^ rol32(x, 17);
> +}
> +
> +static void sm3c(uint32_t *vd, uint32_t *vs1, uint32_t *vs2, uint32_t uimm)
> +{
> +    uint32_t x0, x1;
> +    uint32_t j;
> +    uint32_t ss1, ss2, tt1, tt2;
> +    x0 = vs2[0] ^ vs2[4];
> +    x1 = vs2[1] ^ vs2[5];
> +    j = 2 * uimm;
> +    ss1 = rol32(rol32(vs1[0], 12) + vs1[4] + rol32(t_j(j), j % 32), 7);
> +    ss2 = ss1 ^ rol32(vs1[0], 12);
> +    tt1 = ff_j(vs1[0], vs1[1], vs1[2], j) + vs1[3] + ss2 + x0;
> +    tt2 = gg_j(vs1[4], vs1[5], vs1[6], j) + vs1[7] + ss1 + vs2[0];
> +    vs1[3] = vs1[2];
> +    vd[3] = rol32(vs1[1], 9);
> +    vs1[1] = vs1[0];
> +    vd[1] = tt1;
> +    vs1[7] = vs1[6];
> +    vd[7] = rol32(vs1[5], 19);
> +    vs1[5] = vs1[4];
> +    vd[5] = p_0(tt2);
> +    j = 2 * uimm + 1;
> +    ss1 = rol32(rol32(vd[1], 12) + vd[5] + rol32(t_j(j), j % 32), 7);
> +    ss2 = ss1 ^ rol32(vd[1], 12);
> +    tt1 = ff_j(vd[1], vs1[1], vd[3], j) + vs1[3] + ss2 + x1;
> +    tt2 = gg_j(vd[5], vs1[5], vd[7], j) + vs1[7] + ss1 + vs2[1];
> +    vd[2] = rol32(vs1[1], 9);
> +    vd[0] = tt1;
> +    vd[6] = rol32(vs1[5], 19);
> +    vd[4] = p_0(tt2);
> +}
> +
> +void HELPER(vsm3c_vi)(void *vd_vptr, void *vs2_vptr, uint32_t uimm,
> +                      CPURISCVState *env, uint32_t desc)
> +{
> +    uint32_t esz = memop_size(FIELD_EX64(env->vtype, VTYPE, VSEW));
> +    uint32_t total_elems = vext_get_total_elems(env, desc, esz);
> +    uint32_t vta = vext_vta(desc);
> +    uint32_t *vd = vd_vptr;
> +    uint32_t *vs2 = vs2_vptr;
> +    uint32_t v1[8], v2[8], v3[8];
> +
> +    for (int i = env->vstart / 8; i < env->vl / 8; i++) {
> +        for (int k = 0; k < 8; k++) {
> +            v2[k] = bswap32(vd[H4(i * 8 + k)]);
> +            v3[k] = bswap32(vs2[H4(i * 8 + k)]);
> +        }
> +        sm3c(v1, v2, v3, uimm);
> +        for (int k = 0; k < 8; k++) {
> +            vd[i * 8 + k] = bswap32(v1[H4(k)]);
> +        }
> +    }
> +    vext_set_elems_1s(vd_vptr, vta, env->vl * esz, total_elems * esz);
> +    env->vstart = 0;
> +}
diff mbox series

Patch

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 6bba8ba8c9..c9a9ff80cd 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -121,6 +121,7 @@  static const struct isa_ext_data isa_edata_arr[] = {
     ISA_EXT_DATA_ENTRY(zvkned, PRIV_VERSION_1_12_0, ext_zvkned),
     ISA_EXT_DATA_ENTRY(zvknha, PRIV_VERSION_1_12_0, ext_zvknha),
     ISA_EXT_DATA_ENTRY(zvknhb, PRIV_VERSION_1_12_0, ext_zvknhb),
+    ISA_EXT_DATA_ENTRY(zvksh, PRIV_VERSION_1_12_0, ext_zvksh),
     ISA_EXT_DATA_ENTRY(zhinx, PRIV_VERSION_1_12_0, ext_zhinx),
     ISA_EXT_DATA_ENTRY(zhinxmin, PRIV_VERSION_1_12_0, ext_zhinxmin),
     ISA_EXT_DATA_ENTRY(smaia, PRIV_VERSION_1_12_0, ext_smaia),
@@ -1197,8 +1198,8 @@  void riscv_cpu_validate_set_extensions(RISCVCPU *cpu, Error **errp)
      * In principle Zve*x would also suffice here, were they supported
      * in qemu
      */
-    if ((cpu->cfg.ext_zvbb || cpu->cfg.ext_zvkned || cpu->cfg.ext_zvknha) &&
-        !cpu->cfg.ext_zve32f) {
+    if ((cpu->cfg.ext_zvbb || cpu->cfg.ext_zvkned || cpu->cfg.ext_zvknha ||
+         cpu->cfg.ext_zvksh) && !cpu->cfg.ext_zve32f) {
         error_setg(errp,
                    "Vector crypto extensions require V or Zve* extensions");
         return;
diff --git a/target/riscv/cpu_cfg.h b/target/riscv/cpu_cfg.h
index 41cce87ffc..f859d9e2f5 100644
--- a/target/riscv/cpu_cfg.h
+++ b/target/riscv/cpu_cfg.h
@@ -88,6 +88,7 @@  struct RISCVCPUConfig {
     bool ext_zvkned;
     bool ext_zvknha;
     bool ext_zvknhb;
+    bool ext_zvksh;
     bool ext_zmmul;
     bool ext_zvfh;
     bool ext_zvfhmin;
diff --git a/target/riscv/helper.h b/target/riscv/helper.h
index 19f5a8a28d..9220af18e6 100644
--- a/target/riscv/helper.h
+++ b/target/riscv/helper.h
@@ -1238,3 +1238,6 @@  DEF_HELPER_5(vaeskf2_vi, void, ptr, ptr, i32, env, i32)
 DEF_HELPER_5(vsha2ms_vv, void, ptr, ptr, ptr, env, i32)
 DEF_HELPER_5(vsha2ch_vv, void, ptr, ptr, ptr, env, i32)
 DEF_HELPER_5(vsha2cl_vv, void, ptr, ptr, ptr, env, i32)
+
+DEF_HELPER_5(vsm3me_vv, void, ptr, ptr, ptr, env, i32)
+DEF_HELPER_5(vsm3c_vi, void, ptr, ptr, i32, env, i32)
diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index d2cfb2729c..5ca83e8462 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -953,3 +953,7 @@  vaeskf2_vi  101010 1 ..... ..... 010 ..... 1110111 @r_vm_1
 vsha2ms_vv  101101 1 ..... ..... 010 ..... 1110111 @r_vm_1
 vsha2ch_vv  101110 1 ..... ..... 010 ..... 1110111 @r_vm_1
 vsha2cl_vv  101111 1 ..... ..... 010 ..... 1110111 @r_vm_1
+
+# *** Zvksh vector crypto extension ***
+vsm3me_vv   100000 1 ..... ..... 010 ..... 1110111 @r_vm_1
+vsm3c_vi    101011 1 ..... ..... 010 ..... 1110111 @r_vm_1
diff --git a/target/riscv/insn_trans/trans_rvvk.c.inc b/target/riscv/insn_trans/trans_rvvk.c.inc
index 528a0d3b32..af1fb74c38 100644
--- a/target/riscv/insn_trans/trans_rvvk.c.inc
+++ b/target/riscv/insn_trans/trans_rvvk.c.inc
@@ -479,3 +479,34 @@  static bool vsha_check(DisasContext *s, arg_rmrr *a)
 GEN_VV_UNMASKED_TRANS(vsha2ms_vv, vsha_check, ZVKNH_EGS)
 GEN_VV_UNMASKED_TRANS(vsha2cl_vv, vsha_check, ZVKNH_EGS)
 GEN_VV_UNMASKED_TRANS(vsha2ch_vv, vsha_check, ZVKNH_EGS)
+
+/*
+ * Zvksh
+ */
+
+#define ZVKSH_EGS 8
+
+static inline bool vsm3_check(DisasContext *s, arg_rmrr *a)
+{
+    int egw_bytes = ZVKSH_EGS << s->sew;
+    int mult = 1 << MAX(s->lmul, 0);
+    return s->cfg_ptr->ext_zvksh == true &&
+           require_rvv(s) &&
+           vext_check_isa_ill(s) &&
+           !is_overlapped(a->rd, mult, a->rs2, mult) &&
+           MAXSZ(s) >= egw_bytes &&
+           s->sew == MO_32;
+}
+
+static inline bool vsm3me_check(DisasContext *s, arg_rmrr *a)
+{
+    return vsm3_check(s, a) && vext_check_sss(s, a->rd, a->rs1, a->rs2, a->vm);
+}
+
+static inline bool vsm3c_check(DisasContext *s, arg_rmrr *a)
+{
+    return vsm3_check(s, a) && vext_check_ss(s, a->rd, a->rs2, a->vm);
+}
+
+GEN_VV_UNMASKED_TRANS(vsm3me_vv, vsm3me_check, ZVKSH_EGS)
+GEN_VI_UNMASKED_TRANS(vsm3c_vi, vsm3c_check, ZVKSH_EGS)
diff --git a/target/riscv/vcrypto_helper.c b/target/riscv/vcrypto_helper.c
index ca09062c6c..06c8f4adc7 100644
--- a/target/riscv/vcrypto_helper.c
+++ b/target/riscv/vcrypto_helper.c
@@ -717,3 +717,137 @@  void HELPER(vsha2cl_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env,
     vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz);
     env->vstart = 0;
 }
+
+static inline uint32_t p1(uint32_t x)
+{
+    return x ^ rol32(x, 15) ^ rol32(x, 23);
+}
+
+static inline uint32_t zvksh_w(uint32_t m16, uint32_t m9, uint32_t m3,
+                               uint32_t m13, uint32_t m6)
+{
+    return p1(m16 ^ m9 ^ rol32(m3, 15)) ^ rol32(m13, 7) ^ m6;
+}
+
+void HELPER(vsm3me_vv)(void *vd_vptr, void *vs1_vptr, void *vs2_vptr,
+                       CPURISCVState *env, uint32_t desc)
+{
+    uint32_t esz = memop_size(FIELD_EX64(env->vtype, VTYPE, VSEW));
+    uint32_t total_elems = vext_get_total_elems(env, desc, esz);
+    uint32_t vta = vext_vta(desc);
+    uint32_t *vd = vd_vptr;
+    uint32_t *vs1 = vs1_vptr;
+    uint32_t *vs2 = vs2_vptr;
+
+    for (int i = env->vstart / 8; i < env->vl / 8; i++) {
+        uint32_t w[24];
+        for (int j = 0; j < 8; j++) {
+            w[j] = bswap32(vs1[H4((i * 8) + j)]);
+            w[j + 8] = bswap32(vs2[H4((i * 8) + j)]);
+        }
+        for (int j = 0; j < 8; j++) {
+            w[j + 16] =
+                zvksh_w(w[j], w[j + 7], w[j + 13], w[j + 3], w[j + 10]);
+        }
+        for (int j = 0; j < 8; j++) {
+            vd[(i * 8) + j] = bswap32(w[H4(j + 16)]);
+        }
+    }
+    vext_set_elems_1s(vd_vptr, vta, env->vl * esz, total_elems * esz);
+    env->vstart = 0;
+}
+
+static inline uint32_t ff1(uint32_t x, uint32_t y, uint32_t z)
+{
+    return x ^ y ^ z;
+}
+
+static inline uint32_t ff2(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x & y) | (x & z) | (y & z);
+}
+
+static inline uint32_t ff_j(uint32_t x, uint32_t y, uint32_t z, uint32_t j)
+{
+    return (j <= 15) ? ff1(x, y, z) : ff2(x, y, z);
+}
+
+static inline uint32_t gg1(uint32_t x, uint32_t y, uint32_t z)
+{
+    return x ^ y ^ z;
+}
+
+static inline uint32_t gg2(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x & y) | (~x & z);
+}
+
+static inline uint32_t gg_j(uint32_t x, uint32_t y, uint32_t z, uint32_t j)
+{
+    return (j <= 15) ? gg1(x, y, z) : gg2(x, y, z);
+}
+
+static inline uint32_t t_j(uint32_t j)
+{
+    return (j <= 15) ? 0x79cc4519 : 0x7a879d8a;
+}
+
+static inline uint32_t p_0(uint32_t x)
+{
+    return x ^ rol32(x, 9) ^ rol32(x, 17);
+}
+
+static void sm3c(uint32_t *vd, uint32_t *vs1, uint32_t *vs2, uint32_t uimm)
+{
+    uint32_t x0, x1;
+    uint32_t j;
+    uint32_t ss1, ss2, tt1, tt2;
+    x0 = vs2[0] ^ vs2[4];
+    x1 = vs2[1] ^ vs2[5];
+    j = 2 * uimm;
+    ss1 = rol32(rol32(vs1[0], 12) + vs1[4] + rol32(t_j(j), j % 32), 7);
+    ss2 = ss1 ^ rol32(vs1[0], 12);
+    tt1 = ff_j(vs1[0], vs1[1], vs1[2], j) + vs1[3] + ss2 + x0;
+    tt2 = gg_j(vs1[4], vs1[5], vs1[6], j) + vs1[7] + ss1 + vs2[0];
+    vs1[3] = vs1[2];
+    vd[3] = rol32(vs1[1], 9);
+    vs1[1] = vs1[0];
+    vd[1] = tt1;
+    vs1[7] = vs1[6];
+    vd[7] = rol32(vs1[5], 19);
+    vs1[5] = vs1[4];
+    vd[5] = p_0(tt2);
+    j = 2 * uimm + 1;
+    ss1 = rol32(rol32(vd[1], 12) + vd[5] + rol32(t_j(j), j % 32), 7);
+    ss2 = ss1 ^ rol32(vd[1], 12);
+    tt1 = ff_j(vd[1], vs1[1], vd[3], j) + vs1[3] + ss2 + x1;
+    tt2 = gg_j(vd[5], vs1[5], vd[7], j) + vs1[7] + ss1 + vs2[1];
+    vd[2] = rol32(vs1[1], 9);
+    vd[0] = tt1;
+    vd[6] = rol32(vs1[5], 19);
+    vd[4] = p_0(tt2);
+}
+
+void HELPER(vsm3c_vi)(void *vd_vptr, void *vs2_vptr, uint32_t uimm,
+                      CPURISCVState *env, uint32_t desc)
+{
+    uint32_t esz = memop_size(FIELD_EX64(env->vtype, VTYPE, VSEW));
+    uint32_t total_elems = vext_get_total_elems(env, desc, esz);
+    uint32_t vta = vext_vta(desc);
+    uint32_t *vd = vd_vptr;
+    uint32_t *vs2 = vs2_vptr;
+    uint32_t v1[8], v2[8], v3[8];
+
+    for (int i = env->vstart / 8; i < env->vl / 8; i++) {
+        for (int k = 0; k < 8; k++) {
+            v2[k] = bswap32(vd[H4(i * 8 + k)]);
+            v3[k] = bswap32(vs2[H4(i * 8 + k)]);
+        }
+        sm3c(v1, v2, v3, uimm);
+        for (int k = 0; k < 8; k++) {
+            vd[i * 8 + k] = bswap32(v1[H4(k)]);
+        }
+    }
+    vext_set_elems_1s(vd_vptr, vta, env->vl * esz, total_elems * esz);
+    env->vstart = 0;
+}