diff mbox series

[RFC,v3,26/71] target/riscv: rvv-1.0: update vext_max_elems() for load/store insns

Message ID 20200806104709.13235-27-frank.chang@sifive.com (mailing list archive)
State New, archived
Headers show
Series target/riscv: support vector extension v1.0 | expand

Commit Message

Frank Chang Aug. 6, 2020, 10:46 a.m. UTC
From: Frank Chang <frank.chang@sifive.com>

Unlike other vector instructions, load/store vector instructions return
the maximum vector size calculated with EMUL.
For other vector instructions, return VLMAX as the maximum vector size.

Signed-off-by: Frank Chang <frank.chang@sifive.com>
---
 target/riscv/insn_trans/trans_rvv.inc.c | 10 +++-
 target/riscv/internals.h                |  5 +-
 target/riscv/vector_helper.c            | 61 ++++++++++++++++++-------
 3 files changed, 56 insertions(+), 20 deletions(-)

Comments

Richard Henderson Aug. 7, 2020, 12:03 a.m. UTC | #1
On 8/6/20 3:46 AM, frank.chang@sifive.com wrote:
> +static inline uint32_t vext_max_elems(uint32_t desc, uint32_t esz, bool is_ldst)
>  {
> -    return simd_maxsz(desc) << vext_lmul(desc);
> +    /*
> +     * As simd_desc support at most 256 bytes, the max vlen is 256 bits.
> +     * so vlen in bytes (vlenb) is encoded as maxsz.
> +     */
> +    uint32_t vlenb = simd_maxsz(desc);
> +
> +    if (is_ldst) {
> +        /*
> +         * Vector load/store instructions have the EEW encoded
> +         * directly in the instructions. The maximum vector size is
> +         * calculated with EMUL rather than LMUL.
> +         */
> +        uint32_t eew = ctzl(esz);
> +        uint32_t sew = vext_sew(desc);
> +        uint32_t lmul = vext_lmul(desc);
> +        int32_t emul = eew - sew + lmul;
> +        uint32_t emul_r = emul < 0 ? 0 : emul;
> +        return 1 << (ctzl(vlenb) + emul_r - ctzl(esz));

As I said before, the is_ldst instructions should put the EEW and EMUL values
into the SEW and LMUL desc fields, so that this does not need to be
special-cased at all.

> +        /* Return VLMAX */
> +        return 1 << (ctzl(vlenb) + vext_lmul(desc) - ctzl(esz));

This is overly complicated.

(1) 1 << ctzl(vlenb) == vlenb.
(2) I'm not sure why esz is not already a log2 number.

This ought to look more like

  int scale = lmul - esz;
  return (scale < 0
          ? vlenb >> -scale
          : vlenb << scale);


r~
Frank Chang Aug. 14, 2020, 2:48 a.m. UTC | #2
On Fri, Aug 7, 2020 at 8:04 AM Richard Henderson <
richard.henderson@linaro.org> wrote:

> On 8/6/20 3:46 AM, frank.chang@sifive.com wrote:
> > +static inline uint32_t vext_max_elems(uint32_t desc, uint32_t esz, bool
> is_ldst)
> >  {
> > -    return simd_maxsz(desc) << vext_lmul(desc);
> > +    /*
> > +     * As simd_desc support at most 256 bytes, the max vlen is 256 bits.
> > +     * so vlen in bytes (vlenb) is encoded as maxsz.
> > +     */
> > +    uint32_t vlenb = simd_maxsz(desc);
> > +
> > +    if (is_ldst) {
> > +        /*
> > +         * Vector load/store instructions have the EEW encoded
> > +         * directly in the instructions. The maximum vector size is
> > +         * calculated with EMUL rather than LMUL.
> > +         */
> > +        uint32_t eew = ctzl(esz);
> > +        uint32_t sew = vext_sew(desc);
> > +        uint32_t lmul = vext_lmul(desc);
> > +        int32_t emul = eew - sew + lmul;
> > +        uint32_t emul_r = emul < 0 ? 0 : emul;
> > +        return 1 << (ctzl(vlenb) + emul_r - ctzl(esz));
>
> As I said before, the is_ldst instructions should put the EEW and EMUL
> values
> into the SEW and LMUL desc fields, so that this does not need to be

special-cased at all.
>

I add a vext_get_emul() helper function in trans_rvv.inc.c:

> static uint8_t vext_get_emul(DisasContext *s, uint8_t eew)
> {
>     int8_t lmul = sextract32(s->lmul, 0, 3);
>     int8_t emul = ctzl(eew) - (s->sew + 3) + lmul;  // may remove ctzl()
if eew is already log2(eew)
>     return emul < 0 ? 0 : emul;
> }

and pass emul as LMUL field in VDATA so that it can be
reused in vector_helper.c: vext_max_elems():

> uint8_t emul = vext_get_emul(s, eew);
> data = FIELD_DP32(data, VDATA, LMUL, emul);

I also remove the passing SEW field in VDATA codes as I think SEW
might not be required in the updated vext_max_elems() (see below).


>
> > +        /* Return VLMAX */
> > +        return 1 << (ctzl(vlenb) + vext_lmul(desc) - ctzl(esz));
>
> This is overly complicated.
>
> (1) 1 << ctzl(vlenb) == vlenb.
> (2) I'm not sure why esz is not already a log2 number.
>

esz is passed from e.g. GEN_VEXT_LD_STRIDE() macro:

> #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)        \
> void HELPER(NAME)(void *vd, void * v0, target_ulong base,           \
>                   target_ulong stride, CPURISCVState *env,
      \
>                   uint32_t desc)
                         \
> {
                                   \
>     uint32_t vm = vext_vm(desc);
                \
>     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,     \
>                      sizeof(ETYPE), GETPC(), MMU_DATA_LOAD);            \
> }
>
> GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)

which is calculated by sizeof(ETYPE), so the results would be: 1, 2, 4, 8.
and vext_max_elems() is called by e.g. vext_ldst_stride():

> uint32_t max_elems = vext_max_elems(desc, esz);

I can add another parameter to the macro and pass the hard-coded log2(esz)
number
if it's the better way instead of using ctzl().
Or if there's another approach to get the log2(esz) number more elegantly?


>
> This ought to look more like
>
>   int scale = lmul - esz;
>   return (scale < 0
>           ? vlenb >> -scale
>           : vlenb << scale);
>
>
Thanks for the detailed point outs.
I manage to change the codes to below as your suggestion.

> static inline uint32_t vext_max_elems(uint32_t desc, uint32_t esz)
> {
>     /*
>      * As simd_desc support at most 256 bytes, the max vlen is 256 bits.
>      * so vlen in bytes (vlenb) is encoded as maxsz.
>      */
>     uint32_t vlenb = simd_maxsz(desc);
>
>     /* Return VLMAX */
>     int scale = vext_lmul(desc) - ctzl(esz);  // may remove ctzl() if esz
is already log2(esz)
>     return scale < 0 ? vlenb >> -scale : vlenb << scale;
> }


>
> r~
>

Thanks for the review.
Frank Chang
Richard Henderson Aug. 14, 2020, 6:36 p.m. UTC | #3
On 8/13/20 7:48 PM, Frank Chang wrote:
> esz is passed from e.g. GEN_VEXT_LD_STRIDE() macro:
> 
>> #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)        \
>> void HELPER(NAME)(void *vd, void * v0, target_ulong base,  \
>>                   target_ulong stride, CPURISCVState *env, \
>>                   uint32_t desc)                           \
>> {                                                          \
>>     uint32_t vm = vext_vm(desc);                           \
>>     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN, \
>>                      sizeof(ETYPE), GETPC(), MMU_DATA_LOAD);       \
>> }
>>
>> GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
> 
> which is calculated by sizeof(ETYPE), so the results would be: 1, 2, 4, 8.
> and vext_max_elems() is called by e.g. vext_ldst_stride():

Ah, yes.

>> uint32_t max_elems = vext_max_elems(desc, esz);
> 
> I can add another parameter to the macro and pass the hard-coded log2(esz) number
> if it's the better way instead of using ctzl().
> Or if there's another approach to get the log2(esz) number more elegantly?

Using ctzl(sizeof(type)) in the GEN_VEXT_LD_STRIDE macro will work well.  This
will be constant folded by the compiler.


r~
Frank Chang Aug. 15, 2020, 2:25 a.m. UTC | #4
On Sat, Aug 15, 2020 at 2:36 AM Richard Henderson <
richard.henderson@linaro.org> wrote:

> On 8/13/20 7:48 PM, Frank Chang wrote:
> > esz is passed from e.g. GEN_VEXT_LD_STRIDE() macro:
> >
> >> #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)        \
> >> void HELPER(NAME)(void *vd, void * v0, target_ulong base,  \
> >>                   target_ulong stride, CPURISCVState *env, \
> >>                   uint32_t desc)                           \
> >> {                                                          \
> >>     uint32_t vm = vext_vm(desc);                           \
> >>     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN, \
> >>                      sizeof(ETYPE), GETPC(), MMU_DATA_LOAD);       \
> >> }
> >>
> >> GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
> >
> > which is calculated by sizeof(ETYPE), so the results would be: 1, 2, 4,
> 8.
> > and vext_max_elems() is called by e.g. vext_ldst_stride():
>
> Ah, yes.
>
> >> uint32_t max_elems = vext_max_elems(desc, esz);
> >
> > I can add another parameter to the macro and pass the hard-coded
> log2(esz) number
> > if it's the better way instead of using ctzl().
> > Or if there's another approach to get the log2(esz) number more
> elegantly?
>
> Using ctzl(sizeof(type)) in the GEN_VEXT_LD_STRIDE macro will work well.
> This
> will be constant folded by the compiler.
>
>
> r~
>

Nice, didn't come up with the compiler optimization.
Will fix the codes and send out a new version of patchset.
Thanks for the tips.

Frank Chang
Frank Chang Aug. 15, 2020, 2:52 a.m. UTC | #5
On Sat, Aug 15, 2020 at 2:36 AM Richard Henderson <
richard.henderson@linaro.org> wrote:

> On 8/13/20 7:48 PM, Frank Chang wrote:
> > esz is passed from e.g. GEN_VEXT_LD_STRIDE() macro:
> >
> >> #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)        \
> >> void HELPER(NAME)(void *vd, void * v0, target_ulong base,  \
> >>                   target_ulong stride, CPURISCVState *env, \
> >>                   uint32_t desc)                           \
> >> {                                                          \
> >>     uint32_t vm = vext_vm(desc);                           \
> >>     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN, \
> >>                      sizeof(ETYPE), GETPC(), MMU_DATA_LOAD);       \
> >> }
> >>
> >> GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
> >
> > which is calculated by sizeof(ETYPE), so the results would be: 1, 2, 4,
> 8.
> > and vext_max_elems() is called by e.g. vext_ldst_stride():
>
> Ah, yes.
>
> >> uint32_t max_elems = vext_max_elems(desc, esz);
> >
> > I can add another parameter to the macro and pass the hard-coded
> log2(esz) number
> > if it's the better way instead of using ctzl().
> > Or if there's another approach to get the log2(esz) number more
> elegantly?
>
> Using ctzl(sizeof(type)) in the GEN_VEXT_LD_STRIDE macro will work well.
> This
> will be constant folded by the compiler.
>
>
> r~
>

Checked the codes again,
GEN_VEXT_LD_STRIDE() will eventually call vext_ldst_stride() and pass esz
as the parameter.
However, esz is not only used in vext_max_elems() but also used for other
calculation, e.g.:

    probe_pages(env, base + stride * i, nf * esz, ra, access_type);
and
    target_ulong addr = base + stride * i + k * esz;

If we pass ctzl(sizeof(type)) in GEN_VEXT_LD_STRIDE(),
I would still have to do: (1 << esz) to get the correct element size in the
above calculations.
Would it eliminate the performance gain we have in vext_max_elems() instead?

Frank Chang
Richard Henderson Aug. 15, 2020, 5:29 a.m. UTC | #6
On 8/14/20 7:52 PM, Frank Chang wrote:
>     probe_pages(env, base + stride * i, nf * esz, ra, access_type);
> and
>     target_ulong addr = base + stride * i + k * esz;
> 
> If we pass ctzl(sizeof(type)) in GEN_VEXT_LD_STRIDE(),
> I would still have to do: (1 << esz) to get the correct element size in the
> above calculations.
> Would it eliminate the performance gain we have in vext_max_elems() instead?

Well, no, it will improve performance, because you'll write

  addr = base + stride * i + (k << esz)

I.e. strength-reduce the multiply to a shift.


r~
Frank Chang Aug. 15, 2020, 9:59 p.m. UTC | #7
On Sat, Aug 15, 2020 at 1:29 PM Richard Henderson <
richard.henderson@linaro.org> wrote:

> On 8/14/20 7:52 PM, Frank Chang wrote:
> >     probe_pages(env, base + stride * i, nf * esz, ra, access_type);
> > and
> >     target_ulong addr = base + stride * i + k * esz;
> >
> > If we pass ctzl(sizeof(type)) in GEN_VEXT_LD_STRIDE(),
> > I would still have to do: (1 << esz) to get the correct element size in
> the
> > above calculations.
> > Would it eliminate the performance gain we have in vext_max_elems()
> instead?
>
> Well, no, it will improve performance, because you'll write
>
>   addr = base + stride * i + (k << esz)
>
> I.e. strength-reduce the multiply to a shift.
>
>
This works like a charm.
Thanks for the advice.

Frank Chang



> r~
>
>
diff mbox series

Patch

diff --git a/target/riscv/insn_trans/trans_rvv.inc.c b/target/riscv/insn_trans/trans_rvv.inc.c
index c7e094b6e5b..725f36fcfcc 100644
--- a/target/riscv/insn_trans/trans_rvv.inc.c
+++ b/target/riscv/insn_trans/trans_rvv.inc.c
@@ -609,7 +609,7 @@  static bool ldst_us_trans(uint32_t vd, uint32_t rs1, uint32_t data,
 
     /*
      * As simd_desc supports at most 256 bytes, and in this implementation,
-     * the max vector group length is 2048 bytes. So split it into two parts.
+     * the max vector group length is 1024 bytes. So split it into two parts.
      *
      * The first part is vlen in bytes, encoded in maxsz of simd_desc.
      * The second part is lmul, encoded in data of simd_desc.
@@ -653,6 +653,7 @@  static bool ld_us_op(DisasContext *s, arg_r2nfvm *a, uint8_t seq)
 
     data = FIELD_DP32(data, VDATA, VM, a->vm);
     data = FIELD_DP32(data, VDATA, LMUL, s->lmul);
+    data = FIELD_DP32(data, VDATA, SEW, s->sew);
     data = FIELD_DP32(data, VDATA, NF, a->nf);
     return ldst_us_trans(a->rd, a->rs1, data, fn, s, false);
 }
@@ -689,6 +690,7 @@  static bool st_us_op(DisasContext *s, arg_r2nfvm *a, uint8_t seq)
 
     data = FIELD_DP32(data, VDATA, VM, a->vm);
     data = FIELD_DP32(data, VDATA, LMUL, s->lmul);
+    data = FIELD_DP32(data, VDATA, SEW, s->sew);
     data = FIELD_DP32(data, VDATA, NF, a->nf);
     return ldst_us_trans(a->rd, a->rs1, data, fn, s, true);
 }
@@ -763,6 +765,7 @@  static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t seq)
 
     data = FIELD_DP32(data, VDATA, VM, a->vm);
     data = FIELD_DP32(data, VDATA, LMUL, s->lmul);
+    data = FIELD_DP32(data, VDATA, SEW, s->sew);
     data = FIELD_DP32(data, VDATA, NF, a->nf);
     return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s, false);
 }
@@ -791,6 +794,7 @@  static bool st_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t seq)
 
     data = FIELD_DP32(data, VDATA, VM, a->vm);
     data = FIELD_DP32(data, VDATA, LMUL, s->lmul);
+    data = FIELD_DP32(data, VDATA, SEW, s->sew);
     data = FIELD_DP32(data, VDATA, NF, a->nf);
     fn = fns[seq];
     if (fn == NULL) {
@@ -889,6 +893,7 @@  static bool ld_index_op(DisasContext *s, arg_rnfvm *a, uint8_t seq)
 
     data = FIELD_DP32(data, VDATA, VM, a->vm);
     data = FIELD_DP32(data, VDATA, LMUL, s->lmul);
+    data = FIELD_DP32(data, VDATA, SEW, s->sew);
     data = FIELD_DP32(data, VDATA, NF, a->nf);
     return ldst_index_trans(a->rd, a->rs1, a->rs2, data, fn, s, false);
 }
@@ -940,6 +945,7 @@  static bool st_index_op(DisasContext *s, arg_rnfvm *a, uint8_t seq)
 
     data = FIELD_DP32(data, VDATA, VM, a->vm);
     data = FIELD_DP32(data, VDATA, LMUL, s->lmul);
+    data = FIELD_DP32(data, VDATA, SEW, s->sew);
     data = FIELD_DP32(data, VDATA, NF, a->nf);
     return ldst_index_trans(a->rd, a->rs1, a->rs2, data, fn, s, true);
 }
@@ -1005,6 +1011,7 @@  static bool ldff_op(DisasContext *s, arg_r2nfvm *a, uint8_t seq)
 
     data = FIELD_DP32(data, VDATA, VM, a->vm);
     data = FIELD_DP32(data, VDATA, LMUL, s->lmul);
+    data = FIELD_DP32(data, VDATA, SEW, s->sew);
     data = FIELD_DP32(data, VDATA, NF, a->nf);
     return ldff_trans(a->rd, a->rs1, data, fn, s);
 }
@@ -1189,6 +1196,7 @@  static bool amo_op(DisasContext *s, arg_rwdvm *a, uint8_t seq)
 
     data = FIELD_DP32(data, VDATA, VM, a->vm);
     data = FIELD_DP32(data, VDATA, LMUL, s->lmul);
+    data = FIELD_DP32(data, VDATA, SEW, s->sew);
     data = FIELD_DP32(data, VDATA, WD, a->wd);
     return amo_trans(a->rd, a->rs1, a->rs2, data, fn, s);
 }
diff --git a/target/riscv/internals.h b/target/riscv/internals.h
index bca48297dab..4fb683a7399 100644
--- a/target/riscv/internals.h
+++ b/target/riscv/internals.h
@@ -24,8 +24,9 @@ 
 /* share data between vector helpers and decode code */
 FIELD(VDATA, VM, 0, 1)
 FIELD(VDATA, LMUL, 1, 3)
-FIELD(VDATA, NF, 4, 4)
-FIELD(VDATA, WD, 4, 1)
+FIELD(VDATA, SEW, 4, 3)
+FIELD(VDATA, NF, 7, 4)
+FIELD(VDATA, WD, 7, 1)
 
 /* float point classify helpers */
 target_ulong fclass_h(uint64_t frs1);
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index a58051ded93..77f62c86e02 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -17,6 +17,7 @@ 
  */
 
 #include "qemu/osdep.h"
+#include "qemu/host-utils.h"
 #include "cpu.h"
 #include "exec/memop.h"
 #include "exec/exec-all.h"
@@ -98,6 +99,11 @@  static inline uint32_t vext_vm(uint32_t desc)
     return FIELD_EX32(simd_data(desc), VDATA, VM);
 }
 
+static inline uint32_t vext_sew(uint32_t desc)
+{
+    return FIELD_EX32(simd_data(desc), VDATA, SEW);
+}
+
 /*
  * Encode LMUL to lmul as following:
  *     LMUL    vlmul    lmul
@@ -122,14 +128,35 @@  static uint32_t vext_wd(uint32_t desc)
 }
 
 /*
- * Get vector group length in bytes. Its range is [64, 2048].
+ * Get the maximum number of elements can be operated.
  *
- * As simd_desc support at most 256, the max vlen is 512 bits.
- * So vlen in bytes is encoded as maxsz.
+ * Use ctzl() to get log2(esz) and log2(vlenb)
+ * so that we can use shifts for all arithmetics.
  */
-static inline uint32_t vext_maxsz(uint32_t desc)
+static inline uint32_t vext_max_elems(uint32_t desc, uint32_t esz, bool is_ldst)
 {
-    return simd_maxsz(desc) << vext_lmul(desc);
+    /*
+     * As simd_desc support at most 256 bytes, the max vlen is 256 bits.
+     * so vlen in bytes (vlenb) is encoded as maxsz.
+     */
+    uint32_t vlenb = simd_maxsz(desc);
+
+    if (is_ldst) {
+        /*
+         * Vector load/store instructions have the EEW encoded
+         * directly in the instructions. The maximum vector size is
+         * calculated with EMUL rather than LMUL.
+         */
+        uint32_t eew = ctzl(esz);
+        uint32_t sew = vext_sew(desc);
+        uint32_t lmul = vext_lmul(desc);
+        int32_t emul = eew - sew + lmul;
+        uint32_t emul_r = emul < 0 ? 0 : emul;
+        return 1 << (ctzl(vlenb) + emul_r - ctzl(esz));
+    } else {
+        /* Return VLMAX */
+        return 1 << (ctzl(vlenb) + vext_lmul(desc) - ctzl(esz));
+    }
 }
 
 /*
@@ -224,7 +251,7 @@  vext_ldst_stride(void *vd, void *v0, target_ulong base,
 {
     uint32_t i, k;
     uint32_t nf = vext_nf(desc);
-    uint32_t vlmax = vext_maxsz(desc) / esz;
+    uint32_t max_elems = vext_max_elems(desc, esz, true);
 
     /* probe every access*/
     for (i = 0; i < env->vl; i++) {
@@ -241,7 +268,7 @@  vext_ldst_stride(void *vd, void *v0, target_ulong base,
         }
         while (k < nf) {
             target_ulong addr = base + stride * i + k * esz;
-            ldst_elem(env, addr, i + k * vlmax, vd, ra);
+            ldst_elem(env, addr, i + k * max_elems, vd, ra);
             k++;
         }
     }
@@ -289,7 +316,7 @@  vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
 {
     uint32_t i, k;
     uint32_t nf = vext_nf(desc);
-    uint32_t vlmax = vext_maxsz(desc) / esz;
+    uint32_t max_elems = vext_max_elems(desc, esz, true);
 
     /* probe every access */
     probe_pages(env, base, env->vl * nf * esz, ra, access_type);
@@ -298,7 +325,7 @@  vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
         k = 0;
         while (k < nf) {
             target_ulong addr = base + (i * nf + k) * esz;
-            ldst_elem(env, addr, i + k * vlmax, vd, ra);
+            ldst_elem(env, addr, i + k * max_elems, vd, ra);
             k++;
         }
     }
@@ -379,7 +406,7 @@  vext_ldst_index(void *vd, void *v0, target_ulong base,
     uint32_t i, k;
     uint32_t nf = vext_nf(desc);
     uint32_t vm = vext_vm(desc);
-    uint32_t vlmax = vext_maxsz(desc) / esz;
+    uint32_t max_elems = vext_max_elems(desc, esz, true);
 
     /* probe every access*/
     for (i = 0; i < env->vl; i++) {
@@ -397,7 +424,7 @@  vext_ldst_index(void *vd, void *v0, target_ulong base,
         }
         while (k < nf) {
             abi_ptr addr = get_index_addr(base, i, vs2) + k * esz;
-            ldst_elem(env, addr, i + k * vlmax, vd, ra);
+            ldst_elem(env, addr, i + k * max_elems, vd, ra);
             k++;
         }
     }
@@ -467,7 +494,7 @@  vext_ldff(void *vd, void *v0, target_ulong base,
     uint32_t i, k, vl = 0;
     uint32_t nf = vext_nf(desc);
     uint32_t vm = vext_vm(desc);
-    uint32_t vlmax = vext_maxsz(desc) / esz;
+    uint32_t max_elems = vext_max_elems(desc, esz, true);
     target_ulong addr, offset, remain;
 
     /* probe every access*/
@@ -518,7 +545,7 @@  ProbeSuccess:
         }
         while (k < nf) {
             target_ulong addr = base + (i * nf + k) * esz;
-            ldst_elem(env, addr, i + k * vlmax, vd, ra);
+            ldst_elem(env, addr, i + k * max_elems, vd, ra);
             k++;
         }
     }
@@ -1226,7 +1253,7 @@  void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
                   void *vs2, CPURISCVState *env, uint32_t desc) \
 {                                                               \
     uint32_t vl = env->vl;                                      \
-    uint32_t vlmax = vext_maxsz(desc) / sizeof(ETYPE);          \
+    uint32_t vlmax = vext_max_elems(desc, sizeof(ETYPE), false);\
     uint32_t i;                                                 \
                                                                 \
     for (i = 0; i < vl; i++) {                                  \
@@ -3887,7 +3914,7 @@  void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
 {                                                                   \
     uint32_t vm = vext_vm(desc);                                    \
     uint32_t vl = env->vl;                                          \
-    uint32_t vlmax = vext_maxsz(desc) / sizeof(ETYPE);              \
+    uint32_t vlmax = vext_max_elems(desc, sizeof(ETYPE), false);    \
     uint32_t i;                                                     \
                                                                     \
     for (i = 0; i < vl; i++) {                                      \
@@ -4692,7 +4719,7 @@  GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, uint64_t, H8)
 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
                   CPURISCVState *env, uint32_t desc)                      \
 {                                                                         \
-    uint32_t vlmax = env_archcpu(env)->cfg.vlen;                          \
+    uint32_t vlmax = vext_max_elems(desc, sizeof(ETYPE), false);          \
     uint32_t vm = vext_vm(desc);                                          \
     uint32_t vl = env->vl;                                                \
     uint32_t index, i;                                                    \
@@ -4720,7 +4747,7 @@  GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, H8)
 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
                   CPURISCVState *env, uint32_t desc)                      \
 {                                                                         \
-    uint32_t vlmax = env_archcpu(env)->cfg.vlen;                          \
+    uint32_t vlmax = vext_max_elems(desc, sizeof(ETYPE), false);          \
     uint32_t vm = vext_vm(desc);                                          \
     uint32_t vl = env->vl;                                                \
     uint32_t index = s1, i;                                               \