Message ID | 20200806104709.13235-27-frank.chang@sifive.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | target/riscv: support vector extension v1.0 | expand |
On 8/6/20 3:46 AM, frank.chang@sifive.com wrote: > +static inline uint32_t vext_max_elems(uint32_t desc, uint32_t esz, bool is_ldst) > { > - return simd_maxsz(desc) << vext_lmul(desc); > + /* > + * As simd_desc support at most 256 bytes, the max vlen is 256 bits. > + * so vlen in bytes (vlenb) is encoded as maxsz. > + */ > + uint32_t vlenb = simd_maxsz(desc); > + > + if (is_ldst) { > + /* > + * Vector load/store instructions have the EEW encoded > + * directly in the instructions. The maximum vector size is > + * calculated with EMUL rather than LMUL. > + */ > + uint32_t eew = ctzl(esz); > + uint32_t sew = vext_sew(desc); > + uint32_t lmul = vext_lmul(desc); > + int32_t emul = eew - sew + lmul; > + uint32_t emul_r = emul < 0 ? 0 : emul; > + return 1 << (ctzl(vlenb) + emul_r - ctzl(esz)); As I said before, the is_ldst instructions should put the EEW and EMUL values into the SEW and LMUL desc fields, so that this does not need to be special-cased at all. > + /* Return VLMAX */ > + return 1 << (ctzl(vlenb) + vext_lmul(desc) - ctzl(esz)); This is overly complicated. (1) 1 << ctzl(vlenb) == vlenb. (2) I'm not sure why esz is not already a log2 number. This ought to look more like int scale = lmul - esz; return (scale < 0 ? vlenb >> -scale : vlenb << scale); r~
On Fri, Aug 7, 2020 at 8:04 AM Richard Henderson < richard.henderson@linaro.org> wrote: > On 8/6/20 3:46 AM, frank.chang@sifive.com wrote: > > +static inline uint32_t vext_max_elems(uint32_t desc, uint32_t esz, bool > is_ldst) > > { > > - return simd_maxsz(desc) << vext_lmul(desc); > > + /* > > + * As simd_desc support at most 256 bytes, the max vlen is 256 bits. > > + * so vlen in bytes (vlenb) is encoded as maxsz. > > + */ > > + uint32_t vlenb = simd_maxsz(desc); > > + > > + if (is_ldst) { > > + /* > > + * Vector load/store instructions have the EEW encoded > > + * directly in the instructions. The maximum vector size is > > + * calculated with EMUL rather than LMUL. > > + */ > > + uint32_t eew = ctzl(esz); > > + uint32_t sew = vext_sew(desc); > > + uint32_t lmul = vext_lmul(desc); > > + int32_t emul = eew - sew + lmul; > > + uint32_t emul_r = emul < 0 ? 0 : emul; > > + return 1 << (ctzl(vlenb) + emul_r - ctzl(esz)); > > As I said before, the is_ldst instructions should put the EEW and EMUL > values > into the SEW and LMUL desc fields, so that this does not need to be special-cased at all. > I add a vext_get_emul() helper function in trans_rvv.inc.c: > static uint8_t vext_get_emul(DisasContext *s, uint8_t eew) > { > int8_t lmul = sextract32(s->lmul, 0, 3); > int8_t emul = ctzl(eew) - (s->sew + 3) + lmul; // may remove ctzl() if eew is already log2(eew) > return emul < 0 ? 0 : emul; > } and pass emul as LMUL field in VDATA so that it can be reused in vector_helper.c: vext_max_elems(): > uint8_t emul = vext_get_emul(s, eew); > data = FIELD_DP32(data, VDATA, LMUL, emul); I also remove the passing SEW field in VDATA codes as I think SEW might not be required in the updated vext_max_elems() (see below). > > > + /* Return VLMAX */ > > + return 1 << (ctzl(vlenb) + vext_lmul(desc) - ctzl(esz)); > > This is overly complicated. > > (1) 1 << ctzl(vlenb) == vlenb. > (2) I'm not sure why esz is not already a log2 number. > esz is passed from e.g. GEN_VEXT_LD_STRIDE() macro: > #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN) \ > void HELPER(NAME)(void *vd, void * v0, target_ulong base, \ > target_ulong stride, CPURISCVState *env, \ > uint32_t desc) \ > { \ > uint32_t vm = vext_vm(desc); \ > vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN, \ > sizeof(ETYPE), GETPC(), MMU_DATA_LOAD); \ > } > > GEN_VEXT_LD_STRIDE(vlse8_v, int8_t, lde_b) which is calculated by sizeof(ETYPE), so the results would be: 1, 2, 4, 8. and vext_max_elems() is called by e.g. vext_ldst_stride(): > uint32_t max_elems = vext_max_elems(desc, esz); I can add another parameter to the macro and pass the hard-coded log2(esz) number if it's the better way instead of using ctzl(). Or if there's another approach to get the log2(esz) number more elegantly? > > This ought to look more like > > int scale = lmul - esz; > return (scale < 0 > ? vlenb >> -scale > : vlenb << scale); > > Thanks for the detailed point outs. I manage to change the codes to below as your suggestion. > static inline uint32_t vext_max_elems(uint32_t desc, uint32_t esz) > { > /* > * As simd_desc support at most 256 bytes, the max vlen is 256 bits. > * so vlen in bytes (vlenb) is encoded as maxsz. > */ > uint32_t vlenb = simd_maxsz(desc); > > /* Return VLMAX */ > int scale = vext_lmul(desc) - ctzl(esz); // may remove ctzl() if esz is already log2(esz) > return scale < 0 ? vlenb >> -scale : vlenb << scale; > } > > r~ > Thanks for the review. Frank Chang
On 8/13/20 7:48 PM, Frank Chang wrote: > esz is passed from e.g. GEN_VEXT_LD_STRIDE() macro: > >> #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN) \ >> void HELPER(NAME)(void *vd, void * v0, target_ulong base, \ >> target_ulong stride, CPURISCVState *env, \ >> uint32_t desc) \ >> { \ >> uint32_t vm = vext_vm(desc); \ >> vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN, \ >> sizeof(ETYPE), GETPC(), MMU_DATA_LOAD); \ >> } >> >> GEN_VEXT_LD_STRIDE(vlse8_v, int8_t, lde_b) > > which is calculated by sizeof(ETYPE), so the results would be: 1, 2, 4, 8. > and vext_max_elems() is called by e.g. vext_ldst_stride(): Ah, yes. >> uint32_t max_elems = vext_max_elems(desc, esz); > > I can add another parameter to the macro and pass the hard-coded log2(esz) number > if it's the better way instead of using ctzl(). > Or if there's another approach to get the log2(esz) number more elegantly? Using ctzl(sizeof(type)) in the GEN_VEXT_LD_STRIDE macro will work well. This will be constant folded by the compiler. r~
On Sat, Aug 15, 2020 at 2:36 AM Richard Henderson < richard.henderson@linaro.org> wrote: > On 8/13/20 7:48 PM, Frank Chang wrote: > > esz is passed from e.g. GEN_VEXT_LD_STRIDE() macro: > > > >> #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN) \ > >> void HELPER(NAME)(void *vd, void * v0, target_ulong base, \ > >> target_ulong stride, CPURISCVState *env, \ > >> uint32_t desc) \ > >> { \ > >> uint32_t vm = vext_vm(desc); \ > >> vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN, \ > >> sizeof(ETYPE), GETPC(), MMU_DATA_LOAD); \ > >> } > >> > >> GEN_VEXT_LD_STRIDE(vlse8_v, int8_t, lde_b) > > > > which is calculated by sizeof(ETYPE), so the results would be: 1, 2, 4, > 8. > > and vext_max_elems() is called by e.g. vext_ldst_stride(): > > Ah, yes. > > >> uint32_t max_elems = vext_max_elems(desc, esz); > > > > I can add another parameter to the macro and pass the hard-coded > log2(esz) number > > if it's the better way instead of using ctzl(). > > Or if there's another approach to get the log2(esz) number more > elegantly? > > Using ctzl(sizeof(type)) in the GEN_VEXT_LD_STRIDE macro will work well. > This > will be constant folded by the compiler. > > > r~ > Nice, didn't come up with the compiler optimization. Will fix the codes and send out a new version of patchset. Thanks for the tips. Frank Chang
On Sat, Aug 15, 2020 at 2:36 AM Richard Henderson < richard.henderson@linaro.org> wrote: > On 8/13/20 7:48 PM, Frank Chang wrote: > > esz is passed from e.g. GEN_VEXT_LD_STRIDE() macro: > > > >> #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN) \ > >> void HELPER(NAME)(void *vd, void * v0, target_ulong base, \ > >> target_ulong stride, CPURISCVState *env, \ > >> uint32_t desc) \ > >> { \ > >> uint32_t vm = vext_vm(desc); \ > >> vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN, \ > >> sizeof(ETYPE), GETPC(), MMU_DATA_LOAD); \ > >> } > >> > >> GEN_VEXT_LD_STRIDE(vlse8_v, int8_t, lde_b) > > > > which is calculated by sizeof(ETYPE), so the results would be: 1, 2, 4, > 8. > > and vext_max_elems() is called by e.g. vext_ldst_stride(): > > Ah, yes. > > >> uint32_t max_elems = vext_max_elems(desc, esz); > > > > I can add another parameter to the macro and pass the hard-coded > log2(esz) number > > if it's the better way instead of using ctzl(). > > Or if there's another approach to get the log2(esz) number more > elegantly? > > Using ctzl(sizeof(type)) in the GEN_VEXT_LD_STRIDE macro will work well. > This > will be constant folded by the compiler. > > > r~ > Checked the codes again, GEN_VEXT_LD_STRIDE() will eventually call vext_ldst_stride() and pass esz as the parameter. However, esz is not only used in vext_max_elems() but also used for other calculation, e.g.: probe_pages(env, base + stride * i, nf * esz, ra, access_type); and target_ulong addr = base + stride * i + k * esz; If we pass ctzl(sizeof(type)) in GEN_VEXT_LD_STRIDE(), I would still have to do: (1 << esz) to get the correct element size in the above calculations. Would it eliminate the performance gain we have in vext_max_elems() instead? Frank Chang
On 8/14/20 7:52 PM, Frank Chang wrote: > probe_pages(env, base + stride * i, nf * esz, ra, access_type); > and > target_ulong addr = base + stride * i + k * esz; > > If we pass ctzl(sizeof(type)) in GEN_VEXT_LD_STRIDE(), > I would still have to do: (1 << esz) to get the correct element size in the > above calculations. > Would it eliminate the performance gain we have in vext_max_elems() instead? Well, no, it will improve performance, because you'll write addr = base + stride * i + (k << esz) I.e. strength-reduce the multiply to a shift. r~
On Sat, Aug 15, 2020 at 1:29 PM Richard Henderson < richard.henderson@linaro.org> wrote: > On 8/14/20 7:52 PM, Frank Chang wrote: > > probe_pages(env, base + stride * i, nf * esz, ra, access_type); > > and > > target_ulong addr = base + stride * i + k * esz; > > > > If we pass ctzl(sizeof(type)) in GEN_VEXT_LD_STRIDE(), > > I would still have to do: (1 << esz) to get the correct element size in > the > > above calculations. > > Would it eliminate the performance gain we have in vext_max_elems() > instead? > > Well, no, it will improve performance, because you'll write > > addr = base + stride * i + (k << esz) > > I.e. strength-reduce the multiply to a shift. > > This works like a charm. Thanks for the advice. Frank Chang > r~ > >
diff --git a/target/riscv/insn_trans/trans_rvv.inc.c b/target/riscv/insn_trans/trans_rvv.inc.c index c7e094b6e5b..725f36fcfcc 100644 --- a/target/riscv/insn_trans/trans_rvv.inc.c +++ b/target/riscv/insn_trans/trans_rvv.inc.c @@ -609,7 +609,7 @@ static bool ldst_us_trans(uint32_t vd, uint32_t rs1, uint32_t data, /* * As simd_desc supports at most 256 bytes, and in this implementation, - * the max vector group length is 2048 bytes. So split it into two parts. + * the max vector group length is 1024 bytes. So split it into two parts. * * The first part is vlen in bytes, encoded in maxsz of simd_desc. * The second part is lmul, encoded in data of simd_desc. @@ -653,6 +653,7 @@ static bool ld_us_op(DisasContext *s, arg_r2nfvm *a, uint8_t seq) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, SEW, s->sew); data = FIELD_DP32(data, VDATA, NF, a->nf); return ldst_us_trans(a->rd, a->rs1, data, fn, s, false); } @@ -689,6 +690,7 @@ static bool st_us_op(DisasContext *s, arg_r2nfvm *a, uint8_t seq) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, SEW, s->sew); data = FIELD_DP32(data, VDATA, NF, a->nf); return ldst_us_trans(a->rd, a->rs1, data, fn, s, true); } @@ -763,6 +765,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t seq) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, SEW, s->sew); data = FIELD_DP32(data, VDATA, NF, a->nf); return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s, false); } @@ -791,6 +794,7 @@ static bool st_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t seq) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, SEW, s->sew); data = FIELD_DP32(data, VDATA, NF, a->nf); fn = fns[seq]; if (fn == NULL) { @@ -889,6 +893,7 @@ static bool ld_index_op(DisasContext *s, arg_rnfvm *a, uint8_t seq) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, SEW, s->sew); data = FIELD_DP32(data, VDATA, NF, a->nf); return ldst_index_trans(a->rd, a->rs1, a->rs2, data, fn, s, false); } @@ -940,6 +945,7 @@ static bool st_index_op(DisasContext *s, arg_rnfvm *a, uint8_t seq) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, SEW, s->sew); data = FIELD_DP32(data, VDATA, NF, a->nf); return ldst_index_trans(a->rd, a->rs1, a->rs2, data, fn, s, true); } @@ -1005,6 +1011,7 @@ static bool ldff_op(DisasContext *s, arg_r2nfvm *a, uint8_t seq) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, SEW, s->sew); data = FIELD_DP32(data, VDATA, NF, a->nf); return ldff_trans(a->rd, a->rs1, data, fn, s); } @@ -1189,6 +1196,7 @@ static bool amo_op(DisasContext *s, arg_rwdvm *a, uint8_t seq) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, s->lmul); + data = FIELD_DP32(data, VDATA, SEW, s->sew); data = FIELD_DP32(data, VDATA, WD, a->wd); return amo_trans(a->rd, a->rs1, a->rs2, data, fn, s); } diff --git a/target/riscv/internals.h b/target/riscv/internals.h index bca48297dab..4fb683a7399 100644 --- a/target/riscv/internals.h +++ b/target/riscv/internals.h @@ -24,8 +24,9 @@ /* share data between vector helpers and decode code */ FIELD(VDATA, VM, 0, 1) FIELD(VDATA, LMUL, 1, 3) -FIELD(VDATA, NF, 4, 4) -FIELD(VDATA, WD, 4, 1) +FIELD(VDATA, SEW, 4, 3) +FIELD(VDATA, NF, 7, 4) +FIELD(VDATA, WD, 7, 1) /* float point classify helpers */ target_ulong fclass_h(uint64_t frs1); diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index a58051ded93..77f62c86e02 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -17,6 +17,7 @@ */ #include "qemu/osdep.h" +#include "qemu/host-utils.h" #include "cpu.h" #include "exec/memop.h" #include "exec/exec-all.h" @@ -98,6 +99,11 @@ static inline uint32_t vext_vm(uint32_t desc) return FIELD_EX32(simd_data(desc), VDATA, VM); } +static inline uint32_t vext_sew(uint32_t desc) +{ + return FIELD_EX32(simd_data(desc), VDATA, SEW); +} + /* * Encode LMUL to lmul as following: * LMUL vlmul lmul @@ -122,14 +128,35 @@ static uint32_t vext_wd(uint32_t desc) } /* - * Get vector group length in bytes. Its range is [64, 2048]. + * Get the maximum number of elements can be operated. * - * As simd_desc support at most 256, the max vlen is 512 bits. - * So vlen in bytes is encoded as maxsz. + * Use ctzl() to get log2(esz) and log2(vlenb) + * so that we can use shifts for all arithmetics. */ -static inline uint32_t vext_maxsz(uint32_t desc) +static inline uint32_t vext_max_elems(uint32_t desc, uint32_t esz, bool is_ldst) { - return simd_maxsz(desc) << vext_lmul(desc); + /* + * As simd_desc support at most 256 bytes, the max vlen is 256 bits. + * so vlen in bytes (vlenb) is encoded as maxsz. + */ + uint32_t vlenb = simd_maxsz(desc); + + if (is_ldst) { + /* + * Vector load/store instructions have the EEW encoded + * directly in the instructions. The maximum vector size is + * calculated with EMUL rather than LMUL. + */ + uint32_t eew = ctzl(esz); + uint32_t sew = vext_sew(desc); + uint32_t lmul = vext_lmul(desc); + int32_t emul = eew - sew + lmul; + uint32_t emul_r = emul < 0 ? 0 : emul; + return 1 << (ctzl(vlenb) + emul_r - ctzl(esz)); + } else { + /* Return VLMAX */ + return 1 << (ctzl(vlenb) + vext_lmul(desc) - ctzl(esz)); + } } /* @@ -224,7 +251,7 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base, { uint32_t i, k; uint32_t nf = vext_nf(desc); - uint32_t vlmax = vext_maxsz(desc) / esz; + uint32_t max_elems = vext_max_elems(desc, esz, true); /* probe every access*/ for (i = 0; i < env->vl; i++) { @@ -241,7 +268,7 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base, } while (k < nf) { target_ulong addr = base + stride * i + k * esz; - ldst_elem(env, addr, i + k * vlmax, vd, ra); + ldst_elem(env, addr, i + k * max_elems, vd, ra); k++; } } @@ -289,7 +316,7 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, { uint32_t i, k; uint32_t nf = vext_nf(desc); - uint32_t vlmax = vext_maxsz(desc) / esz; + uint32_t max_elems = vext_max_elems(desc, esz, true); /* probe every access */ probe_pages(env, base, env->vl * nf * esz, ra, access_type); @@ -298,7 +325,7 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, k = 0; while (k < nf) { target_ulong addr = base + (i * nf + k) * esz; - ldst_elem(env, addr, i + k * vlmax, vd, ra); + ldst_elem(env, addr, i + k * max_elems, vd, ra); k++; } } @@ -379,7 +406,7 @@ vext_ldst_index(void *vd, void *v0, target_ulong base, uint32_t i, k; uint32_t nf = vext_nf(desc); uint32_t vm = vext_vm(desc); - uint32_t vlmax = vext_maxsz(desc) / esz; + uint32_t max_elems = vext_max_elems(desc, esz, true); /* probe every access*/ for (i = 0; i < env->vl; i++) { @@ -397,7 +424,7 @@ vext_ldst_index(void *vd, void *v0, target_ulong base, } while (k < nf) { abi_ptr addr = get_index_addr(base, i, vs2) + k * esz; - ldst_elem(env, addr, i + k * vlmax, vd, ra); + ldst_elem(env, addr, i + k * max_elems, vd, ra); k++; } } @@ -467,7 +494,7 @@ vext_ldff(void *vd, void *v0, target_ulong base, uint32_t i, k, vl = 0; uint32_t nf = vext_nf(desc); uint32_t vm = vext_vm(desc); - uint32_t vlmax = vext_maxsz(desc) / esz; + uint32_t max_elems = vext_max_elems(desc, esz, true); target_ulong addr, offset, remain; /* probe every access*/ @@ -518,7 +545,7 @@ ProbeSuccess: } while (k < nf) { target_ulong addr = base + (i * nf + k) * esz; - ldst_elem(env, addr, i + k * vlmax, vd, ra); + ldst_elem(env, addr, i + k * max_elems, vd, ra); k++; } } @@ -1226,7 +1253,7 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \ void *vs2, CPURISCVState *env, uint32_t desc) \ { \ uint32_t vl = env->vl; \ - uint32_t vlmax = vext_maxsz(desc) / sizeof(ETYPE); \ + uint32_t vlmax = vext_max_elems(desc, sizeof(ETYPE), false);\ uint32_t i; \ \ for (i = 0; i < vl; i++) { \ @@ -3887,7 +3914,7 @@ void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \ { \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ - uint32_t vlmax = vext_maxsz(desc) / sizeof(ETYPE); \ + uint32_t vlmax = vext_max_elems(desc, sizeof(ETYPE), false); \ uint32_t i; \ \ for (i = 0; i < vl; i++) { \ @@ -4692,7 +4719,7 @@ GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, uint64_t, H8) void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ CPURISCVState *env, uint32_t desc) \ { \ - uint32_t vlmax = env_archcpu(env)->cfg.vlen; \ + uint32_t vlmax = vext_max_elems(desc, sizeof(ETYPE), false); \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ uint32_t index, i; \ @@ -4720,7 +4747,7 @@ GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, H8) void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ CPURISCVState *env, uint32_t desc) \ { \ - uint32_t vlmax = env_archcpu(env)->cfg.vlen; \ + uint32_t vlmax = vext_max_elems(desc, sizeof(ETYPE), false); \ uint32_t vm = vext_vm(desc); \ uint32_t vl = env->vl; \ uint32_t index = s1, i; \