Message ID | 20240830061607.1940-5-zhiwei_liu@linux.alibaba.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | tcg/riscv: Add support for vector | expand |
On 8/30/24 16:15, LIU Zhiwei wrote: > From: TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com> > > In RISC-V, vector operations require initial configuration using > the vset{i}vl{i} instruction. > > This instruction: > 1. Sets the vector length (vl) in bytes > 2. Configures the vtype register, which includes: > SEW (Single Element Width) > LMUL (vector register group multiplier) > Other vector operation parameters > > This configuration is crucial for defining subsequent vector > operation behavior. To optimize performance, the configuration > process is managed dynamically: > 1. Reconfiguration using vset{i}vl{i} is necessary when SEW > or vector register group width changes. > 2. The vset instruction can be omitted when configuration > remains unchanged. > > This optimization is only effective within a single TB. > Each TB requires reconfiguration at its start, as the current > state cannot be obtained from hardware. > > Signed-off-by: TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com> > Signed-off-by: Weiwei Li <liwei1518@gmail.com> > Reviewed-by: Liu Zhiwei <zhiwei_liu@linux.alibaba.com> > --- > tcg/riscv/tcg-target.c.inc | 104 +++++++++++++++++++++++++++++++++++++ > 1 file changed, 104 insertions(+) > > diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc > index 5ef1538aed..49d01b8775 100644 > --- a/tcg/riscv/tcg-target.c.inc > +++ b/tcg/riscv/tcg-target.c.inc > @@ -119,6 +119,7 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) > #define GET_VREG_SET(vlen) (vlen == 64 ? ALL_QVECTOR_REG_GROUPS : \ > (vlen == 128 ? ALL_DVECTOR_REG_GROUPS : \ > ALL_VECTOR_REGS)) > +#define riscv_vlenb (riscv_vlen / 8) > > #define sextreg sextract64 > > @@ -168,6 +169,18 @@ static bool tcg_target_const_match(int64_t val, int ct, > * RISC-V Base ISA opcodes (IM) > */ > > +#define V_OPIVV (0x0 << 12) > +#define V_OPFVV (0x1 << 12) > +#define V_OPMVV (0x2 << 12) > +#define V_OPIVI (0x3 << 12) > +#define V_OPIVX (0x4 << 12) > +#define V_OPFVF (0x5 << 12) > +#define V_OPMVX (0x6 << 12) > +#define V_OPCFG (0x7 << 12) > + > +#define V_SUMOP (0x0 << 20) > +#define V_LUMOP (0x0 << 20) > + > typedef enum { > OPC_ADD = 0x33, > OPC_ADDI = 0x13, > @@ -263,6 +276,11 @@ typedef enum { > /* Zicond: integer conditional operations */ > OPC_CZERO_EQZ = 0x0e005033, > OPC_CZERO_NEZ = 0x0e007033, > + > + /* V: Vector extension 1.0 */ > + OPC_VSETVLI = 0x57 | V_OPCFG, > + OPC_VSETIVLI = 0xc0000057 | V_OPCFG, > + OPC_VSETVL = 0x80000057 | V_OPCFG, > } RISCVInsn; > > /* > @@ -355,6 +373,35 @@ static int32_t encode_uj(RISCVInsn opc, TCGReg rd, uint32_t imm) > return opc | (rd & 0x1f) << 7 | encode_ujimm20(imm); > } > > +typedef enum { > + VTA_TU = 0, > + VTA_TA, > +} RISCVVta; > + > +typedef enum { > + VMA_MU = 0, > + VMA_MA, > +} RISCVVma; Do these really need enumerators, or would 'bool' be sufficient? > +static int32_t encode_vtypei(RISCVVta vta, RISCVVma vma, > + unsigned vsew, RISCVVlmul vlmul) > +{ > + return (vma & 0x1) << 7 | (vta & 0x1) << 6 | (vsew & 0x7) << 3 | > + (vlmul & 0x7); > +} s/vtypei/vtype/g? vtype is only immediate in specific contexts, and you'll match the manual better if you talk about vtype the CSR rather than the vset*vli argument. Assert values in range rather than masking. Use MemOp vsew, since you're using MO_64, etc. > @@ -498,12 +551,62 @@ static void tcg_out_opc_reg_vec_i(TCGContext *s, RISCVInsn opc, > #define tcg_out_opc_vi(s, opc, vd, vs2, imm, vm) \ > tcg_out_opc_reg_vec_i(s, opc, vd, imm, vs2, vm); > > +#define tcg_out_opc_vconfig(s, opc, rd, avl, vtypei) \ > + tcg_out_opc_vec_config(s, opc, rd, avl, vtypei); Why the extra define? > + > +/* > + * TODO: If the vtype value is not supported by the implementation, > + * then the vill bit is set in vtype, the remaining bits in > + * vtype are set to zero, and the vl register is also set to zero > + */ Why is this a TODO? Are you suggesting that we might need to probe *all* of the cases at startup? > +static __thread int prev_vtypei; I think we should put this into TCGContext. We don't currently have any host-specific values there, but there's no reason we can't have any. > +#define get_vlmax(vsew) (riscv_vlen / (8 << vsew) * (LMUL_MAX)) Given that we know that LMUL_MAX is 8, doesn't this cancel out? > +#define get_vec_type_bytes(type) (type >= TCG_TYPE_V64 ? \ > + (8 << (type - TCG_TYPE_V64)) : 0) Again, assert not produce nonsense results. And this doesn't need hiding in a macro. > +#define calc_vlmul(oprsz) (ctzl(oprsz / riscv_vlenb)) I think it's clearer to do this inline, where we can see that oprsz > vlenb. > + > +static void tcg_target_set_vec_config(TCGContext *s, TCGType type, > + unsigned vece) > +{ > + unsigned vsew, oprsz, avl; > + int vtypei; > + RISCVVlmul vlmul; > + > + vsew = vece; You can just name the argument vsew... > + oprsz = get_vec_type_bytes(type); > + avl = oprsz / (1 << vece); > + vlmul = oprsz > riscv_vlenb ? > + calc_vlmul(oprsz) : VLMUL_M1; I guess it is always the case that full register operations are preferred over fractional? > + vtypei = encode_vtypei(VTA_TA, VMA_MA, vsew, vlmul); > + > + tcg_debug_assert(avl <= get_vlmax(vsew)); > + tcg_debug_assert(vlmul <= VLMUL_RESERVED); > + tcg_debug_assert(vsew <= MO_64); These asserts should be moved higher, above their first uses. r~
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc index 5ef1538aed..49d01b8775 100644 --- a/tcg/riscv/tcg-target.c.inc +++ b/tcg/riscv/tcg-target.c.inc @@ -119,6 +119,7 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) #define GET_VREG_SET(vlen) (vlen == 64 ? ALL_QVECTOR_REG_GROUPS : \ (vlen == 128 ? ALL_DVECTOR_REG_GROUPS : \ ALL_VECTOR_REGS)) +#define riscv_vlenb (riscv_vlen / 8) #define sextreg sextract64 @@ -168,6 +169,18 @@ static bool tcg_target_const_match(int64_t val, int ct, * RISC-V Base ISA opcodes (IM) */ +#define V_OPIVV (0x0 << 12) +#define V_OPFVV (0x1 << 12) +#define V_OPMVV (0x2 << 12) +#define V_OPIVI (0x3 << 12) +#define V_OPIVX (0x4 << 12) +#define V_OPFVF (0x5 << 12) +#define V_OPMVX (0x6 << 12) +#define V_OPCFG (0x7 << 12) + +#define V_SUMOP (0x0 << 20) +#define V_LUMOP (0x0 << 20) + typedef enum { OPC_ADD = 0x33, OPC_ADDI = 0x13, @@ -263,6 +276,11 @@ typedef enum { /* Zicond: integer conditional operations */ OPC_CZERO_EQZ = 0x0e005033, OPC_CZERO_NEZ = 0x0e007033, + + /* V: Vector extension 1.0 */ + OPC_VSETVLI = 0x57 | V_OPCFG, + OPC_VSETIVLI = 0xc0000057 | V_OPCFG, + OPC_VSETVL = 0x80000057 | V_OPCFG, } RISCVInsn; /* @@ -355,6 +373,35 @@ static int32_t encode_uj(RISCVInsn opc, TCGReg rd, uint32_t imm) return opc | (rd & 0x1f) << 7 | encode_ujimm20(imm); } +typedef enum { + VTA_TU = 0, + VTA_TA, +} RISCVVta; + +typedef enum { + VMA_MU = 0, + VMA_MA, +} RISCVVma; + +typedef enum { + VLMUL_M1 = 0, /* LMUL=1 */ + VLMUL_M2, /* LMUL=2 */ + VLMUL_M4, /* LMUL=4 */ + VLMUL_M8, /* LMUL=8 */ + VLMUL_RESERVED, + VLMUL_MF8, /* LMUL=1/8 */ + VLMUL_MF4, /* LMUL=1/4 */ + VLMUL_MF2, /* LMUL=1/2 */ +} RISCVVlmul; +#define LMUL_MAX 8 + +static int32_t encode_vtypei(RISCVVta vta, RISCVVma vma, + unsigned vsew, RISCVVlmul vlmul) +{ + return (vma & 0x1) << 7 | (vta & 0x1) << 6 | (vsew & 0x7) << 3 | + (vlmul & 0x7); +} + /* * RISC-V instruction emitters */ @@ -484,6 +531,12 @@ static void tcg_out_opc_reg_vec_i(TCGContext *s, RISCVInsn opc, tcg_out32(s, encode_r(opc, rd, (imm & 0x1f), vs2) | (vm << 25)); } +static void tcg_out_opc_vec_config(TCGContext *s, RISCVInsn opc, + TCGReg rd, uint32_t avl, int32_t vtypei) +{ + tcg_out32(s, encode_i(opc, rd, avl, vtypei)); +} + /* vm=0 (vm = false) means vector masking ENABLED. */ #define tcg_out_opc_vv(s, opc, vd, vs2, vs1, vm) \ tcg_out_opc_reg_vec(s, opc, vd, vs1, vs2, vm); @@ -498,12 +551,62 @@ static void tcg_out_opc_reg_vec_i(TCGContext *s, RISCVInsn opc, #define tcg_out_opc_vi(s, opc, vd, vs2, imm, vm) \ tcg_out_opc_reg_vec_i(s, opc, vd, imm, vs2, vm); +#define tcg_out_opc_vconfig(s, opc, rd, avl, vtypei) \ + tcg_out_opc_vec_config(s, opc, rd, avl, vtypei); + /* * Only unit-stride addressing implemented; may extend in future. */ #define tcg_out_opc_ldst_vec(s, opc, vs3_vd, rs1, vm) \ tcg_out_opc_reg_vec(s, opc, vs3_vd, rs1, 0, vm); +static void tcg_out_vsetvl(TCGContext *s, uint32_t avl, int vtypei) +{ + if (avl < 32) { + tcg_out_opc_vconfig(s, OPC_VSETIVLI, TCG_REG_ZERO, avl, vtypei); + } else { + tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, TCG_REG_ZERO, avl); + tcg_out_opc_vconfig(s, OPC_VSETVLI, TCG_REG_ZERO, TCG_REG_TMP0, vtypei); + } +} + +/* + * TODO: If the vtype value is not supported by the implementation, + * then the vill bit is set in vtype, the remaining bits in + * vtype are set to zero, and the vl register is also set to zero + */ + +static __thread int prev_vtypei; + +#define get_vlmax(vsew) (riscv_vlen / (8 << vsew) * (LMUL_MAX)) +#define get_vec_type_bytes(type) (type >= TCG_TYPE_V64 ? \ + (8 << (type - TCG_TYPE_V64)) : 0) +#define calc_vlmul(oprsz) (ctzl(oprsz / riscv_vlenb)) + +static void tcg_target_set_vec_config(TCGContext *s, TCGType type, + unsigned vece) +{ + unsigned vsew, oprsz, avl; + int vtypei; + RISCVVlmul vlmul; + + vsew = vece; + oprsz = get_vec_type_bytes(type); + avl = oprsz / (1 << vece); + vlmul = oprsz > riscv_vlenb ? + calc_vlmul(oprsz) : VLMUL_M1; + vtypei = encode_vtypei(VTA_TA, VMA_MA, vsew, vlmul); + + tcg_debug_assert(avl <= get_vlmax(vsew)); + tcg_debug_assert(vlmul <= VLMUL_RESERVED); + tcg_debug_assert(vsew <= MO_64); + + if (vtypei != prev_vtypei) { + prev_vtypei = vtypei; + tcg_out_vsetvl(s, avl, vtypei); + } +} + /* * TCG intrinsics */ @@ -2152,6 +2255,7 @@ static void tcg_target_qemu_prologue(TCGContext *s) static void tcg_out_tb_start(TCGContext *s) { + prev_vtypei = -1; /* nothing to do */ }