Message ID | 20230919035839.3297328-7-pulehui@huaweicloud.com (mailing list archive) |
---|---|
State | Handled Elsewhere, archived |
Headers | show |
Series | Zbb support and code simplification for RV64 JIT | expand |
Context | Check | Description |
---|---|---|
conchuod/cover_letter | success | Series has a cover letter |
conchuod/tree_selection | success | Guessed tree name to be for-next at HEAD 0bb80ecc33a8 |
conchuod/fixes_present | success | Fixes tag not required for -next series |
conchuod/maintainers_pattern | success | MAINTAINERS pattern errors before the patch: 5 and now 5 |
conchuod/verify_signedoff | success | Signed-off-by tag matches author and committer |
conchuod/kdoc | success | Errors and warnings before: 0 this patch: 0 |
conchuod/build_rv64_clang_allmodconfig | success | Errors and warnings before: 9 this patch: 9 |
conchuod/module_param | success | Was 0 now: 0 |
conchuod/build_rv64_gcc_allmodconfig | success | Errors and warnings before: 9 this patch: 9 |
conchuod/build_rv32_defconfig | success | Build OK |
conchuod/dtb_warn_rv64 | success | Errors and warnings before: 25 this patch: 25 |
conchuod/header_inline | success | No static functions without inline keyword in header files |
conchuod/checkpatch | success | total: 0 errors, 0 warnings, 0 checks, 143 lines checked |
conchuod/build_rv64_nommu_k210_defconfig | success | Build OK |
conchuod/verify_fixes | success | No Fixes tag |
conchuod/build_rv64_nommu_virt_defconfig | success | Build OK |
Pu Lehui <pulehui@huaweicloud.com> writes: > From: Pu Lehui <pulehui@huawei.com> > > Optimize bswap instructions by rev8 Zbb instruction conbined with srli > instruction. And Optimize 16-bit zero-extension with Zbb support. > > Signed-off-by: Pu Lehui <pulehui@huawei.com> > --- > arch/riscv/net/bpf_jit.h | 67 +++++++++++++++++++++++++++++++++ > arch/riscv/net/bpf_jit_comp64.c | 50 +----------------------- > 2 files changed, 69 insertions(+), 48 deletions(-) > > diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h > index 944bdd6e4..a04eed672 100644 > --- a/arch/riscv/net/bpf_jit.h > +++ b/arch/riscv/net/bpf_jit.h > @@ -1135,12 +1135,79 @@ static inline void emit_sextw(u8 rd, u8 rs, struct rv_jit_context *ctx) > emit_addiw(rd, rs, 0, ctx); > } > > +static inline void emit_zexth(u8 rd, u8 rs, struct rv_jit_context *ctx) > +{ > + if (rvzbb_enabled()) { > + emit(rvzbb_zexth(rd, rs), ctx); > + } else { > + emit_slli(rd, rs, 48, ctx); > + emit_srli(rd, rd, 48, ctx); > + } > +} > + Prefer early-exit. > static inline void emit_zextw(u8 rd, u8 rs, struct rv_jit_context *ctx) > { > emit_slli(rd, rs, 32, ctx); > emit_srli(rd, rd, 32, ctx); > } > > +static inline void emit_bswap(u8 rd, s32 imm, struct rv_jit_context *ctx) > +{ > + if (rvzbb_enabled()) { > + int bits = 64 - imm; > + > + emit(rvzbb_rev8(rd, rd), ctx); > + if (bits) > + emit_srli(rd, rd, bits, ctx); > + } else { > + emit_li(RV_REG_T2, 0, ctx); > + > + emit_andi(RV_REG_T1, rd, 0xff, ctx); > + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); > + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); > + emit_srli(rd, rd, 8, ctx); > + if (imm == 16) > + goto out_be; > + > + emit_andi(RV_REG_T1, rd, 0xff, ctx); > + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); > + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); > + emit_srli(rd, rd, 8, ctx); > + > + emit_andi(RV_REG_T1, rd, 0xff, ctx); > + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); > + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); > + emit_srli(rd, rd, 8, ctx); > + if (imm == 32) > + goto out_be; > + > + emit_andi(RV_REG_T1, rd, 0xff, ctx); > + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); > + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); > + emit_srli(rd, rd, 8, ctx); > + > + emit_andi(RV_REG_T1, rd, 0xff, ctx); > + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); > + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); > + emit_srli(rd, rd, 8, ctx); > + > + emit_andi(RV_REG_T1, rd, 0xff, ctx); > + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); > + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); > + emit_srli(rd, rd, 8, ctx); > + > + emit_andi(RV_REG_T1, rd, 0xff, ctx); > + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); > + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); > + emit_srli(rd, rd, 8, ctx); > +out_be: > + emit_andi(RV_REG_T1, rd, 0xff, ctx); > + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); > + > + emit_mv(rd, RV_REG_T2, ctx); > + } > +} Definitely early-exit for this one! This function really show-cases why ZBB is nice! ;-) I'll take the next rev of series for a test! Björn
On 2023/9/28 19:08, Björn Töpel wrote: > Pu Lehui <pulehui@huaweicloud.com> writes: > >> From: Pu Lehui <pulehui@huawei.com> >> >> Optimize bswap instructions by rev8 Zbb instruction conbined with srli >> instruction. And Optimize 16-bit zero-extension with Zbb support. >> >> Signed-off-by: Pu Lehui <pulehui@huawei.com> >> --- >> arch/riscv/net/bpf_jit.h | 67 +++++++++++++++++++++++++++++++++ >> arch/riscv/net/bpf_jit_comp64.c | 50 +----------------------- >> 2 files changed, 69 insertions(+), 48 deletions(-) >> >> diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h >> index 944bdd6e4..a04eed672 100644 >> --- a/arch/riscv/net/bpf_jit.h >> +++ b/arch/riscv/net/bpf_jit.h >> @@ -1135,12 +1135,79 @@ static inline void emit_sextw(u8 rd, u8 rs, struct rv_jit_context *ctx) >> emit_addiw(rd, rs, 0, ctx); >> } >> >> +static inline void emit_zexth(u8 rd, u8 rs, struct rv_jit_context *ctx) >> +{ >> + if (rvzbb_enabled()) { >> + emit(rvzbb_zexth(rd, rs), ctx); >> + } else { >> + emit_slli(rd, rs, 48, ctx); >> + emit_srli(rd, rd, 48, ctx); >> + } >> +} >> + > > Prefer early-exit. > >> static inline void emit_zextw(u8 rd, u8 rs, struct rv_jit_context *ctx) >> { >> emit_slli(rd, rs, 32, ctx); >> emit_srli(rd, rd, 32, ctx); >> } >> >> +static inline void emit_bswap(u8 rd, s32 imm, struct rv_jit_context *ctx) >> +{ >> + if (rvzbb_enabled()) { >> + int bits = 64 - imm; >> + >> + emit(rvzbb_rev8(rd, rd), ctx); >> + if (bits) >> + emit_srli(rd, rd, bits, ctx); >> + } else { >> + emit_li(RV_REG_T2, 0, ctx); >> + >> + emit_andi(RV_REG_T1, rd, 0xff, ctx); >> + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); >> + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); >> + emit_srli(rd, rd, 8, ctx); >> + if (imm == 16) >> + goto out_be; >> + >> + emit_andi(RV_REG_T1, rd, 0xff, ctx); >> + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); >> + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); >> + emit_srli(rd, rd, 8, ctx); >> + >> + emit_andi(RV_REG_T1, rd, 0xff, ctx); >> + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); >> + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); >> + emit_srli(rd, rd, 8, ctx); >> + if (imm == 32) >> + goto out_be; >> + >> + emit_andi(RV_REG_T1, rd, 0xff, ctx); >> + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); >> + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); >> + emit_srli(rd, rd, 8, ctx); >> + >> + emit_andi(RV_REG_T1, rd, 0xff, ctx); >> + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); >> + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); >> + emit_srli(rd, rd, 8, ctx); >> + >> + emit_andi(RV_REG_T1, rd, 0xff, ctx); >> + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); >> + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); >> + emit_srli(rd, rd, 8, ctx); >> + >> + emit_andi(RV_REG_T1, rd, 0xff, ctx); >> + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); >> + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); >> + emit_srli(rd, rd, 8, ctx); >> +out_be: >> + emit_andi(RV_REG_T1, rd, 0xff, ctx); >> + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); >> + >> + emit_mv(rd, RV_REG_T2, ctx); >> + } >> +} > > Definitely early-exit for this one! > > This function really show-cases why ZBB is nice! ;-) > > I'll take the next rev of series for a test! > Okay, the relevant modifications will be presented in v3 and will be sent soon. > > Björn
diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index 944bdd6e4..a04eed672 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -1135,12 +1135,79 @@ static inline void emit_sextw(u8 rd, u8 rs, struct rv_jit_context *ctx) emit_addiw(rd, rs, 0, ctx); } +static inline void emit_zexth(u8 rd, u8 rs, struct rv_jit_context *ctx) +{ + if (rvzbb_enabled()) { + emit(rvzbb_zexth(rd, rs), ctx); + } else { + emit_slli(rd, rs, 48, ctx); + emit_srli(rd, rd, 48, ctx); + } +} + static inline void emit_zextw(u8 rd, u8 rs, struct rv_jit_context *ctx) { emit_slli(rd, rs, 32, ctx); emit_srli(rd, rd, 32, ctx); } +static inline void emit_bswap(u8 rd, s32 imm, struct rv_jit_context *ctx) +{ + if (rvzbb_enabled()) { + int bits = 64 - imm; + + emit(rvzbb_rev8(rd, rd), ctx); + if (bits) + emit_srli(rd, rd, bits, ctx); + } else { + emit_li(RV_REG_T2, 0, ctx); + + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); + emit_srli(rd, rd, 8, ctx); + if (imm == 16) + goto out_be; + + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); + emit_srli(rd, rd, 8, ctx); + + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); + emit_srli(rd, rd, 8, ctx); + if (imm == 32) + goto out_be; + + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); + emit_srli(rd, rd, 8, ctx); + + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); + emit_srli(rd, rd, 8, ctx); + + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); + emit_srli(rd, rd, 8, ctx); + + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); + emit_srli(rd, rd, 8, ctx); +out_be: + emit_andi(RV_REG_T1, rd, 0xff, ctx); + emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); + + emit_mv(rd, RV_REG_T2, ctx); + } +} + #endif /* __riscv_xlen == 64 */ void bpf_jit_build_prologue(struct rv_jit_context *ctx); diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index f4ca6b787..35753b142 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1130,8 +1130,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_ALU | BPF_END | BPF_FROM_LE: switch (imm) { case 16: - emit_slli(rd, rd, 48, ctx); - emit_srli(rd, rd, 48, ctx); + emit_zexth(rd, rd, ctx); break; case 32: if (!aux->verifier_zext) @@ -1142,54 +1141,9 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, break; } break; - case BPF_ALU | BPF_END | BPF_FROM_BE: case BPF_ALU64 | BPF_END | BPF_FROM_LE: - emit_li(RV_REG_T2, 0, ctx); - - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); - emit_srli(rd, rd, 8, ctx); - if (imm == 16) - goto out_be; - - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); - emit_srli(rd, rd, 8, ctx); - - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); - emit_srli(rd, rd, 8, ctx); - if (imm == 32) - goto out_be; - - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); - emit_srli(rd, rd, 8, ctx); - - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); - emit_srli(rd, rd, 8, ctx); - - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); - emit_srli(rd, rd, 8, ctx); - - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx); - emit_srli(rd, rd, 8, ctx); -out_be: - emit_andi(RV_REG_T1, rd, 0xff, ctx); - emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx); - - emit_mv(rd, RV_REG_T2, ctx); + emit_bswap(rd, imm, ctx); break; /* dst = imm */