@@ -200,6 +200,8 @@ enum aarch64_insn_size_type {
enum aarch64_insn_ldst_type {
AARCH64_INSN_LDST_LOAD_REG_OFFSET,
AARCH64_INSN_LDST_STORE_REG_OFFSET,
+ AARCH64_INSN_LDST_LOAD_IMM_OFFSET,
+ AARCH64_INSN_LDST_STORE_IMM_OFFSET,
AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX,
AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX,
AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX,
@@ -334,6 +336,7 @@ __AARCH64_INSN_FUNCS(load_pre, 0x3FE00C00, 0x38400C00)
__AARCH64_INSN_FUNCS(store_post, 0x3FE00C00, 0x38000400)
__AARCH64_INSN_FUNCS(load_post, 0x3FE00C00, 0x38400400)
__AARCH64_INSN_FUNCS(str_reg, 0x3FE0EC00, 0x38206800)
+__AARCH64_INSN_FUNCS(str_imm, 0x3FC00000, 0x39000000)
__AARCH64_INSN_FUNCS(ldadd, 0x3F20FC00, 0x38200000)
__AARCH64_INSN_FUNCS(ldclr, 0x3F20FC00, 0x38201000)
__AARCH64_INSN_FUNCS(ldeor, 0x3F20FC00, 0x38202000)
@@ -341,6 +344,7 @@ __AARCH64_INSN_FUNCS(ldset, 0x3F20FC00, 0x38203000)
__AARCH64_INSN_FUNCS(swp, 0x3F20FC00, 0x38208000)
__AARCH64_INSN_FUNCS(cas, 0x3FA07C00, 0x08A07C00)
__AARCH64_INSN_FUNCS(ldr_reg, 0x3FE0EC00, 0x38606800)
+__AARCH64_INSN_FUNCS(ldr_imm, 0x3FC00000, 0x39400000)
__AARCH64_INSN_FUNCS(ldr_lit, 0xBF000000, 0x18000000)
__AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000)
__AARCH64_INSN_FUNCS(exclusive, 0x3F800000, 0x08000000)
@@ -500,6 +504,11 @@ u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg,
enum aarch64_insn_register offset,
enum aarch64_insn_size_type size,
enum aarch64_insn_ldst_type type);
+u32 aarch64_insn_gen_load_store_imm(enum aarch64_insn_register reg,
+ enum aarch64_insn_register base,
+ unsigned int imm,
+ enum aarch64_insn_size_type size,
+ enum aarch64_insn_ldst_type type);
u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1,
enum aarch64_insn_register reg2,
enum aarch64_insn_register base,
@@ -299,29 +299,24 @@ static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type,
return insn;
}
+static const u32 aarch64_insn_ldst_size[] = {
+ [AARCH64_INSN_SIZE_8] = 0,
+ [AARCH64_INSN_SIZE_16] = 1,
+ [AARCH64_INSN_SIZE_32] = 2,
+ [AARCH64_INSN_SIZE_64] = 3,
+};
+
static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type,
u32 insn)
{
u32 size;
- switch (type) {
- case AARCH64_INSN_SIZE_8:
- size = 0;
- break;
- case AARCH64_INSN_SIZE_16:
- size = 1;
- break;
- case AARCH64_INSN_SIZE_32:
- size = 2;
- break;
- case AARCH64_INSN_SIZE_64:
- size = 3;
- break;
- default:
+ if (type < AARCH64_INSN_SIZE_8 || type > AARCH64_INSN_SIZE_64) {
pr_err("%s: unknown size encoding %d\n", __func__, type);
return AARCH64_BREAK_FAULT;
}
+ size = aarch64_insn_ldst_size[type];
insn &= ~GENMASK(31, 30);
insn |= size << 30;
@@ -504,6 +499,50 @@ u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg,
offset);
}
+u32 aarch64_insn_gen_load_store_imm(enum aarch64_insn_register reg,
+ enum aarch64_insn_register base,
+ unsigned int imm,
+ enum aarch64_insn_size_type size,
+ enum aarch64_insn_ldst_type type)
+{
+ u32 insn;
+ u32 shift;
+
+ if (size < AARCH64_INSN_SIZE_8 || size > AARCH64_INSN_SIZE_64) {
+ pr_err("%s: unknown size encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ shift = aarch64_insn_ldst_size[size];
+ if (imm & ~(BIT(12 + shift) - BIT(shift))) {
+ pr_err("%s: invalid imm: %d\n", __func__, imm);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ imm >>= shift;
+
+ switch (type) {
+ case AARCH64_INSN_LDST_LOAD_IMM_OFFSET:
+ insn = aarch64_insn_get_ldr_imm_value();
+ break;
+ case AARCH64_INSN_LDST_STORE_IMM_OFFSET:
+ insn = aarch64_insn_get_str_imm_value();
+ break;
+ default:
+ pr_err("%s: unknown load/store encoding %d\n", __func__, type);
+ return AARCH64_BREAK_FAULT;
+ }
+
+ insn = aarch64_insn_encode_ldst_size(size, insn);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
+
+ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+ base);
+
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm);
+}
+
u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1,
enum aarch64_insn_register reg2,
enum aarch64_insn_register base,
@@ -66,6 +66,20 @@
#define A64_STR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, STORE)
#define A64_LDR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, LOAD)
+/* Load/store register (immediate offset) */
+#define A64_LS_IMM(Rt, Rn, imm, size, type) \
+ aarch64_insn_gen_load_store_imm(Rt, Rn, imm, \
+ AARCH64_INSN_SIZE_##size, \
+ AARCH64_INSN_LDST_##type##_IMM_OFFSET)
+#define A64_STRBI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 8, STORE)
+#define A64_LDRBI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 8, LOAD)
+#define A64_STRHI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 16, STORE)
+#define A64_LDRHI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 16, LOAD)
+#define A64_STR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, STORE)
+#define A64_LDR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, LOAD)
+#define A64_STR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, STORE)
+#define A64_LDR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, LOAD)
+
/* Load/store register pair */
#define A64_LS_PAIR(Rt, Rt2, Rn, offset, ls, type) \
aarch64_insn_gen_load_store_pair(Rt, Rt2, Rn, offset, \
@@ -191,6 +191,47 @@ static bool is_addsub_imm(u32 imm)
return !(imm & ~0xfff) || !(imm & ~0xfff000);
}
+/*
+ * There are 3 types of AArch64 LDR/STR (immediate) instruction:
+ * Post-index, Pre-index, Unsigned offset.
+ *
+ * For BPF ldr/str, the "unsigned offset" type is sufficient.
+ *
+ * "Unsigned offset" type LDR(immediate) format:
+ *
+ * 3 2 1 0
+ * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |x x|1 1 1 0 0 1 0 1| imm12 | Rn | Rt |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * scale
+ *
+ * "Unsigned offset" type STR(immediate) format:
+ * 3 2 1 0
+ * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |x x|1 1 1 0 0 1 0 0| imm12 | Rn | Rt |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * scale
+ *
+ * The offset is calculated from imm12 and scale in the following way:
+ *
+ * offset = (u64)imm12 << scale
+ */
+static inline bool is_lsi_offset(s16 offset, int scale)
+{
+ if (offset < 0)
+ return false;
+
+ if (offset > (0xFFF << scale))
+ return false;
+
+ if (offset & ((1 << scale) - 1))
+ return false;
+
+ return true;
+}
+
/* Tail call offset to jump into */
#if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)
#define PROLOGUE_OFFSET 8
@@ -971,19 +1012,38 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
case BPF_LDX | BPF_PROBE_MEM | BPF_W:
case BPF_LDX | BPF_PROBE_MEM | BPF_H:
case BPF_LDX | BPF_PROBE_MEM | BPF_B:
- emit_a64_mov_i(1, tmp, off, ctx);
switch (BPF_SIZE(code)) {
case BPF_W:
- emit(A64_LDR32(dst, src, tmp), ctx);
+ if (is_lsi_offset(off, 2)) {
+ emit(A64_LDR32I(dst, src, off), ctx);
+ } else {
+ emit_a64_mov_i(1, tmp, off, ctx);
+ emit(A64_LDR32(dst, src, tmp), ctx);
+ }
break;
case BPF_H:
- emit(A64_LDRH(dst, src, tmp), ctx);
+ if (is_lsi_offset(off, 1)) {
+ emit(A64_LDRHI(dst, src, off), ctx);
+ } else {
+ emit_a64_mov_i(1, tmp, off, ctx);
+ emit(A64_LDRH(dst, src, tmp), ctx);
+ }
break;
case BPF_B:
- emit(A64_LDRB(dst, src, tmp), ctx);
+ if (is_lsi_offset(off, 0)) {
+ emit(A64_LDRBI(dst, src, off), ctx);
+ } else {
+ emit_a64_mov_i(1, tmp, off, ctx);
+ emit(A64_LDRB(dst, src, tmp), ctx);
+ }
break;
case BPF_DW:
- emit(A64_LDR64(dst, src, tmp), ctx);
+ if (is_lsi_offset(off, 3)) {
+ emit(A64_LDR64I(dst, src, off), ctx);
+ } else {
+ emit_a64_mov_i(1, tmp, off, ctx);
+ emit(A64_LDR64(dst, src, tmp), ctx);
+ }
break;
}
@@ -1011,20 +1071,39 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
case BPF_ST | BPF_MEM | BPF_B:
case BPF_ST | BPF_MEM | BPF_DW:
/* Load imm to a register then store it */
- emit_a64_mov_i(1, tmp2, off, ctx);
emit_a64_mov_i(1, tmp, imm, ctx);
switch (BPF_SIZE(code)) {
case BPF_W:
- emit(A64_STR32(tmp, dst, tmp2), ctx);
+ if (is_lsi_offset(off, 2)) {
+ emit(A64_STR32I(tmp, dst, off), ctx);
+ } else {
+ emit_a64_mov_i(1, tmp2, off, ctx);
+ emit(A64_STR32(tmp, dst, tmp2), ctx);
+ }
break;
case BPF_H:
- emit(A64_STRH(tmp, dst, tmp2), ctx);
+ if (is_lsi_offset(off, 1)) {
+ emit(A64_STRHI(tmp, dst, off), ctx);
+ } else {
+ emit_a64_mov_i(1, tmp2, off, ctx);
+ emit(A64_STRH(tmp, dst, tmp2), ctx);
+ }
break;
case BPF_B:
- emit(A64_STRB(tmp, dst, tmp2), ctx);
+ if (is_lsi_offset(off, 0)) {
+ emit(A64_STRBI(tmp, dst, off), ctx);
+ } else {
+ emit_a64_mov_i(1, tmp2, off, ctx);
+ emit(A64_STRB(tmp, dst, tmp2), ctx);
+ }
break;
case BPF_DW:
- emit(A64_STR64(tmp, dst, tmp2), ctx);
+ if (is_lsi_offset(off, 3)) {
+ emit(A64_STR64I(tmp, dst, off), ctx);
+ } else {
+ emit_a64_mov_i(1, tmp2, off, ctx);
+ emit(A64_STR64(tmp, dst, tmp2), ctx);
+ }
break;
}
break;
@@ -1034,19 +1113,38 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
case BPF_STX | BPF_MEM | BPF_H:
case BPF_STX | BPF_MEM | BPF_B:
case BPF_STX | BPF_MEM | BPF_DW:
- emit_a64_mov_i(1, tmp, off, ctx);
switch (BPF_SIZE(code)) {
case BPF_W:
- emit(A64_STR32(src, dst, tmp), ctx);
+ if (is_lsi_offset(off, 2)) {
+ emit(A64_STR32I(src, dst, off), ctx);
+ } else {
+ emit_a64_mov_i(1, tmp, off, ctx);
+ emit(A64_STR32(src, dst, tmp), ctx);
+ }
break;
case BPF_H:
- emit(A64_STRH(src, dst, tmp), ctx);
+ if (is_lsi_offset(off, 1)) {
+ emit(A64_STRHI(src, dst, off), ctx);
+ } else {
+ emit_a64_mov_i(1, tmp, off, ctx);
+ emit(A64_STRH(src, dst, tmp), ctx);
+ }
break;
case BPF_B:
- emit(A64_STRB(src, dst, tmp), ctx);
+ if (is_lsi_offset(off, 0)) {
+ emit(A64_STRBI(src, dst, off), ctx);
+ } else {
+ emit_a64_mov_i(1, tmp, off, ctx);
+ emit(A64_STRB(src, dst, tmp), ctx);
+ }
break;
case BPF_DW:
- emit(A64_STR64(src, dst, tmp), ctx);
+ if (is_lsi_offset(off, 3)) {
+ emit(A64_STR64I(src, dst, off), ctx);
+ } else {
+ emit_a64_mov_i(1, tmp, off, ctx);
+ emit(A64_STR64(src, dst, tmp), ctx);
+ }
break;
}
break;
The current BPF store/load instruction is translated by the JIT into two instructions. The first instruction moves the immediate offset into a temporary register. The second instruction uses this temporary register to do the real store/load. In fact, arm64 supports addressing with immediate offsets. So This patch introduces optimization that uses arm64 str/ldr instruction with immediate offset when the offset fits. Example of generated instuction for r2 = *(u64 *)(r1 + 0): without optimization: mov x10, 0 ldr x1, [x0, x10] with optimization: ldr x1, [x0, 0] If the offset is negative, or is not aligned correctly, or exceeds max value, rollback to the use of temporary register. Result for test_bpf: # dmesg -D # insmod test_bpf.ko # dmesg | grep Summary test_bpf: Summary: 1009 PASSED, 0 FAILED, [997/997 JIT'ed] test_bpf: test_tail_calls: Summary: 8 PASSED, 0 FAILED, [8/8 JIT'ed] test_bpf: test_skb_segment: Summary: 2 PASSED, 0 FAILED Signed-off-by: Xu Kuohai <xukuohai@huawei.com> --- v1->v2: 1. Remove macro definition that causes checkpatch to fail 2. Append result to commit message --- arch/arm64/include/asm/insn.h | 9 +++ arch/arm64/lib/insn.c | 67 ++++++++++++++---- arch/arm64/net/bpf_jit.h | 14 ++++ arch/arm64/net/bpf_jit_comp.c | 128 ++++++++++++++++++++++++++++++---- 4 files changed, 189 insertions(+), 29 deletions(-)