diff mbox series

[bpf-next,v4] bpf: Add kernel function call support in 32-bit ARM for EABI

Message ID 20221220115313.29949-1-yangjihong1@huawei.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [bpf-next,v4] bpf: Add kernel function call support in 32-bit ARM for EABI | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 15 of 15 maintainers
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 fail Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 fail Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 fail Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-2 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 fail Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix

Commit Message

Yang Jihong Dec. 20, 2022, 11:53 a.m. UTC
This patch adds kernel function call support to 32-bit ARM bpf jit for
EABI.

Signed-off-by: Yang Jihong <yangjihong1@huawei.com>
---

Changes since v3:
  - Submit patches related to the ARM32 architecture separately.

Changes since v2:
  - Remove patches to adjust sk size check for CO_RE in 32-bit arch.
  - Add check of kfunc's return value in insn_def_regno.
  - Adjust is_reg64 for insn_def_regno.
  - The check of CONFIG_AEABI is moved from emit_kfunc_call to
    bpf_jit_supports_kfunc_call.
  - Fix a comment error in fixup_kfunc_call.

 arch/arm/net/bpf_jit_32.c | 137 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)

Comments

Yang Jihong Jan. 6, 2023, 1:22 a.m. UTC | #1
Hello,

PING.

Thanks,
Yang

On 2022/12/20 19:53, Yang Jihong wrote:
> This patch adds kernel function call support to 32-bit ARM bpf jit for
> EABI.
> 
> Signed-off-by: Yang Jihong <yangjihong1@huawei.com>
> ---
> 
> Changes since v3:
>    - Submit patches related to the ARM32 architecture separately.
> 
> Changes since v2:
>    - Remove patches to adjust sk size check for CO_RE in 32-bit arch.
>    - Add check of kfunc's return value in insn_def_regno.
>    - Adjust is_reg64 for insn_def_regno.
>    - The check of CONFIG_AEABI is moved from emit_kfunc_call to
>      bpf_jit_supports_kfunc_call.
>    - Fix a comment error in fixup_kfunc_call.
> 
>   arch/arm/net/bpf_jit_32.c | 137 ++++++++++++++++++++++++++++++++++++++
>   1 file changed, 137 insertions(+)
> 
> diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
> index 6a1c9fca5260..ae3a36d909f4 100644
> --- a/arch/arm/net/bpf_jit_32.c
> +++ b/arch/arm/net/bpf_jit_32.c
> @@ -1337,6 +1337,125 @@ static void build_epilogue(struct jit_ctx *ctx)
>   #endif
>   }
>   
> +/*
> + * Input parameters of function in 32-bit ARM architecture:
> + * The first four word-sized parameters passed to a function will be
> + * transferred in registers R0-R3. Sub-word sized arguments, for example,
> + * char, will still use a whole register.
> + * Arguments larger than a word will be passed in multiple registers.
> + * If more arguments are passed, the fifth and subsequent words will be passed
> + * on the stack.
> + *
> + * The first for args of a function will be considered for
> + * putting into the 32bit register R1, R2, R3 and R4.
> + *
> + * Two 32bit registers are used to pass a 64bit arg.
> + *
> + * For example,
> + * void foo(u32 a, u32 b, u32 c, u32 d, u32 e):
> + *      u32 a: R0
> + *      u32 b: R1
> + *      u32 c: R2
> + *      u32 d: R3
> + *      u32 e: stack
> + *
> + * void foo(u64 a, u32 b, u32 c, u32 d):
> + *      u64 a: R0 (lo32) R1 (hi32)
> + *      u32 b: R2
> + *      u32 c: R3
> + *      u32 d: stack
> + *
> + * void foo(u32 a, u64 b, u32 c, u32 d):
> + *       u32 a: R0
> + *       u64 b: R2 (lo32) R3 (hi32)
> + *       u32 c: stack
> + *       u32 d: stack
> + *
> + * void foo(u32 a, u32 b, u64 c, u32 d):
> + *       u32 a: R0
> + *       u32 b: R1
> + *       u64 c: R2 (lo32) R3 (hi32)
> + *       u32 d: stack
> + *
> + * void foo(u64 a, u64 b):
> + *       u64 a: R0 (lo32) R1 (hi32)
> + *       u64 b: R2 (lo32) R3 (hi32)
> + *
> + * The return value will be stored in the R0 (and R1 for 64bit value).
> + *
> + * For example,
> + * u32 foo(u32 a, u32 b, u32 c):
> + *      return value: R0
> + *
> + * u64 foo(u32 a, u32 b, u32 c):
> + *      return value: R0 (lo32) R1 (hi32)
> + *
> + * The above is for AEABI only, OABI does not support this function.
> + */
> +static int emit_kfunc_call(const struct bpf_insn *insn, struct jit_ctx *ctx, const u32 func)
> +{
> +	int i;
> +	const struct btf_func_model *fm;
> +	const s8 *tmp = bpf2a32[TMP_REG_1];
> +	const u8 arg_regs[] = { ARM_R0, ARM_R1, ARM_R2, ARM_R3 };
> +	int nr_arg_regs = ARRAY_SIZE(arg_regs);
> +	int arg_regs_idx = 0, stack_off = 0;
> +	const s8 *rd;
> +	s8 rt;
> +
> +	fm = bpf_jit_find_kfunc_model(ctx->prog, insn);
> +	if (!fm)
> +		return -EINVAL;
> +
> +	for (i = 0; i < fm->nr_args; i++) {
> +		if (fm->arg_size[i] > sizeof(u32)) {
> +			rd = arm_bpf_get_reg64(bpf2a32[BPF_REG_1 + i], tmp, ctx);
> +
> +			if (arg_regs_idx + 1 < nr_arg_regs) {
> +				/*
> +				 * AAPCS states:
> +				 * A double-word sized type is passed in two
> +				 * consecutive registers (e.g., r0 and r1, or
> +				 * r2 and r3). The content of the registers is
> +				 * as if the value had been loaded from memory
> +				 * representation with a single LDM instruction.
> +				 */
> +				if (arg_regs_idx & 1)
> +					arg_regs_idx++;
> +
> +				emit(ARM_MOV_R(arg_regs[arg_regs_idx++], rd[1]), ctx);
> +				emit(ARM_MOV_R(arg_regs[arg_regs_idx++], rd[0]), ctx);
> +			} else {
> +				stack_off = ALIGN(stack_off, STACK_ALIGNMENT);
> +
> +				if (__LINUX_ARM_ARCH__ >= 6 ||
> +				    ctx->cpu_architecture >= CPU_ARCH_ARMv5TE) {
> +					emit(ARM_STRD_I(rd[1], ARM_SP, stack_off), ctx);
> +				} else {
> +					emit(ARM_STR_I(rd[1], ARM_SP, stack_off), ctx);
> +					emit(ARM_STR_I(rd[0], ARM_SP, stack_off), ctx);
> +				}
> +
> +				stack_off += 8;
> +			}
> +		} else {
> +			rt = arm_bpf_get_reg32(bpf2a32[BPF_REG_1 + i][1], tmp[1], ctx);
> +
> +			if (arg_regs_idx  < nr_arg_regs) {
> +				emit(ARM_MOV_R(arg_regs[arg_regs_idx++], rt), ctx);
> +			} else {
> +				emit(ARM_STR_I(rt, ARM_SP, stack_off), ctx);
> +				stack_off += 4;
> +			}
> +		}
> +	}
> +
> +	emit_a32_mov_i(tmp[1], func, ctx);
> +	emit_blx_r(tmp[1], ctx);
> +
> +	return 0;
> +}
> +
>   /*
>    * Convert an eBPF instruction to native instruction, i.e
>    * JITs an eBPF instruction.
> @@ -1603,6 +1722,10 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
>   	case BPF_LDX | BPF_MEM | BPF_H:
>   	case BPF_LDX | BPF_MEM | BPF_B:
>   	case BPF_LDX | BPF_MEM | BPF_DW:
> +	case BPF_LDX | BPF_PROBE_MEM | BPF_W:
> +	case BPF_LDX | BPF_PROBE_MEM | BPF_H:
> +	case BPF_LDX | BPF_PROBE_MEM | BPF_B:
> +	case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
>   		rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
>   		emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code));
>   		break;
> @@ -1785,6 +1908,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
>   		const s8 *r5 = bpf2a32[BPF_REG_5];
>   		const u32 func = (u32)__bpf_call_base + (u32)imm;
>   
> +		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
> +			int err;
> +
> +			err = emit_kfunc_call(insn, ctx, func);
> +
> +			if (err)
> +				return err;
> +			break;
> +		}
> +
>   		emit_a32_mov_r64(true, r0, r1, ctx);
>   		emit_a32_mov_r64(true, r1, r2, ctx);
>   		emit_push_r64(r5, ctx);
> @@ -2022,3 +2155,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>   	return prog;
>   }
>   
> +bool bpf_jit_supports_kfunc_call(void)
> +{
> +	return IS_ENABLED(CONFIG_AEABI);
> +}
>
Daniel Borkmann Jan. 6, 2023, 2:46 p.m. UTC | #2
On 12/20/22 12:53 PM, Yang Jihong wrote:
> This patch adds kernel function call support to 32-bit ARM bpf jit for
> EABI.
> 
> Signed-off-by: Yang Jihong <yangjihong1@huawei.com>
> ---
> 
> Changes since v3:
>    - Submit patches related to the ARM32 architecture separately.
> 
> Changes since v2:
>    - Remove patches to adjust sk size check for CO_RE in 32-bit arch.
>    - Add check of kfunc's return value in insn_def_regno.
>    - Adjust is_reg64 for insn_def_regno.
>    - The check of CONFIG_AEABI is moved from emit_kfunc_call to
>      bpf_jit_supports_kfunc_call.
>    - Fix a comment error in fixup_kfunc_call.
> 
>   arch/arm/net/bpf_jit_32.c | 137 ++++++++++++++++++++++++++++++++++++++
>   1 file changed, 137 insertions(+)
> 
> diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
> index 6a1c9fca5260..ae3a36d909f4 100644
> --- a/arch/arm/net/bpf_jit_32.c
> +++ b/arch/arm/net/bpf_jit_32.c
> @@ -1337,6 +1337,125 @@ static void build_epilogue(struct jit_ctx *ctx)
>   #endif
>   }
>   
> +/*
> + * Input parameters of function in 32-bit ARM architecture:
> + * The first four word-sized parameters passed to a function will be
> + * transferred in registers R0-R3. Sub-word sized arguments, for example,
> + * char, will still use a whole register.
> + * Arguments larger than a word will be passed in multiple registers.
> + * If more arguments are passed, the fifth and subsequent words will be passed
> + * on the stack.
> + *
> + * The first for args of a function will be considered for
> + * putting into the 32bit register R1, R2, R3 and R4.
> + *
> + * Two 32bit registers are used to pass a 64bit arg.
> + *
> + * For example,
> + * void foo(u32 a, u32 b, u32 c, u32 d, u32 e):
> + *      u32 a: R0
> + *      u32 b: R1
> + *      u32 c: R2
> + *      u32 d: R3
> + *      u32 e: stack
> + *
> + * void foo(u64 a, u32 b, u32 c, u32 d):
> + *      u64 a: R0 (lo32) R1 (hi32)
> + *      u32 b: R2
> + *      u32 c: R3
> + *      u32 d: stack
> + *
> + * void foo(u32 a, u64 b, u32 c, u32 d):
> + *       u32 a: R0
> + *       u64 b: R2 (lo32) R3 (hi32)
> + *       u32 c: stack
> + *       u32 d: stack
> + *
> + * void foo(u32 a, u32 b, u64 c, u32 d):
> + *       u32 a: R0
> + *       u32 b: R1
> + *       u64 c: R2 (lo32) R3 (hi32)
> + *       u32 d: stack
> + *
> + * void foo(u64 a, u64 b):
> + *       u64 a: R0 (lo32) R1 (hi32)
> + *       u64 b: R2 (lo32) R3 (hi32)
> + *
> + * The return value will be stored in the R0 (and R1 for 64bit value).
> + *
> + * For example,
> + * u32 foo(u32 a, u32 b, u32 c):
> + *      return value: R0
> + *
> + * u64 foo(u32 a, u32 b, u32 c):
> + *      return value: R0 (lo32) R1 (hi32)
> + *
> + * The above is for AEABI only, OABI does not support this function.
> + */
> +static int emit_kfunc_call(const struct bpf_insn *insn, struct jit_ctx *ctx, const u32 func)
> +{
> +	int i;
> +	const struct btf_func_model *fm;
> +	const s8 *tmp = bpf2a32[TMP_REG_1];
> +	const u8 arg_regs[] = { ARM_R0, ARM_R1, ARM_R2, ARM_R3 };
> +	int nr_arg_regs = ARRAY_SIZE(arg_regs);
> +	int arg_regs_idx = 0, stack_off = 0;
> +	const s8 *rd;
> +	s8 rt;
> +
> +	fm = bpf_jit_find_kfunc_model(ctx->prog, insn);
> +	if (!fm)
> +		return -EINVAL;
> +
> +	for (i = 0; i < fm->nr_args; i++) {
> +		if (fm->arg_size[i] > sizeof(u32)) {
> +			rd = arm_bpf_get_reg64(bpf2a32[BPF_REG_1 + i], tmp, ctx);
> +
> +			if (arg_regs_idx + 1 < nr_arg_regs) {
> +				/*
> +				 * AAPCS states:
> +				 * A double-word sized type is passed in two
> +				 * consecutive registers (e.g., r0 and r1, or
> +				 * r2 and r3). The content of the registers is
> +				 * as if the value had been loaded from memory
> +				 * representation with a single LDM instruction.
> +				 */
> +				if (arg_regs_idx & 1)
> +					arg_regs_idx++;
> +
> +				emit(ARM_MOV_R(arg_regs[arg_regs_idx++], rd[1]), ctx);
> +				emit(ARM_MOV_R(arg_regs[arg_regs_idx++], rd[0]), ctx);
> +			} else {
> +				stack_off = ALIGN(stack_off, STACK_ALIGNMENT);
> +
> +				if (__LINUX_ARM_ARCH__ >= 6 ||
> +				    ctx->cpu_architecture >= CPU_ARCH_ARMv5TE) {
> +					emit(ARM_STRD_I(rd[1], ARM_SP, stack_off), ctx);
> +				} else {
> +					emit(ARM_STR_I(rd[1], ARM_SP, stack_off), ctx);
> +					emit(ARM_STR_I(rd[0], ARM_SP, stack_off), ctx);
> +				}
> +
> +				stack_off += 8;
> +			}
> +		} else {
> +			rt = arm_bpf_get_reg32(bpf2a32[BPF_REG_1 + i][1], tmp[1], ctx);
> +
> +			if (arg_regs_idx  < nr_arg_regs) {
> +				emit(ARM_MOV_R(arg_regs[arg_regs_idx++], rt), ctx);
> +			} else {
> +				emit(ARM_STR_I(rt, ARM_SP, stack_off), ctx);
> +				stack_off += 4;
> +			}
> +		}
> +	}
> +
> +	emit_a32_mov_i(tmp[1], func, ctx);
> +	emit_blx_r(tmp[1], ctx);
> +
> +	return 0;
> +}
> +
>   /*
>    * Convert an eBPF instruction to native instruction, i.e
>    * JITs an eBPF instruction.
> @@ -1603,6 +1722,10 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
>   	case BPF_LDX | BPF_MEM | BPF_H:
>   	case BPF_LDX | BPF_MEM | BPF_B:
>   	case BPF_LDX | BPF_MEM | BPF_DW:
> +	case BPF_LDX | BPF_PROBE_MEM | BPF_W:
> +	case BPF_LDX | BPF_PROBE_MEM | BPF_H:
> +	case BPF_LDX | BPF_PROBE_MEM | BPF_B:
> +	case BPF_LDX | BPF_PROBE_MEM | BPF_DW:

This doesn't look right, why is this part of the patch? It's not kfunc related
and if you plan to add support for ldx_probe_mem then it should be separated from
this set. Check out 800834285361 ("bpf, arm64: Add BPF exception tables"), why is
this not needed for arm32?

>   		rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
>   		emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code));
>   		break;
> @@ -1785,6 +1908,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
>   		const s8 *r5 = bpf2a32[BPF_REG_5];
>   		const u32 func = (u32)__bpf_call_base + (u32)imm;
>   
> +		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
> +			int err;
> +
> +			err = emit_kfunc_call(insn, ctx, func);
> +
> +			if (err)
> +				return err;
> +			break;
> +		}
> +
>   		emit_a32_mov_r64(true, r0, r1, ctx);
>   		emit_a32_mov_r64(true, r1, r2, ctx);
>   		emit_push_r64(r5, ctx);
> @@ -2022,3 +2155,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>   	return prog;
>   }
>   
> +bool bpf_jit_supports_kfunc_call(void)
> +{
> +	return IS_ENABLED(CONFIG_AEABI);
> +}
>
diff mbox series

Patch

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 6a1c9fca5260..ae3a36d909f4 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -1337,6 +1337,125 @@  static void build_epilogue(struct jit_ctx *ctx)
 #endif
 }
 
+/*
+ * Input parameters of function in 32-bit ARM architecture:
+ * The first four word-sized parameters passed to a function will be
+ * transferred in registers R0-R3. Sub-word sized arguments, for example,
+ * char, will still use a whole register.
+ * Arguments larger than a word will be passed in multiple registers.
+ * If more arguments are passed, the fifth and subsequent words will be passed
+ * on the stack.
+ *
+ * The first for args of a function will be considered for
+ * putting into the 32bit register R1, R2, R3 and R4.
+ *
+ * Two 32bit registers are used to pass a 64bit arg.
+ *
+ * For example,
+ * void foo(u32 a, u32 b, u32 c, u32 d, u32 e):
+ *      u32 a: R0
+ *      u32 b: R1
+ *      u32 c: R2
+ *      u32 d: R3
+ *      u32 e: stack
+ *
+ * void foo(u64 a, u32 b, u32 c, u32 d):
+ *      u64 a: R0 (lo32) R1 (hi32)
+ *      u32 b: R2
+ *      u32 c: R3
+ *      u32 d: stack
+ *
+ * void foo(u32 a, u64 b, u32 c, u32 d):
+ *       u32 a: R0
+ *       u64 b: R2 (lo32) R3 (hi32)
+ *       u32 c: stack
+ *       u32 d: stack
+ *
+ * void foo(u32 a, u32 b, u64 c, u32 d):
+ *       u32 a: R0
+ *       u32 b: R1
+ *       u64 c: R2 (lo32) R3 (hi32)
+ *       u32 d: stack
+ *
+ * void foo(u64 a, u64 b):
+ *       u64 a: R0 (lo32) R1 (hi32)
+ *       u64 b: R2 (lo32) R3 (hi32)
+ *
+ * The return value will be stored in the R0 (and R1 for 64bit value).
+ *
+ * For example,
+ * u32 foo(u32 a, u32 b, u32 c):
+ *      return value: R0
+ *
+ * u64 foo(u32 a, u32 b, u32 c):
+ *      return value: R0 (lo32) R1 (hi32)
+ *
+ * The above is for AEABI only, OABI does not support this function.
+ */
+static int emit_kfunc_call(const struct bpf_insn *insn, struct jit_ctx *ctx, const u32 func)
+{
+	int i;
+	const struct btf_func_model *fm;
+	const s8 *tmp = bpf2a32[TMP_REG_1];
+	const u8 arg_regs[] = { ARM_R0, ARM_R1, ARM_R2, ARM_R3 };
+	int nr_arg_regs = ARRAY_SIZE(arg_regs);
+	int arg_regs_idx = 0, stack_off = 0;
+	const s8 *rd;
+	s8 rt;
+
+	fm = bpf_jit_find_kfunc_model(ctx->prog, insn);
+	if (!fm)
+		return -EINVAL;
+
+	for (i = 0; i < fm->nr_args; i++) {
+		if (fm->arg_size[i] > sizeof(u32)) {
+			rd = arm_bpf_get_reg64(bpf2a32[BPF_REG_1 + i], tmp, ctx);
+
+			if (arg_regs_idx + 1 < nr_arg_regs) {
+				/*
+				 * AAPCS states:
+				 * A double-word sized type is passed in two
+				 * consecutive registers (e.g., r0 and r1, or
+				 * r2 and r3). The content of the registers is
+				 * as if the value had been loaded from memory
+				 * representation with a single LDM instruction.
+				 */
+				if (arg_regs_idx & 1)
+					arg_regs_idx++;
+
+				emit(ARM_MOV_R(arg_regs[arg_regs_idx++], rd[1]), ctx);
+				emit(ARM_MOV_R(arg_regs[arg_regs_idx++], rd[0]), ctx);
+			} else {
+				stack_off = ALIGN(stack_off, STACK_ALIGNMENT);
+
+				if (__LINUX_ARM_ARCH__ >= 6 ||
+				    ctx->cpu_architecture >= CPU_ARCH_ARMv5TE) {
+					emit(ARM_STRD_I(rd[1], ARM_SP, stack_off), ctx);
+				} else {
+					emit(ARM_STR_I(rd[1], ARM_SP, stack_off), ctx);
+					emit(ARM_STR_I(rd[0], ARM_SP, stack_off), ctx);
+				}
+
+				stack_off += 8;
+			}
+		} else {
+			rt = arm_bpf_get_reg32(bpf2a32[BPF_REG_1 + i][1], tmp[1], ctx);
+
+			if (arg_regs_idx  < nr_arg_regs) {
+				emit(ARM_MOV_R(arg_regs[arg_regs_idx++], rt), ctx);
+			} else {
+				emit(ARM_STR_I(rt, ARM_SP, stack_off), ctx);
+				stack_off += 4;
+			}
+		}
+	}
+
+	emit_a32_mov_i(tmp[1], func, ctx);
+	emit_blx_r(tmp[1], ctx);
+
+	return 0;
+}
+
 /*
  * Convert an eBPF instruction to native instruction, i.e
  * JITs an eBPF instruction.
@@ -1603,6 +1722,10 @@  static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 	case BPF_LDX | BPF_MEM | BPF_H:
 	case BPF_LDX | BPF_MEM | BPF_B:
 	case BPF_LDX | BPF_MEM | BPF_DW:
+	case BPF_LDX | BPF_PROBE_MEM | BPF_W:
+	case BPF_LDX | BPF_PROBE_MEM | BPF_H:
+	case BPF_LDX | BPF_PROBE_MEM | BPF_B:
+	case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
 		rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
 		emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code));
 		break;
@@ -1785,6 +1908,16 @@  static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 		const s8 *r5 = bpf2a32[BPF_REG_5];
 		const u32 func = (u32)__bpf_call_base + (u32)imm;
 
+		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+			int err;
+
+			err = emit_kfunc_call(insn, ctx, func);
+
+			if (err)
+				return err;
+			break;
+		}
+
 		emit_a32_mov_r64(true, r0, r1, ctx);
 		emit_a32_mov_r64(true, r1, r2, ctx);
 		emit_push_r64(r5, ctx);
@@ -2022,3 +2155,7 @@  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	return prog;
 }
 
+bool bpf_jit_supports_kfunc_call(void)
+{
+	return IS_ENABLED(CONFIG_AEABI);
+}