diff mbox series

bpf, arm64: inline bpf_get_current_task/_btf() helpers

Message ID 20240619131334.4297-1-puranjay@kernel.org (mailing list archive)
State New, archived
Headers show
Series bpf, arm64: inline bpf_get_current_task/_btf() helpers | expand

Commit Message

Puranjay Mohan June 19, 2024, 1:13 p.m. UTC
On ARM64, the pointer to task_struct is always available in the sp_el0
register and therefore the calls to bpf_get_current_task() and
bpf_get_current_task_btf() can be inlined into a single MRS instruction.

Here is the difference before and after this change:

Before:

; struct task_struct *task = bpf_get_current_task_btf();
  54:   mov     x10, #0xffffffffffff7978        // #-34440
  58:   movk    x10, #0x802b, lsl #16
  5c:   movk    x10, #0x8000, lsl #32
  60:   blr     x10          -------------->    0xffff8000802b7978 <+0>:     mrs     x0, sp_el0
  64:   add     x7, x0, #0x0 <--------------    0xffff8000802b797c <+4>:     ret

After:

; struct task_struct *task = bpf_get_current_task_btf();
  54:   mrs     x7, sp_el0

This shows around 1% performance improvement in artificial microbenchmark.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c | 9 +++++++++
 1 file changed, 9 insertions(+)

Comments

Xu Kuohai June 20, 2024, 2:41 a.m. UTC | #1
On 6/19/2024 9:13 PM, Puranjay Mohan wrote:
> On ARM64, the pointer to task_struct is always available in the sp_el0
> register and therefore the calls to bpf_get_current_task() and
> bpf_get_current_task_btf() can be inlined into a single MRS instruction.
> 
> Here is the difference before and after this change:
> 
> Before:
> 
> ; struct task_struct *task = bpf_get_current_task_btf();
>    54:   mov     x10, #0xffffffffffff7978        // #-34440
>    58:   movk    x10, #0x802b, lsl #16
>    5c:   movk    x10, #0x8000, lsl #32
>    60:   blr     x10          -------------->    0xffff8000802b7978 <+0>:     mrs     x0, sp_el0
>    64:   add     x7, x0, #0x0 <--------------    0xffff8000802b797c <+4>:     ret
> 
> After:
> 
> ; struct task_struct *task = bpf_get_current_task_btf();
>    54:   mrs     x7, sp_el0
> 
> This shows around 1% performance improvement in artificial microbenchmark.
>

I think it would be better if more detailed data could be provided.

> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> ---
>   arch/arm64/net/bpf_jit_comp.c | 9 +++++++++
>   1 file changed, 9 insertions(+)
> 
> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
> index 720336d28856..b838dab3bd26 100644
> --- a/arch/arm64/net/bpf_jit_comp.c
> +++ b/arch/arm64/net/bpf_jit_comp.c
> @@ -1244,6 +1244,13 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
>   			break;
>   		}
>   
> +		/* Implement helper call to bpf_get_current_task/_btf() inline */
> +		if (insn->src_reg == 0 && (insn->imm == BPF_FUNC_get_current_task ||
> +					   insn->imm == BPF_FUNC_get_current_task_btf)) {
> +			emit(A64_MRS_SP_EL0(r0), ctx);
> +			break;
> +		}
> +
>   		ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass,
>   					    &func_addr, &func_addr_fixed);
>   		if (ret < 0)
> @@ -2581,6 +2588,8 @@ bool bpf_jit_inlines_helper_call(s32 imm)
>   {
>   	switch (imm) {
>   	case BPF_FUNC_get_smp_processor_id:
> +	case BPF_FUNC_get_current_task:
> +	case BPF_FUNC_get_current_task_btf:
>   		return true;
>   	default:
>   		return false;

Acked-by: Xu Kuohai <xukuohai@huawei.com>
Andrii Nakryiko June 20, 2024, 7:07 p.m. UTC | #2
On Wed, Jun 19, 2024 at 6:25 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>
> On ARM64, the pointer to task_struct is always available in the sp_el0
> register and therefore the calls to bpf_get_current_task() and
> bpf_get_current_task_btf() can be inlined into a single MRS instruction.
>
> Here is the difference before and after this change:
>
> Before:
>
> ; struct task_struct *task = bpf_get_current_task_btf();
>   54:   mov     x10, #0xffffffffffff7978        // #-34440
>   58:   movk    x10, #0x802b, lsl #16
>   5c:   movk    x10, #0x8000, lsl #32
>   60:   blr     x10          -------------->    0xffff8000802b7978 <+0>:     mrs     x0, sp_el0
>   64:   add     x7, x0, #0x0 <--------------    0xffff8000802b797c <+4>:     ret
>
> After:
>
> ; struct task_struct *task = bpf_get_current_task_btf();
>   54:   mrs     x7, sp_el0
>
> This shows around 1% performance improvement in artificial microbenchmark.
>
> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> ---
>  arch/arm64/net/bpf_jit_comp.c | 9 +++++++++
>  1 file changed, 9 insertions(+)
>

These are frequently used helpers, similarly to
get_smp_processor_id(), so I think it makes sense to optimize them.

Acked-by: Andrii Nakryiko <andrii@kernel.org>

> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
> index 720336d28856..b838dab3bd26 100644
> --- a/arch/arm64/net/bpf_jit_comp.c
> +++ b/arch/arm64/net/bpf_jit_comp.c
> @@ -1244,6 +1244,13 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
>                         break;
>                 }
>
> +               /* Implement helper call to bpf_get_current_task/_btf() inline */
> +               if (insn->src_reg == 0 && (insn->imm == BPF_FUNC_get_current_task ||
> +                                          insn->imm == BPF_FUNC_get_current_task_btf)) {
> +                       emit(A64_MRS_SP_EL0(r0), ctx);
> +                       break;
> +               }
> +
>                 ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass,
>                                             &func_addr, &func_addr_fixed);
>                 if (ret < 0)
> @@ -2581,6 +2588,8 @@ bool bpf_jit_inlines_helper_call(s32 imm)
>  {
>         switch (imm) {
>         case BPF_FUNC_get_smp_processor_id:
> +       case BPF_FUNC_get_current_task:
> +       case BPF_FUNC_get_current_task_btf:
>                 return true;
>         default:
>                 return false;
> --
> 2.40.1
>
patchwork-bot+netdevbpf@kernel.org June 21, 2024, 9:30 p.m. UTC | #3
Hello:

This patch was applied to bpf/bpf-next.git (master)
by Andrii Nakryiko <andrii@kernel.org>:

On Wed, 19 Jun 2024 13:13:34 +0000 you wrote:
> On ARM64, the pointer to task_struct is always available in the sp_el0
> register and therefore the calls to bpf_get_current_task() and
> bpf_get_current_task_btf() can be inlined into a single MRS instruction.
> 
> Here is the difference before and after this change:
> 
> Before:
> 
> [...]

Here is the summary with links:
  - bpf, arm64: inline bpf_get_current_task/_btf() helpers
    https://git.kernel.org/bpf/bpf-next/c/2bb138cb20a6

You are awesome, thank you!
diff mbox series

Patch

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 720336d28856..b838dab3bd26 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1244,6 +1244,13 @@  static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 			break;
 		}
 
+		/* Implement helper call to bpf_get_current_task/_btf() inline */
+		if (insn->src_reg == 0 && (insn->imm == BPF_FUNC_get_current_task ||
+					   insn->imm == BPF_FUNC_get_current_task_btf)) {
+			emit(A64_MRS_SP_EL0(r0), ctx);
+			break;
+		}
+
 		ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass,
 					    &func_addr, &func_addr_fixed);
 		if (ret < 0)
@@ -2581,6 +2588,8 @@  bool bpf_jit_inlines_helper_call(s32 imm)
 {
 	switch (imm) {
 	case BPF_FUNC_get_smp_processor_id:
+	case BPF_FUNC_get_current_task:
+	case BPF_FUNC_get_current_task_btf:
 		return true;
 	default:
 		return false;