diff mbox series

[bpf-next,v4,3/5] bpf, x64: Fix tailcall hierarchy

Message ID 20240509150541.81799-4-hffilwlqm@gmail.com (mailing list archive)
State New, archived
Delegated to: BPF
Headers show
Series bpf: Fix tailcall hierarchy | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-PR success PR summary

Commit Message

Leon Hwang May 9, 2024, 3:05 p.m. UTC
This patch fixes a tailcall issue caused by abusing the tailcall in
bpf2bpf feature.

As we know, tail_call_cnt propagates by rax from caller to callee when
to call subprog in tailcall context. But, like the following example,
MAX_TAIL_CALL_CNT won't work because of missing tail_call_cnt
back-propagation from callee to caller.

\#include <linux/bpf.h>
\#include <bpf/bpf_helpers.h>
\#include "bpf_legacy.h"

struct {
	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
	__uint(max_entries, 1);
	__uint(key_size, sizeof(__u32));
	__uint(value_size, sizeof(__u32));
} jmp_table SEC(".maps");

int count = 0;

static __noinline
int subprog_tail1(struct __sk_buff *skb)
{
	bpf_tail_call_static(skb, &jmp_table, 0);
	return 0;
}

static __noinline
int subprog_tail2(struct __sk_buff *skb)
{
	bpf_tail_call_static(skb, &jmp_table, 0);
	return 0;
}

SEC("tc")
int entry(struct __sk_buff *skb)
{
	volatile int ret = 1;

	count++;
	subprog_tail1(skb);
	subprog_tail2(skb);

	return ret;
}

char __license[] SEC("license") = "GPL";

At run time, the tail_call_cnt in entry() will be propagated to
subprog_tail1() and subprog_tail2(). But, when the tail_call_cnt in
subprog_tail1() updates when bpf_tail_call_static(), the tail_call_cnt
in entry() won't be updated at the same time. As a result, in entry(),
when tail_call_cnt in entry() is less than MAX_TAIL_CALL_CNT and
subprog_tail1() returns because of MAX_TAIL_CALL_CNT limit,
bpf_tail_call_static() in suprog_tail2() is able to run because the
tail_call_cnt in subprog_tail2() propagated from entry() is less than
MAX_TAIL_CALL_CNT.

So, how many tailcalls are there for this case if no error happens?

From top-down view, does it look like hierarchy layer and layer?

With view, there will be 2+4+8+...+2^33 = 2^34 - 2 = 17,179,869,182
tailcalls for this case.

How about there are N subprog_tail() in entry()? There will be almost
N^34 tailcalls.

Then, in this patch, it resolves this case on x86_64.

In stead of propagating tail_call_cnt from caller to callee, it
propagate its pointer, tail_call_cnt_ptr, tcc_ptr for short.

However, where does it store tail_call_cnt?

It stores tail_call_cnt on the stack of bpf prog's caller by the way in
previous patch "bpf: Introduce bpf_jit_supports_tail_call_cnt_ptr()".
Then, in bpf prog's prologue, it loads tcc_ptr from bpf_tail_call_run_ctx,
and restores the original ctx from bpf_tail_call_run_ctx meanwhile.

Then, when a tailcall runs, it compares tail_call_cnt accessed by
tcc_ptr with MAX_TAIL_CALL_CNT and then increments tail_call_cnt at
tcc_ptr.

Furthermore, when trampoline is the caller of bpf prog, it is required
to prepare tail_call_cnt and tail call run ctx on the stack of the
trampoline.

Finally, enable bpf_jit_supports_tail_call_cnt_ptr() to use
bpf_tail_call_run_ctx in __bpf_prog_run().

Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT")
Fixes: e411901c0b77 ("bpf: allow for tailcalls in BPF subprograms for x64 JIT")
Signed-off-by: Leon Hwang <hffilwlqm@gmail.com>
---
 arch/x86/net/bpf_jit_comp.c | 101 ++++++++++++++++++++++++------------
 include/linux/bpf.h         |   2 +
 2 files changed, 70 insertions(+), 33 deletions(-)

Comments

Leon Hwang May 16, 2024, 3:28 p.m. UTC | #1
On 2024/5/9 23:05, Leon Hwang wrote:
> This patch fixes a tailcall issue caused by abusing the tailcall in
> bpf2bpf feature.
> 
> As we know, tail_call_cnt propagates by rax from caller to callee when
> to call subprog in tailcall context. But, like the following example,
> MAX_TAIL_CALL_CNT won't work because of missing tail_call_cnt
> back-propagation from callee to caller.
> 
> \#include <linux/bpf.h>
> \#include <bpf/bpf_helpers.h>
> \#include "bpf_legacy.h"
> 
> struct {
> 	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
> 	__uint(max_entries, 1);
> 	__uint(key_size, sizeof(__u32));
> 	__uint(value_size, sizeof(__u32));
> } jmp_table SEC(".maps");
> 
> int count = 0;
> 
> static __noinline
> int subprog_tail1(struct __sk_buff *skb)
> {
> 	bpf_tail_call_static(skb, &jmp_table, 0);
> 	return 0;
> }
> 
> static __noinline
> int subprog_tail2(struct __sk_buff *skb)
> {
> 	bpf_tail_call_static(skb, &jmp_table, 0);
> 	return 0;
> }
> 
> SEC("tc")
> int entry(struct __sk_buff *skb)
> {
> 	volatile int ret = 1;
> 
> 	count++;
> 	subprog_tail1(skb);
> 	subprog_tail2(skb);
> 
> 	return ret;
> }
> 
> char __license[] SEC("license") = "GPL";
> 
> At run time, the tail_call_cnt in entry() will be propagated to
> subprog_tail1() and subprog_tail2(). But, when the tail_call_cnt in
> subprog_tail1() updates when bpf_tail_call_static(), the tail_call_cnt
> in entry() won't be updated at the same time. As a result, in entry(),
> when tail_call_cnt in entry() is less than MAX_TAIL_CALL_CNT and
> subprog_tail1() returns because of MAX_TAIL_CALL_CNT limit,
> bpf_tail_call_static() in suprog_tail2() is able to run because the
> tail_call_cnt in subprog_tail2() propagated from entry() is less than
> MAX_TAIL_CALL_CNT.
> 
> So, how many tailcalls are there for this case if no error happens?
> 
> From top-down view, does it look like hierarchy layer and layer?
> 
> With view, there will be 2+4+8+...+2^33 = 2^34 - 2 = 17,179,869,182
> tailcalls for this case.
> 
> How about there are N subprog_tail() in entry()? There will be almost
> N^34 tailcalls.
> 
> Then, in this patch, it resolves this case on x86_64.
> 
> In stead of propagating tail_call_cnt from caller to callee, it
> propagate its pointer, tail_call_cnt_ptr, tcc_ptr for short.
> 
> However, where does it store tail_call_cnt?
> 
> It stores tail_call_cnt on the stack of bpf prog's caller by the way in
> previous patch "bpf: Introduce bpf_jit_supports_tail_call_cnt_ptr()".
> Then, in bpf prog's prologue, it loads tcc_ptr from bpf_tail_call_run_ctx,
> and restores the original ctx from bpf_tail_call_run_ctx meanwhile.
> 
> Then, when a tailcall runs, it compares tail_call_cnt accessed by
> tcc_ptr with MAX_TAIL_CALL_CNT and then increments tail_call_cnt at
> tcc_ptr.
> 
> Furthermore, when trampoline is the caller of bpf prog, it is required
> to prepare tail_call_cnt and tail call run ctx on the stack of the
> trampoline.
> 

Oh, I missed a case here.

This patch set is unable to provide tcc_ptr for freplace programs that
use tail calls in bpf2bpf.

How can this approach provide tcc_ptr for freplace programs?

Achieving this is not straightforward. However, it is simpler to disable
the use of tail calls in bpf2bpf for freplace programs, even though this
is a desired feature for my project.

Therefore, I will disable it in the v5 patch set.

Thanks,
Leon
Zvi Effron May 16, 2024, 6:56 p.m. UTC | #2
On Thu, May 16, 2024 at 8:28 AM Leon Hwang <hffilwlqm@gmail.com> wrote:
>
>
>
> On 2024/5/9 23:05, Leon Hwang wrote:
> > This patch fixes a tailcall issue caused by abusing the tailcall in
> > bpf2bpf feature.
> >
> > As we know, tail_call_cnt propagates by rax from caller to callee when
> > to call subprog in tailcall context. But, like the following example,
> > MAX_TAIL_CALL_CNT won't work because of missing tail_call_cnt
> > back-propagation from callee to caller.
> >
> > \#include <linux/bpf.h>
> > \#include <bpf/bpf_helpers.h>
> > \#include "bpf_legacy.h"
> >
> > struct {
> > __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
> > __uint(max_entries, 1);
> > __uint(key_size, sizeof(__u32));
> > __uint(value_size, sizeof(__u32));
> > } jmp_table SEC(".maps");
> >
> > int count = 0;
> >
> > static __noinline
> > int subprog_tail1(struct __sk_buff *skb)
> > {
> > bpf_tail_call_static(skb, &jmp_table, 0);
> > return 0;
> > }
> >
> > static __noinline
> > int subprog_tail2(struct __sk_buff *skb)
> > {
> > bpf_tail_call_static(skb, &jmp_table, 0);
> > return 0;
> > }
> >
> > SEC("tc")
> > int entry(struct __sk_buff *skb)
> > {
> > volatile int ret = 1;
> >
> > count++;
> > subprog_tail1(skb);
> > subprog_tail2(skb);
> >
> > return ret;
> > }
> >
> > char __license[] SEC("license") = "GPL";
> >
> > At run time, the tail_call_cnt in entry() will be propagated to
> > subprog_tail1() and subprog_tail2(). But, when the tail_call_cnt in
> > subprog_tail1() updates when bpf_tail_call_static(), the tail_call_cnt
> > in entry() won't be updated at the same time. As a result, in entry(),
> > when tail_call_cnt in entry() is less than MAX_TAIL_CALL_CNT and
> > subprog_tail1() returns because of MAX_TAIL_CALL_CNT limit,
> > bpf_tail_call_static() in suprog_tail2() is able to run because the
> > tail_call_cnt in subprog_tail2() propagated from entry() is less than
> > MAX_TAIL_CALL_CNT.
> >
> > So, how many tailcalls are there for this case if no error happens?
> >
> > From top-down view, does it look like hierarchy layer and layer?
> >
> > With view, there will be 2+4+8+...+2^33 = 2^34 - 2 = 17,179,869,182
> > tailcalls for this case.
> >
> > How about there are N subprog_tail() in entry()? There will be almost
> > N^34 tailcalls.
> >
> > Then, in this patch, it resolves this case on x86_64.
> >
> > In stead of propagating tail_call_cnt from caller to callee, it
> > propagate its pointer, tail_call_cnt_ptr, tcc_ptr for short.
> >
> > However, where does it store tail_call_cnt?
> >
> > It stores tail_call_cnt on the stack of bpf prog's caller by the way in
> > previous patch "bpf: Introduce bpf_jit_supports_tail_call_cnt_ptr()".
> > Then, in bpf prog's prologue, it loads tcc_ptr from bpf_tail_call_run_ctx,
> > and restores the original ctx from bpf_tail_call_run_ctx meanwhile.
> >
> > Then, when a tailcall runs, it compares tail_call_cnt accessed by
> > tcc_ptr with MAX_TAIL_CALL_CNT and then increments tail_call_cnt at
> > tcc_ptr.
> >
> > Furthermore, when trampoline is the caller of bpf prog, it is required
> > to prepare tail_call_cnt and tail call run ctx on the stack of the
> > trampoline.
> >
>
> Oh, I missed a case here.
>
> This patch set is unable to provide tcc_ptr for freplace programs that
> use tail calls in bpf2bpf.
>
> How can this approach provide tcc_ptr for freplace programs?
>
> Achieving this is not straightforward. However, it is simpler to disable
> the use of tail calls in bpf2bpf for freplace programs, even though this
> is a desired feature for my project.
>
> Therefore, I will disable it in the v5 patch set.
>

Isn't this a breaking change such that it would effectively be a regression for
any users already using tail_calls in bpf2bpf for freplace programs? And,
correct me if I'm wrong, but aren't those pieces of eBPF essentially considered
UAPI stable (unlike kfuncs)?

I appreciate that this is an esoteric use of eBPF, but as you said, you have a
use case for it, as does my team (although we haven't had a chance to implement
it yet), and if the two of us have use cases for it, I imagine other may have
as well, and some of them might already have done their implementation.

> Thanks,
> Leon
>
Leon Hwang May 17, 2024, 3:05 p.m. UTC | #3
On 2024/5/17 02:56, Zvi Effron wrote:
> On Thu, May 16, 2024 at 8:28 AM Leon Hwang <hffilwlqm@gmail.com> wrote:
>>
>>
>>
>> On 2024/5/9 23:05, Leon Hwang wrote:
>>> This patch fixes a tailcall issue caused by abusing the tailcall in
>>> bpf2bpf feature.
>>>

[SNIP]

>>>
>>
>> Oh, I missed a case here.
>>
>> This patch set is unable to provide tcc_ptr for freplace programs that
>> use tail calls in bpf2bpf.
>>
>> How can this approach provide tcc_ptr for freplace programs?
>>
>> Achieving this is not straightforward. However, it is simpler to disable
>> the use of tail calls in bpf2bpf for freplace programs, even though this
>> is a desired feature for my project.
>>
>> Therefore, I will disable it in the v5 patch set.
>>
> 
> Isn't this a breaking change such that it would effectively be a regression for
> any users already using tail_calls in bpf2bpf for freplace programs? And,
> correct me if I'm wrong, but aren't those pieces of eBPF essentially considered
> UAPI stable (unlike kfuncs)?

Yeah, this is a breaking change. However, I think it's acceptable, as
tail_calls in subprogs was considered to be disabled[0].

[0]
https://lore.kernel.org/bpf/CAADnVQLOswL3BY1s0B28wRZH1PU675S6_2=XknjZKNgyJ=yDxw@mail.gmail.com/

> 
> I appreciate that this is an esoteric use of eBPF, but as you said, you have a
> use case for it, as does my team (although we haven't had a chance to implement
> it yet), and if the two of us have use cases for it, I imagine other may have
> as well, and some of them might already have done their implementation.
> 


It seems it is an useful feature for us. I haven't use it either because
of old kernel version.

So, I figure out another approach to resolve this issue.

Here's the diff just for idea discussion:

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5159c7a229229..b0b6c84874e54 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -273,7 +273,7 @@ struct jit_context {
 /* Number of bytes emit_patch() needs to generate instructions */
 #define X86_PATCH_SIZE		5
 /* Number of bytes that will be skipped on tailcall */
-#define X86_TAIL_CALL_OFFSET	(11 + ENDBR_INSN_SIZE)
+#define X86_TAIL_CALL_OFFSET	(22 + ENDBR_INSN_SIZE)

 static void push_r12(u8 **pprog)
 {
@@ -403,6 +403,22 @@ static void emit_cfi(u8 **pprog, u32 hash)
 	*pprog = prog;
 }

+static notrace void bpf_prepare_tail_call_cnt_ptr()
+{
+	/* %rax stores the position to call the original prog. */
+
+	asm (
+	    "pushq %r9\n\t"       /* Push %r9. */
+	    "movq %rax, %r9\n\t"  /* Cache calling position. */
+	    "xor %eax, %eax\n\t"  /* Initialise tail_call_cnt. */
+	    "pushq %rax\n\t"      /* Push tail_call_cnt. */
+	    "movq %rsp, %rax\n\t" /* Make %rax as tcc_ptr. */
+	    "callq *%r9\n\t"      /* Call the original prog. */
+	    "popq %r9\n\t"        /* Pop tail_call_cnt. */
+	    "popq %r9\n\t"        /* Pop %r9. */
+	);
+}
+
 /*
  * Emit x86-64 prologue code for BPF program.
  * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
@@ -410,9 +426,9 @@ static void emit_cfi(u8 **pprog, u32 hash)
  */
 static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
 			  bool tail_call_reachable, bool is_subprog,
-			  bool is_exception_cb)
+			  bool is_exception_cb, u8 *image)
 {
-	u8 *prog = *pprog;
+	u8 *prog = *pprog, *start = *pprog;

 	emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash);
 	/* BPF trampoline can be made to work without these nops,
@@ -420,14 +436,16 @@ static void emit_prologue(u8 **pprog, u32
stack_depth, bool ebpf_from_cbpf,
 	 */
 	emit_nops(&prog, X86_PATCH_SIZE);
 	if (!ebpf_from_cbpf) {
-		if (tail_call_reachable && !is_subprog)
-			/* When it's the entry of the whole tailcall context,
-			 * zeroing rax means initialising tail_call_cnt.
-			 */
-			EMIT2(0x31, 0xC0); /* xor eax, eax */
-		else
+		if (tail_call_reachable && !is_subprog) {
+			/* mov rax, offset */
+			u32 offset = image + (prog - start) + 13;
+			EMIT4_off32(0x48, 0x8B, 0x04, 0x25, offset);
+			/* call bpf_prepare_tail_call_cnt_ptr */
+			emit_call(&prog, bpf_prepare_tail_call_cnt_ptr, offset-5);
+		 } else {
 			/* Keep the same instruction layout. */
-			EMIT2(0x66, 0x90); /* nop2 */
+			emit_nops(&prog, 13);
+		 }
 	}
 	/* Exception callback receives FP as third parameter */
 	if (is_exception_cb) {
@@ -1344,7 +1362,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int
*addrs, u8 *image, u8 *rw_image

 	emit_prologue(&prog, bpf_prog->aux->stack_depth,
 		      bpf_prog_was_classic(bpf_prog), tail_call_reachable,
-		      bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
+		      bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb,
+		      image);
 	/* Exception callback will clobber callee regs for its own use, and
 	 * restore the original callee regs from main prog's stack frame.
 	 */


Unlink the way to prepare tcc_ptr of current patch set by bpf prog's
caller, it prepares tcc_ptr by calling a function at prologue to reserve
tail_call_cnt memory on stack. And then, call the remain part of the bpf
prog. At the end of prologue, rax is tcc_ptr, too.

This is inspired by the original RFC PATCH[0]. And then, it avoids
unwind-breaking issue by a real function call.

[0] https://lore.kernel.org/bpf/20240104142226.87869-3-hffilwlqm@gmail.com/

However, it introduces an indirect call in
bpf_prepare_tail_call_cnt_ptr(), which costs performance by retpoline.
If to improve performance here, bpf dispatcher should be considered,
like XDP.

Thanks,
Leon
diff mbox series

Patch

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index ff217cc35ce92..43dc628e66222 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -273,7 +273,7 @@  struct jit_context {
 /* Number of bytes emit_patch() needs to generate instructions */
 #define X86_PATCH_SIZE		5
 /* Number of bytes that will be skipped on tailcall */
-#define X86_TAIL_CALL_OFFSET	(11 + ENDBR_INSN_SIZE)
+#define X86_TAIL_CALL_OFFSET	(16 + ENDBR_INSN_SIZE)
 
 static void push_r12(u8 **pprog)
 {
@@ -420,14 +420,17 @@  static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
 	 */
 	emit_nops(&prog, X86_PATCH_SIZE);
 	if (!ebpf_from_cbpf) {
-		if (tail_call_reachable && !is_subprog)
-			/* When it's the entry of the whole tailcall context,
-			 * zeroing rax means initialising tail_call_cnt.
-			 */
-			EMIT2(0x31, 0xC0); /* xor eax, eax */
-		else
+		if (tail_call_reachable && !is_subprog) {
+			/* Store tcc_ptr to rax. */
+			/* mov rax, qword ptr [rdi + 8] */
+			EMIT4(0x48, 0x8B, 0x47, 0x08);
+			/* Restore the original ctx. */
+			/* mov rdi, qword ptr [rdi] */
+			EMIT3(0x48, 0x8B, 0x3F);
+		} else {
 			/* Keep the same instruction layout. */
-			EMIT2(0x66, 0x90); /* nop2 */
+			emit_nops(&prog, 7);
+		}
 	}
 	/* Exception callback receives FP as third parameter */
 	if (is_exception_cb) {
@@ -453,6 +456,7 @@  static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
 	if (stack_depth)
 		EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8));
 	if (tail_call_reachable)
+		/* Here, rax is tail_call_cnt_ptr. */
 		EMIT1(0x50);         /* push rax */
 	*pprog = prog;
 }
@@ -589,13 +593,15 @@  static void emit_return(u8 **pprog, u8 *ip)
 	*pprog = prog;
 }
 
+#define BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack)	(-8 - round_up(stack, 8))
+
 /*
  * Generate the following code:
  *
  * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
  *   if (index >= array->map.max_entries)
  *     goto out;
- *   if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
+ *   if ((*tcc_ptr)++ >= MAX_TAIL_CALL_CNT)
  *     goto out;
  *   prog = array->ptrs[index];
  *   if (prog == NULL)
@@ -608,7 +614,7 @@  static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
 					u32 stack_depth, u8 *ip,
 					struct jit_context *ctx)
 {
-	int tcc_off = -4 - round_up(stack_depth, 8);
+	int tcc_ptr_off = BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack_depth);
 	u8 *prog = *pprog, *start = *pprog;
 	int offset;
 
@@ -630,16 +636,15 @@  static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
 	EMIT2(X86_JBE, offset);                   /* jbe out */
 
 	/*
-	 * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
+	 * if ((*tcc_ptr)++ >= MAX_TAIL_CALL_CNT)
 	 *	goto out;
 	 */
-	EMIT2_off32(0x8B, 0x85, tcc_off);         /* mov eax, dword ptr [rbp - tcc_off] */
-	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);     /* cmp eax, MAX_TAIL_CALL_CNT */
+	EMIT3_off32(0x48, 0x8B, 0x85, tcc_ptr_off); /* mov rax, qword ptr [rbp - tcc_ptr_off] */
+	EMIT3(0x83, 0x38, MAX_TAIL_CALL_CNT);     /* cmp dword ptr [rax], MAX_TAIL_CALL_CNT */
 
 	offset = ctx->tail_call_indirect_label - (prog + 2 - start);
 	EMIT2(X86_JAE, offset);                   /* jae out */
-	EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
-	EMIT2_off32(0x89, 0x85, tcc_off);         /* mov dword ptr [rbp - tcc_off], eax */
+	EMIT3(0x83, 0x00, 0x01);                  /* add dword ptr [rax], 1 */
 
 	/* prog = array->ptrs[index]; */
 	EMIT4_off32(0x48, 0x8B, 0x8C, 0xD6,       /* mov rcx, [rsi + rdx * 8 + offsetof(...)] */
@@ -663,6 +668,7 @@  static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
 			pop_r12(&prog);
 	}
 
+	/* pop tail_call_cnt_ptr */
 	EMIT1(0x58);                              /* pop rax */
 	if (stack_depth)
 		EMIT3_off32(0x48, 0x81, 0xC4,     /* add rsp, sd */
@@ -691,21 +697,20 @@  static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
 				      bool *callee_regs_used, u32 stack_depth,
 				      struct jit_context *ctx)
 {
-	int tcc_off = -4 - round_up(stack_depth, 8);
+	int tcc_ptr_off = BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack_depth);
 	u8 *prog = *pprog, *start = *pprog;
 	int offset;
 
 	/*
-	 * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
+	 * if ((*tcc_ptr)++ >= MAX_TAIL_CALL_CNT)
 	 *	goto out;
 	 */
-	EMIT2_off32(0x8B, 0x85, tcc_off);             /* mov eax, dword ptr [rbp - tcc_off] */
-	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);         /* cmp eax, MAX_TAIL_CALL_CNT */
+	EMIT3_off32(0x48, 0x8B, 0x85, tcc_ptr_off);   /* mov rax, qword ptr [rbp - tcc_ptr_off] */
+	EMIT3(0x83, 0x38, MAX_TAIL_CALL_CNT);         /* cmp dword ptr [rax], MAX_TAIL_CALL_CNT */
 
 	offset = ctx->tail_call_direct_label - (prog + 2 - start);
 	EMIT2(X86_JAE, offset);                       /* jae out */
-	EMIT3(0x83, 0xC0, 0x01);                      /* add eax, 1 */
-	EMIT2_off32(0x89, 0x85, tcc_off);             /* mov dword ptr [rbp - tcc_off], eax */
+	EMIT3(0x83, 0x00, 0x01);                      /* add dword ptr [rax], 1 */
 
 	poke->tailcall_bypass = ip + (prog - start);
 	poke->adj_off = X86_TAIL_CALL_OFFSET;
@@ -724,6 +729,7 @@  static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
 			pop_r12(&prog);
 	}
 
+	/* pop tail_call_cnt_ptr */
 	EMIT1(0x58);                                  /* pop rax */
 	if (stack_depth)
 		EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
@@ -1314,8 +1320,8 @@  static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
 /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
-#define RESTORE_TAIL_CALL_CNT(stack)				\
-	EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8)
+#define LOAD_TAIL_CALL_CNT_PTR(stack)						\
+	EMIT3_off32(0x48, 0x8B, 0x85, BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack))
 
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
 		  int oldproglen, struct jit_context *ctx, bool jmp_padding)
@@ -2045,7 +2051,7 @@  st:			if (is_imm8(insn->off))
 
 			func = (u8 *) __bpf_call_base + imm32;
 			if (tail_call_reachable) {
-				RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth);
+				LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
 				ip += 7;
 			}
 			if (!imm32)
@@ -2555,11 +2561,17 @@  static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 			   int run_ctx_off, bool save_ret,
 			   void *image, void *rw_image)
 {
-	u8 *prog = *pprog;
-	u8 *jmp_insn;
+	int ctx_tail_call_run_ctx_off = -run_ctx_off + offsetof(struct bpf_tramp_run_ctx,
+								tail_call_run_ctx);
+	int ctx_tcc_ptr_off = ctx_tail_call_run_ctx_off + offsetof(struct bpf_tail_call_run_ctx,
+								   tail_call_cnt_ptr);
+	int ctx_tail_call_cnt_off = -run_ctx_off + offsetof(struct bpf_tramp_run_ctx,
+							    tail_call_cnt);
 	int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
 	struct bpf_prog *p = l->link.prog;
 	u64 cookie = l->cookie;
+	u8 *prog = *pprog;
+	u8 *jmp_insn;
 
 	/* mov rdi, cookie */
 	emit_mov_imm64(&prog, BPF_REG_1, (long) cookie >> 32, (u32) (long) cookie);
@@ -2604,6 +2616,23 @@  static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 		emit_mov_imm64(&prog, BPF_REG_2,
 			       (long) p->insnsi >> 32,
 			       (u32) (long) p->insnsi);
+	if (p->aux->use_tail_call_run_ctx) {
+		/* Cache the original ctx */
+		/* mov qword ptr [rbp - ctx_tail_call_run_ctx_off], rdi */
+		EMIT3_off32(0x48, 0x89, 0xBD, ctx_tail_call_run_ctx_off);
+		/* Make rdi as tcc_ptr */
+		/* lea rdi, [rbp - ctx_tail_call_cnt_off] */
+		EMIT3_off32(0x48, 0x8D, 0xBD, ctx_tail_call_cnt_off);
+		/* Clear tail_call_cnt */
+		/* mov dword ptr [rdi], 0 */
+		EMIT2_off32(0xC7, 0x07, 0x00);
+		/* Cache tcc_ptr */
+		/* mov qword ptr [rbp - ctx_tcc_ptr_off], rdi */
+		EMIT3_off32(0x48, 0x89, 0xBD, ctx_tcc_ptr_off);
+		/* Update rdi as tail call run ctx */
+		/* lea rdi, [rbp - ctx_tail_call_run_ctx_off] */
+		EMIT3_off32(0x48, 0x8D, 0xBD, ctx_tail_call_run_ctx_off);
+	}
 	/* call JITed bpf program or interpreter */
 	if (emit_rsb_call(&prog, p->bpf_func, image + (prog - (u8 *)rw_image)))
 		return -EINVAL;
@@ -2840,7 +2869,7 @@  static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 	 *                     [ ...        ]
 	 *                     [ stack_arg2 ]
 	 * RBP - arg_stack_off [ stack_arg1 ]
-	 * RSP                 [ tail_call_cnt ] BPF_TRAMP_F_TAIL_CALL_CTX
+	 * RSP                 [ tail_call_cnt_ptr ] BPF_TRAMP_F_TAIL_CALL_CTX
 	 */
 
 	/* room for return value of orig_call or fentry prog */
@@ -2969,10 +2998,10 @@  static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		save_args(m, &prog, arg_stack_off, true);
 
 		if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
-			/* Before calling the original function, restore the
-			 * tail_call_cnt from stack to rax.
+			/* Before calling the original function, load the
+			 * tail_call_cnt_ptr from stack to rax.
 			 */
-			RESTORE_TAIL_CALL_CNT(stack_size);
+			LOAD_TAIL_CALL_CNT_PTR(stack_size);
 		}
 
 		if (flags & BPF_TRAMP_F_ORIG_STACK) {
@@ -3031,10 +3060,10 @@  static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 			goto cleanup;
 		}
 	} else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
-		/* Before running the original function, restore the
-		 * tail_call_cnt from stack to rax.
+		/* Before running the original function, load the
+		 * tail_call_cnt_ptr from stack to rax.
 		 */
-		RESTORE_TAIL_CALL_CNT(stack_size);
+		LOAD_TAIL_CALL_CNT_PTR(stack_size);
 	}
 
 	/* restore return value of orig_call or fentry prog back into RAX */
@@ -3432,6 +3461,12 @@  bool bpf_jit_supports_subprog_tailcalls(void)
 	return true;
 }
 
+/* Indicate the JIT backend supports tail call count pointer in tailcall context. */
+bool bpf_jit_supports_tail_call_cnt_ptr(void)
+{
+	return true;
+}
+
 bool bpf_jit_supports_percpu_insn(void)
 {
 	return true;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 95888700966f7..94f994204acea 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2057,6 +2057,8 @@  struct bpf_tramp_run_ctx {
 	struct bpf_run_ctx run_ctx;
 	u64 bpf_cookie;
 	struct bpf_run_ctx *saved_run_ctx;
+	struct bpf_tail_call_run_ctx tail_call_run_ctx;
+	u32 tail_call_cnt;
 };
 
 static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)