diff mbox series

[bpf-next,v5,1/6] arm64: ftrace: Add ftrace direct call support

Message ID 20220518131638.3401509-2-xukuohai@huawei.com (mailing list archive)
State New, archived
Headers show
Series bpf trampoline for arm64 | expand

Commit Message

Xu Kuohai May 18, 2022, 1:16 p.m. UTC
Add ftrace direct support for arm64.

1. When there is custom trampoline only, replace the fentry nop to a
   jump instruction that jumps directly to the custom trampoline.

2. When ftrace trampoline and custom trampoline coexist, jump from
   fentry to ftrace trampoline first, then jump to custom trampoline
   when ftrace trampoline exits. The current unused register
   pt_regs->orig_x0 is used as an intermediary for jumping from ftrace
   trampoline to custom trampoline.

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Acked-by: Song Liu <songliubraving@fb.com>
---
 arch/arm64/Kconfig               |  2 ++
 arch/arm64/include/asm/ftrace.h  | 12 ++++++++++++
 arch/arm64/kernel/asm-offsets.c  |  1 +
 arch/arm64/kernel/entry-ftrace.S | 18 +++++++++++++++---
 4 files changed, 30 insertions(+), 3 deletions(-)

Comments

KP Singh May 23, 2022, 1:39 a.m. UTC | #1
On Wed, May 18, 2022 at 3:53 PM Xu Kuohai <xukuohai@huawei.com> wrote:
>
> Add ftrace direct support for arm64.
>
> 1. When there is custom trampoline only, replace the fentry nop to a
>    jump instruction that jumps directly to the custom trampoline.
>
> 2. When ftrace trampoline and custom trampoline coexist, jump from
>    fentry to ftrace trampoline first, then jump to custom trampoline
>    when ftrace trampoline exits. The current unused register
>    pt_regs->orig_x0 is used as an intermediary for jumping from ftrace
>    trampoline to custom trampoline.
>
> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
> Acked-by: Song Liu <songliubraving@fb.com>

Acked-by: KP Singh <kpsingh@kernel.org>
Mark Rutland May 25, 2022, 1:38 p.m. UTC | #2
On Wed, May 18, 2022 at 09:16:33AM -0400, Xu Kuohai wrote:
> Add ftrace direct support for arm64.
> 
> 1. When there is custom trampoline only, replace the fentry nop to a
>    jump instruction that jumps directly to the custom trampoline.
> 
> 2. When ftrace trampoline and custom trampoline coexist, jump from
>    fentry to ftrace trampoline first, then jump to custom trampoline
>    when ftrace trampoline exits. The current unused register
>    pt_regs->orig_x0 is used as an intermediary for jumping from ftrace
>    trampoline to custom trampoline.

For those of us not all that familiar with BPF, can you explain *why* you want
this? The above explains what the patch implements, but not why that's useful.

e.g. is this just to avoid the overhead of the ops list processing in the
regular ftrace code, or is the custom trampoline there to allow you to do
something special?

There is another patch series on the list from some of your colleagues which
uses dynamic trampolines to try to avoid that ops list overhead, and it's not
clear to me whether these are trying to solve the largely same problem or
something different. That other thread is at:

  https://lore.kernel.org/linux-arm-kernel/20220316100132.244849-1-bobo.shaobowang@huawei.com/

... and I've added the relevant parties to CC here, since there doesn't seem to
be any overlap in the CC lists of the two threads.

In that other thread I've suggested a general approach we could follow at:
  
  https://lore.kernel.org/linux-arm-kernel/YmGF%2FOpIhAF8YeVq@lakrids/

As noted in that thread, I have a few concerns which equally apply here:

* Due to the limited range of BL instructions, it's not always possible to
  patch an ftrace call-site to branch to an arbitrary trampoline. The way this
  works for ftrace today relies upon knowingthe set of trampolines at
  compile-time, and allocating module PLTs for those, and that approach cannot
  work reliably for dynanically allocated trampolines.

  I'd strongly prefer to avoid custom tramplines unless they're strictly
  necessary for functional reasons, so that we can have this work reliably and
  consistently.

* If this is mostly about avoiding the ops list processing overhead, I beleive
  we can implement some custom ops support more generally in ftrace which would
  still use a common trampoline but could directly call into those custom ops.
  I would strongly prefer this over custom trampolines.

* I'm looking to minimize the set of regs ftrace saves, and never save a full
  pt_regs, since today we (incompletely) fill that with bogus values and cannot
  acquire some state reliably (e.g. PSTATE). I'd like to avoid usage of pt_regs
  unless necessary, and I don't want to add additional reliance upon that
  structure.

> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
> Acked-by: Song Liu <songliubraving@fb.com>
> ---
>  arch/arm64/Kconfig               |  2 ++
>  arch/arm64/include/asm/ftrace.h  | 12 ++++++++++++
>  arch/arm64/kernel/asm-offsets.c  |  1 +
>  arch/arm64/kernel/entry-ftrace.S | 18 +++++++++++++++---
>  4 files changed, 30 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 57c4c995965f..81cc330daafc 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -177,6 +177,8 @@ config ARM64
>  	select HAVE_DYNAMIC_FTRACE
>  	select HAVE_DYNAMIC_FTRACE_WITH_REGS \
>  		if $(cc-option,-fpatchable-function-entry=2)
> +	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS \
> +		if DYNAMIC_FTRACE_WITH_REGS
>  	select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \
>  		if DYNAMIC_FTRACE_WITH_REGS
>  	select HAVE_EFFICIENT_UNALIGNED_ACCESS
> diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
> index 1494cfa8639b..14a35a5df0a1 100644
> --- a/arch/arm64/include/asm/ftrace.h
> +++ b/arch/arm64/include/asm/ftrace.h
> @@ -78,6 +78,18 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
>  	return addr;
>  }
>  
> +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> +static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs,
> +						 unsigned long addr)
> +{
> +	/*
> +	 * Place custom trampoline address in regs->orig_x0 to let ftrace
> +	 * trampoline jump to it.
> +	 */
> +	regs->orig_x0 = addr;
> +}
> +#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */

Please, let's not abuse pt_regs::orig_x0 for this. That's at best unnecessarily
confusing, and if we really need a field to place a value like this it implies
we should add an ftrace-specific structure to hold the ftrace-specific context
information.

Thanks,
Mark.

> +
>  #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
>  struct dyn_ftrace;
>  int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
> index 1197e7679882..b1ed0bf01c59 100644
> --- a/arch/arm64/kernel/asm-offsets.c
> +++ b/arch/arm64/kernel/asm-offsets.c
> @@ -80,6 +80,7 @@ int main(void)
>    DEFINE(S_SDEI_TTBR1,		offsetof(struct pt_regs, sdei_ttbr1));
>    DEFINE(S_PMR_SAVE,		offsetof(struct pt_regs, pmr_save));
>    DEFINE(S_STACKFRAME,		offsetof(struct pt_regs, stackframe));
> +  DEFINE(S_ORIG_X0,		offsetof(struct pt_regs, orig_x0));
>    DEFINE(PT_REGS_SIZE,		sizeof(struct pt_regs));
>    BLANK();
>  #ifdef CONFIG_COMPAT
> diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
> index e535480a4069..dfe62c55e3a2 100644
> --- a/arch/arm64/kernel/entry-ftrace.S
> +++ b/arch/arm64/kernel/entry-ftrace.S
> @@ -60,6 +60,9 @@
>  	str	x29, [sp, #S_FP]
>  	.endif
>  
> +	/* Set orig_x0 to zero  */
> +	str     xzr, [sp, #S_ORIG_X0]
> +
>  	/* Save the callsite's SP and LR */
>  	add	x10, sp, #(PT_REGS_SIZE + 16)
>  	stp	x9, x10, [sp, #S_LR]
> @@ -119,12 +122,21 @@ ftrace_common_return:
>  	/* Restore the callsite's FP, LR, PC */
>  	ldr	x29, [sp, #S_FP]
>  	ldr	x30, [sp, #S_LR]
> -	ldr	x9, [sp, #S_PC]
> -
> +	ldr	x10, [sp, #S_PC]
> +
> +	ldr	x11, [sp, #S_ORIG_X0]
> +	cbz	x11, 1f
> +	/* Set x9 to parent ip before jump to custom trampoline */
> +	mov	x9,  x30
> +	/* Set lr to self ip */
> +	ldr	x30, [sp, #S_PC]
> +	/* Set x10 (used for return address) to custom trampoline */
> +	mov	x10, x11
> +1:
>  	/* Restore the callsite's SP */
>  	add	sp, sp, #PT_REGS_SIZE + 16
>  
> -	ret	x9
> +	ret	x10
>  SYM_CODE_END(ftrace_common)
>  
>  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
> -- 
> 2.30.2
>
Xu Kuohai May 26, 2022, 9:45 a.m. UTC | #3
On 5/25/2022 9:38 PM, Mark Rutland wrote:
> On Wed, May 18, 2022 at 09:16:33AM -0400, Xu Kuohai wrote:
>> Add ftrace direct support for arm64.
>>
>> 1. When there is custom trampoline only, replace the fentry nop to a
>>    jump instruction that jumps directly to the custom trampoline.
>>
>> 2. When ftrace trampoline and custom trampoline coexist, jump from
>>    fentry to ftrace trampoline first, then jump to custom trampoline
>>    when ftrace trampoline exits. The current unused register
>>    pt_regs->orig_x0 is used as an intermediary for jumping from ftrace
>>    trampoline to custom trampoline.
> 
> For those of us not all that familiar with BPF, can you explain *why* you want
> this? The above explains what the patch implements, but not why that's useful.
> 
> e.g. is this just to avoid the overhead of the ops list processing in the
> regular ftrace code, or is the custom trampoline there to allow you to do
> something special?

IIUC, ftrace direct call was designed to *remove* the unnecessary
overhead of saving regs completely [1][2].

[1]
https://lore.kernel.org/all/20191022175052.frjzlnjjfwwfov64@ast-mbp.dhcp.thefacebook.com/
[2] https://lore.kernel.org/all/20191108212834.594904349@goodmis.org/

This patch itself is just a variant of [3].

[3] https://lore.kernel.org/all/20191108213450.891579507@goodmis.org/

> 
> There is another patch series on the list from some of your colleagues which
> uses dynamic trampolines to try to avoid that ops list overhead, and it's not
> clear to me whether these are trying to solve the largely same problem or
> something different. That other thread is at:
> 
>   https://lore.kernel.org/linux-arm-kernel/20220316100132.244849-1-bobo.shaobowang@huawei.com/
> 
> ... and I've added the relevant parties to CC here, since there doesn't seem to
> be any overlap in the CC lists of the two threads.

We're not working to solve the same problem. The trampoline introduced
in this series helps us to monitor kernel function or another bpf prog
with bpf, and also helps us to use bpf prog like a normal kernel
function pointer.

> 
> In that other thread I've suggested a general approach we could follow at:
>   
>   https://lore.kernel.org/linux-arm-kernel/YmGF%2FOpIhAF8YeVq@lakrids/
>

Is it possible for a kernel function to take a long jump to common
trampoline when we get a huge kernel image?

> As noted in that thread, I have a few concerns which equally apply here:
> 
> * Due to the limited range of BL instructions, it's not always possible to
>   patch an ftrace call-site to branch to an arbitrary trampoline. The way this
>   works for ftrace today relies upon knowingthe set of trampolines at
>   compile-time, and allocating module PLTs for those, and that approach cannot
>   work reliably for dynanically allocated trampolines.

Currently patch 5 returns -ENOTSUPP when long jump is detected, so no
bpf trampoline is constructed for out of range patch-site:

if (is_long_jump(orig_call, image))
	return -ENOTSUPP;

> 
>   I'd strongly prefer to avoid custom tramplines unless they're strictly
>   necessary for functional reasons, so that we can have this work reliably and
>   consistently.

bpf trampoline is needed by bpf itself, not to replace ftrace trampolines.

>> * If this is mostly about avoiding the ops list processing overhead, I
beleive
>   we can implement some custom ops support more generally in ftrace which would
>   still use a common trampoline but could directly call into those custom ops.
>   I would strongly prefer this over custom trampolines.
> 
> * I'm looking to minimize the set of regs ftrace saves, and never save a full
>   pt_regs, since today we (incompletely) fill that with bogus values and cannot
>   acquire some state reliably (e.g. PSTATE). I'd like to avoid usage of pt_regs
>   unless necessary, and I don't want to add additional reliance upon that
>   structure.

Even if such a common trampoline is used, bpf trampoline is still
necessary since we need to construct custom instructions to implement
bpf functions, for example, to implement kernel function pointer with a
bpf prog.

> 
>> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
>> Acked-by: Song Liu <songliubraving@fb.com>
>> ---
>>  arch/arm64/Kconfig               |  2 ++
>>  arch/arm64/include/asm/ftrace.h  | 12 ++++++++++++
>>  arch/arm64/kernel/asm-offsets.c  |  1 +
>>  arch/arm64/kernel/entry-ftrace.S | 18 +++++++++++++++---
>>  4 files changed, 30 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index 57c4c995965f..81cc330daafc 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -177,6 +177,8 @@ config ARM64
>>  	select HAVE_DYNAMIC_FTRACE
>>  	select HAVE_DYNAMIC_FTRACE_WITH_REGS \
>>  		if $(cc-option,-fpatchable-function-entry=2)
>> +	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS \
>> +		if DYNAMIC_FTRACE_WITH_REGS
>>  	select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \
>>  		if DYNAMIC_FTRACE_WITH_REGS
>>  	select HAVE_EFFICIENT_UNALIGNED_ACCESS
>> diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
>> index 1494cfa8639b..14a35a5df0a1 100644
>> --- a/arch/arm64/include/asm/ftrace.h
>> +++ b/arch/arm64/include/asm/ftrace.h
>> @@ -78,6 +78,18 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
>>  	return addr;
>>  }
>>  
>> +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
>> +static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs,
>> +						 unsigned long addr)
>> +{
>> +	/*
>> +	 * Place custom trampoline address in regs->orig_x0 to let ftrace
>> +	 * trampoline jump to it.
>> +	 */
>> +	regs->orig_x0 = addr;
>> +}
>> +#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
> 
> Please, let's not abuse pt_regs::orig_x0 for this. That's at best unnecessarily
> confusing, and if we really need a field to place a value like this it implies
> we should add an ftrace-specific structure to hold the ftrace-specific context
> information.
> 

Sorry for this confusion, this was modified in the x86 way:

https://lore.kernel.org/all/20191108213450.891579507@goodmis.org/

> Thanks,
> Mark.
> 
>> +
>>  #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
>>  struct dyn_ftrace;g w
>>  int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
>> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
>> index 1197e7679882..b1ed0bf01c59 100644
>> --- a/arch/arm64/kernel/asm-offsets.c
>> +++ b/arch/arm64/kernel/asm-offsets.c
>> @@ -80,6 +80,7 @@ int main(void)
>>    DEFINE(S_SDEI_TTBR1,		offsetof(struct pt_regs, sdei_ttbr1));
>>    DEFINE(S_PMR_SAVE,		offsetof(struct pt_regs, pmr_save));
>>    DEFINE(S_STACKFRAME,		offsetof(struct pt_regs, stackframe));
>> +  DEFINE(S_ORIG_X0,		offsetof(struct pt_regs, orig_x0));
>>    DEFINE(PT_REGS_SIZE,		sizeof(struct pt_regs));
>>    BLANK();
>>  #ifdef CONFIG_COMPAT
>> diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
>> index e535480a4069..dfe62c55e3a2 100644
>> --- a/arch/arm64/kernel/entry-ftrace.S
>> +++ b/arch/arm64/kernel/entry-ftrace.S
>> @@ -60,6 +60,9 @@
>>  	str	x29, [sp, #S_FP]
>>  	.endif
>>  
>> +	/* Set orig_x0 to zero  */
>> +	str     xzr, [sp, #S_ORIG_X0]
>> +
>>  	/* Save the callsite's SP and LR */
>>  	add	x10, sp, #(PT_REGS_SIZE + 16)
>>  	stp	x9, x10, [sp, #S_LR]
>> @@ -119,12 +122,21 @@ ftrace_common_return:
>>  	/* Restore the callsite's FP, LR, PC */
>>  	ldr	x29, [sp, #S_FP]
>>  	ldr	x30, [sp, #S_LR]
>> -	ldr	x9, [sp, #S_PC]
>> -
>> +	ldr	x10, [sp, #S_PC]
>> +
>> +	ldr	x11, [sp, #S_ORIG_X0]
>> +	cbz	x11, 1f
>> +	/* Set x9 to parent ip before jump to custom trampoline */
>> +	mov	x9,  x30
>> +	/* Set lr to self ip */
>> +	ldr	x30, [sp, #S_PC]
>> +	/* Set x10 (used for return address) to custom trampoline */
>> +	mov	x10, x11
>> +1:
>>  	/* Restore the callsite's SP */
>>  	add	sp, sp, #PT_REGS_SIZE + 16
>>  
>> -	ret	x9
>> +	ret	x10
>>  SYM_CODE_END(ftrace_common)
>>  
>>  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
>> -- 
>> 2.30.2
>>
> .
Mark Rutland May 26, 2022, 10:06 a.m. UTC | #4
On Thu, May 26, 2022 at 05:45:03PM +0800, Xu Kuohai wrote:
> On 5/25/2022 9:38 PM, Mark Rutland wrote:
> > On Wed, May 18, 2022 at 09:16:33AM -0400, Xu Kuohai wrote:
> >> Add ftrace direct support for arm64.
> >>
> >> 1. When there is custom trampoline only, replace the fentry nop to a
> >>    jump instruction that jumps directly to the custom trampoline.
> >>
> >> 2. When ftrace trampoline and custom trampoline coexist, jump from
> >>    fentry to ftrace trampoline first, then jump to custom trampoline
> >>    when ftrace trampoline exits. The current unused register
> >>    pt_regs->orig_x0 is used as an intermediary for jumping from ftrace
> >>    trampoline to custom trampoline.
> > 
> > For those of us not all that familiar with BPF, can you explain *why* you want
> > this? The above explains what the patch implements, but not why that's useful.
> > 
> > e.g. is this just to avoid the overhead of the ops list processing in the
> > regular ftrace code, or is the custom trampoline there to allow you to do
> > something special?
> 
> IIUC, ftrace direct call was designed to *remove* the unnecessary
> overhead of saving regs completely [1][2].

Ok. My plan is to get rid of most of the register saving generally, so I think
that aspect can be solved without direct calls.

> [1]
> https://lore.kernel.org/all/20191022175052.frjzlnjjfwwfov64@ast-mbp.dhcp.thefacebook.com/
> [2] https://lore.kernel.org/all/20191108212834.594904349@goodmis.org/
> 
> This patch itself is just a variant of [3].
> 
> [3] https://lore.kernel.org/all/20191108213450.891579507@goodmis.org/
> 
> > 
> > There is another patch series on the list from some of your colleagues which
> > uses dynamic trampolines to try to avoid that ops list overhead, and it's not
> > clear to me whether these are trying to solve the largely same problem or
> > something different. That other thread is at:
> > 
> >   https://lore.kernel.org/linux-arm-kernel/20220316100132.244849-1-bobo.shaobowang@huawei.com/
> > 
> > ... and I've added the relevant parties to CC here, since there doesn't seem to
> > be any overlap in the CC lists of the two threads.
> 
> We're not working to solve the same problem. The trampoline introduced
> in this series helps us to monitor kernel function or another bpf prog
> with bpf, and also helps us to use bpf prog like a normal kernel
> function pointer.

Ok, but why is it necessary to have a special trampoline?

Is that *just* to avoid overhead, or do you need to do something special that
the regular trampoline won't do?

> > 
> > In that other thread I've suggested a general approach we could follow at:
> >   
> >   https://lore.kernel.org/linux-arm-kernel/YmGF%2FOpIhAF8YeVq@lakrids/
> >
> 
> Is it possible for a kernel function to take a long jump to common
> trampoline when we get a huge kernel image?

It is possible, but only where the kernel Image itself is massive and the .text
section exceeeds 128MiB, at which point other things break anyway. Practically
speaking, this doesn't happen for production kernels, or reasonable test
kernels.

I've been meaning to add some logic to detect this at boot time and idsable
ftrace (or at build time), since live patching would also be broken in that
case.

> > As noted in that thread, I have a few concerns which equally apply here:
> > 
> > * Due to the limited range of BL instructions, it's not always possible to
> >   patch an ftrace call-site to branch to an arbitrary trampoline. The way this
> >   works for ftrace today relies upon knowingthe set of trampolines at
> >   compile-time, and allocating module PLTs for those, and that approach cannot
> >   work reliably for dynanically allocated trampolines.
> 
> Currently patch 5 returns -ENOTSUPP when long jump is detected, so no
> bpf trampoline is constructed for out of range patch-site:
> 
> if (is_long_jump(orig_call, image))
> 	return -ENOTSUPP;

Sure, my point is that in practice that means that (from the user's PoV) this
may randomly fail to work, and I'd like something that we can ensure works
consistently.

> >   I'd strongly prefer to avoid custom tramplines unless they're strictly
> >   necessary for functional reasons, so that we can have this work reliably and
> >   consistently.
> 
> bpf trampoline is needed by bpf itself, not to replace ftrace trampolines.

As above, can you please let me know *why* specifically it is needed? Why can't
we invoke the BPF code through the usual ops mechanism?

Is that to avoid overhead, or are there other functional reasons you need a
special trampoline?

> >> * If this is mostly about avoiding the ops list processing overhead, I
> beleive
> >   we can implement some custom ops support more generally in ftrace which would
> >   still use a common trampoline but could directly call into those custom ops.
> >   I would strongly prefer this over custom trampolines.
> > 
> > * I'm looking to minimize the set of regs ftrace saves, and never save a full
> >   pt_regs, since today we (incompletely) fill that with bogus values and cannot
> >   acquire some state reliably (e.g. PSTATE). I'd like to avoid usage of pt_regs
> >   unless necessary, and I don't want to add additional reliance upon that
> >   structure.
> 
> Even if such a common trampoline is used, bpf trampoline is still
> necessary since we need to construct custom instructions to implement
> bpf functions, for example, to implement kernel function pointer with a
> bpf prog.

Sorry, but I'm struggling to understand this. What specifically do you need to
do that means this can't use the same calling convention as the regular ops
function pointers?

Thanks,
Mark.
Xu Kuohai May 26, 2022, 2:48 p.m. UTC | #5
On 5/26/2022 6:06 PM, Mark Rutland wrote:
> On Thu, May 26, 2022 at 05:45:03PM +0800, Xu Kuohai wrote:
>> On 5/25/2022 9:38 PM, Mark Rutland wrote:
>>> On Wed, May 18, 2022 at 09:16:33AM -0400, Xu Kuohai wrote:
>>>> Add ftrace direct support for arm64.
>>>>
>>>> 1. When there is custom trampoline only, replace the fentry nop to a
>>>>    jump instruction that jumps directly to the custom trampoline.
>>>>
>>>> 2. When ftrace trampoline and custom trampoline coexist, jump from
>>>>    fentry to ftrace trampoline first, then jump to custom trampoline
>>>>    when ftrace trampoline exits. The current unused register
>>>>    pt_regs->orig_x0 is used as an intermediary for jumping from ftrace
>>>>    trampoline to custom trampoline.
>>>
>>> For those of us not all that familiar with BPF, can you explain *why* you want
>>> this? The above explains what the patch implements, but not why that's useful.
>>>
>>> e.g. is this just to avoid the overhead of the ops list processing in the
>>> regular ftrace code, or is the custom trampoline there to allow you to do
>>> something special?
>>
>> IIUC, ftrace direct call was designed to *remove* the unnecessary
>> overhead of saving regs completely [1][2].
> 
> Ok. My plan is to get rid of most of the register saving generally, so I think
> that aspect can be solved without direct calls.
Looking forward to your new solution.

> 
>> [1]
>> https://lore.kernel.org/all/20191022175052.frjzlnjjfwwfov64@ast-mbp.dhcp.thefacebook.com/
>> [2] https://lore.kernel.org/all/20191108212834.594904349@goodmis.org/
>>
>> This patch itself is just a variant of [3].
>>
>> [3] https://lore.kernel.org/all/20191108213450.891579507@goodmis.org/
>>
>>>
>>> There is another patch series on the list from some of your colleagues which
>>> uses dynamic trampolines to try to avoid that ops list overhead, and it's not
>>> clear to me whether these are trying to solve the largely same problem or
>>> something different. That other thread is at:
>>>
>>>   https://lore.kernel.org/linux-arm-kernel/20220316100132.244849-1-bobo.shaobowang@huawei.com/
>>>
>>> ... and I've added the relevant parties to CC here, since there doesn't seem to
>>> be any overlap in the CC lists of the two threads.
>>
>> We're not working to solve the same problem. The trampoline introduced
>> in this series helps us to monitor kernel function or another bpf prog
>> with bpf, and also helps us to use bpf prog like a normal kernel
>> function pointer.
> 
> Ok, but why is it necessary to have a special trampoline?
> 
> Is that *just* to avoid overhead, or do you need to do something special that
> the regular trampoline won't do?
> 

Sorry for not explaining the problem. The main bpf prog accepts only a
single argument 'ctx' in r1, so to allow kernel code to call bpf prog
transparently, we need a trampoline to convert native calling convention
into BPF calling convention [1].

[1] https://lore.kernel.org/bpf/20191114185720.1641606-5-ast@kernel.org/

For example,

SEC("struct_ops/dctcp_state")
void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state)
{
    // do something
}

The above bpf prog will be compiled to something like this:

dctcp_state:
    r2 = *(u64 *)(r1 + 8)  // new_state
    r1 = *(u64 *)(r1 + 0)  // sk
    ...

It accepts only one argument 'ctx' in r1, and loads the actual arugment
'sk' and 'new_state' from r1 + 0 and r1 + 8, resepectively. So before
calling this prog, we need to construct 'ctx' and store its address to r1.

>>>
>>> In that other thread I've suggested a general approach we could follow at:
>>>   
>>>   https://lore.kernel.org/linux-arm-kernel/YmGF%2FOpIhAF8YeVq@lakrids/
>>>
>>
>> Is it possible for a kernel function to take a long jump to common
>> trampoline when we get a huge kernel image?
> 
> It is possible, but only where the kernel Image itself is massive and the .text
> section exceeeds 128MiB, at which point other things break anyway. Practically
> speaking, this doesn't happen for production kernels, or reasonable test
> kernels.
> 

So even for normal kernel functions, we need some way to construct and
destruct long jumps atomically and safely.

> I've been meaning to add some logic to detect this at boot time and idsable
> ftrace (or at build time), since live patching would also be broken in that
> case.
>>>> As noted in that thread, I have a few concerns which equally apply here:
>>>
>>> * Due to the limited range of BL instructions, it's not always possible to
>>>   patch an ftrace call-site to branch to an arbitrary trampoline. The way this
>>>   works for ftrace today relies upon knowingthe set of trampolines at
>>>   compile-time, and allocating module PLTs for those, and that approach cannot
>>>   work reliably for dynanically allocated trampolines.
>>
>> Currently patch 5 returns -ENOTSUPP when long jump is detected, so no
>> bpf trampoline is constructed for out of range patch-site:
>>
>> if (is_long_jump(orig_call, image))
>> 	return -ENOTSUPP;
> 
> Sure, my point is that in practice that means that (from the user's PoV) this
> may randomly fail to work, and I'd like something that we can ensure works
> consistently.
> 

OK, should I suspend this work until you finish refactoring ftrace?

>>>   I'd strongly prefer to avoid custom tramplines unless they're strictly
>>>   necessary for functional reasons, so that we can have this work reliably and
>>>   consistently.
>>
>> bpf trampoline is needed by bpf itself, not to replace ftrace trampolines.
> 
> As above, can you please let me know *why* specifically it is needed? Why can't
> we invoke the BPF code through the usual ops mechanism?
> 
> Is that to avoid overhead, or are there other functional reasons you need a
> special trampoline?
> 
>>>> * If this is mostly about avoiding the ops list processing overhead, I
>> beleive
>>>   we can implement some custom ops support more generally in ftrace which would
>>>   still use a common trampoline but could directly call into those custom ops.
>>>   I would strongly prefer this over custom trampolines.
>>>
>>> * I'm looking to minimize the set of regs ftrace saves, and never save a full
>>>   pt_regs, since today we (incompletely) fill that with bogus values and cannot
>>>   acquire some state reliably (e.g. PSTATE). I'd like to avoid usage of pt_regs
>>>   unless necessary, and I don't want to add additional reliance upon that
>>>   structure.
>>
>> Even if such a common trampoline is used, bpf trampoline is still
>> necessary since we need to construct custom instructions to implement
>> bpf functions, for example, to implement kernel function pointer with a
>> bpf prog.
> 
> Sorry, but I'm struggling to understand this. What specifically do you need to
> do that means this can't use the same calling convention as the regular ops
> function pointers?
> > Thanks,
> Mark.
> .
Mark Rutland June 6, 2022, 4:35 p.m. UTC | #6
On Thu, May 26, 2022 at 10:48:05PM +0800, Xu Kuohai wrote:
> On 5/26/2022 6:06 PM, Mark Rutland wrote:
> > On Thu, May 26, 2022 at 05:45:03PM +0800, Xu Kuohai wrote:
> >> On 5/25/2022 9:38 PM, Mark Rutland wrote:
> >>> On Wed, May 18, 2022 at 09:16:33AM -0400, Xu Kuohai wrote:
> >>>> Add ftrace direct support for arm64.
> >>>>
> >>>> 1. When there is custom trampoline only, replace the fentry nop to a
> >>>>    jump instruction that jumps directly to the custom trampoline.
> >>>>
> >>>> 2. When ftrace trampoline and custom trampoline coexist, jump from
> >>>>    fentry to ftrace trampoline first, then jump to custom trampoline
> >>>>    when ftrace trampoline exits. The current unused register
> >>>>    pt_regs->orig_x0 is used as an intermediary for jumping from ftrace
> >>>>    trampoline to custom trampoline.
> >>>
> >>> For those of us not all that familiar with BPF, can you explain *why* you want
> >>> this? The above explains what the patch implements, but not why that's useful.
> >>>
> >>> e.g. is this just to avoid the overhead of the ops list processing in the
> >>> regular ftrace code, or is the custom trampoline there to allow you to do
> >>> something special?
> >>
> >> IIUC, ftrace direct call was designed to *remove* the unnecessary
> >> overhead of saving regs completely [1][2].
> > 
> > Ok. My plan is to get rid of most of the register saving generally, so I think
> > that aspect can be solved without direct calls.
> Looking forward to your new solution.

For the register saving rework, I have a WIP branch on my kernel.org repo:

  https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=arm64/ftrace/minimal-regs
  git://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git arm64/ftrace/minimal-regs

I'm working on that at the moment along with a per-callsite ops implementaiton
that would avoid most of the need for custom trampolines (and work with branch
range limitations):

  https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=arm64/ftrace/per-callsite-ops
  git://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git arm64/ftrace/per-callsite-ops

> >> [1]
> >> https://lore.kernel.org/all/20191022175052.frjzlnjjfwwfov64@ast-mbp.dhcp.thefacebook.com/
> >> [2] https://lore.kernel.org/all/20191108212834.594904349@goodmis.org/
> >>
> >> This patch itself is just a variant of [3].
> >>
> >> [3] https://lore.kernel.org/all/20191108213450.891579507@goodmis.org/
> >>
> >>>
> >>> There is another patch series on the list from some of your colleagues which
> >>> uses dynamic trampolines to try to avoid that ops list overhead, and it's not
> >>> clear to me whether these are trying to solve the largely same problem or
> >>> something different. That other thread is at:
> >>>
> >>>   https://lore.kernel.org/linux-arm-kernel/20220316100132.244849-1-bobo.shaobowang@huawei.com/
> >>>
> >>> ... and I've added the relevant parties to CC here, since there doesn't seem to
> >>> be any overlap in the CC lists of the two threads.
> >>
> >> We're not working to solve the same problem. The trampoline introduced
> >> in this series helps us to monitor kernel function or another bpf prog
> >> with bpf, and also helps us to use bpf prog like a normal kernel
> >> function pointer.
> > 
> > Ok, but why is it necessary to have a special trampoline?
> > 
> > Is that *just* to avoid overhead, or do you need to do something special that
> > the regular trampoline won't do?
> > 
> 
> Sorry for not explaining the problem. The main bpf prog accepts only a
> single argument 'ctx' in r1, so to allow kernel code to call bpf prog
> transparently, we need a trampoline to convert native calling convention
> into BPF calling convention [1].
> 
> [1] https://lore.kernel.org/bpf/20191114185720.1641606-5-ast@kernel.org/

Thanks for the pointer; I'll go page that in.

> For example,
> 
> SEC("struct_ops/dctcp_state")
> void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state)
> {
>     // do something
> }
> 
> The above bpf prog will be compiled to something like this:
> 
> dctcp_state:
>     r2 = *(u64 *)(r1 + 8)  // new_state
>     r1 = *(u64 *)(r1 + 0)  // sk
>     ...
> 
> It accepts only one argument 'ctx' in r1, and loads the actual arugment
> 'sk' and 'new_state' from r1 + 0 and r1 + 8, resepectively. So before
> calling this prog, we need to construct 'ctx' and store its address to r1.
> 
> >>>
> >>> In that other thread I've suggested a general approach we could follow at:
> >>>   
> >>>   https://lore.kernel.org/linux-arm-kernel/YmGF%2FOpIhAF8YeVq@lakrids/
> >>
> >> Is it possible for a kernel function to take a long jump to common
> >> trampoline when we get a huge kernel image?
> > 
> > It is possible, but only where the kernel Image itself is massive and the .text
> > section exceeeds 128MiB, at which point other things break anyway. Practically
> > speaking, this doesn't happen for production kernels, or reasonable test
> > kernels.
> 
> So even for normal kernel functions, we need some way to construct and
> destruct long jumps atomically and safely.

My point was that case is unrealistic for production kernels, and utterly
broken anyway (and as below I intend to make ftrace detect this and mark itself
as broken).

FWIW, an allmodconfig kernel built with GCC 12.1.0 has a ~30MB .text segment,
so for realistic kernels we have plenty of headroom for normal functions to
reach the in-kernel trampoline.

> > I've been meaning to add some logic to detect this at boot time and idsable
> > ftrace (or at build time), since live patching would also be broken in that
> > case.
> >>>> As noted in that thread, I have a few concerns which equally apply here:
> >>>
> >>> * Due to the limited range of BL instructions, it's not always possible to
> >>>   patch an ftrace call-site to branch to an arbitrary trampoline. The way this
> >>>   works for ftrace today relies upon knowingthe set of trampolines at
> >>>   compile-time, and allocating module PLTs for those, and that approach cannot
> >>>   work reliably for dynanically allocated trampolines.
> >>
> >> Currently patch 5 returns -ENOTSUPP when long jump is detected, so no
> >> bpf trampoline is constructed for out of range patch-site:
> >>
> >> if (is_long_jump(orig_call, image))
> >> 	return -ENOTSUPP;
> > 
> > Sure, my point is that in practice that means that (from the user's PoV) this
> > may randomly fail to work, and I'd like something that we can ensure works
> > consistently.
> > 
> 
> OK, should I suspend this work until you finish refactoring ftrace?

Yes; I'd appreciate if we could hold on this for a bit.

I think with some ground work we can avoid most of the painful edge cases and
might be able to avoid the need for custom trampolines.

Thanks,
Mark.
Xu Kuohai June 9, 2022, 4:27 a.m. UTC | #7
On 6/7/2022 12:35 AM, Mark Rutland wrote:
> On Thu, May 26, 2022 at 10:48:05PM +0800, Xu Kuohai wrote:
>> On 5/26/2022 6:06 PM, Mark Rutland wrote:
>>> On Thu, May 26, 2022 at 05:45:03PM +0800, Xu Kuohai wrote:
>>>> On 5/25/2022 9:38 PM, Mark Rutland wrote:
>>>>> On Wed, May 18, 2022 at 09:16:33AM -0400, Xu Kuohai wrote:
>>>>>> Add ftrace direct support for arm64.
>>>>>>
>>>>>> 1. When there is custom trampoline only, replace the fentry nop to a
>>>>>>    jump instruction that jumps directly to the custom trampoline.
>>>>>>
>>>>>> 2. When ftrace trampoline and custom trampoline coexist, jump from
>>>>>>    fentry to ftrace trampoline first, then jump to custom trampoline
>>>>>>    when ftrace trampoline exits. The current unused register
>>>>>>    pt_regs->orig_x0 is used as an intermediary for jumping from ftrace
>>>>>>    trampoline to custom trampoline.
>>>>>
>>>>> For those of us not all that familiar with BPF, can you explain *why* you want
>>>>> this? The above explains what the patch implements, but not why that's useful.
>>>>>
>>>>> e.g. is this just to avoid the overhead of the ops list processing in the
>>>>> regular ftrace code, or is the custom trampoline there to allow you to do
>>>>> something special?
>>>>
>>>> IIUC, ftrace direct call was designed to *remove* the unnecessary
>>>> overhead of saving regs completely [1][2].
>>>
>>> Ok. My plan is to get rid of most of the register saving generally, so I think
>>> that aspect can be solved without direct calls.
>> Looking forward to your new solution.
> 
> For the register saving rework, I have a WIP branch on my kernel.org repo:
> 
>   https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=arm64/ftrace/minimal-regs
>   git://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git arm64/ftrace/minimal-regs
> 
> I'm working on that at the moment along with a per-callsite ops implementaiton
> that would avoid most of the need for custom trampolines (and work with branch
> range limitations):
> 
>   https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=arm64/ftrace/per-callsite-ops
>   git://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git arm64/ftrace/per-callsite-ops
> 
>>>> [1]
>>>> https://lore.kernel.org/all/20191022175052.frjzlnjjfwwfov64@ast-mbp.dhcp.thefacebook.com/
>>>> [2] https://lore.kernel.org/all/20191108212834.594904349@goodmis.org/
>>>>
>>>> This patch itself is just a variant of [3].
>>>>
>>>> [3] https://lore.kernel.org/all/20191108213450.891579507@goodmis.org/
>>>>
>>>>>
>>>>> There is another patch series on the list from some of your colleagues which
>>>>> uses dynamic trampolines to try to avoid that ops list overhead, and it's not
>>>>> clear to me whether these are trying to solve the largely same problem or
>>>>> something different. That other thread is at:
>>>>>
>>>>>   https://lore.kernel.org/linux-arm-kernel/20220316100132.244849-1-bobo.shaobowang@huawei.com/
>>>>>
>>>>> ... and I've added the relevant parties to CC here, since there doesn't seem to
>>>>> be any overlap in the CC lists of the two threads.
>>>>
>>>> We're not working to solve the same problem. The trampoline introduced
>>>> in this series helps us to monitor kernel function or another bpf prog
>>>> with bpf, and also helps us to use bpf prog like a normal kernel
>>>> function pointer.
>>>
>>> Ok, but why is it necessary to have a special trampoline?
>>>
>>> Is that *just* to avoid overhead, or do you need to do something special that
>>> the regular trampoline won't do?
>>>
>>
>> Sorry for not explaining the problem. The main bpf prog accepts only a
>> single argument 'ctx' in r1, so to allow kernel code to call bpf prog
>> transparently, we need a trampoline to convert native calling convention
>> into BPF calling convention [1].
>>
>> [1] https://lore.kernel.org/bpf/20191114185720.1641606-5-ast@kernel.org/
> 
> Thanks for the pointer; I'll go page that in.
> 
>> For example,
>>
>> SEC("struct_ops/dctcp_state")
>> void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state)
>> {
>>     // do something
>> }
>>
>> The above bpf prog will be compiled to something like this:
>>
>> dctcp_state:
>>     r2 = *(u64 *)(r1 + 8)  // new_state
>>     r1 = *(u64 *)(r1 + 0)  // sk
>>     ...
>>
>> It accepts only one argument 'ctx' in r1, and loads the actual arugment
>> 'sk' and 'new_state' from r1 + 0 and r1 + 8, resepectively. So before
>> calling this prog, we need to construct 'ctx' and store its address to r1.
>>
>>>>>
>>>>> In that other thread I've suggested a general approach we could follow at:
>>>>>   
>>>>>   https://lore.kernel.org/linux-arm-kernel/YmGF%2FOpIhAF8YeVq@lakrids/
>>>>
>>>> Is it possible for a kernel function to take a long jump to common
>>>> trampoline when we get a huge kernel image?
>>>
>>> It is possible, but only where the kernel Image itself is massive and the .text
>>> section exceeeds 128MiB, at which point other things break anyway. Practically
>>> speaking, this doesn't happen for production kernels, or reasonable test
>>> kernels.
>>
>> So even for normal kernel functions, we need some way to construct and
>> destruct long jumps atomically and safely.
> 
> My point was that case is unrealistic for production kernels, and utterly
> broken anyway (and as below I intend to make ftrace detect this and mark itself
> as broken).
> 
> FWIW, an allmodconfig kernel built with GCC 12.1.0 has a ~30MB .text segment,
> so for realistic kernels we have plenty of headroom for normal functions to
> reach the in-kernel trampoline.
> 
>>> I've been meaning to add some logic to detect this at boot time and idsable
>>> ftrace (or at build time), since live patching would also be broken in that
>>> case.
>>>>>> As noted in that thread, I have a few concerns which equally apply here:
>>>>>
>>>>> * Due to the limited range of BL instructions, it's not always possible to
>>>>>   patch an ftrace call-site to branch to an arbitrary trampoline. The way this
>>>>>   works for ftrace today relies upon knowingthe set of trampolines at
>>>>>   compile-time, and allocating module PLTs for those, and that approach cannot
>>>>>   work reliably for dynanically allocated trampolines.
>>>>
>>>> Currently patch 5 returns -ENOTSUPP when long jump is detected, so no
>>>> bpf trampoline is constructed for out of range patch-site:
>>>>
>>>> if (is_long_jump(orig_call, image))
>>>> 	return -ENOTSUPP;
>>>
>>> Sure, my point is that in practice that means that (from the user's PoV) this
>>> may randomly fail to work, and I'd like something that we can ensure works
>>> consistently.
>>>
>>
>> OK, should I suspend this work until you finish refactoring ftrace?
> 
> Yes; I'd appreciate if we could hold on this for a bit.
> 
> I think with some ground work we can avoid most of the painful edge cases and
> might be able to avoid the need for custom trampolines.
> 

I'v read your WIP code, but unfortunately I didn't find any mechanism to
replace bpf trampoline in your code, sorry.

It looks like bpf trampoline and ftrace works can be done at the same
time. I think for now we can just attach bpf trampoline to bpf prog.
Once your ftrace work is done, we can add support for attaching bpf
trampoline to regular kernel function. Is this OK?

> Thanks,
> Mark.
> .
Florent Revest Aug. 9, 2022, 5:03 p.m. UTC | #8
On Thu, Jun 9, 2022 at 6:27 AM Xu Kuohai <xukuohai@huawei.com> wrote:
> On 6/7/2022 12:35 AM, Mark Rutland wrote:
> > On Thu, May 26, 2022 at 10:48:05PM +0800, Xu Kuohai wrote:
> >> On 5/26/2022 6:06 PM, Mark Rutland wrote:
> >>> On Thu, May 26, 2022 at 05:45:03PM +0800, Xu Kuohai wrote:
> >>>> On 5/25/2022 9:38 PM, Mark Rutland wrote:
> >>>>> On Wed, May 18, 2022 at 09:16:33AM -0400, Xu Kuohai wrote:
> >>>>>> As noted in that thread, I have a few concerns which equally apply here:
> >>>>>
> >>>>> * Due to the limited range of BL instructions, it's not always possible to
> >>>>>   patch an ftrace call-site to branch to an arbitrary trampoline. The way this
> >>>>>   works for ftrace today relies upon knowingthe set of trampolines at
> >>>>>   compile-time, and allocating module PLTs for those, and that approach cannot
> >>>>>   work reliably for dynanically allocated trampolines.
> >>>>
> >>>> Currently patch 5 returns -ENOTSUPP when long jump is detected, so no
> >>>> bpf trampoline is constructed for out of range patch-site:
> >>>>
> >>>> if (is_long_jump(orig_call, image))
> >>>>    return -ENOTSUPP;
> >>>
> >>> Sure, my point is that in practice that means that (from the user's PoV) this
> >>> may randomly fail to work, and I'd like something that we can ensure works
> >>> consistently.
> >>>
> >>
> >> OK, should I suspend this work until you finish refactoring ftrace?
> >
> > Yes; I'd appreciate if we could hold on this for a bit.
> >
> > I think with some ground work we can avoid most of the painful edge cases and
> > might be able to avoid the need for custom trampolines.
> >
>
> I'v read your WIP code, but unfortunately I didn't find any mechanism to
> replace bpf trampoline in your code, sorry.
>
> It looks like bpf trampoline and ftrace works can be done at the same
> time. I think for now we can just attach bpf trampoline to bpf prog.
> Once your ftrace work is done, we can add support for attaching bpf
> trampoline to regular kernel function. Is this OK?

Hey Mark and Xu! :)

I'm interested in this feature too and would be happy to help.

I've been trying to understand what you both have in mind to figure out a way
forward, please correct me if I got anything wrong! :)


It looks like, currently, there are three places where an indirection to BPF is
technically possible. Chronologically these are:

- the function's patchsite (currently there are 2 nops, this could become 4
  nops with Mark's series on per call-site ops)

- the ftrace ops (currently called by iterating over a global list but could be
  called more directly with Mark's series on per-call-site ops or by
  dynamically generated branches with Wang's series on dynamic trampolines)

- a ftrace trampoline tail call (currently, this is after restoring a full
  pt_regs but this could become an args only restoration with Mark's series on
  DYNAMIC_FTRACE_WITH_ARGS)


If we first consider the situation when only a BPF program is attached to a
kernel function:
- Using the patchsite for indirection (proposed by Xu, same as on x86)
   Pros:
   - We have BPF trampolines anyway because they are required for orthogonal
     features such as calling BPF programs as functions, so jumping into that
     existing JITed code is straightforward
   - This has the minimum overhead (eg: these trampolines only save the actual
     number of args used by the function in ctx and avoid indirect calls)
   Cons:
   - If the BPF trampoline is JITed outside BL's limits, attachment can
     randomly fail

- Using a ftrace op for indirection (proposed by Mark)
  Pros:
  - BPF doesn't need to care about BL's range, ftrace_caller will be in range
  Cons:
  - The ftrace trampoline would first save all args in an ftrace_regs only for
    the BPF op to then re-save them in a BPF ctx array (as per BPF calling
    convention) so we'd effectively have to do the work of saving args twice
  - BPF currently uses DYNAMIC_FTRACE_WITH_DIRECT_CALLS APIs. Either arm64
    should implement DIRECT_CALLS with... an indirect call :) (that is, the
    arch_ftrace_set_direct_caller op would turn back its ftrace_regs into
    arguments for the BPF trampoline) or BPF would need to use a different
    ftrace API just on arm64 (to define new ops, which, unless if they would be
    dynamically JITed, wouldn't be as performant as the existing BPF
    trampolines)

- Using a ftrace trampoline tail call for indirection (not discussed yet iiuc)
  Pros:
  - BPF also doesn't need to care about BL's range
  - This also leverages the existing BPF trampolines
  Cons:
  - This also does the work of saving/restoring arguments twice
  - DYNAMIC_FTRACE_WITH_DIRECT_CALLS depends on DYNAMIC_FTRACE_WITH_REGS now
    although in practice the registers kept by DYNAMIC_FTRACE_WITH_ARGS
    should be enough to call BPF trampolines

If we consider the situation when both ftrace ops and BPF programs are attached
to a kernel function:
- Using the patchsite for indirection can't solve this

- Using a ftrace op for indirection (proposed by Mark) or using a ftrace
  trampoline tail call as an indirection (proposed by Xu, same as on x86) have
  the same pros & cons as in the BPF only situation except that this time we
  pay the cost of registers saving twice for good reasons (we need args in both
  ftrace_regs and the BPF ctx array formats anyway)


Unless I'm missing something, it sounds like the following approach would work:
- Always patch patchsites with calls to ftrace trampolines (within BL ranges)
- Always go through ops and have arch_ftrace_set_direct_caller set
  ftrace_regs->direct_call (instead of pt_regs->orig_x0 in this patch)
- If ftrace_regs->direct_call != 0 at the end of the ftrace trampoline, tail
  call it

Once Mark's series on DYNAMIC_FTRACE_WITH_ARGS is merged, we would need to have
DYNAMIC_FTRACE_WITH_DIRECT_CALLS
  depend on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
BPF trampolines (the only users of this API now) only care about args to the
attachment point anyway so I think this would work transparently ?

Once Mark's series on per-callsite ops is merged, the second step (going
through ops) would be significantly faster in the situation where only one
program is used, therefore one arch_ftrace_set_direct_caller op.

Once Wang's series on dynamic trampolines is merged, the second step (going
through ops) would also be significantly faster in the case when multiple ops
are attached.


What are your thoughts? If this sounds somewhat sane, I'm happy to help out
with the implementation as well :)

Thanks!
Florent
Xu Kuohai Aug. 10, 2022, 8:10 a.m. UTC | #9
On 8/10/2022 1:03 AM, Florent Revest wrote:
> On Thu, Jun 9, 2022 at 6:27 AM Xu Kuohai <xukuohai@huawei.com> wrote:
>> On 6/7/2022 12:35 AM, Mark Rutland wrote:
>>> On Thu, May 26, 2022 at 10:48:05PM +0800, Xu Kuohai wrote:
>>>> On 5/26/2022 6:06 PM, Mark Rutland wrote:
>>>>> On Thu, May 26, 2022 at 05:45:03PM +0800, Xu Kuohai wrote:
>>>>>> On 5/25/2022 9:38 PM, Mark Rutland wrote:
>>>>>>> On Wed, May 18, 2022 at 09:16:33AM -0400, Xu Kuohai wrote:
>>>>>>>> As noted in that thread, I have a few concerns which equally apply here:
>>>>>>>
>>>>>>> * Due to the limited range of BL instructions, it's not always possible to
>>>>>>>    patch an ftrace call-site to branch to an arbitrary trampoline. The way this
>>>>>>>    works for ftrace today relies upon knowingthe set of trampolines at
>>>>>>>    compile-time, and allocating module PLTs for those, and that approach cannot
>>>>>>>    work reliably for dynanically allocated trampolines.
>>>>>>
>>>>>> Currently patch 5 returns -ENOTSUPP when long jump is detected, so no
>>>>>> bpf trampoline is constructed for out of range patch-site:
>>>>>>
>>>>>> if (is_long_jump(orig_call, image))
>>>>>>     return -ENOTSUPP;
>>>>>
>>>>> Sure, my point is that in practice that means that (from the user's PoV) this
>>>>> may randomly fail to work, and I'd like something that we can ensure works
>>>>> consistently.
>>>>>
>>>>
>>>> OK, should I suspend this work until you finish refactoring ftrace?
>>>
>>> Yes; I'd appreciate if we could hold on this for a bit.
>>>
>>> I think with some ground work we can avoid most of the painful edge cases and
>>> might be able to avoid the need for custom trampolines.
>>>
>>
>> I'v read your WIP code, but unfortunately I didn't find any mechanism to
>> replace bpf trampoline in your code, sorry.
>>
>> It looks like bpf trampoline and ftrace works can be done at the same
>> time. I think for now we can just attach bpf trampoline to bpf prog.
>> Once your ftrace work is done, we can add support for attaching bpf
>> trampoline to regular kernel function. Is this OK?
> 
> Hey Mark and Xu! :)
> 
> I'm interested in this feature too and would be happy to help.
> 
> I've been trying to understand what you both have in mind to figure out a way
> forward, please correct me if I got anything wrong! :)
> 
> 
> It looks like, currently, there are three places where an indirection to BPF is
> technically possible. Chronologically these are:
> 
> - the function's patchsite (currently there are 2 nops, this could become 4
>    nops with Mark's series on per call-site ops)
> 
> - the ftrace ops (currently called by iterating over a global list but could be
>    called more directly with Mark's series on per-call-site ops or by
>    dynamically generated branches with Wang's series on dynamic trampolines)
> 
> - a ftrace trampoline tail call (currently, this is after restoring a full
>    pt_regs but this could become an args only restoration with Mark's series on
>    DYNAMIC_FTRACE_WITH_ARGS)
> 
> 
> If we first consider the situation when only a BPF program is attached to a
> kernel function:
> - Using the patchsite for indirection (proposed by Xu, same as on x86)
>     Pros:
>     - We have BPF trampolines anyway because they are required for orthogonal
>       features such as calling BPF programs as functions, so jumping into that
>       existing JITed code is straightforward
>     - This has the minimum overhead (eg: these trampolines only save the actual
>       number of args used by the function in ctx and avoid indirect calls)
>     Cons:
>     - If the BPF trampoline is JITed outside BL's limits, attachment can
>       randomly fail
> 
> - Using a ftrace op for indirection (proposed by Mark)
>    Pros:
>    - BPF doesn't need to care about BL's range, ftrace_caller will be in range
>    Cons:
>    - The ftrace trampoline would first save all args in an ftrace_regs only for
>      the BPF op to then re-save them in a BPF ctx array (as per BPF calling
>      convention) so we'd effectively have to do the work of saving args twice
>    - BPF currently uses DYNAMIC_FTRACE_WITH_DIRECT_CALLS APIs. Either arm64
>      should implement DIRECT_CALLS with... an indirect call :) (that is, the
>      arch_ftrace_set_direct_caller op would turn back its ftrace_regs into
>      arguments for the BPF trampoline) or BPF would need to use a different
>      ftrace API just on arm64 (to define new ops, which, unless if they would be
>      dynamically JITed, wouldn't be as performant as the existing BPF
>      trampolines)
> 
> - Using a ftrace trampoline tail call for indirection (not discussed yet iiuc)
>    Pros:
>    - BPF also doesn't need to care about BL's range
>    - This also leverages the existing BPF trampolines
>    Cons:
>    - This also does the work of saving/restoring arguments twice
>    - DYNAMIC_FTRACE_WITH_DIRECT_CALLS depends on DYNAMIC_FTRACE_WITH_REGS now
>      although in practice the registers kept by DYNAMIC_FTRACE_WITH_ARGS
>      should be enough to call BPF trampolines
> 
> If we consider the situation when both ftrace ops and BPF programs are attached
> to a kernel function:
> - Using the patchsite for indirection can't solve this
> 
> - Using a ftrace op for indirection (proposed by Mark) or using a ftrace
>    trampoline tail call as an indirection (proposed by Xu, same as on x86) have
>    the same pros & cons as in the BPF only situation except that this time we
>    pay the cost of registers saving twice for good reasons (we need args in both
>    ftrace_regs and the BPF ctx array formats anyway)
> 
> 
> Unless I'm missing something, it sounds like the following approach would work:
> - Always patch patchsites with calls to ftrace trampolines (within BL ranges)
> - Always go through ops and have arch_ftrace_set_direct_caller set
>    ftrace_regs->direct_call (instead of pt_regs->orig_x0 in this patch)
> - If ftrace_regs->direct_call != 0 at the end of the ftrace trampoline, tail
>    call it
> 
> Once Mark's series on DYNAMIC_FTRACE_WITH_ARGS is merged, we would need to have
> DYNAMIC_FTRACE_WITH_DIRECT_CALLS
>    depend on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
> BPF trampolines (the only users of this API now) only care about args to the
> attachment point anyway so I think this would work transparently ?
> 
> Once Mark's series on per-callsite ops is merged, the second step (going
> through ops) would be significantly faster in the situation where only one
> program is used, therefore one arch_ftrace_set_direct_caller op.
> 
> Once Wang's series on dynamic trampolines is merged, the second step (going
> through ops) would also be significantly faster in the case when multiple ops
> are attached.
> 
> 
> What are your thoughts? If this sounds somewhat sane, I'm happy to help out
> with the implementation as well :)
> 

Hi Florent,

I'm struggling with how to attach bpf trampoline to regular kernel functions. I
think your suggestion is fine. Thanks for the help!

> Thanks!
> Florent
> .
diff mbox series

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 57c4c995965f..81cc330daafc 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -177,6 +177,8 @@  config ARM64
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS \
 		if $(cc-option,-fpatchable-function-entry=2)
+	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS \
+		if DYNAMIC_FTRACE_WITH_REGS
 	select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \
 		if DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index 1494cfa8639b..14a35a5df0a1 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -78,6 +78,18 @@  static inline unsigned long ftrace_call_adjust(unsigned long addr)
 	return addr;
 }
 
+#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs,
+						 unsigned long addr)
+{
+	/*
+	 * Place custom trampoline address in regs->orig_x0 to let ftrace
+	 * trampoline jump to it.
+	 */
+	regs->orig_x0 = addr;
+}
+#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
 struct dyn_ftrace;
 int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 1197e7679882..b1ed0bf01c59 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -80,6 +80,7 @@  int main(void)
   DEFINE(S_SDEI_TTBR1,		offsetof(struct pt_regs, sdei_ttbr1));
   DEFINE(S_PMR_SAVE,		offsetof(struct pt_regs, pmr_save));
   DEFINE(S_STACKFRAME,		offsetof(struct pt_regs, stackframe));
+  DEFINE(S_ORIG_X0,		offsetof(struct pt_regs, orig_x0));
   DEFINE(PT_REGS_SIZE,		sizeof(struct pt_regs));
   BLANK();
 #ifdef CONFIG_COMPAT
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index e535480a4069..dfe62c55e3a2 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -60,6 +60,9 @@ 
 	str	x29, [sp, #S_FP]
 	.endif
 
+	/* Set orig_x0 to zero  */
+	str     xzr, [sp, #S_ORIG_X0]
+
 	/* Save the callsite's SP and LR */
 	add	x10, sp, #(PT_REGS_SIZE + 16)
 	stp	x9, x10, [sp, #S_LR]
@@ -119,12 +122,21 @@  ftrace_common_return:
 	/* Restore the callsite's FP, LR, PC */
 	ldr	x29, [sp, #S_FP]
 	ldr	x30, [sp, #S_LR]
-	ldr	x9, [sp, #S_PC]
-
+	ldr	x10, [sp, #S_PC]
+
+	ldr	x11, [sp, #S_ORIG_X0]
+	cbz	x11, 1f
+	/* Set x9 to parent ip before jump to custom trampoline */
+	mov	x9,  x30
+	/* Set lr to self ip */
+	ldr	x30, [sp, #S_PC]
+	/* Set x10 (used for return address) to custom trampoline */
+	mov	x10, x11
+1:
 	/* Restore the callsite's SP */
 	add	sp, sp, #PT_REGS_SIZE + 16
 
-	ret	x9
+	ret	x10
 SYM_CODE_END(ftrace_common)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER