diff mbox series

[bpf-next,22/24] s390/bpf: Implement arch_prepare_bpf_trampoline()

Message ID 20230125213817.1424447-23-iii@linux.ibm.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Support bpf trampoline for s390x | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count fail Series longer than 15 patches (and no cover letter)
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1420 this patch: 1420
netdev/cc_maintainers warning 12 maintainers not CCed: sdf@google.com kpsingh@kernel.org jolsa@kernel.org borntraeger@linux.ibm.com martin.lau@linux.dev svens@linux.ibm.com song@kernel.org john.fastabend@gmail.com linux-s390@vger.kernel.org haoluo@google.com agordeev@linux.ibm.com yhs@fb.com
netdev/build_clang success Errors and warnings before: 149 this patch: 149
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1415 this patch: 1415
netdev/checkpatch warning CHECK: architecture specific defines should be avoided WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc

Commit Message

Ilya Leoshkevich Jan. 25, 2023, 9:38 p.m. UTC
arch_prepare_bpf_trampoline() is used for direct attachment of eBPF
programs to various places, bypassing kprobes. It's responsible for
calling a number of eBPF programs before, instead and/or after
whatever they are attached to.

Add a s390x implementation, paying attention to the following:

- Reuse the existing JIT infrastructure, where possible.
- Like the existing JIT, prefer making multiple passes instead of
  backpatching. Currently 2 passes is enough. If literal pool is
  introduced, this needs to be raised to 3. However, at the moment
  adding literal pool only makes the code larger. If branch
  shortening is introduced, the number of passes needs to be
  increased even further.
- Support both regular and ftrace calling conventions, depending on
  the trampoline flags.
- Use expolines for indirect calls.
- Handle the mismatch between the eBPF and the s390x ABIs.
- Sign-extend fmod_ret return values.

invoke_bpf_prog() produces about 120 bytes; it might be possible to
slightly optimize this, but reaching 50 bytes, like on x86_64, looks
unrealistic: just loading cookie, __bpf_prog_enter, bpf_func, insnsi
and __bpf_prog_exit as literals already takes at least 5 * 12 = 60
bytes, and we can't use relative addressing for most of them.
Therefore, lower BPF_MAX_TRAMP_LINKS on s390x.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
---
 arch/s390/net/bpf_jit_comp.c | 535 +++++++++++++++++++++++++++++++++--
 include/linux/bpf.h          |   4 +
 2 files changed, 517 insertions(+), 22 deletions(-)

Comments

Andrii Nakryiko Jan. 26, 2023, 1:15 a.m. UTC | #1
On Wed, Jan 25, 2023 at 1:39 PM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>
> arch_prepare_bpf_trampoline() is used for direct attachment of eBPF
> programs to various places, bypassing kprobes. It's responsible for
> calling a number of eBPF programs before, instead and/or after
> whatever they are attached to.
>
> Add a s390x implementation, paying attention to the following:
>
> - Reuse the existing JIT infrastructure, where possible.
> - Like the existing JIT, prefer making multiple passes instead of
>   backpatching. Currently 2 passes is enough. If literal pool is
>   introduced, this needs to be raised to 3. However, at the moment
>   adding literal pool only makes the code larger. If branch
>   shortening is introduced, the number of passes needs to be
>   increased even further.
> - Support both regular and ftrace calling conventions, depending on
>   the trampoline flags.
> - Use expolines for indirect calls.
> - Handle the mismatch between the eBPF and the s390x ABIs.
> - Sign-extend fmod_ret return values.
>
> invoke_bpf_prog() produces about 120 bytes; it might be possible to
> slightly optimize this, but reaching 50 bytes, like on x86_64, looks
> unrealistic: just loading cookie, __bpf_prog_enter, bpf_func, insnsi
> and __bpf_prog_exit as literals already takes at least 5 * 12 = 60
> bytes, and we can't use relative addressing for most of them.
> Therefore, lower BPF_MAX_TRAMP_LINKS on s390x.
>
> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> ---
>  arch/s390/net/bpf_jit_comp.c | 535 +++++++++++++++++++++++++++++++++--
>  include/linux/bpf.h          |   4 +
>  2 files changed, 517 insertions(+), 22 deletions(-)
>

[...]

> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index cf89504c8dda..52ff43bbf996 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -943,7 +943,11 @@ struct btf_func_model {
>  /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
>   * bytes on x86.
>   */
> +#if defined(__s390x__)
> +#define BPF_MAX_TRAMP_LINKS 27
> +#else
>  #define BPF_MAX_TRAMP_LINKS 38
> +#endif

if we turn this into enum definition, then on selftests side we can
just discover this from vmlinux BTF, instead of hard-coding
arch-specific constants. Thoughts?

>
>  struct bpf_tramp_links {
>         struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS];
> --
> 2.39.1
>
Ilya Leoshkevich Jan. 26, 2023, 2:30 p.m. UTC | #2
On Wed, 2023-01-25 at 17:15 -0800, Andrii Nakryiko wrote:
> On Wed, Jan 25, 2023 at 1:39 PM Ilya Leoshkevich <iii@linux.ibm.com>
> wrote:
> > 
> > arch_prepare_bpf_trampoline() is used for direct attachment of eBPF
> > programs to various places, bypassing kprobes. It's responsible for
> > calling a number of eBPF programs before, instead and/or after
> > whatever they are attached to.
> > 
> > Add a s390x implementation, paying attention to the following:
> > 
> > - Reuse the existing JIT infrastructure, where possible.
> > - Like the existing JIT, prefer making multiple passes instead of
> >   backpatching. Currently 2 passes is enough. If literal pool is
> >   introduced, this needs to be raised to 3. However, at the moment
> >   adding literal pool only makes the code larger. If branch
> >   shortening is introduced, the number of passes needs to be
> >   increased even further.
> > - Support both regular and ftrace calling conventions, depending on
> >   the trampoline flags.
> > - Use expolines for indirect calls.
> > - Handle the mismatch between the eBPF and the s390x ABIs.
> > - Sign-extend fmod_ret return values.
> > 
> > invoke_bpf_prog() produces about 120 bytes; it might be possible to
> > slightly optimize this, but reaching 50 bytes, like on x86_64,
> > looks
> > unrealistic: just loading cookie, __bpf_prog_enter, bpf_func,
> > insnsi
> > and __bpf_prog_exit as literals already takes at least 5 * 12 = 60
> > bytes, and we can't use relative addressing for most of them.
> > Therefore, lower BPF_MAX_TRAMP_LINKS on s390x.
> > 
> > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> > ---
> >  arch/s390/net/bpf_jit_comp.c | 535
> > +++++++++++++++++++++++++++++++++--
> >  include/linux/bpf.h          |   4 +
> >  2 files changed, 517 insertions(+), 22 deletions(-)
> > 
> 
> [...]
> 
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index cf89504c8dda..52ff43bbf996 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -943,7 +943,11 @@ struct btf_func_model {
> >  /* Each call __bpf_prog_enter + call bpf_func + call
> > __bpf_prog_exit is ~50
> >   * bytes on x86.
> >   */
> > +#if defined(__s390x__)
> > +#define BPF_MAX_TRAMP_LINKS 27
> > +#else
> >  #define BPF_MAX_TRAMP_LINKS 38
> > +#endif
> 
> if we turn this into enum definition, then on selftests side we can
> just discover this from vmlinux BTF, instead of hard-coding
> arch-specific constants. Thoughts?

This seems to work. I can replace 3/24 and 4/24 with that in v2.
Some random notes:

- It doesn't seem to be possible to #include "vlinux.h" into tests,
  so one has to go through the btf__load_vmlinux_btf() dance and
  allocate the fd arrays dynamically.

- One has to give this enum an otherwise unnecessary name, so that
  it's easy to find. This doesn't seem like a big deal though:

enum bpf_max_tramp_links {
#if defined(__s390x__)
	BPF_MAX_TRAMP_LINKS = 27,
#else
	BPF_MAX_TRAMP_LINKS = 38,
#endif
};

- An alternative might be to expose this via /proc, since the users
  might be interested in it too.

> > 
> >  struct bpf_tramp_links {
> >         struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS];
> > --
> > 2.39.1
> >
Andrii Nakryiko Jan. 26, 2023, 7:06 p.m. UTC | #3
On Thu, Jan 26, 2023 at 6:30 AM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>
> On Wed, 2023-01-25 at 17:15 -0800, Andrii Nakryiko wrote:
> > On Wed, Jan 25, 2023 at 1:39 PM Ilya Leoshkevich <iii@linux.ibm.com>
> > wrote:
> > >
> > > arch_prepare_bpf_trampoline() is used for direct attachment of eBPF
> > > programs to various places, bypassing kprobes. It's responsible for
> > > calling a number of eBPF programs before, instead and/or after
> > > whatever they are attached to.
> > >
> > > Add a s390x implementation, paying attention to the following:
> > >
> > > - Reuse the existing JIT infrastructure, where possible.
> > > - Like the existing JIT, prefer making multiple passes instead of
> > >   backpatching. Currently 2 passes is enough. If literal pool is
> > >   introduced, this needs to be raised to 3. However, at the moment
> > >   adding literal pool only makes the code larger. If branch
> > >   shortening is introduced, the number of passes needs to be
> > >   increased even further.
> > > - Support both regular and ftrace calling conventions, depending on
> > >   the trampoline flags.
> > > - Use expolines for indirect calls.
> > > - Handle the mismatch between the eBPF and the s390x ABIs.
> > > - Sign-extend fmod_ret return values.
> > >
> > > invoke_bpf_prog() produces about 120 bytes; it might be possible to
> > > slightly optimize this, but reaching 50 bytes, like on x86_64,
> > > looks
> > > unrealistic: just loading cookie, __bpf_prog_enter, bpf_func,
> > > insnsi
> > > and __bpf_prog_exit as literals already takes at least 5 * 12 = 60
> > > bytes, and we can't use relative addressing for most of them.
> > > Therefore, lower BPF_MAX_TRAMP_LINKS on s390x.
> > >
> > > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> > > ---
> > >  arch/s390/net/bpf_jit_comp.c | 535
> > > +++++++++++++++++++++++++++++++++--
> > >  include/linux/bpf.h          |   4 +
> > >  2 files changed, 517 insertions(+), 22 deletions(-)
> > >
> >
> > [...]
> >
> > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > index cf89504c8dda..52ff43bbf996 100644
> > > --- a/include/linux/bpf.h
> > > +++ b/include/linux/bpf.h
> > > @@ -943,7 +943,11 @@ struct btf_func_model {
> > >  /* Each call __bpf_prog_enter + call bpf_func + call
> > > __bpf_prog_exit is ~50
> > >   * bytes on x86.
> > >   */
> > > +#if defined(__s390x__)
> > > +#define BPF_MAX_TRAMP_LINKS 27
> > > +#else
> > >  #define BPF_MAX_TRAMP_LINKS 38
> > > +#endif
> >
> > if we turn this into enum definition, then on selftests side we can
> > just discover this from vmlinux BTF, instead of hard-coding
> > arch-specific constants. Thoughts?
>
> This seems to work. I can replace 3/24 and 4/24 with that in v2.
> Some random notes:
>
> - It doesn't seem to be possible to #include "vlinux.h" into tests,
>   so one has to go through the btf__load_vmlinux_btf() dance and
>   allocate the fd arrays dynamically.

yes, you can't include vmlinux.h into user-space code, of course. And
yes it's true about needing to use btf__load_vmlinux_btf().

But I didn't get what you are saying about fd arrays, tbh. Can you
please elaborate?

>
> - One has to give this enum an otherwise unnecessary name, so that
>   it's easy to find. This doesn't seem like a big deal though:
>
> enum bpf_max_tramp_links {

not really, you can keep it anonymous enum. We do that in
include/uapi/linux/bpf.h for a lot of constants

> #if defined(__s390x__)
>         BPF_MAX_TRAMP_LINKS = 27,
> #else
>         BPF_MAX_TRAMP_LINKS = 38,
> #endif
> };
>
> - An alternative might be to expose this via /proc, since the users
>   might be interested in it too.

I'd say let's not, there is no need, having it in BTF is more than
enough for testing purposes

>
> > >
> > >  struct bpf_tramp_links {
> > >         struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS];
> > > --
> > > 2.39.1
> > >
>
Ilya Leoshkevich Jan. 27, 2023, 11:15 a.m. UTC | #4
On Thu, 2023-01-26 at 11:06 -0800, Andrii Nakryiko wrote:
> On Thu, Jan 26, 2023 at 6:30 AM Ilya Leoshkevich <iii@linux.ibm.com>
> wrote:
> > 
> > On Wed, 2023-01-25 at 17:15 -0800, Andrii Nakryiko wrote:
> > > On Wed, Jan 25, 2023 at 1:39 PM Ilya Leoshkevich
> > > <iii@linux.ibm.com>
> > > wrote:
> > > > 
> > > > arch_prepare_bpf_trampoline() is used for direct attachment of
> > > > eBPF
> > > > programs to various places, bypassing kprobes. It's responsible
> > > > for
> > > > calling a number of eBPF programs before, instead and/or after
> > > > whatever they are attached to.
> > > > 
> > > > Add a s390x implementation, paying attention to the following:
> > > > 
> > > > - Reuse the existing JIT infrastructure, where possible.
> > > > - Like the existing JIT, prefer making multiple passes instead
> > > > of
> > > >   backpatching. Currently 2 passes is enough. If literal pool
> > > > is
> > > >   introduced, this needs to be raised to 3. However, at the
> > > > moment
> > > >   adding literal pool only makes the code larger. If branch
> > > >   shortening is introduced, the number of passes needs to be
> > > >   increased even further.
> > > > - Support both regular and ftrace calling conventions,
> > > > depending on
> > > >   the trampoline flags.
> > > > - Use expolines for indirect calls.
> > > > - Handle the mismatch between the eBPF and the s390x ABIs.
> > > > - Sign-extend fmod_ret return values.
> > > > 
> > > > invoke_bpf_prog() produces about 120 bytes; it might be
> > > > possible to
> > > > slightly optimize this, but reaching 50 bytes, like on x86_64,
> > > > looks
> > > > unrealistic: just loading cookie, __bpf_prog_enter, bpf_func,
> > > > insnsi
> > > > and __bpf_prog_exit as literals already takes at least 5 * 12 =
> > > > 60
> > > > bytes, and we can't use relative addressing for most of them.
> > > > Therefore, lower BPF_MAX_TRAMP_LINKS on s390x.
> > > > 
> > > > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> > > > ---
> > > >  arch/s390/net/bpf_jit_comp.c | 535
> > > > +++++++++++++++++++++++++++++++++--
> > > >  include/linux/bpf.h          |   4 +
> > > >  2 files changed, 517 insertions(+), 22 deletions(-)
> > > > 
> > > 
> > > [...]
> > > 
> > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > index cf89504c8dda..52ff43bbf996 100644
> > > > --- a/include/linux/bpf.h
> > > > +++ b/include/linux/bpf.h
> > > > @@ -943,7 +943,11 @@ struct btf_func_model {
> > > >  /* Each call __bpf_prog_enter + call bpf_func + call
> > > > __bpf_prog_exit is ~50
> > > >   * bytes on x86.
> > > >   */
> > > > +#if defined(__s390x__)
> > > > +#define BPF_MAX_TRAMP_LINKS 27
> > > > +#else
> > > >  #define BPF_MAX_TRAMP_LINKS 38
> > > > +#endif
> > > 
> > > if we turn this into enum definition, then on selftests side we
> > > can
> > > just discover this from vmlinux BTF, instead of hard-coding
> > > arch-specific constants. Thoughts?
> > 
> > This seems to work. I can replace 3/24 and 4/24 with that in v2.
> > Some random notes:
> > 
> > - It doesn't seem to be possible to #include "vlinux.h" into tests,
> >   so one has to go through the btf__load_vmlinux_btf() dance and
> >   allocate the fd arrays dynamically.
> 
> yes, you can't include vmlinux.h into user-space code, of course. And
> yes it's true about needing to use btf__load_vmlinux_btf().
> 
> But I didn't get what you are saying about fd arrays, tbh. Can you
> please elaborate?

That's a really minor thing; fexit_fd and and link_fd in fexit_stress
now need to be allocated dynamically.

> > - One has to give this enum an otherwise unnecessary name, so that
> >   it's easy to find. This doesn't seem like a big deal though:
> > 
> > enum bpf_max_tramp_links {
> 
> not really, you can keep it anonymous enum. We do that in
> include/uapi/linux/bpf.h for a lot of constants

How would you find it then? My current code is:

int get_bpf_max_tramp_links_from(struct btf *btf)
{
        const struct btf_enum *e;
        const struct btf_type *t;
        const char *name;
        int id;

        id = btf__find_by_name_kind(btf, "bpf_max_tramp_links",
BTF_KIND_ENUM);
        if (!ASSERT_GT(id, 0, "bpf_max_tramp_links id"))
                return -1;
        t = btf__type_by_id(btf, id);
        if (!ASSERT_OK_PTR(t, "bpf_max_tramp_links type"))
                return -1;
        if (!ASSERT_EQ(btf_vlen(t), 1, "bpf_max_tramp_links vlen"))
                return -1;
        e = btf_enum(t);
        if (!ASSERT_OK_PTR(e, "bpf_max_tramp_links[0]"))
                return -1;
        name = btf__name_by_offset(btf, e->name_off);
        if (!ASSERT_OK_PTR(name, "bpf_max_tramp_links[0].name_off") &&
            !ASSERT_STREQ(name, "BPF_MAX_TRAMP_LINKS",
"BPF_MAX_TRAMP_LINKS"))
                return -1;

        return e->val;
}

Is there a way to bypass looking up the enum, and go straight for the
named member?

> > #if defined(__s390x__)
> >         BPF_MAX_TRAMP_LINKS = 27,
> > #else
> >         BPF_MAX_TRAMP_LINKS = 38,
> > #endif
> > };
> > 
> > - An alternative might be to expose this via /proc, since the users
> >   might be interested in it too.
> 
> I'd say let's not, there is no need, having it in BTF is more than
> enough for testing purposes

Fair enough.
>
Andrii Nakryiko Jan. 27, 2023, 5:30 p.m. UTC | #5
On Fri, Jan 27, 2023 at 3:15 AM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>
> On Thu, 2023-01-26 at 11:06 -0800, Andrii Nakryiko wrote:
> > On Thu, Jan 26, 2023 at 6:30 AM Ilya Leoshkevich <iii@linux.ibm.com>
> > wrote:
> > >
> > > On Wed, 2023-01-25 at 17:15 -0800, Andrii Nakryiko wrote:
> > > > On Wed, Jan 25, 2023 at 1:39 PM Ilya Leoshkevich
> > > > <iii@linux.ibm.com>
> > > > wrote:
> > > > >
> > > > > arch_prepare_bpf_trampoline() is used for direct attachment of
> > > > > eBPF
> > > > > programs to various places, bypassing kprobes. It's responsible
> > > > > for
> > > > > calling a number of eBPF programs before, instead and/or after
> > > > > whatever they are attached to.
> > > > >
> > > > > Add a s390x implementation, paying attention to the following:
> > > > >
> > > > > - Reuse the existing JIT infrastructure, where possible.
> > > > > - Like the existing JIT, prefer making multiple passes instead
> > > > > of
> > > > >   backpatching. Currently 2 passes is enough. If literal pool
> > > > > is
> > > > >   introduced, this needs to be raised to 3. However, at the
> > > > > moment
> > > > >   adding literal pool only makes the code larger. If branch
> > > > >   shortening is introduced, the number of passes needs to be
> > > > >   increased even further.
> > > > > - Support both regular and ftrace calling conventions,
> > > > > depending on
> > > > >   the trampoline flags.
> > > > > - Use expolines for indirect calls.
> > > > > - Handle the mismatch between the eBPF and the s390x ABIs.
> > > > > - Sign-extend fmod_ret return values.
> > > > >
> > > > > invoke_bpf_prog() produces about 120 bytes; it might be
> > > > > possible to
> > > > > slightly optimize this, but reaching 50 bytes, like on x86_64,
> > > > > looks
> > > > > unrealistic: just loading cookie, __bpf_prog_enter, bpf_func,
> > > > > insnsi
> > > > > and __bpf_prog_exit as literals already takes at least 5 * 12 =
> > > > > 60
> > > > > bytes, and we can't use relative addressing for most of them.
> > > > > Therefore, lower BPF_MAX_TRAMP_LINKS on s390x.
> > > > >
> > > > > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> > > > > ---
> > > > >  arch/s390/net/bpf_jit_comp.c | 535
> > > > > +++++++++++++++++++++++++++++++++--
> > > > >  include/linux/bpf.h          |   4 +
> > > > >  2 files changed, 517 insertions(+), 22 deletions(-)
> > > > >
> > > >
> > > > [...]
> > > >
> > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > index cf89504c8dda..52ff43bbf996 100644
> > > > > --- a/include/linux/bpf.h
> > > > > +++ b/include/linux/bpf.h
> > > > > @@ -943,7 +943,11 @@ struct btf_func_model {
> > > > >  /* Each call __bpf_prog_enter + call bpf_func + call
> > > > > __bpf_prog_exit is ~50
> > > > >   * bytes on x86.
> > > > >   */
> > > > > +#if defined(__s390x__)
> > > > > +#define BPF_MAX_TRAMP_LINKS 27
> > > > > +#else
> > > > >  #define BPF_MAX_TRAMP_LINKS 38
> > > > > +#endif
> > > >
> > > > if we turn this into enum definition, then on selftests side we
> > > > can
> > > > just discover this from vmlinux BTF, instead of hard-coding
> > > > arch-specific constants. Thoughts?
> > >
> > > This seems to work. I can replace 3/24 and 4/24 with that in v2.
> > > Some random notes:
> > >
> > > - It doesn't seem to be possible to #include "vlinux.h" into tests,
> > >   so one has to go through the btf__load_vmlinux_btf() dance and
> > >   allocate the fd arrays dynamically.
> >
> > yes, you can't include vmlinux.h into user-space code, of course. And
> > yes it's true about needing to use btf__load_vmlinux_btf().
> >
> > But I didn't get what you are saying about fd arrays, tbh. Can you
> > please elaborate?
>
> That's a really minor thing; fexit_fd and and link_fd in fexit_stress
> now need to be allocated dynamically.
>
> > > - One has to give this enum an otherwise unnecessary name, so that
> > >   it's easy to find. This doesn't seem like a big deal though:
> > >
> > > enum bpf_max_tramp_links {
> >
> > not really, you can keep it anonymous enum. We do that in
> > include/uapi/linux/bpf.h for a lot of constants
>
> How would you find it then? My current code is:
>
> int get_bpf_max_tramp_links_from(struct btf *btf)
> {
>         const struct btf_enum *e;
>         const struct btf_type *t;
>         const char *name;
>         int id;
>
>         id = btf__find_by_name_kind(btf, "bpf_max_tramp_links",
> BTF_KIND_ENUM);
>         if (!ASSERT_GT(id, 0, "bpf_max_tramp_links id"))
>                 return -1;
>         t = btf__type_by_id(btf, id);
>         if (!ASSERT_OK_PTR(t, "bpf_max_tramp_links type"))
>                 return -1;
>         if (!ASSERT_EQ(btf_vlen(t), 1, "bpf_max_tramp_links vlen"))
>                 return -1;
>         e = btf_enum(t);
>         if (!ASSERT_OK_PTR(e, "bpf_max_tramp_links[0]"))
>                 return -1;
>         name = btf__name_by_offset(btf, e->name_off);
>         if (!ASSERT_OK_PTR(name, "bpf_max_tramp_links[0].name_off") &&
>             !ASSERT_STREQ(name, "BPF_MAX_TRAMP_LINKS",
> "BPF_MAX_TRAMP_LINKS"))
>                 return -1;
>
>         return e->val;
> }
>
> Is there a way to bypass looking up the enum, and go straight for the
> named member?


don't use btf__find_by_name_kind, just iterate all types and look at
all anonymous enums and its values, roughly

for (i = 1; i < btf__type_cnt(btf); i++) {
    const btf_type *t = btf__type_by_id(i);
    if (!btf_is_enum(t) || t->name_off)
        continue;
    for (j = 0; j < btf_vlen(t); j++) {
        if (strcmp(btf__str_by_offset(btf, btf_enum(t)[j].name_off),
"BPF_MAX_TRAMP_LINKS") != 0)
            continue;
        /* found it */
    }
}

but cleaner :)


>
> > > #if defined(__s390x__)
> > >         BPF_MAX_TRAMP_LINKS = 27,
> > > #else
> > >         BPF_MAX_TRAMP_LINKS = 38,
> > > #endif
> > > };
> > >
> > > - An alternative might be to expose this via /proc, since the users
> > >   might be interested in it too.
> >
> > I'd say let's not, there is no need, having it in BTF is more than
> > enough for testing purposes
>
> Fair enough.
> >
diff mbox series

Patch

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index c72eb3fc1f98..ea8203bd4112 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -71,6 +71,10 @@  struct bpf_jit {
 #define REG_0		REG_W0			/* Register 0 */
 #define REG_1		REG_W1			/* Register 1 */
 #define REG_2		BPF_REG_1		/* Register 2 */
+#define REG_3		BPF_REG_2		/* Register 3 */
+#define REG_4		BPF_REG_3		/* Register 4 */
+#define REG_7		BPF_REG_6		/* Register 7 */
+#define REG_8		BPF_REG_7		/* Register 8 */
 #define REG_14		BPF_REG_0		/* Register 14 */
 
 /*
@@ -595,6 +599,43 @@  static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
 	}
 }
 
+/*
+ * Emit an expoline for a jump that follows
+ */
+static void emit_expoline(struct bpf_jit *jit)
+{
+	/* exrl %r0,.+10 */
+	EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10);
+	/* j . */
+	EMIT4_PCREL(0xa7f40000, 0);
+}
+
+/*
+ * Emit __s390_indirect_jump_r1 thunk if necessary
+ */
+static void emit_r1_thunk(struct bpf_jit *jit)
+{
+	if (nospec_uses_trampoline()) {
+		jit->r1_thunk_ip = jit->prg;
+		emit_expoline(jit);
+		/* br %r1 */
+		_EMIT2(0x07f1);
+	}
+}
+
+/*
+ * Call r1 either directly or via __s390_indirect_jump_r1 thunk
+ */
+static void call_r1(struct bpf_jit *jit)
+{
+	if (nospec_uses_trampoline())
+		/* brasl %r14,__s390_indirect_jump_r1 */
+		EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip);
+	else
+		/* basr %r14,%r1 */
+		EMIT2(0x0d00, REG_14, REG_1);
+}
+
 /*
  * Function epilogue
  */
@@ -608,25 +649,13 @@  static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
 	if (nospec_uses_trampoline()) {
 		jit->r14_thunk_ip = jit->prg;
 		/* Generate __s390_indirect_jump_r14 thunk */
-		/* exrl %r0,.+10 */
-		EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10);
-		/* j . */
-		EMIT4_PCREL(0xa7f40000, 0);
+		emit_expoline(jit);
 	}
 	/* br %r14 */
 	_EMIT2(0x07fe);
 
-	if ((nospec_uses_trampoline()) &&
-	    (is_first_pass(jit) || (jit->seen & SEEN_FUNC))) {
-		jit->r1_thunk_ip = jit->prg;
-		/* Generate __s390_indirect_jump_r1 thunk */
-		/* exrl %r0,.+10 */
-		EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10);
-		/* j . */
-		EMIT4_PCREL(0xa7f40000, 0);
-		/* br %r1 */
-		_EMIT2(0x07f1);
-	}
+	if (is_first_pass(jit) || (jit->seen & SEEN_FUNC))
+		emit_r1_thunk(jit);
 
 	jit->prg = ALIGN(jit->prg, 8);
 	jit->prologue_plt = jit->prg;
@@ -707,6 +736,34 @@  static int bpf_jit_probe_mem(struct bpf_jit *jit, struct bpf_prog *fp,
 	return 0;
 }
 
+/*
+ * Sign-extend the register if necessary
+ */
+static int sign_extend(struct bpf_jit *jit, int r, u8 size, u8 flags)
+{
+	if (!(flags & BTF_FMODEL_SIGNED_ARG))
+		return 0;
+
+	switch (size) {
+	case 1:
+		/* lgbr %r,%r */
+		EMIT4(0xb9060000, r, r);
+		return 0;
+	case 2:
+		/* lghr %r,%r */
+		EMIT4(0xb9070000, r, r);
+		return 0;
+	case 4:
+		/* lgfr %r,%r */
+		EMIT4(0xb9140000, r, r);
+		return 0;
+	case 8:
+		return 0;
+	default:
+		return -1;
+	}
+}
+
 /*
  * Compile one eBPF instruction into s390x code
  *
@@ -1355,13 +1412,8 @@  static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		jit->seen |= SEEN_FUNC;
 		/* lgrl %w1,func */
 		EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func));
-		if (nospec_uses_trampoline()) {
-			/* brasl %r14,__s390_indirect_jump_r1 */
-			EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip);
-		} else {
-			/* basr %r14,%w1 */
-			EMIT2(0x0d00, REG_14, REG_W1);
-		}
+		/* %r1() */
+		call_r1(jit);
 		/* lgr %b0,%r2: load return value into %b0 */
 		EMIT4(0xb9040000, BPF_REG_0, REG_2);
 		break;
@@ -1964,3 +2016,442 @@  int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 
 	return 0;
 }
+
+struct bpf_tramp_jit {
+	struct bpf_jit common;
+	int orig_stack_args_off;/* Offset of arguments placed on stack by the
+				 * func_addr's original caller
+				 */
+	int stack_size;		/* Trampoline stack size */
+	int stack_args_off;	/* Offset of stack arguments for calling
+				 * func_addr, has to be at the top
+				 */
+	int reg_args_off;	/* Offset of register arguments for calling
+				 * func_addr
+				 */
+	int ip_off;		/* For bpf_get_func_ip(), has to be at
+				 * (ctx - 16)
+				 */
+	int arg_cnt_off;	/* For bpf_get_func_arg_cnt(), has to be at
+				 * (ctx - 8)
+				 */
+	int bpf_args_off;	/* Offset of BPF_PROG context, which consists
+				 * of BPF arguments followed by return value
+				 */
+	int retval_off;		/* Offset of return value (see above) */
+	int r7_r8_off;		/* Offset of saved %r7 and %r8, which are used
+				 * for __bpf_prog_enter() return value and
+				 * func_addr respectively
+				 */
+	int r14_off;		/* Offset of saved %r14 */
+	int run_ctx_off;	/* Offset of struct bpf_tramp_run_ctx */
+	int do_fexit;		/* do_fexit: label */
+};
+
+static void load_imm64(struct bpf_jit *jit, int dst_reg, u64 val)
+{
+	/* llihf %dst_reg,val_hi */
+	EMIT6_IMM(0xc00e0000, dst_reg, (val >> 32));
+	/* oilf %rdst_reg,val_lo */
+	EMIT6_IMM(0xc00d0000, dst_reg, val);
+}
+
+static void invoke_bpf_prog(struct bpf_tramp_jit *tjit,
+			    const struct btf_func_model *m,
+			    struct bpf_tramp_link *tlink, bool save_ret)
+{
+	struct bpf_jit *jit = &tjit->common;
+	int cookie_off = tjit->run_ctx_off +
+			 offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
+	struct bpf_prog *p = tlink->link.prog;
+	int patch;
+
+	/*
+	 * run_ctx.cookie = tlink->cookie;
+	 */
+
+	/* %r0 = tlink->cookie */
+	load_imm64(jit, REG_W0, tlink->cookie);
+	/* stg %r0,cookie_off(%r15) */
+	EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W0, REG_0, REG_15, cookie_off);
+
+	/*
+	 * if ((start = __bpf_prog_enter(p, &run_ctx)) == 0)
+	 *         goto skip;
+	 */
+
+	/* %r1 = __bpf_prog_enter */
+	load_imm64(jit, REG_1, (u64)bpf_trampoline_enter(p));
+	/* %r2 = p */
+	load_imm64(jit, REG_2, (u64)p);
+	/* la %r3,run_ctx_off(%r15) */
+	EMIT4_DISP(0x41000000, REG_3, REG_15, tjit->run_ctx_off);
+	/* %r1() */
+	call_r1(jit);
+	/* ltgr %r7,%r2 */
+	EMIT4(0xb9020000, REG_7, REG_2);
+	/* brcl 8,skip */
+	patch = jit->prg;
+	EMIT6_PCREL_RILC(0xc0040000, 8, 0);
+
+	/*
+	 * retval = bpf_func(args, p->insnsi);
+	 */
+
+	/* %r1 = p->bpf_func */
+	load_imm64(jit, REG_1, (u64)p->bpf_func);
+	/* la %r2,bpf_args_off(%r15) */
+	EMIT4_DISP(0x41000000, REG_2, REG_15, tjit->bpf_args_off);
+	/* %r3 = p->insnsi */
+	if (!p->jited)
+		load_imm64(jit, REG_3, (u64)p->insnsi);
+	/* %r1() */
+	call_r1(jit);
+	/* stg %r2,retval_off(%r15) */
+	if (save_ret) {
+		sign_extend(jit, REG_2, m->ret_size, m->ret_flags);
+		EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15,
+			      tjit->retval_off);
+	}
+
+	/* skip: */
+	if (jit->prg_buf)
+		*(u32 *)&jit->prg_buf[patch + 2] = (jit->prg - patch) >> 1;
+
+	/*
+	 * __bpf_prog_exit(p, start, &run_ctx);
+	 */
+
+	/* %r1 = __bpf_prog_exit */
+	load_imm64(jit, REG_1, (u64)bpf_trampoline_exit(p));
+	/* %r2 = p */
+	load_imm64(jit, REG_2, (u64)p);
+	/* lgr %r3,%r7 */
+	EMIT4(0xb9040000, REG_3, REG_7);
+	/* la %r4,run_ctx_off(%r15) */
+	EMIT4_DISP(0x41000000, REG_4, REG_15, tjit->run_ctx_off);
+	/* %r1() */
+	call_r1(jit);
+}
+
+static int alloc_stack(struct bpf_tramp_jit *tjit, size_t size)
+{
+	int stack_offset = tjit->stack_size;
+
+	tjit->stack_size += size;
+	return stack_offset;
+}
+
+/* ABI uses %r2 - %r6 for parameter passing. */
+#define MAX_NR_REG_ARGS 5
+
+/* The "L" field of the "mvc" instruction is 8 bits. */
+#define MAX_MVC_SIZE 256
+#define MAX_NR_STACK_ARGS (MAX_MVC_SIZE / sizeof(u64))
+
+/* -mfentry generates a 6-byte nop on s390x. */
+#define S390X_PATCH_SIZE 6
+
+int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
+				  struct bpf_tramp_jit *tjit,
+				  const struct btf_func_model *m,
+				  u32 flags, struct bpf_tramp_links *tlinks,
+				  void *func_addr)
+{
+	struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
+	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
+	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
+	int nr_bpf_args, nr_reg_args, nr_stack_args;
+	struct bpf_jit *jit = &tjit->common;
+	int arg, bpf_arg_off;
+	int i, j;
+
+	/* Support as many stack arguments as "mvc" instruction can handle. */
+	nr_reg_args = min_t(int, m->nr_args, MAX_NR_REG_ARGS);
+	nr_stack_args = m->nr_args - nr_reg_args;
+	if (nr_stack_args > MAX_NR_STACK_ARGS)
+		return -ENOTSUPP;
+
+	/* Return to %r14, since func_addr and %r0 are not available. */
+	if (!func_addr && !(flags & BPF_TRAMP_F_ORIG_STACK))
+		flags |= BPF_TRAMP_F_SKIP_FRAME;
+
+	/*
+	 * Compute how many arguments we need to pass to BPF programs.
+	 * BPF ABI mirrors that of x86_64: arguments that are 16 bytes or
+	 * smaller are packed into 1 or 2 registers; larger arguments are
+	 * passed via pointers.
+	 * In s390x ABI, arguments that are 8 bytes or smaller are packed into
+	 * a register; larger arguments are passed via pointers.
+	 * We need to deal with this difference.
+	 */
+	nr_bpf_args = 0;
+	for (i = 0; i < m->nr_args; i++) {
+		if (m->arg_size[i] <= 8)
+			nr_bpf_args += 1;
+		else if (m->arg_size[i] <= 16)
+			nr_bpf_args += 2;
+		else
+			return -ENOTSUPP;
+	}
+
+	/*
+	 * Calculate the stack layout.
+	 */
+
+	/* Reserve STACK_FRAME_OVERHEAD bytes for the callees. */
+	tjit->stack_size = STACK_FRAME_OVERHEAD;
+	tjit->stack_args_off = alloc_stack(tjit, nr_stack_args * sizeof(u64));
+	tjit->reg_args_off = alloc_stack(tjit, nr_reg_args * sizeof(u64));
+	tjit->ip_off = alloc_stack(tjit, sizeof(u64));
+	tjit->arg_cnt_off = alloc_stack(tjit, sizeof(u64));
+	tjit->bpf_args_off = alloc_stack(tjit, nr_bpf_args * sizeof(u64));
+	tjit->retval_off = alloc_stack(tjit, sizeof(u64));
+	tjit->r7_r8_off = alloc_stack(tjit, 2 * sizeof(u64));
+	tjit->r14_off = alloc_stack(tjit, sizeof(u64));
+	tjit->run_ctx_off = alloc_stack(tjit,
+					sizeof(struct bpf_tramp_run_ctx));
+	/* The caller has already reserved STACK_FRAME_OVERHEAD bytes. */
+	tjit->stack_size -= STACK_FRAME_OVERHEAD;
+	tjit->orig_stack_args_off = tjit->stack_size + STACK_FRAME_OVERHEAD;
+
+	/* aghi %r15,-stack_size */
+	EMIT4_IMM(0xa70b0000, REG_15, -tjit->stack_size);
+	/* stmg %r2,%rN,fwd_reg_args_off(%r15) */
+	if (nr_reg_args)
+		EMIT6_DISP_LH(0xeb000000, 0x0024, REG_2,
+			      REG_2 + (nr_reg_args - 1), REG_15,
+			      tjit->reg_args_off);
+	for (i = 0, j = 0; i < m->nr_args; i++) {
+		if (i < MAX_NR_REG_ARGS)
+			arg = REG_2 + i;
+		else
+			arg = tjit->orig_stack_args_off +
+			      (i - MAX_NR_REG_ARGS) * sizeof(u64);
+		bpf_arg_off = tjit->bpf_args_off + j * sizeof(u64);
+		if (m->arg_size[i] <= 8) {
+			if (i < MAX_NR_REG_ARGS)
+				/* stg %arg,bpf_arg_off(%r15) */
+				EMIT6_DISP_LH(0xe3000000, 0x0024, arg,
+					      REG_0, REG_15, bpf_arg_off);
+			else
+				/* mvc bpf_arg_off(8,%r15),arg(%r15) */
+				_EMIT6(0xd207f000 | bpf_arg_off,
+				       0xf000 | arg);
+			j += 1;
+		} else {
+			if (i < MAX_NR_REG_ARGS) {
+				/* mvc bpf_arg_off(16,%r15),0(%arg) */
+				_EMIT6(0xd20ff000 | bpf_arg_off,
+				       reg2hex[arg] << 12);
+			} else {
+				/* lg %r1,arg(%r15) */
+				EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, REG_0,
+					      REG_15, arg);
+				/* mvc bpf_arg_off(16,%r15),0(%r1) */
+				_EMIT6(0xd20ff000 | bpf_arg_off, 0x1000);
+			}
+			j += 2;
+		}
+	}
+	/* stmg %r7,%r8,r7_r8_off(%r15) */
+	EMIT6_DISP_LH(0xeb000000, 0x0024, REG_7, REG_8, REG_15,
+		      tjit->r7_r8_off);
+	/* stg %r14,r14_off(%r15) */
+	EMIT6_DISP_LH(0xe3000000, 0x0024, REG_14, REG_0, REG_15, tjit->r14_off);
+
+	if (flags & BPF_TRAMP_F_ORIG_STACK) {
+		/*
+		 * The ftrace trampoline puts the return address (which is the
+		 * address of the original function + S390X_PATCH_SIZE) into
+		 * %r0; see ftrace_shared_hotpatch_trampoline_br and
+		 * ftrace_init_nop() for details.
+		 */
+
+		/* lgr %r8,%r0 */
+		EMIT4(0xb9040000, REG_8, REG_0);
+	} else {
+		/* %r8 = func_addr + S390X_PATCH_SIZE */
+		load_imm64(jit, REG_8, (u64)func_addr + S390X_PATCH_SIZE);
+	}
+
+	/*
+	 * ip = func_addr;
+	 * arg_cnt = m->nr_args;
+	 */
+
+	if (flags & BPF_TRAMP_F_IP_ARG) {
+		/* %r0 = func_addr */
+		load_imm64(jit, REG_0, (u64)func_addr);
+		/* stg %r0,ip_off(%r15) */
+		EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15,
+			      tjit->ip_off);
+	}
+	/* lghi %r0,nr_bpf_args */
+	EMIT4_IMM(0xa7090000, REG_0, nr_bpf_args);
+	/* stg %r0,arg_cnt_off(%r15) */
+	EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15,
+		      tjit->arg_cnt_off);
+
+	if (flags & BPF_TRAMP_F_CALL_ORIG) {
+		/*
+		 * __bpf_tramp_enter(im);
+		 */
+
+		/* %r1 = __bpf_tramp_enter */
+		load_imm64(jit, REG_1, (u64)__bpf_tramp_enter);
+		/* %r2 = im */
+		load_imm64(jit, REG_2, (u64)im);
+		/* %r1() */
+		call_r1(jit);
+	}
+
+	for (i = 0; i < fentry->nr_links; i++)
+		invoke_bpf_prog(tjit, m, fentry->links[i],
+				flags & BPF_TRAMP_F_RET_FENTRY_RET);
+
+	if (fmod_ret->nr_links) {
+		/*
+		 * retval = 0;
+		 */
+
+		/* xc retval_off(8,%r15),retval_off(%r15) */
+		_EMIT6(0xd707f000 | tjit->retval_off,
+		       0xf000 | tjit->retval_off);
+
+		for (i = 0; i < fmod_ret->nr_links; i++) {
+			invoke_bpf_prog(tjit, m, fmod_ret->links[i], true);
+
+			/*
+			 * if (retval)
+			 *         goto do_fexit;
+			 */
+
+			/* ltg %r0,retval_off(%r15) */
+			EMIT6_DISP_LH(0xe3000000, 0x0002, REG_0, REG_0, REG_15,
+				      tjit->retval_off);
+			/* brcl 7,do_fexit */
+			EMIT6_PCREL_RILC(0xc0040000, 7, tjit->do_fexit);
+		}
+	}
+
+	if (flags & BPF_TRAMP_F_CALL_ORIG) {
+		/*
+		 * retval = func_addr(args);
+		 */
+
+		/* lmg %r2,%rN,reg_args_off(%r15) */
+		if (nr_reg_args)
+			EMIT6_DISP_LH(0xeb000000, 0x0004, REG_2,
+				      REG_2 + (nr_reg_args - 1), REG_15,
+				      tjit->reg_args_off);
+		/* mvc stack_args_off(N,%r15),orig_stack_args_off(%r15) */
+		if (nr_stack_args)
+			_EMIT6(0xd200f000 |
+				       (nr_stack_args * sizeof(u64) - 1) << 16 |
+				       tjit->stack_args_off,
+			       0xf000 | tjit->orig_stack_args_off);
+		/* lgr %r1,%r8 */
+		EMIT4(0xb9040000, REG_1, REG_8);
+		/* %r1() */
+		call_r1(jit);
+		/* stg %r2,retval_off(%r15) */
+		EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15,
+			      tjit->retval_off);
+
+		im->ip_after_call = jit->prg_buf + jit->prg;
+
+		/*
+		 * The following nop will be patched by bpf_tramp_image_put().
+		 */
+
+		/* brcl 0,im->ip_epilogue */
+		EMIT6_PCREL_RILC(0xc0040000, 0, (u64)im->ip_epilogue);
+	}
+
+	/* do_fexit: */
+	tjit->do_fexit = jit->prg;
+	for (i = 0; i < fexit->nr_links; i++)
+		invoke_bpf_prog(tjit, m, fexit->links[i], false);
+
+	if (flags & BPF_TRAMP_F_CALL_ORIG) {
+		im->ip_epilogue = jit->prg_buf + jit->prg;
+
+		/*
+		 * __bpf_tramp_exit(im);
+		 */
+
+		/* %r1 = __bpf_tramp_exit */
+		load_imm64(jit, REG_1, (u64)__bpf_tramp_exit);
+		/* %r2 = im */
+		load_imm64(jit, REG_2, (u64)im);
+		/* %r1() */
+		call_r1(jit);
+	}
+
+	/* lmg %r2,%rN,reg_args_off(%r15) */
+	if ((flags & BPF_TRAMP_F_RESTORE_REGS) && nr_reg_args)
+		EMIT6_DISP_LH(0xeb000000, 0x0004, REG_2,
+			      REG_2 + (nr_reg_args - 1), REG_15,
+			      tjit->reg_args_off);
+	/* lgr %r1,%r8 */
+	if (!(flags & BPF_TRAMP_F_SKIP_FRAME))
+		EMIT4(0xb9040000, REG_1, REG_8);
+	/* lmg %r7,%r8,r7_r8_off(%r15) */
+	EMIT6_DISP_LH(0xeb000000, 0x0004, REG_7, REG_8, REG_15,
+		      tjit->r7_r8_off);
+	/* lg %r14,r14_off(%r15) */
+	EMIT6_DISP_LH(0xe3000000, 0x0004, REG_14, REG_0, REG_15, tjit->r14_off);
+	/* lg %r2,retval_off(%r15) */
+	if (flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET))
+		EMIT6_DISP_LH(0xe3000000, 0x0004, REG_2, REG_0, REG_15,
+			      tjit->retval_off);
+	/* aghi %r15,stack_size */
+	EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size);
+	/* Emit an expoline for the following indirect jump. */
+	if (nospec_uses_trampoline())
+		emit_expoline(jit);
+	if (flags & BPF_TRAMP_F_SKIP_FRAME)
+		/* br %r14 */
+		_EMIT2(0x07fe);
+	else
+		/* br %r1 */
+		_EMIT2(0x07f1);
+
+	emit_r1_thunk(jit);
+
+	return 0;
+}
+
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
+				void *image_end, const struct btf_func_model *m,
+				u32 flags, struct bpf_tramp_links *tlinks,
+				void *func_addr)
+{
+	struct bpf_tramp_jit tjit;
+	int ret;
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		if (i == 0) {
+			/* Compute offsets, check whether the code fits. */
+			memset(&tjit, 0, sizeof(tjit));
+		} else {
+			/* Generate the code. */
+			tjit.common.prg = 0;
+			tjit.common.prg_buf = image;
+		}
+		ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags,
+						    tlinks, func_addr);
+		if (ret < 0)
+			return ret;
+		if (tjit.common.prg > (char *)image_end - (char *)image)
+			/*
+			 * Use the same error code as for exceeding
+			 * BPF_MAX_TRAMP_LINKS.
+			 */
+			return -E2BIG;
+	}
+
+	return ret;
+}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cf89504c8dda..52ff43bbf996 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -943,7 +943,11 @@  struct btf_func_model {
 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
  * bytes on x86.
  */
+#if defined(__s390x__)
+#define BPF_MAX_TRAMP_LINKS 27
+#else
 #define BPF_MAX_TRAMP_LINKS 38
+#endif
 
 struct bpf_tramp_links {
 	struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS];