diff mbox series

[RFCv2,08/18] uprobes/x86: Add uprobe syscall to speed up uprobe

Message ID 20250224140151.667679-9-jolsa@kernel.org (mailing list archive)
State New
Headers show
Series uprobes: Add support to optimize usdt probes on x86_64 | expand

Commit Message

Jiri Olsa Feb. 24, 2025, 2:01 p.m. UTC
Adding new uprobe syscall that calls uprobe handlers for given
'breakpoint' address.

The idea is that the 'breakpoint' address calls the user space
trampoline which executes the uprobe syscall.

The syscall handler reads the return address of the initial call
to retrieve the original 'breakpoint' address. With this address
we find the related uprobe object and call its consumers.

Adding the arch_uprobe_trampoline_mapping function that provides
uprobe trampoline mapping. This mapping is backed with one global
page initialized at __init time and shared by the all the mapping
instances.

We do not allow to execute uprobe syscall if the caller is not
from uprobe trampoline mapping.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 arch/x86/kernel/uprobes.c              | 87 ++++++++++++++++++++++++++
 include/linux/syscalls.h               |  2 +
 include/linux/uprobes.h                |  1 +
 kernel/events/uprobes.c                | 22 +++++++
 kernel/sys_ni.c                        |  1 +
 6 files changed, 114 insertions(+)

Comments

Alexei Starovoitov Feb. 24, 2025, 7:22 p.m. UTC | #1
On Mon, Feb 24, 2025 at 6:08 AM Jiri Olsa <jolsa@kernel.org> wrote:
>
> +SYSCALL_DEFINE0(uprobe)
> +{
> +       struct pt_regs *regs = task_pt_regs(current);
> +       unsigned long bp_vaddr;
> +       int err;
> +
> +       err = copy_from_user(&bp_vaddr, (void __user *)regs->sp + 3*8, sizeof(bp_vaddr));
> +       if (err) {
> +               force_sig(SIGILL);
> +               return -1;
> +       }
> +
> +       /* Allow execution only from uprobe trampolines. */
> +       if (!in_uprobe_trampoline(regs->ip)) {
> +               force_sig(SIGILL);
> +               return -1;
> +       }
> +
> +       handle_syscall_uprobe(regs, bp_vaddr - 5);
> +       return 0;
> +}
> +
> +asm (
> +       ".pushsection .rodata\n"
> +       ".balign " __stringify(PAGE_SIZE) "\n"
> +       "uprobe_trampoline_entry:\n"
> +       "endbr64\n"

why endbr is there?
The trampoline is called with a direct call.

> +       "push %rcx\n"
> +       "push %r11\n"
> +       "push %rax\n"
> +       "movq $" __stringify(__NR_uprobe) ", %rax\n"

To avoid introducing a new syscall for a very similar operation
can we disambiguate uprobe vs uretprobe via %rdi or
some other way?
imo not too late to change uretprobe api.
Maybe it was discussed already.

> +       "syscall\n"
> +       "pop %rax\n"
> +       "pop %r11\n"
> +       "pop %rcx\n"
> +       "ret\n"

In later patches I see nop5 is replaced with a call to
uprobe_trampoline_entry, but which part saves
rdi and other regs?
Compiler doesn't automatically spill/fill around USDT's nop/nop5.
Selftest is doing:
+__naked noinline void uprobe_test(void)
so just lucky ?
Jiri Olsa Feb. 25, 2025, 1:35 p.m. UTC | #2
On Mon, Feb 24, 2025 at 11:22:42AM -0800, Alexei Starovoitov wrote:
> On Mon, Feb 24, 2025 at 6:08 AM Jiri Olsa <jolsa@kernel.org> wrote:
> >
> > +SYSCALL_DEFINE0(uprobe)
> > +{
> > +       struct pt_regs *regs = task_pt_regs(current);
> > +       unsigned long bp_vaddr;
> > +       int err;
> > +
> > +       err = copy_from_user(&bp_vaddr, (void __user *)regs->sp + 3*8, sizeof(bp_vaddr));
> > +       if (err) {
> > +               force_sig(SIGILL);
> > +               return -1;
> > +       }
> > +
> > +       /* Allow execution only from uprobe trampolines. */
> > +       if (!in_uprobe_trampoline(regs->ip)) {
> > +               force_sig(SIGILL);
> > +               return -1;
> > +       }
> > +
> > +       handle_syscall_uprobe(regs, bp_vaddr - 5);
> > +       return 0;
> > +}
> > +
> > +asm (
> > +       ".pushsection .rodata\n"
> > +       ".balign " __stringify(PAGE_SIZE) "\n"
> > +       "uprobe_trampoline_entry:\n"
> > +       "endbr64\n"
> 
> why endbr is there?
> The trampoline is called with a direct call.

ok, that's wrong, will remove that

> 
> > +       "push %rcx\n"
> > +       "push %r11\n"
> > +       "push %rax\n"
> > +       "movq $" __stringify(__NR_uprobe) ", %rax\n"
> 
> To avoid introducing a new syscall for a very similar operation
> can we disambiguate uprobe vs uretprobe via %rdi or
> some other way?
> imo not too late to change uretprobe api.
> Maybe it was discussed already.

yes, I recall discussing that early during uretprobe work with the decision to
have separate syscalls for each uprobe and uretprobe.. however wrt recent seccomp
changes, it might be easier just to add argument to uretprobe syscall to handle
uprobe

too bad it's not the other way around.. uprobe syscall with argument to do uretprobe
would sound better

> 
> > +       "syscall\n"
> > +       "pop %rax\n"
> > +       "pop %r11\n"
> > +       "pop %rcx\n"
> > +       "ret\n"
> 
> In later patches I see nop5 is replaced with a call to
> uprobe_trampoline_entry, but which part saves
> rdi and other regs?
> Compiler doesn't automatically spill/fill around USDT's nop/nop5.
> Selftest is doing:
> +__naked noinline void uprobe_test(void)
> so just lucky ?

if you mean registers that would carry usdt arguments, ebpf programs
access those based on assembler operand string stored in usdt record:

  stapsdt              0x00000048       NT_STAPSDT (SystemTap probe descriptors)
    Provider: test
    Name: usdt3
    Location: 0x0000000000712f2f, Base: 0x0000000002f516b0, Semaphore: 0x0000000003348ec2
->  Arguments: -4@-1192(%rbp) -8@-1200(%rbp) 8@%rax

it's up to bpf program to know which register(+offset) to access, libbpf have all
this logic hidden behind usdt_manager_attach_usdt and bpf_usdt_arg bpf call

the trampoline only saves rcx/r11/rax, because they are changed by syscall instruction

but actually I forgot to load these saved values (of rcx/r11/rax) and rsp into regs that
are passed to ebpf program, (like we do in uretprobe syscall) will fix that in next version

I'll add tests for optimized usdt with more arguments

thanks,
jirka
Andrii Nakryiko Feb. 25, 2025, 5:10 p.m. UTC | #3
On Tue, Feb 25, 2025 at 5:35 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Mon, Feb 24, 2025 at 11:22:42AM -0800, Alexei Starovoitov wrote:
> > On Mon, Feb 24, 2025 at 6:08 AM Jiri Olsa <jolsa@kernel.org> wrote:
> > >
> > > +SYSCALL_DEFINE0(uprobe)
> > > +{
> > > +       struct pt_regs *regs = task_pt_regs(current);
> > > +       unsigned long bp_vaddr;
> > > +       int err;
> > > +
> > > +       err = copy_from_user(&bp_vaddr, (void __user *)regs->sp + 3*8, sizeof(bp_vaddr));
> > > +       if (err) {
> > > +               force_sig(SIGILL);
> > > +               return -1;
> > > +       }
> > > +
> > > +       /* Allow execution only from uprobe trampolines. */
> > > +       if (!in_uprobe_trampoline(regs->ip)) {
> > > +               force_sig(SIGILL);
> > > +               return -1;
> > > +       }
> > > +
> > > +       handle_syscall_uprobe(regs, bp_vaddr - 5);
> > > +       return 0;
> > > +}
> > > +
> > > +asm (
> > > +       ".pushsection .rodata\n"
> > > +       ".balign " __stringify(PAGE_SIZE) "\n"
> > > +       "uprobe_trampoline_entry:\n"
> > > +       "endbr64\n"
> >
> > why endbr is there?
> > The trampoline is called with a direct call.
>
> ok, that's wrong, will remove that
>
> >
> > > +       "push %rcx\n"
> > > +       "push %r11\n"
> > > +       "push %rax\n"
> > > +       "movq $" __stringify(__NR_uprobe) ", %rax\n"
> >
> > To avoid introducing a new syscall for a very similar operation
> > can we disambiguate uprobe vs uretprobe via %rdi or
> > some other way?
> > imo not too late to change uretprobe api.
> > Maybe it was discussed already.
>
> yes, I recall discussing that early during uretprobe work with the decision to
> have separate syscalls for each uprobe and uretprobe.. however wrt recent seccomp
> changes, it might be easier just to add argument to uretprobe syscall to handle
> uprobe
>
> too bad it's not the other way around.. uprobe syscall with argument to do uretprobe
> would sound better

It's an "internal" syscall, why can't we rename it, if we want to?

Though I'm not sure I see the problem having both sys_uprobe and
sys_uretprobe, tbh. We just add sys_uprobe to the same list(s) that
sys_uretprobe is in for seccomp.

>
> >
> > > +       "syscall\n"
> > > +       "pop %rax\n"
> > > +       "pop %r11\n"
> > > +       "pop %rcx\n"
> > > +       "ret\n"
> >
> > In later patches I see nop5 is replaced with a call to
> > uprobe_trampoline_entry, but which part saves
> > rdi and other regs?
> > Compiler doesn't automatically spill/fill around USDT's nop/nop5.
> > Selftest is doing:
> > +__naked noinline void uprobe_test(void)
> > so just lucky ?
>
> if you mean registers that would carry usdt arguments, ebpf programs
> access those based on assembler operand string stored in usdt record:
>
>   stapsdt              0x00000048       NT_STAPSDT (SystemTap probe descriptors)
>     Provider: test
>     Name: usdt3
>     Location: 0x0000000000712f2f, Base: 0x0000000002f516b0, Semaphore: 0x0000000003348ec2
> ->  Arguments: -4@-1192(%rbp) -8@-1200(%rbp) 8@%rax
>
> it's up to bpf program to know which register(+offset) to access, libbpf have all
> this logic hidden behind usdt_manager_attach_usdt and bpf_usdt_arg bpf call
>
> the trampoline only saves rcx/r11/rax, because they are changed by syscall instruction
>
> but actually I forgot to load these saved values (of rcx/r11/rax) and rsp into regs that
> are passed to ebpf program, (like we do in uretprobe syscall) will fix that in next version
>
> I'll add tests for optimized usdt with more arguments
>
> thanks,
> jirka
Alexei Starovoitov Feb. 25, 2025, 6:06 p.m. UTC | #4
On Tue, Feb 25, 2025 at 5:35 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> > In later patches I see nop5 is replaced with a call to
> > uprobe_trampoline_entry, but which part saves
> > rdi and other regs?
> > Compiler doesn't automatically spill/fill around USDT's nop/nop5.
> > Selftest is doing:
> > +__naked noinline void uprobe_test(void)
> > so just lucky ?
>
> if you mean registers that would carry usdt arguments, ebpf programs
> access those based on assembler operand string stored in usdt record:

No. I'm talking about all normal registers that trap-style uprobe
preserves, but this nop5->call will scratch.
Instead of void uprobe_test(void)
add some arguments to it, and read them before and after nop5 uprobe.
They must remain the same.
Alexei Starovoitov Feb. 26, 2025, 2:36 a.m. UTC | #5
On Tue, Feb 25, 2025 at 10:06 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Feb 25, 2025 at 5:35 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> > > In later patches I see nop5 is replaced with a call to
> > > uprobe_trampoline_entry, but which part saves
> > > rdi and other regs?
> > > Compiler doesn't automatically spill/fill around USDT's nop/nop5.
> > > Selftest is doing:
> > > +__naked noinline void uprobe_test(void)
> > > so just lucky ?
> >
> > if you mean registers that would carry usdt arguments, ebpf programs
> > access those based on assembler operand string stored in usdt record:
>
> No. I'm talking about all normal registers that trap-style uprobe
> preserves, but this nop5->call will scratch.
> Instead of void uprobe_test(void)
> add some arguments to it, and read them before and after nop5 uprobe.
> They must remain the same.

Ignore me. It's a syscall insn. All fine.
diff mbox series

Patch

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5eb708bff1c7..88e388c7675b 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,7 @@ 
 333	common	io_pgetevents		sys_io_pgetevents
 334	common	rseq			sys_rseq
 335	common	uretprobe		sys_uretprobe
+336	common	uprobe			sys_uprobe
 # don't use numbers 387 through 423, add new calls after the last
 # 'common' entry
 424	common	pidfd_send_signal	sys_pidfd_send_signal
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index b06f3cd7551a..3ea682dbeb39 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -425,6 +425,93 @@  SYSCALL_DEFINE0(uretprobe)
 	return -1;
 }
 
+static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
+{
+	return -EPERM;
+}
+
+static struct page *tramp_mapping_pages[2] __ro_after_init;
+
+static struct vm_special_mapping tramp_mapping = {
+	.name   = "[uprobes-trampoline]",
+	.mremap = tramp_mremap,
+	.pages  = tramp_mapping_pages,
+};
+
+static bool in_uprobe_trampoline(unsigned long ip)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	bool found, retry = true;
+	unsigned int seq;
+
+	rcu_read_lock();
+	if (mmap_lock_speculate_try_begin(mm, &seq)) {
+		vma = vma_lookup(current->mm, ip);
+		found = vma && vma_is_special_mapping(vma, &tramp_mapping);
+		retry = mmap_lock_speculate_retry(mm, seq);
+	}
+	rcu_read_unlock();
+
+	if (retry) {
+		mmap_read_lock(mm);
+		vma = vma_lookup(current->mm, ip);
+		found = vma && vma_is_special_mapping(vma, &tramp_mapping);
+		mmap_read_unlock(mm);
+	}
+	return found;
+}
+
+SYSCALL_DEFINE0(uprobe)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+	unsigned long bp_vaddr;
+	int err;
+
+	err = copy_from_user(&bp_vaddr, (void __user *)regs->sp + 3*8, sizeof(bp_vaddr));
+	if (err) {
+		force_sig(SIGILL);
+		return -1;
+	}
+
+	/* Allow execution only from uprobe trampolines. */
+	if (!in_uprobe_trampoline(regs->ip)) {
+		force_sig(SIGILL);
+		return -1;
+	}
+
+	handle_syscall_uprobe(regs, bp_vaddr - 5);
+	return 0;
+}
+
+asm (
+	".pushsection .rodata\n"
+	".balign " __stringify(PAGE_SIZE) "\n"
+	"uprobe_trampoline_entry:\n"
+	"endbr64\n"
+	"push %rcx\n"
+	"push %r11\n"
+	"push %rax\n"
+	"movq $" __stringify(__NR_uprobe) ", %rax\n"
+	"syscall\n"
+	"pop %rax\n"
+	"pop %r11\n"
+	"pop %rcx\n"
+	"ret\n"
+	".balign " __stringify(PAGE_SIZE) "\n"
+	".popsection\n"
+);
+
+extern u8 uprobe_trampoline_entry[];
+
+static int __init arch_uprobes_init(void)
+{
+	tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry);
+	return 0;
+}
+
+late_initcall(arch_uprobes_init);
+
 /*
  * If arch_uprobe->insn doesn't use rip-relative addressing, return
  * immediately.  Otherwise, rewrite the instruction so that it accesses
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c6333204d451..002f4e1debe5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -994,6 +994,8 @@  asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
 
 asmlinkage long sys_uretprobe(void);
 
+asmlinkage long sys_uprobe(void);
+
 /* pciconfig: alpha, arm, arm64, ia64, sparc */
 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
 				unsigned long off, unsigned long len,
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 6c3c90a0d110..de3631ae1746 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -231,6 +231,7 @@  extern void uprobe_handle_trampoline(struct pt_regs *regs);
 extern void *arch_uretprobe_trampoline(unsigned long *psize);
 extern unsigned long uprobe_get_trampoline_vaddr(void);
 extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len);
+extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index cfcde7295e15..6ac691fe5682 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2733,6 +2733,28 @@  static void handle_swbp(struct pt_regs *regs)
 	rcu_read_unlock_trace();
 }
 
+void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
+{
+	struct uprobe *uprobe;
+	int is_swbp;
+
+	rcu_read_lock_trace();
+	uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
+	if (!uprobe)
+		goto unlock;
+
+	if (!get_utask())
+		goto unlock;
+
+	if (arch_uprobe_ignore(&uprobe->arch, regs))
+		goto unlock;
+
+	handler_chain(uprobe, regs, false);
+
+ unlock:
+	rcu_read_unlock_trace();
+}
+
 /*
  * Perform required fix-ups and disable singlestep.
  * Allow pending signals to take effect.
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c00a86931f8c..bf5d05c635ff 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -392,3 +392,4 @@  COND_SYSCALL(setuid16);
 COND_SYSCALL(rseq);
 
 COND_SYSCALL(uretprobe);
+COND_SYSCALL(uprobe);