diff mbox

[v4,0/9] uprobes: misc cleanups/simplifications

Message ID 20240801132638.GA8759@redhat.com (mailing list archive)
State Accepted
Delegated to: Masami Hiramatsu
Headers show

Commit Message

Oleg Nesterov Aug. 1, 2024, 1:26 p.m. UTC
(Andrii, I'll try to look at your new series on Weekend).

Changes:

	- added the acks I got from Andrii, Masami, and Jiri
	- new 4/9 patch from Jiri, fixes the unrelated bug in bpf_testmod
	- adapt tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
	  to the API changes in 5/9 an 6/9

see the interdiff below.

Oleg.
---

 include/linux/uprobes.h                            |  22 ++--
 kernel/events/uprobes.c                            | 137 +++++++++------------
 kernel/trace/bpf_trace.c                           |  25 ++--
 kernel/trace/trace_uprobe.c                        |  31 ++---
 .../selftests/bpf/bpf_testmod/bpf_testmod.c        |  26 ++--
 5 files changed, 103 insertions(+), 138 deletions(-)

-------------------------------------------------------------------------------

Comments

Peter Zijlstra Aug. 1, 2024, 1:36 p.m. UTC | #1
On Thu, Aug 01, 2024 at 03:26:38PM +0200, Oleg Nesterov wrote:
> (Andrii, I'll try to look at your new series on Weekend).

OK, I dropped all your previous patches and stuffed these in.

They should all be visible in queue/perf/core, and provided the robot
doesn't scream, I'll push them into tip/perf/core soonish.
Andrii Nakryiko Aug. 1, 2024, 6:58 p.m. UTC | #2
+ bpf

On Thu, Aug 1, 2024 at 6:36 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Aug 01, 2024 at 03:26:38PM +0200, Oleg Nesterov wrote:
> > (Andrii, I'll try to look at your new series on Weekend).
>
> OK, I dropped all your previous patches and stuffed these in.
>
> They should all be visible in queue/perf/core, and provided the robot
> doesn't scream, I'll push them into tip/perf/core soonish.

Just FYI, it seems like tip/perf/core is currently broken for uprobes
(and by implication also queue/perf/core). Also torvalds/linux/master
master is broken. See what I'm getting when running BPF selftests
dealing with uprobes. Sometimes I only get that WARNING and nothing
else.

I'm bisecting at the moment with bpf/master being a "good" checkpoint,
will let you know once I bisect.

[   34.343557] ------------[ cut here ]------------
[   34.343906] WARNING: CPU: 3 PID: 2364 at
kernel/trace/trace_uprobe.c:1109 __probe_event_disable+0x26/0x80
[   34.344468] Modules linked in:
[   34.344488] BUG: unable to handle page fault for address: ffffc90001c5bea0
[   34.345071] #PF: supervisor read access in kernel mode
[   34.345370] #PF: error_code(0x0000) - not-present page
[   34.345700] PGD 100000067 P4D 100000067 PUD ffff88810d86cd40
[   34.346061] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
[   34.346377] CPU: 3 UID: 0 PID: 2364 Comm: test_progs Tainted: G
      OE      6.11.0-rc1-00006-g6763ebdb4983 #115
[   34.347052] Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
[   34.347392] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04
[   34.348085] RIP: 0010:__wake_up_common+0x3e/0xc0
[   34.348359] Code: 89 d4 55 53 48 89 fb 48 83 ec 08 8b 05 c3 05 a8
02 89 74 24 04 85 c0 75 6d 48 8b 43 48 48 83 c3 48 4f
[   34.349440] RSP: 0018:ffffc900001d0d90 EFLAGS: 00010087
[   34.349796] RAX: ffffc90001c5bea0 RBX: ffff88810138e0e8 RCX: 0000000000000001
[   34.350282] RDX: 0000000080010005 RSI: ffffffff8295f82c RDI: ffffc90001c5be88
[   34.350768] RBP: ffff88810138e0a0 R08: 0000000000000000 R09: 000000000000438f
[   34.351245] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000001
[   34.351703] R13: 0000000000000046 R14: 0000000000000000 R15: 0000000000000000
[   34.352112] FS:  00007fd71c7b3d00(0000) GS:ffff88881ca00000(0000)
knlGS:0000000000000000
[   34.352574] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   34.352911] CR2: ffffc90001c5bea0 CR3: 000000010b456005 CR4: 0000000000370ef0
[   34.353320] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   34.353734] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   34.354144] Call Trace:
[   34.354290]  <IRQ>
[   34.354413]  ? __die+0x1f/0x60
[   34.354601]  ? page_fault_oops+0x14c/0x450
[   34.354848]  ? search_bpf_extables+0xa8/0x150
[   34.355105]  ? fixup_exception+0x22/0x2d0
[   34.355342]  ? exc_page_fault+0x207/0x210
[   34.355579]  ? asm_exc_page_fault+0x22/0x30
[   34.355829]  ? __wake_up_common+0x3e/0xc0
[   34.356065]  __wake_up+0x32/0x60
[   34.356261]  ep_poll_callback+0x13e/0x270
[   34.356502]  __wake_up_common+0x7d/0xc0
[   34.356731]  __wake_up+0x32/0x60
[   34.356961]  irq_work_single+0x67/0x90
[   34.357184]  irq_work_run_list+0x26/0x40
[   34.357442]  update_process_times+0x83/0xa0
[   34.357717]  tick_nohz_handler+0x97/0x140
[   34.357977]  ? __pfx_tick_nohz_handler+0x10/0x10
[   34.358283]  __hrtimer_run_queues+0x19a/0x3b0
[   34.358580]  hrtimer_interrupt+0xfe/0x240
[   34.358864]  __sysvec_apic_timer_interrupt+0x87/0x210
[   34.359189]  sysvec_apic_timer_interrupt+0x98/0xc0
[   34.359487]  </IRQ>
[   34.359614]  <TASK>
[   34.359745]  asm_sysvec_apic_timer_interrupt+0x16/0x20
[   34.360042] RIP: 0010:print_modules+0x27/0xd0
[   34.360301] Code: 90 90 90 0f 1f 44 00 00 53 48 c7 c7 63 22 98 82
48 83 ec 18 e8 da 8d fc ff bf 01 00 00 00 e8 80 d1 f1
[   34.361372] RSP: 0018:ffffc90001e5bc58 EFLAGS: 00000297
[   34.361682] RAX: ffffffffa03f0948 RBX: 0000000000000000 RCX: 0000000000000000
[   34.362091] RDX: 0000000000000002 RSI: 0000000000000027 RDI: 0000000000000001
[   34.362502] RBP: ffffc90001e5bd28 R08: 00000000fffeffff R09: 0000000000000001
[   34.362917] R10: 0000000000000000 R11: ffffffff83299920 R12: ffffffff81233536
[   34.363325] R13: 0000000000000009 R14: 0000000000000455 R15: ffffffff8298f924
[   34.363740]  ? __probe_event_disable+0x26/0x80
[   34.364009]  ? print_modules+0x20/0xd0
[   34.364230]  ? __probe_event_disable+0x26/0x80
[   34.364488]  __warn+0x6f/0x180
[   34.364683]  ? __probe_event_disable+0x26/0x80
[   34.364945]  report_bug+0x18d/0x1c0
[   34.365156]  handle_bug+0x3a/0x70
[   34.365354]  exc_invalid_op+0x13/0x60
[   34.365571]  asm_exc_invalid_op+0x16/0x20
[   34.365838] RIP: 0010:__probe_event_disable+0x26/0x80
[   34.366175] Code: 90 90 90 90 55 48 89 fd 53 48 8b 47 10 8b 90 38
01 00 00 85 d2 75 13 48 8b 88 40 01 00 00 48 8d 90 40
[   34.367340] RSP: 0018:ffffc90001e5bdd0 EFLAGS: 00010287
[   34.367651] RAX: ffff88810d86cc00 RBX: ffff88810d86cce0 RCX: ffff8881075c8168
[   34.368067] RDX: ffff88810d86cd40 RSI: 0000000000000003 RDI: ffff88810a5e1db0
[   34.368482] RBP: ffff88810a5e1db0 R08: 00000000ffffffff R09: 0000000000000000
[   34.368896] R10: 0000000000000001 R11: 0000000000000000 R12: ffff8881075c8250
[   34.369310] R13: ffff8881075c82f0 R14: ffffc90001e5bb80 R15: ffff8881075c8000
[   34.369775]  trace_uprobe_register+0x1a8/0x300
[   34.370053]  perf_trace_event_unreg.isra.0+0x22/0x80
[   34.370342]  perf_uprobe_destroy+0x3a/0x70
[   34.370582]  _free_event+0x114/0x580
[   34.370806]  perf_event_release_kernel+0x282/0x2c0
[   34.371123]  perf_release+0x11/0x20
[   34.371354]  __fput+0x102/0x2e0
[   34.371561]  task_work_run+0x55/0xa0
[   34.371798]  syscall_exit_to_user_mode+0x1dd/0x1f0
[   34.372104]  do_syscall_64+0x70/0x140
[   34.372337]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   34.372659] RIP: 0033:0x7fd71c986a94
[   34.372901] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84
00 00 00 00 00 90 f3 0f 1e fa 80 3d d5 18 0e 00 00 73
[   34.373987] RSP: 002b:00007ffc09d83118 EFLAGS: 00000202 ORIG_RAX:
0000000000000003
[   34.374426] RAX: 0000000000000000 RBX: 00007ffc09d835e8 RCX: 00007fd71c986a94
[   34.374874] RDX: 000000000000000b RSI: 0000000000002401 RDI: 000000000000000c
[   34.375325] RBP: 00007ffc09d83150 R08: 00000000049d39c7 R09: 0000000000000007
[   34.375782] R10: 00000000049d71b0 R11: 0000000000000202 R12: 0000000000000000
[   34.376245] R13: 00007ffc09d83608 R14: 00007fd71cae3000 R15: 000000000103cd90
[   34.376661]  </TASK>
[   34.376794] Modules linked in: bpf_testmod(OE) aesni_intel(E)
crypto_simd(E) cryptd(E) kvm_intel(E) i2c_piix4(E) i2c_s)
[   34.377900] CR2: ffffc90001c5bea0
[   34.378097] ---[ end trace 0000000000000000 ]---
[   34.378366] RIP: 0010:__wake_up_common+0x3e/0xc0
[   34.378637] Code: 89 d4 55 53 48 89 fb 48 83 ec 08 8b 05 c3 05 a8
02 89 74 24 04 85 c0 75 6d 48 8b 43 48 48 83 c3 48 4f
[   34.379703] RSP: 0018:ffffc900001d0d90 EFLAGS: 00010087
[   34.380004] RAX: ffffc90001c5bea0 RBX: ffff88810138e0e8 RCX: 0000000000000001
[   34.380414] RDX: 0000000080010005 RSI: ffffffff8295f82c RDI: ffffc90001c5be88
[   34.380827] RBP: ffff88810138e0a0 R08: 0000000000000000 R09: 000000000000438f
[   34.381284] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000001
[   34.381736] R13: 0000000000000046 R14: 0000000000000000 R15: 0000000000000000
[   34.382198] FS:  00007fd71c7b3d00(0000) GS:ffff88881ca00000(0000)
knlGS:0000000000000000
[   34.382712] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   34.383082] CR2: ffffc90001c5bea0 CR3: 000000010b456005 CR4: 0000000000370ef0
[   34.383540] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   34.383991] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   34.384407] Kernel panic - not syncing: Fatal exception in interrupt
[   35.464708] Shutting down cpus with NMI
[   35.465244] Kernel Offset: disabled
[   35.465474] ---[ end Kernel panic - not syncing: Fatal exception in
interrupt ]---
Andrii Nakryiko Aug. 1, 2024, 9:13 p.m. UTC | #3
On Thu, Aug 1, 2024 at 11:58 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> + bpf
>
> On Thu, Aug 1, 2024 at 6:36 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Thu, Aug 01, 2024 at 03:26:38PM +0200, Oleg Nesterov wrote:
> > > (Andrii, I'll try to look at your new series on Weekend).
> >
> > OK, I dropped all your previous patches and stuffed these in.
> >
> > They should all be visible in queue/perf/core, and provided the robot
> > doesn't scream, I'll push them into tip/perf/core soonish.
>
> Just FYI, it seems like tip/perf/core is currently broken for uprobes
> (and by implication also queue/perf/core). Also torvalds/linux/master
> master is broken. See what I'm getting when running BPF selftests
> dealing with uprobes. Sometimes I only get that WARNING and nothing
> else.
>
> I'm bisecting at the moment with bpf/master being a "good" checkpoint,
> will let you know once I bisect.

Ok, this bisected to:

675ad74989c2 ("perf/core: Add aux_pause, aux_resume, aux_start_paused")

Reverting all (applied to tip/perf/core) four patches from that series:

6763ebdb4983 (tip/perf/core) perf/x86/intel: Do not enable large PEBS
for events with aux actions or aux sampling
6a45d8847597 perf/x86/intel/pt: Add support for pause / resume
675ad74989c2 perf/core: Add aux_pause, aux_resume, aux_start_paused
d92792a4b26e perf/x86/intel/pt: Fix sampling synchronization

... makes everything work again. I'll leave it up to you and Adrian to
figure this out.

>
> [   34.343557] ------------[ cut here ]------------
> [   34.343906] WARNING: CPU: 3 PID: 2364 at
> kernel/trace/trace_uprobe.c:1109 __probe_event_disable+0x26/0x80
> [   34.344468] Modules linked in:
> [   34.344488] BUG: unable to handle page fault for address: ffffc90001c5bea0
> [   34.345071] #PF: supervisor read access in kernel mode
> [   34.345370] #PF: error_code(0x0000) - not-present page
> [   34.345700] PGD 100000067 P4D 100000067 PUD ffff88810d86cd40
> [   34.346061] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
> [   34.346377] CPU: 3 UID: 0 PID: 2364 Comm: test_progs Tainted: G
>       OE      6.11.0-rc1-00006-g6763ebdb4983 #115
> [   34.347052] Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
> [   34.347392] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04
> [   34.348085] RIP: 0010:__wake_up_common+0x3e/0xc0
> [   34.348359] Code: 89 d4 55 53 48 89 fb 48 83 ec 08 8b 05 c3 05 a8
> 02 89 74 24 04 85 c0 75 6d 48 8b 43 48 48 83 c3 48 4f
> [   34.349440] RSP: 0018:ffffc900001d0d90 EFLAGS: 00010087
> [   34.349796] RAX: ffffc90001c5bea0 RBX: ffff88810138e0e8 RCX: 0000000000000001
> [   34.350282] RDX: 0000000080010005 RSI: ffffffff8295f82c RDI: ffffc90001c5be88
> [   34.350768] RBP: ffff88810138e0a0 R08: 0000000000000000 R09: 000000000000438f
> [   34.351245] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000001
> [   34.351703] R13: 0000000000000046 R14: 0000000000000000 R15: 0000000000000000
> [   34.352112] FS:  00007fd71c7b3d00(0000) GS:ffff88881ca00000(0000)
> knlGS:0000000000000000
> [   34.352574] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   34.352911] CR2: ffffc90001c5bea0 CR3: 000000010b456005 CR4: 0000000000370ef0
> [   34.353320] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   34.353734] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [   34.354144] Call Trace:
> [   34.354290]  <IRQ>
> [   34.354413]  ? __die+0x1f/0x60
> [   34.354601]  ? page_fault_oops+0x14c/0x450
> [   34.354848]  ? search_bpf_extables+0xa8/0x150
> [   34.355105]  ? fixup_exception+0x22/0x2d0
> [   34.355342]  ? exc_page_fault+0x207/0x210
> [   34.355579]  ? asm_exc_page_fault+0x22/0x30
> [   34.355829]  ? __wake_up_common+0x3e/0xc0
> [   34.356065]  __wake_up+0x32/0x60
> [   34.356261]  ep_poll_callback+0x13e/0x270
> [   34.356502]  __wake_up_common+0x7d/0xc0
> [   34.356731]  __wake_up+0x32/0x60
> [   34.356961]  irq_work_single+0x67/0x90
> [   34.357184]  irq_work_run_list+0x26/0x40
> [   34.357442]  update_process_times+0x83/0xa0
> [   34.357717]  tick_nohz_handler+0x97/0x140
> [   34.357977]  ? __pfx_tick_nohz_handler+0x10/0x10
> [   34.358283]  __hrtimer_run_queues+0x19a/0x3b0
> [   34.358580]  hrtimer_interrupt+0xfe/0x240
> [   34.358864]  __sysvec_apic_timer_interrupt+0x87/0x210
> [   34.359189]  sysvec_apic_timer_interrupt+0x98/0xc0
> [   34.359487]  </IRQ>
> [   34.359614]  <TASK>
> [   34.359745]  asm_sysvec_apic_timer_interrupt+0x16/0x20
> [   34.360042] RIP: 0010:print_modules+0x27/0xd0
> [   34.360301] Code: 90 90 90 0f 1f 44 00 00 53 48 c7 c7 63 22 98 82
> 48 83 ec 18 e8 da 8d fc ff bf 01 00 00 00 e8 80 d1 f1
> [   34.361372] RSP: 0018:ffffc90001e5bc58 EFLAGS: 00000297
> [   34.361682] RAX: ffffffffa03f0948 RBX: 0000000000000000 RCX: 0000000000000000
> [   34.362091] RDX: 0000000000000002 RSI: 0000000000000027 RDI: 0000000000000001
> [   34.362502] RBP: ffffc90001e5bd28 R08: 00000000fffeffff R09: 0000000000000001
> [   34.362917] R10: 0000000000000000 R11: ffffffff83299920 R12: ffffffff81233536
> [   34.363325] R13: 0000000000000009 R14: 0000000000000455 R15: ffffffff8298f924
> [   34.363740]  ? __probe_event_disable+0x26/0x80
> [   34.364009]  ? print_modules+0x20/0xd0
> [   34.364230]  ? __probe_event_disable+0x26/0x80
> [   34.364488]  __warn+0x6f/0x180
> [   34.364683]  ? __probe_event_disable+0x26/0x80
> [   34.364945]  report_bug+0x18d/0x1c0
> [   34.365156]  handle_bug+0x3a/0x70
> [   34.365354]  exc_invalid_op+0x13/0x60
> [   34.365571]  asm_exc_invalid_op+0x16/0x20
> [   34.365838] RIP: 0010:__probe_event_disable+0x26/0x80
> [   34.366175] Code: 90 90 90 90 55 48 89 fd 53 48 8b 47 10 8b 90 38
> 01 00 00 85 d2 75 13 48 8b 88 40 01 00 00 48 8d 90 40
> [   34.367340] RSP: 0018:ffffc90001e5bdd0 EFLAGS: 00010287
> [   34.367651] RAX: ffff88810d86cc00 RBX: ffff88810d86cce0 RCX: ffff8881075c8168
> [   34.368067] RDX: ffff88810d86cd40 RSI: 0000000000000003 RDI: ffff88810a5e1db0
> [   34.368482] RBP: ffff88810a5e1db0 R08: 00000000ffffffff R09: 0000000000000000
> [   34.368896] R10: 0000000000000001 R11: 0000000000000000 R12: ffff8881075c8250
> [   34.369310] R13: ffff8881075c82f0 R14: ffffc90001e5bb80 R15: ffff8881075c8000
> [   34.369775]  trace_uprobe_register+0x1a8/0x300
> [   34.370053]  perf_trace_event_unreg.isra.0+0x22/0x80
> [   34.370342]  perf_uprobe_destroy+0x3a/0x70
> [   34.370582]  _free_event+0x114/0x580
> [   34.370806]  perf_event_release_kernel+0x282/0x2c0
> [   34.371123]  perf_release+0x11/0x20
> [   34.371354]  __fput+0x102/0x2e0
> [   34.371561]  task_work_run+0x55/0xa0
> [   34.371798]  syscall_exit_to_user_mode+0x1dd/0x1f0
> [   34.372104]  do_syscall_64+0x70/0x140
> [   34.372337]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [   34.372659] RIP: 0033:0x7fd71c986a94
> [   34.372901] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84
> 00 00 00 00 00 90 f3 0f 1e fa 80 3d d5 18 0e 00 00 73
> [   34.373987] RSP: 002b:00007ffc09d83118 EFLAGS: 00000202 ORIG_RAX:
> 0000000000000003
> [   34.374426] RAX: 0000000000000000 RBX: 00007ffc09d835e8 RCX: 00007fd71c986a94
> [   34.374874] RDX: 000000000000000b RSI: 0000000000002401 RDI: 000000000000000c
> [   34.375325] RBP: 00007ffc09d83150 R08: 00000000049d39c7 R09: 0000000000000007
> [   34.375782] R10: 00000000049d71b0 R11: 0000000000000202 R12: 0000000000000000
> [   34.376245] R13: 00007ffc09d83608 R14: 00007fd71cae3000 R15: 000000000103cd90
> [   34.376661]  </TASK>
> [   34.376794] Modules linked in: bpf_testmod(OE) aesni_intel(E)
> crypto_simd(E) cryptd(E) kvm_intel(E) i2c_piix4(E) i2c_s)
> [   34.377900] CR2: ffffc90001c5bea0
> [   34.378097] ---[ end trace 0000000000000000 ]---
> [   34.378366] RIP: 0010:__wake_up_common+0x3e/0xc0
> [   34.378637] Code: 89 d4 55 53 48 89 fb 48 83 ec 08 8b 05 c3 05 a8
> 02 89 74 24 04 85 c0 75 6d 48 8b 43 48 48 83 c3 48 4f
> [   34.379703] RSP: 0018:ffffc900001d0d90 EFLAGS: 00010087
> [   34.380004] RAX: ffffc90001c5bea0 RBX: ffff88810138e0e8 RCX: 0000000000000001
> [   34.380414] RDX: 0000000080010005 RSI: ffffffff8295f82c RDI: ffffc90001c5be88
> [   34.380827] RBP: ffff88810138e0a0 R08: 0000000000000000 R09: 000000000000438f
> [   34.381284] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000001
> [   34.381736] R13: 0000000000000046 R14: 0000000000000000 R15: 0000000000000000
> [   34.382198] FS:  00007fd71c7b3d00(0000) GS:ffff88881ca00000(0000)
> knlGS:0000000000000000
> [   34.382712] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   34.383082] CR2: ffffc90001c5bea0 CR3: 000000010b456005 CR4: 0000000000370ef0
> [   34.383540] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   34.383991] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [   34.384407] Kernel panic - not syncing: Fatal exception in interrupt
> [   35.464708] Shutting down cpus with NMI
> [   35.465244] Kernel Offset: disabled
> [   35.465474] ---[ end Kernel panic - not syncing: Fatal exception in
> interrupt ]---
Peter Zijlstra Aug. 2, 2024, 8:27 a.m. UTC | #4
On Thu, Aug 01, 2024 at 02:13:41PM -0700, Andrii Nakryiko wrote:
> On Thu, Aug 1, 2024 at 11:58 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > + bpf
> >
> > On Thu, Aug 1, 2024 at 6:36 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Thu, Aug 01, 2024 at 03:26:38PM +0200, Oleg Nesterov wrote:
> > > > (Andrii, I'll try to look at your new series on Weekend).
> > >
> > > OK, I dropped all your previous patches and stuffed these in.
> > >
> > > They should all be visible in queue/perf/core, and provided the robot
> > > doesn't scream, I'll push them into tip/perf/core soonish.
> >
> > Just FYI, it seems like tip/perf/core is currently broken for uprobes
> > (and by implication also queue/perf/core). Also torvalds/linux/master
> > master is broken. See what I'm getting when running BPF selftests
> > dealing with uprobes. Sometimes I only get that WARNING and nothing
> > else.
> >
> > I'm bisecting at the moment with bpf/master being a "good" checkpoint,
> > will let you know once I bisect.
> 
> Ok, this bisected to:
> 
> 675ad74989c2 ("perf/core: Add aux_pause, aux_resume, aux_start_paused")
> 
> Reverting all (applied to tip/perf/core) four patches from that series:
> 
> 6763ebdb4983 (tip/perf/core) perf/x86/intel: Do not enable large PEBS
> for events with aux actions or aux sampling
> 6a45d8847597 perf/x86/intel/pt: Add support for pause / resume
> 675ad74989c2 perf/core: Add aux_pause, aux_resume, aux_start_paused
> d92792a4b26e perf/x86/intel/pt: Fix sampling synchronization
> 
> ... makes everything work again. I'll leave it up to you and Adrian to
> figure this out.

Thanks for catching this. I'll go have a look.
Peter Zijlstra Aug. 2, 2024, 9:25 a.m. UTC | #5
On Thu, Aug 01, 2024 at 02:13:41PM -0700, Andrii Nakryiko wrote:

> Ok, this bisected to:
> 
> 675ad74989c2 ("perf/core: Add aux_pause, aux_resume, aux_start_paused")

Adrian, there are at least two obvious bugs there:

 - aux_action was key's off of PERF_PMU_CAP_AUX_OUTPUT, which is not
   right, that's the capability where events can output to AUX -- aka.
   PEBS-to-PT. It should be PERF_PMU_CAP_ITRACE, which is the
   PT/CoreSight thing.

 - it sets aux_paused unconditionally, which is scribbling in the giant
   union which is overwriting state set by perf_init_event().

But I think there's more problems, we need to do the aux_action
validation after perf_get_aux_event(), we can't know if having those
bits set makes sense before that. This means the perf_event_alloc() site
is wrong in the first place.

I'm going to drop these patches for now. Please rework.
Adrian Hunter Aug. 2, 2024, 11:02 a.m. UTC | #6
On 2/08/24 12:25, Peter Zijlstra wrote:
> On Thu, Aug 01, 2024 at 02:13:41PM -0700, Andrii Nakryiko wrote:
> 
>> Ok, this bisected to:
>>
>> 675ad74989c2 ("perf/core: Add aux_pause, aux_resume, aux_start_paused")
> 
> Adrian, there are at least two obvious bugs there:
> 
>  - aux_action was key's off of PERF_PMU_CAP_AUX_OUTPUT, which is not
>    right, that's the capability where events can output to AUX -- aka.
>    PEBS-to-PT. It should be PERF_PMU_CAP_ITRACE, which is the
>    PT/CoreSight thing.
> 
>  - it sets aux_paused unconditionally, which is scribbling in the giant
>    union which is overwriting state set by perf_init_event().
> 
> But I think there's more problems, we need to do the aux_action
> validation after perf_get_aux_event(), we can't know if having those
> bits set makes sense before that. This means the perf_event_alloc() site
> is wrong in the first place.
> 
> I'm going to drop these patches for now. Please rework.

Yes I will do that.

FWIW, I'd expect the reported issue would go away with just:

diff --git a/kernel/events/core.c b/kernel/events/core.c
index e4cb6e5a5f40..2072aaa4d449 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -12151,7 +12151,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		err = -EOPNOTSUPP;
 		goto err_pmu;
 	}
-	event->hw.aux_paused = event->attr.aux_start_paused;
+	if (event->attr.aux_start_paused)
+		event->hw.aux_paused = 1;
 
 	if (cgroup_fd != -1) {
 		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
Adrian Hunter Aug. 2, 2024, 5:14 p.m. UTC | #7
On 2/08/24 14:02, Adrian Hunter wrote:
> On 2/08/24 12:25, Peter Zijlstra wrote:
>> On Thu, Aug 01, 2024 at 02:13:41PM -0700, Andrii Nakryiko wrote:
>>
>>> Ok, this bisected to:
>>>
>>> 675ad74989c2 ("perf/core: Add aux_pause, aux_resume, aux_start_paused")
>>
>> Adrian, there are at least two obvious bugs there:
>>
>>  - aux_action was key's off of PERF_PMU_CAP_AUX_OUTPUT, which is not
>>    right, that's the capability where events can output to AUX -- aka.
>>    PEBS-to-PT. It should be PERF_PMU_CAP_ITRACE, which is the
>>    PT/CoreSight thing.

Not sure about that.

In perf_event_alloc(), there is:

	if (event->attr.aux_output &&
	    (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
	     event->attr.aux_pause || event->attr.aux_resume)) {
		err = -EOPNOTSUPP;
		goto err_pmu;
	}

which is to prevent aux_output with aux_pause or aux_resume.
That is because aux_output (i.e. PEBS-via-PT) has no interrupt
and so does not overflow.  (Instead the PEBS record is written
by hardware to the Intel PT trace)  No overflow => no (software)
aux_pause/aux_resume, so aux_output with aux_pause/aux_resume
does not make sense.

The PMU capability for aux_pause/aux_resume or aux_start_paused
is PERF_PMU_CAP_AUX_PAUSE.  aux_pause/aux_resume are valid for
non-AUX events (member of the group), whereas aux_start_paused
is valid for the AUX event itself (group leader).  For 
aux_pause/aux_resume the group leader's PMU capability is
checked.  For aux_start_paused the event's PMU capability is
checked.

>>
>>  - it sets aux_paused unconditionally, which is scribbling in the giant
>>    union which is overwriting state set by perf_init_event().

That definitely needs fixing, but the fix is just the diff
from my previous reply:

diff --git a/kernel/events/core.c b/kernel/events/core.c
index e4cb6e5a5f40..2072aaa4d449 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -12151,7 +12151,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		err = -EOPNOTSUPP;
 		goto err_pmu;
 	}
-	event->hw.aux_paused = event->attr.aux_start_paused;
+	if (event->attr.aux_start_paused)
+		event->hw.aux_paused = 1;
 
 	if (cgroup_fd != -1) {
 		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);

I tested that with:

  # perf probe -x /root/main -a main
  Added new event:
    probe_main:main      (on main in /root/main)

  # perf record -e probe_main:main -- ./main

and it made the problem go away.

>>
>> But I think there's more problems, we need to do the aux_action
>> validation after perf_get_aux_event(), we can't know if having those
>> bits set makes sense before that. This means the perf_event_alloc() site
>> is wrong in the first place.

As above, aux_start_paused is used on the AUX event itself, so the
PMU capability is checked in perf_event_alloc:

	if (event->attr.aux_start_paused &&
	    !(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) {
		err = -EOPNOTSUPP;
		goto err_pmu;
	}

Whereas aux_pause/aux_resume are checked in perf_get_aux_event():

	if ((event->attr.aux_pause || event->attr.aux_resume) &&
	    !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
		return 0;

That all seems OK, so please let me know if there is
something else to change.
diff mbox

Patch

--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -432,7 +432,7 @@  uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func,
 
 struct testmod_uprobe {
 	struct path path;
-	loff_t offset;
+	struct uprobe *uprobe;
 	struct uprobe_consumer consumer;
 };
 
@@ -446,25 +446,25 @@  static int testmod_register_uprobe(loff_t offset)
 {
 	int err = -EBUSY;
 
-	if (uprobe.offset)
+	if (uprobe.uprobe)
 		return -EBUSY;
 
 	mutex_lock(&testmod_uprobe_mutex);
 
-	if (uprobe.offset)
+	if (uprobe.uprobe)
 		goto out;
 
 	err = kern_path("/proc/self/exe", LOOKUP_FOLLOW, &uprobe.path);
 	if (err)
 		goto out;
 
-	err = uprobe_register_refctr(d_real_inode(uprobe.path.dentry),
-				     offset, 0, &uprobe.consumer);
-	if (err)
+	uprobe.uprobe = uprobe_register(d_real_inode(uprobe.path.dentry),
+					offset, 0, &uprobe.consumer);
+	if (IS_ERR(uprobe.uprobe)) {
+		err = PTR_ERR(uprobe.uprobe);
 		path_put(&uprobe.path);
-	else
-		uprobe.offset = offset;
-
+		uprobe.uprobe = NULL;
+	}
 out:
 	mutex_unlock(&testmod_uprobe_mutex);
 	return err;
@@ -474,10 +474,10 @@  static void testmod_unregister_uprobe(void)
 {
 	mutex_lock(&testmod_uprobe_mutex);
 
-	if (uprobe.offset) {
-		uprobe_unregister(d_real_inode(uprobe.path.dentry),
-				  uprobe.offset, &uprobe.consumer);
-		uprobe.offset = 0;
+	if (uprobe.uprobe) {
+		uprobe_unregister(uprobe.uprobe, &uprobe.consumer);
+		path_put(&uprobe.path);
+		uprobe.uprobe = NULL;
 	}
 
 	mutex_unlock(&testmod_uprobe_mutex);