bpf: don't call mmap_read_trylock() from IRQ context

Message ID	4b875158-1aa7-402e-8861-860a493c49cd@I-love.SAKURA.ne.jp (mailing list archive)
State	Changes Requested
Delegated to:	BPF
Headers	show Received: from www262.sakura.ne.jp (www262.sakura.ne.jp [202.181.97.72]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 00767225AE for <bpf@vger.kernel.org>; Sat, 8 Jun 2024 10:54:58 +0000 (UTC) Message-ID: <4b875158-1aa7-402e-8861-860a493c49cd@I-love.SAKURA.ne.jp> Date: Sat, 8 Jun 2024 19:53:58 +0900 Precedence: bulk MIME-Version: 1.0 User-Agent: Mozilla Thunderbird Content-Language: en-US To: Alexei Starovoitov <ast@kernel.org>, Daniel Borkmann <daniel@iogearbox.net>, Andrii Nakryiko <andrii@kernel.org>, Martin KaFai Lau <martin.lau@linux.dev>, Eduard Zingerman <eddyz87@gmail.com>, Song Liu <song@kernel.org>, Yonghong Song <yonghong.song@linux.dev>, John Fastabend <john.fastabend@gmail.com>, KP Singh <kpsingh@kernel.org>, Stanislav Fomichev <sdf@google.com>, Hao Luo <haoluo@google.com>, Jiri Olsa <jolsa@kernel.org> Cc: bpf <bpf@vger.kernel.org>, LKML <linux-kernel@vger.kernel.org> From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Subject: [PATCH] bpf: don't call mmap_read_trylock() from IRQ context Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 7bit
Series	bpf: don't call mmap_read_trylock() from IRQ context \| expand bpf: don't call mmap_read_trylock() from IRQ context

Context	Check	Description
bpf/vmtest-bpf-next-PR	fail	PR summary
netdev/series_format	warning	Single patches do not need cover letters; Target tree name not specified in the subject
netdev/tree_selection	success	Guessed tree name to be net-next
netdev/ynl	success	Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present	success	Fixes tag not required for -next series
netdev/header_inline	success	No static functions without inline keyword in header files
netdev/build_32bit	success	Errors and warnings before: 875 this patch: 875
netdev/build_tools	success	No tools touched, skip
netdev/cc_maintainers	success	CCed 13 of 13 maintainers
netdev/build_clang	success	Errors and warnings before: 868 this patch: 868
netdev/verify_signedoff	success	Signed-off-by tag matches author and committer
netdev/deprecated_api	success	None detected
netdev/check_selftest	success	No net selftest shell script
netdev/verify_fixes	success	No Fixes tag
netdev/build_allmodconfig_warn	success	Errors and warnings before: 879 this patch: 879
netdev/checkpatch	success	total: 0 errors, 0 warnings, 0 checks, 14 lines checked
netdev/build_clang_rust	success	No Rust files in patch. Skipping build
netdev/kdoc	success	Errors and warnings before: 0 this patch: 0
netdev/source_inline	success	Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0	success	Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1	success	Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2	success	Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3	success	Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5	success	Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4	success	Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10	success	Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11	success	Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12	success	Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18	success	Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-17	success	Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-20	success	Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-19	success	Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29	success	Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-28	success	Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34	success	Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-36	success	Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-35	success	Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42	success	Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-6	success	Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9	success	Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8	fail	Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7	fail	Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-21	success	Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23	fail	Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22	fail	Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24	success	Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27	success	Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26	success	Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25	success	Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-32	fail	Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30	success	Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33	success	Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41	success	Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37	success	Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38	fail	Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40	fail	Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-16	success	Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13	success	Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31	fail	Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-39	fail	Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-14	fail	Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15	fail	Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc

Tetsuo Handa June 8, 2024, 10:53 a.m. UTC

syzbot is reporting that the same local lock is held when trying to
hold mmap sem from both IRQ enabled context and IRQ context.

Since all callers use bpf_mmap_unlock_get_irq_work() before calling
mmap_read_trylock(), test in_hardirq() at bpf_mmap_unlock_get_irq_work()
in order to make sure that mmap_read_trylock() won't be called from IRQ
context.

  asm_exc_page_fault() => exc_page_fault() => handle_page_fault()
  => do_user_addr_fault() => handle_mm_fault() => __handle_mm_fault()
  => handle_pte_fault() => do_pte_missing() => do_anonymous_page()
  => vmf_anon_prepare() => mmap_read_trylock()
  => __mmap_lock_trace_start_locking()
  => __mmap_lock_do_trace_start_locking() => local_lock_acquire()
  => lock_acquire()

  sysvec_irq_work() => instr_sysvec_irq_work() => __sysvec_irq_work()
  => irq_work_run() => irq_work_run_list() => irq_work_single()
  => do_bpf_send_signal() => group_send_sig_info() => rcu_read_unlock()
  => rcu_lock_release() => lock_release() => trace_lock_release()
  => perf_trace_lock() => perf_trace_run_bpf_submit() => trace_call_bpf()
  => bpf_prog_run_array() => bpf_prog_run() => __bpf_prog_run()
  => bpf_dispatcher_nop_func() => bpf_prog_e6cf5f9c69743609()
  => __bpf_get_stack() => stack_map_get_build_id_offset()
  => mmap_read_trylock() => __mmap_lock_trace_acquire_returned()
  => __mmap_lock_do_trace_acquire_returned() => local_lock_acquire()
  => lock_acquire()

WARNING: inconsistent lock state
6.10.0-rc2-syzkaller-00222-gd30d0e49da71 #0 Not tainted
--------------------------------
inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
syz-executor.2/10910 [HC1[1]:SC0[0]:HE0:SE1] takes:
ffff8880b9538828 (lock#10){?.+.}-{2:2}, at: local_lock_acquire include/linux/local_lock_internal.h:29 [inline]
ffff8880b9538828 (lock#10){?.+.}-{2:2}, at: __mmap_lock_do_trace_acquire_returned+0x8f/0x630 mm/mmap_lock.c:237

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(lock#10);
  <Interrupt>
    lock(lock#10);

 *** DEADLOCK ***

Reported-by: syzbot <syzbot+a225ee3df7e7f9372dbe@syzkaller.appspotmail.com>
Closes: https://syzkaller.appspot.com/bug?extid=a225ee3df7e7f9372dbe
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
Example is https://syzkaller.appspot.com/text?tag=CrashReport&x=1649a2e2980000 .
But not using this example, for this link will disappear eventually.

 kernel/bpf/mmap_unlock_work.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

Tetsuo Handa June 8, 2024, 11:04 a.m. UTC | #1

On 2024/06/08 19:53, Tetsuo Handa wrote:
> inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.

Oops, "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage." example was
found at https://syzkaller.appspot.com/text?tag=CrashReport&x=14f0179a980000 .

Then, do we want to

-	if (in_hardirq()) {
+	if (!in_task()) {

instead?

Tetsuo Handa June 14, 2024, 3:15 p.m. UTC | #2

On 2024/06/08 20:04, Tetsuo Handa wrote:
> On 2024/06/08 19:53, Tetsuo Handa wrote:
>> inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
> 
> Oops, "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage." example was
> found at https://syzkaller.appspot.com/text?tag=CrashReport&x=14f0179a980000 .
> 
> Then, do we want to
> 
> -	if (in_hardirq()) {
> +	if (!in_task()) {
> 
> instead?
> 

"inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage." upon unlock from IRQ work
was reported at https://syzkaller.appspot.com/bug?extid=40905bca570ae6784745 .

Alexei Starovoitov June 14, 2024, 4:40 p.m. UTC | #3

On Fri, Jun 14, 2024 at 8:15 AM Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp> wrote:
>
> On 2024/06/08 20:04, Tetsuo Handa wrote:
> > On 2024/06/08 19:53, Tetsuo Handa wrote:
> >> inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
> >
> > Oops, "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage." example was
> > found at https://syzkaller.appspot.com/text?tag=CrashReport&x=14f0179a980000 .
> >
> > Then, do we want to
> >
> > -     if (in_hardirq()) {
> > +     if (!in_task()) {
> >
> > instead?
> >
>
> "inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage." upon unlock from IRQ work
> was reported at https://syzkaller.appspot.com/bug?extid=40905bca570ae6784745 .

imo the issue is elsewhere. syzbot reports:
local_lock_acquire include/linux/local_lock_internal.h:29 [inline]
 __mmap_lock_do_trace_released+0x9c/0x620 mm/mmap_lock.c:243
 __mmap_lock_trace_released include/linux/mmap_lock.h:42 [inline]

it complains about:
local_lock(&memcg_paths.lock);
in TRACE_MMAP_LOCK_EVENT.
which looks like a false positive.

Tetsuo Handa June 15, 2024, 10:59 a.m. UTC | #4

Hello.

On 2024/06/15 1:40, Alexei Starovoitov wrote:
> On Fri, Jun 14, 2024 at 8:15 AM Tetsuo Handa
> <penguin-kernel@i-love.sakura.ne.jp> wrote:
>> "inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage." upon unlock from IRQ work
>> was reported at https://syzkaller.appspot.com/bug?extid=40905bca570ae6784745 .
> 
> imo the issue is elsewhere. syzbot reports:
> local_lock_acquire include/linux/local_lock_internal.h:29 [inline]
>  __mmap_lock_do_trace_released+0x9c/0x620 mm/mmap_lock.c:243
>  __mmap_lock_trace_released include/linux/mmap_lock.h:42 [inline]
> 
> it complains about:
> local_lock(&memcg_paths.lock);
> in TRACE_MMAP_LOCK_EVENT.
> which looks like a false positive.

Commit 2b5067a8143e ("mm: mmap_lock: add tracepoints around lock
acquisition") introduced TRACE_MMAP_LOCK_EVENT() macro as below.

#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
       do {                                                                   \
               const char *memcg_path;                                        \
               preempt_disable();                                             \
               memcg_path = get_mm_memcg_path(mm);                            \
               trace_mmap_lock_##type(mm,                                     \
                                      memcg_path != NULL ? memcg_path : "",   \
                                      ##__VA_ARGS__);                         \
               if (likely(memcg_path != NULL))                                \
                       put_memcg_path_buf();                                  \
               preempt_enable();                                              \
       } while (0)

Commit d01079f3d0c0 ("mm/mmap_lock: remove dead code for !CONFIG_TRACING
configurations") moved the location of this macro.

Commit 832b50725373 ("mm: mmap_lock: use local locks instead of disabling
preemption") replaced preempt_disable() with local_lock(&memcg_paths.lock)
based on an argument that preempt_disable() has to be avoided because
get_mm_memcg_path() might sleep if PREEMPT_RT=y.

The local_lock() macro is defined as

  #define local_lock(lock)		__local_lock(lock)

in include/linux/local_lock.h and __local_lock() macro is defined as

  #define __local_lock(lock)					\
  	do {							\
  		preempt_disable();				\
  		local_lock_acquire(this_cpu_ptr(lock));		\
  	} while (0)

if PREEMPT_RT=n or

  #define __local_lock(__lock)					\
  	do {							\
  		migrate_disable();				\
  		spin_lock(this_cpu_ptr((__lock)));		\
  	} while (0)

if PREEMPT_RT=y in include/linux/local_lock_internal.h .
Note that the difference between preempt_disable() and migrate_disable().

Commit d01079f3d0c0 ("mm/mmap_lock: remove dead code for !CONFIG_TRACING
configurations") by error replaced local_lock() with preempt_disable(),
and commit e904c2ccf9b5 ("mm: mmap_lock: fix disabling preemption
directly") replaced preempt_disable() with local_lock().



Now, syzbot started reporting

  inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.

and

  inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.

messages, for local_lock() does not disable IRQ.

I think we can replace local_lock() with local_lock_irqsave() like

diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 1854850b4b89..1901ad22ccbd 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -156,14 +156,15 @@ static inline void put_memcg_path_buf(void)
 #define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
 	do {                                                                   \
 		const char *memcg_path;                                        \
-		local_lock(&memcg_paths.lock);                                 \
+		unsigned long flags;                                           \
+		local_lock_irqsave(&memcg_paths.lock, flags);                  \
 		memcg_path = get_mm_memcg_path(mm);                            \
 		trace_mmap_lock_##type(mm,                                     \
 				       memcg_path != NULL ? memcg_path : "",   \
 				       ##__VA_ARGS__);                         \
 		if (likely(memcg_path != NULL))                                \
 			put_memcg_path_buf();                                  \
-		local_unlock(&memcg_paths.lock);                               \
+		local_unlock_irqrestore(&memcg_paths.lock, flags);             \
 	} while (0)
 
 #else /* !CONFIG_MEMCG */

because local_lock_irqsave() is defined as

  #define local_lock_irqsave(lock, flags)	__local_lock_irqsave(lock, flags)

in include/linux/local_lock.h and __local_lock_irqsave() macro is defined as

  #define __local_lock_irqsave(lock, flags)			\
  	do {							\
  		local_irq_save(flags);				\
  		local_lock_acquire(this_cpu_ptr(lock));		\
  	} while (0)

if PREEMPT_RT=n or

  #define __local_lock_irqsave(lock, flags)			\
  	do {							\
  		typecheck(unsigned long, flags);		\
  		flags = 0;					\
  		__local_lock(lock);				\
  	} while (0)

if PREEMPT_RT=y in include/linux/local_lock_internal.h .



But a fundamental question arises here; why do we need to hold
memcg_paths.lock here, for as of commit 44ef20baed8e ("Merge tag
's390-6.10-4' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux"),
"git grep -nF memcg_paths" indicates that memcg_paths.lock is used within
only TRACE_MMAP_LOCK_EVENT() macro.

  mm/mmap_lock.c:48:static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = {
  mm/mmap_lock.c:63:              memcg_path = per_cpu_ptr(&memcg_paths, cpu);
  mm/mmap_lock.c:101:             rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new);
  mm/mmap_lock.c:135:     struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths);
  mm/mmap_lock.c:152:     local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx);
  mm/mmap_lock.c:159:             local_lock(&memcg_paths.lock);                                 \
  mm/mmap_lock.c:166:             local_unlock(&memcg_paths.lock);                               \

That is, what is the reason we can't revert commit 832b50725373 and
make the TRACE_MMAP_LOCK_EVENT() macro to branch directly like below?

diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 1854850b4b89..1e440c7d48e6 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -153,19 +153,35 @@ static inline void put_memcg_path_buf(void)
 	rcu_read_unlock();
 }
 
+#ifndef CONFIG_PREEMPT_RT
 #define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
 	do {                                                                   \
 		const char *memcg_path;                                        \
-		local_lock(&memcg_paths.lock);                                 \
+		preempt_disable();                                             \
 		memcg_path = get_mm_memcg_path(mm);                            \
 		trace_mmap_lock_##type(mm,                                     \
 				       memcg_path != NULL ? memcg_path : "",   \
 				       ##__VA_ARGS__);                         \
 		if (likely(memcg_path != NULL))                                \
 			put_memcg_path_buf();                                  \
-		local_unlock(&memcg_paths.lock);                               \
+		preempt_enable();                                              \
 	} while (0)
 
+#else
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
+	do {                                                                   \
+		const char *memcg_path;                                        \
+		migrate_disable();                                             \
+		memcg_path = get_mm_memcg_path(mm);                            \
+		trace_mmap_lock_##type(mm,                                     \
+				       memcg_path != NULL ? memcg_path : "",   \
+				       ##__VA_ARGS__);                         \
+		if (likely(memcg_path != NULL))                                \
+			put_memcg_path_buf();                                  \
+		migrate_enable();                                              \
+	} while (0)
+#endif
+
 #else /* !CONFIG_MEMCG */
 
 int trace_mmap_lock_reg(void)

Is the reason because &buf[idx] in get_memcg_path_buf() might become out of
bounds due to preemption in normal context if PREEMPT_RT=y ? If so, can't we
add "idx >=0 && idx < CONTEXT_COUNT" check into get_memcg_path_buf() and
return NULL if preemption (or interrupt or recursion if any) exhausted the per
cpu buffer?

  /*
   * How many contexts our trace events might be called in: normal, softirq, irq,
   * and NMI.
   */
  #define CONTEXT_COUNT 4

Tetsuo Handa June 15, 2024, 2:26 p.m. UTC | #5

On 2024/06/15 19:59, Tetsuo Handa wrote:
> Is the reason because &buf[idx] in get_memcg_path_buf() might become out of
> bounds due to preemption in normal context if PREEMPT_RT=y ? If so, can't we
> add "idx >=0 && idx < CONTEXT_COUNT" check into get_memcg_path_buf() and
> return NULL if preemption (or interrupt or recursion if any) exhausted the per
> cpu buffer?

More simply, why not use on-stack buffer, for MEMCG_PATH_BUF_SIZE is only 256?

 mm/mmap_lock.c | 175 ++++++-------------------------------------------
 1 file changed, 20 insertions(+), 155 deletions(-)

diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 1854850b4b89..59165c01c960 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -19,14 +19,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
 
 #ifdef CONFIG_MEMCG
 
-/*
- * Our various events all share the same buffer (because we don't want or need
- * to allocate a set of buffers *per event type*), so we need to protect against
- * concurrent _reg() and _unreg() calls, and count how many _reg() calls have
- * been made.
- */
-static DEFINE_MUTEX(reg_lock);
-static int reg_refcount; /* Protected by reg_lock. */
+static atomic_t reg_refcount;
 
 /*
  * Size of the buffer for memcg path names. Ignoring stack trace support,
@@ -34,136 +27,22 @@ static int reg_refcount; /* Protected by reg_lock. */
  */
 #define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
 
-/*
- * How many contexts our trace events might be called in: normal, softirq, irq,
- * and NMI.
- */
-#define CONTEXT_COUNT 4
-
-struct memcg_path {
-	local_lock_t lock;
-	char __rcu *buf;
-	local_t buf_idx;
-};
-static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = {
-	.lock = INIT_LOCAL_LOCK(lock),
-	.buf_idx = LOCAL_INIT(0),
-};
-
-static char **tmp_bufs;
-
-/* Called with reg_lock held. */
-static void free_memcg_path_bufs(void)
-{
-	struct memcg_path *memcg_path;
-	int cpu;
-	char **old = tmp_bufs;
-
-	for_each_possible_cpu(cpu) {
-		memcg_path = per_cpu_ptr(&memcg_paths, cpu);
-		*(old++) = rcu_dereference_protected(memcg_path->buf,
-			lockdep_is_held(&reg_lock));
-		rcu_assign_pointer(memcg_path->buf, NULL);
-	}
-
-	/* Wait for inflight memcg_path_buf users to finish. */
-	synchronize_rcu();
-
-	old = tmp_bufs;
-	for_each_possible_cpu(cpu) {
-		kfree(*(old++));
-	}
-
-	kfree(tmp_bufs);
-	tmp_bufs = NULL;
-}
-
 int trace_mmap_lock_reg(void)
 {
-	int cpu;
-	char *new;
-
-	mutex_lock(&reg_lock);
-
-	/* If the refcount is going 0->1, proceed with allocating buffers. */
-	if (reg_refcount++)
-		goto out;
-
-	tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs),
-				 GFP_KERNEL);
-	if (tmp_bufs == NULL)
-		goto out_fail;
-
-	for_each_possible_cpu(cpu) {
-		new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL);
-		if (new == NULL)
-			goto out_fail_free;
-		rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new);
-		/* Don't need to wait for inflights, they'd have gotten NULL. */
-	}
-
-out:
-	mutex_unlock(&reg_lock);
+	atomic_inc(&reg_refcount);
 	return 0;
-
-out_fail_free:
-	free_memcg_path_bufs();
-out_fail:
-	/* Since we failed, undo the earlier ref increment. */
-	--reg_refcount;
-
-	mutex_unlock(&reg_lock);
-	return -ENOMEM;
 }
 
 void trace_mmap_lock_unreg(void)
 {
-	mutex_lock(&reg_lock);
-
-	/* If the refcount is going 1->0, proceed with freeing buffers. */
-	if (--reg_refcount)
-		goto out;
-
-	free_memcg_path_bufs();
-
-out:
-	mutex_unlock(&reg_lock);
-}
-
-static inline char *get_memcg_path_buf(void)
-{
-	struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths);
-	char *buf;
-	int idx;
-
-	rcu_read_lock();
-	buf = rcu_dereference(memcg_path->buf);
-	if (buf == NULL) {
-		rcu_read_unlock();
-		return NULL;
-	}
-	idx = local_add_return(MEMCG_PATH_BUF_SIZE, &memcg_path->buf_idx) -
-	      MEMCG_PATH_BUF_SIZE;
-	return &buf[idx];
+	atomic_dec(&reg_refcount);
 }
 
-static inline void put_memcg_path_buf(void)
-{
-	local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx);
-	rcu_read_unlock();
-}
-
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
-	do {                                                                   \
-		const char *memcg_path;                                        \
-		local_lock(&memcg_paths.lock);                                 \
-		memcg_path = get_mm_memcg_path(mm);                            \
-		trace_mmap_lock_##type(mm,                                     \
-				       memcg_path != NULL ? memcg_path : "",   \
-				       ##__VA_ARGS__);                         \
-		if (likely(memcg_path != NULL))                                \
-			put_memcg_path_buf();                                  \
-		local_unlock(&memcg_paths.lock);                               \
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                    \
+	do {                                                    \
+		char buf[MEMCG_PATH_BUF_SIZE];                  \
+		get_mm_memcg_path(mm, buf, sizeof(buf));        \
+		trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \
 	} while (0)
 
 #else /* !CONFIG_MEMCG */
@@ -185,37 +64,23 @@ void trace_mmap_lock_unreg(void)
 #ifdef CONFIG_TRACING
 #ifdef CONFIG_MEMCG
 /*
- * Write the given mm_struct's memcg path to a percpu buffer, and return a
- * pointer to it. If the path cannot be determined, or no buffer was available
- * (because the trace event is being unregistered), NULL is returned.
- *
- * Note: buffers are allocated per-cpu to avoid locking, so preemption must be
- * disabled by the caller before calling us, and re-enabled only after the
- * caller is done with the pointer.
- *
- * The caller must call put_memcg_path_buf() once the buffer is no longer
- * needed. This must be done while preemption is still disabled.
+ * Write the given mm_struct's memcg path to on-stack buffer. If the path cannot be
+ * determined or the trace event is being unregistered, empty string is written.
  */
-static const char *get_mm_memcg_path(struct mm_struct *mm)
+static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
 {
-	char *buf = NULL;
-	struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+	struct mem_cgroup *memcg;
 
+	buf[0] = '\0';
+	/* No need to get path if no trace event is registered. */
+	if (!atomic_read(&reg_refcount))
+		return;
+	memcg = get_mem_cgroup_from_mm(mm);
 	if (memcg == NULL)
-		goto out;
-	if (unlikely(memcg->css.cgroup == NULL))
-		goto out_put;
-
-	buf = get_memcg_path_buf();
-	if (buf == NULL)
-		goto out_put;
-
-	cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE);
-
-out_put:
+		return;
+	if (memcg->css.cgroup)
+		cgroup_path(memcg->css.cgroup, buf, buflen);
 	css_put(&memcg->css);
-out:
-	return buf;
 }
 
 #endif /* CONFIG_MEMCG */

bpf: don't call mmap_read_trylock() from IRQ context

Checks

Commit Message

Comments

Patch