Message ID | 20211015110035.14813389@gandalf.local.home (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Series | tracing: Have all levels of checks prevent recursion | expand |
On Fri, Oct 15, 2021 at 11:00:35AM -0400, Steven Rostedt wrote: > From: "Steven Rostedt (VMware)" <rostedt@goodmis.org> > > While writing an email explaining the "bit = 0" logic for a discussion on > bit = trace_get_context_bit() + start; While there, you were also going to update that function to match/use get_recursion_context(). Because your version is still branch hell.
On Fri, 15 Oct 2021 18:17:02 +0200 Peter Zijlstra <peterz@infradead.org> wrote: > On Fri, Oct 15, 2021 at 11:00:35AM -0400, Steven Rostedt wrote: > > From: "Steven Rostedt (VMware)" <rostedt@goodmis.org> > > > > While writing an email explaining the "bit = 0" logic for a discussion on > > > bit = trace_get_context_bit() + start; > > While there, you were also going to update that function to match/use > get_recursion_context(). Because your version is still branch hell. That would probably be a separate patch. This is just a fix to a design flaw, changing the context tests would be performance enhancement. Thanks for the reminder (as it was on my todo list to update that code). -- Steve
On Fri, 15 Oct 2021 13:35:04 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > On Fri, 15 Oct 2021 18:17:02 +0200 > Peter Zijlstra <peterz@infradead.org> wrote: > > > On Fri, Oct 15, 2021 at 11:00:35AM -0400, Steven Rostedt wrote: > > > From: "Steven Rostedt (VMware)" <rostedt@goodmis.org> > > > > > > While writing an email explaining the "bit = 0" logic for a discussion on > > > > > bit = trace_get_context_bit() + start; > > > > While there, you were also going to update that function to match/use > > get_recursion_context(). Because your version is still branch hell. > > That would probably be a separate patch. This is just a fix to a design > flaw, changing the context tests would be performance enhancement. > > Thanks for the reminder (as it was on my todo list to update that code). Something like this: From: "Steven Rostedt (VMware)" <rostedt@goodmis.org> Subject: [PATCH] tracing: Reuse logic from perf's get_recursion_context() Instead of having branches that adds noise to the branch prediction, use the addition logic to set the bit for the level of interrupt context that the state is currently in. This copies the logic from perf's get_recursion_context() function. Link: https://lore.kernel.org/all/20211015161702.GF174703@worktop.programming.kicks-ass.net/ Suggested-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> --- include/linux/trace_recursion.h | 11 ++++++----- kernel/trace/ring_buffer.c | 12 ++++++------ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index 168fdf07419a..41f5bfd9e93f 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -119,12 +119,13 @@ enum { static __always_inline int trace_get_context_bit(void) { unsigned long pc = preempt_count(); + unsigned char bit = 0; - if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) - return TRACE_CTX_NORMAL; - else - return pc & NMI_MASK ? TRACE_CTX_NMI : - pc & HARDIRQ_MASK ? TRACE_CTX_IRQ : TRACE_CTX_SOFTIRQ; + bit += !!(pc & (NMI_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + return TRACE_CTX_NORMAL - bit; } #ifdef CONFIG_FTRACE_RECORD_RECURSION diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c5a3fbf19617..15d4380006e3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3168,13 +3168,13 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { unsigned int val = cpu_buffer->current_context; unsigned long pc = preempt_count(); - int bit; + int bit = 0; - if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) - bit = RB_CTX_NORMAL; - else - bit = pc & NMI_MASK ? RB_CTX_NMI : - pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; + bit += !!(pc & (NMI_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + bit = RB_CTX_NORMAL - bit; if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { /*
On Fri, Oct 15, 2021 at 01:58:06PM -0400, Steven Rostedt wrote:
> Something like this:
I think having one copy of that in a header is better than having 3
copies. But yes, something along them lines.
On Fri, 15 Oct 2021 20:04:29 +0200 Peter Zijlstra <peterz@infradead.org> wrote: > On Fri, Oct 15, 2021 at 01:58:06PM -0400, Steven Rostedt wrote: > > Something like this: > > I think having one copy of that in a header is better than having 3 > copies. But yes, something along them lines. I was just about to ask you about this patch ;-) diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 4d244e295e85..a6ae329aa654 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -74,6 +74,27 @@ */ #define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) +/** + * interrupt_context_level - return interrupt context level + * + * Returns the current interrupt context level. + * 0 - normal context + * 1 - softirq context + * 2 - hardirq context + * 3 - NMI context + */ +static __always_inline unsigned char interrupt_context_level(void) +{ + unsigned long pc = preempt_count(); + unsigned char level = 0; + + level += !!(pc & (NMI_MASK)); + level += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + return level; +} + /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ #include <asm/preempt.h> diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index 41f5bfd9e93f..018a04381556 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -118,12 +118,7 @@ enum { static __always_inline int trace_get_context_bit(void) { - unsigned long pc = preempt_count(); - unsigned char bit = 0; - - bit += !!(pc & (NMI_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + unsigned char bit = interrupt_context_level(); return TRACE_CTX_NORMAL - bit; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 228801e20788..c91711f20cf8 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -206,11 +206,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) static inline int get_recursion_context(int *recursion) { unsigned int pc = preempt_count(); - unsigned char rctx = 0; - - rctx += !!(pc & (NMI_MASK)); - rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK)); - rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + unsigned char rctx = interrupt_context_level(); if (recursion[rctx]) return -1; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 15d4380006e3..f6520d0a4c8c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3167,12 +3167,7 @@ static __always_inline int trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { unsigned int val = cpu_buffer->current_context; - unsigned long pc = preempt_count(); - int bit = 0; - - bit += !!(pc & (NMI_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + int bit = interrupt_context_level(); bit = RB_CTX_NORMAL - bit;
On Fri, Oct 15, 2021 at 02:20:33PM -0400, Steven Rostedt wrote: > On Fri, 15 Oct 2021 20:04:29 +0200 > Peter Zijlstra <peterz@infradead.org> wrote: > > > On Fri, Oct 15, 2021 at 01:58:06PM -0400, Steven Rostedt wrote: > > > Something like this: > > > > I think having one copy of that in a header is better than having 3 > > copies. But yes, something along them lines. > > I was just about to ask you about this patch ;-) Much better :-) > diff --git a/kernel/events/internal.h b/kernel/events/internal.h > index 228801e20788..c91711f20cf8 100644 > --- a/kernel/events/internal.h > +++ b/kernel/events/internal.h > @@ -206,11 +206,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) > static inline int get_recursion_context(int *recursion) > { > unsigned int pc = preempt_count(); Although I think we can do without that ^ line as well :-) > - unsigned char rctx = 0; > - > - rctx += !!(pc & (NMI_MASK)); > - rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK)); > - rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); > + unsigned char rctx = interrupt_context_level(); > > if (recursion[rctx]) > return -1;
On Fri, 15 Oct 2021 14:20:33 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > > I think having one copy of that in a header is better than having 3 > > copies. But yes, something along them lines. > > I was just about to ask you about this patch ;-) Except it doesn't build :-p (need to move the inlined function down a bit) diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 4d244e295e85..b32e3dabe28b 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -77,6 +77,27 @@ /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ #include <asm/preempt.h> +/** + * interrupt_context_level - return interrupt context level + * + * Returns the current interrupt context level. + * 0 - normal context + * 1 - softirq context + * 2 - hardirq context + * 3 - NMI context + */ +static __always_inline unsigned char interrupt_context_level(void) +{ + unsigned long pc = preempt_count(); + unsigned char level = 0; + + level += !!(pc & (NMI_MASK)); + level += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + return level; +} + #define nmi_count() (preempt_count() & NMI_MASK) #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #ifdef CONFIG_PREEMPT_RT diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index 41f5bfd9e93f..018a04381556 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -118,12 +118,7 @@ enum { static __always_inline int trace_get_context_bit(void) { - unsigned long pc = preempt_count(); - unsigned char bit = 0; - - bit += !!(pc & (NMI_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + unsigned char bit = interrupt_context_level(); return TRACE_CTX_NORMAL - bit; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 228801e20788..c91711f20cf8 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -206,11 +206,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) static inline int get_recursion_context(int *recursion) { unsigned int pc = preempt_count(); - unsigned char rctx = 0; - - rctx += !!(pc & (NMI_MASK)); - rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK)); - rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + unsigned char rctx = interrupt_context_level(); if (recursion[rctx]) return -1; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 15d4380006e3..f6520d0a4c8c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3167,12 +3167,7 @@ static __always_inline int trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { unsigned int val = cpu_buffer->current_context; - unsigned long pc = preempt_count(); - int bit = 0; - - bit += !!(pc & (NMI_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + int bit = interrupt_context_level(); bit = RB_CTX_NORMAL - bit;
On Fri, 15 Oct 2021 20:24:59 +0200 Peter Zijlstra <peterz@infradead.org> wrote: > > @@ -206,11 +206,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) > > static inline int get_recursion_context(int *recursion) > > { > > unsigned int pc = preempt_count(); > > Although I think we can do without that ^ line as well :-) Ah, I could have sworn I deleted it. Oh well, will make a proper patch set. Thanks! -- Steve > > > - unsigned char rctx = 0;
On Fri 2021-10-15 11:00:35, Steven Rostedt wrote: > From: "Steven Rostedt (VMware)" <rostedt@goodmis.org> > > While writing an email explaining the "bit = 0" logic for a discussion on > making ftrace_test_recursion_trylock() disable preemption, I discovered a > path that makes the "not do the logic if bit is zero" unsafe. > > Since we want to encourage architectures to implement all ftrace features, > having them slow down due to this extra logic may encourage the > maintainers to update to the latest ftrace features. And because this > logic is only safe for them, remove it completely. > > [*] There is on layer of recursion that is allowed, and that is to allow > for the transition between interrupt context (normal -> softirq -> > irq -> NMI), because a trace may occur before the context update is > visible to the trace recursion logic. > > diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h > index a9f9c5714e65..168fdf07419a 100644 > --- a/include/linux/trace_recursion.h > +++ b/include/linux/trace_recursion.h > @@ -165,40 +147,29 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign > unsigned int val = READ_ONCE(current->trace_recursion); > int bit; > > - /* A previous recursion check was made */ > - if ((val & TRACE_CONTEXT_MASK) > max) > - return 0; @max parameter is no longer used. > - > bit = trace_get_context_bit() + start; > if (unlikely(val & (1 << bit))) { > /* > * It could be that preempt_count has not been updated during > * a switch between contexts. Allow for a single recursion. > */ > - bit = TRACE_TRANSITION_BIT; > + bit = TRACE_CTX_TRANSITION + start; I just want to be sure that I understand it correctly. The transition bit is the same for all contexts. It will allow one recursion only in one context. IMHO, the same problem (not-yet-updated preempt_count) might happen in any transition between contexts: normal -> soft IRQ -> IRQ -> NMI. Well, I am not sure what exacly it means "preempt_count has not been updated during a switch between contexts." Is it that a function in the interrupt entry code is traced before preempt_count is updated? Or that an interrupt entry is interrupted by a higher level interrupt, e.g. IRQ entry code interrupted by NMI? I guess that it is the first case. It would mean that the above function allows one mistake (one traced funtion in an interrupt entry code before the entry code updates preempt_count). Do I get it correctly? Is this what we want? Could we please update the comment? I mean to say if it is a race or if we trace a function that should not get traced. > if (val & (1 << bit)) { > do_ftrace_record_recursion(ip, pip); > return -1; > } > - } else { > - /* Normal check passed, clear the transition to allow it again */ > - val &= ~(1 << TRACE_TRANSITION_BIT); > } > > val |= 1 << bit; > current->trace_recursion = val; > barrier(); > > - return bit + 1; > + return bit; > } > > static __always_inline void trace_clear_recursion(int bit) > { > - if (!bit) > - return; > - > barrier(); > - bit--; > trace_recursion_clear(bit); > } Otherwise, the change looks great. It simplifies that logic a lot. I think that I start understanding it ;-) Best Regards, Petr
On Mon, 18 Oct 2021 12:19:20 +0200 Petr Mladek <pmladek@suse.com> wrote: > On Fri 2021-10-15 11:00:35, Steven Rostedt wrote: > > From: "Steven Rostedt (VMware)" <rostedt@goodmis.org> > > > > While writing an email explaining the "bit = 0" logic for a discussion on > > making ftrace_test_recursion_trylock() disable preemption, I discovered a > > path that makes the "not do the logic if bit is zero" unsafe. > > > > Since we want to encourage architectures to implement all ftrace features, > > having them slow down due to this extra logic may encourage the > > maintainers to update to the latest ftrace features. And because this > > logic is only safe for them, remove it completely. > > > > [*] There is on layer of recursion that is allowed, and that is to allow > > for the transition between interrupt context (normal -> softirq -> > > irq -> NMI), because a trace may occur before the context update is > > visible to the trace recursion logic. > > > > diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h > > index a9f9c5714e65..168fdf07419a 100644 > > --- a/include/linux/trace_recursion.h > > +++ b/include/linux/trace_recursion.h > > @@ -165,40 +147,29 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign > > unsigned int val = READ_ONCE(current->trace_recursion); > > int bit; > > > > - /* A previous recursion check was made */ > > - if ((val & TRACE_CONTEXT_MASK) > max) > > - return 0; > > @max parameter is no longer used. Thanks for noticing! > > > - > > bit = trace_get_context_bit() + start; > > if (unlikely(val & (1 << bit))) { > > /* > > * It could be that preempt_count has not been updated during > > * a switch between contexts. Allow for a single recursion. > > */ > > - bit = TRACE_TRANSITION_BIT; > > + bit = TRACE_CTX_TRANSITION + start; > > I just want to be sure that I understand it correctly. > > The transition bit is the same for all contexts. It will allow one > recursion only in one context. Right. > > IMHO, the same problem (not-yet-updated preempt_count) might happen > in any transition between contexts: normal -> soft IRQ -> IRQ -> NMI. Yes, and then we will drop the event if it happens twice, otherwise, we will need to have a 4 layer transition bit mask, and allow 4 recursions, which is more than I want to allow. > > Well, I am not sure what exacly it means "preempt_count has not been > updated during a switch between contexts." > > Is it that a function in the interrupt entry code is traced before > preempt_count is updated? > > Or that an interrupt entry is interrupted by a higher level > interrupt, e.g. IRQ entry code interrupted by NMI? Both actually ;-) There are places that can trigger a trace between the time the interrupt is triggered, and the time the preempt_count updates the interrupt context it is in. Thus the tracer will still think it is in the previous context. But that is OK, unless, that interrupt happened while the previous context was in the middle of tracing: trace() { context = get_context(preempt_count); test_and_set_bit(context) <<--- interrupt --->>> trace() { context = get_context(preempt_count); test_and_set_bit(context); <-- detects recursion! } update_preempt_count(irq_context); By allowing a single recursion, it still does the above trace. Yes, if an NMI happens during that first interrupt trace, and it too traces before the preempt_count is updated, it will detect it as a recursion. But this can only happen for that one trace. After the trace returns, the transition bit is cleared, and the tracing that happens in the rest of the interrupt is using the proper context. Thus, to drop due to needing two transition bits, it would require that an interrupt triggered during a trace, and while it was tracing before the preempt_count update, it too needed to be interrupted by something (NMI) and that needs to trace before the preempt_count update. Although, I think we were able to remove all the NMI tracing before the update, there's a game of whack-a-mole to handle the interrupt cases. > > > I guess that it is the first case. It would mean that the above > function allows one mistake (one traced funtion in an interrupt entry > code before the entry code updates preempt_count). > > Do I get it correctly? > Is this what we want? Pretty much, which my above explanation to hopefully fill in the details. > > > Could we please update the comment? I mean to say if it is a race > or if we trace a function that should not get traced. Comments can always use some loving ;-) > > > if (val & (1 << bit)) { > > do_ftrace_record_recursion(ip, pip); > > return -1; > > } > > - } else { > > - /* Normal check passed, clear the transition to allow it again */ > > - val &= ~(1 << TRACE_TRANSITION_BIT); > > } > > > > val |= 1 << bit; > > current->trace_recursion = val; > > barrier(); > > > > - return bit + 1; > > + return bit; > > } > > > > static __always_inline void trace_clear_recursion(int bit) > > { > > - if (!bit) > > - return; > > - > > barrier(); > > - bit--; > > trace_recursion_clear(bit); > > } > > Otherwise, the change looks great. It simplifies that logic a lot. > I think that I start understanding it ;-) Awesome. I'll make some more updates. -- Steve
On Mon 2021-10-18 09:50:27, Steven Rostedt wrote: > On Mon, 18 Oct 2021 12:19:20 +0200 > Petr Mladek <pmladek@suse.com> wrote: > > > On Fri 2021-10-15 11:00:35, Steven Rostedt wrote: > > > From: "Steven Rostedt (VMware)" <rostedt@goodmis.org> > > > > > > While writing an email explaining the "bit = 0" logic for a discussion on > > > making ftrace_test_recursion_trylock() disable preemption, I discovered a > > > path that makes the "not do the logic if bit is zero" unsafe. > > > > > > Since we want to encourage architectures to implement all ftrace features, > > > having them slow down due to this extra logic may encourage the > > > maintainers to update to the latest ftrace features. And because this > > > logic is only safe for them, remove it completely. > > > > > > [*] There is on layer of recursion that is allowed, and that is to allow > > > for the transition between interrupt context (normal -> softirq -> > > > irq -> NMI), because a trace may occur before the context update is > > > visible to the trace recursion logic. > > > > > > diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h > > > index a9f9c5714e65..168fdf07419a 100644 > > > --- a/include/linux/trace_recursion.h > > > +++ b/include/linux/trace_recursion.h > > > @@ -165,40 +147,29 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign > > > unsigned int val = READ_ONCE(current->trace_recursion); > > > int bit; > > > > > > - /* A previous recursion check was made */ > > > - if ((val & TRACE_CONTEXT_MASK) > max) > > > - return 0; > > > > @max parameter is no longer used. > > Thanks for noticing! > > > > > > - > > > bit = trace_get_context_bit() + start; > > > if (unlikely(val & (1 << bit))) { > > > /* > > > * It could be that preempt_count has not been updated during > > > * a switch between contexts. Allow for a single recursion. > > > */ > > > - bit = TRACE_TRANSITION_BIT; > > > + bit = TRACE_CTX_TRANSITION + start; > > > > I just want to be sure that I understand it correctly. > > > > The transition bit is the same for all contexts. It will allow one > > recursion only in one context. > > Right. > > > > > IMHO, the same problem (not-yet-updated preempt_count) might happen > > in any transition between contexts: normal -> soft IRQ -> IRQ -> NMI. > > Yes, and then we will drop the event if it happens twice, otherwise, we > will need to have a 4 layer transition bit mask, and allow 4 recursions, > which is more than I want to allow. Fair enough. I am still not sure if we want to allow the recursion at all, see below. > > > > > Well, I am not sure what exacly it means "preempt_count has not been > > updated during a switch between contexts." > > > > Is it that a function in the interrupt entry code is traced before > > preempt_count is updated? > > > > Or that an interrupt entry is interrupted by a higher level > > interrupt, e.g. IRQ entry code interrupted by NMI? > > Both actually ;-) I see. But only one is a problem. See below. > There are places that can trigger a trace between the time the interrupt is > triggered, and the time the preempt_count updates the interrupt context it > is in. By other words, these locations trace a function that is called between entering the context and updating preempt_count. > Thus the tracer will still think it is in the previous context. But > that is OK, unless, that interrupt happened while the previous context was > in the middle of tracing: My understanding is that this is _not_ OK. Peter Zijlstra fixed the generic entry and no function should be traced there. The problem is that some architectures still allow to trace some code in this entry context code. And this is why you currently tolerate one level of recursion here. Do I get it correctly? But wait. If you tolerate the recursion, you actually tolerate tracing the code between entering context and setting preempt count. It is in your example: > trace() { > context = get_context(preempt_count); > test_and_set_bit(context) > <<--- interrupt --->>> > trace() { > context = get_context(preempt_count); > test_and_set_bit(context); <-- detects recursion! > } > update_preempt_count(irq_context); > > By allowing a single recursion, it still does the above trace. This happens only when the nested trace() is tracing something in the IRQ entry before update_preempt_count(irq_context). My understanding is that Peter Zijlstra do _not_ want to allow the nested trace because this code should not get traced. The outer trace() will work even if we reject the inner trace(). Let me show that interrupting context entry is fine: <<--- normal context -->> trace() context = get_context(preempt_count); // normal context test_and_set_bit(context) <<--- interrupt --->>> interrupt_entry() // tracing is _not_ allowed here update_preempt_count(irq_context); // tracing should work after this point interrupt_handler(); trace() context = get_context(preempt_count); // IRQ context test_and_set_bit(context); // Passes; it sees IRQ context // trace code is allowed <<-- back to normal context --> // outer trace code is allowed. It sees normal context. Best Regards, Petr
On Mon, 18 Oct 2021 12:19:20 +0200 Petr Mladek <pmladek@suse.com> wrote: > > - > > bit = trace_get_context_bit() + start; > > if (unlikely(val & (1 << bit))) { > > /* > > * It could be that preempt_count has not been updated during > > * a switch between contexts. Allow for a single recursion. > > */ > > - bit = TRACE_TRANSITION_BIT; > > + bit = TRACE_CTX_TRANSITION + start; > [..] > Could we please update the comment? I mean to say if it is a race > or if we trace a function that should not get traced. What do you think of this change? diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index 1d8cce02c3fb..24f284eb55a7 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -168,8 +168,12 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign bit = trace_get_context_bit() + start; if (unlikely(val & (1 << bit))) { /* - * It could be that preempt_count has not been updated during - * a switch between contexts. Allow for a single recursion. + * If an interrupt occurs during a trace, and another trace + * happens in that interrupt but before the preempt_count is + * updated to reflect the new interrupt context, then this + * will think a recursion occurred, and the event will be dropped. + * Let a single instance happen via the TRANSITION_BIT to + * not drop those events. */ bit = TRACE_TRANSITION_BIT; if (val & (1 << bit)) { -- Steve
On Mon 2021-10-18 22:02:03, Steven Rostedt wrote: > On Mon, 18 Oct 2021 12:19:20 +0200 > Petr Mladek <pmladek@suse.com> wrote: > > > > - > > > bit = trace_get_context_bit() + start; > > > if (unlikely(val & (1 << bit))) { > > > /* > > > * It could be that preempt_count has not been updated during > > > * a switch between contexts. Allow for a single recursion. > > > */ > > > - bit = TRACE_TRANSITION_BIT; > > > + bit = TRACE_CTX_TRANSITION + start; > > > > [..] > > > Could we please update the comment? I mean to say if it is a race > > or if we trace a function that should not get traced. > > What do you think of this change? > > diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h > index 1d8cce02c3fb..24f284eb55a7 100644 > --- a/include/linux/trace_recursion.h > +++ b/include/linux/trace_recursion.h > @@ -168,8 +168,12 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign > bit = trace_get_context_bit() + start; > if (unlikely(val & (1 << bit))) { > /* > - * It could be that preempt_count has not been updated during > - * a switch between contexts. Allow for a single recursion. > + * If an interrupt occurs during a trace, and another trace > + * happens in that interrupt but before the preempt_count is > + * updated to reflect the new interrupt context, then this > + * will think a recursion occurred, and the event will be dropped. > + * Let a single instance happen via the TRANSITION_BIT to > + * not drop those events. > */ > bit = TRACE_TRANSITION_BIT; > if (val & (1 << bit)) { > > Looks good to me. Thanks for the update. Feel free to postpone this change. I do not want to complicate upstreaming the fix for stable. I am sorry if I already complicated it. Best Regards, Petr
On Tue, 19 Oct 2021 08:41:23 +0200 Petr Mladek <pmladek@suse.com> wrote: > Feel free to postpone this change. I do not want to complicate > upstreaming the fix for stable. I am sorry if I already > complicated it. > No problem. It's not that complicated of a merge fix. I'm sure Linus can handle it ;-) -- Steve
From: Steven Rostedt <rostedt@goodmis.org> Date: Fri, 15 Oct 2021 14:25:41 -0400 Sorry for such a necroposting :z Just wanted to know if this is a bug, so that I could send a fix, or intended behaviour. > On Fri, 15 Oct 2021 14:20:33 -0400 > Steven Rostedt <rostedt@goodmis.org> wrote: > >>> I think having one copy of that in a header is better than having 3 >>> copies. But yes, something along them lines. >> >> I was just about to ask you about this patch ;-) > > Except it doesn't build :-p (need to move the inlined function down a bit) > > diff --git a/include/linux/preempt.h b/include/linux/preempt.h > index 4d244e295e85..b32e3dabe28b 100644 > --- a/include/linux/preempt.h > +++ b/include/linux/preempt.h > @@ -77,6 +77,27 @@ > /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ > #include <asm/preempt.h> > > +/** > + * interrupt_context_level - return interrupt context level > + * > + * Returns the current interrupt context level. > + * 0 - normal context > + * 1 - softirq context > + * 2 - hardirq context > + * 3 - NMI context > + */ > +static __always_inline unsigned char interrupt_context_level(void) > +{ > + unsigned long pc = preempt_count(); > + unsigned char level = 0; > + > + level += !!(pc & (NMI_MASK)); > + level += !!(pc & (NMI_MASK | HARDIRQ_MASK)); > + level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); This doesn't take into account that we can switch the context manually via local_bh_disable() / local_irq_save() etc. During the testing of the separate issue[0], I've found that the function returns 1 in both just softirq and softirq under local_irq_save(). Is this intended? Shouldn't that be level += !!(pc & (NMI_MASK)); level += !!(pc * (NMI_MASK | HARDIRQ_MASK)) || irqs_disabled(); level += !!(pc * (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)) || in_atomic(); ? Otherwise, the result it returns is not really "context level". > + > + return level; > +} > + [0] https://lore.kernel.org/netdev/b3884ff9-d903-948d-797a-1830a39b1e71@intel.com Thanks, Olek
On Fri, 21 Jul 2023 17:34:41 +0200 Alexander Lobakin <aleksander.lobakin@intel.com> wrote: > From: Steven Rostedt <rostedt@goodmis.org> > Date: Fri, 15 Oct 2021 14:25:41 -0400 > > Sorry for such a necroposting :z > Just wanted to know if this is a bug, so that I could send a fix, or > intended behaviour. > > > On Fri, 15 Oct 2021 14:20:33 -0400 > > Steven Rostedt <rostedt@goodmis.org> wrote: > > > >>> I think having one copy of that in a header is better than having 3 > >>> copies. But yes, something along them lines. > >> > >> I was just about to ask you about this patch ;-) > > > > Except it doesn't build :-p (need to move the inlined function down a bit) > > > > diff --git a/include/linux/preempt.h b/include/linux/preempt.h > > index 4d244e295e85..b32e3dabe28b 100644 > > --- a/include/linux/preempt.h > > +++ b/include/linux/preempt.h > > @@ -77,6 +77,27 @@ > > /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ > > #include <asm/preempt.h> > > > > +/** > > + * interrupt_context_level - return interrupt context level > > + * > > + * Returns the current interrupt context level. > > + * 0 - normal context > > + * 1 - softirq context > > + * 2 - hardirq context > > + * 3 - NMI context > > + */ > > +static __always_inline unsigned char interrupt_context_level(void) > > +{ > > + unsigned long pc = preempt_count(); > > + unsigned char level = 0; > > + > > + level += !!(pc & (NMI_MASK)); > > + level += !!(pc & (NMI_MASK | HARDIRQ_MASK)); > > + level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); > > This doesn't take into account that we can switch the context manually > via local_bh_disable() / local_irq_save() etc. During the testing of the You cannot manually switch interrupt context. > separate issue[0], I've found that the function returns 1 in both just > softirq and softirq under local_irq_save(). > Is this intended? Shouldn't that be That is intended behavior. local_bh_disable() and local_irq_save() is not a context switch. It is just preventing that context from happening. The interrupt_context_level() is to tell us what context we are running in, not what context is disabled. > > level += !!(pc & (NMI_MASK)); > level += !!(pc * (NMI_MASK | HARDIRQ_MASK)) || irqs_disabled(); > level += !!(pc * (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)) || > in_atomic(); > > ? > Otherwise, the result it returns is not really "context level". local_bh_disable() use to (and perhaps still does in some configurations) confuse things. But read the comment in kernel/softirq.c /* * SOFTIRQ_OFFSET usage: * * On !RT kernels 'count' is the preempt counter, on RT kernels this applies * to a per CPU counter and to task::softirqs_disabled_cnt. * * - count is changed by SOFTIRQ_OFFSET on entering or leaving softirq * processing. * * - count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) * on local_bh_disable or local_bh_enable. * * This lets us distinguish between whether we are currently processing * softirq and whether we just have bh disabled. */ Just because you disable interrupts does not mean you are in interrupt context. -- Steve > > > + > > + return level; > > +} > > + > [0] > https://lore.kernel.org/netdev/b3884ff9-d903-948d-797a-1830a39b1e71@intel.com > > Thanks, > Olek
From: Steven Rostedt <rostedt@goodmis.org> Date: Fri, 21 Jul 2023 12:00:40 -0400 > On Fri, 21 Jul 2023 17:34:41 +0200 > Alexander Lobakin <aleksander.lobakin@intel.com> wrote: [...] >>> + level += !!(pc & (NMI_MASK)); >>> + level += !!(pc & (NMI_MASK | HARDIRQ_MASK)); >>> + level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); >> >> This doesn't take into account that we can switch the context manually >> via local_bh_disable() / local_irq_save() etc. During the testing of the > > You cannot manually switch interrupt context. > >> separate issue[0], I've found that the function returns 1 in both just >> softirq and softirq under local_irq_save(). >> Is this intended? Shouldn't that be > > That is intended behavior. > > local_bh_disable() and local_irq_save() is not a context switch. It is just > preventing that context from happening. The interrupt_context_level() is to > tell us what context we are running in, not what context is disabled. > >> >> level += !!(pc & (NMI_MASK)); >> level += !!(pc * (NMI_MASK | HARDIRQ_MASK)) || irqs_disabled(); >> level += !!(pc * (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)) || >> in_atomic(); >> >> ? >> Otherwise, the result it returns is not really "context level". > > local_bh_disable() use to (and perhaps still does in some configurations) > confuse things. But read the comment in kernel/softirq.c > > /* > * SOFTIRQ_OFFSET usage: > * > * On !RT kernels 'count' is the preempt counter, on RT kernels this applies > * to a per CPU counter and to task::softirqs_disabled_cnt. > * > * - count is changed by SOFTIRQ_OFFSET on entering or leaving softirq > * processing. > * > * - count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) > * on local_bh_disable or local_bh_enable. > * > * This lets us distinguish between whether we are currently processing > * softirq and whether we just have bh disabled. > */ > > Just because you disable interrupts does not mean you are in interrupt > context. Ah okay, thanks! IOW, if we want to check in some code that we're certainly have interrupts enabled and are not in the interrupt context, we must always do if (!(in_hardirq() || irqs_disabled())) , nothing more elegant / already existing / ...? > > -- Steve > > >> >>> + >>> + return level; >>> +} >>> + >> [0] >> https://lore.kernel.org/netdev/b3884ff9-d903-948d-797a-1830a39b1e71@intel.com >> >> Thanks, >> Olek > Thanks, Olek
On Fri, 21 Jul 2023 18:06:07 +0200 Alexander Lobakin <aleksander.lobakin@intel.com> wrote: > > Just because you disable interrupts does not mean you are in interrupt > > context. > > Ah okay, thanks! IOW, if we want to check in some code that we're > certainly have interrupts enabled and are not in the interrupt context, > we must always do > > if (!(in_hardirq() || irqs_disabled())) > Yeah, probably. > , nothing more elegant / already existing / ...? It's not a common check. What would you call that? irq_safe() ? -- Steve
On Fri, 21 Jul 2023 12:26:32 -0400 Steven Rostedt wrote: > > if (!(in_hardirq() || irqs_disabled())) > > > > Yeah, probably. > > > , nothing more elegant / already existing / ...? > > It's not a common check. What would you call that? Looks like Olek started the weekend already so let me answer. He's trying to check if we can use a fast path cache which takes a _bh spin lock.
diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index a9f9c5714e65..168fdf07419a 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -16,23 +16,8 @@ * When function tracing occurs, the following steps are made: * If arch does not support a ftrace feature: * call internal function (uses INTERNAL bits) which calls... - * If callback is registered to the "global" list, the list - * function is called and recursion checks the GLOBAL bits. - * then this function calls... * The function callback, which can use the FTRACE bits to * check for recursion. - * - * Now if the arch does not support a feature, and it calls - * the global list function which calls the ftrace callback - * all three of these steps will do a recursion protection. - * There's no reason to do one if the previous caller already - * did. The recursion that we are protecting against will - * go through the same steps again. - * - * To prevent the multiple recursion checks, if a recursion - * bit is set that is higher than the MAX bit of the current - * check, then we know that the check was made by the previous - * caller, and we can skip the current check. */ enum { /* Function recursion bits */ @@ -40,12 +25,14 @@ enum { TRACE_FTRACE_NMI_BIT, TRACE_FTRACE_IRQ_BIT, TRACE_FTRACE_SIRQ_BIT, + TRACE_FTRACE_TRANSITION_BIT, - /* INTERNAL_BITs must be greater than FTRACE_BITs */ + /* Internal use recursion bits */ TRACE_INTERNAL_BIT, TRACE_INTERNAL_NMI_BIT, TRACE_INTERNAL_IRQ_BIT, TRACE_INTERNAL_SIRQ_BIT, + TRACE_INTERNAL_TRANSITION_BIT, TRACE_BRANCH_BIT, /* @@ -86,12 +73,6 @@ enum { */ TRACE_GRAPH_NOTRACE_BIT, - /* - * When transitioning between context, the preempt_count() may - * not be correct. Allow for a single recursion to cover this case. - */ - TRACE_TRANSITION_BIT, - /* Used to prevent recursion recording from recursing. */ TRACE_RECORD_RECURSION_BIT, }; @@ -132,6 +113,7 @@ enum { TRACE_CTX_IRQ, TRACE_CTX_SOFTIRQ, TRACE_CTX_NORMAL, + TRACE_CTX_TRANSITION, }; static __always_inline int trace_get_context_bit(void) @@ -165,40 +147,29 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign unsigned int val = READ_ONCE(current->trace_recursion); int bit; - /* A previous recursion check was made */ - if ((val & TRACE_CONTEXT_MASK) > max) - return 0; - bit = trace_get_context_bit() + start; if (unlikely(val & (1 << bit))) { /* * It could be that preempt_count has not been updated during * a switch between contexts. Allow for a single recursion. */ - bit = TRACE_TRANSITION_BIT; + bit = TRACE_CTX_TRANSITION + start; if (val & (1 << bit)) { do_ftrace_record_recursion(ip, pip); return -1; } - } else { - /* Normal check passed, clear the transition to allow it again */ - val &= ~(1 << TRACE_TRANSITION_BIT); } val |= 1 << bit; current->trace_recursion = val; barrier(); - return bit + 1; + return bit; } static __always_inline void trace_clear_recursion(int bit) { - if (!bit) - return; - barrier(); - bit--; trace_recursion_clear(bit); }