Message ID | 20241119153502.41361-12-vschneid@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | context_tracking,x86: Defer some IPIs until a user->kernel transition | expand |
Le Tue, Nov 19, 2024 at 04:34:58PM +0100, Valentin Schneider a écrit : > +bool ct_set_cpu_work(unsigned int cpu, unsigned int work) > +{ > + struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); > + unsigned int old; > + bool ret = false; > + > + preempt_disable(); > + > + old = atomic_read(&ct->state); > + /* > + * Try setting the work until either > + * - the target CPU has entered kernelspace > + * - the work has been set > + */ > + do { > + ret = atomic_try_cmpxchg(&ct->state, &old, old | (work << CT_WORK_START)); > + } while (!ret && ((old & CT_STATE_MASK) != CT_STATE_KERNEL)); > + > + preempt_enable(); > + return ret; Does it ignore the IPI even if: (ret && (old & CT_STATE_MASK) == CT_STATE_KERNEL)) ? And what about CT_STATE_IDLE? Is the work ignored in those two cases? But would it be cleaner to never set the work if the target is elsewhere than CT_STATE_USER. So you don't need to clear the work on kernel exit but rather on kernel entry. That is: bool ct_set_cpu_work(unsigned int cpu, unsigned int work) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); unsigned int old; bool ret = false; preempt_disable(); old = atomic_read(&ct->state); /* Start with our best wishes */ old &= ~CT_STATE_MASK; old |= CT_STATE_USER /* * Try setting the work until either * - the target CPU has exited userspace * - the work has been set */ do { ret = atomic_try_cmpxchg(&ct->state, &old, old | (work << CT_WORK_START)); } while (!ret && ((old & CT_STATE_MASK) == CT_STATE_USER)); preempt_enable(); return ret; } Thanks.
Le Wed, Nov 20, 2024 at 11:54:36AM +0100, Frederic Weisbecker a écrit : > Le Tue, Nov 19, 2024 at 04:34:58PM +0100, Valentin Schneider a écrit : > > +bool ct_set_cpu_work(unsigned int cpu, unsigned int work) > > +{ > > + struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); > > + unsigned int old; > > + bool ret = false; > > + > > + preempt_disable(); > > + > > + old = atomic_read(&ct->state); > > + /* > > + * Try setting the work until either > > + * - the target CPU has entered kernelspace > > + * - the work has been set > > + */ > > + do { > > + ret = atomic_try_cmpxchg(&ct->state, &old, old | (work << CT_WORK_START)); > > + } while (!ret && ((old & CT_STATE_MASK) != CT_STATE_KERNEL)); > > + > > + preempt_enable(); > > + return ret; > > Does it ignore the IPI even if: > > (ret && (old & CT_STATE_MASK) == CT_STATE_KERNEL)) > > ? > > And what about CT_STATE_IDLE? > > Is the work ignored in those two cases? > > But would it be cleaner to never set the work if the target is elsewhere > than CT_STATE_USER. So you don't need to clear the work on kernel exit > but rather on kernel entry. > > That is: > > bool ct_set_cpu_work(unsigned int cpu, unsigned int work) > { > struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); > unsigned int old; > bool ret = false; > > preempt_disable(); > > old = atomic_read(&ct->state); > > /* Start with our best wishes */ > old &= ~CT_STATE_MASK; > old |= CT_STATE_USER > > /* > * Try setting the work until either > * - the target CPU has exited userspace > * - the work has been set > */ > do { > ret = atomic_try_cmpxchg(&ct->state, &old, old | (work << CT_WORK_START)); > } while (!ret && ((old & CT_STATE_MASK) == CT_STATE_USER)); > > preempt_enable(); > > return ret; > } Ah but there is CT_STATE_GUEST and I see the last patch also applies that to CT_STATE_IDLE. So that could be: bool ct_set_cpu_work(unsigned int cpu, unsigned int work) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); unsigned int old; bool ret = false; preempt_disable(); old = atomic_read(&ct->state); /* CT_STATE_IDLE can be added to last patch here */ if (!(old & (CT_STATE_USER | CT_STATE_GUEST))) { old &= ~CT_STATE_MASK; old |= CT_STATE_USER; } /* * Try setting the work until either * - the target CPU has exited userspace / guest * - the work has been set */ do { ret = atomic_try_cmpxchg(&ct->state, &old, old | (work << CT_WORK_START)); } while (!ret && old & (CT_STATE_USER | CT_STATE_GUEST)); preempt_enable(); return ret; } Thanks.
On 20/11/24 15:23, Frederic Weisbecker wrote: > Ah but there is CT_STATE_GUEST and I see the last patch also applies that to > CT_STATE_IDLE. > > So that could be: > > bool ct_set_cpu_work(unsigned int cpu, unsigned int work) > { > struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); > unsigned int old; > bool ret = false; > > preempt_disable(); > > old = atomic_read(&ct->state); > > /* CT_STATE_IDLE can be added to last patch here */ > if (!(old & (CT_STATE_USER | CT_STATE_GUEST))) { > old &= ~CT_STATE_MASK; > old |= CT_STATE_USER; > } Hmph, so that lets us leverage the cmpxchg for a !CT_STATE_KERNEL check, but we get an extra loop if the target CPU exits kernelspace not to userspace (e.g. vcpu or idle) in the meantime - not great, not terrible. At the cost of one extra bit for the CT_STATE area, with CT_STATE_KERNEL=1 we could do: old = atomic_read(&ct->state); old &= ~CT_STATE_KERNEL;
Le Wed, Nov 20, 2024 at 06:10:43PM +0100, Valentin Schneider a écrit : > On 20/11/24 15:23, Frederic Weisbecker wrote: > > > Ah but there is CT_STATE_GUEST and I see the last patch also applies that to > > CT_STATE_IDLE. > > > > So that could be: > > > > bool ct_set_cpu_work(unsigned int cpu, unsigned int work) > > { > > struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); > > unsigned int old; > > bool ret = false; > > > > preempt_disable(); > > > > old = atomic_read(&ct->state); > > > > /* CT_STATE_IDLE can be added to last patch here */ > > if (!(old & (CT_STATE_USER | CT_STATE_GUEST))) { > > old &= ~CT_STATE_MASK; > > old |= CT_STATE_USER; > > } > > Hmph, so that lets us leverage the cmpxchg for a !CT_STATE_KERNEL check, > but we get an extra loop if the target CPU exits kernelspace not to > userspace (e.g. vcpu or idle) in the meantime - not great, not terrible. The thing is, what you read with atomic_read() should be close to reality. If it already is != CT_STATE_KERNEL then you're good (minus racy changes). If it is CT_STATE_KERNEL then you still must do a failing cmpxchg() in any case, at least to make sure you didn't miss a context tracking change. So the best you can do is a bet. > > At the cost of one extra bit for the CT_STATE area, with CT_STATE_KERNEL=1 > we could do: > > old = atomic_read(&ct->state); > old &= ~CT_STATE_KERNEL; And perhaps also old |= CT_STATE_IDLE (I'm seeing the last patch now), so you at least get a chance of making it right (only ~CT_STATE_KERNEL will always fail) and CPUs usually spend most of their time idle. Thanks.
diff --git a/arch/Kconfig b/arch/Kconfig index bd9f095d69fa0..e7f3f797a34a4 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -912,6 +912,15 @@ config HAVE_CONTEXT_TRACKING_USER_OFFSTACK - No use of instrumentation, unless instrumentation_begin() got called. +config HAVE_CONTEXT_TRACKING_WORK + bool + help + Architecture supports deferring work while not in kernel context. + This is especially useful on setups with isolated CPUs that might + want to avoid being interrupted to perform housekeeping tasks (for + ex. TLB invalidation or icache invalidation). The housekeeping + operations are performed upon re-entering the kernel. + config HAVE_TIF_NOHZ bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 16354dfa6d965..c703376dd326b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -213,6 +213,7 @@ config X86 select HAVE_CMPXCHG_LOCAL select HAVE_CONTEXT_TRACKING_USER if X86_64 select HAVE_CONTEXT_TRACKING_USER_OFFSTACK if HAVE_CONTEXT_TRACKING_USER + select HAVE_CONTEXT_TRACKING_WORK if X86_64 select HAVE_C_RECORDMCOUNT select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL select HAVE_OBJTOOL_NOP_MCOUNT if HAVE_OBJTOOL_MCOUNT diff --git a/arch/x86/include/asm/context_tracking_work.h b/arch/x86/include/asm/context_tracking_work.h new file mode 100644 index 0000000000000..5bc29e6b2ed38 --- /dev/null +++ b/arch/x86/include/asm/context_tracking_work.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_CONTEXT_TRACKING_WORK_H +#define _ASM_X86_CONTEXT_TRACKING_WORK_H + +static __always_inline void arch_context_tracking_work(int work) +{ + switch (work) { + case CONTEXT_WORK_n: + // Do work... + break; + } +} + +#endif diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index af9fe87a09225..16a2eb7525f1f 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -5,6 +5,7 @@ #include <linux/sched.h> #include <linux/vtime.h> #include <linux/context_tracking_state.h> +#include <linux/context_tracking_work.h> #include <linux/instrumentation.h> #include <asm/ptrace.h> @@ -137,6 +138,26 @@ static __always_inline unsigned long ct_state_inc(int incby) return raw_atomic_add_return(incby, this_cpu_ptr(&context_tracking.state)); } +#ifdef CONTEXT_TRACKING_WORK +static __always_inline unsigned long ct_state_inc_clear_work(int incby) +{ + struct context_tracking *ct = this_cpu_ptr(&context_tracking); + unsigned long new, old, state; + + state = arch_atomic_read(&ct->state); + do { + old = state; + new = old & ~CONTEXT_WORK_MASK; + new += incby; + state = arch_atomic_cmpxchg(&ct->state, old, new); + } while (old != state); + + return new; +} +#else +#define ct_state_inc_clear_work(x) ct_state_inc(x) +#endif + static __always_inline bool warn_rcu_enter(void) { bool ret = false; diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 0b81248aa03e2..d1d37dbdf7195 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -5,6 +5,7 @@ #include <linux/percpu.h> #include <linux/static_key.h> #include <linux/context_tracking_irq.h> +#include <linux/context_tracking_work.h> /* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ #define CT_NESTING_IRQ_NONIDLE ((LONG_MAX / 2) + 1) @@ -39,16 +40,19 @@ struct context_tracking { }; /* - * We cram two different things within the same atomic variable: + * We cram up to three different things within the same atomic variable: * - * CT_RCU_WATCHING_START CT_STATE_START - * | | - * v v - * MSB [ RCU watching counter ][ context_state ] LSB - * ^ ^ - * | | - * CT_RCU_WATCHING_END CT_STATE_END + * CT_RCU_WATCHING_START CT_STATE_START + * | CT_WORK_START | + * | | | + * v v v + * MSB [ RCU watching counter ][ context work ][ context_state ] LSB + * ^ ^ ^ + * | | | + * | CT_WORK_END | + * CT_RCU_WATCHING_END CT_STATE_END * + * The [ context work ] region spans 0 bits if CONFIG_CONTEXT_WORK=n * Bits are used from the LSB upwards, so unused bits (if any) will always be in * upper bits of the variable. */ @@ -59,18 +63,24 @@ struct context_tracking { #define CT_STATE_START 0 #define CT_STATE_END (CT_STATE_START + CT_STATE_WIDTH - 1) -#define CT_RCU_WATCHING_MAX_WIDTH (CT_SIZE - CT_STATE_WIDTH) +#define CT_WORK_WIDTH (IS_ENABLED(CONFIG_CONTEXT_TRACKING_WORK) ? CONTEXT_WORK_MAX_OFFSET : 0) +#define CT_WORK_START (CT_STATE_END + 1) +#define CT_WORK_END (CT_WORK_START + CT_WORK_WIDTH - 1) + +#define CT_RCU_WATCHING_MAX_WIDTH (CT_SIZE - CT_WORK_WIDTH - CT_STATE_WIDTH) #define CT_RCU_WATCHING_WIDTH (IS_ENABLED(CONFIG_RCU_DYNTICKS_TORTURE) ? 2 : CT_RCU_WATCHING_MAX_WIDTH) -#define CT_RCU_WATCHING_START (CT_STATE_END + 1) +#define CT_RCU_WATCHING_START (CT_WORK_END + 1) #define CT_RCU_WATCHING_END (CT_RCU_WATCHING_START + CT_RCU_WATCHING_WIDTH - 1) #define CT_RCU_WATCHING BIT(CT_RCU_WATCHING_START) #define CT_STATE_MASK GENMASK(CT_STATE_END, CT_STATE_START) +#define CT_WORK_MASK GENMASK(CT_WORK_END, CT_WORK_START) #define CT_RCU_WATCHING_MASK GENMASK(CT_RCU_WATCHING_END, CT_RCU_WATCHING_START) #define CT_UNUSED_WIDTH (CT_RCU_WATCHING_MAX_WIDTH - CT_RCU_WATCHING_WIDTH) static_assert(CT_STATE_WIDTH + + CT_WORK_WIDTH + CT_RCU_WATCHING_WIDTH + CT_UNUSED_WIDTH == CT_SIZE); diff --git a/include/linux/context_tracking_work.h b/include/linux/context_tracking_work.h new file mode 100644 index 0000000000000..fb74db8876dd2 --- /dev/null +++ b/include/linux/context_tracking_work.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CONTEXT_TRACKING_WORK_H +#define _LINUX_CONTEXT_TRACKING_WORK_H + +#include <linux/bitops.h> + +enum { + CONTEXT_WORK_n_OFFSET, + CONTEXT_WORK_MAX_OFFSET +}; + +enum ct_work { + CONTEXT_WORK_n = BIT(CONTEXT_WORK_n_OFFSET), + CONTEXT_WORK_MAX = BIT(CONTEXT_WORK_MAX_OFFSET) +}; + +#include <asm/context_tracking_work.h> + +#ifdef CONFIG_CONTEXT_TRACKING_WORK +extern bool ct_set_cpu_work(unsigned int cpu, unsigned int work); +#else +static inline bool +ct_set_cpu_work(unsigned int cpu, unsigned int work) { return false; } +#endif + +#endif diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 938c48952d265..37b094ea56fb6 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -72,6 +72,47 @@ static __always_inline void rcu_task_trace_heavyweight_exit(void) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } +#ifdef CONFIG_CONTEXT_TRACKING_WORK +static noinstr void ct_work_flush(unsigned long seq) +{ + int bit; + + seq = (seq & CT_WORK_MASK) >> CT_WORK_START; + + /* + * arch_context_tracking_work() must be noinstr, non-blocking, + * and NMI safe. + */ + for_each_set_bit(bit, &seq, CONTEXT_WORK_MAX) + arch_context_tracking_work(BIT(bit)); +} + +bool ct_set_cpu_work(unsigned int cpu, unsigned int work) +{ + struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); + unsigned int old; + bool ret = false; + + preempt_disable(); + + old = atomic_read(&ct->state); + /* + * Try setting the work until either + * - the target CPU has entered kernelspace + * - the work has been set + */ + do { + ret = atomic_try_cmpxchg(&ct->state, &old, old | (work << CT_WORK_START)); + } while (!ret && ((old & CT_STATE_MASK) != CT_STATE_KERNEL)); + + preempt_enable(); + return ret; +} +#else +static __always_inline void ct_work_flush(unsigned long work) { } +static __always_inline void ct_work_clear(struct context_tracking *ct) { } +#endif + /* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state, that is, @@ -88,7 +129,7 @@ static noinstr void ct_kernel_exit_state(int offset) * next idle sojourn. */ rcu_task_trace_heavyweight_enter(); // Before CT state update! - seq = ct_state_inc(offset); + seq = ct_state_inc_clear_work(offset); // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & CT_RCU_WATCHING)); } @@ -100,7 +141,7 @@ static noinstr void ct_kernel_exit_state(int offset) */ static noinstr void ct_kernel_enter_state(int offset) { - int seq; + unsigned long seq; /* * CPUs seeing atomic_add_return() must see prior idle sojourns, @@ -108,6 +149,7 @@ static noinstr void ct_kernel_enter_state(int offset) * critical section. */ seq = ct_state_inc(offset); + ct_work_flush(seq); // RCU is now watching. Better not be in an extended quiescent state! rcu_task_trace_heavyweight_exit(); // After CT state update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & CT_RCU_WATCHING)); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8ebb6d5a106be..04efc2b605823 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -186,6 +186,11 @@ config CONTEXT_TRACKING_USER_FORCE Say N otherwise, this option brings an overhead that you don't want in production. +config CONTEXT_TRACKING_WORK + bool + depends on HAVE_CONTEXT_TRACKING_WORK && CONTEXT_TRACKING_USER + default y + config NO_HZ bool "Old Idle dynticks config" help