diff mbox series

[V2,11/11] x86/rcu: Add THUNK rcu_read_unlock_special_thunk

Message ID 20240407090558.3395-12-jiangshanlai@gmail.com (mailing list archive)
State New
Headers show
Series rcu/x86: Use per-cpu rcu preempt count | expand

Commit Message

Lai Jiangshan April 7, 2024, 9:05 a.m. UTC
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>

Add rcu_read_unlock_special_thunk(), so that the inlined rcu_read_unlock()
doesn't need any code to save the caller-saved registers.

Make rcu_read_unlock() only two instructions in the slow path at the
caller site.

Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
---
 arch/x86/entry/thunk.S             | 5 +++++
 arch/x86/include/asm/rcu_preempt.h | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)

Comments

Joel Fernandes April 23, 2024, 5:26 p.m. UTC | #1
Hello Lai,

On Sun, Apr 7, 2024 at 5:07 AM Lai Jiangshan <jiangshanlai@gmail.com> wrote:
>
> From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
>
> Add rcu_read_unlock_special_thunk(), so that the inlined rcu_read_unlock()
> doesn't need any code to save the caller-saved registers.
>
> Make rcu_read_unlock() only two instructions in the slow path at the
> caller site.
>
> Cc: "Paul E. McKenney" <paulmck@kernel.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Frederic Weisbecker <frederic@kernel.org>
> Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
> ---
>  arch/x86/entry/thunk.S             | 5 +++++
>  arch/x86/include/asm/rcu_preempt.h | 4 +++-
>  2 files changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/thunk.S b/arch/x86/entry/thunk.S
> index 119ebdc3d362..10c60369a67c 100644
> --- a/arch/x86/entry/thunk.S
> +++ b/arch/x86/entry/thunk.S
> @@ -13,3 +13,8 @@ THUNK preempt_schedule_thunk, preempt_schedule
>  THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
>  EXPORT_SYMBOL(preempt_schedule_thunk)
>  EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
> +
> +#ifdef CONFIG_PCPU_RCU_PREEMPT_COUNT
> +THUNK rcu_read_unlock_special_thunk, rcu_read_unlock_special
> +EXPORT_SYMBOL_GPL(rcu_read_unlock_special_thunk)
> +#endif /* #ifdef CONFIG_PCPU_RCU_PREEMPT_COUNT */
> diff --git a/arch/x86/include/asm/rcu_preempt.h b/arch/x86/include/asm/rcu_preempt.h
> index cb25ebe038a5..acdd73b74c05 100644
> --- a/arch/x86/include/asm/rcu_preempt.h
> +++ b/arch/x86/include/asm/rcu_preempt.h
> @@ -97,9 +97,11 @@ static __always_inline bool pcpu_rcu_preempt_count_dec_and_test(void)
>                                __percpu_arg([var]));
>  }
>
> +extern asmlinkage void rcu_read_unlock_special_thunk(void);
> +
>  #define pcpu_rcu_read_unlock_special()                                         \
>  do {                                                                           \
> -       rcu_read_unlock_special();

Instead, can you not use __no_caller_saved_registers attribute for
definition of rcu_read_unlock_special() or does that not work for what
you're trying to do here?

Thanks,

 - Joel
Lai Jiangshan April 24, 2024, 2:43 a.m. UTC | #2
On Wed, Apr 24, 2024 at 1:26 AM Joel Fernandes <joel@joelfernandes.org> wrote:
>
> Hello Lai,
>
> On Sun, Apr 7, 2024 at 5:07 AM Lai Jiangshan <jiangshanlai@gmail.com> wrote:
> >
> > From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
> >
> > Add rcu_read_unlock_special_thunk(), so that the inlined rcu_read_unlock()
> > doesn't need any code to save the caller-saved registers.
> >
> > Make rcu_read_unlock() only two instructions in the slow path at the
> > caller site.
> >
> > Cc: "Paul E. McKenney" <paulmck@kernel.org>
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Frederic Weisbecker <frederic@kernel.org>
> > Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
> > ---
> >  arch/x86/entry/thunk.S             | 5 +++++
> >  arch/x86/include/asm/rcu_preempt.h | 4 +++-
> >  2 files changed, 8 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/entry/thunk.S b/arch/x86/entry/thunk.S
> > index 119ebdc3d362..10c60369a67c 100644
> > --- a/arch/x86/entry/thunk.S
> > +++ b/arch/x86/entry/thunk.S
> > @@ -13,3 +13,8 @@ THUNK preempt_schedule_thunk, preempt_schedule
> >  THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
> >  EXPORT_SYMBOL(preempt_schedule_thunk)
> >  EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
> > +
> > +#ifdef CONFIG_PCPU_RCU_PREEMPT_COUNT
> > +THUNK rcu_read_unlock_special_thunk, rcu_read_unlock_special
> > +EXPORT_SYMBOL_GPL(rcu_read_unlock_special_thunk)
> > +#endif /* #ifdef CONFIG_PCPU_RCU_PREEMPT_COUNT */
> > diff --git a/arch/x86/include/asm/rcu_preempt.h b/arch/x86/include/asm/rcu_preempt.h
> > index cb25ebe038a5..acdd73b74c05 100644
> > --- a/arch/x86/include/asm/rcu_preempt.h
> > +++ b/arch/x86/include/asm/rcu_preempt.h
> > @@ -97,9 +97,11 @@ static __always_inline bool pcpu_rcu_preempt_count_dec_and_test(void)
> >                                __percpu_arg([var]));
> >  }
> >
> > +extern asmlinkage void rcu_read_unlock_special_thunk(void);
> > +
> >  #define pcpu_rcu_read_unlock_special()                                         \
> >  do {                                                                           \
> > -       rcu_read_unlock_special();
>
> Instead, can you not use __no_caller_saved_registers attribute for
> definition of rcu_read_unlock_special() or does that not work for what
> you're trying to do here?
>

I think it is paramount to make it the same as preempt_schedule[_thunk]()
when it comes to avoiding the caller-saving-registers-procedures in the
unlock paths.

I had no idea of 'no_caller_saved_registers' before, so I haven't tried it.

And there are limits to 'no_caller_saved_registers' either:

https://clang.llvm.org/docs/AttributeReference.html#no-caller-saved-registers:

Functions specified with the ‘no_caller_saved_registers’ attribute
should only call other functions with the ‘no_caller_saved_registers’
attribute, or should be compiled with the ‘-mgeneral-regs-only’ flag
to avoid saving unused non-GPR registers.

https://gcc.gnu.org/onlinedocs/gcc/x86-Function-Attributes.html#index-no_005fcaller_005fsaved_005fregisters-function-attribute_002c-x86:

Since GCC doesn’t preserve SSE, MMX nor x87 states, the GCC option
-mgeneral-regs-only should be used to compile functions with
no_caller_saved_registers attribute.

And I don't think ‘-mgeneral-regs-only’ is being used in the kernel for x86.

Thanks
Lai
diff mbox series

Patch

diff --git a/arch/x86/entry/thunk.S b/arch/x86/entry/thunk.S
index 119ebdc3d362..10c60369a67c 100644
--- a/arch/x86/entry/thunk.S
+++ b/arch/x86/entry/thunk.S
@@ -13,3 +13,8 @@  THUNK preempt_schedule_thunk, preempt_schedule
 THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
 EXPORT_SYMBOL(preempt_schedule_thunk)
 EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
+
+#ifdef CONFIG_PCPU_RCU_PREEMPT_COUNT
+THUNK rcu_read_unlock_special_thunk, rcu_read_unlock_special
+EXPORT_SYMBOL_GPL(rcu_read_unlock_special_thunk)
+#endif /* #ifdef CONFIG_PCPU_RCU_PREEMPT_COUNT */
diff --git a/arch/x86/include/asm/rcu_preempt.h b/arch/x86/include/asm/rcu_preempt.h
index cb25ebe038a5..acdd73b74c05 100644
--- a/arch/x86/include/asm/rcu_preempt.h
+++ b/arch/x86/include/asm/rcu_preempt.h
@@ -97,9 +97,11 @@  static __always_inline bool pcpu_rcu_preempt_count_dec_and_test(void)
 			       __percpu_arg([var]));
 }
 
+extern asmlinkage void rcu_read_unlock_special_thunk(void);
+
 #define pcpu_rcu_read_unlock_special()						\
 do {										\
-	rcu_read_unlock_special();						\
+	asm volatile ("call rcu_read_unlock_special_thunk" : ASM_CALL_CONSTRAINT);\
 } while (0)
 
 #endif // #ifdef CONFIG_PCPU_RCU_PREEMPT_COUNT