diff mbox series

[v6,1/4] x86/smp: Allow calling mwait_play_dead with an arbitrary hint

Message ID 20241127161518.432616-2-patryk.wlazlyn@linux.intel.com (mailing list archive)
State New
Headers show
Series SRF: Fix offline CPU preventing pc6 entry | expand

Commit Message

Patryk Wlazlyn Nov. 27, 2024, 4:15 p.m. UTC
The MWAIT instruction needs different hints on different CPUs to reach
specific idle states. The current hint calculation in mwait_play_dead()
code works in practice on current Intel hardware, but is not documented
and fails on a recent Intel's Sierra Forest and possibly some future
ones. Those newer CPUs' power efficiency suffers when the CPU is put
offline.

Allow cpuidle code to provide mwait_play_dead with a known hint for
efficient play_dead code.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
---
 arch/x86/include/asm/smp.h |  4 +-
 arch/x86/kernel/smpboot.c  | 86 ++++++++++++++++++++------------------
 2 files changed, 49 insertions(+), 41 deletions(-)

Comments

Gautham R. Shenoy Nov. 29, 2024, 4:27 a.m. UTC | #1
Hello Patryk,

On Wed, Nov 27, 2024 at 05:15:15PM +0100, Patryk Wlazlyn wrote:
> The MWAIT instruction needs different hints on different CPUs to reach
> specific idle states. The current hint calculation in mwait_play_dead()
> code works in practice on current Intel hardware, but is not documented
> and fails on a recent Intel's Sierra Forest and possibly some future
> ones. Those newer CPUs' power efficiency suffers when the CPU is put
> offline.
> 
> Allow cpuidle code to provide mwait_play_dead with a known hint for
> efficient play_dead code.


Just a couple of minor nits:

You could just reword this something along the lines of:

"Introduce a helper function to allow offlined CPUs to enter FFh idle
states with a specific MWAIT hint. The new helper will be used in
subsequent patches by the acpi_idle and intel_idle drivers. This patch
should not have any functional impact."

There is no need to mention MWAIT hint calculation and the Sierra
Forest failure in this patch, as this patch is not doing anything to
fix the issue. Very likely you will be covering that in Patch 4.

> 
> Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
> ---
>  arch/x86/include/asm/smp.h |  4 +-
>  arch/x86/kernel/smpboot.c  | 86 ++++++++++++++++++++------------------
>  2 files changed, 49 insertions(+), 41 deletions(-)
> 
> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
> index ca073f40698f..ab90b95037f3 100644
> --- a/arch/x86/include/asm/smp.h
> +++ b/arch/x86/include/asm/smp.h
> @@ -114,6 +114,7 @@ void wbinvd_on_cpu(int cpu);
>  int wbinvd_on_all_cpus(void);
>  
>  void smp_kick_mwait_play_dead(void);
> +void mwait_play_dead_with_hint(unsigned int hint);
>  
>  void native_smp_send_reschedule(int cpu);
>  void native_send_call_func_ipi(const struct cpumask *mask);
> @@ -151,7 +152,6 @@ static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
>  {
>  	return per_cpu(cpu_l2c_shared_map, cpu);
>  }
> -
>  #else /* !CONFIG_SMP */
>  #define wbinvd_on_cpu(cpu)     wbinvd()
>  static inline int wbinvd_on_all_cpus(void)

This hunk is not relevant to this patch.

The rest of the patch looks good to me.
--
Thanks and Regards
gautham.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index ca073f40698f..ab90b95037f3 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -114,6 +114,7 @@  void wbinvd_on_cpu(int cpu);
 int wbinvd_on_all_cpus(void);
 
 void smp_kick_mwait_play_dead(void);
+void mwait_play_dead_with_hint(unsigned int hint);
 
 void native_smp_send_reschedule(int cpu);
 void native_send_call_func_ipi(const struct cpumask *mask);
@@ -151,7 +152,6 @@  static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
 {
 	return per_cpu(cpu_l2c_shared_map, cpu);
 }
-
 #else /* !CONFIG_SMP */
 #define wbinvd_on_cpu(cpu)     wbinvd()
 static inline int wbinvd_on_all_cpus(void)
@@ -164,6 +164,8 @@  static inline struct cpumask *cpu_llc_shared_mask(int cpu)
 {
 	return (struct cpumask *)cpumask_of(0);
 }
+
+static inline void mwait_play_dead_with_hint(unsigned int eax_hint) { }
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_DEBUG_NMI_SELFTEST
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b5a8f0891135..ef112143623d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1272,13 +1272,57 @@  void play_dead_common(void)
 	local_irq_disable();
 }
 
+void __noreturn mwait_play_dead_with_hint(unsigned int eax_hint)
+{
+	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
+
+	/* Set up state for the kexec() hack below */
+	md->status = CPUDEAD_MWAIT_WAIT;
+	md->control = CPUDEAD_MWAIT_WAIT;
+
+	wbinvd();
+
+	while (1) {
+		/*
+		 * The CLFLUSH is a workaround for erratum AAI65 for
+		 * the Xeon 7400 series.  It's not clear it is actually
+		 * needed, but it should be harmless in either case.
+		 * The WBINVD is insufficient due to the spurious-wakeup
+		 * case where we return around the loop.
+		 */
+		mb();
+		clflush(md);
+		mb();
+		__monitor(md, 0, 0);
+		mb();
+		__mwait(eax_hint, 0);
+
+		if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
+			/*
+			 * Kexec is about to happen. Don't go back into mwait() as
+			 * the kexec kernel might overwrite text and data including
+			 * page tables and stack. So mwait() would resume when the
+			 * monitor cache line is written to and then the CPU goes
+			 * south due to overwritten text, page tables and stack.
+			 *
+			 * Note: This does _NOT_ protect against a stray MCE, NMI,
+			 * SMI. They will resume execution at the instruction
+			 * following the HLT instruction and run into the problem
+			 * which this is trying to prevent.
+			 */
+			WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
+			while(1)
+				native_halt();
+		}
+	}
+}
+
 /*
  * We need to flush the caches before going to sleep, lest we have
  * dirty data in our caches when we come back up.
  */
 static inline void mwait_play_dead(void)
 {
-	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
 	unsigned int eax, ebx, ecx, edx;
 	unsigned int highest_cstate = 0;
 	unsigned int highest_subcstate = 0;
@@ -1316,45 +1360,7 @@  static inline void mwait_play_dead(void)
 			(highest_subcstate - 1);
 	}
 
-	/* Set up state for the kexec() hack below */
-	md->status = CPUDEAD_MWAIT_WAIT;
-	md->control = CPUDEAD_MWAIT_WAIT;
-
-	wbinvd();
-
-	while (1) {
-		/*
-		 * The CLFLUSH is a workaround for erratum AAI65 for
-		 * the Xeon 7400 series.  It's not clear it is actually
-		 * needed, but it should be harmless in either case.
-		 * The WBINVD is insufficient due to the spurious-wakeup
-		 * case where we return around the loop.
-		 */
-		mb();
-		clflush(md);
-		mb();
-		__monitor(md, 0, 0);
-		mb();
-		__mwait(eax, 0);
-
-		if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
-			/*
-			 * Kexec is about to happen. Don't go back into mwait() as
-			 * the kexec kernel might overwrite text and data including
-			 * page tables and stack. So mwait() would resume when the
-			 * monitor cache line is written to and then the CPU goes
-			 * south due to overwritten text, page tables and stack.
-			 *
-			 * Note: This does _NOT_ protect against a stray MCE, NMI,
-			 * SMI. They will resume execution at the instruction
-			 * following the HLT instruction and run into the problem
-			 * which this is trying to prevent.
-			 */
-			WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
-			while(1)
-				native_halt();
-		}
-	}
+	mwait_play_dead_with_hint(eax);
 }
 
 /*