diff mbox series

thermal: intel: Fix unchecked MSR issue

Message ID 20230410173501.3743570-1-srinivas.pandruvada@linux.intel.com (mailing list archive)
State Mainlined, archived
Headers show
Series thermal: intel: Fix unchecked MSR issue | expand

Commit Message

srinivas pandruvada April 10, 2023, 5:35 p.m. UTC
Some older processors don't allow BIT(13) and BIT(15) in the current
mask set by "THERM_STATUS_CLEAR_CORE_MASK". This results in:

unchecked MSR access error: WRMSR to 0x19c (tried to
write 0x000000000000aaa8) at rIP: 0xffffffff816f66a6
(throttle_active_work+0xa6/0x1d0)

To avoid unchecked MSR issues, check cpuid for each feature and then
form core mask. Do the same for package mask set by
"THERM_STATUS_CLEAR_PKG_MASK".

Introduce functions thermal_intr_core_clear_mask() and
thermal_intr_pkg_clear_mask() to set core and package mask respectively.
These functions are called during initialization.

Fixes: 6fe1e64b6026 ("thermal: intel: Prevent accidental clearing of HFI status")
Reported-by: Rui Salvaterra <rsalvaterra@gmail.com>
Link: https://lore.kernel.org/lkml/cdf43fb423368ee3994124a9e8c9b4f8d00712c6.camel@linux.intel.com/T/
Tested-by: Rui Salvaterra <rsalvaterra@gmail.com>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: stable@kernel.org # 6.2+
---
 drivers/thermal/intel/therm_throt.c | 73 ++++++++++++++++++++++++++---
 1 file changed, 66 insertions(+), 7 deletions(-)

Comments

Rafael J. Wysocki April 11, 2023, 4:16 p.m. UTC | #1
On Mon, Apr 10, 2023 at 7:35 PM Srinivas Pandruvada
<srinivas.pandruvada@linux.intel.com> wrote:
>
> Some older processors don't allow BIT(13) and BIT(15) in the current
> mask set by "THERM_STATUS_CLEAR_CORE_MASK". This results in:
>
> unchecked MSR access error: WRMSR to 0x19c (tried to
> write 0x000000000000aaa8) at rIP: 0xffffffff816f66a6
> (throttle_active_work+0xa6/0x1d0)
>
> To avoid unchecked MSR issues, check cpuid for each feature and then
> form core mask. Do the same for package mask set by
> "THERM_STATUS_CLEAR_PKG_MASK".
>
> Introduce functions thermal_intr_core_clear_mask() and
> thermal_intr_pkg_clear_mask()

I've renamed these two functions to
thermal_intr_init_core_clear_mask() and
thermal_intr_init_pkg_clear_mask(), respectively.

> to set core and package mask respectively.
> These functions are called during initialization.
>
> Fixes: 6fe1e64b6026 ("thermal: intel: Prevent accidental clearing of HFI status")
> Reported-by: Rui Salvaterra <rsalvaterra@gmail.com>
> Link: https://lore.kernel.org/lkml/cdf43fb423368ee3994124a9e8c9b4f8d00712c6.camel@linux.intel.com/T/
> Tested-by: Rui Salvaterra <rsalvaterra@gmail.com>
> Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
> Cc: stable@kernel.org # 6.2+
> ---
>  drivers/thermal/intel/therm_throt.c | 73 ++++++++++++++++++++++++++---
>  1 file changed, 66 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c
> index 2e22bb82b738..d5047676f3d2 100644
> --- a/drivers/thermal/intel/therm_throt.c
> +++ b/drivers/thermal/intel/therm_throt.c
> @@ -193,8 +193,67 @@ static const struct attribute_group thermal_attr_group = {
>  #define THERM_THROT_POLL_INTERVAL      HZ
>  #define THERM_STATUS_PROCHOT_LOG       BIT(1)
>
> -#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15))
> -#define THERM_STATUS_CLEAR_PKG_MASK  (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11))
> +static u64 def_therm_core_clear_mask;
> +static u64 def_therm_pkg_clear_mask;

And I've renamed these two variables to therm_intr_core_clear_mask and
therm_intr_pkg_clear_mask, respectively.

Also I've changed the subject (to "thermal: intel: Avoid updating
unsupported THERM_STATUS_CLEAR mask bits") and made some assorted
changelog edits.

With the above changes, the patch has been queued up for 6.3-rc7.

> +
> +static void thermal_intr_core_clear_mask(void)
> +{
> +       if (def_therm_core_clear_mask)
> +               return;
> +
> +       /*
> +        * Reference: Intel SDM  Volume 4
> +        * "Table 2-2. IA-32 Architectural MSRs", MSR 0x19C
> +        * IA32_THERM_STATUS.
> +        */
> +
> +       /*
> +        * Bit 1, 3, 5: CPUID.01H:EDX[22] = 1. This driver will not
> +        * enable interrupts, when 0 as it checks for X86_FEATURE_ACPI.
> +        */
> +       def_therm_core_clear_mask = (BIT(1) | BIT(3) | BIT(5));
> +
> +       /*
> +        * Bit 7 and 9: Thermal Threshold #1 and #2 log
> +        * If CPUID.01H:ECX[8] = 1
> +        */
> +       if (boot_cpu_has(X86_FEATURE_TM2))
> +               def_therm_core_clear_mask |= (BIT(7) | BIT(9));
> +
> +       /* Bit 11: Power Limitation log (R/WC0) If CPUID.06H:EAX[4] = 1 */
> +       if (boot_cpu_has(X86_FEATURE_PLN))
> +               def_therm_core_clear_mask |= BIT(11);
> +
> +       /*
> +        * Bit 13: Current Limit log (R/WC0) If CPUID.06H:EAX[7] = 1
> +        * Bit 15: Cross Domain Limit log (R/WC0) If CPUID.06H:EAX[7] = 1
> +        */
> +       if (boot_cpu_has(X86_FEATURE_HWP))
> +               def_therm_core_clear_mask |= (BIT(13) | BIT(15));
> +}
> +
> +static void thermal_intr_pkg_clear_mask(void)
> +{
> +       if (def_therm_pkg_clear_mask)
> +               return;
> +
> +       /*
> +        * Reference: Intel SDM  Volume 4
> +        * "Table 2-2. IA-32 Architectural MSRs", MSR 0x1B1
> +        * IA32_PACKAGE_THERM_STATUS.
> +        */
> +
> +       /* All bits except BIT 26 depends on CPUID.06H: EAX[6] = 1 */
> +       if (boot_cpu_has(X86_FEATURE_PTS))
> +               def_therm_pkg_clear_mask = (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11));
> +
> +       /*
> +        * Intel SDM Volume 2A: Thermal and Power Management Leaf
> +        * Bit 26: CPUID.06H: EAX[19] = 1
> +        */
> +       if (boot_cpu_has(X86_FEATURE_HFI))
> +               def_therm_pkg_clear_mask |= BIT(26);
> +}
>
>  /*
>   * Clear the bits in package thermal status register for bit = 1
> @@ -207,13 +266,10 @@ void thermal_clear_package_intr_status(int level, u64 bit_mask)
>
>         if (level == CORE_LEVEL) {
>                 msr  = MSR_IA32_THERM_STATUS;
> -               msr_val = THERM_STATUS_CLEAR_CORE_MASK;
> +               msr_val = def_therm_core_clear_mask;
>         } else {
>                 msr  = MSR_IA32_PACKAGE_THERM_STATUS;
> -               msr_val = THERM_STATUS_CLEAR_PKG_MASK;
> -               if (boot_cpu_has(X86_FEATURE_HFI))
> -                       msr_val |= BIT(26);
> -
> +               msr_val = def_therm_pkg_clear_mask;
>         }
>
>         msr_val &= ~bit_mask;
> @@ -708,6 +764,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
>         h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
>         apic_write(APIC_LVTTHMR, h);
>
> +       thermal_intr_core_clear_mask();
> +       thermal_intr_pkg_clear_mask();
> +
>         rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
>         if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
>                 wrmsr(MSR_IA32_THERM_INTERRUPT,
> --
> 2.39.1
>
srinivas pandruvada April 11, 2023, 5:12 p.m. UTC | #2
On Tue, 2023-04-11 at 18:16 +0200, Rafael J. Wysocki wrote:
> On Mon, Apr 10, 2023 at 7:35 PM Srinivas Pandruvada
> <srinivas.pandruvada@linux.intel.com> wrote:
> > 
> > Some older processors don't allow BIT(13) and BIT(15) in the
> > current
> > mask set by "THERM_STATUS_CLEAR_CORE_MASK". This results in:
> > 
> > unchecked MSR access error: WRMSR to 0x19c (tried to
> > write 0x000000000000aaa8) at rIP: 0xffffffff816f66a6
> > (throttle_active_work+0xa6/0x1d0)
> > 
> > To avoid unchecked MSR issues, check cpuid for each feature and
> > then
> > form core mask. Do the same for package mask set by
> > "THERM_STATUS_CLEAR_PKG_MASK".
> > 
> > Introduce functions thermal_intr_core_clear_mask() and
> > thermal_intr_pkg_clear_mask()
> 
> I've renamed these two functions to
> thermal_intr_init_core_clear_mask() and
> thermal_intr_init_pkg_clear_mask(), respectively.
> 
> > to set core and package mask respectively.
> > These functions are called during initialization.
> > 
> > Fixes: 6fe1e64b6026 ("thermal: intel: Prevent accidental clearing
> > of HFI status")
> > Reported-by: Rui Salvaterra <rsalvaterra@gmail.com>
> > Link:
> > https://lore.kernel.org/lkml/cdf43fb423368ee3994124a9e8c9b4f8d00712c6.camel@linux.intel.com/T/
> > Tested-by: Rui Salvaterra <rsalvaterra@gmail.com>
> > Signed-off-by: Srinivas Pandruvada
> > <srinivas.pandruvada@linux.intel.com>
> > Cc: stable@kernel.org # 6.2+
> > ---
> >  drivers/thermal/intel/therm_throt.c | 73
> > ++++++++++++++++++++++++++---
> >  1 file changed, 66 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/thermal/intel/therm_throt.c
> > b/drivers/thermal/intel/therm_throt.c
> > index 2e22bb82b738..d5047676f3d2 100644
> > --- a/drivers/thermal/intel/therm_throt.c
> > +++ b/drivers/thermal/intel/therm_throt.c
> > @@ -193,8 +193,67 @@ static const struct attribute_group
> > thermal_attr_group = {
> >  #define THERM_THROT_POLL_INTERVAL      HZ
> >  #define THERM_STATUS_PROCHOT_LOG       BIT(1)
> > 
> > -#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) |
> > BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15))
> > -#define THERM_STATUS_CLEAR_PKG_MASK  (BIT(1) | BIT(3) | BIT(5) |
> > BIT(7) | BIT(9) | BIT(11))
> > +static u64 def_therm_core_clear_mask;
> > +static u64 def_therm_pkg_clear_mask;
> 
> And I've renamed these two variables to therm_intr_core_clear_mask
> and
> therm_intr_pkg_clear_mask, respectively.
> 
> Also I've changed the subject (to "thermal: intel: Avoid updating
> unsupported THERM_STATUS_CLEAR mask bits") and made some assorted
> changelog edits.
> 
> With the above changes, the patch has been queued up for 6.3-rc7.
Thanks!

-Srinivas

> 
> > +
> > +static void thermal_intr_core_clear_mask(void)
> > +{
> > +       if (def_therm_core_clear_mask)
> > +               return;
> > +
> > +       /*
> > +        * Reference: Intel SDM  Volume 4
> > +        * "Table 2-2. IA-32 Architectural MSRs", MSR 0x19C
> > +        * IA32_THERM_STATUS.
> > +        */
> > +
> > +       /*
> > +        * Bit 1, 3, 5: CPUID.01H:EDX[22] = 1. This driver will not
> > +        * enable interrupts, when 0 as it checks for
> > X86_FEATURE_ACPI.
> > +        */
> > +       def_therm_core_clear_mask = (BIT(1) | BIT(3) | BIT(5));
> > +
> > +       /*
> > +        * Bit 7 and 9: Thermal Threshold #1 and #2 log
> > +        * If CPUID.01H:ECX[8] = 1
> > +        */
> > +       if (boot_cpu_has(X86_FEATURE_TM2))
> > +               def_therm_core_clear_mask |= (BIT(7) | BIT(9));
> > +
> > +       /* Bit 11: Power Limitation log (R/WC0) If CPUID.06H:EAX[4]
> > = 1 */
> > +       if (boot_cpu_has(X86_FEATURE_PLN))
> > +               def_therm_core_clear_mask |= BIT(11);
> > +
> > +       /*
> > +        * Bit 13: Current Limit log (R/WC0) If CPUID.06H:EAX[7] =
> > 1
> > +        * Bit 15: Cross Domain Limit log (R/WC0) If
> > CPUID.06H:EAX[7] = 1
> > +        */
> > +       if (boot_cpu_has(X86_FEATURE_HWP))
> > +               def_therm_core_clear_mask |= (BIT(13) | BIT(15));
> > +}
> > +
> > +static void thermal_intr_pkg_clear_mask(void)
> > +{
> > +       if (def_therm_pkg_clear_mask)
> > +               return;
> > +
> > +       /*
> > +        * Reference: Intel SDM  Volume 4
> > +        * "Table 2-2. IA-32 Architectural MSRs", MSR 0x1B1
> > +        * IA32_PACKAGE_THERM_STATUS.
> > +        */
> > +
> > +       /* All bits except BIT 26 depends on CPUID.06H: EAX[6] = 1
> > */
> > +       if (boot_cpu_has(X86_FEATURE_PTS))
> > +               def_therm_pkg_clear_mask = (BIT(1) | BIT(3) |
> > BIT(5) | BIT(7) | BIT(9) | BIT(11));
> > +
> > +       /*
> > +        * Intel SDM Volume 2A: Thermal and Power Management Leaf
> > +        * Bit 26: CPUID.06H: EAX[19] = 1
> > +        */
> > +       if (boot_cpu_has(X86_FEATURE_HFI))
> > +               def_therm_pkg_clear_mask |= BIT(26);
> > +}
> > 
> >  /*
> >   * Clear the bits in package thermal status register for bit = 1
> > @@ -207,13 +266,10 @@ void thermal_clear_package_intr_status(int
> > level, u64 bit_mask)
> > 
> >         if (level == CORE_LEVEL) {
> >                 msr  = MSR_IA32_THERM_STATUS;
> > -               msr_val = THERM_STATUS_CLEAR_CORE_MASK;
> > +               msr_val = def_therm_core_clear_mask;
> >         } else {
> >                 msr  = MSR_IA32_PACKAGE_THERM_STATUS;
> > -               msr_val = THERM_STATUS_CLEAR_PKG_MASK;
> > -               if (boot_cpu_has(X86_FEATURE_HFI))
> > -                       msr_val |= BIT(26);
> > -
> > +               msr_val = def_therm_pkg_clear_mask;
> >         }
> > 
> >         msr_val &= ~bit_mask;
> > @@ -708,6 +764,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
> >         h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
> >         apic_write(APIC_LVTTHMR, h);
> > 
> > +       thermal_intr_core_clear_mask();
> > +       thermal_intr_pkg_clear_mask();
> > +
> >         rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
> >         if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
> >                 wrmsr(MSR_IA32_THERM_INTERRUPT,
> > --
> > 2.39.1
> >
diff mbox series

Patch

diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c
index 2e22bb82b738..d5047676f3d2 100644
--- a/drivers/thermal/intel/therm_throt.c
+++ b/drivers/thermal/intel/therm_throt.c
@@ -193,8 +193,67 @@  static const struct attribute_group thermal_attr_group = {
 #define THERM_THROT_POLL_INTERVAL	HZ
 #define THERM_STATUS_PROCHOT_LOG	BIT(1)
 
-#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15))
-#define THERM_STATUS_CLEAR_PKG_MASK  (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11))
+static u64 def_therm_core_clear_mask;
+static u64 def_therm_pkg_clear_mask;
+
+static void thermal_intr_core_clear_mask(void)
+{
+	if (def_therm_core_clear_mask)
+		return;
+
+	/*
+	 * Reference: Intel SDM  Volume 4
+	 * "Table 2-2. IA-32 Architectural MSRs", MSR 0x19C
+	 * IA32_THERM_STATUS.
+	 */
+
+	/*
+	 * Bit 1, 3, 5: CPUID.01H:EDX[22] = 1. This driver will not
+	 * enable interrupts, when 0 as it checks for X86_FEATURE_ACPI.
+	 */
+	def_therm_core_clear_mask = (BIT(1) | BIT(3) | BIT(5));
+
+	/*
+	 * Bit 7 and 9: Thermal Threshold #1 and #2 log
+	 * If CPUID.01H:ECX[8] = 1
+	 */
+	if (boot_cpu_has(X86_FEATURE_TM2))
+		def_therm_core_clear_mask |= (BIT(7) | BIT(9));
+
+	/* Bit 11: Power Limitation log (R/WC0) If CPUID.06H:EAX[4] = 1 */
+	if (boot_cpu_has(X86_FEATURE_PLN))
+		def_therm_core_clear_mask |= BIT(11);
+
+	/*
+	 * Bit 13: Current Limit log (R/WC0) If CPUID.06H:EAX[7] = 1
+	 * Bit 15: Cross Domain Limit log (R/WC0) If CPUID.06H:EAX[7] = 1
+	 */
+	if (boot_cpu_has(X86_FEATURE_HWP))
+		def_therm_core_clear_mask |= (BIT(13) | BIT(15));
+}
+
+static void thermal_intr_pkg_clear_mask(void)
+{
+	if (def_therm_pkg_clear_mask)
+		return;
+
+	/*
+	 * Reference: Intel SDM  Volume 4
+	 * "Table 2-2. IA-32 Architectural MSRs", MSR 0x1B1
+	 * IA32_PACKAGE_THERM_STATUS.
+	 */
+
+	/* All bits except BIT 26 depends on CPUID.06H: EAX[6] = 1 */
+	if (boot_cpu_has(X86_FEATURE_PTS))
+		def_therm_pkg_clear_mask = (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11));
+
+	/*
+	 * Intel SDM Volume 2A: Thermal and Power Management Leaf
+	 * Bit 26: CPUID.06H: EAX[19] = 1
+	 */
+	if (boot_cpu_has(X86_FEATURE_HFI))
+		def_therm_pkg_clear_mask |= BIT(26);
+}
 
 /*
  * Clear the bits in package thermal status register for bit = 1
@@ -207,13 +266,10 @@  void thermal_clear_package_intr_status(int level, u64 bit_mask)
 
 	if (level == CORE_LEVEL) {
 		msr  = MSR_IA32_THERM_STATUS;
-		msr_val = THERM_STATUS_CLEAR_CORE_MASK;
+		msr_val = def_therm_core_clear_mask;
 	} else {
 		msr  = MSR_IA32_PACKAGE_THERM_STATUS;
-		msr_val = THERM_STATUS_CLEAR_PKG_MASK;
-		if (boot_cpu_has(X86_FEATURE_HFI))
-			msr_val |= BIT(26);
-
+		msr_val = def_therm_pkg_clear_mask;
 	}
 
 	msr_val &= ~bit_mask;
@@ -708,6 +764,9 @@  void intel_init_thermal(struct cpuinfo_x86 *c)
 	h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
 	apic_write(APIC_LVTTHMR, h);
 
+	thermal_intr_core_clear_mask();
+	thermal_intr_pkg_clear_mask();
+
 	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
 	if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
 		wrmsr(MSR_IA32_THERM_INTERRUPT,