diff mbox series

[v7,2/9] reboot: thermal: Export hardware protection shutdown

Message ID adf417797006c996605a03c8bacfb4961e8f0b42.1618377272.git.matti.vaittinen@fi.rohmeurope.com (mailing list archive)
State New, archived
Delegated to: Daniel Lezcano
Headers show
Series Extend regulator notification support | expand

Commit Message

Vaittinen, Matti April 14, 2021, 5:52 a.m. UTC
Thermal core contains a logic for safety shutdown. System is attempted to
be powered off if temperature exceeds safety limits.

Currently this can be also utilized by regulator subsystem as a final
protection measure if PMICs report dangerous over-voltage, over-current or
over-temperature and if per regulator counter measures fail or do not
exist.

Move this logic to kernel/reboot.c and export the functionality for other
subsystems to use. Also replace the mutex with a spinlock to allow using
the function from any context.

Also the EMIF bus code has implemented a safety shut-down. EMIF does not
attempt orderly_poweroff at all. Thus the EMIF code is not converted to use
this new function.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
---
Changelog
 v7:
  - new patch

Please note - this patch has received only a minimal amount of testing.
(The new API call was tested to shut-down my system at driver probe but
no odd corner-cases have been tested).

Any testing for thermal shutdown is appreciated.
---
 drivers/thermal/thermal_core.c | 63 ++-----------------------
 include/linux/reboot.h         |  1 +
 kernel/reboot.c                | 86 ++++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+), 59 deletions(-)

Comments

Daniel Lezcano April 17, 2021, 4:57 a.m. UTC | #1
On 14/04/2021 07:52, Matti Vaittinen wrote:
> Thermal core contains a logic for safety shutdown. System is attempted to
> be powered off if temperature exceeds safety limits.
> 
> Currently this can be also utilized by regulator subsystem as a final
> protection measure if PMICs report dangerous over-voltage, over-current or
> over-temperature and if per regulator counter measures fail or do not
> exist.
> 
> Move this logic to kernel/reboot.c and export the functionality for other
> subsystems to use. Also replace the mutex with a spinlock to allow using
> the function from any context.
> 
> Also the EMIF bus code has implemented a safety shut-down. EMIF does not
> attempt orderly_poweroff at all. Thus the EMIF code is not converted to use
> this new function.
> 
> Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
> ---
> Changelog
>  v7:
>   - new patch
> 
> Please note - this patch has received only a minimal amount of testing.
> (The new API call was tested to shut-down my system at driver probe but
> no odd corner-cases have been tested).
> 
> Any testing for thermal shutdown is appreciated.

You can test it easily by enabling the option CONFIG_THERMAL_EMULATION

Then in any thermal zone:

Assuming the critical temp is below the one specified in the command:

echo 100000 > /sys/class/thermal/thermal_zone0/emul_temp

> ---
>  drivers/thermal/thermal_core.c | 63 ++-----------------------
>  include/linux/reboot.h         |  1 +
>  kernel/reboot.c                | 86 ++++++++++++++++++++++++++++++++++
>  3 files changed, 91 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
> index 996c038f83a4..b1444845af38 100644
> --- a/drivers/thermal/thermal_core.c
> +++ b/drivers/thermal/thermal_core.c
> @@ -36,10 +36,8 @@ static LIST_HEAD(thermal_governor_list);
>  
>  static DEFINE_MUTEX(thermal_list_lock);
>  static DEFINE_MUTEX(thermal_governor_lock);
> -static DEFINE_MUTEX(poweroff_lock);
>  
>  static atomic_t in_suspend;
> -static bool power_off_triggered;
>  
>  static struct thermal_governor *def_governor;
>  
> @@ -327,70 +325,18 @@ static void handle_non_critical_trips(struct thermal_zone_device *tz, int trip)
>  		       def_governor->throttle(tz, trip);
>  }
>  
> -/**
> - * thermal_emergency_poweroff_func - emergency poweroff work after a known delay
> - * @work: work_struct associated with the emergency poweroff function
> - *
> - * This function is called in very critical situations to force
> - * a kernel poweroff after a configurable timeout value.
> - */
> -static void thermal_emergency_poweroff_func(struct work_struct *work)
> -{
> -	/*
> -	 * We have reached here after the emergency thermal shutdown
> -	 * Waiting period has expired. This means orderly_poweroff has
> -	 * not been able to shut off the system for some reason.
> -	 * Try to shut down the system immediately using kernel_power_off
> -	 * if populated
> -	 */
> -	WARN(1, "Attempting kernel_power_off: Temperature too high\n");
> -	kernel_power_off();
> -
> -	/*
> -	 * Worst of the worst case trigger emergency restart
> -	 */
> -	WARN(1, "Attempting emergency_restart: Temperature too high\n");
> -	emergency_restart();
> -}
> -
> -static DECLARE_DELAYED_WORK(thermal_emergency_poweroff_work,
> -			    thermal_emergency_poweroff_func);
> -
> -/**
> - * thermal_emergency_poweroff - Trigger an emergency system poweroff
> - *
> - * This may be called from any critical situation to trigger a system shutdown
> - * after a known period of time. By default this is not scheduled.
> - */
> -static void thermal_emergency_poweroff(void)
> +void thermal_zone_device_critical(struct thermal_zone_device *tz)
>  {
> -	int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS;
>  	/*
>  	 * poweroff_delay_ms must be a carefully profiled positive value.
> -	 * Its a must for thermal_emergency_poweroff_work to be scheduled
> +	 * Its a must for forced_emergency_poweroff_work to be scheduled.
>  	 */
> -	if (poweroff_delay_ms <= 0)
> -		return;
> -	schedule_delayed_work(&thermal_emergency_poweroff_work,
> -			      msecs_to_jiffies(poweroff_delay_ms));
> -}
> +	int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS;
>  
> -void thermal_zone_device_critical(struct thermal_zone_device *tz)
> -{
>  	dev_emerg(&tz->device, "%s: critical temperature reached, "
>  		  "shutting down\n", tz->type);
>  
> -	mutex_lock(&poweroff_lock);
> -	if (!power_off_triggered) {
> -		/*
> -		 * Queue a backup emergency shutdown in the event of
> -		 * orderly_poweroff failure
> -		 */
> -		thermal_emergency_poweroff();
> -		orderly_poweroff(true);
> -		power_off_triggered = true;
> -	}
> -	mutex_unlock(&poweroff_lock);
> +	hw_protection_shutdown("Temperature too high", poweroff_delay_ms);
>  }
>  EXPORT_SYMBOL(thermal_zone_device_critical);
>  
> @@ -1549,7 +1495,6 @@ static int __init thermal_init(void)
>  	ida_destroy(&thermal_cdev_ida);
>  	mutex_destroy(&thermal_list_lock);
>  	mutex_destroy(&thermal_governor_lock);
> -	mutex_destroy(&poweroff_lock);
>  	return result;
>  }
>  postcore_initcall(thermal_init);
> diff --git a/include/linux/reboot.h b/include/linux/reboot.h
> index 3734cd8f38a8..af907a3d68d1 100644
> --- a/include/linux/reboot.h
> +++ b/include/linux/reboot.h
> @@ -79,6 +79,7 @@ extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN];
>  
>  extern void orderly_poweroff(bool force);
>  extern void orderly_reboot(void);
> +void hw_protection_shutdown(const char *reason, int ms_until_forced);
>  
>  /*
>   * Emergency restart, callable from an interrupt handler.
> diff --git a/kernel/reboot.c b/kernel/reboot.c
> index a6ad5eb2fa73..1b5fa6d213d4 100644
> --- a/kernel/reboot.c
> +++ b/kernel/reboot.c
> @@ -518,6 +518,92 @@ void orderly_reboot(void)
>  }
>  EXPORT_SYMBOL_GPL(orderly_reboot);
>  
> +/**
> + * hw_failure_emergency_poweroff_func - emergency poweroff work after a known delay
> + * @work: work_struct associated with the emergency poweroff function
> + *
> + * This function is called in very critical situations to force
> + * a kernel poweroff after a configurable timeout value.
> + */
> +static void hw_failure_emergency_poweroff_func(struct work_struct *work)
> +{
> +	/*
> +	 * We have reached here after the emergency shutdown waiting period has
> +	 * expired. This means orderly_poweroff has not been able to shut off
> +	 * the system for some reason.
> +	 *
> +	 * Try to shut down the system immediately using kernel_power_off
> +	 * if populated
> +	 */
> +	WARN(1, "Hardware protection timed-out. Trying forced poweroff\n");
> +	kernel_power_off();
> +
> +	/*
> +	 * Worst of the worst case trigger emergency restart
> +	 */
> +	WARN(1,
> +	     "Hardware protection shutdown failed. Trying emergency restart\n");
> +	emergency_restart();
> +}
> +
> +static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work,
> +			    hw_failure_emergency_poweroff_func);
> +
> +/**
> + * hw_failure_emergency_poweroff - Trigger an emergency system poweroff
> + *
> + * This may be called from any critical situation to trigger a system shutdown
> + * after a given period of time. If time is negative this is not scheduled.
> + */
> +static void hw_failure_emergency_poweroff(int poweroff_delay_ms)
> +{
> +	if (poweroff_delay_ms <= 0)
> +		return;
> +	schedule_delayed_work(&hw_failure_emergency_poweroff_work,
> +			      msecs_to_jiffies(poweroff_delay_ms));
> +}
> +
> +static bool prot_power_off_triggered;
> +static DEFINE_SPINLOCK(poweroff_lock);
> +
> +/**
> + * hw_protection_shutdown - Trigger an emergency system poweroff
> + *
> + * @reason:		Reason of emergency shutdown to be printed.
> + * @ms_until_forced:	Time to wait for orderly shutdown before tiggering a
> + *			forced shudown. Negative value disables the forced
> + *			shutdown.
> + *
> + * Initiate an emergency system shutdown in order to protect hardware from
> + * further damage. Usage examples include a thermal protection or a voltage or
> + * current regulator failures.
> + * NOTE: The request is ignored if protection shutdown is already pending even
> + * if the previous request has given a large timeout for forced shutdown.
> + * Can be called from any context.
> + */
> +void hw_protection_shutdown(const char *reason, int ms_until_forced)
> +{
> +	unsigned long flags;
> +
> +	pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason);
> +
> +	spin_lock_irqsave(&poweroff_lock, flags);
> +	if (prot_power_off_triggered) {
> +		spin_unlock(&poweroff_lock);
> +		return;
> +	}
> +	prot_power_off_triggered = true;
> +	spin_unlock_irqrestore(&poweroff_lock, flags);
> +
> +	/*
> +	 * Queue a backup emergency shutdown in the event of
> +	 * orderly_poweroff failure
> +	 */
> +	hw_failure_emergency_poweroff(ms_until_forced);
> +	orderly_poweroff(true);
> +}
> +EXPORT_SYMBOL_GPL(hw_protection_shutdown);
> +
>  static int __init reboot_setup(char *str)
>  {
>  	for (;;) {
>
Daniel Lezcano April 17, 2021, 5:32 a.m. UTC | #2
On 14/04/2021 07:52, Matti Vaittinen wrote:
> Thermal core contains a logic for safety shutdown. System is attempted to
> be powered off if temperature exceeds safety limits.
> 
> Currently this can be also utilized by regulator subsystem as a final
> protection measure if PMICs report dangerous over-voltage, over-current or
> over-temperature and if per regulator counter measures fail or do not
> exist.
> 
> Move this logic to kernel/reboot.c and export the functionality for other
> subsystems to use. Also replace the mutex with a spinlock to allow using
> the function from any context.
> 
> Also the EMIF bus code has implemented a safety shut-down. EMIF does not
> attempt orderly_poweroff at all. Thus the EMIF code is not converted to use
> this new function.
> 
> Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
> ---
> Changelog
>  v7:
>   - new patch
> 
> Please note - this patch has received only a minimal amount of testing.
> (The new API call was tested to shut-down my system at driver probe but
> no odd corner-cases have been tested).
> 
> Any testing for thermal shutdown is appreciated.
> ---
>  drivers/thermal/thermal_core.c | 63 ++-----------------------
>  include/linux/reboot.h         |  1 +
>  kernel/reboot.c                | 86 ++++++++++++++++++++++++++++++++++

Please send a patch implementing the reboot/shutdown and then another
one replacing the thermal shutdown code by a call to the new API.

>  3 files changed, 91 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
> index 996c038f83a4..b1444845af38 100644
> --- a/drivers/thermal/thermal_core.c
> +++ b/drivers/thermal/thermal_core.c
> @@ -36,10 +36,8 @@ static LIST_HEAD(thermal_governor_list);
>  
>  static DEFINE_MUTEX(thermal_list_lock);
>  static DEFINE_MUTEX(thermal_governor_lock);
> -static DEFINE_MUTEX(poweroff_lock);
>  
>  static atomic_t in_suspend;
> -static bool power_off_triggered;
>  
>  static struct thermal_governor *def_governor;
>  
> @@ -327,70 +325,18 @@ static void handle_non_critical_trips(struct thermal_zone_device *tz, int trip)
>  		       def_governor->throttle(tz, trip);
>  }
>  
> -/**
> - * thermal_emergency_poweroff_func - emergency poweroff work after a known delay
> - * @work: work_struct associated with the emergency poweroff function
> - *
> - * This function is called in very critical situations to force
> - * a kernel poweroff after a configurable timeout value.
> - */
> -static void thermal_emergency_poweroff_func(struct work_struct *work)
> -{
> -	/*
> -	 * We have reached here after the emergency thermal shutdown
> -	 * Waiting period has expired. This means orderly_poweroff has
> -	 * not been able to shut off the system for some reason.
> -	 * Try to shut down the system immediately using kernel_power_off
> -	 * if populated
> -	 */
> -	WARN(1, "Attempting kernel_power_off: Temperature too high\n");
> -	kernel_power_off();
> -
> -	/*
> -	 * Worst of the worst case trigger emergency restart
> -	 */
> -	WARN(1, "Attempting emergency_restart: Temperature too high\n");
> -	emergency_restart();
> -}
> -
> -static DECLARE_DELAYED_WORK(thermal_emergency_poweroff_work,
> -			    thermal_emergency_poweroff_func);
> -
> -/**
> - * thermal_emergency_poweroff - Trigger an emergency system poweroff
> - *
> - * This may be called from any critical situation to trigger a system shutdown
> - * after a known period of time. By default this is not scheduled.
> - */
> -static void thermal_emergency_poweroff(void)
> +void thermal_zone_device_critical(struct thermal_zone_device *tz)
>  {
> -	int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS;
>  	/*
>  	 * poweroff_delay_ms must be a carefully profiled positive value.
> -	 * Its a must for thermal_emergency_poweroff_work to be scheduled
> +	 * Its a must for forced_emergency_poweroff_work to be scheduled.
>  	 */
> -	if (poweroff_delay_ms <= 0)
> -		return;
> -	schedule_delayed_work(&thermal_emergency_poweroff_work,
> -			      msecs_to_jiffies(poweroff_delay_ms));
> -}
> +	int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS;
>  
> -void thermal_zone_device_critical(struct thermal_zone_device *tz)
> -{
>  	dev_emerg(&tz->device, "%s: critical temperature reached, "
>  		  "shutting down\n", tz->type);
>  
> -	mutex_lock(&poweroff_lock);
> -	if (!power_off_triggered) {
> -		/*
> -		 * Queue a backup emergency shutdown in the event of
> -		 * orderly_poweroff failure
> -		 */
> -		thermal_emergency_poweroff();
> -		orderly_poweroff(true);
> -		power_off_triggered = true;
> -	}
> -	mutex_unlock(&poweroff_lock);
> +	hw_protection_shutdown("Temperature too high", poweroff_delay_ms);
>  }
>  EXPORT_SYMBOL(thermal_zone_device_critical);
>  
> @@ -1549,7 +1495,6 @@ static int __init thermal_init(void)
>  	ida_destroy(&thermal_cdev_ida);
>  	mutex_destroy(&thermal_list_lock);
>  	mutex_destroy(&thermal_governor_lock);
> -	mutex_destroy(&poweroff_lock);
>  	return result;
>  }
>  postcore_initcall(thermal_init);
> diff --git a/include/linux/reboot.h b/include/linux/reboot.h
> index 3734cd8f38a8..af907a3d68d1 100644
> --- a/include/linux/reboot.h
> +++ b/include/linux/reboot.h
> @@ -79,6 +79,7 @@ extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN];
>  
>  extern void orderly_poweroff(bool force);
>  extern void orderly_reboot(void);
> +void hw_protection_shutdown(const char *reason, int ms_until_forced);
>  
>  /*
>   * Emergency restart, callable from an interrupt handler.
> diff --git a/kernel/reboot.c b/kernel/reboot.c
> index a6ad5eb2fa73..1b5fa6d213d4 100644
> --- a/kernel/reboot.c
> +++ b/kernel/reboot.c
> @@ -518,6 +518,92 @@ void orderly_reboot(void)
>  }
>  EXPORT_SYMBOL_GPL(orderly_reboot);
>  
> +/**
> + * hw_failure_emergency_poweroff_func - emergency poweroff work after a known delay
> + * @work: work_struct associated with the emergency poweroff function
> + *
> + * This function is called in very critical situations to force
> + * a kernel poweroff after a configurable timeout value.
> + */
> +static void hw_failure_emergency_poweroff_func(struct work_struct *work)
> +{
> +	/*
> +	 * We have reached here after the emergency shutdown waiting period has
> +	 * expired. This means orderly_poweroff has not been able to shut off
> +	 * the system for some reason.
> +	 *
> +	 * Try to shut down the system immediately using kernel_power_off
> +	 * if populated
> +	 */
> +	WARN(1, "Hardware protection timed-out. Trying forced poweroff\n");
> +	kernel_power_off();
> +
> +	/*
> +	 * Worst of the worst case trigger emergency restart
> +	 */
> +	WARN(1,
> +	     "Hardware protection shutdown failed. Trying emergency restart\n");
> +	emergency_restart();
> +}
> +
> +static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work,
> +			    hw_failure_emergency_poweroff_func);
> +
> +/**
> + * hw_failure_emergency_poweroff - Trigger an emergency system poweroff
> + *
> + * This may be called from any critical situation to trigger a system shutdown
> + * after a given period of time. If time is negative this is not scheduled.
> + */
> +static void hw_failure_emergency_poweroff(int poweroff_delay_ms)
> +{
> +	if (poweroff_delay_ms <= 0)
> +		return;
> +	schedule_delayed_work(&hw_failure_emergency_poweroff_work,
> +			      msecs_to_jiffies(poweroff_delay_ms));
> +}
> +
> +static bool prot_power_off_triggered;
> +static DEFINE_SPINLOCK(poweroff_lock);
> +
> +/**
> + * hw_protection_shutdown - Trigger an emergency system poweroff
> + *
> + * @reason:		Reason of emergency shutdown to be printed.
> + * @ms_until_forced:	Time to wait for orderly shutdown before tiggering a
> + *			forced shudown. Negative value disables the forced
> + *			shutdown.
> + *
> + * Initiate an emergency system shutdown in order to protect hardware from
> + * further damage. Usage examples include a thermal protection or a voltage or
> + * current regulator failures.
> + * NOTE: The request is ignored if protection shutdown is already pending even
> + * if the previous request has given a large timeout for forced shutdown.
> + * Can be called from any context.
> + */
> +void hw_protection_shutdown(const char *reason, int ms_until_forced)
> +{
> +	unsigned long flags;
> +
> +	pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason);
> +
> +	spin_lock_irqsave(&poweroff_lock, flags);
> +	if (prot_power_off_triggered) {
> +		spin_unlock(&poweroff_lock);

Why not spin_unlock_irqrestore() ?

> +		return;
> +	}
> +	prot_power_off_triggered = true;
> +	spin_unlock_irqrestore(&poweroff_lock, flags);

Why not take the spin_lock definitively for all the procedure ?

eg.

{
	...

	pr_emerg( ... );

	if (spin_trylock(&lock))
		return;

	hw_failure_emergency_poweroff(ms_until_forced);

	orderly_poweroff(true);
}

No need of prot_power_off_triggered and the spin_lock can be declared
static inside the function.

> +	/*
> +	 * Queue a backup emergency shutdown in the event of
> +	 * orderly_poweroff failure
> +	 */
> +	hw_failure_emergency_poweroff(ms_until_forced);
> +	orderly_poweroff(true);
> +}
> +EXPORT_SYMBOL_GPL(hw_protection_shutdown);
> +
>  static int __init reboot_setup(char *str)
>  {
>  	for (;;) {
>
Vaittinen, Matti April 17, 2021, 8:49 a.m. UTC | #3
On Sat, 2021-04-17 at 06:57 +0200, Daniel Lezcano wrote:
> On 14/04/2021 07:52, Matti Vaittinen wrote:
> > Thermal core contains a logic for safety shutdown. System is
> > attempted to
> > be powered off if temperature exceeds safety limits.
> > 
> > Currently this can be also utilized by regulator subsystem as a
> > final
> > protection measure if PMICs report dangerous over-voltage, over-
> > current or
> > over-temperature and if per regulator counter measures fail or do
> > not
> > exist.
> > 
> > Move this logic to kernel/reboot.c and export the functionality for
> > other
> > subsystems to use. Also replace the mutex with a spinlock to allow
> > using
> > the function from any context.
> > 
> > Also the EMIF bus code has implemented a safety shut-down. EMIF
> > does not
> > attempt orderly_poweroff at all. Thus the EMIF code is not
> > converted to use
> > this new function.
> > 
> > Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
> > ---
> > Changelog
> >  v7:
> >   - new patch
> > 
> > Please note - this patch has received only a minimal amount of
> > testing.
> > (The new API call was tested to shut-down my system at driver probe
> > but
> > no odd corner-cases have been tested).
> > 
> > Any testing for thermal shutdown is appreciated.
> 
> You can test it easily by enabling the option
> CONFIG_THERMAL_EMULATION
> 
> Then in any thermal zone:
> 
> Assuming the critical temp is below the one specified in the command:
> 
> echo 100000 > /sys/class/thermal/thermal_zone0/emul_temp
> 

Thanks Daniel, I will see how that works when I create the next version
:)

Best Regards
	Matti Vaittinen
Vaittinen, Matti April 17, 2021, 8:49 a.m. UTC | #4
Hi Daniel,

Thank you for the review. Much appreciated!

On Sat, 2021-04-17 at 07:32 +0200, Daniel Lezcano wrote:
> On 14/04/2021 07:52, Matti Vaittinen wrote:
> > Thermal core contains a logic for safety shutdown. System is
> > attempted to
> > be powered off if temperature exceeds safety limits.
> > 
> > Currently this can be also utilized by regulator subsystem as a
> > final
> > protection measure if PMICs report dangerous over-voltage, over-
> > current or
> > over-temperature and if per regulator counter measures fail or do
> > not
> > exist.
> > 
> > Move this logic to kernel/reboot.c and export the functionality for
> > other
> > subsystems to use. Also replace the mutex with a spinlock to allow
> > using
> > the function from any context.
> > 
> > Also the EMIF bus code has implemented a safety shut-down. EMIF
> > does not
> > attempt orderly_poweroff at all. Thus the EMIF code is not
> > converted to use
> > this new function.
> > 
> > Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
> > ---
> > Changelog
> >  v7:
> >   - new patch
> > 
> > Please note - this patch has received only a minimal amount of
> > testing.
> > (The new API call was tested to shut-down my system at driver probe
> > but
> > no odd corner-cases have been tested).
> > 
> > Any testing for thermal shutdown is appreciated.
> > ---
> >  drivers/thermal/thermal_core.c | 63 ++-----------------------
> >  include/linux/reboot.h         |  1 +
> >  kernel/reboot.c                | 86
> > ++++++++++++++++++++++++++++++++++
> 
> Please send a patch implementing the reboot/shutdown and then another
> one replacing the thermal shutdown code by a call to the new API.

I guess your suggestion makes sense. That way if the change causes any
problems in thermal-core it can be reverted without impacting other
potential users of this API. My original thinking was that this was
more of an move of functionality than adding an API. Having the move as
one patch makes sense as it shows where the code came from.

> 
> >  3 files changed, 91 insertions(+), 59 deletions(-)
> > 
> > diff --git a/drivers/thermal/thermal_core.c
> > b/drivers/thermal/thermal_core.c
> > index 996c038f83a4..b1444845af38 100644
> > --- a/drivers/thermal/thermal_core.c
> > +++ b/drivers/thermal/thermal_core.c
> > @@ -36,10 +36,8 @@ static LIST_HEAD(thermal_governor_list);
> >  
> > 

...

> > +static bool prot_power_off_triggered;
> > +static DEFINE_SPINLOCK(poweroff_lock);
> > +
> > +/**
> > + * hw_protection_shutdown - Trigger an emergency system poweroff
> > + *
> > + * @reason:		Reason of emergency shutdown to be
> > printed.
> > + * @ms_until_forced:	Time to wait for orderly shutdown
> > before tiggering a
> > + *			forced shudown. Negative value disables the
> > forced
> > + *			shutdown.
> > + *
> > + * Initiate an emergency system shutdown in order to protect
> > hardware from
> > + * further damage. Usage examples include a thermal protection or
> > a voltage or
> > + * current regulator failures.
> > + * NOTE: The request is ignored if protection shutdown is already
> > pending even
> > + * if the previous request has given a large timeout for forced
> > shutdown.
> > + * Can be called from any context.
> > + */
> > +void hw_protection_shutdown(const char *reason, int
> > ms_until_forced)
> > +{
> > +	unsigned long flags;
> > +
> > +	pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason);
> > +
> > +	spin_lock_irqsave(&poweroff_lock, flags);
> > +	if (prot_power_off_triggered) {
> > +		spin_unlock(&poweroff_lock);
> 
> Why not spin_unlock_irqrestore() ?
> 

Well spotted It for sure must be spin_unlock_irqrestore. My bad.

> > +		return;
> > +	}
> > +	prot_power_off_triggered = true;
> > +	spin_unlock_irqrestore(&poweroff_lock, flags);
> 
> Why not take the spin_lock definitively for all the procedure ?
> 
> eg.
> 
> {
> 	...
> 
> 	pr_emerg( ... );
> 
> 	if (spin_trylock(&lock))
> 		return;
> 
> 	hw_failure_emergency_poweroff(ms_until_forced);
> 
> 	orderly_poweroff(true);
> }
> 
> No need of prot_power_off_triggered and the spin_lock can be declared
> static inside the function.

I think this makes perfect sense. My thinking just jammed to replacing
the mutex thermal-core used with a spin-lock using similar logic. I
guess this could even be just an atomic cmpxchg (or equivalent, I don't
remember what atomic abstractions we have) just to return if function
has been previously executed. Well, the spin_trylock() should work just
fine as far as I can say. So - thanks.


Best Regards
	Matti Vaittinen
diff mbox series

Patch

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 996c038f83a4..b1444845af38 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -36,10 +36,8 @@  static LIST_HEAD(thermal_governor_list);
 
 static DEFINE_MUTEX(thermal_list_lock);
 static DEFINE_MUTEX(thermal_governor_lock);
-static DEFINE_MUTEX(poweroff_lock);
 
 static atomic_t in_suspend;
-static bool power_off_triggered;
 
 static struct thermal_governor *def_governor;
 
@@ -327,70 +325,18 @@  static void handle_non_critical_trips(struct thermal_zone_device *tz, int trip)
 		       def_governor->throttle(tz, trip);
 }
 
-/**
- * thermal_emergency_poweroff_func - emergency poweroff work after a known delay
- * @work: work_struct associated with the emergency poweroff function
- *
- * This function is called in very critical situations to force
- * a kernel poweroff after a configurable timeout value.
- */
-static void thermal_emergency_poweroff_func(struct work_struct *work)
-{
-	/*
-	 * We have reached here after the emergency thermal shutdown
-	 * Waiting period has expired. This means orderly_poweroff has
-	 * not been able to shut off the system for some reason.
-	 * Try to shut down the system immediately using kernel_power_off
-	 * if populated
-	 */
-	WARN(1, "Attempting kernel_power_off: Temperature too high\n");
-	kernel_power_off();
-
-	/*
-	 * Worst of the worst case trigger emergency restart
-	 */
-	WARN(1, "Attempting emergency_restart: Temperature too high\n");
-	emergency_restart();
-}
-
-static DECLARE_DELAYED_WORK(thermal_emergency_poweroff_work,
-			    thermal_emergency_poweroff_func);
-
-/**
- * thermal_emergency_poweroff - Trigger an emergency system poweroff
- *
- * This may be called from any critical situation to trigger a system shutdown
- * after a known period of time. By default this is not scheduled.
- */
-static void thermal_emergency_poweroff(void)
+void thermal_zone_device_critical(struct thermal_zone_device *tz)
 {
-	int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS;
 	/*
 	 * poweroff_delay_ms must be a carefully profiled positive value.
-	 * Its a must for thermal_emergency_poweroff_work to be scheduled
+	 * Its a must for forced_emergency_poweroff_work to be scheduled.
 	 */
-	if (poweroff_delay_ms <= 0)
-		return;
-	schedule_delayed_work(&thermal_emergency_poweroff_work,
-			      msecs_to_jiffies(poweroff_delay_ms));
-}
+	int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS;
 
-void thermal_zone_device_critical(struct thermal_zone_device *tz)
-{
 	dev_emerg(&tz->device, "%s: critical temperature reached, "
 		  "shutting down\n", tz->type);
 
-	mutex_lock(&poweroff_lock);
-	if (!power_off_triggered) {
-		/*
-		 * Queue a backup emergency shutdown in the event of
-		 * orderly_poweroff failure
-		 */
-		thermal_emergency_poweroff();
-		orderly_poweroff(true);
-		power_off_triggered = true;
-	}
-	mutex_unlock(&poweroff_lock);
+	hw_protection_shutdown("Temperature too high", poweroff_delay_ms);
 }
 EXPORT_SYMBOL(thermal_zone_device_critical);
 
@@ -1549,7 +1495,6 @@  static int __init thermal_init(void)
 	ida_destroy(&thermal_cdev_ida);
 	mutex_destroy(&thermal_list_lock);
 	mutex_destroy(&thermal_governor_lock);
-	mutex_destroy(&poweroff_lock);
 	return result;
 }
 postcore_initcall(thermal_init);
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 3734cd8f38a8..af907a3d68d1 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -79,6 +79,7 @@  extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN];
 
 extern void orderly_poweroff(bool force);
 extern void orderly_reboot(void);
+void hw_protection_shutdown(const char *reason, int ms_until_forced);
 
 /*
  * Emergency restart, callable from an interrupt handler.
diff --git a/kernel/reboot.c b/kernel/reboot.c
index a6ad5eb2fa73..1b5fa6d213d4 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -518,6 +518,92 @@  void orderly_reboot(void)
 }
 EXPORT_SYMBOL_GPL(orderly_reboot);
 
+/**
+ * hw_failure_emergency_poweroff_func - emergency poweroff work after a known delay
+ * @work: work_struct associated with the emergency poweroff function
+ *
+ * This function is called in very critical situations to force
+ * a kernel poweroff after a configurable timeout value.
+ */
+static void hw_failure_emergency_poweroff_func(struct work_struct *work)
+{
+	/*
+	 * We have reached here after the emergency shutdown waiting period has
+	 * expired. This means orderly_poweroff has not been able to shut off
+	 * the system for some reason.
+	 *
+	 * Try to shut down the system immediately using kernel_power_off
+	 * if populated
+	 */
+	WARN(1, "Hardware protection timed-out. Trying forced poweroff\n");
+	kernel_power_off();
+
+	/*
+	 * Worst of the worst case trigger emergency restart
+	 */
+	WARN(1,
+	     "Hardware protection shutdown failed. Trying emergency restart\n");
+	emergency_restart();
+}
+
+static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work,
+			    hw_failure_emergency_poweroff_func);
+
+/**
+ * hw_failure_emergency_poweroff - Trigger an emergency system poweroff
+ *
+ * This may be called from any critical situation to trigger a system shutdown
+ * after a given period of time. If time is negative this is not scheduled.
+ */
+static void hw_failure_emergency_poweroff(int poweroff_delay_ms)
+{
+	if (poweroff_delay_ms <= 0)
+		return;
+	schedule_delayed_work(&hw_failure_emergency_poweroff_work,
+			      msecs_to_jiffies(poweroff_delay_ms));
+}
+
+static bool prot_power_off_triggered;
+static DEFINE_SPINLOCK(poweroff_lock);
+
+/**
+ * hw_protection_shutdown - Trigger an emergency system poweroff
+ *
+ * @reason:		Reason of emergency shutdown to be printed.
+ * @ms_until_forced:	Time to wait for orderly shutdown before tiggering a
+ *			forced shudown. Negative value disables the forced
+ *			shutdown.
+ *
+ * Initiate an emergency system shutdown in order to protect hardware from
+ * further damage. Usage examples include a thermal protection or a voltage or
+ * current regulator failures.
+ * NOTE: The request is ignored if protection shutdown is already pending even
+ * if the previous request has given a large timeout for forced shutdown.
+ * Can be called from any context.
+ */
+void hw_protection_shutdown(const char *reason, int ms_until_forced)
+{
+	unsigned long flags;
+
+	pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason);
+
+	spin_lock_irqsave(&poweroff_lock, flags);
+	if (prot_power_off_triggered) {
+		spin_unlock(&poweroff_lock);
+		return;
+	}
+	prot_power_off_triggered = true;
+	spin_unlock_irqrestore(&poweroff_lock, flags);
+
+	/*
+	 * Queue a backup emergency shutdown in the event of
+	 * orderly_poweroff failure
+	 */
+	hw_failure_emergency_poweroff(ms_until_forced);
+	orderly_poweroff(true);
+}
+EXPORT_SYMBOL_GPL(hw_protection_shutdown);
+
 static int __init reboot_setup(char *str)
 {
 	for (;;) {