diff mbox series

[v3] drivers: watchdog: Add support for panic notifier callback

Message ID 20250225140615.2141119-1-george.cherian@marvell.com (mailing list archive)
State New
Headers show
Series [v3] drivers: watchdog: Add support for panic notifier callback | expand

Commit Message

George Cherian Feb. 25, 2025, 2:06 p.m. UTC
Watchdog is not turned off in kernel panic situation.
In certain systems this might prevent the successful loading
of kdump kernel. The kdump kernel might hit a watchdog reset
while it is booting.

To avoid such scenarios add a panic notifier call back function
which can stop the watchdog. This provision can be enabled by
passing watchdog.stop_on_panic=1 via kernel command-line parameter.

Signed-off-by: George Cherian <george.cherian@marvell.com>
---
Changelog:
v1 -> v2
- Remove the per driver flag setting option
- Take the parameter via kernel command-line parameter to watchdog_core.

v2 -> v3
- Remove the helper function watchdog_stop_on_panic() from watchdog.h.
- There are no users for this. 

 drivers/watchdog/watchdog_core.c | 42 ++++++++++++++++++++++++++++++++
 include/linux/watchdog.h         |  2 ++
 2 files changed, 44 insertions(+)

Comments

Guenter Roeck Feb. 25, 2025, 4:04 p.m. UTC | #1
On Tue, Feb 25, 2025 at 02:06:15PM +0000, George Cherian wrote:
> Watchdog is not turned off in kernel panic situation.
> In certain systems this might prevent the successful loading
> of kdump kernel. The kdump kernel might hit a watchdog reset
> while it is booting.
> 
> To avoid such scenarios add a panic notifier call back function
> which can stop the watchdog. This provision can be enabled by
> passing watchdog.stop_on_panic=1 via kernel command-line parameter.
> 
> Signed-off-by: George Cherian <george.cherian@marvell.com>
> ---
> Changelog:
> v1 -> v2
> - Remove the per driver flag setting option
> - Take the parameter via kernel command-line parameter to watchdog_core.
> 
> v2 -> v3
> - Remove the helper function watchdog_stop_on_panic() from watchdog.h.
> - There are no users for this. 
> 
>  drivers/watchdog/watchdog_core.c | 42 ++++++++++++++++++++++++++++++++
>  include/linux/watchdog.h         |  2 ++
>  2 files changed, 44 insertions(+)
> 
> diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c
> index d46d8c8c01f2..8cbebe38b7dd 100644
> --- a/drivers/watchdog/watchdog_core.c
> +++ b/drivers/watchdog/watchdog_core.c
> @@ -34,6 +34,7 @@
>  #include <linux/idr.h>		/* For ida_* macros */
>  #include <linux/err.h>		/* For IS_ERR macros */
>  #include <linux/of.h>		/* For of_get_timeout_sec */
> +#include <linux/panic_notifier.h> /* For panic handler */
>  #include <linux/suspend.h>
>  
>  #include "watchdog_core.h"	/* For watchdog_dev_register/... */
> @@ -47,6 +48,9 @@ static int stop_on_reboot = -1;
>  module_param(stop_on_reboot, int, 0444);
>  MODULE_PARM_DESC(stop_on_reboot, "Stop watchdogs on reboot (0=keep watching, 1=stop)");
>  
> +static int stop_on_panic = -1;
> +module_param(stop_on_panic, int, 0444);

This can now be bool.

> +MODULE_PARM_DESC(stop_on_panic, "Stop watchdogs on panic (0=keep watching, 1=stop)");
>  /*
>   * Deferred Registration infrastructure.
>   *
> @@ -155,6 +159,23 @@ int watchdog_init_timeout(struct watchdog_device *wdd,
>  }
>  EXPORT_SYMBOL_GPL(watchdog_init_timeout);
>  
> +static int watchdog_panic_notify(struct notifier_block *nb,
> +				 unsigned long action, void *data)
> +{
> +	struct watchdog_device *wdd;
> +
> +	wdd = container_of(nb, struct watchdog_device, panic_nb);
> +	if (watchdog_active(wdd)) {
> +		int ret;
> +
> +		ret = wdd->ops->stop(wdd);
> +		if (ret)
> +			return NOTIFY_BAD;
> +	}
> +
> +	return NOTIFY_DONE;
> +}
> +
>  static int watchdog_reboot_notifier(struct notifier_block *nb,
>  				    unsigned long code, void *data)
>  {
> @@ -299,6 +320,14 @@ static int ___watchdog_register_device(struct watchdog_device *wdd)
>  			clear_bit(WDOG_STOP_ON_REBOOT, &wdd->status);
>  	}
>  
> +	/* Module parameter to force watchdog policy on panic. */
> +	if (stop_on_panic != -1) {
> +		if (stop_on_panic &&  !test_bit(WDOG_NO_WAY_OUT, &wdd->status))
> +			set_bit(WDOG_STOP_ON_PANIC, &wdd->status);
> +		else
> +			clear_bit(WDOG_STOP_ON_PANIC, &wdd->status);
> +	}
> +

No longer needed here. See below.

>  	if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) {
>  		if (!wdd->ops->stop)
>  			pr_warn("watchdog%d: stop_on_reboot not supported\n", wdd->id);
> @@ -334,6 +363,16 @@ static int ___watchdog_register_device(struct watchdog_device *wdd)
>  				wdd->id, ret);
>  	}
>  
> +	if (test_bit(WDOG_STOP_ON_PANIC, &wdd->status)) {
> +		if (!wdd->ops->stop) {
> +			pr_warn("watchdog%d: stop_on_panic not supported\n", wdd->id);
> +		} else {
> +			wdd->panic_nb.notifier_call = watchdog_panic_notify;
> +			atomic_notifier_chain_register(&panic_notifier_list,
> +						       &wdd->panic_nb);
> +		}
> +	}

Simplify to
	if (stop_on_panic) {
		if (!wdd->ops->stop) {
			pr_warn("watchdog%d: stop_on_panic not supported\n", wdd->id);
		} else {
			wdd->panic_nb.notifier_call = watchdog_panic_notify;
			atomic_notifier_chain_register(&panic_notifier_list,
						       &wdd->panic_nb);
			set_bit(WDOG_STOP_ON_PANIC, &wdd->status);
		}
	}

This also fixes the bug where the unregistration function is called
even if the notifier was not actually registered.

One thing I just realized is that we'll have to figure out if atomic
notifiers can be used here unconditionally. Unless I am missing
something, watchdog stop functions can sleep. Of course, sleeping
while panic isn't a good idea. That means we _may_ need a driver
flag indicating either that the stop function can sleep or that it
won't. If we need that, I suggest we add WDIOF_STOP_MAYSLEEP or
similar to the watchdog_info options field.

Thanks,
Guenter
Guenter Roeck Feb. 26, 2025, 1:05 p.m. UTC | #2
On 2/26/25 01:14, George Cherian wrote:
> 
> 
> ________________________________________
> From: Guenter Roeck <groeck7@gmail.com> on behalf of Guenter Roeck <linux@roeck-us.net>
> Sent: Tuesday, February 25, 2025 21:34
> To: George Cherian
> Cc: wim@linux-watchdog.org; corbet@lwn.net; linux-watchdog@vger.kernel.org; linux-doc@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: [EXTERNAL] Re: [PATCH v3] drivers: watchdog: Add support for panic notifier callback
> 
> 
>>> On Tue, Feb 25, 2025 at 02:06:15PM +0000, George Cherian wrote:
>>> Watchdog is not turned off in kernel panic situation.
>>> In certain systems this might prevent the successful loading
>>> of kdump kernel. The kdump kernel might hit a watchdog reset
>>> while it is booting.
>>>
>>> To avoid such scenarios add a panic notifier call back function
>>> which can stop the watchdog. This provision can be enabled by
>>> passing watchdog.stop_on_panic=1 via kernel command-line parameter.
>>>
> v> Signed-off-by: George Cherian <george.cherian@marvell.com>
>>> ---
>>> Changelog:
>>> v1 -> v2
>>> - Remove the per driver flag setting option
>>> - Take the parameter via kernel command-line parameter to watchdog_core.
>>>
>>> v2 -> v3
>>> - Remove the helper function watchdog_stop_on_panic() from watchdog.h.
>>> - There are no users for this.
>>>
>>>   drivers/watchdog/watchdog_core.c | 42 ++++++++++++++++++++++++++++++++
>>>   include/linux/watchdog.h         |  2 ++
>>>   2 files changed, 44 insertions(+)
>>>
>>> diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c
>>> index d46d8c8c01f2..8cbebe38b7dd 100644
>>> --- a/drivers/watchdog/watchdog_core.c
>>> +++ b/drivers/watchdog/watchdog_core.c
>>> @@ -34,6 +34,7 @@
>>>   #include <linux/idr.h>               /* For ida_* macros */
>>>   #include <linux/err.h>               /* For IS_ERR macros */
>>>   #include <linux/of.h>                /* For of_get_timeout_sec */
>>> +#include <linux/panic_notifier.h> /* For panic handler */
>>>   #include <linux/suspend.h>
>>>
>>>   #include "watchdog_core.h"   /* For watchdog_dev_register/... */
>>> @@ -47,6 +48,9 @@ static int stop_on_reboot = -1;
>>>   module_param(stop_on_reboot, int, 0444);
>>>   MODULE_PARM_DESC(stop_on_reboot, "Stop watchdogs on reboot (0=keep watching, 1=stop)");
>>>
>>> +static int stop_on_panic = -1;
>>> +module_param(stop_on_panic, int, 0444);
> 
>> This can now be bool.
> Ack.
>>> +MODULE_PARM_DESC(stop_on_panic, "Stop watchdogs on panic (0=keep watching, 1=stop)");
>>>   /*
>>>    * Deferred Registration infrastructure.
>>>    *
>>> @@ -155,6 +159,23 @@ int watchdog_init_timeout(struct watchdog_device *wdd,
>>>   }
>>>   EXPORT_SYMBOL_GPL(watchdog_init_timeout);
>>>
>>> +static int watchdog_panic_notify(struct notifier_block *nb,
>>> +                              unsigned long action, void *data)
>>> +{
>>> +     struct watchdog_device *wdd;
>>> +
>>> +     wdd = container_of(nb, struct watchdog_device, panic_nb);
>>> +     if (watchdog_active(wdd)) {
>>> +             int ret;
>>> +
>>> +             ret = wdd->ops->stop(wdd);
>>> +             if (ret)
>>> +                     return NOTIFY_BAD;
>>> +     }
>>> +
>>> +     return NOTIFY_DONE;
>>> +}
>>> +
>>>   static int watchdog_reboot_notifier(struct notifier_block *nb,
>>>                                    unsigned long code, void *data)
>>>   {
>>> @@ -299,6 +320,14 @@ static int ___watchdog_register_device(struct watchdog_device *wdd)
>>>                        clear_bit(WDOG_STOP_ON_REBOOT, &wdd->status);
>>>        }
>>>
>>> +     /* Module parameter to force watchdog policy on panic. */
>>> +     if (stop_on_panic != -1) {
>>> +             if (stop_on_panic &&  !test_bit(WDOG_NO_WAY_OUT, &wdd->status))
>>> +                     set_bit(WDOG_STOP_ON_PANIC, &wdd->status);
>>> +             else
>>> +                     clear_bit(WDOG_STOP_ON_PANIC, &wdd->status);
>>> +     }
>>> +
> 
>> No longer needed here. See below.
>>
> Ack Got it.
>>>        if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) {
>>>                if (!wdd->ops->stop)
>>>                        pr_warn("watchdog%d: stop_on_reboot not supported\n", wdd->id);
>>> @@ -334,6 +363,16 @@ static int ___watchdog_register_device(struct watchdog_device *wdd)
>>>                                wdd->id, ret);
>>>        }
>>>
>>> +     if (test_bit(WDOG_STOP_ON_PANIC, &wdd->status)) {
>>> +             if (!wdd->ops->stop) {
>>> +                     pr_warn("watchdog%d: stop_on_panic not supported\n", wdd->id);
>>> +             } else {
>>> +                     wdd->panic_nb.notifier_call = watchdog_panic_notify;
>>> +                     atomic_notifier_chain_register(&panic_notifier_list,
>>> +                                                    &wdd->panic_nb);
>>> +             }
>>> +     }
>>
>> Simplify to
>>        if (stop_on_panic) {
>>                 if (!wdd->ops->stop) {
>>                       pr_warn("watchdog%d: stop_on_panic not supported\n", wdd->id);
>>                 } else {
>>                         wdd->panic_nb.notifier_call = watchdog_panic_notify;
>>                         atomic_notifier_chain_register(&panic_notifier_list,
>>                                                        &wdd->panic_nb);
>>                         set_bit(WDOG_STOP_ON_PANIC, &wdd->status);
>>                 }
>>         }
> Okay will update to this.
> 
>> This also fixes the bug where the unregistration function is called
>> even if the notifier was not actually registered.
> 
>> One thing I just realized is that we'll have to figure out if atomic
>> notifiers can be used here unconditionally. Unless I am missing
>> something, watchdog stop functions can sleep. Of course, sleeping
>> while panic isn't a good idea. That means we _may_ need a driver
>> flag indicating either that the stop function can sleep or that it
>> won't. If we need that, I suggest we add WDIOF_STOP_MAYSLEEP or
>> similar to the watchdog_info options field.
> 
> Yes, that is correct there are certain .stop implementations which can sleep.
> I will add a new WDIOF_STOP_MAYSLEEP  flag and enable the drivers with
> this new flag. Only those drivers which have non-sleeping stop function will
> be able to have this feature.
> 
> I hope this is what you are expecting.

Yes, that would be great. Please add the flag in a separate patch.

Thanks,
Guenter
diff mbox series

Patch

diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c
index d46d8c8c01f2..8cbebe38b7dd 100644
--- a/drivers/watchdog/watchdog_core.c
+++ b/drivers/watchdog/watchdog_core.c
@@ -34,6 +34,7 @@ 
 #include <linux/idr.h>		/* For ida_* macros */
 #include <linux/err.h>		/* For IS_ERR macros */
 #include <linux/of.h>		/* For of_get_timeout_sec */
+#include <linux/panic_notifier.h> /* For panic handler */
 #include <linux/suspend.h>
 
 #include "watchdog_core.h"	/* For watchdog_dev_register/... */
@@ -47,6 +48,9 @@  static int stop_on_reboot = -1;
 module_param(stop_on_reboot, int, 0444);
 MODULE_PARM_DESC(stop_on_reboot, "Stop watchdogs on reboot (0=keep watching, 1=stop)");
 
+static int stop_on_panic = -1;
+module_param(stop_on_panic, int, 0444);
+MODULE_PARM_DESC(stop_on_panic, "Stop watchdogs on panic (0=keep watching, 1=stop)");
 /*
  * Deferred Registration infrastructure.
  *
@@ -155,6 +159,23 @@  int watchdog_init_timeout(struct watchdog_device *wdd,
 }
 EXPORT_SYMBOL_GPL(watchdog_init_timeout);
 
+static int watchdog_panic_notify(struct notifier_block *nb,
+				 unsigned long action, void *data)
+{
+	struct watchdog_device *wdd;
+
+	wdd = container_of(nb, struct watchdog_device, panic_nb);
+	if (watchdog_active(wdd)) {
+		int ret;
+
+		ret = wdd->ops->stop(wdd);
+		if (ret)
+			return NOTIFY_BAD;
+	}
+
+	return NOTIFY_DONE;
+}
+
 static int watchdog_reboot_notifier(struct notifier_block *nb,
 				    unsigned long code, void *data)
 {
@@ -299,6 +320,14 @@  static int ___watchdog_register_device(struct watchdog_device *wdd)
 			clear_bit(WDOG_STOP_ON_REBOOT, &wdd->status);
 	}
 
+	/* Module parameter to force watchdog policy on panic. */
+	if (stop_on_panic != -1) {
+		if (stop_on_panic &&  !test_bit(WDOG_NO_WAY_OUT, &wdd->status))
+			set_bit(WDOG_STOP_ON_PANIC, &wdd->status);
+		else
+			clear_bit(WDOG_STOP_ON_PANIC, &wdd->status);
+	}
+
 	if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) {
 		if (!wdd->ops->stop)
 			pr_warn("watchdog%d: stop_on_reboot not supported\n", wdd->id);
@@ -334,6 +363,16 @@  static int ___watchdog_register_device(struct watchdog_device *wdd)
 				wdd->id, ret);
 	}
 
+	if (test_bit(WDOG_STOP_ON_PANIC, &wdd->status)) {
+		if (!wdd->ops->stop) {
+			pr_warn("watchdog%d: stop_on_panic not supported\n", wdd->id);
+		} else {
+			wdd->panic_nb.notifier_call = watchdog_panic_notify;
+			atomic_notifier_chain_register(&panic_notifier_list,
+						       &wdd->panic_nb);
+		}
+	}
+
 	return 0;
 }
 
@@ -390,6 +429,9 @@  static void __watchdog_unregister_device(struct watchdog_device *wdd)
 	if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status))
 		unregister_reboot_notifier(&wdd->reboot_nb);
 
+	if (test_bit(WDOG_STOP_ON_PANIC, &wdd->status))
+		atomic_notifier_chain_unregister(&panic_notifier_list,
+						 &wdd->panic_nb);
 	watchdog_dev_unregister(wdd);
 	ida_free(&watchdog_ida, wdd->id);
 }
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index 99660197a36c..ef6f1136a4c5 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -108,6 +108,7 @@  struct watchdog_device {
 	struct notifier_block reboot_nb;
 	struct notifier_block restart_nb;
 	struct notifier_block pm_nb;
+	struct notifier_block panic_nb;
 	void *driver_data;
 	struct watchdog_core_data *wd_data;
 	unsigned long status;
@@ -118,6 +119,7 @@  struct watchdog_device {
 #define WDOG_HW_RUNNING		3	/* True if HW watchdog running */
 #define WDOG_STOP_ON_UNREGISTER	4	/* Should be stopped on unregister */
 #define WDOG_NO_PING_ON_SUSPEND	5	/* Ping worker should be stopped on suspend */
+#define WDOG_STOP_ON_PANIC	6	/* Should be stopped on panic for loading kdump kernels */
 	struct list_head deferred;
 };