diff mbox

[v2,09/11] misc: throttler: Add core support for non-thermal throttling

Message ID 20180607181214.30338-10-mka@chromium.org (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Matthias Kaehlcke June 7, 2018, 6:12 p.m. UTC
The purpose of the throttler is to provide support for non-thermal
throttling. Throttling is triggered by external event, e.g. the
detection of a high battery discharge current, close to the OCP limit
of the battery. The throttler is only in charge of the throttling, not
the monitoring, which is done by another (possibly platform specific)
driver.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
---
Changes in v2:
- completely reworked the driver to support configuration through OPPs, which
  requires a more dynamic handling
- added sysfs attribute to set the level for debugging and testing
- Makefile: depend on Kconfig option to traverse throttler directory
- Kconfig: removed 'default n'
- added SPDX line instead of license boiler-plate
- added entry to MAINTAINERS file


 MAINTAINERS                     |   7 +
 drivers/misc/Kconfig            |   1 +
 drivers/misc/Makefile           |   1 +
 drivers/misc/throttler/Kconfig  |  14 +
 drivers/misc/throttler/Makefile |   1 +
 drivers/misc/throttler/core.c   | 642 ++++++++++++++++++++++++++++++++
 include/linux/throttler.h       |  11 +
 7 files changed, 677 insertions(+)
 create mode 100644 drivers/misc/throttler/Kconfig
 create mode 100644 drivers/misc/throttler/Makefile
 create mode 100644 drivers/misc/throttler/core.c
 create mode 100644 include/linux/throttler.h

Comments

kernel test robot June 9, 2018, 4:34 a.m. UTC | #1
Hi Matthias,

I love your patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.17 next-20180608]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Matthias-Kaehlcke/Add-throttler-driver-for-non-thermal-throttling/20180609-061437
config: openrisc-allmodconfig (attached as .config)
compiler: or1k-linux-gcc (GCC) 6.0.0 20160327 (experimental)
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=openrisc 

All errors (new ones prefixed by >>):

   drivers/misc/throttler/core.c: In function 'thr_cpufreq_update_policy':
>> drivers/misc/throttler/core.c:417:3: error: implicit declaration of function 'cpufreq_update_policy' [-Werror=implicit-function-declaration]
      cpufreq_update_policy(cftd->cpu);
      ^~~~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors

vim +/cpufreq_update_policy +417 drivers/misc/throttler/core.c

   391	
   392	static void thr_cpufreq_update_policy(struct throttler *thr)
   393	{
   394		struct cpufreq_thrdev *cftd;
   395	
   396		WARN_ON(!mutex_is_locked(&thr->lock));
   397	
   398		list_for_each_entry(cftd, &thr->cpufreq.list, node) {
   399			struct cpufreq_policy *policy = cpufreq_cpu_get(cftd->cpu);
   400	
   401			if (!policy) {
   402				dev_warn(thr->dev,
   403					 "CPU%d does have no cpufreq policy!\n",
   404					 cftd->cpu);
   405				continue;
   406			}
   407	
   408			/*
   409			 * The lock isn't really needed in this function, the list
   410			 * of cpufreq devices can be extended, but no items are
   411			 * deleted during the lifetime of the throttler. Releasing
   412			 * the lock is necessary since cpufreq_update_policy() ends
   413			 * up calling thr_handle_cpufreq_event(), which needs to
   414			 * acquire the lock.
   415			 */
   416			mutex_unlock(&thr->lock);
 > 417			cpufreq_update_policy(cftd->cpu);
   418			mutex_lock(&thr->lock);
   419	
   420			cpufreq_cpu_put(policy);
   421		}
   422	}
   423	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Brian Norris June 12, 2018, 1:49 a.m. UTC | #2
Hi!

A few comments, but I didn't get to finish a thorough review yet.


On Thu, Jun 07, 2018 at 11:12:12AM -0700, Matthias Kaehlcke wrote:
> The purpose of the throttler is to provide support for non-thermal
> throttling. Throttling is triggered by external event, e.g. the
> detection of a high battery discharge current, close to the OCP limit
> of the battery. The throttler is only in charge of the throttling, not
> the monitoring, which is done by another (possibly platform specific)
> driver.
> 
> Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
> ---
> Changes in v2:
> - completely reworked the driver to support configuration through OPPs, which
>   requires a more dynamic handling
> - added sysfs attribute to set the level for debugging and testing
> - Makefile: depend on Kconfig option to traverse throttler directory
> - Kconfig: removed 'default n'
> - added SPDX line instead of license boiler-plate
> - added entry to MAINTAINERS file
> 
> 
>  MAINTAINERS                     |   7 +
>  drivers/misc/Kconfig            |   1 +
>  drivers/misc/Makefile           |   1 +
>  drivers/misc/throttler/Kconfig  |  14 +
>  drivers/misc/throttler/Makefile |   1 +
>  drivers/misc/throttler/core.c   | 642 ++++++++++++++++++++++++++++++++
>  include/linux/throttler.h       |  11 +
>  7 files changed, 677 insertions(+)
>  create mode 100644 drivers/misc/throttler/Kconfig
>  create mode 100644 drivers/misc/throttler/Makefile
>  create mode 100644 drivers/misc/throttler/core.c
>  create mode 100644 include/linux/throttler.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 92e47b5b0480..f9550e5680ce 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -13938,6 +13938,13 @@ T:	git git://repo.or.cz/linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git
>  S:	Maintained
>  F:	drivers/platform/x86/thinkpad_acpi.c
>  
> +THROTTLER DRIVERS
> +M:	Matthias Kaehlcke <mka@chromium.org>
> +L:	linux-pm@vger.kernel.org
> +S:	Maintained
> +F:	drivers/misc/throttler/
> +F:	include/linux/throttler.h
> +
>  THUNDERBOLT DRIVER
>  M:	Andreas Noever <andreas.noever@gmail.com>
>  M:	Michael Jamet <michael.jamet@intel.com>
> diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
> index 5d713008749b..691d9625d83c 100644
> --- a/drivers/misc/Kconfig
> +++ b/drivers/misc/Kconfig
> @@ -513,4 +513,5 @@ source "drivers/misc/echo/Kconfig"
>  source "drivers/misc/cxl/Kconfig"
>  source "drivers/misc/ocxl/Kconfig"
>  source "drivers/misc/cardreader/Kconfig"
> +source "drivers/misc/throttler/Kconfig"
>  endmenu
> diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
> index 20be70c3f118..b549ccad5215 100644
> --- a/drivers/misc/Makefile
> +++ b/drivers/misc/Makefile
> @@ -57,3 +57,4 @@ obj-$(CONFIG_ASPEED_LPC_SNOOP)	+= aspeed-lpc-snoop.o
>  obj-$(CONFIG_PCI_ENDPOINT_TEST)	+= pci_endpoint_test.o
>  obj-$(CONFIG_OCXL)		+= ocxl/
>  obj-$(CONFIG_MISC_RTSX)		+= cardreader/
> +obj-$(CONFIG_THROTTLER)		+= throttler/
> diff --git a/drivers/misc/throttler/Kconfig b/drivers/misc/throttler/Kconfig
> new file mode 100644
> index 000000000000..e561f1df5085
> --- /dev/null
> +++ b/drivers/misc/throttler/Kconfig
> @@ -0,0 +1,14 @@
> +# SPDX-License-Identifier: GPL-2.0
> +
> +menuconfig THROTTLER
> +	bool "Throttler support"
> +	depends on OF
> +	select CPU_FREQ
> +	select PM_DEVFREQ

I'm curious whether we're actually truly compile-time dependent on
{CPU,DEV}FREQ? It seems like those subsystems mostly stub out stuff so
they fall back to no-ops if not built in.

I know that's not very useful for your existing throttler, since it
wouldn't be very effective if one or both were disabled.

> +	help
> +	  This option enables core support for non-thermal throttling of CPUs
> +	  and devfreq devices.
> +
> +	  Note that you also need a event monitor module usually called
> +	  *_throttler.
> +
> diff --git a/drivers/misc/throttler/Makefile b/drivers/misc/throttler/Makefile
> new file mode 100644
> index 000000000000..c8d920cee315
> --- /dev/null
> +++ b/drivers/misc/throttler/Makefile
> @@ -0,0 +1 @@
> +obj-$(CONFIG_THROTTLER)		+= core.o
> diff --git a/drivers/misc/throttler/core.c b/drivers/misc/throttler/core.c
> new file mode 100644
> index 000000000000..15b50c111032
> --- /dev/null
> +++ b/drivers/misc/throttler/core.c
> @@ -0,0 +1,642 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Core code for non-thermal throttling
> + *
> + * Copyright (C) 2018 Google, Inc.
> + */
> +
> +#include <linux/cpufreq.h>
> +#include <linux/cpumask.h>
> +#include <linux/devfreq.h>
> +#include <linux/kernel.h>
> +#include <linux/list.h>
> +#include <linux/mutex.h>
> +#include <linux/notifier.h>
> +#include <linux/of.h>
> +#include <linux/of_platform.h>
> +#include <linux/platform_device.h>
> +#include <linux/pm_opp.h>
> +#include <linux/sort.h>
> +#include <linux/slab.h>
> +#include <linux/throttler.h>
> +
> +/*
> + * Non-thermal throttling: throttling of system components in response to
> + * external events (e.g. high battery discharge current).
> + *
> + * The throttler supports throttling through cpufreq and devfreq. Multiple
> + * levels of throttling can be configured. At level 0 no throttling is
> + * active on behalf of the throttler, for values > 0 throttling is typically
> + * configured to be increasingly aggressive with each level.
> + * The number of throttling levels is not limited by the throttler (though
> + * it is likely limited by the throttling devices). It is not necessary to
> + * configure the same number of levels for all throttling devices. If the
> + * requested throttling level for a device is higher than the maximum level
> + * of the device the throttler will select the maximum throttling level of
> + * the device.
> + *
> + * Non-thermal throttling is split in two parts:
> + *
> + * - throttler core
> + *   - parses the thermal policy
> + *   - applies throttling settings for a requested level of throttling
> + *
> + * - event monitor driver
> + *   - monitors events that trigger throttling
> + *   - determines the throttling level (often limited to on/off)
> + *   - asks throttler core to apply throttling settings
> + *
> + * It is possible for a system to have more than one throttler and the
> + * throttlers may make use of the same throttling devices, in case of
> + * conflicting settings for a device the more aggressive values will be
> + * applied.
> + *
> + */
> +
> +#define ci_to_throttler(ci) \
> +	container_of(ci, struct throttler, devfreq.class_iface)
> +
> +// #define DEBUG_THROTTLER

Did you mean to leave your debug code in? Seems like you have some
related dead code under #ifdefs.

(If you do want this, maybe it'd be better under debugfs, until somebody
really wants to formalize and document it.)

> +
> +struct thr_freq_table {
> +	uint32_t *freqs;
> +	int n_entries;
> +};
> +
> +struct cpufreq_thrdev {
> +	uint32_t cpu;
> +	struct thr_freq_table freq_table;
> +	struct list_head node;
> +};
> +
> +struct devfreq_thrdev {
> +	struct devfreq *devfreq;
> +	struct thr_freq_table freq_table;
> +	struct throttler *thr;
> +	struct notifier_block nb;
> +	struct list_head node;
> +};
> +
> +struct __thr_cpufreq {
> +	struct list_head list;
> +	cpumask_t cm_initialized;
> +	cpumask_t cm_ignore;
> +	struct notifier_block nb;
> +};
> +
> +struct __thr_devfreq {
> +	struct list_head list;
> +	struct class_interface class_iface;
> +};
> +
> +struct throttler {
> +	struct device *dev;
> +	int level;
> +	struct __thr_cpufreq cpufreq;
> +	struct __thr_devfreq devfreq;
> +	struct mutex lock;
> +};
> +
> +static inline int cmp_freqs(const void *a, const void *b)
> +{
> +	const uint32_t *pa = a, *pb = b;
> +
> +	if (*pa < *pb)
> +		return 1;
> +	else if (*pa > *pb)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static int thr_handle_devfreq_event(struct notifier_block *nb,
> +				    unsigned long event, void *data);
> +
> +static unsigned long thr_get_throttling_freq(struct thr_freq_table *ft,
> +					     int level)
> +{
> +	if (level == 0) {
> +		WARN(true, "level == 0");
> +		return ULONG_MAX;
> +	}
> +
> +	if (level <= ft->n_entries)
> +		return ft->freqs[level - 1];
> +	else
> +		return ft->freqs[ft->n_entries - 1];
> +}
> +
> +static int thr_init_freq_table(struct throttler *thr, struct device *opp_dev,
> +			       struct thr_freq_table *ft)
> +{
> +	struct device_node *np_opp_desc, *np_opp;
> +	int nchilds;
> +	uint32_t *freqs;
> +	int nfreqs = 0;
> +	int err = 0;
> +
> +	np_opp_desc = dev_pm_opp_of_get_opp_desc_node(opp_dev);
> +	if (!np_opp_desc)
> +		return -EINVAL;
> +
> +	nchilds = of_get_child_count(np_opp_desc);
> +	if (!nchilds) {
> +		err = -EINVAL;
> +		goto out_node_put;
> +	}
> +
> +	freqs = kzalloc(nchilds * sizeof(uint32_t), GFP_KERNEL);
> +	if (!freqs) {
> +		err = -ENOMEM;
> +		goto out_node_put;
> +	}
> +
> +	/* determine which OPPs are used by this throttler (if any) */
> +	for_each_child_of_node(np_opp_desc, np_opp) {
> +		int num_thr;
> +		int i;
> +
> +		num_thr = of_property_count_u32_elems(np_opp, "opp-throttlers");
> +		if (num_thr < 0)
> +			continue;
> +
> +		for (i = 0; i < num_thr; i++) {
> +			struct device_node *np_thr;
> +			struct platform_device *pdev;
> +
> +			np_thr = of_parse_phandle(np_opp, "opp-throttlers", i);
> +			if (!np_thr) {
> +				dev_err(thr->dev,
> +					"failed to parse phandle %d: %s\n", i,
> +					np_opp->full_name);
> +				continue;
> +			}
> +
> +			pdev = of_find_device_by_node(np_thr);
> +			if (!pdev) {
> +				dev_err(thr->dev,
> +					"could not find throttler dev: %s\n",
> +					 np_thr->full_name);
> +				of_node_put(np_thr);
> +				continue;
> +			}
> +
> +			/* OPP is used by this throttler */
> +			if (&pdev->dev == thr->dev) {

So you're assuming that all throttlers are platform devices? Seems
slightly shaky; I could easily imagine a similar device that's a SPI or
I2C device.

> +				u64 rate;
> +
> +				err = of_property_read_u64(np_opp, "opp-hz",
> +							   &rate);
> +				if (!err) {
> +					freqs[nfreqs] = rate;
> +					nfreqs++;
> +				} else {
> +					dev_err(thr->dev,
> +						"opp-hz not found: %s\n",
> +						np_opp->full_name);
> +				}
> +			}
> +
> +			of_node_put(np_thr);
> +			put_device(&pdev->dev);
> +		}
> +	}
> +
> +	if (nfreqs > 0) {
> +		/* sort frequencies in descending order */
> +		sort(freqs, nfreqs, sizeof(*freqs), cmp_freqs, NULL);
> +
> +		ft->n_entries = nfreqs;
> +		ft->freqs = devm_kzalloc(thr->dev,
> +				  nfreqs * sizeof(*freqs), GFP_KERNEL);
> +		if (!ft->freqs) {
> +			err = -ENOMEM;
> +			goto out_free;
> +		}
> +
> +		memcpy(ft->freqs, freqs, nfreqs * sizeof(*freqs));
> +	} else {
> +		err = -ENODEV;
> +	}
> +
> +out_free:
> +	kfree(freqs);
> +
> +out_node_put:
> +	of_node_put(np_opp_desc);
> +
> +	return err;
> +}

[...]

Brian
Matthias Kaehlcke June 12, 2018, 5:11 p.m. UTC | #3
Hi,

On Mon, Jun 11, 2018 at 06:49:13PM -0700, Brian Norris wrote:
> Hi!
> 
> A few comments, but I didn't get to finish a thorough review yet.
> 
> 
> On Thu, Jun 07, 2018 at 11:12:12AM -0700, Matthias Kaehlcke wrote:
> > The purpose of the throttler is to provide support for non-thermal
> > throttling. Throttling is triggered by external event, e.g. the
> > detection of a high battery discharge current, close to the OCP limit
> > of the battery. The throttler is only in charge of the throttling, not
> > the monitoring, which is done by another (possibly platform specific)
> > driver.
> > 
> > Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
> > ---
> > Changes in v2:
> > - completely reworked the driver to support configuration through OPPs, which
> >   requires a more dynamic handling
> > - added sysfs attribute to set the level for debugging and testing
> > - Makefile: depend on Kconfig option to traverse throttler directory
> > - Kconfig: removed 'default n'
> > - added SPDX line instead of license boiler-plate
> > - added entry to MAINTAINERS file
> > 
> > 
> >  MAINTAINERS                     |   7 +
> >  drivers/misc/Kconfig            |   1 +
> >  drivers/misc/Makefile           |   1 +
> >  drivers/misc/throttler/Kconfig  |  14 +
> >  drivers/misc/throttler/Makefile |   1 +
> >  drivers/misc/throttler/core.c   | 642 ++++++++++++++++++++++++++++++++
> >  include/linux/throttler.h       |  11 +
> >  7 files changed, 677 insertions(+)
> >  create mode 100644 drivers/misc/throttler/Kconfig
> >  create mode 100644 drivers/misc/throttler/Makefile
> >  create mode 100644 drivers/misc/throttler/core.c
> >  create mode 100644 include/linux/throttler.h
> > 
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index 92e47b5b0480..f9550e5680ce 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -13938,6 +13938,13 @@ T:	git git://repo.or.cz/linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git
> >  S:	Maintained
> >  F:	drivers/platform/x86/thinkpad_acpi.c
> >  
> > +THROTTLER DRIVERS
> > +M:	Matthias Kaehlcke <mka@chromium.org>
> > +L:	linux-pm@vger.kernel.org
> > +S:	Maintained
> > +F:	drivers/misc/throttler/
> > +F:	include/linux/throttler.h
> > +
> >  THUNDERBOLT DRIVER
> >  M:	Andreas Noever <andreas.noever@gmail.com>
> >  M:	Michael Jamet <michael.jamet@intel.com>
> > diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
> > index 5d713008749b..691d9625d83c 100644
> > --- a/drivers/misc/Kconfig
> > +++ b/drivers/misc/Kconfig
> > @@ -513,4 +513,5 @@ source "drivers/misc/echo/Kconfig"
> >  source "drivers/misc/cxl/Kconfig"
> >  source "drivers/misc/ocxl/Kconfig"
> >  source "drivers/misc/cardreader/Kconfig"
> > +source "drivers/misc/throttler/Kconfig"
> >  endmenu
> > diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
> > index 20be70c3f118..b549ccad5215 100644
> > --- a/drivers/misc/Makefile
> > +++ b/drivers/misc/Makefile
> > @@ -57,3 +57,4 @@ obj-$(CONFIG_ASPEED_LPC_SNOOP)	+= aspeed-lpc-snoop.o
> >  obj-$(CONFIG_PCI_ENDPOINT_TEST)	+= pci_endpoint_test.o
> >  obj-$(CONFIG_OCXL)		+= ocxl/
> >  obj-$(CONFIG_MISC_RTSX)		+= cardreader/
> > +obj-$(CONFIG_THROTTLER)		+= throttler/
> > diff --git a/drivers/misc/throttler/Kconfig b/drivers/misc/throttler/Kconfig
> > new file mode 100644
> > index 000000000000..e561f1df5085
> > --- /dev/null
> > +++ b/drivers/misc/throttler/Kconfig
> > @@ -0,0 +1,14 @@
> > +# SPDX-License-Identifier: GPL-2.0
> > +
> > +menuconfig THROTTLER
> > +	bool "Throttler support"
> > +	depends on OF
> > +	select CPU_FREQ
> > +	select PM_DEVFREQ
> 
> I'm curious whether we're actually truly compile-time dependent on
> {CPU,DEV}FREQ? It seems like those subsystems mostly stub out stuff so
> they fall back to no-ops if not built in.
>
> I know that's not very useful for your existing throttler, since it
> wouldn't be very effective if one or both were disabled.

The idea is not to depend on both options being enabled, since
throttling of one type might be all that is needed.

As the build bot pointed out cpufreq doesn't stub out all functions.
Probably the cleanest way to work around this is to define a stub for
cpufreq_update_policy() in the throttler when CONFIG_CPU_FREQ is not
defined.

> > diff --git a/drivers/misc/throttler/core.c b/drivers/misc/throttler/core.c
> > new file mode 100644
> > index 000000000000..15b50c111032
> > --- /dev/null
> > +++ b/drivers/misc/throttler/core.c
> > ...
> > +// #define DEBUG_THROTTLER
> 
> Did you mean to leave your debug code in? Seems like you have some
> related dead code under #ifdefs.

Yes, I left it in intentionally.

> (If you do want this, maybe it'd be better under debugfs, until somebody
> really wants to formalize and document it.)

Since it is code that is never enabled in an official kernel build I
found it simpler to use sysfs with a direct association with the
device, instead of having <debugfs>/throttler/<throttler_name>/level.

If folks have strong opinions about using debugfs or not having the
debug code at all I'm fine with changing/dropping it.

> > +static int thr_init_freq_table(struct throttler *thr, struct device *opp_dev,
> > +			       struct thr_freq_table *ft)
> > +{
> > +	struct device_node *np_opp_desc, *np_opp;
> > +	int nchilds;
> > +	uint32_t *freqs;
> > +	int nfreqs = 0;
> > +	int err = 0;
> > +
> > +	np_opp_desc = dev_pm_opp_of_get_opp_desc_node(opp_dev);
> > +	if (!np_opp_desc)
> > +		return -EINVAL;
> > +
> > +	nchilds = of_get_child_count(np_opp_desc);
> > +	if (!nchilds) {
> > +		err = -EINVAL;
> > +		goto out_node_put;
> > +	}
> > +
> > +	freqs = kzalloc(nchilds * sizeof(uint32_t), GFP_KERNEL);
> > +	if (!freqs) {
> > +		err = -ENOMEM;
> > +		goto out_node_put;
> > +	}
> > +
> > +	/* determine which OPPs are used by this throttler (if any) */
> > +	for_each_child_of_node(np_opp_desc, np_opp) {
> > +		int num_thr;
> > +		int i;
> > +
> > +		num_thr = of_property_count_u32_elems(np_opp, "opp-throttlers");
> > +		if (num_thr < 0)
> > +			continue;
> > +
> > +		for (i = 0; i < num_thr; i++) {
> > +			struct device_node *np_thr;
> > +			struct platform_device *pdev;
> > +
> > +			np_thr = of_parse_phandle(np_opp, "opp-throttlers", i);
> > +			if (!np_thr) {
> > +				dev_err(thr->dev,
> > +					"failed to parse phandle %d: %s\n", i,
> > +					np_opp->full_name);
> > +				continue;
> > +			}
> > +
> > +			pdev = of_find_device_by_node(np_thr);
> > +			if (!pdev) {
> > +				dev_err(thr->dev,
> > +					"could not find throttler dev: %s\n",
> > +					 np_thr->full_name);
> > +				of_node_put(np_thr);
> > +				continue;
> > +			}
> > +
> > +			/* OPP is used by this throttler */
> > +			if (&pdev->dev == thr->dev) {
> 
> So you're assuming that all throttlers are platform devices? Seems
> slightly shaky; I could easily imagine a similar device that's a SPI or
> I2C device.

As of now that's the only existing throttler. Adding handling for
throttlers on other buses that might never be implemented seems
premature. If throttlers with other bus types are added the core
code can be extended to support this using
of_find_i2c_device_by_node(), of_find_spi_device_by_node(), etc

Thanks

Matthias
Brian Norris June 12, 2018, 8 p.m. UTC | #4
Hi,

On Tue, Jun 12, 2018 at 10:11:40AM -0700, Matthias Kaehlcke wrote:
> On Mon, Jun 11, 2018 at 06:49:13PM -0700, Brian Norris wrote:
> > On Thu, Jun 07, 2018 at 11:12:12AM -0700, Matthias Kaehlcke wrote:
> > > The purpose of the throttler is to provide support for non-thermal
> > > throttling. Throttling is triggered by external event, e.g. the
> > > detection of a high battery discharge current, close to the OCP limit
> > > of the battery. The throttler is only in charge of the throttling, not
> > > the monitoring, which is done by another (possibly platform specific)
> > > driver.
> > > 
> > > Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
> > > ---
> > > Changes in v2:
> > > - completely reworked the driver to support configuration through OPPs, which
> > >   requires a more dynamic handling
> > > - added sysfs attribute to set the level for debugging and testing
> > > - Makefile: depend on Kconfig option to traverse throttler directory
> > > - Kconfig: removed 'default n'
> > > - added SPDX line instead of license boiler-plate
> > > - added entry to MAINTAINERS file
> > > 
> > > 
> > >  MAINTAINERS                     |   7 +
> > >  drivers/misc/Kconfig            |   1 +
> > >  drivers/misc/Makefile           |   1 +
> > >  drivers/misc/throttler/Kconfig  |  14 +
> > >  drivers/misc/throttler/Makefile |   1 +
> > >  drivers/misc/throttler/core.c   | 642 ++++++++++++++++++++++++++++++++
> > >  include/linux/throttler.h       |  11 +
> > >  7 files changed, 677 insertions(+)
> > >  create mode 100644 drivers/misc/throttler/Kconfig
> > >  create mode 100644 drivers/misc/throttler/Makefile
> > >  create mode 100644 drivers/misc/throttler/core.c
> > >  create mode 100644 include/linux/throttler.h
> > > 

...

> > > diff --git a/drivers/misc/throttler/Kconfig b/drivers/misc/throttler/Kconfig
> > > new file mode 100644
> > > index 000000000000..e561f1df5085
> > > --- /dev/null
> > > +++ b/drivers/misc/throttler/Kconfig
> > > @@ -0,0 +1,14 @@
> > > +# SPDX-License-Identifier: GPL-2.0
> > > +
> > > +menuconfig THROTTLER
> > > +	bool "Throttler support"
> > > +	depends on OF
> > > +	select CPU_FREQ
> > > +	select PM_DEVFREQ
> > 
> > I'm curious whether we're actually truly compile-time dependent on
> > {CPU,DEV}FREQ? It seems like those subsystems mostly stub out stuff so
> > they fall back to no-ops if not built in.
> >
> > I know that's not very useful for your existing throttler, since it
> > wouldn't be very effective if one or both were disabled.
> 
> The idea is not to depend on both options being enabled, since
> throttling of one type might be all that is needed.

OK, then if you fix the build errors...don't do these 'select's here?

> As the build bot pointed out cpufreq doesn't stub out all functions.
> Probably the cleanest way to work around this is to define a stub for
> cpufreq_update_policy() in the throttler when CONFIG_CPU_FREQ is not
> defined.

Might make sense.

Also, how is it that CONFIG_CPU_FREQ managed to not be defined, even
though you 'select'ed it? Was the kbuild error on some oddball
architecture that doesn't support CPU_FREQ?

> > > diff --git a/drivers/misc/throttler/core.c b/drivers/misc/throttler/core.c
> > > new file mode 100644
> > > index 000000000000..15b50c111032
> > > --- /dev/null
> > > +++ b/drivers/misc/throttler/core.c
> > > ...
> > > +// #define DEBUG_THROTTLER
> > 
> > Did you mean to leave your debug code in? Seems like you have some
> > related dead code under #ifdefs.
> 
> Yes, I left it in intentionally.
> 
> > (If you do want this, maybe it'd be better under debugfs, until somebody
> > really wants to formalize and document it.)
> 
> Since it is code that is never enabled in an official kernel build I
> found it simpler to use sysfs with a direct association with the
> device, instead of having <debugfs>/throttler/<throttler_name>/level.
> 
> If folks have strong opinions about using debugfs or not having the
> debug code at all I'm fine with changing/dropping it.

If you ever expect this to actually get merged, I'd think we should go
with one way or the other. Dead code is not appreciated in mainline, so
either make it something people can actually have a chance of using
(e.g., a CONFIG_* option or debugfs), or else drop it.

> > > +static int thr_init_freq_table(struct throttler *thr, struct device *opp_dev,
> > > +			       struct thr_freq_table *ft)
> > > +{
> > > +	struct device_node *np_opp_desc, *np_opp;
> > > +	int nchilds;
> > > +	uint32_t *freqs;
> > > +	int nfreqs = 0;
> > > +	int err = 0;
> > > +
> > > +	np_opp_desc = dev_pm_opp_of_get_opp_desc_node(opp_dev);
> > > +	if (!np_opp_desc)
> > > +		return -EINVAL;
> > > +
> > > +	nchilds = of_get_child_count(np_opp_desc);
> > > +	if (!nchilds) {
> > > +		err = -EINVAL;
> > > +		goto out_node_put;
> > > +	}
> > > +
> > > +	freqs = kzalloc(nchilds * sizeof(uint32_t), GFP_KERNEL);
> > > +	if (!freqs) {
> > > +		err = -ENOMEM;
> > > +		goto out_node_put;
> > > +	}
> > > +
> > > +	/* determine which OPPs are used by this throttler (if any) */
> > > +	for_each_child_of_node(np_opp_desc, np_opp) {
> > > +		int num_thr;
> > > +		int i;
> > > +
> > > +		num_thr = of_property_count_u32_elems(np_opp, "opp-throttlers");
> > > +		if (num_thr < 0)
> > > +			continue;
> > > +
> > > +		for (i = 0; i < num_thr; i++) {
> > > +			struct device_node *np_thr;
> > > +			struct platform_device *pdev;
> > > +
> > > +			np_thr = of_parse_phandle(np_opp, "opp-throttlers", i);
> > > +			if (!np_thr) {
> > > +				dev_err(thr->dev,
> > > +					"failed to parse phandle %d: %s\n", i,
> > > +					np_opp->full_name);
> > > +				continue;
> > > +			}
> > > +
> > > +			pdev = of_find_device_by_node(np_thr);
> > > +			if (!pdev) {
> > > +				dev_err(thr->dev,
> > > +					"could not find throttler dev: %s\n",
> > > +					 np_thr->full_name);
> > > +				of_node_put(np_thr);
> > > +				continue;
> > > +			}
> > > +
> > > +			/* OPP is used by this throttler */
> > > +			if (&pdev->dev == thr->dev) {
> > 
> > So you're assuming that all throttlers are platform devices? Seems
> > slightly shaky; I could easily imagine a similar device that's a SPI or
> > I2C device.
> 
> As of now that's the only existing throttler. Adding handling for
> throttlers on other buses that might never be implemented seems
> premature. If throttlers with other bus types are added the core
> code can be extended to support this using
> of_find_i2c_device_by_node(), of_find_spi_device_by_node(), etc

I suppose...but it feels like there should be a better way to do this
that isn't specific to a particular bus.

If you're not going to fix this, then maybe you should force callers to
pass a plaform_device instead of device, to make it extra clear that
other devices are not supported.

Brian
Matthias Kaehlcke June 13, 2018, 1:48 a.m. UTC | #5
On Tue, Jun 12, 2018 at 01:00:14PM -0700, Brian Norris wrote:
> Hi,
> 
> On Tue, Jun 12, 2018 at 10:11:40AM -0700, Matthias Kaehlcke wrote:
> > On Mon, Jun 11, 2018 at 06:49:13PM -0700, Brian Norris wrote:
> > > On Thu, Jun 07, 2018 at 11:12:12AM -0700, Matthias Kaehlcke wrote:
> > > > The purpose of the throttler is to provide support for non-thermal
> > > > throttling. Throttling is triggered by external event, e.g. the
> > > > detection of a high battery discharge current, close to the OCP limit
> > > > of the battery. The throttler is only in charge of the throttling, not
> > > > the monitoring, which is done by another (possibly platform specific)
> > > > driver.
> > > > 
> > > > Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
> > > > ---
> > > > Changes in v2:
> > > > - completely reworked the driver to support configuration through OPPs, which
> > > >   requires a more dynamic handling
> > > > - added sysfs attribute to set the level for debugging and testing
> > > > - Makefile: depend on Kconfig option to traverse throttler directory
> > > > - Kconfig: removed 'default n'
> > > > - added SPDX line instead of license boiler-plate
> > > > - added entry to MAINTAINERS file
> > > > 
> > > > 
> > > >  MAINTAINERS                     |   7 +
> > > >  drivers/misc/Kconfig            |   1 +
> > > >  drivers/misc/Makefile           |   1 +
> > > >  drivers/misc/throttler/Kconfig  |  14 +
> > > >  drivers/misc/throttler/Makefile |   1 +
> > > >  drivers/misc/throttler/core.c   | 642 ++++++++++++++++++++++++++++++++
> > > >  include/linux/throttler.h       |  11 +
> > > >  7 files changed, 677 insertions(+)
> > > >  create mode 100644 drivers/misc/throttler/Kconfig
> > > >  create mode 100644 drivers/misc/throttler/Makefile
> > > >  create mode 100644 drivers/misc/throttler/core.c
> > > >  create mode 100644 include/linux/throttler.h
> > > > 
> 
> ...
> 
> > > > diff --git a/drivers/misc/throttler/Kconfig b/drivers/misc/throttler/Kconfig
> > > > new file mode 100644
> > > > index 000000000000..e561f1df5085
> > > > --- /dev/null
> > > > +++ b/drivers/misc/throttler/Kconfig
> > > > @@ -0,0 +1,14 @@
> > > > +# SPDX-License-Identifier: GPL-2.0
> > > > +
> > > > +menuconfig THROTTLER
> > > > +	bool "Throttler support"
> > > > +	depends on OF
> > > > +	select CPU_FREQ
> > > > +	select PM_DEVFREQ
> > > 
> > > I'm curious whether we're actually truly compile-time dependent on
> > > {CPU,DEV}FREQ? It seems like those subsystems mostly stub out stuff so
> > > they fall back to no-ops if not built in.
> > >
> > > I know that's not very useful for your existing throttler, since it
> > > wouldn't be very effective if one or both were disabled.
> > 
> > The idea is not to depend on both options being enabled, since
> > throttling of one type might be all that is needed.
> 
> OK, then if you fix the build errors...don't do these 'select's here?

Ok, I'll remove them

> > As the build bot pointed out cpufreq doesn't stub out all functions.
> > Probably the cleanest way to work around this is to define a stub for
> > cpufreq_update_policy() in the throttler when CONFIG_CPU_FREQ is not
> > defined.
> 
> Might make sense.
> 
> Also, how is it that CONFIG_CPU_FREQ managed to not be defined, even
> though you 'select'ed it? Was the kbuild error on some oddball
> architecture that doesn't support CPU_FREQ?

The build error occured with 'openrisc', from a quick grep in
drivers/cpufreq it seems indeed that there is no driver for this
architecture.

> > > > diff --git a/drivers/misc/throttler/core.c b/drivers/misc/throttler/core.c
> > > > new file mode 100644
> > > > index 000000000000..15b50c111032
> > > > --- /dev/null
> > > > +++ b/drivers/misc/throttler/core.c
> > > > ...
> > > > +// #define DEBUG_THROTTLER
> > > 
> > > Did you mean to leave your debug code in? Seems like you have some
> > > related dead code under #ifdefs.
> > 
> > Yes, I left it in intentionally.
> > 
> > > (If you do want this, maybe it'd be better under debugfs, until somebody
> > > really wants to formalize and document it.)
> > 
> > Since it is code that is never enabled in an official kernel build I
> > found it simpler to use sysfs with a direct association with the
> > device, instead of having <debugfs>/throttler/<throttler_name>/level.
> > 
> > If folks have strong opinions about using debugfs or not having the
> > debug code at all I'm fine with changing/dropping it.
> 
> If you ever expect this to actually get merged, I'd think we should go
> with one way or the other. Dead code is not appreciated in mainline, so
> either make it something people can actually have a chance of using
> (e.g., a CONFIG_* option or debugfs), or else drop it.

Ok, will change to debugfs with CONFIG_* option.

> > > > +static int thr_init_freq_table(struct throttler *thr, struct device *opp_dev,
> > > > +			       struct thr_freq_table *ft)
> > > > +{
> > > > +	struct device_node *np_opp_desc, *np_opp;
> > > > +	int nchilds;
> > > > +	uint32_t *freqs;
> > > > +	int nfreqs = 0;
> > > > +	int err = 0;
> > > > +
> > > > +	np_opp_desc = dev_pm_opp_of_get_opp_desc_node(opp_dev);
> > > > +	if (!np_opp_desc)
> > > > +		return -EINVAL;
> > > > +
> > > > +	nchilds = of_get_child_count(np_opp_desc);
> > > > +	if (!nchilds) {
> > > > +		err = -EINVAL;
> > > > +		goto out_node_put;
> > > > +	}
> > > > +
> > > > +	freqs = kzalloc(nchilds * sizeof(uint32_t), GFP_KERNEL);
> > > > +	if (!freqs) {
> > > > +		err = -ENOMEM;
> > > > +		goto out_node_put;
> > > > +	}
> > > > +
> > > > +	/* determine which OPPs are used by this throttler (if any) */
> > > > +	for_each_child_of_node(np_opp_desc, np_opp) {
> > > > +		int num_thr;
> > > > +		int i;
> > > > +
> > > > +		num_thr = of_property_count_u32_elems(np_opp, "opp-throttlers");
> > > > +		if (num_thr < 0)
> > > > +			continue;
> > > > +
> > > > +		for (i = 0; i < num_thr; i++) {
> > > > +			struct device_node *np_thr;
> > > > +			struct platform_device *pdev;
> > > > +
> > > > +			np_thr = of_parse_phandle(np_opp, "opp-throttlers", i);
> > > > +			if (!np_thr) {
> > > > +				dev_err(thr->dev,
> > > > +					"failed to parse phandle %d: %s\n", i,
> > > > +					np_opp->full_name);
> > > > +				continue;
> > > > +			}
> > > > +
> > > > +			pdev = of_find_device_by_node(np_thr);
> > > > +			if (!pdev) {
> > > > +				dev_err(thr->dev,
> > > > +					"could not find throttler dev: %s\n",
> > > > +					 np_thr->full_name);
> > > > +				of_node_put(np_thr);
> > > > +				continue;
> > > > +			}
> > > > +
> > > > +			/* OPP is used by this throttler */
> > > > +			if (&pdev->dev == thr->dev) {
> > > 
> > > So you're assuming that all throttlers are platform devices? Seems
> > > slightly shaky; I could easily imagine a similar device that's a SPI or
> > > I2C device.
> > 
> > As of now that's the only existing throttler. Adding handling for
> > throttlers on other buses that might never be implemented seems
> > premature. If throttlers with other bus types are added the core
> > code can be extended to support this using
> > of_find_i2c_device_by_node(), of_find_spi_device_by_node(), etc
> 
> I suppose...but it feels like there should be a better way to do this
> that isn't specific to a particular bus.

There is actually a better option, I was so focussed on the
of_find_device_by_node() way that I didn't see the obvious solution
right away:

We can just check if 'np_thr == thr->dev->of_node' ...

Thanks for pushing me to give it another look!
diff mbox

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 92e47b5b0480..f9550e5680ce 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13938,6 +13938,13 @@  T:	git git://repo.or.cz/linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git
 S:	Maintained
 F:	drivers/platform/x86/thinkpad_acpi.c
 
+THROTTLER DRIVERS
+M:	Matthias Kaehlcke <mka@chromium.org>
+L:	linux-pm@vger.kernel.org
+S:	Maintained
+F:	drivers/misc/throttler/
+F:	include/linux/throttler.h
+
 THUNDERBOLT DRIVER
 M:	Andreas Noever <andreas.noever@gmail.com>
 M:	Michael Jamet <michael.jamet@intel.com>
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 5d713008749b..691d9625d83c 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -513,4 +513,5 @@  source "drivers/misc/echo/Kconfig"
 source "drivers/misc/cxl/Kconfig"
 source "drivers/misc/ocxl/Kconfig"
 source "drivers/misc/cardreader/Kconfig"
+source "drivers/misc/throttler/Kconfig"
 endmenu
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 20be70c3f118..b549ccad5215 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -57,3 +57,4 @@  obj-$(CONFIG_ASPEED_LPC_SNOOP)	+= aspeed-lpc-snoop.o
 obj-$(CONFIG_PCI_ENDPOINT_TEST)	+= pci_endpoint_test.o
 obj-$(CONFIG_OCXL)		+= ocxl/
 obj-$(CONFIG_MISC_RTSX)		+= cardreader/
+obj-$(CONFIG_THROTTLER)		+= throttler/
diff --git a/drivers/misc/throttler/Kconfig b/drivers/misc/throttler/Kconfig
new file mode 100644
index 000000000000..e561f1df5085
--- /dev/null
+++ b/drivers/misc/throttler/Kconfig
@@ -0,0 +1,14 @@ 
+# SPDX-License-Identifier: GPL-2.0
+
+menuconfig THROTTLER
+	bool "Throttler support"
+	depends on OF
+	select CPU_FREQ
+	select PM_DEVFREQ
+	help
+	  This option enables core support for non-thermal throttling of CPUs
+	  and devfreq devices.
+
+	  Note that you also need a event monitor module usually called
+	  *_throttler.
+
diff --git a/drivers/misc/throttler/Makefile b/drivers/misc/throttler/Makefile
new file mode 100644
index 000000000000..c8d920cee315
--- /dev/null
+++ b/drivers/misc/throttler/Makefile
@@ -0,0 +1 @@ 
+obj-$(CONFIG_THROTTLER)		+= core.o
diff --git a/drivers/misc/throttler/core.c b/drivers/misc/throttler/core.c
new file mode 100644
index 000000000000..15b50c111032
--- /dev/null
+++ b/drivers/misc/throttler/core.c
@@ -0,0 +1,642 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Core code for non-thermal throttling
+ *
+ * Copyright (C) 2018 Google, Inc.
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/cpumask.h>
+#include <linux/devfreq.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/pm_opp.h>
+#include <linux/sort.h>
+#include <linux/slab.h>
+#include <linux/throttler.h>
+
+/*
+ * Non-thermal throttling: throttling of system components in response to
+ * external events (e.g. high battery discharge current).
+ *
+ * The throttler supports throttling through cpufreq and devfreq. Multiple
+ * levels of throttling can be configured. At level 0 no throttling is
+ * active on behalf of the throttler, for values > 0 throttling is typically
+ * configured to be increasingly aggressive with each level.
+ * The number of throttling levels is not limited by the throttler (though
+ * it is likely limited by the throttling devices). It is not necessary to
+ * configure the same number of levels for all throttling devices. If the
+ * requested throttling level for a device is higher than the maximum level
+ * of the device the throttler will select the maximum throttling level of
+ * the device.
+ *
+ * Non-thermal throttling is split in two parts:
+ *
+ * - throttler core
+ *   - parses the thermal policy
+ *   - applies throttling settings for a requested level of throttling
+ *
+ * - event monitor driver
+ *   - monitors events that trigger throttling
+ *   - determines the throttling level (often limited to on/off)
+ *   - asks throttler core to apply throttling settings
+ *
+ * It is possible for a system to have more than one throttler and the
+ * throttlers may make use of the same throttling devices, in case of
+ * conflicting settings for a device the more aggressive values will be
+ * applied.
+ *
+ */
+
+#define ci_to_throttler(ci) \
+	container_of(ci, struct throttler, devfreq.class_iface)
+
+// #define DEBUG_THROTTLER
+
+struct thr_freq_table {
+	uint32_t *freqs;
+	int n_entries;
+};
+
+struct cpufreq_thrdev {
+	uint32_t cpu;
+	struct thr_freq_table freq_table;
+	struct list_head node;
+};
+
+struct devfreq_thrdev {
+	struct devfreq *devfreq;
+	struct thr_freq_table freq_table;
+	struct throttler *thr;
+	struct notifier_block nb;
+	struct list_head node;
+};
+
+struct __thr_cpufreq {
+	struct list_head list;
+	cpumask_t cm_initialized;
+	cpumask_t cm_ignore;
+	struct notifier_block nb;
+};
+
+struct __thr_devfreq {
+	struct list_head list;
+	struct class_interface class_iface;
+};
+
+struct throttler {
+	struct device *dev;
+	int level;
+	struct __thr_cpufreq cpufreq;
+	struct __thr_devfreq devfreq;
+	struct mutex lock;
+};
+
+static inline int cmp_freqs(const void *a, const void *b)
+{
+	const uint32_t *pa = a, *pb = b;
+
+	if (*pa < *pb)
+		return 1;
+	else if (*pa > *pb)
+		return -1;
+
+	return 0;
+}
+
+static int thr_handle_devfreq_event(struct notifier_block *nb,
+				    unsigned long event, void *data);
+
+static unsigned long thr_get_throttling_freq(struct thr_freq_table *ft,
+					     int level)
+{
+	if (level == 0) {
+		WARN(true, "level == 0");
+		return ULONG_MAX;
+	}
+
+	if (level <= ft->n_entries)
+		return ft->freqs[level - 1];
+	else
+		return ft->freqs[ft->n_entries - 1];
+}
+
+static int thr_init_freq_table(struct throttler *thr, struct device *opp_dev,
+			       struct thr_freq_table *ft)
+{
+	struct device_node *np_opp_desc, *np_opp;
+	int nchilds;
+	uint32_t *freqs;
+	int nfreqs = 0;
+	int err = 0;
+
+	np_opp_desc = dev_pm_opp_of_get_opp_desc_node(opp_dev);
+	if (!np_opp_desc)
+		return -EINVAL;
+
+	nchilds = of_get_child_count(np_opp_desc);
+	if (!nchilds) {
+		err = -EINVAL;
+		goto out_node_put;
+	}
+
+	freqs = kzalloc(nchilds * sizeof(uint32_t), GFP_KERNEL);
+	if (!freqs) {
+		err = -ENOMEM;
+		goto out_node_put;
+	}
+
+	/* determine which OPPs are used by this throttler (if any) */
+	for_each_child_of_node(np_opp_desc, np_opp) {
+		int num_thr;
+		int i;
+
+		num_thr = of_property_count_u32_elems(np_opp, "opp-throttlers");
+		if (num_thr < 0)
+			continue;
+
+		for (i = 0; i < num_thr; i++) {
+			struct device_node *np_thr;
+			struct platform_device *pdev;
+
+			np_thr = of_parse_phandle(np_opp, "opp-throttlers", i);
+			if (!np_thr) {
+				dev_err(thr->dev,
+					"failed to parse phandle %d: %s\n", i,
+					np_opp->full_name);
+				continue;
+			}
+
+			pdev = of_find_device_by_node(np_thr);
+			if (!pdev) {
+				dev_err(thr->dev,
+					"could not find throttler dev: %s\n",
+					 np_thr->full_name);
+				of_node_put(np_thr);
+				continue;
+			}
+
+			/* OPP is used by this throttler */
+			if (&pdev->dev == thr->dev) {
+				u64 rate;
+
+				err = of_property_read_u64(np_opp, "opp-hz",
+							   &rate);
+				if (!err) {
+					freqs[nfreqs] = rate;
+					nfreqs++;
+				} else {
+					dev_err(thr->dev,
+						"opp-hz not found: %s\n",
+						np_opp->full_name);
+				}
+			}
+
+			of_node_put(np_thr);
+			put_device(&pdev->dev);
+		}
+	}
+
+	if (nfreqs > 0) {
+		/* sort frequencies in descending order */
+		sort(freqs, nfreqs, sizeof(*freqs), cmp_freqs, NULL);
+
+		ft->n_entries = nfreqs;
+		ft->freqs = devm_kzalloc(thr->dev,
+				  nfreqs * sizeof(*freqs), GFP_KERNEL);
+		if (!ft->freqs) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+
+		memcpy(ft->freqs, freqs, nfreqs * sizeof(*freqs));
+	} else {
+		err = -ENODEV;
+	}
+
+out_free:
+	kfree(freqs);
+
+out_node_put:
+	of_node_put(np_opp_desc);
+
+	return err;
+}
+
+static void thr_cpufreq_init(struct throttler *thr, int cpu)
+{
+	struct device *cpu_dev;
+	struct thr_freq_table ft;
+	struct cpufreq_thrdev *cpufreq_dev;
+	int err;
+
+	WARN_ON(!mutex_is_locked(&thr->lock));
+
+	cpu_dev = get_cpu_device(cpu);
+	if (!cpu_dev) {
+		dev_err_ratelimited(thr->dev, "failed to get CPU %d\n", cpu);
+		return;
+	}
+
+	err = thr_init_freq_table(thr, cpu_dev, &ft);
+	if (err) {
+		/* CPU is not throttled or initialization failed */
+		if (err != -ENODEV)
+			dev_err(thr->dev,
+				"failed to initialize CPU %d: %d", cpu, err);
+
+		cpumask_set_cpu(cpu, &thr->cpufreq.cm_ignore);
+		return;
+	}
+
+	cpufreq_dev = devm_kzalloc(thr->dev, sizeof(*cpufreq_dev), GFP_KERNEL);
+	if (!cpufreq_dev) {
+		dev_err(thr->dev, "%s: out of memory\n", __func__);
+		return;
+	}
+
+	cpufreq_dev->cpu = cpu;
+	memcpy(&cpufreq_dev->freq_table, &ft, sizeof(ft));
+	list_add_tail(&cpufreq_dev->node, &thr->cpufreq.list);
+
+	cpumask_set_cpu(cpu, &thr->cpufreq.cm_initialized);
+}
+
+static void thr_devfreq_init(struct device *dev, void *data)
+{
+	struct throttler *thr = data;
+	struct thr_freq_table ft;
+	struct devfreq_thrdev *dftd;
+	int err;
+
+	WARN_ON(!mutex_is_locked(&thr->lock));
+
+	err = thr_init_freq_table(thr, dev->parent, &ft);
+	if (err) {
+		if (err == -ENODEV)
+			/* devfreq device not throttled */
+			return;
+
+		dev_err(thr->dev,
+			"failed to init frequency table of device %s: %d",
+			dev_name(dev), err);
+		return;
+	}
+
+	dftd = devm_kzalloc(thr->dev, sizeof(*dftd), GFP_KERNEL);
+	if (!dftd) {
+		dev_err(thr->dev, "%s: out of memory\n", __func__);
+		return;
+	}
+
+	dftd->thr = thr;
+	dftd->devfreq = container_of(dev, struct devfreq, dev);
+	memcpy(&dftd->freq_table, &ft, sizeof(ft));
+
+	dftd->nb.notifier_call = thr_handle_devfreq_event;
+	err = devm_devfreq_register_notifier(thr->dev, dftd->devfreq,
+				     &dftd->nb, DEVFREQ_POLICY_NOTIFIER);
+	if (err < 0) {
+		dev_err(dev, "failed to register devfreq notifier\n");
+		devm_kfree(thr->dev, dftd);
+		return;
+	}
+
+	list_add_tail(&dftd->node, &thr->devfreq.list);
+}
+
+static int thr_handle_cpufreq_event(struct notifier_block *nb,
+				unsigned long event, void *data)
+{
+	struct throttler *thr =
+		container_of(nb, struct throttler, cpufreq.nb);
+	struct cpufreq_policy *policy = data;
+	struct cpufreq_thrdev *cftd;
+
+	if (event != CPUFREQ_ADJUST)
+		return 0;
+
+	mutex_lock(&thr->lock);
+
+	if (cpumask_test_cpu(policy->cpu, &thr->cpufreq.cm_ignore))
+		goto out;
+
+	if (!cpumask_test_cpu(policy->cpu, &thr->cpufreq.cm_initialized)) {
+		thr_cpufreq_init(thr, policy->cpu);
+
+		if (cpumask_test_cpu(policy->cpu, &thr->cpufreq.cm_ignore))
+			goto out;
+	}
+
+	/*
+	 * Can't do this check earlier, otherwise we might miss CPU policies
+	 * that are added after setup().
+	 */
+	if (thr->level == 0)
+		goto out;
+
+	list_for_each_entry(cftd, &thr->cpufreq.list, node) {
+		unsigned long clamp_freq;
+
+		if (cftd->cpu != policy->cpu)
+			continue;
+
+		clamp_freq = thr_get_throttling_freq(&cftd->freq_table,
+						     thr->level) / 1000;
+		if (clamp_freq < policy->max)
+			cpufreq_verify_within_limits(policy, 0, clamp_freq);
+	}
+
+out:
+	mutex_unlock(&thr->lock);
+
+	return NOTIFY_DONE;
+}
+
+/*
+ * Notifier called by devfreq. Can't acquire thr->lock since it might
+ * already be held by throttler_set_level(). It isn't necessary to
+ * acquire the lock for the following reasons:
+ *
+ * Only the devfreq_thrdev and thr->level are accessed in this function.
+ * The devfreq device won't go away (or change) during the execution of
+ * this function, since we are called from the devfreq core. Theoretically
+ * thr->level could change and we'd apply an outdated setting, however in
+ * this case the function would run again shortly after and apply the
+ * correct value.
+ */
+static int thr_handle_devfreq_event(struct notifier_block *nb,
+				    unsigned long event, void *data)
+{
+	struct devfreq_thrdev *dftd =
+		container_of(nb, struct devfreq_thrdev, nb);
+	struct throttler *thr = dftd->thr;
+	struct devfreq_policy *policy = data;
+	unsigned long clamp_freq;
+
+	if ((event != DEVFREQ_ADJUST) || (thr->level == 0))
+		return NOTIFY_DONE;
+
+	clamp_freq = thr_get_throttling_freq(&dftd->freq_table, thr->level);
+	if (clamp_freq < policy->max)
+		devfreq_verify_within_limits(policy, 0, clamp_freq);
+
+	return NOTIFY_DONE;
+}
+
+static void thr_cpufreq_update_policy(struct throttler *thr)
+{
+	struct cpufreq_thrdev *cftd;
+
+	WARN_ON(!mutex_is_locked(&thr->lock));
+
+	list_for_each_entry(cftd, &thr->cpufreq.list, node) {
+		struct cpufreq_policy *policy = cpufreq_cpu_get(cftd->cpu);
+
+		if (!policy) {
+			dev_warn(thr->dev,
+				 "CPU%d does have no cpufreq policy!\n",
+				 cftd->cpu);
+			continue;
+		}
+
+		/*
+		 * The lock isn't really needed in this function, the list
+		 * of cpufreq devices can be extended, but no items are
+		 * deleted during the lifetime of the throttler. Releasing
+		 * the lock is necessary since cpufreq_update_policy() ends
+		 * up calling thr_handle_cpufreq_event(), which needs to
+		 * acquire the lock.
+		 */
+		mutex_unlock(&thr->lock);
+		cpufreq_update_policy(cftd->cpu);
+		mutex_lock(&thr->lock);
+
+		cpufreq_cpu_put(policy);
+	}
+}
+
+static int thr_handle_devfreq_added(struct device *dev,
+				    struct class_interface *ci)
+{
+	struct throttler *thr = ci_to_throttler(ci);
+
+	mutex_lock(&thr->lock);
+	thr_devfreq_init(dev, thr);
+	mutex_unlock(&thr->lock);
+
+	return 0;
+}
+
+static void thr_handle_devfreq_removed(struct device *dev,
+				       struct class_interface *ci)
+{
+	struct devfreq_thrdev *dftd;
+	struct throttler *thr = ci_to_throttler(ci);
+
+	mutex_lock(&thr->lock);
+
+	list_for_each_entry(dftd, &thr->devfreq.list, node) {
+		if (dev == &dftd->devfreq->dev) {
+			list_del(&dftd->node);
+			devm_kfree(thr->dev, dftd->freq_table.freqs);
+			devm_kfree(thr->dev, dftd);
+			break;
+		}
+	}
+
+	mutex_unlock(&thr->lock);
+}
+
+void throttler_set_level(struct throttler *thr, int level)
+{
+	struct devfreq_thrdev *dftd;
+
+	if (level == thr->level)
+		return;
+
+	mutex_lock(&thr->lock);
+
+	dev_dbg(thr->dev, "throttling level: %d\n", level);
+	thr->level = level;
+
+	if (!list_empty(&thr->cpufreq.list))
+		thr_cpufreq_update_policy(thr);
+
+	list_for_each_entry(dftd, &thr->devfreq.list, node) {
+		mutex_lock(&dftd->devfreq->lock);
+		update_devfreq(dftd->devfreq);
+		mutex_unlock(&dftd->devfreq->lock);
+	}
+
+	mutex_unlock(&thr->lock);
+}
+EXPORT_SYMBOL_GPL(throttler_set_level);
+
+#ifdef DEBUG_THROTTLER
+
+static ssize_t thr_show_level(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	struct dev_ext_attribute *ea =
+		container_of(attr, struct dev_ext_attribute, attr);
+	struct throttler *thr = ea->var;
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", thr->level);
+}
+
+static ssize_t thr_store_level(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t count)
+{
+	struct dev_ext_attribute *ea =
+		container_of(attr, struct dev_ext_attribute, attr);
+	struct throttler *thr = ea->var;
+	int level;
+
+	if (!kstrtoint(buf, 10, &level))
+		return -EINVAL;
+
+	throttler_set_level(thr, level);
+
+	return count;
+}
+
+#endif /* DEBUG_THROTTLER */
+
+struct throttler *throttler_setup(struct device *dev)
+{
+	struct throttler *thr;
+	struct device_node *np = dev->of_node;
+	struct class_interface *ci;
+#ifdef DEBUG_THROTTLER
+	struct dev_ext_attribute *attr;
+#endif
+	int cpu;
+	int err;
+
+	if (!np)
+		/* should never happen */
+		return ERR_PTR(-EINVAL);
+
+	thr = devm_kzalloc(dev, sizeof(*thr), GFP_KERNEL);
+	if (!thr)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&thr->lock);
+	thr->dev = dev;
+
+	cpumask_clear(&thr->cpufreq.cm_ignore);
+	cpumask_clear(&thr->cpufreq.cm_initialized);
+
+	INIT_LIST_HEAD(&thr->cpufreq.list);
+	INIT_LIST_HEAD(&thr->devfreq.list);
+
+	thr->cpufreq.nb.notifier_call = thr_handle_cpufreq_event;
+	err = cpufreq_register_notifier(&thr->cpufreq.nb,
+					CPUFREQ_POLICY_NOTIFIER);
+	if (err < 0) {
+		dev_err(dev, "failed to register cpufreq notifier\n");
+		return ERR_PTR(err);
+	}
+
+	/*
+	 * The CPU throttling configuration is parsed at runtime, when the
+	 * cpufreq policy notifier is called for a CPU that hasn't been
+	 * initialized yet.
+	 *
+	 * This is done for two reasons:
+	 * -  when the throttler is probed the CPU might not yet have a policy
+	 * -  CPUs that were offline at probe time might be hotplugged
+	 *
+	 * The notifier is called then the policy is added/set
+	 */
+	for_each_online_cpu(cpu) {
+		struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
+
+		if (!policy)
+			continue;
+
+		cpufreq_update_policy(cpu);
+		cpufreq_cpu_put(policy);
+	}
+
+	/*
+	 * devfreq devices can be added and removed at runtime, hence they
+	 * must also be handled dynamically. The class_interface notifies us
+	 * whenever a device is added or removed. When the interface is
+	 * registered ci->add_dev() is called for all existing devfreq
+	 * devices.
+	 */
+	ci = &thr->devfreq.class_iface;
+	ci->class = devfreq_class;
+	ci->add_dev = thr_handle_devfreq_added;
+	ci->remove_dev = thr_handle_devfreq_removed;
+
+	err = class_interface_register(ci);
+	if (err) {
+		dev_err(thr->dev,
+			"failed to register devfreq class interface: %d\n",
+			err);
+		cpufreq_unregister_notifier(&thr->cpufreq.nb,
+					    CPUFREQ_POLICY_NOTIFIER);
+		return ERR_PTR(err);
+	}
+
+#ifdef DEBUG_THROTTLER
+	attr = devm_kzalloc(thr->dev, sizeof(*attr), GFP_KERNEL);
+	if (!attr)
+		goto skip_attr;
+
+	attr->attr.attr.name = "level";
+	sysfs_attr_init(&attr->attr);
+	attr->attr.attr.mode = 0644;
+	attr->attr.show	= thr_show_level;
+	attr->attr.store = thr_store_level;
+	attr->var = thr;
+
+	device_create_file(thr->dev, &attr->attr);
+
+skip_attr:
+#endif
+
+	return thr;
+}
+EXPORT_SYMBOL_GPL(throttler_setup);
+
+void throttler_teardown(struct throttler *thr)
+{
+	struct devfreq_thrdev *dftd;
+	int level;
+
+	mutex_lock(&thr->lock);
+
+	level = thr->level;
+	thr->level = 0;
+
+	class_interface_unregister(&thr->devfreq.class_iface);
+
+	if (level) {
+		/* unthrottle CPUs */
+		if (!list_empty(&thr->cpufreq.list))
+			thr_cpufreq_update_policy(thr);
+
+		/* unthrottle devfreq devices */
+		list_for_each_entry(dftd, &thr->devfreq.list, node) {
+			mutex_lock(&dftd->devfreq->lock);
+			update_devfreq(dftd->devfreq);
+			mutex_unlock(&dftd->devfreq->lock);
+		}
+	}
+
+	cpufreq_unregister_notifier(&thr->cpufreq.nb,
+				    CPUFREQ_POLICY_NOTIFIER);
+
+	mutex_unlock(&thr->lock);
+}
+EXPORT_SYMBOL_GPL(throttler_teardown);
diff --git a/include/linux/throttler.h b/include/linux/throttler.h
new file mode 100644
index 000000000000..3c06054ab39c
--- /dev/null
+++ b/include/linux/throttler.h
@@ -0,0 +1,11 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_THROTTLER_H__
+#define __LINUX_THROTTLER_H__
+
+struct throttler;
+
+extern struct throttler *throttler_setup(struct device *dev);
+extern void throttler_teardown(struct throttler *thr);
+extern void throttler_set_level(struct throttler *thr, int level);
+
+#endif /* __LINUX_THROTTLER_H__ */