diff mbox series

[3/3] thermal: qcom: tsens: Implement re-initialization workaround quirk

Message ID 20220701145815.2037993-4-bhupesh.sharma@linaro.org (mailing list archive)
State New, archived
Delegated to: Daniel Lezcano
Headers show
Series Add support for tsens controller reinit via trustzone | expand

Commit Message

Bhupesh Sharma July 1, 2022, 2:58 p.m. UTC
Since for some QCoM tsens controllers, its suggested to
monitor the controller health periodically and in case an
issue is detected, to re-initialize the tsens controller
via trustzone, add the support for the same in the
qcom tsens driver.

Note that Once the tsens controller is reset using scm call,
all SROT and TM region registers will enter the reset mode.

While all the SROT registers will be re-programmed and
re-enabled in trustzone prior to the scm call exit, the TM
region registers will not re-initialized in trustzone and thus
need to be handled by the tsens driver.

Cc: Amit Kucheria <amitk@kernel.org>
Cc: Thara Gopinath <thara.gopinath@gmail.com>
Cc: linux-pm@vger.kernel.org
Cc: linux-arm-msm@vger.kernel.org
Signed-off-by: Bhupesh Sharma <bhupesh.sharma@linaro.org>
---
 drivers/thermal/qcom/tsens-v2.c |   3 +
 drivers/thermal/qcom/tsens.c    | 237 +++++++++++++++++++++++++++++++-
 drivers/thermal/qcom/tsens.h    |   6 +
 3 files changed, 239 insertions(+), 7 deletions(-)

Comments

kernel test robot July 8, 2022, 11:40 a.m. UTC | #1
Hi Bhupesh,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on rafael-pm/thermal]
[also build test ERROR on linus/master v5.19-rc5 next-20220707]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Bhupesh-Sharma/Add-support-for-tsens-controller-reinit-via-trustzone/20220701-230113
base:   https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git thermal
config: arm64-randconfig-r015-20220707 (https://download.01.org/0day-ci/archive/20220708/202207081955.SXcfKpLo-lkp@intel.com/config)
compiler: aarch64-linux-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/32929e13eb338e76b714bb8b4805899e2857734f
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Bhupesh-Sharma/Add-support-for-tsens-controller-reinit-via-trustzone/20220701-230113
        git checkout 32929e13eb338e76b714bb8b4805899e2857734f
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=arm64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   aarch64-linux-ld: Unexpected GOT/PLT entries detected!
   aarch64-linux-ld: Unexpected run-time procedure linkages detected!
   aarch64-linux-ld: ID map text too big or misaligned
   aarch64-linux-ld: drivers/thermal/qcom/tsens.o: in function `tsens_probe':
>> drivers/thermal/qcom/tsens.c:1337: undefined reference to `qcom_scm_is_available'
   aarch64-linux-ld: drivers/thermal/qcom/tsens.o: in function `get_temp_tsens_valid':
>> drivers/thermal/qcom/tsens.c:714: undefined reference to `qcom_scm_tsens_reinit'


vim +1337 drivers/thermal/qcom/tsens.c

  1293	
  1294	static int tsens_probe(struct platform_device *pdev)
  1295	{
  1296		int ret, i;
  1297		struct device *dev;
  1298		struct device_node *np;
  1299		struct tsens_priv *priv;
  1300		const struct tsens_plat_data *data;
  1301		const struct of_device_id *id;
  1302		u32 num_sensors;
  1303	
  1304		if (pdev->dev.of_node)
  1305			dev = &pdev->dev;
  1306		else
  1307			dev = pdev->dev.parent;
  1308	
  1309		np = dev->of_node;
  1310	
  1311		id = of_match_node(tsens_table, np);
  1312		if (id)
  1313			data = id->data;
  1314		else
  1315			data = &data_8960;
  1316	
  1317		num_sensors = data->num_sensors;
  1318	
  1319		if (np)
  1320			of_property_read_u32(np, "#qcom,sensors", &num_sensors);
  1321	
  1322		if (num_sensors <= 0) {
  1323			dev_err(dev, "%s: invalid number of sensors\n", __func__);
  1324			return -EINVAL;
  1325		}
  1326	
  1327		priv = devm_kzalloc(dev,
  1328				     struct_size(priv, sensor, num_sensors),
  1329				     GFP_KERNEL);
  1330		if (!priv)
  1331			return -ENOMEM;
  1332	
  1333		priv->dev = dev;
  1334		priv->num_sensors = num_sensors;
  1335		priv->needs_reinit_wa = data->needs_reinit_wa;
  1336	
> 1337		if (priv->needs_reinit_wa && !qcom_scm_is_available())
  1338			return -EPROBE_DEFER;
  1339	
  1340		if (priv->needs_reinit_wa) {
  1341			priv->reinit_wa_worker = alloc_workqueue("tsens_reinit_work",
  1342								 WQ_HIGHPRI, 0);
  1343			if (!priv->reinit_wa_worker)
  1344				return -ENOMEM;
  1345	
  1346			INIT_WORK(&priv->reinit_wa_notify, tsens_reinit_worker_notify);
  1347		}
  1348	
  1349		priv->ops = data->ops;
  1350		for (i = 0;  i < priv->num_sensors; i++) {
  1351			if (data->hw_ids)
  1352				priv->sensor[i].hw_id = data->hw_ids[i];
  1353			else
  1354				priv->sensor[i].hw_id = i;
  1355		}
  1356		priv->feat = data->feat;
  1357		priv->fields = data->fields;
  1358	
  1359		platform_set_drvdata(pdev, priv);
  1360	
  1361		if (!priv->ops || !priv->ops->init || !priv->ops->get_temp) {
  1362			ret = -EINVAL;
  1363			goto free_wq;
  1364		}
  1365	
  1366		ret = priv->ops->init(priv);
  1367		if (ret < 0) {
  1368			dev_err(dev, "%s: init failed\n", __func__);
  1369			goto free_wq;
  1370		}
  1371	
  1372		if (priv->ops->calibrate) {
  1373			ret = priv->ops->calibrate(priv);
  1374			if (ret < 0) {
  1375				if (ret != -EPROBE_DEFER)
  1376					dev_err(dev, "%s: calibration failed\n", __func__);
  1377	
  1378				goto free_wq;
  1379			}
  1380		}
  1381	
  1382		ret = tsens_register(priv);
  1383		if (ret < 0) {
  1384			dev_err(dev, "%s: registration failed\n", __func__);
  1385			goto free_wq;
  1386		}
  1387	
  1388		list_add_tail(&priv->list, &tsens_device_list);
  1389		return 0;
  1390	
  1391	free_wq:
  1392		destroy_workqueue(priv->reinit_wa_worker);
  1393		return ret;
  1394	}
  1395
Bhupesh Sharma July 12, 2022, 11:04 a.m. UTC | #2
Hi,

On Fri, 8 Jul 2022 at 17:10, kernel test robot <lkp@intel.com> wrote:
>
> Hi Bhupesh,
>
> Thank you for the patch! Yet something to improve:
>
> [auto build test ERROR on rafael-pm/thermal]
> [also build test ERROR on linus/master v5.19-rc5 next-20220707]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url:    https://github.com/intel-lab-lkp/linux/commits/Bhupesh-Sharma/Add-support-for-tsens-controller-reinit-via-trustzone/20220701-230113
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git thermal
> config: arm64-randconfig-r015-20220707 (https://download.01.org/0day-ci/archive/20220708/202207081955.SXcfKpLo-lkp@intel.com/config)
> compiler: aarch64-linux-gcc (GCC) 11.3.0
> reproduce (this is a W=1 build):
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # https://github.com/intel-lab-lkp/linux/commit/32929e13eb338e76b714bb8b4805899e2857734f
>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>         git fetch --no-tags linux-review Bhupesh-Sharma/Add-support-for-tsens-controller-reinit-via-trustzone/20220701-230113
>         git checkout 32929e13eb338e76b714bb8b4805899e2857734f
>         # save the config file
>         mkdir build_dir && cp config build_dir/.config
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=arm64 SHELL=/bin/bash
>
> If you fix the issue, kindly add following tag where applicable
> Reported-by: kernel test robot <lkp@intel.com>
>
> All errors (new ones prefixed by >>):
>
>    aarch64-linux-ld: Unexpected GOT/PLT entries detected!
>    aarch64-linux-ld: Unexpected run-time procedure linkages detected!
>    aarch64-linux-ld: ID map text too big or misaligned
>    aarch64-linux-ld: drivers/thermal/qcom/tsens.o: in function `tsens_probe':
> >> drivers/thermal/qcom/tsens.c:1337: undefined reference to `qcom_scm_is_available'
>    aarch64-linux-ld: drivers/thermal/qcom/tsens.o: in function `get_temp_tsens_valid':
> >> drivers/thermal/qcom/tsens.c:714: undefined reference to `qcom_scm_tsens_reinit'
>
>
> vim +1337 drivers/thermal/qcom/tsens.c
>
>   1293
>   1294  static int tsens_probe(struct platform_device *pdev)
>   1295  {
>   1296          int ret, i;
>   1297          struct device *dev;
>   1298          struct device_node *np;
>   1299          struct tsens_priv *priv;
>   1300          const struct tsens_plat_data *data;
>   1301          const struct of_device_id *id;
>   1302          u32 num_sensors;
>   1303
>   1304          if (pdev->dev.of_node)
>   1305                  dev = &pdev->dev;
>   1306          else
>   1307                  dev = pdev->dev.parent;
>   1308
>   1309          np = dev->of_node;
>   1310
>   1311          id = of_match_node(tsens_table, np);
>   1312          if (id)
>   1313                  data = id->data;
>   1314          else
>   1315                  data = &data_8960;
>   1316
>   1317          num_sensors = data->num_sensors;
>   1318
>   1319          if (np)
>   1320                  of_property_read_u32(np, "#qcom,sensors", &num_sensors);
>   1321
>   1322          if (num_sensors <= 0) {
>   1323                  dev_err(dev, "%s: invalid number of sensors\n", __func__);
>   1324                  return -EINVAL;
>   1325          }
>   1326
>   1327          priv = devm_kzalloc(dev,
>   1328                               struct_size(priv, sensor, num_sensors),
>   1329                               GFP_KERNEL);
>   1330          if (!priv)
>   1331                  return -ENOMEM;
>   1332
>   1333          priv->dev = dev;
>   1334          priv->num_sensors = num_sensors;
>   1335          priv->needs_reinit_wa = data->needs_reinit_wa;
>   1336
> > 1337          if (priv->needs_reinit_wa && !qcom_scm_is_available())
>   1338                  return -EPROBE_DEFER;
>   1339
>   1340          if (priv->needs_reinit_wa) {
>   1341                  priv->reinit_wa_worker = alloc_workqueue("tsens_reinit_work",
>   1342                                                           WQ_HIGHPRI, 0);
>   1343                  if (!priv->reinit_wa_worker)
>   1344                          return -ENOMEM;
>   1345
>   1346                  INIT_WORK(&priv->reinit_wa_notify, tsens_reinit_worker_notify);
>   1347          }
>   1348
>   1349          priv->ops = data->ops;
>   1350          for (i = 0;  i < priv->num_sensors; i++) {
>   1351                  if (data->hw_ids)
>   1352                          priv->sensor[i].hw_id = data->hw_ids[i];
>   1353                  else
>   1354                          priv->sensor[i].hw_id = i;
>   1355          }
>   1356          priv->feat = data->feat;
>   1357          priv->fields = data->fields;
>   1358
>   1359          platform_set_drvdata(pdev, priv);
>   1360
>   1361          if (!priv->ops || !priv->ops->init || !priv->ops->get_temp) {
>   1362                  ret = -EINVAL;
>   1363                  goto free_wq;
>   1364          }
>   1365
>   1366          ret = priv->ops->init(priv);
>   1367          if (ret < 0) {
>   1368                  dev_err(dev, "%s: init failed\n", __func__);
>   1369                  goto free_wq;
>   1370          }
>   1371
>   1372          if (priv->ops->calibrate) {
>   1373                  ret = priv->ops->calibrate(priv);
>   1374                  if (ret < 0) {
>   1375                          if (ret != -EPROBE_DEFER)
>   1376                                  dev_err(dev, "%s: calibration failed\n", __func__);
>   1377
>   1378                          goto free_wq;
>   1379                  }
>   1380          }
>   1381
>   1382          ret = tsens_register(priv);
>   1383          if (ret < 0) {
>   1384                  dev_err(dev, "%s: registration failed\n", __func__);
>   1385                  goto free_wq;
>   1386          }
>   1387
>   1388          list_add_tail(&priv->list, &tsens_device_list);
>   1389          return 0;
>   1390
>   1391  free_wq:
>   1392          destroy_workqueue(priv->reinit_wa_worker);
>   1393          return ret;
>   1394  }
>   1395

Thanks, I will fix this in v2.

Regards,
Bhupesh
Konrad Dybcio July 15, 2022, 2:56 p.m. UTC | #3
On 1.07.2022 16:58, Bhupesh Sharma wrote:
> Since for some QCoM tsens controllers, its suggested to
> monitor the controller health periodically and in case an
> issue is detected, to re-initialize the tsens controller
> via trustzone, add the support for the same in the
> qcom tsens driver.
> 
> Note that Once the tsens controller is reset using scm call,
> all SROT and TM region registers will enter the reset mode.
> 
> While all the SROT registers will be re-programmed and
> re-enabled in trustzone prior to the scm call exit, the TM
> region registers will not re-initialized in trustzone and thus
> need to be handled by the tsens driver.
> 
> Cc: Amit Kucheria <amitk@kernel.org>
> Cc: Thara Gopinath <thara.gopinath@gmail.com>
> Cc: linux-pm@vger.kernel.org
> Cc: linux-arm-msm@vger.kernel.org
> Signed-off-by: Bhupesh Sharma <bhupesh.sharma@linaro.org>
> Reported-by: kernel test robot <lkp@intel.com>
> ---
Hi, I think this should be also checked and applied on init. This
seems required for at least SM6375, as the controller starts (or
well, doesn't start...) in an unknown state and the driver does
not like it, as the TSENS_EN indicates it is disabled.
Downstream runs this right at probe..

Konrad
Bhupesh Sharma July 18, 2022, 6:34 a.m. UTC | #4
Hi Konrad,

On 7/15/22 8:26 PM, Konrad Dybcio <konrad.dybcio@somainline.org> wrote:
> 
> 
> On 1.07.2022 16:58, Bhupesh Sharma wrote:
> > Since for some QCoM tsens controllers, its suggested to
> > monitor the controller health periodically and in case an
> > issue is detected, to re-initialize the tsens controller
> > via trustzone, add the support for the same in the
> > qcom tsens driver.
> >
> > Note that Once the tsens controller is reset using scm call,
> > all SROT and TM region registers will enter the reset mode.
> >
> > While all the SROT registers will be re-programmed and
> > re-enabled in trustzone prior to the scm call exit, the TM
> > region registers will not re-initialized in trustzone and thus
> > need to be handled by the tsens driver.
> >
> > Cc: Amit Kucheria <amitk@kernel.org>
> > Cc: Thara Gopinath <thara.gopinath@gmail.com>
> > Cc: linux-pm@vger.kernel.org
> > Cc: linux-arm-msm@vger.kernel.org
> > Signed-off-by: Bhupesh Sharma <bhupesh.sharma@linaro.org>
> > Reported-by: kernel test robot <lkp@intel.com>
> > ---
> Hi, I think this should be also checked and applied on init. This
> seems required for at least SM6375, as the controller starts (or
> well, doesn't start...) in an unknown state and the driver does
> not like it, as the TSENS_EN indicates it is disabled.
> Downstream runs this right at probe..

Hmm.. very interesting. I was not aware of the SM6375 case, as for SM8150
the controller starts in a valid state but may require reinit during operation.

So, I did not use the downstream approach to do it right at _probe() and then
later while get_temp() is called.

Let me add that in v2. BTW do you want me to set the need_reinit_wa as true
for SM6375 as well, or would you like to add that with a followup-patch ?

Regards,
Bhupesh
Bjorn Andersson July 19, 2022, 3:30 a.m. UTC | #5
On Fri 01 Jul 09:58 CDT 2022, Bhupesh Sharma wrote:

> Since for some QCoM tsens controllers, its suggested to
> monitor the controller health periodically and in case an
> issue is detected, to re-initialize the tsens controller
> via trustzone, add the support for the same in the
> qcom tsens driver.
> 
> Note that Once the tsens controller is reset using scm call,
> all SROT and TM region registers will enter the reset mode.
> 
> While all the SROT registers will be re-programmed and
> re-enabled in trustzone prior to the scm call exit, the TM
> region registers will not re-initialized in trustzone and thus
> need to be handled by the tsens driver.
> 
> Cc: Amit Kucheria <amitk@kernel.org>
> Cc: Thara Gopinath <thara.gopinath@gmail.com>
> Cc: linux-pm@vger.kernel.org
> Cc: linux-arm-msm@vger.kernel.org
> Signed-off-by: Bhupesh Sharma <bhupesh.sharma@linaro.org>
> ---
>  drivers/thermal/qcom/tsens-v2.c |   3 +
>  drivers/thermal/qcom/tsens.c    | 237 +++++++++++++++++++++++++++++++-
>  drivers/thermal/qcom/tsens.h    |   6 +
>  3 files changed, 239 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/thermal/qcom/tsens-v2.c b/drivers/thermal/qcom/tsens-v2.c
> index 61d38a56d29a..9bb542f16482 100644
> --- a/drivers/thermal/qcom/tsens-v2.c
> +++ b/drivers/thermal/qcom/tsens-v2.c
> @@ -88,6 +88,9 @@ static const struct reg_field tsens_v2_regfields[MAX_REGFIELDS] = {
>  
>  	/* TRDY: 1=ready, 0=in progress */
>  	[TRDY] = REG_FIELD(TM_TRDY_OFF, 0, 0),
> +
> +	/* FIRST_ROUND_COMPLETE: 1=complete, 0=not complete */
> +	[FIRST_ROUND_COMPLETE] = REG_FIELD(TM_TRDY_OFF, 3, 3),
>  };
>  
>  static const struct tsens_ops ops_generic_v2 = {
> diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c
> index 97f4d4454f20..28d42ae0eb47 100644
> --- a/drivers/thermal/qcom/tsens.c
> +++ b/drivers/thermal/qcom/tsens.c
> @@ -7,6 +7,7 @@
>  #include <linux/debugfs.h>
>  #include <linux/err.h>
>  #include <linux/io.h>
> +#include <linux/qcom_scm.h>
>  #include <linux/module.h>
>  #include <linux/nvmem-consumer.h>
>  #include <linux/of.h>
> @@ -21,6 +22,8 @@
>  #include "../thermal_hwmon.h"
>  #include "tsens.h"
>  
> +LIST_HEAD(tsens_device_list);
> +
>  /**
>   * struct tsens_irq_data - IRQ status and temperature violations
>   * @up_viol:        upper threshold violated
> @@ -594,19 +597,159 @@ static void tsens_disable_irq(struct tsens_priv *priv)
>  	regmap_field_write(priv->rf[INT_EN], 0);
>  }
>  
> +static int tsens_reenable_hw_after_scm(struct tsens_priv *priv)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&priv->ul_lock, flags);
> +
> +	/* Re-enable watchdog, unmask the bark and
> +	 * disable cycle completion monitoring.
> +	 */
> +	regmap_field_write(priv->rf[WDOG_BARK_CLEAR], 1);
> +	regmap_field_write(priv->rf[WDOG_BARK_CLEAR], 0);
> +	regmap_field_write(priv->rf[WDOG_BARK_MASK], 0);
> +	regmap_field_write(priv->rf[CC_MON_MASK], 1);
> +
> +	/* Re-enable interrupts */
> +	tsens_enable_irq(priv);
> +
> +	spin_unlock_irqrestore(&priv->ul_lock, flags);
> +
> +	return 0;
> +}
> +
>  int get_temp_tsens_valid(const struct tsens_sensor *s, int *temp)
>  {
> -	struct tsens_priv *priv = s->priv;
> +	struct tsens_priv *priv = s->priv, *priv_reinit;
>  	int hw_id = s->hw_id;
>  	u32 temp_idx = LAST_TEMP_0 + hw_id;
>  	u32 valid_idx = VALID_0 + hw_id;
>  	u32 valid;
> -	int ret;
> +	int ret, trdy, first_round, tsens_ret, sw_reg;
> +	unsigned long timeout;
> +	static atomic_t in_tsens_reinit;

This is a global state, I suggest you move it to the top of the file to
make that obvious.

>  
>  	/* VER_0 doesn't have VALID bit */
>  	if (tsens_version(priv) == VER_0)
>  		goto get_temp;
>  
> +	/* For some tsens controllers, its suggested to
> +	 * monitor the controller health periodically
> +	 * and in case an issue is detected to reinit
> +	 * tsens controller via trustzone.
> +	 */
> +	if (priv->needs_reinit_wa) {

I would suggest that you move all this entire block to a separate
function, maybe something:

int tsens_health_check_and_reinit()

> +		/* First check if TRDY is SET */
> +		timeout = jiffies + usecs_to_jiffies(TIMEOUT_US);
> +		do {
> +			ret = regmap_field_read(priv->rf[TRDY], &trdy);
> +			if (ret)
> +				goto err;
> +			if (!trdy)
> +				continue;
> +		} while (time_before(jiffies, timeout));

This looks like a regmap_field_read()

> +
> +		if (!trdy) {
> +			ret = regmap_field_read(priv->rf[FIRST_ROUND_COMPLETE], &first_round);
> +			if (ret)
> +				goto err;
> +
> +			if (!first_round) {
> +				if (atomic_read(&in_tsens_reinit)) {
> +					dev_dbg(priv->dev, "tsens re-init is in progress\n");
> +					ret = -EAGAIN;

Is it preferred to return -EAGAIN here, over just serializing this whole
thing using a mutex?

> +					goto err;
> +				}
> +
> +				/* Wait for 2 ms for tsens controller to recover */
> +				timeout = jiffies + msecs_to_jiffies(RESET_TIMEOUT_MS);
> +				do {
> +					ret = regmap_field_read(priv->rf[FIRST_ROUND_COMPLETE],
> +								&first_round);
> +					if (ret)
> +						goto err;
> +
> +					if (first_round) {
> +						dev_dbg(priv->dev, "tsens controller recovered\n");
> +						goto sensor_read;
> +					}
> +				} while (time_before(jiffies, timeout));
> +
> +				/*
> +				 * tsens controller did not recover,
> +				 * proceed with SCM call to re-init it
> +				 */
> +				if (atomic_read(&in_tsens_reinit)) {
> +					dev_dbg(priv->dev, "tsens re-init is in progress\n");
> +					ret = -EAGAIN;
> +					goto err;
> +				}
> +
> +				atomic_set(&in_tsens_reinit, 1);

Afaict nothing prevents two different processes to run the remainder of
the recovery in parallel. I think you need some locking here.

> +
> +				/*
> +				 * Invoke scm call only if SW register write is
> +				 * reflecting in controller. Try it for 2 ms.
> +				 */
> +				timeout = jiffies + msecs_to_jiffies(RESET_TIMEOUT_MS);
> +				do {
> +					ret = regmap_field_write(priv->rf[INT_EN], BIT(2));

Do we know what BIT(2) is and would we be allowed to give it a define?

> +					if (ret)
> +						goto err_unset;
> +
> +					ret = regmap_field_read(priv->rf[INT_EN], &sw_reg);
> +					if (ret)
> +						goto err_unset;
> +
> +					if (!(sw_reg & BIT(2)))
> +						continue;

Why not:

} while (sw_reg & BIT(2) && time_before(jiffies, timeout));

> +				} while (time_before(jiffies, timeout));
> +
> +				if (!(sw_reg & BIT(2))) {
> +					ret = -ENOTRECOVERABLE;
> +					goto err_unset;
> +				}
> +
> +				ret = qcom_scm_tsens_reinit(&tsens_ret);
> +				if (ret || tsens_ret) {
> +					dev_err(priv->dev, "tsens reinit scm call failed (%d : %d)\n",
> +							ret, tsens_ret);
> +					if (tsens_ret)
> +						ret = -ENOTRECOVERABLE;

If that's the api for the SCM, feel free to move the -ENOTRECOVERABLE to
the scm function.

> +
> +					goto err_unset;
> +				}
> +
> +				/* After the SCM call, we need to re-enable
> +				 * the interrupts and also set active threshold
> +				 * for each sensor.
> +				 */
> +				list_for_each_entry(priv_reinit,
> +						&tsens_device_list, list) {
> +					ret = tsens_reenable_hw_after_scm(priv_reinit);
> +					if (ret) {
> +						dev_err(priv->dev,
> +							"tsens re-enable after scm call failed (%d)\n",
> +							ret);
> +						ret = -ENOTRECOVERABLE;
> +						goto err_unset;
> +					}
> +				}
> +
> +				atomic_set(&in_tsens_reinit, 0);
> +
> +				/* Notify reinit wa worker */
> +				list_for_each_entry(priv_reinit,

Do you need to loop twice over the tsens instances?

> +						&tsens_device_list, list) {
> +					queue_work(priv_reinit->reinit_wa_worker,
> +							&priv_reinit->reinit_wa_notify);
> +				}
> +			}
> +		}
> +	}
> +
> +sensor_read:
>  	/* Valid bit is 0 for 6 AHB clock cycles.
>  	 * At 19.2MHz, 1 AHB clock is ~60ns.
>  	 * We should enter this loop very, very rarely.
> @@ -623,6 +766,12 @@ int get_temp_tsens_valid(const struct tsens_sensor *s, int *temp)
>  	*temp = tsens_hw_to_mC(s, temp_idx);
>  
>  	return 0;
> +
> +err_unset:
> +	atomic_set(&in_tsens_reinit, 0);
> +
> +err:
> +	return ret;
>  }
>  
>  int get_temp_common(const struct tsens_sensor *s, int *temp)
> @@ -860,6 +1009,14 @@ int __init init_common(struct tsens_priv *priv)
>  		goto err_put_device;
>  	}
>  
> +	priv->rf[FIRST_ROUND_COMPLETE] = devm_regmap_field_alloc(dev,
> +								priv->tm_map,
> +								priv->fields[FIRST_ROUND_COMPLETE]);
> +	if (IS_ERR(priv->rf[FIRST_ROUND_COMPLETE])) {
> +		ret = PTR_ERR(priv->rf[FIRST_ROUND_COMPLETE]);
> +		goto err_put_device;
> +	}
> +
>  	/* This loop might need changes if enum regfield_ids is reordered */
>  	for (j = LAST_TEMP_0; j <= UP_THRESH_15; j += 16) {
>  		for (i = 0; i < priv->feat->max_sensors; i++) {
> @@ -1097,6 +1254,43 @@ static int tsens_register(struct tsens_priv *priv)
>  	return ret;
>  }
>  
> +static void tsens_reinit_worker_notify(struct work_struct *work)
> +{
> +	int i, ret, temp;

priv->num_sensors is unsigned, so i could be too.

> +	struct tsens_irq_data d;
> +	struct tsens_priv *priv = container_of(work, struct tsens_priv,
> +					       reinit_wa_notify);
> +
> +	for (i = 0; i < priv->num_sensors; i++) {
> +		const struct tsens_sensor *s = &priv->sensor[i];
> +		u32 hw_id = s->hw_id;
> +
> +		if (!s->tzd)
> +			continue;
> +		if (!tsens_threshold_violated(priv, hw_id, &d))
> +			continue;
> +
> +		ret = get_temp_tsens_valid(s, &temp);
> +		if (ret) {
> +			dev_err(priv->dev, "[%u] %s: error reading sensor\n",
> +				hw_id, __func__);

Please express yourself in the message, instead of using __func__.

> +			continue;
> +		}
> +
> +		tsens_read_irq_state(priv, hw_id, s, &d);
> +
> +		if ((d.up_thresh < temp) || (d.low_thresh > temp)) {
> +			dev_dbg(priv->dev, "[%u] %s: TZ update trigger (%d mC)\n",
> +				hw_id, __func__, temp);
> +			thermal_zone_device_update(s->tzd,
> +						   THERMAL_EVENT_UNSPECIFIED);

This is just 86 chars long, no need to wrap the line.

> +		} else {
> +			dev_dbg(priv->dev, "[%u] %s: no violation:  %d\n",

Double space after ':'

> +				hw_id, __func__, temp);
> +		}
> +	}
> +}
> +
>  static int tsens_probe(struct platform_device *pdev)
>  {
>  	int ret, i;
> @@ -1139,6 +1333,19 @@ static int tsens_probe(struct platform_device *pdev)
>  	priv->dev = dev;
>  	priv->num_sensors = num_sensors;
>  	priv->needs_reinit_wa = data->needs_reinit_wa;
> +
> +	if (priv->needs_reinit_wa && !qcom_scm_is_available())
> +		return -EPROBE_DEFER;
> +
> +	if (priv->needs_reinit_wa) {
> +		priv->reinit_wa_worker = alloc_workqueue("tsens_reinit_work",
> +							 WQ_HIGHPRI, 0);

Do you really need your own work queue for this, how about just
scheduling the work on system_highpri_wq?

Regards,
Bjorn
Konrad Dybcio July 19, 2022, 10:39 a.m. UTC | #6
On 18.07.2022 08:34, bhupesh.sharma@linaro.org wrote:
> Hi Konrad,
> 
> On 7/15/22 8:26 PM, Konrad Dybcio <konrad.dybcio@somainline.org> wrote:
>>
>>
>> On 1.07.2022 16:58, Bhupesh Sharma wrote:
>> > Since for some QCoM tsens controllers, its suggested to
>> > monitor the controller health periodically and in case an
>> > issue is detected, to re-initialize the tsens controller
>> > via trustzone, add the support for the same in the
>> > qcom tsens driver.
>> >
>> > Note that Once the tsens controller is reset using scm call,
>> > all SROT and TM region registers will enter the reset mode.
>> >
>> > While all the SROT registers will be re-programmed and
>> > re-enabled in trustzone prior to the scm call exit, the TM
>> > region registers will not re-initialized in trustzone and thus
>> > need to be handled by the tsens driver.
>> >
>> > Cc: Amit Kucheria <amitk@kernel.org>
>> > Cc: Thara Gopinath <thara.gopinath@gmail.com>
>> > Cc: linux-pm@vger.kernel.org
>> > Cc: linux-arm-msm@vger.kernel.org
>> > Signed-off-by: Bhupesh Sharma <bhupesh.sharma@linaro.org>
>> > Reported-by: kernel test robot <lkp@intel.com>
>> > ---
>> Hi, I think this should be also checked and applied on init. This
>> seems required for at least SM6375, as the controller starts (or
>> well, doesn't start...) in an unknown state and the driver does
>> not like it, as the TSENS_EN indicates it is disabled.
>> Downstream runs this right at probe..
> 
> Hmm.. very interesting. I was not aware of the SM6375 case, as for SM8150
> the controller starts in a valid state but may require reinit during operation.
> 
> So, I did not use the downstream approach to do it right at _probe() and then
> later while get_temp() is called.
> 
> Let me add that in v2. BTW do you want me to set the need_reinit_wa as true
> for SM6375 as well, or would you like to add that with a followup-patch ?
Please set it, I'll happily test it!

Konrad
> 
> Regards,
> Bhupesh
Bhupesh Sharma July 20, 2022, 8:16 a.m. UTC | #7
On 7/19/22 4:09 PM, Konrad Dybcio wrote:
> 
> 
> On 18.07.2022 08:34, bhupesh.sharma@linaro.org wrote:
>> Hi Konrad,
>>
>> On 7/15/22 8:26 PM, Konrad Dybcio <konrad.dybcio@somainline.org> wrote:
>>>
>>>
>>> On 1.07.2022 16:58, Bhupesh Sharma wrote:
>>>> Since for some QCoM tsens controllers, its suggested to
>>>> monitor the controller health periodically and in case an
>>>> issue is detected, to re-initialize the tsens controller
>>>> via trustzone, add the support for the same in the
>>>> qcom tsens driver.
>>>>
>>>> Note that Once the tsens controller is reset using scm call,
>>>> all SROT and TM region registers will enter the reset mode.
>>>>
>>>> While all the SROT registers will be re-programmed and
>>>> re-enabled in trustzone prior to the scm call exit, the TM
>>>> region registers will not re-initialized in trustzone and thus
>>>> need to be handled by the tsens driver.
>>>>
>>>> Cc: Amit Kucheria <amitk@kernel.org>
>>>> Cc: Thara Gopinath <thara.gopinath@gmail.com>
>>>> Cc: linux-pm@vger.kernel.org
>>>> Cc: linux-arm-msm@vger.kernel.org
>>>> Signed-off-by: Bhupesh Sharma <bhupesh.sharma@linaro.org>
>>>> Reported-by: kernel test robot <lkp@intel.com>
>>>> ---
>>> Hi, I think this should be also checked and applied on init. This
>>> seems required for at least SM6375, as the controller starts (or
>>> well, doesn't start...) in an unknown state and the driver does
>>> not like it, as the TSENS_EN indicates it is disabled.
>>> Downstream runs this right at probe..
>>
>> Hmm.. very interesting. I was not aware of the SM6375 case, as for SM8150
>> the controller starts in a valid state but may require reinit during operation.
>>
>> So, I did not use the downstream approach to do it right at _probe() and then
>> later while get_temp() is called.
>>
>> Let me add that in v2. BTW do you want me to set the need_reinit_wa as true
>> for SM6375 as well, or would you like to add that with a followup-patch ?
> Please set it, I'll happily test it!

Thanks. Will share v2 shortly.

Regards,
Bhupesh
Bhupesh Sharma July 20, 2022, 8:27 a.m. UTC | #8
Hi Bjorn,

Thanks for your review.

On 7/19/22 9:00 AM, Bjorn Andersson wrote:
> On Fri 01 Jul 09:58 CDT 2022, Bhupesh Sharma wrote:
> 
>> Since for some QCoM tsens controllers, its suggested to
>> monitor the controller health periodically and in case an
>> issue is detected, to re-initialize the tsens controller
>> via trustzone, add the support for the same in the
>> qcom tsens driver.
>>
>> Note that Once the tsens controller is reset using scm call,
>> all SROT and TM region registers will enter the reset mode.
>>
>> While all the SROT registers will be re-programmed and
>> re-enabled in trustzone prior to the scm call exit, the TM
>> region registers will not re-initialized in trustzone and thus
>> need to be handled by the tsens driver.
>>
>> Cc: Amit Kucheria <amitk@kernel.org>
>> Cc: Thara Gopinath <thara.gopinath@gmail.com>
>> Cc: linux-pm@vger.kernel.org
>> Cc: linux-arm-msm@vger.kernel.org
>> Signed-off-by: Bhupesh Sharma <bhupesh.sharma@linaro.org>
>> ---
>>   drivers/thermal/qcom/tsens-v2.c |   3 +
>>   drivers/thermal/qcom/tsens.c    | 237 +++++++++++++++++++++++++++++++-
>>   drivers/thermal/qcom/tsens.h    |   6 +
>>   3 files changed, 239 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/thermal/qcom/tsens-v2.c b/drivers/thermal/qcom/tsens-v2.c
>> index 61d38a56d29a..9bb542f16482 100644
>> --- a/drivers/thermal/qcom/tsens-v2.c
>> +++ b/drivers/thermal/qcom/tsens-v2.c
>> @@ -88,6 +88,9 @@ static const struct reg_field tsens_v2_regfields[MAX_REGFIELDS] = {
>>   
>>   	/* TRDY: 1=ready, 0=in progress */
>>   	[TRDY] = REG_FIELD(TM_TRDY_OFF, 0, 0),
>> +
>> +	/* FIRST_ROUND_COMPLETE: 1=complete, 0=not complete */
>> +	[FIRST_ROUND_COMPLETE] = REG_FIELD(TM_TRDY_OFF, 3, 3),
>>   };
>>   
>>   static const struct tsens_ops ops_generic_v2 = {
>> diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c
>> index 97f4d4454f20..28d42ae0eb47 100644
>> --- a/drivers/thermal/qcom/tsens.c
>> +++ b/drivers/thermal/qcom/tsens.c
>> @@ -7,6 +7,7 @@
>>   #include <linux/debugfs.h>
>>   #include <linux/err.h>
>>   #include <linux/io.h>
>> +#include <linux/qcom_scm.h>
>>   #include <linux/module.h>
>>   #include <linux/nvmem-consumer.h>
>>   #include <linux/of.h>
>> @@ -21,6 +22,8 @@
>>   #include "../thermal_hwmon.h"
>>   #include "tsens.h"
>>   
>> +LIST_HEAD(tsens_device_list);
>> +
>>   /**
>>    * struct tsens_irq_data - IRQ status and temperature violations
>>    * @up_viol:        upper threshold violated
>> @@ -594,19 +597,159 @@ static void tsens_disable_irq(struct tsens_priv *priv)
>>   	regmap_field_write(priv->rf[INT_EN], 0);
>>   }
>>   
>> +static int tsens_reenable_hw_after_scm(struct tsens_priv *priv)
>> +{
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&priv->ul_lock, flags);
>> +
>> +	/* Re-enable watchdog, unmask the bark and
>> +	 * disable cycle completion monitoring.
>> +	 */
>> +	regmap_field_write(priv->rf[WDOG_BARK_CLEAR], 1);
>> +	regmap_field_write(priv->rf[WDOG_BARK_CLEAR], 0);
>> +	regmap_field_write(priv->rf[WDOG_BARK_MASK], 0);
>> +	regmap_field_write(priv->rf[CC_MON_MASK], 1);
>> +
>> +	/* Re-enable interrupts */
>> +	tsens_enable_irq(priv);
>> +
>> +	spin_unlock_irqrestore(&priv->ul_lock, flags);
>> +
>> +	return 0;
>> +}
>> +
>>   int get_temp_tsens_valid(const struct tsens_sensor *s, int *temp)
>>   {
>> -	struct tsens_priv *priv = s->priv;
>> +	struct tsens_priv *priv = s->priv, *priv_reinit;
>>   	int hw_id = s->hw_id;
>>   	u32 temp_idx = LAST_TEMP_0 + hw_id;
>>   	u32 valid_idx = VALID_0 + hw_id;
>>   	u32 valid;
>> -	int ret;
>> +	int ret, trdy, first_round, tsens_ret, sw_reg;
>> +	unsigned long timeout;
>> +	static atomic_t in_tsens_reinit;
> 
> This is a global state, I suggest you move it to the top of the file to
> make that obvious.

Sure.

>>   	/* VER_0 doesn't have VALID bit */
>>   	if (tsens_version(priv) == VER_0)
>>   		goto get_temp;
>>   
>> +	/* For some tsens controllers, its suggested to
>> +	 * monitor the controller health periodically
>> +	 * and in case an issue is detected to reinit
>> +	 * tsens controller via trustzone.
>> +	 */
>> +	if (priv->needs_reinit_wa) {
> 
> I would suggest that you move all this entire block to a separate
> function, maybe something:
> 
> int tsens_health_check_and_reinit()

Ok. Will fix in v2.

>> +		/* First check if TRDY is SET */
>> +		timeout = jiffies + usecs_to_jiffies(TIMEOUT_US);
>> +		do {
>> +			ret = regmap_field_read(priv->rf[TRDY], &trdy);
>> +			if (ret)
>> +				goto err;
>> +			if (!trdy)
>> +				continue;
>> +		} while (time_before(jiffies, timeout));
> 
> This looks like a regmap_field_read()

Not sure, I completely understand this comment. Can you please elaborate?

>> +
>> +		if (!trdy) {
>> +			ret = regmap_field_read(priv->rf[FIRST_ROUND_COMPLETE], &first_round);
>> +			if (ret)
>> +				goto err;
>> +
>> +			if (!first_round) {
>> +				if (atomic_read(&in_tsens_reinit)) {
>> +					dev_dbg(priv->dev, "tsens re-init is in progress\n");
>> +					ret = -EAGAIN;
> 
> Is it preferred to return -EAGAIN here, over just serializing this whole
> thing using a mutex?

Right, using a mutex to serialize here makes sense. Will fix in v2.

>> +					goto err;
>> +				}
>> +
>> +				/* Wait for 2 ms for tsens controller to recover */
>> +				timeout = jiffies + msecs_to_jiffies(RESET_TIMEOUT_MS);
>> +				do {
>> +					ret = regmap_field_read(priv->rf[FIRST_ROUND_COMPLETE],
>> +								&first_round);
>> +					if (ret)
>> +						goto err;
>> +
>> +					if (first_round) {
>> +						dev_dbg(priv->dev, "tsens controller recovered\n");
>> +						goto sensor_read;
>> +					}
>> +				} while (time_before(jiffies, timeout));
>> +
>> +				/*
>> +				 * tsens controller did not recover,
>> +				 * proceed with SCM call to re-init it
>> +				 */
>> +				if (atomic_read(&in_tsens_reinit)) {
>> +					dev_dbg(priv->dev, "tsens re-init is in progress\n");
>> +					ret = -EAGAIN;
>> +					goto err;
>> +				}
>> +
>> +				atomic_set(&in_tsens_reinit, 1);
> 
> Afaict nothing prevents two different processes to run the remainder of
> the recovery in parallel. I think you need some locking here.

Ack.

>> +
>> +				/*
>> +				 * Invoke scm call only if SW register write is
>> +				 * reflecting in controller. Try it for 2 ms.
>> +				 */
>> +				timeout = jiffies + msecs_to_jiffies(RESET_TIMEOUT_MS);
>> +				do {
>> +					ret = regmap_field_write(priv->rf[INT_EN], BIT(2));
> 
> Do we know what BIT(2) is and would we be allowed to give it a define?

Sure, I will add a define here.

>> +					if (ret)
>> +						goto err_unset;
>> +
>> +					ret = regmap_field_read(priv->rf[INT_EN], &sw_reg);
>> +					if (ret)
>> +						goto err_unset;
>> +
>> +					if (!(sw_reg & BIT(2)))
>> +						continue;
> 
> Why not:
> 
> } while (sw_reg & BIT(2) && time_before(jiffies, timeout));

Sure.

>> +				} while (time_before(jiffies, timeout));
>> +
>> +				if (!(sw_reg & BIT(2))) {
>> +					ret = -ENOTRECOVERABLE;
>> +					goto err_unset;
>> +				}
>> +
>> +				ret = qcom_scm_tsens_reinit(&tsens_ret);
>> +				if (ret || tsens_ret) {
>> +					dev_err(priv->dev, "tsens reinit scm call failed (%d : %d)\n",
>> +							ret, tsens_ret);
>> +					if (tsens_ret)
>> +						ret = -ENOTRECOVERABLE;
> 
> If that's the api for the SCM, feel free to move the -ENOTRECOVERABLE to
> the scm function.

Ok, let me check and fix this in v2.

>> +
>> +					goto err_unset;
>> +				}
>> +
>> +				/* After the SCM call, we need to re-enable
>> +				 * the interrupts and also set active threshold
>> +				 * for each sensor.
>> +				 */
>> +				list_for_each_entry(priv_reinit,
>> +						&tsens_device_list, list) {
>> +					ret = tsens_reenable_hw_after_scm(priv_reinit);
>> +					if (ret) {
>> +						dev_err(priv->dev,
>> +							"tsens re-enable after scm call failed (%d)\n",
>> +							ret);
>> +						ret = -ENOTRECOVERABLE;
>> +						goto err_unset;
>> +					}
>> +				}
>> +
>> +				atomic_set(&in_tsens_reinit, 0);
>> +
>> +				/* Notify reinit wa worker */
>> +				list_for_each_entry(priv_reinit,
> 
> Do you need to loop twice over the tsens instances?
> 
>> +						&tsens_device_list, list) {
>> +					queue_work(priv_reinit->reinit_wa_worker,
>> +							&priv_reinit->reinit_wa_notify);
>> +				}
>> +			}
>> +		}
>> +	}
>> +
>> +sensor_read:
>>   	/* Valid bit is 0 for 6 AHB clock cycles.
>>   	 * At 19.2MHz, 1 AHB clock is ~60ns.
>>   	 * We should enter this loop very, very rarely.
>> @@ -623,6 +766,12 @@ int get_temp_tsens_valid(const struct tsens_sensor *s, int *temp)
>>   	*temp = tsens_hw_to_mC(s, temp_idx);
>>   
>>   	return 0;
>> +
>> +err_unset:
>> +	atomic_set(&in_tsens_reinit, 0);
>> +
>> +err:
>> +	return ret;
>>   }
>>   
>>   int get_temp_common(const struct tsens_sensor *s, int *temp)
>> @@ -860,6 +1009,14 @@ int __init init_common(struct tsens_priv *priv)
>>   		goto err_put_device;
>>   	}
>>   
>> +	priv->rf[FIRST_ROUND_COMPLETE] = devm_regmap_field_alloc(dev,
>> +								priv->tm_map,
>> +								priv->fields[FIRST_ROUND_COMPLETE]);
>> +	if (IS_ERR(priv->rf[FIRST_ROUND_COMPLETE])) {
>> +		ret = PTR_ERR(priv->rf[FIRST_ROUND_COMPLETE]);
>> +		goto err_put_device;
>> +	}
>> +
>>   	/* This loop might need changes if enum regfield_ids is reordered */
>>   	for (j = LAST_TEMP_0; j <= UP_THRESH_15; j += 16) {
>>   		for (i = 0; i < priv->feat->max_sensors; i++) {
>> @@ -1097,6 +1254,43 @@ static int tsens_register(struct tsens_priv *priv)
>>   	return ret;
>>   }
>>   
>> +static void tsens_reinit_worker_notify(struct work_struct *work)
>> +{
>> +	int i, ret, temp;
> 
> priv->num_sensors is unsigned, so i could be too.

Ok.

>> +	struct tsens_irq_data d;
>> +	struct tsens_priv *priv = container_of(work, struct tsens_priv,
>> +					       reinit_wa_notify);
>> +
>> +	for (i = 0; i < priv->num_sensors; i++) {
>> +		const struct tsens_sensor *s = &priv->sensor[i];
>> +		u32 hw_id = s->hw_id;
>> +
>> +		if (!s->tzd)
>> +			continue;
>> +		if (!tsens_threshold_violated(priv, hw_id, &d))
>> +			continue;
>> +
>> +		ret = get_temp_tsens_valid(s, &temp);
>> +		if (ret) {
>> +			dev_err(priv->dev, "[%u] %s: error reading sensor\n",
>> +				hw_id, __func__);
> 
> Please express yourself in the message, instead of using __func__.

This was a reuse from the existing tsens irq handler code, but I agree.
Let me fix it in v2.

>> +			continue;
>> +		}
>> +
>> +		tsens_read_irq_state(priv, hw_id, s, &d);
>> +
>> +		if ((d.up_thresh < temp) || (d.low_thresh > temp)) {
>> +			dev_dbg(priv->dev, "[%u] %s: TZ update trigger (%d mC)\n",
>> +				hw_id, __func__, temp);
>> +			thermal_zone_device_update(s->tzd,
>> +						   THERMAL_EVENT_UNSPECIFIED);
> 
> This is just 86 chars long, no need to wrap the line.

Sure.

>> +		} else {
>> +			dev_dbg(priv->dev, "[%u] %s: no violation:  %d\n",
> 
> Double space after ':'

Again this is a reuse from the existing tsens irq handler code, but I 
agree. Let me fix it in v2.

>> +				hw_id, __func__, temp);
>> +		}
>> +	}
>> +}
>> +
>>   static int tsens_probe(struct platform_device *pdev)
>>   {
>>   	int ret, i;
>> @@ -1139,6 +1333,19 @@ static int tsens_probe(struct platform_device *pdev)
>>   	priv->dev = dev;
>>   	priv->num_sensors = num_sensors;
>>   	priv->needs_reinit_wa = data->needs_reinit_wa;
>> +
>> +	if (priv->needs_reinit_wa && !qcom_scm_is_available())
>> +		return -EPROBE_DEFER;
>> +
>> +	if (priv->needs_reinit_wa) {
>> +		priv->reinit_wa_worker = alloc_workqueue("tsens_reinit_work",
>> +							 WQ_HIGHPRI, 0);
> 
> Do you really need your own work queue for this, how about just
> scheduling the work on system_highpri_wq?

Ok, let me use 'system_highpri_wq' in v2.

Regards,
Bhupesh
diff mbox series

Patch

diff --git a/drivers/thermal/qcom/tsens-v2.c b/drivers/thermal/qcom/tsens-v2.c
index 61d38a56d29a..9bb542f16482 100644
--- a/drivers/thermal/qcom/tsens-v2.c
+++ b/drivers/thermal/qcom/tsens-v2.c
@@ -88,6 +88,9 @@  static const struct reg_field tsens_v2_regfields[MAX_REGFIELDS] = {
 
 	/* TRDY: 1=ready, 0=in progress */
 	[TRDY] = REG_FIELD(TM_TRDY_OFF, 0, 0),
+
+	/* FIRST_ROUND_COMPLETE: 1=complete, 0=not complete */
+	[FIRST_ROUND_COMPLETE] = REG_FIELD(TM_TRDY_OFF, 3, 3),
 };
 
 static const struct tsens_ops ops_generic_v2 = {
diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c
index 97f4d4454f20..28d42ae0eb47 100644
--- a/drivers/thermal/qcom/tsens.c
+++ b/drivers/thermal/qcom/tsens.c
@@ -7,6 +7,7 @@ 
 #include <linux/debugfs.h>
 #include <linux/err.h>
 #include <linux/io.h>
+#include <linux/qcom_scm.h>
 #include <linux/module.h>
 #include <linux/nvmem-consumer.h>
 #include <linux/of.h>
@@ -21,6 +22,8 @@ 
 #include "../thermal_hwmon.h"
 #include "tsens.h"
 
+LIST_HEAD(tsens_device_list);
+
 /**
  * struct tsens_irq_data - IRQ status and temperature violations
  * @up_viol:        upper threshold violated
@@ -594,19 +597,159 @@  static void tsens_disable_irq(struct tsens_priv *priv)
 	regmap_field_write(priv->rf[INT_EN], 0);
 }
 
+static int tsens_reenable_hw_after_scm(struct tsens_priv *priv)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->ul_lock, flags);
+
+	/* Re-enable watchdog, unmask the bark and
+	 * disable cycle completion monitoring.
+	 */
+	regmap_field_write(priv->rf[WDOG_BARK_CLEAR], 1);
+	regmap_field_write(priv->rf[WDOG_BARK_CLEAR], 0);
+	regmap_field_write(priv->rf[WDOG_BARK_MASK], 0);
+	regmap_field_write(priv->rf[CC_MON_MASK], 1);
+
+	/* Re-enable interrupts */
+	tsens_enable_irq(priv);
+
+	spin_unlock_irqrestore(&priv->ul_lock, flags);
+
+	return 0;
+}
+
 int get_temp_tsens_valid(const struct tsens_sensor *s, int *temp)
 {
-	struct tsens_priv *priv = s->priv;
+	struct tsens_priv *priv = s->priv, *priv_reinit;
 	int hw_id = s->hw_id;
 	u32 temp_idx = LAST_TEMP_0 + hw_id;
 	u32 valid_idx = VALID_0 + hw_id;
 	u32 valid;
-	int ret;
+	int ret, trdy, first_round, tsens_ret, sw_reg;
+	unsigned long timeout;
+	static atomic_t in_tsens_reinit;
 
 	/* VER_0 doesn't have VALID bit */
 	if (tsens_version(priv) == VER_0)
 		goto get_temp;
 
+	/* For some tsens controllers, its suggested to
+	 * monitor the controller health periodically
+	 * and in case an issue is detected to reinit
+	 * tsens controller via trustzone.
+	 */
+	if (priv->needs_reinit_wa) {
+		/* First check if TRDY is SET */
+		timeout = jiffies + usecs_to_jiffies(TIMEOUT_US);
+		do {
+			ret = regmap_field_read(priv->rf[TRDY], &trdy);
+			if (ret)
+				goto err;
+			if (!trdy)
+				continue;
+		} while (time_before(jiffies, timeout));
+
+		if (!trdy) {
+			ret = regmap_field_read(priv->rf[FIRST_ROUND_COMPLETE], &first_round);
+			if (ret)
+				goto err;
+
+			if (!first_round) {
+				if (atomic_read(&in_tsens_reinit)) {
+					dev_dbg(priv->dev, "tsens re-init is in progress\n");
+					ret = -EAGAIN;
+					goto err;
+				}
+
+				/* Wait for 2 ms for tsens controller to recover */
+				timeout = jiffies + msecs_to_jiffies(RESET_TIMEOUT_MS);
+				do {
+					ret = regmap_field_read(priv->rf[FIRST_ROUND_COMPLETE],
+								&first_round);
+					if (ret)
+						goto err;
+
+					if (first_round) {
+						dev_dbg(priv->dev, "tsens controller recovered\n");
+						goto sensor_read;
+					}
+				} while (time_before(jiffies, timeout));
+
+				/*
+				 * tsens controller did not recover,
+				 * proceed with SCM call to re-init it
+				 */
+				if (atomic_read(&in_tsens_reinit)) {
+					dev_dbg(priv->dev, "tsens re-init is in progress\n");
+					ret = -EAGAIN;
+					goto err;
+				}
+
+				atomic_set(&in_tsens_reinit, 1);
+
+				/*
+				 * Invoke scm call only if SW register write is
+				 * reflecting in controller. Try it for 2 ms.
+				 */
+				timeout = jiffies + msecs_to_jiffies(RESET_TIMEOUT_MS);
+				do {
+					ret = regmap_field_write(priv->rf[INT_EN], BIT(2));
+					if (ret)
+						goto err_unset;
+
+					ret = regmap_field_read(priv->rf[INT_EN], &sw_reg);
+					if (ret)
+						goto err_unset;
+
+					if (!(sw_reg & BIT(2)))
+						continue;
+				} while (time_before(jiffies, timeout));
+
+				if (!(sw_reg & BIT(2))) {
+					ret = -ENOTRECOVERABLE;
+					goto err_unset;
+				}
+
+				ret = qcom_scm_tsens_reinit(&tsens_ret);
+				if (ret || tsens_ret) {
+					dev_err(priv->dev, "tsens reinit scm call failed (%d : %d)\n",
+							ret, tsens_ret);
+					if (tsens_ret)
+						ret = -ENOTRECOVERABLE;
+
+					goto err_unset;
+				}
+
+				/* After the SCM call, we need to re-enable
+				 * the interrupts and also set active threshold
+				 * for each sensor.
+				 */
+				list_for_each_entry(priv_reinit,
+						&tsens_device_list, list) {
+					ret = tsens_reenable_hw_after_scm(priv_reinit);
+					if (ret) {
+						dev_err(priv->dev,
+							"tsens re-enable after scm call failed (%d)\n",
+							ret);
+						ret = -ENOTRECOVERABLE;
+						goto err_unset;
+					}
+				}
+
+				atomic_set(&in_tsens_reinit, 0);
+
+				/* Notify reinit wa worker */
+				list_for_each_entry(priv_reinit,
+						&tsens_device_list, list) {
+					queue_work(priv_reinit->reinit_wa_worker,
+							&priv_reinit->reinit_wa_notify);
+				}
+			}
+		}
+	}
+
+sensor_read:
 	/* Valid bit is 0 for 6 AHB clock cycles.
 	 * At 19.2MHz, 1 AHB clock is ~60ns.
 	 * We should enter this loop very, very rarely.
@@ -623,6 +766,12 @@  int get_temp_tsens_valid(const struct tsens_sensor *s, int *temp)
 	*temp = tsens_hw_to_mC(s, temp_idx);
 
 	return 0;
+
+err_unset:
+	atomic_set(&in_tsens_reinit, 0);
+
+err:
+	return ret;
 }
 
 int get_temp_common(const struct tsens_sensor *s, int *temp)
@@ -860,6 +1009,14 @@  int __init init_common(struct tsens_priv *priv)
 		goto err_put_device;
 	}
 
+	priv->rf[FIRST_ROUND_COMPLETE] = devm_regmap_field_alloc(dev,
+								priv->tm_map,
+								priv->fields[FIRST_ROUND_COMPLETE]);
+	if (IS_ERR(priv->rf[FIRST_ROUND_COMPLETE])) {
+		ret = PTR_ERR(priv->rf[FIRST_ROUND_COMPLETE]);
+		goto err_put_device;
+	}
+
 	/* This loop might need changes if enum regfield_ids is reordered */
 	for (j = LAST_TEMP_0; j <= UP_THRESH_15; j += 16) {
 		for (i = 0; i < priv->feat->max_sensors; i++) {
@@ -1097,6 +1254,43 @@  static int tsens_register(struct tsens_priv *priv)
 	return ret;
 }
 
+static void tsens_reinit_worker_notify(struct work_struct *work)
+{
+	int i, ret, temp;
+	struct tsens_irq_data d;
+	struct tsens_priv *priv = container_of(work, struct tsens_priv,
+					       reinit_wa_notify);
+
+	for (i = 0; i < priv->num_sensors; i++) {
+		const struct tsens_sensor *s = &priv->sensor[i];
+		u32 hw_id = s->hw_id;
+
+		if (!s->tzd)
+			continue;
+		if (!tsens_threshold_violated(priv, hw_id, &d))
+			continue;
+
+		ret = get_temp_tsens_valid(s, &temp);
+		if (ret) {
+			dev_err(priv->dev, "[%u] %s: error reading sensor\n",
+				hw_id, __func__);
+			continue;
+		}
+
+		tsens_read_irq_state(priv, hw_id, s, &d);
+
+		if ((d.up_thresh < temp) || (d.low_thresh > temp)) {
+			dev_dbg(priv->dev, "[%u] %s: TZ update trigger (%d mC)\n",
+				hw_id, __func__, temp);
+			thermal_zone_device_update(s->tzd,
+						   THERMAL_EVENT_UNSPECIFIED);
+		} else {
+			dev_dbg(priv->dev, "[%u] %s: no violation:  %d\n",
+				hw_id, __func__, temp);
+		}
+	}
+}
+
 static int tsens_probe(struct platform_device *pdev)
 {
 	int ret, i;
@@ -1139,6 +1333,19 @@  static int tsens_probe(struct platform_device *pdev)
 	priv->dev = dev;
 	priv->num_sensors = num_sensors;
 	priv->needs_reinit_wa = data->needs_reinit_wa;
+
+	if (priv->needs_reinit_wa && !qcom_scm_is_available())
+		return -EPROBE_DEFER;
+
+	if (priv->needs_reinit_wa) {
+		priv->reinit_wa_worker = alloc_workqueue("tsens_reinit_work",
+							 WQ_HIGHPRI, 0);
+		if (!priv->reinit_wa_worker)
+			return -ENOMEM;
+
+		INIT_WORK(&priv->reinit_wa_notify, tsens_reinit_worker_notify);
+	}
+
 	priv->ops = data->ops;
 	for (i = 0;  i < priv->num_sensors; i++) {
 		if (data->hw_ids)
@@ -1151,13 +1358,15 @@  static int tsens_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, priv);
 
-	if (!priv->ops || !priv->ops->init || !priv->ops->get_temp)
-		return -EINVAL;
+	if (!priv->ops || !priv->ops->init || !priv->ops->get_temp) {
+		ret = -EINVAL;
+		goto free_wq;
+	}
 
 	ret = priv->ops->init(priv);
 	if (ret < 0) {
 		dev_err(dev, "%s: init failed\n", __func__);
-		return ret;
+		goto free_wq;
 	}
 
 	if (priv->ops->calibrate) {
@@ -1165,11 +1374,23 @@  static int tsens_probe(struct platform_device *pdev)
 		if (ret < 0) {
 			if (ret != -EPROBE_DEFER)
 				dev_err(dev, "%s: calibration failed\n", __func__);
-			return ret;
+
+			goto free_wq;
 		}
 	}
 
-	return tsens_register(priv);
+	ret = tsens_register(priv);
+	if (ret < 0) {
+		dev_err(dev, "%s: registration failed\n", __func__);
+		goto free_wq;
+	}
+
+	list_add_tail(&priv->list, &tsens_device_list);
+	return 0;
+
+free_wq:
+	destroy_workqueue(priv->reinit_wa_worker);
+	return ret;
 }
 
 static int tsens_remove(struct platform_device *pdev)
@@ -1181,6 +1402,8 @@  static int tsens_remove(struct platform_device *pdev)
 	if (priv->ops->disable)
 		priv->ops->disable(priv);
 
+	destroy_workqueue(priv->reinit_wa_worker);
+
 	return 0;
 }
 
diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h
index 48a7bda902c1..c7279a50cf9b 100644
--- a/drivers/thermal/qcom/tsens.h
+++ b/drivers/thermal/qcom/tsens.h
@@ -14,6 +14,7 @@ 
 #define SLOPE_FACTOR		1000
 #define SLOPE_DEFAULT		3200
 #define TIMEOUT_US		100
+#define RESET_TIMEOUT_MS	2
 #define THRESHOLD_MAX_ADC_CODE	0x3ff
 #define THRESHOLD_MIN_ADC_CODE	0x0
 
@@ -167,6 +168,7 @@  enum regfield_ids {
 	/* ----- TM ------ */
 	/* TRDY */
 	TRDY,
+	FIRST_ROUND_COMPLETE,
 	/* INTERRUPT ENABLE */
 	INT_EN,	/* v2+ has separate enables for crit, upper and lower irq */
 	/* STATUS */
@@ -565,6 +567,10 @@  struct tsens_priv {
 	struct regmap			*srot_map;
 	u32				tm_offset;
 	bool				needs_reinit_wa;
+	struct workqueue_struct		*reinit_wa_worker;
+	struct work_struct		reinit_wa_notify;
+
+	struct list_head		list;
 
 	/* lock for upper/lower threshold interrupts */
 	spinlock_t			ul_lock;